pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
"""Syntactic analysis metrics (requires spaCy)."""
|
|
2
2
|
|
|
3
|
+
from .advanced_syntactic import compute_advanced_syntactic
|
|
3
4
|
from .pos_ratios import compute_pos_ratios
|
|
4
5
|
from .sentence_stats import compute_sentence_stats
|
|
6
|
+
from .sentence_types import compute_sentence_types
|
|
5
7
|
|
|
6
8
|
__all__ = [
|
|
7
9
|
"compute_pos_ratios",
|
|
8
10
|
"compute_sentence_stats",
|
|
11
|
+
"compute_advanced_syntactic",
|
|
12
|
+
"compute_sentence_types",
|
|
9
13
|
]
|
|
@@ -0,0 +1,494 @@
|
|
|
1
|
+
"""Advanced syntactic analysis using dependency parsing.
|
|
2
|
+
|
|
3
|
+
This module provides sophisticated syntactic metrics beyond basic POS tagging.
|
|
4
|
+
Using dependency parsing, it extracts features related to sentence complexity,
|
|
5
|
+
grammatical sophistication, and syntactic style preferences.
|
|
6
|
+
|
|
7
|
+
Related GitHub Issue:
|
|
8
|
+
#17 - Advanced Syntactic Analysis
|
|
9
|
+
https://github.com/craigtrim/pystylometry/issues/17
|
|
10
|
+
|
|
11
|
+
Features implemented:
|
|
12
|
+
- Parse tree depth (sentence structural complexity)
|
|
13
|
+
- T-units (minimal terminable units - independent clauses with modifiers)
|
|
14
|
+
- Clausal density (clauses per T-unit)
|
|
15
|
+
- Dependent clause ratio
|
|
16
|
+
- Passive voice ratio
|
|
17
|
+
- Subordination and coordination indices
|
|
18
|
+
- Dependency distance metrics
|
|
19
|
+
- Branching direction (left vs. right)
|
|
20
|
+
|
|
21
|
+
References:
|
|
22
|
+
Hunt, K. W. (1965). Grammatical structures written at three grade levels.
|
|
23
|
+
NCTE Research Report No. 3.
|
|
24
|
+
Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
|
|
25
|
+
Lu, X. (2010). Automatic analysis of syntactic complexity in second language
|
|
26
|
+
writing. International Journal of Corpus Linguistics, 15(4), 474-496.
|
|
27
|
+
Gibson, E. (2000). The dependency locality theory: A distance-based theory
|
|
28
|
+
of linguistic complexity. In Image, language, brain (pp. 95-126).
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
from typing import Any
|
|
32
|
+
|
|
33
|
+
from .._types import AdvancedSyntacticResult, Distribution, make_distribution
|
|
34
|
+
from .._utils import check_optional_dependency
|
|
35
|
+
|
|
36
|
+
# Type aliases for spaCy objects (loaded dynamically)
|
|
37
|
+
_SpaCyToken = Any
|
|
38
|
+
_SpaCyDoc = Any
|
|
39
|
+
_SpaCySpan = Any
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def compute_advanced_syntactic(
|
|
43
|
+
text: str,
|
|
44
|
+
model: str = "en_core_web_sm",
|
|
45
|
+
chunk_size: int = 1000,
|
|
46
|
+
) -> AdvancedSyntacticResult:
|
|
47
|
+
"""
|
|
48
|
+
Compute advanced syntactic complexity metrics using dependency parsing.
|
|
49
|
+
|
|
50
|
+
This function uses spaCy's dependency parser to extract sophisticated
|
|
51
|
+
syntactic features that go beyond simple POS tagging. These features
|
|
52
|
+
capture sentence complexity, grammatical sophistication, and stylistic
|
|
53
|
+
preferences in syntactic structure.
|
|
54
|
+
|
|
55
|
+
Related GitHub Issue:
|
|
56
|
+
#17 - Advanced Syntactic Analysis
|
|
57
|
+
https://github.com/craigtrim/pystylometry/issues/17
|
|
58
|
+
|
|
59
|
+
Why syntactic complexity matters:
|
|
60
|
+
1. Correlates with writing proficiency and cognitive development
|
|
61
|
+
2. Distinguishes between genres (academic vs. conversational)
|
|
62
|
+
3. Captures authorial style preferences
|
|
63
|
+
4. Indicates text difficulty and readability
|
|
64
|
+
5. Varies systematically across languages and registers
|
|
65
|
+
|
|
66
|
+
Metrics computed:
|
|
67
|
+
|
|
68
|
+
Parse Tree Depth:
|
|
69
|
+
- Mean and maximum depth of dependency parse trees
|
|
70
|
+
- Deeper trees = more complex syntactic structures
|
|
71
|
+
- Indicates level of embedding and subordination
|
|
72
|
+
|
|
73
|
+
T-units:
|
|
74
|
+
- Minimal terminable units (Hunt 1965)
|
|
75
|
+
- Independent clause + all dependent clauses attached to it
|
|
76
|
+
- More reliable than sentence length for measuring complexity
|
|
77
|
+
- Mean T-unit length is standard complexity measure
|
|
78
|
+
|
|
79
|
+
Clausal Density:
|
|
80
|
+
- Number of clauses per T-unit
|
|
81
|
+
- Higher density = more complex, embedded structures
|
|
82
|
+
- Academic writing typically has higher clausal density
|
|
83
|
+
|
|
84
|
+
Passive Voice:
|
|
85
|
+
- Ratio of passive constructions to total sentences
|
|
86
|
+
- Academic/formal writing uses more passive voice
|
|
87
|
+
- Fiction/conversational writing uses more active voice
|
|
88
|
+
|
|
89
|
+
Subordination & Coordination:
|
|
90
|
+
- Subordination: Use of dependent clauses
|
|
91
|
+
- Coordination: Use of coordinate clauses (and, but, or)
|
|
92
|
+
- Balance indicates syntactic style
|
|
93
|
+
|
|
94
|
+
Dependency Distance:
|
|
95
|
+
- Average distance between heads and dependents
|
|
96
|
+
- Longer distances = more processing difficulty
|
|
97
|
+
- Related to working memory load
|
|
98
|
+
|
|
99
|
+
Branching Direction:
|
|
100
|
+
- Left-branching: Modifiers before head
|
|
101
|
+
- Right-branching: Modifiers after head
|
|
102
|
+
- English tends toward right-branching
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
text: Input text to analyze. Should contain multiple sentences for
|
|
106
|
+
reliable metrics. Very short texts may have unstable values.
|
|
107
|
+
model: spaCy model name with dependency parser. Default is "en_core_web_sm".
|
|
108
|
+
Larger models (en_core_web_md, en_core_web_lg) may provide better
|
|
109
|
+
parsing accuracy but are slower.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
AdvancedSyntacticResult containing:
|
|
113
|
+
- mean_parse_tree_depth: Average depth across all parse trees
|
|
114
|
+
- max_parse_tree_depth: Maximum depth in any parse tree
|
|
115
|
+
- t_unit_count: Number of T-units detected
|
|
116
|
+
- mean_t_unit_length: Average words per T-unit
|
|
117
|
+
- clausal_density: Clauses per T-unit
|
|
118
|
+
- dependent_clause_ratio: Dependent clauses / total clauses
|
|
119
|
+
- passive_voice_ratio: Passive sentences / total sentences
|
|
120
|
+
- subordination_index: Subordinate clauses / total clauses
|
|
121
|
+
- coordination_index: Coordinate clauses / total clauses
|
|
122
|
+
- sentence_complexity_score: Composite complexity metric
|
|
123
|
+
- dependency_distance: Mean distance between heads and dependents
|
|
124
|
+
- left_branching_ratio: Left-branching structures / total
|
|
125
|
+
- right_branching_ratio: Right-branching structures / total
|
|
126
|
+
- metadata: Parse tree details, clause counts, etc.
|
|
127
|
+
|
|
128
|
+
Example:
|
|
129
|
+
>>> result = compute_advanced_syntactic("Complex multi-clause text...")
|
|
130
|
+
>>> print(f"Parse tree depth: {result.mean_parse_tree_depth:.1f}")
|
|
131
|
+
Parse tree depth: 5.3
|
|
132
|
+
>>> print(f"T-units: {result.t_unit_count}")
|
|
133
|
+
T-units: 12
|
|
134
|
+
>>> print(f"Clausal density: {result.clausal_density:.2f}")
|
|
135
|
+
Clausal density: 2.4
|
|
136
|
+
>>> print(f"Passive voice: {result.passive_voice_ratio * 100:.1f}%")
|
|
137
|
+
Passive voice: 23.5%
|
|
138
|
+
|
|
139
|
+
>>> # Compare genres
|
|
140
|
+
>>> academic = compute_advanced_syntactic("Academic paper...")
|
|
141
|
+
>>> fiction = compute_advanced_syntactic("Fiction narrative...")
|
|
142
|
+
>>> print(f"Academic clausal density: {academic.clausal_density:.2f}")
|
|
143
|
+
>>> print(f"Fiction clausal density: {fiction.clausal_density:.2f}")
|
|
144
|
+
>>> # Academic typically higher
|
|
145
|
+
|
|
146
|
+
Note:
|
|
147
|
+
- Requires spaCy with dependency parser (small model minimum)
|
|
148
|
+
- Parse accuracy affects metrics (larger models are better)
|
|
149
|
+
- Very long sentences may have parsing errors
|
|
150
|
+
- Passive voice detection uses dependency patterns
|
|
151
|
+
- T-unit segmentation follows Hunt (1965) criteria
|
|
152
|
+
- Empty or very short texts return NaN for ratios
|
|
153
|
+
"""
|
|
154
|
+
check_optional_dependency("spacy", "syntactic")
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
import spacy # type: ignore
|
|
158
|
+
except ImportError as e:
|
|
159
|
+
raise ImportError(
|
|
160
|
+
"spaCy is required for advanced syntactic analysis. "
|
|
161
|
+
"Install with: pip install spacy && python -m spacy download en_core_web_sm"
|
|
162
|
+
) from e
|
|
163
|
+
|
|
164
|
+
# Load spaCy model
|
|
165
|
+
try:
|
|
166
|
+
nlp = spacy.load(model)
|
|
167
|
+
except OSError as e:
|
|
168
|
+
raise OSError(
|
|
169
|
+
f"spaCy model '{model}' not found. Download with: python -m spacy download {model}"
|
|
170
|
+
) from e
|
|
171
|
+
|
|
172
|
+
# Parse text
|
|
173
|
+
doc = nlp(text)
|
|
174
|
+
sentences = list(doc.sents)
|
|
175
|
+
|
|
176
|
+
# Handle empty text
|
|
177
|
+
if len(sentences) == 0 or len(doc) == 0:
|
|
178
|
+
empty_dist = Distribution(
|
|
179
|
+
values=[],
|
|
180
|
+
mean=float("nan"),
|
|
181
|
+
median=float("nan"),
|
|
182
|
+
std=0.0,
|
|
183
|
+
range=0.0,
|
|
184
|
+
iqr=0.0,
|
|
185
|
+
)
|
|
186
|
+
return AdvancedSyntacticResult(
|
|
187
|
+
mean_parse_tree_depth=float("nan"),
|
|
188
|
+
max_parse_tree_depth=0,
|
|
189
|
+
t_unit_count=0,
|
|
190
|
+
mean_t_unit_length=float("nan"),
|
|
191
|
+
clausal_density=float("nan"),
|
|
192
|
+
dependent_clause_ratio=float("nan"),
|
|
193
|
+
passive_voice_ratio=float("nan"),
|
|
194
|
+
subordination_index=float("nan"),
|
|
195
|
+
coordination_index=float("nan"),
|
|
196
|
+
sentence_complexity_score=float("nan"),
|
|
197
|
+
dependency_distance=float("nan"),
|
|
198
|
+
left_branching_ratio=float("nan"),
|
|
199
|
+
right_branching_ratio=float("nan"),
|
|
200
|
+
mean_parse_tree_depth_dist=empty_dist,
|
|
201
|
+
max_parse_tree_depth_dist=empty_dist,
|
|
202
|
+
mean_t_unit_length_dist=empty_dist,
|
|
203
|
+
clausal_density_dist=empty_dist,
|
|
204
|
+
dependent_clause_ratio_dist=empty_dist,
|
|
205
|
+
passive_voice_ratio_dist=empty_dist,
|
|
206
|
+
subordination_index_dist=empty_dist,
|
|
207
|
+
coordination_index_dist=empty_dist,
|
|
208
|
+
sentence_complexity_score_dist=empty_dist,
|
|
209
|
+
dependency_distance_dist=empty_dist,
|
|
210
|
+
left_branching_ratio_dist=empty_dist,
|
|
211
|
+
right_branching_ratio_dist=empty_dist,
|
|
212
|
+
chunk_size=chunk_size,
|
|
213
|
+
chunk_count=0,
|
|
214
|
+
metadata={
|
|
215
|
+
"sentence_count": 0,
|
|
216
|
+
"word_count": 0,
|
|
217
|
+
"total_clauses": 0,
|
|
218
|
+
"warning": "Empty text or no sentences found",
|
|
219
|
+
},
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# 1. Calculate parse tree depth
|
|
223
|
+
parse_depths = []
|
|
224
|
+
for sent in sentences:
|
|
225
|
+
depth = _calculate_max_tree_depth(sent.root)
|
|
226
|
+
parse_depths.append(depth)
|
|
227
|
+
|
|
228
|
+
mean_parse_tree_depth = sum(parse_depths) / len(parse_depths)
|
|
229
|
+
max_parse_tree_depth = max(parse_depths)
|
|
230
|
+
|
|
231
|
+
# 2. Calculate mean dependency distance
|
|
232
|
+
dependency_distances = []
|
|
233
|
+
for token in doc:
|
|
234
|
+
if token != token.head: # Exclude root
|
|
235
|
+
distance = abs(token.i - token.head.i)
|
|
236
|
+
dependency_distances.append(distance)
|
|
237
|
+
|
|
238
|
+
if dependency_distances:
|
|
239
|
+
mean_dependency_distance = sum(dependency_distances) / len(dependency_distances)
|
|
240
|
+
else:
|
|
241
|
+
mean_dependency_distance = 0.0
|
|
242
|
+
|
|
243
|
+
# 3. Identify T-units and calculate mean T-unit length
|
|
244
|
+
t_units = _identify_t_units(doc)
|
|
245
|
+
t_unit_count = len(t_units)
|
|
246
|
+
t_unit_lengths = [len(t_unit) for t_unit in t_units]
|
|
247
|
+
|
|
248
|
+
if t_unit_count > 0:
|
|
249
|
+
mean_t_unit_length = sum(t_unit_lengths) / t_unit_count
|
|
250
|
+
else:
|
|
251
|
+
mean_t_unit_length = float("nan")
|
|
252
|
+
|
|
253
|
+
# 4. Count clauses (total, dependent, subordinate, coordinate)
|
|
254
|
+
total_clauses = 0
|
|
255
|
+
dependent_clause_count = 0
|
|
256
|
+
subordinate_clause_count = 0
|
|
257
|
+
coordinate_clause_count = 0
|
|
258
|
+
|
|
259
|
+
for sent in sentences:
|
|
260
|
+
sent_total, sent_dependent, sent_subordinate, sent_coordinate = _count_clauses(sent)
|
|
261
|
+
total_clauses += sent_total
|
|
262
|
+
dependent_clause_count += sent_dependent
|
|
263
|
+
subordinate_clause_count += sent_subordinate
|
|
264
|
+
coordinate_clause_count += sent_coordinate
|
|
265
|
+
|
|
266
|
+
# Calculate ratios
|
|
267
|
+
if total_clauses > 0:
|
|
268
|
+
dependent_clause_ratio = dependent_clause_count / total_clauses
|
|
269
|
+
subordination_index = subordinate_clause_count / total_clauses
|
|
270
|
+
coordination_index = coordinate_clause_count / total_clauses
|
|
271
|
+
else:
|
|
272
|
+
dependent_clause_ratio = float("nan")
|
|
273
|
+
subordination_index = float("nan")
|
|
274
|
+
coordination_index = float("nan")
|
|
275
|
+
|
|
276
|
+
if t_unit_count > 0:
|
|
277
|
+
clausal_density = total_clauses / t_unit_count
|
|
278
|
+
else:
|
|
279
|
+
clausal_density = float("nan")
|
|
280
|
+
|
|
281
|
+
# 5. Detect passive voice
|
|
282
|
+
passive_sentence_count = sum(1 for sent in sentences if _is_passive_voice(sent))
|
|
283
|
+
passive_voice_ratio = passive_sentence_count / len(sentences)
|
|
284
|
+
|
|
285
|
+
# 6. Calculate branching direction
|
|
286
|
+
left_branching = 0
|
|
287
|
+
right_branching = 0
|
|
288
|
+
|
|
289
|
+
for token in doc:
|
|
290
|
+
if token != token.head: # Exclude root
|
|
291
|
+
if token.i < token.head.i:
|
|
292
|
+
left_branching += 1
|
|
293
|
+
else:
|
|
294
|
+
right_branching += 1
|
|
295
|
+
|
|
296
|
+
total_branching = left_branching + right_branching
|
|
297
|
+
if total_branching > 0:
|
|
298
|
+
left_branching_ratio = left_branching / total_branching
|
|
299
|
+
right_branching_ratio = right_branching / total_branching
|
|
300
|
+
else:
|
|
301
|
+
left_branching_ratio = float("nan")
|
|
302
|
+
right_branching_ratio = float("nan")
|
|
303
|
+
|
|
304
|
+
# 7. Calculate sentence complexity score (composite metric)
|
|
305
|
+
# Normalize individual metrics to 0-1 range
|
|
306
|
+
normalized_parse_depth = min(mean_parse_tree_depth / 10, 1.0)
|
|
307
|
+
normalized_clausal_density = (
|
|
308
|
+
min(clausal_density / 3, 1.0)
|
|
309
|
+
if not isinstance(clausal_density, float) or not (clausal_density != clausal_density)
|
|
310
|
+
else 0.0
|
|
311
|
+
)
|
|
312
|
+
normalized_t_unit_length = (
|
|
313
|
+
min(mean_t_unit_length / 25, 1.0)
|
|
314
|
+
if not isinstance(mean_t_unit_length, float)
|
|
315
|
+
or not (mean_t_unit_length != mean_t_unit_length)
|
|
316
|
+
else 0.0
|
|
317
|
+
)
|
|
318
|
+
normalized_dependency_distance = min(mean_dependency_distance / 5, 1.0)
|
|
319
|
+
normalized_subordination = (
|
|
320
|
+
subordination_index
|
|
321
|
+
if not isinstance(subordination_index, float)
|
|
322
|
+
or not (subordination_index != subordination_index)
|
|
323
|
+
else 0.0
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
# Weighted combination
|
|
327
|
+
sentence_complexity_score = (
|
|
328
|
+
0.3 * normalized_parse_depth
|
|
329
|
+
+ 0.3 * normalized_clausal_density
|
|
330
|
+
+ 0.2 * normalized_t_unit_length
|
|
331
|
+
+ 0.1 * normalized_subordination
|
|
332
|
+
+ 0.1 * normalized_dependency_distance
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
# Create single-value distributions (analysis is done on full text)
|
|
336
|
+
mean_parse_tree_depth_dist = make_distribution([mean_parse_tree_depth])
|
|
337
|
+
max_parse_tree_depth_dist = make_distribution([float(max_parse_tree_depth)])
|
|
338
|
+
mean_t_unit_length_dist = make_distribution([mean_t_unit_length])
|
|
339
|
+
clausal_density_dist = make_distribution([clausal_density])
|
|
340
|
+
dependent_clause_ratio_dist = make_distribution([dependent_clause_ratio])
|
|
341
|
+
passive_voice_ratio_dist = make_distribution([passive_voice_ratio])
|
|
342
|
+
subordination_index_dist = make_distribution([subordination_index])
|
|
343
|
+
coordination_index_dist = make_distribution([coordination_index])
|
|
344
|
+
sentence_complexity_score_dist = make_distribution([sentence_complexity_score])
|
|
345
|
+
dependency_distance_dist = make_distribution([mean_dependency_distance])
|
|
346
|
+
left_branching_ratio_dist = make_distribution([left_branching_ratio])
|
|
347
|
+
right_branching_ratio_dist = make_distribution([right_branching_ratio])
|
|
348
|
+
|
|
349
|
+
# Collect metadata
|
|
350
|
+
metadata = {
|
|
351
|
+
"sentence_count": len(sentences),
|
|
352
|
+
"word_count": len(doc),
|
|
353
|
+
"total_clauses": total_clauses,
|
|
354
|
+
"independent_clause_count": total_clauses - dependent_clause_count,
|
|
355
|
+
"dependent_clause_count": dependent_clause_count,
|
|
356
|
+
"subordinate_clause_count": subordinate_clause_count,
|
|
357
|
+
"coordinate_clause_count": coordinate_clause_count,
|
|
358
|
+
"passive_sentence_count": passive_sentence_count,
|
|
359
|
+
"parse_depths_per_sentence": parse_depths,
|
|
360
|
+
"t_unit_lengths": t_unit_lengths,
|
|
361
|
+
"t_unit_count": t_unit_count,
|
|
362
|
+
"dependency_distances": dependency_distances[:100], # Sample for brevity
|
|
363
|
+
"left_branching_count": left_branching,
|
|
364
|
+
"right_branching_count": right_branching,
|
|
365
|
+
"model_used": model,
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
return AdvancedSyntacticResult(
|
|
369
|
+
mean_parse_tree_depth=mean_parse_tree_depth,
|
|
370
|
+
max_parse_tree_depth=max_parse_tree_depth,
|
|
371
|
+
t_unit_count=t_unit_count,
|
|
372
|
+
mean_t_unit_length=mean_t_unit_length,
|
|
373
|
+
clausal_density=clausal_density,
|
|
374
|
+
dependent_clause_ratio=dependent_clause_ratio,
|
|
375
|
+
passive_voice_ratio=passive_voice_ratio,
|
|
376
|
+
subordination_index=subordination_index,
|
|
377
|
+
coordination_index=coordination_index,
|
|
378
|
+
sentence_complexity_score=sentence_complexity_score,
|
|
379
|
+
dependency_distance=mean_dependency_distance,
|
|
380
|
+
left_branching_ratio=left_branching_ratio,
|
|
381
|
+
right_branching_ratio=right_branching_ratio,
|
|
382
|
+
mean_parse_tree_depth_dist=mean_parse_tree_depth_dist,
|
|
383
|
+
max_parse_tree_depth_dist=max_parse_tree_depth_dist,
|
|
384
|
+
mean_t_unit_length_dist=mean_t_unit_length_dist,
|
|
385
|
+
clausal_density_dist=clausal_density_dist,
|
|
386
|
+
dependent_clause_ratio_dist=dependent_clause_ratio_dist,
|
|
387
|
+
passive_voice_ratio_dist=passive_voice_ratio_dist,
|
|
388
|
+
subordination_index_dist=subordination_index_dist,
|
|
389
|
+
coordination_index_dist=coordination_index_dist,
|
|
390
|
+
sentence_complexity_score_dist=sentence_complexity_score_dist,
|
|
391
|
+
dependency_distance_dist=dependency_distance_dist,
|
|
392
|
+
left_branching_ratio_dist=left_branching_ratio_dist,
|
|
393
|
+
right_branching_ratio_dist=right_branching_ratio_dist,
|
|
394
|
+
chunk_size=chunk_size,
|
|
395
|
+
chunk_count=1, # Single pass analysis
|
|
396
|
+
metadata=metadata,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def _calculate_max_tree_depth(token: _SpaCyToken) -> int:
|
|
401
|
+
"""
|
|
402
|
+
Calculate maximum depth of dependency tree starting from token.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
token: spaCy Token to start from (typically sentence root)
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
Maximum depth of tree (root = 0, children = parent + 1)
|
|
409
|
+
"""
|
|
410
|
+
if not list(token.children):
|
|
411
|
+
return 0
|
|
412
|
+
|
|
413
|
+
child_depths = [_calculate_max_tree_depth(child) for child in token.children]
|
|
414
|
+
return max(child_depths) + 1
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def _identify_t_units(doc: _SpaCyDoc) -> list[_SpaCySpan]:
|
|
418
|
+
"""
|
|
419
|
+
Identify T-units (minimal terminable units) in document.
|
|
420
|
+
|
|
421
|
+
A T-unit is one main clause plus all subordinate clauses attached to it.
|
|
422
|
+
This follows Hunt (1965) definition.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
doc: spaCy Doc object
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
List of spaCy Span objects, each representing a T-unit
|
|
429
|
+
"""
|
|
430
|
+
# For simplicity, treat each sentence as a T-unit
|
|
431
|
+
# More sophisticated approach would split compound sentences
|
|
432
|
+
# into separate T-units, but this requires complex coordination analysis
|
|
433
|
+
return list(doc.sents)
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def _count_clauses(sent: _SpaCySpan) -> tuple[int, int, int, int]:
|
|
437
|
+
"""
|
|
438
|
+
Count different types of clauses in sentence.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
sent: spaCy Span representing a sentence
|
|
442
|
+
|
|
443
|
+
Returns:
|
|
444
|
+
Tuple of (total_clauses, dependent_clauses, subordinate_clauses, coordinate_clauses)
|
|
445
|
+
"""
|
|
446
|
+
# Start with 1 for the main clause
|
|
447
|
+
total = 1
|
|
448
|
+
dependent = 0
|
|
449
|
+
subordinate = 0
|
|
450
|
+
coordinate = 0
|
|
451
|
+
|
|
452
|
+
# Dependency labels that indicate clauses
|
|
453
|
+
dependent_clause_labels = {"csubj", "ccomp", "xcomp", "advcl", "acl", "relcl"}
|
|
454
|
+
subordinate_clause_labels = {"advcl", "acl", "relcl"}
|
|
455
|
+
coordinate_clause_labels = {"conj"}
|
|
456
|
+
|
|
457
|
+
for token in sent:
|
|
458
|
+
if token.dep_ in dependent_clause_labels:
|
|
459
|
+
total += 1
|
|
460
|
+
dependent += 1
|
|
461
|
+
if token.dep_ in subordinate_clause_labels:
|
|
462
|
+
subordinate += 1
|
|
463
|
+
elif token.dep_ in coordinate_clause_labels and token.pos_ == "VERB":
|
|
464
|
+
# Coordinate clause (conj) with verb = coordinated main clause
|
|
465
|
+
total += 1
|
|
466
|
+
coordinate += 1
|
|
467
|
+
|
|
468
|
+
return total, dependent, subordinate, coordinate
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def _is_passive_voice(sent: _SpaCySpan) -> bool:
|
|
472
|
+
"""
|
|
473
|
+
Detect if sentence contains passive voice construction.
|
|
474
|
+
|
|
475
|
+
Args:
|
|
476
|
+
sent: spaCy Span representing a sentence
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
True if passive voice detected, False otherwise
|
|
480
|
+
"""
|
|
481
|
+
# Look for passive auxiliary + past participle pattern
|
|
482
|
+
for token in sent:
|
|
483
|
+
# Check for passive subject dependency (older spaCy versions)
|
|
484
|
+
if token.dep_ == "nsubjpass":
|
|
485
|
+
return True
|
|
486
|
+
# Check for passive auxiliary + past participle (newer spaCy versions)
|
|
487
|
+
# In newer spaCy, passive is marked with nsubj:pass or through aux:pass
|
|
488
|
+
if "pass" in token.dep_:
|
|
489
|
+
return True
|
|
490
|
+
# Alternative: check for "be" verb + past participle
|
|
491
|
+
if token.dep_ == "auxpass":
|
|
492
|
+
return True
|
|
493
|
+
|
|
494
|
+
return False
|