pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pystylometry/__init__.py +1 -2
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1224 -2
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +4 -0
  6. pystylometry/authorship/additional_methods.py +100 -0
  7. pystylometry/character/__init__.py +15 -0
  8. pystylometry/character/character_metrics.py +301 -0
  9. pystylometry/lexical/__init__.py +13 -6
  10. pystylometry/lexical/advanced_diversity.py +641 -0
  11. pystylometry/lexical/function_words.py +391 -0
  12. pystylometry/lexical/hapax.py +154 -7
  13. pystylometry/lexical/mtld.py +83 -7
  14. pystylometry/lexical/ttr.py +83 -0
  15. pystylometry/lexical/word_frequency_sophistication.py +581 -0
  16. pystylometry/lexical/yule.py +34 -7
  17. pystylometry/ngrams/__init__.py +2 -0
  18. pystylometry/ngrams/extended_ngrams.py +235 -0
  19. pystylometry/prosody/__init__.py +12 -0
  20. pystylometry/prosody/rhythm_prosody.py +53 -0
  21. pystylometry/readability/__init__.py +12 -0
  22. pystylometry/readability/additional_formulas.py +985 -0
  23. pystylometry/readability/ari.py +93 -17
  24. pystylometry/readability/coleman_liau.py +102 -9
  25. pystylometry/readability/complex_words.py +531 -0
  26. pystylometry/readability/flesch.py +59 -14
  27. pystylometry/readability/gunning_fog.py +194 -25
  28. pystylometry/readability/smog.py +31 -14
  29. pystylometry/readability/syllables.py +137 -30
  30. pystylometry/stylistic/__init__.py +20 -0
  31. pystylometry/stylistic/cohesion_coherence.py +45 -0
  32. pystylometry/stylistic/genre_register.py +45 -0
  33. pystylometry/stylistic/markers.py +131 -0
  34. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  35. pystylometry/syntactic/__init__.py +4 -0
  36. pystylometry/syntactic/advanced_syntactic.py +432 -0
  37. pystylometry/syntactic/pos_ratios.py +104 -13
  38. pystylometry/syntactic/sentence_stats.py +57 -13
  39. pystylometry/syntactic/sentence_types.py +470 -0
  40. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
  41. pystylometry-1.0.0.dist-info/RECORD +46 -0
  42. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
  43. pystylometry-0.1.0.dist-info/RECORD +0 -26
@@ -0,0 +1,131 @@
1
+ """Stylistic markers for authorship attribution.
2
+
3
+ This module identifies and analyzes specific linguistic features that authors
4
+ use consistently and often subconsciously. These markers include contraction
5
+ preferences, intensifier usage, hedging patterns, modal auxiliaries, negation
6
+ patterns, and punctuation style habits.
7
+
8
+ Related GitHub Issue:
9
+ #20 - Stylistic Markers
10
+ https://github.com/craigtrim/pystylometry/issues/20
11
+
12
+ Categories of stylistic markers:
13
+ - Contraction patterns (can't vs. cannot, I'm vs. I am)
14
+ - Intensifiers (very, really, extremely, quite)
15
+ - Hedges (maybe, perhaps, probably, somewhat)
16
+ - Modal auxiliaries (can, could, may, might, must, should, will, would)
17
+ - Negation patterns (not, no, never, none, neither)
18
+ - Punctuation style (exclamations, questions, quotes, parentheticals)
19
+
20
+ References:
21
+ Argamon, S., & Levitan, S. (2005). Measuring the usefulness of function
22
+ words for authorship attribution. ACH/ALLC.
23
+ Pennebaker, J. W. (2011). The secret life of pronouns. Bloomsbury Press.
24
+ Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
25
+ """
26
+
27
+ from .._types import StylisticMarkersResult
28
+
29
+
30
+ def compute_stylistic_markers(text: str) -> StylisticMarkersResult:
31
+ """
32
+ Analyze stylistic markers for authorship attribution.
33
+
34
+ Identifies and quantifies specific linguistic features that reveal authorial
35
+ style. These features are often used subconsciously and remain consistent
36
+ across an author's works, making them valuable for attribution.
37
+
38
+ Related GitHub Issue:
39
+ #20 - Stylistic Markers
40
+ https://github.com/craigtrim/pystylometry/issues/20
41
+
42
+ Why stylistic markers matter:
43
+
44
+ Subconscious usage:
45
+ - Authors don't deliberately vary these features
46
+ - Remain consistent even when author tries to disguise style
47
+ - Difficult to consciously control
48
+
49
+ Genre-independent:
50
+ - Used similarly across different topics
51
+ - More stable than content words
52
+ - Complement content-based features
53
+
54
+ Psychologically meaningful:
55
+ - Reveal personality traits (Pennebaker's research)
56
+ - Indicate emotional state
57
+ - Show cognitive patterns
58
+
59
+ Marker Categories Analyzed:
60
+
61
+ 1. Contractions:
62
+ - Preference for contracted vs. expanded forms
63
+ - Examples: can't/cannot, I'm/I am, won't/will not
64
+ - Formality indicator (more contractions = informal)
65
+
66
+ 2. Intensifiers:
67
+ - Words that amplify meaning
68
+ - Examples: very, really, extremely, quite, rather
69
+ - Indicate emphatic style
70
+
71
+ 3. Hedges:
72
+ - Words that weaken or qualify statements
73
+ - Examples: maybe, perhaps, probably, somewhat, kind of
74
+ - Indicate tentative or cautious style
75
+
76
+ 4. Modal Auxiliaries:
77
+ - Express necessity, possibility, permission
78
+ - Epistemic modals: may, might, could (possibility)
79
+ - Deontic modals: must, should, ought (obligation)
80
+
81
+ 5. Negation:
82
+ - Patterns of negative expression
83
+ - not, no, never, none, neither, nowhere
84
+ - Frequency and type vary by author
85
+
86
+ 6. Punctuation Style:
87
+ - Exclamation marks: Emphatic, emotional
88
+ - Question marks: Interactive, rhetorical
89
+ - Quotation marks: Dialogue, scare quotes
90
+ - Parentheticals: Asides, additional info
91
+ - Ellipses: Trailing off, suspense
92
+ - Dashes: Interruptions, emphasis
93
+ - Semicolons/colons: Sophisticated syntax
94
+
95
+ Args:
96
+ text: Input text to analyze. Should contain at least 200+ words for
97
+ reliable statistics. Shorter texts may have unstable marker ratios.
98
+
99
+ Returns:
100
+ StylisticMarkersResult containing extensive marker statistics.
101
+ See _types.py for complete field list.
102
+
103
+ Example:
104
+ >>> result = compute_stylistic_markers("Sample text with markers...")
105
+ >>> print(f"Contraction ratio: {result.contraction_ratio * 100:.1f}%")
106
+ Contraction ratio: 42.3%
107
+ >>> print(f"Intensifiers/100 words: {result.intensifier_density:.2f}")
108
+ Intensifiers/100 words: 3.45
109
+ >>> print(f"Top intensifiers: {result.top_intensifiers[:3]}")
110
+ Top intensifiers: [('very', 12), ('really', 8), ('quite', 5)]
111
+ >>> print(f"Exclamation density: {result.exclamation_density:.2f}")
112
+ Exclamation density: 2.10
113
+
114
+ Note:
115
+ - Densities are per 100 words for interpretability
116
+ - Contraction detection requires pattern matching
117
+ - Modal auxiliaries classified as epistemic or deontic
118
+ - Punctuation counts include all occurrences
119
+ - Empty text returns NaN for ratios, 0 for counts
120
+ """
121
+ # TODO: Implement stylistic marker analysis
122
+ # GitHub Issue #20: https://github.com/craigtrim/pystylometry/issues/20
123
+ #
124
+ # This is a comprehensive implementation with many components.
125
+ # Break it down into logical sections.
126
+ #
127
+ # See GitHub issue for full implementation plan and word lists.
128
+ raise NotImplementedError(
129
+ "Stylistic markers not yet implemented. "
130
+ "See GitHub Issue #20: https://github.com/craigtrim/pystylometry/issues/20"
131
+ )
@@ -0,0 +1,47 @@
1
+ """Vocabulary overlap and similarity metrics.
2
+
3
+ This module computes similarity measures between two texts based on their
4
+ shared vocabulary. Useful for authorship verification, plagiarism detection,
5
+ and measuring stylistic consistency.
6
+
7
+ Related GitHub Issue:
8
+ #21 - Vocabulary Overlap and Similarity Metrics
9
+ https://github.com/craigtrim/pystylometry/issues/21
10
+
11
+ References:
12
+ Jaccard, P. (1912). The distribution of the flora in the alpine zone.
13
+ Salton, G., & McGill, M. J. (1983). Introduction to Modern Information Retrieval.
14
+ """
15
+
16
+ from .._types import VocabularyOverlapResult
17
+
18
+
19
+ def compute_vocabulary_overlap(text1: str, text2: str) -> VocabularyOverlapResult:
20
+ """
21
+ Compute vocabulary overlap and similarity between two texts.
22
+
23
+ Related GitHub Issue:
24
+ #21 - Vocabulary Overlap and Similarity Metrics
25
+ https://github.com/craigtrim/pystylometry/issues/21
26
+
27
+ Args:
28
+ text1: First text to compare
29
+ text2: Second text to compare
30
+
31
+ Returns:
32
+ VocabularyOverlapResult with Jaccard, Dice, cosine similarities,
33
+ shared vocabulary statistics, and distinctive words for each text.
34
+
35
+ Example:
36
+ >>> result = compute_vocabulary_overlap(text1, text2)
37
+ >>> print(f"Jaccard similarity: {result.jaccard_similarity:.3f}")
38
+ Jaccard similarity: 0.456
39
+ >>> print(f"Shared words: {result.shared_vocab_size}")
40
+ Shared words: 234
41
+ """
42
+ # TODO: Implement vocabulary overlap analysis
43
+ # GitHub Issue #21: https://github.com/craigtrim/pystylometry/issues/21
44
+ raise NotImplementedError(
45
+ "Vocabulary overlap not yet implemented. "
46
+ "See GitHub Issue #21: https://github.com/craigtrim/pystylometry/issues/21"
47
+ )
@@ -1,9 +1,13 @@
1
1
  """Syntactic analysis metrics (requires spaCy)."""
2
2
 
3
+ from .advanced_syntactic import compute_advanced_syntactic
3
4
  from .pos_ratios import compute_pos_ratios
4
5
  from .sentence_stats import compute_sentence_stats
6
+ from .sentence_types import compute_sentence_types
5
7
 
6
8
  __all__ = [
7
9
  "compute_pos_ratios",
8
10
  "compute_sentence_stats",
11
+ "compute_advanced_syntactic",
12
+ "compute_sentence_types",
9
13
  ]
@@ -0,0 +1,432 @@
1
+ """Advanced syntactic analysis using dependency parsing.
2
+
3
+ This module provides sophisticated syntactic metrics beyond basic POS tagging.
4
+ Using dependency parsing, it extracts features related to sentence complexity,
5
+ grammatical sophistication, and syntactic style preferences.
6
+
7
+ Related GitHub Issue:
8
+ #17 - Advanced Syntactic Analysis
9
+ https://github.com/craigtrim/pystylometry/issues/17
10
+
11
+ Features implemented:
12
+ - Parse tree depth (sentence structural complexity)
13
+ - T-units (minimal terminable units - independent clauses with modifiers)
14
+ - Clausal density (clauses per T-unit)
15
+ - Dependent clause ratio
16
+ - Passive voice ratio
17
+ - Subordination and coordination indices
18
+ - Dependency distance metrics
19
+ - Branching direction (left vs. right)
20
+
21
+ References:
22
+ Hunt, K. W. (1965). Grammatical structures written at three grade levels.
23
+ NCTE Research Report No. 3.
24
+ Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
25
+ Lu, X. (2010). Automatic analysis of syntactic complexity in second language
26
+ writing. International Journal of Corpus Linguistics, 15(4), 474-496.
27
+ Gibson, E. (2000). The dependency locality theory: A distance-based theory
28
+ of linguistic complexity. In Image, language, brain (pp. 95-126).
29
+ """
30
+
31
+ from .._types import AdvancedSyntacticResult
32
+ from .._utils import check_optional_dependency
33
+
34
+
35
+ def compute_advanced_syntactic(
36
+ text: str,
37
+ model: str = "en_core_web_sm",
38
+ ) -> AdvancedSyntacticResult:
39
+ """
40
+ Compute advanced syntactic complexity metrics using dependency parsing.
41
+
42
+ This function uses spaCy's dependency parser to extract sophisticated
43
+ syntactic features that go beyond simple POS tagging. These features
44
+ capture sentence complexity, grammatical sophistication, and stylistic
45
+ preferences in syntactic structure.
46
+
47
+ Related GitHub Issue:
48
+ #17 - Advanced Syntactic Analysis
49
+ https://github.com/craigtrim/pystylometry/issues/17
50
+
51
+ Why syntactic complexity matters:
52
+ 1. Correlates with writing proficiency and cognitive development
53
+ 2. Distinguishes between genres (academic vs. conversational)
54
+ 3. Captures authorial style preferences
55
+ 4. Indicates text difficulty and readability
56
+ 5. Varies systematically across languages and registers
57
+
58
+ Metrics computed:
59
+
60
+ Parse Tree Depth:
61
+ - Mean and maximum depth of dependency parse trees
62
+ - Deeper trees = more complex syntactic structures
63
+ - Indicates level of embedding and subordination
64
+
65
+ T-units:
66
+ - Minimal terminable units (Hunt 1965)
67
+ - Independent clause + all dependent clauses attached to it
68
+ - More reliable than sentence length for measuring complexity
69
+ - Mean T-unit length is standard complexity measure
70
+
71
+ Clausal Density:
72
+ - Number of clauses per T-unit
73
+ - Higher density = more complex, embedded structures
74
+ - Academic writing typically has higher clausal density
75
+
76
+ Passive Voice:
77
+ - Ratio of passive constructions to total sentences
78
+ - Academic/formal writing uses more passive voice
79
+ - Fiction/conversational writing uses more active voice
80
+
81
+ Subordination & Coordination:
82
+ - Subordination: Use of dependent clauses
83
+ - Coordination: Use of coordinate clauses (and, but, or)
84
+ - Balance indicates syntactic style
85
+
86
+ Dependency Distance:
87
+ - Average distance between heads and dependents
88
+ - Longer distances = more processing difficulty
89
+ - Related to working memory load
90
+
91
+ Branching Direction:
92
+ - Left-branching: Modifiers before head
93
+ - Right-branching: Modifiers after head
94
+ - English tends toward right-branching
95
+
96
+ Args:
97
+ text: Input text to analyze. Should contain multiple sentences for
98
+ reliable metrics. Very short texts may have unstable values.
99
+ model: spaCy model name with dependency parser. Default is "en_core_web_sm".
100
+ Larger models (en_core_web_md, en_core_web_lg) may provide better
101
+ parsing accuracy but are slower.
102
+
103
+ Returns:
104
+ AdvancedSyntacticResult containing:
105
+ - mean_parse_tree_depth: Average depth across all parse trees
106
+ - max_parse_tree_depth: Maximum depth in any parse tree
107
+ - t_unit_count: Number of T-units detected
108
+ - mean_t_unit_length: Average words per T-unit
109
+ - clausal_density: Clauses per T-unit
110
+ - dependent_clause_ratio: Dependent clauses / total clauses
111
+ - passive_voice_ratio: Passive sentences / total sentences
112
+ - subordination_index: Subordinate clauses / total clauses
113
+ - coordination_index: Coordinate clauses / total clauses
114
+ - sentence_complexity_score: Composite complexity metric
115
+ - dependency_distance: Mean distance between heads and dependents
116
+ - left_branching_ratio: Left-branching structures / total
117
+ - right_branching_ratio: Right-branching structures / total
118
+ - metadata: Parse tree details, clause counts, etc.
119
+
120
+ Example:
121
+ >>> result = compute_advanced_syntactic("Complex multi-clause text...")
122
+ >>> print(f"Parse tree depth: {result.mean_parse_tree_depth:.1f}")
123
+ Parse tree depth: 5.3
124
+ >>> print(f"T-units: {result.t_unit_count}")
125
+ T-units: 12
126
+ >>> print(f"Clausal density: {result.clausal_density:.2f}")
127
+ Clausal density: 2.4
128
+ >>> print(f"Passive voice: {result.passive_voice_ratio * 100:.1f}%")
129
+ Passive voice: 23.5%
130
+
131
+ >>> # Compare genres
132
+ >>> academic = compute_advanced_syntactic("Academic paper...")
133
+ >>> fiction = compute_advanced_syntactic("Fiction narrative...")
134
+ >>> print(f"Academic clausal density: {academic.clausal_density:.2f}")
135
+ >>> print(f"Fiction clausal density: {fiction.clausal_density:.2f}")
136
+ >>> # Academic typically higher
137
+
138
+ Note:
139
+ - Requires spaCy with dependency parser (small model minimum)
140
+ - Parse accuracy affects metrics (larger models are better)
141
+ - Very long sentences may have parsing errors
142
+ - Passive voice detection uses dependency patterns
143
+ - T-unit segmentation follows Hunt (1965) criteria
144
+ - Empty or very short texts return NaN for ratios
145
+ """
146
+ check_optional_dependency("spacy", "syntactic")
147
+
148
+ try:
149
+ import spacy # type: ignore
150
+ from spacy.tokens import Doc, Span, Token # type: ignore
151
+ except ImportError as e:
152
+ raise ImportError(
153
+ "spaCy is required for advanced syntactic analysis. "
154
+ "Install with: pip install spacy && python -m spacy download en_core_web_sm"
155
+ ) from e
156
+
157
+ # Load spaCy model
158
+ try:
159
+ nlp = spacy.load(model)
160
+ except OSError as e:
161
+ raise OSError(
162
+ f"spaCy model '{model}' not found. "
163
+ f"Download with: python -m spacy download {model}"
164
+ ) from e
165
+
166
+ # Parse text
167
+ doc = nlp(text)
168
+ sentences = list(doc.sents)
169
+
170
+ # Handle empty text
171
+ if len(sentences) == 0 or len(doc) == 0:
172
+ return AdvancedSyntacticResult(
173
+ mean_parse_tree_depth=float("nan"),
174
+ max_parse_tree_depth=0,
175
+ t_unit_count=0,
176
+ mean_t_unit_length=float("nan"),
177
+ clausal_density=float("nan"),
178
+ dependent_clause_ratio=float("nan"),
179
+ passive_voice_ratio=float("nan"),
180
+ subordination_index=float("nan"),
181
+ coordination_index=float("nan"),
182
+ sentence_complexity_score=float("nan"),
183
+ dependency_distance=float("nan"),
184
+ left_branching_ratio=float("nan"),
185
+ right_branching_ratio=float("nan"),
186
+ metadata={
187
+ "sentence_count": 0,
188
+ "word_count": 0,
189
+ "total_clauses": 0,
190
+ "warning": "Empty text or no sentences found",
191
+ },
192
+ )
193
+
194
+ # 1. Calculate parse tree depth
195
+ parse_depths = []
196
+ for sent in sentences:
197
+ depth = _calculate_max_tree_depth(sent.root)
198
+ parse_depths.append(depth)
199
+
200
+ mean_parse_tree_depth = sum(parse_depths) / len(parse_depths)
201
+ max_parse_tree_depth = max(parse_depths)
202
+
203
+ # 2. Calculate mean dependency distance
204
+ dependency_distances = []
205
+ for token in doc:
206
+ if token != token.head: # Exclude root
207
+ distance = abs(token.i - token.head.i)
208
+ dependency_distances.append(distance)
209
+
210
+ if dependency_distances:
211
+ mean_dependency_distance = sum(dependency_distances) / len(dependency_distances)
212
+ else:
213
+ mean_dependency_distance = 0.0
214
+
215
+ # 3. Identify T-units and calculate mean T-unit length
216
+ t_units = _identify_t_units(doc)
217
+ t_unit_count = len(t_units)
218
+ t_unit_lengths = [len(t_unit) for t_unit in t_units]
219
+
220
+ if t_unit_count > 0:
221
+ mean_t_unit_length = sum(t_unit_lengths) / t_unit_count
222
+ else:
223
+ mean_t_unit_length = float("nan")
224
+
225
+ # 4. Count clauses (total, dependent, subordinate, coordinate)
226
+ total_clauses = 0
227
+ dependent_clause_count = 0
228
+ subordinate_clause_count = 0
229
+ coordinate_clause_count = 0
230
+
231
+ for sent in sentences:
232
+ sent_total, sent_dependent, sent_subordinate, sent_coordinate = _count_clauses(
233
+ sent
234
+ )
235
+ total_clauses += sent_total
236
+ dependent_clause_count += sent_dependent
237
+ subordinate_clause_count += sent_subordinate
238
+ coordinate_clause_count += sent_coordinate
239
+
240
+ # Calculate ratios
241
+ if total_clauses > 0:
242
+ dependent_clause_ratio = dependent_clause_count / total_clauses
243
+ subordination_index = subordinate_clause_count / total_clauses
244
+ coordination_index = coordinate_clause_count / total_clauses
245
+ else:
246
+ dependent_clause_ratio = float("nan")
247
+ subordination_index = float("nan")
248
+ coordination_index = float("nan")
249
+
250
+ if t_unit_count > 0:
251
+ clausal_density = total_clauses / t_unit_count
252
+ else:
253
+ clausal_density = float("nan")
254
+
255
+ # 5. Detect passive voice
256
+ passive_sentence_count = sum(1 for sent in sentences if _is_passive_voice(sent))
257
+ passive_voice_ratio = passive_sentence_count / len(sentences)
258
+
259
+ # 6. Calculate branching direction
260
+ left_branching = 0
261
+ right_branching = 0
262
+
263
+ for token in doc:
264
+ if token != token.head: # Exclude root
265
+ if token.i < token.head.i:
266
+ left_branching += 1
267
+ else:
268
+ right_branching += 1
269
+
270
+ total_branching = left_branching + right_branching
271
+ if total_branching > 0:
272
+ left_branching_ratio = left_branching / total_branching
273
+ right_branching_ratio = right_branching / total_branching
274
+ else:
275
+ left_branching_ratio = float("nan")
276
+ right_branching_ratio = float("nan")
277
+
278
+ # 7. Calculate sentence complexity score (composite metric)
279
+ # Normalize individual metrics to 0-1 range
280
+ normalized_parse_depth = min(mean_parse_tree_depth / 10, 1.0)
281
+ normalized_clausal_density = (
282
+ min(clausal_density / 3, 1.0) if not isinstance(clausal_density, float) or not (clausal_density != clausal_density) else 0.0
283
+ )
284
+ normalized_t_unit_length = (
285
+ min(mean_t_unit_length / 25, 1.0) if not isinstance(mean_t_unit_length, float) or not (mean_t_unit_length != mean_t_unit_length) else 0.0
286
+ )
287
+ normalized_dependency_distance = min(mean_dependency_distance / 5, 1.0)
288
+ normalized_subordination = (
289
+ subordination_index if not isinstance(subordination_index, float) or not (subordination_index != subordination_index) else 0.0
290
+ )
291
+
292
+ # Weighted combination
293
+ sentence_complexity_score = (
294
+ 0.3 * normalized_parse_depth
295
+ + 0.3 * normalized_clausal_density
296
+ + 0.2 * normalized_t_unit_length
297
+ + 0.1 * normalized_subordination
298
+ + 0.1 * normalized_dependency_distance
299
+ )
300
+
301
+ # Collect metadata
302
+ metadata = {
303
+ "sentence_count": len(sentences),
304
+ "word_count": len(doc),
305
+ "total_clauses": total_clauses,
306
+ "independent_clause_count": total_clauses - dependent_clause_count,
307
+ "dependent_clause_count": dependent_clause_count,
308
+ "subordinate_clause_count": subordinate_clause_count,
309
+ "coordinate_clause_count": coordinate_clause_count,
310
+ "passive_sentence_count": passive_sentence_count,
311
+ "parse_depths_per_sentence": parse_depths,
312
+ "t_unit_lengths": t_unit_lengths,
313
+ "t_unit_count": t_unit_count,
314
+ "dependency_distances": dependency_distances[:100], # Sample for brevity
315
+ "left_branching_count": left_branching,
316
+ "right_branching_count": right_branching,
317
+ "model_used": model,
318
+ }
319
+
320
+ return AdvancedSyntacticResult(
321
+ mean_parse_tree_depth=mean_parse_tree_depth,
322
+ max_parse_tree_depth=max_parse_tree_depth,
323
+ t_unit_count=t_unit_count,
324
+ mean_t_unit_length=mean_t_unit_length,
325
+ clausal_density=clausal_density,
326
+ dependent_clause_ratio=dependent_clause_ratio,
327
+ passive_voice_ratio=passive_voice_ratio,
328
+ subordination_index=subordination_index,
329
+ coordination_index=coordination_index,
330
+ sentence_complexity_score=sentence_complexity_score,
331
+ dependency_distance=mean_dependency_distance,
332
+ left_branching_ratio=left_branching_ratio,
333
+ right_branching_ratio=right_branching_ratio,
334
+ metadata=metadata,
335
+ )
336
+
337
+
338
+ def _calculate_max_tree_depth(token) -> int:
339
+ """
340
+ Calculate maximum depth of dependency tree starting from token.
341
+
342
+ Args:
343
+ token: spaCy Token to start from (typically sentence root)
344
+
345
+ Returns:
346
+ Maximum depth of tree (root = 0, children = parent + 1)
347
+ """
348
+ if not list(token.children):
349
+ return 0
350
+
351
+ child_depths = [_calculate_max_tree_depth(child) for child in token.children]
352
+ return max(child_depths) + 1
353
+
354
+
355
+ def _identify_t_units(doc) -> list:
356
+ """
357
+ Identify T-units (minimal terminable units) in document.
358
+
359
+ A T-unit is one main clause plus all subordinate clauses attached to it.
360
+ This follows Hunt (1965) definition.
361
+
362
+ Args:
363
+ doc: spaCy Doc object
364
+
365
+ Returns:
366
+ List of spaCy Span objects, each representing a T-unit
367
+ """
368
+ # For simplicity, treat each sentence as a T-unit
369
+ # More sophisticated approach would split compound sentences
370
+ # into separate T-units, but this requires complex coordination analysis
371
+ return list(doc.sents)
372
+
373
+
374
+ def _count_clauses(sent) -> tuple[int, int, int, int]:
375
+ """
376
+ Count different types of clauses in sentence.
377
+
378
+ Args:
379
+ sent: spaCy Span representing a sentence
380
+
381
+ Returns:
382
+ Tuple of (total_clauses, dependent_clauses, subordinate_clauses, coordinate_clauses)
383
+ """
384
+ # Start with 1 for the main clause
385
+ total = 1
386
+ dependent = 0
387
+ subordinate = 0
388
+ coordinate = 0
389
+
390
+ # Dependency labels that indicate clauses
391
+ dependent_clause_labels = {"csubj", "ccomp", "xcomp", "advcl", "acl", "relcl"}
392
+ subordinate_clause_labels = {"advcl", "acl", "relcl"}
393
+ coordinate_clause_labels = {"conj"}
394
+
395
+ for token in sent:
396
+ if token.dep_ in dependent_clause_labels:
397
+ total += 1
398
+ dependent += 1
399
+ if token.dep_ in subordinate_clause_labels:
400
+ subordinate += 1
401
+ elif token.dep_ in coordinate_clause_labels and token.pos_ == "VERB":
402
+ # Coordinate clause (conj) with verb = coordinated main clause
403
+ total += 1
404
+ coordinate += 1
405
+
406
+ return total, dependent, subordinate, coordinate
407
+
408
+
409
+ def _is_passive_voice(sent) -> bool:
410
+ """
411
+ Detect if sentence contains passive voice construction.
412
+
413
+ Args:
414
+ sent: spaCy Span representing a sentence
415
+
416
+ Returns:
417
+ True if passive voice detected, False otherwise
418
+ """
419
+ # Look for passive auxiliary + past participle pattern
420
+ for token in sent:
421
+ # Check for passive subject dependency (older spaCy versions)
422
+ if token.dep_ == "nsubjpass":
423
+ return True
424
+ # Check for passive auxiliary + past participle (newer spaCy versions)
425
+ # In newer spaCy, passive is marked with nsubj:pass or through aux:pass
426
+ if "pass" in token.dep_:
427
+ return True
428
+ # Alternative: check for "be" verb + past participle
429
+ if token.dep_ == "auxpass":
430
+ return True
431
+
432
+ return False