pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. pystylometry/README.md +42 -0
  2. pystylometry/__init__.py +45 -3
  3. pystylometry/_types.py +1017 -259
  4. pystylometry/authorship/README.md +21 -0
  5. pystylometry/authorship/__init__.py +28 -4
  6. pystylometry/authorship/additional_methods.py +260 -40
  7. pystylometry/authorship/compression.py +175 -0
  8. pystylometry/authorship/kilgarriff.py +354 -0
  9. pystylometry/character/README.md +17 -0
  10. pystylometry/character/character_metrics.py +267 -179
  11. pystylometry/cli.py +427 -0
  12. pystylometry/consistency/README.md +27 -0
  13. pystylometry/consistency/__init__.py +57 -0
  14. pystylometry/consistency/_thresholds.py +162 -0
  15. pystylometry/consistency/drift.py +549 -0
  16. pystylometry/dialect/README.md +26 -0
  17. pystylometry/dialect/__init__.py +65 -0
  18. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  19. pystylometry/dialect/_loader.py +360 -0
  20. pystylometry/dialect/detector.py +533 -0
  21. pystylometry/lexical/README.md +23 -0
  22. pystylometry/lexical/advanced_diversity.py +61 -22
  23. pystylometry/lexical/function_words.py +255 -56
  24. pystylometry/lexical/hapax.py +182 -52
  25. pystylometry/lexical/mtld.py +108 -26
  26. pystylometry/lexical/ttr.py +76 -10
  27. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  28. pystylometry/lexical/yule.py +136 -50
  29. pystylometry/ngrams/README.md +18 -0
  30. pystylometry/ngrams/entropy.py +150 -49
  31. pystylometry/ngrams/extended_ngrams.py +314 -69
  32. pystylometry/prosody/README.md +17 -0
  33. pystylometry/prosody/rhythm_prosody.py +773 -11
  34. pystylometry/readability/README.md +23 -0
  35. pystylometry/readability/additional_formulas.py +1887 -762
  36. pystylometry/readability/ari.py +144 -82
  37. pystylometry/readability/coleman_liau.py +136 -109
  38. pystylometry/readability/flesch.py +177 -73
  39. pystylometry/readability/gunning_fog.py +165 -161
  40. pystylometry/readability/smog.py +123 -42
  41. pystylometry/stylistic/README.md +20 -0
  42. pystylometry/stylistic/cohesion_coherence.py +669 -13
  43. pystylometry/stylistic/genre_register.py +1560 -17
  44. pystylometry/stylistic/markers.py +611 -17
  45. pystylometry/stylistic/vocabulary_overlap.py +354 -13
  46. pystylometry/syntactic/README.md +20 -0
  47. pystylometry/syntactic/advanced_syntactic.py +76 -14
  48. pystylometry/syntactic/pos_ratios.py +70 -6
  49. pystylometry/syntactic/sentence_stats.py +55 -12
  50. pystylometry/syntactic/sentence_types.py +71 -15
  51. pystylometry/viz/README.md +27 -0
  52. pystylometry/viz/__init__.py +71 -0
  53. pystylometry/viz/drift.py +589 -0
  54. pystylometry/viz/jsx/__init__.py +31 -0
  55. pystylometry/viz/jsx/_base.py +144 -0
  56. pystylometry/viz/jsx/report.py +677 -0
  57. pystylometry/viz/jsx/timeline.py +716 -0
  58. pystylometry/viz/jsx/viewer.py +1032 -0
  59. pystylometry-1.3.0.dist-info/METADATA +136 -0
  60. pystylometry-1.3.0.dist-info/RECORD +76 -0
  61. {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
  62. pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
  63. pystylometry-1.0.0.dist-info/METADATA +0 -275
  64. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -10,15 +10,275 @@ Related GitHub Issue:
10
10
 
11
11
  References:
12
12
  Jaccard, P. (1912). The distribution of the flora in the alpine zone.
13
- Salton, G., & McGill, M. J. (1983). Introduction to Modern Information Retrieval.
13
+ New Phytologist, 11(2), 37-50.
14
+ Sørensen, T. (1948). A method of establishing groups of equal amplitude in
15
+ plant sociology based on similarity of species. Kongelige Danske
16
+ Videnskabernes Selskab, 5(4), 1-34.
17
+ Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
18
+ Retrieval. McGraw-Hill.
19
+ Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
20
+ Annals of Mathematical Statistics, 22(1), 79-86.
21
+ Manning, C. D., & Schütze, H. (1999). Foundations of Statistical NLP.
22
+ MIT Press.
14
23
  """
15
24
 
25
+ from __future__ import annotations
26
+
27
+ import math
28
+ import re
29
+ from collections import Counter
30
+
16
31
  from .._types import VocabularyOverlapResult
17
32
 
18
33
 
19
- def compute_vocabulary_overlap(text1: str, text2: str) -> VocabularyOverlapResult:
34
+ def _tokenize(text: str) -> list[str]:
35
+ """Tokenize text into lowercase words.
36
+
37
+ Uses a simple regex-based tokenizer that extracts word characters.
38
+ Converts to lowercase for case-insensitive comparison.
39
+
40
+ Args:
41
+ text: Input text to tokenize
42
+
43
+ Returns:
44
+ List of lowercase word tokens
45
+ """
46
+ # Match word characters, convert to lowercase
47
+ tokens = re.findall(r"\b[a-zA-Z]+\b", text.lower())
48
+ return tokens
49
+
50
+
51
+ def _compute_jaccard(set1: set[str], set2: set[str]) -> float:
52
+ """Compute Jaccard similarity coefficient.
53
+
54
+ The Jaccard index measures similarity as the size of the intersection
55
+ divided by the size of the union of two sets.
56
+
57
+ J(A, B) = |A ∩ B| / |A ∪ B|
58
+
59
+ Args:
60
+ set1: First vocabulary set
61
+ set2: Second vocabulary set
62
+
63
+ Returns:
64
+ Jaccard similarity coefficient (0.0 to 1.0)
65
+
66
+ References:
67
+ Jaccard, P. (1912). The distribution of the flora in the alpine zone.
68
+ """
69
+ if not set1 and not set2:
70
+ return 1.0 # Both empty = identical
71
+
72
+ intersection = len(set1 & set2)
73
+ union = len(set1 | set2)
74
+
75
+ return intersection / union if union > 0 else 0.0
76
+
77
+
78
+ def _compute_dice(set1: set[str], set2: set[str]) -> float:
79
+ """Compute Sørensen-Dice coefficient.
80
+
81
+ The Dice coefficient is similar to Jaccard but weights the intersection
82
+ more heavily. Also known as the Sørensen-Dice index.
83
+
84
+ D(A, B) = 2|A ∩ B| / (|A| + |B|)
85
+
86
+ Args:
87
+ set1: First vocabulary set
88
+ set2: Second vocabulary set
89
+
90
+ Returns:
91
+ Dice coefficient (0.0 to 1.0)
92
+
93
+ References:
94
+ Sørensen, T. (1948). A method of establishing groups of equal amplitude
95
+ in plant sociology based on similarity of species.
20
96
  """
21
- Compute vocabulary overlap and similarity between two texts.
97
+ if not set1 and not set2:
98
+ return 1.0 # Both empty = identical
99
+
100
+ intersection = len(set1 & set2)
101
+ total_size = len(set1) + len(set2)
102
+
103
+ return (2 * intersection) / total_size if total_size > 0 else 0.0
104
+
105
+
106
+ def _compute_overlap_coefficient(set1: set[str], set2: set[str]) -> float:
107
+ """Compute overlap coefficient.
108
+
109
+ The overlap coefficient measures the overlap relative to the smaller set.
110
+ Useful when comparing texts of very different lengths.
111
+
112
+ O(A, B) = |A ∩ B| / min(|A|, |B|)
113
+
114
+ Args:
115
+ set1: First vocabulary set
116
+ set2: Second vocabulary set
117
+
118
+ Returns:
119
+ Overlap coefficient (0.0 to 1.0)
120
+ """
121
+ if not set1 or not set2:
122
+ return 0.0 if set1 or set2 else 1.0
123
+
124
+ intersection = len(set1 & set2)
125
+ min_size = min(len(set1), len(set2))
126
+
127
+ return intersection / min_size if min_size > 0 else 0.0
128
+
129
+
130
+ def _compute_cosine_similarity(freq1: Counter[str], freq2: Counter[str], vocab: set[str]) -> float:
131
+ """Compute cosine similarity between term frequency vectors.
132
+
133
+ Treats each text as a vector in vocabulary space where each dimension
134
+ is the frequency of a word. Computes the cosine of the angle between vectors.
135
+
136
+ cos(θ) = (A · B) / (||A|| × ||B||)
137
+
138
+ Args:
139
+ freq1: Word frequencies for text 1
140
+ freq2: Word frequencies for text 2
141
+ vocab: Combined vocabulary (union of both texts)
142
+
143
+ Returns:
144
+ Cosine similarity (-1.0 to 1.0, though word frequencies yield 0.0 to 1.0)
145
+
146
+ References:
147
+ Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
148
+ Retrieval.
149
+ """
150
+ if not vocab:
151
+ return 1.0 # Both empty = identical
152
+
153
+ # Compute dot product and magnitudes
154
+ dot_product = 0.0
155
+ magnitude1 = 0.0
156
+ magnitude2 = 0.0
157
+
158
+ for word in vocab:
159
+ f1 = freq1.get(word, 0)
160
+ f2 = freq2.get(word, 0)
161
+ dot_product += f1 * f2
162
+ magnitude1 += f1 * f1
163
+ magnitude2 += f2 * f2
164
+
165
+ magnitude1 = math.sqrt(magnitude1)
166
+ magnitude2 = math.sqrt(magnitude2)
167
+
168
+ if magnitude1 == 0 or magnitude2 == 0:
169
+ return 0.0
170
+
171
+ return dot_product / (magnitude1 * magnitude2)
172
+
173
+
174
+ def _compute_kl_divergence(
175
+ freq1: Counter[str], freq2: Counter[str], vocab: set[str], smoothing: float = 1e-10
176
+ ) -> float:
177
+ """Compute Kullback-Leibler divergence from text1 to text2.
178
+
179
+ KL divergence measures how one probability distribution diverges from
180
+ another. It is asymmetric: D_KL(P || Q) ≠ D_KL(Q || P).
181
+
182
+ D_KL(P || Q) = Σ P(x) log(P(x) / Q(x))
183
+
184
+ A small smoothing value is added to avoid division by zero when Q(x) = 0.
185
+
186
+ Args:
187
+ freq1: Word frequencies for text 1 (P distribution)
188
+ freq2: Word frequencies for text 2 (Q distribution)
189
+ vocab: Combined vocabulary (union of both texts)
190
+ smoothing: Small value added to probabilities to avoid log(0)
191
+
192
+ Returns:
193
+ KL divergence (non-negative, unbounded above)
194
+
195
+ Note:
196
+ Returns 0.0 for identical distributions. Higher values indicate
197
+ greater difference between distributions.
198
+
199
+ References:
200
+ Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
201
+ """
202
+ if not vocab:
203
+ return 0.0 # Both empty = identical
204
+
205
+ # Convert frequencies to probabilities
206
+ total1 = sum(freq1.values())
207
+ total2 = sum(freq2.values())
208
+
209
+ if total1 == 0 or total2 == 0:
210
+ return 0.0
211
+
212
+ kl_div = 0.0
213
+ for word in vocab:
214
+ p = (freq1.get(word, 0) / total1) + smoothing
215
+ q = (freq2.get(word, 0) / total2) + smoothing
216
+ kl_div += p * math.log(p / q)
217
+
218
+ return max(0.0, kl_div) # Ensure non-negative due to smoothing artifacts
219
+
220
+
221
+ def _compute_tfidf_distinctive_words(
222
+ freq1: Counter[str],
223
+ freq2: Counter[str],
224
+ unique_to_1: set[str],
225
+ unique_to_2: set[str],
226
+ top_n: int = 20,
227
+ ) -> tuple[list[tuple[str, float]], list[tuple[str, float]]]:
228
+ """Compute distinctive words for each text using TF-IDF-like scoring.
229
+
230
+ Words unique to each text are scored by their frequency, providing
231
+ a measure of how "distinctive" they are for that text.
232
+
233
+ For texts with shared vocabulary, the scoring considers relative
234
+ frequency differences.
235
+
236
+ Args:
237
+ freq1: Word frequencies for text 1
238
+ freq2: Word frequencies for text 2
239
+ unique_to_1: Words appearing only in text 1
240
+ unique_to_2: Words appearing only in text 2
241
+ top_n: Number of top distinctive words to return
242
+
243
+ Returns:
244
+ Tuple of (text1_distinctive, text2_distinctive) lists,
245
+ each containing (word, score) tuples sorted by score descending
246
+ """
247
+ # For unique words, score by frequency
248
+ text1_scores: list[tuple[str, float]] = []
249
+ for word in unique_to_1:
250
+ score = float(freq1[word])
251
+ text1_scores.append((word, score))
252
+
253
+ text2_scores: list[tuple[str, float]] = []
254
+ for word in unique_to_2:
255
+ score = float(freq2[word])
256
+ text2_scores.append((word, score))
257
+
258
+ # Sort by score descending
259
+ text1_scores.sort(key=lambda x: x[1], reverse=True)
260
+ text2_scores.sort(key=lambda x: x[1], reverse=True)
261
+
262
+ return text1_scores[:top_n], text2_scores[:top_n]
263
+
264
+
265
+ def compute_vocabulary_overlap(
266
+ text1: str,
267
+ text2: str,
268
+ top_distinctive: int = 20,
269
+ ) -> VocabularyOverlapResult:
270
+ """Compute vocabulary overlap and similarity between two texts.
271
+
272
+ This function computes multiple similarity metrics based on vocabulary
273
+ comparison, useful for authorship verification, plagiarism detection,
274
+ and measuring stylistic consistency across texts.
275
+
276
+ Metrics computed:
277
+ - Jaccard similarity: intersection / union (set-based)
278
+ - Sørensen-Dice coefficient: 2 * intersection / (size1 + size2)
279
+ - Overlap coefficient: intersection / min(size1, size2)
280
+ - Cosine similarity: dot product of frequency vectors
281
+ - KL divergence: distributional difference (asymmetric)
22
282
 
23
283
  Related GitHub Issue:
24
284
  #21 - Vocabulary Overlap and Similarity Metrics
@@ -27,21 +287,102 @@ def compute_vocabulary_overlap(text1: str, text2: str) -> VocabularyOverlapResul
27
287
  Args:
28
288
  text1: First text to compare
29
289
  text2: Second text to compare
290
+ top_distinctive: Number of most distinctive words to return per text
30
291
 
31
292
  Returns:
32
- VocabularyOverlapResult with Jaccard, Dice, cosine similarities,
33
- shared vocabulary statistics, and distinctive words for each text.
293
+ VocabularyOverlapResult with similarity scores, vocabulary statistics,
294
+ shared vocabulary, and distinctive words for each text.
34
295
 
35
296
  Example:
36
- >>> result = compute_vocabulary_overlap(text1, text2)
297
+ >>> result = compute_vocabulary_overlap(
298
+ ... "The quick brown fox jumps over the lazy dog",
299
+ ... "The fast brown fox leaps over the sleepy dog"
300
+ ... )
37
301
  >>> print(f"Jaccard similarity: {result.jaccard_similarity:.3f}")
38
- Jaccard similarity: 0.456
302
+ Jaccard similarity: 0.583
39
303
  >>> print(f"Shared words: {result.shared_vocab_size}")
40
- Shared words: 234
304
+ Shared words: 7
305
+ >>> print(f"Text1 distinctive: {result.text1_distinctive_words}")
306
+ [('quick', 1.0), ('jumps', 1.0), ('lazy', 1.0)]
307
+
308
+ References:
309
+ Jaccard, P. (1912). The distribution of the flora in the alpine zone.
310
+ New Phytologist, 11(2), 37-50.
311
+ Sørensen, T. (1948). A method of establishing groups of equal amplitude
312
+ in plant sociology based on similarity of species.
313
+ Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
314
+ Retrieval. McGraw-Hill.
315
+ Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
316
+ Annals of Mathematical Statistics, 22(1), 79-86.
317
+ Manning, C. D., & Schütze, H. (1999). Foundations of Statistical NLP.
318
+ MIT Press.
41
319
  """
42
- # TODO: Implement vocabulary overlap analysis
43
- # GitHub Issue #21: https://github.com/craigtrim/pystylometry/issues/21
44
- raise NotImplementedError(
45
- "Vocabulary overlap not yet implemented. "
46
- "See GitHub Issue #21: https://github.com/craigtrim/pystylometry/issues/21"
320
+ # Tokenize texts
321
+ tokens1 = _tokenize(text1)
322
+ tokens2 = _tokenize(text2)
323
+
324
+ # Build frequency counters and vocabulary sets
325
+ freq1: Counter[str] = Counter(tokens1)
326
+ freq2: Counter[str] = Counter(tokens2)
327
+
328
+ vocab1 = set(freq1.keys())
329
+ vocab2 = set(freq2.keys())
330
+
331
+ # Compute set operations
332
+ shared = vocab1 & vocab2
333
+ union = vocab1 | vocab2
334
+ unique_to_1 = vocab1 - vocab2
335
+ unique_to_2 = vocab2 - vocab1
336
+
337
+ # Compute similarity metrics
338
+ jaccard = _compute_jaccard(vocab1, vocab2)
339
+ dice = _compute_dice(vocab1, vocab2)
340
+ overlap = _compute_overlap_coefficient(vocab1, vocab2)
341
+ cosine = _compute_cosine_similarity(freq1, freq2, union)
342
+ kl_div = _compute_kl_divergence(freq1, freq2, union)
343
+
344
+ # Compute coverage ratios
345
+ text1_coverage = len(shared) / len(vocab1) if vocab1 else 0.0
346
+ text2_coverage = len(shared) / len(vocab2) if vocab2 else 0.0
347
+
348
+ # Get distinctive words
349
+ text1_distinctive, text2_distinctive = _compute_tfidf_distinctive_words(
350
+ freq1, freq2, unique_to_1, unique_to_2, top_distinctive
351
+ )
352
+
353
+ # Build shared words list (sorted by combined frequency)
354
+ shared_with_freq = [(word, freq1[word] + freq2[word]) for word in shared]
355
+ shared_with_freq.sort(key=lambda x: x[1], reverse=True)
356
+ shared_words = [word for word, _ in shared_with_freq]
357
+
358
+ return VocabularyOverlapResult(
359
+ # Similarity scores
360
+ jaccard_similarity=jaccard,
361
+ dice_coefficient=dice,
362
+ overlap_coefficient=overlap,
363
+ cosine_similarity=cosine,
364
+ kl_divergence=kl_div,
365
+ # Vocabulary sizes
366
+ text1_vocab_size=len(vocab1),
367
+ text2_vocab_size=len(vocab2),
368
+ shared_vocab_size=len(shared),
369
+ union_vocab_size=len(union),
370
+ text1_unique_count=len(unique_to_1),
371
+ text2_unique_count=len(unique_to_2),
372
+ # Shared and distinctive vocabulary
373
+ shared_words=shared_words,
374
+ text1_distinctive_words=text1_distinctive,
375
+ text2_distinctive_words=text2_distinctive,
376
+ # Coverage ratios
377
+ text1_coverage=text1_coverage,
378
+ text2_coverage=text2_coverage,
379
+ # Metadata
380
+ metadata={
381
+ "text1_token_count": len(tokens1),
382
+ "text2_token_count": len(tokens2),
383
+ "text1_frequencies": dict(freq1),
384
+ "text2_frequencies": dict(freq2),
385
+ "unique_to_text1": sorted(unique_to_1),
386
+ "unique_to_text2": sorted(unique_to_2),
387
+ },
47
388
  )
@@ -0,0 +1,20 @@
1
+ # syntactic
2
+
3
+ ![4 public functions](https://img.shields.io/badge/functions-4-blue)
4
+ ![Requires spaCy](https://img.shields.io/badge/requires-spaCy-orange)
5
+
6
+ Sentence structure, part-of-speech, and parse tree analysis.
7
+
8
+ ## Catalogue
9
+
10
+ | File | Function | What It Measures |
11
+ |------|----------|-----------------|
12
+ | `pos_ratios.py` | `compute_pos_ratios` | Noun/verb/adjective/adverb ratios |
13
+ | `sentence_stats.py` | `compute_sentence_stats` | Sentence length, word length distributions |
14
+ | `sentence_types.py` | `compute_sentence_types` | Declarative, interrogative, imperative, exclamatory classification |
15
+ | `advanced_syntactic.py` | `compute_advanced_syntactic` | Parse tree depth, clausal density, passive voice, T-units, dependency distance, subordination/coordination ratios |
16
+
17
+ ## See Also
18
+
19
+ - [`stylistic/`](../stylistic/) for higher-level style features built on syntactic foundations
20
+ - [`ngrams/`](../ngrams/) for POS n-gram sequences via `compute_extended_ngrams(text, pos=True)`
@@ -28,13 +28,21 @@ References:
28
28
  of linguistic complexity. In Image, language, brain (pp. 95-126).
29
29
  """
30
30
 
31
- from .._types import AdvancedSyntacticResult
31
+ from typing import Any
32
+
33
+ from .._types import AdvancedSyntacticResult, Distribution, make_distribution
32
34
  from .._utils import check_optional_dependency
33
35
 
36
+ # Type aliases for spaCy objects (loaded dynamically)
37
+ _SpaCyToken = Any
38
+ _SpaCyDoc = Any
39
+ _SpaCySpan = Any
40
+
34
41
 
35
42
  def compute_advanced_syntactic(
36
43
  text: str,
37
44
  model: str = "en_core_web_sm",
45
+ chunk_size: int = 1000,
38
46
  ) -> AdvancedSyntacticResult:
39
47
  """
40
48
  Compute advanced syntactic complexity metrics using dependency parsing.
@@ -147,7 +155,6 @@ def compute_advanced_syntactic(
147
155
 
148
156
  try:
149
157
  import spacy # type: ignore
150
- from spacy.tokens import Doc, Span, Token # type: ignore
151
158
  except ImportError as e:
152
159
  raise ImportError(
153
160
  "spaCy is required for advanced syntactic analysis. "
@@ -159,8 +166,7 @@ def compute_advanced_syntactic(
159
166
  nlp = spacy.load(model)
160
167
  except OSError as e:
161
168
  raise OSError(
162
- f"spaCy model '{model}' not found. "
163
- f"Download with: python -m spacy download {model}"
169
+ f"spaCy model '{model}' not found. Download with: python -m spacy download {model}"
164
170
  ) from e
165
171
 
166
172
  # Parse text
@@ -169,6 +175,14 @@ def compute_advanced_syntactic(
169
175
 
170
176
  # Handle empty text
171
177
  if len(sentences) == 0 or len(doc) == 0:
178
+ empty_dist = Distribution(
179
+ values=[],
180
+ mean=float("nan"),
181
+ median=float("nan"),
182
+ std=0.0,
183
+ range=0.0,
184
+ iqr=0.0,
185
+ )
172
186
  return AdvancedSyntacticResult(
173
187
  mean_parse_tree_depth=float("nan"),
174
188
  max_parse_tree_depth=0,
@@ -183,6 +197,20 @@ def compute_advanced_syntactic(
183
197
  dependency_distance=float("nan"),
184
198
  left_branching_ratio=float("nan"),
185
199
  right_branching_ratio=float("nan"),
200
+ mean_parse_tree_depth_dist=empty_dist,
201
+ max_parse_tree_depth_dist=empty_dist,
202
+ mean_t_unit_length_dist=empty_dist,
203
+ clausal_density_dist=empty_dist,
204
+ dependent_clause_ratio_dist=empty_dist,
205
+ passive_voice_ratio_dist=empty_dist,
206
+ subordination_index_dist=empty_dist,
207
+ coordination_index_dist=empty_dist,
208
+ sentence_complexity_score_dist=empty_dist,
209
+ dependency_distance_dist=empty_dist,
210
+ left_branching_ratio_dist=empty_dist,
211
+ right_branching_ratio_dist=empty_dist,
212
+ chunk_size=chunk_size,
213
+ chunk_count=0,
186
214
  metadata={
187
215
  "sentence_count": 0,
188
216
  "word_count": 0,
@@ -229,9 +257,7 @@ def compute_advanced_syntactic(
229
257
  coordinate_clause_count = 0
230
258
 
231
259
  for sent in sentences:
232
- sent_total, sent_dependent, sent_subordinate, sent_coordinate = _count_clauses(
233
- sent
234
- )
260
+ sent_total, sent_dependent, sent_subordinate, sent_coordinate = _count_clauses(sent)
235
261
  total_clauses += sent_total
236
262
  dependent_clause_count += sent_dependent
237
263
  subordinate_clause_count += sent_subordinate
@@ -279,14 +305,22 @@ def compute_advanced_syntactic(
279
305
  # Normalize individual metrics to 0-1 range
280
306
  normalized_parse_depth = min(mean_parse_tree_depth / 10, 1.0)
281
307
  normalized_clausal_density = (
282
- min(clausal_density / 3, 1.0) if not isinstance(clausal_density, float) or not (clausal_density != clausal_density) else 0.0
308
+ min(clausal_density / 3, 1.0)
309
+ if not isinstance(clausal_density, float) or not (clausal_density != clausal_density)
310
+ else 0.0
283
311
  )
284
312
  normalized_t_unit_length = (
285
- min(mean_t_unit_length / 25, 1.0) if not isinstance(mean_t_unit_length, float) or not (mean_t_unit_length != mean_t_unit_length) else 0.0
313
+ min(mean_t_unit_length / 25, 1.0)
314
+ if not isinstance(mean_t_unit_length, float)
315
+ or not (mean_t_unit_length != mean_t_unit_length)
316
+ else 0.0
286
317
  )
287
318
  normalized_dependency_distance = min(mean_dependency_distance / 5, 1.0)
288
319
  normalized_subordination = (
289
- subordination_index if not isinstance(subordination_index, float) or not (subordination_index != subordination_index) else 0.0
320
+ subordination_index
321
+ if not isinstance(subordination_index, float)
322
+ or not (subordination_index != subordination_index)
323
+ else 0.0
290
324
  )
291
325
 
292
326
  # Weighted combination
@@ -298,6 +332,20 @@ def compute_advanced_syntactic(
298
332
  + 0.1 * normalized_dependency_distance
299
333
  )
300
334
 
335
+ # Create single-value distributions (analysis is done on full text)
336
+ mean_parse_tree_depth_dist = make_distribution([mean_parse_tree_depth])
337
+ max_parse_tree_depth_dist = make_distribution([float(max_parse_tree_depth)])
338
+ mean_t_unit_length_dist = make_distribution([mean_t_unit_length])
339
+ clausal_density_dist = make_distribution([clausal_density])
340
+ dependent_clause_ratio_dist = make_distribution([dependent_clause_ratio])
341
+ passive_voice_ratio_dist = make_distribution([passive_voice_ratio])
342
+ subordination_index_dist = make_distribution([subordination_index])
343
+ coordination_index_dist = make_distribution([coordination_index])
344
+ sentence_complexity_score_dist = make_distribution([sentence_complexity_score])
345
+ dependency_distance_dist = make_distribution([mean_dependency_distance])
346
+ left_branching_ratio_dist = make_distribution([left_branching_ratio])
347
+ right_branching_ratio_dist = make_distribution([right_branching_ratio])
348
+
301
349
  # Collect metadata
302
350
  metadata = {
303
351
  "sentence_count": len(sentences),
@@ -331,11 +379,25 @@ def compute_advanced_syntactic(
331
379
  dependency_distance=mean_dependency_distance,
332
380
  left_branching_ratio=left_branching_ratio,
333
381
  right_branching_ratio=right_branching_ratio,
382
+ mean_parse_tree_depth_dist=mean_parse_tree_depth_dist,
383
+ max_parse_tree_depth_dist=max_parse_tree_depth_dist,
384
+ mean_t_unit_length_dist=mean_t_unit_length_dist,
385
+ clausal_density_dist=clausal_density_dist,
386
+ dependent_clause_ratio_dist=dependent_clause_ratio_dist,
387
+ passive_voice_ratio_dist=passive_voice_ratio_dist,
388
+ subordination_index_dist=subordination_index_dist,
389
+ coordination_index_dist=coordination_index_dist,
390
+ sentence_complexity_score_dist=sentence_complexity_score_dist,
391
+ dependency_distance_dist=dependency_distance_dist,
392
+ left_branching_ratio_dist=left_branching_ratio_dist,
393
+ right_branching_ratio_dist=right_branching_ratio_dist,
394
+ chunk_size=chunk_size,
395
+ chunk_count=1, # Single pass analysis
334
396
  metadata=metadata,
335
397
  )
336
398
 
337
399
 
338
- def _calculate_max_tree_depth(token) -> int:
400
+ def _calculate_max_tree_depth(token: _SpaCyToken) -> int:
339
401
  """
340
402
  Calculate maximum depth of dependency tree starting from token.
341
403
 
@@ -352,7 +414,7 @@ def _calculate_max_tree_depth(token) -> int:
352
414
  return max(child_depths) + 1
353
415
 
354
416
 
355
- def _identify_t_units(doc) -> list:
417
+ def _identify_t_units(doc: _SpaCyDoc) -> list[_SpaCySpan]:
356
418
  """
357
419
  Identify T-units (minimal terminable units) in document.
358
420
 
@@ -371,7 +433,7 @@ def _identify_t_units(doc) -> list:
371
433
  return list(doc.sents)
372
434
 
373
435
 
374
- def _count_clauses(sent) -> tuple[int, int, int, int]:
436
+ def _count_clauses(sent: _SpaCySpan) -> tuple[int, int, int, int]:
375
437
  """
376
438
  Count different types of clauses in sentence.
377
439
 
@@ -406,7 +468,7 @@ def _count_clauses(sent) -> tuple[int, int, int, int]:
406
468
  return total, dependent, subordinate, coordinate
407
469
 
408
470
 
409
- def _is_passive_voice(sent) -> bool:
471
+ def _is_passive_voice(sent: _SpaCySpan) -> bool:
410
472
  """
411
473
  Detect if sentence contains passive voice construction.
412
474