pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pystylometry/__init__.py +30 -5
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1954 -28
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +26 -1
  6. pystylometry/authorship/additional_methods.py +75 -0
  7. pystylometry/authorship/kilgarriff.py +347 -0
  8. pystylometry/character/__init__.py +15 -0
  9. pystylometry/character/character_metrics.py +389 -0
  10. pystylometry/cli.py +427 -0
  11. pystylometry/consistency/__init__.py +57 -0
  12. pystylometry/consistency/_thresholds.py +162 -0
  13. pystylometry/consistency/drift.py +549 -0
  14. pystylometry/dialect/__init__.py +65 -0
  15. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  16. pystylometry/dialect/_loader.py +360 -0
  17. pystylometry/dialect/detector.py +533 -0
  18. pystylometry/lexical/__init__.py +13 -6
  19. pystylometry/lexical/advanced_diversity.py +680 -0
  20. pystylometry/lexical/function_words.py +590 -0
  21. pystylometry/lexical/hapax.py +310 -33
  22. pystylometry/lexical/mtld.py +180 -22
  23. pystylometry/lexical/ttr.py +149 -0
  24. pystylometry/lexical/word_frequency_sophistication.py +1805 -0
  25. pystylometry/lexical/yule.py +142 -29
  26. pystylometry/ngrams/__init__.py +2 -0
  27. pystylometry/ngrams/entropy.py +150 -49
  28. pystylometry/ngrams/extended_ngrams.py +235 -0
  29. pystylometry/prosody/__init__.py +12 -0
  30. pystylometry/prosody/rhythm_prosody.py +53 -0
  31. pystylometry/readability/__init__.py +12 -0
  32. pystylometry/readability/additional_formulas.py +2110 -0
  33. pystylometry/readability/ari.py +173 -35
  34. pystylometry/readability/coleman_liau.py +150 -30
  35. pystylometry/readability/complex_words.py +531 -0
  36. pystylometry/readability/flesch.py +181 -32
  37. pystylometry/readability/gunning_fog.py +208 -35
  38. pystylometry/readability/smog.py +126 -28
  39. pystylometry/readability/syllables.py +137 -30
  40. pystylometry/stylistic/__init__.py +20 -0
  41. pystylometry/stylistic/cohesion_coherence.py +45 -0
  42. pystylometry/stylistic/genre_register.py +45 -0
  43. pystylometry/stylistic/markers.py +131 -0
  44. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  45. pystylometry/syntactic/__init__.py +4 -0
  46. pystylometry/syntactic/advanced_syntactic.py +494 -0
  47. pystylometry/syntactic/pos_ratios.py +172 -17
  48. pystylometry/syntactic/sentence_stats.py +105 -18
  49. pystylometry/syntactic/sentence_types.py +526 -0
  50. pystylometry/viz/__init__.py +71 -0
  51. pystylometry/viz/drift.py +589 -0
  52. pystylometry/viz/jsx/__init__.py +31 -0
  53. pystylometry/viz/jsx/_base.py +144 -0
  54. pystylometry/viz/jsx/report.py +677 -0
  55. pystylometry/viz/jsx/timeline.py +716 -0
  56. pystylometry/viz/jsx/viewer.py +1032 -0
  57. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
  58. pystylometry-1.1.0.dist-info/RECORD +63 -0
  59. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  60. pystylometry-0.1.0.dist-info/RECORD +0 -26
  61. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
@@ -1,15 +1,87 @@
1
- """Hapax legomena and related vocabulary richness metrics."""
1
+ """Hapax legomena and related vocabulary richness metrics.
2
2
 
3
+ This module implements hapax metrics with native chunked analysis for
4
+ stylometric fingerprinting.
5
+
6
+ Related GitHub Issue:
7
+ #27 - Native chunked analysis with Distribution dataclass
8
+ https://github.com/craigtrim/pystylometry/issues/27
9
+ """
10
+
11
+ import math
3
12
  from collections import Counter
4
13
 
5
- from .._types import HapaxResult
6
- from .._utils import tokenize
14
+ from .._types import (
15
+ Distribution,
16
+ HapaxLexiconResult,
17
+ HapaxResult,
18
+ LexiconCategories,
19
+ chunk_text,
20
+ make_distribution,
21
+ )
22
+ from .._utils import check_optional_dependency, tokenize
23
+
7
24
 
25
+ def _compute_hapax_single(text: str) -> tuple[int, float, int, float, float, float, dict]:
26
+ """Compute hapax metrics for a single chunk of text.
8
27
 
9
- def compute_hapax_ratios(text: str) -> HapaxResult:
28
+ Returns:
29
+ Tuple of (hapax_count, hapax_ratio, dis_hapax_count, dis_hapax_ratio,
30
+ sichel_s, honore_r, metadata_dict).
31
+ Returns nans for ratios on empty input.
32
+ """
33
+ tokens = tokenize(text.lower())
34
+ N = len(tokens) # noqa: N806
35
+
36
+ if N == 0:
37
+ return (
38
+ 0,
39
+ float("nan"),
40
+ 0,
41
+ float("nan"),
42
+ float("nan"),
43
+ float("nan"),
44
+ {"token_count": 0, "vocabulary_size": 0},
45
+ )
46
+
47
+ # Count frequency of each token
48
+ freq_counter = Counter(tokens)
49
+ V = len(freq_counter) # noqa: N806
50
+
51
+ # Count hapax legomena (V₁) and dislegomena (V₂)
52
+ V1 = sum(1 for count in freq_counter.values() if count == 1) # noqa: N806
53
+ V2 = sum(1 for count in freq_counter.values() if count == 2) # noqa: N806
54
+
55
+ # Sichel's S: ratio of dislegomena to vocabulary size
56
+ sichel_s = V2 / V if V > 0 else 0.0
57
+
58
+ # Honoré's R: 100 × log(N) / (1 - V₁/V)
59
+ if V1 == V:
60
+ honore_r = float("inf")
61
+ else:
62
+ honore_r = 100 * math.log(N) / (1 - V1 / V)
63
+
64
+ hapax_ratio = V1 / N if N > 0 else 0.0
65
+ dis_hapax_ratio = V2 / N if N > 0 else 0.0
66
+
67
+ return (
68
+ V1,
69
+ hapax_ratio,
70
+ V2,
71
+ dis_hapax_ratio,
72
+ sichel_s,
73
+ honore_r,
74
+ {"token_count": N, "vocabulary_size": V},
75
+ )
76
+
77
+
78
+ def compute_hapax_ratios(text: str, chunk_size: int = 1000) -> HapaxResult:
10
79
  """
11
80
  Compute hapax legomena, hapax dislegomena, and related richness metrics.
12
81
 
82
+ This function uses native chunked analysis to capture variance and patterns
83
+ across the text, which is essential for stylometric fingerprinting.
84
+
13
85
  Hapax legomena = words appearing exactly once
14
86
  Hapax dislegomena = words appearing exactly twice
15
87
 
@@ -17,6 +89,10 @@ def compute_hapax_ratios(text: str) -> HapaxResult:
17
89
  - Sichel's S: V₂ / V (ratio of dislegomena to total vocabulary)
18
90
  - Honoré's R: 100 × log(N) / (1 - V₁/V)
19
91
 
92
+ Related GitHub Issue:
93
+ #27 - Native chunked analysis with Distribution dataclass
94
+ https://github.com/craigtrim/pystylometry/issues/27
95
+
20
96
  References:
21
97
  Sichel, H. S. (1975). On a distribution law for word frequencies.
22
98
  Journal of the American Statistical Association, 70(351a), 542-547.
@@ -26,50 +102,251 @@ def compute_hapax_ratios(text: str) -> HapaxResult:
26
102
 
27
103
  Args:
28
104
  text: Input text to analyze
105
+ chunk_size: Number of words per chunk (default: 1000)
29
106
 
30
107
  Returns:
31
- HapaxResult with counts, ratios, Sichel's S, Honoré's R, and metadata
108
+ HapaxResult with counts, ratios, distributions, and metadata
32
109
 
33
110
  Example:
34
- >>> result = compute_hapax_ratios("The quick brown fox jumps over the lazy dog.")
35
- >>> print(f"Hapax ratio: {result.hapax_ratio:.3f}")
36
- >>> print(f"Sichel's S: {result.sichel_s:.3f}")
111
+ >>> result = compute_hapax_ratios("Long text here...", chunk_size=1000)
112
+ >>> result.hapax_ratio # Mean across chunks
113
+ 0.45
114
+ >>> result.hapax_ratio_dist.std # Variance reveals fingerprint
115
+ 0.08
37
116
  """
38
- tokens = tokenize(text.lower())
39
- N = len(tokens) # noqa: N806
117
+ # Chunk the text
118
+ chunks = chunk_text(text, chunk_size)
40
119
 
41
- if N == 0:
120
+ # Compute metrics per chunk
121
+ hapax_ratio_values = []
122
+ dis_hapax_ratio_values = []
123
+ sichel_s_values = []
124
+ honore_r_values = []
125
+ honore_r_inf_count = 0 # Track chunks where all words are unique (V₁ = V)
126
+ total_hapax_count = 0
127
+ total_dis_hapax_count = 0
128
+ total_tokens = 0
129
+ total_vocab = 0
130
+ valid_chunk_count = 0
131
+
132
+ for chunk in chunks:
133
+ h_cnt, h_rat, dh_cnt, dh_rat, sichel, honore, meta = _compute_hapax_single(chunk)
134
+ total_hapax_count += h_cnt
135
+ total_dis_hapax_count += dh_cnt
136
+ total_tokens += meta.get("token_count", 0)
137
+ total_vocab += meta.get("vocabulary_size", 0)
138
+
139
+ if not math.isnan(h_rat):
140
+ hapax_ratio_values.append(h_rat)
141
+ valid_chunk_count += 1
142
+ if not math.isnan(dh_rat):
143
+ dis_hapax_ratio_values.append(dh_rat)
144
+ if not math.isnan(sichel):
145
+ sichel_s_values.append(sichel)
146
+ if math.isinf(honore):
147
+ # Track infinite values (when V₁ = V, maximal vocabulary richness)
148
+ honore_r_inf_count += 1
149
+ elif not math.isnan(honore):
150
+ honore_r_values.append(honore)
151
+
152
+ # Handle empty or all-invalid chunks
153
+ if not hapax_ratio_values:
154
+ empty_dist = Distribution(
155
+ values=[],
156
+ mean=float("nan"),
157
+ median=float("nan"),
158
+ std=0.0,
159
+ range=0.0,
160
+ iqr=0.0,
161
+ )
42
162
  return HapaxResult(
43
163
  hapax_count=0,
44
- hapax_ratio=0.0,
164
+ hapax_ratio=float("nan"),
45
165
  dis_hapax_count=0,
46
- dis_hapax_ratio=0.0,
47
- sichel_s=0.0,
48
- honore_r=0.0,
49
- metadata={"token_count": 0, "vocabulary_size": 0},
166
+ dis_hapax_ratio=float("nan"),
167
+ sichel_s=float("nan"),
168
+ honore_r=float("nan"),
169
+ hapax_ratio_dist=empty_dist,
170
+ dis_hapax_ratio_dist=empty_dist,
171
+ sichel_s_dist=empty_dist,
172
+ honore_r_dist=empty_dist,
173
+ chunk_size=chunk_size,
174
+ chunk_count=len(chunks),
175
+ metadata={"total_token_count": 0, "total_vocabulary_size": 0},
50
176
  )
51
177
 
52
- # Count frequency of each token
178
+ # Build distributions
179
+ hapax_ratio_dist = make_distribution(hapax_ratio_values)
180
+ dis_hapax_ratio_dist = make_distribution(dis_hapax_ratio_values)
181
+ sichel_s_dist = (
182
+ make_distribution(sichel_s_values)
183
+ if sichel_s_values
184
+ else Distribution(
185
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
186
+ )
187
+ )
188
+
189
+ # Handle honore_r specially: if all valid chunks had V₁ = V (all unique words),
190
+ # return infinity to indicate maximal vocabulary richness
191
+ if honore_r_values:
192
+ honore_r_dist = make_distribution(honore_r_values)
193
+ honore_r_final = honore_r_dist.mean
194
+ elif honore_r_inf_count > 0 and honore_r_inf_count == valid_chunk_count:
195
+ # All valid chunks had infinite honore_r (all words unique)
196
+ honore_r_dist = Distribution(
197
+ values=[], mean=float("inf"), median=float("inf"), std=0.0, range=0.0, iqr=0.0
198
+ )
199
+ honore_r_final = float("inf")
200
+ else:
201
+ honore_r_dist = Distribution(
202
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
203
+ )
204
+ honore_r_final = float("nan")
205
+
206
+ return HapaxResult(
207
+ hapax_count=total_hapax_count,
208
+ hapax_ratio=hapax_ratio_dist.mean,
209
+ dis_hapax_count=total_dis_hapax_count,
210
+ dis_hapax_ratio=dis_hapax_ratio_dist.mean,
211
+ sichel_s=sichel_s_dist.mean,
212
+ honore_r=honore_r_final,
213
+ hapax_ratio_dist=hapax_ratio_dist,
214
+ dis_hapax_ratio_dist=dis_hapax_ratio_dist,
215
+ sichel_s_dist=sichel_s_dist,
216
+ honore_r_dist=honore_r_dist,
217
+ chunk_size=chunk_size,
218
+ chunk_count=len(chunks),
219
+ metadata={
220
+ "total_token_count": total_tokens,
221
+ "total_vocabulary_size": total_vocab,
222
+ },
223
+ )
224
+
225
+
226
+ def compute_hapax_with_lexicon_analysis(text: str) -> HapaxLexiconResult:
227
+ """
228
+ Compute hapax legomena with lexicon-based categorization.
229
+
230
+ Extends standard hapax analysis by categorizing hapax legomena based on
231
+ presence in WordNet and British National Corpus (BNC). This distinguishes
232
+ between:
233
+
234
+ 1. **Neologisms**: Words not in WordNet AND not in BNC
235
+ - True novel words or proper nouns
236
+ - High neologism ratio indicates vocabulary innovation
237
+
238
+ 2. **Rare Words**: Words in BNC but not WordNet, or vice versa
239
+ - Technical jargon, specialized terminology
240
+ - Words at the edges of common vocabulary
241
+
242
+ 3. **Common Words**: Words in both WordNet AND BNC
243
+ - Standard vocabulary that happens to appear once
244
+ - Low incidental usage of common words
245
+
246
+ This categorization is valuable for stylometric analysis:
247
+ - Authors with high neologism ratios are more innovative/creative
248
+ - Technical writing typically has higher rare word ratios
249
+ - Comparison of neologism vs common hapax distinguishes vocabulary
250
+ innovation from incidental word usage
251
+
252
+ Args:
253
+ text: Input text to analyze
254
+
255
+ Returns:
256
+ HapaxLexiconResult with standard hapax metrics and lexicon categorization
257
+
258
+ Raises:
259
+ ImportError: If bnc-lookup or wordnet-lookup packages are not installed
260
+
261
+ Example:
262
+ >>> text = "The xyzbot platform facilitates interdepartmental synergy."
263
+ >>> result = compute_hapax_with_lexicon_analysis(text)
264
+ >>> result.lexicon_analysis.neologisms
265
+ ['xyzbot', 'platform']
266
+ >>> result.lexicon_analysis.rare_words
267
+ ['facilitates', 'interdepartmental']
268
+ >>> result.lexicon_analysis.common_words
269
+ ['synergy']
270
+ >>> print(f"Neologism ratio: {result.lexicon_analysis.neologism_ratio:.2%}")
271
+ Neologism ratio: 40.00%
272
+
273
+ References:
274
+ British National Corpus: http://www.natcorp.ox.ac.uk/
275
+ WordNet: https://wordnet.princeton.edu/
276
+ """
277
+ # Check dependencies
278
+ check_optional_dependency("bnc_lookup", "lexical")
279
+ check_optional_dependency("wordnet_lookup", "lexical")
280
+
281
+ from bnc_lookup import exists as is_bnc_term # type: ignore[import-untyped]
282
+ from wordnet_lookup import is_wordnet_term # type: ignore[import-untyped]
283
+
284
+ # First compute standard hapax metrics
285
+ hapax_result = compute_hapax_ratios(text)
286
+
287
+ # If no hapax legomena, return empty categorization
288
+ if hapax_result.hapax_count == 0:
289
+ return HapaxLexiconResult(
290
+ hapax_result=hapax_result,
291
+ lexicon_analysis=LexiconCategories(
292
+ neologisms=[],
293
+ rare_words=[],
294
+ common_words=[],
295
+ neologism_ratio=0.0,
296
+ rare_word_ratio=0.0,
297
+ metadata={"total_hapax": 0},
298
+ ),
299
+ metadata={"note": "No hapax legomena found"},
300
+ )
301
+
302
+ # Get tokens and identify hapax words
303
+ tokens = tokenize(text.lower())
53
304
  freq_counter = Counter(tokens)
54
- V = len(freq_counter) # noqa: N806
305
+ hapax_words = [word for word, count in freq_counter.items() if count == 1]
55
306
 
56
- # Count hapax legomena (V₁) and dislegomena (V₂)
57
- V1 = sum(1 for count in freq_counter.values() if count == 1) # noqa: N806
58
- V2 = sum(1 for count in freq_counter.values() if count == 2) # noqa: N806
307
+ # Categorize each hapax word by lexicon presence
308
+ neologisms = []
309
+ rare_words = []
310
+ common_words = []
59
311
 
60
- # TODO: Implement Sichel's S and Honoré's R
61
- sichel_s = 0.0 # Placeholder
62
- honore_r = 0.0 # Placeholder
312
+ for word in hapax_words:
313
+ in_bnc = is_bnc_term(word)
314
+ in_wordnet = is_wordnet_term(word)
63
315
 
64
- return HapaxResult(
65
- hapax_count=V1,
66
- hapax_ratio=V1 / N if N > 0 else 0.0,
67
- dis_hapax_count=V2,
68
- dis_hapax_ratio=V2 / N if N > 0 else 0.0,
69
- sichel_s=sichel_s,
70
- honore_r=honore_r,
316
+ if not in_bnc and not in_wordnet:
317
+ # Not in either lexicon → true neologism
318
+ neologisms.append(word)
319
+ elif in_bnc and in_wordnet:
320
+ # In both lexicons common word
321
+ common_words.append(word)
322
+ else:
323
+ # In one but not the other → rare word
324
+ rare_words.append(word)
325
+
326
+ # Calculate ratios
327
+ total_hapax = len(hapax_words)
328
+ neologism_ratio = len(neologisms) / total_hapax if total_hapax > 0 else 0.0
329
+ rare_word_ratio = len(rare_words) / total_hapax if total_hapax > 0 else 0.0
330
+ common_word_ratio = len(common_words) / total_hapax if total_hapax > 0 else 0.0
331
+
332
+ return HapaxLexiconResult(
333
+ hapax_result=hapax_result,
334
+ lexicon_analysis=LexiconCategories(
335
+ neologisms=sorted(neologisms),
336
+ rare_words=sorted(rare_words),
337
+ common_words=sorted(common_words),
338
+ neologism_ratio=neologism_ratio,
339
+ rare_word_ratio=rare_word_ratio,
340
+ metadata={
341
+ "total_hapax": total_hapax,
342
+ "neologism_count": len(neologisms),
343
+ "rare_word_count": len(rare_words),
344
+ "common_word_count": len(common_words),
345
+ "common_word_ratio": common_word_ratio,
346
+ },
347
+ ),
71
348
  metadata={
72
- "token_count": N,
73
- "vocabulary_size": V,
349
+ "lexicons_used": ["bnc", "wordnet"],
350
+ "note": "Lexicon categorization based on BNC and WordNet presence",
74
351
  },
75
352
  )
@@ -1,23 +1,130 @@
1
- """MTLD (Measure of Textual Lexical Diversity) implementation."""
1
+ """MTLD (Measure of Textual Lexical Diversity) implementation.
2
2
 
3
- from .._types import MTLDResult
3
+ This module implements MTLD with native chunked analysis for stylometric
4
+ fingerprinting.
5
+
6
+ Related GitHub Issue:
7
+ #27 - Native chunked analysis with Distribution dataclass
8
+ https://github.com/craigtrim/pystylometry/issues/27
9
+ """
10
+
11
+ import math
12
+
13
+ from .._types import Distribution, MTLDResult, chunk_text, make_distribution
4
14
  from .._utils import tokenize
5
15
 
6
16
 
17
+ def _calculate_mtld_direction(tokens: list[str], threshold: float, forward: bool) -> float:
18
+ """
19
+ Calculate MTLD in one direction (forward or backward).
20
+
21
+ Args:
22
+ tokens: List of tokens to analyze
23
+ threshold: TTR threshold to maintain (must be in range (0, 1))
24
+ forward: If True, process forward; if False, process backward
25
+
26
+ Returns:
27
+ MTLD score for this direction
28
+ """
29
+ if len(tokens) == 0:
30
+ return 0.0
31
+
32
+ # Process tokens in the specified direction
33
+ token_list = tokens if forward else tokens[::-1]
34
+
35
+ factors = 0.0
36
+ current_count = 0
37
+ current_types = set()
38
+
39
+ for token in token_list:
40
+ current_count += 1
41
+ current_types.add(token)
42
+
43
+ # Calculate current TTR
44
+ ttr = len(current_types) / current_count
45
+
46
+ # If TTR drops below threshold, we've completed a factor
47
+ if ttr < threshold:
48
+ factors += 1.0
49
+ current_count = 0
50
+ current_types = set()
51
+
52
+ # Handle remaining partial factor
53
+ # Add proportion of a complete factor based on how close we are to threshold
54
+ if current_count > 0:
55
+ ttr = len(current_types) / current_count
56
+ # If we're still above threshold, add partial factor credit
57
+ # Formula: (1 - current_ttr) / (1 - threshold)
58
+ # This represents how far we've progressed toward completing a factor
59
+ # In theory, ttr should always be >= threshold here because drops below
60
+ # threshold are handled in the loop above (which resets current_count).
61
+ # Adding defensive check to prevent mathematical errors.
62
+ if ttr >= threshold:
63
+ factors += (1.0 - ttr) / (1.0 - threshold)
64
+
65
+ # MTLD is the mean length of factors
66
+ # Total tokens / number of factors
67
+ if factors > 0:
68
+ return len(tokens) / factors
69
+ else:
70
+ # If no factors were completed, return the text length
71
+ # This happens when TTR stays above threshold for the entire text
72
+ return float(len(tokens))
73
+
74
+
75
+ def _compute_mtld_single(text: str, threshold: float) -> tuple[float, float, float, dict]:
76
+ """Compute MTLD for a single chunk of text.
77
+
78
+ Returns:
79
+ Tuple of (mtld_forward, mtld_backward, mtld_average, metadata_dict).
80
+ Returns (nan, nan, nan, metadata) for empty input.
81
+ """
82
+ tokens = tokenize(text.lower())
83
+
84
+ if len(tokens) == 0:
85
+ return (
86
+ float("nan"),
87
+ float("nan"),
88
+ float("nan"),
89
+ {"token_count": 0},
90
+ )
91
+
92
+ mtld_forward = _calculate_mtld_direction(tokens, threshold, forward=True)
93
+ mtld_backward = _calculate_mtld_direction(tokens, threshold, forward=False)
94
+ mtld_average = (mtld_forward + mtld_backward) / 2
95
+
96
+ return (
97
+ mtld_forward,
98
+ mtld_backward,
99
+ mtld_average,
100
+ {"token_count": len(tokens)},
101
+ )
102
+
103
+
7
104
  def compute_mtld(
8
105
  text: str,
9
106
  threshold: float = 0.72,
107
+ chunk_size: int = 1000,
10
108
  ) -> MTLDResult:
11
109
  """
12
110
  Compute MTLD (Measure of Textual Lexical Diversity).
13
111
 
112
+ This function uses native chunked analysis to capture variance and patterns
113
+ across the text, which is essential for stylometric fingerprinting.
114
+
14
115
  MTLD measures the mean length of sequential word strings that maintain
15
116
  a minimum threshold TTR. It's more robust than simple TTR for texts of
16
117
  varying lengths.
17
118
 
18
119
  Formula:
19
- MTLD = mean(forward_factors, backward_factors)
20
- where factors are word string lengths that maintain TTR >= threshold
120
+ MTLD = total_tokens / factor_count
121
+ where factor_count includes:
122
+ - Completed factors (segments where TTR dropped below threshold)
123
+ - Partial factor for any remaining incomplete segment (weighted by proximity to threshold)
124
+
125
+ Related GitHub Issue:
126
+ #27 - Native chunked analysis with Distribution dataclass
127
+ https://github.com/craigtrim/pystylometry/issues/27
21
128
 
22
129
  References:
23
130
  McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D:
@@ -26,36 +133,87 @@ def compute_mtld(
26
133
 
27
134
  Args:
28
135
  text: Input text to analyze
29
- threshold: TTR threshold to maintain (default: 0.72)
136
+ threshold: TTR threshold to maintain (default: 0.72, must be in range (0, 1))
137
+ chunk_size: Number of words per chunk (default: 1000)
30
138
 
31
139
  Returns:
32
- MTLDResult with forward, backward, and average MTLD scores
140
+ MTLDResult with forward, backward, average MTLD scores and distributions
141
+
142
+ Raises:
143
+ ValueError: If threshold is not in range (0, 1)
33
144
 
34
145
  Example:
35
- >>> result = compute_mtld("The quick brown fox jumps over the lazy dog...")
36
- >>> print(f"MTLD: {result.mtld_average:.2f}")
146
+ >>> result = compute_mtld("Long text here...", chunk_size=1000)
147
+ >>> result.mtld_average # Mean across chunks
148
+ 72.5
149
+ >>> result.mtld_average_dist.std # Variance reveals fingerprint
150
+ 8.3
37
151
  """
38
- tokens = tokenize(text)
152
+ # Validate threshold parameter
153
+ if not (0 < threshold < 1):
154
+ raise ValueError(
155
+ f"Threshold must be in range (0, 1), got {threshold}. "
156
+ "Common values: 0.72 (default), 0.5-0.8"
157
+ )
39
158
 
40
- if len(tokens) == 0:
159
+ # Chunk the text
160
+ chunks = chunk_text(text, chunk_size)
161
+
162
+ # Compute metrics per chunk
163
+ forward_values = []
164
+ backward_values = []
165
+ average_values = []
166
+ total_tokens = 0
167
+
168
+ for chunk in chunks:
169
+ fwd, bwd, avg, meta = _compute_mtld_single(chunk, threshold)
170
+ if not math.isnan(fwd):
171
+ forward_values.append(fwd)
172
+ backward_values.append(bwd)
173
+ average_values.append(avg)
174
+ total_tokens += meta.get("token_count", 0)
175
+
176
+ # Handle empty or all-invalid chunks
177
+ if not forward_values:
178
+ empty_dist = Distribution(
179
+ values=[],
180
+ mean=float("nan"),
181
+ median=float("nan"),
182
+ std=0.0,
183
+ range=0.0,
184
+ iqr=0.0,
185
+ )
41
186
  return MTLDResult(
42
- mtld_forward=0.0,
43
- mtld_backward=0.0,
44
- mtld_average=0.0,
45
- metadata={"token_count": 0, "threshold": threshold},
187
+ mtld_forward=float("nan"),
188
+ mtld_backward=float("nan"),
189
+ mtld_average=float("nan"),
190
+ mtld_forward_dist=empty_dist,
191
+ mtld_backward_dist=empty_dist,
192
+ mtld_average_dist=empty_dist,
193
+ chunk_size=chunk_size,
194
+ chunk_count=len(chunks),
195
+ metadata={
196
+ "total_token_count": 0,
197
+ "threshold": threshold,
198
+ },
46
199
  )
47
200
 
48
- # TODO: Implement forward and backward MTLD calculation
49
- mtld_forward = 0.0 # Placeholder
50
- mtld_backward = 0.0 # Placeholder
51
- mtld_average = (mtld_forward + mtld_backward) / 2
201
+ # Build distributions
202
+ forward_dist = make_distribution(forward_values)
203
+ backward_dist = make_distribution(backward_values)
204
+ average_dist = make_distribution(average_values)
52
205
 
53
206
  return MTLDResult(
54
- mtld_forward=mtld_forward,
55
- mtld_backward=mtld_backward,
56
- mtld_average=mtld_average,
207
+ mtld_forward=forward_dist.mean,
208
+ mtld_backward=backward_dist.mean,
209
+ mtld_average=average_dist.mean,
210
+ mtld_forward_dist=forward_dist,
211
+ mtld_backward_dist=backward_dist,
212
+ mtld_average_dist=average_dist,
213
+ chunk_size=chunk_size,
214
+ chunk_count=len(chunks),
57
215
  metadata={
58
- "token_count": len(tokens),
216
+ "total_token_count": total_tokens,
59
217
  "threshold": threshold,
60
218
  },
61
219
  )