pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pystylometry/__init__.py +30 -5
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1954 -28
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +26 -1
  6. pystylometry/authorship/additional_methods.py +75 -0
  7. pystylometry/authorship/kilgarriff.py +347 -0
  8. pystylometry/character/__init__.py +15 -0
  9. pystylometry/character/character_metrics.py +389 -0
  10. pystylometry/cli.py +427 -0
  11. pystylometry/consistency/__init__.py +57 -0
  12. pystylometry/consistency/_thresholds.py +162 -0
  13. pystylometry/consistency/drift.py +549 -0
  14. pystylometry/dialect/__init__.py +65 -0
  15. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  16. pystylometry/dialect/_loader.py +360 -0
  17. pystylometry/dialect/detector.py +533 -0
  18. pystylometry/lexical/__init__.py +13 -6
  19. pystylometry/lexical/advanced_diversity.py +680 -0
  20. pystylometry/lexical/function_words.py +590 -0
  21. pystylometry/lexical/hapax.py +310 -33
  22. pystylometry/lexical/mtld.py +180 -22
  23. pystylometry/lexical/ttr.py +149 -0
  24. pystylometry/lexical/word_frequency_sophistication.py +1805 -0
  25. pystylometry/lexical/yule.py +142 -29
  26. pystylometry/ngrams/__init__.py +2 -0
  27. pystylometry/ngrams/entropy.py +150 -49
  28. pystylometry/ngrams/extended_ngrams.py +235 -0
  29. pystylometry/prosody/__init__.py +12 -0
  30. pystylometry/prosody/rhythm_prosody.py +53 -0
  31. pystylometry/readability/__init__.py +12 -0
  32. pystylometry/readability/additional_formulas.py +2110 -0
  33. pystylometry/readability/ari.py +173 -35
  34. pystylometry/readability/coleman_liau.py +150 -30
  35. pystylometry/readability/complex_words.py +531 -0
  36. pystylometry/readability/flesch.py +181 -32
  37. pystylometry/readability/gunning_fog.py +208 -35
  38. pystylometry/readability/smog.py +126 -28
  39. pystylometry/readability/syllables.py +137 -30
  40. pystylometry/stylistic/__init__.py +20 -0
  41. pystylometry/stylistic/cohesion_coherence.py +45 -0
  42. pystylometry/stylistic/genre_register.py +45 -0
  43. pystylometry/stylistic/markers.py +131 -0
  44. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  45. pystylometry/syntactic/__init__.py +4 -0
  46. pystylometry/syntactic/advanced_syntactic.py +494 -0
  47. pystylometry/syntactic/pos_ratios.py +172 -17
  48. pystylometry/syntactic/sentence_stats.py +105 -18
  49. pystylometry/syntactic/sentence_types.py +526 -0
  50. pystylometry/viz/__init__.py +71 -0
  51. pystylometry/viz/drift.py +589 -0
  52. pystylometry/viz/jsx/__init__.py +31 -0
  53. pystylometry/viz/jsx/_base.py +144 -0
  54. pystylometry/viz/jsx/report.py +677 -0
  55. pystylometry/viz/jsx/timeline.py +716 -0
  56. pystylometry/viz/jsx/viewer.py +1032 -0
  57. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
  58. pystylometry-1.1.0.dist-info/RECORD +63 -0
  59. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  60. pystylometry-0.1.0.dist-info/RECORD +0 -26
  61. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,680 @@
1
+ """Advanced lexical diversity metrics.
2
+
3
+ This module provides sophisticated measures of lexical diversity that go beyond
4
+ simple Type-Token Ratio (TTR). These metrics are designed to control for text
5
+ length and provide more stable, comparable measures across texts of different sizes.
6
+
7
+ Related GitHub Issue:
8
+ #14 - Advanced Lexical Diversity Metrics
9
+ https://github.com/craigtrim/pystylometry/issues/14
10
+
11
+ Metrics implemented:
12
+ - voc-D: Mathematical model-based diversity estimate
13
+ - MATTR: Moving-Average Type-Token Ratio
14
+ - HD-D: Hypergeometric Distribution D
15
+ - MSTTR: Mean Segmental Type-Token Ratio
16
+
17
+ Each of these metrics addresses the "text length problem" that affects simple
18
+ TTR: longer texts tend to have lower TTR values because words repeat. These
19
+ advanced metrics normalize for length in different ways.
20
+
21
+ References:
22
+ McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D: A validation
23
+ study of sophisticated approaches to lexical diversity assessment.
24
+ Behavior Research Methods, 42(2), 381-392.
25
+ Malvern, D., Richards, B., Chipere, N., & Durán, P. (2004).
26
+ Lexical Diversity and Language Development. Palgrave Macmillan.
27
+ Covington, M. A., & McFall, J. D. (2010). Cutting the Gordian knot:
28
+ The moving-average type-token ratio (MATTR). Journal of Quantitative
29
+ Linguistics, 17(2), 94-100.
30
+ """
31
+
32
+ import random
33
+ from typing import Optional
34
+
35
+ from .._types import (
36
+ HDDResult,
37
+ MATTRResult,
38
+ MSTTRResult,
39
+ VocdDResult,
40
+ make_distribution,
41
+ )
42
+
43
+
44
+ def _tokenize_for_diversity(text: str) -> list[str]:
45
+ """Tokenize text for lexical diversity analysis.
46
+
47
+ This helper function provides consistent tokenization across all
48
+ diversity metrics. It:
49
+ - Converts text to lowercase
50
+ - Splits on whitespace
51
+ - Strips punctuation from each token
52
+ - Returns list of clean tokens
53
+
54
+ Args:
55
+ text: Input text to tokenize
56
+
57
+ Returns:
58
+ List of lowercase tokens with punctuation removed
59
+ """
60
+ if not text or not text.strip():
61
+ return []
62
+
63
+ # Lowercase entire text
64
+ text_lower = text.lower()
65
+
66
+ # Split on whitespace
67
+ raw_tokens = text_lower.split()
68
+
69
+ # Comprehensive punctuation set for stripping
70
+ punctuation_chars = set(".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„\"\"''‚'")
71
+
72
+ # Strip punctuation from each token
73
+ tokens = []
74
+ for token in raw_tokens:
75
+ # Strip leading and trailing punctuation
76
+ clean_token = token.strip("".join(punctuation_chars))
77
+ if clean_token: # Only add non-empty tokens
78
+ tokens.append(clean_token)
79
+
80
+ return tokens
81
+
82
+
83
+ def compute_vocd_d(
84
+ text: str,
85
+ sample_size: int = 35,
86
+ num_samples: int = 100,
87
+ min_tokens: int = 100,
88
+ random_seed: Optional[int] = None,
89
+ chunk_size: int = 1000,
90
+ ) -> VocdDResult:
91
+ """
92
+ Compute voc-D (vocabulary D) using curve-fitting approach.
93
+
94
+ voc-D estimates lexical diversity by fitting a mathematical model to the
95
+ relationship between tokens and types across multiple random samples.
96
+ The D parameter represents theoretical vocabulary size and is more stable
97
+ across text lengths than simple TTR.
98
+
99
+ Related GitHub Issue:
100
+ #14 - Advanced Lexical Diversity Metrics
101
+ https://github.com/craigtrim/pystylometry/issues/14
102
+
103
+ The algorithm:
104
+ 1. Take multiple random samples of varying sizes from the text
105
+ 2. For each sample size, calculate the mean TTR across samples
106
+ 3. Fit a curve to the (sample_size, TTR) relationship
107
+ 4. The D parameter is the best-fit curve parameter
108
+ 5. Higher D values indicate greater lexical diversity
109
+
110
+ Advantages over TTR:
111
+ - Less sensitive to text length
112
+ - More comparable across texts of different sizes
113
+ - Theoretically grounded in vocabulary acquisition models
114
+ - Widely used in language development research
115
+
116
+ Disadvantages:
117
+ - Computationally expensive (requires many random samples)
118
+ - Requires sufficient text length (typically 100+ tokens)
119
+ - Can be unstable with very short texts
120
+ - Curve fitting may not converge in some cases
121
+
122
+ Args:
123
+ text: Input text to analyze. Should contain at least min_tokens words
124
+ for reliable D estimation. Texts with fewer tokens will return
125
+ NaN or raise an error.
126
+ sample_size: Size of random samples to draw. Default is 35 tokens,
127
+ following Malvern et al. (2004). Smaller sizes increase
128
+ variance; larger sizes may exceed text length.
129
+ num_samples: Number of random samples to draw for each sample size.
130
+ More samples increase accuracy but also computation time.
131
+ Default is 100 samples.
132
+ min_tokens: Minimum tokens required for D calculation. Texts shorter
133
+ than this will return NaN or error. Default is 100.
134
+
135
+ Returns:
136
+ VocdDResult containing:
137
+ - d_parameter: The D value (higher = more diverse)
138
+ - curve_fit_r_squared: Quality of curve fit (closer to 1.0 is better)
139
+ - sample_count: Number of samples actually used
140
+ - optimal_sample_size: Sample size used for calculation
141
+ - metadata: Sampling details, convergence info, curve parameters
142
+
143
+ Example:
144
+ >>> text = "Long sample text with sufficient tokens..."
145
+ >>> result = compute_vocd_d(text, sample_size=35, num_samples=100)
146
+ >>> print(f"D parameter: {result.d_parameter:.2f}")
147
+ D parameter: 67.34
148
+ >>> print(f"Curve fit R²: {result.curve_fit_r_squared:.3f}")
149
+ Curve fit R²: 0.987
150
+
151
+ >>> # Short text handling
152
+ >>> short_text = "Too short"
153
+ >>> result = compute_vocd_d(short_text)
154
+ >>> import math
155
+ >>> math.isnan(result.d_parameter)
156
+ True
157
+
158
+ Note:
159
+ - Requires random sampling, so results may vary slightly between runs
160
+ - Use a random seed in metadata for reproducibility
161
+ - Very short texts (< min_tokens) cannot be analyzed
162
+ - D values typically range from 10 (low diversity) to 100+ (high diversity)
163
+ - Curve fitting uses least-squares optimization
164
+ - Poor curve fits (low R²) indicate unreliable D estimates
165
+ """
166
+ # Set random seed for reproducibility
167
+ if random_seed is not None:
168
+ random.seed(random_seed)
169
+
170
+ # Step 1: Tokenize text
171
+ tokens = _tokenize_for_diversity(text)
172
+ total_tokens = len(tokens)
173
+ total_types = len(set(tokens))
174
+
175
+ # Step 2: Validate minimum length
176
+ if total_tokens < min_tokens:
177
+ raise ValueError(f"Text has {total_tokens} tokens, minimum {min_tokens} required for voc-D")
178
+
179
+ # Step 3: Determine sample sizes to test
180
+ # Test from 10 tokens up to min(100, total_tokens - 10)
181
+ min_sample_size = 10
182
+ max_sample_size = min(100, total_tokens - 10)
183
+
184
+ # Create list of sample sizes (every 5 tokens)
185
+ sample_sizes = list(range(min_sample_size, max_sample_size + 1, 5))
186
+
187
+ # Ensure we have at least a few sample sizes
188
+ if len(sample_sizes) < 3:
189
+ # If text is very short, just use what we can
190
+ sample_sizes = list(range(min_sample_size, max_sample_size + 1))
191
+
192
+ # Step 4: For each sample size, take random samples and calculate mean TTR
193
+ sample_size_to_mean_ttr: dict[int, float] = {}
194
+
195
+ for size in sample_sizes:
196
+ ttrs = []
197
+ for _ in range(num_samples):
198
+ # Random sample of 'size' tokens
199
+ sample = random.sample(tokens, size)
200
+ sample_types = len(set(sample))
201
+ ttr = sample_types / size
202
+ ttrs.append(ttr)
203
+
204
+ # Mean TTR for this sample size
205
+ mean_ttr = sum(ttrs) / len(ttrs)
206
+ sample_size_to_mean_ttr[size] = mean_ttr
207
+
208
+ # Step 5: Fit curve using model: TTR = D / sqrt(sample_size)
209
+ # Using least-squares fitting for y = a/sqrt(x)
210
+ # Minimize: sum((y_i - a/sqrt(x_i))^2)
211
+ # Solution: a = sum(y_i/sqrt(x_i)) / sum(1/x_i)
212
+
213
+ numerator = 0.0
214
+ denominator = 0.0
215
+
216
+ for size, ttr in sample_size_to_mean_ttr.items():
217
+ numerator += ttr / (size**0.5)
218
+ denominator += 1.0 / size
219
+
220
+ d_param = numerator / denominator if denominator > 0 else 0.0
221
+
222
+ # Step 6: Calculate R² (goodness of fit)
223
+ # Predicted TTR = D / sqrt(sample_size)
224
+ y_actual = list(sample_size_to_mean_ttr.values())
225
+ y_predicted = [d_param / (size**0.5) for size in sample_sizes]
226
+
227
+ # R² calculation
228
+ mean_y = sum(y_actual) / len(y_actual)
229
+ ss_tot = sum((y - mean_y) ** 2 for y in y_actual)
230
+ ss_res = sum((y_actual[i] - y_predicted[i]) ** 2 for i in range(len(y_actual)))
231
+
232
+ r_squared = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0.0
233
+
234
+ # Step 7: Build metadata
235
+ metadata = {
236
+ "total_token_count": total_tokens,
237
+ "total_type_count": total_types,
238
+ "simple_ttr": total_types / total_tokens if total_tokens > 0 else 0.0,
239
+ "sample_sizes_used": sample_sizes,
240
+ "mean_ttrs_per_sample_size": list(sample_size_to_mean_ttr.values()),
241
+ "num_samples_per_size": num_samples,
242
+ "random_seed": random_seed,
243
+ }
244
+
245
+ # Step 8: Create distributions (single-pass analysis)
246
+ d_parameter_dist = make_distribution([d_param])
247
+ curve_fit_r_squared_dist = make_distribution([r_squared])
248
+
249
+ # Step 9: Return result
250
+ return VocdDResult(
251
+ d_parameter=d_param,
252
+ curve_fit_r_squared=r_squared,
253
+ sample_count=len(sample_sizes),
254
+ optimal_sample_size=sample_size, # Input parameter
255
+ d_parameter_dist=d_parameter_dist,
256
+ curve_fit_r_squared_dist=curve_fit_r_squared_dist,
257
+ chunk_size=chunk_size,
258
+ chunk_count=1, # Single pass analysis
259
+ metadata=metadata,
260
+ )
261
+
262
+
263
+ def compute_mattr(text: str, window_size: int = 50, chunk_size: int = 1000) -> MATTRResult:
264
+ """
265
+ Compute Moving-Average Type-Token Ratio (MATTR).
266
+
267
+ MATTR calculates TTR using a moving window of fixed size, then averages
268
+ across all windows. This provides a length-normalized measure that is
269
+ more stable than simple TTR and comparable across texts of different lengths.
270
+
271
+ Related GitHub Issue:
272
+ #14 - Advanced Lexical Diversity Metrics
273
+ https://github.com/craigtrim/pystylometry/issues/14
274
+
275
+ The algorithm:
276
+ 1. Slide a window of fixed size across the text (token by token)
277
+ 2. Calculate TTR for each window position
278
+ 3. Average all window TTRs to get MATTR
279
+ 4. Also compute statistics (std dev, min, max) across windows
280
+
281
+ Advantages over TTR:
282
+ - Controlled for text length (fixed window size)
283
+ - More comparable across texts
284
+ - Computationally simple and fast
285
+ - Intuitive interpretation (like TTR but normalized)
286
+
287
+ Disadvantages:
288
+ - Requires choosing window size (affects results)
289
+ - Not applicable to texts shorter than window size
290
+ - Adjacent windows overlap (not independent samples)
291
+
292
+ Args:
293
+ text: Input text to analyze. Must contain at least window_size tokens.
294
+ Texts shorter than window_size will return NaN.
295
+ window_size: Size of moving window in tokens. Default is 50, following
296
+ Covington & McFall (2010). Larger windows are more stable
297
+ but require longer texts. Smaller windows are noisier.
298
+
299
+ Returns:
300
+ MATTRResult containing:
301
+ - mattr_score: Average TTR across all windows
302
+ - window_size: Size of window used
303
+ - window_count: Number of windows analyzed
304
+ - ttr_std_dev: Standard deviation of TTR across windows
305
+ - min_ttr: Minimum TTR in any window
306
+ - max_ttr: Maximum TTR in any window
307
+ - metadata: Window-by-window TTR values
308
+
309
+ Example:
310
+ >>> result = compute_mattr("Sample text here...", window_size=50)
311
+ >>> print(f"MATTR score: {result.mattr_score:.3f}")
312
+ MATTR score: 0.847
313
+ >>> print(f"Windows analyzed: {result.window_count}")
314
+ Windows analyzed: 123
315
+ >>> print(f"TTR std dev: {result.ttr_std_dev:.3f}")
316
+ TTR std dev: 0.042
317
+
318
+ >>> # Short text handling
319
+ >>> short_text = "Too short for window"
320
+ >>> result = compute_mattr(short_text, window_size=50)
321
+ >>> import math
322
+ >>> math.isnan(result.mattr_score)
323
+ True
324
+
325
+ Note:
326
+ - Window size choice affects results (no universally optimal value)
327
+ - Standard window size is 50 tokens (Covington & McFall 2010)
328
+ - For very short texts, consider reducing window size or using different metric
329
+ - High TTR std dev suggests uneven lexical distribution
330
+ - MATTR values range from 0 (no diversity) to 1 (perfect diversity)
331
+ """
332
+ # Step 1: Tokenize text
333
+ tokens = _tokenize_for_diversity(text)
334
+ total_tokens = len(tokens)
335
+ total_types = len(set(tokens))
336
+
337
+ # Step 2: Validate minimum length
338
+ if total_tokens < window_size:
339
+ raise ValueError(
340
+ f"Text has {total_tokens} tokens, minimum {window_size} required for MATTR"
341
+ )
342
+
343
+ # Step 3: Slide window across text and calculate TTR for each position
344
+ window_ttrs = []
345
+
346
+ for i in range(total_tokens - window_size + 1):
347
+ # Extract window
348
+ window = tokens[i : i + window_size]
349
+
350
+ # Calculate TTR for this window
351
+ window_types = len(set(window))
352
+ ttr = window_types / window_size
353
+ window_ttrs.append(ttr)
354
+
355
+ # Step 4: Calculate MATTR (mean of all window TTRs)
356
+ mattr_score = sum(window_ttrs) / len(window_ttrs)
357
+
358
+ # Step 5: Calculate statistics
359
+ # Standard deviation
360
+ variance = sum((ttr - mattr_score) ** 2 for ttr in window_ttrs) / len(window_ttrs)
361
+ ttr_std_dev = variance**0.5
362
+
363
+ # Min and max
364
+ min_ttr = min(window_ttrs)
365
+ max_ttr = max(window_ttrs)
366
+
367
+ # Step 6: Build metadata
368
+ metadata = {
369
+ "total_token_count": total_tokens,
370
+ "total_type_count": total_types,
371
+ "simple_ttr": total_types / total_tokens if total_tokens > 0 else 0.0,
372
+ "first_window_ttr": window_ttrs[0],
373
+ "last_window_ttr": window_ttrs[-1],
374
+ }
375
+
376
+ # Step 7: Create distributions (single-pass analysis)
377
+ mattr_score_dist = make_distribution([mattr_score])
378
+ ttr_std_dev_dist = make_distribution([ttr_std_dev])
379
+ min_ttr_dist = make_distribution([min_ttr])
380
+ max_ttr_dist = make_distribution([max_ttr])
381
+
382
+ # Step 8: Return result
383
+ return MATTRResult(
384
+ mattr_score=mattr_score,
385
+ window_size=window_size,
386
+ window_count=len(window_ttrs),
387
+ ttr_std_dev=ttr_std_dev,
388
+ min_ttr=min_ttr,
389
+ max_ttr=max_ttr,
390
+ mattr_score_dist=mattr_score_dist,
391
+ ttr_std_dev_dist=ttr_std_dev_dist,
392
+ min_ttr_dist=min_ttr_dist,
393
+ max_ttr_dist=max_ttr_dist,
394
+ chunk_size=chunk_size,
395
+ chunk_count=1, # Single pass analysis
396
+ metadata=metadata,
397
+ )
398
+
399
+
400
+ def compute_hdd(text: str, sample_size: int = 42, chunk_size: int = 1000) -> HDDResult:
401
+ """
402
+ Compute HD-D (Hypergeometric Distribution D).
403
+
404
+ HD-D uses the hypergeometric distribution to model the probability of
405
+ encountering new word types as text length increases. It provides a
406
+ probabilistic measure of lexical diversity that is less sensitive to
407
+ text length than simple TTR.
408
+
409
+ Related GitHub Issue:
410
+ #14 - Advanced Lexical Diversity Metrics
411
+ https://github.com/craigtrim/pystylometry/issues/14
412
+
413
+ The algorithm:
414
+ 1. For each word type in the text, calculate the probability that
415
+ it would NOT appear in a random sample of size N
416
+ 2. Sum these probabilities across all types
417
+ 3. This sum represents the expected number of new types in a sample
418
+ 4. HD-D is derived from this expected value
419
+
420
+ The hypergeometric distribution P(X=0) gives the probability that a word
421
+ type with frequency f does not appear in a random sample of size N from
422
+ a text of length T.
423
+
424
+ Advantages over TTR:
425
+ - Mathematically rigorous (probability-based)
426
+ - Less sensitive to text length
427
+ - Well-defined statistical properties
428
+ - Good empirical performance (McCarthy & Jarvis 2010)
429
+
430
+ Disadvantages:
431
+ - Computationally complex
432
+ - Requires understanding of probability theory
433
+ - Sample size choice affects results
434
+ - Less intuitive than TTR
435
+
436
+ Args:
437
+ text: Input text to analyze. Should contain at least 50+ tokens
438
+ for reliable HD-D calculation.
439
+ sample_size: Size of hypothetical sample for calculation. Default is
440
+ 42 tokens, following McCarthy & Jarvis (2010). The optimal
441
+ sample size is typically 35-50 tokens.
442
+
443
+ Returns:
444
+ HDDResult containing:
445
+ - hdd_score: The HD-D value (higher = more diverse)
446
+ - sample_size: Sample size used for calculation
447
+ - type_count: Number of unique types in text
448
+ - token_count: Number of tokens in text
449
+ - metadata: Probability distribution details
450
+
451
+ Example:
452
+ >>> result = compute_hdd("Sample text for analysis...")
453
+ >>> print(f"HD-D score: {result.hdd_score:.3f}")
454
+ HD-D score: 0.823
455
+ >>> print(f"Sample size: {result.sample_size}")
456
+ Sample size: 42
457
+ >>> print(f"Types: {result.type_count}, Tokens: {result.token_count}")
458
+ Types: 67, Tokens: 150
459
+
460
+ >>> # Empty text handling
461
+ >>> result = compute_hdd("")
462
+ >>> import math
463
+ >>> math.isnan(result.hdd_score)
464
+ True
465
+
466
+ Note:
467
+ - HD-D values range from 0 (no diversity) to 1 (perfect diversity)
468
+ - Requires scipy for hypergeometric distribution calculations
469
+ - Sample size should be smaller than text length
470
+ - Very short texts may produce unreliable HD-D values
471
+ - HD-D correlates highly with other diversity measures but is more stable
472
+ """
473
+ # Step 1: Tokenize text
474
+ tokens = _tokenize_for_diversity(text)
475
+ total_tokens = len(tokens)
476
+
477
+ # Step 2: Validate minimum length
478
+ if total_tokens < sample_size:
479
+ raise ValueError(f"Text has {total_tokens} tokens, minimum {sample_size} required for HD-D")
480
+
481
+ # Step 3: Build frequency distribution
482
+ type_counts: dict[str, int] = {}
483
+ for token in tokens:
484
+ type_counts[token] = type_counts.get(token, 0) + 1
485
+
486
+ total_types = len(type_counts)
487
+
488
+ # Step 4: Calculate HD-D using hypergeometric distribution
489
+ # HD-D = sum over all types of P(X = 0)
490
+ # where P(X = 0) is probability that type does NOT appear in random sample
491
+ #
492
+ # Using simplified formula (stable and no scipy required):
493
+ # P(X=0) = ((total_tokens - count) / total_tokens)^sample_size
494
+
495
+ hdd_sum = 0.0
496
+
497
+ for word_type, count in type_counts.items():
498
+ # Probability this type does NOT appear in sample of size sample_size
499
+ prob_not_appear = ((total_tokens - count) / total_tokens) ** sample_size
500
+ hdd_sum += prob_not_appear
501
+
502
+ # Step 5: Build metadata
503
+ metadata = {
504
+ "total_token_count": total_tokens,
505
+ "total_type_count": total_types,
506
+ "simple_ttr": total_types / total_tokens if total_tokens > 0 else 0.0,
507
+ "hypergeometric_sum": hdd_sum,
508
+ "calculation_method": "simplified",
509
+ }
510
+
511
+ # Step 6: Create distribution (single-pass analysis)
512
+ hdd_score_dist = make_distribution([hdd_sum])
513
+
514
+ # Step 7: Return result
515
+ return HDDResult(
516
+ hdd_score=hdd_sum,
517
+ sample_size=sample_size,
518
+ type_count=total_types,
519
+ token_count=total_tokens,
520
+ hdd_score_dist=hdd_score_dist,
521
+ chunk_size=chunk_size,
522
+ chunk_count=1, # Single pass analysis
523
+ metadata=metadata,
524
+ )
525
+
526
+
527
+ def compute_msttr(text: str, segment_size: int = 100, chunk_size: int = 1000) -> MSTTRResult:
528
+ """
529
+ Compute Mean Segmental Type-Token Ratio (MSTTR).
530
+
531
+ MSTTR divides text into sequential, non-overlapping segments of equal
532
+ length, calculates TTR for each segment, then averages across segments.
533
+ This normalizes for text length and provides a stable diversity measure.
534
+
535
+ Related GitHub Issue:
536
+ #14 - Advanced Lexical Diversity Metrics
537
+ https://github.com/craigtrim/pystylometry/issues/14
538
+
539
+ The algorithm:
540
+ 1. Divide text into non-overlapping segments of segment_size tokens
541
+ 2. Calculate TTR for each complete segment
542
+ 3. Discard any remaining tokens that don't form a complete segment
543
+ 4. Average TTRs across all segments
544
+ 5. Compute statistics (std dev, min, max) across segments
545
+
546
+ Advantages over TTR:
547
+ - Normalized for text length (fixed segment size)
548
+ - Simple and intuitive
549
+ - Fast computation
550
+ - Independent segments (unlike MATTR's overlapping windows)
551
+
552
+ Disadvantages:
553
+ - Discards incomplete final segment (information loss)
554
+ - Requires choosing segment size (affects results)
555
+ - Needs longer texts to produce multiple segments
556
+ - Segment boundaries are arbitrary
557
+
558
+ Args:
559
+ text: Input text to analyze. Should contain at least segment_size tokens.
560
+ Texts shorter than segment_size will return NaN. Longer texts
561
+ will have leftover tokens discarded if they don't form a complete
562
+ segment.
563
+ segment_size: Size of each segment in tokens. Default is 100 following
564
+ Johnson (1944). Larger segments are more stable but need
565
+ longer texts. Smaller segments are noisier but work with
566
+ shorter texts.
567
+
568
+ Returns:
569
+ MSTTRResult containing:
570
+ - msttr_score: Mean TTR across all segments
571
+ - segment_size: Size of each segment used
572
+ - segment_count: Number of complete segments analyzed
573
+ - ttr_std_dev: Standard deviation of TTR across segments
574
+ - min_ttr: Minimum TTR in any segment
575
+ - max_ttr: Maximum TTR in any segment
576
+ - segment_ttrs: List of TTR for each segment
577
+ - metadata: Segment details, tokens used/discarded
578
+
579
+ Example:
580
+ >>> result = compute_msttr("Long text with many segments...", segment_size=100)
581
+ >>> print(f"MSTTR score: {result.msttr_score:.3f}")
582
+ MSTTR score: 0.734
583
+ >>> print(f"Segments: {result.segment_count}")
584
+ Segments: 8
585
+ >>> print(f"TTR range: {result.min_ttr:.3f} to {result.max_ttr:.3f}")
586
+ TTR range: 0.680 to 0.790
587
+
588
+ >>> # Short text handling
589
+ >>> short_text = "Too short"
590
+ >>> result = compute_msttr(short_text, segment_size=100)
591
+ >>> import math
592
+ >>> math.isnan(result.msttr_score)
593
+ True
594
+
595
+ Note:
596
+ - Segment size choice affects results (common values: 50, 100, 200)
597
+ - Standard segment size is 100 tokens (Johnson 1944)
598
+ - Leftover tokens are discarded (e.g., 250 tokens → 2 segments of 100)
599
+ - At least 1 complete segment required (min text length = segment_size)
600
+ - High TTR std dev suggests inconsistent lexical diversity across text
601
+ - MSTTR values range from 0 (no diversity) to 1 (perfect diversity)
602
+ """
603
+ # Step 1: Tokenize text
604
+ tokens = _tokenize_for_diversity(text)
605
+ total_tokens = len(tokens)
606
+ total_types = len(set(tokens))
607
+
608
+ # Step 2: Validate minimum length
609
+ if total_tokens < segment_size:
610
+ raise ValueError(
611
+ f"Text has {total_tokens} tokens, minimum {segment_size} required for MSTTR"
612
+ )
613
+
614
+ # Step 3: Calculate number of complete segments
615
+ segment_count = total_tokens // segment_size
616
+
617
+ # Step 4: Calculate TTR for each segment
618
+ segment_ttrs = []
619
+
620
+ for i in range(segment_count):
621
+ # Extract segment
622
+ start = i * segment_size
623
+ end = start + segment_size
624
+ segment = tokens[start:end]
625
+
626
+ # Calculate TTR for this segment
627
+ segment_types = len(set(segment))
628
+ ttr = segment_types / segment_size
629
+ segment_ttrs.append(ttr)
630
+
631
+ # Step 5: Calculate MSTTR (mean of segment TTRs)
632
+ msttr_score = sum(segment_ttrs) / len(segment_ttrs)
633
+
634
+ # Step 6: Calculate statistics
635
+ # Standard deviation
636
+ variance = sum((ttr - msttr_score) ** 2 for ttr in segment_ttrs) / len(segment_ttrs)
637
+ ttr_std_dev = variance**0.5
638
+
639
+ # Min and max
640
+ min_ttr = min(segment_ttrs)
641
+ max_ttr = max(segment_ttrs)
642
+
643
+ # Step 7: Calculate tokens used/discarded
644
+ tokens_used = segment_count * segment_size
645
+ tokens_discarded = total_tokens - tokens_used
646
+
647
+ # Step 8: Build metadata
648
+ metadata = {
649
+ "total_token_count": total_tokens,
650
+ "total_type_count": total_types,
651
+ "simple_ttr": total_types / total_tokens if total_tokens > 0 else 0.0,
652
+ "tokens_used": tokens_used,
653
+ "tokens_discarded": tokens_discarded,
654
+ "first_segment_ttr": segment_ttrs[0],
655
+ "last_segment_ttr": segment_ttrs[-1],
656
+ }
657
+
658
+ # Step 9: Create distributions (single-pass analysis)
659
+ msttr_score_dist = make_distribution([msttr_score])
660
+ ttr_std_dev_dist = make_distribution([ttr_std_dev])
661
+ min_ttr_dist = make_distribution([min_ttr])
662
+ max_ttr_dist = make_distribution([max_ttr])
663
+
664
+ # Step 10: Return result
665
+ return MSTTRResult(
666
+ msttr_score=msttr_score,
667
+ segment_size=segment_size,
668
+ segment_count=segment_count,
669
+ ttr_std_dev=ttr_std_dev,
670
+ min_ttr=min_ttr,
671
+ max_ttr=max_ttr,
672
+ segment_ttrs=segment_ttrs,
673
+ msttr_score_dist=msttr_score_dist,
674
+ ttr_std_dev_dist=ttr_std_dev_dist,
675
+ min_ttr_dist=min_ttr_dist,
676
+ max_ttr_dist=max_ttr_dist,
677
+ chunk_size=chunk_size,
678
+ chunk_count=1, # Single pass analysis
679
+ metadata=metadata,
680
+ )