pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pystylometry/__init__.py +1 -2
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1224 -2
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +4 -0
  6. pystylometry/authorship/additional_methods.py +100 -0
  7. pystylometry/character/__init__.py +15 -0
  8. pystylometry/character/character_metrics.py +301 -0
  9. pystylometry/lexical/__init__.py +13 -6
  10. pystylometry/lexical/advanced_diversity.py +641 -0
  11. pystylometry/lexical/function_words.py +391 -0
  12. pystylometry/lexical/hapax.py +154 -7
  13. pystylometry/lexical/mtld.py +83 -7
  14. pystylometry/lexical/ttr.py +83 -0
  15. pystylometry/lexical/word_frequency_sophistication.py +581 -0
  16. pystylometry/lexical/yule.py +34 -7
  17. pystylometry/ngrams/__init__.py +2 -0
  18. pystylometry/ngrams/extended_ngrams.py +235 -0
  19. pystylometry/prosody/__init__.py +12 -0
  20. pystylometry/prosody/rhythm_prosody.py +53 -0
  21. pystylometry/readability/__init__.py +12 -0
  22. pystylometry/readability/additional_formulas.py +985 -0
  23. pystylometry/readability/ari.py +93 -17
  24. pystylometry/readability/coleman_liau.py +102 -9
  25. pystylometry/readability/complex_words.py +531 -0
  26. pystylometry/readability/flesch.py +59 -14
  27. pystylometry/readability/gunning_fog.py +194 -25
  28. pystylometry/readability/smog.py +31 -14
  29. pystylometry/readability/syllables.py +137 -30
  30. pystylometry/stylistic/__init__.py +20 -0
  31. pystylometry/stylistic/cohesion_coherence.py +45 -0
  32. pystylometry/stylistic/genre_register.py +45 -0
  33. pystylometry/stylistic/markers.py +131 -0
  34. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  35. pystylometry/syntactic/__init__.py +4 -0
  36. pystylometry/syntactic/advanced_syntactic.py +432 -0
  37. pystylometry/syntactic/pos_ratios.py +104 -13
  38. pystylometry/syntactic/sentence_stats.py +57 -13
  39. pystylometry/syntactic/sentence_types.py +470 -0
  40. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
  41. pystylometry-1.0.0.dist-info/RECORD +46 -0
  42. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
  43. pystylometry-0.1.0.dist-info/RECORD +0 -26
@@ -0,0 +1,641 @@
1
+ """Advanced lexical diversity metrics.
2
+
3
+ This module provides sophisticated measures of lexical diversity that go beyond
4
+ simple Type-Token Ratio (TTR). These metrics are designed to control for text
5
+ length and provide more stable, comparable measures across texts of different sizes.
6
+
7
+ Related GitHub Issue:
8
+ #14 - Advanced Lexical Diversity Metrics
9
+ https://github.com/craigtrim/pystylometry/issues/14
10
+
11
+ Metrics implemented:
12
+ - voc-D: Mathematical model-based diversity estimate
13
+ - MATTR: Moving-Average Type-Token Ratio
14
+ - HD-D: Hypergeometric Distribution D
15
+ - MSTTR: Mean Segmental Type-Token Ratio
16
+
17
+ Each of these metrics addresses the "text length problem" that affects simple
18
+ TTR: longer texts tend to have lower TTR values because words repeat. These
19
+ advanced metrics normalize for length in different ways.
20
+
21
+ References:
22
+ McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D: A validation
23
+ study of sophisticated approaches to lexical diversity assessment.
24
+ Behavior Research Methods, 42(2), 381-392.
25
+ Malvern, D., Richards, B., Chipere, N., & Durán, P. (2004).
26
+ Lexical Diversity and Language Development. Palgrave Macmillan.
27
+ Covington, M. A., & McFall, J. D. (2010). Cutting the Gordian knot:
28
+ The moving-average type-token ratio (MATTR). Journal of Quantitative
29
+ Linguistics, 17(2), 94-100.
30
+ """
31
+
32
+ import random
33
+ from typing import Optional
34
+
35
+ from .._types import HDDResult, MATTRResult, MSTTRResult, VocdDResult
36
+
37
+
38
+ def _tokenize_for_diversity(text: str) -> list[str]:
39
+ """Tokenize text for lexical diversity analysis.
40
+
41
+ This helper function provides consistent tokenization across all
42
+ diversity metrics. It:
43
+ - Converts text to lowercase
44
+ - Splits on whitespace
45
+ - Strips punctuation from each token
46
+ - Returns list of clean tokens
47
+
48
+ Args:
49
+ text: Input text to tokenize
50
+
51
+ Returns:
52
+ List of lowercase tokens with punctuation removed
53
+ """
54
+ if not text or not text.strip():
55
+ return []
56
+
57
+ # Lowercase entire text
58
+ text_lower = text.lower()
59
+
60
+ # Split on whitespace
61
+ raw_tokens = text_lower.split()
62
+
63
+ # Comprehensive punctuation set for stripping
64
+ PUNCTUATION = set(".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„\"\"''‚'")
65
+
66
+ # Strip punctuation from each token
67
+ tokens = []
68
+ for token in raw_tokens:
69
+ # Strip leading and trailing punctuation
70
+ clean_token = token.strip("".join(PUNCTUATION))
71
+ if clean_token: # Only add non-empty tokens
72
+ tokens.append(clean_token)
73
+
74
+ return tokens
75
+
76
+
77
+ def compute_vocd_d(
78
+ text: str,
79
+ sample_size: int = 35,
80
+ num_samples: int = 100,
81
+ min_tokens: int = 100,
82
+ random_seed: Optional[int] = None,
83
+ ) -> VocdDResult:
84
+ """
85
+ Compute voc-D (vocabulary D) using curve-fitting approach.
86
+
87
+ voc-D estimates lexical diversity by fitting a mathematical model to the
88
+ relationship between tokens and types across multiple random samples.
89
+ The D parameter represents theoretical vocabulary size and is more stable
90
+ across text lengths than simple TTR.
91
+
92
+ Related GitHub Issue:
93
+ #14 - Advanced Lexical Diversity Metrics
94
+ https://github.com/craigtrim/pystylometry/issues/14
95
+
96
+ The algorithm:
97
+ 1. Take multiple random samples of varying sizes from the text
98
+ 2. For each sample size, calculate the mean TTR across samples
99
+ 3. Fit a curve to the (sample_size, TTR) relationship
100
+ 4. The D parameter is the best-fit curve parameter
101
+ 5. Higher D values indicate greater lexical diversity
102
+
103
+ Advantages over TTR:
104
+ - Less sensitive to text length
105
+ - More comparable across texts of different sizes
106
+ - Theoretically grounded in vocabulary acquisition models
107
+ - Widely used in language development research
108
+
109
+ Disadvantages:
110
+ - Computationally expensive (requires many random samples)
111
+ - Requires sufficient text length (typically 100+ tokens)
112
+ - Can be unstable with very short texts
113
+ - Curve fitting may not converge in some cases
114
+
115
+ Args:
116
+ text: Input text to analyze. Should contain at least min_tokens words
117
+ for reliable D estimation. Texts with fewer tokens will return
118
+ NaN or raise an error.
119
+ sample_size: Size of random samples to draw. Default is 35 tokens,
120
+ following Malvern et al. (2004). Smaller sizes increase
121
+ variance; larger sizes may exceed text length.
122
+ num_samples: Number of random samples to draw for each sample size.
123
+ More samples increase accuracy but also computation time.
124
+ Default is 100 samples.
125
+ min_tokens: Minimum tokens required for D calculation. Texts shorter
126
+ than this will return NaN or error. Default is 100.
127
+
128
+ Returns:
129
+ VocdDResult containing:
130
+ - d_parameter: The D value (higher = more diverse)
131
+ - curve_fit_r_squared: Quality of curve fit (closer to 1.0 is better)
132
+ - sample_count: Number of samples actually used
133
+ - optimal_sample_size: Sample size used for calculation
134
+ - metadata: Sampling details, convergence info, curve parameters
135
+
136
+ Example:
137
+ >>> text = "Long sample text with sufficient tokens..."
138
+ >>> result = compute_vocd_d(text, sample_size=35, num_samples=100)
139
+ >>> print(f"D parameter: {result.d_parameter:.2f}")
140
+ D parameter: 67.34
141
+ >>> print(f"Curve fit R²: {result.curve_fit_r_squared:.3f}")
142
+ Curve fit R²: 0.987
143
+
144
+ >>> # Short text handling
145
+ >>> short_text = "Too short"
146
+ >>> result = compute_vocd_d(short_text)
147
+ >>> import math
148
+ >>> math.isnan(result.d_parameter)
149
+ True
150
+
151
+ Note:
152
+ - Requires random sampling, so results may vary slightly between runs
153
+ - Use a random seed in metadata for reproducibility
154
+ - Very short texts (< min_tokens) cannot be analyzed
155
+ - D values typically range from 10 (low diversity) to 100+ (high diversity)
156
+ - Curve fitting uses least-squares optimization
157
+ - Poor curve fits (low R²) indicate unreliable D estimates
158
+ """
159
+ # Set random seed for reproducibility
160
+ if random_seed is not None:
161
+ random.seed(random_seed)
162
+
163
+ # Step 1: Tokenize text
164
+ tokens = _tokenize_for_diversity(text)
165
+ total_tokens = len(tokens)
166
+ total_types = len(set(tokens))
167
+
168
+ # Step 2: Validate minimum length
169
+ if total_tokens < min_tokens:
170
+ raise ValueError(
171
+ f"Text has {total_tokens} tokens, minimum {min_tokens} required for voc-D"
172
+ )
173
+
174
+ # Step 3: Determine sample sizes to test
175
+ # Test from 10 tokens up to min(100, total_tokens - 10)
176
+ min_sample_size = 10
177
+ max_sample_size = min(100, total_tokens - 10)
178
+
179
+ # Create list of sample sizes (every 5 tokens)
180
+ sample_sizes = list(range(min_sample_size, max_sample_size + 1, 5))
181
+
182
+ # Ensure we have at least a few sample sizes
183
+ if len(sample_sizes) < 3:
184
+ # If text is very short, just use what we can
185
+ sample_sizes = list(range(min_sample_size, max_sample_size + 1))
186
+
187
+ # Step 4: For each sample size, take random samples and calculate mean TTR
188
+ sample_size_to_mean_ttr: dict[int, float] = {}
189
+
190
+ for size in sample_sizes:
191
+ ttrs = []
192
+ for _ in range(num_samples):
193
+ # Random sample of 'size' tokens
194
+ sample = random.sample(tokens, size)
195
+ sample_types = len(set(sample))
196
+ ttr = sample_types / size
197
+ ttrs.append(ttr)
198
+
199
+ # Mean TTR for this sample size
200
+ mean_ttr = sum(ttrs) / len(ttrs)
201
+ sample_size_to_mean_ttr[size] = mean_ttr
202
+
203
+ # Step 5: Fit curve using model: TTR = D / sqrt(sample_size)
204
+ # Using least-squares fitting for y = a/sqrt(x)
205
+ # Minimize: sum((y_i - a/sqrt(x_i))^2)
206
+ # Solution: a = sum(y_i/sqrt(x_i)) / sum(1/x_i)
207
+
208
+ numerator = 0.0
209
+ denominator = 0.0
210
+
211
+ for size, ttr in sample_size_to_mean_ttr.items():
212
+ numerator += ttr / (size**0.5)
213
+ denominator += 1.0 / size
214
+
215
+ D = numerator / denominator if denominator > 0 else 0.0
216
+
217
+ # Step 6: Calculate R² (goodness of fit)
218
+ # Predicted TTR = D / sqrt(sample_size)
219
+ y_actual = list(sample_size_to_mean_ttr.values())
220
+ y_predicted = [D / (size**0.5) for size in sample_sizes]
221
+
222
+ # R² calculation
223
+ mean_y = sum(y_actual) / len(y_actual)
224
+ ss_tot = sum((y - mean_y) ** 2 for y in y_actual)
225
+ ss_res = sum((y_actual[i] - y_predicted[i]) ** 2 for i in range(len(y_actual)))
226
+
227
+ r_squared = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0.0
228
+
229
+ # Step 7: Build metadata
230
+ metadata = {
231
+ "total_token_count": total_tokens,
232
+ "total_type_count": total_types,
233
+ "simple_ttr": total_types / total_tokens if total_tokens > 0 else 0.0,
234
+ "sample_sizes_used": sample_sizes,
235
+ "mean_ttrs_per_sample_size": list(sample_size_to_mean_ttr.values()),
236
+ "num_samples_per_size": num_samples,
237
+ "random_seed": random_seed,
238
+ }
239
+
240
+ # Step 8: Return result
241
+ return VocdDResult(
242
+ d_parameter=D,
243
+ curve_fit_r_squared=r_squared,
244
+ sample_count=len(sample_sizes),
245
+ optimal_sample_size=sample_size, # Input parameter
246
+ metadata=metadata,
247
+ )
248
+
249
+
250
+ def compute_mattr(text: str, window_size: int = 50) -> MATTRResult:
251
+ """
252
+ Compute Moving-Average Type-Token Ratio (MATTR).
253
+
254
+ MATTR calculates TTR using a moving window of fixed size, then averages
255
+ across all windows. This provides a length-normalized measure that is
256
+ more stable than simple TTR and comparable across texts of different lengths.
257
+
258
+ Related GitHub Issue:
259
+ #14 - Advanced Lexical Diversity Metrics
260
+ https://github.com/craigtrim/pystylometry/issues/14
261
+
262
+ The algorithm:
263
+ 1. Slide a window of fixed size across the text (token by token)
264
+ 2. Calculate TTR for each window position
265
+ 3. Average all window TTRs to get MATTR
266
+ 4. Also compute statistics (std dev, min, max) across windows
267
+
268
+ Advantages over TTR:
269
+ - Controlled for text length (fixed window size)
270
+ - More comparable across texts
271
+ - Computationally simple and fast
272
+ - Intuitive interpretation (like TTR but normalized)
273
+
274
+ Disadvantages:
275
+ - Requires choosing window size (affects results)
276
+ - Not applicable to texts shorter than window size
277
+ - Adjacent windows overlap (not independent samples)
278
+
279
+ Args:
280
+ text: Input text to analyze. Must contain at least window_size tokens.
281
+ Texts shorter than window_size will return NaN.
282
+ window_size: Size of moving window in tokens. Default is 50, following
283
+ Covington & McFall (2010). Larger windows are more stable
284
+ but require longer texts. Smaller windows are noisier.
285
+
286
+ Returns:
287
+ MATTRResult containing:
288
+ - mattr_score: Average TTR across all windows
289
+ - window_size: Size of window used
290
+ - window_count: Number of windows analyzed
291
+ - ttr_std_dev: Standard deviation of TTR across windows
292
+ - min_ttr: Minimum TTR in any window
293
+ - max_ttr: Maximum TTR in any window
294
+ - metadata: Window-by-window TTR values
295
+
296
+ Example:
297
+ >>> result = compute_mattr("Sample text here...", window_size=50)
298
+ >>> print(f"MATTR score: {result.mattr_score:.3f}")
299
+ MATTR score: 0.847
300
+ >>> print(f"Windows analyzed: {result.window_count}")
301
+ Windows analyzed: 123
302
+ >>> print(f"TTR std dev: {result.ttr_std_dev:.3f}")
303
+ TTR std dev: 0.042
304
+
305
+ >>> # Short text handling
306
+ >>> short_text = "Too short for window"
307
+ >>> result = compute_mattr(short_text, window_size=50)
308
+ >>> import math
309
+ >>> math.isnan(result.mattr_score)
310
+ True
311
+
312
+ Note:
313
+ - Window size choice affects results (no universally optimal value)
314
+ - Standard window size is 50 tokens (Covington & McFall 2010)
315
+ - For very short texts, consider reducing window size or using different metric
316
+ - High TTR std dev suggests uneven lexical distribution
317
+ - MATTR values range from 0 (no diversity) to 1 (perfect diversity)
318
+ """
319
+ # Step 1: Tokenize text
320
+ tokens = _tokenize_for_diversity(text)
321
+ total_tokens = len(tokens)
322
+ total_types = len(set(tokens))
323
+
324
+ # Step 2: Validate minimum length
325
+ if total_tokens < window_size:
326
+ raise ValueError(
327
+ f"Text has {total_tokens} tokens, minimum {window_size} required for MATTR"
328
+ )
329
+
330
+ # Step 3: Slide window across text and calculate TTR for each position
331
+ window_ttrs = []
332
+
333
+ for i in range(total_tokens - window_size + 1):
334
+ # Extract window
335
+ window = tokens[i : i + window_size]
336
+
337
+ # Calculate TTR for this window
338
+ window_types = len(set(window))
339
+ ttr = window_types / window_size
340
+ window_ttrs.append(ttr)
341
+
342
+ # Step 4: Calculate MATTR (mean of all window TTRs)
343
+ mattr_score = sum(window_ttrs) / len(window_ttrs)
344
+
345
+ # Step 5: Calculate statistics
346
+ # Standard deviation
347
+ variance = sum((ttr - mattr_score) ** 2 for ttr in window_ttrs) / len(window_ttrs)
348
+ ttr_std_dev = variance**0.5
349
+
350
+ # Min and max
351
+ min_ttr = min(window_ttrs)
352
+ max_ttr = max(window_ttrs)
353
+
354
+ # Step 6: Build metadata
355
+ metadata = {
356
+ "total_token_count": total_tokens,
357
+ "total_type_count": total_types,
358
+ "simple_ttr": total_types / total_tokens if total_tokens > 0 else 0.0,
359
+ "first_window_ttr": window_ttrs[0],
360
+ "last_window_ttr": window_ttrs[-1],
361
+ }
362
+
363
+ # Step 7: Return result
364
+ return MATTRResult(
365
+ mattr_score=mattr_score,
366
+ window_size=window_size,
367
+ window_count=len(window_ttrs),
368
+ ttr_std_dev=ttr_std_dev,
369
+ min_ttr=min_ttr,
370
+ max_ttr=max_ttr,
371
+ metadata=metadata,
372
+ )
373
+
374
+
375
+ def compute_hdd(text: str, sample_size: int = 42) -> HDDResult:
376
+ """
377
+ Compute HD-D (Hypergeometric Distribution D).
378
+
379
+ HD-D uses the hypergeometric distribution to model the probability of
380
+ encountering new word types as text length increases. It provides a
381
+ probabilistic measure of lexical diversity that is less sensitive to
382
+ text length than simple TTR.
383
+
384
+ Related GitHub Issue:
385
+ #14 - Advanced Lexical Diversity Metrics
386
+ https://github.com/craigtrim/pystylometry/issues/14
387
+
388
+ The algorithm:
389
+ 1. For each word type in the text, calculate the probability that
390
+ it would NOT appear in a random sample of size N
391
+ 2. Sum these probabilities across all types
392
+ 3. This sum represents the expected number of new types in a sample
393
+ 4. HD-D is derived from this expected value
394
+
395
+ The hypergeometric distribution P(X=0) gives the probability that a word
396
+ type with frequency f does not appear in a random sample of size N from
397
+ a text of length T.
398
+
399
+ Advantages over TTR:
400
+ - Mathematically rigorous (probability-based)
401
+ - Less sensitive to text length
402
+ - Well-defined statistical properties
403
+ - Good empirical performance (McCarthy & Jarvis 2010)
404
+
405
+ Disadvantages:
406
+ - Computationally complex
407
+ - Requires understanding of probability theory
408
+ - Sample size choice affects results
409
+ - Less intuitive than TTR
410
+
411
+ Args:
412
+ text: Input text to analyze. Should contain at least 50+ tokens
413
+ for reliable HD-D calculation.
414
+ sample_size: Size of hypothetical sample for calculation. Default is
415
+ 42 tokens, following McCarthy & Jarvis (2010). The optimal
416
+ sample size is typically 35-50 tokens.
417
+
418
+ Returns:
419
+ HDDResult containing:
420
+ - hdd_score: The HD-D value (higher = more diverse)
421
+ - sample_size: Sample size used for calculation
422
+ - type_count: Number of unique types in text
423
+ - token_count: Number of tokens in text
424
+ - metadata: Probability distribution details
425
+
426
+ Example:
427
+ >>> result = compute_hdd("Sample text for analysis...")
428
+ >>> print(f"HD-D score: {result.hdd_score:.3f}")
429
+ HD-D score: 0.823
430
+ >>> print(f"Sample size: {result.sample_size}")
431
+ Sample size: 42
432
+ >>> print(f"Types: {result.type_count}, Tokens: {result.token_count}")
433
+ Types: 67, Tokens: 150
434
+
435
+ >>> # Empty text handling
436
+ >>> result = compute_hdd("")
437
+ >>> import math
438
+ >>> math.isnan(result.hdd_score)
439
+ True
440
+
441
+ Note:
442
+ - HD-D values range from 0 (no diversity) to 1 (perfect diversity)
443
+ - Requires scipy for hypergeometric distribution calculations
444
+ - Sample size should be smaller than text length
445
+ - Very short texts may produce unreliable HD-D values
446
+ - HD-D correlates highly with other diversity measures but is more stable
447
+ """
448
+ # Step 1: Tokenize text
449
+ tokens = _tokenize_for_diversity(text)
450
+ total_tokens = len(tokens)
451
+
452
+ # Step 2: Validate minimum length
453
+ if total_tokens < sample_size:
454
+ raise ValueError(
455
+ f"Text has {total_tokens} tokens, minimum {sample_size} required for HD-D"
456
+ )
457
+
458
+ # Step 3: Build frequency distribution
459
+ type_counts: dict[str, int] = {}
460
+ for token in tokens:
461
+ type_counts[token] = type_counts.get(token, 0) + 1
462
+
463
+ total_types = len(type_counts)
464
+
465
+ # Step 4: Calculate HD-D using hypergeometric distribution
466
+ # HD-D = sum over all types of P(X = 0)
467
+ # where P(X = 0) is probability that type does NOT appear in random sample
468
+ #
469
+ # Using simplified formula (stable and no scipy required):
470
+ # P(X=0) = ((total_tokens - count) / total_tokens)^sample_size
471
+
472
+ hdd_sum = 0.0
473
+
474
+ for word_type, count in type_counts.items():
475
+ # Probability this type does NOT appear in sample of size sample_size
476
+ prob_not_appear = ((total_tokens - count) / total_tokens) ** sample_size
477
+ hdd_sum += prob_not_appear
478
+
479
+ # Step 5: Build metadata
480
+ metadata = {
481
+ "total_token_count": total_tokens,
482
+ "total_type_count": total_types,
483
+ "simple_ttr": total_types / total_tokens if total_tokens > 0 else 0.0,
484
+ "hypergeometric_sum": hdd_sum,
485
+ "calculation_method": "simplified",
486
+ }
487
+
488
+ # Step 6: Return result
489
+ return HDDResult(
490
+ hdd_score=hdd_sum,
491
+ sample_size=sample_size,
492
+ type_count=total_types,
493
+ token_count=total_tokens,
494
+ metadata=metadata,
495
+ )
496
+
497
+
498
+ def compute_msttr(text: str, segment_size: int = 100) -> MSTTRResult:
499
+ """
500
+ Compute Mean Segmental Type-Token Ratio (MSTTR).
501
+
502
+ MSTTR divides text into sequential, non-overlapping segments of equal
503
+ length, calculates TTR for each segment, then averages across segments.
504
+ This normalizes for text length and provides a stable diversity measure.
505
+
506
+ Related GitHub Issue:
507
+ #14 - Advanced Lexical Diversity Metrics
508
+ https://github.com/craigtrim/pystylometry/issues/14
509
+
510
+ The algorithm:
511
+ 1. Divide text into non-overlapping segments of segment_size tokens
512
+ 2. Calculate TTR for each complete segment
513
+ 3. Discard any remaining tokens that don't form a complete segment
514
+ 4. Average TTRs across all segments
515
+ 5. Compute statistics (std dev, min, max) across segments
516
+
517
+ Advantages over TTR:
518
+ - Normalized for text length (fixed segment size)
519
+ - Simple and intuitive
520
+ - Fast computation
521
+ - Independent segments (unlike MATTR's overlapping windows)
522
+
523
+ Disadvantages:
524
+ - Discards incomplete final segment (information loss)
525
+ - Requires choosing segment size (affects results)
526
+ - Needs longer texts to produce multiple segments
527
+ - Segment boundaries are arbitrary
528
+
529
+ Args:
530
+ text: Input text to analyze. Should contain at least segment_size tokens.
531
+ Texts shorter than segment_size will return NaN. Longer texts
532
+ will have leftover tokens discarded if they don't form a complete
533
+ segment.
534
+ segment_size: Size of each segment in tokens. Default is 100 following
535
+ Johnson (1944). Larger segments are more stable but need
536
+ longer texts. Smaller segments are noisier but work with
537
+ shorter texts.
538
+
539
+ Returns:
540
+ MSTTRResult containing:
541
+ - msttr_score: Mean TTR across all segments
542
+ - segment_size: Size of each segment used
543
+ - segment_count: Number of complete segments analyzed
544
+ - ttr_std_dev: Standard deviation of TTR across segments
545
+ - min_ttr: Minimum TTR in any segment
546
+ - max_ttr: Maximum TTR in any segment
547
+ - segment_ttrs: List of TTR for each segment
548
+ - metadata: Segment details, tokens used/discarded
549
+
550
+ Example:
551
+ >>> result = compute_msttr("Long text with many segments...", segment_size=100)
552
+ >>> print(f"MSTTR score: {result.msttr_score:.3f}")
553
+ MSTTR score: 0.734
554
+ >>> print(f"Segments: {result.segment_count}")
555
+ Segments: 8
556
+ >>> print(f"TTR range: {result.min_ttr:.3f} to {result.max_ttr:.3f}")
557
+ TTR range: 0.680 to 0.790
558
+
559
+ >>> # Short text handling
560
+ >>> short_text = "Too short"
561
+ >>> result = compute_msttr(short_text, segment_size=100)
562
+ >>> import math
563
+ >>> math.isnan(result.msttr_score)
564
+ True
565
+
566
+ Note:
567
+ - Segment size choice affects results (common values: 50, 100, 200)
568
+ - Standard segment size is 100 tokens (Johnson 1944)
569
+ - Leftover tokens are discarded (e.g., 250 tokens → 2 segments of 100)
570
+ - At least 1 complete segment required (min text length = segment_size)
571
+ - High TTR std dev suggests inconsistent lexical diversity across text
572
+ - MSTTR values range from 0 (no diversity) to 1 (perfect diversity)
573
+ """
574
+ # Step 1: Tokenize text
575
+ tokens = _tokenize_for_diversity(text)
576
+ total_tokens = len(tokens)
577
+ total_types = len(set(tokens))
578
+
579
+ # Step 2: Validate minimum length
580
+ if total_tokens < segment_size:
581
+ raise ValueError(
582
+ f"Text has {total_tokens} tokens, minimum {segment_size} required for MSTTR"
583
+ )
584
+
585
+ # Step 3: Calculate number of complete segments
586
+ segment_count = total_tokens // segment_size
587
+
588
+ # Step 4: Calculate TTR for each segment
589
+ segment_ttrs = []
590
+
591
+ for i in range(segment_count):
592
+ # Extract segment
593
+ start = i * segment_size
594
+ end = start + segment_size
595
+ segment = tokens[start:end]
596
+
597
+ # Calculate TTR for this segment
598
+ segment_types = len(set(segment))
599
+ ttr = segment_types / segment_size
600
+ segment_ttrs.append(ttr)
601
+
602
+ # Step 5: Calculate MSTTR (mean of segment TTRs)
603
+ msttr_score = sum(segment_ttrs) / len(segment_ttrs)
604
+
605
+ # Step 6: Calculate statistics
606
+ # Standard deviation
607
+ variance = sum((ttr - msttr_score) ** 2 for ttr in segment_ttrs) / len(
608
+ segment_ttrs
609
+ )
610
+ ttr_std_dev = variance**0.5
611
+
612
+ # Min and max
613
+ min_ttr = min(segment_ttrs)
614
+ max_ttr = max(segment_ttrs)
615
+
616
+ # Step 7: Calculate tokens used/discarded
617
+ tokens_used = segment_count * segment_size
618
+ tokens_discarded = total_tokens - tokens_used
619
+
620
+ # Step 8: Build metadata
621
+ metadata = {
622
+ "total_token_count": total_tokens,
623
+ "total_type_count": total_types,
624
+ "simple_ttr": total_types / total_tokens if total_tokens > 0 else 0.0,
625
+ "tokens_used": tokens_used,
626
+ "tokens_discarded": tokens_discarded,
627
+ "first_segment_ttr": segment_ttrs[0],
628
+ "last_segment_ttr": segment_ttrs[-1],
629
+ }
630
+
631
+ # Step 9: Return result
632
+ return MSTTRResult(
633
+ msttr_score=msttr_score,
634
+ segment_size=segment_size,
635
+ segment_count=segment_count,
636
+ ttr_std_dev=ttr_std_dev,
637
+ min_ttr=min_ttr,
638
+ max_ttr=max_ttr,
639
+ segment_ttrs=segment_ttrs,
640
+ metadata=metadata,
641
+ )