pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,680 @@
|
|
|
1
|
+
"""Advanced lexical diversity metrics.
|
|
2
|
+
|
|
3
|
+
This module provides sophisticated measures of lexical diversity that go beyond
|
|
4
|
+
simple Type-Token Ratio (TTR). These metrics are designed to control for text
|
|
5
|
+
length and provide more stable, comparable measures across texts of different sizes.
|
|
6
|
+
|
|
7
|
+
Related GitHub Issue:
|
|
8
|
+
#14 - Advanced Lexical Diversity Metrics
|
|
9
|
+
https://github.com/craigtrim/pystylometry/issues/14
|
|
10
|
+
|
|
11
|
+
Metrics implemented:
|
|
12
|
+
- voc-D: Mathematical model-based diversity estimate
|
|
13
|
+
- MATTR: Moving-Average Type-Token Ratio
|
|
14
|
+
- HD-D: Hypergeometric Distribution D
|
|
15
|
+
- MSTTR: Mean Segmental Type-Token Ratio
|
|
16
|
+
|
|
17
|
+
Each of these metrics addresses the "text length problem" that affects simple
|
|
18
|
+
TTR: longer texts tend to have lower TTR values because words repeat. These
|
|
19
|
+
advanced metrics normalize for length in different ways.
|
|
20
|
+
|
|
21
|
+
References:
|
|
22
|
+
McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D: A validation
|
|
23
|
+
study of sophisticated approaches to lexical diversity assessment.
|
|
24
|
+
Behavior Research Methods, 42(2), 381-392.
|
|
25
|
+
Malvern, D., Richards, B., Chipere, N., & Durán, P. (2004).
|
|
26
|
+
Lexical Diversity and Language Development. Palgrave Macmillan.
|
|
27
|
+
Covington, M. A., & McFall, J. D. (2010). Cutting the Gordian knot:
|
|
28
|
+
The moving-average type-token ratio (MATTR). Journal of Quantitative
|
|
29
|
+
Linguistics, 17(2), 94-100.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
import random
|
|
33
|
+
from typing import Optional
|
|
34
|
+
|
|
35
|
+
from .._types import (
|
|
36
|
+
HDDResult,
|
|
37
|
+
MATTRResult,
|
|
38
|
+
MSTTRResult,
|
|
39
|
+
VocdDResult,
|
|
40
|
+
make_distribution,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _tokenize_for_diversity(text: str) -> list[str]:
|
|
45
|
+
"""Tokenize text for lexical diversity analysis.
|
|
46
|
+
|
|
47
|
+
This helper function provides consistent tokenization across all
|
|
48
|
+
diversity metrics. It:
|
|
49
|
+
- Converts text to lowercase
|
|
50
|
+
- Splits on whitespace
|
|
51
|
+
- Strips punctuation from each token
|
|
52
|
+
- Returns list of clean tokens
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
text: Input text to tokenize
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
List of lowercase tokens with punctuation removed
|
|
59
|
+
"""
|
|
60
|
+
if not text or not text.strip():
|
|
61
|
+
return []
|
|
62
|
+
|
|
63
|
+
# Lowercase entire text
|
|
64
|
+
text_lower = text.lower()
|
|
65
|
+
|
|
66
|
+
# Split on whitespace
|
|
67
|
+
raw_tokens = text_lower.split()
|
|
68
|
+
|
|
69
|
+
# Comprehensive punctuation set for stripping
|
|
70
|
+
punctuation_chars = set(".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„\"\"''‚'")
|
|
71
|
+
|
|
72
|
+
# Strip punctuation from each token
|
|
73
|
+
tokens = []
|
|
74
|
+
for token in raw_tokens:
|
|
75
|
+
# Strip leading and trailing punctuation
|
|
76
|
+
clean_token = token.strip("".join(punctuation_chars))
|
|
77
|
+
if clean_token: # Only add non-empty tokens
|
|
78
|
+
tokens.append(clean_token)
|
|
79
|
+
|
|
80
|
+
return tokens
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def compute_vocd_d(
|
|
84
|
+
text: str,
|
|
85
|
+
sample_size: int = 35,
|
|
86
|
+
num_samples: int = 100,
|
|
87
|
+
min_tokens: int = 100,
|
|
88
|
+
random_seed: Optional[int] = None,
|
|
89
|
+
chunk_size: int = 1000,
|
|
90
|
+
) -> VocdDResult:
|
|
91
|
+
"""
|
|
92
|
+
Compute voc-D (vocabulary D) using curve-fitting approach.
|
|
93
|
+
|
|
94
|
+
voc-D estimates lexical diversity by fitting a mathematical model to the
|
|
95
|
+
relationship between tokens and types across multiple random samples.
|
|
96
|
+
The D parameter represents theoretical vocabulary size and is more stable
|
|
97
|
+
across text lengths than simple TTR.
|
|
98
|
+
|
|
99
|
+
Related GitHub Issue:
|
|
100
|
+
#14 - Advanced Lexical Diversity Metrics
|
|
101
|
+
https://github.com/craigtrim/pystylometry/issues/14
|
|
102
|
+
|
|
103
|
+
The algorithm:
|
|
104
|
+
1. Take multiple random samples of varying sizes from the text
|
|
105
|
+
2. For each sample size, calculate the mean TTR across samples
|
|
106
|
+
3. Fit a curve to the (sample_size, TTR) relationship
|
|
107
|
+
4. The D parameter is the best-fit curve parameter
|
|
108
|
+
5. Higher D values indicate greater lexical diversity
|
|
109
|
+
|
|
110
|
+
Advantages over TTR:
|
|
111
|
+
- Less sensitive to text length
|
|
112
|
+
- More comparable across texts of different sizes
|
|
113
|
+
- Theoretically grounded in vocabulary acquisition models
|
|
114
|
+
- Widely used in language development research
|
|
115
|
+
|
|
116
|
+
Disadvantages:
|
|
117
|
+
- Computationally expensive (requires many random samples)
|
|
118
|
+
- Requires sufficient text length (typically 100+ tokens)
|
|
119
|
+
- Can be unstable with very short texts
|
|
120
|
+
- Curve fitting may not converge in some cases
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
text: Input text to analyze. Should contain at least min_tokens words
|
|
124
|
+
for reliable D estimation. Texts with fewer tokens will return
|
|
125
|
+
NaN or raise an error.
|
|
126
|
+
sample_size: Size of random samples to draw. Default is 35 tokens,
|
|
127
|
+
following Malvern et al. (2004). Smaller sizes increase
|
|
128
|
+
variance; larger sizes may exceed text length.
|
|
129
|
+
num_samples: Number of random samples to draw for each sample size.
|
|
130
|
+
More samples increase accuracy but also computation time.
|
|
131
|
+
Default is 100 samples.
|
|
132
|
+
min_tokens: Minimum tokens required for D calculation. Texts shorter
|
|
133
|
+
than this will return NaN or error. Default is 100.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
VocdDResult containing:
|
|
137
|
+
- d_parameter: The D value (higher = more diverse)
|
|
138
|
+
- curve_fit_r_squared: Quality of curve fit (closer to 1.0 is better)
|
|
139
|
+
- sample_count: Number of samples actually used
|
|
140
|
+
- optimal_sample_size: Sample size used for calculation
|
|
141
|
+
- metadata: Sampling details, convergence info, curve parameters
|
|
142
|
+
|
|
143
|
+
Example:
|
|
144
|
+
>>> text = "Long sample text with sufficient tokens..."
|
|
145
|
+
>>> result = compute_vocd_d(text, sample_size=35, num_samples=100)
|
|
146
|
+
>>> print(f"D parameter: {result.d_parameter:.2f}")
|
|
147
|
+
D parameter: 67.34
|
|
148
|
+
>>> print(f"Curve fit R²: {result.curve_fit_r_squared:.3f}")
|
|
149
|
+
Curve fit R²: 0.987
|
|
150
|
+
|
|
151
|
+
>>> # Short text handling
|
|
152
|
+
>>> short_text = "Too short"
|
|
153
|
+
>>> result = compute_vocd_d(short_text)
|
|
154
|
+
>>> import math
|
|
155
|
+
>>> math.isnan(result.d_parameter)
|
|
156
|
+
True
|
|
157
|
+
|
|
158
|
+
Note:
|
|
159
|
+
- Requires random sampling, so results may vary slightly between runs
|
|
160
|
+
- Use a random seed in metadata for reproducibility
|
|
161
|
+
- Very short texts (< min_tokens) cannot be analyzed
|
|
162
|
+
- D values typically range from 10 (low diversity) to 100+ (high diversity)
|
|
163
|
+
- Curve fitting uses least-squares optimization
|
|
164
|
+
- Poor curve fits (low R²) indicate unreliable D estimates
|
|
165
|
+
"""
|
|
166
|
+
# Set random seed for reproducibility
|
|
167
|
+
if random_seed is not None:
|
|
168
|
+
random.seed(random_seed)
|
|
169
|
+
|
|
170
|
+
# Step 1: Tokenize text
|
|
171
|
+
tokens = _tokenize_for_diversity(text)
|
|
172
|
+
total_tokens = len(tokens)
|
|
173
|
+
total_types = len(set(tokens))
|
|
174
|
+
|
|
175
|
+
# Step 2: Validate minimum length
|
|
176
|
+
if total_tokens < min_tokens:
|
|
177
|
+
raise ValueError(f"Text has {total_tokens} tokens, minimum {min_tokens} required for voc-D")
|
|
178
|
+
|
|
179
|
+
# Step 3: Determine sample sizes to test
|
|
180
|
+
# Test from 10 tokens up to min(100, total_tokens - 10)
|
|
181
|
+
min_sample_size = 10
|
|
182
|
+
max_sample_size = min(100, total_tokens - 10)
|
|
183
|
+
|
|
184
|
+
# Create list of sample sizes (every 5 tokens)
|
|
185
|
+
sample_sizes = list(range(min_sample_size, max_sample_size + 1, 5))
|
|
186
|
+
|
|
187
|
+
# Ensure we have at least a few sample sizes
|
|
188
|
+
if len(sample_sizes) < 3:
|
|
189
|
+
# If text is very short, just use what we can
|
|
190
|
+
sample_sizes = list(range(min_sample_size, max_sample_size + 1))
|
|
191
|
+
|
|
192
|
+
# Step 4: For each sample size, take random samples and calculate mean TTR
|
|
193
|
+
sample_size_to_mean_ttr: dict[int, float] = {}
|
|
194
|
+
|
|
195
|
+
for size in sample_sizes:
|
|
196
|
+
ttrs = []
|
|
197
|
+
for _ in range(num_samples):
|
|
198
|
+
# Random sample of 'size' tokens
|
|
199
|
+
sample = random.sample(tokens, size)
|
|
200
|
+
sample_types = len(set(sample))
|
|
201
|
+
ttr = sample_types / size
|
|
202
|
+
ttrs.append(ttr)
|
|
203
|
+
|
|
204
|
+
# Mean TTR for this sample size
|
|
205
|
+
mean_ttr = sum(ttrs) / len(ttrs)
|
|
206
|
+
sample_size_to_mean_ttr[size] = mean_ttr
|
|
207
|
+
|
|
208
|
+
# Step 5: Fit curve using model: TTR = D / sqrt(sample_size)
|
|
209
|
+
# Using least-squares fitting for y = a/sqrt(x)
|
|
210
|
+
# Minimize: sum((y_i - a/sqrt(x_i))^2)
|
|
211
|
+
# Solution: a = sum(y_i/sqrt(x_i)) / sum(1/x_i)
|
|
212
|
+
|
|
213
|
+
numerator = 0.0
|
|
214
|
+
denominator = 0.0
|
|
215
|
+
|
|
216
|
+
for size, ttr in sample_size_to_mean_ttr.items():
|
|
217
|
+
numerator += ttr / (size**0.5)
|
|
218
|
+
denominator += 1.0 / size
|
|
219
|
+
|
|
220
|
+
d_param = numerator / denominator if denominator > 0 else 0.0
|
|
221
|
+
|
|
222
|
+
# Step 6: Calculate R² (goodness of fit)
|
|
223
|
+
# Predicted TTR = D / sqrt(sample_size)
|
|
224
|
+
y_actual = list(sample_size_to_mean_ttr.values())
|
|
225
|
+
y_predicted = [d_param / (size**0.5) for size in sample_sizes]
|
|
226
|
+
|
|
227
|
+
# R² calculation
|
|
228
|
+
mean_y = sum(y_actual) / len(y_actual)
|
|
229
|
+
ss_tot = sum((y - mean_y) ** 2 for y in y_actual)
|
|
230
|
+
ss_res = sum((y_actual[i] - y_predicted[i]) ** 2 for i in range(len(y_actual)))
|
|
231
|
+
|
|
232
|
+
r_squared = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0.0
|
|
233
|
+
|
|
234
|
+
# Step 7: Build metadata
|
|
235
|
+
metadata = {
|
|
236
|
+
"total_token_count": total_tokens,
|
|
237
|
+
"total_type_count": total_types,
|
|
238
|
+
"simple_ttr": total_types / total_tokens if total_tokens > 0 else 0.0,
|
|
239
|
+
"sample_sizes_used": sample_sizes,
|
|
240
|
+
"mean_ttrs_per_sample_size": list(sample_size_to_mean_ttr.values()),
|
|
241
|
+
"num_samples_per_size": num_samples,
|
|
242
|
+
"random_seed": random_seed,
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
# Step 8: Create distributions (single-pass analysis)
|
|
246
|
+
d_parameter_dist = make_distribution([d_param])
|
|
247
|
+
curve_fit_r_squared_dist = make_distribution([r_squared])
|
|
248
|
+
|
|
249
|
+
# Step 9: Return result
|
|
250
|
+
return VocdDResult(
|
|
251
|
+
d_parameter=d_param,
|
|
252
|
+
curve_fit_r_squared=r_squared,
|
|
253
|
+
sample_count=len(sample_sizes),
|
|
254
|
+
optimal_sample_size=sample_size, # Input parameter
|
|
255
|
+
d_parameter_dist=d_parameter_dist,
|
|
256
|
+
curve_fit_r_squared_dist=curve_fit_r_squared_dist,
|
|
257
|
+
chunk_size=chunk_size,
|
|
258
|
+
chunk_count=1, # Single pass analysis
|
|
259
|
+
metadata=metadata,
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def compute_mattr(text: str, window_size: int = 50, chunk_size: int = 1000) -> MATTRResult:
|
|
264
|
+
"""
|
|
265
|
+
Compute Moving-Average Type-Token Ratio (MATTR).
|
|
266
|
+
|
|
267
|
+
MATTR calculates TTR using a moving window of fixed size, then averages
|
|
268
|
+
across all windows. This provides a length-normalized measure that is
|
|
269
|
+
more stable than simple TTR and comparable across texts of different lengths.
|
|
270
|
+
|
|
271
|
+
Related GitHub Issue:
|
|
272
|
+
#14 - Advanced Lexical Diversity Metrics
|
|
273
|
+
https://github.com/craigtrim/pystylometry/issues/14
|
|
274
|
+
|
|
275
|
+
The algorithm:
|
|
276
|
+
1. Slide a window of fixed size across the text (token by token)
|
|
277
|
+
2. Calculate TTR for each window position
|
|
278
|
+
3. Average all window TTRs to get MATTR
|
|
279
|
+
4. Also compute statistics (std dev, min, max) across windows
|
|
280
|
+
|
|
281
|
+
Advantages over TTR:
|
|
282
|
+
- Controlled for text length (fixed window size)
|
|
283
|
+
- More comparable across texts
|
|
284
|
+
- Computationally simple and fast
|
|
285
|
+
- Intuitive interpretation (like TTR but normalized)
|
|
286
|
+
|
|
287
|
+
Disadvantages:
|
|
288
|
+
- Requires choosing window size (affects results)
|
|
289
|
+
- Not applicable to texts shorter than window size
|
|
290
|
+
- Adjacent windows overlap (not independent samples)
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
text: Input text to analyze. Must contain at least window_size tokens.
|
|
294
|
+
Texts shorter than window_size will return NaN.
|
|
295
|
+
window_size: Size of moving window in tokens. Default is 50, following
|
|
296
|
+
Covington & McFall (2010). Larger windows are more stable
|
|
297
|
+
but require longer texts. Smaller windows are noisier.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
MATTRResult containing:
|
|
301
|
+
- mattr_score: Average TTR across all windows
|
|
302
|
+
- window_size: Size of window used
|
|
303
|
+
- window_count: Number of windows analyzed
|
|
304
|
+
- ttr_std_dev: Standard deviation of TTR across windows
|
|
305
|
+
- min_ttr: Minimum TTR in any window
|
|
306
|
+
- max_ttr: Maximum TTR in any window
|
|
307
|
+
- metadata: Window-by-window TTR values
|
|
308
|
+
|
|
309
|
+
Example:
|
|
310
|
+
>>> result = compute_mattr("Sample text here...", window_size=50)
|
|
311
|
+
>>> print(f"MATTR score: {result.mattr_score:.3f}")
|
|
312
|
+
MATTR score: 0.847
|
|
313
|
+
>>> print(f"Windows analyzed: {result.window_count}")
|
|
314
|
+
Windows analyzed: 123
|
|
315
|
+
>>> print(f"TTR std dev: {result.ttr_std_dev:.3f}")
|
|
316
|
+
TTR std dev: 0.042
|
|
317
|
+
|
|
318
|
+
>>> # Short text handling
|
|
319
|
+
>>> short_text = "Too short for window"
|
|
320
|
+
>>> result = compute_mattr(short_text, window_size=50)
|
|
321
|
+
>>> import math
|
|
322
|
+
>>> math.isnan(result.mattr_score)
|
|
323
|
+
True
|
|
324
|
+
|
|
325
|
+
Note:
|
|
326
|
+
- Window size choice affects results (no universally optimal value)
|
|
327
|
+
- Standard window size is 50 tokens (Covington & McFall 2010)
|
|
328
|
+
- For very short texts, consider reducing window size or using different metric
|
|
329
|
+
- High TTR std dev suggests uneven lexical distribution
|
|
330
|
+
- MATTR values range from 0 (no diversity) to 1 (perfect diversity)
|
|
331
|
+
"""
|
|
332
|
+
# Step 1: Tokenize text
|
|
333
|
+
tokens = _tokenize_for_diversity(text)
|
|
334
|
+
total_tokens = len(tokens)
|
|
335
|
+
total_types = len(set(tokens))
|
|
336
|
+
|
|
337
|
+
# Step 2: Validate minimum length
|
|
338
|
+
if total_tokens < window_size:
|
|
339
|
+
raise ValueError(
|
|
340
|
+
f"Text has {total_tokens} tokens, minimum {window_size} required for MATTR"
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
# Step 3: Slide window across text and calculate TTR for each position
|
|
344
|
+
window_ttrs = []
|
|
345
|
+
|
|
346
|
+
for i in range(total_tokens - window_size + 1):
|
|
347
|
+
# Extract window
|
|
348
|
+
window = tokens[i : i + window_size]
|
|
349
|
+
|
|
350
|
+
# Calculate TTR for this window
|
|
351
|
+
window_types = len(set(window))
|
|
352
|
+
ttr = window_types / window_size
|
|
353
|
+
window_ttrs.append(ttr)
|
|
354
|
+
|
|
355
|
+
# Step 4: Calculate MATTR (mean of all window TTRs)
|
|
356
|
+
mattr_score = sum(window_ttrs) / len(window_ttrs)
|
|
357
|
+
|
|
358
|
+
# Step 5: Calculate statistics
|
|
359
|
+
# Standard deviation
|
|
360
|
+
variance = sum((ttr - mattr_score) ** 2 for ttr in window_ttrs) / len(window_ttrs)
|
|
361
|
+
ttr_std_dev = variance**0.5
|
|
362
|
+
|
|
363
|
+
# Min and max
|
|
364
|
+
min_ttr = min(window_ttrs)
|
|
365
|
+
max_ttr = max(window_ttrs)
|
|
366
|
+
|
|
367
|
+
# Step 6: Build metadata
|
|
368
|
+
metadata = {
|
|
369
|
+
"total_token_count": total_tokens,
|
|
370
|
+
"total_type_count": total_types,
|
|
371
|
+
"simple_ttr": total_types / total_tokens if total_tokens > 0 else 0.0,
|
|
372
|
+
"first_window_ttr": window_ttrs[0],
|
|
373
|
+
"last_window_ttr": window_ttrs[-1],
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
# Step 7: Create distributions (single-pass analysis)
|
|
377
|
+
mattr_score_dist = make_distribution([mattr_score])
|
|
378
|
+
ttr_std_dev_dist = make_distribution([ttr_std_dev])
|
|
379
|
+
min_ttr_dist = make_distribution([min_ttr])
|
|
380
|
+
max_ttr_dist = make_distribution([max_ttr])
|
|
381
|
+
|
|
382
|
+
# Step 8: Return result
|
|
383
|
+
return MATTRResult(
|
|
384
|
+
mattr_score=mattr_score,
|
|
385
|
+
window_size=window_size,
|
|
386
|
+
window_count=len(window_ttrs),
|
|
387
|
+
ttr_std_dev=ttr_std_dev,
|
|
388
|
+
min_ttr=min_ttr,
|
|
389
|
+
max_ttr=max_ttr,
|
|
390
|
+
mattr_score_dist=mattr_score_dist,
|
|
391
|
+
ttr_std_dev_dist=ttr_std_dev_dist,
|
|
392
|
+
min_ttr_dist=min_ttr_dist,
|
|
393
|
+
max_ttr_dist=max_ttr_dist,
|
|
394
|
+
chunk_size=chunk_size,
|
|
395
|
+
chunk_count=1, # Single pass analysis
|
|
396
|
+
metadata=metadata,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def compute_hdd(text: str, sample_size: int = 42, chunk_size: int = 1000) -> HDDResult:
|
|
401
|
+
"""
|
|
402
|
+
Compute HD-D (Hypergeometric Distribution D).
|
|
403
|
+
|
|
404
|
+
HD-D uses the hypergeometric distribution to model the probability of
|
|
405
|
+
encountering new word types as text length increases. It provides a
|
|
406
|
+
probabilistic measure of lexical diversity that is less sensitive to
|
|
407
|
+
text length than simple TTR.
|
|
408
|
+
|
|
409
|
+
Related GitHub Issue:
|
|
410
|
+
#14 - Advanced Lexical Diversity Metrics
|
|
411
|
+
https://github.com/craigtrim/pystylometry/issues/14
|
|
412
|
+
|
|
413
|
+
The algorithm:
|
|
414
|
+
1. For each word type in the text, calculate the probability that
|
|
415
|
+
it would NOT appear in a random sample of size N
|
|
416
|
+
2. Sum these probabilities across all types
|
|
417
|
+
3. This sum represents the expected number of new types in a sample
|
|
418
|
+
4. HD-D is derived from this expected value
|
|
419
|
+
|
|
420
|
+
The hypergeometric distribution P(X=0) gives the probability that a word
|
|
421
|
+
type with frequency f does not appear in a random sample of size N from
|
|
422
|
+
a text of length T.
|
|
423
|
+
|
|
424
|
+
Advantages over TTR:
|
|
425
|
+
- Mathematically rigorous (probability-based)
|
|
426
|
+
- Less sensitive to text length
|
|
427
|
+
- Well-defined statistical properties
|
|
428
|
+
- Good empirical performance (McCarthy & Jarvis 2010)
|
|
429
|
+
|
|
430
|
+
Disadvantages:
|
|
431
|
+
- Computationally complex
|
|
432
|
+
- Requires understanding of probability theory
|
|
433
|
+
- Sample size choice affects results
|
|
434
|
+
- Less intuitive than TTR
|
|
435
|
+
|
|
436
|
+
Args:
|
|
437
|
+
text: Input text to analyze. Should contain at least 50+ tokens
|
|
438
|
+
for reliable HD-D calculation.
|
|
439
|
+
sample_size: Size of hypothetical sample for calculation. Default is
|
|
440
|
+
42 tokens, following McCarthy & Jarvis (2010). The optimal
|
|
441
|
+
sample size is typically 35-50 tokens.
|
|
442
|
+
|
|
443
|
+
Returns:
|
|
444
|
+
HDDResult containing:
|
|
445
|
+
- hdd_score: The HD-D value (higher = more diverse)
|
|
446
|
+
- sample_size: Sample size used for calculation
|
|
447
|
+
- type_count: Number of unique types in text
|
|
448
|
+
- token_count: Number of tokens in text
|
|
449
|
+
- metadata: Probability distribution details
|
|
450
|
+
|
|
451
|
+
Example:
|
|
452
|
+
>>> result = compute_hdd("Sample text for analysis...")
|
|
453
|
+
>>> print(f"HD-D score: {result.hdd_score:.3f}")
|
|
454
|
+
HD-D score: 0.823
|
|
455
|
+
>>> print(f"Sample size: {result.sample_size}")
|
|
456
|
+
Sample size: 42
|
|
457
|
+
>>> print(f"Types: {result.type_count}, Tokens: {result.token_count}")
|
|
458
|
+
Types: 67, Tokens: 150
|
|
459
|
+
|
|
460
|
+
>>> # Empty text handling
|
|
461
|
+
>>> result = compute_hdd("")
|
|
462
|
+
>>> import math
|
|
463
|
+
>>> math.isnan(result.hdd_score)
|
|
464
|
+
True
|
|
465
|
+
|
|
466
|
+
Note:
|
|
467
|
+
- HD-D values range from 0 (no diversity) to 1 (perfect diversity)
|
|
468
|
+
- Requires scipy for hypergeometric distribution calculations
|
|
469
|
+
- Sample size should be smaller than text length
|
|
470
|
+
- Very short texts may produce unreliable HD-D values
|
|
471
|
+
- HD-D correlates highly with other diversity measures but is more stable
|
|
472
|
+
"""
|
|
473
|
+
# Step 1: Tokenize text
|
|
474
|
+
tokens = _tokenize_for_diversity(text)
|
|
475
|
+
total_tokens = len(tokens)
|
|
476
|
+
|
|
477
|
+
# Step 2: Validate minimum length
|
|
478
|
+
if total_tokens < sample_size:
|
|
479
|
+
raise ValueError(f"Text has {total_tokens} tokens, minimum {sample_size} required for HD-D")
|
|
480
|
+
|
|
481
|
+
# Step 3: Build frequency distribution
|
|
482
|
+
type_counts: dict[str, int] = {}
|
|
483
|
+
for token in tokens:
|
|
484
|
+
type_counts[token] = type_counts.get(token, 0) + 1
|
|
485
|
+
|
|
486
|
+
total_types = len(type_counts)
|
|
487
|
+
|
|
488
|
+
# Step 4: Calculate HD-D using hypergeometric distribution
|
|
489
|
+
# HD-D = sum over all types of P(X = 0)
|
|
490
|
+
# where P(X = 0) is probability that type does NOT appear in random sample
|
|
491
|
+
#
|
|
492
|
+
# Using simplified formula (stable and no scipy required):
|
|
493
|
+
# P(X=0) = ((total_tokens - count) / total_tokens)^sample_size
|
|
494
|
+
|
|
495
|
+
hdd_sum = 0.0
|
|
496
|
+
|
|
497
|
+
for word_type, count in type_counts.items():
|
|
498
|
+
# Probability this type does NOT appear in sample of size sample_size
|
|
499
|
+
prob_not_appear = ((total_tokens - count) / total_tokens) ** sample_size
|
|
500
|
+
hdd_sum += prob_not_appear
|
|
501
|
+
|
|
502
|
+
# Step 5: Build metadata
|
|
503
|
+
metadata = {
|
|
504
|
+
"total_token_count": total_tokens,
|
|
505
|
+
"total_type_count": total_types,
|
|
506
|
+
"simple_ttr": total_types / total_tokens if total_tokens > 0 else 0.0,
|
|
507
|
+
"hypergeometric_sum": hdd_sum,
|
|
508
|
+
"calculation_method": "simplified",
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
# Step 6: Create distribution (single-pass analysis)
|
|
512
|
+
hdd_score_dist = make_distribution([hdd_sum])
|
|
513
|
+
|
|
514
|
+
# Step 7: Return result
|
|
515
|
+
return HDDResult(
|
|
516
|
+
hdd_score=hdd_sum,
|
|
517
|
+
sample_size=sample_size,
|
|
518
|
+
type_count=total_types,
|
|
519
|
+
token_count=total_tokens,
|
|
520
|
+
hdd_score_dist=hdd_score_dist,
|
|
521
|
+
chunk_size=chunk_size,
|
|
522
|
+
chunk_count=1, # Single pass analysis
|
|
523
|
+
metadata=metadata,
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def compute_msttr(text: str, segment_size: int = 100, chunk_size: int = 1000) -> MSTTRResult:
|
|
528
|
+
"""
|
|
529
|
+
Compute Mean Segmental Type-Token Ratio (MSTTR).
|
|
530
|
+
|
|
531
|
+
MSTTR divides text into sequential, non-overlapping segments of equal
|
|
532
|
+
length, calculates TTR for each segment, then averages across segments.
|
|
533
|
+
This normalizes for text length and provides a stable diversity measure.
|
|
534
|
+
|
|
535
|
+
Related GitHub Issue:
|
|
536
|
+
#14 - Advanced Lexical Diversity Metrics
|
|
537
|
+
https://github.com/craigtrim/pystylometry/issues/14
|
|
538
|
+
|
|
539
|
+
The algorithm:
|
|
540
|
+
1. Divide text into non-overlapping segments of segment_size tokens
|
|
541
|
+
2. Calculate TTR for each complete segment
|
|
542
|
+
3. Discard any remaining tokens that don't form a complete segment
|
|
543
|
+
4. Average TTRs across all segments
|
|
544
|
+
5. Compute statistics (std dev, min, max) across segments
|
|
545
|
+
|
|
546
|
+
Advantages over TTR:
|
|
547
|
+
- Normalized for text length (fixed segment size)
|
|
548
|
+
- Simple and intuitive
|
|
549
|
+
- Fast computation
|
|
550
|
+
- Independent segments (unlike MATTR's overlapping windows)
|
|
551
|
+
|
|
552
|
+
Disadvantages:
|
|
553
|
+
- Discards incomplete final segment (information loss)
|
|
554
|
+
- Requires choosing segment size (affects results)
|
|
555
|
+
- Needs longer texts to produce multiple segments
|
|
556
|
+
- Segment boundaries are arbitrary
|
|
557
|
+
|
|
558
|
+
Args:
|
|
559
|
+
text: Input text to analyze. Should contain at least segment_size tokens.
|
|
560
|
+
Texts shorter than segment_size will return NaN. Longer texts
|
|
561
|
+
will have leftover tokens discarded if they don't form a complete
|
|
562
|
+
segment.
|
|
563
|
+
segment_size: Size of each segment in tokens. Default is 100 following
|
|
564
|
+
Johnson (1944). Larger segments are more stable but need
|
|
565
|
+
longer texts. Smaller segments are noisier but work with
|
|
566
|
+
shorter texts.
|
|
567
|
+
|
|
568
|
+
Returns:
|
|
569
|
+
MSTTRResult containing:
|
|
570
|
+
- msttr_score: Mean TTR across all segments
|
|
571
|
+
- segment_size: Size of each segment used
|
|
572
|
+
- segment_count: Number of complete segments analyzed
|
|
573
|
+
- ttr_std_dev: Standard deviation of TTR across segments
|
|
574
|
+
- min_ttr: Minimum TTR in any segment
|
|
575
|
+
- max_ttr: Maximum TTR in any segment
|
|
576
|
+
- segment_ttrs: List of TTR for each segment
|
|
577
|
+
- metadata: Segment details, tokens used/discarded
|
|
578
|
+
|
|
579
|
+
Example:
|
|
580
|
+
>>> result = compute_msttr("Long text with many segments...", segment_size=100)
|
|
581
|
+
>>> print(f"MSTTR score: {result.msttr_score:.3f}")
|
|
582
|
+
MSTTR score: 0.734
|
|
583
|
+
>>> print(f"Segments: {result.segment_count}")
|
|
584
|
+
Segments: 8
|
|
585
|
+
>>> print(f"TTR range: {result.min_ttr:.3f} to {result.max_ttr:.3f}")
|
|
586
|
+
TTR range: 0.680 to 0.790
|
|
587
|
+
|
|
588
|
+
>>> # Short text handling
|
|
589
|
+
>>> short_text = "Too short"
|
|
590
|
+
>>> result = compute_msttr(short_text, segment_size=100)
|
|
591
|
+
>>> import math
|
|
592
|
+
>>> math.isnan(result.msttr_score)
|
|
593
|
+
True
|
|
594
|
+
|
|
595
|
+
Note:
|
|
596
|
+
- Segment size choice affects results (common values: 50, 100, 200)
|
|
597
|
+
- Standard segment size is 100 tokens (Johnson 1944)
|
|
598
|
+
- Leftover tokens are discarded (e.g., 250 tokens → 2 segments of 100)
|
|
599
|
+
- At least 1 complete segment required (min text length = segment_size)
|
|
600
|
+
- High TTR std dev suggests inconsistent lexical diversity across text
|
|
601
|
+
- MSTTR values range from 0 (no diversity) to 1 (perfect diversity)
|
|
602
|
+
"""
|
|
603
|
+
# Step 1: Tokenize text
|
|
604
|
+
tokens = _tokenize_for_diversity(text)
|
|
605
|
+
total_tokens = len(tokens)
|
|
606
|
+
total_types = len(set(tokens))
|
|
607
|
+
|
|
608
|
+
# Step 2: Validate minimum length
|
|
609
|
+
if total_tokens < segment_size:
|
|
610
|
+
raise ValueError(
|
|
611
|
+
f"Text has {total_tokens} tokens, minimum {segment_size} required for MSTTR"
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
# Step 3: Calculate number of complete segments
|
|
615
|
+
segment_count = total_tokens // segment_size
|
|
616
|
+
|
|
617
|
+
# Step 4: Calculate TTR for each segment
|
|
618
|
+
segment_ttrs = []
|
|
619
|
+
|
|
620
|
+
for i in range(segment_count):
|
|
621
|
+
# Extract segment
|
|
622
|
+
start = i * segment_size
|
|
623
|
+
end = start + segment_size
|
|
624
|
+
segment = tokens[start:end]
|
|
625
|
+
|
|
626
|
+
# Calculate TTR for this segment
|
|
627
|
+
segment_types = len(set(segment))
|
|
628
|
+
ttr = segment_types / segment_size
|
|
629
|
+
segment_ttrs.append(ttr)
|
|
630
|
+
|
|
631
|
+
# Step 5: Calculate MSTTR (mean of segment TTRs)
|
|
632
|
+
msttr_score = sum(segment_ttrs) / len(segment_ttrs)
|
|
633
|
+
|
|
634
|
+
# Step 6: Calculate statistics
|
|
635
|
+
# Standard deviation
|
|
636
|
+
variance = sum((ttr - msttr_score) ** 2 for ttr in segment_ttrs) / len(segment_ttrs)
|
|
637
|
+
ttr_std_dev = variance**0.5
|
|
638
|
+
|
|
639
|
+
# Min and max
|
|
640
|
+
min_ttr = min(segment_ttrs)
|
|
641
|
+
max_ttr = max(segment_ttrs)
|
|
642
|
+
|
|
643
|
+
# Step 7: Calculate tokens used/discarded
|
|
644
|
+
tokens_used = segment_count * segment_size
|
|
645
|
+
tokens_discarded = total_tokens - tokens_used
|
|
646
|
+
|
|
647
|
+
# Step 8: Build metadata
|
|
648
|
+
metadata = {
|
|
649
|
+
"total_token_count": total_tokens,
|
|
650
|
+
"total_type_count": total_types,
|
|
651
|
+
"simple_ttr": total_types / total_tokens if total_tokens > 0 else 0.0,
|
|
652
|
+
"tokens_used": tokens_used,
|
|
653
|
+
"tokens_discarded": tokens_discarded,
|
|
654
|
+
"first_segment_ttr": segment_ttrs[0],
|
|
655
|
+
"last_segment_ttr": segment_ttrs[-1],
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
# Step 9: Create distributions (single-pass analysis)
|
|
659
|
+
msttr_score_dist = make_distribution([msttr_score])
|
|
660
|
+
ttr_std_dev_dist = make_distribution([ttr_std_dev])
|
|
661
|
+
min_ttr_dist = make_distribution([min_ttr])
|
|
662
|
+
max_ttr_dist = make_distribution([max_ttr])
|
|
663
|
+
|
|
664
|
+
# Step 10: Return result
|
|
665
|
+
return MSTTRResult(
|
|
666
|
+
msttr_score=msttr_score,
|
|
667
|
+
segment_size=segment_size,
|
|
668
|
+
segment_count=segment_count,
|
|
669
|
+
ttr_std_dev=ttr_std_dev,
|
|
670
|
+
min_ttr=min_ttr,
|
|
671
|
+
max_ttr=max_ttr,
|
|
672
|
+
segment_ttrs=segment_ttrs,
|
|
673
|
+
msttr_score_dist=msttr_score_dist,
|
|
674
|
+
ttr_std_dev_dist=ttr_std_dev_dist,
|
|
675
|
+
min_ttr_dist=min_ttr_dist,
|
|
676
|
+
max_ttr_dist=max_ttr_dist,
|
|
677
|
+
chunk_size=chunk_size,
|
|
678
|
+
chunk_count=1, # Single pass analysis
|
|
679
|
+
metadata=metadata,
|
|
680
|
+
)
|