pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
pystylometry/lexical/hapax.py
CHANGED
|
@@ -1,15 +1,87 @@
|
|
|
1
|
-
"""Hapax legomena and related vocabulary richness metrics.
|
|
1
|
+
"""Hapax legomena and related vocabulary richness metrics.
|
|
2
2
|
|
|
3
|
+
This module implements hapax metrics with native chunked analysis for
|
|
4
|
+
stylometric fingerprinting.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import math
|
|
3
12
|
from collections import Counter
|
|
4
13
|
|
|
5
|
-
from .._types import
|
|
6
|
-
|
|
14
|
+
from .._types import (
|
|
15
|
+
Distribution,
|
|
16
|
+
HapaxLexiconResult,
|
|
17
|
+
HapaxResult,
|
|
18
|
+
LexiconCategories,
|
|
19
|
+
chunk_text,
|
|
20
|
+
make_distribution,
|
|
21
|
+
)
|
|
22
|
+
from .._utils import check_optional_dependency, tokenize
|
|
23
|
+
|
|
7
24
|
|
|
25
|
+
def _compute_hapax_single(text: str) -> tuple[int, float, int, float, float, float, dict]:
|
|
26
|
+
"""Compute hapax metrics for a single chunk of text.
|
|
8
27
|
|
|
9
|
-
|
|
28
|
+
Returns:
|
|
29
|
+
Tuple of (hapax_count, hapax_ratio, dis_hapax_count, dis_hapax_ratio,
|
|
30
|
+
sichel_s, honore_r, metadata_dict).
|
|
31
|
+
Returns nans for ratios on empty input.
|
|
32
|
+
"""
|
|
33
|
+
tokens = tokenize(text.lower())
|
|
34
|
+
N = len(tokens) # noqa: N806
|
|
35
|
+
|
|
36
|
+
if N == 0:
|
|
37
|
+
return (
|
|
38
|
+
0,
|
|
39
|
+
float("nan"),
|
|
40
|
+
0,
|
|
41
|
+
float("nan"),
|
|
42
|
+
float("nan"),
|
|
43
|
+
float("nan"),
|
|
44
|
+
{"token_count": 0, "vocabulary_size": 0},
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Count frequency of each token
|
|
48
|
+
freq_counter = Counter(tokens)
|
|
49
|
+
V = len(freq_counter) # noqa: N806
|
|
50
|
+
|
|
51
|
+
# Count hapax legomena (V₁) and dislegomena (V₂)
|
|
52
|
+
V1 = sum(1 for count in freq_counter.values() if count == 1) # noqa: N806
|
|
53
|
+
V2 = sum(1 for count in freq_counter.values() if count == 2) # noqa: N806
|
|
54
|
+
|
|
55
|
+
# Sichel's S: ratio of dislegomena to vocabulary size
|
|
56
|
+
sichel_s = V2 / V if V > 0 else 0.0
|
|
57
|
+
|
|
58
|
+
# Honoré's R: 100 × log(N) / (1 - V₁/V)
|
|
59
|
+
if V1 == V:
|
|
60
|
+
honore_r = float("inf")
|
|
61
|
+
else:
|
|
62
|
+
honore_r = 100 * math.log(N) / (1 - V1 / V)
|
|
63
|
+
|
|
64
|
+
hapax_ratio = V1 / N if N > 0 else 0.0
|
|
65
|
+
dis_hapax_ratio = V2 / N if N > 0 else 0.0
|
|
66
|
+
|
|
67
|
+
return (
|
|
68
|
+
V1,
|
|
69
|
+
hapax_ratio,
|
|
70
|
+
V2,
|
|
71
|
+
dis_hapax_ratio,
|
|
72
|
+
sichel_s,
|
|
73
|
+
honore_r,
|
|
74
|
+
{"token_count": N, "vocabulary_size": V},
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def compute_hapax_ratios(text: str, chunk_size: int = 1000) -> HapaxResult:
|
|
10
79
|
"""
|
|
11
80
|
Compute hapax legomena, hapax dislegomena, and related richness metrics.
|
|
12
81
|
|
|
82
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
83
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
84
|
+
|
|
13
85
|
Hapax legomena = words appearing exactly once
|
|
14
86
|
Hapax dislegomena = words appearing exactly twice
|
|
15
87
|
|
|
@@ -17,6 +89,10 @@ def compute_hapax_ratios(text: str) -> HapaxResult:
|
|
|
17
89
|
- Sichel's S: V₂ / V (ratio of dislegomena to total vocabulary)
|
|
18
90
|
- Honoré's R: 100 × log(N) / (1 - V₁/V)
|
|
19
91
|
|
|
92
|
+
Related GitHub Issue:
|
|
93
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
94
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
95
|
+
|
|
20
96
|
References:
|
|
21
97
|
Sichel, H. S. (1975). On a distribution law for word frequencies.
|
|
22
98
|
Journal of the American Statistical Association, 70(351a), 542-547.
|
|
@@ -26,50 +102,251 @@ def compute_hapax_ratios(text: str) -> HapaxResult:
|
|
|
26
102
|
|
|
27
103
|
Args:
|
|
28
104
|
text: Input text to analyze
|
|
105
|
+
chunk_size: Number of words per chunk (default: 1000)
|
|
29
106
|
|
|
30
107
|
Returns:
|
|
31
|
-
HapaxResult with counts, ratios,
|
|
108
|
+
HapaxResult with counts, ratios, distributions, and metadata
|
|
32
109
|
|
|
33
110
|
Example:
|
|
34
|
-
>>> result = compute_hapax_ratios("
|
|
35
|
-
>>>
|
|
36
|
-
|
|
111
|
+
>>> result = compute_hapax_ratios("Long text here...", chunk_size=1000)
|
|
112
|
+
>>> result.hapax_ratio # Mean across chunks
|
|
113
|
+
0.45
|
|
114
|
+
>>> result.hapax_ratio_dist.std # Variance reveals fingerprint
|
|
115
|
+
0.08
|
|
37
116
|
"""
|
|
38
|
-
|
|
39
|
-
|
|
117
|
+
# Chunk the text
|
|
118
|
+
chunks = chunk_text(text, chunk_size)
|
|
40
119
|
|
|
41
|
-
|
|
120
|
+
# Compute metrics per chunk
|
|
121
|
+
hapax_ratio_values = []
|
|
122
|
+
dis_hapax_ratio_values = []
|
|
123
|
+
sichel_s_values = []
|
|
124
|
+
honore_r_values = []
|
|
125
|
+
honore_r_inf_count = 0 # Track chunks where all words are unique (V₁ = V)
|
|
126
|
+
total_hapax_count = 0
|
|
127
|
+
total_dis_hapax_count = 0
|
|
128
|
+
total_tokens = 0
|
|
129
|
+
total_vocab = 0
|
|
130
|
+
valid_chunk_count = 0
|
|
131
|
+
|
|
132
|
+
for chunk in chunks:
|
|
133
|
+
h_cnt, h_rat, dh_cnt, dh_rat, sichel, honore, meta = _compute_hapax_single(chunk)
|
|
134
|
+
total_hapax_count += h_cnt
|
|
135
|
+
total_dis_hapax_count += dh_cnt
|
|
136
|
+
total_tokens += meta.get("token_count", 0)
|
|
137
|
+
total_vocab += meta.get("vocabulary_size", 0)
|
|
138
|
+
|
|
139
|
+
if not math.isnan(h_rat):
|
|
140
|
+
hapax_ratio_values.append(h_rat)
|
|
141
|
+
valid_chunk_count += 1
|
|
142
|
+
if not math.isnan(dh_rat):
|
|
143
|
+
dis_hapax_ratio_values.append(dh_rat)
|
|
144
|
+
if not math.isnan(sichel):
|
|
145
|
+
sichel_s_values.append(sichel)
|
|
146
|
+
if math.isinf(honore):
|
|
147
|
+
# Track infinite values (when V₁ = V, maximal vocabulary richness)
|
|
148
|
+
honore_r_inf_count += 1
|
|
149
|
+
elif not math.isnan(honore):
|
|
150
|
+
honore_r_values.append(honore)
|
|
151
|
+
|
|
152
|
+
# Handle empty or all-invalid chunks
|
|
153
|
+
if not hapax_ratio_values:
|
|
154
|
+
empty_dist = Distribution(
|
|
155
|
+
values=[],
|
|
156
|
+
mean=float("nan"),
|
|
157
|
+
median=float("nan"),
|
|
158
|
+
std=0.0,
|
|
159
|
+
range=0.0,
|
|
160
|
+
iqr=0.0,
|
|
161
|
+
)
|
|
42
162
|
return HapaxResult(
|
|
43
163
|
hapax_count=0,
|
|
44
|
-
hapax_ratio=
|
|
164
|
+
hapax_ratio=float("nan"),
|
|
45
165
|
dis_hapax_count=0,
|
|
46
|
-
dis_hapax_ratio=
|
|
47
|
-
sichel_s=
|
|
48
|
-
honore_r=
|
|
49
|
-
|
|
166
|
+
dis_hapax_ratio=float("nan"),
|
|
167
|
+
sichel_s=float("nan"),
|
|
168
|
+
honore_r=float("nan"),
|
|
169
|
+
hapax_ratio_dist=empty_dist,
|
|
170
|
+
dis_hapax_ratio_dist=empty_dist,
|
|
171
|
+
sichel_s_dist=empty_dist,
|
|
172
|
+
honore_r_dist=empty_dist,
|
|
173
|
+
chunk_size=chunk_size,
|
|
174
|
+
chunk_count=len(chunks),
|
|
175
|
+
metadata={"total_token_count": 0, "total_vocabulary_size": 0},
|
|
50
176
|
)
|
|
51
177
|
|
|
52
|
-
#
|
|
178
|
+
# Build distributions
|
|
179
|
+
hapax_ratio_dist = make_distribution(hapax_ratio_values)
|
|
180
|
+
dis_hapax_ratio_dist = make_distribution(dis_hapax_ratio_values)
|
|
181
|
+
sichel_s_dist = (
|
|
182
|
+
make_distribution(sichel_s_values)
|
|
183
|
+
if sichel_s_values
|
|
184
|
+
else Distribution(
|
|
185
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
186
|
+
)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# Handle honore_r specially: if all valid chunks had V₁ = V (all unique words),
|
|
190
|
+
# return infinity to indicate maximal vocabulary richness
|
|
191
|
+
if honore_r_values:
|
|
192
|
+
honore_r_dist = make_distribution(honore_r_values)
|
|
193
|
+
honore_r_final = honore_r_dist.mean
|
|
194
|
+
elif honore_r_inf_count > 0 and honore_r_inf_count == valid_chunk_count:
|
|
195
|
+
# All valid chunks had infinite honore_r (all words unique)
|
|
196
|
+
honore_r_dist = Distribution(
|
|
197
|
+
values=[], mean=float("inf"), median=float("inf"), std=0.0, range=0.0, iqr=0.0
|
|
198
|
+
)
|
|
199
|
+
honore_r_final = float("inf")
|
|
200
|
+
else:
|
|
201
|
+
honore_r_dist = Distribution(
|
|
202
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
203
|
+
)
|
|
204
|
+
honore_r_final = float("nan")
|
|
205
|
+
|
|
206
|
+
return HapaxResult(
|
|
207
|
+
hapax_count=total_hapax_count,
|
|
208
|
+
hapax_ratio=hapax_ratio_dist.mean,
|
|
209
|
+
dis_hapax_count=total_dis_hapax_count,
|
|
210
|
+
dis_hapax_ratio=dis_hapax_ratio_dist.mean,
|
|
211
|
+
sichel_s=sichel_s_dist.mean,
|
|
212
|
+
honore_r=honore_r_final,
|
|
213
|
+
hapax_ratio_dist=hapax_ratio_dist,
|
|
214
|
+
dis_hapax_ratio_dist=dis_hapax_ratio_dist,
|
|
215
|
+
sichel_s_dist=sichel_s_dist,
|
|
216
|
+
honore_r_dist=honore_r_dist,
|
|
217
|
+
chunk_size=chunk_size,
|
|
218
|
+
chunk_count=len(chunks),
|
|
219
|
+
metadata={
|
|
220
|
+
"total_token_count": total_tokens,
|
|
221
|
+
"total_vocabulary_size": total_vocab,
|
|
222
|
+
},
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def compute_hapax_with_lexicon_analysis(text: str) -> HapaxLexiconResult:
|
|
227
|
+
"""
|
|
228
|
+
Compute hapax legomena with lexicon-based categorization.
|
|
229
|
+
|
|
230
|
+
Extends standard hapax analysis by categorizing hapax legomena based on
|
|
231
|
+
presence in WordNet and British National Corpus (BNC). This distinguishes
|
|
232
|
+
between:
|
|
233
|
+
|
|
234
|
+
1. **Neologisms**: Words not in WordNet AND not in BNC
|
|
235
|
+
- True novel words or proper nouns
|
|
236
|
+
- High neologism ratio indicates vocabulary innovation
|
|
237
|
+
|
|
238
|
+
2. **Rare Words**: Words in BNC but not WordNet, or vice versa
|
|
239
|
+
- Technical jargon, specialized terminology
|
|
240
|
+
- Words at the edges of common vocabulary
|
|
241
|
+
|
|
242
|
+
3. **Common Words**: Words in both WordNet AND BNC
|
|
243
|
+
- Standard vocabulary that happens to appear once
|
|
244
|
+
- Low incidental usage of common words
|
|
245
|
+
|
|
246
|
+
This categorization is valuable for stylometric analysis:
|
|
247
|
+
- Authors with high neologism ratios are more innovative/creative
|
|
248
|
+
- Technical writing typically has higher rare word ratios
|
|
249
|
+
- Comparison of neologism vs common hapax distinguishes vocabulary
|
|
250
|
+
innovation from incidental word usage
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
text: Input text to analyze
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
HapaxLexiconResult with standard hapax metrics and lexicon categorization
|
|
257
|
+
|
|
258
|
+
Raises:
|
|
259
|
+
ImportError: If bnc-lookup or wordnet-lookup packages are not installed
|
|
260
|
+
|
|
261
|
+
Example:
|
|
262
|
+
>>> text = "The xyzbot platform facilitates interdepartmental synergy."
|
|
263
|
+
>>> result = compute_hapax_with_lexicon_analysis(text)
|
|
264
|
+
>>> result.lexicon_analysis.neologisms
|
|
265
|
+
['xyzbot', 'platform']
|
|
266
|
+
>>> result.lexicon_analysis.rare_words
|
|
267
|
+
['facilitates', 'interdepartmental']
|
|
268
|
+
>>> result.lexicon_analysis.common_words
|
|
269
|
+
['synergy']
|
|
270
|
+
>>> print(f"Neologism ratio: {result.lexicon_analysis.neologism_ratio:.2%}")
|
|
271
|
+
Neologism ratio: 40.00%
|
|
272
|
+
|
|
273
|
+
References:
|
|
274
|
+
British National Corpus: http://www.natcorp.ox.ac.uk/
|
|
275
|
+
WordNet: https://wordnet.princeton.edu/
|
|
276
|
+
"""
|
|
277
|
+
# Check dependencies
|
|
278
|
+
check_optional_dependency("bnc_lookup", "lexical")
|
|
279
|
+
check_optional_dependency("wordnet_lookup", "lexical")
|
|
280
|
+
|
|
281
|
+
from bnc_lookup import exists as is_bnc_term # type: ignore[import-untyped]
|
|
282
|
+
from wordnet_lookup import is_wordnet_term # type: ignore[import-untyped]
|
|
283
|
+
|
|
284
|
+
# First compute standard hapax metrics
|
|
285
|
+
hapax_result = compute_hapax_ratios(text)
|
|
286
|
+
|
|
287
|
+
# If no hapax legomena, return empty categorization
|
|
288
|
+
if hapax_result.hapax_count == 0:
|
|
289
|
+
return HapaxLexiconResult(
|
|
290
|
+
hapax_result=hapax_result,
|
|
291
|
+
lexicon_analysis=LexiconCategories(
|
|
292
|
+
neologisms=[],
|
|
293
|
+
rare_words=[],
|
|
294
|
+
common_words=[],
|
|
295
|
+
neologism_ratio=0.0,
|
|
296
|
+
rare_word_ratio=0.0,
|
|
297
|
+
metadata={"total_hapax": 0},
|
|
298
|
+
),
|
|
299
|
+
metadata={"note": "No hapax legomena found"},
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# Get tokens and identify hapax words
|
|
303
|
+
tokens = tokenize(text.lower())
|
|
53
304
|
freq_counter = Counter(tokens)
|
|
54
|
-
|
|
305
|
+
hapax_words = [word for word, count in freq_counter.items() if count == 1]
|
|
55
306
|
|
|
56
|
-
#
|
|
57
|
-
|
|
58
|
-
|
|
307
|
+
# Categorize each hapax word by lexicon presence
|
|
308
|
+
neologisms = []
|
|
309
|
+
rare_words = []
|
|
310
|
+
common_words = []
|
|
59
311
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
312
|
+
for word in hapax_words:
|
|
313
|
+
in_bnc = is_bnc_term(word)
|
|
314
|
+
in_wordnet = is_wordnet_term(word)
|
|
63
315
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
316
|
+
if not in_bnc and not in_wordnet:
|
|
317
|
+
# Not in either lexicon → true neologism
|
|
318
|
+
neologisms.append(word)
|
|
319
|
+
elif in_bnc and in_wordnet:
|
|
320
|
+
# In both lexicons → common word
|
|
321
|
+
common_words.append(word)
|
|
322
|
+
else:
|
|
323
|
+
# In one but not the other → rare word
|
|
324
|
+
rare_words.append(word)
|
|
325
|
+
|
|
326
|
+
# Calculate ratios
|
|
327
|
+
total_hapax = len(hapax_words)
|
|
328
|
+
neologism_ratio = len(neologisms) / total_hapax if total_hapax > 0 else 0.0
|
|
329
|
+
rare_word_ratio = len(rare_words) / total_hapax if total_hapax > 0 else 0.0
|
|
330
|
+
common_word_ratio = len(common_words) / total_hapax if total_hapax > 0 else 0.0
|
|
331
|
+
|
|
332
|
+
return HapaxLexiconResult(
|
|
333
|
+
hapax_result=hapax_result,
|
|
334
|
+
lexicon_analysis=LexiconCategories(
|
|
335
|
+
neologisms=sorted(neologisms),
|
|
336
|
+
rare_words=sorted(rare_words),
|
|
337
|
+
common_words=sorted(common_words),
|
|
338
|
+
neologism_ratio=neologism_ratio,
|
|
339
|
+
rare_word_ratio=rare_word_ratio,
|
|
340
|
+
metadata={
|
|
341
|
+
"total_hapax": total_hapax,
|
|
342
|
+
"neologism_count": len(neologisms),
|
|
343
|
+
"rare_word_count": len(rare_words),
|
|
344
|
+
"common_word_count": len(common_words),
|
|
345
|
+
"common_word_ratio": common_word_ratio,
|
|
346
|
+
},
|
|
347
|
+
),
|
|
71
348
|
metadata={
|
|
72
|
-
"
|
|
73
|
-
"
|
|
349
|
+
"lexicons_used": ["bnc", "wordnet"],
|
|
350
|
+
"note": "Lexicon categorization based on BNC and WordNet presence",
|
|
74
351
|
},
|
|
75
352
|
)
|
pystylometry/lexical/mtld.py
CHANGED
|
@@ -1,23 +1,130 @@
|
|
|
1
|
-
"""MTLD (Measure of Textual Lexical Diversity) implementation.
|
|
1
|
+
"""MTLD (Measure of Textual Lexical Diversity) implementation.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
This module implements MTLD with native chunked analysis for stylometric
|
|
4
|
+
fingerprinting.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
|
|
13
|
+
from .._types import Distribution, MTLDResult, chunk_text, make_distribution
|
|
4
14
|
from .._utils import tokenize
|
|
5
15
|
|
|
6
16
|
|
|
17
|
+
def _calculate_mtld_direction(tokens: list[str], threshold: float, forward: bool) -> float:
|
|
18
|
+
"""
|
|
19
|
+
Calculate MTLD in one direction (forward or backward).
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
tokens: List of tokens to analyze
|
|
23
|
+
threshold: TTR threshold to maintain (must be in range (0, 1))
|
|
24
|
+
forward: If True, process forward; if False, process backward
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
MTLD score for this direction
|
|
28
|
+
"""
|
|
29
|
+
if len(tokens) == 0:
|
|
30
|
+
return 0.0
|
|
31
|
+
|
|
32
|
+
# Process tokens in the specified direction
|
|
33
|
+
token_list = tokens if forward else tokens[::-1]
|
|
34
|
+
|
|
35
|
+
factors = 0.0
|
|
36
|
+
current_count = 0
|
|
37
|
+
current_types = set()
|
|
38
|
+
|
|
39
|
+
for token in token_list:
|
|
40
|
+
current_count += 1
|
|
41
|
+
current_types.add(token)
|
|
42
|
+
|
|
43
|
+
# Calculate current TTR
|
|
44
|
+
ttr = len(current_types) / current_count
|
|
45
|
+
|
|
46
|
+
# If TTR drops below threshold, we've completed a factor
|
|
47
|
+
if ttr < threshold:
|
|
48
|
+
factors += 1.0
|
|
49
|
+
current_count = 0
|
|
50
|
+
current_types = set()
|
|
51
|
+
|
|
52
|
+
# Handle remaining partial factor
|
|
53
|
+
# Add proportion of a complete factor based on how close we are to threshold
|
|
54
|
+
if current_count > 0:
|
|
55
|
+
ttr = len(current_types) / current_count
|
|
56
|
+
# If we're still above threshold, add partial factor credit
|
|
57
|
+
# Formula: (1 - current_ttr) / (1 - threshold)
|
|
58
|
+
# This represents how far we've progressed toward completing a factor
|
|
59
|
+
# In theory, ttr should always be >= threshold here because drops below
|
|
60
|
+
# threshold are handled in the loop above (which resets current_count).
|
|
61
|
+
# Adding defensive check to prevent mathematical errors.
|
|
62
|
+
if ttr >= threshold:
|
|
63
|
+
factors += (1.0 - ttr) / (1.0 - threshold)
|
|
64
|
+
|
|
65
|
+
# MTLD is the mean length of factors
|
|
66
|
+
# Total tokens / number of factors
|
|
67
|
+
if factors > 0:
|
|
68
|
+
return len(tokens) / factors
|
|
69
|
+
else:
|
|
70
|
+
# If no factors were completed, return the text length
|
|
71
|
+
# This happens when TTR stays above threshold for the entire text
|
|
72
|
+
return float(len(tokens))
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _compute_mtld_single(text: str, threshold: float) -> tuple[float, float, float, dict]:
|
|
76
|
+
"""Compute MTLD for a single chunk of text.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Tuple of (mtld_forward, mtld_backward, mtld_average, metadata_dict).
|
|
80
|
+
Returns (nan, nan, nan, metadata) for empty input.
|
|
81
|
+
"""
|
|
82
|
+
tokens = tokenize(text.lower())
|
|
83
|
+
|
|
84
|
+
if len(tokens) == 0:
|
|
85
|
+
return (
|
|
86
|
+
float("nan"),
|
|
87
|
+
float("nan"),
|
|
88
|
+
float("nan"),
|
|
89
|
+
{"token_count": 0},
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
mtld_forward = _calculate_mtld_direction(tokens, threshold, forward=True)
|
|
93
|
+
mtld_backward = _calculate_mtld_direction(tokens, threshold, forward=False)
|
|
94
|
+
mtld_average = (mtld_forward + mtld_backward) / 2
|
|
95
|
+
|
|
96
|
+
return (
|
|
97
|
+
mtld_forward,
|
|
98
|
+
mtld_backward,
|
|
99
|
+
mtld_average,
|
|
100
|
+
{"token_count": len(tokens)},
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
7
104
|
def compute_mtld(
|
|
8
105
|
text: str,
|
|
9
106
|
threshold: float = 0.72,
|
|
107
|
+
chunk_size: int = 1000,
|
|
10
108
|
) -> MTLDResult:
|
|
11
109
|
"""
|
|
12
110
|
Compute MTLD (Measure of Textual Lexical Diversity).
|
|
13
111
|
|
|
112
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
113
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
114
|
+
|
|
14
115
|
MTLD measures the mean length of sequential word strings that maintain
|
|
15
116
|
a minimum threshold TTR. It's more robust than simple TTR for texts of
|
|
16
117
|
varying lengths.
|
|
17
118
|
|
|
18
119
|
Formula:
|
|
19
|
-
MTLD =
|
|
20
|
-
where
|
|
120
|
+
MTLD = total_tokens / factor_count
|
|
121
|
+
where factor_count includes:
|
|
122
|
+
- Completed factors (segments where TTR dropped below threshold)
|
|
123
|
+
- Partial factor for any remaining incomplete segment (weighted by proximity to threshold)
|
|
124
|
+
|
|
125
|
+
Related GitHub Issue:
|
|
126
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
127
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
21
128
|
|
|
22
129
|
References:
|
|
23
130
|
McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D:
|
|
@@ -26,36 +133,87 @@ def compute_mtld(
|
|
|
26
133
|
|
|
27
134
|
Args:
|
|
28
135
|
text: Input text to analyze
|
|
29
|
-
threshold: TTR threshold to maintain (default: 0.72)
|
|
136
|
+
threshold: TTR threshold to maintain (default: 0.72, must be in range (0, 1))
|
|
137
|
+
chunk_size: Number of words per chunk (default: 1000)
|
|
30
138
|
|
|
31
139
|
Returns:
|
|
32
|
-
MTLDResult with forward, backward,
|
|
140
|
+
MTLDResult with forward, backward, average MTLD scores and distributions
|
|
141
|
+
|
|
142
|
+
Raises:
|
|
143
|
+
ValueError: If threshold is not in range (0, 1)
|
|
33
144
|
|
|
34
145
|
Example:
|
|
35
|
-
>>> result = compute_mtld("
|
|
36
|
-
>>>
|
|
146
|
+
>>> result = compute_mtld("Long text here...", chunk_size=1000)
|
|
147
|
+
>>> result.mtld_average # Mean across chunks
|
|
148
|
+
72.5
|
|
149
|
+
>>> result.mtld_average_dist.std # Variance reveals fingerprint
|
|
150
|
+
8.3
|
|
37
151
|
"""
|
|
38
|
-
|
|
152
|
+
# Validate threshold parameter
|
|
153
|
+
if not (0 < threshold < 1):
|
|
154
|
+
raise ValueError(
|
|
155
|
+
f"Threshold must be in range (0, 1), got {threshold}. "
|
|
156
|
+
"Common values: 0.72 (default), 0.5-0.8"
|
|
157
|
+
)
|
|
39
158
|
|
|
40
|
-
|
|
159
|
+
# Chunk the text
|
|
160
|
+
chunks = chunk_text(text, chunk_size)
|
|
161
|
+
|
|
162
|
+
# Compute metrics per chunk
|
|
163
|
+
forward_values = []
|
|
164
|
+
backward_values = []
|
|
165
|
+
average_values = []
|
|
166
|
+
total_tokens = 0
|
|
167
|
+
|
|
168
|
+
for chunk in chunks:
|
|
169
|
+
fwd, bwd, avg, meta = _compute_mtld_single(chunk, threshold)
|
|
170
|
+
if not math.isnan(fwd):
|
|
171
|
+
forward_values.append(fwd)
|
|
172
|
+
backward_values.append(bwd)
|
|
173
|
+
average_values.append(avg)
|
|
174
|
+
total_tokens += meta.get("token_count", 0)
|
|
175
|
+
|
|
176
|
+
# Handle empty or all-invalid chunks
|
|
177
|
+
if not forward_values:
|
|
178
|
+
empty_dist = Distribution(
|
|
179
|
+
values=[],
|
|
180
|
+
mean=float("nan"),
|
|
181
|
+
median=float("nan"),
|
|
182
|
+
std=0.0,
|
|
183
|
+
range=0.0,
|
|
184
|
+
iqr=0.0,
|
|
185
|
+
)
|
|
41
186
|
return MTLDResult(
|
|
42
|
-
mtld_forward=
|
|
43
|
-
mtld_backward=
|
|
44
|
-
mtld_average=
|
|
45
|
-
|
|
187
|
+
mtld_forward=float("nan"),
|
|
188
|
+
mtld_backward=float("nan"),
|
|
189
|
+
mtld_average=float("nan"),
|
|
190
|
+
mtld_forward_dist=empty_dist,
|
|
191
|
+
mtld_backward_dist=empty_dist,
|
|
192
|
+
mtld_average_dist=empty_dist,
|
|
193
|
+
chunk_size=chunk_size,
|
|
194
|
+
chunk_count=len(chunks),
|
|
195
|
+
metadata={
|
|
196
|
+
"total_token_count": 0,
|
|
197
|
+
"threshold": threshold,
|
|
198
|
+
},
|
|
46
199
|
)
|
|
47
200
|
|
|
48
|
-
#
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
201
|
+
# Build distributions
|
|
202
|
+
forward_dist = make_distribution(forward_values)
|
|
203
|
+
backward_dist = make_distribution(backward_values)
|
|
204
|
+
average_dist = make_distribution(average_values)
|
|
52
205
|
|
|
53
206
|
return MTLDResult(
|
|
54
|
-
mtld_forward=
|
|
55
|
-
mtld_backward=
|
|
56
|
-
mtld_average=
|
|
207
|
+
mtld_forward=forward_dist.mean,
|
|
208
|
+
mtld_backward=backward_dist.mean,
|
|
209
|
+
mtld_average=average_dist.mean,
|
|
210
|
+
mtld_forward_dist=forward_dist,
|
|
211
|
+
mtld_backward_dist=backward_dist,
|
|
212
|
+
mtld_average_dist=average_dist,
|
|
213
|
+
chunk_size=chunk_size,
|
|
214
|
+
chunk_count=len(chunks),
|
|
57
215
|
metadata={
|
|
58
|
-
"
|
|
216
|
+
"total_token_count": total_tokens,
|
|
59
217
|
"threshold": threshold,
|
|
60
218
|
},
|
|
61
219
|
)
|