pystylometry 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +29 -3
- pystylometry/_types.py +963 -259
- pystylometry/authorship/__init__.py +23 -2
- pystylometry/authorship/additional_methods.py +4 -29
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +5 -2
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +1 -1
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/RECORD +0 -46
pystylometry/lexical/hapax.py
CHANGED
|
@@ -1,16 +1,87 @@
|
|
|
1
|
-
"""Hapax legomena and related vocabulary richness metrics.
|
|
1
|
+
"""Hapax legomena and related vocabulary richness metrics.
|
|
2
|
+
|
|
3
|
+
This module implements hapax metrics with native chunked analysis for
|
|
4
|
+
stylometric fingerprinting.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
9
|
+
"""
|
|
2
10
|
|
|
3
11
|
import math
|
|
4
12
|
from collections import Counter
|
|
5
13
|
|
|
6
|
-
from .._types import
|
|
14
|
+
from .._types import (
|
|
15
|
+
Distribution,
|
|
16
|
+
HapaxLexiconResult,
|
|
17
|
+
HapaxResult,
|
|
18
|
+
LexiconCategories,
|
|
19
|
+
chunk_text,
|
|
20
|
+
make_distribution,
|
|
21
|
+
)
|
|
7
22
|
from .._utils import check_optional_dependency, tokenize
|
|
8
23
|
|
|
9
24
|
|
|
10
|
-
def
|
|
25
|
+
def _compute_hapax_single(text: str) -> tuple[int, float, int, float, float, float, dict]:
|
|
26
|
+
"""Compute hapax metrics for a single chunk of text.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Tuple of (hapax_count, hapax_ratio, dis_hapax_count, dis_hapax_ratio,
|
|
30
|
+
sichel_s, honore_r, metadata_dict).
|
|
31
|
+
Returns nans for ratios on empty input.
|
|
32
|
+
"""
|
|
33
|
+
tokens = tokenize(text.lower())
|
|
34
|
+
N = len(tokens) # noqa: N806
|
|
35
|
+
|
|
36
|
+
if N == 0:
|
|
37
|
+
return (
|
|
38
|
+
0,
|
|
39
|
+
float("nan"),
|
|
40
|
+
0,
|
|
41
|
+
float("nan"),
|
|
42
|
+
float("nan"),
|
|
43
|
+
float("nan"),
|
|
44
|
+
{"token_count": 0, "vocabulary_size": 0},
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Count frequency of each token
|
|
48
|
+
freq_counter = Counter(tokens)
|
|
49
|
+
V = len(freq_counter) # noqa: N806
|
|
50
|
+
|
|
51
|
+
# Count hapax legomena (V₁) and dislegomena (V₂)
|
|
52
|
+
V1 = sum(1 for count in freq_counter.values() if count == 1) # noqa: N806
|
|
53
|
+
V2 = sum(1 for count in freq_counter.values() if count == 2) # noqa: N806
|
|
54
|
+
|
|
55
|
+
# Sichel's S: ratio of dislegomena to vocabulary size
|
|
56
|
+
sichel_s = V2 / V if V > 0 else 0.0
|
|
57
|
+
|
|
58
|
+
# Honoré's R: 100 × log(N) / (1 - V₁/V)
|
|
59
|
+
if V1 == V:
|
|
60
|
+
honore_r = float("inf")
|
|
61
|
+
else:
|
|
62
|
+
honore_r = 100 * math.log(N) / (1 - V1 / V)
|
|
63
|
+
|
|
64
|
+
hapax_ratio = V1 / N if N > 0 else 0.0
|
|
65
|
+
dis_hapax_ratio = V2 / N if N > 0 else 0.0
|
|
66
|
+
|
|
67
|
+
return (
|
|
68
|
+
V1,
|
|
69
|
+
hapax_ratio,
|
|
70
|
+
V2,
|
|
71
|
+
dis_hapax_ratio,
|
|
72
|
+
sichel_s,
|
|
73
|
+
honore_r,
|
|
74
|
+
{"token_count": N, "vocabulary_size": V},
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def compute_hapax_ratios(text: str, chunk_size: int = 1000) -> HapaxResult:
|
|
11
79
|
"""
|
|
12
80
|
Compute hapax legomena, hapax dislegomena, and related richness metrics.
|
|
13
81
|
|
|
82
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
83
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
84
|
+
|
|
14
85
|
Hapax legomena = words appearing exactly once
|
|
15
86
|
Hapax dislegomena = words appearing exactly twice
|
|
16
87
|
|
|
@@ -18,6 +89,10 @@ def compute_hapax_ratios(text: str) -> HapaxResult:
|
|
|
18
89
|
- Sichel's S: V₂ / V (ratio of dislegomena to total vocabulary)
|
|
19
90
|
- Honoré's R: 100 × log(N) / (1 - V₁/V)
|
|
20
91
|
|
|
92
|
+
Related GitHub Issue:
|
|
93
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
94
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
95
|
+
|
|
21
96
|
References:
|
|
22
97
|
Sichel, H. S. (1975). On a distribution law for word frequencies.
|
|
23
98
|
Journal of the American Statistical Association, 70(351a), 542-547.
|
|
@@ -27,68 +102,123 @@ def compute_hapax_ratios(text: str) -> HapaxResult:
|
|
|
27
102
|
|
|
28
103
|
Args:
|
|
29
104
|
text: Input text to analyze
|
|
105
|
+
chunk_size: Number of words per chunk (default: 1000)
|
|
30
106
|
|
|
31
107
|
Returns:
|
|
32
|
-
HapaxResult with counts, ratios,
|
|
33
|
-
|
|
34
|
-
Note: When all words are unique (V₁ = V), Honoré's R returns float('inf')
|
|
35
|
-
to indicate maximal vocabulary richness (division by zero case).
|
|
108
|
+
HapaxResult with counts, ratios, distributions, and metadata
|
|
36
109
|
|
|
37
110
|
Example:
|
|
38
|
-
>>>
|
|
39
|
-
>>> result
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
1
|
|
44
|
-
>>> print(f"Sichel's S: {result.sichel_s:.3f}")
|
|
45
|
-
Sichel's S: 0.125
|
|
111
|
+
>>> result = compute_hapax_ratios("Long text here...", chunk_size=1000)
|
|
112
|
+
>>> result.hapax_ratio # Mean across chunks
|
|
113
|
+
0.45
|
|
114
|
+
>>> result.hapax_ratio_dist.std # Variance reveals fingerprint
|
|
115
|
+
0.08
|
|
46
116
|
"""
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
117
|
+
# Chunk the text
|
|
118
|
+
chunks = chunk_text(text, chunk_size)
|
|
119
|
+
|
|
120
|
+
# Compute metrics per chunk
|
|
121
|
+
hapax_ratio_values = []
|
|
122
|
+
dis_hapax_ratio_values = []
|
|
123
|
+
sichel_s_values = []
|
|
124
|
+
honore_r_values = []
|
|
125
|
+
honore_r_inf_count = 0 # Track chunks where all words are unique (V₁ = V)
|
|
126
|
+
total_hapax_count = 0
|
|
127
|
+
total_dis_hapax_count = 0
|
|
128
|
+
total_tokens = 0
|
|
129
|
+
total_vocab = 0
|
|
130
|
+
valid_chunk_count = 0
|
|
131
|
+
|
|
132
|
+
for chunk in chunks:
|
|
133
|
+
h_cnt, h_rat, dh_cnt, dh_rat, sichel, honore, meta = _compute_hapax_single(chunk)
|
|
134
|
+
total_hapax_count += h_cnt
|
|
135
|
+
total_dis_hapax_count += dh_cnt
|
|
136
|
+
total_tokens += meta.get("token_count", 0)
|
|
137
|
+
total_vocab += meta.get("vocabulary_size", 0)
|
|
138
|
+
|
|
139
|
+
if not math.isnan(h_rat):
|
|
140
|
+
hapax_ratio_values.append(h_rat)
|
|
141
|
+
valid_chunk_count += 1
|
|
142
|
+
if not math.isnan(dh_rat):
|
|
143
|
+
dis_hapax_ratio_values.append(dh_rat)
|
|
144
|
+
if not math.isnan(sichel):
|
|
145
|
+
sichel_s_values.append(sichel)
|
|
146
|
+
if math.isinf(honore):
|
|
147
|
+
# Track infinite values (when V₁ = V, maximal vocabulary richness)
|
|
148
|
+
honore_r_inf_count += 1
|
|
149
|
+
elif not math.isnan(honore):
|
|
150
|
+
honore_r_values.append(honore)
|
|
151
|
+
|
|
152
|
+
# Handle empty or all-invalid chunks
|
|
153
|
+
if not hapax_ratio_values:
|
|
154
|
+
empty_dist = Distribution(
|
|
155
|
+
values=[],
|
|
156
|
+
mean=float("nan"),
|
|
157
|
+
median=float("nan"),
|
|
158
|
+
std=0.0,
|
|
159
|
+
range=0.0,
|
|
160
|
+
iqr=0.0,
|
|
161
|
+
)
|
|
51
162
|
return HapaxResult(
|
|
52
163
|
hapax_count=0,
|
|
53
|
-
hapax_ratio=
|
|
164
|
+
hapax_ratio=float("nan"),
|
|
54
165
|
dis_hapax_count=0,
|
|
55
|
-
dis_hapax_ratio=
|
|
56
|
-
sichel_s=
|
|
57
|
-
honore_r=
|
|
58
|
-
|
|
166
|
+
dis_hapax_ratio=float("nan"),
|
|
167
|
+
sichel_s=float("nan"),
|
|
168
|
+
honore_r=float("nan"),
|
|
169
|
+
hapax_ratio_dist=empty_dist,
|
|
170
|
+
dis_hapax_ratio_dist=empty_dist,
|
|
171
|
+
sichel_s_dist=empty_dist,
|
|
172
|
+
honore_r_dist=empty_dist,
|
|
173
|
+
chunk_size=chunk_size,
|
|
174
|
+
chunk_count=len(chunks),
|
|
175
|
+
metadata={"total_token_count": 0, "total_vocabulary_size": 0},
|
|
59
176
|
)
|
|
60
177
|
|
|
61
|
-
#
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
sichel_s = V2 / V if V > 0 else 0.0
|
|
178
|
+
# Build distributions
|
|
179
|
+
hapax_ratio_dist = make_distribution(hapax_ratio_values)
|
|
180
|
+
dis_hapax_ratio_dist = make_distribution(dis_hapax_ratio_values)
|
|
181
|
+
sichel_s_dist = (
|
|
182
|
+
make_distribution(sichel_s_values)
|
|
183
|
+
if sichel_s_values
|
|
184
|
+
else Distribution(
|
|
185
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
186
|
+
)
|
|
187
|
+
)
|
|
72
188
|
|
|
73
|
-
#
|
|
74
|
-
#
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
189
|
+
# Handle honore_r specially: if all valid chunks had V₁ = V (all unique words),
|
|
190
|
+
# return infinity to indicate maximal vocabulary richness
|
|
191
|
+
if honore_r_values:
|
|
192
|
+
honore_r_dist = make_distribution(honore_r_values)
|
|
193
|
+
honore_r_final = honore_r_dist.mean
|
|
194
|
+
elif honore_r_inf_count > 0 and honore_r_inf_count == valid_chunk_count:
|
|
195
|
+
# All valid chunks had infinite honore_r (all words unique)
|
|
196
|
+
honore_r_dist = Distribution(
|
|
197
|
+
values=[], mean=float("inf"), median=float("inf"), std=0.0, range=0.0, iqr=0.0
|
|
198
|
+
)
|
|
199
|
+
honore_r_final = float("inf")
|
|
79
200
|
else:
|
|
80
|
-
|
|
201
|
+
honore_r_dist = Distribution(
|
|
202
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
203
|
+
)
|
|
204
|
+
honore_r_final = float("nan")
|
|
81
205
|
|
|
82
206
|
return HapaxResult(
|
|
83
|
-
hapax_count=
|
|
84
|
-
hapax_ratio=
|
|
85
|
-
dis_hapax_count=
|
|
86
|
-
dis_hapax_ratio=
|
|
87
|
-
sichel_s=
|
|
88
|
-
honore_r=
|
|
207
|
+
hapax_count=total_hapax_count,
|
|
208
|
+
hapax_ratio=hapax_ratio_dist.mean,
|
|
209
|
+
dis_hapax_count=total_dis_hapax_count,
|
|
210
|
+
dis_hapax_ratio=dis_hapax_ratio_dist.mean,
|
|
211
|
+
sichel_s=sichel_s_dist.mean,
|
|
212
|
+
honore_r=honore_r_final,
|
|
213
|
+
hapax_ratio_dist=hapax_ratio_dist,
|
|
214
|
+
dis_hapax_ratio_dist=dis_hapax_ratio_dist,
|
|
215
|
+
sichel_s_dist=sichel_s_dist,
|
|
216
|
+
honore_r_dist=honore_r_dist,
|
|
217
|
+
chunk_size=chunk_size,
|
|
218
|
+
chunk_count=len(chunks),
|
|
89
219
|
metadata={
|
|
90
|
-
"
|
|
91
|
-
"
|
|
220
|
+
"total_token_count": total_tokens,
|
|
221
|
+
"total_vocabulary_size": total_vocab,
|
|
92
222
|
},
|
|
93
223
|
)
|
|
94
224
|
|
|
@@ -148,8 +278,8 @@ def compute_hapax_with_lexicon_analysis(text: str) -> HapaxLexiconResult:
|
|
|
148
278
|
check_optional_dependency("bnc_lookup", "lexical")
|
|
149
279
|
check_optional_dependency("wordnet_lookup", "lexical")
|
|
150
280
|
|
|
151
|
-
from bnc_lookup import is_bnc_term # type: ignore[import-
|
|
152
|
-
from wordnet_lookup import is_wordnet_term # type: ignore[import-
|
|
281
|
+
from bnc_lookup import exists as is_bnc_term # type: ignore[import-untyped]
|
|
282
|
+
from wordnet_lookup import is_wordnet_term # type: ignore[import-untyped]
|
|
153
283
|
|
|
154
284
|
# First compute standard hapax metrics
|
|
155
285
|
hapax_result = compute_hapax_ratios(text)
|
pystylometry/lexical/mtld.py
CHANGED
|
@@ -1,6 +1,16 @@
|
|
|
1
|
-
"""MTLD (Measure of Textual Lexical Diversity) implementation.
|
|
1
|
+
"""MTLD (Measure of Textual Lexical Diversity) implementation.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
This module implements MTLD with native chunked analysis for stylometric
|
|
4
|
+
fingerprinting.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
|
|
13
|
+
from .._types import Distribution, MTLDResult, chunk_text, make_distribution
|
|
4
14
|
from .._utils import tokenize
|
|
5
15
|
|
|
6
16
|
|
|
@@ -62,13 +72,46 @@ def _calculate_mtld_direction(tokens: list[str], threshold: float, forward: bool
|
|
|
62
72
|
return float(len(tokens))
|
|
63
73
|
|
|
64
74
|
|
|
75
|
+
def _compute_mtld_single(text: str, threshold: float) -> tuple[float, float, float, dict]:
|
|
76
|
+
"""Compute MTLD for a single chunk of text.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Tuple of (mtld_forward, mtld_backward, mtld_average, metadata_dict).
|
|
80
|
+
Returns (nan, nan, nan, metadata) for empty input.
|
|
81
|
+
"""
|
|
82
|
+
tokens = tokenize(text.lower())
|
|
83
|
+
|
|
84
|
+
if len(tokens) == 0:
|
|
85
|
+
return (
|
|
86
|
+
float("nan"),
|
|
87
|
+
float("nan"),
|
|
88
|
+
float("nan"),
|
|
89
|
+
{"token_count": 0},
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
mtld_forward = _calculate_mtld_direction(tokens, threshold, forward=True)
|
|
93
|
+
mtld_backward = _calculate_mtld_direction(tokens, threshold, forward=False)
|
|
94
|
+
mtld_average = (mtld_forward + mtld_backward) / 2
|
|
95
|
+
|
|
96
|
+
return (
|
|
97
|
+
mtld_forward,
|
|
98
|
+
mtld_backward,
|
|
99
|
+
mtld_average,
|
|
100
|
+
{"token_count": len(tokens)},
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
65
104
|
def compute_mtld(
|
|
66
105
|
text: str,
|
|
67
106
|
threshold: float = 0.72,
|
|
107
|
+
chunk_size: int = 1000,
|
|
68
108
|
) -> MTLDResult:
|
|
69
109
|
"""
|
|
70
110
|
Compute MTLD (Measure of Textual Lexical Diversity).
|
|
71
111
|
|
|
112
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
113
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
114
|
+
|
|
72
115
|
MTLD measures the mean length of sequential word strings that maintain
|
|
73
116
|
a minimum threshold TTR. It's more robust than simple TTR for texts of
|
|
74
117
|
varying lengths.
|
|
@@ -79,6 +122,10 @@ def compute_mtld(
|
|
|
79
122
|
- Completed factors (segments where TTR dropped below threshold)
|
|
80
123
|
- Partial factor for any remaining incomplete segment (weighted by proximity to threshold)
|
|
81
124
|
|
|
125
|
+
Related GitHub Issue:
|
|
126
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
127
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
128
|
+
|
|
82
129
|
References:
|
|
83
130
|
McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D:
|
|
84
131
|
A validation study of sophisticated approaches to lexical diversity assessment.
|
|
@@ -87,16 +134,20 @@ def compute_mtld(
|
|
|
87
134
|
Args:
|
|
88
135
|
text: Input text to analyze
|
|
89
136
|
threshold: TTR threshold to maintain (default: 0.72, must be in range (0, 1))
|
|
137
|
+
chunk_size: Number of words per chunk (default: 1000)
|
|
90
138
|
|
|
91
139
|
Returns:
|
|
92
|
-
MTLDResult with forward, backward,
|
|
140
|
+
MTLDResult with forward, backward, average MTLD scores and distributions
|
|
93
141
|
|
|
94
142
|
Raises:
|
|
95
143
|
ValueError: If threshold is not in range (0, 1)
|
|
96
144
|
|
|
97
145
|
Example:
|
|
98
|
-
>>> result = compute_mtld("
|
|
99
|
-
>>>
|
|
146
|
+
>>> result = compute_mtld("Long text here...", chunk_size=1000)
|
|
147
|
+
>>> result.mtld_average # Mean across chunks
|
|
148
|
+
72.5
|
|
149
|
+
>>> result.mtld_average_dist.std # Variance reveals fingerprint
|
|
150
|
+
8.3
|
|
100
151
|
"""
|
|
101
152
|
# Validate threshold parameter
|
|
102
153
|
if not (0 < threshold < 1):
|
|
@@ -105,33 +156,64 @@ def compute_mtld(
|
|
|
105
156
|
"Common values: 0.72 (default), 0.5-0.8"
|
|
106
157
|
)
|
|
107
158
|
|
|
108
|
-
#
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
159
|
+
# Chunk the text
|
|
160
|
+
chunks = chunk_text(text, chunk_size)
|
|
161
|
+
|
|
162
|
+
# Compute metrics per chunk
|
|
163
|
+
forward_values = []
|
|
164
|
+
backward_values = []
|
|
165
|
+
average_values = []
|
|
166
|
+
total_tokens = 0
|
|
167
|
+
|
|
168
|
+
for chunk in chunks:
|
|
169
|
+
fwd, bwd, avg, meta = _compute_mtld_single(chunk, threshold)
|
|
170
|
+
if not math.isnan(fwd):
|
|
171
|
+
forward_values.append(fwd)
|
|
172
|
+
backward_values.append(bwd)
|
|
173
|
+
average_values.append(avg)
|
|
174
|
+
total_tokens += meta.get("token_count", 0)
|
|
175
|
+
|
|
176
|
+
# Handle empty or all-invalid chunks
|
|
177
|
+
if not forward_values:
|
|
178
|
+
empty_dist = Distribution(
|
|
179
|
+
values=[],
|
|
180
|
+
mean=float("nan"),
|
|
181
|
+
median=float("nan"),
|
|
182
|
+
std=0.0,
|
|
183
|
+
range=0.0,
|
|
184
|
+
iqr=0.0,
|
|
185
|
+
)
|
|
113
186
|
return MTLDResult(
|
|
114
|
-
mtld_forward=
|
|
115
|
-
mtld_backward=
|
|
116
|
-
mtld_average=
|
|
117
|
-
|
|
187
|
+
mtld_forward=float("nan"),
|
|
188
|
+
mtld_backward=float("nan"),
|
|
189
|
+
mtld_average=float("nan"),
|
|
190
|
+
mtld_forward_dist=empty_dist,
|
|
191
|
+
mtld_backward_dist=empty_dist,
|
|
192
|
+
mtld_average_dist=empty_dist,
|
|
193
|
+
chunk_size=chunk_size,
|
|
194
|
+
chunk_count=len(chunks),
|
|
195
|
+
metadata={
|
|
196
|
+
"total_token_count": 0,
|
|
197
|
+
"threshold": threshold,
|
|
198
|
+
},
|
|
118
199
|
)
|
|
119
200
|
|
|
120
|
-
#
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
mtld_backward = _calculate_mtld_direction(tokens, threshold, forward=False)
|
|
125
|
-
|
|
126
|
-
# Average of forward and backward
|
|
127
|
-
mtld_average = (mtld_forward + mtld_backward) / 2
|
|
201
|
+
# Build distributions
|
|
202
|
+
forward_dist = make_distribution(forward_values)
|
|
203
|
+
backward_dist = make_distribution(backward_values)
|
|
204
|
+
average_dist = make_distribution(average_values)
|
|
128
205
|
|
|
129
206
|
return MTLDResult(
|
|
130
|
-
mtld_forward=
|
|
131
|
-
mtld_backward=
|
|
132
|
-
mtld_average=
|
|
207
|
+
mtld_forward=forward_dist.mean,
|
|
208
|
+
mtld_backward=backward_dist.mean,
|
|
209
|
+
mtld_average=average_dist.mean,
|
|
210
|
+
mtld_forward_dist=forward_dist,
|
|
211
|
+
mtld_backward_dist=backward_dist,
|
|
212
|
+
mtld_average_dist=average_dist,
|
|
213
|
+
chunk_size=chunk_size,
|
|
214
|
+
chunk_count=len(chunks),
|
|
133
215
|
metadata={
|
|
134
|
-
"
|
|
216
|
+
"total_token_count": total_tokens,
|
|
135
217
|
"threshold": threshold,
|
|
136
218
|
},
|
|
137
219
|
)
|
pystylometry/lexical/ttr.py
CHANGED
|
@@ -2,12 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
This module provides a facade wrapper around the stylometry-ttr package,
|
|
4
4
|
maintaining consistent API patterns with other pystylometry metrics.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
5
9
|
"""
|
|
6
10
|
|
|
7
|
-
from
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from .._types import Distribution, TTRResult, make_distribution
|
|
8
14
|
|
|
9
15
|
|
|
10
|
-
def compute_ttr(text: str, text_id: str | None = None) -> TTRResult:
|
|
16
|
+
def compute_ttr(text: str, text_id: str | None = None, chunk_size: int = 1000) -> TTRResult:
|
|
11
17
|
"""
|
|
12
18
|
Compute Type-Token Ratio (TTR) metrics for vocabulary richness.
|
|
13
19
|
|
|
@@ -22,6 +28,10 @@ def compute_ttr(text: str, text_id: str | None = None) -> TTRResult:
|
|
|
22
28
|
- STTR: Standardized TTR across fixed-size chunks (reduces length bias)
|
|
23
29
|
- Delta Std: Standard deviation of TTR across chunks (vocabulary consistency)
|
|
24
30
|
|
|
31
|
+
Related GitHub Issue:
|
|
32
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
33
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
34
|
+
|
|
25
35
|
References:
|
|
26
36
|
Guiraud, P. (1960). Problèmes et méthodes de la statistique linguistique.
|
|
27
37
|
Herdan, G. (1960). Type-token Mathematics: A Textbook of Mathematical
|
|
@@ -32,9 +42,14 @@ def compute_ttr(text: str, text_id: str | None = None) -> TTRResult:
|
|
|
32
42
|
Args:
|
|
33
43
|
text: Input text to analyze
|
|
34
44
|
text_id: Optional identifier for the text (for tracking purposes)
|
|
45
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
46
|
+
Note: The stylometry-ttr package handles its own internal chunking,
|
|
47
|
+
so this parameter is included for API consistency but actual chunking
|
|
48
|
+
behavior is delegated to stylometry-ttr.
|
|
35
49
|
|
|
36
50
|
Returns:
|
|
37
|
-
TTRResult with all TTR variants and metadata
|
|
51
|
+
TTRResult with all TTR variants and metadata, including Distribution
|
|
52
|
+
objects for stylometric fingerprinting.
|
|
38
53
|
|
|
39
54
|
Example:
|
|
40
55
|
>>> result = compute_ttr("The quick brown fox jumps over the lazy dog.")
|
|
@@ -63,17 +78,68 @@ def compute_ttr(text: str, text_id: str | None = None) -> TTRResult:
|
|
|
63
78
|
# Note: stylometry-ttr requires text_id to be a string, not None
|
|
64
79
|
ttr_result = _compute_ttr(text, text_id=text_id or "")
|
|
65
80
|
|
|
81
|
+
# Extract values, handling None for short texts
|
|
82
|
+
ttr_val = ttr_result.ttr
|
|
83
|
+
root_ttr_val = ttr_result.root_ttr
|
|
84
|
+
log_ttr_val = ttr_result.log_ttr
|
|
85
|
+
sttr_val = ttr_result.sttr if ttr_result.sttr is not None else 0.0
|
|
86
|
+
delta_std_val = ttr_result.delta_std if ttr_result.delta_std is not None else 0.0
|
|
87
|
+
|
|
88
|
+
# Create single-value distributions from stylometry-ttr results
|
|
89
|
+
# The stylometry-ttr package handles its own internal chunking for STTR
|
|
90
|
+
# so we wrap the aggregate results in Distribution objects
|
|
91
|
+
ttr_dist = (
|
|
92
|
+
make_distribution([ttr_val])
|
|
93
|
+
if ttr_val is not None
|
|
94
|
+
else Distribution(
|
|
95
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
root_ttr_dist = (
|
|
99
|
+
make_distribution([root_ttr_val])
|
|
100
|
+
if root_ttr_val is not None
|
|
101
|
+
else Distribution(
|
|
102
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
103
|
+
)
|
|
104
|
+
)
|
|
105
|
+
log_ttr_dist = (
|
|
106
|
+
make_distribution([log_ttr_val])
|
|
107
|
+
if log_ttr_val is not None
|
|
108
|
+
else Distribution(
|
|
109
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
sttr_dist = (
|
|
113
|
+
make_distribution([sttr_val])
|
|
114
|
+
if ttr_result.sttr is not None
|
|
115
|
+
else Distribution(
|
|
116
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
delta_std_dist = (
|
|
120
|
+
make_distribution([delta_std_val])
|
|
121
|
+
if ttr_result.delta_std is not None
|
|
122
|
+
else Distribution(
|
|
123
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
|
|
66
127
|
# Convert to our TTRResult dataclass
|
|
67
|
-
# The stylometry-ttr result has attributes we can access
|
|
68
|
-
# Some fields (sttr, delta_std) may be None for short texts
|
|
69
128
|
return TTRResult(
|
|
70
129
|
total_words=ttr_result.total_words,
|
|
71
130
|
unique_words=ttr_result.unique_words,
|
|
72
|
-
ttr=
|
|
73
|
-
root_ttr=
|
|
74
|
-
log_ttr=
|
|
75
|
-
sttr=
|
|
76
|
-
delta_std=
|
|
131
|
+
ttr=ttr_val if ttr_val is not None else float("nan"),
|
|
132
|
+
root_ttr=root_ttr_val if root_ttr_val is not None else float("nan"),
|
|
133
|
+
log_ttr=log_ttr_val if log_ttr_val is not None else float("nan"),
|
|
134
|
+
sttr=sttr_val,
|
|
135
|
+
delta_std=delta_std_val,
|
|
136
|
+
ttr_dist=ttr_dist,
|
|
137
|
+
root_ttr_dist=root_ttr_dist,
|
|
138
|
+
log_ttr_dist=log_ttr_dist,
|
|
139
|
+
sttr_dist=sttr_dist,
|
|
140
|
+
delta_std_dist=delta_std_dist,
|
|
141
|
+
chunk_size=chunk_size,
|
|
142
|
+
chunk_count=1, # stylometry-ttr returns aggregate results
|
|
77
143
|
metadata={
|
|
78
144
|
"text_id": text_id or "",
|
|
79
145
|
"source": "stylometry-ttr",
|