pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +45 -3
- pystylometry/_types.py +1017 -259
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +28 -4
- pystylometry/authorship/additional_methods.py +260 -40
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +354 -0
- pystylometry/character/README.md +17 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/README.md +27 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- pystylometry-1.3.0.dist-info/METADATA +136 -0
- pystylometry-1.3.0.dist-info/RECORD +76 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
- pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/METADATA +0 -275
- pystylometry-1.0.0.dist-info/RECORD +0 -46
pystylometry/_types.py
CHANGED
|
@@ -1,42 +1,281 @@
|
|
|
1
|
-
"""Result dataclasses for all pystylometry metrics.
|
|
1
|
+
"""Result dataclasses for all pystylometry metrics.
|
|
2
|
+
|
|
3
|
+
This module defines dataclasses for all metric results in pystylometry.
|
|
4
|
+
|
|
5
|
+
Native Chunked Analysis (Issue #27):
|
|
6
|
+
All metrics support chunked analysis by default. Results include:
|
|
7
|
+
- Convenient access to the mean value (e.g., result.reading_ease)
|
|
8
|
+
- Full distribution with per-chunk values and statistics (e.g., result.reading_ease_dist)
|
|
9
|
+
|
|
10
|
+
The Distribution dataclass provides:
|
|
11
|
+
- values: list of per-chunk metric values
|
|
12
|
+
- mean, median, std: central tendency and variability
|
|
13
|
+
- range, iqr: spread measures
|
|
14
|
+
|
|
15
|
+
This design captures the variance and rhythm in writing style, which is
|
|
16
|
+
essential for authorship attribution and linguistic fingerprinting.
|
|
17
|
+
|
|
18
|
+
References:
|
|
19
|
+
STTR methodology: Johnson, W. (1944). Studies in language behavior.
|
|
20
|
+
"""
|
|
2
21
|
|
|
3
22
|
from __future__ import annotations
|
|
4
23
|
|
|
24
|
+
import statistics
|
|
5
25
|
from dataclasses import dataclass
|
|
6
26
|
from typing import Any
|
|
7
27
|
|
|
28
|
+
# ===== Distribution and Chunking =====
|
|
29
|
+
# Related to GitHub Issue #27: Native chunked analysis with Distribution dataclass
|
|
30
|
+
# https://github.com/craigtrim/pystylometry/issues/27
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class Distribution:
|
|
35
|
+
"""Distribution of metric values across chunks.
|
|
36
|
+
|
|
37
|
+
This dataclass captures the variance and rhythm in writing style by storing
|
|
38
|
+
per-chunk values along with descriptive statistics. The variance across chunks
|
|
39
|
+
is often more revealing of authorial fingerprint than aggregate values.
|
|
40
|
+
|
|
41
|
+
Related GitHub Issue:
|
|
42
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
43
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
44
|
+
|
|
45
|
+
Attributes:
|
|
46
|
+
values: Raw per-chunk metric values
|
|
47
|
+
mean: Arithmetic mean of values
|
|
48
|
+
median: Middle value when sorted
|
|
49
|
+
std: Standard deviation (0.0 for single-chunk)
|
|
50
|
+
range: max - min (spread measure)
|
|
51
|
+
iqr: Interquartile range (Q3 - Q1), robust spread measure
|
|
52
|
+
|
|
53
|
+
Note:
|
|
54
|
+
min/max are omitted as trivial operations on values:
|
|
55
|
+
- min(dist.values), max(dist.values)
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
>>> dist = Distribution(
|
|
59
|
+
... values=[65.2, 71.1, 68.8, 70.5],
|
|
60
|
+
... mean=68.9, median=69.65, std=2.57,
|
|
61
|
+
... range=5.9, iqr=3.15
|
|
62
|
+
... )
|
|
63
|
+
>>> dist.std # variance reveals authorial fingerprint
|
|
64
|
+
2.57
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
values: list[float]
|
|
68
|
+
mean: float
|
|
69
|
+
median: float
|
|
70
|
+
std: float
|
|
71
|
+
range: float
|
|
72
|
+
iqr: float
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def chunk_text(text: str, chunk_size: int) -> list[str]:
|
|
76
|
+
"""Split text into word-based chunks of approximately equal size.
|
|
77
|
+
|
|
78
|
+
Chunks are created by splitting on whitespace and grouping words.
|
|
79
|
+
The last chunk may be smaller than chunk_size if the text doesn't
|
|
80
|
+
divide evenly.
|
|
81
|
+
|
|
82
|
+
Related GitHub Issue:
|
|
83
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
84
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
text: The text to chunk
|
|
88
|
+
chunk_size: Target number of words per chunk (default: 1000)
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
List of text chunks. For text smaller than chunk_size,
|
|
92
|
+
returns a single-element list with the entire text.
|
|
93
|
+
|
|
94
|
+
Example:
|
|
95
|
+
>>> chunks = chunk_text("word " * 2500, chunk_size=1000)
|
|
96
|
+
>>> len(chunks)
|
|
97
|
+
3
|
|
98
|
+
>>> # First two chunks have ~1000 words, last has ~500
|
|
99
|
+
"""
|
|
100
|
+
words = text.split()
|
|
101
|
+
if not words:
|
|
102
|
+
return [""]
|
|
103
|
+
|
|
104
|
+
chunks = []
|
|
105
|
+
for i in range(0, len(words), chunk_size):
|
|
106
|
+
chunk_words = words[i : i + chunk_size]
|
|
107
|
+
chunks.append(" ".join(chunk_words))
|
|
108
|
+
|
|
109
|
+
return chunks
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def make_distribution(values: list[float]) -> Distribution:
|
|
113
|
+
"""Create a Distribution from a list of values.
|
|
114
|
+
|
|
115
|
+
Computes all descriptive statistics for the distribution.
|
|
116
|
+
Handles single-value lists by setting std, range, and iqr to 0.0.
|
|
117
|
+
|
|
118
|
+
Related GitHub Issue:
|
|
119
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
120
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
values: List of numeric values (must be non-empty)
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Distribution with computed statistics
|
|
127
|
+
|
|
128
|
+
Raises:
|
|
129
|
+
ValueError: If values is empty
|
|
130
|
+
|
|
131
|
+
Example:
|
|
132
|
+
>>> dist = make_distribution([65.2, 71.1, 68.8, 70.5])
|
|
133
|
+
>>> dist.mean
|
|
134
|
+
68.9
|
|
135
|
+
>>> dist.std # reveals variance in the signal
|
|
136
|
+
2.57...
|
|
137
|
+
"""
|
|
138
|
+
if not values:
|
|
139
|
+
raise ValueError("Cannot create distribution from empty values")
|
|
140
|
+
|
|
141
|
+
if len(values) == 1:
|
|
142
|
+
return Distribution(
|
|
143
|
+
values=values,
|
|
144
|
+
mean=values[0],
|
|
145
|
+
median=values[0],
|
|
146
|
+
std=0.0,
|
|
147
|
+
range=0.0,
|
|
148
|
+
iqr=0.0,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# For 2-3 values, quantiles() needs special handling
|
|
152
|
+
if len(values) < 4:
|
|
153
|
+
q1 = values[0]
|
|
154
|
+
q3 = values[-1]
|
|
155
|
+
else:
|
|
156
|
+
q = statistics.quantiles(values, n=4)
|
|
157
|
+
q1, q3 = q[0], q[2]
|
|
158
|
+
|
|
159
|
+
return Distribution(
|
|
160
|
+
values=values,
|
|
161
|
+
mean=statistics.mean(values),
|
|
162
|
+
median=statistics.median(values),
|
|
163
|
+
std=statistics.stdev(values),
|
|
164
|
+
range=max(values) - min(values),
|
|
165
|
+
iqr=q3 - q1,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
8
169
|
# ===== Lexical Results =====
|
|
9
170
|
|
|
10
171
|
|
|
11
172
|
@dataclass
|
|
12
173
|
class MTLDResult:
|
|
13
|
-
"""Result from MTLD (Measure of Textual Lexical Diversity) computation.
|
|
174
|
+
"""Result from MTLD (Measure of Textual Lexical Diversity) computation.
|
|
14
175
|
|
|
176
|
+
All numeric metrics include both a mean value (convenient access) and
|
|
177
|
+
a full distribution with per-chunk values and statistics.
|
|
178
|
+
|
|
179
|
+
Related GitHub Issue:
|
|
180
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
181
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
182
|
+
|
|
183
|
+
Example:
|
|
184
|
+
>>> result = compute_mtld(text, chunk_size=1000)
|
|
185
|
+
>>> result.mtld_average # mean MTLD across chunks
|
|
186
|
+
72.5
|
|
187
|
+
>>> result.mtld_average_dist.std # MTLD variance
|
|
188
|
+
8.3
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
# Convenient access (mean values)
|
|
15
192
|
mtld_forward: float
|
|
16
193
|
mtld_backward: float
|
|
17
194
|
mtld_average: float
|
|
195
|
+
|
|
196
|
+
# Full distributions
|
|
197
|
+
mtld_forward_dist: Distribution
|
|
198
|
+
mtld_backward_dist: Distribution
|
|
199
|
+
mtld_average_dist: Distribution
|
|
200
|
+
|
|
201
|
+
# Chunking context
|
|
202
|
+
chunk_size: int
|
|
203
|
+
chunk_count: int
|
|
204
|
+
|
|
18
205
|
metadata: dict[str, Any]
|
|
19
206
|
|
|
20
207
|
|
|
21
208
|
@dataclass
|
|
22
209
|
class YuleResult:
|
|
23
|
-
"""Result from Yule's K and I computation.
|
|
210
|
+
"""Result from Yule's K and I computation.
|
|
211
|
+
|
|
212
|
+
All numeric metrics include both a mean value (convenient access) and
|
|
213
|
+
a full distribution with per-chunk values and statistics.
|
|
24
214
|
|
|
215
|
+
Related GitHub Issue:
|
|
216
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
217
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
218
|
+
|
|
219
|
+
Example:
|
|
220
|
+
>>> result = compute_yule(text, chunk_size=1000)
|
|
221
|
+
>>> result.yule_k # mean across chunks
|
|
222
|
+
120.5
|
|
223
|
+
>>> result.yule_k_dist.std # variance reveals fingerprint
|
|
224
|
+
15.2
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
# Convenient access (mean values)
|
|
25
228
|
yule_k: float
|
|
26
229
|
yule_i: float
|
|
230
|
+
|
|
231
|
+
# Full distributions
|
|
232
|
+
yule_k_dist: Distribution
|
|
233
|
+
yule_i_dist: Distribution
|
|
234
|
+
|
|
235
|
+
# Chunking context
|
|
236
|
+
chunk_size: int
|
|
237
|
+
chunk_count: int
|
|
238
|
+
|
|
27
239
|
metadata: dict[str, Any]
|
|
28
240
|
|
|
29
241
|
|
|
30
242
|
@dataclass
|
|
31
243
|
class HapaxResult:
|
|
32
|
-
"""Result from Hapax Legomena analysis.
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
244
|
+
"""Result from Hapax Legomena analysis.
|
|
245
|
+
|
|
246
|
+
All numeric metrics include both a mean value (convenient access) and
|
|
247
|
+
a full distribution with per-chunk values and statistics.
|
|
248
|
+
|
|
249
|
+
Related GitHub Issue:
|
|
250
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
251
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
252
|
+
|
|
253
|
+
Example:
|
|
254
|
+
>>> result = compute_hapax(text, chunk_size=1000)
|
|
255
|
+
>>> result.hapax_ratio # mean across chunks
|
|
256
|
+
0.45
|
|
257
|
+
>>> result.hapax_ratio_dist.std # variance
|
|
258
|
+
0.08
|
|
259
|
+
"""
|
|
260
|
+
|
|
261
|
+
# Convenient access (mean/total values)
|
|
262
|
+
hapax_count: int # Total across all chunks
|
|
263
|
+
hapax_ratio: float # Mean ratio
|
|
264
|
+
dis_hapax_count: int # Total across all chunks
|
|
265
|
+
dis_hapax_ratio: float # Mean ratio
|
|
266
|
+
sichel_s: float # Mean
|
|
267
|
+
honore_r: float # Mean
|
|
268
|
+
|
|
269
|
+
# Full distributions (ratios only - counts don't distribute meaningfully)
|
|
270
|
+
hapax_ratio_dist: Distribution
|
|
271
|
+
dis_hapax_ratio_dist: Distribution
|
|
272
|
+
sichel_s_dist: Distribution
|
|
273
|
+
honore_r_dist: Distribution
|
|
274
|
+
|
|
275
|
+
# Chunking context
|
|
276
|
+
chunk_size: int
|
|
277
|
+
chunk_count: int
|
|
278
|
+
|
|
40
279
|
metadata: dict[str, Any]
|
|
41
280
|
|
|
42
281
|
|
|
@@ -80,6 +319,9 @@ class TTRResult:
|
|
|
80
319
|
Wraps stylometry-ttr package functionality to measure vocabulary richness
|
|
81
320
|
through the ratio of unique words (types) to total words (tokens).
|
|
82
321
|
|
|
322
|
+
All numeric metrics include both a mean value (convenient access) and
|
|
323
|
+
a full distribution with per-chunk values and statistics.
|
|
324
|
+
|
|
83
325
|
Includes multiple TTR variants for length normalization:
|
|
84
326
|
- Raw TTR: Direct ratio of unique to total words
|
|
85
327
|
- Root TTR (Guiraud's index): types / sqrt(tokens)
|
|
@@ -87,18 +329,44 @@ class TTRResult:
|
|
|
87
329
|
- STTR: Standardized TTR across fixed-size chunks
|
|
88
330
|
- Delta Std: Measures vocabulary consistency across chunks
|
|
89
331
|
|
|
332
|
+
Related GitHub Issue:
|
|
333
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
334
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
335
|
+
|
|
90
336
|
References:
|
|
91
337
|
Guiraud, P. (1960). Problèmes et méthodes de la statistique linguistique.
|
|
92
338
|
Herdan, G. (1960). Type-token Mathematics.
|
|
339
|
+
|
|
340
|
+
Example:
|
|
341
|
+
>>> result = compute_ttr(text, chunk_size=1000)
|
|
342
|
+
>>> result.ttr # mean TTR across chunks
|
|
343
|
+
0.42
|
|
344
|
+
>>> result.ttr_dist.std # TTR variance reveals fingerprint
|
|
345
|
+
0.05
|
|
346
|
+
>>> result.chunk_count
|
|
347
|
+
59
|
|
93
348
|
"""
|
|
94
349
|
|
|
350
|
+
# Convenient access (mean values)
|
|
95
351
|
total_words: int
|
|
96
352
|
unique_words: int
|
|
97
|
-
ttr: float # Raw TTR
|
|
98
|
-
root_ttr: float # Guiraud's index
|
|
99
|
-
log_ttr: float # Herdan's C
|
|
100
|
-
sttr: float # Standardized TTR
|
|
101
|
-
delta_std: float # Vocabulary consistency
|
|
353
|
+
ttr: float # Raw TTR (mean)
|
|
354
|
+
root_ttr: float # Guiraud's index (mean)
|
|
355
|
+
log_ttr: float # Herdan's C (mean)
|
|
356
|
+
sttr: float # Standardized TTR (mean)
|
|
357
|
+
delta_std: float # Vocabulary consistency (mean)
|
|
358
|
+
|
|
359
|
+
# Full distributions with per-chunk values
|
|
360
|
+
ttr_dist: Distribution
|
|
361
|
+
root_ttr_dist: Distribution
|
|
362
|
+
log_ttr_dist: Distribution
|
|
363
|
+
sttr_dist: Distribution
|
|
364
|
+
delta_std_dist: Distribution
|
|
365
|
+
|
|
366
|
+
# Chunking context
|
|
367
|
+
chunk_size: int
|
|
368
|
+
chunk_count: int
|
|
369
|
+
|
|
102
370
|
metadata: dict[str, Any]
|
|
103
371
|
|
|
104
372
|
|
|
@@ -107,48 +375,135 @@ class TTRResult:
|
|
|
107
375
|
|
|
108
376
|
@dataclass
|
|
109
377
|
class FleschResult:
|
|
110
|
-
"""Result from Flesch Reading Ease and Flesch-Kincaid Grade computation.
|
|
378
|
+
"""Result from Flesch Reading Ease and Flesch-Kincaid Grade computation.
|
|
379
|
+
|
|
380
|
+
All numeric metrics include both a mean value (convenient access) and
|
|
381
|
+
a full distribution with per-chunk values and statistics.
|
|
382
|
+
|
|
383
|
+
Related GitHub Issue:
|
|
384
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
385
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
111
386
|
|
|
387
|
+
Example:
|
|
388
|
+
>>> result = compute_flesch(text, chunk_size=1000)
|
|
389
|
+
>>> result.reading_ease # mean across chunks
|
|
390
|
+
68.54
|
|
391
|
+
>>> result.reading_ease_dist.std # variance reveals fingerprint
|
|
392
|
+
4.2
|
|
393
|
+
>>> result.reading_ease_dist.values # per-chunk values
|
|
394
|
+
[65.2, 71.1, 68.8, ...]
|
|
395
|
+
"""
|
|
396
|
+
|
|
397
|
+
# Convenient access (mean values)
|
|
112
398
|
reading_ease: float
|
|
113
399
|
grade_level: float
|
|
114
|
-
difficulty: str #
|
|
400
|
+
difficulty: str # Based on mean reading_ease
|
|
401
|
+
|
|
402
|
+
# Full distributions
|
|
403
|
+
reading_ease_dist: Distribution
|
|
404
|
+
grade_level_dist: Distribution
|
|
405
|
+
|
|
406
|
+
# Chunking context
|
|
407
|
+
chunk_size: int
|
|
408
|
+
chunk_count: int
|
|
409
|
+
|
|
115
410
|
metadata: dict[str, Any]
|
|
116
411
|
|
|
117
412
|
|
|
118
413
|
@dataclass
|
|
119
414
|
class SMOGResult:
|
|
120
|
-
"""Result from SMOG Index computation.
|
|
415
|
+
"""Result from SMOG Index computation.
|
|
121
416
|
|
|
417
|
+
Related GitHub Issue:
|
|
418
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
419
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
420
|
+
"""
|
|
421
|
+
|
|
422
|
+
# Convenient access (mean values)
|
|
122
423
|
smog_index: float
|
|
123
424
|
grade_level: float
|
|
425
|
+
|
|
426
|
+
# Full distributions
|
|
427
|
+
smog_index_dist: Distribution
|
|
428
|
+
grade_level_dist: Distribution
|
|
429
|
+
|
|
430
|
+
# Chunking context
|
|
431
|
+
chunk_size: int
|
|
432
|
+
chunk_count: int
|
|
433
|
+
|
|
124
434
|
metadata: dict[str, Any]
|
|
125
435
|
|
|
126
436
|
|
|
127
437
|
@dataclass
|
|
128
438
|
class GunningFogResult:
|
|
129
|
-
"""Result from Gunning Fog Index computation.
|
|
439
|
+
"""Result from Gunning Fog Index computation.
|
|
130
440
|
|
|
441
|
+
Related GitHub Issue:
|
|
442
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
443
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
444
|
+
"""
|
|
445
|
+
|
|
446
|
+
# Convenient access (mean values)
|
|
131
447
|
fog_index: float
|
|
132
448
|
grade_level: float
|
|
449
|
+
|
|
450
|
+
# Full distributions
|
|
451
|
+
fog_index_dist: Distribution
|
|
452
|
+
grade_level_dist: Distribution
|
|
453
|
+
|
|
454
|
+
# Chunking context
|
|
455
|
+
chunk_size: int
|
|
456
|
+
chunk_count: int
|
|
457
|
+
|
|
133
458
|
metadata: dict[str, Any]
|
|
134
459
|
|
|
135
460
|
|
|
136
461
|
@dataclass
|
|
137
462
|
class ColemanLiauResult:
|
|
138
|
-
"""Result from Coleman-Liau Index computation.
|
|
463
|
+
"""Result from Coleman-Liau Index computation.
|
|
464
|
+
|
|
465
|
+
Related GitHub Issue:
|
|
466
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
467
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
468
|
+
"""
|
|
139
469
|
|
|
470
|
+
# Convenient access (mean values)
|
|
140
471
|
cli_index: float
|
|
141
|
-
grade_level:
|
|
472
|
+
grade_level: float # Changed to float for mean across chunks
|
|
473
|
+
|
|
474
|
+
# Full distributions
|
|
475
|
+
cli_index_dist: Distribution
|
|
476
|
+
grade_level_dist: Distribution
|
|
477
|
+
|
|
478
|
+
# Chunking context
|
|
479
|
+
chunk_size: int
|
|
480
|
+
chunk_count: int
|
|
481
|
+
|
|
142
482
|
metadata: dict[str, Any]
|
|
143
483
|
|
|
144
484
|
|
|
145
485
|
@dataclass
|
|
146
486
|
class ARIResult:
|
|
147
|
-
"""Result from Automated Readability Index computation.
|
|
487
|
+
"""Result from Automated Readability Index computation.
|
|
148
488
|
|
|
489
|
+
Related GitHub Issue:
|
|
490
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
491
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
492
|
+
"""
|
|
493
|
+
|
|
494
|
+
# Convenient access (mean values)
|
|
149
495
|
ari_score: float
|
|
150
|
-
grade_level:
|
|
151
|
-
age_range: str
|
|
496
|
+
grade_level: float # Changed to float for mean across chunks
|
|
497
|
+
age_range: str # Based on mean grade level
|
|
498
|
+
|
|
499
|
+
# Full distributions
|
|
500
|
+
ari_score_dist: Distribution
|
|
501
|
+
grade_level_dist: Distribution
|
|
502
|
+
|
|
503
|
+
# Chunking context
|
|
504
|
+
chunk_size: int
|
|
505
|
+
chunk_count: int
|
|
506
|
+
|
|
152
507
|
metadata: dict[str, Any]
|
|
153
508
|
|
|
154
509
|
|
|
@@ -157,8 +512,14 @@ class ARIResult:
|
|
|
157
512
|
|
|
158
513
|
@dataclass
|
|
159
514
|
class POSResult:
|
|
160
|
-
"""Result from Part-of-Speech ratio analysis.
|
|
515
|
+
"""Result from Part-of-Speech ratio analysis.
|
|
161
516
|
|
|
517
|
+
Related GitHub Issue:
|
|
518
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
519
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
520
|
+
"""
|
|
521
|
+
|
|
522
|
+
# Convenient access (mean values)
|
|
162
523
|
noun_ratio: float
|
|
163
524
|
verb_ratio: float
|
|
164
525
|
adjective_ratio: float
|
|
@@ -167,19 +528,52 @@ class POSResult:
|
|
|
167
528
|
adjective_noun_ratio: float
|
|
168
529
|
lexical_density: float
|
|
169
530
|
function_word_ratio: float
|
|
531
|
+
|
|
532
|
+
# Full distributions
|
|
533
|
+
noun_ratio_dist: Distribution
|
|
534
|
+
verb_ratio_dist: Distribution
|
|
535
|
+
adjective_ratio_dist: Distribution
|
|
536
|
+
adverb_ratio_dist: Distribution
|
|
537
|
+
noun_verb_ratio_dist: Distribution
|
|
538
|
+
adjective_noun_ratio_dist: Distribution
|
|
539
|
+
lexical_density_dist: Distribution
|
|
540
|
+
function_word_ratio_dist: Distribution
|
|
541
|
+
|
|
542
|
+
# Chunking context
|
|
543
|
+
chunk_size: int
|
|
544
|
+
chunk_count: int
|
|
545
|
+
|
|
170
546
|
metadata: dict[str, Any]
|
|
171
547
|
|
|
172
548
|
|
|
173
549
|
@dataclass
|
|
174
550
|
class SentenceStatsResult:
|
|
175
|
-
"""Result from sentence-level statistics.
|
|
551
|
+
"""Result from sentence-level statistics.
|
|
552
|
+
|
|
553
|
+
Related GitHub Issue:
|
|
554
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
555
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
556
|
+
"""
|
|
176
557
|
|
|
558
|
+
# Convenient access (mean values)
|
|
177
559
|
mean_sentence_length: float
|
|
178
560
|
sentence_length_std: float
|
|
179
|
-
sentence_length_range:
|
|
180
|
-
min_sentence_length:
|
|
181
|
-
max_sentence_length:
|
|
182
|
-
sentence_count: int
|
|
561
|
+
sentence_length_range: float # Changed to float for mean across chunks
|
|
562
|
+
min_sentence_length: float # Changed to float for mean across chunks
|
|
563
|
+
max_sentence_length: float # Changed to float for mean across chunks
|
|
564
|
+
sentence_count: int # Total across all chunks
|
|
565
|
+
|
|
566
|
+
# Full distributions
|
|
567
|
+
mean_sentence_length_dist: Distribution
|
|
568
|
+
sentence_length_std_dist: Distribution
|
|
569
|
+
sentence_length_range_dist: Distribution
|
|
570
|
+
min_sentence_length_dist: Distribution
|
|
571
|
+
max_sentence_length_dist: Distribution
|
|
572
|
+
|
|
573
|
+
# Chunking context
|
|
574
|
+
chunk_size: int
|
|
575
|
+
chunk_count: int
|
|
576
|
+
|
|
183
577
|
metadata: dict[str, Any]
|
|
184
578
|
|
|
185
579
|
|
|
@@ -211,11 +605,26 @@ class ZetaResult:
|
|
|
211
605
|
|
|
212
606
|
@dataclass
|
|
213
607
|
class EntropyResult:
|
|
214
|
-
"""Result from n-gram entropy computation.
|
|
608
|
+
"""Result from n-gram entropy computation.
|
|
609
|
+
|
|
610
|
+
Related GitHub Issue:
|
|
611
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
612
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
613
|
+
"""
|
|
215
614
|
|
|
615
|
+
# Convenient access (mean values)
|
|
216
616
|
entropy: float
|
|
217
617
|
perplexity: float
|
|
218
618
|
ngram_type: str # "character_bigram", "word_bigram", "word_trigram"
|
|
619
|
+
|
|
620
|
+
# Full distributions
|
|
621
|
+
entropy_dist: Distribution
|
|
622
|
+
perplexity_dist: Distribution
|
|
623
|
+
|
|
624
|
+
# Chunking context
|
|
625
|
+
chunk_size: int
|
|
626
|
+
chunk_count: int
|
|
627
|
+
|
|
219
628
|
metadata: dict[str, Any]
|
|
220
629
|
|
|
221
630
|
|
|
@@ -233,9 +642,9 @@ class CharacterMetricsResult:
|
|
|
233
642
|
fundamental for authorship attribution and can capture distinctive
|
|
234
643
|
patterns in punctuation, formatting, and word construction.
|
|
235
644
|
|
|
236
|
-
Related GitHub
|
|
645
|
+
Related GitHub Issues:
|
|
237
646
|
#12 - Character-Level Metrics
|
|
238
|
-
|
|
647
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
239
648
|
|
|
240
649
|
Metrics included:
|
|
241
650
|
- Average word length (characters per word)
|
|
@@ -253,25 +662,35 @@ class CharacterMetricsResult:
|
|
|
253
662
|
of techniques. Literary and Linguistic Computing, 22(3), 251-270.
|
|
254
663
|
Stamatatos, E. (2009). A survey of modern authorship attribution methods.
|
|
255
664
|
JASIST, 60(3), 538-556.
|
|
256
|
-
|
|
257
|
-
Example:
|
|
258
|
-
>>> result = compute_character_metrics("Sample text here.")
|
|
259
|
-
>>> print(f"Avg word length: {result.avg_word_length:.2f} chars")
|
|
260
|
-
>>> print(f"Punctuation density: {result.punctuation_density:.2f}")
|
|
261
|
-
>>> print(f"Vowel/consonant ratio: {result.vowel_consonant_ratio:.2f}")
|
|
262
665
|
"""
|
|
263
666
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
667
|
+
# Convenient access (mean values)
|
|
668
|
+
avg_word_length: float
|
|
669
|
+
avg_sentence_length_chars: float
|
|
670
|
+
punctuation_density: float
|
|
671
|
+
punctuation_variety: float # Changed to float for mean across chunks
|
|
672
|
+
letter_frequency: dict[str, float] # Aggregate frequency
|
|
673
|
+
vowel_consonant_ratio: float
|
|
674
|
+
digit_count: int # Total across all chunks
|
|
675
|
+
digit_ratio: float
|
|
676
|
+
uppercase_ratio: float
|
|
677
|
+
whitespace_ratio: float
|
|
678
|
+
|
|
679
|
+
# Full distributions
|
|
680
|
+
avg_word_length_dist: Distribution
|
|
681
|
+
avg_sentence_length_chars_dist: Distribution
|
|
682
|
+
punctuation_density_dist: Distribution
|
|
683
|
+
punctuation_variety_dist: Distribution
|
|
684
|
+
vowel_consonant_ratio_dist: Distribution
|
|
685
|
+
digit_ratio_dist: Distribution
|
|
686
|
+
uppercase_ratio_dist: Distribution
|
|
687
|
+
whitespace_ratio_dist: Distribution
|
|
688
|
+
|
|
689
|
+
# Chunking context
|
|
690
|
+
chunk_size: int
|
|
691
|
+
chunk_count: int
|
|
692
|
+
|
|
693
|
+
metadata: dict[str, Any]
|
|
275
694
|
|
|
276
695
|
|
|
277
696
|
# ===== Function Word Results =====
|
|
@@ -288,9 +707,9 @@ class FunctionWordResult:
|
|
|
288
707
|
subconsciously. They are considered strong authorship markers because authors
|
|
289
708
|
use them consistently across different topics and genres.
|
|
290
709
|
|
|
291
|
-
Related GitHub
|
|
710
|
+
Related GitHub Issues:
|
|
292
711
|
#13 - Function Word Analysis
|
|
293
|
-
|
|
712
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
294
713
|
|
|
295
714
|
This analysis computes:
|
|
296
715
|
- Frequency profiles for all function word categories
|
|
@@ -311,26 +730,36 @@ class FunctionWordResult:
|
|
|
311
730
|
The Federalist. Addison-Wesley.
|
|
312
731
|
Burrows, J. (2002). 'Delta': A measure of stylistic difference and a guide
|
|
313
732
|
to likely authorship. Literary and Linguistic Computing, 17(3), 267-287.
|
|
314
|
-
|
|
315
|
-
Example:
|
|
316
|
-
>>> result = compute_function_words("Sample text for analysis.")
|
|
317
|
-
>>> print(f"Determiner ratio: {result.determiner_ratio:.3f}")
|
|
318
|
-
>>> print(f"Preposition ratio: {result.preposition_ratio:.3f}")
|
|
319
|
-
>>> print(f"Most frequent: {result.most_frequent_function_words[:5]}")
|
|
320
733
|
"""
|
|
321
734
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
735
|
+
# Convenient access (mean values)
|
|
736
|
+
determiner_ratio: float
|
|
737
|
+
preposition_ratio: float
|
|
738
|
+
conjunction_ratio: float
|
|
739
|
+
pronoun_ratio: float
|
|
740
|
+
auxiliary_ratio: float
|
|
741
|
+
particle_ratio: float
|
|
742
|
+
total_function_word_ratio: float
|
|
743
|
+
function_word_diversity: float
|
|
744
|
+
most_frequent_function_words: list[tuple[str, int]] # Aggregate
|
|
745
|
+
least_frequent_function_words: list[tuple[str, int]] # Aggregate
|
|
746
|
+
function_word_distribution: dict[str, int] # Aggregate
|
|
747
|
+
|
|
748
|
+
# Full distributions
|
|
749
|
+
determiner_ratio_dist: Distribution
|
|
750
|
+
preposition_ratio_dist: Distribution
|
|
751
|
+
conjunction_ratio_dist: Distribution
|
|
752
|
+
pronoun_ratio_dist: Distribution
|
|
753
|
+
auxiliary_ratio_dist: Distribution
|
|
754
|
+
particle_ratio_dist: Distribution
|
|
755
|
+
total_function_word_ratio_dist: Distribution
|
|
756
|
+
function_word_diversity_dist: Distribution
|
|
757
|
+
|
|
758
|
+
# Chunking context
|
|
759
|
+
chunk_size: int
|
|
760
|
+
chunk_count: int
|
|
761
|
+
|
|
762
|
+
metadata: dict[str, Any]
|
|
334
763
|
|
|
335
764
|
|
|
336
765
|
# ===== Advanced Lexical Diversity Results =====
|
|
@@ -347,9 +776,9 @@ class VocdDResult:
|
|
|
347
776
|
It fits a curve to the relationship between tokens and types across multiple
|
|
348
777
|
random samples of the text.
|
|
349
778
|
|
|
350
|
-
Related GitHub
|
|
779
|
+
Related GitHub Issues:
|
|
351
780
|
#14 - Advanced Lexical Diversity Metrics
|
|
352
|
-
|
|
781
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
353
782
|
|
|
354
783
|
The D parameter represents the theoretical vocabulary size and is more
|
|
355
784
|
stable across different text lengths than simple TTR measures.
|
|
@@ -360,18 +789,23 @@ class VocdDResult:
|
|
|
360
789
|
McKee, G., Malvern, D., & Richards, B. (2000). Measuring vocabulary
|
|
361
790
|
diversity using dedicated software. Literary and Linguistic Computing,
|
|
362
791
|
15(3), 323-337.
|
|
363
|
-
|
|
364
|
-
Example:
|
|
365
|
-
>>> result = compute_vocd_d("Long sample text for voc-D analysis...")
|
|
366
|
-
>>> print(f"D parameter: {result.d_parameter:.2f}")
|
|
367
|
-
>>> print(f"Curve fit R²: {result.curve_fit_r_squared:.3f}")
|
|
368
792
|
"""
|
|
369
793
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
794
|
+
# Convenient access (mean values)
|
|
795
|
+
d_parameter: float
|
|
796
|
+
curve_fit_r_squared: float
|
|
797
|
+
sample_count: int # Total across all chunks
|
|
798
|
+
optimal_sample_size: int
|
|
799
|
+
|
|
800
|
+
# Full distributions
|
|
801
|
+
d_parameter_dist: Distribution
|
|
802
|
+
curve_fit_r_squared_dist: Distribution
|
|
803
|
+
|
|
804
|
+
# Chunking context
|
|
805
|
+
chunk_size: int
|
|
806
|
+
chunk_count: int
|
|
807
|
+
|
|
808
|
+
metadata: dict[str, Any]
|
|
375
809
|
|
|
376
810
|
|
|
377
811
|
@dataclass
|
|
@@ -383,28 +817,35 @@ class MATTRResult:
|
|
|
383
817
|
for longer texts. The moving window approach reduces the impact of text
|
|
384
818
|
length on the TTR calculation.
|
|
385
819
|
|
|
386
|
-
Related GitHub
|
|
820
|
+
Related GitHub Issues:
|
|
387
821
|
#14 - Advanced Lexical Diversity Metrics
|
|
388
|
-
|
|
822
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
389
823
|
|
|
390
824
|
References:
|
|
391
825
|
Covington, M. A., & McFall, J. D. (2010). Cutting the Gordian knot:
|
|
392
826
|
The moving-average type-token ratio (MATTR). Journal of Quantitative
|
|
393
827
|
Linguistics, 17(2), 94-100.
|
|
394
|
-
|
|
395
|
-
Example:
|
|
396
|
-
>>> result = compute_mattr("Sample text here...", window_size=50)
|
|
397
|
-
>>> print(f"MATTR score: {result.mattr_score:.3f}")
|
|
398
|
-
>>> print(f"Window size: {result.window_size}")
|
|
399
828
|
"""
|
|
400
829
|
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
830
|
+
# Convenient access (mean values)
|
|
831
|
+
mattr_score: float
|
|
832
|
+
window_size: int
|
|
833
|
+
window_count: int # Total across all chunks
|
|
834
|
+
ttr_std_dev: float
|
|
835
|
+
min_ttr: float
|
|
836
|
+
max_ttr: float
|
|
837
|
+
|
|
838
|
+
# Full distributions
|
|
839
|
+
mattr_score_dist: Distribution
|
|
840
|
+
ttr_std_dev_dist: Distribution
|
|
841
|
+
min_ttr_dist: Distribution
|
|
842
|
+
max_ttr_dist: Distribution
|
|
843
|
+
|
|
844
|
+
# Chunking context
|
|
845
|
+
chunk_size: int
|
|
846
|
+
chunk_count: int
|
|
847
|
+
|
|
848
|
+
metadata: dict[str, Any]
|
|
408
849
|
|
|
409
850
|
|
|
410
851
|
@dataclass
|
|
@@ -416,26 +857,30 @@ class HDDResult:
|
|
|
416
857
|
new word types as text length increases, providing a mathematically
|
|
417
858
|
rigorous measure that is less sensitive to text length than TTR.
|
|
418
859
|
|
|
419
|
-
Related GitHub
|
|
860
|
+
Related GitHub Issues:
|
|
420
861
|
#14 - Advanced Lexical Diversity Metrics
|
|
421
|
-
|
|
862
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
422
863
|
|
|
423
864
|
References:
|
|
424
865
|
McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D: A validation
|
|
425
866
|
study of sophisticated approaches to lexical diversity assessment.
|
|
426
867
|
Behavior Research Methods, 42(2), 381-392.
|
|
427
|
-
|
|
428
|
-
Example:
|
|
429
|
-
>>> result = compute_hdd("Sample text for HD-D analysis...")
|
|
430
|
-
>>> print(f"HD-D score: {result.hdd_score:.3f}")
|
|
431
|
-
>>> print(f"Sample size: {result.sample_size}")
|
|
432
868
|
"""
|
|
433
869
|
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
870
|
+
# Convenient access (mean values)
|
|
871
|
+
hdd_score: float
|
|
872
|
+
sample_size: int
|
|
873
|
+
type_count: int # Total unique across all chunks
|
|
874
|
+
token_count: int # Total across all chunks
|
|
875
|
+
|
|
876
|
+
# Full distributions
|
|
877
|
+
hdd_score_dist: Distribution
|
|
878
|
+
|
|
879
|
+
# Chunking context
|
|
880
|
+
chunk_size: int
|
|
881
|
+
chunk_count: int
|
|
882
|
+
|
|
883
|
+
metadata: dict[str, Any]
|
|
439
884
|
|
|
440
885
|
|
|
441
886
|
@dataclass
|
|
@@ -447,28 +892,35 @@ class MSTTRResult:
|
|
|
447
892
|
normalized measure of lexical diversity that is more comparable across
|
|
448
893
|
texts of different lengths.
|
|
449
894
|
|
|
450
|
-
Related GitHub
|
|
895
|
+
Related GitHub Issues:
|
|
451
896
|
#14 - Advanced Lexical Diversity Metrics
|
|
452
|
-
|
|
897
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
453
898
|
|
|
454
899
|
References:
|
|
455
900
|
Johnson, W. (1944). Studies in language behavior: I. A program of research.
|
|
456
901
|
Psychological Monographs, 56(2), 1-15.
|
|
457
|
-
|
|
458
|
-
Example:
|
|
459
|
-
>>> result = compute_msttr("Sample text...", segment_size=100)
|
|
460
|
-
>>> print(f"MSTTR score: {result.msttr_score:.3f}")
|
|
461
|
-
>>> print(f"Segments analyzed: {result.segment_count}")
|
|
462
902
|
"""
|
|
463
903
|
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
904
|
+
# Convenient access (mean values)
|
|
905
|
+
msttr_score: float
|
|
906
|
+
segment_size: int
|
|
907
|
+
segment_count: int # Total across all chunks
|
|
908
|
+
ttr_std_dev: float
|
|
909
|
+
min_ttr: float
|
|
910
|
+
max_ttr: float
|
|
911
|
+
segment_ttrs: list[float] # Aggregate from all chunks
|
|
912
|
+
|
|
913
|
+
# Full distributions
|
|
914
|
+
msttr_score_dist: Distribution
|
|
915
|
+
ttr_std_dev_dist: Distribution
|
|
916
|
+
min_ttr_dist: Distribution
|
|
917
|
+
max_ttr_dist: Distribution
|
|
918
|
+
|
|
919
|
+
# Chunking context
|
|
920
|
+
chunk_size: int
|
|
921
|
+
chunk_count: int
|
|
922
|
+
|
|
923
|
+
metadata: dict[str, Any]
|
|
472
924
|
|
|
473
925
|
|
|
474
926
|
# ===== Word Frequency Sophistication Results =====
|
|
@@ -485,9 +937,9 @@ class WordFrequencySophisticationResult:
|
|
|
485
937
|
large corpora. Authors who use less frequent (more sophisticated) words
|
|
486
938
|
score higher on these metrics.
|
|
487
939
|
|
|
488
|
-
Related GitHub
|
|
940
|
+
Related GitHub Issues:
|
|
489
941
|
#15 - Word Frequency Sophistication Metrics
|
|
490
|
-
|
|
942
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
491
943
|
|
|
492
944
|
This analysis uses reference frequency data from:
|
|
493
945
|
- COCA (Corpus of Contemporary American English)
|
|
@@ -507,24 +959,32 @@ class WordFrequencySophisticationResult:
|
|
|
507
959
|
A critical evaluation of current word frequency norms. Behavior
|
|
508
960
|
Research Methods, Instruments, & Computers, 41(4), 977-990.
|
|
509
961
|
Coxhead, A. (2000). A new academic word list. TESOL Quarterly, 34(2), 213-238.
|
|
510
|
-
|
|
511
|
-
Example:
|
|
512
|
-
>>> result = compute_word_frequency_sophistication("Sample text...")
|
|
513
|
-
>>> print(f"Mean frequency rank: {result.mean_frequency_rank:.1f}")
|
|
514
|
-
>>> print(f"Rare word ratio: {result.rare_word_ratio:.3f}")
|
|
515
|
-
>>> print(f"Academic word ratio: {result.academic_word_ratio:.3f}")
|
|
516
962
|
"""
|
|
517
963
|
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
964
|
+
# Convenient access (mean values)
|
|
965
|
+
mean_frequency_rank: float
|
|
966
|
+
median_frequency_rank: float
|
|
967
|
+
rare_word_ratio: float
|
|
968
|
+
common_word_ratio: float
|
|
969
|
+
academic_word_ratio: float
|
|
970
|
+
advanced_word_ratio: float
|
|
971
|
+
frequency_band_distribution: dict[str, float] # Aggregate
|
|
972
|
+
rarest_words: list[tuple[str, float]] # Aggregate
|
|
973
|
+
most_common_words: list[tuple[str, float]] # Aggregate
|
|
974
|
+
|
|
975
|
+
# Full distributions
|
|
976
|
+
mean_frequency_rank_dist: Distribution
|
|
977
|
+
median_frequency_rank_dist: Distribution
|
|
978
|
+
rare_word_ratio_dist: Distribution
|
|
979
|
+
common_word_ratio_dist: Distribution
|
|
980
|
+
academic_word_ratio_dist: Distribution
|
|
981
|
+
advanced_word_ratio_dist: Distribution
|
|
982
|
+
|
|
983
|
+
# Chunking context
|
|
984
|
+
chunk_size: int
|
|
985
|
+
chunk_count: int
|
|
986
|
+
|
|
987
|
+
metadata: dict[str, Any]
|
|
528
988
|
|
|
529
989
|
|
|
530
990
|
# ===== Additional Readability Results =====
|
|
@@ -541,9 +1001,9 @@ class DaleChallResult:
|
|
|
541
1001
|
The formula provides a grade level estimate based on sentence length and
|
|
542
1002
|
the percentage of difficult words.
|
|
543
1003
|
|
|
544
|
-
Related GitHub
|
|
1004
|
+
Related GitHub Issues:
|
|
545
1005
|
#16 - Additional Readability Formulas
|
|
546
|
-
|
|
1006
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
547
1007
|
|
|
548
1008
|
Formula: 0.1579 * (difficult_words / total_words * 100) + 0.0496 * avg_sentence_length
|
|
549
1009
|
|
|
@@ -554,21 +1014,26 @@ class DaleChallResult:
|
|
|
554
1014
|
Educational Research Bulletin, 27(1), 11-28.
|
|
555
1015
|
Chall, J. S., & Dale, E. (1995). Readability revisited: The new Dale-Chall
|
|
556
1016
|
readability formula. Brookline Books.
|
|
557
|
-
|
|
558
|
-
Example:
|
|
559
|
-
>>> result = compute_dale_chall("Sample text to analyze...")
|
|
560
|
-
>>> print(f"Dale-Chall score: {result.dale_chall_score:.2f}")
|
|
561
|
-
>>> print(f"Grade level: {result.grade_level}")
|
|
562
|
-
>>> print(f"Difficult word %: {result.difficult_word_ratio * 100:.1f}%")
|
|
563
1017
|
"""
|
|
564
1018
|
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
1019
|
+
# Convenient access (mean values)
|
|
1020
|
+
dale_chall_score: float
|
|
1021
|
+
grade_level: str # Based on mean score
|
|
1022
|
+
difficult_word_count: int # Total across all chunks
|
|
1023
|
+
difficult_word_ratio: float # Mean ratio
|
|
1024
|
+
avg_sentence_length: float # Mean
|
|
1025
|
+
total_words: int # Total across all chunks
|
|
1026
|
+
|
|
1027
|
+
# Full distributions
|
|
1028
|
+
dale_chall_score_dist: Distribution
|
|
1029
|
+
difficult_word_ratio_dist: Distribution
|
|
1030
|
+
avg_sentence_length_dist: Distribution
|
|
1031
|
+
|
|
1032
|
+
# Chunking context
|
|
1033
|
+
chunk_size: int
|
|
1034
|
+
chunk_count: int
|
|
1035
|
+
|
|
1036
|
+
metadata: dict[str, Any]
|
|
572
1037
|
|
|
573
1038
|
|
|
574
1039
|
@dataclass
|
|
@@ -580,26 +1045,32 @@ class LinsearWriteResult:
|
|
|
580
1045
|
syllables) or "hard" (3+ syllables) and uses sentence length to estimate
|
|
581
1046
|
grade level. It's particularly effective for technical writing.
|
|
582
1047
|
|
|
583
|
-
Related GitHub
|
|
1048
|
+
Related GitHub Issues:
|
|
584
1049
|
#16 - Additional Readability Formulas
|
|
585
|
-
|
|
1050
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
586
1051
|
|
|
587
1052
|
References:
|
|
588
1053
|
Klare, G. R. (1974-1975). Assessing readability. Reading Research Quarterly,
|
|
589
1054
|
10(1), 62-102.
|
|
590
|
-
|
|
591
|
-
Example:
|
|
592
|
-
>>> result = compute_linsear_write("Technical manual text...")
|
|
593
|
-
>>> print(f"Linsear Write score: {result.linsear_score:.2f}")
|
|
594
|
-
>>> print(f"Grade level: {result.grade_level}")
|
|
595
1055
|
"""
|
|
596
1056
|
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
1057
|
+
# Convenient access (mean values)
|
|
1058
|
+
linsear_score: float
|
|
1059
|
+
grade_level: float # Changed to float for mean across chunks
|
|
1060
|
+
easy_word_count: int # Total across all chunks
|
|
1061
|
+
hard_word_count: int # Total across all chunks
|
|
1062
|
+
avg_sentence_length: float # Mean
|
|
1063
|
+
|
|
1064
|
+
# Full distributions
|
|
1065
|
+
linsear_score_dist: Distribution
|
|
1066
|
+
grade_level_dist: Distribution
|
|
1067
|
+
avg_sentence_length_dist: Distribution
|
|
1068
|
+
|
|
1069
|
+
# Chunking context
|
|
1070
|
+
chunk_size: int
|
|
1071
|
+
chunk_count: int
|
|
1072
|
+
|
|
1073
|
+
metadata: dict[str, Any]
|
|
603
1074
|
|
|
604
1075
|
|
|
605
1076
|
@dataclass
|
|
@@ -611,28 +1082,32 @@ class FryResult:
|
|
|
611
1082
|
to determine the grade level. This implementation provides the numerical
|
|
612
1083
|
coordinates and estimated grade level.
|
|
613
1084
|
|
|
614
|
-
Related GitHub
|
|
1085
|
+
Related GitHub Issues:
|
|
615
1086
|
#16 - Additional Readability Formulas
|
|
616
|
-
|
|
1087
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
617
1088
|
|
|
618
1089
|
References:
|
|
619
1090
|
Fry, E. (1968). A readability formula that saves time. Journal of Reading,
|
|
620
1091
|
11(7), 513-578.
|
|
621
1092
|
Fry, E. (1977). Fry's readability graph: Clarifications, validity, and
|
|
622
1093
|
extension to level 17. Journal of Reading, 21(3), 242-252.
|
|
623
|
-
|
|
624
|
-
Example:
|
|
625
|
-
>>> result = compute_fry("Sample educational text...")
|
|
626
|
-
>>> print(f"Avg sentence length: {result.avg_sentence_length:.1f}")
|
|
627
|
-
>>> print(f"Avg syllables/100 words: {result.avg_syllables_per_100:.1f}")
|
|
628
|
-
>>> print(f"Grade level: {result.grade_level}")
|
|
629
1094
|
"""
|
|
630
1095
|
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
1096
|
+
# Convenient access (mean values)
|
|
1097
|
+
avg_sentence_length: float
|
|
1098
|
+
avg_syllables_per_100: float
|
|
1099
|
+
grade_level: str # Based on mean coordinates
|
|
1100
|
+
graph_zone: str # Based on mean coordinates
|
|
1101
|
+
|
|
1102
|
+
# Full distributions
|
|
1103
|
+
avg_sentence_length_dist: Distribution
|
|
1104
|
+
avg_syllables_per_100_dist: Distribution
|
|
1105
|
+
|
|
1106
|
+
# Chunking context
|
|
1107
|
+
chunk_size: int
|
|
1108
|
+
chunk_count: int
|
|
1109
|
+
|
|
1110
|
+
metadata: dict[str, Any]
|
|
636
1111
|
|
|
637
1112
|
|
|
638
1113
|
@dataclass
|
|
@@ -644,9 +1119,9 @@ class FORCASTResult:
|
|
|
644
1119
|
words as a measure, making it faster to compute than syllable-based formulas.
|
|
645
1120
|
Particularly useful for technical and military documents.
|
|
646
1121
|
|
|
647
|
-
Related GitHub
|
|
1122
|
+
Related GitHub Issues:
|
|
648
1123
|
#16 - Additional Readability Formulas
|
|
649
|
-
|
|
1124
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
650
1125
|
|
|
651
1126
|
Formula: 20 - (N / 10), where N is the number of single-syllable words
|
|
652
1127
|
per 150-word sample.
|
|
@@ -655,19 +1130,25 @@ class FORCASTResult:
|
|
|
655
1130
|
Caylor, J. S., Sticht, T. G., Fox, L. C., & Ford, J. P. (1973).
|
|
656
1131
|
Methodologies for determining reading requirements of military
|
|
657
1132
|
occupational specialties. Human Resources Research Organization.
|
|
658
|
-
|
|
659
|
-
Example:
|
|
660
|
-
>>> result = compute_forcast("Military technical document text...")
|
|
661
|
-
>>> print(f"FORCAST score: {result.forcast_score:.2f}")
|
|
662
|
-
>>> print(f"Grade level: {result.grade_level}")
|
|
663
1133
|
"""
|
|
664
1134
|
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
1135
|
+
# Convenient access (mean values)
|
|
1136
|
+
forcast_score: float
|
|
1137
|
+
grade_level: float # Changed to float for mean across chunks
|
|
1138
|
+
single_syllable_ratio: float # Mean ratio
|
|
1139
|
+
single_syllable_count: int # Total across all chunks
|
|
1140
|
+
total_words: int # Total across all chunks
|
|
1141
|
+
|
|
1142
|
+
# Full distributions
|
|
1143
|
+
forcast_score_dist: Distribution
|
|
1144
|
+
grade_level_dist: Distribution
|
|
1145
|
+
single_syllable_ratio_dist: Distribution
|
|
1146
|
+
|
|
1147
|
+
# Chunking context
|
|
1148
|
+
chunk_size: int
|
|
1149
|
+
chunk_count: int
|
|
1150
|
+
|
|
1151
|
+
metadata: dict[str, Any]
|
|
671
1152
|
|
|
672
1153
|
|
|
673
1154
|
@dataclass
|
|
@@ -679,9 +1160,9 @@ class PowersSumnerKearlResult:
|
|
|
679
1160
|
average sentence length and average syllables per word, but with different
|
|
680
1161
|
coefficients optimized for younger readers.
|
|
681
1162
|
|
|
682
|
-
Related GitHub
|
|
1163
|
+
Related GitHub Issues:
|
|
683
1164
|
#16 - Additional Readability Formulas
|
|
684
|
-
|
|
1165
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
685
1166
|
|
|
686
1167
|
Formula: 0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
|
|
687
1168
|
|
|
@@ -689,21 +1170,28 @@ class PowersSumnerKearlResult:
|
|
|
689
1170
|
Powers, R. D., Sumner, W. A., & Kearl, B. E. (1958). A recalculation of
|
|
690
1171
|
four adult readability formulas. Journal of Educational Psychology,
|
|
691
1172
|
49(2), 99-105.
|
|
692
|
-
|
|
693
|
-
Example:
|
|
694
|
-
>>> result = compute_powers_sumner_kearl("Children's book text...")
|
|
695
|
-
>>> print(f"PSK score: {result.psk_score:.2f}")
|
|
696
|
-
>>> print(f"Grade level: {result.grade_level}")
|
|
697
1173
|
"""
|
|
698
1174
|
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
1175
|
+
# Convenient access (mean values)
|
|
1176
|
+
psk_score: float
|
|
1177
|
+
grade_level: float
|
|
1178
|
+
avg_sentence_length: float
|
|
1179
|
+
avg_syllables_per_word: float
|
|
1180
|
+
total_sentences: int # Total across all chunks
|
|
1181
|
+
total_words: int # Total across all chunks
|
|
1182
|
+
total_syllables: int # Total across all chunks
|
|
1183
|
+
|
|
1184
|
+
# Full distributions
|
|
1185
|
+
psk_score_dist: Distribution
|
|
1186
|
+
grade_level_dist: Distribution
|
|
1187
|
+
avg_sentence_length_dist: Distribution
|
|
1188
|
+
avg_syllables_per_word_dist: Distribution
|
|
1189
|
+
|
|
1190
|
+
# Chunking context
|
|
1191
|
+
chunk_size: int
|
|
1192
|
+
chunk_count: int
|
|
1193
|
+
|
|
1194
|
+
metadata: dict[str, Any]
|
|
707
1195
|
|
|
708
1196
|
|
|
709
1197
|
# ===== Advanced Syntactic Results =====
|
|
@@ -720,9 +1208,9 @@ class AdvancedSyntacticResult:
|
|
|
720
1208
|
capture sentence complexity, grammatical sophistication, and syntactic
|
|
721
1209
|
style preferences.
|
|
722
1210
|
|
|
723
|
-
Related GitHub
|
|
1211
|
+
Related GitHub Issues:
|
|
724
1212
|
#17 - Advanced Syntactic Analysis
|
|
725
|
-
|
|
1213
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
726
1214
|
|
|
727
1215
|
Features analyzed:
|
|
728
1216
|
- Parse tree depth (sentence structural complexity)
|
|
@@ -740,28 +1228,42 @@ class AdvancedSyntacticResult:
|
|
|
740
1228
|
Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
|
|
741
1229
|
Lu, X. (2010). Automatic analysis of syntactic complexity in second language
|
|
742
1230
|
writing. International Journal of Corpus Linguistics, 15(4), 474-496.
|
|
743
|
-
|
|
744
|
-
Example:
|
|
745
|
-
>>> result = compute_advanced_syntactic("Complex sentence structures...")
|
|
746
|
-
>>> print(f"Parse tree depth: {result.mean_parse_tree_depth:.1f}")
|
|
747
|
-
>>> print(f"T-units: {result.t_unit_count}")
|
|
748
|
-
>>> print(f"Passive voice %: {result.passive_voice_ratio * 100:.1f}%")
|
|
749
1231
|
"""
|
|
750
1232
|
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
1233
|
+
# Convenient access (mean values)
|
|
1234
|
+
mean_parse_tree_depth: float
|
|
1235
|
+
max_parse_tree_depth: float # Changed to float for mean across chunks
|
|
1236
|
+
t_unit_count: int # Total across all chunks
|
|
1237
|
+
mean_t_unit_length: float
|
|
1238
|
+
clausal_density: float
|
|
1239
|
+
dependent_clause_ratio: float
|
|
1240
|
+
passive_voice_ratio: float
|
|
1241
|
+
subordination_index: float
|
|
1242
|
+
coordination_index: float
|
|
1243
|
+
sentence_complexity_score: float
|
|
1244
|
+
dependency_distance: float
|
|
1245
|
+
left_branching_ratio: float
|
|
1246
|
+
right_branching_ratio: float
|
|
1247
|
+
|
|
1248
|
+
# Full distributions
|
|
1249
|
+
mean_parse_tree_depth_dist: Distribution
|
|
1250
|
+
max_parse_tree_depth_dist: Distribution
|
|
1251
|
+
mean_t_unit_length_dist: Distribution
|
|
1252
|
+
clausal_density_dist: Distribution
|
|
1253
|
+
dependent_clause_ratio_dist: Distribution
|
|
1254
|
+
passive_voice_ratio_dist: Distribution
|
|
1255
|
+
subordination_index_dist: Distribution
|
|
1256
|
+
coordination_index_dist: Distribution
|
|
1257
|
+
sentence_complexity_score_dist: Distribution
|
|
1258
|
+
dependency_distance_dist: Distribution
|
|
1259
|
+
left_branching_ratio_dist: Distribution
|
|
1260
|
+
right_branching_ratio_dist: Distribution
|
|
1261
|
+
|
|
1262
|
+
# Chunking context
|
|
1263
|
+
chunk_size: int
|
|
1264
|
+
chunk_count: int
|
|
1265
|
+
|
|
1266
|
+
metadata: dict[str, Any]
|
|
765
1267
|
|
|
766
1268
|
|
|
767
1269
|
# ===== Sentence Type Results =====
|
|
@@ -778,15 +1280,16 @@ class SentenceTypeResult:
|
|
|
778
1280
|
function (declarative, interrogative, imperative, exclamatory). Different
|
|
779
1281
|
authors and genres show distinct patterns in sentence type distribution.
|
|
780
1282
|
|
|
781
|
-
Related GitHub
|
|
1283
|
+
Related GitHub Issues:
|
|
782
1284
|
#18 - Sentence Type Classification
|
|
783
|
-
|
|
1285
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
784
1286
|
|
|
785
1287
|
Structural types:
|
|
786
1288
|
- Simple: One independent clause (e.g., "The cat sat.")
|
|
787
1289
|
- Compound: Multiple independent clauses (e.g., "I came, I saw, I conquered.")
|
|
788
1290
|
- Complex: One independent + dependent clause(s) (e.g., "When I arrived, I saw her.")
|
|
789
|
-
- Compound-Complex: Multiple independent + dependent
|
|
1291
|
+
- Compound-Complex: Multiple independent + dependent
|
|
1292
|
+
(e.g., "I came when called, and I stayed.")
|
|
790
1293
|
|
|
791
1294
|
Functional types:
|
|
792
1295
|
- Declarative: Statement (e.g., "The sky is blue.")
|
|
@@ -797,27 +1300,19 @@ class SentenceTypeResult:
|
|
|
797
1300
|
References:
|
|
798
1301
|
Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
|
|
799
1302
|
Huddleston, R., & Pullum, G. K. (2002). The Cambridge Grammar of the English Language.
|
|
800
|
-
|
|
801
|
-
Example:
|
|
802
|
-
>>> result = compute_sentence_types("Mix of sentence types here...")
|
|
803
|
-
>>> print(f"Simple: {result.simple_ratio * 100:.1f}%")
|
|
804
|
-
>>> print(f"Complex: {result.complex_ratio * 100:.1f}%")
|
|
805
|
-
>>> print(f"Questions: {result.interrogative_ratio * 100:.1f}%")
|
|
806
1303
|
"""
|
|
807
1304
|
|
|
808
|
-
#
|
|
809
|
-
simple_ratio: float
|
|
810
|
-
compound_ratio: float
|
|
811
|
-
complex_ratio: float
|
|
812
|
-
compound_complex_ratio: float
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
# Counts
|
|
1305
|
+
# Convenient access (mean ratios)
|
|
1306
|
+
simple_ratio: float
|
|
1307
|
+
compound_ratio: float
|
|
1308
|
+
complex_ratio: float
|
|
1309
|
+
compound_complex_ratio: float
|
|
1310
|
+
declarative_ratio: float
|
|
1311
|
+
interrogative_ratio: float
|
|
1312
|
+
imperative_ratio: float
|
|
1313
|
+
exclamatory_ratio: float
|
|
1314
|
+
|
|
1315
|
+
# Counts (totals across all chunks)
|
|
821
1316
|
simple_count: int
|
|
822
1317
|
compound_count: int
|
|
823
1318
|
complex_count: int
|
|
@@ -828,11 +1323,27 @@ class SentenceTypeResult:
|
|
|
828
1323
|
exclamatory_count: int
|
|
829
1324
|
total_sentences: int
|
|
830
1325
|
|
|
831
|
-
# Diversity
|
|
832
|
-
structural_diversity: float
|
|
833
|
-
functional_diversity: float
|
|
1326
|
+
# Diversity (mean across chunks)
|
|
1327
|
+
structural_diversity: float
|
|
1328
|
+
functional_diversity: float
|
|
1329
|
+
|
|
1330
|
+
# Full distributions
|
|
1331
|
+
simple_ratio_dist: Distribution
|
|
1332
|
+
compound_ratio_dist: Distribution
|
|
1333
|
+
complex_ratio_dist: Distribution
|
|
1334
|
+
compound_complex_ratio_dist: Distribution
|
|
1335
|
+
declarative_ratio_dist: Distribution
|
|
1336
|
+
interrogative_ratio_dist: Distribution
|
|
1337
|
+
imperative_ratio_dist: Distribution
|
|
1338
|
+
exclamatory_ratio_dist: Distribution
|
|
1339
|
+
structural_diversity_dist: Distribution
|
|
1340
|
+
functional_diversity_dist: Distribution
|
|
1341
|
+
|
|
1342
|
+
# Chunking context
|
|
1343
|
+
chunk_size: int
|
|
1344
|
+
chunk_count: int
|
|
834
1345
|
|
|
835
|
-
metadata: dict[str, Any]
|
|
1346
|
+
metadata: dict[str, Any]
|
|
836
1347
|
|
|
837
1348
|
|
|
838
1349
|
# ===== Extended N-gram Results =====
|
|
@@ -1006,6 +1517,7 @@ class VocabularyOverlapResult:
|
|
|
1006
1517
|
- Dice coefficient (2 * intersection / sum of sizes)
|
|
1007
1518
|
- Overlap coefficient (intersection / min(size1, size2))
|
|
1008
1519
|
- Cosine similarity (using word frequency vectors)
|
|
1520
|
+
- KL divergence (asymmetric distributional difference)
|
|
1009
1521
|
- Shared vocabulary size and ratio
|
|
1010
1522
|
- Unique words in each text
|
|
1011
1523
|
- Most distinctive words for each text
|
|
@@ -1015,6 +1527,10 @@ class VocabularyOverlapResult:
|
|
|
1015
1527
|
New Phytologist, 11(2), 37-50.
|
|
1016
1528
|
Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
|
|
1017
1529
|
Retrieval. McGraw-Hill.
|
|
1530
|
+
Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
|
|
1531
|
+
Annals of Mathematical Statistics, 22(1), 79-86.
|
|
1532
|
+
Manning, C. D., & Schütze, H. (1999). Foundations of Statistical NLP.
|
|
1533
|
+
MIT Press.
|
|
1018
1534
|
|
|
1019
1535
|
Example:
|
|
1020
1536
|
>>> result = compute_vocabulary_overlap(text1, text2)
|
|
@@ -1028,6 +1544,7 @@ class VocabularyOverlapResult:
|
|
|
1028
1544
|
dice_coefficient: float # 2 * intersection / (size1 + size2)
|
|
1029
1545
|
overlap_coefficient: float # Intersection / min(size1, size2)
|
|
1030
1546
|
cosine_similarity: float # Cosine of frequency vectors
|
|
1547
|
+
kl_divergence: float # Kullback-Leibler divergence (asymmetric, text1 || text2)
|
|
1031
1548
|
|
|
1032
1549
|
# Vocabulary sizes
|
|
1033
1550
|
text1_vocab_size: int # Unique words in text 1
|
|
@@ -1249,6 +1766,87 @@ class KilgarriffResult:
|
|
|
1249
1766
|
metadata: dict[str, Any] # Frequency tables, expected values, etc.
|
|
1250
1767
|
|
|
1251
1768
|
|
|
1769
|
+
@dataclass
|
|
1770
|
+
class KilgarriffDriftResult:
|
|
1771
|
+
"""Result from Kilgarriff chi-squared drift detection within a single document.
|
|
1772
|
+
|
|
1773
|
+
This result captures stylistic drift patterns by comparing sequential chunks
|
|
1774
|
+
of text using Kilgarriff's chi-squared method. It enables detection of
|
|
1775
|
+
inconsistent authorship, heavy editing, pasted content, and AI-generated
|
|
1776
|
+
text signatures.
|
|
1777
|
+
|
|
1778
|
+
Related GitHub Issues:
|
|
1779
|
+
#36 - Kilgarriff Chi-Squared drift detection for intra-document analysis
|
|
1780
|
+
https://github.com/craigtrim/pystylometry/issues/36
|
|
1781
|
+
#31 - Classical Stylometric Methods from Programming Historian
|
|
1782
|
+
https://github.com/craigtrim/pystylometry/issues/31
|
|
1783
|
+
|
|
1784
|
+
Pattern Signatures:
|
|
1785
|
+
- consistent: Low, stable χ² across pairs (natural human writing)
|
|
1786
|
+
- gradual_drift: Slowly increasing trend (author fatigue, topic shift)
|
|
1787
|
+
- sudden_spike: One pair has high χ² (pasted content, different author)
|
|
1788
|
+
- suspiciously_uniform: Near-zero variance (possible AI generation)
|
|
1789
|
+
- unknown: Insufficient data for classification
|
|
1790
|
+
|
|
1791
|
+
Marketing Name: "Style Drift Detector" / "Consistency Fingerprint"
|
|
1792
|
+
|
|
1793
|
+
References:
|
|
1794
|
+
Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus
|
|
1795
|
+
Linguistics, 6(1), 97-133.
|
|
1796
|
+
|
|
1797
|
+
Example:
|
|
1798
|
+
>>> result = compute_kilgarriff_drift(text, window_size=1000, stride=500)
|
|
1799
|
+
>>> result.pattern # "consistent", "gradual_drift", "sudden_spike", etc.
|
|
1800
|
+
'consistent'
|
|
1801
|
+
>>> result.mean_chi_squared # Average χ² across chunk pairs
|
|
1802
|
+
45.2
|
|
1803
|
+
>>> result.status # "success", "marginal_data", "insufficient_data"
|
|
1804
|
+
'success'
|
|
1805
|
+
"""
|
|
1806
|
+
|
|
1807
|
+
# Status (graceful handling of edge cases)
|
|
1808
|
+
status: str # "success", "marginal_data", "insufficient_data"
|
|
1809
|
+
status_message: str # Human-readable explanation
|
|
1810
|
+
|
|
1811
|
+
# Pattern classification
|
|
1812
|
+
pattern: str # "consistent", "gradual_drift", "sudden_spike", "suspiciously_uniform", "unknown"
|
|
1813
|
+
pattern_confidence: float # 0.0-1.0 confidence in classification
|
|
1814
|
+
|
|
1815
|
+
# Holistic metrics (may be NaN if insufficient data)
|
|
1816
|
+
mean_chi_squared: float # Average χ² across all chunk pairs
|
|
1817
|
+
std_chi_squared: float # Standard deviation of χ² values
|
|
1818
|
+
max_chi_squared: float # Highest χ² between any two chunks
|
|
1819
|
+
min_chi_squared: float # Lowest χ² between any two chunks
|
|
1820
|
+
max_location: int # Index of chunk boundary with max χ² (0-indexed)
|
|
1821
|
+
trend: float # Linear regression slope of χ² over chunk pairs
|
|
1822
|
+
|
|
1823
|
+
# Pairwise comparison data
|
|
1824
|
+
pairwise_scores: list[dict] # [{"chunk_pair": (0, 1), "chi_squared": 45.2, "top_words": [...]}]
|
|
1825
|
+
|
|
1826
|
+
# Window configuration (for reproducibility)
|
|
1827
|
+
window_size: int
|
|
1828
|
+
stride: int
|
|
1829
|
+
overlap_ratio: float # Computed: max(0, 1 - stride/window_size)
|
|
1830
|
+
comparison_mode: str # "sequential", "all_pairs", "fixed_lag"
|
|
1831
|
+
window_count: int
|
|
1832
|
+
|
|
1833
|
+
# For all_pairs mode only
|
|
1834
|
+
distance_matrix: list[list[float]] | None # None for sequential/fixed_lag
|
|
1835
|
+
|
|
1836
|
+
# Thresholds used for pattern classification (for transparency)
|
|
1837
|
+
thresholds: dict[str, float]
|
|
1838
|
+
|
|
1839
|
+
metadata: dict[str, Any]
|
|
1840
|
+
|
|
1841
|
+
|
|
1842
|
+
# ===== Consistency Module Thresholds =====
|
|
1843
|
+
# Related to GitHub Issue #36
|
|
1844
|
+
# These are calibration constants for pattern classification
|
|
1845
|
+
|
|
1846
|
+
MIN_WINDOWS = 3 # Bare minimum for variance calculation
|
|
1847
|
+
RECOMMENDED_WINDOWS = 5 # For reliable pattern classification
|
|
1848
|
+
|
|
1849
|
+
|
|
1252
1850
|
@dataclass
|
|
1253
1851
|
class MinMaxResult:
|
|
1254
1852
|
"""Result from Min-Max distance method (Burrows' original method).
|
|
@@ -1305,6 +1903,54 @@ class JohnsBurrowsResult:
|
|
|
1305
1903
|
metadata: dict[str, Any] # Method-specific parameters, z-scores, etc.
|
|
1306
1904
|
|
|
1307
1905
|
|
|
1906
|
+
@dataclass
|
|
1907
|
+
class CompressionResult:
|
|
1908
|
+
"""Result from compression-based authorship attribution.
|
|
1909
|
+
|
|
1910
|
+
Compression-based methods use the Normalized Compression Distance (NCD) to
|
|
1911
|
+
measure similarity between texts. The intuition is that if two texts are
|
|
1912
|
+
similar, compressing them together will yield better compression than
|
|
1913
|
+
compressing separately. This approach is language-independent and captures
|
|
1914
|
+
deep statistical regularities.
|
|
1915
|
+
|
|
1916
|
+
Related GitHub Issue:
|
|
1917
|
+
#24 - Additional Authorship Attribution Methods
|
|
1918
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
1919
|
+
|
|
1920
|
+
Formula:
|
|
1921
|
+
NCD(x,y) = (C(xy) - min(C(x), C(y))) / max(C(x), C(y))
|
|
1922
|
+
|
|
1923
|
+
where C(x) is the compressed size of x, and C(xy) is the compressed
|
|
1924
|
+
size of x and y concatenated.
|
|
1925
|
+
|
|
1926
|
+
Interpretation:
|
|
1927
|
+
- NCD ≈ 0: Texts are very similar
|
|
1928
|
+
- NCD ≈ 1: Texts are very different
|
|
1929
|
+
- Typical same-author pairs: 0.3-0.6
|
|
1930
|
+
- Typical different-author pairs: 0.6-0.9
|
|
1931
|
+
|
|
1932
|
+
References:
|
|
1933
|
+
Cilibrasi, R., & Vitányi, P. M. (2005). Clustering by compression.
|
|
1934
|
+
IEEE Transactions on Information Theory, 51(4), 1523-1545.
|
|
1935
|
+
|
|
1936
|
+
Benedetto, D., Caglioti, E., & Loreto, V. (2002). Language trees and
|
|
1937
|
+
zipping. Physical Review Letters, 88(4), 048702.
|
|
1938
|
+
|
|
1939
|
+
Example:
|
|
1940
|
+
>>> result = compute_compression_distance(text1, text2)
|
|
1941
|
+
>>> print(f"NCD: {result.ncd:.3f}")
|
|
1942
|
+
>>> if result.ncd < 0.5:
|
|
1943
|
+
... print("Texts likely by same author")
|
|
1944
|
+
"""
|
|
1945
|
+
|
|
1946
|
+
ncd: float # Normalized Compression Distance [0, 1+]
|
|
1947
|
+
compressor: str # Compression algorithm used (e.g., "gzip", "zlib", "bz2")
|
|
1948
|
+
text1_compressed_size: int # Compressed size of text1 alone
|
|
1949
|
+
text2_compressed_size: int # Compressed size of text2 alone
|
|
1950
|
+
combined_compressed_size: int # Compressed size of concatenated texts
|
|
1951
|
+
metadata: dict[str, Any] # Raw sizes, compression ratios, etc.
|
|
1952
|
+
|
|
1953
|
+
|
|
1308
1954
|
# ===== Rhythm and Prosody Results =====
|
|
1309
1955
|
# Related to GitHub Issue #25: Rhythm and Prosody Metrics
|
|
1310
1956
|
# https://github.com/craigtrim/pystylometry/issues/25
|
|
@@ -1379,6 +2025,118 @@ class RhythmProsodyResult:
|
|
|
1379
2025
|
metadata: dict[str, Any] # Syllable counts, stress patterns, phoneme data, etc.
|
|
1380
2026
|
|
|
1381
2027
|
|
|
2028
|
+
# ===== Dialect Detection Results =====
|
|
2029
|
+
# Related to GitHub Issue #35: Dialect detection with extensible JSON markers
|
|
2030
|
+
# https://github.com/craigtrim/pystylometry/issues/35
|
|
2031
|
+
# Related to GitHub Issue #30: Whonix stylometry features
|
|
2032
|
+
# https://github.com/craigtrim/pystylometry/issues/30
|
|
2033
|
+
|
|
2034
|
+
|
|
2035
|
+
@dataclass
|
|
2036
|
+
class DialectResult:
|
|
2037
|
+
"""Result from dialect detection analysis.
|
|
2038
|
+
|
|
2039
|
+
Dialect detection identifies regional linguistic preferences (British vs.
|
|
2040
|
+
American English) and measures text markedness - how far the text deviates
|
|
2041
|
+
from "unmarked" standard English. This analysis uses an extensible JSON-based
|
|
2042
|
+
marker database covering vocabulary, spelling patterns, grammar patterns,
|
|
2043
|
+
punctuation conventions, and idiomatic expressions.
|
|
2044
|
+
|
|
2045
|
+
The analysis follows the chunking pattern from Issue #27, computing metrics
|
|
2046
|
+
per chunk and providing distributions for stylometric fingerprinting. Dialect
|
|
2047
|
+
markers are sparse, so variance across chunks can reveal mixed authorship
|
|
2048
|
+
(e.g., a UK speaker using ChatGPT-generated American English content).
|
|
2049
|
+
|
|
2050
|
+
Related GitHub Issues:
|
|
2051
|
+
#35 - Dialect detection with extensible JSON markers
|
|
2052
|
+
https://github.com/craigtrim/pystylometry/issues/35
|
|
2053
|
+
#30 - Whonix stylometry features (regional linguistic preferences)
|
|
2054
|
+
https://github.com/craigtrim/pystylometry/issues/30
|
|
2055
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
2056
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
2057
|
+
|
|
2058
|
+
Theoretical Background:
|
|
2059
|
+
Markedness theory (Battistella, 1990) informs the markedness_score:
|
|
2060
|
+
marked forms stand out against "standard" written English. High
|
|
2061
|
+
markedness suggests intentional stylistic choice or strong dialect
|
|
2062
|
+
identity. Dialectometry (Goebl, 1982; Nerbonne, 2009) provides the
|
|
2063
|
+
quantitative framework for holistic dialect measurement.
|
|
2064
|
+
|
|
2065
|
+
Feature Levels:
|
|
2066
|
+
Markers are categorized by linguistic level for fine-grained analysis:
|
|
2067
|
+
- Phonological: Spelling reflecting pronunciation (colour/color)
|
|
2068
|
+
- Morphological: Word formation (-ise/-ize, -our/-or, doubled L)
|
|
2069
|
+
- Lexical: Different words for same concept (flat/apartment)
|
|
2070
|
+
- Syntactic: Grammar differences (have got/have, collective nouns)
|
|
2071
|
+
|
|
2072
|
+
Eye Dialect vs. True Dialect:
|
|
2073
|
+
Following Encyclopedia.com's distinction, "eye dialect" (gonna, wanna)
|
|
2074
|
+
indicates informal register, not regional dialect. True dialect markers
|
|
2075
|
+
(colour, flat, lorry) indicate actual regional preference.
|
|
2076
|
+
|
|
2077
|
+
References:
|
|
2078
|
+
Battistella, Edwin L. "Markedness: The Evaluative Superstructure of
|
|
2079
|
+
Language." State University of New York Press, 1990.
|
|
2080
|
+
Goebl, Hans. "Dialektometrie: Prinzipien und Methoden des Einsatzes der
|
|
2081
|
+
numerischen Taxonomie im Bereich der Dialektgeographie." Verlag der
|
|
2082
|
+
Österreichischen Akademie der Wissenschaften, 1982.
|
|
2083
|
+
Nerbonne, John. "Data-Driven Dialectology." Language and Linguistics
|
|
2084
|
+
Compass, vol. 3, no. 1, 2009, pp. 175-198.
|
|
2085
|
+
Labov, William. "The Social Stratification of English in New York City."
|
|
2086
|
+
Cambridge University Press, 2006.
|
|
2087
|
+
Whonix Project. "Stylometry: Deanonymization Techniques." Whonix Wiki,
|
|
2088
|
+
https://www.whonix.org/wiki/Stylometry
|
|
2089
|
+
|
|
2090
|
+
Example:
|
|
2091
|
+
>>> result = compute_dialect(text, chunk_size=1000)
|
|
2092
|
+
>>> result.dialect # "british", "american", "mixed", or "neutral"
|
|
2093
|
+
'british'
|
|
2094
|
+
>>> result.british_score # Mean across chunks
|
|
2095
|
+
0.72
|
|
2096
|
+
>>> result.british_score_dist.std # Variance reveals fingerprint
|
|
2097
|
+
0.05
|
|
2098
|
+
>>> result.markedness_score # Deviation from standard English
|
|
2099
|
+
0.35
|
|
2100
|
+
"""
|
|
2101
|
+
|
|
2102
|
+
# Classification result
|
|
2103
|
+
dialect: str # "british", "american", "mixed", "neutral"
|
|
2104
|
+
confidence: float # 0.0-1.0, how confident the classification is
|
|
2105
|
+
|
|
2106
|
+
# Convenient access (mean values across chunks)
|
|
2107
|
+
british_score: float # Mean British marker density (0.0-1.0)
|
|
2108
|
+
american_score: float # Mean American marker density (0.0-1.0)
|
|
2109
|
+
markedness_score: float # Mean deviation from unmarked standard (0.0-1.0)
|
|
2110
|
+
|
|
2111
|
+
# Full distributions for stylometric fingerprinting
|
|
2112
|
+
british_score_dist: Distribution
|
|
2113
|
+
american_score_dist: Distribution
|
|
2114
|
+
markedness_score_dist: Distribution
|
|
2115
|
+
|
|
2116
|
+
# Marker breakdown by linguistic level (aggregated across chunks)
|
|
2117
|
+
# Keys: "phonological", "morphological", "lexical", "syntactic"
|
|
2118
|
+
markers_by_level: dict[str, dict[str, int]]
|
|
2119
|
+
|
|
2120
|
+
# Detailed marker counts (aggregated across chunks)
|
|
2121
|
+
spelling_markers: dict[str, int] # {"colour": 2, "color": 1}
|
|
2122
|
+
vocabulary_markers: dict[str, int] # {"flat": 1, "apartment": 0}
|
|
2123
|
+
grammar_markers: dict[str, int] # {"have got": 1}
|
|
2124
|
+
|
|
2125
|
+
# Eye dialect (informal register indicators, not true dialect)
|
|
2126
|
+
eye_dialect_count: int # Total eye dialect markers (gonna, wanna, etc.)
|
|
2127
|
+
eye_dialect_ratio: float # Eye dialect per 1000 words
|
|
2128
|
+
|
|
2129
|
+
# Register analysis hints
|
|
2130
|
+
register_hints: dict[str, Any] # {"formality": 0.7, "hedging_density": 0.05}
|
|
2131
|
+
|
|
2132
|
+
# Chunking context
|
|
2133
|
+
chunk_size: int
|
|
2134
|
+
chunk_count: int
|
|
2135
|
+
|
|
2136
|
+
# Extensible metadata
|
|
2137
|
+
metadata: dict[str, Any]
|
|
2138
|
+
|
|
2139
|
+
|
|
1382
2140
|
# ===== Unified Analysis Result =====
|
|
1383
2141
|
|
|
1384
2142
|
|