pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
pystylometry/_types.py
CHANGED
|
@@ -1,42 +1,372 @@
|
|
|
1
|
-
"""Result dataclasses for all pystylometry metrics.
|
|
1
|
+
"""Result dataclasses for all pystylometry metrics.
|
|
2
|
+
|
|
3
|
+
This module defines dataclasses for all metric results in pystylometry.
|
|
4
|
+
|
|
5
|
+
Native Chunked Analysis (Issue #27):
|
|
6
|
+
All metrics support chunked analysis by default. Results include:
|
|
7
|
+
- Convenient access to the mean value (e.g., result.reading_ease)
|
|
8
|
+
- Full distribution with per-chunk values and statistics (e.g., result.reading_ease_dist)
|
|
9
|
+
|
|
10
|
+
The Distribution dataclass provides:
|
|
11
|
+
- values: list of per-chunk metric values
|
|
12
|
+
- mean, median, std: central tendency and variability
|
|
13
|
+
- range, iqr: spread measures
|
|
14
|
+
|
|
15
|
+
This design captures the variance and rhythm in writing style, which is
|
|
16
|
+
essential for authorship attribution and linguistic fingerprinting.
|
|
17
|
+
|
|
18
|
+
References:
|
|
19
|
+
STTR methodology: Johnson, W. (1944). Studies in language behavior.
|
|
20
|
+
"""
|
|
2
21
|
|
|
3
22
|
from __future__ import annotations
|
|
4
23
|
|
|
24
|
+
import statistics
|
|
5
25
|
from dataclasses import dataclass
|
|
6
26
|
from typing import Any
|
|
7
27
|
|
|
28
|
+
# ===== Distribution and Chunking =====
|
|
29
|
+
# Related to GitHub Issue #27: Native chunked analysis with Distribution dataclass
|
|
30
|
+
# https://github.com/craigtrim/pystylometry/issues/27
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class Distribution:
|
|
35
|
+
"""Distribution of metric values across chunks.
|
|
36
|
+
|
|
37
|
+
This dataclass captures the variance and rhythm in writing style by storing
|
|
38
|
+
per-chunk values along with descriptive statistics. The variance across chunks
|
|
39
|
+
is often more revealing of authorial fingerprint than aggregate values.
|
|
40
|
+
|
|
41
|
+
Related GitHub Issue:
|
|
42
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
43
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
44
|
+
|
|
45
|
+
Attributes:
|
|
46
|
+
values: Raw per-chunk metric values
|
|
47
|
+
mean: Arithmetic mean of values
|
|
48
|
+
median: Middle value when sorted
|
|
49
|
+
std: Standard deviation (0.0 for single-chunk)
|
|
50
|
+
range: max - min (spread measure)
|
|
51
|
+
iqr: Interquartile range (Q3 - Q1), robust spread measure
|
|
52
|
+
|
|
53
|
+
Note:
|
|
54
|
+
min/max are omitted as trivial operations on values:
|
|
55
|
+
- min(dist.values), max(dist.values)
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
>>> dist = Distribution(
|
|
59
|
+
... values=[65.2, 71.1, 68.8, 70.5],
|
|
60
|
+
... mean=68.9, median=69.65, std=2.57,
|
|
61
|
+
... range=5.9, iqr=3.15
|
|
62
|
+
... )
|
|
63
|
+
>>> dist.std # variance reveals authorial fingerprint
|
|
64
|
+
2.57
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
values: list[float]
|
|
68
|
+
mean: float
|
|
69
|
+
median: float
|
|
70
|
+
std: float
|
|
71
|
+
range: float
|
|
72
|
+
iqr: float
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def chunk_text(text: str, chunk_size: int) -> list[str]:
|
|
76
|
+
"""Split text into word-based chunks of approximately equal size.
|
|
77
|
+
|
|
78
|
+
Chunks are created by splitting on whitespace and grouping words.
|
|
79
|
+
The last chunk may be smaller than chunk_size if the text doesn't
|
|
80
|
+
divide evenly.
|
|
81
|
+
|
|
82
|
+
Related GitHub Issue:
|
|
83
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
84
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
text: The text to chunk
|
|
88
|
+
chunk_size: Target number of words per chunk (default: 1000)
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
List of text chunks. For text smaller than chunk_size,
|
|
92
|
+
returns a single-element list with the entire text.
|
|
93
|
+
|
|
94
|
+
Example:
|
|
95
|
+
>>> chunks = chunk_text("word " * 2500, chunk_size=1000)
|
|
96
|
+
>>> len(chunks)
|
|
97
|
+
3
|
|
98
|
+
>>> # First two chunks have ~1000 words, last has ~500
|
|
99
|
+
"""
|
|
100
|
+
words = text.split()
|
|
101
|
+
if not words:
|
|
102
|
+
return [""]
|
|
103
|
+
|
|
104
|
+
chunks = []
|
|
105
|
+
for i in range(0, len(words), chunk_size):
|
|
106
|
+
chunk_words = words[i : i + chunk_size]
|
|
107
|
+
chunks.append(" ".join(chunk_words))
|
|
108
|
+
|
|
109
|
+
return chunks
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def make_distribution(values: list[float]) -> Distribution:
|
|
113
|
+
"""Create a Distribution from a list of values.
|
|
114
|
+
|
|
115
|
+
Computes all descriptive statistics for the distribution.
|
|
116
|
+
Handles single-value lists by setting std, range, and iqr to 0.0.
|
|
117
|
+
|
|
118
|
+
Related GitHub Issue:
|
|
119
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
120
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
values: List of numeric values (must be non-empty)
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Distribution with computed statistics
|
|
127
|
+
|
|
128
|
+
Raises:
|
|
129
|
+
ValueError: If values is empty
|
|
130
|
+
|
|
131
|
+
Example:
|
|
132
|
+
>>> dist = make_distribution([65.2, 71.1, 68.8, 70.5])
|
|
133
|
+
>>> dist.mean
|
|
134
|
+
68.9
|
|
135
|
+
>>> dist.std # reveals variance in the signal
|
|
136
|
+
2.57...
|
|
137
|
+
"""
|
|
138
|
+
if not values:
|
|
139
|
+
raise ValueError("Cannot create distribution from empty values")
|
|
140
|
+
|
|
141
|
+
if len(values) == 1:
|
|
142
|
+
return Distribution(
|
|
143
|
+
values=values,
|
|
144
|
+
mean=values[0],
|
|
145
|
+
median=values[0],
|
|
146
|
+
std=0.0,
|
|
147
|
+
range=0.0,
|
|
148
|
+
iqr=0.0,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# For 2-3 values, quantiles() needs special handling
|
|
152
|
+
if len(values) < 4:
|
|
153
|
+
q1 = values[0]
|
|
154
|
+
q3 = values[-1]
|
|
155
|
+
else:
|
|
156
|
+
q = statistics.quantiles(values, n=4)
|
|
157
|
+
q1, q3 = q[0], q[2]
|
|
158
|
+
|
|
159
|
+
return Distribution(
|
|
160
|
+
values=values,
|
|
161
|
+
mean=statistics.mean(values),
|
|
162
|
+
median=statistics.median(values),
|
|
163
|
+
std=statistics.stdev(values),
|
|
164
|
+
range=max(values) - min(values),
|
|
165
|
+
iqr=q3 - q1,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
8
169
|
# ===== Lexical Results =====
|
|
9
170
|
|
|
10
171
|
|
|
11
172
|
@dataclass
|
|
12
173
|
class MTLDResult:
|
|
13
|
-
"""Result from MTLD (Measure of Textual Lexical Diversity) computation.
|
|
174
|
+
"""Result from MTLD (Measure of Textual Lexical Diversity) computation.
|
|
175
|
+
|
|
176
|
+
All numeric metrics include both a mean value (convenient access) and
|
|
177
|
+
a full distribution with per-chunk values and statistics.
|
|
178
|
+
|
|
179
|
+
Related GitHub Issue:
|
|
180
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
181
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
14
182
|
|
|
183
|
+
Example:
|
|
184
|
+
>>> result = compute_mtld(text, chunk_size=1000)
|
|
185
|
+
>>> result.mtld_average # mean MTLD across chunks
|
|
186
|
+
72.5
|
|
187
|
+
>>> result.mtld_average_dist.std # MTLD variance
|
|
188
|
+
8.3
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
# Convenient access (mean values)
|
|
15
192
|
mtld_forward: float
|
|
16
193
|
mtld_backward: float
|
|
17
194
|
mtld_average: float
|
|
195
|
+
|
|
196
|
+
# Full distributions
|
|
197
|
+
mtld_forward_dist: Distribution
|
|
198
|
+
mtld_backward_dist: Distribution
|
|
199
|
+
mtld_average_dist: Distribution
|
|
200
|
+
|
|
201
|
+
# Chunking context
|
|
202
|
+
chunk_size: int
|
|
203
|
+
chunk_count: int
|
|
204
|
+
|
|
18
205
|
metadata: dict[str, Any]
|
|
19
206
|
|
|
20
207
|
|
|
21
208
|
@dataclass
|
|
22
209
|
class YuleResult:
|
|
23
|
-
"""Result from Yule's K and I computation.
|
|
210
|
+
"""Result from Yule's K and I computation.
|
|
211
|
+
|
|
212
|
+
All numeric metrics include both a mean value (convenient access) and
|
|
213
|
+
a full distribution with per-chunk values and statistics.
|
|
214
|
+
|
|
215
|
+
Related GitHub Issue:
|
|
216
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
217
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
218
|
+
|
|
219
|
+
Example:
|
|
220
|
+
>>> result = compute_yule(text, chunk_size=1000)
|
|
221
|
+
>>> result.yule_k # mean across chunks
|
|
222
|
+
120.5
|
|
223
|
+
>>> result.yule_k_dist.std # variance reveals fingerprint
|
|
224
|
+
15.2
|
|
225
|
+
"""
|
|
24
226
|
|
|
227
|
+
# Convenient access (mean values)
|
|
25
228
|
yule_k: float
|
|
26
229
|
yule_i: float
|
|
230
|
+
|
|
231
|
+
# Full distributions
|
|
232
|
+
yule_k_dist: Distribution
|
|
233
|
+
yule_i_dist: Distribution
|
|
234
|
+
|
|
235
|
+
# Chunking context
|
|
236
|
+
chunk_size: int
|
|
237
|
+
chunk_count: int
|
|
238
|
+
|
|
27
239
|
metadata: dict[str, Any]
|
|
28
240
|
|
|
29
241
|
|
|
30
242
|
@dataclass
|
|
31
243
|
class HapaxResult:
|
|
32
|
-
"""Result from Hapax Legomena analysis.
|
|
244
|
+
"""Result from Hapax Legomena analysis.
|
|
245
|
+
|
|
246
|
+
All numeric metrics include both a mean value (convenient access) and
|
|
247
|
+
a full distribution with per-chunk values and statistics.
|
|
248
|
+
|
|
249
|
+
Related GitHub Issue:
|
|
250
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
251
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
252
|
+
|
|
253
|
+
Example:
|
|
254
|
+
>>> result = compute_hapax(text, chunk_size=1000)
|
|
255
|
+
>>> result.hapax_ratio # mean across chunks
|
|
256
|
+
0.45
|
|
257
|
+
>>> result.hapax_ratio_dist.std # variance
|
|
258
|
+
0.08
|
|
259
|
+
"""
|
|
260
|
+
|
|
261
|
+
# Convenient access (mean/total values)
|
|
262
|
+
hapax_count: int # Total across all chunks
|
|
263
|
+
hapax_ratio: float # Mean ratio
|
|
264
|
+
dis_hapax_count: int # Total across all chunks
|
|
265
|
+
dis_hapax_ratio: float # Mean ratio
|
|
266
|
+
sichel_s: float # Mean
|
|
267
|
+
honore_r: float # Mean
|
|
268
|
+
|
|
269
|
+
# Full distributions (ratios only - counts don't distribute meaningfully)
|
|
270
|
+
hapax_ratio_dist: Distribution
|
|
271
|
+
dis_hapax_ratio_dist: Distribution
|
|
272
|
+
sichel_s_dist: Distribution
|
|
273
|
+
honore_r_dist: Distribution
|
|
274
|
+
|
|
275
|
+
# Chunking context
|
|
276
|
+
chunk_size: int
|
|
277
|
+
chunk_count: int
|
|
278
|
+
|
|
279
|
+
metadata: dict[str, Any]
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
@dataclass
|
|
283
|
+
class LexiconCategories:
|
|
284
|
+
"""Categorization of words by lexicon presence."""
|
|
285
|
+
|
|
286
|
+
neologisms: list[str] # Not in WordNet AND not in BNC
|
|
287
|
+
rare_words: list[str] # In one lexicon but not both
|
|
288
|
+
common_words: list[str] # In both WordNet AND BNC
|
|
289
|
+
neologism_ratio: float # Ratio of neologisms to total hapax
|
|
290
|
+
rare_word_ratio: float # Ratio of rare words to total hapax
|
|
291
|
+
metadata: dict[str, Any]
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
@dataclass
|
|
295
|
+
class HapaxLexiconResult:
|
|
296
|
+
"""Result from Hapax Legomena analysis with lexicon categorization.
|
|
297
|
+
|
|
298
|
+
Extends basic hapax analysis by categorizing hapax legomena based on
|
|
299
|
+
presence in WordNet and British National Corpus (BNC):
|
|
300
|
+
|
|
301
|
+
- Neologisms: Words not in WordNet AND not in BNC (true novel words)
|
|
302
|
+
- Rare words: Words in BNC but not WordNet, or vice versa
|
|
303
|
+
- Common words: Words in both lexicons (just happen to appear once in text)
|
|
304
|
+
|
|
305
|
+
This categorization is valuable for stylometric analysis as it distinguishes
|
|
306
|
+
between vocabulary innovation (neologisms) and incidental hapax occurrence
|
|
307
|
+
(common words that appear once).
|
|
308
|
+
"""
|
|
309
|
+
|
|
310
|
+
hapax_result: HapaxResult # Standard hapax metrics
|
|
311
|
+
lexicon_analysis: LexiconCategories # Lexicon-based categorization
|
|
312
|
+
metadata: dict[str, Any]
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
@dataclass
|
|
316
|
+
class TTRResult:
|
|
317
|
+
"""Result from Type-Token Ratio (TTR) analysis.
|
|
318
|
+
|
|
319
|
+
Wraps stylometry-ttr package functionality to measure vocabulary richness
|
|
320
|
+
through the ratio of unique words (types) to total words (tokens).
|
|
321
|
+
|
|
322
|
+
All numeric metrics include both a mean value (convenient access) and
|
|
323
|
+
a full distribution with per-chunk values and statistics.
|
|
324
|
+
|
|
325
|
+
Includes multiple TTR variants for length normalization:
|
|
326
|
+
- Raw TTR: Direct ratio of unique to total words
|
|
327
|
+
- Root TTR (Guiraud's index): types / sqrt(tokens)
|
|
328
|
+
- Log TTR (Herdan's C): log(types) / log(tokens)
|
|
329
|
+
- STTR: Standardized TTR across fixed-size chunks
|
|
330
|
+
- Delta Std: Measures vocabulary consistency across chunks
|
|
331
|
+
|
|
332
|
+
Related GitHub Issue:
|
|
333
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
334
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
335
|
+
|
|
336
|
+
References:
|
|
337
|
+
Guiraud, P. (1960). Problèmes et méthodes de la statistique linguistique.
|
|
338
|
+
Herdan, G. (1960). Type-token Mathematics.
|
|
339
|
+
|
|
340
|
+
Example:
|
|
341
|
+
>>> result = compute_ttr(text, chunk_size=1000)
|
|
342
|
+
>>> result.ttr # mean TTR across chunks
|
|
343
|
+
0.42
|
|
344
|
+
>>> result.ttr_dist.std # TTR variance reveals fingerprint
|
|
345
|
+
0.05
|
|
346
|
+
>>> result.chunk_count
|
|
347
|
+
59
|
|
348
|
+
"""
|
|
349
|
+
|
|
350
|
+
# Convenient access (mean values)
|
|
351
|
+
total_words: int
|
|
352
|
+
unique_words: int
|
|
353
|
+
ttr: float # Raw TTR (mean)
|
|
354
|
+
root_ttr: float # Guiraud's index (mean)
|
|
355
|
+
log_ttr: float # Herdan's C (mean)
|
|
356
|
+
sttr: float # Standardized TTR (mean)
|
|
357
|
+
delta_std: float # Vocabulary consistency (mean)
|
|
358
|
+
|
|
359
|
+
# Full distributions with per-chunk values
|
|
360
|
+
ttr_dist: Distribution
|
|
361
|
+
root_ttr_dist: Distribution
|
|
362
|
+
log_ttr_dist: Distribution
|
|
363
|
+
sttr_dist: Distribution
|
|
364
|
+
delta_std_dist: Distribution
|
|
365
|
+
|
|
366
|
+
# Chunking context
|
|
367
|
+
chunk_size: int
|
|
368
|
+
chunk_count: int
|
|
33
369
|
|
|
34
|
-
hapax_count: int
|
|
35
|
-
hapax_ratio: float
|
|
36
|
-
dis_hapax_count: int
|
|
37
|
-
dis_hapax_ratio: float
|
|
38
|
-
sichel_s: float
|
|
39
|
-
honore_r: float
|
|
40
370
|
metadata: dict[str, Any]
|
|
41
371
|
|
|
42
372
|
|
|
@@ -45,48 +375,135 @@ class HapaxResult:
|
|
|
45
375
|
|
|
46
376
|
@dataclass
|
|
47
377
|
class FleschResult:
|
|
48
|
-
"""Result from Flesch Reading Ease and Flesch-Kincaid Grade computation.
|
|
378
|
+
"""Result from Flesch Reading Ease and Flesch-Kincaid Grade computation.
|
|
379
|
+
|
|
380
|
+
All numeric metrics include both a mean value (convenient access) and
|
|
381
|
+
a full distribution with per-chunk values and statistics.
|
|
49
382
|
|
|
383
|
+
Related GitHub Issue:
|
|
384
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
385
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
386
|
+
|
|
387
|
+
Example:
|
|
388
|
+
>>> result = compute_flesch(text, chunk_size=1000)
|
|
389
|
+
>>> result.reading_ease # mean across chunks
|
|
390
|
+
68.54
|
|
391
|
+
>>> result.reading_ease_dist.std # variance reveals fingerprint
|
|
392
|
+
4.2
|
|
393
|
+
>>> result.reading_ease_dist.values # per-chunk values
|
|
394
|
+
[65.2, 71.1, 68.8, ...]
|
|
395
|
+
"""
|
|
396
|
+
|
|
397
|
+
# Convenient access (mean values)
|
|
50
398
|
reading_ease: float
|
|
51
399
|
grade_level: float
|
|
52
|
-
difficulty: str #
|
|
400
|
+
difficulty: str # Based on mean reading_ease
|
|
401
|
+
|
|
402
|
+
# Full distributions
|
|
403
|
+
reading_ease_dist: Distribution
|
|
404
|
+
grade_level_dist: Distribution
|
|
405
|
+
|
|
406
|
+
# Chunking context
|
|
407
|
+
chunk_size: int
|
|
408
|
+
chunk_count: int
|
|
409
|
+
|
|
53
410
|
metadata: dict[str, Any]
|
|
54
411
|
|
|
55
412
|
|
|
56
413
|
@dataclass
|
|
57
414
|
class SMOGResult:
|
|
58
|
-
"""Result from SMOG Index computation.
|
|
415
|
+
"""Result from SMOG Index computation.
|
|
416
|
+
|
|
417
|
+
Related GitHub Issue:
|
|
418
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
419
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
420
|
+
"""
|
|
59
421
|
|
|
422
|
+
# Convenient access (mean values)
|
|
60
423
|
smog_index: float
|
|
61
|
-
grade_level:
|
|
424
|
+
grade_level: float
|
|
425
|
+
|
|
426
|
+
# Full distributions
|
|
427
|
+
smog_index_dist: Distribution
|
|
428
|
+
grade_level_dist: Distribution
|
|
429
|
+
|
|
430
|
+
# Chunking context
|
|
431
|
+
chunk_size: int
|
|
432
|
+
chunk_count: int
|
|
433
|
+
|
|
62
434
|
metadata: dict[str, Any]
|
|
63
435
|
|
|
64
436
|
|
|
65
437
|
@dataclass
|
|
66
438
|
class GunningFogResult:
|
|
67
|
-
"""Result from Gunning Fog Index computation.
|
|
439
|
+
"""Result from Gunning Fog Index computation.
|
|
440
|
+
|
|
441
|
+
Related GitHub Issue:
|
|
442
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
443
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
444
|
+
"""
|
|
68
445
|
|
|
446
|
+
# Convenient access (mean values)
|
|
69
447
|
fog_index: float
|
|
70
|
-
grade_level:
|
|
448
|
+
grade_level: float
|
|
449
|
+
|
|
450
|
+
# Full distributions
|
|
451
|
+
fog_index_dist: Distribution
|
|
452
|
+
grade_level_dist: Distribution
|
|
453
|
+
|
|
454
|
+
# Chunking context
|
|
455
|
+
chunk_size: int
|
|
456
|
+
chunk_count: int
|
|
457
|
+
|
|
71
458
|
metadata: dict[str, Any]
|
|
72
459
|
|
|
73
460
|
|
|
74
461
|
@dataclass
|
|
75
462
|
class ColemanLiauResult:
|
|
76
|
-
"""Result from Coleman-Liau Index computation.
|
|
463
|
+
"""Result from Coleman-Liau Index computation.
|
|
77
464
|
|
|
465
|
+
Related GitHub Issue:
|
|
466
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
467
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
468
|
+
"""
|
|
469
|
+
|
|
470
|
+
# Convenient access (mean values)
|
|
78
471
|
cli_index: float
|
|
79
|
-
grade_level:
|
|
472
|
+
grade_level: float # Changed to float for mean across chunks
|
|
473
|
+
|
|
474
|
+
# Full distributions
|
|
475
|
+
cli_index_dist: Distribution
|
|
476
|
+
grade_level_dist: Distribution
|
|
477
|
+
|
|
478
|
+
# Chunking context
|
|
479
|
+
chunk_size: int
|
|
480
|
+
chunk_count: int
|
|
481
|
+
|
|
80
482
|
metadata: dict[str, Any]
|
|
81
483
|
|
|
82
484
|
|
|
83
485
|
@dataclass
|
|
84
486
|
class ARIResult:
|
|
85
|
-
"""Result from Automated Readability Index computation.
|
|
487
|
+
"""Result from Automated Readability Index computation.
|
|
86
488
|
|
|
489
|
+
Related GitHub Issue:
|
|
490
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
491
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
492
|
+
"""
|
|
493
|
+
|
|
494
|
+
# Convenient access (mean values)
|
|
87
495
|
ari_score: float
|
|
88
|
-
grade_level:
|
|
89
|
-
age_range: str
|
|
496
|
+
grade_level: float # Changed to float for mean across chunks
|
|
497
|
+
age_range: str # Based on mean grade level
|
|
498
|
+
|
|
499
|
+
# Full distributions
|
|
500
|
+
ari_score_dist: Distribution
|
|
501
|
+
grade_level_dist: Distribution
|
|
502
|
+
|
|
503
|
+
# Chunking context
|
|
504
|
+
chunk_size: int
|
|
505
|
+
chunk_count: int
|
|
506
|
+
|
|
90
507
|
metadata: dict[str, Any]
|
|
91
508
|
|
|
92
509
|
|
|
@@ -95,8 +512,14 @@ class ARIResult:
|
|
|
95
512
|
|
|
96
513
|
@dataclass
|
|
97
514
|
class POSResult:
|
|
98
|
-
"""Result from Part-of-Speech ratio analysis.
|
|
515
|
+
"""Result from Part-of-Speech ratio analysis.
|
|
516
|
+
|
|
517
|
+
Related GitHub Issue:
|
|
518
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
519
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
520
|
+
"""
|
|
99
521
|
|
|
522
|
+
# Convenient access (mean values)
|
|
100
523
|
noun_ratio: float
|
|
101
524
|
verb_ratio: float
|
|
102
525
|
adjective_ratio: float
|
|
@@ -105,19 +528,52 @@ class POSResult:
|
|
|
105
528
|
adjective_noun_ratio: float
|
|
106
529
|
lexical_density: float
|
|
107
530
|
function_word_ratio: float
|
|
531
|
+
|
|
532
|
+
# Full distributions
|
|
533
|
+
noun_ratio_dist: Distribution
|
|
534
|
+
verb_ratio_dist: Distribution
|
|
535
|
+
adjective_ratio_dist: Distribution
|
|
536
|
+
adverb_ratio_dist: Distribution
|
|
537
|
+
noun_verb_ratio_dist: Distribution
|
|
538
|
+
adjective_noun_ratio_dist: Distribution
|
|
539
|
+
lexical_density_dist: Distribution
|
|
540
|
+
function_word_ratio_dist: Distribution
|
|
541
|
+
|
|
542
|
+
# Chunking context
|
|
543
|
+
chunk_size: int
|
|
544
|
+
chunk_count: int
|
|
545
|
+
|
|
108
546
|
metadata: dict[str, Any]
|
|
109
547
|
|
|
110
548
|
|
|
111
549
|
@dataclass
|
|
112
550
|
class SentenceStatsResult:
|
|
113
|
-
"""Result from sentence-level statistics.
|
|
551
|
+
"""Result from sentence-level statistics.
|
|
552
|
+
|
|
553
|
+
Related GitHub Issue:
|
|
554
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
555
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
556
|
+
"""
|
|
114
557
|
|
|
558
|
+
# Convenient access (mean values)
|
|
115
559
|
mean_sentence_length: float
|
|
116
560
|
sentence_length_std: float
|
|
117
|
-
sentence_length_range:
|
|
118
|
-
min_sentence_length:
|
|
119
|
-
max_sentence_length:
|
|
120
|
-
sentence_count: int
|
|
561
|
+
sentence_length_range: float # Changed to float for mean across chunks
|
|
562
|
+
min_sentence_length: float # Changed to float for mean across chunks
|
|
563
|
+
max_sentence_length: float # Changed to float for mean across chunks
|
|
564
|
+
sentence_count: int # Total across all chunks
|
|
565
|
+
|
|
566
|
+
# Full distributions
|
|
567
|
+
mean_sentence_length_dist: Distribution
|
|
568
|
+
sentence_length_std_dist: Distribution
|
|
569
|
+
sentence_length_range_dist: Distribution
|
|
570
|
+
min_sentence_length_dist: Distribution
|
|
571
|
+
max_sentence_length_dist: Distribution
|
|
572
|
+
|
|
573
|
+
# Chunking context
|
|
574
|
+
chunk_size: int
|
|
575
|
+
chunk_count: int
|
|
576
|
+
|
|
121
577
|
metadata: dict[str, Any]
|
|
122
578
|
|
|
123
579
|
|
|
@@ -149,11 +605,1481 @@ class ZetaResult:
|
|
|
149
605
|
|
|
150
606
|
@dataclass
|
|
151
607
|
class EntropyResult:
|
|
152
|
-
"""Result from n-gram entropy computation.
|
|
608
|
+
"""Result from n-gram entropy computation.
|
|
153
609
|
|
|
610
|
+
Related GitHub Issue:
|
|
611
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
612
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
613
|
+
"""
|
|
614
|
+
|
|
615
|
+
# Convenient access (mean values)
|
|
154
616
|
entropy: float
|
|
155
617
|
perplexity: float
|
|
156
618
|
ngram_type: str # "character_bigram", "word_bigram", "word_trigram"
|
|
619
|
+
|
|
620
|
+
# Full distributions
|
|
621
|
+
entropy_dist: Distribution
|
|
622
|
+
perplexity_dist: Distribution
|
|
623
|
+
|
|
624
|
+
# Chunking context
|
|
625
|
+
chunk_size: int
|
|
626
|
+
chunk_count: int
|
|
627
|
+
|
|
628
|
+
metadata: dict[str, Any]
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
# ===== Character-Level Results =====
|
|
632
|
+
# Related to GitHub Issue #12: Character-Level Metrics
|
|
633
|
+
# https://github.com/craigtrim/pystylometry/issues/12
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
@dataclass
|
|
637
|
+
class CharacterMetricsResult:
|
|
638
|
+
"""Result from character-level metrics analysis.
|
|
639
|
+
|
|
640
|
+
This dataclass holds character-level stylometric features that provide
|
|
641
|
+
low-level insights into writing style. Character-level metrics are
|
|
642
|
+
fundamental for authorship attribution and can capture distinctive
|
|
643
|
+
patterns in punctuation, formatting, and word construction.
|
|
644
|
+
|
|
645
|
+
Related GitHub Issues:
|
|
646
|
+
#12 - Character-Level Metrics
|
|
647
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
648
|
+
|
|
649
|
+
Metrics included:
|
|
650
|
+
- Average word length (characters per word)
|
|
651
|
+
- Average sentence length (characters per sentence)
|
|
652
|
+
- Punctuation density (punctuation marks per 100 words)
|
|
653
|
+
- Punctuation variety (count of unique punctuation types)
|
|
654
|
+
- Letter frequency distribution (26-element vector for a-z)
|
|
655
|
+
- Vowel-to-consonant ratio
|
|
656
|
+
- Digit frequency (count/ratio of numeric characters)
|
|
657
|
+
- Uppercase ratio (uppercase letters / total letters)
|
|
658
|
+
- Whitespace ratio (whitespace characters / total characters)
|
|
659
|
+
|
|
660
|
+
References:
|
|
661
|
+
Grieve, J. (2007). Quantitative authorship attribution: An evaluation
|
|
662
|
+
of techniques. Literary and Linguistic Computing, 22(3), 251-270.
|
|
663
|
+
Stamatatos, E. (2009). A survey of modern authorship attribution methods.
|
|
664
|
+
JASIST, 60(3), 538-556.
|
|
665
|
+
"""
|
|
666
|
+
|
|
667
|
+
# Convenient access (mean values)
|
|
668
|
+
avg_word_length: float
|
|
669
|
+
avg_sentence_length_chars: float
|
|
670
|
+
punctuation_density: float
|
|
671
|
+
punctuation_variety: float # Changed to float for mean across chunks
|
|
672
|
+
letter_frequency: dict[str, float] # Aggregate frequency
|
|
673
|
+
vowel_consonant_ratio: float
|
|
674
|
+
digit_count: int # Total across all chunks
|
|
675
|
+
digit_ratio: float
|
|
676
|
+
uppercase_ratio: float
|
|
677
|
+
whitespace_ratio: float
|
|
678
|
+
|
|
679
|
+
# Full distributions
|
|
680
|
+
avg_word_length_dist: Distribution
|
|
681
|
+
avg_sentence_length_chars_dist: Distribution
|
|
682
|
+
punctuation_density_dist: Distribution
|
|
683
|
+
punctuation_variety_dist: Distribution
|
|
684
|
+
vowel_consonant_ratio_dist: Distribution
|
|
685
|
+
digit_ratio_dist: Distribution
|
|
686
|
+
uppercase_ratio_dist: Distribution
|
|
687
|
+
whitespace_ratio_dist: Distribution
|
|
688
|
+
|
|
689
|
+
# Chunking context
|
|
690
|
+
chunk_size: int
|
|
691
|
+
chunk_count: int
|
|
692
|
+
|
|
693
|
+
metadata: dict[str, Any]
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
# ===== Function Word Results =====
|
|
697
|
+
# Related to GitHub Issue #13: Function Word Analysis
|
|
698
|
+
# https://github.com/craigtrim/pystylometry/issues/13
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
@dataclass
|
|
702
|
+
class FunctionWordResult:
|
|
703
|
+
"""Result from function word analysis.
|
|
704
|
+
|
|
705
|
+
Function words (determiners, prepositions, conjunctions, pronouns, auxiliary
|
|
706
|
+
verbs) are highly frequent, content-independent words that are often used
|
|
707
|
+
subconsciously. They are considered strong authorship markers because authors
|
|
708
|
+
use them consistently across different topics and genres.
|
|
709
|
+
|
|
710
|
+
Related GitHub Issues:
|
|
711
|
+
#13 - Function Word Analysis
|
|
712
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
713
|
+
|
|
714
|
+
This analysis computes:
|
|
715
|
+
- Frequency profiles for all function word categories
|
|
716
|
+
- Ratios for specific grammatical categories
|
|
717
|
+
- Most/least frequently used function words
|
|
718
|
+
- Function word diversity metrics
|
|
719
|
+
|
|
720
|
+
Function word categories analyzed:
|
|
721
|
+
- Determiners: the, a, an, this, that, these, those, etc.
|
|
722
|
+
- Prepositions: in, on, at, by, for, with, from, to, etc.
|
|
723
|
+
- Conjunctions: and, but, or, nor, for, yet, so, etc.
|
|
724
|
+
- Pronouns: I, you, he, she, it, we, they, etc.
|
|
725
|
+
- Auxiliary verbs: be, have, do, can, will, shall, may, etc.
|
|
726
|
+
- Particles: up, down, out, off, over, etc.
|
|
727
|
+
|
|
728
|
+
References:
|
|
729
|
+
Mosteller, F., & Wallace, D. L. (1964). Inference and disputed authorship:
|
|
730
|
+
The Federalist. Addison-Wesley.
|
|
731
|
+
Burrows, J. (2002). 'Delta': A measure of stylistic difference and a guide
|
|
732
|
+
to likely authorship. Literary and Linguistic Computing, 17(3), 267-287.
|
|
733
|
+
"""
|
|
734
|
+
|
|
735
|
+
# Convenient access (mean values)
|
|
736
|
+
determiner_ratio: float
|
|
737
|
+
preposition_ratio: float
|
|
738
|
+
conjunction_ratio: float
|
|
739
|
+
pronoun_ratio: float
|
|
740
|
+
auxiliary_ratio: float
|
|
741
|
+
particle_ratio: float
|
|
742
|
+
total_function_word_ratio: float
|
|
743
|
+
function_word_diversity: float
|
|
744
|
+
most_frequent_function_words: list[tuple[str, int]] # Aggregate
|
|
745
|
+
least_frequent_function_words: list[tuple[str, int]] # Aggregate
|
|
746
|
+
function_word_distribution: dict[str, int] # Aggregate
|
|
747
|
+
|
|
748
|
+
# Full distributions
|
|
749
|
+
determiner_ratio_dist: Distribution
|
|
750
|
+
preposition_ratio_dist: Distribution
|
|
751
|
+
conjunction_ratio_dist: Distribution
|
|
752
|
+
pronoun_ratio_dist: Distribution
|
|
753
|
+
auxiliary_ratio_dist: Distribution
|
|
754
|
+
particle_ratio_dist: Distribution
|
|
755
|
+
total_function_word_ratio_dist: Distribution
|
|
756
|
+
function_word_diversity_dist: Distribution
|
|
757
|
+
|
|
758
|
+
# Chunking context
|
|
759
|
+
chunk_size: int
|
|
760
|
+
chunk_count: int
|
|
761
|
+
|
|
762
|
+
metadata: dict[str, Any]
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
# ===== Advanced Lexical Diversity Results =====
|
|
766
|
+
# Related to GitHub Issue #14: Advanced Lexical Diversity Metrics
|
|
767
|
+
# https://github.com/craigtrim/pystylometry/issues/14
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
@dataclass
|
|
771
|
+
class VocdDResult:
|
|
772
|
+
"""Result from voc-D computation.
|
|
773
|
+
|
|
774
|
+
voc-D is a sophisticated measure of lexical diversity that uses a mathematical
|
|
775
|
+
model to estimate vocabulary richness while controlling for text length.
|
|
776
|
+
It fits a curve to the relationship between tokens and types across multiple
|
|
777
|
+
random samples of the text.
|
|
778
|
+
|
|
779
|
+
Related GitHub Issues:
|
|
780
|
+
#14 - Advanced Lexical Diversity Metrics
|
|
781
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
782
|
+
|
|
783
|
+
The D parameter represents the theoretical vocabulary size and is more
|
|
784
|
+
stable across different text lengths than simple TTR measures.
|
|
785
|
+
|
|
786
|
+
References:
|
|
787
|
+
Malvern, D., Richards, B., Chipere, N., & Durán, P. (2004).
|
|
788
|
+
Lexical Diversity and Language Development. Palgrave Macmillan.
|
|
789
|
+
McKee, G., Malvern, D., & Richards, B. (2000). Measuring vocabulary
|
|
790
|
+
diversity using dedicated software. Literary and Linguistic Computing,
|
|
791
|
+
15(3), 323-337.
|
|
792
|
+
"""
|
|
793
|
+
|
|
794
|
+
# Convenient access (mean values)
|
|
795
|
+
d_parameter: float
|
|
796
|
+
curve_fit_r_squared: float
|
|
797
|
+
sample_count: int # Total across all chunks
|
|
798
|
+
optimal_sample_size: int
|
|
799
|
+
|
|
800
|
+
# Full distributions
|
|
801
|
+
d_parameter_dist: Distribution
|
|
802
|
+
curve_fit_r_squared_dist: Distribution
|
|
803
|
+
|
|
804
|
+
# Chunking context
|
|
805
|
+
chunk_size: int
|
|
806
|
+
chunk_count: int
|
|
807
|
+
|
|
808
|
+
metadata: dict[str, Any]
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
@dataclass
|
|
812
|
+
class MATTRResult:
|
|
813
|
+
"""Result from MATTR (Moving-Average Type-Token Ratio) computation.
|
|
814
|
+
|
|
815
|
+
MATTR computes TTR using a moving window of fixed size, which provides
|
|
816
|
+
a more stable measure of lexical diversity than simple TTR, especially
|
|
817
|
+
for longer texts. The moving window approach reduces the impact of text
|
|
818
|
+
length on the TTR calculation.
|
|
819
|
+
|
|
820
|
+
Related GitHub Issues:
|
|
821
|
+
#14 - Advanced Lexical Diversity Metrics
|
|
822
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
823
|
+
|
|
824
|
+
References:
|
|
825
|
+
Covington, M. A., & McFall, J. D. (2010). Cutting the Gordian knot:
|
|
826
|
+
The moving-average type-token ratio (MATTR). Journal of Quantitative
|
|
827
|
+
Linguistics, 17(2), 94-100.
|
|
828
|
+
"""
|
|
829
|
+
|
|
830
|
+
# Convenient access (mean values)
|
|
831
|
+
mattr_score: float
|
|
832
|
+
window_size: int
|
|
833
|
+
window_count: int # Total across all chunks
|
|
834
|
+
ttr_std_dev: float
|
|
835
|
+
min_ttr: float
|
|
836
|
+
max_ttr: float
|
|
837
|
+
|
|
838
|
+
# Full distributions
|
|
839
|
+
mattr_score_dist: Distribution
|
|
840
|
+
ttr_std_dev_dist: Distribution
|
|
841
|
+
min_ttr_dist: Distribution
|
|
842
|
+
max_ttr_dist: Distribution
|
|
843
|
+
|
|
844
|
+
# Chunking context
|
|
845
|
+
chunk_size: int
|
|
846
|
+
chunk_count: int
|
|
847
|
+
|
|
848
|
+
metadata: dict[str, Any]
|
|
849
|
+
|
|
850
|
+
|
|
851
|
+
@dataclass
|
|
852
|
+
class HDDResult:
|
|
853
|
+
"""Result from HD-D (Hypergeometric Distribution D) computation.
|
|
854
|
+
|
|
855
|
+
HD-D is a probabilistic measure of lexical diversity based on the
|
|
856
|
+
hypergeometric distribution. It estimates the probability of encountering
|
|
857
|
+
new word types as text length increases, providing a mathematically
|
|
858
|
+
rigorous measure that is less sensitive to text length than TTR.
|
|
859
|
+
|
|
860
|
+
Related GitHub Issues:
|
|
861
|
+
#14 - Advanced Lexical Diversity Metrics
|
|
862
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
863
|
+
|
|
864
|
+
References:
|
|
865
|
+
McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D: A validation
|
|
866
|
+
study of sophisticated approaches to lexical diversity assessment.
|
|
867
|
+
Behavior Research Methods, 42(2), 381-392.
|
|
868
|
+
"""
|
|
869
|
+
|
|
870
|
+
# Convenient access (mean values)
|
|
871
|
+
hdd_score: float
|
|
872
|
+
sample_size: int
|
|
873
|
+
type_count: int # Total unique across all chunks
|
|
874
|
+
token_count: int # Total across all chunks
|
|
875
|
+
|
|
876
|
+
# Full distributions
|
|
877
|
+
hdd_score_dist: Distribution
|
|
878
|
+
|
|
879
|
+
# Chunking context
|
|
880
|
+
chunk_size: int
|
|
881
|
+
chunk_count: int
|
|
882
|
+
|
|
883
|
+
metadata: dict[str, Any]
|
|
884
|
+
|
|
885
|
+
|
|
886
|
+
@dataclass
|
|
887
|
+
class MSTTRResult:
|
|
888
|
+
"""Result from MSTTR (Mean Segmental Type-Token Ratio) computation.
|
|
889
|
+
|
|
890
|
+
MSTTR divides the text into sequential segments of equal length and
|
|
891
|
+
computes the average TTR across all segments. This provides a length-
|
|
892
|
+
normalized measure of lexical diversity that is more comparable across
|
|
893
|
+
texts of different lengths.
|
|
894
|
+
|
|
895
|
+
Related GitHub Issues:
|
|
896
|
+
#14 - Advanced Lexical Diversity Metrics
|
|
897
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
898
|
+
|
|
899
|
+
References:
|
|
900
|
+
Johnson, W. (1944). Studies in language behavior: I. A program of research.
|
|
901
|
+
Psychological Monographs, 56(2), 1-15.
|
|
902
|
+
"""
|
|
903
|
+
|
|
904
|
+
# Convenient access (mean values)
|
|
905
|
+
msttr_score: float
|
|
906
|
+
segment_size: int
|
|
907
|
+
segment_count: int # Total across all chunks
|
|
908
|
+
ttr_std_dev: float
|
|
909
|
+
min_ttr: float
|
|
910
|
+
max_ttr: float
|
|
911
|
+
segment_ttrs: list[float] # Aggregate from all chunks
|
|
912
|
+
|
|
913
|
+
# Full distributions
|
|
914
|
+
msttr_score_dist: Distribution
|
|
915
|
+
ttr_std_dev_dist: Distribution
|
|
916
|
+
min_ttr_dist: Distribution
|
|
917
|
+
max_ttr_dist: Distribution
|
|
918
|
+
|
|
919
|
+
# Chunking context
|
|
920
|
+
chunk_size: int
|
|
921
|
+
chunk_count: int
|
|
922
|
+
|
|
923
|
+
metadata: dict[str, Any]
|
|
924
|
+
|
|
925
|
+
|
|
926
|
+
# ===== Word Frequency Sophistication Results =====
|
|
927
|
+
# Related to GitHub Issue #15: Word Frequency Sophistication Metrics
|
|
928
|
+
# https://github.com/craigtrim/pystylometry/issues/15
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
@dataclass
|
|
932
|
+
class WordFrequencySophisticationResult:
|
|
933
|
+
"""Result from word frequency sophistication analysis.
|
|
934
|
+
|
|
935
|
+
Word frequency sophistication metrics measure how common or rare the
|
|
936
|
+
vocabulary used in a text is, based on reference frequency lists from
|
|
937
|
+
large corpora. Authors who use less frequent (more sophisticated) words
|
|
938
|
+
score higher on these metrics.
|
|
939
|
+
|
|
940
|
+
Related GitHub Issues:
|
|
941
|
+
#15 - Word Frequency Sophistication Metrics
|
|
942
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
943
|
+
|
|
944
|
+
This analysis uses reference frequency data from:
|
|
945
|
+
- COCA (Corpus of Contemporary American English)
|
|
946
|
+
- BNC (British National Corpus)
|
|
947
|
+
- Google N-grams
|
|
948
|
+
- SUBTLEXus (subtitle frequencies)
|
|
949
|
+
|
|
950
|
+
Metrics computed:
|
|
951
|
+
- Mean word frequency (average frequency rank)
|
|
952
|
+
- Median word frequency
|
|
953
|
+
- Rare word ratio (words beyond frequency threshold)
|
|
954
|
+
- Academic word ratio (from Academic Word List)
|
|
955
|
+
- Advanced word ratio (sophisticated vocabulary)
|
|
956
|
+
|
|
957
|
+
References:
|
|
958
|
+
Brysbaert, M., & New, B. (2009). Moving beyond Kučera and Francis:
|
|
959
|
+
A critical evaluation of current word frequency norms. Behavior
|
|
960
|
+
Research Methods, Instruments, & Computers, 41(4), 977-990.
|
|
961
|
+
Coxhead, A. (2000). A new academic word list. TESOL Quarterly, 34(2), 213-238.
|
|
962
|
+
"""
|
|
963
|
+
|
|
964
|
+
# Convenient access (mean values)
|
|
965
|
+
mean_frequency_rank: float
|
|
966
|
+
median_frequency_rank: float
|
|
967
|
+
rare_word_ratio: float
|
|
968
|
+
common_word_ratio: float
|
|
969
|
+
academic_word_ratio: float
|
|
970
|
+
advanced_word_ratio: float
|
|
971
|
+
frequency_band_distribution: dict[str, float] # Aggregate
|
|
972
|
+
rarest_words: list[tuple[str, float]] # Aggregate
|
|
973
|
+
most_common_words: list[tuple[str, float]] # Aggregate
|
|
974
|
+
|
|
975
|
+
# Full distributions
|
|
976
|
+
mean_frequency_rank_dist: Distribution
|
|
977
|
+
median_frequency_rank_dist: Distribution
|
|
978
|
+
rare_word_ratio_dist: Distribution
|
|
979
|
+
common_word_ratio_dist: Distribution
|
|
980
|
+
academic_word_ratio_dist: Distribution
|
|
981
|
+
advanced_word_ratio_dist: Distribution
|
|
982
|
+
|
|
983
|
+
# Chunking context
|
|
984
|
+
chunk_size: int
|
|
985
|
+
chunk_count: int
|
|
986
|
+
|
|
987
|
+
metadata: dict[str, Any]
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
# ===== Additional Readability Results =====
|
|
991
|
+
# Related to GitHub Issue #16: Additional Readability Formulas
|
|
992
|
+
# https://github.com/craigtrim/pystylometry/issues/16
|
|
993
|
+
|
|
994
|
+
|
|
995
|
+
@dataclass
|
|
996
|
+
class DaleChallResult:
|
|
997
|
+
"""Result from Dale-Chall Readability Formula.
|
|
998
|
+
|
|
999
|
+
The Dale-Chall formula uses a list of 3000 familiar words that 80% of
|
|
1000
|
+
fourth-graders understand. Words not on this list are considered "difficult."
|
|
1001
|
+
The formula provides a grade level estimate based on sentence length and
|
|
1002
|
+
the percentage of difficult words.
|
|
1003
|
+
|
|
1004
|
+
Related GitHub Issues:
|
|
1005
|
+
#16 - Additional Readability Formulas
|
|
1006
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
1007
|
+
|
|
1008
|
+
Formula: 0.1579 * (difficult_words / total_words * 100) + 0.0496 * avg_sentence_length
|
|
1009
|
+
|
|
1010
|
+
If % difficult words > 5%, add 3.6365 to the raw score.
|
|
1011
|
+
|
|
1012
|
+
References:
|
|
1013
|
+
Dale, E., & Chall, J. S. (1948). A formula for predicting readability.
|
|
1014
|
+
Educational Research Bulletin, 27(1), 11-28.
|
|
1015
|
+
Chall, J. S., & Dale, E. (1995). Readability revisited: The new Dale-Chall
|
|
1016
|
+
readability formula. Brookline Books.
|
|
1017
|
+
"""
|
|
1018
|
+
|
|
1019
|
+
# Convenient access (mean values)
|
|
1020
|
+
dale_chall_score: float
|
|
1021
|
+
grade_level: str # Based on mean score
|
|
1022
|
+
difficult_word_count: int # Total across all chunks
|
|
1023
|
+
difficult_word_ratio: float # Mean ratio
|
|
1024
|
+
avg_sentence_length: float # Mean
|
|
1025
|
+
total_words: int # Total across all chunks
|
|
1026
|
+
|
|
1027
|
+
# Full distributions
|
|
1028
|
+
dale_chall_score_dist: Distribution
|
|
1029
|
+
difficult_word_ratio_dist: Distribution
|
|
1030
|
+
avg_sentence_length_dist: Distribution
|
|
1031
|
+
|
|
1032
|
+
# Chunking context
|
|
1033
|
+
chunk_size: int
|
|
1034
|
+
chunk_count: int
|
|
1035
|
+
|
|
1036
|
+
metadata: dict[str, Any]
|
|
1037
|
+
|
|
1038
|
+
|
|
1039
|
+
@dataclass
|
|
1040
|
+
class LinsearWriteResult:
|
|
1041
|
+
"""Result from Linsear Write Formula.
|
|
1042
|
+
|
|
1043
|
+
The Linsear Write Formula was developed for the U.S. Air Force to calculate
|
|
1044
|
+
the readability of technical manuals. It categorizes words as "easy" (1-2
|
|
1045
|
+
syllables) or "hard" (3+ syllables) and uses sentence length to estimate
|
|
1046
|
+
grade level. It's particularly effective for technical writing.
|
|
1047
|
+
|
|
1048
|
+
Related GitHub Issues:
|
|
1049
|
+
#16 - Additional Readability Formulas
|
|
1050
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
1051
|
+
|
|
1052
|
+
References:
|
|
1053
|
+
Klare, G. R. (1974-1975). Assessing readability. Reading Research Quarterly,
|
|
1054
|
+
10(1), 62-102.
|
|
1055
|
+
"""
|
|
1056
|
+
|
|
1057
|
+
# Convenient access (mean values)
|
|
1058
|
+
linsear_score: float
|
|
1059
|
+
grade_level: float # Changed to float for mean across chunks
|
|
1060
|
+
easy_word_count: int # Total across all chunks
|
|
1061
|
+
hard_word_count: int # Total across all chunks
|
|
1062
|
+
avg_sentence_length: float # Mean
|
|
1063
|
+
|
|
1064
|
+
# Full distributions
|
|
1065
|
+
linsear_score_dist: Distribution
|
|
1066
|
+
grade_level_dist: Distribution
|
|
1067
|
+
avg_sentence_length_dist: Distribution
|
|
1068
|
+
|
|
1069
|
+
# Chunking context
|
|
1070
|
+
chunk_size: int
|
|
1071
|
+
chunk_count: int
|
|
1072
|
+
|
|
1073
|
+
metadata: dict[str, Any]
|
|
1074
|
+
|
|
1075
|
+
|
|
1076
|
+
@dataclass
|
|
1077
|
+
class FryResult:
|
|
1078
|
+
"""Result from Fry Readability Graph.
|
|
1079
|
+
|
|
1080
|
+
The Fry Readability Graph uses average sentence length and average syllables
|
|
1081
|
+
per word to determine reading difficulty. It plots these values on a graph
|
|
1082
|
+
to determine the grade level. This implementation provides the numerical
|
|
1083
|
+
coordinates and estimated grade level.
|
|
1084
|
+
|
|
1085
|
+
Related GitHub Issues:
|
|
1086
|
+
#16 - Additional Readability Formulas
|
|
1087
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
1088
|
+
|
|
1089
|
+
References:
|
|
1090
|
+
Fry, E. (1968). A readability formula that saves time. Journal of Reading,
|
|
1091
|
+
11(7), 513-578.
|
|
1092
|
+
Fry, E. (1977). Fry's readability graph: Clarifications, validity, and
|
|
1093
|
+
extension to level 17. Journal of Reading, 21(3), 242-252.
|
|
1094
|
+
"""
|
|
1095
|
+
|
|
1096
|
+
# Convenient access (mean values)
|
|
1097
|
+
avg_sentence_length: float
|
|
1098
|
+
avg_syllables_per_100: float
|
|
1099
|
+
grade_level: str # Based on mean coordinates
|
|
1100
|
+
graph_zone: str # Based on mean coordinates
|
|
1101
|
+
|
|
1102
|
+
# Full distributions
|
|
1103
|
+
avg_sentence_length_dist: Distribution
|
|
1104
|
+
avg_syllables_per_100_dist: Distribution
|
|
1105
|
+
|
|
1106
|
+
# Chunking context
|
|
1107
|
+
chunk_size: int
|
|
1108
|
+
chunk_count: int
|
|
1109
|
+
|
|
1110
|
+
metadata: dict[str, Any]
|
|
1111
|
+
|
|
1112
|
+
|
|
1113
|
+
@dataclass
|
|
1114
|
+
class FORCASTResult:
|
|
1115
|
+
"""Result from FORCAST Readability Formula.
|
|
1116
|
+
|
|
1117
|
+
FORCAST (FORmula for CASTing readability) was developed by the U.S. military
|
|
1118
|
+
to assess readability without counting syllables. It uses only single-syllable
|
|
1119
|
+
words as a measure, making it faster to compute than syllable-based formulas.
|
|
1120
|
+
Particularly useful for technical and military documents.
|
|
1121
|
+
|
|
1122
|
+
Related GitHub Issues:
|
|
1123
|
+
#16 - Additional Readability Formulas
|
|
1124
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
1125
|
+
|
|
1126
|
+
Formula: 20 - (N / 10), where N is the number of single-syllable words
|
|
1127
|
+
per 150-word sample.
|
|
1128
|
+
|
|
1129
|
+
References:
|
|
1130
|
+
Caylor, J. S., Sticht, T. G., Fox, L. C., & Ford, J. P. (1973).
|
|
1131
|
+
Methodologies for determining reading requirements of military
|
|
1132
|
+
occupational specialties. Human Resources Research Organization.
|
|
1133
|
+
"""
|
|
1134
|
+
|
|
1135
|
+
# Convenient access (mean values)
|
|
1136
|
+
forcast_score: float
|
|
1137
|
+
grade_level: float # Changed to float for mean across chunks
|
|
1138
|
+
single_syllable_ratio: float # Mean ratio
|
|
1139
|
+
single_syllable_count: int # Total across all chunks
|
|
1140
|
+
total_words: int # Total across all chunks
|
|
1141
|
+
|
|
1142
|
+
# Full distributions
|
|
1143
|
+
forcast_score_dist: Distribution
|
|
1144
|
+
grade_level_dist: Distribution
|
|
1145
|
+
single_syllable_ratio_dist: Distribution
|
|
1146
|
+
|
|
1147
|
+
# Chunking context
|
|
1148
|
+
chunk_size: int
|
|
1149
|
+
chunk_count: int
|
|
1150
|
+
|
|
1151
|
+
metadata: dict[str, Any]
|
|
1152
|
+
|
|
1153
|
+
|
|
1154
|
+
@dataclass
|
|
1155
|
+
class PowersSumnerKearlResult:
|
|
1156
|
+
"""Result from Powers-Sumner-Kearl Readability Formula.
|
|
1157
|
+
|
|
1158
|
+
The Powers-Sumner-Kearl formula is a variation of the Flesch Reading Ease
|
|
1159
|
+
formula, recalibrated for primary grade levels (grades 1-4). It uses
|
|
1160
|
+
average sentence length and average syllables per word, but with different
|
|
1161
|
+
coefficients optimized for younger readers.
|
|
1162
|
+
|
|
1163
|
+
Related GitHub Issues:
|
|
1164
|
+
#16 - Additional Readability Formulas
|
|
1165
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
1166
|
+
|
|
1167
|
+
Formula: 0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
|
|
1168
|
+
|
|
1169
|
+
References:
|
|
1170
|
+
Powers, R. D., Sumner, W. A., & Kearl, B. E. (1958). A recalculation of
|
|
1171
|
+
four adult readability formulas. Journal of Educational Psychology,
|
|
1172
|
+
49(2), 99-105.
|
|
1173
|
+
"""
|
|
1174
|
+
|
|
1175
|
+
# Convenient access (mean values)
|
|
1176
|
+
psk_score: float
|
|
1177
|
+
grade_level: float
|
|
1178
|
+
avg_sentence_length: float
|
|
1179
|
+
avg_syllables_per_word: float
|
|
1180
|
+
total_sentences: int # Total across all chunks
|
|
1181
|
+
total_words: int # Total across all chunks
|
|
1182
|
+
total_syllables: int # Total across all chunks
|
|
1183
|
+
|
|
1184
|
+
# Full distributions
|
|
1185
|
+
psk_score_dist: Distribution
|
|
1186
|
+
grade_level_dist: Distribution
|
|
1187
|
+
avg_sentence_length_dist: Distribution
|
|
1188
|
+
avg_syllables_per_word_dist: Distribution
|
|
1189
|
+
|
|
1190
|
+
# Chunking context
|
|
1191
|
+
chunk_size: int
|
|
1192
|
+
chunk_count: int
|
|
1193
|
+
|
|
1194
|
+
metadata: dict[str, Any]
|
|
1195
|
+
|
|
1196
|
+
|
|
1197
|
+
# ===== Advanced Syntactic Results =====
|
|
1198
|
+
# Related to GitHub Issue #17: Advanced Syntactic Analysis
|
|
1199
|
+
# https://github.com/craigtrim/pystylometry/issues/17
|
|
1200
|
+
|
|
1201
|
+
|
|
1202
|
+
@dataclass
|
|
1203
|
+
class AdvancedSyntacticResult:
|
|
1204
|
+
"""Result from advanced syntactic analysis using dependency parsing.
|
|
1205
|
+
|
|
1206
|
+
Advanced syntactic analysis uses dependency parsing to extract sophisticated
|
|
1207
|
+
grammatical features that go beyond simple POS tagging. These features
|
|
1208
|
+
capture sentence complexity, grammatical sophistication, and syntactic
|
|
1209
|
+
style preferences.
|
|
1210
|
+
|
|
1211
|
+
Related GitHub Issues:
|
|
1212
|
+
#17 - Advanced Syntactic Analysis
|
|
1213
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
1214
|
+
|
|
1215
|
+
Features analyzed:
|
|
1216
|
+
- Parse tree depth (sentence structural complexity)
|
|
1217
|
+
- T-units (minimal terminable units - independent clauses with modifiers)
|
|
1218
|
+
- Clausal density (clauses per T-unit)
|
|
1219
|
+
- Dependent clause ratio
|
|
1220
|
+
- Passive voice ratio
|
|
1221
|
+
- Subordination index
|
|
1222
|
+
- Coordination index
|
|
1223
|
+
- Sentence complexity score
|
|
1224
|
+
|
|
1225
|
+
References:
|
|
1226
|
+
Hunt, K. W. (1965). Grammatical structures written at three grade levels.
|
|
1227
|
+
NCTE Research Report No. 3.
|
|
1228
|
+
Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
|
|
1229
|
+
Lu, X. (2010). Automatic analysis of syntactic complexity in second language
|
|
1230
|
+
writing. International Journal of Corpus Linguistics, 15(4), 474-496.
|
|
1231
|
+
"""
|
|
1232
|
+
|
|
1233
|
+
# Convenient access (mean values)
|
|
1234
|
+
mean_parse_tree_depth: float
|
|
1235
|
+
max_parse_tree_depth: float # Changed to float for mean across chunks
|
|
1236
|
+
t_unit_count: int # Total across all chunks
|
|
1237
|
+
mean_t_unit_length: float
|
|
1238
|
+
clausal_density: float
|
|
1239
|
+
dependent_clause_ratio: float
|
|
1240
|
+
passive_voice_ratio: float
|
|
1241
|
+
subordination_index: float
|
|
1242
|
+
coordination_index: float
|
|
1243
|
+
sentence_complexity_score: float
|
|
1244
|
+
dependency_distance: float
|
|
1245
|
+
left_branching_ratio: float
|
|
1246
|
+
right_branching_ratio: float
|
|
1247
|
+
|
|
1248
|
+
# Full distributions
|
|
1249
|
+
mean_parse_tree_depth_dist: Distribution
|
|
1250
|
+
max_parse_tree_depth_dist: Distribution
|
|
1251
|
+
mean_t_unit_length_dist: Distribution
|
|
1252
|
+
clausal_density_dist: Distribution
|
|
1253
|
+
dependent_clause_ratio_dist: Distribution
|
|
1254
|
+
passive_voice_ratio_dist: Distribution
|
|
1255
|
+
subordination_index_dist: Distribution
|
|
1256
|
+
coordination_index_dist: Distribution
|
|
1257
|
+
sentence_complexity_score_dist: Distribution
|
|
1258
|
+
dependency_distance_dist: Distribution
|
|
1259
|
+
left_branching_ratio_dist: Distribution
|
|
1260
|
+
right_branching_ratio_dist: Distribution
|
|
1261
|
+
|
|
1262
|
+
# Chunking context
|
|
1263
|
+
chunk_size: int
|
|
1264
|
+
chunk_count: int
|
|
1265
|
+
|
|
1266
|
+
metadata: dict[str, Any]
|
|
1267
|
+
|
|
1268
|
+
|
|
1269
|
+
# ===== Sentence Type Results =====
|
|
1270
|
+
# Related to GitHub Issue #18: Sentence Type Classification
|
|
1271
|
+
# https://github.com/craigtrim/pystylometry/issues/18
|
|
1272
|
+
|
|
1273
|
+
|
|
1274
|
+
@dataclass
|
|
1275
|
+
class SentenceTypeResult:
|
|
1276
|
+
"""Result from sentence type classification analysis.
|
|
1277
|
+
|
|
1278
|
+
Sentence type classification categorizes sentences by their grammatical
|
|
1279
|
+
structure (simple, compound, complex, compound-complex) and communicative
|
|
1280
|
+
function (declarative, interrogative, imperative, exclamatory). Different
|
|
1281
|
+
authors and genres show distinct patterns in sentence type distribution.
|
|
1282
|
+
|
|
1283
|
+
Related GitHub Issues:
|
|
1284
|
+
#18 - Sentence Type Classification
|
|
1285
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
1286
|
+
|
|
1287
|
+
Structural types:
|
|
1288
|
+
- Simple: One independent clause (e.g., "The cat sat.")
|
|
1289
|
+
- Compound: Multiple independent clauses (e.g., "I came, I saw, I conquered.")
|
|
1290
|
+
- Complex: One independent + dependent clause(s) (e.g., "When I arrived, I saw her.")
|
|
1291
|
+
- Compound-Complex: Multiple independent + dependent
|
|
1292
|
+
(e.g., "I came when called, and I stayed.")
|
|
1293
|
+
|
|
1294
|
+
Functional types:
|
|
1295
|
+
- Declarative: Statement (e.g., "The sky is blue.")
|
|
1296
|
+
- Interrogative: Question (e.g., "Is the sky blue?")
|
|
1297
|
+
- Imperative: Command (e.g., "Look at the sky!")
|
|
1298
|
+
- Exclamatory: Exclamation (e.g., "What a blue sky!")
|
|
1299
|
+
|
|
1300
|
+
References:
|
|
1301
|
+
Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
|
|
1302
|
+
Huddleston, R., & Pullum, G. K. (2002). The Cambridge Grammar of the English Language.
|
|
1303
|
+
"""
|
|
1304
|
+
|
|
1305
|
+
# Convenient access (mean ratios)
|
|
1306
|
+
simple_ratio: float
|
|
1307
|
+
compound_ratio: float
|
|
1308
|
+
complex_ratio: float
|
|
1309
|
+
compound_complex_ratio: float
|
|
1310
|
+
declarative_ratio: float
|
|
1311
|
+
interrogative_ratio: float
|
|
1312
|
+
imperative_ratio: float
|
|
1313
|
+
exclamatory_ratio: float
|
|
1314
|
+
|
|
1315
|
+
# Counts (totals across all chunks)
|
|
1316
|
+
simple_count: int
|
|
1317
|
+
compound_count: int
|
|
1318
|
+
complex_count: int
|
|
1319
|
+
compound_complex_count: int
|
|
1320
|
+
declarative_count: int
|
|
1321
|
+
interrogative_count: int
|
|
1322
|
+
imperative_count: int
|
|
1323
|
+
exclamatory_count: int
|
|
1324
|
+
total_sentences: int
|
|
1325
|
+
|
|
1326
|
+
# Diversity (mean across chunks)
|
|
1327
|
+
structural_diversity: float
|
|
1328
|
+
functional_diversity: float
|
|
1329
|
+
|
|
1330
|
+
# Full distributions
|
|
1331
|
+
simple_ratio_dist: Distribution
|
|
1332
|
+
compound_ratio_dist: Distribution
|
|
1333
|
+
complex_ratio_dist: Distribution
|
|
1334
|
+
compound_complex_ratio_dist: Distribution
|
|
1335
|
+
declarative_ratio_dist: Distribution
|
|
1336
|
+
interrogative_ratio_dist: Distribution
|
|
1337
|
+
imperative_ratio_dist: Distribution
|
|
1338
|
+
exclamatory_ratio_dist: Distribution
|
|
1339
|
+
structural_diversity_dist: Distribution
|
|
1340
|
+
functional_diversity_dist: Distribution
|
|
1341
|
+
|
|
1342
|
+
# Chunking context
|
|
1343
|
+
chunk_size: int
|
|
1344
|
+
chunk_count: int
|
|
1345
|
+
|
|
1346
|
+
metadata: dict[str, Any]
|
|
1347
|
+
|
|
1348
|
+
|
|
1349
|
+
# ===== Extended N-gram Results =====
|
|
1350
|
+
# Related to GitHub Issue #19: Extended N-gram Features
|
|
1351
|
+
# https://github.com/craigtrim/pystylometry/issues/19
|
|
1352
|
+
|
|
1353
|
+
|
|
1354
|
+
@dataclass
|
|
1355
|
+
class ExtendedNgramResult:
|
|
1356
|
+
"""Result from extended n-gram analysis.
|
|
1357
|
+
|
|
1358
|
+
Extended n-gram analysis goes beyond basic bigram/trigram entropy to provide
|
|
1359
|
+
comprehensive n-gram statistics including frequency distributions, most
|
|
1360
|
+
distinctive n-grams, skipgrams, and part-of-speech n-grams. These features
|
|
1361
|
+
are valuable for authorship attribution and style analysis.
|
|
1362
|
+
|
|
1363
|
+
Related GitHub Issue:
|
|
1364
|
+
#19 - Extended N-gram Features
|
|
1365
|
+
https://github.com/craigtrim/pystylometry/issues/19
|
|
1366
|
+
|
|
1367
|
+
Features computed:
|
|
1368
|
+
- Trigram frequency distributions and top trigrams
|
|
1369
|
+
- 4-gram frequency distributions and top 4-grams
|
|
1370
|
+
- Skipgrams (n-grams with gaps, e.g., "the * dog")
|
|
1371
|
+
- POS n-grams (e.g., "DET ADJ NOUN")
|
|
1372
|
+
- Character trigrams and 4-grams
|
|
1373
|
+
- N-gram diversity metrics
|
|
1374
|
+
- Entropy for each n-gram order
|
|
1375
|
+
|
|
1376
|
+
References:
|
|
1377
|
+
Guthrie, D., Allison, B., Liu, W., Guthrie, L., & Wilks, Y. (2006).
|
|
1378
|
+
A closer look at skip-gram modelling. LREC.
|
|
1379
|
+
Stamatatos, E. (2009). A survey of modern authorship attribution methods.
|
|
1380
|
+
JASIST, 60(3), 538-556.
|
|
1381
|
+
|
|
1382
|
+
Example:
|
|
1383
|
+
>>> result = compute_extended_ngrams("Sample text for n-gram analysis...")
|
|
1384
|
+
>>> print(f"Top trigrams: {result.top_word_trigrams[:5]}")
|
|
1385
|
+
>>> print(f"Trigram entropy: {result.word_trigram_entropy:.2f}")
|
|
1386
|
+
"""
|
|
1387
|
+
|
|
1388
|
+
# Word n-grams
|
|
1389
|
+
top_word_trigrams: list[tuple[str, int]] # Most frequent word trigrams
|
|
1390
|
+
top_word_4grams: list[tuple[str, int]] # Most frequent word 4-grams
|
|
1391
|
+
word_trigram_count: int # Total unique word trigrams
|
|
1392
|
+
word_4gram_count: int # Total unique word 4-grams
|
|
1393
|
+
word_trigram_entropy: float # Shannon entropy of trigram distribution
|
|
1394
|
+
word_4gram_entropy: float # Shannon entropy of 4-gram distribution
|
|
1395
|
+
|
|
1396
|
+
# Skipgrams (n-grams with gaps)
|
|
1397
|
+
top_skipgrams_2_1: list[tuple[str, int]] # Top 2-skipgrams (gap of 1)
|
|
1398
|
+
top_skipgrams_3_1: list[tuple[str, int]] # Top 3-skipgrams (gap of 1)
|
|
1399
|
+
skipgram_2_1_count: int # Unique 2-skipgrams
|
|
1400
|
+
skipgram_3_1_count: int # Unique 3-skipgrams
|
|
1401
|
+
|
|
1402
|
+
# POS n-grams
|
|
1403
|
+
top_pos_trigrams: list[tuple[str, int]] # Most frequent POS trigrams
|
|
1404
|
+
top_pos_4grams: list[tuple[str, int]] # Most frequent POS 4-grams
|
|
1405
|
+
pos_trigram_count: int # Unique POS trigrams
|
|
1406
|
+
pos_4gram_count: int # Unique POS 4-grams
|
|
1407
|
+
pos_trigram_entropy: float # Shannon entropy of POS trigram distribution
|
|
1408
|
+
|
|
1409
|
+
# Character n-grams
|
|
1410
|
+
top_char_trigrams: list[tuple[str, int]] # Most frequent character trigrams
|
|
1411
|
+
top_char_4grams: list[tuple[str, int]] # Most frequent character 4-grams
|
|
1412
|
+
char_trigram_entropy: float # Shannon entropy of char trigram distribution
|
|
1413
|
+
char_4gram_entropy: float # Shannon entropy of char 4-gram distribution
|
|
1414
|
+
|
|
1415
|
+
metadata: dict[str, Any] # Full frequency distributions, parameters, etc.
|
|
1416
|
+
|
|
1417
|
+
|
|
1418
|
+
# ===== Stylistic Markers Results =====
|
|
1419
|
+
# Related to GitHub Issue #20: Stylistic Markers
|
|
1420
|
+
# https://github.com/craigtrim/pystylometry/issues/20
|
|
1421
|
+
|
|
1422
|
+
|
|
1423
|
+
@dataclass
|
|
1424
|
+
class StylisticMarkersResult:
|
|
1425
|
+
"""Result from stylistic markers analysis.
|
|
1426
|
+
|
|
1427
|
+
Stylistic markers are specific linguistic features that authors tend to use
|
|
1428
|
+
consistently and often subconsciously. These include contraction usage,
|
|
1429
|
+
intensifier preferences, hedging expressions, punctuation habits, and more.
|
|
1430
|
+
They are powerful indicators of authorial identity.
|
|
1431
|
+
|
|
1432
|
+
Related GitHub Issue:
|
|
1433
|
+
#20 - Stylistic Markers
|
|
1434
|
+
https://github.com/craigtrim/pystylometry/issues/20
|
|
1435
|
+
|
|
1436
|
+
Markers analyzed:
|
|
1437
|
+
- Contraction usage (don't vs. do not, I'm vs. I am, etc.)
|
|
1438
|
+
- Intensifiers (very, really, extremely, quite, etc.)
|
|
1439
|
+
- Hedges (maybe, perhaps, probably, somewhat, etc.)
|
|
1440
|
+
- Modal auxiliaries (can, could, may, might, must, should, will, would)
|
|
1441
|
+
- Negation patterns (not, no, never, none, neither, etc.)
|
|
1442
|
+
- Exclamation frequency
|
|
1443
|
+
- Question frequency
|
|
1444
|
+
- Quotation usage
|
|
1445
|
+
- Parenthetical expressions
|
|
1446
|
+
- Ellipses and dashes
|
|
1447
|
+
|
|
1448
|
+
References:
|
|
1449
|
+
Argamon, S., & Levitan, S. (2005). Measuring the usefulness of function
|
|
1450
|
+
words for authorship attribution. ACH/ALLC.
|
|
1451
|
+
Pennebaker, J. W. (2011). The secret life of pronouns. Bloomsbury Press.
|
|
1452
|
+
|
|
1453
|
+
Example:
|
|
1454
|
+
>>> result = compute_stylistic_markers("Sample text with various markers...")
|
|
1455
|
+
>>> print(f"Contraction ratio: {result.contraction_ratio * 100:.1f}%")
|
|
1456
|
+
>>> print(f"Intensifier density: {result.intensifier_density:.2f}")
|
|
1457
|
+
>>> print(f"Hedging density: {result.hedging_density:.2f}")
|
|
1458
|
+
"""
|
|
1459
|
+
|
|
1460
|
+
# Contraction patterns
|
|
1461
|
+
contraction_ratio: float # Contractions / (contractions + full forms)
|
|
1462
|
+
contraction_count: int # Total contractions
|
|
1463
|
+
expanded_form_count: int # Total expanded forms (e.g., "do not" vs "don't")
|
|
1464
|
+
top_contractions: list[tuple[str, int]] # Most frequent contractions
|
|
1465
|
+
|
|
1466
|
+
# Intensifiers and hedges
|
|
1467
|
+
intensifier_density: float # Intensifiers per 100 words
|
|
1468
|
+
intensifier_count: int # Total intensifier count
|
|
1469
|
+
top_intensifiers: list[tuple[str, int]] # Most frequent intensifiers
|
|
1470
|
+
hedging_density: float # Hedges per 100 words
|
|
1471
|
+
hedging_count: int # Total hedge count
|
|
1472
|
+
top_hedges: list[tuple[str, int]] # Most frequent hedges
|
|
1473
|
+
|
|
1474
|
+
# Modal auxiliaries
|
|
1475
|
+
modal_density: float # Modal auxiliaries per 100 words
|
|
1476
|
+
modal_distribution: dict[str, int] # Count per modal (can, could, may, etc.)
|
|
1477
|
+
epistemic_modal_ratio: float # Epistemic modals / all modals
|
|
1478
|
+
deontic_modal_ratio: float # Deontic modals / all modals
|
|
1479
|
+
|
|
1480
|
+
# Negation
|
|
1481
|
+
negation_density: float # Negation markers per 100 words
|
|
1482
|
+
negation_count: int # Total negation markers
|
|
1483
|
+
negation_types: dict[str, int] # not, no, never, etc. with counts
|
|
1484
|
+
|
|
1485
|
+
# Punctuation style
|
|
1486
|
+
exclamation_density: float # Exclamation marks per 100 words
|
|
1487
|
+
question_density: float # Question marks per 100 words
|
|
1488
|
+
quotation_density: float # Quotation marks per 100 words
|
|
1489
|
+
parenthetical_density: float # Parentheses per 100 words
|
|
1490
|
+
ellipsis_density: float # Ellipses per 100 words
|
|
1491
|
+
dash_density: float # Dashes (em/en) per 100 words
|
|
1492
|
+
semicolon_density: float # Semicolons per 100 words
|
|
1493
|
+
colon_density: float # Colons per 100 words
|
|
1494
|
+
|
|
1495
|
+
metadata: dict[str, Any] # Full lists, total word count, etc.
|
|
1496
|
+
|
|
1497
|
+
|
|
1498
|
+
# ===== Vocabulary Overlap Results =====
|
|
1499
|
+
# Related to GitHub Issue #21: Vocabulary Overlap and Similarity Metrics
|
|
1500
|
+
# https://github.com/craigtrim/pystylometry/issues/21
|
|
1501
|
+
|
|
1502
|
+
|
|
1503
|
+
@dataclass
|
|
1504
|
+
class VocabularyOverlapResult:
|
|
1505
|
+
"""Result from vocabulary overlap and similarity analysis.
|
|
1506
|
+
|
|
1507
|
+
Vocabulary overlap metrics measure the similarity between two texts based on
|
|
1508
|
+
their shared vocabulary. These metrics are useful for authorship verification,
|
|
1509
|
+
plagiarism detection, and measuring stylistic consistency across texts.
|
|
1510
|
+
|
|
1511
|
+
Related GitHub Issue:
|
|
1512
|
+
#21 - Vocabulary Overlap and Similarity Metrics
|
|
1513
|
+
https://github.com/craigtrim/pystylometry/issues/21
|
|
1514
|
+
|
|
1515
|
+
Metrics computed:
|
|
1516
|
+
- Jaccard similarity (intersection / union)
|
|
1517
|
+
- Dice coefficient (2 * intersection / sum of sizes)
|
|
1518
|
+
- Overlap coefficient (intersection / min(size1, size2))
|
|
1519
|
+
- Cosine similarity (using word frequency vectors)
|
|
1520
|
+
- Shared vocabulary size and ratio
|
|
1521
|
+
- Unique words in each text
|
|
1522
|
+
- Most distinctive words for each text
|
|
1523
|
+
|
|
1524
|
+
References:
|
|
1525
|
+
Jaccard, P. (1912). The distribution of the flora in the alpine zone.
|
|
1526
|
+
New Phytologist, 11(2), 37-50.
|
|
1527
|
+
Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
|
|
1528
|
+
Retrieval. McGraw-Hill.
|
|
1529
|
+
|
|
1530
|
+
Example:
|
|
1531
|
+
>>> result = compute_vocabulary_overlap(text1, text2)
|
|
1532
|
+
>>> print(f"Jaccard similarity: {result.jaccard_similarity:.3f}")
|
|
1533
|
+
>>> print(f"Shared vocabulary: {result.shared_vocab_size} words")
|
|
1534
|
+
>>> print(f"Text1 unique: {result.text1_unique_count}")
|
|
1535
|
+
"""
|
|
1536
|
+
|
|
1537
|
+
# Similarity scores (0-1 range)
|
|
1538
|
+
jaccard_similarity: float # Intersection / union
|
|
1539
|
+
dice_coefficient: float # 2 * intersection / (size1 + size2)
|
|
1540
|
+
overlap_coefficient: float # Intersection / min(size1, size2)
|
|
1541
|
+
cosine_similarity: float # Cosine of frequency vectors
|
|
1542
|
+
|
|
1543
|
+
# Vocabulary sizes
|
|
1544
|
+
text1_vocab_size: int # Unique words in text 1
|
|
1545
|
+
text2_vocab_size: int # Unique words in text 2
|
|
1546
|
+
shared_vocab_size: int # Words in both texts
|
|
1547
|
+
union_vocab_size: int # Words in either text
|
|
1548
|
+
text1_unique_count: int # Words only in text 1
|
|
1549
|
+
text2_unique_count: int # Words only in text 2
|
|
1550
|
+
|
|
1551
|
+
# Shared and distinctive vocabulary
|
|
1552
|
+
shared_words: list[str] # Words appearing in both texts
|
|
1553
|
+
text1_distinctive_words: list[tuple[str, float]] # Words + TF-IDF scores for text 1
|
|
1554
|
+
text2_distinctive_words: list[tuple[str, float]] # Words + TF-IDF scores for text 2
|
|
1555
|
+
|
|
1556
|
+
# Ratios
|
|
1557
|
+
text1_coverage: float # Shared / text1_vocab (how much of text1 is shared)
|
|
1558
|
+
text2_coverage: float # Shared / text2_vocab (how much of text2 is shared)
|
|
1559
|
+
|
|
1560
|
+
metadata: dict[str, Any] # Full vocabulary sets, frequency vectors, etc.
|
|
1561
|
+
|
|
1562
|
+
|
|
1563
|
+
# ===== Cohesion and Coherence Results =====
|
|
1564
|
+
# Related to GitHub Issue #22: Cohesion and Coherence Metrics
|
|
1565
|
+
# https://github.com/craigtrim/pystylometry/issues/22
|
|
1566
|
+
|
|
1567
|
+
|
|
1568
|
+
@dataclass
|
|
1569
|
+
class CohesionCoherenceResult:
|
|
1570
|
+
"""Result from cohesion and coherence analysis.
|
|
1571
|
+
|
|
1572
|
+
Cohesion and coherence metrics measure how well a text holds together
|
|
1573
|
+
structurally (cohesion) and semantically (coherence). These metrics are
|
|
1574
|
+
important for analyzing writing quality, readability, and authorial
|
|
1575
|
+
sophistication.
|
|
1576
|
+
|
|
1577
|
+
Related GitHub Issue:
|
|
1578
|
+
#22 - Cohesion and Coherence Metrics
|
|
1579
|
+
https://github.com/craigtrim/pystylometry/issues/22
|
|
1580
|
+
|
|
1581
|
+
Cohesion features:
|
|
1582
|
+
- Referential cohesion (pronouns, demonstratives pointing back)
|
|
1583
|
+
- Lexical cohesion (word repetition, synonyms, semantic relatedness)
|
|
1584
|
+
- Connective density (discourse markers, conjunctions)
|
|
1585
|
+
- Anaphora resolution success rate
|
|
1586
|
+
- Lexical chains (sequences of semantically related words)
|
|
1587
|
+
|
|
1588
|
+
Coherence features:
|
|
1589
|
+
- Sentence-to-sentence semantic similarity
|
|
1590
|
+
- Topic consistency across paragraphs
|
|
1591
|
+
- Discourse structure (thesis, support, conclusion)
|
|
1592
|
+
- Semantic overlap between adjacent sentences
|
|
1593
|
+
|
|
1594
|
+
References:
|
|
1595
|
+
Halliday, M. A. K., & Hasan, R. (1976). Cohesion in English. Longman.
|
|
1596
|
+
Graesser, A. C., McNamara, D. S., & Kulikowich, J. M. (2011). Coh-Metrix:
|
|
1597
|
+
Providing multilevel analyses of text characteristics. Educational
|
|
1598
|
+
Researcher, 40(5), 223-234.
|
|
1599
|
+
|
|
1600
|
+
Example:
|
|
1601
|
+
>>> result = compute_cohesion_coherence("Multi-paragraph text...")
|
|
1602
|
+
>>> print(f"Pronoun density: {result.pronoun_density:.2f}")
|
|
1603
|
+
>>> print(f"Lexical overlap: {result.adjacent_sentence_overlap:.3f}")
|
|
1604
|
+
>>> print(f"Connective density: {result.connective_density:.2f}")
|
|
1605
|
+
"""
|
|
1606
|
+
|
|
1607
|
+
# Referential cohesion
|
|
1608
|
+
pronoun_density: float # Pronouns per 100 words
|
|
1609
|
+
demonstrative_density: float # Demonstratives (this, that, these, those) per 100 words
|
|
1610
|
+
anaphora_count: int # Anaphoric references detected
|
|
1611
|
+
anaphora_resolution_ratio: float # Successfully resolved / total
|
|
1612
|
+
|
|
1613
|
+
# Lexical cohesion
|
|
1614
|
+
word_repetition_ratio: float # Repeated content words / total content words
|
|
1615
|
+
synonym_density: float # Synonym pairs per 100 words
|
|
1616
|
+
lexical_chain_count: int # Number of lexical chains detected
|
|
1617
|
+
mean_chain_length: float # Average length of lexical chains
|
|
1618
|
+
content_word_overlap: float # Content word overlap between sentences
|
|
1619
|
+
|
|
1620
|
+
# Connectives and discourse markers
|
|
1621
|
+
connective_density: float # Discourse connectives per 100 words
|
|
1622
|
+
additive_connective_ratio: float # "and", "also", "furthermore" / total connectives
|
|
1623
|
+
adversative_connective_ratio: float # "but", "however", "nevertheless" / total
|
|
1624
|
+
causal_connective_ratio: float # "because", "therefore", "thus" / total
|
|
1625
|
+
temporal_connective_ratio: float # "then", "after", "before" / total
|
|
1626
|
+
|
|
1627
|
+
# Coherence measures
|
|
1628
|
+
adjacent_sentence_overlap: float # Mean semantic overlap between adjacent sentences
|
|
1629
|
+
paragraph_topic_consistency: float # Mean topic consistency within paragraphs
|
|
1630
|
+
mean_sentence_similarity: float # Mean cosine similarity between all sentence pairs
|
|
1631
|
+
semantic_coherence_score: float # Composite coherence metric (0-1)
|
|
1632
|
+
|
|
1633
|
+
# Structural coherence
|
|
1634
|
+
paragraph_count: int # Number of paragraphs detected
|
|
1635
|
+
mean_paragraph_length: float # Mean sentences per paragraph
|
|
1636
|
+
discourse_structure_score: float # Quality of intro/body/conclusion structure
|
|
1637
|
+
|
|
1638
|
+
metadata: dict[str, Any] # Lexical chains, connective lists, similarity matrices, etc.
|
|
1639
|
+
|
|
1640
|
+
|
|
1641
|
+
# ===== Genre and Register Results =====
|
|
1642
|
+
# Related to GitHub Issue #23: Genre and Register Features
|
|
1643
|
+
# https://github.com/craigtrim/pystylometry/issues/23
|
|
1644
|
+
|
|
1645
|
+
|
|
1646
|
+
@dataclass
|
|
1647
|
+
class GenreRegisterResult:
|
|
1648
|
+
"""Result from genre and register classification analysis.
|
|
1649
|
+
|
|
1650
|
+
Genre and register features distinguish between different types of texts
|
|
1651
|
+
(academic, journalistic, fiction, legal, etc.) based on linguistic patterns.
|
|
1652
|
+
These features can help identify the context and formality level of a text,
|
|
1653
|
+
and are useful for authorship attribution when combined with other metrics.
|
|
1654
|
+
|
|
1655
|
+
Related GitHub Issue:
|
|
1656
|
+
#23 - Genre and Register Features
|
|
1657
|
+
https://github.com/craigtrim/pystylometry/issues/23
|
|
1658
|
+
|
|
1659
|
+
Features analyzed:
|
|
1660
|
+
- Formality markers (Latinate words, nominalizations, passive voice)
|
|
1661
|
+
- Personal vs. impersonal style (1st/2nd person vs. 3rd person)
|
|
1662
|
+
- Abstract vs. concrete vocabulary
|
|
1663
|
+
- Technical term density
|
|
1664
|
+
- Narrative vs. expository markers
|
|
1665
|
+
- Dialogue presence and ratio
|
|
1666
|
+
- Register classification (frozen, formal, consultative, casual, intimate)
|
|
1667
|
+
|
|
1668
|
+
References:
|
|
1669
|
+
Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
|
|
1670
|
+
Biber, D., & Conrad, S. (2009). Register, genre, and style. Cambridge
|
|
1671
|
+
University Press.
|
|
1672
|
+
Heylighen, F., & Dewaele, J. M. (1999). Formality of language: Definition,
|
|
1673
|
+
measurement and behavioral determinants. Internal Report, Center "Leo
|
|
1674
|
+
Apostel", Free University of Brussels.
|
|
1675
|
+
|
|
1676
|
+
Example:
|
|
1677
|
+
>>> result = compute_genre_register("Academic paper text...")
|
|
1678
|
+
>>> print(f"Formality score: {result.formality_score:.2f}")
|
|
1679
|
+
>>> print(f"Register: {result.register_classification}")
|
|
1680
|
+
>>> print(f"Genre prediction: {result.predicted_genre}")
|
|
1681
|
+
"""
|
|
1682
|
+
|
|
1683
|
+
# Formality indicators
|
|
1684
|
+
formality_score: float # Composite formality score (0-100)
|
|
1685
|
+
latinate_ratio: float # Latinate words / total words
|
|
1686
|
+
nominalization_density: float # Nominalizations per 100 words
|
|
1687
|
+
passive_voice_density: float # Passive constructions per 100 words
|
|
1688
|
+
|
|
1689
|
+
# Personal vs. impersonal
|
|
1690
|
+
first_person_ratio: float # 1st person pronouns / total pronouns
|
|
1691
|
+
second_person_ratio: float # 2nd person pronouns / total pronouns
|
|
1692
|
+
third_person_ratio: float # 3rd person pronouns / total pronouns
|
|
1693
|
+
impersonal_construction_density: float # "It is...", "There are..." per 100 words
|
|
1694
|
+
|
|
1695
|
+
# Abstract vs. concrete
|
|
1696
|
+
abstract_noun_ratio: float # Abstract nouns / total nouns
|
|
1697
|
+
concrete_noun_ratio: float # Concrete nouns / total nouns
|
|
1698
|
+
abstractness_score: float # Composite abstractness (based on word concreteness ratings)
|
|
1699
|
+
|
|
1700
|
+
# Technical and specialized
|
|
1701
|
+
technical_term_density: float # Technical/specialized terms per 100 words
|
|
1702
|
+
jargon_density: float # Domain-specific jargon per 100 words
|
|
1703
|
+
|
|
1704
|
+
# Narrative vs. expository
|
|
1705
|
+
narrative_marker_density: float # Past tense, action verbs per 100 words
|
|
1706
|
+
expository_marker_density: float # Present tense, linking verbs per 100 words
|
|
1707
|
+
narrative_expository_ratio: float # Narrative / expository markers
|
|
1708
|
+
|
|
1709
|
+
# Dialogue and quotation
|
|
1710
|
+
dialogue_ratio: float # Dialogue / total text (estimated)
|
|
1711
|
+
quotation_density: float # Quotations per 100 words
|
|
1712
|
+
|
|
1713
|
+
# Classification results
|
|
1714
|
+
register_classification: str # frozen, formal, consultative, casual, intimate
|
|
1715
|
+
predicted_genre: str # academic, journalistic, fiction, legal, conversational, etc.
|
|
1716
|
+
genre_confidence: float # Confidence in genre prediction (0-1)
|
|
1717
|
+
|
|
1718
|
+
# Feature scores for major genres (0-1 scores for each)
|
|
1719
|
+
academic_score: float
|
|
1720
|
+
journalistic_score: float
|
|
1721
|
+
fiction_score: float
|
|
1722
|
+
legal_score: float
|
|
1723
|
+
conversational_score: float
|
|
1724
|
+
|
|
1725
|
+
metadata: dict[str, Any] # Feature details, word lists, classification probabilities, etc.
|
|
1726
|
+
|
|
1727
|
+
|
|
1728
|
+
# ===== Additional Authorship Results =====
|
|
1729
|
+
# Related to GitHub Issue #24: Additional Authorship Attribution Methods
|
|
1730
|
+
# https://github.com/craigtrim/pystylometry/issues/24
|
|
1731
|
+
|
|
1732
|
+
|
|
1733
|
+
@dataclass
|
|
1734
|
+
class KilgarriffResult:
|
|
1735
|
+
"""Result from Kilgarriff's Chi-squared method.
|
|
1736
|
+
|
|
1737
|
+
Kilgarriff's chi-squared method compares word frequency distributions between
|
|
1738
|
+
texts using the chi-squared test. It's particularly effective for authorship
|
|
1739
|
+
attribution when comparing frequency profiles of common words.
|
|
1740
|
+
|
|
1741
|
+
Related GitHub Issue:
|
|
1742
|
+
#24 - Additional Authorship Attribution Methods
|
|
1743
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
1744
|
+
|
|
1745
|
+
References:
|
|
1746
|
+
Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus
|
|
1747
|
+
Linguistics, 6(1), 97-133.
|
|
1748
|
+
|
|
1749
|
+
Example:
|
|
1750
|
+
>>> result = compute_kilgarriff(text1, text2)
|
|
1751
|
+
>>> print(f"Chi-squared: {result.chi_squared:.2f}")
|
|
1752
|
+
>>> print(f"P-value: {result.p_value:.4f}")
|
|
1753
|
+
"""
|
|
1754
|
+
|
|
1755
|
+
chi_squared: float # Chi-squared statistic
|
|
1756
|
+
p_value: float # Statistical significance (p-value)
|
|
1757
|
+
degrees_of_freedom: int # df for chi-squared test
|
|
1758
|
+
feature_count: int # Number of features (words) compared
|
|
1759
|
+
most_distinctive_features: list[tuple[str, float]] # Words + chi-squared contributions
|
|
1760
|
+
metadata: dict[str, Any] # Frequency tables, expected values, etc.
|
|
1761
|
+
|
|
1762
|
+
|
|
1763
|
+
@dataclass
|
|
1764
|
+
class KilgarriffDriftResult:
|
|
1765
|
+
"""Result from Kilgarriff chi-squared drift detection within a single document.
|
|
1766
|
+
|
|
1767
|
+
This result captures stylistic drift patterns by comparing sequential chunks
|
|
1768
|
+
of text using Kilgarriff's chi-squared method. It enables detection of
|
|
1769
|
+
inconsistent authorship, heavy editing, pasted content, and AI-generated
|
|
1770
|
+
text signatures.
|
|
1771
|
+
|
|
1772
|
+
Related GitHub Issues:
|
|
1773
|
+
#36 - Kilgarriff Chi-Squared drift detection for intra-document analysis
|
|
1774
|
+
https://github.com/craigtrim/pystylometry/issues/36
|
|
1775
|
+
#31 - Classical Stylometric Methods from Programming Historian
|
|
1776
|
+
https://github.com/craigtrim/pystylometry/issues/31
|
|
1777
|
+
|
|
1778
|
+
Pattern Signatures:
|
|
1779
|
+
- consistent: Low, stable χ² across pairs (natural human writing)
|
|
1780
|
+
- gradual_drift: Slowly increasing trend (author fatigue, topic shift)
|
|
1781
|
+
- sudden_spike: One pair has high χ² (pasted content, different author)
|
|
1782
|
+
- suspiciously_uniform: Near-zero variance (possible AI generation)
|
|
1783
|
+
- unknown: Insufficient data for classification
|
|
1784
|
+
|
|
1785
|
+
Marketing Name: "Style Drift Detector" / "Consistency Fingerprint"
|
|
1786
|
+
|
|
1787
|
+
References:
|
|
1788
|
+
Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus
|
|
1789
|
+
Linguistics, 6(1), 97-133.
|
|
1790
|
+
|
|
1791
|
+
Example:
|
|
1792
|
+
>>> result = compute_kilgarriff_drift(text, window_size=1000, stride=500)
|
|
1793
|
+
>>> result.pattern # "consistent", "gradual_drift", "sudden_spike", etc.
|
|
1794
|
+
'consistent'
|
|
1795
|
+
>>> result.mean_chi_squared # Average χ² across chunk pairs
|
|
1796
|
+
45.2
|
|
1797
|
+
>>> result.status # "success", "marginal_data", "insufficient_data"
|
|
1798
|
+
'success'
|
|
1799
|
+
"""
|
|
1800
|
+
|
|
1801
|
+
# Status (graceful handling of edge cases)
|
|
1802
|
+
status: str # "success", "marginal_data", "insufficient_data"
|
|
1803
|
+
status_message: str # Human-readable explanation
|
|
1804
|
+
|
|
1805
|
+
# Pattern classification
|
|
1806
|
+
pattern: str # "consistent", "gradual_drift", "sudden_spike", "suspiciously_uniform", "unknown"
|
|
1807
|
+
pattern_confidence: float # 0.0-1.0 confidence in classification
|
|
1808
|
+
|
|
1809
|
+
# Holistic metrics (may be NaN if insufficient data)
|
|
1810
|
+
mean_chi_squared: float # Average χ² across all chunk pairs
|
|
1811
|
+
std_chi_squared: float # Standard deviation of χ² values
|
|
1812
|
+
max_chi_squared: float # Highest χ² between any two chunks
|
|
1813
|
+
min_chi_squared: float # Lowest χ² between any two chunks
|
|
1814
|
+
max_location: int # Index of chunk boundary with max χ² (0-indexed)
|
|
1815
|
+
trend: float # Linear regression slope of χ² over chunk pairs
|
|
1816
|
+
|
|
1817
|
+
# Pairwise comparison data
|
|
1818
|
+
pairwise_scores: list[dict] # [{"chunk_pair": (0, 1), "chi_squared": 45.2, "top_words": [...]}]
|
|
1819
|
+
|
|
1820
|
+
# Window configuration (for reproducibility)
|
|
1821
|
+
window_size: int
|
|
1822
|
+
stride: int
|
|
1823
|
+
overlap_ratio: float # Computed: max(0, 1 - stride/window_size)
|
|
1824
|
+
comparison_mode: str # "sequential", "all_pairs", "fixed_lag"
|
|
1825
|
+
window_count: int
|
|
1826
|
+
|
|
1827
|
+
# For all_pairs mode only
|
|
1828
|
+
distance_matrix: list[list[float]] | None # None for sequential/fixed_lag
|
|
1829
|
+
|
|
1830
|
+
# Thresholds used for pattern classification (for transparency)
|
|
1831
|
+
thresholds: dict[str, float]
|
|
1832
|
+
|
|
1833
|
+
metadata: dict[str, Any]
|
|
1834
|
+
|
|
1835
|
+
|
|
1836
|
+
# ===== Consistency Module Thresholds =====
|
|
1837
|
+
# Related to GitHub Issue #36
|
|
1838
|
+
# These are calibration constants for pattern classification
|
|
1839
|
+
|
|
1840
|
+
MIN_WINDOWS = 3 # Bare minimum for variance calculation
|
|
1841
|
+
RECOMMENDED_WINDOWS = 5 # For reliable pattern classification
|
|
1842
|
+
|
|
1843
|
+
|
|
1844
|
+
@dataclass
|
|
1845
|
+
class MinMaxResult:
|
|
1846
|
+
"""Result from Min-Max distance method (Burrows' original method).
|
|
1847
|
+
|
|
1848
|
+
The Min-Max method normalizes feature frequencies using min-max scaling,
|
|
1849
|
+
then computes distance between texts. This was Burrows' original approach
|
|
1850
|
+
before developing Delta.
|
|
1851
|
+
|
|
1852
|
+
Related GitHub Issue:
|
|
1853
|
+
#24 - Additional Authorship Attribution Methods
|
|
1854
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
1855
|
+
|
|
1856
|
+
References:
|
|
1857
|
+
Burrows, J. F. (1992). Not unless you ask nicely: The interpretative
|
|
1858
|
+
nexus between analysis and information. Literary and Linguistic
|
|
1859
|
+
Computing, 7(2), 91-109.
|
|
1860
|
+
|
|
1861
|
+
Example:
|
|
1862
|
+
>>> result = compute_minmax(text1, text2)
|
|
1863
|
+
>>> print(f"MinMax distance: {result.minmax_distance:.3f}")
|
|
1864
|
+
"""
|
|
1865
|
+
|
|
1866
|
+
minmax_distance: float # Min-max normalized distance
|
|
1867
|
+
feature_count: int # Number of features used
|
|
1868
|
+
most_distinctive_features: list[tuple[str, float]] # Features + contributions
|
|
1869
|
+
metadata: dict[str, Any] # Normalized frequencies, scaling parameters, etc.
|
|
1870
|
+
|
|
1871
|
+
|
|
1872
|
+
@dataclass
|
|
1873
|
+
class JohnsBurrowsResult:
|
|
1874
|
+
"""Result from John's Burrows' variation of Delta.
|
|
1875
|
+
|
|
1876
|
+
John Burrows has developed several variations of the Delta method over
|
|
1877
|
+
the years. This captures alternative formulations including Quadratic
|
|
1878
|
+
Delta and other distance measures.
|
|
1879
|
+
|
|
1880
|
+
Related GitHub Issue:
|
|
1881
|
+
#24 - Additional Authorship Attribution Methods
|
|
1882
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
1883
|
+
|
|
1884
|
+
References:
|
|
1885
|
+
Burrows, J. (2005). Who wrote Shamela? Verifying the authorship of a
|
|
1886
|
+
parodic text. Literary and Linguistic Computing, 20(4), 437-450.
|
|
1887
|
+
|
|
1888
|
+
Example:
|
|
1889
|
+
>>> result = compute_johns_delta(text1, text2, method="quadratic")
|
|
1890
|
+
>>> print(f"Quadratic Delta: {result.delta_score:.3f}")
|
|
1891
|
+
"""
|
|
1892
|
+
|
|
1893
|
+
delta_score: float # Delta distance score
|
|
1894
|
+
method: str # "quadratic", "weighted", "rotated", etc.
|
|
1895
|
+
feature_count: int # Number of MFW used
|
|
1896
|
+
most_distinctive_features: list[tuple[str, float]] # Features + contributions
|
|
1897
|
+
metadata: dict[str, Any] # Method-specific parameters, z-scores, etc.
|
|
1898
|
+
|
|
1899
|
+
|
|
1900
|
+
# ===== Rhythm and Prosody Results =====
|
|
1901
|
+
# Related to GitHub Issue #25: Rhythm and Prosody Metrics
|
|
1902
|
+
# https://github.com/craigtrim/pystylometry/issues/25
|
|
1903
|
+
|
|
1904
|
+
|
|
1905
|
+
@dataclass
|
|
1906
|
+
class RhythmProsodyResult:
|
|
1907
|
+
"""Result from rhythm and prosody analysis.
|
|
1908
|
+
|
|
1909
|
+
Rhythm and prosody metrics capture the musical qualities of written language,
|
|
1910
|
+
including stress patterns, syllable rhythms, and phonological features. While
|
|
1911
|
+
these are typically studied in spoken language, written text preserves many
|
|
1912
|
+
rhythmic patterns that vary by author and genre.
|
|
1913
|
+
|
|
1914
|
+
Related GitHub Issue:
|
|
1915
|
+
#25 - Rhythm and Prosody Metrics
|
|
1916
|
+
https://github.com/craigtrim/pystylometry/issues/25
|
|
1917
|
+
|
|
1918
|
+
Features analyzed:
|
|
1919
|
+
- Syllable patterns and stress patterns
|
|
1920
|
+
- Rhythmic regularity (coefficient of variation of syllable counts)
|
|
1921
|
+
- Phonological features (alliteration, assonance)
|
|
1922
|
+
- Syllable complexity (consonant clusters)
|
|
1923
|
+
- Sentence rhythm (alternating long/short sentences)
|
|
1924
|
+
- Polysyllabic word ratio
|
|
1925
|
+
|
|
1926
|
+
References:
|
|
1927
|
+
Lea, R. B., Mulligan, E. J., & Walton, J. H. (2005). Sentence rhythm and
|
|
1928
|
+
text comprehension. Memory & Cognition, 33(3), 388-396.
|
|
1929
|
+
Louwerse, M. M., & Benesh, N. (2012). Representing spatial structure through
|
|
1930
|
+
maps and language: Lord of the Rings encodes the spatial structure of
|
|
1931
|
+
Middle Earth. Cognitive Science, 36(8), 1556-1569.
|
|
1932
|
+
|
|
1933
|
+
Example:
|
|
1934
|
+
>>> result = compute_rhythm_prosody("Sample text with rhythm...")
|
|
1935
|
+
>>> print(f"Syllables per word: {result.mean_syllables_per_word:.2f}")
|
|
1936
|
+
>>> print(f"Rhythmic regularity: {result.rhythmic_regularity:.3f}")
|
|
1937
|
+
>>> print(f"Alliteration density: {result.alliteration_density:.2f}")
|
|
1938
|
+
"""
|
|
1939
|
+
|
|
1940
|
+
# Syllable patterns
|
|
1941
|
+
mean_syllables_per_word: float # Average syllables per word
|
|
1942
|
+
syllable_std_dev: float # Std dev of syllables per word
|
|
1943
|
+
polysyllabic_ratio: float # Words with 3+ syllables / total
|
|
1944
|
+
monosyllabic_ratio: float # Single-syllable words / total
|
|
1945
|
+
|
|
1946
|
+
# Rhythmic regularity
|
|
1947
|
+
rhythmic_regularity: float # 1 / CV of syllable counts (higher = more regular)
|
|
1948
|
+
syllable_cv: float # Coefficient of variation of syllables per word
|
|
1949
|
+
stress_pattern_entropy: float # Entropy of stress patterns
|
|
1950
|
+
|
|
1951
|
+
# Sentence rhythm
|
|
1952
|
+
sentence_length_alternation: float # Degree of long/short alternation
|
|
1953
|
+
sentence_rhythm_score: float # Composite rhythm score
|
|
1954
|
+
|
|
1955
|
+
# Phonological features
|
|
1956
|
+
alliteration_density: float # Alliterative word pairs per 100 words
|
|
1957
|
+
assonance_density: float # Assonant word pairs per 100 words
|
|
1958
|
+
consonance_density: float # Consonant word pairs per 100 words
|
|
1959
|
+
|
|
1960
|
+
# Syllable complexity
|
|
1961
|
+
mean_consonant_cluster_length: float # Avg consonants in clusters
|
|
1962
|
+
initial_cluster_ratio: float # Words starting with clusters / total
|
|
1963
|
+
final_cluster_ratio: float # Words ending with clusters / total
|
|
1964
|
+
|
|
1965
|
+
# Stress patterns (estimated for written text)
|
|
1966
|
+
iambic_ratio: float # Iambic patterns (unstressed-stressed) / total
|
|
1967
|
+
trochaic_ratio: float # Trochaic patterns (stressed-unstressed) / total
|
|
1968
|
+
dactylic_ratio: float # Dactylic patterns / total
|
|
1969
|
+
anapestic_ratio: float # Anapestic patterns / total
|
|
1970
|
+
|
|
1971
|
+
metadata: dict[str, Any] # Syllable counts, stress patterns, phoneme data, etc.
|
|
1972
|
+
|
|
1973
|
+
|
|
1974
|
+
# ===== Dialect Detection Results =====
|
|
1975
|
+
# Related to GitHub Issue #35: Dialect detection with extensible JSON markers
|
|
1976
|
+
# https://github.com/craigtrim/pystylometry/issues/35
|
|
1977
|
+
# Related to GitHub Issue #30: Whonix stylometry features
|
|
1978
|
+
# https://github.com/craigtrim/pystylometry/issues/30
|
|
1979
|
+
|
|
1980
|
+
|
|
1981
|
+
@dataclass
|
|
1982
|
+
class DialectResult:
|
|
1983
|
+
"""Result from dialect detection analysis.
|
|
1984
|
+
|
|
1985
|
+
Dialect detection identifies regional linguistic preferences (British vs.
|
|
1986
|
+
American English) and measures text markedness - how far the text deviates
|
|
1987
|
+
from "unmarked" standard English. This analysis uses an extensible JSON-based
|
|
1988
|
+
marker database covering vocabulary, spelling patterns, grammar patterns,
|
|
1989
|
+
punctuation conventions, and idiomatic expressions.
|
|
1990
|
+
|
|
1991
|
+
The analysis follows the chunking pattern from Issue #27, computing metrics
|
|
1992
|
+
per chunk and providing distributions for stylometric fingerprinting. Dialect
|
|
1993
|
+
markers are sparse, so variance across chunks can reveal mixed authorship
|
|
1994
|
+
(e.g., a UK speaker using ChatGPT-generated American English content).
|
|
1995
|
+
|
|
1996
|
+
Related GitHub Issues:
|
|
1997
|
+
#35 - Dialect detection with extensible JSON markers
|
|
1998
|
+
https://github.com/craigtrim/pystylometry/issues/35
|
|
1999
|
+
#30 - Whonix stylometry features (regional linguistic preferences)
|
|
2000
|
+
https://github.com/craigtrim/pystylometry/issues/30
|
|
2001
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
2002
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
2003
|
+
|
|
2004
|
+
Theoretical Background:
|
|
2005
|
+
Markedness theory (Battistella, 1990) informs the markedness_score:
|
|
2006
|
+
marked forms stand out against "standard" written English. High
|
|
2007
|
+
markedness suggests intentional stylistic choice or strong dialect
|
|
2008
|
+
identity. Dialectometry (Goebl, 1982; Nerbonne, 2009) provides the
|
|
2009
|
+
quantitative framework for holistic dialect measurement.
|
|
2010
|
+
|
|
2011
|
+
Feature Levels:
|
|
2012
|
+
Markers are categorized by linguistic level for fine-grained analysis:
|
|
2013
|
+
- Phonological: Spelling reflecting pronunciation (colour/color)
|
|
2014
|
+
- Morphological: Word formation (-ise/-ize, -our/-or, doubled L)
|
|
2015
|
+
- Lexical: Different words for same concept (flat/apartment)
|
|
2016
|
+
- Syntactic: Grammar differences (have got/have, collective nouns)
|
|
2017
|
+
|
|
2018
|
+
Eye Dialect vs. True Dialect:
|
|
2019
|
+
Following Encyclopedia.com's distinction, "eye dialect" (gonna, wanna)
|
|
2020
|
+
indicates informal register, not regional dialect. True dialect markers
|
|
2021
|
+
(colour, flat, lorry) indicate actual regional preference.
|
|
2022
|
+
|
|
2023
|
+
References:
|
|
2024
|
+
Battistella, Edwin L. "Markedness: The Evaluative Superstructure of
|
|
2025
|
+
Language." State University of New York Press, 1990.
|
|
2026
|
+
Goebl, Hans. "Dialektometrie: Prinzipien und Methoden des Einsatzes der
|
|
2027
|
+
numerischen Taxonomie im Bereich der Dialektgeographie." Verlag der
|
|
2028
|
+
Österreichischen Akademie der Wissenschaften, 1982.
|
|
2029
|
+
Nerbonne, John. "Data-Driven Dialectology." Language and Linguistics
|
|
2030
|
+
Compass, vol. 3, no. 1, 2009, pp. 175-198.
|
|
2031
|
+
Labov, William. "The Social Stratification of English in New York City."
|
|
2032
|
+
Cambridge University Press, 2006.
|
|
2033
|
+
Whonix Project. "Stylometry: Deanonymization Techniques." Whonix Wiki,
|
|
2034
|
+
https://www.whonix.org/wiki/Stylometry
|
|
2035
|
+
|
|
2036
|
+
Example:
|
|
2037
|
+
>>> result = compute_dialect(text, chunk_size=1000)
|
|
2038
|
+
>>> result.dialect # "british", "american", "mixed", or "neutral"
|
|
2039
|
+
'british'
|
|
2040
|
+
>>> result.british_score # Mean across chunks
|
|
2041
|
+
0.72
|
|
2042
|
+
>>> result.british_score_dist.std # Variance reveals fingerprint
|
|
2043
|
+
0.05
|
|
2044
|
+
>>> result.markedness_score # Deviation from standard English
|
|
2045
|
+
0.35
|
|
2046
|
+
"""
|
|
2047
|
+
|
|
2048
|
+
# Classification result
|
|
2049
|
+
dialect: str # "british", "american", "mixed", "neutral"
|
|
2050
|
+
confidence: float # 0.0-1.0, how confident the classification is
|
|
2051
|
+
|
|
2052
|
+
# Convenient access (mean values across chunks)
|
|
2053
|
+
british_score: float # Mean British marker density (0.0-1.0)
|
|
2054
|
+
american_score: float # Mean American marker density (0.0-1.0)
|
|
2055
|
+
markedness_score: float # Mean deviation from unmarked standard (0.0-1.0)
|
|
2056
|
+
|
|
2057
|
+
# Full distributions for stylometric fingerprinting
|
|
2058
|
+
british_score_dist: Distribution
|
|
2059
|
+
american_score_dist: Distribution
|
|
2060
|
+
markedness_score_dist: Distribution
|
|
2061
|
+
|
|
2062
|
+
# Marker breakdown by linguistic level (aggregated across chunks)
|
|
2063
|
+
# Keys: "phonological", "morphological", "lexical", "syntactic"
|
|
2064
|
+
markers_by_level: dict[str, dict[str, int]]
|
|
2065
|
+
|
|
2066
|
+
# Detailed marker counts (aggregated across chunks)
|
|
2067
|
+
spelling_markers: dict[str, int] # {"colour": 2, "color": 1}
|
|
2068
|
+
vocabulary_markers: dict[str, int] # {"flat": 1, "apartment": 0}
|
|
2069
|
+
grammar_markers: dict[str, int] # {"have got": 1}
|
|
2070
|
+
|
|
2071
|
+
# Eye dialect (informal register indicators, not true dialect)
|
|
2072
|
+
eye_dialect_count: int # Total eye dialect markers (gonna, wanna, etc.)
|
|
2073
|
+
eye_dialect_ratio: float # Eye dialect per 1000 words
|
|
2074
|
+
|
|
2075
|
+
# Register analysis hints
|
|
2076
|
+
register_hints: dict[str, Any] # {"formality": 0.7, "hedging_density": 0.05}
|
|
2077
|
+
|
|
2078
|
+
# Chunking context
|
|
2079
|
+
chunk_size: int
|
|
2080
|
+
chunk_count: int
|
|
2081
|
+
|
|
2082
|
+
# Extensible metadata
|
|
157
2083
|
metadata: dict[str, Any]
|
|
158
2084
|
|
|
159
2085
|
|