pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pystylometry/__init__.py +30 -5
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1954 -28
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +26 -1
  6. pystylometry/authorship/additional_methods.py +75 -0
  7. pystylometry/authorship/kilgarriff.py +347 -0
  8. pystylometry/character/__init__.py +15 -0
  9. pystylometry/character/character_metrics.py +389 -0
  10. pystylometry/cli.py +427 -0
  11. pystylometry/consistency/__init__.py +57 -0
  12. pystylometry/consistency/_thresholds.py +162 -0
  13. pystylometry/consistency/drift.py +549 -0
  14. pystylometry/dialect/__init__.py +65 -0
  15. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  16. pystylometry/dialect/_loader.py +360 -0
  17. pystylometry/dialect/detector.py +533 -0
  18. pystylometry/lexical/__init__.py +13 -6
  19. pystylometry/lexical/advanced_diversity.py +680 -0
  20. pystylometry/lexical/function_words.py +590 -0
  21. pystylometry/lexical/hapax.py +310 -33
  22. pystylometry/lexical/mtld.py +180 -22
  23. pystylometry/lexical/ttr.py +149 -0
  24. pystylometry/lexical/word_frequency_sophistication.py +1805 -0
  25. pystylometry/lexical/yule.py +142 -29
  26. pystylometry/ngrams/__init__.py +2 -0
  27. pystylometry/ngrams/entropy.py +150 -49
  28. pystylometry/ngrams/extended_ngrams.py +235 -0
  29. pystylometry/prosody/__init__.py +12 -0
  30. pystylometry/prosody/rhythm_prosody.py +53 -0
  31. pystylometry/readability/__init__.py +12 -0
  32. pystylometry/readability/additional_formulas.py +2110 -0
  33. pystylometry/readability/ari.py +173 -35
  34. pystylometry/readability/coleman_liau.py +150 -30
  35. pystylometry/readability/complex_words.py +531 -0
  36. pystylometry/readability/flesch.py +181 -32
  37. pystylometry/readability/gunning_fog.py +208 -35
  38. pystylometry/readability/smog.py +126 -28
  39. pystylometry/readability/syllables.py +137 -30
  40. pystylometry/stylistic/__init__.py +20 -0
  41. pystylometry/stylistic/cohesion_coherence.py +45 -0
  42. pystylometry/stylistic/genre_register.py +45 -0
  43. pystylometry/stylistic/markers.py +131 -0
  44. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  45. pystylometry/syntactic/__init__.py +4 -0
  46. pystylometry/syntactic/advanced_syntactic.py +494 -0
  47. pystylometry/syntactic/pos_ratios.py +172 -17
  48. pystylometry/syntactic/sentence_stats.py +105 -18
  49. pystylometry/syntactic/sentence_types.py +526 -0
  50. pystylometry/viz/__init__.py +71 -0
  51. pystylometry/viz/drift.py +589 -0
  52. pystylometry/viz/jsx/__init__.py +31 -0
  53. pystylometry/viz/jsx/_base.py +144 -0
  54. pystylometry/viz/jsx/report.py +677 -0
  55. pystylometry/viz/jsx/timeline.py +716 -0
  56. pystylometry/viz/jsx/viewer.py +1032 -0
  57. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
  58. pystylometry-1.1.0.dist-info/RECORD +63 -0
  59. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  60. pystylometry-0.1.0.dist-info/RECORD +0 -26
  61. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
pystylometry/_types.py CHANGED
@@ -1,42 +1,372 @@
1
- """Result dataclasses for all pystylometry metrics."""
1
+ """Result dataclasses for all pystylometry metrics.
2
+
3
+ This module defines dataclasses for all metric results in pystylometry.
4
+
5
+ Native Chunked Analysis (Issue #27):
6
+ All metrics support chunked analysis by default. Results include:
7
+ - Convenient access to the mean value (e.g., result.reading_ease)
8
+ - Full distribution with per-chunk values and statistics (e.g., result.reading_ease_dist)
9
+
10
+ The Distribution dataclass provides:
11
+ - values: list of per-chunk metric values
12
+ - mean, median, std: central tendency and variability
13
+ - range, iqr: spread measures
14
+
15
+ This design captures the variance and rhythm in writing style, which is
16
+ essential for authorship attribution and linguistic fingerprinting.
17
+
18
+ References:
19
+ STTR methodology: Johnson, W. (1944). Studies in language behavior.
20
+ """
2
21
 
3
22
  from __future__ import annotations
4
23
 
24
+ import statistics
5
25
  from dataclasses import dataclass
6
26
  from typing import Any
7
27
 
28
+ # ===== Distribution and Chunking =====
29
+ # Related to GitHub Issue #27: Native chunked analysis with Distribution dataclass
30
+ # https://github.com/craigtrim/pystylometry/issues/27
31
+
32
+
33
+ @dataclass
34
+ class Distribution:
35
+ """Distribution of metric values across chunks.
36
+
37
+ This dataclass captures the variance and rhythm in writing style by storing
38
+ per-chunk values along with descriptive statistics. The variance across chunks
39
+ is often more revealing of authorial fingerprint than aggregate values.
40
+
41
+ Related GitHub Issue:
42
+ #27 - Native chunked analysis with Distribution dataclass
43
+ https://github.com/craigtrim/pystylometry/issues/27
44
+
45
+ Attributes:
46
+ values: Raw per-chunk metric values
47
+ mean: Arithmetic mean of values
48
+ median: Middle value when sorted
49
+ std: Standard deviation (0.0 for single-chunk)
50
+ range: max - min (spread measure)
51
+ iqr: Interquartile range (Q3 - Q1), robust spread measure
52
+
53
+ Note:
54
+ min/max are omitted as trivial operations on values:
55
+ - min(dist.values), max(dist.values)
56
+
57
+ Example:
58
+ >>> dist = Distribution(
59
+ ... values=[65.2, 71.1, 68.8, 70.5],
60
+ ... mean=68.9, median=69.65, std=2.57,
61
+ ... range=5.9, iqr=3.15
62
+ ... )
63
+ >>> dist.std # variance reveals authorial fingerprint
64
+ 2.57
65
+ """
66
+
67
+ values: list[float]
68
+ mean: float
69
+ median: float
70
+ std: float
71
+ range: float
72
+ iqr: float
73
+
74
+
75
+ def chunk_text(text: str, chunk_size: int) -> list[str]:
76
+ """Split text into word-based chunks of approximately equal size.
77
+
78
+ Chunks are created by splitting on whitespace and grouping words.
79
+ The last chunk may be smaller than chunk_size if the text doesn't
80
+ divide evenly.
81
+
82
+ Related GitHub Issue:
83
+ #27 - Native chunked analysis with Distribution dataclass
84
+ https://github.com/craigtrim/pystylometry/issues/27
85
+
86
+ Args:
87
+ text: The text to chunk
88
+ chunk_size: Target number of words per chunk (default: 1000)
89
+
90
+ Returns:
91
+ List of text chunks. For text smaller than chunk_size,
92
+ returns a single-element list with the entire text.
93
+
94
+ Example:
95
+ >>> chunks = chunk_text("word " * 2500, chunk_size=1000)
96
+ >>> len(chunks)
97
+ 3
98
+ >>> # First two chunks have ~1000 words, last has ~500
99
+ """
100
+ words = text.split()
101
+ if not words:
102
+ return [""]
103
+
104
+ chunks = []
105
+ for i in range(0, len(words), chunk_size):
106
+ chunk_words = words[i : i + chunk_size]
107
+ chunks.append(" ".join(chunk_words))
108
+
109
+ return chunks
110
+
111
+
112
+ def make_distribution(values: list[float]) -> Distribution:
113
+ """Create a Distribution from a list of values.
114
+
115
+ Computes all descriptive statistics for the distribution.
116
+ Handles single-value lists by setting std, range, and iqr to 0.0.
117
+
118
+ Related GitHub Issue:
119
+ #27 - Native chunked analysis with Distribution dataclass
120
+ https://github.com/craigtrim/pystylometry/issues/27
121
+
122
+ Args:
123
+ values: List of numeric values (must be non-empty)
124
+
125
+ Returns:
126
+ Distribution with computed statistics
127
+
128
+ Raises:
129
+ ValueError: If values is empty
130
+
131
+ Example:
132
+ >>> dist = make_distribution([65.2, 71.1, 68.8, 70.5])
133
+ >>> dist.mean
134
+ 68.9
135
+ >>> dist.std # reveals variance in the signal
136
+ 2.57...
137
+ """
138
+ if not values:
139
+ raise ValueError("Cannot create distribution from empty values")
140
+
141
+ if len(values) == 1:
142
+ return Distribution(
143
+ values=values,
144
+ mean=values[0],
145
+ median=values[0],
146
+ std=0.0,
147
+ range=0.0,
148
+ iqr=0.0,
149
+ )
150
+
151
+ # For 2-3 values, quantiles() needs special handling
152
+ if len(values) < 4:
153
+ q1 = values[0]
154
+ q3 = values[-1]
155
+ else:
156
+ q = statistics.quantiles(values, n=4)
157
+ q1, q3 = q[0], q[2]
158
+
159
+ return Distribution(
160
+ values=values,
161
+ mean=statistics.mean(values),
162
+ median=statistics.median(values),
163
+ std=statistics.stdev(values),
164
+ range=max(values) - min(values),
165
+ iqr=q3 - q1,
166
+ )
167
+
168
+
8
169
  # ===== Lexical Results =====
9
170
 
10
171
 
11
172
  @dataclass
12
173
  class MTLDResult:
13
- """Result from MTLD (Measure of Textual Lexical Diversity) computation."""
174
+ """Result from MTLD (Measure of Textual Lexical Diversity) computation.
175
+
176
+ All numeric metrics include both a mean value (convenient access) and
177
+ a full distribution with per-chunk values and statistics.
178
+
179
+ Related GitHub Issue:
180
+ #27 - Native chunked analysis with Distribution dataclass
181
+ https://github.com/craigtrim/pystylometry/issues/27
14
182
 
183
+ Example:
184
+ >>> result = compute_mtld(text, chunk_size=1000)
185
+ >>> result.mtld_average # mean MTLD across chunks
186
+ 72.5
187
+ >>> result.mtld_average_dist.std # MTLD variance
188
+ 8.3
189
+ """
190
+
191
+ # Convenient access (mean values)
15
192
  mtld_forward: float
16
193
  mtld_backward: float
17
194
  mtld_average: float
195
+
196
+ # Full distributions
197
+ mtld_forward_dist: Distribution
198
+ mtld_backward_dist: Distribution
199
+ mtld_average_dist: Distribution
200
+
201
+ # Chunking context
202
+ chunk_size: int
203
+ chunk_count: int
204
+
18
205
  metadata: dict[str, Any]
19
206
 
20
207
 
21
208
  @dataclass
22
209
  class YuleResult:
23
- """Result from Yule's K and I computation."""
210
+ """Result from Yule's K and I computation.
211
+
212
+ All numeric metrics include both a mean value (convenient access) and
213
+ a full distribution with per-chunk values and statistics.
214
+
215
+ Related GitHub Issue:
216
+ #27 - Native chunked analysis with Distribution dataclass
217
+ https://github.com/craigtrim/pystylometry/issues/27
218
+
219
+ Example:
220
+ >>> result = compute_yule(text, chunk_size=1000)
221
+ >>> result.yule_k # mean across chunks
222
+ 120.5
223
+ >>> result.yule_k_dist.std # variance reveals fingerprint
224
+ 15.2
225
+ """
24
226
 
227
+ # Convenient access (mean values)
25
228
  yule_k: float
26
229
  yule_i: float
230
+
231
+ # Full distributions
232
+ yule_k_dist: Distribution
233
+ yule_i_dist: Distribution
234
+
235
+ # Chunking context
236
+ chunk_size: int
237
+ chunk_count: int
238
+
27
239
  metadata: dict[str, Any]
28
240
 
29
241
 
30
242
  @dataclass
31
243
  class HapaxResult:
32
- """Result from Hapax Legomena analysis."""
244
+ """Result from Hapax Legomena analysis.
245
+
246
+ All numeric metrics include both a mean value (convenient access) and
247
+ a full distribution with per-chunk values and statistics.
248
+
249
+ Related GitHub Issue:
250
+ #27 - Native chunked analysis with Distribution dataclass
251
+ https://github.com/craigtrim/pystylometry/issues/27
252
+
253
+ Example:
254
+ >>> result = compute_hapax(text, chunk_size=1000)
255
+ >>> result.hapax_ratio # mean across chunks
256
+ 0.45
257
+ >>> result.hapax_ratio_dist.std # variance
258
+ 0.08
259
+ """
260
+
261
+ # Convenient access (mean/total values)
262
+ hapax_count: int # Total across all chunks
263
+ hapax_ratio: float # Mean ratio
264
+ dis_hapax_count: int # Total across all chunks
265
+ dis_hapax_ratio: float # Mean ratio
266
+ sichel_s: float # Mean
267
+ honore_r: float # Mean
268
+
269
+ # Full distributions (ratios only - counts don't distribute meaningfully)
270
+ hapax_ratio_dist: Distribution
271
+ dis_hapax_ratio_dist: Distribution
272
+ sichel_s_dist: Distribution
273
+ honore_r_dist: Distribution
274
+
275
+ # Chunking context
276
+ chunk_size: int
277
+ chunk_count: int
278
+
279
+ metadata: dict[str, Any]
280
+
281
+
282
+ @dataclass
283
+ class LexiconCategories:
284
+ """Categorization of words by lexicon presence."""
285
+
286
+ neologisms: list[str] # Not in WordNet AND not in BNC
287
+ rare_words: list[str] # In one lexicon but not both
288
+ common_words: list[str] # In both WordNet AND BNC
289
+ neologism_ratio: float # Ratio of neologisms to total hapax
290
+ rare_word_ratio: float # Ratio of rare words to total hapax
291
+ metadata: dict[str, Any]
292
+
293
+
294
+ @dataclass
295
+ class HapaxLexiconResult:
296
+ """Result from Hapax Legomena analysis with lexicon categorization.
297
+
298
+ Extends basic hapax analysis by categorizing hapax legomena based on
299
+ presence in WordNet and British National Corpus (BNC):
300
+
301
+ - Neologisms: Words not in WordNet AND not in BNC (true novel words)
302
+ - Rare words: Words in BNC but not WordNet, or vice versa
303
+ - Common words: Words in both lexicons (just happen to appear once in text)
304
+
305
+ This categorization is valuable for stylometric analysis as it distinguishes
306
+ between vocabulary innovation (neologisms) and incidental hapax occurrence
307
+ (common words that appear once).
308
+ """
309
+
310
+ hapax_result: HapaxResult # Standard hapax metrics
311
+ lexicon_analysis: LexiconCategories # Lexicon-based categorization
312
+ metadata: dict[str, Any]
313
+
314
+
315
+ @dataclass
316
+ class TTRResult:
317
+ """Result from Type-Token Ratio (TTR) analysis.
318
+
319
+ Wraps stylometry-ttr package functionality to measure vocabulary richness
320
+ through the ratio of unique words (types) to total words (tokens).
321
+
322
+ All numeric metrics include both a mean value (convenient access) and
323
+ a full distribution with per-chunk values and statistics.
324
+
325
+ Includes multiple TTR variants for length normalization:
326
+ - Raw TTR: Direct ratio of unique to total words
327
+ - Root TTR (Guiraud's index): types / sqrt(tokens)
328
+ - Log TTR (Herdan's C): log(types) / log(tokens)
329
+ - STTR: Standardized TTR across fixed-size chunks
330
+ - Delta Std: Measures vocabulary consistency across chunks
331
+
332
+ Related GitHub Issue:
333
+ #27 - Native chunked analysis with Distribution dataclass
334
+ https://github.com/craigtrim/pystylometry/issues/27
335
+
336
+ References:
337
+ Guiraud, P. (1960). Problèmes et méthodes de la statistique linguistique.
338
+ Herdan, G. (1960). Type-token Mathematics.
339
+
340
+ Example:
341
+ >>> result = compute_ttr(text, chunk_size=1000)
342
+ >>> result.ttr # mean TTR across chunks
343
+ 0.42
344
+ >>> result.ttr_dist.std # TTR variance reveals fingerprint
345
+ 0.05
346
+ >>> result.chunk_count
347
+ 59
348
+ """
349
+
350
+ # Convenient access (mean values)
351
+ total_words: int
352
+ unique_words: int
353
+ ttr: float # Raw TTR (mean)
354
+ root_ttr: float # Guiraud's index (mean)
355
+ log_ttr: float # Herdan's C (mean)
356
+ sttr: float # Standardized TTR (mean)
357
+ delta_std: float # Vocabulary consistency (mean)
358
+
359
+ # Full distributions with per-chunk values
360
+ ttr_dist: Distribution
361
+ root_ttr_dist: Distribution
362
+ log_ttr_dist: Distribution
363
+ sttr_dist: Distribution
364
+ delta_std_dist: Distribution
365
+
366
+ # Chunking context
367
+ chunk_size: int
368
+ chunk_count: int
33
369
 
34
- hapax_count: int
35
- hapax_ratio: float
36
- dis_hapax_count: int
37
- dis_hapax_ratio: float
38
- sichel_s: float
39
- honore_r: float
40
370
  metadata: dict[str, Any]
41
371
 
42
372
 
@@ -45,48 +375,135 @@ class HapaxResult:
45
375
 
46
376
  @dataclass
47
377
  class FleschResult:
48
- """Result from Flesch Reading Ease and Flesch-Kincaid Grade computation."""
378
+ """Result from Flesch Reading Ease and Flesch-Kincaid Grade computation.
379
+
380
+ All numeric metrics include both a mean value (convenient access) and
381
+ a full distribution with per-chunk values and statistics.
49
382
 
383
+ Related GitHub Issue:
384
+ #27 - Native chunked analysis with Distribution dataclass
385
+ https://github.com/craigtrim/pystylometry/issues/27
386
+
387
+ Example:
388
+ >>> result = compute_flesch(text, chunk_size=1000)
389
+ >>> result.reading_ease # mean across chunks
390
+ 68.54
391
+ >>> result.reading_ease_dist.std # variance reveals fingerprint
392
+ 4.2
393
+ >>> result.reading_ease_dist.values # per-chunk values
394
+ [65.2, 71.1, 68.8, ...]
395
+ """
396
+
397
+ # Convenient access (mean values)
50
398
  reading_ease: float
51
399
  grade_level: float
52
- difficulty: str # "Very Easy", "Easy", "Fairly Easy", "Standard", etc.
400
+ difficulty: str # Based on mean reading_ease
401
+
402
+ # Full distributions
403
+ reading_ease_dist: Distribution
404
+ grade_level_dist: Distribution
405
+
406
+ # Chunking context
407
+ chunk_size: int
408
+ chunk_count: int
409
+
53
410
  metadata: dict[str, Any]
54
411
 
55
412
 
56
413
  @dataclass
57
414
  class SMOGResult:
58
- """Result from SMOG Index computation."""
415
+ """Result from SMOG Index computation.
416
+
417
+ Related GitHub Issue:
418
+ #27 - Native chunked analysis with Distribution dataclass
419
+ https://github.com/craigtrim/pystylometry/issues/27
420
+ """
59
421
 
422
+ # Convenient access (mean values)
60
423
  smog_index: float
61
- grade_level: int
424
+ grade_level: float
425
+
426
+ # Full distributions
427
+ smog_index_dist: Distribution
428
+ grade_level_dist: Distribution
429
+
430
+ # Chunking context
431
+ chunk_size: int
432
+ chunk_count: int
433
+
62
434
  metadata: dict[str, Any]
63
435
 
64
436
 
65
437
  @dataclass
66
438
  class GunningFogResult:
67
- """Result from Gunning Fog Index computation."""
439
+ """Result from Gunning Fog Index computation.
440
+
441
+ Related GitHub Issue:
442
+ #27 - Native chunked analysis with Distribution dataclass
443
+ https://github.com/craigtrim/pystylometry/issues/27
444
+ """
68
445
 
446
+ # Convenient access (mean values)
69
447
  fog_index: float
70
- grade_level: int
448
+ grade_level: float
449
+
450
+ # Full distributions
451
+ fog_index_dist: Distribution
452
+ grade_level_dist: Distribution
453
+
454
+ # Chunking context
455
+ chunk_size: int
456
+ chunk_count: int
457
+
71
458
  metadata: dict[str, Any]
72
459
 
73
460
 
74
461
  @dataclass
75
462
  class ColemanLiauResult:
76
- """Result from Coleman-Liau Index computation."""
463
+ """Result from Coleman-Liau Index computation.
77
464
 
465
+ Related GitHub Issue:
466
+ #27 - Native chunked analysis with Distribution dataclass
467
+ https://github.com/craigtrim/pystylometry/issues/27
468
+ """
469
+
470
+ # Convenient access (mean values)
78
471
  cli_index: float
79
- grade_level: int
472
+ grade_level: float # Changed to float for mean across chunks
473
+
474
+ # Full distributions
475
+ cli_index_dist: Distribution
476
+ grade_level_dist: Distribution
477
+
478
+ # Chunking context
479
+ chunk_size: int
480
+ chunk_count: int
481
+
80
482
  metadata: dict[str, Any]
81
483
 
82
484
 
83
485
  @dataclass
84
486
  class ARIResult:
85
- """Result from Automated Readability Index computation."""
487
+ """Result from Automated Readability Index computation.
86
488
 
489
+ Related GitHub Issue:
490
+ #27 - Native chunked analysis with Distribution dataclass
491
+ https://github.com/craigtrim/pystylometry/issues/27
492
+ """
493
+
494
+ # Convenient access (mean values)
87
495
  ari_score: float
88
- grade_level: int
89
- age_range: str
496
+ grade_level: float # Changed to float for mean across chunks
497
+ age_range: str # Based on mean grade level
498
+
499
+ # Full distributions
500
+ ari_score_dist: Distribution
501
+ grade_level_dist: Distribution
502
+
503
+ # Chunking context
504
+ chunk_size: int
505
+ chunk_count: int
506
+
90
507
  metadata: dict[str, Any]
91
508
 
92
509
 
@@ -95,8 +512,14 @@ class ARIResult:
95
512
 
96
513
  @dataclass
97
514
  class POSResult:
98
- """Result from Part-of-Speech ratio analysis."""
515
+ """Result from Part-of-Speech ratio analysis.
516
+
517
+ Related GitHub Issue:
518
+ #27 - Native chunked analysis with Distribution dataclass
519
+ https://github.com/craigtrim/pystylometry/issues/27
520
+ """
99
521
 
522
+ # Convenient access (mean values)
100
523
  noun_ratio: float
101
524
  verb_ratio: float
102
525
  adjective_ratio: float
@@ -105,19 +528,52 @@ class POSResult:
105
528
  adjective_noun_ratio: float
106
529
  lexical_density: float
107
530
  function_word_ratio: float
531
+
532
+ # Full distributions
533
+ noun_ratio_dist: Distribution
534
+ verb_ratio_dist: Distribution
535
+ adjective_ratio_dist: Distribution
536
+ adverb_ratio_dist: Distribution
537
+ noun_verb_ratio_dist: Distribution
538
+ adjective_noun_ratio_dist: Distribution
539
+ lexical_density_dist: Distribution
540
+ function_word_ratio_dist: Distribution
541
+
542
+ # Chunking context
543
+ chunk_size: int
544
+ chunk_count: int
545
+
108
546
  metadata: dict[str, Any]
109
547
 
110
548
 
111
549
  @dataclass
112
550
  class SentenceStatsResult:
113
- """Result from sentence-level statistics."""
551
+ """Result from sentence-level statistics.
552
+
553
+ Related GitHub Issue:
554
+ #27 - Native chunked analysis with Distribution dataclass
555
+ https://github.com/craigtrim/pystylometry/issues/27
556
+ """
114
557
 
558
+ # Convenient access (mean values)
115
559
  mean_sentence_length: float
116
560
  sentence_length_std: float
117
- sentence_length_range: int
118
- min_sentence_length: int
119
- max_sentence_length: int
120
- sentence_count: int
561
+ sentence_length_range: float # Changed to float for mean across chunks
562
+ min_sentence_length: float # Changed to float for mean across chunks
563
+ max_sentence_length: float # Changed to float for mean across chunks
564
+ sentence_count: int # Total across all chunks
565
+
566
+ # Full distributions
567
+ mean_sentence_length_dist: Distribution
568
+ sentence_length_std_dist: Distribution
569
+ sentence_length_range_dist: Distribution
570
+ min_sentence_length_dist: Distribution
571
+ max_sentence_length_dist: Distribution
572
+
573
+ # Chunking context
574
+ chunk_size: int
575
+ chunk_count: int
576
+
121
577
  metadata: dict[str, Any]
122
578
 
123
579
 
@@ -149,11 +605,1481 @@ class ZetaResult:
149
605
 
150
606
  @dataclass
151
607
  class EntropyResult:
152
- """Result from n-gram entropy computation."""
608
+ """Result from n-gram entropy computation.
153
609
 
610
+ Related GitHub Issue:
611
+ #27 - Native chunked analysis with Distribution dataclass
612
+ https://github.com/craigtrim/pystylometry/issues/27
613
+ """
614
+
615
+ # Convenient access (mean values)
154
616
  entropy: float
155
617
  perplexity: float
156
618
  ngram_type: str # "character_bigram", "word_bigram", "word_trigram"
619
+
620
+ # Full distributions
621
+ entropy_dist: Distribution
622
+ perplexity_dist: Distribution
623
+
624
+ # Chunking context
625
+ chunk_size: int
626
+ chunk_count: int
627
+
628
+ metadata: dict[str, Any]
629
+
630
+
631
+ # ===== Character-Level Results =====
632
+ # Related to GitHub Issue #12: Character-Level Metrics
633
+ # https://github.com/craigtrim/pystylometry/issues/12
634
+
635
+
636
+ @dataclass
637
+ class CharacterMetricsResult:
638
+ """Result from character-level metrics analysis.
639
+
640
+ This dataclass holds character-level stylometric features that provide
641
+ low-level insights into writing style. Character-level metrics are
642
+ fundamental for authorship attribution and can capture distinctive
643
+ patterns in punctuation, formatting, and word construction.
644
+
645
+ Related GitHub Issues:
646
+ #12 - Character-Level Metrics
647
+ #27 - Native chunked analysis with Distribution dataclass
648
+
649
+ Metrics included:
650
+ - Average word length (characters per word)
651
+ - Average sentence length (characters per sentence)
652
+ - Punctuation density (punctuation marks per 100 words)
653
+ - Punctuation variety (count of unique punctuation types)
654
+ - Letter frequency distribution (26-element vector for a-z)
655
+ - Vowel-to-consonant ratio
656
+ - Digit frequency (count/ratio of numeric characters)
657
+ - Uppercase ratio (uppercase letters / total letters)
658
+ - Whitespace ratio (whitespace characters / total characters)
659
+
660
+ References:
661
+ Grieve, J. (2007). Quantitative authorship attribution: An evaluation
662
+ of techniques. Literary and Linguistic Computing, 22(3), 251-270.
663
+ Stamatatos, E. (2009). A survey of modern authorship attribution methods.
664
+ JASIST, 60(3), 538-556.
665
+ """
666
+
667
+ # Convenient access (mean values)
668
+ avg_word_length: float
669
+ avg_sentence_length_chars: float
670
+ punctuation_density: float
671
+ punctuation_variety: float # Changed to float for mean across chunks
672
+ letter_frequency: dict[str, float] # Aggregate frequency
673
+ vowel_consonant_ratio: float
674
+ digit_count: int # Total across all chunks
675
+ digit_ratio: float
676
+ uppercase_ratio: float
677
+ whitespace_ratio: float
678
+
679
+ # Full distributions
680
+ avg_word_length_dist: Distribution
681
+ avg_sentence_length_chars_dist: Distribution
682
+ punctuation_density_dist: Distribution
683
+ punctuation_variety_dist: Distribution
684
+ vowel_consonant_ratio_dist: Distribution
685
+ digit_ratio_dist: Distribution
686
+ uppercase_ratio_dist: Distribution
687
+ whitespace_ratio_dist: Distribution
688
+
689
+ # Chunking context
690
+ chunk_size: int
691
+ chunk_count: int
692
+
693
+ metadata: dict[str, Any]
694
+
695
+
696
+ # ===== Function Word Results =====
697
+ # Related to GitHub Issue #13: Function Word Analysis
698
+ # https://github.com/craigtrim/pystylometry/issues/13
699
+
700
+
701
+ @dataclass
702
+ class FunctionWordResult:
703
+ """Result from function word analysis.
704
+
705
+ Function words (determiners, prepositions, conjunctions, pronouns, auxiliary
706
+ verbs) are highly frequent, content-independent words that are often used
707
+ subconsciously. They are considered strong authorship markers because authors
708
+ use them consistently across different topics and genres.
709
+
710
+ Related GitHub Issues:
711
+ #13 - Function Word Analysis
712
+ #27 - Native chunked analysis with Distribution dataclass
713
+
714
+ This analysis computes:
715
+ - Frequency profiles for all function word categories
716
+ - Ratios for specific grammatical categories
717
+ - Most/least frequently used function words
718
+ - Function word diversity metrics
719
+
720
+ Function word categories analyzed:
721
+ - Determiners: the, a, an, this, that, these, those, etc.
722
+ - Prepositions: in, on, at, by, for, with, from, to, etc.
723
+ - Conjunctions: and, but, or, nor, for, yet, so, etc.
724
+ - Pronouns: I, you, he, she, it, we, they, etc.
725
+ - Auxiliary verbs: be, have, do, can, will, shall, may, etc.
726
+ - Particles: up, down, out, off, over, etc.
727
+
728
+ References:
729
+ Mosteller, F., & Wallace, D. L. (1964). Inference and disputed authorship:
730
+ The Federalist. Addison-Wesley.
731
+ Burrows, J. (2002). 'Delta': A measure of stylistic difference and a guide
732
+ to likely authorship. Literary and Linguistic Computing, 17(3), 267-287.
733
+ """
734
+
735
+ # Convenient access (mean values)
736
+ determiner_ratio: float
737
+ preposition_ratio: float
738
+ conjunction_ratio: float
739
+ pronoun_ratio: float
740
+ auxiliary_ratio: float
741
+ particle_ratio: float
742
+ total_function_word_ratio: float
743
+ function_word_diversity: float
744
+ most_frequent_function_words: list[tuple[str, int]] # Aggregate
745
+ least_frequent_function_words: list[tuple[str, int]] # Aggregate
746
+ function_word_distribution: dict[str, int] # Aggregate
747
+
748
+ # Full distributions
749
+ determiner_ratio_dist: Distribution
750
+ preposition_ratio_dist: Distribution
751
+ conjunction_ratio_dist: Distribution
752
+ pronoun_ratio_dist: Distribution
753
+ auxiliary_ratio_dist: Distribution
754
+ particle_ratio_dist: Distribution
755
+ total_function_word_ratio_dist: Distribution
756
+ function_word_diversity_dist: Distribution
757
+
758
+ # Chunking context
759
+ chunk_size: int
760
+ chunk_count: int
761
+
762
+ metadata: dict[str, Any]
763
+
764
+
765
+ # ===== Advanced Lexical Diversity Results =====
766
+ # Related to GitHub Issue #14: Advanced Lexical Diversity Metrics
767
+ # https://github.com/craigtrim/pystylometry/issues/14
768
+
769
+
770
+ @dataclass
771
+ class VocdDResult:
772
+ """Result from voc-D computation.
773
+
774
+ voc-D is a sophisticated measure of lexical diversity that uses a mathematical
775
+ model to estimate vocabulary richness while controlling for text length.
776
+ It fits a curve to the relationship between tokens and types across multiple
777
+ random samples of the text.
778
+
779
+ Related GitHub Issues:
780
+ #14 - Advanced Lexical Diversity Metrics
781
+ #27 - Native chunked analysis with Distribution dataclass
782
+
783
+ The D parameter represents the theoretical vocabulary size and is more
784
+ stable across different text lengths than simple TTR measures.
785
+
786
+ References:
787
+ Malvern, D., Richards, B., Chipere, N., & Durán, P. (2004).
788
+ Lexical Diversity and Language Development. Palgrave Macmillan.
789
+ McKee, G., Malvern, D., & Richards, B. (2000). Measuring vocabulary
790
+ diversity using dedicated software. Literary and Linguistic Computing,
791
+ 15(3), 323-337.
792
+ """
793
+
794
+ # Convenient access (mean values)
795
+ d_parameter: float
796
+ curve_fit_r_squared: float
797
+ sample_count: int # Total across all chunks
798
+ optimal_sample_size: int
799
+
800
+ # Full distributions
801
+ d_parameter_dist: Distribution
802
+ curve_fit_r_squared_dist: Distribution
803
+
804
+ # Chunking context
805
+ chunk_size: int
806
+ chunk_count: int
807
+
808
+ metadata: dict[str, Any]
809
+
810
+
811
+ @dataclass
812
+ class MATTRResult:
813
+ """Result from MATTR (Moving-Average Type-Token Ratio) computation.
814
+
815
+ MATTR computes TTR using a moving window of fixed size, which provides
816
+ a more stable measure of lexical diversity than simple TTR, especially
817
+ for longer texts. The moving window approach reduces the impact of text
818
+ length on the TTR calculation.
819
+
820
+ Related GitHub Issues:
821
+ #14 - Advanced Lexical Diversity Metrics
822
+ #27 - Native chunked analysis with Distribution dataclass
823
+
824
+ References:
825
+ Covington, M. A., & McFall, J. D. (2010). Cutting the Gordian knot:
826
+ The moving-average type-token ratio (MATTR). Journal of Quantitative
827
+ Linguistics, 17(2), 94-100.
828
+ """
829
+
830
+ # Convenient access (mean values)
831
+ mattr_score: float
832
+ window_size: int
833
+ window_count: int # Total across all chunks
834
+ ttr_std_dev: float
835
+ min_ttr: float
836
+ max_ttr: float
837
+
838
+ # Full distributions
839
+ mattr_score_dist: Distribution
840
+ ttr_std_dev_dist: Distribution
841
+ min_ttr_dist: Distribution
842
+ max_ttr_dist: Distribution
843
+
844
+ # Chunking context
845
+ chunk_size: int
846
+ chunk_count: int
847
+
848
+ metadata: dict[str, Any]
849
+
850
+
851
+ @dataclass
852
+ class HDDResult:
853
+ """Result from HD-D (Hypergeometric Distribution D) computation.
854
+
855
+ HD-D is a probabilistic measure of lexical diversity based on the
856
+ hypergeometric distribution. It estimates the probability of encountering
857
+ new word types as text length increases, providing a mathematically
858
+ rigorous measure that is less sensitive to text length than TTR.
859
+
860
+ Related GitHub Issues:
861
+ #14 - Advanced Lexical Diversity Metrics
862
+ #27 - Native chunked analysis with Distribution dataclass
863
+
864
+ References:
865
+ McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D: A validation
866
+ study of sophisticated approaches to lexical diversity assessment.
867
+ Behavior Research Methods, 42(2), 381-392.
868
+ """
869
+
870
+ # Convenient access (mean values)
871
+ hdd_score: float
872
+ sample_size: int
873
+ type_count: int # Total unique across all chunks
874
+ token_count: int # Total across all chunks
875
+
876
+ # Full distributions
877
+ hdd_score_dist: Distribution
878
+
879
+ # Chunking context
880
+ chunk_size: int
881
+ chunk_count: int
882
+
883
+ metadata: dict[str, Any]
884
+
885
+
886
+ @dataclass
887
+ class MSTTRResult:
888
+ """Result from MSTTR (Mean Segmental Type-Token Ratio) computation.
889
+
890
+ MSTTR divides the text into sequential segments of equal length and
891
+ computes the average TTR across all segments. This provides a length-
892
+ normalized measure of lexical diversity that is more comparable across
893
+ texts of different lengths.
894
+
895
+ Related GitHub Issues:
896
+ #14 - Advanced Lexical Diversity Metrics
897
+ #27 - Native chunked analysis with Distribution dataclass
898
+
899
+ References:
900
+ Johnson, W. (1944). Studies in language behavior: I. A program of research.
901
+ Psychological Monographs, 56(2), 1-15.
902
+ """
903
+
904
+ # Convenient access (mean values)
905
+ msttr_score: float
906
+ segment_size: int
907
+ segment_count: int # Total across all chunks
908
+ ttr_std_dev: float
909
+ min_ttr: float
910
+ max_ttr: float
911
+ segment_ttrs: list[float] # Aggregate from all chunks
912
+
913
+ # Full distributions
914
+ msttr_score_dist: Distribution
915
+ ttr_std_dev_dist: Distribution
916
+ min_ttr_dist: Distribution
917
+ max_ttr_dist: Distribution
918
+
919
+ # Chunking context
920
+ chunk_size: int
921
+ chunk_count: int
922
+
923
+ metadata: dict[str, Any]
924
+
925
+
926
+ # ===== Word Frequency Sophistication Results =====
927
+ # Related to GitHub Issue #15: Word Frequency Sophistication Metrics
928
+ # https://github.com/craigtrim/pystylometry/issues/15
929
+
930
+
931
+ @dataclass
932
+ class WordFrequencySophisticationResult:
933
+ """Result from word frequency sophistication analysis.
934
+
935
+ Word frequency sophistication metrics measure how common or rare the
936
+ vocabulary used in a text is, based on reference frequency lists from
937
+ large corpora. Authors who use less frequent (more sophisticated) words
938
+ score higher on these metrics.
939
+
940
+ Related GitHub Issues:
941
+ #15 - Word Frequency Sophistication Metrics
942
+ #27 - Native chunked analysis with Distribution dataclass
943
+
944
+ This analysis uses reference frequency data from:
945
+ - COCA (Corpus of Contemporary American English)
946
+ - BNC (British National Corpus)
947
+ - Google N-grams
948
+ - SUBTLEXus (subtitle frequencies)
949
+
950
+ Metrics computed:
951
+ - Mean word frequency (average frequency rank)
952
+ - Median word frequency
953
+ - Rare word ratio (words beyond frequency threshold)
954
+ - Academic word ratio (from Academic Word List)
955
+ - Advanced word ratio (sophisticated vocabulary)
956
+
957
+ References:
958
+ Brysbaert, M., & New, B. (2009). Moving beyond Kučera and Francis:
959
+ A critical evaluation of current word frequency norms. Behavior
960
+ Research Methods, Instruments, & Computers, 41(4), 977-990.
961
+ Coxhead, A. (2000). A new academic word list. TESOL Quarterly, 34(2), 213-238.
962
+ """
963
+
964
+ # Convenient access (mean values)
965
+ mean_frequency_rank: float
966
+ median_frequency_rank: float
967
+ rare_word_ratio: float
968
+ common_word_ratio: float
969
+ academic_word_ratio: float
970
+ advanced_word_ratio: float
971
+ frequency_band_distribution: dict[str, float] # Aggregate
972
+ rarest_words: list[tuple[str, float]] # Aggregate
973
+ most_common_words: list[tuple[str, float]] # Aggregate
974
+
975
+ # Full distributions
976
+ mean_frequency_rank_dist: Distribution
977
+ median_frequency_rank_dist: Distribution
978
+ rare_word_ratio_dist: Distribution
979
+ common_word_ratio_dist: Distribution
980
+ academic_word_ratio_dist: Distribution
981
+ advanced_word_ratio_dist: Distribution
982
+
983
+ # Chunking context
984
+ chunk_size: int
985
+ chunk_count: int
986
+
987
+ metadata: dict[str, Any]
988
+
989
+
990
+ # ===== Additional Readability Results =====
991
+ # Related to GitHub Issue #16: Additional Readability Formulas
992
+ # https://github.com/craigtrim/pystylometry/issues/16
993
+
994
+
995
+ @dataclass
996
+ class DaleChallResult:
997
+ """Result from Dale-Chall Readability Formula.
998
+
999
+ The Dale-Chall formula uses a list of 3000 familiar words that 80% of
1000
+ fourth-graders understand. Words not on this list are considered "difficult."
1001
+ The formula provides a grade level estimate based on sentence length and
1002
+ the percentage of difficult words.
1003
+
1004
+ Related GitHub Issues:
1005
+ #16 - Additional Readability Formulas
1006
+ #27 - Native chunked analysis with Distribution dataclass
1007
+
1008
+ Formula: 0.1579 * (difficult_words / total_words * 100) + 0.0496 * avg_sentence_length
1009
+
1010
+ If % difficult words > 5%, add 3.6365 to the raw score.
1011
+
1012
+ References:
1013
+ Dale, E., & Chall, J. S. (1948). A formula for predicting readability.
1014
+ Educational Research Bulletin, 27(1), 11-28.
1015
+ Chall, J. S., & Dale, E. (1995). Readability revisited: The new Dale-Chall
1016
+ readability formula. Brookline Books.
1017
+ """
1018
+
1019
+ # Convenient access (mean values)
1020
+ dale_chall_score: float
1021
+ grade_level: str # Based on mean score
1022
+ difficult_word_count: int # Total across all chunks
1023
+ difficult_word_ratio: float # Mean ratio
1024
+ avg_sentence_length: float # Mean
1025
+ total_words: int # Total across all chunks
1026
+
1027
+ # Full distributions
1028
+ dale_chall_score_dist: Distribution
1029
+ difficult_word_ratio_dist: Distribution
1030
+ avg_sentence_length_dist: Distribution
1031
+
1032
+ # Chunking context
1033
+ chunk_size: int
1034
+ chunk_count: int
1035
+
1036
+ metadata: dict[str, Any]
1037
+
1038
+
1039
+ @dataclass
1040
+ class LinsearWriteResult:
1041
+ """Result from Linsear Write Formula.
1042
+
1043
+ The Linsear Write Formula was developed for the U.S. Air Force to calculate
1044
+ the readability of technical manuals. It categorizes words as "easy" (1-2
1045
+ syllables) or "hard" (3+ syllables) and uses sentence length to estimate
1046
+ grade level. It's particularly effective for technical writing.
1047
+
1048
+ Related GitHub Issues:
1049
+ #16 - Additional Readability Formulas
1050
+ #27 - Native chunked analysis with Distribution dataclass
1051
+
1052
+ References:
1053
+ Klare, G. R. (1974-1975). Assessing readability. Reading Research Quarterly,
1054
+ 10(1), 62-102.
1055
+ """
1056
+
1057
+ # Convenient access (mean values)
1058
+ linsear_score: float
1059
+ grade_level: float # Changed to float for mean across chunks
1060
+ easy_word_count: int # Total across all chunks
1061
+ hard_word_count: int # Total across all chunks
1062
+ avg_sentence_length: float # Mean
1063
+
1064
+ # Full distributions
1065
+ linsear_score_dist: Distribution
1066
+ grade_level_dist: Distribution
1067
+ avg_sentence_length_dist: Distribution
1068
+
1069
+ # Chunking context
1070
+ chunk_size: int
1071
+ chunk_count: int
1072
+
1073
+ metadata: dict[str, Any]
1074
+
1075
+
1076
+ @dataclass
1077
+ class FryResult:
1078
+ """Result from Fry Readability Graph.
1079
+
1080
+ The Fry Readability Graph uses average sentence length and average syllables
1081
+ per word to determine reading difficulty. It plots these values on a graph
1082
+ to determine the grade level. This implementation provides the numerical
1083
+ coordinates and estimated grade level.
1084
+
1085
+ Related GitHub Issues:
1086
+ #16 - Additional Readability Formulas
1087
+ #27 - Native chunked analysis with Distribution dataclass
1088
+
1089
+ References:
1090
+ Fry, E. (1968). A readability formula that saves time. Journal of Reading,
1091
+ 11(7), 513-578.
1092
+ Fry, E. (1977). Fry's readability graph: Clarifications, validity, and
1093
+ extension to level 17. Journal of Reading, 21(3), 242-252.
1094
+ """
1095
+
1096
+ # Convenient access (mean values)
1097
+ avg_sentence_length: float
1098
+ avg_syllables_per_100: float
1099
+ grade_level: str # Based on mean coordinates
1100
+ graph_zone: str # Based on mean coordinates
1101
+
1102
+ # Full distributions
1103
+ avg_sentence_length_dist: Distribution
1104
+ avg_syllables_per_100_dist: Distribution
1105
+
1106
+ # Chunking context
1107
+ chunk_size: int
1108
+ chunk_count: int
1109
+
1110
+ metadata: dict[str, Any]
1111
+
1112
+
1113
+ @dataclass
1114
+ class FORCASTResult:
1115
+ """Result from FORCAST Readability Formula.
1116
+
1117
+ FORCAST (FORmula for CASTing readability) was developed by the U.S. military
1118
+ to assess readability without counting syllables. It uses only single-syllable
1119
+ words as a measure, making it faster to compute than syllable-based formulas.
1120
+ Particularly useful for technical and military documents.
1121
+
1122
+ Related GitHub Issues:
1123
+ #16 - Additional Readability Formulas
1124
+ #27 - Native chunked analysis with Distribution dataclass
1125
+
1126
+ Formula: 20 - (N / 10), where N is the number of single-syllable words
1127
+ per 150-word sample.
1128
+
1129
+ References:
1130
+ Caylor, J. S., Sticht, T. G., Fox, L. C., & Ford, J. P. (1973).
1131
+ Methodologies for determining reading requirements of military
1132
+ occupational specialties. Human Resources Research Organization.
1133
+ """
1134
+
1135
+ # Convenient access (mean values)
1136
+ forcast_score: float
1137
+ grade_level: float # Changed to float for mean across chunks
1138
+ single_syllable_ratio: float # Mean ratio
1139
+ single_syllable_count: int # Total across all chunks
1140
+ total_words: int # Total across all chunks
1141
+
1142
+ # Full distributions
1143
+ forcast_score_dist: Distribution
1144
+ grade_level_dist: Distribution
1145
+ single_syllable_ratio_dist: Distribution
1146
+
1147
+ # Chunking context
1148
+ chunk_size: int
1149
+ chunk_count: int
1150
+
1151
+ metadata: dict[str, Any]
1152
+
1153
+
1154
+ @dataclass
1155
+ class PowersSumnerKearlResult:
1156
+ """Result from Powers-Sumner-Kearl Readability Formula.
1157
+
1158
+ The Powers-Sumner-Kearl formula is a variation of the Flesch Reading Ease
1159
+ formula, recalibrated for primary grade levels (grades 1-4). It uses
1160
+ average sentence length and average syllables per word, but with different
1161
+ coefficients optimized for younger readers.
1162
+
1163
+ Related GitHub Issues:
1164
+ #16 - Additional Readability Formulas
1165
+ #27 - Native chunked analysis with Distribution dataclass
1166
+
1167
+ Formula: 0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
1168
+
1169
+ References:
1170
+ Powers, R. D., Sumner, W. A., & Kearl, B. E. (1958). A recalculation of
1171
+ four adult readability formulas. Journal of Educational Psychology,
1172
+ 49(2), 99-105.
1173
+ """
1174
+
1175
+ # Convenient access (mean values)
1176
+ psk_score: float
1177
+ grade_level: float
1178
+ avg_sentence_length: float
1179
+ avg_syllables_per_word: float
1180
+ total_sentences: int # Total across all chunks
1181
+ total_words: int # Total across all chunks
1182
+ total_syllables: int # Total across all chunks
1183
+
1184
+ # Full distributions
1185
+ psk_score_dist: Distribution
1186
+ grade_level_dist: Distribution
1187
+ avg_sentence_length_dist: Distribution
1188
+ avg_syllables_per_word_dist: Distribution
1189
+
1190
+ # Chunking context
1191
+ chunk_size: int
1192
+ chunk_count: int
1193
+
1194
+ metadata: dict[str, Any]
1195
+
1196
+
1197
+ # ===== Advanced Syntactic Results =====
1198
+ # Related to GitHub Issue #17: Advanced Syntactic Analysis
1199
+ # https://github.com/craigtrim/pystylometry/issues/17
1200
+
1201
+
1202
+ @dataclass
1203
+ class AdvancedSyntacticResult:
1204
+ """Result from advanced syntactic analysis using dependency parsing.
1205
+
1206
+ Advanced syntactic analysis uses dependency parsing to extract sophisticated
1207
+ grammatical features that go beyond simple POS tagging. These features
1208
+ capture sentence complexity, grammatical sophistication, and syntactic
1209
+ style preferences.
1210
+
1211
+ Related GitHub Issues:
1212
+ #17 - Advanced Syntactic Analysis
1213
+ #27 - Native chunked analysis with Distribution dataclass
1214
+
1215
+ Features analyzed:
1216
+ - Parse tree depth (sentence structural complexity)
1217
+ - T-units (minimal terminable units - independent clauses with modifiers)
1218
+ - Clausal density (clauses per T-unit)
1219
+ - Dependent clause ratio
1220
+ - Passive voice ratio
1221
+ - Subordination index
1222
+ - Coordination index
1223
+ - Sentence complexity score
1224
+
1225
+ References:
1226
+ Hunt, K. W. (1965). Grammatical structures written at three grade levels.
1227
+ NCTE Research Report No. 3.
1228
+ Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
1229
+ Lu, X. (2010). Automatic analysis of syntactic complexity in second language
1230
+ writing. International Journal of Corpus Linguistics, 15(4), 474-496.
1231
+ """
1232
+
1233
+ # Convenient access (mean values)
1234
+ mean_parse_tree_depth: float
1235
+ max_parse_tree_depth: float # Changed to float for mean across chunks
1236
+ t_unit_count: int # Total across all chunks
1237
+ mean_t_unit_length: float
1238
+ clausal_density: float
1239
+ dependent_clause_ratio: float
1240
+ passive_voice_ratio: float
1241
+ subordination_index: float
1242
+ coordination_index: float
1243
+ sentence_complexity_score: float
1244
+ dependency_distance: float
1245
+ left_branching_ratio: float
1246
+ right_branching_ratio: float
1247
+
1248
+ # Full distributions
1249
+ mean_parse_tree_depth_dist: Distribution
1250
+ max_parse_tree_depth_dist: Distribution
1251
+ mean_t_unit_length_dist: Distribution
1252
+ clausal_density_dist: Distribution
1253
+ dependent_clause_ratio_dist: Distribution
1254
+ passive_voice_ratio_dist: Distribution
1255
+ subordination_index_dist: Distribution
1256
+ coordination_index_dist: Distribution
1257
+ sentence_complexity_score_dist: Distribution
1258
+ dependency_distance_dist: Distribution
1259
+ left_branching_ratio_dist: Distribution
1260
+ right_branching_ratio_dist: Distribution
1261
+
1262
+ # Chunking context
1263
+ chunk_size: int
1264
+ chunk_count: int
1265
+
1266
+ metadata: dict[str, Any]
1267
+
1268
+
1269
+ # ===== Sentence Type Results =====
1270
+ # Related to GitHub Issue #18: Sentence Type Classification
1271
+ # https://github.com/craigtrim/pystylometry/issues/18
1272
+
1273
+
1274
+ @dataclass
1275
+ class SentenceTypeResult:
1276
+ """Result from sentence type classification analysis.
1277
+
1278
+ Sentence type classification categorizes sentences by their grammatical
1279
+ structure (simple, compound, complex, compound-complex) and communicative
1280
+ function (declarative, interrogative, imperative, exclamatory). Different
1281
+ authors and genres show distinct patterns in sentence type distribution.
1282
+
1283
+ Related GitHub Issues:
1284
+ #18 - Sentence Type Classification
1285
+ #27 - Native chunked analysis with Distribution dataclass
1286
+
1287
+ Structural types:
1288
+ - Simple: One independent clause (e.g., "The cat sat.")
1289
+ - Compound: Multiple independent clauses (e.g., "I came, I saw, I conquered.")
1290
+ - Complex: One independent + dependent clause(s) (e.g., "When I arrived, I saw her.")
1291
+ - Compound-Complex: Multiple independent + dependent
1292
+ (e.g., "I came when called, and I stayed.")
1293
+
1294
+ Functional types:
1295
+ - Declarative: Statement (e.g., "The sky is blue.")
1296
+ - Interrogative: Question (e.g., "Is the sky blue?")
1297
+ - Imperative: Command (e.g., "Look at the sky!")
1298
+ - Exclamatory: Exclamation (e.g., "What a blue sky!")
1299
+
1300
+ References:
1301
+ Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
1302
+ Huddleston, R., & Pullum, G. K. (2002). The Cambridge Grammar of the English Language.
1303
+ """
1304
+
1305
+ # Convenient access (mean ratios)
1306
+ simple_ratio: float
1307
+ compound_ratio: float
1308
+ complex_ratio: float
1309
+ compound_complex_ratio: float
1310
+ declarative_ratio: float
1311
+ interrogative_ratio: float
1312
+ imperative_ratio: float
1313
+ exclamatory_ratio: float
1314
+
1315
+ # Counts (totals across all chunks)
1316
+ simple_count: int
1317
+ compound_count: int
1318
+ complex_count: int
1319
+ compound_complex_count: int
1320
+ declarative_count: int
1321
+ interrogative_count: int
1322
+ imperative_count: int
1323
+ exclamatory_count: int
1324
+ total_sentences: int
1325
+
1326
+ # Diversity (mean across chunks)
1327
+ structural_diversity: float
1328
+ functional_diversity: float
1329
+
1330
+ # Full distributions
1331
+ simple_ratio_dist: Distribution
1332
+ compound_ratio_dist: Distribution
1333
+ complex_ratio_dist: Distribution
1334
+ compound_complex_ratio_dist: Distribution
1335
+ declarative_ratio_dist: Distribution
1336
+ interrogative_ratio_dist: Distribution
1337
+ imperative_ratio_dist: Distribution
1338
+ exclamatory_ratio_dist: Distribution
1339
+ structural_diversity_dist: Distribution
1340
+ functional_diversity_dist: Distribution
1341
+
1342
+ # Chunking context
1343
+ chunk_size: int
1344
+ chunk_count: int
1345
+
1346
+ metadata: dict[str, Any]
1347
+
1348
+
1349
+ # ===== Extended N-gram Results =====
1350
+ # Related to GitHub Issue #19: Extended N-gram Features
1351
+ # https://github.com/craigtrim/pystylometry/issues/19
1352
+
1353
+
1354
+ @dataclass
1355
+ class ExtendedNgramResult:
1356
+ """Result from extended n-gram analysis.
1357
+
1358
+ Extended n-gram analysis goes beyond basic bigram/trigram entropy to provide
1359
+ comprehensive n-gram statistics including frequency distributions, most
1360
+ distinctive n-grams, skipgrams, and part-of-speech n-grams. These features
1361
+ are valuable for authorship attribution and style analysis.
1362
+
1363
+ Related GitHub Issue:
1364
+ #19 - Extended N-gram Features
1365
+ https://github.com/craigtrim/pystylometry/issues/19
1366
+
1367
+ Features computed:
1368
+ - Trigram frequency distributions and top trigrams
1369
+ - 4-gram frequency distributions and top 4-grams
1370
+ - Skipgrams (n-grams with gaps, e.g., "the * dog")
1371
+ - POS n-grams (e.g., "DET ADJ NOUN")
1372
+ - Character trigrams and 4-grams
1373
+ - N-gram diversity metrics
1374
+ - Entropy for each n-gram order
1375
+
1376
+ References:
1377
+ Guthrie, D., Allison, B., Liu, W., Guthrie, L., & Wilks, Y. (2006).
1378
+ A closer look at skip-gram modelling. LREC.
1379
+ Stamatatos, E. (2009). A survey of modern authorship attribution methods.
1380
+ JASIST, 60(3), 538-556.
1381
+
1382
+ Example:
1383
+ >>> result = compute_extended_ngrams("Sample text for n-gram analysis...")
1384
+ >>> print(f"Top trigrams: {result.top_word_trigrams[:5]}")
1385
+ >>> print(f"Trigram entropy: {result.word_trigram_entropy:.2f}")
1386
+ """
1387
+
1388
+ # Word n-grams
1389
+ top_word_trigrams: list[tuple[str, int]] # Most frequent word trigrams
1390
+ top_word_4grams: list[tuple[str, int]] # Most frequent word 4-grams
1391
+ word_trigram_count: int # Total unique word trigrams
1392
+ word_4gram_count: int # Total unique word 4-grams
1393
+ word_trigram_entropy: float # Shannon entropy of trigram distribution
1394
+ word_4gram_entropy: float # Shannon entropy of 4-gram distribution
1395
+
1396
+ # Skipgrams (n-grams with gaps)
1397
+ top_skipgrams_2_1: list[tuple[str, int]] # Top 2-skipgrams (gap of 1)
1398
+ top_skipgrams_3_1: list[tuple[str, int]] # Top 3-skipgrams (gap of 1)
1399
+ skipgram_2_1_count: int # Unique 2-skipgrams
1400
+ skipgram_3_1_count: int # Unique 3-skipgrams
1401
+
1402
+ # POS n-grams
1403
+ top_pos_trigrams: list[tuple[str, int]] # Most frequent POS trigrams
1404
+ top_pos_4grams: list[tuple[str, int]] # Most frequent POS 4-grams
1405
+ pos_trigram_count: int # Unique POS trigrams
1406
+ pos_4gram_count: int # Unique POS 4-grams
1407
+ pos_trigram_entropy: float # Shannon entropy of POS trigram distribution
1408
+
1409
+ # Character n-grams
1410
+ top_char_trigrams: list[tuple[str, int]] # Most frequent character trigrams
1411
+ top_char_4grams: list[tuple[str, int]] # Most frequent character 4-grams
1412
+ char_trigram_entropy: float # Shannon entropy of char trigram distribution
1413
+ char_4gram_entropy: float # Shannon entropy of char 4-gram distribution
1414
+
1415
+ metadata: dict[str, Any] # Full frequency distributions, parameters, etc.
1416
+
1417
+
1418
+ # ===== Stylistic Markers Results =====
1419
+ # Related to GitHub Issue #20: Stylistic Markers
1420
+ # https://github.com/craigtrim/pystylometry/issues/20
1421
+
1422
+
1423
+ @dataclass
1424
+ class StylisticMarkersResult:
1425
+ """Result from stylistic markers analysis.
1426
+
1427
+ Stylistic markers are specific linguistic features that authors tend to use
1428
+ consistently and often subconsciously. These include contraction usage,
1429
+ intensifier preferences, hedging expressions, punctuation habits, and more.
1430
+ They are powerful indicators of authorial identity.
1431
+
1432
+ Related GitHub Issue:
1433
+ #20 - Stylistic Markers
1434
+ https://github.com/craigtrim/pystylometry/issues/20
1435
+
1436
+ Markers analyzed:
1437
+ - Contraction usage (don't vs. do not, I'm vs. I am, etc.)
1438
+ - Intensifiers (very, really, extremely, quite, etc.)
1439
+ - Hedges (maybe, perhaps, probably, somewhat, etc.)
1440
+ - Modal auxiliaries (can, could, may, might, must, should, will, would)
1441
+ - Negation patterns (not, no, never, none, neither, etc.)
1442
+ - Exclamation frequency
1443
+ - Question frequency
1444
+ - Quotation usage
1445
+ - Parenthetical expressions
1446
+ - Ellipses and dashes
1447
+
1448
+ References:
1449
+ Argamon, S., & Levitan, S. (2005). Measuring the usefulness of function
1450
+ words for authorship attribution. ACH/ALLC.
1451
+ Pennebaker, J. W. (2011). The secret life of pronouns. Bloomsbury Press.
1452
+
1453
+ Example:
1454
+ >>> result = compute_stylistic_markers("Sample text with various markers...")
1455
+ >>> print(f"Contraction ratio: {result.contraction_ratio * 100:.1f}%")
1456
+ >>> print(f"Intensifier density: {result.intensifier_density:.2f}")
1457
+ >>> print(f"Hedging density: {result.hedging_density:.2f}")
1458
+ """
1459
+
1460
+ # Contraction patterns
1461
+ contraction_ratio: float # Contractions / (contractions + full forms)
1462
+ contraction_count: int # Total contractions
1463
+ expanded_form_count: int # Total expanded forms (e.g., "do not" vs "don't")
1464
+ top_contractions: list[tuple[str, int]] # Most frequent contractions
1465
+
1466
+ # Intensifiers and hedges
1467
+ intensifier_density: float # Intensifiers per 100 words
1468
+ intensifier_count: int # Total intensifier count
1469
+ top_intensifiers: list[tuple[str, int]] # Most frequent intensifiers
1470
+ hedging_density: float # Hedges per 100 words
1471
+ hedging_count: int # Total hedge count
1472
+ top_hedges: list[tuple[str, int]] # Most frequent hedges
1473
+
1474
+ # Modal auxiliaries
1475
+ modal_density: float # Modal auxiliaries per 100 words
1476
+ modal_distribution: dict[str, int] # Count per modal (can, could, may, etc.)
1477
+ epistemic_modal_ratio: float # Epistemic modals / all modals
1478
+ deontic_modal_ratio: float # Deontic modals / all modals
1479
+
1480
+ # Negation
1481
+ negation_density: float # Negation markers per 100 words
1482
+ negation_count: int # Total negation markers
1483
+ negation_types: dict[str, int] # not, no, never, etc. with counts
1484
+
1485
+ # Punctuation style
1486
+ exclamation_density: float # Exclamation marks per 100 words
1487
+ question_density: float # Question marks per 100 words
1488
+ quotation_density: float # Quotation marks per 100 words
1489
+ parenthetical_density: float # Parentheses per 100 words
1490
+ ellipsis_density: float # Ellipses per 100 words
1491
+ dash_density: float # Dashes (em/en) per 100 words
1492
+ semicolon_density: float # Semicolons per 100 words
1493
+ colon_density: float # Colons per 100 words
1494
+
1495
+ metadata: dict[str, Any] # Full lists, total word count, etc.
1496
+
1497
+
1498
+ # ===== Vocabulary Overlap Results =====
1499
+ # Related to GitHub Issue #21: Vocabulary Overlap and Similarity Metrics
1500
+ # https://github.com/craigtrim/pystylometry/issues/21
1501
+
1502
+
1503
+ @dataclass
1504
+ class VocabularyOverlapResult:
1505
+ """Result from vocabulary overlap and similarity analysis.
1506
+
1507
+ Vocabulary overlap metrics measure the similarity between two texts based on
1508
+ their shared vocabulary. These metrics are useful for authorship verification,
1509
+ plagiarism detection, and measuring stylistic consistency across texts.
1510
+
1511
+ Related GitHub Issue:
1512
+ #21 - Vocabulary Overlap and Similarity Metrics
1513
+ https://github.com/craigtrim/pystylometry/issues/21
1514
+
1515
+ Metrics computed:
1516
+ - Jaccard similarity (intersection / union)
1517
+ - Dice coefficient (2 * intersection / sum of sizes)
1518
+ - Overlap coefficient (intersection / min(size1, size2))
1519
+ - Cosine similarity (using word frequency vectors)
1520
+ - Shared vocabulary size and ratio
1521
+ - Unique words in each text
1522
+ - Most distinctive words for each text
1523
+
1524
+ References:
1525
+ Jaccard, P. (1912). The distribution of the flora in the alpine zone.
1526
+ New Phytologist, 11(2), 37-50.
1527
+ Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
1528
+ Retrieval. McGraw-Hill.
1529
+
1530
+ Example:
1531
+ >>> result = compute_vocabulary_overlap(text1, text2)
1532
+ >>> print(f"Jaccard similarity: {result.jaccard_similarity:.3f}")
1533
+ >>> print(f"Shared vocabulary: {result.shared_vocab_size} words")
1534
+ >>> print(f"Text1 unique: {result.text1_unique_count}")
1535
+ """
1536
+
1537
+ # Similarity scores (0-1 range)
1538
+ jaccard_similarity: float # Intersection / union
1539
+ dice_coefficient: float # 2 * intersection / (size1 + size2)
1540
+ overlap_coefficient: float # Intersection / min(size1, size2)
1541
+ cosine_similarity: float # Cosine of frequency vectors
1542
+
1543
+ # Vocabulary sizes
1544
+ text1_vocab_size: int # Unique words in text 1
1545
+ text2_vocab_size: int # Unique words in text 2
1546
+ shared_vocab_size: int # Words in both texts
1547
+ union_vocab_size: int # Words in either text
1548
+ text1_unique_count: int # Words only in text 1
1549
+ text2_unique_count: int # Words only in text 2
1550
+
1551
+ # Shared and distinctive vocabulary
1552
+ shared_words: list[str] # Words appearing in both texts
1553
+ text1_distinctive_words: list[tuple[str, float]] # Words + TF-IDF scores for text 1
1554
+ text2_distinctive_words: list[tuple[str, float]] # Words + TF-IDF scores for text 2
1555
+
1556
+ # Ratios
1557
+ text1_coverage: float # Shared / text1_vocab (how much of text1 is shared)
1558
+ text2_coverage: float # Shared / text2_vocab (how much of text2 is shared)
1559
+
1560
+ metadata: dict[str, Any] # Full vocabulary sets, frequency vectors, etc.
1561
+
1562
+
1563
+ # ===== Cohesion and Coherence Results =====
1564
+ # Related to GitHub Issue #22: Cohesion and Coherence Metrics
1565
+ # https://github.com/craigtrim/pystylometry/issues/22
1566
+
1567
+
1568
+ @dataclass
1569
+ class CohesionCoherenceResult:
1570
+ """Result from cohesion and coherence analysis.
1571
+
1572
+ Cohesion and coherence metrics measure how well a text holds together
1573
+ structurally (cohesion) and semantically (coherence). These metrics are
1574
+ important for analyzing writing quality, readability, and authorial
1575
+ sophistication.
1576
+
1577
+ Related GitHub Issue:
1578
+ #22 - Cohesion and Coherence Metrics
1579
+ https://github.com/craigtrim/pystylometry/issues/22
1580
+
1581
+ Cohesion features:
1582
+ - Referential cohesion (pronouns, demonstratives pointing back)
1583
+ - Lexical cohesion (word repetition, synonyms, semantic relatedness)
1584
+ - Connective density (discourse markers, conjunctions)
1585
+ - Anaphora resolution success rate
1586
+ - Lexical chains (sequences of semantically related words)
1587
+
1588
+ Coherence features:
1589
+ - Sentence-to-sentence semantic similarity
1590
+ - Topic consistency across paragraphs
1591
+ - Discourse structure (thesis, support, conclusion)
1592
+ - Semantic overlap between adjacent sentences
1593
+
1594
+ References:
1595
+ Halliday, M. A. K., & Hasan, R. (1976). Cohesion in English. Longman.
1596
+ Graesser, A. C., McNamara, D. S., & Kulikowich, J. M. (2011). Coh-Metrix:
1597
+ Providing multilevel analyses of text characteristics. Educational
1598
+ Researcher, 40(5), 223-234.
1599
+
1600
+ Example:
1601
+ >>> result = compute_cohesion_coherence("Multi-paragraph text...")
1602
+ >>> print(f"Pronoun density: {result.pronoun_density:.2f}")
1603
+ >>> print(f"Lexical overlap: {result.adjacent_sentence_overlap:.3f}")
1604
+ >>> print(f"Connective density: {result.connective_density:.2f}")
1605
+ """
1606
+
1607
+ # Referential cohesion
1608
+ pronoun_density: float # Pronouns per 100 words
1609
+ demonstrative_density: float # Demonstratives (this, that, these, those) per 100 words
1610
+ anaphora_count: int # Anaphoric references detected
1611
+ anaphora_resolution_ratio: float # Successfully resolved / total
1612
+
1613
+ # Lexical cohesion
1614
+ word_repetition_ratio: float # Repeated content words / total content words
1615
+ synonym_density: float # Synonym pairs per 100 words
1616
+ lexical_chain_count: int # Number of lexical chains detected
1617
+ mean_chain_length: float # Average length of lexical chains
1618
+ content_word_overlap: float # Content word overlap between sentences
1619
+
1620
+ # Connectives and discourse markers
1621
+ connective_density: float # Discourse connectives per 100 words
1622
+ additive_connective_ratio: float # "and", "also", "furthermore" / total connectives
1623
+ adversative_connective_ratio: float # "but", "however", "nevertheless" / total
1624
+ causal_connective_ratio: float # "because", "therefore", "thus" / total
1625
+ temporal_connective_ratio: float # "then", "after", "before" / total
1626
+
1627
+ # Coherence measures
1628
+ adjacent_sentence_overlap: float # Mean semantic overlap between adjacent sentences
1629
+ paragraph_topic_consistency: float # Mean topic consistency within paragraphs
1630
+ mean_sentence_similarity: float # Mean cosine similarity between all sentence pairs
1631
+ semantic_coherence_score: float # Composite coherence metric (0-1)
1632
+
1633
+ # Structural coherence
1634
+ paragraph_count: int # Number of paragraphs detected
1635
+ mean_paragraph_length: float # Mean sentences per paragraph
1636
+ discourse_structure_score: float # Quality of intro/body/conclusion structure
1637
+
1638
+ metadata: dict[str, Any] # Lexical chains, connective lists, similarity matrices, etc.
1639
+
1640
+
1641
+ # ===== Genre and Register Results =====
1642
+ # Related to GitHub Issue #23: Genre and Register Features
1643
+ # https://github.com/craigtrim/pystylometry/issues/23
1644
+
1645
+
1646
+ @dataclass
1647
+ class GenreRegisterResult:
1648
+ """Result from genre and register classification analysis.
1649
+
1650
+ Genre and register features distinguish between different types of texts
1651
+ (academic, journalistic, fiction, legal, etc.) based on linguistic patterns.
1652
+ These features can help identify the context and formality level of a text,
1653
+ and are useful for authorship attribution when combined with other metrics.
1654
+
1655
+ Related GitHub Issue:
1656
+ #23 - Genre and Register Features
1657
+ https://github.com/craigtrim/pystylometry/issues/23
1658
+
1659
+ Features analyzed:
1660
+ - Formality markers (Latinate words, nominalizations, passive voice)
1661
+ - Personal vs. impersonal style (1st/2nd person vs. 3rd person)
1662
+ - Abstract vs. concrete vocabulary
1663
+ - Technical term density
1664
+ - Narrative vs. expository markers
1665
+ - Dialogue presence and ratio
1666
+ - Register classification (frozen, formal, consultative, casual, intimate)
1667
+
1668
+ References:
1669
+ Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
1670
+ Biber, D., & Conrad, S. (2009). Register, genre, and style. Cambridge
1671
+ University Press.
1672
+ Heylighen, F., & Dewaele, J. M. (1999). Formality of language: Definition,
1673
+ measurement and behavioral determinants. Internal Report, Center "Leo
1674
+ Apostel", Free University of Brussels.
1675
+
1676
+ Example:
1677
+ >>> result = compute_genre_register("Academic paper text...")
1678
+ >>> print(f"Formality score: {result.formality_score:.2f}")
1679
+ >>> print(f"Register: {result.register_classification}")
1680
+ >>> print(f"Genre prediction: {result.predicted_genre}")
1681
+ """
1682
+
1683
+ # Formality indicators
1684
+ formality_score: float # Composite formality score (0-100)
1685
+ latinate_ratio: float # Latinate words / total words
1686
+ nominalization_density: float # Nominalizations per 100 words
1687
+ passive_voice_density: float # Passive constructions per 100 words
1688
+
1689
+ # Personal vs. impersonal
1690
+ first_person_ratio: float # 1st person pronouns / total pronouns
1691
+ second_person_ratio: float # 2nd person pronouns / total pronouns
1692
+ third_person_ratio: float # 3rd person pronouns / total pronouns
1693
+ impersonal_construction_density: float # "It is...", "There are..." per 100 words
1694
+
1695
+ # Abstract vs. concrete
1696
+ abstract_noun_ratio: float # Abstract nouns / total nouns
1697
+ concrete_noun_ratio: float # Concrete nouns / total nouns
1698
+ abstractness_score: float # Composite abstractness (based on word concreteness ratings)
1699
+
1700
+ # Technical and specialized
1701
+ technical_term_density: float # Technical/specialized terms per 100 words
1702
+ jargon_density: float # Domain-specific jargon per 100 words
1703
+
1704
+ # Narrative vs. expository
1705
+ narrative_marker_density: float # Past tense, action verbs per 100 words
1706
+ expository_marker_density: float # Present tense, linking verbs per 100 words
1707
+ narrative_expository_ratio: float # Narrative / expository markers
1708
+
1709
+ # Dialogue and quotation
1710
+ dialogue_ratio: float # Dialogue / total text (estimated)
1711
+ quotation_density: float # Quotations per 100 words
1712
+
1713
+ # Classification results
1714
+ register_classification: str # frozen, formal, consultative, casual, intimate
1715
+ predicted_genre: str # academic, journalistic, fiction, legal, conversational, etc.
1716
+ genre_confidence: float # Confidence in genre prediction (0-1)
1717
+
1718
+ # Feature scores for major genres (0-1 scores for each)
1719
+ academic_score: float
1720
+ journalistic_score: float
1721
+ fiction_score: float
1722
+ legal_score: float
1723
+ conversational_score: float
1724
+
1725
+ metadata: dict[str, Any] # Feature details, word lists, classification probabilities, etc.
1726
+
1727
+
1728
+ # ===== Additional Authorship Results =====
1729
+ # Related to GitHub Issue #24: Additional Authorship Attribution Methods
1730
+ # https://github.com/craigtrim/pystylometry/issues/24
1731
+
1732
+
1733
+ @dataclass
1734
+ class KilgarriffResult:
1735
+ """Result from Kilgarriff's Chi-squared method.
1736
+
1737
+ Kilgarriff's chi-squared method compares word frequency distributions between
1738
+ texts using the chi-squared test. It's particularly effective for authorship
1739
+ attribution when comparing frequency profiles of common words.
1740
+
1741
+ Related GitHub Issue:
1742
+ #24 - Additional Authorship Attribution Methods
1743
+ https://github.com/craigtrim/pystylometry/issues/24
1744
+
1745
+ References:
1746
+ Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus
1747
+ Linguistics, 6(1), 97-133.
1748
+
1749
+ Example:
1750
+ >>> result = compute_kilgarriff(text1, text2)
1751
+ >>> print(f"Chi-squared: {result.chi_squared:.2f}")
1752
+ >>> print(f"P-value: {result.p_value:.4f}")
1753
+ """
1754
+
1755
+ chi_squared: float # Chi-squared statistic
1756
+ p_value: float # Statistical significance (p-value)
1757
+ degrees_of_freedom: int # df for chi-squared test
1758
+ feature_count: int # Number of features (words) compared
1759
+ most_distinctive_features: list[tuple[str, float]] # Words + chi-squared contributions
1760
+ metadata: dict[str, Any] # Frequency tables, expected values, etc.
1761
+
1762
+
1763
+ @dataclass
1764
+ class KilgarriffDriftResult:
1765
+ """Result from Kilgarriff chi-squared drift detection within a single document.
1766
+
1767
+ This result captures stylistic drift patterns by comparing sequential chunks
1768
+ of text using Kilgarriff's chi-squared method. It enables detection of
1769
+ inconsistent authorship, heavy editing, pasted content, and AI-generated
1770
+ text signatures.
1771
+
1772
+ Related GitHub Issues:
1773
+ #36 - Kilgarriff Chi-Squared drift detection for intra-document analysis
1774
+ https://github.com/craigtrim/pystylometry/issues/36
1775
+ #31 - Classical Stylometric Methods from Programming Historian
1776
+ https://github.com/craigtrim/pystylometry/issues/31
1777
+
1778
+ Pattern Signatures:
1779
+ - consistent: Low, stable χ² across pairs (natural human writing)
1780
+ - gradual_drift: Slowly increasing trend (author fatigue, topic shift)
1781
+ - sudden_spike: One pair has high χ² (pasted content, different author)
1782
+ - suspiciously_uniform: Near-zero variance (possible AI generation)
1783
+ - unknown: Insufficient data for classification
1784
+
1785
+ Marketing Name: "Style Drift Detector" / "Consistency Fingerprint"
1786
+
1787
+ References:
1788
+ Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus
1789
+ Linguistics, 6(1), 97-133.
1790
+
1791
+ Example:
1792
+ >>> result = compute_kilgarriff_drift(text, window_size=1000, stride=500)
1793
+ >>> result.pattern # "consistent", "gradual_drift", "sudden_spike", etc.
1794
+ 'consistent'
1795
+ >>> result.mean_chi_squared # Average χ² across chunk pairs
1796
+ 45.2
1797
+ >>> result.status # "success", "marginal_data", "insufficient_data"
1798
+ 'success'
1799
+ """
1800
+
1801
+ # Status (graceful handling of edge cases)
1802
+ status: str # "success", "marginal_data", "insufficient_data"
1803
+ status_message: str # Human-readable explanation
1804
+
1805
+ # Pattern classification
1806
+ pattern: str # "consistent", "gradual_drift", "sudden_spike", "suspiciously_uniform", "unknown"
1807
+ pattern_confidence: float # 0.0-1.0 confidence in classification
1808
+
1809
+ # Holistic metrics (may be NaN if insufficient data)
1810
+ mean_chi_squared: float # Average χ² across all chunk pairs
1811
+ std_chi_squared: float # Standard deviation of χ² values
1812
+ max_chi_squared: float # Highest χ² between any two chunks
1813
+ min_chi_squared: float # Lowest χ² between any two chunks
1814
+ max_location: int # Index of chunk boundary with max χ² (0-indexed)
1815
+ trend: float # Linear regression slope of χ² over chunk pairs
1816
+
1817
+ # Pairwise comparison data
1818
+ pairwise_scores: list[dict] # [{"chunk_pair": (0, 1), "chi_squared": 45.2, "top_words": [...]}]
1819
+
1820
+ # Window configuration (for reproducibility)
1821
+ window_size: int
1822
+ stride: int
1823
+ overlap_ratio: float # Computed: max(0, 1 - stride/window_size)
1824
+ comparison_mode: str # "sequential", "all_pairs", "fixed_lag"
1825
+ window_count: int
1826
+
1827
+ # For all_pairs mode only
1828
+ distance_matrix: list[list[float]] | None # None for sequential/fixed_lag
1829
+
1830
+ # Thresholds used for pattern classification (for transparency)
1831
+ thresholds: dict[str, float]
1832
+
1833
+ metadata: dict[str, Any]
1834
+
1835
+
1836
+ # ===== Consistency Module Thresholds =====
1837
+ # Related to GitHub Issue #36
1838
+ # These are calibration constants for pattern classification
1839
+
1840
+ MIN_WINDOWS = 3 # Bare minimum for variance calculation
1841
+ RECOMMENDED_WINDOWS = 5 # For reliable pattern classification
1842
+
1843
+
1844
+ @dataclass
1845
+ class MinMaxResult:
1846
+ """Result from Min-Max distance method (Burrows' original method).
1847
+
1848
+ The Min-Max method normalizes feature frequencies using min-max scaling,
1849
+ then computes distance between texts. This was Burrows' original approach
1850
+ before developing Delta.
1851
+
1852
+ Related GitHub Issue:
1853
+ #24 - Additional Authorship Attribution Methods
1854
+ https://github.com/craigtrim/pystylometry/issues/24
1855
+
1856
+ References:
1857
+ Burrows, J. F. (1992). Not unless you ask nicely: The interpretative
1858
+ nexus between analysis and information. Literary and Linguistic
1859
+ Computing, 7(2), 91-109.
1860
+
1861
+ Example:
1862
+ >>> result = compute_minmax(text1, text2)
1863
+ >>> print(f"MinMax distance: {result.minmax_distance:.3f}")
1864
+ """
1865
+
1866
+ minmax_distance: float # Min-max normalized distance
1867
+ feature_count: int # Number of features used
1868
+ most_distinctive_features: list[tuple[str, float]] # Features + contributions
1869
+ metadata: dict[str, Any] # Normalized frequencies, scaling parameters, etc.
1870
+
1871
+
1872
+ @dataclass
1873
+ class JohnsBurrowsResult:
1874
+ """Result from John's Burrows' variation of Delta.
1875
+
1876
+ John Burrows has developed several variations of the Delta method over
1877
+ the years. This captures alternative formulations including Quadratic
1878
+ Delta and other distance measures.
1879
+
1880
+ Related GitHub Issue:
1881
+ #24 - Additional Authorship Attribution Methods
1882
+ https://github.com/craigtrim/pystylometry/issues/24
1883
+
1884
+ References:
1885
+ Burrows, J. (2005). Who wrote Shamela? Verifying the authorship of a
1886
+ parodic text. Literary and Linguistic Computing, 20(4), 437-450.
1887
+
1888
+ Example:
1889
+ >>> result = compute_johns_delta(text1, text2, method="quadratic")
1890
+ >>> print(f"Quadratic Delta: {result.delta_score:.3f}")
1891
+ """
1892
+
1893
+ delta_score: float # Delta distance score
1894
+ method: str # "quadratic", "weighted", "rotated", etc.
1895
+ feature_count: int # Number of MFW used
1896
+ most_distinctive_features: list[tuple[str, float]] # Features + contributions
1897
+ metadata: dict[str, Any] # Method-specific parameters, z-scores, etc.
1898
+
1899
+
1900
+ # ===== Rhythm and Prosody Results =====
1901
+ # Related to GitHub Issue #25: Rhythm and Prosody Metrics
1902
+ # https://github.com/craigtrim/pystylometry/issues/25
1903
+
1904
+
1905
+ @dataclass
1906
+ class RhythmProsodyResult:
1907
+ """Result from rhythm and prosody analysis.
1908
+
1909
+ Rhythm and prosody metrics capture the musical qualities of written language,
1910
+ including stress patterns, syllable rhythms, and phonological features. While
1911
+ these are typically studied in spoken language, written text preserves many
1912
+ rhythmic patterns that vary by author and genre.
1913
+
1914
+ Related GitHub Issue:
1915
+ #25 - Rhythm and Prosody Metrics
1916
+ https://github.com/craigtrim/pystylometry/issues/25
1917
+
1918
+ Features analyzed:
1919
+ - Syllable patterns and stress patterns
1920
+ - Rhythmic regularity (coefficient of variation of syllable counts)
1921
+ - Phonological features (alliteration, assonance)
1922
+ - Syllable complexity (consonant clusters)
1923
+ - Sentence rhythm (alternating long/short sentences)
1924
+ - Polysyllabic word ratio
1925
+
1926
+ References:
1927
+ Lea, R. B., Mulligan, E. J., & Walton, J. H. (2005). Sentence rhythm and
1928
+ text comprehension. Memory & Cognition, 33(3), 388-396.
1929
+ Louwerse, M. M., & Benesh, N. (2012). Representing spatial structure through
1930
+ maps and language: Lord of the Rings encodes the spatial structure of
1931
+ Middle Earth. Cognitive Science, 36(8), 1556-1569.
1932
+
1933
+ Example:
1934
+ >>> result = compute_rhythm_prosody("Sample text with rhythm...")
1935
+ >>> print(f"Syllables per word: {result.mean_syllables_per_word:.2f}")
1936
+ >>> print(f"Rhythmic regularity: {result.rhythmic_regularity:.3f}")
1937
+ >>> print(f"Alliteration density: {result.alliteration_density:.2f}")
1938
+ """
1939
+
1940
+ # Syllable patterns
1941
+ mean_syllables_per_word: float # Average syllables per word
1942
+ syllable_std_dev: float # Std dev of syllables per word
1943
+ polysyllabic_ratio: float # Words with 3+ syllables / total
1944
+ monosyllabic_ratio: float # Single-syllable words / total
1945
+
1946
+ # Rhythmic regularity
1947
+ rhythmic_regularity: float # 1 / CV of syllable counts (higher = more regular)
1948
+ syllable_cv: float # Coefficient of variation of syllables per word
1949
+ stress_pattern_entropy: float # Entropy of stress patterns
1950
+
1951
+ # Sentence rhythm
1952
+ sentence_length_alternation: float # Degree of long/short alternation
1953
+ sentence_rhythm_score: float # Composite rhythm score
1954
+
1955
+ # Phonological features
1956
+ alliteration_density: float # Alliterative word pairs per 100 words
1957
+ assonance_density: float # Assonant word pairs per 100 words
1958
+ consonance_density: float # Consonant word pairs per 100 words
1959
+
1960
+ # Syllable complexity
1961
+ mean_consonant_cluster_length: float # Avg consonants in clusters
1962
+ initial_cluster_ratio: float # Words starting with clusters / total
1963
+ final_cluster_ratio: float # Words ending with clusters / total
1964
+
1965
+ # Stress patterns (estimated for written text)
1966
+ iambic_ratio: float # Iambic patterns (unstressed-stressed) / total
1967
+ trochaic_ratio: float # Trochaic patterns (stressed-unstressed) / total
1968
+ dactylic_ratio: float # Dactylic patterns / total
1969
+ anapestic_ratio: float # Anapestic patterns / total
1970
+
1971
+ metadata: dict[str, Any] # Syllable counts, stress patterns, phoneme data, etc.
1972
+
1973
+
1974
+ # ===== Dialect Detection Results =====
1975
+ # Related to GitHub Issue #35: Dialect detection with extensible JSON markers
1976
+ # https://github.com/craigtrim/pystylometry/issues/35
1977
+ # Related to GitHub Issue #30: Whonix stylometry features
1978
+ # https://github.com/craigtrim/pystylometry/issues/30
1979
+
1980
+
1981
+ @dataclass
1982
+ class DialectResult:
1983
+ """Result from dialect detection analysis.
1984
+
1985
+ Dialect detection identifies regional linguistic preferences (British vs.
1986
+ American English) and measures text markedness - how far the text deviates
1987
+ from "unmarked" standard English. This analysis uses an extensible JSON-based
1988
+ marker database covering vocabulary, spelling patterns, grammar patterns,
1989
+ punctuation conventions, and idiomatic expressions.
1990
+
1991
+ The analysis follows the chunking pattern from Issue #27, computing metrics
1992
+ per chunk and providing distributions for stylometric fingerprinting. Dialect
1993
+ markers are sparse, so variance across chunks can reveal mixed authorship
1994
+ (e.g., a UK speaker using ChatGPT-generated American English content).
1995
+
1996
+ Related GitHub Issues:
1997
+ #35 - Dialect detection with extensible JSON markers
1998
+ https://github.com/craigtrim/pystylometry/issues/35
1999
+ #30 - Whonix stylometry features (regional linguistic preferences)
2000
+ https://github.com/craigtrim/pystylometry/issues/30
2001
+ #27 - Native chunked analysis with Distribution dataclass
2002
+ https://github.com/craigtrim/pystylometry/issues/27
2003
+
2004
+ Theoretical Background:
2005
+ Markedness theory (Battistella, 1990) informs the markedness_score:
2006
+ marked forms stand out against "standard" written English. High
2007
+ markedness suggests intentional stylistic choice or strong dialect
2008
+ identity. Dialectometry (Goebl, 1982; Nerbonne, 2009) provides the
2009
+ quantitative framework for holistic dialect measurement.
2010
+
2011
+ Feature Levels:
2012
+ Markers are categorized by linguistic level for fine-grained analysis:
2013
+ - Phonological: Spelling reflecting pronunciation (colour/color)
2014
+ - Morphological: Word formation (-ise/-ize, -our/-or, doubled L)
2015
+ - Lexical: Different words for same concept (flat/apartment)
2016
+ - Syntactic: Grammar differences (have got/have, collective nouns)
2017
+
2018
+ Eye Dialect vs. True Dialect:
2019
+ Following Encyclopedia.com's distinction, "eye dialect" (gonna, wanna)
2020
+ indicates informal register, not regional dialect. True dialect markers
2021
+ (colour, flat, lorry) indicate actual regional preference.
2022
+
2023
+ References:
2024
+ Battistella, Edwin L. "Markedness: The Evaluative Superstructure of
2025
+ Language." State University of New York Press, 1990.
2026
+ Goebl, Hans. "Dialektometrie: Prinzipien und Methoden des Einsatzes der
2027
+ numerischen Taxonomie im Bereich der Dialektgeographie." Verlag der
2028
+ Österreichischen Akademie der Wissenschaften, 1982.
2029
+ Nerbonne, John. "Data-Driven Dialectology." Language and Linguistics
2030
+ Compass, vol. 3, no. 1, 2009, pp. 175-198.
2031
+ Labov, William. "The Social Stratification of English in New York City."
2032
+ Cambridge University Press, 2006.
2033
+ Whonix Project. "Stylometry: Deanonymization Techniques." Whonix Wiki,
2034
+ https://www.whonix.org/wiki/Stylometry
2035
+
2036
+ Example:
2037
+ >>> result = compute_dialect(text, chunk_size=1000)
2038
+ >>> result.dialect # "british", "american", "mixed", or "neutral"
2039
+ 'british'
2040
+ >>> result.british_score # Mean across chunks
2041
+ 0.72
2042
+ >>> result.british_score_dist.std # Variance reveals fingerprint
2043
+ 0.05
2044
+ >>> result.markedness_score # Deviation from standard English
2045
+ 0.35
2046
+ """
2047
+
2048
+ # Classification result
2049
+ dialect: str # "british", "american", "mixed", "neutral"
2050
+ confidence: float # 0.0-1.0, how confident the classification is
2051
+
2052
+ # Convenient access (mean values across chunks)
2053
+ british_score: float # Mean British marker density (0.0-1.0)
2054
+ american_score: float # Mean American marker density (0.0-1.0)
2055
+ markedness_score: float # Mean deviation from unmarked standard (0.0-1.0)
2056
+
2057
+ # Full distributions for stylometric fingerprinting
2058
+ british_score_dist: Distribution
2059
+ american_score_dist: Distribution
2060
+ markedness_score_dist: Distribution
2061
+
2062
+ # Marker breakdown by linguistic level (aggregated across chunks)
2063
+ # Keys: "phonological", "morphological", "lexical", "syntactic"
2064
+ markers_by_level: dict[str, dict[str, int]]
2065
+
2066
+ # Detailed marker counts (aggregated across chunks)
2067
+ spelling_markers: dict[str, int] # {"colour": 2, "color": 1}
2068
+ vocabulary_markers: dict[str, int] # {"flat": 1, "apartment": 0}
2069
+ grammar_markers: dict[str, int] # {"have got": 1}
2070
+
2071
+ # Eye dialect (informal register indicators, not true dialect)
2072
+ eye_dialect_count: int # Total eye dialect markers (gonna, wanna, etc.)
2073
+ eye_dialect_ratio: float # Eye dialect per 1000 words
2074
+
2075
+ # Register analysis hints
2076
+ register_hints: dict[str, Any] # {"formality": 0.7, "hedging_density": 0.05}
2077
+
2078
+ # Chunking context
2079
+ chunk_size: int
2080
+ chunk_count: int
2081
+
2082
+ # Extensible metadata
157
2083
  metadata: dict[str, Any]
158
2084
 
159
2085