pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. pystylometry/README.md +42 -0
  2. pystylometry/__init__.py +45 -3
  3. pystylometry/_types.py +1017 -259
  4. pystylometry/authorship/README.md +21 -0
  5. pystylometry/authorship/__init__.py +28 -4
  6. pystylometry/authorship/additional_methods.py +260 -40
  7. pystylometry/authorship/compression.py +175 -0
  8. pystylometry/authorship/kilgarriff.py +354 -0
  9. pystylometry/character/README.md +17 -0
  10. pystylometry/character/character_metrics.py +267 -179
  11. pystylometry/cli.py +427 -0
  12. pystylometry/consistency/README.md +27 -0
  13. pystylometry/consistency/__init__.py +57 -0
  14. pystylometry/consistency/_thresholds.py +162 -0
  15. pystylometry/consistency/drift.py +549 -0
  16. pystylometry/dialect/README.md +26 -0
  17. pystylometry/dialect/__init__.py +65 -0
  18. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  19. pystylometry/dialect/_loader.py +360 -0
  20. pystylometry/dialect/detector.py +533 -0
  21. pystylometry/lexical/README.md +23 -0
  22. pystylometry/lexical/advanced_diversity.py +61 -22
  23. pystylometry/lexical/function_words.py +255 -56
  24. pystylometry/lexical/hapax.py +182 -52
  25. pystylometry/lexical/mtld.py +108 -26
  26. pystylometry/lexical/ttr.py +76 -10
  27. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  28. pystylometry/lexical/yule.py +136 -50
  29. pystylometry/ngrams/README.md +18 -0
  30. pystylometry/ngrams/entropy.py +150 -49
  31. pystylometry/ngrams/extended_ngrams.py +314 -69
  32. pystylometry/prosody/README.md +17 -0
  33. pystylometry/prosody/rhythm_prosody.py +773 -11
  34. pystylometry/readability/README.md +23 -0
  35. pystylometry/readability/additional_formulas.py +1887 -762
  36. pystylometry/readability/ari.py +144 -82
  37. pystylometry/readability/coleman_liau.py +136 -109
  38. pystylometry/readability/flesch.py +177 -73
  39. pystylometry/readability/gunning_fog.py +165 -161
  40. pystylometry/readability/smog.py +123 -42
  41. pystylometry/stylistic/README.md +20 -0
  42. pystylometry/stylistic/cohesion_coherence.py +669 -13
  43. pystylometry/stylistic/genre_register.py +1560 -17
  44. pystylometry/stylistic/markers.py +611 -17
  45. pystylometry/stylistic/vocabulary_overlap.py +354 -13
  46. pystylometry/syntactic/README.md +20 -0
  47. pystylometry/syntactic/advanced_syntactic.py +76 -14
  48. pystylometry/syntactic/pos_ratios.py +70 -6
  49. pystylometry/syntactic/sentence_stats.py +55 -12
  50. pystylometry/syntactic/sentence_types.py +71 -15
  51. pystylometry/viz/README.md +27 -0
  52. pystylometry/viz/__init__.py +71 -0
  53. pystylometry/viz/drift.py +589 -0
  54. pystylometry/viz/jsx/__init__.py +31 -0
  55. pystylometry/viz/jsx/_base.py +144 -0
  56. pystylometry/viz/jsx/report.py +677 -0
  57. pystylometry/viz/jsx/timeline.py +716 -0
  58. pystylometry/viz/jsx/viewer.py +1032 -0
  59. pystylometry-1.3.0.dist-info/METADATA +136 -0
  60. pystylometry-1.3.0.dist-info/RECORD +76 -0
  61. {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
  62. pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
  63. pystylometry-1.0.0.dist-info/METADATA +0 -275
  64. pystylometry-1.0.0.dist-info/RECORD +0 -46
pystylometry/_types.py CHANGED
@@ -1,42 +1,281 @@
1
- """Result dataclasses for all pystylometry metrics."""
1
+ """Result dataclasses for all pystylometry metrics.
2
+
3
+ This module defines dataclasses for all metric results in pystylometry.
4
+
5
+ Native Chunked Analysis (Issue #27):
6
+ All metrics support chunked analysis by default. Results include:
7
+ - Convenient access to the mean value (e.g., result.reading_ease)
8
+ - Full distribution with per-chunk values and statistics (e.g., result.reading_ease_dist)
9
+
10
+ The Distribution dataclass provides:
11
+ - values: list of per-chunk metric values
12
+ - mean, median, std: central tendency and variability
13
+ - range, iqr: spread measures
14
+
15
+ This design captures the variance and rhythm in writing style, which is
16
+ essential for authorship attribution and linguistic fingerprinting.
17
+
18
+ References:
19
+ STTR methodology: Johnson, W. (1944). Studies in language behavior.
20
+ """
2
21
 
3
22
  from __future__ import annotations
4
23
 
24
+ import statistics
5
25
  from dataclasses import dataclass
6
26
  from typing import Any
7
27
 
28
+ # ===== Distribution and Chunking =====
29
+ # Related to GitHub Issue #27: Native chunked analysis with Distribution dataclass
30
+ # https://github.com/craigtrim/pystylometry/issues/27
31
+
32
+
33
+ @dataclass
34
+ class Distribution:
35
+ """Distribution of metric values across chunks.
36
+
37
+ This dataclass captures the variance and rhythm in writing style by storing
38
+ per-chunk values along with descriptive statistics. The variance across chunks
39
+ is often more revealing of authorial fingerprint than aggregate values.
40
+
41
+ Related GitHub Issue:
42
+ #27 - Native chunked analysis with Distribution dataclass
43
+ https://github.com/craigtrim/pystylometry/issues/27
44
+
45
+ Attributes:
46
+ values: Raw per-chunk metric values
47
+ mean: Arithmetic mean of values
48
+ median: Middle value when sorted
49
+ std: Standard deviation (0.0 for single-chunk)
50
+ range: max - min (spread measure)
51
+ iqr: Interquartile range (Q3 - Q1), robust spread measure
52
+
53
+ Note:
54
+ min/max are omitted as trivial operations on values:
55
+ - min(dist.values), max(dist.values)
56
+
57
+ Example:
58
+ >>> dist = Distribution(
59
+ ... values=[65.2, 71.1, 68.8, 70.5],
60
+ ... mean=68.9, median=69.65, std=2.57,
61
+ ... range=5.9, iqr=3.15
62
+ ... )
63
+ >>> dist.std # variance reveals authorial fingerprint
64
+ 2.57
65
+ """
66
+
67
+ values: list[float]
68
+ mean: float
69
+ median: float
70
+ std: float
71
+ range: float
72
+ iqr: float
73
+
74
+
75
+ def chunk_text(text: str, chunk_size: int) -> list[str]:
76
+ """Split text into word-based chunks of approximately equal size.
77
+
78
+ Chunks are created by splitting on whitespace and grouping words.
79
+ The last chunk may be smaller than chunk_size if the text doesn't
80
+ divide evenly.
81
+
82
+ Related GitHub Issue:
83
+ #27 - Native chunked analysis with Distribution dataclass
84
+ https://github.com/craigtrim/pystylometry/issues/27
85
+
86
+ Args:
87
+ text: The text to chunk
88
+ chunk_size: Target number of words per chunk (default: 1000)
89
+
90
+ Returns:
91
+ List of text chunks. For text smaller than chunk_size,
92
+ returns a single-element list with the entire text.
93
+
94
+ Example:
95
+ >>> chunks = chunk_text("word " * 2500, chunk_size=1000)
96
+ >>> len(chunks)
97
+ 3
98
+ >>> # First two chunks have ~1000 words, last has ~500
99
+ """
100
+ words = text.split()
101
+ if not words:
102
+ return [""]
103
+
104
+ chunks = []
105
+ for i in range(0, len(words), chunk_size):
106
+ chunk_words = words[i : i + chunk_size]
107
+ chunks.append(" ".join(chunk_words))
108
+
109
+ return chunks
110
+
111
+
112
+ def make_distribution(values: list[float]) -> Distribution:
113
+ """Create a Distribution from a list of values.
114
+
115
+ Computes all descriptive statistics for the distribution.
116
+ Handles single-value lists by setting std, range, and iqr to 0.0.
117
+
118
+ Related GitHub Issue:
119
+ #27 - Native chunked analysis with Distribution dataclass
120
+ https://github.com/craigtrim/pystylometry/issues/27
121
+
122
+ Args:
123
+ values: List of numeric values (must be non-empty)
124
+
125
+ Returns:
126
+ Distribution with computed statistics
127
+
128
+ Raises:
129
+ ValueError: If values is empty
130
+
131
+ Example:
132
+ >>> dist = make_distribution([65.2, 71.1, 68.8, 70.5])
133
+ >>> dist.mean
134
+ 68.9
135
+ >>> dist.std # reveals variance in the signal
136
+ 2.57...
137
+ """
138
+ if not values:
139
+ raise ValueError("Cannot create distribution from empty values")
140
+
141
+ if len(values) == 1:
142
+ return Distribution(
143
+ values=values,
144
+ mean=values[0],
145
+ median=values[0],
146
+ std=0.0,
147
+ range=0.0,
148
+ iqr=0.0,
149
+ )
150
+
151
+ # For 2-3 values, quantiles() needs special handling
152
+ if len(values) < 4:
153
+ q1 = values[0]
154
+ q3 = values[-1]
155
+ else:
156
+ q = statistics.quantiles(values, n=4)
157
+ q1, q3 = q[0], q[2]
158
+
159
+ return Distribution(
160
+ values=values,
161
+ mean=statistics.mean(values),
162
+ median=statistics.median(values),
163
+ std=statistics.stdev(values),
164
+ range=max(values) - min(values),
165
+ iqr=q3 - q1,
166
+ )
167
+
168
+
8
169
  # ===== Lexical Results =====
9
170
 
10
171
 
11
172
  @dataclass
12
173
  class MTLDResult:
13
- """Result from MTLD (Measure of Textual Lexical Diversity) computation."""
174
+ """Result from MTLD (Measure of Textual Lexical Diversity) computation.
14
175
 
176
+ All numeric metrics include both a mean value (convenient access) and
177
+ a full distribution with per-chunk values and statistics.
178
+
179
+ Related GitHub Issue:
180
+ #27 - Native chunked analysis with Distribution dataclass
181
+ https://github.com/craigtrim/pystylometry/issues/27
182
+
183
+ Example:
184
+ >>> result = compute_mtld(text, chunk_size=1000)
185
+ >>> result.mtld_average # mean MTLD across chunks
186
+ 72.5
187
+ >>> result.mtld_average_dist.std # MTLD variance
188
+ 8.3
189
+ """
190
+
191
+ # Convenient access (mean values)
15
192
  mtld_forward: float
16
193
  mtld_backward: float
17
194
  mtld_average: float
195
+
196
+ # Full distributions
197
+ mtld_forward_dist: Distribution
198
+ mtld_backward_dist: Distribution
199
+ mtld_average_dist: Distribution
200
+
201
+ # Chunking context
202
+ chunk_size: int
203
+ chunk_count: int
204
+
18
205
  metadata: dict[str, Any]
19
206
 
20
207
 
21
208
  @dataclass
22
209
  class YuleResult:
23
- """Result from Yule's K and I computation."""
210
+ """Result from Yule's K and I computation.
211
+
212
+ All numeric metrics include both a mean value (convenient access) and
213
+ a full distribution with per-chunk values and statistics.
24
214
 
215
+ Related GitHub Issue:
216
+ #27 - Native chunked analysis with Distribution dataclass
217
+ https://github.com/craigtrim/pystylometry/issues/27
218
+
219
+ Example:
220
+ >>> result = compute_yule(text, chunk_size=1000)
221
+ >>> result.yule_k # mean across chunks
222
+ 120.5
223
+ >>> result.yule_k_dist.std # variance reveals fingerprint
224
+ 15.2
225
+ """
226
+
227
+ # Convenient access (mean values)
25
228
  yule_k: float
26
229
  yule_i: float
230
+
231
+ # Full distributions
232
+ yule_k_dist: Distribution
233
+ yule_i_dist: Distribution
234
+
235
+ # Chunking context
236
+ chunk_size: int
237
+ chunk_count: int
238
+
27
239
  metadata: dict[str, Any]
28
240
 
29
241
 
30
242
  @dataclass
31
243
  class HapaxResult:
32
- """Result from Hapax Legomena analysis."""
33
-
34
- hapax_count: int
35
- hapax_ratio: float
36
- dis_hapax_count: int
37
- dis_hapax_ratio: float
38
- sichel_s: float
39
- honore_r: float
244
+ """Result from Hapax Legomena analysis.
245
+
246
+ All numeric metrics include both a mean value (convenient access) and
247
+ a full distribution with per-chunk values and statistics.
248
+
249
+ Related GitHub Issue:
250
+ #27 - Native chunked analysis with Distribution dataclass
251
+ https://github.com/craigtrim/pystylometry/issues/27
252
+
253
+ Example:
254
+ >>> result = compute_hapax(text, chunk_size=1000)
255
+ >>> result.hapax_ratio # mean across chunks
256
+ 0.45
257
+ >>> result.hapax_ratio_dist.std # variance
258
+ 0.08
259
+ """
260
+
261
+ # Convenient access (mean/total values)
262
+ hapax_count: int # Total across all chunks
263
+ hapax_ratio: float # Mean ratio
264
+ dis_hapax_count: int # Total across all chunks
265
+ dis_hapax_ratio: float # Mean ratio
266
+ sichel_s: float # Mean
267
+ honore_r: float # Mean
268
+
269
+ # Full distributions (ratios only - counts don't distribute meaningfully)
270
+ hapax_ratio_dist: Distribution
271
+ dis_hapax_ratio_dist: Distribution
272
+ sichel_s_dist: Distribution
273
+ honore_r_dist: Distribution
274
+
275
+ # Chunking context
276
+ chunk_size: int
277
+ chunk_count: int
278
+
40
279
  metadata: dict[str, Any]
41
280
 
42
281
 
@@ -80,6 +319,9 @@ class TTRResult:
80
319
  Wraps stylometry-ttr package functionality to measure vocabulary richness
81
320
  through the ratio of unique words (types) to total words (tokens).
82
321
 
322
+ All numeric metrics include both a mean value (convenient access) and
323
+ a full distribution with per-chunk values and statistics.
324
+
83
325
  Includes multiple TTR variants for length normalization:
84
326
  - Raw TTR: Direct ratio of unique to total words
85
327
  - Root TTR (Guiraud's index): types / sqrt(tokens)
@@ -87,18 +329,44 @@ class TTRResult:
87
329
  - STTR: Standardized TTR across fixed-size chunks
88
330
  - Delta Std: Measures vocabulary consistency across chunks
89
331
 
332
+ Related GitHub Issue:
333
+ #27 - Native chunked analysis with Distribution dataclass
334
+ https://github.com/craigtrim/pystylometry/issues/27
335
+
90
336
  References:
91
337
  Guiraud, P. (1960). Problèmes et méthodes de la statistique linguistique.
92
338
  Herdan, G. (1960). Type-token Mathematics.
339
+
340
+ Example:
341
+ >>> result = compute_ttr(text, chunk_size=1000)
342
+ >>> result.ttr # mean TTR across chunks
343
+ 0.42
344
+ >>> result.ttr_dist.std # TTR variance reveals fingerprint
345
+ 0.05
346
+ >>> result.chunk_count
347
+ 59
93
348
  """
94
349
 
350
+ # Convenient access (mean values)
95
351
  total_words: int
96
352
  unique_words: int
97
- ttr: float # Raw TTR
98
- root_ttr: float # Guiraud's index
99
- log_ttr: float # Herdan's C
100
- sttr: float # Standardized TTR
101
- delta_std: float # Vocabulary consistency
353
+ ttr: float # Raw TTR (mean)
354
+ root_ttr: float # Guiraud's index (mean)
355
+ log_ttr: float # Herdan's C (mean)
356
+ sttr: float # Standardized TTR (mean)
357
+ delta_std: float # Vocabulary consistency (mean)
358
+
359
+ # Full distributions with per-chunk values
360
+ ttr_dist: Distribution
361
+ root_ttr_dist: Distribution
362
+ log_ttr_dist: Distribution
363
+ sttr_dist: Distribution
364
+ delta_std_dist: Distribution
365
+
366
+ # Chunking context
367
+ chunk_size: int
368
+ chunk_count: int
369
+
102
370
  metadata: dict[str, Any]
103
371
 
104
372
 
@@ -107,48 +375,135 @@ class TTRResult:
107
375
 
108
376
  @dataclass
109
377
  class FleschResult:
110
- """Result from Flesch Reading Ease and Flesch-Kincaid Grade computation."""
378
+ """Result from Flesch Reading Ease and Flesch-Kincaid Grade computation.
379
+
380
+ All numeric metrics include both a mean value (convenient access) and
381
+ a full distribution with per-chunk values and statistics.
382
+
383
+ Related GitHub Issue:
384
+ #27 - Native chunked analysis with Distribution dataclass
385
+ https://github.com/craigtrim/pystylometry/issues/27
111
386
 
387
+ Example:
388
+ >>> result = compute_flesch(text, chunk_size=1000)
389
+ >>> result.reading_ease # mean across chunks
390
+ 68.54
391
+ >>> result.reading_ease_dist.std # variance reveals fingerprint
392
+ 4.2
393
+ >>> result.reading_ease_dist.values # per-chunk values
394
+ [65.2, 71.1, 68.8, ...]
395
+ """
396
+
397
+ # Convenient access (mean values)
112
398
  reading_ease: float
113
399
  grade_level: float
114
- difficulty: str # "Very Easy", "Easy", "Fairly Easy", "Standard", etc.
400
+ difficulty: str # Based on mean reading_ease
401
+
402
+ # Full distributions
403
+ reading_ease_dist: Distribution
404
+ grade_level_dist: Distribution
405
+
406
+ # Chunking context
407
+ chunk_size: int
408
+ chunk_count: int
409
+
115
410
  metadata: dict[str, Any]
116
411
 
117
412
 
118
413
  @dataclass
119
414
  class SMOGResult:
120
- """Result from SMOG Index computation."""
415
+ """Result from SMOG Index computation.
121
416
 
417
+ Related GitHub Issue:
418
+ #27 - Native chunked analysis with Distribution dataclass
419
+ https://github.com/craigtrim/pystylometry/issues/27
420
+ """
421
+
422
+ # Convenient access (mean values)
122
423
  smog_index: float
123
424
  grade_level: float
425
+
426
+ # Full distributions
427
+ smog_index_dist: Distribution
428
+ grade_level_dist: Distribution
429
+
430
+ # Chunking context
431
+ chunk_size: int
432
+ chunk_count: int
433
+
124
434
  metadata: dict[str, Any]
125
435
 
126
436
 
127
437
  @dataclass
128
438
  class GunningFogResult:
129
- """Result from Gunning Fog Index computation."""
439
+ """Result from Gunning Fog Index computation.
130
440
 
441
+ Related GitHub Issue:
442
+ #27 - Native chunked analysis with Distribution dataclass
443
+ https://github.com/craigtrim/pystylometry/issues/27
444
+ """
445
+
446
+ # Convenient access (mean values)
131
447
  fog_index: float
132
448
  grade_level: float
449
+
450
+ # Full distributions
451
+ fog_index_dist: Distribution
452
+ grade_level_dist: Distribution
453
+
454
+ # Chunking context
455
+ chunk_size: int
456
+ chunk_count: int
457
+
133
458
  metadata: dict[str, Any]
134
459
 
135
460
 
136
461
  @dataclass
137
462
  class ColemanLiauResult:
138
- """Result from Coleman-Liau Index computation."""
463
+ """Result from Coleman-Liau Index computation.
464
+
465
+ Related GitHub Issue:
466
+ #27 - Native chunked analysis with Distribution dataclass
467
+ https://github.com/craigtrim/pystylometry/issues/27
468
+ """
139
469
 
470
+ # Convenient access (mean values)
140
471
  cli_index: float
141
- grade_level: int
472
+ grade_level: float # Changed to float for mean across chunks
473
+
474
+ # Full distributions
475
+ cli_index_dist: Distribution
476
+ grade_level_dist: Distribution
477
+
478
+ # Chunking context
479
+ chunk_size: int
480
+ chunk_count: int
481
+
142
482
  metadata: dict[str, Any]
143
483
 
144
484
 
145
485
  @dataclass
146
486
  class ARIResult:
147
- """Result from Automated Readability Index computation."""
487
+ """Result from Automated Readability Index computation.
148
488
 
489
+ Related GitHub Issue:
490
+ #27 - Native chunked analysis with Distribution dataclass
491
+ https://github.com/craigtrim/pystylometry/issues/27
492
+ """
493
+
494
+ # Convenient access (mean values)
149
495
  ari_score: float
150
- grade_level: int
151
- age_range: str
496
+ grade_level: float # Changed to float for mean across chunks
497
+ age_range: str # Based on mean grade level
498
+
499
+ # Full distributions
500
+ ari_score_dist: Distribution
501
+ grade_level_dist: Distribution
502
+
503
+ # Chunking context
504
+ chunk_size: int
505
+ chunk_count: int
506
+
152
507
  metadata: dict[str, Any]
153
508
 
154
509
 
@@ -157,8 +512,14 @@ class ARIResult:
157
512
 
158
513
  @dataclass
159
514
  class POSResult:
160
- """Result from Part-of-Speech ratio analysis."""
515
+ """Result from Part-of-Speech ratio analysis.
161
516
 
517
+ Related GitHub Issue:
518
+ #27 - Native chunked analysis with Distribution dataclass
519
+ https://github.com/craigtrim/pystylometry/issues/27
520
+ """
521
+
522
+ # Convenient access (mean values)
162
523
  noun_ratio: float
163
524
  verb_ratio: float
164
525
  adjective_ratio: float
@@ -167,19 +528,52 @@ class POSResult:
167
528
  adjective_noun_ratio: float
168
529
  lexical_density: float
169
530
  function_word_ratio: float
531
+
532
+ # Full distributions
533
+ noun_ratio_dist: Distribution
534
+ verb_ratio_dist: Distribution
535
+ adjective_ratio_dist: Distribution
536
+ adverb_ratio_dist: Distribution
537
+ noun_verb_ratio_dist: Distribution
538
+ adjective_noun_ratio_dist: Distribution
539
+ lexical_density_dist: Distribution
540
+ function_word_ratio_dist: Distribution
541
+
542
+ # Chunking context
543
+ chunk_size: int
544
+ chunk_count: int
545
+
170
546
  metadata: dict[str, Any]
171
547
 
172
548
 
173
549
  @dataclass
174
550
  class SentenceStatsResult:
175
- """Result from sentence-level statistics."""
551
+ """Result from sentence-level statistics.
552
+
553
+ Related GitHub Issue:
554
+ #27 - Native chunked analysis with Distribution dataclass
555
+ https://github.com/craigtrim/pystylometry/issues/27
556
+ """
176
557
 
558
+ # Convenient access (mean values)
177
559
  mean_sentence_length: float
178
560
  sentence_length_std: float
179
- sentence_length_range: int
180
- min_sentence_length: int
181
- max_sentence_length: int
182
- sentence_count: int
561
+ sentence_length_range: float # Changed to float for mean across chunks
562
+ min_sentence_length: float # Changed to float for mean across chunks
563
+ max_sentence_length: float # Changed to float for mean across chunks
564
+ sentence_count: int # Total across all chunks
565
+
566
+ # Full distributions
567
+ mean_sentence_length_dist: Distribution
568
+ sentence_length_std_dist: Distribution
569
+ sentence_length_range_dist: Distribution
570
+ min_sentence_length_dist: Distribution
571
+ max_sentence_length_dist: Distribution
572
+
573
+ # Chunking context
574
+ chunk_size: int
575
+ chunk_count: int
576
+
183
577
  metadata: dict[str, Any]
184
578
 
185
579
 
@@ -211,11 +605,26 @@ class ZetaResult:
211
605
 
212
606
  @dataclass
213
607
  class EntropyResult:
214
- """Result from n-gram entropy computation."""
608
+ """Result from n-gram entropy computation.
609
+
610
+ Related GitHub Issue:
611
+ #27 - Native chunked analysis with Distribution dataclass
612
+ https://github.com/craigtrim/pystylometry/issues/27
613
+ """
215
614
 
615
+ # Convenient access (mean values)
216
616
  entropy: float
217
617
  perplexity: float
218
618
  ngram_type: str # "character_bigram", "word_bigram", "word_trigram"
619
+
620
+ # Full distributions
621
+ entropy_dist: Distribution
622
+ perplexity_dist: Distribution
623
+
624
+ # Chunking context
625
+ chunk_size: int
626
+ chunk_count: int
627
+
219
628
  metadata: dict[str, Any]
220
629
 
221
630
 
@@ -233,9 +642,9 @@ class CharacterMetricsResult:
233
642
  fundamental for authorship attribution and can capture distinctive
234
643
  patterns in punctuation, formatting, and word construction.
235
644
 
236
- Related GitHub Issue:
645
+ Related GitHub Issues:
237
646
  #12 - Character-Level Metrics
238
- https://github.com/craigtrim/pystylometry/issues/12
647
+ #27 - Native chunked analysis with Distribution dataclass
239
648
 
240
649
  Metrics included:
241
650
  - Average word length (characters per word)
@@ -253,25 +662,35 @@ class CharacterMetricsResult:
253
662
  of techniques. Literary and Linguistic Computing, 22(3), 251-270.
254
663
  Stamatatos, E. (2009). A survey of modern authorship attribution methods.
255
664
  JASIST, 60(3), 538-556.
256
-
257
- Example:
258
- >>> result = compute_character_metrics("Sample text here.")
259
- >>> print(f"Avg word length: {result.avg_word_length:.2f} chars")
260
- >>> print(f"Punctuation density: {result.punctuation_density:.2f}")
261
- >>> print(f"Vowel/consonant ratio: {result.vowel_consonant_ratio:.2f}")
262
665
  """
263
666
 
264
- avg_word_length: float # Mean characters per word
265
- avg_sentence_length_chars: float # Mean characters per sentence
266
- punctuation_density: float # Punctuation marks per 100 words
267
- punctuation_variety: int # Count of unique punctuation types used
268
- letter_frequency: dict[str, float] # Frequency distribution for a-z
269
- vowel_consonant_ratio: float # Ratio of vowels to consonants
270
- digit_count: int # Total count of digit characters (0-9)
271
- digit_ratio: float # Digits / total characters
272
- uppercase_ratio: float # Uppercase letters / total letters
273
- whitespace_ratio: float # Whitespace characters / total characters
274
- metadata: dict[str, Any] # Additional info (character counts, etc.)
667
+ # Convenient access (mean values)
668
+ avg_word_length: float
669
+ avg_sentence_length_chars: float
670
+ punctuation_density: float
671
+ punctuation_variety: float # Changed to float for mean across chunks
672
+ letter_frequency: dict[str, float] # Aggregate frequency
673
+ vowel_consonant_ratio: float
674
+ digit_count: int # Total across all chunks
675
+ digit_ratio: float
676
+ uppercase_ratio: float
677
+ whitespace_ratio: float
678
+
679
+ # Full distributions
680
+ avg_word_length_dist: Distribution
681
+ avg_sentence_length_chars_dist: Distribution
682
+ punctuation_density_dist: Distribution
683
+ punctuation_variety_dist: Distribution
684
+ vowel_consonant_ratio_dist: Distribution
685
+ digit_ratio_dist: Distribution
686
+ uppercase_ratio_dist: Distribution
687
+ whitespace_ratio_dist: Distribution
688
+
689
+ # Chunking context
690
+ chunk_size: int
691
+ chunk_count: int
692
+
693
+ metadata: dict[str, Any]
275
694
 
276
695
 
277
696
  # ===== Function Word Results =====
@@ -288,9 +707,9 @@ class FunctionWordResult:
288
707
  subconsciously. They are considered strong authorship markers because authors
289
708
  use them consistently across different topics and genres.
290
709
 
291
- Related GitHub Issue:
710
+ Related GitHub Issues:
292
711
  #13 - Function Word Analysis
293
- https://github.com/craigtrim/pystylometry/issues/13
712
+ #27 - Native chunked analysis with Distribution dataclass
294
713
 
295
714
  This analysis computes:
296
715
  - Frequency profiles for all function word categories
@@ -311,26 +730,36 @@ class FunctionWordResult:
311
730
  The Federalist. Addison-Wesley.
312
731
  Burrows, J. (2002). 'Delta': A measure of stylistic difference and a guide
313
732
  to likely authorship. Literary and Linguistic Computing, 17(3), 267-287.
314
-
315
- Example:
316
- >>> result = compute_function_words("Sample text for analysis.")
317
- >>> print(f"Determiner ratio: {result.determiner_ratio:.3f}")
318
- >>> print(f"Preposition ratio: {result.preposition_ratio:.3f}")
319
- >>> print(f"Most frequent: {result.most_frequent_function_words[:5]}")
320
733
  """
321
734
 
322
- determiner_ratio: float # Determiners / total words
323
- preposition_ratio: float # Prepositions / total words
324
- conjunction_ratio: float # Conjunctions / total words
325
- pronoun_ratio: float # Pronouns / total words
326
- auxiliary_ratio: float # Auxiliary verbs / total words
327
- particle_ratio: float # Particles / total words
328
- total_function_word_ratio: float # All function words / total words
329
- function_word_diversity: float # Unique function words / total function words
330
- most_frequent_function_words: list[tuple[str, int]] # Top N with counts
331
- least_frequent_function_words: list[tuple[str, int]] # Bottom N with counts
332
- function_word_distribution: dict[str, int] # All function words with counts
333
- metadata: dict[str, Any] # Category-specific counts, total counts, etc.
735
+ # Convenient access (mean values)
736
+ determiner_ratio: float
737
+ preposition_ratio: float
738
+ conjunction_ratio: float
739
+ pronoun_ratio: float
740
+ auxiliary_ratio: float
741
+ particle_ratio: float
742
+ total_function_word_ratio: float
743
+ function_word_diversity: float
744
+ most_frequent_function_words: list[tuple[str, int]] # Aggregate
745
+ least_frequent_function_words: list[tuple[str, int]] # Aggregate
746
+ function_word_distribution: dict[str, int] # Aggregate
747
+
748
+ # Full distributions
749
+ determiner_ratio_dist: Distribution
750
+ preposition_ratio_dist: Distribution
751
+ conjunction_ratio_dist: Distribution
752
+ pronoun_ratio_dist: Distribution
753
+ auxiliary_ratio_dist: Distribution
754
+ particle_ratio_dist: Distribution
755
+ total_function_word_ratio_dist: Distribution
756
+ function_word_diversity_dist: Distribution
757
+
758
+ # Chunking context
759
+ chunk_size: int
760
+ chunk_count: int
761
+
762
+ metadata: dict[str, Any]
334
763
 
335
764
 
336
765
  # ===== Advanced Lexical Diversity Results =====
@@ -347,9 +776,9 @@ class VocdDResult:
347
776
  It fits a curve to the relationship between tokens and types across multiple
348
777
  random samples of the text.
349
778
 
350
- Related GitHub Issue:
779
+ Related GitHub Issues:
351
780
  #14 - Advanced Lexical Diversity Metrics
352
- https://github.com/craigtrim/pystylometry/issues/14
781
+ #27 - Native chunked analysis with Distribution dataclass
353
782
 
354
783
  The D parameter represents the theoretical vocabulary size and is more
355
784
  stable across different text lengths than simple TTR measures.
@@ -360,18 +789,23 @@ class VocdDResult:
360
789
  McKee, G., Malvern, D., & Richards, B. (2000). Measuring vocabulary
361
790
  diversity using dedicated software. Literary and Linguistic Computing,
362
791
  15(3), 323-337.
363
-
364
- Example:
365
- >>> result = compute_vocd_d("Long sample text for voc-D analysis...")
366
- >>> print(f"D parameter: {result.d_parameter:.2f}")
367
- >>> print(f"Curve fit R²: {result.curve_fit_r_squared:.3f}")
368
792
  """
369
793
 
370
- d_parameter: float # The D value (theoretical vocabulary size)
371
- curve_fit_r_squared: float # Quality of curve fit (0-1)
372
- sample_count: int # Number of random samples used
373
- optimal_sample_size: int # Optimal token sample size used
374
- metadata: dict[str, Any] # Sampling parameters, convergence info, etc.
794
+ # Convenient access (mean values)
795
+ d_parameter: float
796
+ curve_fit_r_squared: float
797
+ sample_count: int # Total across all chunks
798
+ optimal_sample_size: int
799
+
800
+ # Full distributions
801
+ d_parameter_dist: Distribution
802
+ curve_fit_r_squared_dist: Distribution
803
+
804
+ # Chunking context
805
+ chunk_size: int
806
+ chunk_count: int
807
+
808
+ metadata: dict[str, Any]
375
809
 
376
810
 
377
811
  @dataclass
@@ -383,28 +817,35 @@ class MATTRResult:
383
817
  for longer texts. The moving window approach reduces the impact of text
384
818
  length on the TTR calculation.
385
819
 
386
- Related GitHub Issue:
820
+ Related GitHub Issues:
387
821
  #14 - Advanced Lexical Diversity Metrics
388
- https://github.com/craigtrim/pystylometry/issues/14
822
+ #27 - Native chunked analysis with Distribution dataclass
389
823
 
390
824
  References:
391
825
  Covington, M. A., & McFall, J. D. (2010). Cutting the Gordian knot:
392
826
  The moving-average type-token ratio (MATTR). Journal of Quantitative
393
827
  Linguistics, 17(2), 94-100.
394
-
395
- Example:
396
- >>> result = compute_mattr("Sample text here...", window_size=50)
397
- >>> print(f"MATTR score: {result.mattr_score:.3f}")
398
- >>> print(f"Window size: {result.window_size}")
399
828
  """
400
829
 
401
- mattr_score: float # Average TTR across all windows
402
- window_size: int # Size of moving window used
403
- window_count: int # Number of windows analyzed
404
- ttr_std_dev: float # Standard deviation of TTR across windows
405
- min_ttr: float # Minimum TTR in any window
406
- max_ttr: float # Maximum TTR in any window
407
- metadata: dict[str, Any] # Window-by-window TTR values, etc.
830
+ # Convenient access (mean values)
831
+ mattr_score: float
832
+ window_size: int
833
+ window_count: int # Total across all chunks
834
+ ttr_std_dev: float
835
+ min_ttr: float
836
+ max_ttr: float
837
+
838
+ # Full distributions
839
+ mattr_score_dist: Distribution
840
+ ttr_std_dev_dist: Distribution
841
+ min_ttr_dist: Distribution
842
+ max_ttr_dist: Distribution
843
+
844
+ # Chunking context
845
+ chunk_size: int
846
+ chunk_count: int
847
+
848
+ metadata: dict[str, Any]
408
849
 
409
850
 
410
851
  @dataclass
@@ -416,26 +857,30 @@ class HDDResult:
416
857
  new word types as text length increases, providing a mathematically
417
858
  rigorous measure that is less sensitive to text length than TTR.
418
859
 
419
- Related GitHub Issue:
860
+ Related GitHub Issues:
420
861
  #14 - Advanced Lexical Diversity Metrics
421
- https://github.com/craigtrim/pystylometry/issues/14
862
+ #27 - Native chunked analysis with Distribution dataclass
422
863
 
423
864
  References:
424
865
  McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D: A validation
425
866
  study of sophisticated approaches to lexical diversity assessment.
426
867
  Behavior Research Methods, 42(2), 381-392.
427
-
428
- Example:
429
- >>> result = compute_hdd("Sample text for HD-D analysis...")
430
- >>> print(f"HD-D score: {result.hdd_score:.3f}")
431
- >>> print(f"Sample size: {result.sample_size}")
432
868
  """
433
869
 
434
- hdd_score: float # The HD-D value
435
- sample_size: int # Sample size used for calculation
436
- type_count: int # Number of unique types in sample
437
- token_count: int # Number of tokens in sample
438
- metadata: dict[str, Any] # Probability distribution info, etc.
870
+ # Convenient access (mean values)
871
+ hdd_score: float
872
+ sample_size: int
873
+ type_count: int # Total unique across all chunks
874
+ token_count: int # Total across all chunks
875
+
876
+ # Full distributions
877
+ hdd_score_dist: Distribution
878
+
879
+ # Chunking context
880
+ chunk_size: int
881
+ chunk_count: int
882
+
883
+ metadata: dict[str, Any]
439
884
 
440
885
 
441
886
  @dataclass
@@ -447,28 +892,35 @@ class MSTTRResult:
447
892
  normalized measure of lexical diversity that is more comparable across
448
893
  texts of different lengths.
449
894
 
450
- Related GitHub Issue:
895
+ Related GitHub Issues:
451
896
  #14 - Advanced Lexical Diversity Metrics
452
- https://github.com/craigtrim/pystylometry/issues/14
897
+ #27 - Native chunked analysis with Distribution dataclass
453
898
 
454
899
  References:
455
900
  Johnson, W. (1944). Studies in language behavior: I. A program of research.
456
901
  Psychological Monographs, 56(2), 1-15.
457
-
458
- Example:
459
- >>> result = compute_msttr("Sample text...", segment_size=100)
460
- >>> print(f"MSTTR score: {result.msttr_score:.3f}")
461
- >>> print(f"Segments analyzed: {result.segment_count}")
462
902
  """
463
903
 
464
- msttr_score: float # Mean TTR across all segments
465
- segment_size: int # Size of each segment
466
- segment_count: int # Number of segments analyzed
467
- ttr_std_dev: float # Standard deviation of TTR across segments
468
- min_ttr: float # Minimum TTR in any segment
469
- max_ttr: float # Maximum TTR in any segment
470
- segment_ttrs: list[float] # TTR for each individual segment
471
- metadata: dict[str, Any] # Segment details, remaining tokens, etc.
904
+ # Convenient access (mean values)
905
+ msttr_score: float
906
+ segment_size: int
907
+ segment_count: int # Total across all chunks
908
+ ttr_std_dev: float
909
+ min_ttr: float
910
+ max_ttr: float
911
+ segment_ttrs: list[float] # Aggregate from all chunks
912
+
913
+ # Full distributions
914
+ msttr_score_dist: Distribution
915
+ ttr_std_dev_dist: Distribution
916
+ min_ttr_dist: Distribution
917
+ max_ttr_dist: Distribution
918
+
919
+ # Chunking context
920
+ chunk_size: int
921
+ chunk_count: int
922
+
923
+ metadata: dict[str, Any]
472
924
 
473
925
 
474
926
  # ===== Word Frequency Sophistication Results =====
@@ -485,9 +937,9 @@ class WordFrequencySophisticationResult:
485
937
  large corpora. Authors who use less frequent (more sophisticated) words
486
938
  score higher on these metrics.
487
939
 
488
- Related GitHub Issue:
940
+ Related GitHub Issues:
489
941
  #15 - Word Frequency Sophistication Metrics
490
- https://github.com/craigtrim/pystylometry/issues/15
942
+ #27 - Native chunked analysis with Distribution dataclass
491
943
 
492
944
  This analysis uses reference frequency data from:
493
945
  - COCA (Corpus of Contemporary American English)
@@ -507,24 +959,32 @@ class WordFrequencySophisticationResult:
507
959
  A critical evaluation of current word frequency norms. Behavior
508
960
  Research Methods, Instruments, & Computers, 41(4), 977-990.
509
961
  Coxhead, A. (2000). A new academic word list. TESOL Quarterly, 34(2), 213-238.
510
-
511
- Example:
512
- >>> result = compute_word_frequency_sophistication("Sample text...")
513
- >>> print(f"Mean frequency rank: {result.mean_frequency_rank:.1f}")
514
- >>> print(f"Rare word ratio: {result.rare_word_ratio:.3f}")
515
- >>> print(f"Academic word ratio: {result.academic_word_ratio:.3f}")
516
962
  """
517
963
 
518
- mean_frequency_rank: float # Average frequency rank of words
519
- median_frequency_rank: float # Median frequency rank
520
- rare_word_ratio: float # Words beyond frequency threshold / total
521
- common_word_ratio: float # High-frequency words / total
522
- academic_word_ratio: float # Academic Word List words / total
523
- advanced_word_ratio: float # Sophisticated vocabulary / total
524
- frequency_band_distribution: dict[str, float] # Distribution across frequency bands
525
- rarest_words: list[tuple[str, float]] # Least frequent words with ranks
526
- most_common_words: list[tuple[str, float]] # Most frequent words with ranks
527
- metadata: dict[str, Any] # Corpus source, band thresholds, total words, etc.
964
+ # Convenient access (mean values)
965
+ mean_frequency_rank: float
966
+ median_frequency_rank: float
967
+ rare_word_ratio: float
968
+ common_word_ratio: float
969
+ academic_word_ratio: float
970
+ advanced_word_ratio: float
971
+ frequency_band_distribution: dict[str, float] # Aggregate
972
+ rarest_words: list[tuple[str, float]] # Aggregate
973
+ most_common_words: list[tuple[str, float]] # Aggregate
974
+
975
+ # Full distributions
976
+ mean_frequency_rank_dist: Distribution
977
+ median_frequency_rank_dist: Distribution
978
+ rare_word_ratio_dist: Distribution
979
+ common_word_ratio_dist: Distribution
980
+ academic_word_ratio_dist: Distribution
981
+ advanced_word_ratio_dist: Distribution
982
+
983
+ # Chunking context
984
+ chunk_size: int
985
+ chunk_count: int
986
+
987
+ metadata: dict[str, Any]
528
988
 
529
989
 
530
990
  # ===== Additional Readability Results =====
@@ -541,9 +1001,9 @@ class DaleChallResult:
541
1001
  The formula provides a grade level estimate based on sentence length and
542
1002
  the percentage of difficult words.
543
1003
 
544
- Related GitHub Issue:
1004
+ Related GitHub Issues:
545
1005
  #16 - Additional Readability Formulas
546
- https://github.com/craigtrim/pystylometry/issues/16
1006
+ #27 - Native chunked analysis with Distribution dataclass
547
1007
 
548
1008
  Formula: 0.1579 * (difficult_words / total_words * 100) + 0.0496 * avg_sentence_length
549
1009
 
@@ -554,21 +1014,26 @@ class DaleChallResult:
554
1014
  Educational Research Bulletin, 27(1), 11-28.
555
1015
  Chall, J. S., & Dale, E. (1995). Readability revisited: The new Dale-Chall
556
1016
  readability formula. Brookline Books.
557
-
558
- Example:
559
- >>> result = compute_dale_chall("Sample text to analyze...")
560
- >>> print(f"Dale-Chall score: {result.dale_chall_score:.2f}")
561
- >>> print(f"Grade level: {result.grade_level}")
562
- >>> print(f"Difficult word %: {result.difficult_word_ratio * 100:.1f}%")
563
1017
  """
564
1018
 
565
- dale_chall_score: float # The Dale-Chall readability score
566
- grade_level: str # Corresponding grade level (e.g., "7-8", "College")
567
- difficult_word_count: int # Words not on Dale-Chall list
568
- difficult_word_ratio: float # Difficult words / total words
569
- avg_sentence_length: float # Average words per sentence
570
- total_words: int # Total word count
571
- metadata: dict[str, Any] # List of difficult words, adjusted score, etc.
1019
+ # Convenient access (mean values)
1020
+ dale_chall_score: float
1021
+ grade_level: str # Based on mean score
1022
+ difficult_word_count: int # Total across all chunks
1023
+ difficult_word_ratio: float # Mean ratio
1024
+ avg_sentence_length: float # Mean
1025
+ total_words: int # Total across all chunks
1026
+
1027
+ # Full distributions
1028
+ dale_chall_score_dist: Distribution
1029
+ difficult_word_ratio_dist: Distribution
1030
+ avg_sentence_length_dist: Distribution
1031
+
1032
+ # Chunking context
1033
+ chunk_size: int
1034
+ chunk_count: int
1035
+
1036
+ metadata: dict[str, Any]
572
1037
 
573
1038
 
574
1039
  @dataclass
@@ -580,26 +1045,32 @@ class LinsearWriteResult:
580
1045
  syllables) or "hard" (3+ syllables) and uses sentence length to estimate
581
1046
  grade level. It's particularly effective for technical writing.
582
1047
 
583
- Related GitHub Issue:
1048
+ Related GitHub Issues:
584
1049
  #16 - Additional Readability Formulas
585
- https://github.com/craigtrim/pystylometry/issues/16
1050
+ #27 - Native chunked analysis with Distribution dataclass
586
1051
 
587
1052
  References:
588
1053
  Klare, G. R. (1974-1975). Assessing readability. Reading Research Quarterly,
589
1054
  10(1), 62-102.
590
-
591
- Example:
592
- >>> result = compute_linsear_write("Technical manual text...")
593
- >>> print(f"Linsear Write score: {result.linsear_score:.2f}")
594
- >>> print(f"Grade level: {result.grade_level}")
595
1055
  """
596
1056
 
597
- linsear_score: float # The Linsear Write score
598
- grade_level: int # Corresponding U.S. grade level
599
- easy_word_count: int # Words with 1-2 syllables
600
- hard_word_count: int # Words with 3+ syllables
601
- avg_sentence_length: float # Average words per sentence
602
- metadata: dict[str, Any] # Calculation details, sentence count, etc.
1057
+ # Convenient access (mean values)
1058
+ linsear_score: float
1059
+ grade_level: float # Changed to float for mean across chunks
1060
+ easy_word_count: int # Total across all chunks
1061
+ hard_word_count: int # Total across all chunks
1062
+ avg_sentence_length: float # Mean
1063
+
1064
+ # Full distributions
1065
+ linsear_score_dist: Distribution
1066
+ grade_level_dist: Distribution
1067
+ avg_sentence_length_dist: Distribution
1068
+
1069
+ # Chunking context
1070
+ chunk_size: int
1071
+ chunk_count: int
1072
+
1073
+ metadata: dict[str, Any]
603
1074
 
604
1075
 
605
1076
  @dataclass
@@ -611,28 +1082,32 @@ class FryResult:
611
1082
  to determine the grade level. This implementation provides the numerical
612
1083
  coordinates and estimated grade level.
613
1084
 
614
- Related GitHub Issue:
1085
+ Related GitHub Issues:
615
1086
  #16 - Additional Readability Formulas
616
- https://github.com/craigtrim/pystylometry/issues/16
1087
+ #27 - Native chunked analysis with Distribution dataclass
617
1088
 
618
1089
  References:
619
1090
  Fry, E. (1968). A readability formula that saves time. Journal of Reading,
620
1091
  11(7), 513-578.
621
1092
  Fry, E. (1977). Fry's readability graph: Clarifications, validity, and
622
1093
  extension to level 17. Journal of Reading, 21(3), 242-252.
623
-
624
- Example:
625
- >>> result = compute_fry("Sample educational text...")
626
- >>> print(f"Avg sentence length: {result.avg_sentence_length:.1f}")
627
- >>> print(f"Avg syllables/100 words: {result.avg_syllables_per_100:.1f}")
628
- >>> print(f"Grade level: {result.grade_level}")
629
1094
  """
630
1095
 
631
- avg_sentence_length: float # Average words per sentence
632
- avg_syllables_per_100: float # Average syllables per 100 words
633
- grade_level: str # Estimated grade level (e.g., "5", "7", "College")
634
- graph_zone: str # Which zone of Fry graph (for validity checking)
635
- metadata: dict[str, Any] # Total sentences, total syllables, etc.
1096
+ # Convenient access (mean values)
1097
+ avg_sentence_length: float
1098
+ avg_syllables_per_100: float
1099
+ grade_level: str # Based on mean coordinates
1100
+ graph_zone: str # Based on mean coordinates
1101
+
1102
+ # Full distributions
1103
+ avg_sentence_length_dist: Distribution
1104
+ avg_syllables_per_100_dist: Distribution
1105
+
1106
+ # Chunking context
1107
+ chunk_size: int
1108
+ chunk_count: int
1109
+
1110
+ metadata: dict[str, Any]
636
1111
 
637
1112
 
638
1113
  @dataclass
@@ -644,9 +1119,9 @@ class FORCASTResult:
644
1119
  words as a measure, making it faster to compute than syllable-based formulas.
645
1120
  Particularly useful for technical and military documents.
646
1121
 
647
- Related GitHub Issue:
1122
+ Related GitHub Issues:
648
1123
  #16 - Additional Readability Formulas
649
- https://github.com/craigtrim/pystylometry/issues/16
1124
+ #27 - Native chunked analysis with Distribution dataclass
650
1125
 
651
1126
  Formula: 20 - (N / 10), where N is the number of single-syllable words
652
1127
  per 150-word sample.
@@ -655,19 +1130,25 @@ class FORCASTResult:
655
1130
  Caylor, J. S., Sticht, T. G., Fox, L. C., & Ford, J. P. (1973).
656
1131
  Methodologies for determining reading requirements of military
657
1132
  occupational specialties. Human Resources Research Organization.
658
-
659
- Example:
660
- >>> result = compute_forcast("Military technical document text...")
661
- >>> print(f"FORCAST score: {result.forcast_score:.2f}")
662
- >>> print(f"Grade level: {result.grade_level}")
663
1133
  """
664
1134
 
665
- forcast_score: float # The FORCAST readability score
666
- grade_level: int # Corresponding U.S. grade level
667
- single_syllable_ratio: float # Single-syllable words / total words
668
- single_syllable_count: int # Count of single-syllable words
669
- total_words: int # Total word count
670
- metadata: dict[str, Any] # Samples used, calculation details, etc.
1135
+ # Convenient access (mean values)
1136
+ forcast_score: float
1137
+ grade_level: float # Changed to float for mean across chunks
1138
+ single_syllable_ratio: float # Mean ratio
1139
+ single_syllable_count: int # Total across all chunks
1140
+ total_words: int # Total across all chunks
1141
+
1142
+ # Full distributions
1143
+ forcast_score_dist: Distribution
1144
+ grade_level_dist: Distribution
1145
+ single_syllable_ratio_dist: Distribution
1146
+
1147
+ # Chunking context
1148
+ chunk_size: int
1149
+ chunk_count: int
1150
+
1151
+ metadata: dict[str, Any]
671
1152
 
672
1153
 
673
1154
  @dataclass
@@ -679,9 +1160,9 @@ class PowersSumnerKearlResult:
679
1160
  average sentence length and average syllables per word, but with different
680
1161
  coefficients optimized for younger readers.
681
1162
 
682
- Related GitHub Issue:
1163
+ Related GitHub Issues:
683
1164
  #16 - Additional Readability Formulas
684
- https://github.com/craigtrim/pystylometry/issues/16
1165
+ #27 - Native chunked analysis with Distribution dataclass
685
1166
 
686
1167
  Formula: 0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
687
1168
 
@@ -689,21 +1170,28 @@ class PowersSumnerKearlResult:
689
1170
  Powers, R. D., Sumner, W. A., & Kearl, B. E. (1958). A recalculation of
690
1171
  four adult readability formulas. Journal of Educational Psychology,
691
1172
  49(2), 99-105.
692
-
693
- Example:
694
- >>> result = compute_powers_sumner_kearl("Children's book text...")
695
- >>> print(f"PSK score: {result.psk_score:.2f}")
696
- >>> print(f"Grade level: {result.grade_level}")
697
1173
  """
698
1174
 
699
- psk_score: float # The Powers-Sumner-Kearl score
700
- grade_level: float # Corresponding grade level (can be decimal for primary grades)
701
- avg_sentence_length: float # Average words per sentence
702
- avg_syllables_per_word: float # Average syllables per word
703
- total_sentences: int # Total sentence count
704
- total_words: int # Total word count
705
- total_syllables: int # Total syllable count
706
- metadata: dict[str, Any] # Calculation details, comparison to Flesch, etc.
1175
+ # Convenient access (mean values)
1176
+ psk_score: float
1177
+ grade_level: float
1178
+ avg_sentence_length: float
1179
+ avg_syllables_per_word: float
1180
+ total_sentences: int # Total across all chunks
1181
+ total_words: int # Total across all chunks
1182
+ total_syllables: int # Total across all chunks
1183
+
1184
+ # Full distributions
1185
+ psk_score_dist: Distribution
1186
+ grade_level_dist: Distribution
1187
+ avg_sentence_length_dist: Distribution
1188
+ avg_syllables_per_word_dist: Distribution
1189
+
1190
+ # Chunking context
1191
+ chunk_size: int
1192
+ chunk_count: int
1193
+
1194
+ metadata: dict[str, Any]
707
1195
 
708
1196
 
709
1197
  # ===== Advanced Syntactic Results =====
@@ -720,9 +1208,9 @@ class AdvancedSyntacticResult:
720
1208
  capture sentence complexity, grammatical sophistication, and syntactic
721
1209
  style preferences.
722
1210
 
723
- Related GitHub Issue:
1211
+ Related GitHub Issues:
724
1212
  #17 - Advanced Syntactic Analysis
725
- https://github.com/craigtrim/pystylometry/issues/17
1213
+ #27 - Native chunked analysis with Distribution dataclass
726
1214
 
727
1215
  Features analyzed:
728
1216
  - Parse tree depth (sentence structural complexity)
@@ -740,28 +1228,42 @@ class AdvancedSyntacticResult:
740
1228
  Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
741
1229
  Lu, X. (2010). Automatic analysis of syntactic complexity in second language
742
1230
  writing. International Journal of Corpus Linguistics, 15(4), 474-496.
743
-
744
- Example:
745
- >>> result = compute_advanced_syntactic("Complex sentence structures...")
746
- >>> print(f"Parse tree depth: {result.mean_parse_tree_depth:.1f}")
747
- >>> print(f"T-units: {result.t_unit_count}")
748
- >>> print(f"Passive voice %: {result.passive_voice_ratio * 100:.1f}%")
749
1231
  """
750
1232
 
751
- mean_parse_tree_depth: float # Average depth of dependency parse trees
752
- max_parse_tree_depth: int # Maximum parse tree depth in text
753
- t_unit_count: int # Number of T-units (minimal terminable units)
754
- mean_t_unit_length: float # Average words per T-unit
755
- clausal_density: float # Clauses per T-unit
756
- dependent_clause_ratio: float # Dependent clauses / total clauses
757
- passive_voice_ratio: float # Passive constructions / total sentences
758
- subordination_index: float # Subordinate clauses / total clauses
759
- coordination_index: float # Coordinate clauses / total clauses
760
- sentence_complexity_score: float # Composite complexity metric
761
- dependency_distance: float # Mean distance between heads and dependents
762
- left_branching_ratio: float # Left-branching structures / total
763
- right_branching_ratio: float # Right-branching structures / total
764
- metadata: dict[str, Any] # Parse tree details, clause counts, etc.
1233
+ # Convenient access (mean values)
1234
+ mean_parse_tree_depth: float
1235
+ max_parse_tree_depth: float # Changed to float for mean across chunks
1236
+ t_unit_count: int # Total across all chunks
1237
+ mean_t_unit_length: float
1238
+ clausal_density: float
1239
+ dependent_clause_ratio: float
1240
+ passive_voice_ratio: float
1241
+ subordination_index: float
1242
+ coordination_index: float
1243
+ sentence_complexity_score: float
1244
+ dependency_distance: float
1245
+ left_branching_ratio: float
1246
+ right_branching_ratio: float
1247
+
1248
+ # Full distributions
1249
+ mean_parse_tree_depth_dist: Distribution
1250
+ max_parse_tree_depth_dist: Distribution
1251
+ mean_t_unit_length_dist: Distribution
1252
+ clausal_density_dist: Distribution
1253
+ dependent_clause_ratio_dist: Distribution
1254
+ passive_voice_ratio_dist: Distribution
1255
+ subordination_index_dist: Distribution
1256
+ coordination_index_dist: Distribution
1257
+ sentence_complexity_score_dist: Distribution
1258
+ dependency_distance_dist: Distribution
1259
+ left_branching_ratio_dist: Distribution
1260
+ right_branching_ratio_dist: Distribution
1261
+
1262
+ # Chunking context
1263
+ chunk_size: int
1264
+ chunk_count: int
1265
+
1266
+ metadata: dict[str, Any]
765
1267
 
766
1268
 
767
1269
  # ===== Sentence Type Results =====
@@ -778,15 +1280,16 @@ class SentenceTypeResult:
778
1280
  function (declarative, interrogative, imperative, exclamatory). Different
779
1281
  authors and genres show distinct patterns in sentence type distribution.
780
1282
 
781
- Related GitHub Issue:
1283
+ Related GitHub Issues:
782
1284
  #18 - Sentence Type Classification
783
- https://github.com/craigtrim/pystylometry/issues/18
1285
+ #27 - Native chunked analysis with Distribution dataclass
784
1286
 
785
1287
  Structural types:
786
1288
  - Simple: One independent clause (e.g., "The cat sat.")
787
1289
  - Compound: Multiple independent clauses (e.g., "I came, I saw, I conquered.")
788
1290
  - Complex: One independent + dependent clause(s) (e.g., "When I arrived, I saw her.")
789
- - Compound-Complex: Multiple independent + dependent (e.g., "I came when called, and I stayed.")
1291
+ - Compound-Complex: Multiple independent + dependent
1292
+ (e.g., "I came when called, and I stayed.")
790
1293
 
791
1294
  Functional types:
792
1295
  - Declarative: Statement (e.g., "The sky is blue.")
@@ -797,27 +1300,19 @@ class SentenceTypeResult:
797
1300
  References:
798
1301
  Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
799
1302
  Huddleston, R., & Pullum, G. K. (2002). The Cambridge Grammar of the English Language.
800
-
801
- Example:
802
- >>> result = compute_sentence_types("Mix of sentence types here...")
803
- >>> print(f"Simple: {result.simple_ratio * 100:.1f}%")
804
- >>> print(f"Complex: {result.complex_ratio * 100:.1f}%")
805
- >>> print(f"Questions: {result.interrogative_ratio * 100:.1f}%")
806
1303
  """
807
1304
 
808
- # Structural type ratios (sum to 1.0)
809
- simple_ratio: float # Simple sentences / total
810
- compound_ratio: float # Compound sentences / total
811
- complex_ratio: float # Complex sentences / total
812
- compound_complex_ratio: float # Compound-complex / total
813
-
814
- # Functional type ratios (sum to 1.0)
815
- declarative_ratio: float # Declarative sentences / total
816
- interrogative_ratio: float # Interrogative (questions) / total
817
- imperative_ratio: float # Imperative (commands) / total
818
- exclamatory_ratio: float # Exclamatory sentences / total
819
-
820
- # Counts
1305
+ # Convenient access (mean ratios)
1306
+ simple_ratio: float
1307
+ compound_ratio: float
1308
+ complex_ratio: float
1309
+ compound_complex_ratio: float
1310
+ declarative_ratio: float
1311
+ interrogative_ratio: float
1312
+ imperative_ratio: float
1313
+ exclamatory_ratio: float
1314
+
1315
+ # Counts (totals across all chunks)
821
1316
  simple_count: int
822
1317
  compound_count: int
823
1318
  complex_count: int
@@ -828,11 +1323,27 @@ class SentenceTypeResult:
828
1323
  exclamatory_count: int
829
1324
  total_sentences: int
830
1325
 
831
- # Diversity
832
- structural_diversity: float # Shannon entropy of structural type distribution
833
- functional_diversity: float # Shannon entropy of functional type distribution
1326
+ # Diversity (mean across chunks)
1327
+ structural_diversity: float
1328
+ functional_diversity: float
1329
+
1330
+ # Full distributions
1331
+ simple_ratio_dist: Distribution
1332
+ compound_ratio_dist: Distribution
1333
+ complex_ratio_dist: Distribution
1334
+ compound_complex_ratio_dist: Distribution
1335
+ declarative_ratio_dist: Distribution
1336
+ interrogative_ratio_dist: Distribution
1337
+ imperative_ratio_dist: Distribution
1338
+ exclamatory_ratio_dist: Distribution
1339
+ structural_diversity_dist: Distribution
1340
+ functional_diversity_dist: Distribution
1341
+
1342
+ # Chunking context
1343
+ chunk_size: int
1344
+ chunk_count: int
834
1345
 
835
- metadata: dict[str, Any] # Sentence-by-sentence classifications, etc.
1346
+ metadata: dict[str, Any]
836
1347
 
837
1348
 
838
1349
  # ===== Extended N-gram Results =====
@@ -1006,6 +1517,7 @@ class VocabularyOverlapResult:
1006
1517
  - Dice coefficient (2 * intersection / sum of sizes)
1007
1518
  - Overlap coefficient (intersection / min(size1, size2))
1008
1519
  - Cosine similarity (using word frequency vectors)
1520
+ - KL divergence (asymmetric distributional difference)
1009
1521
  - Shared vocabulary size and ratio
1010
1522
  - Unique words in each text
1011
1523
  - Most distinctive words for each text
@@ -1015,6 +1527,10 @@ class VocabularyOverlapResult:
1015
1527
  New Phytologist, 11(2), 37-50.
1016
1528
  Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
1017
1529
  Retrieval. McGraw-Hill.
1530
+ Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
1531
+ Annals of Mathematical Statistics, 22(1), 79-86.
1532
+ Manning, C. D., & Schütze, H. (1999). Foundations of Statistical NLP.
1533
+ MIT Press.
1018
1534
 
1019
1535
  Example:
1020
1536
  >>> result = compute_vocabulary_overlap(text1, text2)
@@ -1028,6 +1544,7 @@ class VocabularyOverlapResult:
1028
1544
  dice_coefficient: float # 2 * intersection / (size1 + size2)
1029
1545
  overlap_coefficient: float # Intersection / min(size1, size2)
1030
1546
  cosine_similarity: float # Cosine of frequency vectors
1547
+ kl_divergence: float # Kullback-Leibler divergence (asymmetric, text1 || text2)
1031
1548
 
1032
1549
  # Vocabulary sizes
1033
1550
  text1_vocab_size: int # Unique words in text 1
@@ -1249,6 +1766,87 @@ class KilgarriffResult:
1249
1766
  metadata: dict[str, Any] # Frequency tables, expected values, etc.
1250
1767
 
1251
1768
 
1769
+ @dataclass
1770
+ class KilgarriffDriftResult:
1771
+ """Result from Kilgarriff chi-squared drift detection within a single document.
1772
+
1773
+ This result captures stylistic drift patterns by comparing sequential chunks
1774
+ of text using Kilgarriff's chi-squared method. It enables detection of
1775
+ inconsistent authorship, heavy editing, pasted content, and AI-generated
1776
+ text signatures.
1777
+
1778
+ Related GitHub Issues:
1779
+ #36 - Kilgarriff Chi-Squared drift detection for intra-document analysis
1780
+ https://github.com/craigtrim/pystylometry/issues/36
1781
+ #31 - Classical Stylometric Methods from Programming Historian
1782
+ https://github.com/craigtrim/pystylometry/issues/31
1783
+
1784
+ Pattern Signatures:
1785
+ - consistent: Low, stable χ² across pairs (natural human writing)
1786
+ - gradual_drift: Slowly increasing trend (author fatigue, topic shift)
1787
+ - sudden_spike: One pair has high χ² (pasted content, different author)
1788
+ - suspiciously_uniform: Near-zero variance (possible AI generation)
1789
+ - unknown: Insufficient data for classification
1790
+
1791
+ Marketing Name: "Style Drift Detector" / "Consistency Fingerprint"
1792
+
1793
+ References:
1794
+ Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus
1795
+ Linguistics, 6(1), 97-133.
1796
+
1797
+ Example:
1798
+ >>> result = compute_kilgarriff_drift(text, window_size=1000, stride=500)
1799
+ >>> result.pattern # "consistent", "gradual_drift", "sudden_spike", etc.
1800
+ 'consistent'
1801
+ >>> result.mean_chi_squared # Average χ² across chunk pairs
1802
+ 45.2
1803
+ >>> result.status # "success", "marginal_data", "insufficient_data"
1804
+ 'success'
1805
+ """
1806
+
1807
+ # Status (graceful handling of edge cases)
1808
+ status: str # "success", "marginal_data", "insufficient_data"
1809
+ status_message: str # Human-readable explanation
1810
+
1811
+ # Pattern classification
1812
+ pattern: str # "consistent", "gradual_drift", "sudden_spike", "suspiciously_uniform", "unknown"
1813
+ pattern_confidence: float # 0.0-1.0 confidence in classification
1814
+
1815
+ # Holistic metrics (may be NaN if insufficient data)
1816
+ mean_chi_squared: float # Average χ² across all chunk pairs
1817
+ std_chi_squared: float # Standard deviation of χ² values
1818
+ max_chi_squared: float # Highest χ² between any two chunks
1819
+ min_chi_squared: float # Lowest χ² between any two chunks
1820
+ max_location: int # Index of chunk boundary with max χ² (0-indexed)
1821
+ trend: float # Linear regression slope of χ² over chunk pairs
1822
+
1823
+ # Pairwise comparison data
1824
+ pairwise_scores: list[dict] # [{"chunk_pair": (0, 1), "chi_squared": 45.2, "top_words": [...]}]
1825
+
1826
+ # Window configuration (for reproducibility)
1827
+ window_size: int
1828
+ stride: int
1829
+ overlap_ratio: float # Computed: max(0, 1 - stride/window_size)
1830
+ comparison_mode: str # "sequential", "all_pairs", "fixed_lag"
1831
+ window_count: int
1832
+
1833
+ # For all_pairs mode only
1834
+ distance_matrix: list[list[float]] | None # None for sequential/fixed_lag
1835
+
1836
+ # Thresholds used for pattern classification (for transparency)
1837
+ thresholds: dict[str, float]
1838
+
1839
+ metadata: dict[str, Any]
1840
+
1841
+
1842
+ # ===== Consistency Module Thresholds =====
1843
+ # Related to GitHub Issue #36
1844
+ # These are calibration constants for pattern classification
1845
+
1846
+ MIN_WINDOWS = 3 # Bare minimum for variance calculation
1847
+ RECOMMENDED_WINDOWS = 5 # For reliable pattern classification
1848
+
1849
+
1252
1850
  @dataclass
1253
1851
  class MinMaxResult:
1254
1852
  """Result from Min-Max distance method (Burrows' original method).
@@ -1305,6 +1903,54 @@ class JohnsBurrowsResult:
1305
1903
  metadata: dict[str, Any] # Method-specific parameters, z-scores, etc.
1306
1904
 
1307
1905
 
1906
+ @dataclass
1907
+ class CompressionResult:
1908
+ """Result from compression-based authorship attribution.
1909
+
1910
+ Compression-based methods use the Normalized Compression Distance (NCD) to
1911
+ measure similarity between texts. The intuition is that if two texts are
1912
+ similar, compressing them together will yield better compression than
1913
+ compressing separately. This approach is language-independent and captures
1914
+ deep statistical regularities.
1915
+
1916
+ Related GitHub Issue:
1917
+ #24 - Additional Authorship Attribution Methods
1918
+ https://github.com/craigtrim/pystylometry/issues/24
1919
+
1920
+ Formula:
1921
+ NCD(x,y) = (C(xy) - min(C(x), C(y))) / max(C(x), C(y))
1922
+
1923
+ where C(x) is the compressed size of x, and C(xy) is the compressed
1924
+ size of x and y concatenated.
1925
+
1926
+ Interpretation:
1927
+ - NCD ≈ 0: Texts are very similar
1928
+ - NCD ≈ 1: Texts are very different
1929
+ - Typical same-author pairs: 0.3-0.6
1930
+ - Typical different-author pairs: 0.6-0.9
1931
+
1932
+ References:
1933
+ Cilibrasi, R., & Vitányi, P. M. (2005). Clustering by compression.
1934
+ IEEE Transactions on Information Theory, 51(4), 1523-1545.
1935
+
1936
+ Benedetto, D., Caglioti, E., & Loreto, V. (2002). Language trees and
1937
+ zipping. Physical Review Letters, 88(4), 048702.
1938
+
1939
+ Example:
1940
+ >>> result = compute_compression_distance(text1, text2)
1941
+ >>> print(f"NCD: {result.ncd:.3f}")
1942
+ >>> if result.ncd < 0.5:
1943
+ ... print("Texts likely by same author")
1944
+ """
1945
+
1946
+ ncd: float # Normalized Compression Distance [0, 1+]
1947
+ compressor: str # Compression algorithm used (e.g., "gzip", "zlib", "bz2")
1948
+ text1_compressed_size: int # Compressed size of text1 alone
1949
+ text2_compressed_size: int # Compressed size of text2 alone
1950
+ combined_compressed_size: int # Compressed size of concatenated texts
1951
+ metadata: dict[str, Any] # Raw sizes, compression ratios, etc.
1952
+
1953
+
1308
1954
  # ===== Rhythm and Prosody Results =====
1309
1955
  # Related to GitHub Issue #25: Rhythm and Prosody Metrics
1310
1956
  # https://github.com/craigtrim/pystylometry/issues/25
@@ -1379,6 +2025,118 @@ class RhythmProsodyResult:
1379
2025
  metadata: dict[str, Any] # Syllable counts, stress patterns, phoneme data, etc.
1380
2026
 
1381
2027
 
2028
+ # ===== Dialect Detection Results =====
2029
+ # Related to GitHub Issue #35: Dialect detection with extensible JSON markers
2030
+ # https://github.com/craigtrim/pystylometry/issues/35
2031
+ # Related to GitHub Issue #30: Whonix stylometry features
2032
+ # https://github.com/craigtrim/pystylometry/issues/30
2033
+
2034
+
2035
+ @dataclass
2036
+ class DialectResult:
2037
+ """Result from dialect detection analysis.
2038
+
2039
+ Dialect detection identifies regional linguistic preferences (British vs.
2040
+ American English) and measures text markedness - how far the text deviates
2041
+ from "unmarked" standard English. This analysis uses an extensible JSON-based
2042
+ marker database covering vocabulary, spelling patterns, grammar patterns,
2043
+ punctuation conventions, and idiomatic expressions.
2044
+
2045
+ The analysis follows the chunking pattern from Issue #27, computing metrics
2046
+ per chunk and providing distributions for stylometric fingerprinting. Dialect
2047
+ markers are sparse, so variance across chunks can reveal mixed authorship
2048
+ (e.g., a UK speaker using ChatGPT-generated American English content).
2049
+
2050
+ Related GitHub Issues:
2051
+ #35 - Dialect detection with extensible JSON markers
2052
+ https://github.com/craigtrim/pystylometry/issues/35
2053
+ #30 - Whonix stylometry features (regional linguistic preferences)
2054
+ https://github.com/craigtrim/pystylometry/issues/30
2055
+ #27 - Native chunked analysis with Distribution dataclass
2056
+ https://github.com/craigtrim/pystylometry/issues/27
2057
+
2058
+ Theoretical Background:
2059
+ Markedness theory (Battistella, 1990) informs the markedness_score:
2060
+ marked forms stand out against "standard" written English. High
2061
+ markedness suggests intentional stylistic choice or strong dialect
2062
+ identity. Dialectometry (Goebl, 1982; Nerbonne, 2009) provides the
2063
+ quantitative framework for holistic dialect measurement.
2064
+
2065
+ Feature Levels:
2066
+ Markers are categorized by linguistic level for fine-grained analysis:
2067
+ - Phonological: Spelling reflecting pronunciation (colour/color)
2068
+ - Morphological: Word formation (-ise/-ize, -our/-or, doubled L)
2069
+ - Lexical: Different words for same concept (flat/apartment)
2070
+ - Syntactic: Grammar differences (have got/have, collective nouns)
2071
+
2072
+ Eye Dialect vs. True Dialect:
2073
+ Following Encyclopedia.com's distinction, "eye dialect" (gonna, wanna)
2074
+ indicates informal register, not regional dialect. True dialect markers
2075
+ (colour, flat, lorry) indicate actual regional preference.
2076
+
2077
+ References:
2078
+ Battistella, Edwin L. "Markedness: The Evaluative Superstructure of
2079
+ Language." State University of New York Press, 1990.
2080
+ Goebl, Hans. "Dialektometrie: Prinzipien und Methoden des Einsatzes der
2081
+ numerischen Taxonomie im Bereich der Dialektgeographie." Verlag der
2082
+ Österreichischen Akademie der Wissenschaften, 1982.
2083
+ Nerbonne, John. "Data-Driven Dialectology." Language and Linguistics
2084
+ Compass, vol. 3, no. 1, 2009, pp. 175-198.
2085
+ Labov, William. "The Social Stratification of English in New York City."
2086
+ Cambridge University Press, 2006.
2087
+ Whonix Project. "Stylometry: Deanonymization Techniques." Whonix Wiki,
2088
+ https://www.whonix.org/wiki/Stylometry
2089
+
2090
+ Example:
2091
+ >>> result = compute_dialect(text, chunk_size=1000)
2092
+ >>> result.dialect # "british", "american", "mixed", or "neutral"
2093
+ 'british'
2094
+ >>> result.british_score # Mean across chunks
2095
+ 0.72
2096
+ >>> result.british_score_dist.std # Variance reveals fingerprint
2097
+ 0.05
2098
+ >>> result.markedness_score # Deviation from standard English
2099
+ 0.35
2100
+ """
2101
+
2102
+ # Classification result
2103
+ dialect: str # "british", "american", "mixed", "neutral"
2104
+ confidence: float # 0.0-1.0, how confident the classification is
2105
+
2106
+ # Convenient access (mean values across chunks)
2107
+ british_score: float # Mean British marker density (0.0-1.0)
2108
+ american_score: float # Mean American marker density (0.0-1.0)
2109
+ markedness_score: float # Mean deviation from unmarked standard (0.0-1.0)
2110
+
2111
+ # Full distributions for stylometric fingerprinting
2112
+ british_score_dist: Distribution
2113
+ american_score_dist: Distribution
2114
+ markedness_score_dist: Distribution
2115
+
2116
+ # Marker breakdown by linguistic level (aggregated across chunks)
2117
+ # Keys: "phonological", "morphological", "lexical", "syntactic"
2118
+ markers_by_level: dict[str, dict[str, int]]
2119
+
2120
+ # Detailed marker counts (aggregated across chunks)
2121
+ spelling_markers: dict[str, int] # {"colour": 2, "color": 1}
2122
+ vocabulary_markers: dict[str, int] # {"flat": 1, "apartment": 0}
2123
+ grammar_markers: dict[str, int] # {"have got": 1}
2124
+
2125
+ # Eye dialect (informal register indicators, not true dialect)
2126
+ eye_dialect_count: int # Total eye dialect markers (gonna, wanna, etc.)
2127
+ eye_dialect_ratio: float # Eye dialect per 1000 words
2128
+
2129
+ # Register analysis hints
2130
+ register_hints: dict[str, Any] # {"formality": 0.7, "hedging_density": 0.05}
2131
+
2132
+ # Chunking context
2133
+ chunk_size: int
2134
+ chunk_count: int
2135
+
2136
+ # Extensible metadata
2137
+ metadata: dict[str, Any]
2138
+
2139
+
1382
2140
  # ===== Unified Analysis Result =====
1383
2141
 
1384
2142