pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pystylometry/__init__.py +30 -5
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1954 -28
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +26 -1
  6. pystylometry/authorship/additional_methods.py +75 -0
  7. pystylometry/authorship/kilgarriff.py +347 -0
  8. pystylometry/character/__init__.py +15 -0
  9. pystylometry/character/character_metrics.py +389 -0
  10. pystylometry/cli.py +427 -0
  11. pystylometry/consistency/__init__.py +57 -0
  12. pystylometry/consistency/_thresholds.py +162 -0
  13. pystylometry/consistency/drift.py +549 -0
  14. pystylometry/dialect/__init__.py +65 -0
  15. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  16. pystylometry/dialect/_loader.py +360 -0
  17. pystylometry/dialect/detector.py +533 -0
  18. pystylometry/lexical/__init__.py +13 -6
  19. pystylometry/lexical/advanced_diversity.py +680 -0
  20. pystylometry/lexical/function_words.py +590 -0
  21. pystylometry/lexical/hapax.py +310 -33
  22. pystylometry/lexical/mtld.py +180 -22
  23. pystylometry/lexical/ttr.py +149 -0
  24. pystylometry/lexical/word_frequency_sophistication.py +1805 -0
  25. pystylometry/lexical/yule.py +142 -29
  26. pystylometry/ngrams/__init__.py +2 -0
  27. pystylometry/ngrams/entropy.py +150 -49
  28. pystylometry/ngrams/extended_ngrams.py +235 -0
  29. pystylometry/prosody/__init__.py +12 -0
  30. pystylometry/prosody/rhythm_prosody.py +53 -0
  31. pystylometry/readability/__init__.py +12 -0
  32. pystylometry/readability/additional_formulas.py +2110 -0
  33. pystylometry/readability/ari.py +173 -35
  34. pystylometry/readability/coleman_liau.py +150 -30
  35. pystylometry/readability/complex_words.py +531 -0
  36. pystylometry/readability/flesch.py +181 -32
  37. pystylometry/readability/gunning_fog.py +208 -35
  38. pystylometry/readability/smog.py +126 -28
  39. pystylometry/readability/syllables.py +137 -30
  40. pystylometry/stylistic/__init__.py +20 -0
  41. pystylometry/stylistic/cohesion_coherence.py +45 -0
  42. pystylometry/stylistic/genre_register.py +45 -0
  43. pystylometry/stylistic/markers.py +131 -0
  44. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  45. pystylometry/syntactic/__init__.py +4 -0
  46. pystylometry/syntactic/advanced_syntactic.py +494 -0
  47. pystylometry/syntactic/pos_ratios.py +172 -17
  48. pystylometry/syntactic/sentence_stats.py +105 -18
  49. pystylometry/syntactic/sentence_types.py +526 -0
  50. pystylometry/viz/__init__.py +71 -0
  51. pystylometry/viz/drift.py +589 -0
  52. pystylometry/viz/jsx/__init__.py +31 -0
  53. pystylometry/viz/jsx/_base.py +144 -0
  54. pystylometry/viz/jsx/report.py +677 -0
  55. pystylometry/viz/jsx/timeline.py +716 -0
  56. pystylometry/viz/jsx/viewer.py +1032 -0
  57. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
  58. pystylometry-1.1.0.dist-info/RECORD +63 -0
  59. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  60. pystylometry-0.1.0.dist-info/RECORD +0 -26
  61. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,2110 @@
1
+ """Additional readability formulas.
2
+
3
+ This module provides additional readability metrics beyond the core formulas
4
+ (Flesch, SMOG, Gunning Fog, Coleman-Liau, ARI). These formulas offer alternative
5
+ approaches to measuring text difficulty and are valuable for cross-validation
6
+ and comprehensive readability assessment.
7
+
8
+ Related GitHub Issues:
9
+ #16 - Additional Readability Formulas
10
+ #27 - Native chunked analysis with Distribution dataclass
11
+
12
+ Formulas implemented:
13
+ - Dale-Chall: Based on list of 3000 familiar words
14
+ - Linsear Write: Developed for technical writing assessment
15
+ - Fry Readability Graph: Visual graph-based assessment
16
+ - FORCAST: Military formula using only single-syllable words
17
+ - Powers-Sumner-Kearl: Recalibrated Flesch for primary grades
18
+
19
+ References:
20
+ Dale, E., & Chall, J. S. (1948). A formula for predicting readability.
21
+ Chall, J. S., & Dale, E. (1995). Readability revisited: The new Dale-Chall
22
+ readability formula. Brookline Books.
23
+ Klare, G. R. (1974-1975). Assessing readability. Reading Research Quarterly.
24
+ Fry, E. (1968). A readability formula that saves time. Journal of Reading.
25
+ Caylor, J. S., et al. (1973). Methodologies for determining reading requirements
26
+ of military occupational specialties. Human Resources Research Organization.
27
+ Powers, R. D., Sumner, W. A., & Kearl, B. E. (1958). A recalculation of four
28
+ adult readability formulas. Journal of Educational Psychology.
29
+ """
30
+
31
+ import math
32
+
33
+ from .._normalize import normalize_for_readability
34
+ from .._types import (
35
+ DaleChallResult,
36
+ Distribution,
37
+ FORCASTResult,
38
+ FryResult,
39
+ LinsearWriteResult,
40
+ PowersSumnerKearlResult,
41
+ chunk_text,
42
+ make_distribution,
43
+ )
44
+ from .._utils import split_sentences, tokenize
45
+ from .syllables import count_syllables
46
+
47
+ # Dale-Chall List of Familiar Words (subset of ~1200 words)
48
+ # GitHub Issue #16: https://github.com/craigtrim/pystylometry/issues/16
49
+ # Full Dale-Chall list has 3000 words that 80% of 4th graders understand.
50
+ # This is a representative subset covering most common everyday words.
51
+ DALE_CHALL_FAMILIAR_WORDS = {
52
+ # Articles, pronouns, determiners
53
+ "a",
54
+ "an",
55
+ "the",
56
+ "this",
57
+ "that",
58
+ "these",
59
+ "those",
60
+ "some",
61
+ "any",
62
+ "all",
63
+ "each",
64
+ "every",
65
+ "both",
66
+ "few",
67
+ "many",
68
+ "much",
69
+ "more",
70
+ "most",
71
+ "other",
72
+ "another",
73
+ "such",
74
+ "what",
75
+ "which",
76
+ "who",
77
+ "whom",
78
+ "whose",
79
+ "whoever",
80
+ "i",
81
+ "me",
82
+ "my",
83
+ "mine",
84
+ "myself",
85
+ "we",
86
+ "us",
87
+ "our",
88
+ "ours",
89
+ "ourselves",
90
+ "you",
91
+ "your",
92
+ "yours",
93
+ "yourself",
94
+ "yourselves",
95
+ "he",
96
+ "him",
97
+ "his",
98
+ "himself",
99
+ "she",
100
+ "her",
101
+ "hers",
102
+ "herself",
103
+ "it",
104
+ "its",
105
+ "itself",
106
+ "they",
107
+ "them",
108
+ "their",
109
+ "theirs",
110
+ "themselves",
111
+ "one",
112
+ "ones",
113
+ "someone",
114
+ "somebody",
115
+ "something",
116
+ "anyone",
117
+ "anybody",
118
+ "anything",
119
+ "everyone",
120
+ "everybody",
121
+ "everything",
122
+ "no",
123
+ "none",
124
+ "nobody",
125
+ "nothing",
126
+ # Conjunctions and prepositions
127
+ "and",
128
+ "or",
129
+ "but",
130
+ "if",
131
+ "when",
132
+ "where",
133
+ "why",
134
+ "how",
135
+ "because",
136
+ "so",
137
+ "for",
138
+ "nor",
139
+ "yet",
140
+ "after",
141
+ "before",
142
+ "while",
143
+ "since",
144
+ "until",
145
+ "unless",
146
+ "though",
147
+ "although",
148
+ "whether",
149
+ "than",
150
+ "as",
151
+ "like",
152
+ "of",
153
+ "to",
154
+ "in",
155
+ "on",
156
+ "at",
157
+ "by",
158
+ "with",
159
+ "from",
160
+ "about",
161
+ "into",
162
+ "through",
163
+ "over",
164
+ "under",
165
+ "above",
166
+ "below",
167
+ "between",
168
+ "among",
169
+ "against",
170
+ "during",
171
+ "without",
172
+ "within",
173
+ "along",
174
+ "across",
175
+ "behind",
176
+ "beside",
177
+ "near",
178
+ "off",
179
+ "out",
180
+ "up",
181
+ "down",
182
+ "around",
183
+ "past",
184
+ "toward",
185
+ "upon",
186
+ # Common verbs (base, past, -ing, -ed forms included)
187
+ "be",
188
+ "am",
189
+ "is",
190
+ "are",
191
+ "was",
192
+ "were",
193
+ "been",
194
+ "being",
195
+ "have",
196
+ "has",
197
+ "had",
198
+ "having",
199
+ "do",
200
+ "does",
201
+ "did",
202
+ "doing",
203
+ "done",
204
+ "will",
205
+ "would",
206
+ "shall",
207
+ "should",
208
+ "may",
209
+ "might",
210
+ "must",
211
+ "can",
212
+ "could",
213
+ "go",
214
+ "goes",
215
+ "went",
216
+ "gone",
217
+ "going",
218
+ "come",
219
+ "comes",
220
+ "came",
221
+ "coming",
222
+ "make",
223
+ "makes",
224
+ "made",
225
+ "making",
226
+ "get",
227
+ "gets",
228
+ "got",
229
+ "getting",
230
+ "gotten",
231
+ "know",
232
+ "knows",
233
+ "knew",
234
+ "known",
235
+ "knowing",
236
+ "think",
237
+ "thinks",
238
+ "thought",
239
+ "thinking",
240
+ "see",
241
+ "sees",
242
+ "saw",
243
+ "seen",
244
+ "seeing",
245
+ "look",
246
+ "looks",
247
+ "looked",
248
+ "looking",
249
+ "take",
250
+ "takes",
251
+ "took",
252
+ "taken",
253
+ "taking",
254
+ "give",
255
+ "gives",
256
+ "gave",
257
+ "given",
258
+ "giving",
259
+ "find",
260
+ "finds",
261
+ "found",
262
+ "finding",
263
+ "tell",
264
+ "tells",
265
+ "told",
266
+ "telling",
267
+ "ask",
268
+ "asks",
269
+ "asked",
270
+ "asking",
271
+ "work",
272
+ "works",
273
+ "worked",
274
+ "working",
275
+ "seem",
276
+ "seems",
277
+ "seemed",
278
+ "seeming",
279
+ "feel",
280
+ "feels",
281
+ "felt",
282
+ "feeling",
283
+ "try",
284
+ "tries",
285
+ "tried",
286
+ "trying",
287
+ "leave",
288
+ "leaves",
289
+ "left",
290
+ "leaving",
291
+ "call",
292
+ "calls",
293
+ "called",
294
+ "calling",
295
+ "use",
296
+ "uses",
297
+ "used",
298
+ "using",
299
+ "want",
300
+ "wants",
301
+ "wanted",
302
+ "wanting",
303
+ "need",
304
+ "needs",
305
+ "needed",
306
+ "needing",
307
+ "say",
308
+ "says",
309
+ "said",
310
+ "saying",
311
+ "talk",
312
+ "talks",
313
+ "talked",
314
+ "talking",
315
+ "turn",
316
+ "turns",
317
+ "turned",
318
+ "turning",
319
+ "run",
320
+ "runs",
321
+ "ran",
322
+ "running",
323
+ "move",
324
+ "moves",
325
+ "moved",
326
+ "moving",
327
+ "live",
328
+ "lives",
329
+ "lived",
330
+ "living",
331
+ "believe",
332
+ "believes",
333
+ "believed",
334
+ "believing",
335
+ "hold",
336
+ "holds",
337
+ "held",
338
+ "holding",
339
+ "bring",
340
+ "brings",
341
+ "brought",
342
+ "bringing",
343
+ "happen",
344
+ "happens",
345
+ "happened",
346
+ "happening",
347
+ "write",
348
+ "writes",
349
+ "wrote",
350
+ "written",
351
+ "writing",
352
+ "sit",
353
+ "sits",
354
+ "sat",
355
+ "sitting",
356
+ "stand",
357
+ "stands",
358
+ "stood",
359
+ "standing",
360
+ "hear",
361
+ "hears",
362
+ "heard",
363
+ "hearing",
364
+ "let",
365
+ "lets",
366
+ "letting",
367
+ "help",
368
+ "helps",
369
+ "helped",
370
+ "helping",
371
+ "show",
372
+ "shows",
373
+ "showed",
374
+ "shown",
375
+ "showing",
376
+ "play",
377
+ "plays",
378
+ "played",
379
+ "playing",
380
+ "read",
381
+ "reads",
382
+ "reading",
383
+ "change",
384
+ "changes",
385
+ "changed",
386
+ "changing",
387
+ "keep",
388
+ "keeps",
389
+ "kept",
390
+ "keeping",
391
+ "start",
392
+ "starts",
393
+ "started",
394
+ "starting",
395
+ "stop",
396
+ "stops",
397
+ "stopped",
398
+ "stopping",
399
+ "learn",
400
+ "learns",
401
+ "learned",
402
+ "learning",
403
+ "grow",
404
+ "grows",
405
+ "grew",
406
+ "grown",
407
+ "growing",
408
+ "open",
409
+ "opens",
410
+ "opened",
411
+ "opening",
412
+ "close",
413
+ "closes",
414
+ "closed",
415
+ "closing",
416
+ "walk",
417
+ "walks",
418
+ "walked",
419
+ "walking",
420
+ "win",
421
+ "wins",
422
+ "won",
423
+ "winning",
424
+ "begin",
425
+ "begins",
426
+ "began",
427
+ "begun",
428
+ "beginning",
429
+ "end",
430
+ "ends",
431
+ "ended",
432
+ "ending",
433
+ "lose",
434
+ "loses",
435
+ "lost",
436
+ "losing",
437
+ "send",
438
+ "sends",
439
+ "sent",
440
+ "sending",
441
+ "buy",
442
+ "buys",
443
+ "bought",
444
+ "buying",
445
+ "pay",
446
+ "pays",
447
+ "paid",
448
+ "paying",
449
+ "eat",
450
+ "eats",
451
+ "ate",
452
+ "eaten",
453
+ "eating",
454
+ "drink",
455
+ "drinks",
456
+ "drank",
457
+ "drinking",
458
+ "sleep",
459
+ "sleeps",
460
+ "slept",
461
+ "sleeping",
462
+ "wake",
463
+ "wakes",
464
+ "woke",
465
+ "waking",
466
+ "sing",
467
+ "sings",
468
+ "sang",
469
+ "sung",
470
+ "singing",
471
+ "dance",
472
+ "dances",
473
+ "danced",
474
+ "dancing",
475
+ "wait",
476
+ "waits",
477
+ "waited",
478
+ "waiting",
479
+ "stay",
480
+ "stays",
481
+ "stayed",
482
+ "staying",
483
+ "fly",
484
+ "flies",
485
+ "flew",
486
+ "flown",
487
+ "flying",
488
+ "fall",
489
+ "falls",
490
+ "fell",
491
+ "fallen",
492
+ "falling",
493
+ "cut",
494
+ "cuts",
495
+ "cutting",
496
+ "break",
497
+ "breaks",
498
+ "broke",
499
+ "broken",
500
+ "breaking",
501
+ "watch",
502
+ "watches",
503
+ "watched",
504
+ "watching",
505
+ "listen",
506
+ "listens",
507
+ "listened",
508
+ "listening",
509
+ "remember",
510
+ "remembers",
511
+ "remembered",
512
+ "remembering",
513
+ "forget",
514
+ "forgets",
515
+ "forgot",
516
+ "forgotten",
517
+ "forgetting",
518
+ "meet",
519
+ "meets",
520
+ "met",
521
+ "meeting",
522
+ "follow",
523
+ "follows",
524
+ "followed",
525
+ "following",
526
+ "carry",
527
+ "carries",
528
+ "carried",
529
+ "carrying",
530
+ "catch",
531
+ "catches",
532
+ "caught",
533
+ "catching",
534
+ "draw",
535
+ "draws",
536
+ "drew",
537
+ "drawn",
538
+ "drawing",
539
+ "drive",
540
+ "drives",
541
+ "drove",
542
+ "driven",
543
+ "driving",
544
+ "ride",
545
+ "rides",
546
+ "rode",
547
+ "ridden",
548
+ "riding",
549
+ "wear",
550
+ "wears",
551
+ "wore",
552
+ "worn",
553
+ "wearing",
554
+ "pull",
555
+ "pulls",
556
+ "pulled",
557
+ "pulling",
558
+ "push",
559
+ "pushes",
560
+ "pushed",
561
+ "pushing",
562
+ "throw",
563
+ "throws",
564
+ "threw",
565
+ "thrown",
566
+ "throwing",
567
+ "reach",
568
+ "reaches",
569
+ "reached",
570
+ "reaching",
571
+ "pass",
572
+ "passes",
573
+ "passed",
574
+ "passing",
575
+ "shoot",
576
+ "shoots",
577
+ "shot",
578
+ "shooting",
579
+ "rise",
580
+ "rises",
581
+ "rose",
582
+ "risen",
583
+ "rising",
584
+ "blow",
585
+ "blows",
586
+ "blew",
587
+ "blown",
588
+ "blowing",
589
+ "grow",
590
+ "grows",
591
+ "grew",
592
+ "grown",
593
+ "growing",
594
+ "hit",
595
+ "hits",
596
+ "hitting",
597
+ "fight",
598
+ "fights",
599
+ "fought",
600
+ "fighting",
601
+ "die",
602
+ "dies",
603
+ "died",
604
+ "dying",
605
+ "kill",
606
+ "kills",
607
+ "killed",
608
+ "killing",
609
+ "speak",
610
+ "speaks",
611
+ "spoke",
612
+ "spoken",
613
+ "speaking",
614
+ # Common nouns
615
+ "time",
616
+ "times",
617
+ "year",
618
+ "years",
619
+ "day",
620
+ "days",
621
+ "week",
622
+ "weeks",
623
+ "month",
624
+ "months",
625
+ "hour",
626
+ "hours",
627
+ "minute",
628
+ "minutes",
629
+ "second",
630
+ "seconds",
631
+ "morning",
632
+ "afternoon",
633
+ "evening",
634
+ "night",
635
+ "today",
636
+ "yesterday",
637
+ "tomorrow",
638
+ "people",
639
+ "person",
640
+ "man",
641
+ "men",
642
+ "woman",
643
+ "women",
644
+ "child",
645
+ "children",
646
+ "boy",
647
+ "boys",
648
+ "girl",
649
+ "girls",
650
+ "baby",
651
+ "babies",
652
+ "friend",
653
+ "friends",
654
+ "family",
655
+ "families",
656
+ "mother",
657
+ "father",
658
+ "parent",
659
+ "parents",
660
+ "brother",
661
+ "brothers",
662
+ "sister",
663
+ "sisters",
664
+ "son",
665
+ "daughter",
666
+ "place",
667
+ "places",
668
+ "home",
669
+ "house",
670
+ "houses",
671
+ "room",
672
+ "rooms",
673
+ "school",
674
+ "schools",
675
+ "class",
676
+ "classes",
677
+ "student",
678
+ "students",
679
+ "teacher",
680
+ "teachers",
681
+ "way",
682
+ "ways",
683
+ "thing",
684
+ "things",
685
+ "part",
686
+ "parts",
687
+ "group",
688
+ "groups",
689
+ "number",
690
+ "numbers",
691
+ "side",
692
+ "sides",
693
+ "kind",
694
+ "kinds",
695
+ "head",
696
+ "heads",
697
+ "hand",
698
+ "hands",
699
+ "eye",
700
+ "eyes",
701
+ "face",
702
+ "faces",
703
+ "body",
704
+ "bodies",
705
+ "foot",
706
+ "feet",
707
+ "arm",
708
+ "arms",
709
+ "leg",
710
+ "legs",
711
+ "ear",
712
+ "ears",
713
+ "mouth",
714
+ "water",
715
+ "food",
716
+ "air",
717
+ "land",
718
+ "earth",
719
+ "ground",
720
+ "world",
721
+ "country",
722
+ "countries",
723
+ "state",
724
+ "states",
725
+ "city",
726
+ "cities",
727
+ "town",
728
+ "towns",
729
+ "name",
730
+ "names",
731
+ "word",
732
+ "words",
733
+ "line",
734
+ "lines",
735
+ "page",
736
+ "pages",
737
+ "book",
738
+ "books",
739
+ "story",
740
+ "stories",
741
+ "letter",
742
+ "letters",
743
+ "paper",
744
+ "papers",
745
+ "point",
746
+ "points",
747
+ "end",
748
+ "ends",
749
+ "top",
750
+ "bottom",
751
+ "front",
752
+ "back",
753
+ "life",
754
+ "lives",
755
+ "problem",
756
+ "problems",
757
+ "question",
758
+ "questions",
759
+ "answer",
760
+ "answers",
761
+ "work",
762
+ "works",
763
+ "job",
764
+ "jobs",
765
+ "money",
766
+ "door",
767
+ "doors",
768
+ "window",
769
+ "windows",
770
+ "car",
771
+ "cars",
772
+ "road",
773
+ "roads",
774
+ "street",
775
+ "streets",
776
+ "tree",
777
+ "trees",
778
+ "animal",
779
+ "animals",
780
+ "bird",
781
+ "birds",
782
+ "fish",
783
+ "dog",
784
+ "dogs",
785
+ "cat",
786
+ "cats",
787
+ "horse",
788
+ "horses",
789
+ "sea",
790
+ "mountain",
791
+ "mountains",
792
+ "river",
793
+ "rivers",
794
+ "sun",
795
+ "moon",
796
+ "star",
797
+ "stars",
798
+ "sky",
799
+ "cloud",
800
+ "clouds",
801
+ "rain",
802
+ "snow",
803
+ "wind",
804
+ "fire",
805
+ "light",
806
+ "dark",
807
+ "sound",
808
+ "sounds",
809
+ "color",
810
+ "colors",
811
+ "white",
812
+ "black",
813
+ "red",
814
+ "blue",
815
+ "green",
816
+ "yellow",
817
+ "brown",
818
+ "orange",
819
+ "game",
820
+ "games",
821
+ "ball",
822
+ "music",
823
+ "song",
824
+ "songs",
825
+ "picture",
826
+ "pictures",
827
+ "table",
828
+ "tables",
829
+ "chair",
830
+ "chairs",
831
+ "bed",
832
+ "beds",
833
+ "floor",
834
+ "wall",
835
+ "walls",
836
+ "minute",
837
+ "power",
838
+ "war",
839
+ "force",
840
+ "age",
841
+ "care",
842
+ "order",
843
+ "case",
844
+ # Common adjectives
845
+ "good",
846
+ "better",
847
+ "best",
848
+ "bad",
849
+ "worse",
850
+ "worst",
851
+ "big",
852
+ "bigger",
853
+ "biggest",
854
+ "small",
855
+ "smaller",
856
+ "smallest",
857
+ "large",
858
+ "larger",
859
+ "largest",
860
+ "little",
861
+ "less",
862
+ "least",
863
+ "long",
864
+ "longer",
865
+ "longest",
866
+ "short",
867
+ "shorter",
868
+ "shortest",
869
+ "high",
870
+ "higher",
871
+ "highest",
872
+ "low",
873
+ "lower",
874
+ "lowest",
875
+ "old",
876
+ "older",
877
+ "oldest",
878
+ "young",
879
+ "younger",
880
+ "youngest",
881
+ "new",
882
+ "newer",
883
+ "newest",
884
+ "great",
885
+ "greater",
886
+ "greatest",
887
+ "important",
888
+ "right",
889
+ "left",
890
+ "own",
891
+ "other",
892
+ "different",
893
+ "same",
894
+ "next",
895
+ "last",
896
+ "first",
897
+ "second",
898
+ "third",
899
+ "early",
900
+ "earlier",
901
+ "earliest",
902
+ "late",
903
+ "later",
904
+ "latest",
905
+ "easy",
906
+ "easier",
907
+ "easiest",
908
+ "hard",
909
+ "harder",
910
+ "hardest",
911
+ "hot",
912
+ "hotter",
913
+ "hottest",
914
+ "cold",
915
+ "colder",
916
+ "coldest",
917
+ "warm",
918
+ "warmer",
919
+ "warmest",
920
+ "cool",
921
+ "cooler",
922
+ "coolest",
923
+ "fast",
924
+ "faster",
925
+ "fastest",
926
+ "slow",
927
+ "slower",
928
+ "slowest",
929
+ "strong",
930
+ "stronger",
931
+ "strongest",
932
+ "weak",
933
+ "weaker",
934
+ "weakest",
935
+ "happy",
936
+ "happier",
937
+ "happiest",
938
+ "sad",
939
+ "sadder",
940
+ "saddest",
941
+ "nice",
942
+ "nicer",
943
+ "nicest",
944
+ "kind",
945
+ "kinder",
946
+ "kindest",
947
+ "sure",
948
+ "free",
949
+ "full",
950
+ "whole",
951
+ "ready",
952
+ "simple",
953
+ "clear",
954
+ "real",
955
+ "true",
956
+ "certain",
957
+ "public",
958
+ "able",
959
+ "several",
960
+ "open",
961
+ "closed",
962
+ "deep",
963
+ "wide",
964
+ "bright",
965
+ "dark",
966
+ "heavy",
967
+ "light",
968
+ "clean",
969
+ "dirty",
970
+ "wet",
971
+ "dry",
972
+ "soft",
973
+ "hard",
974
+ "quiet",
975
+ "loud",
976
+ "quick",
977
+ "slow",
978
+ "rich",
979
+ "poor",
980
+ "sick",
981
+ "well",
982
+ "dead",
983
+ "alive",
984
+ "empty",
985
+ "busy",
986
+ "pretty",
987
+ "beautiful",
988
+ "ugly",
989
+ # Common adverbs
990
+ "very",
991
+ "too",
992
+ "so",
993
+ "more",
994
+ "most",
995
+ "less",
996
+ "least",
997
+ "well",
998
+ "better",
999
+ "best",
1000
+ "just",
1001
+ "only",
1002
+ "even",
1003
+ "still",
1004
+ "also",
1005
+ "just",
1006
+ "now",
1007
+ "then",
1008
+ "here",
1009
+ "there",
1010
+ "where",
1011
+ "how",
1012
+ "when",
1013
+ "why",
1014
+ "not",
1015
+ "never",
1016
+ "always",
1017
+ "often",
1018
+ "sometimes",
1019
+ "usually",
1020
+ "ever",
1021
+ "again",
1022
+ "back",
1023
+ "away",
1024
+ "together",
1025
+ "once",
1026
+ "twice",
1027
+ "soon",
1028
+ "today",
1029
+ "yesterday",
1030
+ "tomorrow",
1031
+ "already",
1032
+ "almost",
1033
+ "enough",
1034
+ "quite",
1035
+ "rather",
1036
+ "really",
1037
+ "perhaps",
1038
+ "maybe",
1039
+ "probably",
1040
+ "certainly",
1041
+ "surely",
1042
+ "yes",
1043
+ "no",
1044
+ "please",
1045
+ "thank",
1046
+ "sorry",
1047
+ # Numbers
1048
+ "zero",
1049
+ "one",
1050
+ "two",
1051
+ "three",
1052
+ "four",
1053
+ "five",
1054
+ "six",
1055
+ "seven",
1056
+ "eight",
1057
+ "nine",
1058
+ "ten",
1059
+ "eleven",
1060
+ "twelve",
1061
+ "thirteen",
1062
+ "fourteen",
1063
+ "fifteen",
1064
+ "sixteen",
1065
+ "seventeen",
1066
+ "eighteen",
1067
+ "nineteen",
1068
+ "twenty",
1069
+ "thirty",
1070
+ "forty",
1071
+ "fifty",
1072
+ "sixty",
1073
+ "seventy",
1074
+ "eighty",
1075
+ "ninety",
1076
+ "hundred",
1077
+ "thousand",
1078
+ "million",
1079
+ "first",
1080
+ "second",
1081
+ "third",
1082
+ "fourth",
1083
+ "fifth",
1084
+ "sixth",
1085
+ "seventh",
1086
+ "eighth",
1087
+ "ninth",
1088
+ "tenth",
1089
+ # Additional common words
1090
+ "able",
1091
+ "accept",
1092
+ "across",
1093
+ "act",
1094
+ "add",
1095
+ "afraid",
1096
+ "against",
1097
+ "agree",
1098
+ "allow",
1099
+ "alone",
1100
+ "appear",
1101
+ "apple",
1102
+ "area",
1103
+ "arm",
1104
+ "arrive",
1105
+ "art",
1106
+ "aunt",
1107
+ "ball",
1108
+ "become",
1109
+ "believe",
1110
+ "belong",
1111
+ "boat",
1112
+ "build",
1113
+ "burn",
1114
+ "business",
1115
+ "chair",
1116
+ "chance",
1117
+ "church",
1118
+ "clear",
1119
+ "climb",
1120
+ "clothe",
1121
+ "clothes",
1122
+ "company",
1123
+ "contain",
1124
+ "continue",
1125
+ "control",
1126
+ "cook",
1127
+ "corner",
1128
+ "cost",
1129
+ "count",
1130
+ "course",
1131
+ "cover",
1132
+ "create",
1133
+ "cross",
1134
+ "crowd",
1135
+ "cry",
1136
+ "decide",
1137
+ "depend",
1138
+ "describe",
1139
+ "develop",
1140
+ "die",
1141
+ "direction",
1142
+ "discover",
1143
+ "doctor",
1144
+ "double",
1145
+ "drop",
1146
+ "during",
1147
+ "edge",
1148
+ "effect",
1149
+ "eight",
1150
+ "either",
1151
+ "else",
1152
+ "enjoy",
1153
+ "enough",
1154
+ "enter",
1155
+ "example",
1156
+ "except",
1157
+ "excite",
1158
+ "expect",
1159
+ "explain",
1160
+ "express",
1161
+ "fact",
1162
+ "fair",
1163
+ "farm",
1164
+ "fear",
1165
+ "field",
1166
+ "fill",
1167
+ "final",
1168
+ "fine",
1169
+ "finger",
1170
+ "finish",
1171
+ "flower",
1172
+ "force",
1173
+ "foreign",
1174
+ "forest",
1175
+ "form",
1176
+ "fresh",
1177
+ "front",
1178
+ "garden",
1179
+ "general",
1180
+ "glass",
1181
+ "god",
1182
+ "gold",
1183
+ "hang",
1184
+ "hat",
1185
+ "hope",
1186
+ "hot",
1187
+ "idea",
1188
+ "include",
1189
+ "increase",
1190
+ "instead",
1191
+ "interest",
1192
+ "island",
1193
+ "join",
1194
+ "laugh",
1195
+ "law",
1196
+ "lead",
1197
+ "lie",
1198
+ "lift",
1199
+ "list",
1200
+ "lock",
1201
+ "love",
1202
+ "machine",
1203
+ "mark",
1204
+ "matter",
1205
+ "mean",
1206
+ "measure",
1207
+ "member",
1208
+ "mention",
1209
+ "middle",
1210
+ "mile",
1211
+ "mind",
1212
+ "miss",
1213
+ "moment",
1214
+ "nation",
1215
+ "natural",
1216
+ "nature",
1217
+ "necessary",
1218
+ "neighbor",
1219
+ "notice",
1220
+ "object",
1221
+ "ocean",
1222
+ "offer",
1223
+ "office",
1224
+ "opinion",
1225
+ "paint",
1226
+ "pair",
1227
+ "party",
1228
+ "pattern",
1229
+ "period",
1230
+ "pick",
1231
+ "plan",
1232
+ "plant",
1233
+ "position",
1234
+ "possible",
1235
+ "pound",
1236
+ "prepare",
1237
+ "present",
1238
+ "president",
1239
+ "press",
1240
+ "prince",
1241
+ "print",
1242
+ "probable",
1243
+ "produce",
1244
+ "promise",
1245
+ "proper",
1246
+ "protect",
1247
+ "prove",
1248
+ "purpose",
1249
+ "quarter",
1250
+ "queen",
1251
+ "question",
1252
+ "quick",
1253
+ "quiet",
1254
+ "race",
1255
+ "raise",
1256
+ "range",
1257
+ "rate",
1258
+ "reason",
1259
+ "receive",
1260
+ "record",
1261
+ "region",
1262
+ "remain",
1263
+ "reply",
1264
+ "report",
1265
+ "represent",
1266
+ "require",
1267
+ "rest",
1268
+ "result",
1269
+ "return",
1270
+ "roll",
1271
+ "rule",
1272
+ "sail",
1273
+ "salt",
1274
+ "save",
1275
+ "science",
1276
+ "season",
1277
+ "seat",
1278
+ "seem",
1279
+ "sell",
1280
+ "sense",
1281
+ "sentence",
1282
+ "separate",
1283
+ "serve",
1284
+ "set",
1285
+ "settle",
1286
+ "seven",
1287
+ "shape",
1288
+ "share",
1289
+ "ship",
1290
+ "shore",
1291
+ "sign",
1292
+ "silver",
1293
+ "single",
1294
+ "sir",
1295
+ "six",
1296
+ "size",
1297
+ "skin",
1298
+ "soldier",
1299
+ "solve",
1300
+ "south",
1301
+ "space",
1302
+ "special",
1303
+ "speed",
1304
+ "spell",
1305
+ "spend",
1306
+ "spread",
1307
+ "spring",
1308
+ "square",
1309
+ "step",
1310
+ "stone",
1311
+ "straight",
1312
+ "strange",
1313
+ "stream",
1314
+ "strength",
1315
+ "strike",
1316
+ "subject",
1317
+ "success",
1318
+ "sudden",
1319
+ "suffer",
1320
+ "suggest",
1321
+ "suit",
1322
+ "summer",
1323
+ "supply",
1324
+ "support",
1325
+ "suppose",
1326
+ "surface",
1327
+ "surprise",
1328
+ "sweet",
1329
+ "swim",
1330
+ "system",
1331
+ "tail",
1332
+ "taste",
1333
+ "teach",
1334
+ "team",
1335
+ "telephone",
1336
+ "television",
1337
+ "temperature",
1338
+ "ten",
1339
+ "test",
1340
+ "thick",
1341
+ "thin",
1342
+ "though",
1343
+ "thousand",
1344
+ "three",
1345
+ "tire",
1346
+ "total",
1347
+ "touch",
1348
+ "track",
1349
+ "train",
1350
+ "travel",
1351
+ "trip",
1352
+ "trouble",
1353
+ "type",
1354
+ "uncle",
1355
+ "understand",
1356
+ "unit",
1357
+ "universe",
1358
+ "value",
1359
+ "various",
1360
+ "view",
1361
+ "village",
1362
+ "visit",
1363
+ "voice",
1364
+ "vote",
1365
+ "wagon",
1366
+ "wander",
1367
+ "warm",
1368
+ "wash",
1369
+ "wave",
1370
+ "wealth",
1371
+ "weather",
1372
+ "weight",
1373
+ "welcome",
1374
+ "west",
1375
+ "wheel",
1376
+ "wild",
1377
+ "wind",
1378
+ "winter",
1379
+ "wish",
1380
+ "wonder",
1381
+ "wood",
1382
+ "yard",
1383
+ "yellow",
1384
+ }
1385
+
1386
+
1387
+ def _compute_dale_chall_single(text: str) -> tuple[float, int, float, float, dict]:
1388
+ """Compute Dale-Chall for a single chunk."""
1389
+ sentences = split_sentences(text)
1390
+ tokens = tokenize(text)
1391
+ word_tokens = normalize_for_readability(tokens)
1392
+
1393
+ if len(sentences) == 0 or len(word_tokens) == 0:
1394
+ return (float("nan"), 0, float("nan"), float("nan"), {"sentence_count": 0, "word_count": 0})
1395
+
1396
+ difficult_words = [w for w in word_tokens if w.lower() not in DALE_CHALL_FAMILIAR_WORDS]
1397
+ difficult_word_count = len(difficult_words)
1398
+ difficult_word_ratio = difficult_word_count / len(word_tokens)
1399
+ difficult_word_pct = difficult_word_ratio * 100
1400
+ avg_sentence_length = len(word_tokens) / len(sentences)
1401
+ raw_score = 0.1579 * difficult_word_pct + 0.0496 * avg_sentence_length
1402
+ adjusted = difficult_word_pct > 5.0
1403
+ dale_chall_score = raw_score + 3.6365 if adjusted else raw_score
1404
+
1405
+ return (
1406
+ dale_chall_score,
1407
+ difficult_word_count,
1408
+ difficult_word_ratio,
1409
+ avg_sentence_length,
1410
+ {
1411
+ "sentence_count": len(sentences),
1412
+ "word_count": len(word_tokens),
1413
+ "adjusted": adjusted,
1414
+ "raw_score": raw_score,
1415
+ "difficult_word_pct": difficult_word_pct,
1416
+ },
1417
+ )
1418
+
1419
+
1420
+ def _get_dale_chall_grade_level(score: float) -> str:
1421
+ """Map Dale-Chall score to grade level."""
1422
+ if math.isnan(score):
1423
+ return "Unknown"
1424
+ if score < 5.0:
1425
+ return "4 and below"
1426
+ elif score < 6.0:
1427
+ return "5-6"
1428
+ elif score < 7.0:
1429
+ return "7-8"
1430
+ elif score < 8.0:
1431
+ return "9-10"
1432
+ elif score < 9.0:
1433
+ return "11-12"
1434
+ elif score < 10.0:
1435
+ return "College"
1436
+ else:
1437
+ return "College Graduate"
1438
+
1439
+
1440
+ def compute_dale_chall(text: str, chunk_size: int = 1000) -> DaleChallResult:
1441
+ """
1442
+ Compute Dale-Chall Readability Formula.
1443
+
1444
+ This function uses native chunked analysis to capture variance and patterns
1445
+ across the text, which is essential for stylometric fingerprinting.
1446
+
1447
+ Related GitHub Issues:
1448
+ #16 - Additional Readability Formulas
1449
+ #27 - Native chunked analysis with Distribution dataclass
1450
+
1451
+ Formula:
1452
+ Raw Score = 0.1579 * (difficult_words_pct) + 0.0496 * (avg_sentence_length)
1453
+
1454
+ If difficult_words_pct > 5%:
1455
+ Adjusted Score = Raw Score + 3.6365
1456
+
1457
+ Args:
1458
+ text: Input text to analyze
1459
+ chunk_size: Number of words per chunk (default: 1000)
1460
+
1461
+ Returns:
1462
+ DaleChallResult with dale_chall_score, grade_level, distributions, and metadata
1463
+
1464
+ Example:
1465
+ >>> result = compute_dale_chall("Long text here...", chunk_size=1000)
1466
+ >>> result.dale_chall_score # Mean across chunks
1467
+ 7.3
1468
+ >>> result.dale_chall_score_dist.std # Variance reveals fingerprint
1469
+ 0.5
1470
+ """
1471
+ chunks = chunk_text(text, chunk_size)
1472
+ score_values = []
1473
+ ratio_values = []
1474
+ sent_len_values = []
1475
+ total_difficult = 0
1476
+ total_words = 0
1477
+ total_sentences = 0
1478
+
1479
+ for chunk in chunks:
1480
+ sc, diff_cnt, diff_rat, sent_len, meta = _compute_dale_chall_single(chunk)
1481
+ if not math.isnan(sc):
1482
+ score_values.append(sc)
1483
+ ratio_values.append(diff_rat)
1484
+ sent_len_values.append(sent_len)
1485
+ total_difficult += diff_cnt
1486
+ total_words += meta.get("word_count", 0)
1487
+ total_sentences += meta.get("sentence_count", 0)
1488
+
1489
+ if not score_values:
1490
+ empty_dist = Distribution(
1491
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
1492
+ )
1493
+ return DaleChallResult(
1494
+ dale_chall_score=float("nan"),
1495
+ grade_level="Unknown",
1496
+ difficult_word_count=0,
1497
+ difficult_word_ratio=float("nan"),
1498
+ avg_sentence_length=float("nan"),
1499
+ total_words=0,
1500
+ dale_chall_score_dist=empty_dist,
1501
+ difficult_word_ratio_dist=empty_dist,
1502
+ avg_sentence_length_dist=empty_dist,
1503
+ chunk_size=chunk_size,
1504
+ chunk_count=len(chunks),
1505
+ metadata={
1506
+ "sentence_count": 0,
1507
+ "raw_score": float("nan"),
1508
+ "adjusted": False,
1509
+ "difficult_word_pct": float("nan"),
1510
+ "reliable": False,
1511
+ },
1512
+ )
1513
+
1514
+ score_dist = make_distribution(score_values)
1515
+ ratio_dist = make_distribution(ratio_values)
1516
+ sent_len_dist = make_distribution(sent_len_values)
1517
+
1518
+ # Calculate overall raw score and adjusted status for metadata
1519
+ overall_difficult_pct = (total_difficult / total_words * 100) if total_words > 0 else 0.0
1520
+ overall_raw_score = 0.1579 * overall_difficult_pct + 0.0496 * sent_len_dist.mean
1521
+ overall_adjusted = overall_difficult_pct > 5.0
1522
+
1523
+ return DaleChallResult(
1524
+ dale_chall_score=score_dist.mean,
1525
+ grade_level=_get_dale_chall_grade_level(score_dist.mean),
1526
+ difficult_word_count=total_difficult,
1527
+ difficult_word_ratio=ratio_dist.mean,
1528
+ avg_sentence_length=sent_len_dist.mean,
1529
+ total_words=total_words,
1530
+ dale_chall_score_dist=score_dist,
1531
+ difficult_word_ratio_dist=ratio_dist,
1532
+ avg_sentence_length_dist=sent_len_dist,
1533
+ chunk_size=chunk_size,
1534
+ chunk_count=len(chunks),
1535
+ metadata={
1536
+ "sentence_count": total_sentences,
1537
+ "raw_score": overall_raw_score,
1538
+ "adjusted": overall_adjusted,
1539
+ "difficult_word_pct": overall_difficult_pct,
1540
+ "total_sentence_count": total_sentences,
1541
+ "total_word_count": total_words,
1542
+ "total_difficult_word_count": total_difficult,
1543
+ "reliable": total_words >= 100,
1544
+ },
1545
+ )
1546
+
1547
+
1548
+ def _compute_linsear_single(text: str) -> tuple[float, float, int, int, float, dict]:
1549
+ """Compute Linsear Write for a single chunk."""
1550
+ sentences = split_sentences(text)
1551
+ tokens = tokenize(text)
1552
+ word_tokens = normalize_for_readability(tokens)
1553
+
1554
+ if len(sentences) == 0 or len(word_tokens) == 0:
1555
+ return (
1556
+ float("nan"),
1557
+ float("nan"),
1558
+ 0,
1559
+ 0,
1560
+ float("nan"),
1561
+ {"sentence_count": 0, "word_count": 0},
1562
+ )
1563
+
1564
+ easy_word_count = sum(1 for w in word_tokens if count_syllables(w) <= 2)
1565
+ hard_word_count = len(word_tokens) - easy_word_count
1566
+ weighted_sum = easy_word_count + hard_word_count * 3
1567
+ raw_score = weighted_sum / len(sentences)
1568
+ grade_level_raw = round(raw_score / 2) if raw_score > 20 else round((raw_score - 2) / 2)
1569
+ grade_level = max(0.0, float(grade_level_raw))
1570
+ avg_sentence_length = len(word_tokens) / len(sentences)
1571
+
1572
+ return (
1573
+ raw_score,
1574
+ grade_level,
1575
+ easy_word_count,
1576
+ hard_word_count,
1577
+ avg_sentence_length,
1578
+ {"sentence_count": len(sentences), "word_count": len(word_tokens)},
1579
+ )
1580
+
1581
+
1582
+ def compute_linsear_write(text: str, chunk_size: int = 1000) -> LinsearWriteResult:
1583
+ """
1584
+ Compute Linsear Write Readability Formula.
1585
+
1586
+ This function uses native chunked analysis to capture variance and patterns
1587
+ across the text, which is essential for stylometric fingerprinting.
1588
+
1589
+ Related GitHub Issues:
1590
+ #16 - Additional Readability Formulas
1591
+ #27 - Native chunked analysis with Distribution dataclass
1592
+
1593
+ Args:
1594
+ text: Input text to analyze
1595
+ chunk_size: Number of words per chunk (default: 1000)
1596
+
1597
+ Returns:
1598
+ LinsearWriteResult with score, grade_level, distributions, and metadata
1599
+
1600
+ Example:
1601
+ >>> result = compute_linsear_write("Long text here...", chunk_size=1000)
1602
+ >>> result.linsear_score # Mean across chunks
1603
+ 11.3
1604
+ """
1605
+ chunks = chunk_text(text, chunk_size)
1606
+ score_values = []
1607
+ grade_values = []
1608
+ sent_len_values = []
1609
+ total_easy = 0
1610
+ total_hard = 0
1611
+ total_words = 0
1612
+
1613
+ for chunk in chunks:
1614
+ sc, gr, easy, hard, sent_len, meta = _compute_linsear_single(chunk)
1615
+ if not math.isnan(sc):
1616
+ score_values.append(sc)
1617
+ grade_values.append(gr)
1618
+ sent_len_values.append(sent_len)
1619
+ total_easy += easy
1620
+ total_hard += hard
1621
+ total_words += meta.get("word_count", 0)
1622
+
1623
+ if not score_values:
1624
+ empty_dist = Distribution(
1625
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
1626
+ )
1627
+ return LinsearWriteResult(
1628
+ linsear_score=float("nan"),
1629
+ grade_level=float("nan"),
1630
+ easy_word_count=0,
1631
+ hard_word_count=0,
1632
+ avg_sentence_length=float("nan"),
1633
+ linsear_score_dist=empty_dist,
1634
+ grade_level_dist=empty_dist,
1635
+ avg_sentence_length_dist=empty_dist,
1636
+ chunk_size=chunk_size,
1637
+ chunk_count=len(chunks),
1638
+ metadata={"total_words": 0, "reliable": False},
1639
+ )
1640
+
1641
+ score_dist = make_distribution(score_values)
1642
+ grade_dist = make_distribution(grade_values)
1643
+ sent_len_dist = make_distribution(sent_len_values)
1644
+
1645
+ return LinsearWriteResult(
1646
+ linsear_score=score_dist.mean,
1647
+ grade_level=grade_dist.mean,
1648
+ easy_word_count=total_easy,
1649
+ hard_word_count=total_hard,
1650
+ avg_sentence_length=sent_len_dist.mean,
1651
+ linsear_score_dist=score_dist,
1652
+ grade_level_dist=grade_dist,
1653
+ avg_sentence_length_dist=sent_len_dist,
1654
+ chunk_size=chunk_size,
1655
+ chunk_count=len(chunks),
1656
+ metadata={"total_words": total_words, "reliable": total_words >= 100},
1657
+ )
1658
+
1659
+
1660
+ def _get_fry_grade_level(avg_sent_len: float, avg_syl_100: float) -> tuple[str, str]:
1661
+ """Get Fry grade level and zone from coordinates."""
1662
+ if math.isnan(avg_sent_len) or math.isnan(avg_syl_100):
1663
+ return ("Unknown", "invalid")
1664
+
1665
+ if avg_syl_100 < 125:
1666
+ if avg_sent_len < 7:
1667
+ grade, zone = "1", "valid"
1668
+ elif avg_sent_len < 11:
1669
+ grade, zone = "2", "valid"
1670
+ else:
1671
+ grade, zone = "3", "valid"
1672
+ elif avg_syl_100 < 135:
1673
+ if avg_sent_len < 8:
1674
+ grade, zone = "2", "valid"
1675
+ elif avg_sent_len < 12:
1676
+ grade, zone = "3", "valid"
1677
+ else:
1678
+ grade, zone = "4", "valid"
1679
+ elif avg_syl_100 < 145:
1680
+ if avg_sent_len < 9:
1681
+ grade, zone = "3", "valid"
1682
+ elif avg_sent_len < 13:
1683
+ grade, zone = "5", "valid"
1684
+ else:
1685
+ grade, zone = "6", "valid"
1686
+ elif avg_syl_100 < 155:
1687
+ if avg_sent_len < 10:
1688
+ grade, zone = "4", "valid"
1689
+ elif avg_sent_len < 14:
1690
+ grade, zone = "7", "valid"
1691
+ else:
1692
+ grade, zone = "8", "valid"
1693
+ elif avg_syl_100 < 165:
1694
+ if avg_sent_len < 12:
1695
+ grade, zone = "6", "valid"
1696
+ elif avg_sent_len < 16:
1697
+ grade, zone = "9", "valid"
1698
+ else:
1699
+ grade, zone = "10", "valid"
1700
+ elif avg_syl_100 < 175:
1701
+ if avg_sent_len < 14:
1702
+ grade, zone = "8", "valid"
1703
+ elif avg_sent_len < 18:
1704
+ grade, zone = "11", "valid"
1705
+ else:
1706
+ grade, zone = "12", "valid"
1707
+ else:
1708
+ if avg_sent_len < 16:
1709
+ grade, zone = "10", "valid"
1710
+ elif avg_sent_len < 20:
1711
+ grade, zone = "College", "valid"
1712
+ else:
1713
+ grade, zone = "College+", "valid"
1714
+
1715
+ if avg_syl_100 > 185 or avg_sent_len > 25:
1716
+ zone = "above_graph"
1717
+ elif avg_syl_100 < 110:
1718
+ zone = "below_graph"
1719
+
1720
+ return (grade, zone)
1721
+
1722
+
1723
+ def _compute_fry_single(text: str) -> tuple[float, float, dict]:
1724
+ """Compute Fry for a single chunk. Returns (avg_sent_len, avg_syl_100, meta)."""
1725
+ sentences = split_sentences(text)
1726
+ tokens = tokenize(text)
1727
+ word_tokens = normalize_for_readability(tokens)
1728
+
1729
+ if len(sentences) == 0 or len(word_tokens) == 0:
1730
+ return (
1731
+ float("nan"),
1732
+ float("nan"),
1733
+ {"sentence_count": 0, "word_count": 0, "syllable_count": 0, "sample_size": 0},
1734
+ )
1735
+
1736
+ sample_size = min(100, len(word_tokens))
1737
+ sample_tokens = word_tokens[:sample_size]
1738
+ total_syllables = sum(count_syllables(w) for w in sample_tokens)
1739
+
1740
+ word_count_so_far = 0
1741
+ sentences_in_sample = 0
1742
+ for sent in sentences:
1743
+ sent_tokens = normalize_for_readability(tokenize(sent))
1744
+ if word_count_so_far + len(sent_tokens) <= sample_size:
1745
+ sentences_in_sample += 1
1746
+ word_count_so_far += len(sent_tokens)
1747
+ else:
1748
+ if word_count_so_far < sample_size:
1749
+ sentences_in_sample += 1
1750
+ break
1751
+
1752
+ sentences_in_sample = max(1, sentences_in_sample)
1753
+ avg_sentence_length = sample_size / sentences_in_sample
1754
+ avg_syllables_per_100 = (total_syllables / sample_size) * 100
1755
+
1756
+ return (
1757
+ avg_sentence_length,
1758
+ avg_syllables_per_100,
1759
+ {
1760
+ "sentence_count": len(sentences),
1761
+ "word_count": len(word_tokens),
1762
+ "syllable_count": total_syllables,
1763
+ "sample_size": sample_size,
1764
+ },
1765
+ )
1766
+
1767
+
1768
+ def compute_fry(text: str, chunk_size: int = 1000) -> FryResult:
1769
+ """
1770
+ Compute Fry Readability Graph metrics.
1771
+
1772
+ This function uses native chunked analysis to capture variance and patterns
1773
+ across the text, which is essential for stylometric fingerprinting.
1774
+
1775
+ Related GitHub Issues:
1776
+ #16 - Additional Readability Formulas
1777
+ #27 - Native chunked analysis with Distribution dataclass
1778
+
1779
+ Args:
1780
+ text: Input text to analyze
1781
+ chunk_size: Number of words per chunk (default: 1000)
1782
+
1783
+ Returns:
1784
+ FryResult with avg_sentence_length, avg_syllables_per_100, distributions, and metadata
1785
+
1786
+ Example:
1787
+ >>> result = compute_fry("Long text here...", chunk_size=1000)
1788
+ >>> result.avg_sentence_length # Mean across chunks
1789
+ 14.3
1790
+ """
1791
+ chunks = chunk_text(text, chunk_size)
1792
+ sent_len_values = []
1793
+ syl_100_values = []
1794
+ total_words = 0
1795
+ total_sentences = 0
1796
+ total_syllables = 0
1797
+
1798
+ for chunk in chunks:
1799
+ sent_len, syl_100, meta = _compute_fry_single(chunk)
1800
+ if not math.isnan(sent_len):
1801
+ sent_len_values.append(sent_len)
1802
+ syl_100_values.append(syl_100)
1803
+ total_words += meta.get("word_count", 0)
1804
+ total_sentences += meta.get("sentence_count", 0)
1805
+ total_syllables += meta.get("syllable_count", 0)
1806
+
1807
+ if not sent_len_values:
1808
+ empty_dist = Distribution(
1809
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
1810
+ )
1811
+ return FryResult(
1812
+ avg_sentence_length=float("nan"),
1813
+ avg_syllables_per_100=float("nan"),
1814
+ grade_level="Unknown",
1815
+ graph_zone="invalid",
1816
+ avg_sentence_length_dist=empty_dist,
1817
+ avg_syllables_per_100_dist=empty_dist,
1818
+ chunk_size=chunk_size,
1819
+ chunk_count=len(chunks),
1820
+ metadata={"total_sentences": 0, "total_words": 0, "sample_size": 0, "reliable": False},
1821
+ )
1822
+
1823
+ sent_len_dist = make_distribution(sent_len_values)
1824
+ syl_100_dist = make_distribution(syl_100_values)
1825
+ grade_level, graph_zone = _get_fry_grade_level(sent_len_dist.mean, syl_100_dist.mean)
1826
+
1827
+ # Calculate sample size (min of 100 or total_words for overall)
1828
+ sample_size = min(100, total_words)
1829
+
1830
+ return FryResult(
1831
+ avg_sentence_length=sent_len_dist.mean,
1832
+ avg_syllables_per_100=syl_100_dist.mean,
1833
+ grade_level=grade_level,
1834
+ graph_zone=graph_zone,
1835
+ avg_sentence_length_dist=sent_len_dist,
1836
+ avg_syllables_per_100_dist=syl_100_dist,
1837
+ chunk_size=chunk_size,
1838
+ chunk_count=len(chunks),
1839
+ metadata={
1840
+ "total_sentences": total_sentences,
1841
+ "total_words": total_words,
1842
+ "total_syllables": total_syllables,
1843
+ "sample_size": sample_size,
1844
+ "reliable": total_words >= 100,
1845
+ },
1846
+ )
1847
+
1848
+
1849
+ def _compute_forcast_single(text: str) -> tuple[float, float, int, float, dict]:
1850
+ """Compute FORCAST for a single chunk."""
1851
+ tokens = tokenize(text)
1852
+ word_tokens = normalize_for_readability(tokens)
1853
+
1854
+ if len(word_tokens) == 0:
1855
+ return (
1856
+ float("nan"),
1857
+ float("nan"),
1858
+ 0,
1859
+ float("nan"),
1860
+ {"word_count": 0, "sample_size": 0, "scaled_n": 0.0},
1861
+ )
1862
+
1863
+ sample_size = min(150, len(word_tokens))
1864
+ sample_tokens = word_tokens[:sample_size]
1865
+ single_syllable_count = sum(1 for w in sample_tokens if count_syllables(w) == 1)
1866
+ scaled_n = (
1867
+ single_syllable_count * (150 / sample_size) if sample_size < 150 else single_syllable_count
1868
+ )
1869
+ forcast_score = 20 - (scaled_n / 10)
1870
+ grade_level = float(max(0, min(20, round(forcast_score))))
1871
+ single_syllable_ratio = single_syllable_count / sample_size
1872
+
1873
+ return (
1874
+ forcast_score,
1875
+ grade_level,
1876
+ single_syllable_count,
1877
+ single_syllable_ratio,
1878
+ {"word_count": len(word_tokens), "sample_size": sample_size, "scaled_n": scaled_n},
1879
+ )
1880
+
1881
+
1882
+ def compute_forcast(text: str, chunk_size: int = 1000) -> FORCASTResult:
1883
+ """
1884
+ Compute FORCAST Readability Formula.
1885
+
1886
+ This function uses native chunked analysis to capture variance and patterns
1887
+ across the text, which is essential for stylometric fingerprinting.
1888
+
1889
+ Related GitHub Issues:
1890
+ #16 - Additional Readability Formulas
1891
+ #27 - Native chunked analysis with Distribution dataclass
1892
+
1893
+ Formula:
1894
+ Grade Level = 20 - (N / 10)
1895
+ Where N is the number of single-syllable words in a 150-word sample.
1896
+
1897
+ Args:
1898
+ text: Input text to analyze
1899
+ chunk_size: Number of words per chunk (default: 1000)
1900
+
1901
+ Returns:
1902
+ FORCASTResult with score, grade_level, distributions, and metadata
1903
+
1904
+ Example:
1905
+ >>> result = compute_forcast("Long text here...", chunk_size=1000)
1906
+ >>> result.forcast_score # Mean across chunks
1907
+ 9.7
1908
+ """
1909
+ chunks = chunk_text(text, chunk_size)
1910
+ score_values = []
1911
+ grade_values = []
1912
+ ratio_values = []
1913
+ total_single = 0
1914
+ total_words = 0
1915
+
1916
+ for chunk in chunks:
1917
+ sc, gr, single_cnt, single_rat, meta = _compute_forcast_single(chunk)
1918
+ if not math.isnan(sc):
1919
+ score_values.append(sc)
1920
+ grade_values.append(gr)
1921
+ ratio_values.append(single_rat)
1922
+ total_single += single_cnt
1923
+ total_words += meta.get("word_count", 0)
1924
+
1925
+ if not score_values:
1926
+ empty_dist = Distribution(
1927
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
1928
+ )
1929
+ return FORCASTResult(
1930
+ forcast_score=float("nan"),
1931
+ grade_level=float("nan"),
1932
+ single_syllable_ratio=float("nan"),
1933
+ single_syllable_count=0,
1934
+ total_words=0,
1935
+ forcast_score_dist=empty_dist,
1936
+ grade_level_dist=empty_dist,
1937
+ single_syllable_ratio_dist=empty_dist,
1938
+ chunk_size=chunk_size,
1939
+ chunk_count=len(chunks),
1940
+ metadata={"sample_size": 0, "scaled_n": 0.0, "reliable": False},
1941
+ )
1942
+
1943
+ score_dist = make_distribution(score_values)
1944
+ grade_dist = make_distribution(grade_values)
1945
+ ratio_dist = make_distribution(ratio_values)
1946
+
1947
+ # Calculate overall sample_size and scaled_n for metadata
1948
+ overall_sample_size = min(150, total_words)
1949
+ overall_scaled_n = (
1950
+ total_single * (150 / overall_sample_size)
1951
+ if overall_sample_size < 150
1952
+ else float(total_single)
1953
+ )
1954
+
1955
+ return FORCASTResult(
1956
+ forcast_score=score_dist.mean,
1957
+ grade_level=grade_dist.mean,
1958
+ single_syllable_ratio=ratio_dist.mean,
1959
+ single_syllable_count=total_single,
1960
+ total_words=total_words,
1961
+ forcast_score_dist=score_dist,
1962
+ grade_level_dist=grade_dist,
1963
+ single_syllable_ratio_dist=ratio_dist,
1964
+ chunk_size=chunk_size,
1965
+ chunk_count=len(chunks),
1966
+ metadata={
1967
+ "sample_size": overall_sample_size,
1968
+ "scaled_n": overall_scaled_n,
1969
+ "reliable": total_words >= 100,
1970
+ },
1971
+ )
1972
+
1973
+
1974
+ def _compute_psk_single(text: str) -> tuple[float, float, float, float, int, dict]:
1975
+ """Compute PSK for a single chunk."""
1976
+ sentences = split_sentences(text)
1977
+ tokens = tokenize(text)
1978
+ word_tokens = normalize_for_readability(tokens)
1979
+
1980
+ if len(sentences) == 0 or len(word_tokens) == 0:
1981
+ return (
1982
+ float("nan"),
1983
+ float("nan"),
1984
+ float("nan"),
1985
+ float("nan"),
1986
+ 0,
1987
+ {"sentence_count": 0, "word_count": 0},
1988
+ )
1989
+
1990
+ total_syllables = sum(count_syllables(w) for w in word_tokens)
1991
+ avg_sentence_length = len(word_tokens) / len(sentences)
1992
+ avg_syllables_per_word = total_syllables / len(word_tokens)
1993
+ psk_score = 0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
1994
+ grade_level = round(psk_score, 1)
1995
+
1996
+ return (
1997
+ psk_score,
1998
+ grade_level,
1999
+ avg_sentence_length,
2000
+ avg_syllables_per_word,
2001
+ total_syllables,
2002
+ {"sentence_count": len(sentences), "word_count": len(word_tokens)},
2003
+ )
2004
+
2005
+
2006
+ def compute_powers_sumner_kearl(text: str, chunk_size: int = 1000) -> PowersSumnerKearlResult:
2007
+ """
2008
+ Compute Powers-Sumner-Kearl Readability Formula.
2009
+
2010
+ This function uses native chunked analysis to capture variance and patterns
2011
+ across the text, which is essential for stylometric fingerprinting.
2012
+
2013
+ Related GitHub Issues:
2014
+ #16 - Additional Readability Formulas
2015
+ #27 - Native chunked analysis with Distribution dataclass
2016
+
2017
+ Formula:
2018
+ Grade Level = 0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
2019
+
2020
+ Args:
2021
+ text: Input text to analyze
2022
+ chunk_size: Number of words per chunk (default: 1000)
2023
+
2024
+ Returns:
2025
+ PowersSumnerKearlResult with score, grade_level, distributions, and metadata
2026
+
2027
+ Example:
2028
+ >>> result = compute_powers_sumner_kearl("Long text here...", chunk_size=1000)
2029
+ >>> result.psk_score # Mean across chunks
2030
+ 2.3
2031
+ """
2032
+ chunks = chunk_text(text, chunk_size)
2033
+ score_values = []
2034
+ grade_values = []
2035
+ sent_len_values = []
2036
+ syl_per_word_values = []
2037
+ total_sentences = 0
2038
+ total_words = 0
2039
+ total_syllables = 0
2040
+
2041
+ for chunk in chunks:
2042
+ sc, gr, sent_len, syl_word, syls, meta = _compute_psk_single(chunk)
2043
+ if not math.isnan(sc):
2044
+ score_values.append(sc)
2045
+ grade_values.append(gr)
2046
+ sent_len_values.append(sent_len)
2047
+ syl_per_word_values.append(syl_word)
2048
+ total_sentences += meta.get("sentence_count", 0)
2049
+ total_words += meta.get("word_count", 0)
2050
+ total_syllables += syls
2051
+
2052
+ if not score_values:
2053
+ empty_dist = Distribution(
2054
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
2055
+ )
2056
+ return PowersSumnerKearlResult(
2057
+ psk_score=float("nan"),
2058
+ grade_level=float("nan"),
2059
+ avg_sentence_length=float("nan"),
2060
+ avg_syllables_per_word=float("nan"),
2061
+ total_sentences=0,
2062
+ total_words=0,
2063
+ total_syllables=0,
2064
+ psk_score_dist=empty_dist,
2065
+ grade_level_dist=empty_dist,
2066
+ avg_sentence_length_dist=empty_dist,
2067
+ avg_syllables_per_word_dist=empty_dist,
2068
+ chunk_size=chunk_size,
2069
+ chunk_count=len(chunks),
2070
+ metadata={
2071
+ "flesch_reading_ease": float("nan"),
2072
+ "flesch_kincaid_grade": float("nan"),
2073
+ "difference_from_flesch": float("nan"),
2074
+ "reliable": False,
2075
+ },
2076
+ )
2077
+
2078
+ score_dist = make_distribution(score_values)
2079
+ grade_dist = make_distribution(grade_values)
2080
+ sent_len_dist = make_distribution(sent_len_values)
2081
+ syl_word_dist = make_distribution(syl_per_word_values)
2082
+
2083
+ # Compute Flesch metrics for comparison (using the same avg values)
2084
+ # Flesch Reading Ease: 206.835 - 1.015 * ASL - 84.6 * ASW
2085
+ # Flesch-Kincaid Grade: 0.39 * ASL + 11.8 * ASW - 15.59
2086
+ flesch_reading_ease = 206.835 - 1.015 * sent_len_dist.mean - 84.6 * syl_word_dist.mean
2087
+ flesch_kincaid_grade = 0.39 * sent_len_dist.mean + 11.8 * syl_word_dist.mean - 15.59
2088
+ difference_from_flesch = grade_dist.mean - flesch_kincaid_grade
2089
+
2090
+ return PowersSumnerKearlResult(
2091
+ psk_score=score_dist.mean,
2092
+ grade_level=grade_dist.mean,
2093
+ avg_sentence_length=sent_len_dist.mean,
2094
+ avg_syllables_per_word=syl_word_dist.mean,
2095
+ total_sentences=total_sentences,
2096
+ total_words=total_words,
2097
+ total_syllables=total_syllables,
2098
+ psk_score_dist=score_dist,
2099
+ grade_level_dist=grade_dist,
2100
+ avg_sentence_length_dist=sent_len_dist,
2101
+ avg_syllables_per_word_dist=syl_word_dist,
2102
+ chunk_size=chunk_size,
2103
+ chunk_count=len(chunks),
2104
+ metadata={
2105
+ "flesch_reading_ease": flesch_reading_ease,
2106
+ "flesch_kincaid_grade": flesch_kincaid_grade,
2107
+ "difference_from_flesch": difference_from_flesch,
2108
+ "reliable": total_words >= 100,
2109
+ },
2110
+ )