pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. pystylometry/README.md +42 -0
  2. pystylometry/__init__.py +45 -3
  3. pystylometry/_types.py +1017 -259
  4. pystylometry/authorship/README.md +21 -0
  5. pystylometry/authorship/__init__.py +28 -4
  6. pystylometry/authorship/additional_methods.py +260 -40
  7. pystylometry/authorship/compression.py +175 -0
  8. pystylometry/authorship/kilgarriff.py +354 -0
  9. pystylometry/character/README.md +17 -0
  10. pystylometry/character/character_metrics.py +267 -179
  11. pystylometry/cli.py +427 -0
  12. pystylometry/consistency/README.md +27 -0
  13. pystylometry/consistency/__init__.py +57 -0
  14. pystylometry/consistency/_thresholds.py +162 -0
  15. pystylometry/consistency/drift.py +549 -0
  16. pystylometry/dialect/README.md +26 -0
  17. pystylometry/dialect/__init__.py +65 -0
  18. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  19. pystylometry/dialect/_loader.py +360 -0
  20. pystylometry/dialect/detector.py +533 -0
  21. pystylometry/lexical/README.md +23 -0
  22. pystylometry/lexical/advanced_diversity.py +61 -22
  23. pystylometry/lexical/function_words.py +255 -56
  24. pystylometry/lexical/hapax.py +182 -52
  25. pystylometry/lexical/mtld.py +108 -26
  26. pystylometry/lexical/ttr.py +76 -10
  27. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  28. pystylometry/lexical/yule.py +136 -50
  29. pystylometry/ngrams/README.md +18 -0
  30. pystylometry/ngrams/entropy.py +150 -49
  31. pystylometry/ngrams/extended_ngrams.py +314 -69
  32. pystylometry/prosody/README.md +17 -0
  33. pystylometry/prosody/rhythm_prosody.py +773 -11
  34. pystylometry/readability/README.md +23 -0
  35. pystylometry/readability/additional_formulas.py +1887 -762
  36. pystylometry/readability/ari.py +144 -82
  37. pystylometry/readability/coleman_liau.py +136 -109
  38. pystylometry/readability/flesch.py +177 -73
  39. pystylometry/readability/gunning_fog.py +165 -161
  40. pystylometry/readability/smog.py +123 -42
  41. pystylometry/stylistic/README.md +20 -0
  42. pystylometry/stylistic/cohesion_coherence.py +669 -13
  43. pystylometry/stylistic/genre_register.py +1560 -17
  44. pystylometry/stylistic/markers.py +611 -17
  45. pystylometry/stylistic/vocabulary_overlap.py +354 -13
  46. pystylometry/syntactic/README.md +20 -0
  47. pystylometry/syntactic/advanced_syntactic.py +76 -14
  48. pystylometry/syntactic/pos_ratios.py +70 -6
  49. pystylometry/syntactic/sentence_stats.py +55 -12
  50. pystylometry/syntactic/sentence_types.py +71 -15
  51. pystylometry/viz/README.md +27 -0
  52. pystylometry/viz/__init__.py +71 -0
  53. pystylometry/viz/drift.py +589 -0
  54. pystylometry/viz/jsx/__init__.py +31 -0
  55. pystylometry/viz/jsx/_base.py +144 -0
  56. pystylometry/viz/jsx/report.py +677 -0
  57. pystylometry/viz/jsx/timeline.py +716 -0
  58. pystylometry/viz/jsx/viewer.py +1032 -0
  59. pystylometry-1.3.0.dist-info/METADATA +136 -0
  60. pystylometry-1.3.0.dist-info/RECORD +76 -0
  61. {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
  62. pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
  63. pystylometry-1.0.0.dist-info/METADATA +0 -275
  64. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -5,9 +5,9 @@ This module provides additional readability metrics beyond the core formulas
5
5
  approaches to measuring text difficulty and are valuable for cross-validation
6
6
  and comprehensive readability assessment.
7
7
 
8
- Related GitHub Issue:
8
+ Related GitHub Issues:
9
9
  #16 - Additional Readability Formulas
10
- https://github.com/craigtrim/pystylometry/issues/16
10
+ #27 - Native chunked analysis with Distribution dataclass
11
11
 
12
12
  Formulas implemented:
13
13
  - Dale-Chall: Based on list of 3000 familiar words
@@ -28,230 +28,1425 @@ References:
28
28
  adult readability formulas. Journal of Educational Psychology.
29
29
  """
30
30
 
31
+ import math
32
+
31
33
  from .._normalize import normalize_for_readability
32
34
  from .._types import (
33
35
  DaleChallResult,
36
+ Distribution,
34
37
  FORCASTResult,
35
38
  FryResult,
36
39
  LinsearWriteResult,
37
40
  PowersSumnerKearlResult,
41
+ chunk_text,
42
+ make_distribution,
38
43
  )
39
44
  from .._utils import split_sentences, tokenize
40
45
  from .syllables import count_syllables
41
46
 
42
-
43
47
  # Dale-Chall List of Familiar Words (subset of ~1200 words)
44
48
  # GitHub Issue #16: https://github.com/craigtrim/pystylometry/issues/16
45
49
  # Full Dale-Chall list has 3000 words that 80% of 4th graders understand.
46
50
  # This is a representative subset covering most common everyday words.
47
51
  DALE_CHALL_FAMILIAR_WORDS = {
48
52
  # Articles, pronouns, determiners
49
- "a", "an", "the", "this", "that", "these", "those", "some", "any", "all",
50
- "each", "every", "both", "few", "many", "much", "more", "most", "other",
51
- "another", "such", "what", "which", "who", "whom", "whose", "whoever",
52
- "i", "me", "my", "mine", "myself", "we", "us", "our", "ours", "ourselves",
53
- "you", "your", "yours", "yourself", "yourselves",
54
- "he", "him", "his", "himself", "she", "her", "hers", "herself",
55
- "it", "its", "itself", "they", "them", "their", "theirs", "themselves",
56
- "one", "ones", "someone", "somebody", "something", "anyone", "anybody", "anything",
57
- "everyone", "everybody", "everything", "no", "none", "nobody", "nothing",
58
-
53
+ "a",
54
+ "an",
55
+ "the",
56
+ "this",
57
+ "that",
58
+ "these",
59
+ "those",
60
+ "some",
61
+ "any",
62
+ "all",
63
+ "each",
64
+ "every",
65
+ "both",
66
+ "few",
67
+ "many",
68
+ "much",
69
+ "more",
70
+ "most",
71
+ "other",
72
+ "another",
73
+ "such",
74
+ "what",
75
+ "which",
76
+ "who",
77
+ "whom",
78
+ "whose",
79
+ "whoever",
80
+ "i",
81
+ "me",
82
+ "my",
83
+ "mine",
84
+ "myself",
85
+ "we",
86
+ "us",
87
+ "our",
88
+ "ours",
89
+ "ourselves",
90
+ "you",
91
+ "your",
92
+ "yours",
93
+ "yourself",
94
+ "yourselves",
95
+ "he",
96
+ "him",
97
+ "his",
98
+ "himself",
99
+ "she",
100
+ "her",
101
+ "hers",
102
+ "herself",
103
+ "it",
104
+ "its",
105
+ "itself",
106
+ "they",
107
+ "them",
108
+ "their",
109
+ "theirs",
110
+ "themselves",
111
+ "one",
112
+ "ones",
113
+ "someone",
114
+ "somebody",
115
+ "something",
116
+ "anyone",
117
+ "anybody",
118
+ "anything",
119
+ "everyone",
120
+ "everybody",
121
+ "everything",
122
+ "no",
123
+ "none",
124
+ "nobody",
125
+ "nothing",
59
126
  # Conjunctions and prepositions
60
- "and", "or", "but", "if", "when", "where", "why", "how", "because", "so",
61
- "for", "nor", "yet", "after", "before", "while", "since", "until", "unless",
62
- "though", "although", "whether", "than", "as", "like",
63
- "of", "to", "in", "on", "at", "by", "with", "from", "about", "into",
64
- "through", "over", "under", "above", "below", "between", "among", "against",
65
- "during", "without", "within", "along", "across", "behind", "beside", "near",
66
- "off", "out", "up", "down", "around", "past", "toward", "upon",
67
-
127
+ "and",
128
+ "or",
129
+ "but",
130
+ "if",
131
+ "when",
132
+ "where",
133
+ "why",
134
+ "how",
135
+ "because",
136
+ "so",
137
+ "for",
138
+ "nor",
139
+ "yet",
140
+ "after",
141
+ "before",
142
+ "while",
143
+ "since",
144
+ "until",
145
+ "unless",
146
+ "though",
147
+ "although",
148
+ "whether",
149
+ "than",
150
+ "as",
151
+ "like",
152
+ "of",
153
+ "to",
154
+ "in",
155
+ "on",
156
+ "at",
157
+ "by",
158
+ "with",
159
+ "from",
160
+ "about",
161
+ "into",
162
+ "through",
163
+ "over",
164
+ "under",
165
+ "above",
166
+ "below",
167
+ "between",
168
+ "among",
169
+ "against",
170
+ "during",
171
+ "without",
172
+ "within",
173
+ "along",
174
+ "across",
175
+ "behind",
176
+ "beside",
177
+ "near",
178
+ "off",
179
+ "out",
180
+ "up",
181
+ "down",
182
+ "around",
183
+ "past",
184
+ "toward",
185
+ "upon",
68
186
  # Common verbs (base, past, -ing, -ed forms included)
69
- "be", "am", "is", "are", "was", "were", "been", "being",
70
- "have", "has", "had", "having", "do", "does", "did", "doing", "done",
71
- "will", "would", "shall", "should", "may", "might", "must", "can", "could",
72
- "go", "goes", "went", "gone", "going", "come", "comes", "came", "coming",
73
- "make", "makes", "made", "making", "get", "gets", "got", "getting", "gotten",
74
- "know", "knows", "knew", "known", "knowing",
75
- "think", "thinks", "thought", "thinking",
76
- "see", "sees", "saw", "seen", "seeing", "look", "looks", "looked", "looking",
77
- "take", "takes", "took", "taken", "taking", "give", "gives", "gave", "given", "giving",
78
- "find", "finds", "found", "finding", "tell", "tells", "told", "telling",
79
- "ask", "asks", "asked", "asking", "work", "works", "worked", "working",
80
- "seem", "seems", "seemed", "seeming", "feel", "feels", "felt", "feeling",
81
- "try", "tries", "tried", "trying", "leave", "leaves", "left", "leaving",
82
- "call", "calls", "called", "calling", "use", "uses", "used", "using",
83
- "want", "wants", "wanted", "wanting", "need", "needs", "needed", "needing",
84
- "say", "says", "said", "saying", "talk", "talks", "talked", "talking",
85
- "turn", "turns", "turned", "turning", "run", "runs", "ran", "running",
86
- "move", "moves", "moved", "moving", "live", "lives", "lived", "living",
87
- "believe", "believes", "believed", "believing",
88
- "hold", "holds", "held", "holding", "bring", "brings", "brought", "bringing",
89
- "happen", "happens", "happened", "happening",
90
- "write", "writes", "wrote", "written", "writing",
91
- "sit", "sits", "sat", "sitting", "stand", "stands", "stood", "standing",
92
- "hear", "hears", "heard", "hearing", "let", "lets", "letting",
93
- "help", "helps", "helped", "helping", "show", "shows", "showed", "shown", "showing",
94
- "play", "plays", "played", "playing", "read", "reads", "reading",
95
- "change", "changes", "changed", "changing", "keep", "keeps", "kept", "keeping",
96
- "start", "starts", "started", "starting", "stop", "stops", "stopped", "stopping",
97
- "learn", "learns", "learned", "learning", "grow", "grows", "grew", "grown", "growing",
98
- "open", "opens", "opened", "opening", "close", "closes", "closed", "closing",
99
- "walk", "walks", "walked", "walking", "win", "wins", "won", "winning",
100
- "begin", "begins", "began", "begun", "beginning", "end", "ends", "ended", "ending",
101
- "lose", "loses", "lost", "losing", "send", "sends", "sent", "sending",
102
- "buy", "buys", "bought", "buying", "pay", "pays", "paid", "paying",
103
- "eat", "eats", "ate", "eaten", "eating", "drink", "drinks", "drank", "drinking",
104
- "sleep", "sleeps", "slept", "sleeping", "wake", "wakes", "woke", "waking",
105
- "sing", "sings", "sang", "sung", "singing", "dance", "dances", "danced", "dancing",
106
- "wait", "waits", "waited", "waiting", "stay", "stays", "stayed", "staying",
107
- "fly", "flies", "flew", "flown", "flying", "fall", "falls", "fell", "fallen", "falling",
108
- "cut", "cuts", "cutting", "break", "breaks", "broke", "broken", "breaking",
109
- "watch", "watches", "watched", "watching", "listen", "listens", "listened", "listening",
110
- "remember", "remembers", "remembered", "remembering",
111
- "forget", "forgets", "forgot", "forgotten", "forgetting",
112
- "meet", "meets", "met", "meeting", "follow", "follows", "followed", "following",
113
- "carry", "carries", "carried", "carrying", "catch", "catches", "caught", "catching",
114
- "draw", "draws", "drew", "drawn", "drawing", "drive", "drives", "drove", "driven", "driving",
115
- "ride", "rides", "rode", "ridden", "riding", "wear", "wears", "wore", "worn", "wearing",
116
- "pull", "pulls", "pulled", "pulling", "push", "pushes", "pushed", "pushing",
117
- "throw", "throws", "threw", "thrown", "throwing",
118
- "reach", "reaches", "reached", "reaching", "pass", "passes", "passed", "passing",
119
- "shoot", "shoots", "shot", "shooting", "rise", "rises", "rose", "risen", "rising",
120
- "blow", "blows", "blew", "blown", "blowing", "grow", "grows", "grew", "grown", "growing",
121
- "hit", "hits", "hitting", "fight", "fights", "fought", "fighting",
122
- "die", "dies", "died", "dying", "kill", "kills", "killed", "killing",
123
- "speak", "speaks", "spoke", "spoken", "speaking",
124
-
187
+ "be",
188
+ "am",
189
+ "is",
190
+ "are",
191
+ "was",
192
+ "were",
193
+ "been",
194
+ "being",
195
+ "have",
196
+ "has",
197
+ "had",
198
+ "having",
199
+ "do",
200
+ "does",
201
+ "did",
202
+ "doing",
203
+ "done",
204
+ "will",
205
+ "would",
206
+ "shall",
207
+ "should",
208
+ "may",
209
+ "might",
210
+ "must",
211
+ "can",
212
+ "could",
213
+ "go",
214
+ "goes",
215
+ "went",
216
+ "gone",
217
+ "going",
218
+ "come",
219
+ "comes",
220
+ "came",
221
+ "coming",
222
+ "make",
223
+ "makes",
224
+ "made",
225
+ "making",
226
+ "get",
227
+ "gets",
228
+ "got",
229
+ "getting",
230
+ "gotten",
231
+ "know",
232
+ "knows",
233
+ "knew",
234
+ "known",
235
+ "knowing",
236
+ "think",
237
+ "thinks",
238
+ "thought",
239
+ "thinking",
240
+ "see",
241
+ "sees",
242
+ "saw",
243
+ "seen",
244
+ "seeing",
245
+ "look",
246
+ "looks",
247
+ "looked",
248
+ "looking",
249
+ "take",
250
+ "takes",
251
+ "took",
252
+ "taken",
253
+ "taking",
254
+ "give",
255
+ "gives",
256
+ "gave",
257
+ "given",
258
+ "giving",
259
+ "find",
260
+ "finds",
261
+ "found",
262
+ "finding",
263
+ "tell",
264
+ "tells",
265
+ "told",
266
+ "telling",
267
+ "ask",
268
+ "asks",
269
+ "asked",
270
+ "asking",
271
+ "work",
272
+ "works",
273
+ "worked",
274
+ "working",
275
+ "seem",
276
+ "seems",
277
+ "seemed",
278
+ "seeming",
279
+ "feel",
280
+ "feels",
281
+ "felt",
282
+ "feeling",
283
+ "try",
284
+ "tries",
285
+ "tried",
286
+ "trying",
287
+ "leave",
288
+ "leaves",
289
+ "left",
290
+ "leaving",
291
+ "call",
292
+ "calls",
293
+ "called",
294
+ "calling",
295
+ "use",
296
+ "uses",
297
+ "used",
298
+ "using",
299
+ "want",
300
+ "wants",
301
+ "wanted",
302
+ "wanting",
303
+ "need",
304
+ "needs",
305
+ "needed",
306
+ "needing",
307
+ "say",
308
+ "says",
309
+ "said",
310
+ "saying",
311
+ "talk",
312
+ "talks",
313
+ "talked",
314
+ "talking",
315
+ "turn",
316
+ "turns",
317
+ "turned",
318
+ "turning",
319
+ "run",
320
+ "runs",
321
+ "ran",
322
+ "running",
323
+ "move",
324
+ "moves",
325
+ "moved",
326
+ "moving",
327
+ "live",
328
+ "lives",
329
+ "lived",
330
+ "living",
331
+ "believe",
332
+ "believes",
333
+ "believed",
334
+ "believing",
335
+ "hold",
336
+ "holds",
337
+ "held",
338
+ "holding",
339
+ "bring",
340
+ "brings",
341
+ "brought",
342
+ "bringing",
343
+ "happen",
344
+ "happens",
345
+ "happened",
346
+ "happening",
347
+ "write",
348
+ "writes",
349
+ "wrote",
350
+ "written",
351
+ "writing",
352
+ "sit",
353
+ "sits",
354
+ "sat",
355
+ "sitting",
356
+ "stand",
357
+ "stands",
358
+ "stood",
359
+ "standing",
360
+ "hear",
361
+ "hears",
362
+ "heard",
363
+ "hearing",
364
+ "let",
365
+ "lets",
366
+ "letting",
367
+ "help",
368
+ "helps",
369
+ "helped",
370
+ "helping",
371
+ "show",
372
+ "shows",
373
+ "showed",
374
+ "shown",
375
+ "showing",
376
+ "play",
377
+ "plays",
378
+ "played",
379
+ "playing",
380
+ "read",
381
+ "reads",
382
+ "reading",
383
+ "change",
384
+ "changes",
385
+ "changed",
386
+ "changing",
387
+ "keep",
388
+ "keeps",
389
+ "kept",
390
+ "keeping",
391
+ "start",
392
+ "starts",
393
+ "started",
394
+ "starting",
395
+ "stop",
396
+ "stops",
397
+ "stopped",
398
+ "stopping",
399
+ "learn",
400
+ "learns",
401
+ "learned",
402
+ "learning",
403
+ "grow",
404
+ "grows",
405
+ "grew",
406
+ "grown",
407
+ "growing",
408
+ "open",
409
+ "opens",
410
+ "opened",
411
+ "opening",
412
+ "close",
413
+ "closes",
414
+ "closed",
415
+ "closing",
416
+ "walk",
417
+ "walks",
418
+ "walked",
419
+ "walking",
420
+ "win",
421
+ "wins",
422
+ "won",
423
+ "winning",
424
+ "begin",
425
+ "begins",
426
+ "began",
427
+ "begun",
428
+ "beginning",
429
+ "end",
430
+ "ends",
431
+ "ended",
432
+ "ending",
433
+ "lose",
434
+ "loses",
435
+ "lost",
436
+ "losing",
437
+ "send",
438
+ "sends",
439
+ "sent",
440
+ "sending",
441
+ "buy",
442
+ "buys",
443
+ "bought",
444
+ "buying",
445
+ "pay",
446
+ "pays",
447
+ "paid",
448
+ "paying",
449
+ "eat",
450
+ "eats",
451
+ "ate",
452
+ "eaten",
453
+ "eating",
454
+ "drink",
455
+ "drinks",
456
+ "drank",
457
+ "drinking",
458
+ "sleep",
459
+ "sleeps",
460
+ "slept",
461
+ "sleeping",
462
+ "wake",
463
+ "wakes",
464
+ "woke",
465
+ "waking",
466
+ "sing",
467
+ "sings",
468
+ "sang",
469
+ "sung",
470
+ "singing",
471
+ "dance",
472
+ "dances",
473
+ "danced",
474
+ "dancing",
475
+ "wait",
476
+ "waits",
477
+ "waited",
478
+ "waiting",
479
+ "stay",
480
+ "stays",
481
+ "stayed",
482
+ "staying",
483
+ "fly",
484
+ "flies",
485
+ "flew",
486
+ "flown",
487
+ "flying",
488
+ "fall",
489
+ "falls",
490
+ "fell",
491
+ "fallen",
492
+ "falling",
493
+ "cut",
494
+ "cuts",
495
+ "cutting",
496
+ "break",
497
+ "breaks",
498
+ "broke",
499
+ "broken",
500
+ "breaking",
501
+ "watch",
502
+ "watches",
503
+ "watched",
504
+ "watching",
505
+ "listen",
506
+ "listens",
507
+ "listened",
508
+ "listening",
509
+ "remember",
510
+ "remembers",
511
+ "remembered",
512
+ "remembering",
513
+ "forget",
514
+ "forgets",
515
+ "forgot",
516
+ "forgotten",
517
+ "forgetting",
518
+ "meet",
519
+ "meets",
520
+ "met",
521
+ "meeting",
522
+ "follow",
523
+ "follows",
524
+ "followed",
525
+ "following",
526
+ "carry",
527
+ "carries",
528
+ "carried",
529
+ "carrying",
530
+ "catch",
531
+ "catches",
532
+ "caught",
533
+ "catching",
534
+ "draw",
535
+ "draws",
536
+ "drew",
537
+ "drawn",
538
+ "drawing",
539
+ "drive",
540
+ "drives",
541
+ "drove",
542
+ "driven",
543
+ "driving",
544
+ "ride",
545
+ "rides",
546
+ "rode",
547
+ "ridden",
548
+ "riding",
549
+ "wear",
550
+ "wears",
551
+ "wore",
552
+ "worn",
553
+ "wearing",
554
+ "pull",
555
+ "pulls",
556
+ "pulled",
557
+ "pulling",
558
+ "push",
559
+ "pushes",
560
+ "pushed",
561
+ "pushing",
562
+ "throw",
563
+ "throws",
564
+ "threw",
565
+ "thrown",
566
+ "throwing",
567
+ "reach",
568
+ "reaches",
569
+ "reached",
570
+ "reaching",
571
+ "pass",
572
+ "passes",
573
+ "passed",
574
+ "passing",
575
+ "shoot",
576
+ "shoots",
577
+ "shot",
578
+ "shooting",
579
+ "rise",
580
+ "rises",
581
+ "rose",
582
+ "risen",
583
+ "rising",
584
+ "blow",
585
+ "blows",
586
+ "blew",
587
+ "blown",
588
+ "blowing",
589
+ "grow",
590
+ "grows",
591
+ "grew",
592
+ "grown",
593
+ "growing",
594
+ "hit",
595
+ "hits",
596
+ "hitting",
597
+ "fight",
598
+ "fights",
599
+ "fought",
600
+ "fighting",
601
+ "die",
602
+ "dies",
603
+ "died",
604
+ "dying",
605
+ "kill",
606
+ "kills",
607
+ "killed",
608
+ "killing",
609
+ "speak",
610
+ "speaks",
611
+ "spoke",
612
+ "spoken",
613
+ "speaking",
125
614
  # Common nouns
126
- "time", "times", "year", "years", "day", "days", "week", "weeks",
127
- "month", "months", "hour", "hours", "minute", "minutes", "second", "seconds",
128
- "morning", "afternoon", "evening", "night", "today", "yesterday", "tomorrow",
129
- "people", "person", "man", "men", "woman", "women", "child", "children",
130
- "boy", "boys", "girl", "girls", "baby", "babies", "friend", "friends",
131
- "family", "families", "mother", "father", "parent", "parents",
132
- "brother", "brothers", "sister", "sisters", "son", "daughter",
133
- "place", "places", "home", "house", "houses", "room", "rooms",
134
- "school", "schools", "class", "classes", "student", "students", "teacher", "teachers",
135
- "way", "ways", "thing", "things", "part", "parts", "group", "groups",
136
- "number", "numbers", "side", "sides", "kind", "kinds", "head", "heads",
137
- "hand", "hands", "eye", "eyes", "face", "faces", "body", "bodies",
138
- "foot", "feet", "arm", "arms", "leg", "legs", "ear", "ears", "mouth",
139
- "water", "food", "air", "land", "earth", "ground", "world",
140
- "country", "countries", "state", "states", "city", "cities", "town", "towns",
141
- "name", "names", "word", "words", "line", "lines", "page", "pages",
142
- "book", "books", "story", "stories", "letter", "letters", "paper", "papers",
143
- "point", "points", "end", "ends", "top", "bottom", "front", "back",
144
- "life", "lives", "problem", "problems", "question", "questions", "answer", "answers",
145
- "work", "works", "job", "jobs", "money", "door", "doors", "window", "windows",
146
- "car", "cars", "road", "roads", "street", "streets", "tree", "trees",
147
- "animal", "animals", "bird", "birds", "fish", "dog", "dogs", "cat", "cats",
148
- "horse", "horses", "sea", "mountain", "mountains", "river", "rivers",
149
- "sun", "moon", "star", "stars", "sky", "cloud", "clouds", "rain", "snow",
150
- "wind", "fire", "light", "dark", "sound", "sounds", "color", "colors",
151
- "white", "black", "red", "blue", "green", "yellow", "brown", "orange",
152
- "game", "games", "ball", "music", "song", "songs", "picture", "pictures",
153
- "table", "tables", "chair", "chairs", "bed", "beds", "floor", "wall", "walls",
154
- "minute", "power", "war", "force", "age", "care", "order", "case",
155
-
615
+ "time",
616
+ "times",
617
+ "year",
618
+ "years",
619
+ "day",
620
+ "days",
621
+ "week",
622
+ "weeks",
623
+ "month",
624
+ "months",
625
+ "hour",
626
+ "hours",
627
+ "minute",
628
+ "minutes",
629
+ "second",
630
+ "seconds",
631
+ "morning",
632
+ "afternoon",
633
+ "evening",
634
+ "night",
635
+ "today",
636
+ "yesterday",
637
+ "tomorrow",
638
+ "people",
639
+ "person",
640
+ "man",
641
+ "men",
642
+ "woman",
643
+ "women",
644
+ "child",
645
+ "children",
646
+ "boy",
647
+ "boys",
648
+ "girl",
649
+ "girls",
650
+ "baby",
651
+ "babies",
652
+ "friend",
653
+ "friends",
654
+ "family",
655
+ "families",
656
+ "mother",
657
+ "father",
658
+ "parent",
659
+ "parents",
660
+ "brother",
661
+ "brothers",
662
+ "sister",
663
+ "sisters",
664
+ "son",
665
+ "daughter",
666
+ "place",
667
+ "places",
668
+ "home",
669
+ "house",
670
+ "houses",
671
+ "room",
672
+ "rooms",
673
+ "school",
674
+ "schools",
675
+ "class",
676
+ "classes",
677
+ "student",
678
+ "students",
679
+ "teacher",
680
+ "teachers",
681
+ "way",
682
+ "ways",
683
+ "thing",
684
+ "things",
685
+ "part",
686
+ "parts",
687
+ "group",
688
+ "groups",
689
+ "number",
690
+ "numbers",
691
+ "side",
692
+ "sides",
693
+ "kind",
694
+ "kinds",
695
+ "head",
696
+ "heads",
697
+ "hand",
698
+ "hands",
699
+ "eye",
700
+ "eyes",
701
+ "face",
702
+ "faces",
703
+ "body",
704
+ "bodies",
705
+ "foot",
706
+ "feet",
707
+ "arm",
708
+ "arms",
709
+ "leg",
710
+ "legs",
711
+ "ear",
712
+ "ears",
713
+ "mouth",
714
+ "water",
715
+ "food",
716
+ "air",
717
+ "land",
718
+ "earth",
719
+ "ground",
720
+ "world",
721
+ "country",
722
+ "countries",
723
+ "state",
724
+ "states",
725
+ "city",
726
+ "cities",
727
+ "town",
728
+ "towns",
729
+ "name",
730
+ "names",
731
+ "word",
732
+ "words",
733
+ "line",
734
+ "lines",
735
+ "page",
736
+ "pages",
737
+ "book",
738
+ "books",
739
+ "story",
740
+ "stories",
741
+ "letter",
742
+ "letters",
743
+ "paper",
744
+ "papers",
745
+ "point",
746
+ "points",
747
+ "end",
748
+ "ends",
749
+ "top",
750
+ "bottom",
751
+ "front",
752
+ "back",
753
+ "life",
754
+ "lives",
755
+ "problem",
756
+ "problems",
757
+ "question",
758
+ "questions",
759
+ "answer",
760
+ "answers",
761
+ "work",
762
+ "works",
763
+ "job",
764
+ "jobs",
765
+ "money",
766
+ "door",
767
+ "doors",
768
+ "window",
769
+ "windows",
770
+ "car",
771
+ "cars",
772
+ "road",
773
+ "roads",
774
+ "street",
775
+ "streets",
776
+ "tree",
777
+ "trees",
778
+ "animal",
779
+ "animals",
780
+ "bird",
781
+ "birds",
782
+ "fish",
783
+ "dog",
784
+ "dogs",
785
+ "cat",
786
+ "cats",
787
+ "horse",
788
+ "horses",
789
+ "sea",
790
+ "mountain",
791
+ "mountains",
792
+ "river",
793
+ "rivers",
794
+ "sun",
795
+ "moon",
796
+ "star",
797
+ "stars",
798
+ "sky",
799
+ "cloud",
800
+ "clouds",
801
+ "rain",
802
+ "snow",
803
+ "wind",
804
+ "fire",
805
+ "light",
806
+ "dark",
807
+ "sound",
808
+ "sounds",
809
+ "color",
810
+ "colors",
811
+ "white",
812
+ "black",
813
+ "red",
814
+ "blue",
815
+ "green",
816
+ "yellow",
817
+ "brown",
818
+ "orange",
819
+ "game",
820
+ "games",
821
+ "ball",
822
+ "music",
823
+ "song",
824
+ "songs",
825
+ "picture",
826
+ "pictures",
827
+ "table",
828
+ "tables",
829
+ "chair",
830
+ "chairs",
831
+ "bed",
832
+ "beds",
833
+ "floor",
834
+ "wall",
835
+ "walls",
836
+ "minute",
837
+ "power",
838
+ "war",
839
+ "force",
840
+ "age",
841
+ "care",
842
+ "order",
843
+ "case",
156
844
  # Common adjectives
157
- "good", "better", "best", "bad", "worse", "worst",
158
- "big", "bigger", "biggest", "small", "smaller", "smallest",
159
- "large", "larger", "largest", "little", "less", "least",
160
- "long", "longer", "longest", "short", "shorter", "shortest",
161
- "high", "higher", "highest", "low", "lower", "lowest",
162
- "old", "older", "oldest", "young", "younger", "youngest", "new", "newer", "newest",
163
- "great", "greater", "greatest", "important", "right", "left", "own",
164
- "other", "different", "same", "next", "last", "first", "second", "third",
165
- "early", "earlier", "earliest", "late", "later", "latest",
166
- "easy", "easier", "easiest", "hard", "harder", "hardest",
167
- "hot", "hotter", "hottest", "cold", "colder", "coldest",
168
- "warm", "warmer", "warmest", "cool", "cooler", "coolest",
169
- "fast", "faster", "fastest", "slow", "slower", "slowest",
170
- "strong", "stronger", "strongest", "weak", "weaker", "weakest",
171
- "happy", "happier", "happiest", "sad", "sadder", "saddest",
172
- "nice", "nicer", "nicest", "kind", "kinder", "kindest",
173
- "sure", "free", "full", "whole", "ready", "simple", "clear",
174
- "real", "true", "certain", "public", "able", "several",
175
- "open", "closed", "deep", "wide", "bright", "dark", "heavy", "light",
176
- "clean", "dirty", "wet", "dry", "soft", "hard", "quiet", "loud",
177
- "quick", "slow", "rich", "poor", "sick", "well", "dead", "alive",
178
- "empty", "busy", "pretty", "beautiful", "ugly",
179
-
845
+ "good",
846
+ "better",
847
+ "best",
848
+ "bad",
849
+ "worse",
850
+ "worst",
851
+ "big",
852
+ "bigger",
853
+ "biggest",
854
+ "small",
855
+ "smaller",
856
+ "smallest",
857
+ "large",
858
+ "larger",
859
+ "largest",
860
+ "little",
861
+ "less",
862
+ "least",
863
+ "long",
864
+ "longer",
865
+ "longest",
866
+ "short",
867
+ "shorter",
868
+ "shortest",
869
+ "high",
870
+ "higher",
871
+ "highest",
872
+ "low",
873
+ "lower",
874
+ "lowest",
875
+ "old",
876
+ "older",
877
+ "oldest",
878
+ "young",
879
+ "younger",
880
+ "youngest",
881
+ "new",
882
+ "newer",
883
+ "newest",
884
+ "great",
885
+ "greater",
886
+ "greatest",
887
+ "important",
888
+ "right",
889
+ "left",
890
+ "own",
891
+ "other",
892
+ "different",
893
+ "same",
894
+ "next",
895
+ "last",
896
+ "first",
897
+ "second",
898
+ "third",
899
+ "early",
900
+ "earlier",
901
+ "earliest",
902
+ "late",
903
+ "later",
904
+ "latest",
905
+ "easy",
906
+ "easier",
907
+ "easiest",
908
+ "hard",
909
+ "harder",
910
+ "hardest",
911
+ "hot",
912
+ "hotter",
913
+ "hottest",
914
+ "cold",
915
+ "colder",
916
+ "coldest",
917
+ "warm",
918
+ "warmer",
919
+ "warmest",
920
+ "cool",
921
+ "cooler",
922
+ "coolest",
923
+ "fast",
924
+ "faster",
925
+ "fastest",
926
+ "slow",
927
+ "slower",
928
+ "slowest",
929
+ "strong",
930
+ "stronger",
931
+ "strongest",
932
+ "weak",
933
+ "weaker",
934
+ "weakest",
935
+ "happy",
936
+ "happier",
937
+ "happiest",
938
+ "sad",
939
+ "sadder",
940
+ "saddest",
941
+ "nice",
942
+ "nicer",
943
+ "nicest",
944
+ "kind",
945
+ "kinder",
946
+ "kindest",
947
+ "sure",
948
+ "free",
949
+ "full",
950
+ "whole",
951
+ "ready",
952
+ "simple",
953
+ "clear",
954
+ "real",
955
+ "true",
956
+ "certain",
957
+ "public",
958
+ "able",
959
+ "several",
960
+ "open",
961
+ "closed",
962
+ "deep",
963
+ "wide",
964
+ "bright",
965
+ "dark",
966
+ "heavy",
967
+ "light",
968
+ "clean",
969
+ "dirty",
970
+ "wet",
971
+ "dry",
972
+ "soft",
973
+ "hard",
974
+ "quiet",
975
+ "loud",
976
+ "quick",
977
+ "slow",
978
+ "rich",
979
+ "poor",
980
+ "sick",
981
+ "well",
982
+ "dead",
983
+ "alive",
984
+ "empty",
985
+ "busy",
986
+ "pretty",
987
+ "beautiful",
988
+ "ugly",
180
989
  # Common adverbs
181
- "very", "too", "so", "more", "most", "less", "least",
182
- "well", "better", "best", "just", "only", "even", "still",
183
- "also", "just", "now", "then", "here", "there", "where",
184
- "how", "when", "why", "not", "never", "always", "often",
185
- "sometimes", "usually", "ever", "again", "back", "away",
186
- "together", "once", "twice", "soon", "today", "yesterday", "tomorrow",
187
- "already", "almost", "enough", "quite", "rather", "really",
188
- "perhaps", "maybe", "probably", "certainly", "surely",
189
- "yes", "no", "please", "thank", "sorry",
190
-
990
+ "very",
991
+ "too",
992
+ "so",
993
+ "more",
994
+ "most",
995
+ "less",
996
+ "least",
997
+ "well",
998
+ "better",
999
+ "best",
1000
+ "just",
1001
+ "only",
1002
+ "even",
1003
+ "still",
1004
+ "also",
1005
+ "just",
1006
+ "now",
1007
+ "then",
1008
+ "here",
1009
+ "there",
1010
+ "where",
1011
+ "how",
1012
+ "when",
1013
+ "why",
1014
+ "not",
1015
+ "never",
1016
+ "always",
1017
+ "often",
1018
+ "sometimes",
1019
+ "usually",
1020
+ "ever",
1021
+ "again",
1022
+ "back",
1023
+ "away",
1024
+ "together",
1025
+ "once",
1026
+ "twice",
1027
+ "soon",
1028
+ "today",
1029
+ "yesterday",
1030
+ "tomorrow",
1031
+ "already",
1032
+ "almost",
1033
+ "enough",
1034
+ "quite",
1035
+ "rather",
1036
+ "really",
1037
+ "perhaps",
1038
+ "maybe",
1039
+ "probably",
1040
+ "certainly",
1041
+ "surely",
1042
+ "yes",
1043
+ "no",
1044
+ "please",
1045
+ "thank",
1046
+ "sorry",
191
1047
  # Numbers
192
- "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
193
- "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty",
194
- "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
195
- "hundred", "thousand", "million",
196
- "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth",
197
-
1048
+ "zero",
1049
+ "one",
1050
+ "two",
1051
+ "three",
1052
+ "four",
1053
+ "five",
1054
+ "six",
1055
+ "seven",
1056
+ "eight",
1057
+ "nine",
1058
+ "ten",
1059
+ "eleven",
1060
+ "twelve",
1061
+ "thirteen",
1062
+ "fourteen",
1063
+ "fifteen",
1064
+ "sixteen",
1065
+ "seventeen",
1066
+ "eighteen",
1067
+ "nineteen",
1068
+ "twenty",
1069
+ "thirty",
1070
+ "forty",
1071
+ "fifty",
1072
+ "sixty",
1073
+ "seventy",
1074
+ "eighty",
1075
+ "ninety",
1076
+ "hundred",
1077
+ "thousand",
1078
+ "million",
1079
+ "first",
1080
+ "second",
1081
+ "third",
1082
+ "fourth",
1083
+ "fifth",
1084
+ "sixth",
1085
+ "seventh",
1086
+ "eighth",
1087
+ "ninth",
1088
+ "tenth",
198
1089
  # Additional common words
199
- "able", "accept", "across", "act", "add", "afraid", "against", "agree",
200
- "allow", "alone", "appear", "apple", "area", "arm", "arrive", "art",
201
- "aunt", "ball", "become", "believe", "belong", "boat", "build",
202
- "burn", "business", "chair", "chance", "church", "clear", "climb",
203
- "clothe", "clothes", "company", "contain", "continue", "control",
204
- "cook", "corner", "cost", "count", "course", "cover", "create",
205
- "cross", "crowd", "cry", "decide", "depend", "describe", "develop",
206
- "die", "direction", "discover", "doctor", "double", "drop", "during",
207
- "edge", "effect", "eight", "either", "else", "enjoy", "enough",
208
- "enter", "example", "except", "excite", "expect", "explain", "express",
209
- "fact", "fair", "farm", "fear", "field", "fill", "final", "fine",
210
- "finger", "finish", "flower", "force", "foreign", "forest", "form",
211
- "fresh", "front", "garden", "general", "glass", "god", "gold",
212
- "hang", "hat", "hope", "hot", "idea", "include", "increase",
213
- "instead", "interest", "island", "join", "laugh", "law", "lead",
214
- "lie", "lift", "list", "lock", "love", "machine", "mark",
215
- "matter", "mean", "measure", "member", "mention", "middle", "mile",
216
- "mind", "miss", "moment", "nation", "natural", "nature", "necessary",
217
- "neighbor", "notice", "object", "ocean", "offer", "office", "opinion",
218
- "paint", "pair", "party", "pattern", "period", "pick", "plan",
219
- "plant", "position", "possible", "pound", "prepare", "present", "president",
220
- "press", "prince", "print", "probable", "produce", "promise", "proper",
221
- "protect", "prove", "purpose", "quarter", "queen", "question", "quick",
222
- "quiet", "race", "raise", "range", "rate", "reason", "receive",
223
- "record", "region", "remain", "reply", "report", "represent", "require",
224
- "rest", "result", "return", "roll", "rule", "sail", "salt",
225
- "save", "science", "season", "seat", "seem", "sell", "sense",
226
- "sentence", "separate", "serve", "set", "settle", "seven", "shape",
227
- "share", "ship", "shore", "sign", "silver", "single", "sir",
228
- "six", "size", "skin", "soldier", "solve", "south", "space",
229
- "special", "speed", "spell", "spend", "spread", "spring", "square",
230
- "step", "stone", "straight", "strange", "stream", "strength", "strike",
231
- "subject", "success", "sudden", "suffer", "suggest", "suit", "summer",
232
- "supply", "support", "suppose", "surface", "surprise", "sweet", "swim",
233
- "system", "tail", "taste", "teach", "team", "telephone", "television",
234
- "temperature", "ten", "test", "thick", "thin", "though", "thousand",
235
- "three", "tire", "total", "touch", "track", "train", "travel",
236
- "trip", "trouble", "type", "uncle", "understand", "unit", "universe",
237
- "value", "various", "view", "village", "visit", "voice", "vote",
238
- "wagon", "wander", "warm", "wash", "wave", "wealth", "weather",
239
- "weight", "welcome", "west", "wheel", "wild", "wind", "winter",
240
- "wish", "wonder", "wood", "yard", "yellow",
1090
+ "able",
1091
+ "accept",
1092
+ "across",
1093
+ "act",
1094
+ "add",
1095
+ "afraid",
1096
+ "against",
1097
+ "agree",
1098
+ "allow",
1099
+ "alone",
1100
+ "appear",
1101
+ "apple",
1102
+ "area",
1103
+ "arm",
1104
+ "arrive",
1105
+ "art",
1106
+ "aunt",
1107
+ "ball",
1108
+ "become",
1109
+ "believe",
1110
+ "belong",
1111
+ "boat",
1112
+ "build",
1113
+ "burn",
1114
+ "business",
1115
+ "chair",
1116
+ "chance",
1117
+ "church",
1118
+ "clear",
1119
+ "climb",
1120
+ "clothe",
1121
+ "clothes",
1122
+ "company",
1123
+ "contain",
1124
+ "continue",
1125
+ "control",
1126
+ "cook",
1127
+ "corner",
1128
+ "cost",
1129
+ "count",
1130
+ "course",
1131
+ "cover",
1132
+ "create",
1133
+ "cross",
1134
+ "crowd",
1135
+ "cry",
1136
+ "decide",
1137
+ "depend",
1138
+ "describe",
1139
+ "develop",
1140
+ "die",
1141
+ "direction",
1142
+ "discover",
1143
+ "doctor",
1144
+ "double",
1145
+ "drop",
1146
+ "during",
1147
+ "edge",
1148
+ "effect",
1149
+ "eight",
1150
+ "either",
1151
+ "else",
1152
+ "enjoy",
1153
+ "enough",
1154
+ "enter",
1155
+ "example",
1156
+ "except",
1157
+ "excite",
1158
+ "expect",
1159
+ "explain",
1160
+ "express",
1161
+ "fact",
1162
+ "fair",
1163
+ "farm",
1164
+ "fear",
1165
+ "field",
1166
+ "fill",
1167
+ "final",
1168
+ "fine",
1169
+ "finger",
1170
+ "finish",
1171
+ "flower",
1172
+ "force",
1173
+ "foreign",
1174
+ "forest",
1175
+ "form",
1176
+ "fresh",
1177
+ "front",
1178
+ "garden",
1179
+ "general",
1180
+ "glass",
1181
+ "god",
1182
+ "gold",
1183
+ "hang",
1184
+ "hat",
1185
+ "hope",
1186
+ "hot",
1187
+ "idea",
1188
+ "include",
1189
+ "increase",
1190
+ "instead",
1191
+ "interest",
1192
+ "island",
1193
+ "join",
1194
+ "laugh",
1195
+ "law",
1196
+ "lead",
1197
+ "lie",
1198
+ "lift",
1199
+ "list",
1200
+ "lock",
1201
+ "love",
1202
+ "machine",
1203
+ "mark",
1204
+ "matter",
1205
+ "mean",
1206
+ "measure",
1207
+ "member",
1208
+ "mention",
1209
+ "middle",
1210
+ "mile",
1211
+ "mind",
1212
+ "miss",
1213
+ "moment",
1214
+ "nation",
1215
+ "natural",
1216
+ "nature",
1217
+ "necessary",
1218
+ "neighbor",
1219
+ "notice",
1220
+ "object",
1221
+ "ocean",
1222
+ "offer",
1223
+ "office",
1224
+ "opinion",
1225
+ "paint",
1226
+ "pair",
1227
+ "party",
1228
+ "pattern",
1229
+ "period",
1230
+ "pick",
1231
+ "plan",
1232
+ "plant",
1233
+ "position",
1234
+ "possible",
1235
+ "pound",
1236
+ "prepare",
1237
+ "present",
1238
+ "president",
1239
+ "press",
1240
+ "prince",
1241
+ "print",
1242
+ "probable",
1243
+ "produce",
1244
+ "promise",
1245
+ "proper",
1246
+ "protect",
1247
+ "prove",
1248
+ "purpose",
1249
+ "quarter",
1250
+ "queen",
1251
+ "question",
1252
+ "quick",
1253
+ "quiet",
1254
+ "race",
1255
+ "raise",
1256
+ "range",
1257
+ "rate",
1258
+ "reason",
1259
+ "receive",
1260
+ "record",
1261
+ "region",
1262
+ "remain",
1263
+ "reply",
1264
+ "report",
1265
+ "represent",
1266
+ "require",
1267
+ "rest",
1268
+ "result",
1269
+ "return",
1270
+ "roll",
1271
+ "rule",
1272
+ "sail",
1273
+ "salt",
1274
+ "save",
1275
+ "science",
1276
+ "season",
1277
+ "seat",
1278
+ "seem",
1279
+ "sell",
1280
+ "sense",
1281
+ "sentence",
1282
+ "separate",
1283
+ "serve",
1284
+ "set",
1285
+ "settle",
1286
+ "seven",
1287
+ "shape",
1288
+ "share",
1289
+ "ship",
1290
+ "shore",
1291
+ "sign",
1292
+ "silver",
1293
+ "single",
1294
+ "sir",
1295
+ "six",
1296
+ "size",
1297
+ "skin",
1298
+ "soldier",
1299
+ "solve",
1300
+ "south",
1301
+ "space",
1302
+ "special",
1303
+ "speed",
1304
+ "spell",
1305
+ "spend",
1306
+ "spread",
1307
+ "spring",
1308
+ "square",
1309
+ "step",
1310
+ "stone",
1311
+ "straight",
1312
+ "strange",
1313
+ "stream",
1314
+ "strength",
1315
+ "strike",
1316
+ "subject",
1317
+ "success",
1318
+ "sudden",
1319
+ "suffer",
1320
+ "suggest",
1321
+ "suit",
1322
+ "summer",
1323
+ "supply",
1324
+ "support",
1325
+ "suppose",
1326
+ "surface",
1327
+ "surprise",
1328
+ "sweet",
1329
+ "swim",
1330
+ "system",
1331
+ "tail",
1332
+ "taste",
1333
+ "teach",
1334
+ "team",
1335
+ "telephone",
1336
+ "television",
1337
+ "temperature",
1338
+ "ten",
1339
+ "test",
1340
+ "thick",
1341
+ "thin",
1342
+ "though",
1343
+ "thousand",
1344
+ "three",
1345
+ "tire",
1346
+ "total",
1347
+ "touch",
1348
+ "track",
1349
+ "train",
1350
+ "travel",
1351
+ "trip",
1352
+ "trouble",
1353
+ "type",
1354
+ "uncle",
1355
+ "understand",
1356
+ "unit",
1357
+ "universe",
1358
+ "value",
1359
+ "various",
1360
+ "view",
1361
+ "village",
1362
+ "visit",
1363
+ "voice",
1364
+ "vote",
1365
+ "wagon",
1366
+ "wander",
1367
+ "warm",
1368
+ "wash",
1369
+ "wave",
1370
+ "wealth",
1371
+ "weather",
1372
+ "weight",
1373
+ "welcome",
1374
+ "west",
1375
+ "wheel",
1376
+ "wild",
1377
+ "wind",
1378
+ "winter",
1379
+ "wish",
1380
+ "wonder",
1381
+ "wood",
1382
+ "yard",
1383
+ "yellow",
241
1384
  }
242
1385
 
243
1386
 
244
- def compute_dale_chall(text: str) -> DaleChallResult:
1387
+ def _compute_dale_chall_single(text: str) -> tuple[float, int, float, float, dict]:
1388
+ """Compute Dale-Chall for a single chunk."""
1389
+ sentences = split_sentences(text)
1390
+ tokens = tokenize(text)
1391
+ word_tokens = normalize_for_readability(tokens)
1392
+
1393
+ if len(sentences) == 0 or len(word_tokens) == 0:
1394
+ return (float("nan"), 0, float("nan"), float("nan"), {"sentence_count": 0, "word_count": 0})
1395
+
1396
+ difficult_words = [w for w in word_tokens if w.lower() not in DALE_CHALL_FAMILIAR_WORDS]
1397
+ difficult_word_count = len(difficult_words)
1398
+ difficult_word_ratio = difficult_word_count / len(word_tokens)
1399
+ difficult_word_pct = difficult_word_ratio * 100
1400
+ avg_sentence_length = len(word_tokens) / len(sentences)
1401
+ raw_score = 0.1579 * difficult_word_pct + 0.0496 * avg_sentence_length
1402
+ adjusted = difficult_word_pct > 5.0
1403
+ dale_chall_score = raw_score + 3.6365 if adjusted else raw_score
1404
+
1405
+ return (
1406
+ dale_chall_score,
1407
+ difficult_word_count,
1408
+ difficult_word_ratio,
1409
+ avg_sentence_length,
1410
+ {
1411
+ "sentence_count": len(sentences),
1412
+ "word_count": len(word_tokens),
1413
+ "adjusted": adjusted,
1414
+ "raw_score": raw_score,
1415
+ "difficult_word_pct": difficult_word_pct,
1416
+ },
1417
+ )
1418
+
1419
+
1420
+ def _get_dale_chall_grade_level(score: float) -> str:
1421
+ """Map Dale-Chall score to grade level."""
1422
+ if math.isnan(score):
1423
+ return "Unknown"
1424
+ if score < 5.0:
1425
+ return "4 and below"
1426
+ elif score < 6.0:
1427
+ return "5-6"
1428
+ elif score < 7.0:
1429
+ return "7-8"
1430
+ elif score < 8.0:
1431
+ return "9-10"
1432
+ elif score < 9.0:
1433
+ return "11-12"
1434
+ elif score < 10.0:
1435
+ return "College"
1436
+ else:
1437
+ return "College Graduate"
1438
+
1439
+
1440
+ def compute_dale_chall(text: str, chunk_size: int = 1000) -> DaleChallResult:
245
1441
  """
246
1442
  Compute Dale-Chall Readability Formula.
247
1443
 
248
- The Dale-Chall formula estimates reading difficulty based on the percentage
249
- of words that are NOT on a list of 3000 familiar words (words understood
250
- by 80% of 4th graders). It also considers average sentence length.
1444
+ This function uses native chunked analysis to capture variance and patterns
1445
+ across the text, which is essential for stylometric fingerprinting.
251
1446
 
252
- Related GitHub Issue:
1447
+ Related GitHub Issues:
253
1448
  #16 - Additional Readability Formulas
254
- https://github.com/craigtrim/pystylometry/issues/16
1449
+ #27 - Native chunked analysis with Distribution dataclass
255
1450
 
256
1451
  Formula:
257
1452
  Raw Score = 0.1579 * (difficult_words_pct) + 0.0496 * (avg_sentence_length)
@@ -259,62 +1454,42 @@ def compute_dale_chall(text: str) -> DaleChallResult:
259
1454
  If difficult_words_pct > 5%:
260
1455
  Adjusted Score = Raw Score + 3.6365
261
1456
 
262
- Grade Level Correspondence:
263
- 4.9 or lower: Grade 4 and below
264
- 5.0-5.9: Grades 5-6
265
- 6.0-6.9: Grades 7-8
266
- 7.0-7.9: Grades 9-10
267
- 8.0-8.9: Grades 11-12
268
- 9.0-9.9: Grades 13-15 (College)
269
- 10.0+: Grade 16+ (College Graduate)
270
-
271
- Advantages:
272
- - Based on empirical word familiarity data
273
- - Works well for educational materials
274
- - Well-validated across grade levels
275
- - Considers both vocabulary and syntax
276
-
277
- Disadvantages:
278
- - Requires maintaining 3000-word familiar list
279
- - List is dated (1948, updated 1995)
280
- - May not reflect modern vocabulary
281
- - Doesn't account for concept difficulty
282
-
283
1457
  Args:
284
- text: Input text to analyze. Should contain at least one complete
285
- sentence. Empty text returns NaN values.
1458
+ text: Input text to analyze
1459
+ chunk_size: Number of words per chunk (default: 1000)
286
1460
 
287
1461
  Returns:
288
- DaleChallResult containing:
289
- - dale_chall_score: The Dale-Chall readability score
290
- - grade_level: Grade range (e.g., "7-8", "College")
291
- - difficult_word_count: Words not on familiar list
292
- - difficult_word_ratio: Difficult words / total words
293
- - avg_sentence_length: Average words per sentence
294
- - total_words: Total word count
295
- - metadata: List of difficult words, adjusted score flag, etc.
1462
+ DaleChallResult with dale_chall_score, grade_level, distributions, and metadata
296
1463
 
297
1464
  Example:
298
- >>> result = compute_dale_chall("Sample educational text...")
299
- >>> print(f"Dale-Chall score: {result.dale_chall_score:.2f}")
300
- Dale-Chall score: 7.3
301
- >>> print(f"Grade level: {result.grade_level}")
302
- Grade level: 7-8
303
- >>> print(f"Difficult words: {result.difficult_word_ratio * 100:.1f}%")
304
- Difficult words: 12.4%
305
-
306
- Note:
307
- - Case-insensitive word matching
308
- - Punctuation stripped before word lookup
309
- - Proper nouns may be flagged as difficult even if well-known
310
- - Technical/specialized texts score higher than general texts
1465
+ >>> result = compute_dale_chall("Long text here...", chunk_size=1000)
1466
+ >>> result.dale_chall_score # Mean across chunks
1467
+ 7.3
1468
+ >>> result.dale_chall_score_dist.std # Variance reveals fingerprint
1469
+ 0.5
311
1470
  """
312
- # Tokenize and segment
313
- sentences = split_sentences(text)
314
- tokens = tokenize(text)
315
- word_tokens = normalize_for_readability(tokens)
316
-
317
- if len(sentences) == 0 or len(word_tokens) == 0:
1471
+ chunks = chunk_text(text, chunk_size)
1472
+ score_values = []
1473
+ ratio_values = []
1474
+ sent_len_values = []
1475
+ total_difficult = 0
1476
+ total_words = 0
1477
+ total_sentences = 0
1478
+
1479
+ for chunk in chunks:
1480
+ sc, diff_cnt, diff_rat, sent_len, meta = _compute_dale_chall_single(chunk)
1481
+ if not math.isnan(sc):
1482
+ score_values.append(sc)
1483
+ ratio_values.append(diff_rat)
1484
+ sent_len_values.append(sent_len)
1485
+ total_difficult += diff_cnt
1486
+ total_words += meta.get("word_count", 0)
1487
+ total_sentences += meta.get("sentence_count", 0)
1488
+
1489
+ if not score_values:
1490
+ empty_dist = Distribution(
1491
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
1492
+ )
318
1493
  return DaleChallResult(
319
1494
  dale_chall_score=float("nan"),
320
1495
  grade_level="Unknown",
@@ -322,612 +1497,562 @@ def compute_dale_chall(text: str) -> DaleChallResult:
322
1497
  difficult_word_ratio=float("nan"),
323
1498
  avg_sentence_length=float("nan"),
324
1499
  total_words=0,
1500
+ dale_chall_score_dist=empty_dist,
1501
+ difficult_word_ratio_dist=empty_dist,
1502
+ avg_sentence_length_dist=empty_dist,
1503
+ chunk_size=chunk_size,
1504
+ chunk_count=len(chunks),
325
1505
  metadata={
326
1506
  "sentence_count": 0,
327
1507
  "raw_score": float("nan"),
328
1508
  "adjusted": False,
329
- "difficult_words_sample": [],
1509
+ "difficult_word_pct": float("nan"),
1510
+ "reliable": False,
330
1511
  },
331
1512
  )
332
1513
 
333
- # Count difficult words (not in familiar list)
334
- difficult_words = []
335
- for word in word_tokens:
336
- word_lower = word.lower()
337
- if word_lower not in DALE_CHALL_FAMILIAR_WORDS:
338
- difficult_words.append(word)
1514
+ score_dist = make_distribution(score_values)
1515
+ ratio_dist = make_distribution(ratio_values)
1516
+ sent_len_dist = make_distribution(sent_len_values)
339
1517
 
340
- difficult_word_count = len(difficult_words)
341
- difficult_word_ratio = difficult_word_count / len(word_tokens)
342
- difficult_word_pct = difficult_word_ratio * 100
1518
+ # Calculate overall raw score and adjusted status for metadata
1519
+ overall_difficult_pct = (total_difficult / total_words * 100) if total_words > 0 else 0.0
1520
+ overall_raw_score = 0.1579 * overall_difficult_pct + 0.0496 * sent_len_dist.mean
1521
+ overall_adjusted = overall_difficult_pct > 5.0
343
1522
 
344
- # Calculate average sentence length
345
- avg_sentence_length = len(word_tokens) / len(sentences)
1523
+ return DaleChallResult(
1524
+ dale_chall_score=score_dist.mean,
1525
+ grade_level=_get_dale_chall_grade_level(score_dist.mean),
1526
+ difficult_word_count=total_difficult,
1527
+ difficult_word_ratio=ratio_dist.mean,
1528
+ avg_sentence_length=sent_len_dist.mean,
1529
+ total_words=total_words,
1530
+ dale_chall_score_dist=score_dist,
1531
+ difficult_word_ratio_dist=ratio_dist,
1532
+ avg_sentence_length_dist=sent_len_dist,
1533
+ chunk_size=chunk_size,
1534
+ chunk_count=len(chunks),
1535
+ metadata={
1536
+ "sentence_count": total_sentences,
1537
+ "raw_score": overall_raw_score,
1538
+ "adjusted": overall_adjusted,
1539
+ "difficult_word_pct": overall_difficult_pct,
1540
+ "total_sentence_count": total_sentences,
1541
+ "total_word_count": total_words,
1542
+ "total_difficult_word_count": total_difficult,
1543
+ "reliable": total_words >= 100,
1544
+ },
1545
+ )
346
1546
 
347
- # Calculate raw score
348
- raw_score = 0.1579 * difficult_word_pct + 0.0496 * avg_sentence_length
349
1547
 
350
- # Apply adjustment if difficult word % > 5.0
351
- adjusted = difficult_word_pct > 5.0
352
- if adjusted:
353
- dale_chall_score = raw_score + 3.6365
354
- else:
355
- dale_chall_score = raw_score
356
-
357
- # Map score to grade level
358
- if dale_chall_score < 5.0:
359
- grade_level = "4 and below"
360
- elif dale_chall_score < 6.0:
361
- grade_level = "5-6"
362
- elif dale_chall_score < 7.0:
363
- grade_level = "7-8"
364
- elif dale_chall_score < 8.0:
365
- grade_level = "9-10"
366
- elif dale_chall_score < 9.0:
367
- grade_level = "11-12"
368
- elif dale_chall_score < 10.0:
369
- grade_level = "College"
370
- else:
371
- grade_level = "College Graduate"
1548
+ def _compute_linsear_single(text: str) -> tuple[float, float, int, int, float, dict]:
1549
+ """Compute Linsear Write for a single chunk."""
1550
+ sentences = split_sentences(text)
1551
+ tokens = tokenize(text)
1552
+ word_tokens = normalize_for_readability(tokens)
372
1553
 
373
- # Build metadata
374
- # Sample up to 20 difficult words for metadata (avoid huge lists)
375
- difficult_words_sample = list(set(difficult_words))[:20]
1554
+ if len(sentences) == 0 or len(word_tokens) == 0:
1555
+ return (
1556
+ float("nan"),
1557
+ float("nan"),
1558
+ 0,
1559
+ 0,
1560
+ float("nan"),
1561
+ {"sentence_count": 0, "word_count": 0},
1562
+ )
376
1563
 
377
- metadata = {
378
- "sentence_count": len(sentences),
379
- "raw_score": raw_score,
380
- "adjusted": adjusted,
381
- "difficult_word_pct": difficult_word_pct,
382
- "difficult_words_sample": difficult_words_sample,
383
- "familiar_word_list_size": len(DALE_CHALL_FAMILIAR_WORDS),
384
- }
1564
+ easy_word_count = sum(1 for w in word_tokens if count_syllables(w) <= 2)
1565
+ hard_word_count = len(word_tokens) - easy_word_count
1566
+ weighted_sum = easy_word_count + hard_word_count * 3
1567
+ raw_score = weighted_sum / len(sentences)
1568
+ grade_level_raw = round(raw_score / 2) if raw_score > 20 else round((raw_score - 2) / 2)
1569
+ grade_level = max(0.0, float(grade_level_raw))
1570
+ avg_sentence_length = len(word_tokens) / len(sentences)
385
1571
 
386
- return DaleChallResult(
387
- dale_chall_score=dale_chall_score,
388
- grade_level=grade_level,
389
- difficult_word_count=difficult_word_count,
390
- difficult_word_ratio=difficult_word_ratio,
391
- avg_sentence_length=avg_sentence_length,
392
- total_words=len(word_tokens),
393
- metadata=metadata,
1572
+ return (
1573
+ raw_score,
1574
+ grade_level,
1575
+ easy_word_count,
1576
+ hard_word_count,
1577
+ avg_sentence_length,
1578
+ {"sentence_count": len(sentences), "word_count": len(word_tokens)},
394
1579
  )
395
1580
 
396
1581
 
397
- def compute_linsear_write(text: str) -> LinsearWriteResult:
1582
+ def compute_linsear_write(text: str, chunk_size: int = 1000) -> LinsearWriteResult:
398
1583
  """
399
1584
  Compute Linsear Write Readability Formula.
400
1585
 
401
- Developed for the U.S. Air Force to assess technical writing, the Linsear
402
- Write formula classifies words as "easy" (1-2 syllables) or "hard" (3+
403
- syllables) and uses sentence length to estimate grade level.
1586
+ This function uses native chunked analysis to capture variance and patterns
1587
+ across the text, which is essential for stylometric fingerprinting.
404
1588
 
405
- Related GitHub Issue:
1589
+ Related GitHub Issues:
406
1590
  #16 - Additional Readability Formulas
407
- https://github.com/craigtrim/pystylometry/issues/16
408
-
409
- Formula:
410
- 1. Count "easy" words (1-2 syllables): multiply count by 1
411
- 2. Count "hard" words (3+ syllables): multiply count by 3
412
- 3. Divide sum by number of sentences
413
- 4. If result > 20, divide by 2 to get grade level
414
- 5. If result <= 20, subtract 2, then divide by 2
415
-
416
- The formula is optimized for technical writing and works best with
417
- passages of about 100 words.
418
-
419
- Advantages:
420
- - Simple binary classification (easy/hard)
421
- - Effective for technical documents
422
- - Fast computation
423
- - Developed specifically for instructional materials
424
-
425
- Disadvantages:
426
- - Less well-known than other formulas
427
- - Binary word classification is crude
428
- - May overestimate difficulty of technical terms
429
- - Limited validation compared to Flesch or Dale-Chall
1591
+ #27 - Native chunked analysis with Distribution dataclass
430
1592
 
431
1593
  Args:
432
- text: Input text to analyze. Works best with 100-word samples.
433
- Empty text returns NaN values.
1594
+ text: Input text to analyze
1595
+ chunk_size: Number of words per chunk (default: 1000)
434
1596
 
435
1597
  Returns:
436
- LinsearWriteResult containing:
437
- - linsear_score: The Linsear Write score
438
- - grade_level: Corresponding U.S. grade level (integer)
439
- - easy_word_count: Words with 1-2 syllables
440
- - hard_word_count: Words with 3+ syllables
441
- - avg_sentence_length: Average words per sentence
442
- - metadata: Calculation details, sentence count, etc.
1598
+ LinsearWriteResult with score, grade_level, distributions, and metadata
443
1599
 
444
1600
  Example:
445
- >>> result = compute_linsear_write("Technical manual text...")
446
- >>> print(f"Linsear Write score: {result.linsear_score:.2f}")
447
- Linsear Write score: 11.3
448
- >>> print(f"Grade level: {result.grade_level}")
449
- Grade level: 11
450
- >>> print(f"Easy words: {result.easy_word_count}")
451
- Easy words: 78
452
- >>> print(f"Hard words: {result.hard_word_count}")
453
- Hard words: 22
454
-
455
- Note:
456
- - Syllable counting required (use existing syllable module)
457
- - Punctuation and numbers typically excluded
458
- - Most accurate with 100-word samples
459
- - Grade level is rounded to nearest integer
1601
+ >>> result = compute_linsear_write("Long text here...", chunk_size=1000)
1602
+ >>> result.linsear_score # Mean across chunks
1603
+ 11.3
460
1604
  """
461
- # Tokenize and segment
462
- sentences = split_sentences(text)
463
- tokens = tokenize(text)
464
- word_tokens = normalize_for_readability(tokens)
465
-
466
- if len(sentences) == 0 or len(word_tokens) == 0:
1605
+ chunks = chunk_text(text, chunk_size)
1606
+ score_values = []
1607
+ grade_values = []
1608
+ sent_len_values = []
1609
+ total_easy = 0
1610
+ total_hard = 0
1611
+ total_words = 0
1612
+
1613
+ for chunk in chunks:
1614
+ sc, gr, easy, hard, sent_len, meta = _compute_linsear_single(chunk)
1615
+ if not math.isnan(sc):
1616
+ score_values.append(sc)
1617
+ grade_values.append(gr)
1618
+ sent_len_values.append(sent_len)
1619
+ total_easy += easy
1620
+ total_hard += hard
1621
+ total_words += meta.get("word_count", 0)
1622
+
1623
+ if not score_values:
1624
+ empty_dist = Distribution(
1625
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
1626
+ )
467
1627
  return LinsearWriteResult(
468
1628
  linsear_score=float("nan"),
469
- grade_level=0,
1629
+ grade_level=float("nan"),
470
1630
  easy_word_count=0,
471
1631
  hard_word_count=0,
472
1632
  avg_sentence_length=float("nan"),
473
- metadata={"sentence_count": 0, "total_words": 0, "raw_score": float("nan")},
1633
+ linsear_score_dist=empty_dist,
1634
+ grade_level_dist=empty_dist,
1635
+ avg_sentence_length_dist=empty_dist,
1636
+ chunk_size=chunk_size,
1637
+ chunk_count=len(chunks),
1638
+ metadata={"total_words": 0, "reliable": False},
474
1639
  )
475
1640
 
476
- # Classify words as easy (1-2 syllables) or hard (3+ syllables)
477
- easy_word_count = 0
478
- hard_word_count = 0
1641
+ score_dist = make_distribution(score_values)
1642
+ grade_dist = make_distribution(grade_values)
1643
+ sent_len_dist = make_distribution(sent_len_values)
479
1644
 
480
- for word in word_tokens:
481
- syllable_count = count_syllables(word)
482
- if syllable_count <= 2:
483
- easy_word_count += 1
484
- else:
485
- hard_word_count += 1
1645
+ return LinsearWriteResult(
1646
+ linsear_score=score_dist.mean,
1647
+ grade_level=grade_dist.mean,
1648
+ easy_word_count=total_easy,
1649
+ hard_word_count=total_hard,
1650
+ avg_sentence_length=sent_len_dist.mean,
1651
+ linsear_score_dist=score_dist,
1652
+ grade_level_dist=grade_dist,
1653
+ avg_sentence_length_dist=sent_len_dist,
1654
+ chunk_size=chunk_size,
1655
+ chunk_count=len(chunks),
1656
+ metadata={"total_words": total_words, "reliable": total_words >= 100},
1657
+ )
486
1658
 
487
- # Calculate weighted sum
488
- weighted_sum = (easy_word_count * 1) + (hard_word_count * 3)
489
1659
 
490
- # Calculate score
491
- raw_score = weighted_sum / len(sentences)
1660
+ def _get_fry_grade_level(avg_sent_len: float, avg_syl_100: float) -> tuple[str, str]:
1661
+ """Get Fry grade level and zone from coordinates."""
1662
+ if math.isnan(avg_sent_len) or math.isnan(avg_syl_100):
1663
+ return ("Unknown", "invalid")
492
1664
 
493
- # Convert to grade level
494
- if raw_score > 20:
495
- grade_level = round(raw_score / 2)
1665
+ if avg_syl_100 < 125:
1666
+ if avg_sent_len < 7:
1667
+ grade, zone = "1", "valid"
1668
+ elif avg_sent_len < 11:
1669
+ grade, zone = "2", "valid"
1670
+ else:
1671
+ grade, zone = "3", "valid"
1672
+ elif avg_syl_100 < 135:
1673
+ if avg_sent_len < 8:
1674
+ grade, zone = "2", "valid"
1675
+ elif avg_sent_len < 12:
1676
+ grade, zone = "3", "valid"
1677
+ else:
1678
+ grade, zone = "4", "valid"
1679
+ elif avg_syl_100 < 145:
1680
+ if avg_sent_len < 9:
1681
+ grade, zone = "3", "valid"
1682
+ elif avg_sent_len < 13:
1683
+ grade, zone = "5", "valid"
1684
+ else:
1685
+ grade, zone = "6", "valid"
1686
+ elif avg_syl_100 < 155:
1687
+ if avg_sent_len < 10:
1688
+ grade, zone = "4", "valid"
1689
+ elif avg_sent_len < 14:
1690
+ grade, zone = "7", "valid"
1691
+ else:
1692
+ grade, zone = "8", "valid"
1693
+ elif avg_syl_100 < 165:
1694
+ if avg_sent_len < 12:
1695
+ grade, zone = "6", "valid"
1696
+ elif avg_sent_len < 16:
1697
+ grade, zone = "9", "valid"
1698
+ else:
1699
+ grade, zone = "10", "valid"
1700
+ elif avg_syl_100 < 175:
1701
+ if avg_sent_len < 14:
1702
+ grade, zone = "8", "valid"
1703
+ elif avg_sent_len < 18:
1704
+ grade, zone = "11", "valid"
1705
+ else:
1706
+ grade, zone = "12", "valid"
496
1707
  else:
497
- grade_level = round((raw_score - 2) / 2)
1708
+ if avg_sent_len < 16:
1709
+ grade, zone = "10", "valid"
1710
+ elif avg_sent_len < 20:
1711
+ grade, zone = "College", "valid"
1712
+ else:
1713
+ grade, zone = "College+", "valid"
498
1714
 
499
- # Ensure grade level is non-negative
500
- grade_level = max(0, grade_level)
1715
+ if avg_syl_100 > 185 or avg_sent_len > 25:
1716
+ zone = "above_graph"
1717
+ elif avg_syl_100 < 110:
1718
+ zone = "below_graph"
501
1719
 
502
- # Calculate average sentence length
503
- avg_sentence_length = len(word_tokens) / len(sentences)
1720
+ return (grade, zone)
504
1721
 
505
- # Build metadata
506
- metadata = {
507
- "total_words": len(word_tokens),
508
- "sentence_count": len(sentences),
509
- "raw_score": raw_score,
510
- "weighted_sum": weighted_sum,
511
- }
512
1722
 
513
- return LinsearWriteResult(
514
- linsear_score=raw_score,
515
- grade_level=grade_level,
516
- easy_word_count=easy_word_count,
517
- hard_word_count=hard_word_count,
518
- avg_sentence_length=avg_sentence_length,
519
- metadata=metadata,
1723
+ def _compute_fry_single(text: str) -> tuple[float, float, dict]:
1724
+ """Compute Fry for a single chunk. Returns (avg_sent_len, avg_syl_100, meta)."""
1725
+ sentences = split_sentences(text)
1726
+ tokens = tokenize(text)
1727
+ word_tokens = normalize_for_readability(tokens)
1728
+
1729
+ if len(sentences) == 0 or len(word_tokens) == 0:
1730
+ return (
1731
+ float("nan"),
1732
+ float("nan"),
1733
+ {"sentence_count": 0, "word_count": 0, "syllable_count": 0, "sample_size": 0},
1734
+ )
1735
+
1736
+ sample_size = min(100, len(word_tokens))
1737
+ sample_tokens = word_tokens[:sample_size]
1738
+ total_syllables = sum(count_syllables(w) for w in sample_tokens)
1739
+
1740
+ word_count_so_far = 0
1741
+ sentences_in_sample = 0
1742
+ for sent in sentences:
1743
+ sent_tokens = normalize_for_readability(tokenize(sent))
1744
+ if word_count_so_far + len(sent_tokens) <= sample_size:
1745
+ sentences_in_sample += 1
1746
+ word_count_so_far += len(sent_tokens)
1747
+ else:
1748
+ if word_count_so_far < sample_size:
1749
+ sentences_in_sample += 1
1750
+ break
1751
+
1752
+ sentences_in_sample = max(1, sentences_in_sample)
1753
+ avg_sentence_length = sample_size / sentences_in_sample
1754
+ avg_syllables_per_100 = (total_syllables / sample_size) * 100
1755
+
1756
+ return (
1757
+ avg_sentence_length,
1758
+ avg_syllables_per_100,
1759
+ {
1760
+ "sentence_count": len(sentences),
1761
+ "word_count": len(word_tokens),
1762
+ "syllable_count": total_syllables,
1763
+ "sample_size": sample_size,
1764
+ },
520
1765
  )
521
1766
 
522
1767
 
523
- def compute_fry(text: str) -> FryResult:
1768
+ def compute_fry(text: str, chunk_size: int = 1000) -> FryResult:
524
1769
  """
525
1770
  Compute Fry Readability Graph metrics.
526
1771
 
527
- The Fry Readability Graph plots average sentence length against average
528
- syllables per 100 words to determine reading difficulty. This implementation
529
- provides the numerical coordinates and estimated grade level.
1772
+ This function uses native chunked analysis to capture variance and patterns
1773
+ across the text, which is essential for stylometric fingerprinting.
530
1774
 
531
- Related GitHub Issue:
1775
+ Related GitHub Issues:
532
1776
  #16 - Additional Readability Formulas
533
- https://github.com/craigtrim/pystylometry/issues/16
534
-
535
- Method:
536
- 1. Select three 100-word samples from text
537
- 2. Count average sentence length across samples
538
- 3. Count average syllables per 100 words across samples
539
- 4. Plot coordinates on Fry graph (or use numerical approximation)
540
- 5. Determine grade level from graph zone
541
-
542
- The original Fry graph has zones corresponding to grade levels 1-17+.
543
- This implementation uses numerical approximation to estimate grade level.
544
-
545
- Advantages:
546
- - Visual/graphical approach (intuitive)
547
- - Uses two independent dimensions (length & syllables)
548
- - Well-validated for educational materials
549
- - Covers wide range of grade levels (1-17+)
550
-
551
- Disadvantages:
552
- - Requires exactly 100-word samples (padding/truncation needed)
553
- - Graph reading can be subjective
554
- - Less precise than formula-based methods
555
- - Multiple samples needed for reliability
1777
+ #27 - Native chunked analysis with Distribution dataclass
556
1778
 
557
1779
  Args:
558
- text: Input text to analyze. Should contain at least 100 words.
559
- Shorter texts are padded or return limited results.
1780
+ text: Input text to analyze
1781
+ chunk_size: Number of words per chunk (default: 1000)
560
1782
 
561
1783
  Returns:
562
- FryResult containing:
563
- - avg_sentence_length: Average words per sentence
564
- - avg_syllables_per_100: Average syllables per 100 words
565
- - grade_level: Estimated grade level (e.g., "5", "7", "College")
566
- - graph_zone: Which zone of Fry graph (for validity checking)
567
- - metadata: Sample details, total sentences, syllables, etc.
1784
+ FryResult with avg_sentence_length, avg_syllables_per_100, distributions, and metadata
568
1785
 
569
1786
  Example:
570
- >>> result = compute_fry("Educational text for grade assessment...")
571
- >>> print(f"Avg sentence length: {result.avg_sentence_length:.1f}")
572
- Avg sentence length: 14.3
573
- >>> print(f"Syllables/100 words: {result.avg_syllables_per_100:.1f}")
574
- Syllables/100 words: 142.7
575
- >>> print(f"Grade level: {result.grade_level}")
576
- Grade level: 6
577
-
578
- Note:
579
- - Original method uses three 100-word samples
580
- - Implementation may use single sample or whole text
581
- - Syllable counting required
582
- - Grade level estimation uses zone boundaries
583
- - Some texts fall outside graph zones (marked as invalid)
1787
+ >>> result = compute_fry("Long text here...", chunk_size=1000)
1788
+ >>> result.avg_sentence_length # Mean across chunks
1789
+ 14.3
584
1790
  """
585
- # Tokenize and segment
586
- sentences = split_sentences(text)
587
- tokens = tokenize(text)
588
- word_tokens = normalize_for_readability(tokens)
589
-
590
- if len(sentences) == 0 or len(word_tokens) == 0:
1791
+ chunks = chunk_text(text, chunk_size)
1792
+ sent_len_values = []
1793
+ syl_100_values = []
1794
+ total_words = 0
1795
+ total_sentences = 0
1796
+ total_syllables = 0
1797
+
1798
+ for chunk in chunks:
1799
+ sent_len, syl_100, meta = _compute_fry_single(chunk)
1800
+ if not math.isnan(sent_len):
1801
+ sent_len_values.append(sent_len)
1802
+ syl_100_values.append(syl_100)
1803
+ total_words += meta.get("word_count", 0)
1804
+ total_sentences += meta.get("sentence_count", 0)
1805
+ total_syllables += meta.get("syllable_count", 0)
1806
+
1807
+ if not sent_len_values:
1808
+ empty_dist = Distribution(
1809
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
1810
+ )
591
1811
  return FryResult(
592
1812
  avg_sentence_length=float("nan"),
593
1813
  avg_syllables_per_100=float("nan"),
594
1814
  grade_level="Unknown",
595
1815
  graph_zone="invalid",
596
- metadata={
597
- "total_sentences": 0,
598
- "total_syllables": 0,
599
- "total_words": 0,
600
- "sample_size": 0,
601
- },
1816
+ avg_sentence_length_dist=empty_dist,
1817
+ avg_syllables_per_100_dist=empty_dist,
1818
+ chunk_size=chunk_size,
1819
+ chunk_count=len(chunks),
1820
+ metadata={"total_sentences": 0, "total_words": 0, "sample_size": 0, "reliable": False},
602
1821
  )
603
1822
 
604
- # Use first 100 words for sample (or entire text if < 100 words)
605
- sample_size = min(100, len(word_tokens))
606
- sample_tokens = word_tokens[:sample_size]
1823
+ sent_len_dist = make_distribution(sent_len_values)
1824
+ syl_100_dist = make_distribution(syl_100_values)
1825
+ grade_level, graph_zone = _get_fry_grade_level(sent_len_dist.mean, syl_100_dist.mean)
607
1826
 
608
- # Count syllables in sample
609
- total_syllables = sum(count_syllables(word) for word in sample_tokens)
1827
+ # Calculate sample size (min of 100 or total_words for overall)
1828
+ sample_size = min(100, total_words)
610
1829
 
611
- # Count sentences within the sample
612
- # We need to determine how many sentences are in the first sample_size words
613
- word_count_so_far = 0
614
- sentences_in_sample = 0
615
- for sent in sentences:
616
- sent_tokens = tokenize(sent)
617
- sent_word_tokens = normalize_for_readability(sent_tokens)
618
- if word_count_so_far + len(sent_word_tokens) <= sample_size:
619
- sentences_in_sample += 1
620
- word_count_so_far += len(sent_word_tokens)
621
- else:
622
- # Partial sentence in sample
623
- if word_count_so_far < sample_size:
624
- sentences_in_sample += 1
625
- break
1830
+ return FryResult(
1831
+ avg_sentence_length=sent_len_dist.mean,
1832
+ avg_syllables_per_100=syl_100_dist.mean,
1833
+ grade_level=grade_level,
1834
+ graph_zone=graph_zone,
1835
+ avg_sentence_length_dist=sent_len_dist,
1836
+ avg_syllables_per_100_dist=syl_100_dist,
1837
+ chunk_size=chunk_size,
1838
+ chunk_count=len(chunks),
1839
+ metadata={
1840
+ "total_sentences": total_sentences,
1841
+ "total_words": total_words,
1842
+ "total_syllables": total_syllables,
1843
+ "sample_size": sample_size,
1844
+ "reliable": total_words >= 100,
1845
+ },
1846
+ )
626
1847
 
627
- # Ensure at least 1 sentence for division
628
- sentences_in_sample = max(1, sentences_in_sample)
629
1848
 
630
- # Calculate avg_sentence_length (for the sample)
631
- avg_sentence_length = sample_size / sentences_in_sample
1849
+ def _compute_forcast_single(text: str) -> tuple[float, float, int, float, dict]:
1850
+ """Compute FORCAST for a single chunk."""
1851
+ tokens = tokenize(text)
1852
+ word_tokens = normalize_for_readability(tokens)
632
1853
 
633
- # Calculate avg_syllables_per_100 (scale if sample < 100)
634
- avg_syllables_per_100 = (total_syllables / sample_size) * 100
1854
+ if len(word_tokens) == 0:
1855
+ return (
1856
+ float("nan"),
1857
+ float("nan"),
1858
+ 0,
1859
+ float("nan"),
1860
+ {"word_count": 0, "sample_size": 0, "scaled_n": 0.0},
1861
+ )
635
1862
 
636
- # Map to grade level using Fry graph approximation
637
- # Fry graph zones (simplified numerical approximation):
638
- # These are rough boundaries based on Fry graph zones
639
- # X-axis: avg sentences per 100 words (inverse of avg_sentence_length)
640
- # Y-axis: avg syllables per 100 words
641
-
642
- # Determine grade level based on avg_sentence_length and avg_syllables_per_100
643
- # Higher syllables per 100 = higher grade
644
- # Longer sentences = higher grade
645
- # Simplified zone mapping:
646
- if avg_syllables_per_100 < 125:
647
- if avg_sentence_length < 7:
648
- grade_level = "1"
649
- graph_zone = "valid"
650
- elif avg_sentence_length < 11:
651
- grade_level = "2"
652
- graph_zone = "valid"
653
- else:
654
- grade_level = "3"
655
- graph_zone = "valid"
656
- elif avg_syllables_per_100 < 135:
657
- if avg_sentence_length < 8:
658
- grade_level = "2"
659
- graph_zone = "valid"
660
- elif avg_sentence_length < 12:
661
- grade_level = "3"
662
- graph_zone = "valid"
663
- else:
664
- grade_level = "4"
665
- graph_zone = "valid"
666
- elif avg_syllables_per_100 < 145:
667
- if avg_sentence_length < 9:
668
- grade_level = "3"
669
- graph_zone = "valid"
670
- elif avg_sentence_length < 13:
671
- grade_level = "5"
672
- graph_zone = "valid"
673
- else:
674
- grade_level = "6"
675
- graph_zone = "valid"
676
- elif avg_syllables_per_100 < 155:
677
- if avg_sentence_length < 10:
678
- grade_level = "4"
679
- graph_zone = "valid"
680
- elif avg_sentence_length < 14:
681
- grade_level = "7"
682
- graph_zone = "valid"
683
- else:
684
- grade_level = "8"
685
- graph_zone = "valid"
686
- elif avg_syllables_per_100 < 165:
687
- if avg_sentence_length < 12:
688
- grade_level = "6"
689
- graph_zone = "valid"
690
- elif avg_sentence_length < 16:
691
- grade_level = "9"
692
- graph_zone = "valid"
693
- else:
694
- grade_level = "10"
695
- graph_zone = "valid"
696
- elif avg_syllables_per_100 < 175:
697
- if avg_sentence_length < 14:
698
- grade_level = "8"
699
- graph_zone = "valid"
700
- elif avg_sentence_length < 18:
701
- grade_level = "11"
702
- graph_zone = "valid"
703
- else:
704
- grade_level = "12"
705
- graph_zone = "valid"
706
- else: # avg_syllables_per_100 >= 175
707
- if avg_sentence_length < 16:
708
- grade_level = "10"
709
- graph_zone = "valid"
710
- elif avg_sentence_length < 20:
711
- grade_level = "College"
712
- graph_zone = "valid"
713
- else:
714
- grade_level = "College+"
715
- graph_zone = "valid"
716
-
717
- # Check if outside typical graph bounds
718
- if avg_syllables_per_100 > 185 or avg_sentence_length > 25:
719
- graph_zone = "above_graph"
720
- elif avg_syllables_per_100 < 110:
721
- graph_zone = "below_graph"
722
-
723
- # Build metadata
724
- metadata = {
725
- "total_sentences": len(sentences),
726
- "total_syllables": sum(count_syllables(w) for w in word_tokens),
727
- "total_words": len(word_tokens),
728
- "sample_size": sample_size,
729
- "sentences_in_sample": sentences_in_sample,
730
- "syllables_in_sample": total_syllables,
731
- }
1863
+ sample_size = min(150, len(word_tokens))
1864
+ sample_tokens = word_tokens[:sample_size]
1865
+ single_syllable_count = sum(1 for w in sample_tokens if count_syllables(w) == 1)
1866
+ scaled_n = (
1867
+ single_syllable_count * (150 / sample_size) if sample_size < 150 else single_syllable_count
1868
+ )
1869
+ forcast_score = 20 - (scaled_n / 10)
1870
+ grade_level = float(max(0, min(20, round(forcast_score))))
1871
+ single_syllable_ratio = single_syllable_count / sample_size
732
1872
 
733
- return FryResult(
734
- avg_sentence_length=avg_sentence_length,
735
- avg_syllables_per_100=avg_syllables_per_100,
736
- grade_level=grade_level,
737
- graph_zone=graph_zone,
738
- metadata=metadata,
1873
+ return (
1874
+ forcast_score,
1875
+ grade_level,
1876
+ single_syllable_count,
1877
+ single_syllable_ratio,
1878
+ {"word_count": len(word_tokens), "sample_size": sample_size, "scaled_n": scaled_n},
739
1879
  )
740
1880
 
741
1881
 
742
- def compute_forcast(text: str) -> FORCASTResult:
1882
+ def compute_forcast(text: str, chunk_size: int = 1000) -> FORCASTResult:
743
1883
  """
744
1884
  Compute FORCAST Readability Formula.
745
1885
 
746
- FORCAST (FORmula for CASTing readability) was developed by the U.S. military
747
- to assess readability without counting syllables. It uses only the count of
748
- single-syllable words as its metric, making it fast and simple.
1886
+ This function uses native chunked analysis to capture variance and patterns
1887
+ across the text, which is essential for stylometric fingerprinting.
749
1888
 
750
- Related GitHub Issue:
1889
+ Related GitHub Issues:
751
1890
  #16 - Additional Readability Formulas
752
- https://github.com/craigtrim/pystylometry/issues/16
1891
+ #27 - Native chunked analysis with Distribution dataclass
753
1892
 
754
1893
  Formula:
755
1894
  Grade Level = 20 - (N / 10)
756
-
757
1895
  Where N is the number of single-syllable words in a 150-word sample.
758
1896
 
759
- The formula is optimized for technical and military documents and works
760
- best with standardized 150-word samples.
761
-
762
- Advantages:
763
- - Extremely simple (only counts single-syllable words)
764
- - No sentence segmentation required
765
- - Fast computation
766
- - Developed specifically for military/technical texts
767
-
768
- Disadvantages:
769
- - Less well-known and validated than other formulas
770
- - Requires exactly 150-word samples
771
- - Single dimension (doesn't consider sentence length)
772
- - May not generalize well beyond military context
773
-
774
1897
  Args:
775
- text: Input text to analyze. Works best with 150-word samples.
776
- Shorter texts are padded or scored proportionally.
777
- Longer texts use first 150 words or multiple samples.
1898
+ text: Input text to analyze
1899
+ chunk_size: Number of words per chunk (default: 1000)
778
1900
 
779
1901
  Returns:
780
- FORCASTResult containing:
781
- - forcast_score: The FORCAST readability score
782
- - grade_level: Corresponding U.S. grade level (integer)
783
- - single_syllable_ratio: Single-syllable words / total words
784
- - single_syllable_count: Count of single-syllable words
785
- - total_words: Total word count analyzed
786
- - metadata: Sample details, calculation specifics, etc.
1902
+ FORCASTResult with score, grade_level, distributions, and metadata
787
1903
 
788
1904
  Example:
789
- >>> result = compute_forcast("Military technical document...")
790
- >>> print(f"FORCAST score: {result.forcast_score:.2f}")
791
- FORCAST score: 9.7
792
- >>> print(f"Grade level: {result.grade_level}")
793
- Grade level: 10
794
- >>> print(f"Single-syllable ratio: {result.single_syllable_ratio:.3f}")
795
- Single-syllable ratio: 0.687
796
-
797
- Note:
798
- - Syllable counting required (but only to identify 1-syllable words)
799
- - Recommended sample size is 150 words
800
- - Multiple samples can be averaged for longer texts
801
- - Simpler than most readability formulas
802
- - Grade levels typically range from 5-12
1905
+ >>> result = compute_forcast("Long text here...", chunk_size=1000)
1906
+ >>> result.forcast_score # Mean across chunks
1907
+ 9.7
803
1908
  """
804
- # Tokenize
805
- tokens = tokenize(text)
806
- word_tokens = normalize_for_readability(tokens)
807
-
808
- if len(word_tokens) == 0:
1909
+ chunks = chunk_text(text, chunk_size)
1910
+ score_values = []
1911
+ grade_values = []
1912
+ ratio_values = []
1913
+ total_single = 0
1914
+ total_words = 0
1915
+
1916
+ for chunk in chunks:
1917
+ sc, gr, single_cnt, single_rat, meta = _compute_forcast_single(chunk)
1918
+ if not math.isnan(sc):
1919
+ score_values.append(sc)
1920
+ grade_values.append(gr)
1921
+ ratio_values.append(single_rat)
1922
+ total_single += single_cnt
1923
+ total_words += meta.get("word_count", 0)
1924
+
1925
+ if not score_values:
1926
+ empty_dist = Distribution(
1927
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
1928
+ )
809
1929
  return FORCASTResult(
810
1930
  forcast_score=float("nan"),
811
- grade_level=0,
1931
+ grade_level=float("nan"),
812
1932
  single_syllable_ratio=float("nan"),
813
1933
  single_syllable_count=0,
814
1934
  total_words=0,
815
- metadata={"sample_size": 0, "scaled_n": float("nan")},
1935
+ forcast_score_dist=empty_dist,
1936
+ grade_level_dist=empty_dist,
1937
+ single_syllable_ratio_dist=empty_dist,
1938
+ chunk_size=chunk_size,
1939
+ chunk_count=len(chunks),
1940
+ metadata={"sample_size": 0, "scaled_n": 0.0, "reliable": False},
816
1941
  )
817
1942
 
818
- # Use first 150 words for sample (or entire text if < 150 words)
819
- sample_size = min(150, len(word_tokens))
820
- sample_tokens = word_tokens[:sample_size]
821
-
822
- # Count single-syllable words in sample
823
- single_syllable_count = 0
824
- for word in sample_tokens:
825
- if count_syllables(word) == 1:
826
- single_syllable_count += 1
1943
+ score_dist = make_distribution(score_values)
1944
+ grade_dist = make_distribution(grade_values)
1945
+ ratio_dist = make_distribution(ratio_values)
827
1946
 
828
- # Scale N to 150-word basis if sample < 150
829
- if sample_size < 150:
830
- scaled_n = single_syllable_count * (150 / sample_size)
831
- else:
832
- scaled_n = single_syllable_count
1947
+ # Calculate overall sample_size and scaled_n for metadata
1948
+ overall_sample_size = min(150, total_words)
1949
+ overall_scaled_n = (
1950
+ total_single * (150 / overall_sample_size)
1951
+ if overall_sample_size < 150
1952
+ else float(total_single)
1953
+ )
833
1954
 
834
- # Calculate grade level: 20 - (N / 10)
835
- forcast_score = 20 - (scaled_n / 10)
836
- grade_level = round(forcast_score)
1955
+ return FORCASTResult(
1956
+ forcast_score=score_dist.mean,
1957
+ grade_level=grade_dist.mean,
1958
+ single_syllable_ratio=ratio_dist.mean,
1959
+ single_syllable_count=total_single,
1960
+ total_words=total_words,
1961
+ forcast_score_dist=score_dist,
1962
+ grade_level_dist=grade_dist,
1963
+ single_syllable_ratio_dist=ratio_dist,
1964
+ chunk_size=chunk_size,
1965
+ chunk_count=len(chunks),
1966
+ metadata={
1967
+ "sample_size": overall_sample_size,
1968
+ "scaled_n": overall_scaled_n,
1969
+ "reliable": total_words >= 100,
1970
+ },
1971
+ )
837
1972
 
838
- # Ensure grade level is in reasonable range (0-20)
839
- grade_level = max(0, min(20, grade_level))
840
1973
 
841
- # Calculate single syllable ratio (for the sample)
842
- single_syllable_ratio = single_syllable_count / sample_size
1974
+ def _compute_psk_single(text: str) -> tuple[float, float, float, float, int, dict]:
1975
+ """Compute PSK for a single chunk."""
1976
+ sentences = split_sentences(text)
1977
+ tokens = tokenize(text)
1978
+ word_tokens = normalize_for_readability(tokens)
843
1979
 
844
- # Build metadata
845
- metadata = {
846
- "sample_size": sample_size,
847
- "scaled_n": scaled_n,
848
- "total_words_in_text": len(word_tokens),
849
- }
1980
+ if len(sentences) == 0 or len(word_tokens) == 0:
1981
+ return (
1982
+ float("nan"),
1983
+ float("nan"),
1984
+ float("nan"),
1985
+ float("nan"),
1986
+ 0,
1987
+ {"sentence_count": 0, "word_count": 0},
1988
+ )
850
1989
 
851
- return FORCASTResult(
852
- forcast_score=forcast_score,
853
- grade_level=grade_level,
854
- single_syllable_ratio=single_syllable_ratio,
855
- single_syllable_count=single_syllable_count,
856
- total_words=sample_size,
857
- metadata=metadata,
1990
+ total_syllables = sum(count_syllables(w) for w in word_tokens)
1991
+ avg_sentence_length = len(word_tokens) / len(sentences)
1992
+ avg_syllables_per_word = total_syllables / len(word_tokens)
1993
+ psk_score = 0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
1994
+ grade_level = round(psk_score, 1)
1995
+
1996
+ return (
1997
+ psk_score,
1998
+ grade_level,
1999
+ avg_sentence_length,
2000
+ avg_syllables_per_word,
2001
+ total_syllables,
2002
+ {"sentence_count": len(sentences), "word_count": len(word_tokens)},
858
2003
  )
859
2004
 
860
2005
 
861
- def compute_powers_sumner_kearl(text: str) -> PowersSumnerKearlResult:
2006
+ def compute_powers_sumner_kearl(text: str, chunk_size: int = 1000) -> PowersSumnerKearlResult:
862
2007
  """
863
2008
  Compute Powers-Sumner-Kearl Readability Formula.
864
2009
 
865
- The Powers-Sumner-Kearl (PSK) formula is a recalibration of the Flesch
866
- Reading Ease formula, optimized for primary grade levels (grades 1-4).
867
- It uses the same inputs (sentence length, syllables per word) but with
868
- different coefficients.
2010
+ This function uses native chunked analysis to capture variance and patterns
2011
+ across the text, which is essential for stylometric fingerprinting.
869
2012
 
870
- Related GitHub Issue:
2013
+ Related GitHub Issues:
871
2014
  #16 - Additional Readability Formulas
872
- https://github.com/craigtrim/pystylometry/issues/16
2015
+ #27 - Native chunked analysis with Distribution dataclass
873
2016
 
874
2017
  Formula:
875
2018
  Grade Level = 0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
876
2019
 
877
- The formula was derived from analysis of primary-grade texts and provides
878
- more accurate grade-level estimates for beginning readers than the original
879
- Flesch formula.
880
-
881
- Advantages:
882
- - Optimized for primary grades (1-4)
883
- - More accurate than Flesch for young readers
884
- - Uses same inputs as Flesch (easy to compare)
885
- - Well-validated on educational materials
886
-
887
- Disadvantages:
888
- - Less accurate for higher grade levels
889
- - Less well-known than Flesch
890
- - Limited range (not suitable for college-level texts)
891
- - Requires syllable counting
892
-
893
2020
  Args:
894
- text: Input text to analyze. Optimized for children's literature
895
- and primary-grade educational materials. Empty text returns
896
- NaN values.
2021
+ text: Input text to analyze
2022
+ chunk_size: Number of words per chunk (default: 1000)
897
2023
 
898
2024
  Returns:
899
- PowersSumnerKearlResult containing:
900
- - psk_score: The Powers-Sumner-Kearl score
901
- - grade_level: Corresponding grade (decimal, e.g., 2.5 = mid-2nd grade)
902
- - avg_sentence_length: Average words per sentence
903
- - avg_syllables_per_word: Average syllables per word
904
- - total_sentences: Total sentence count
905
- - total_words: Total word count
906
- - total_syllables: Total syllable count
907
- - metadata: Comparison to Flesch, calculation details, etc.
2025
+ PowersSumnerKearlResult with score, grade_level, distributions, and metadata
908
2026
 
909
2027
  Example:
910
- >>> result = compute_powers_sumner_kearl("Children's book text...")
911
- >>> print(f"PSK score: {result.psk_score:.2f}")
912
- PSK score: 2.3
913
- >>> print(f"Grade level: {result.grade_level:.1f}")
914
- Grade level: 2.3
915
- >>> print(f"Avg sentence length: {result.avg_sentence_length:.1f}")
916
- Avg sentence length: 8.5
917
-
918
- Note:
919
- - Most accurate for grades 1-4
920
- - Can produce negative scores for very simple texts
921
- - Grade level is continuous (can be decimal)
922
- - Syllable counting required (same as Flesch)
923
- - Compare to Flesch results for validation
2028
+ >>> result = compute_powers_sumner_kearl("Long text here...", chunk_size=1000)
2029
+ >>> result.psk_score # Mean across chunks
2030
+ 2.3
924
2031
  """
925
- # Tokenize and segment
926
- sentences = split_sentences(text)
927
- tokens = tokenize(text)
928
- word_tokens = normalize_for_readability(tokens)
929
-
930
- if len(sentences) == 0 or len(word_tokens) == 0:
2032
+ chunks = chunk_text(text, chunk_size)
2033
+ score_values = []
2034
+ grade_values = []
2035
+ sent_len_values = []
2036
+ syl_per_word_values = []
2037
+ total_sentences = 0
2038
+ total_words = 0
2039
+ total_syllables = 0
2040
+
2041
+ for chunk in chunks:
2042
+ sc, gr, sent_len, syl_word, syls, meta = _compute_psk_single(chunk)
2043
+ if not math.isnan(sc):
2044
+ score_values.append(sc)
2045
+ grade_values.append(gr)
2046
+ sent_len_values.append(sent_len)
2047
+ syl_per_word_values.append(syl_word)
2048
+ total_sentences += meta.get("sentence_count", 0)
2049
+ total_words += meta.get("word_count", 0)
2050
+ total_syllables += syls
2051
+
2052
+ if not score_values:
2053
+ empty_dist = Distribution(
2054
+ values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
2055
+ )
931
2056
  return PowersSumnerKearlResult(
932
2057
  psk_score=float("nan"),
933
2058
  grade_level=float("nan"),
@@ -936,50 +2061,50 @@ def compute_powers_sumner_kearl(text: str) -> PowersSumnerKearlResult:
936
2061
  total_sentences=0,
937
2062
  total_words=0,
938
2063
  total_syllables=0,
2064
+ psk_score_dist=empty_dist,
2065
+ grade_level_dist=empty_dist,
2066
+ avg_sentence_length_dist=empty_dist,
2067
+ avg_syllables_per_word_dist=empty_dist,
2068
+ chunk_size=chunk_size,
2069
+ chunk_count=len(chunks),
939
2070
  metadata={
940
2071
  "flesch_reading_ease": float("nan"),
941
2072
  "flesch_kincaid_grade": float("nan"),
2073
+ "difference_from_flesch": float("nan"),
2074
+ "reliable": False,
942
2075
  },
943
2076
  )
944
2077
 
945
- # Count syllables
946
- total_syllables = sum(count_syllables(word) for word in word_tokens)
947
-
948
- # Calculate metrics
949
- avg_sentence_length = len(word_tokens) / len(sentences)
950
- avg_syllables_per_word = total_syllables / len(word_tokens)
2078
+ score_dist = make_distribution(score_values)
2079
+ grade_dist = make_distribution(grade_values)
2080
+ sent_len_dist = make_distribution(sent_len_values)
2081
+ syl_word_dist = make_distribution(syl_per_word_values)
951
2082
 
952
- # Apply Powers-Sumner-Kearl formula
953
- # Grade = 0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
954
- psk_score = (
955
- 0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
956
- )
957
- grade_level = round(psk_score, 1) # Round to 1 decimal place
958
-
959
- # Optional: Calculate Flesch scores for comparison
960
- flesch_reading_ease = (
961
- 206.835 - 1.015 * avg_sentence_length - 84.6 * avg_syllables_per_word
962
- )
963
- flesch_kincaid_grade = (
964
- 0.39 * avg_sentence_length + 11.8 * avg_syllables_per_word - 15.59
965
- )
966
-
967
- # Build metadata
968
- metadata = {
969
- "flesch_reading_ease": flesch_reading_ease,
970
- "flesch_kincaid_grade": flesch_kincaid_grade,
971
- "difference_from_flesch": psk_score - flesch_kincaid_grade,
972
- "words_per_sentence": avg_sentence_length,
973
- "syllables_per_word": avg_syllables_per_word,
974
- }
2083
+ # Compute Flesch metrics for comparison (using the same avg values)
2084
+ # Flesch Reading Ease: 206.835 - 1.015 * ASL - 84.6 * ASW
2085
+ # Flesch-Kincaid Grade: 0.39 * ASL + 11.8 * ASW - 15.59
2086
+ flesch_reading_ease = 206.835 - 1.015 * sent_len_dist.mean - 84.6 * syl_word_dist.mean
2087
+ flesch_kincaid_grade = 0.39 * sent_len_dist.mean + 11.8 * syl_word_dist.mean - 15.59
2088
+ difference_from_flesch = grade_dist.mean - flesch_kincaid_grade
975
2089
 
976
2090
  return PowersSumnerKearlResult(
977
- psk_score=psk_score,
978
- grade_level=grade_level,
979
- avg_sentence_length=avg_sentence_length,
980
- avg_syllables_per_word=avg_syllables_per_word,
981
- total_sentences=len(sentences),
982
- total_words=len(word_tokens),
2091
+ psk_score=score_dist.mean,
2092
+ grade_level=grade_dist.mean,
2093
+ avg_sentence_length=sent_len_dist.mean,
2094
+ avg_syllables_per_word=syl_word_dist.mean,
2095
+ total_sentences=total_sentences,
2096
+ total_words=total_words,
983
2097
  total_syllables=total_syllables,
984
- metadata=metadata,
2098
+ psk_score_dist=score_dist,
2099
+ grade_level_dist=grade_dist,
2100
+ avg_sentence_length_dist=sent_len_dist,
2101
+ avg_syllables_per_word_dist=syl_word_dist,
2102
+ chunk_size=chunk_size,
2103
+ chunk_count=len(chunks),
2104
+ metadata={
2105
+ "flesch_reading_ease": flesch_reading_ease,
2106
+ "flesch_kincaid_grade": flesch_kincaid_grade,
2107
+ "difference_from_flesch": difference_from_flesch,
2108
+ "reliable": total_words >= 100,
2109
+ },
985
2110
  )