pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pystylometry/__init__.py +30 -5
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1954 -28
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +26 -1
  6. pystylometry/authorship/additional_methods.py +75 -0
  7. pystylometry/authorship/kilgarriff.py +347 -0
  8. pystylometry/character/__init__.py +15 -0
  9. pystylometry/character/character_metrics.py +389 -0
  10. pystylometry/cli.py +427 -0
  11. pystylometry/consistency/__init__.py +57 -0
  12. pystylometry/consistency/_thresholds.py +162 -0
  13. pystylometry/consistency/drift.py +549 -0
  14. pystylometry/dialect/__init__.py +65 -0
  15. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  16. pystylometry/dialect/_loader.py +360 -0
  17. pystylometry/dialect/detector.py +533 -0
  18. pystylometry/lexical/__init__.py +13 -6
  19. pystylometry/lexical/advanced_diversity.py +680 -0
  20. pystylometry/lexical/function_words.py +590 -0
  21. pystylometry/lexical/hapax.py +310 -33
  22. pystylometry/lexical/mtld.py +180 -22
  23. pystylometry/lexical/ttr.py +149 -0
  24. pystylometry/lexical/word_frequency_sophistication.py +1805 -0
  25. pystylometry/lexical/yule.py +142 -29
  26. pystylometry/ngrams/__init__.py +2 -0
  27. pystylometry/ngrams/entropy.py +150 -49
  28. pystylometry/ngrams/extended_ngrams.py +235 -0
  29. pystylometry/prosody/__init__.py +12 -0
  30. pystylometry/prosody/rhythm_prosody.py +53 -0
  31. pystylometry/readability/__init__.py +12 -0
  32. pystylometry/readability/additional_formulas.py +2110 -0
  33. pystylometry/readability/ari.py +173 -35
  34. pystylometry/readability/coleman_liau.py +150 -30
  35. pystylometry/readability/complex_words.py +531 -0
  36. pystylometry/readability/flesch.py +181 -32
  37. pystylometry/readability/gunning_fog.py +208 -35
  38. pystylometry/readability/smog.py +126 -28
  39. pystylometry/readability/syllables.py +137 -30
  40. pystylometry/stylistic/__init__.py +20 -0
  41. pystylometry/stylistic/cohesion_coherence.py +45 -0
  42. pystylometry/stylistic/genre_register.py +45 -0
  43. pystylometry/stylistic/markers.py +131 -0
  44. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  45. pystylometry/syntactic/__init__.py +4 -0
  46. pystylometry/syntactic/advanced_syntactic.py +494 -0
  47. pystylometry/syntactic/pos_ratios.py +172 -17
  48. pystylometry/syntactic/sentence_stats.py +105 -18
  49. pystylometry/syntactic/sentence_types.py +526 -0
  50. pystylometry/viz/__init__.py +71 -0
  51. pystylometry/viz/drift.py +589 -0
  52. pystylometry/viz/jsx/__init__.py +31 -0
  53. pystylometry/viz/jsx/_base.py +144 -0
  54. pystylometry/viz/jsx/report.py +677 -0
  55. pystylometry/viz/jsx/timeline.py +716 -0
  56. pystylometry/viz/jsx/viewer.py +1032 -0
  57. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
  58. pystylometry-1.1.0.dist-info/RECORD +63 -0
  59. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  60. pystylometry-0.1.0.dist-info/RECORD +0 -26
  61. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,1805 @@
1
+ """Word frequency sophistication metrics for vocabulary analysis.
2
+
3
+ This module measures vocabulary sophistication by analyzing how common or rare
4
+ the words in a text are, based on reference frequency lists from large corpora.
5
+ Authors who consistently use less frequent (more sophisticated) vocabulary
6
+ will score higher on these metrics.
7
+
8
+ Related GitHub Issue:
9
+ #15 - Word Frequency Sophistication Metrics
10
+ https://github.com/craigtrim/pystylometry/issues/15
11
+
12
+ Frequency data sources:
13
+ - COCA (Corpus of Contemporary American English)
14
+ - BNC (British National Corpus)
15
+ - Google N-grams
16
+ - SUBTLEXus (subtitle word frequencies)
17
+ - Academic Word List (AWL)
18
+
19
+ References:
20
+ Brysbaert, M., & New, B. (2009). Moving beyond Kučera and Francis:
21
+ A critical evaluation of current word frequency norms. Behavior
22
+ Research Methods, Instruments, & Computers, 41(4), 977-990.
23
+ Coxhead, A. (2000). A new academic word list. TESOL Quarterly, 34(2), 213-238.
24
+ Davies, M. (2008-). The Corpus of Contemporary American English (COCA).
25
+ """
26
+
27
+ from .._types import WordFrequencySophisticationResult, make_distribution
28
+
29
+ # Academic Word List (AWL) - Coxhead (2000)
30
+ # GitHub Issue #15: https://github.com/craigtrim/pystylometry/issues/15
31
+ # This is a subset of common academic words. The full AWL contains 570 word families.
32
+ # Consider loading from external file for complete list.
33
+ ACADEMIC_WORD_LIST = {
34
+ "analyze",
35
+ "analysis",
36
+ "analytical",
37
+ "approach",
38
+ "area",
39
+ "assess",
40
+ "assessment",
41
+ "assume",
42
+ "assumption",
43
+ "authority",
44
+ "available",
45
+ "benefit",
46
+ "category",
47
+ "chapter",
48
+ "commission",
49
+ "community",
50
+ "complex",
51
+ "compute",
52
+ "computer",
53
+ "conclude",
54
+ "conclusion",
55
+ "conduct",
56
+ "consequence",
57
+ "considerable",
58
+ "consist",
59
+ "consistent",
60
+ "constitute",
61
+ "constitutional",
62
+ "construct",
63
+ "construction",
64
+ "consumer",
65
+ "context",
66
+ "contract",
67
+ "contrast",
68
+ "contribute",
69
+ "contribution",
70
+ "controversial",
71
+ "controversy",
72
+ "convert",
73
+ "create",
74
+ "creation",
75
+ "creative",
76
+ "credit",
77
+ "criteria",
78
+ "cultural",
79
+ "culture",
80
+ "data",
81
+ "debate",
82
+ "define",
83
+ "definition",
84
+ "demonstrate",
85
+ "demonstration",
86
+ "derive",
87
+ "derived",
88
+ "design",
89
+ "despite",
90
+ "detect",
91
+ "dimension",
92
+ "diminish",
93
+ "distinct",
94
+ "distinction",
95
+ "distribute",
96
+ "distribution",
97
+ "diverse",
98
+ "diversity",
99
+ "document",
100
+ "documentation",
101
+ "domestic",
102
+ "dominate",
103
+ "economy",
104
+ "economic",
105
+ "edit",
106
+ "element",
107
+ "eliminate",
108
+ "emerge",
109
+ "emphasis",
110
+ "emphasize",
111
+ "empirical",
112
+ "enable",
113
+ "encounter",
114
+ "energy",
115
+ "enforce",
116
+ "enhance",
117
+ "enormous",
118
+ "ensure",
119
+ "environment",
120
+ "environmental",
121
+ "equation",
122
+ "equate",
123
+ "error",
124
+ "establish",
125
+ "estate",
126
+ "estimate",
127
+ "ethic",
128
+ "ethnic",
129
+ "evaluate",
130
+ "evaluation",
131
+ "eventual",
132
+ "eventually",
133
+ "evident",
134
+ "evidence",
135
+ "evolve",
136
+ "evolution",
137
+ "exceed",
138
+ "exclude",
139
+ "exclusive",
140
+ "expand",
141
+ "expansion",
142
+ "explicit",
143
+ "exploit",
144
+ "export",
145
+ "expose",
146
+ "external",
147
+ "extract",
148
+ "facilitate",
149
+ "factor",
150
+ "feature",
151
+ "federal",
152
+ "fee",
153
+ "file",
154
+ "final",
155
+ "finance",
156
+ "financial",
157
+ "finite",
158
+ "flexible",
159
+ "fluctuate",
160
+ "focus",
161
+ "format",
162
+ "formula",
163
+ "forthcoming",
164
+ "foundation",
165
+ "found",
166
+ "framework",
167
+ "function",
168
+ "functional",
169
+ "fund",
170
+ "fundamental",
171
+ "gender",
172
+ "generate",
173
+ "generation",
174
+ "global",
175
+ "goal",
176
+ "grant",
177
+ "guarantee",
178
+ "guideline",
179
+ "hence",
180
+ "hypothesis",
181
+ "hypothetical",
182
+ "identical",
183
+ "identify",
184
+ "identity",
185
+ "ideology",
186
+ "ignorance",
187
+ "illustrate",
188
+ "image",
189
+ "immigrate",
190
+ "impact",
191
+ "implement",
192
+ "implicate",
193
+ "implicit",
194
+ "imply",
195
+ "impose",
196
+ "incentive",
197
+ "incidence",
198
+ "incline",
199
+ "income",
200
+ "incorporate",
201
+ "index",
202
+ "indicate",
203
+ "indication",
204
+ "individual",
205
+ "individualism",
206
+ "induce",
207
+ "inevitable",
208
+ "infer",
209
+ "infrastructure",
210
+ "inherent",
211
+ "inherit",
212
+ "initial",
213
+ "initially",
214
+ "initiate",
215
+ "injure",
216
+ "innovate",
217
+ "innovation",
218
+ "input",
219
+ "insert",
220
+ "insight",
221
+ "inspect",
222
+ "instance",
223
+ "institute",
224
+ "institution",
225
+ "instruct",
226
+ "integral",
227
+ "integrate",
228
+ "integration",
229
+ "integrity",
230
+ "intelligence",
231
+ "intense",
232
+ "intensity",
233
+ "interact",
234
+ "interaction",
235
+ "intermediate",
236
+ "internal",
237
+ "interpret",
238
+ "interpretation",
239
+ "interval",
240
+ "intervene",
241
+ "intervention",
242
+ "intrinsic",
243
+ "invest",
244
+ "investigate",
245
+ "investigation",
246
+ "investment",
247
+ "invoke",
248
+ "involve",
249
+ "involvement",
250
+ "isolate",
251
+ "isolation",
252
+ "issue",
253
+ "item",
254
+ "job",
255
+ "journal",
256
+ "justify",
257
+ "label",
258
+ "labor",
259
+ "layer",
260
+ "lecture",
261
+ "legal",
262
+ "legislate",
263
+ "legislation",
264
+ "legislative",
265
+ "levy",
266
+ "liberal",
267
+ "license",
268
+ "likewise",
269
+ "link",
270
+ "locate",
271
+ "location",
272
+ "logic",
273
+ "maintain",
274
+ "maintenance",
275
+ "major",
276
+ "majority",
277
+ "manipulate",
278
+ "manual",
279
+ "margin",
280
+ "mature",
281
+ "maturity",
282
+ "maximize",
283
+ "mechanism",
284
+ "media",
285
+ "mediate",
286
+ "medical",
287
+ "medium",
288
+ "mental",
289
+ "method",
290
+ "methodology",
291
+ "migrate",
292
+ "military",
293
+ "minimal",
294
+ "minimize",
295
+ "minimum",
296
+ "ministry",
297
+ "minor",
298
+ "minority",
299
+ "mode",
300
+ "modify",
301
+ "monitor",
302
+ "motive",
303
+ "mutual",
304
+ "negate",
305
+ "network",
306
+ "neutral",
307
+ "nevertheless",
308
+ "nonetheless",
309
+ "normal",
310
+ "normally",
311
+ "notion",
312
+ "notwithstanding",
313
+ "nuclear",
314
+ "objective",
315
+ "obtain",
316
+ "obvious",
317
+ "obviously",
318
+ "occupy",
319
+ "occur",
320
+ "odd",
321
+ "offset",
322
+ "ongoing",
323
+ "option",
324
+ "orient",
325
+ "orientation",
326
+ "origin",
327
+ "original",
328
+ "output",
329
+ "overall",
330
+ "overlap",
331
+ "overseas",
332
+ "panel",
333
+ "paradigm",
334
+ "paragraph",
335
+ "parallel",
336
+ "parameter",
337
+ "participate",
338
+ "participation",
339
+ "particular",
340
+ "partner",
341
+ "passive",
342
+ "perceive",
343
+ "percent",
344
+ "percentage",
345
+ "perception",
346
+ "period",
347
+ "periodic",
348
+ "persist",
349
+ "perspective",
350
+ "phase",
351
+ "phenomena",
352
+ "phenomenon",
353
+ "philosophy",
354
+ "physical",
355
+ "plus",
356
+ "policy",
357
+ "portion",
358
+ "pose",
359
+ "positive",
360
+ "potential",
361
+ "practitioner",
362
+ "precede",
363
+ "preceding",
364
+ "precise",
365
+ "predict",
366
+ "prediction",
367
+ "predominant",
368
+ "preliminary",
369
+ "presume",
370
+ "previous",
371
+ "primarily",
372
+ "primary",
373
+ "prime",
374
+ "principal",
375
+ "principle",
376
+ "prior",
377
+ "priority",
378
+ "proceed",
379
+ "process",
380
+ "professional",
381
+ "prohibit",
382
+ "project",
383
+ "projection",
384
+ "promote",
385
+ "promotion",
386
+ "proportion",
387
+ "prospect",
388
+ "protocol",
389
+ "psychology",
390
+ "publication",
391
+ "publish",
392
+ "purchase",
393
+ "pursue",
394
+ "qualitative",
395
+ "quote",
396
+ "radical",
397
+ "random",
398
+ "range",
399
+ "ratio",
400
+ "rational",
401
+ "react",
402
+ "reaction",
403
+ "recover",
404
+ "refine",
405
+ "reform",
406
+ "regime",
407
+ "region",
408
+ "regional",
409
+ "register",
410
+ "regulate",
411
+ "regulation",
412
+ "reinforce",
413
+ "reject",
414
+ "relax",
415
+ "release",
416
+ "relevant",
417
+ "reluctance",
418
+ "rely",
419
+ "remove",
420
+ "require",
421
+ "requirement",
422
+ "research",
423
+ "researcher",
424
+ "reside",
425
+ "resolve",
426
+ "resource",
427
+ "respond",
428
+ "response",
429
+ "restore",
430
+ "restrain",
431
+ "restrict",
432
+ "restriction",
433
+ "retain",
434
+ "reveal",
435
+ "revenue",
436
+ "reverse",
437
+ "revise",
438
+ "revolution",
439
+ "rigid",
440
+ "role",
441
+ "route",
442
+ "scenario",
443
+ "schedule",
444
+ "scheme",
445
+ "scope",
446
+ "section",
447
+ "sector",
448
+ "secure",
449
+ "security",
450
+ "seek",
451
+ "select",
452
+ "selection",
453
+ "sequence",
454
+ "series",
455
+ "sex",
456
+ "shift",
457
+ "significant",
458
+ "significantly",
459
+ "similar",
460
+ "similarly",
461
+ "simulate",
462
+ "simulation",
463
+ "site",
464
+ "so-called",
465
+ "sole",
466
+ "solely",
467
+ "somewhat",
468
+ "source",
469
+ "specific",
470
+ "specifically",
471
+ "specify",
472
+ "sphere",
473
+ "stable",
474
+ "statistics",
475
+ "status",
476
+ "straightforward",
477
+ "strategy",
478
+ "stress",
479
+ "structural",
480
+ "structure",
481
+ "style",
482
+ "submit",
483
+ "subordinate",
484
+ "subsequent",
485
+ "subsequently",
486
+ "subsidy",
487
+ "substitute",
488
+ "substitute",
489
+ "successor",
490
+ "sufficient",
491
+ "sum",
492
+ "summary",
493
+ "supplement",
494
+ "survey",
495
+ "survive",
496
+ "suspend",
497
+ "sustain",
498
+ "symbol",
499
+ "tape",
500
+ "target",
501
+ "task",
502
+ "team",
503
+ "technical",
504
+ "technique",
505
+ "technology",
506
+ "temporary",
507
+ "tense",
508
+ "terminate",
509
+ "text",
510
+ "theme",
511
+ "theory",
512
+ "thereby",
513
+ "thesis",
514
+ "topic",
515
+ "trace",
516
+ "tradition",
517
+ "traditional",
518
+ "transfer",
519
+ "transform",
520
+ "transformation",
521
+ "transit",
522
+ "transition",
523
+ "transmit",
524
+ "transport",
525
+ "trend",
526
+ "trigger",
527
+ "ultimate",
528
+ "ultimately",
529
+ "undergo",
530
+ "underlie",
531
+ "underlying",
532
+ "undertake",
533
+ "uniform",
534
+ "unify",
535
+ "unique",
536
+ "utilize",
537
+ "valid",
538
+ "validity",
539
+ "vary",
540
+ "variation",
541
+ "vehicle",
542
+ "version",
543
+ "via",
544
+ "violate",
545
+ "virtual",
546
+ "virtually",
547
+ "visible",
548
+ "vision",
549
+ "visual",
550
+ "volume",
551
+ "voluntary",
552
+ "welfare",
553
+ "whereas",
554
+ "whereby",
555
+ "widespread",
556
+ }
557
+
558
+
559
+ # COCA Frequency Ranks - Top 5000 most common English words
560
+ # GitHub Issue #15: https://github.com/craigtrim/pystylometry/issues/15
561
+ # Based on Corpus of Contemporary American English (COCA)
562
+ # Words are mapped to their frequency rank (1 = most common)
563
+ # This is an embedded subset for MVP. Full COCA has 60,000+ words.
564
+ COCA_FREQUENCY_RANKS = {
565
+ # Top 100 - Function words and most common verbs
566
+ "the": 1,
567
+ "be": 2,
568
+ "to": 3,
569
+ "of": 4,
570
+ "and": 5,
571
+ "a": 6,
572
+ "in": 7,
573
+ "that": 8,
574
+ "have": 9,
575
+ "i": 10,
576
+ "it": 11,
577
+ "for": 12,
578
+ "not": 13,
579
+ "on": 14,
580
+ "with": 15,
581
+ "he": 16,
582
+ "as": 17,
583
+ "you": 18,
584
+ "do": 19,
585
+ "at": 20,
586
+ "this": 21,
587
+ "but": 22,
588
+ "his": 23,
589
+ "by": 24,
590
+ "from": 25,
591
+ "they": 26,
592
+ "we": 27,
593
+ "say": 28,
594
+ "her": 29,
595
+ "she": 30,
596
+ "or": 31,
597
+ "an": 32,
598
+ "will": 33,
599
+ "my": 34,
600
+ "one": 35,
601
+ "all": 36,
602
+ "would": 37,
603
+ "there": 38,
604
+ "their": 39,
605
+ "what": 40,
606
+ "so": 41,
607
+ "up": 42,
608
+ "out": 43,
609
+ "if": 44,
610
+ "about": 45,
611
+ "who": 46,
612
+ "get": 47,
613
+ "which": 48,
614
+ "go": 49,
615
+ "me": 50,
616
+ "when": 51,
617
+ "make": 52,
618
+ "can": 53,
619
+ "like": 54,
620
+ "time": 55,
621
+ "no": 56,
622
+ "just": 57,
623
+ "him": 58,
624
+ "know": 59,
625
+ "take": 60,
626
+ "people": 61,
627
+ "into": 62,
628
+ "year": 63,
629
+ "your": 64,
630
+ "good": 65,
631
+ "some": 66,
632
+ "could": 67,
633
+ "them": 68,
634
+ "see": 69,
635
+ "other": 70,
636
+ "than": 71,
637
+ "then": 72,
638
+ "now": 73,
639
+ "look": 74,
640
+ "only": 75,
641
+ "come": 76,
642
+ "its": 77,
643
+ "over": 78,
644
+ "think": 79,
645
+ "also": 80,
646
+ "back": 81,
647
+ "after": 82,
648
+ "use": 83,
649
+ "two": 84,
650
+ "how": 85,
651
+ "our": 86,
652
+ "work": 87,
653
+ "first": 88,
654
+ "well": 89,
655
+ "way": 90,
656
+ "even": 91,
657
+ "new": 92,
658
+ "want": 93,
659
+ "because": 94,
660
+ "any": 95,
661
+ "these": 96,
662
+ "give": 97,
663
+ "day": 98,
664
+ "most": 99,
665
+ "us": 100,
666
+ # 101-500 - Common words
667
+ "is": 101,
668
+ "was": 102,
669
+ "are": 103,
670
+ "been": 104,
671
+ "has": 105,
672
+ "had": 106,
673
+ "were": 107,
674
+ "said": 108,
675
+ "did": 109,
676
+ "having": 110,
677
+ "may": 111,
678
+ "should": 112,
679
+ "each": 113,
680
+ "such": 114,
681
+ "through": 115,
682
+ "where": 116,
683
+ "much": 117,
684
+ "before": 118,
685
+ "right": 119,
686
+ "too": 120,
687
+ "means": 121,
688
+ "old": 122,
689
+ "same": 124,
690
+ "tell": 125,
691
+ "boy": 126,
692
+ "follow": 127,
693
+ "came": 128,
694
+ "show": 129,
695
+ "every": 130,
696
+ "under": 135,
697
+ "name": 136,
698
+ "very": 137,
699
+ "form": 140,
700
+ "great": 141,
701
+ "help": 144,
702
+ "low": 145,
703
+ "line": 146,
704
+ "turn": 148,
705
+ "cause": 149,
706
+ "mean": 151,
707
+ "differ": 152,
708
+ "move": 153,
709
+ "does": 158,
710
+ "sentence": 160,
711
+ "set": 161,
712
+ "three": 162,
713
+ "air": 164,
714
+ "play": 167,
715
+ "small": 168,
716
+ "end": 169,
717
+ "put": 170,
718
+ "home": 171,
719
+ "read": 172,
720
+ "hand": 173,
721
+ "port": 174,
722
+ "large": 175,
723
+ "spell": 176,
724
+ "add": 177,
725
+ "land": 179,
726
+ "here": 180,
727
+ "must": 181,
728
+ "big": 182,
729
+ "high": 183,
730
+ "act": 186,
731
+ "why": 187,
732
+ "ask": 188,
733
+ "men": 189,
734
+ "change": 190,
735
+ "went": 191,
736
+ "light": 192,
737
+ "kind": 193,
738
+ "off": 194,
739
+ "need": 195,
740
+ "house": 196,
741
+ "picture": 197,
742
+ "try": 198,
743
+ "again": 200,
744
+ "animal": 201,
745
+ "point": 202,
746
+ "mother": 203,
747
+ "world": 204,
748
+ "near": 205,
749
+ "build": 206,
750
+ "self": 207,
751
+ "earth": 208,
752
+ "father": 209,
753
+ "head": 210,
754
+ "stand": 211,
755
+ "own": 212,
756
+ "page": 213,
757
+ "country": 215,
758
+ "found": 216,
759
+ "answer": 217,
760
+ "school": 218,
761
+ "grow": 219,
762
+ "study": 220,
763
+ "still": 221,
764
+ "learn": 222,
765
+ "plant": 223,
766
+ "cover": 224,
767
+ "food": 225,
768
+ "sun": 226,
769
+ "four": 227,
770
+ "thought": 228,
771
+ "let": 229,
772
+ "keep": 230,
773
+ "eye": 231,
774
+ "never": 232,
775
+ "last": 233,
776
+ "door": 234,
777
+ "between": 235,
778
+ "city": 236,
779
+ "tree": 237,
780
+ "cross": 238,
781
+ "since": 239,
782
+ "hard": 240,
783
+ "start": 241,
784
+ "might": 242,
785
+ "story": 243,
786
+ "saw": 244,
787
+ "far": 245,
788
+ "sea": 246,
789
+ "draw": 247,
790
+ "left": 248,
791
+ "late": 249,
792
+ "run": 250,
793
+ "while": 251,
794
+ "press": 252,
795
+ "close": 253,
796
+ "night": 254,
797
+ "real": 255,
798
+ "life": 256,
799
+ "few": 257,
800
+ "stop": 258,
801
+ "open": 259,
802
+ "seem": 260,
803
+ "together": 261,
804
+ "next": 262,
805
+ "white": 263,
806
+ "children": 264,
807
+ "begin": 265,
808
+ "got": 266,
809
+ "walk": 267,
810
+ "example": 268,
811
+ "ease": 269,
812
+ "paper": 270,
813
+ "often": 271,
814
+ "always": 272,
815
+ "music": 273,
816
+ "those": 274,
817
+ "both": 275,
818
+ "mark": 276,
819
+ "book": 277,
820
+ "letter": 278,
821
+ "until": 279,
822
+ "mile": 280,
823
+ "river": 281,
824
+ "car": 282,
825
+ "feet": 283,
826
+ "care": 284,
827
+ "second": 285,
828
+ "group": 286,
829
+ "carry": 287,
830
+ "took": 288,
831
+ "rain": 289,
832
+ "eat": 290,
833
+ "room": 291,
834
+ "friend": 292,
835
+ "began": 293,
836
+ "idea": 294,
837
+ "fish": 295,
838
+ "mountain": 296,
839
+ "north": 297,
840
+ "once": 298,
841
+ "base": 299,
842
+ "hear": 300,
843
+ "horse": 301,
844
+ "cut": 302,
845
+ "sure": 303,
846
+ "watch": 304,
847
+ "color": 305,
848
+ "face": 306,
849
+ "wood": 307,
850
+ "main": 308,
851
+ "enough": 309,
852
+ "plain": 310,
853
+ "girl": 311,
854
+ "usual": 312,
855
+ "young": 313,
856
+ "ready": 314,
857
+ "above": 315,
858
+ "ever": 316,
859
+ "red": 317,
860
+ "list": 318,
861
+ "though": 319,
862
+ "feel": 320,
863
+ "talk": 321,
864
+ "bird": 322,
865
+ "soon": 323,
866
+ "body": 324,
867
+ "dog": 325,
868
+ "family": 326,
869
+ "direct": 327,
870
+ "pose": 328,
871
+ "leave": 329,
872
+ "song": 330,
873
+ "measure": 331,
874
+ "state": 332,
875
+ "product": 333,
876
+ "black": 334,
877
+ "short": 335,
878
+ "numeral": 336,
879
+ "class": 337,
880
+ "wind": 338,
881
+ "question": 339,
882
+ "happen": 340,
883
+ "complete": 341,
884
+ "ship": 342,
885
+ "area": 343,
886
+ "half": 344,
887
+ "rock": 345,
888
+ "order": 346,
889
+ "fire": 347,
890
+ "south": 348,
891
+ "problem": 349,
892
+ "piece": 350,
893
+ "told": 351,
894
+ "knew": 352,
895
+ "pass": 353,
896
+ "farm": 354,
897
+ "top": 355,
898
+ "whole": 356,
899
+ "king": 357,
900
+ "size": 358,
901
+ "heard": 359,
902
+ "best": 360,
903
+ "hour": 361,
904
+ "better": 362,
905
+ "true": 363,
906
+ "during": 364,
907
+ "hundred": 365,
908
+ "am": 366,
909
+ "remember": 367,
910
+ "step": 368,
911
+ "early": 369,
912
+ "hold": 370,
913
+ "west": 371,
914
+ "ground": 372,
915
+ "interest": 373,
916
+ "reach": 374,
917
+ "fast": 375,
918
+ "five": 376,
919
+ "sing": 377,
920
+ "listen": 378,
921
+ "six": 379,
922
+ "table": 380,
923
+ "travel": 381,
924
+ "less": 382,
925
+ "morning": 383,
926
+ "ten": 384,
927
+ "simple": 385,
928
+ "several": 386,
929
+ "vowel": 387,
930
+ "toward": 388,
931
+ "war": 389,
932
+ "lay": 390,
933
+ "against": 391,
934
+ "pattern": 392,
935
+ "slow": 393,
936
+ "center": 394,
937
+ "love": 395,
938
+ "person": 396,
939
+ "money": 397,
940
+ "serve": 398,
941
+ "appear": 399,
942
+ "road": 400,
943
+ "map": 401,
944
+ "science": 402,
945
+ "rule": 403,
946
+ "govern": 404,
947
+ "pull": 405,
948
+ "cold": 406,
949
+ "notice": 407,
950
+ "voice": 408,
951
+ "fall": 409,
952
+ "power": 410,
953
+ "town": 411,
954
+ "fine": 412,
955
+ "certain": 413,
956
+ "fly": 414,
957
+ "unit": 415,
958
+ "lead": 416,
959
+ "cry": 417,
960
+ "dark": 418,
961
+ "machine": 419,
962
+ "note": 420,
963
+ "wait": 421,
964
+ "plan": 422,
965
+ "figure": 423,
966
+ "star": 424,
967
+ "box": 425,
968
+ "noun": 426,
969
+ "field": 427,
970
+ "rest": 428,
971
+ "correct": 429,
972
+ "able": 430,
973
+ "pound": 431,
974
+ "done": 432,
975
+ "beauty": 433,
976
+ "drive": 434,
977
+ "stood": 435,
978
+ "contain": 436,
979
+ "front": 437,
980
+ "teach": 438,
981
+ "week": 439,
982
+ "final": 440,
983
+ "gave": 441,
984
+ "green": 442,
985
+ "oh": 443,
986
+ "quick": 444,
987
+ "develop": 445,
988
+ "sleep": 446,
989
+ "warm": 447,
990
+ "free": 448,
991
+ "minute": 449,
992
+ "strong": 450,
993
+ "special": 451,
994
+ "mind": 452,
995
+ "behind": 453,
996
+ "clear": 454,
997
+ "tail": 455,
998
+ "produce": 456,
999
+ "fact": 457,
1000
+ "street": 458,
1001
+ "inch": 459,
1002
+ "lot": 460,
1003
+ "nothing": 461,
1004
+ "course": 462,
1005
+ "stay": 463,
1006
+ "wheel": 464,
1007
+ "full": 465,
1008
+ "force": 466,
1009
+ "blue": 467,
1010
+ "object": 468,
1011
+ "decide": 469,
1012
+ "surface": 470,
1013
+ "deep": 471,
1014
+ "moon": 472,
1015
+ "island": 473,
1016
+ "foot": 474,
1017
+ "yet": 475,
1018
+ "busy": 476,
1019
+ "test": 477,
1020
+ "record": 478,
1021
+ "boat": 479,
1022
+ "common": 480,
1023
+ "gold": 481,
1024
+ "possible": 482,
1025
+ "plane": 483,
1026
+ "age": 484,
1027
+ "dry": 485,
1028
+ "wonder": 486,
1029
+ "laugh": 487,
1030
+ "thousand": 488,
1031
+ "ago": 489,
1032
+ "ran": 490,
1033
+ "check": 491,
1034
+ "game": 492,
1035
+ "shape": 493,
1036
+ "yes": 494,
1037
+ "hot": 495,
1038
+ "miss": 496,
1039
+ "brought": 497,
1040
+ "heat": 498,
1041
+ "snow": 499,
1042
+ "bed": 500,
1043
+ # 501-1000 - Common vocabulary
1044
+ "bring": 501,
1045
+ "sit": 502,
1046
+ "perhaps": 503,
1047
+ "fill": 504,
1048
+ "east": 505,
1049
+ "weight": 506,
1050
+ "language": 507,
1051
+ "among": 508,
1052
+ "cat": 509,
1053
+ "ball": 510,
1054
+ "human": 511,
1055
+ "doctor": 513,
1056
+ "office": 515,
1057
+ "break": 516,
1058
+ "die": 517,
1059
+ "radio": 518,
1060
+ "speak": 519,
1061
+ "atom": 520,
1062
+ "blood": 521,
1063
+ "felt": 522,
1064
+ "type": 523,
1065
+ "forward": 524,
1066
+ "century": 525,
1067
+ "milk": 526,
1068
+ "corner": 527,
1069
+ "speed": 528,
1070
+ "method": 529,
1071
+ "organ": 530,
1072
+ "pay": 531,
1073
+ "single": 532,
1074
+ "touch": 533,
1075
+ "control": 534,
1076
+ "bottom": 535,
1077
+ "design": 536,
1078
+ "coat": 537,
1079
+ "else": 538,
1080
+ "quite": 539,
1081
+ "broke": 540,
1082
+ "case": 541,
1083
+ "middle": 542,
1084
+ "kill": 543,
1085
+ "son": 544,
1086
+ "lake": 545,
1087
+ "moment": 546,
1088
+ "scale": 547,
1089
+ "loud": 548,
1090
+ "spring": 549,
1091
+ "observe": 550,
1092
+ "child": 551,
1093
+ "straight": 552,
1094
+ "consonant": 553,
1095
+ "nation": 554,
1096
+ "dictionary": 555,
1097
+ "bit": 556,
1098
+ "coast": 557,
1099
+ "copy": 558,
1100
+ "phrase": 559,
1101
+ "silent": 560,
1102
+ "tall": 561,
1103
+ "sand": 562,
1104
+ "soil": 563,
1105
+ "roll": 564,
1106
+ "temperature": 565,
1107
+ "finger": 566,
1108
+ "industry": 567,
1109
+ "value": 568,
1110
+ "fight": 569,
1111
+ "lie": 570,
1112
+ "beat": 571,
1113
+ "excite": 572,
1114
+ "natural": 573,
1115
+ "view": 574,
1116
+ "sense": 575,
1117
+ "capital": 576,
1118
+ "chair": 578,
1119
+ "danger": 579,
1120
+ "fruit": 580,
1121
+ "rich": 581,
1122
+ "thick": 582,
1123
+ "soldier": 583,
1124
+ "process": 584,
1125
+ "operate": 585,
1126
+ "practice": 586,
1127
+ "separate": 587,
1128
+ "difficult": 588,
1129
+ "visit": 589,
1130
+ "spread": 590,
1131
+ "particular": 591,
1132
+ "catch": 592,
1133
+ "square": 593,
1134
+ "reason": 594,
1135
+ "length": 595,
1136
+ "represent": 596,
1137
+ "art": 597,
1138
+ "subject": 598,
1139
+ "region": 599,
1140
+ "vary": 601,
1141
+ "settle": 602,
1142
+ "general": 605,
1143
+ "ice": 606,
1144
+ "matter": 607,
1145
+ "circle": 608,
1146
+ "pair": 609,
1147
+ "include": 610,
1148
+ "divide": 611,
1149
+ "syllable": 612,
1150
+ "grand": 614,
1151
+ "wave": 617,
1152
+ "drop": 618,
1153
+ "heart": 619,
1154
+ "present": 620,
1155
+ "heavy": 621,
1156
+ "dance": 622,
1157
+ "engine": 623,
1158
+ "position": 624,
1159
+ "arm": 625,
1160
+ "wide": 626,
1161
+ "sail": 627,
1162
+ "material": 628,
1163
+ "fraction": 629,
1164
+ "forest": 630,
1165
+ "race": 632,
1166
+ "window": 633,
1167
+ "store": 634,
1168
+ "summer": 635,
1169
+ "train": 636,
1170
+ "prove": 638,
1171
+ "lone": 639,
1172
+ "leg": 640,
1173
+ "exercise": 641,
1174
+ "wall": 642,
1175
+ "mount": 644,
1176
+ "wish": 645,
1177
+ "sky": 646,
1178
+ "board": 647,
1179
+ "joy": 648,
1180
+ "winter": 649,
1181
+ "sat": 650,
1182
+ "written": 651,
1183
+ "wild": 652,
1184
+ "instrument": 653,
1185
+ "kept": 654,
1186
+ "glass": 655,
1187
+ "grass": 656,
1188
+ "cow": 657,
1189
+ "job": 658,
1190
+ "edge": 659,
1191
+ "sign": 660,
1192
+ "past": 662,
1193
+ "soft": 663,
1194
+ "fun": 664,
1195
+ "bright": 665,
1196
+ "gas": 666,
1197
+ "weather": 667,
1198
+ "month": 668,
1199
+ "million": 669,
1200
+ "bear": 670,
1201
+ "finish": 671,
1202
+ "happy": 672,
1203
+ "hope": 673,
1204
+ "flower": 674,
1205
+ "clothe": 675,
1206
+ "strange": 676,
1207
+ "gone": 677,
1208
+ "trade": 678,
1209
+ "melody": 679,
1210
+ "trip": 680,
1211
+ "receive": 682,
1212
+ "row": 683,
1213
+ "mouth": 684,
1214
+ "exact": 685,
1215
+ "symbol": 686,
1216
+ "least": 688,
1217
+ "trouble": 689,
1218
+ "shout": 690,
1219
+ "except": 691,
1220
+ "wrote": 692,
1221
+ "seed": 693,
1222
+ "tone": 694,
1223
+ "join": 695,
1224
+ "suggest": 696,
1225
+ "clean": 697,
1226
+ "lady": 699,
1227
+ "yard": 700,
1228
+ "rise": 701,
1229
+ "bad": 702,
1230
+ "blow": 703,
1231
+ "oil": 704,
1232
+ "grew": 707,
1233
+ "cent": 708,
1234
+ "mix": 709,
1235
+ "team": 710,
1236
+ "wire": 711,
1237
+ "cost": 712,
1238
+ "lost": 713,
1239
+ "brown": 714,
1240
+ "wear": 715,
1241
+ "garden": 716,
1242
+ "equal": 717,
1243
+ "sent": 718,
1244
+ "choose": 719,
1245
+ "fell": 720,
1246
+ "fit": 721,
1247
+ "flow": 722,
1248
+ "fair": 723,
1249
+ "bank": 724,
1250
+ "collect": 725,
1251
+ "save": 726,
1252
+ "decimal": 728,
1253
+ "ear": 729,
1254
+ "paragraph": 748,
1255
+ "parent": 749,
1256
+ "shore": 750,
1257
+ "division": 751,
1258
+ "sheet": 752,
1259
+ "substance": 753,
1260
+ "favor": 754,
1261
+ "connect": 755,
1262
+ "post": 756,
1263
+ "spend": 757,
1264
+ "chord": 758,
1265
+ "fat": 759,
1266
+ "glad": 760,
1267
+ "original": 761,
1268
+ "share": 762,
1269
+ "station": 763,
1270
+ "dad": 764,
1271
+ "bread": 765,
1272
+ "charge": 766,
1273
+ "proper": 767,
1274
+ "bar": 768,
1275
+ "offer": 769,
1276
+ "segment": 770,
1277
+ "slave": 771,
1278
+ "duck": 772,
1279
+ "instant": 773,
1280
+ "market": 774,
1281
+ "degree": 775,
1282
+ "populate": 776,
1283
+ "chick": 777,
1284
+ "dear": 778,
1285
+ "enemy": 779,
1286
+ "reply": 780,
1287
+ "drink": 781,
1288
+ "occur": 782,
1289
+ "support": 783,
1290
+ "speech": 784,
1291
+ "nature": 785,
1292
+ "range": 786,
1293
+ "steam": 787,
1294
+ "motion": 788,
1295
+ "path": 789,
1296
+ "liquid": 790,
1297
+ "log": 791,
1298
+ "meant": 792,
1299
+ "quotient": 793,
1300
+ "teeth": 794,
1301
+ "shell": 795,
1302
+ "neck": 796,
1303
+ "oxygen": 797,
1304
+ "sugar": 798,
1305
+ "death": 799,
1306
+ "pretty": 800,
1307
+ "skill": 801,
1308
+ "women": 802,
1309
+ "season": 803,
1310
+ "solution": 804,
1311
+ "magnet": 805,
1312
+ "silver": 806,
1313
+ "thank": 807,
1314
+ "branch": 808,
1315
+ "match": 809,
1316
+ "suffix": 810,
1317
+ "especially": 811,
1318
+ "fig": 812,
1319
+ "afraid": 813,
1320
+ "huge": 814,
1321
+ "sister": 815,
1322
+ "steel": 816,
1323
+ "discuss": 817,
1324
+ "similar": 819,
1325
+ "guide": 820,
1326
+ "experience": 821,
1327
+ "score": 822,
1328
+ "apple": 823,
1329
+ "bought": 824,
1330
+ "led": 825,
1331
+ "pitch": 826,
1332
+ "mass": 828,
1333
+ "card": 829,
1334
+ "band": 830,
1335
+ "rope": 831,
1336
+ "slip": 832,
1337
+ "win": 833,
1338
+ "dream": 834,
1339
+ "evening": 835,
1340
+ "condition": 836,
1341
+ "feed": 837,
1342
+ "tool": 838,
1343
+ "total": 839,
1344
+ "basic": 840,
1345
+ "smell": 841,
1346
+ "valley": 842,
1347
+ "nor": 843,
1348
+ "double": 844,
1349
+ "seat": 845,
1350
+ "continue": 846,
1351
+ "block": 847,
1352
+ "chart": 848,
1353
+ "hat": 849,
1354
+ "sell": 850,
1355
+ "success": 851,
1356
+ "company": 852,
1357
+ "subtract": 853,
1358
+ "event": 854,
1359
+ "deal": 856,
1360
+ "swim": 857,
1361
+ "term": 858,
1362
+ "opposite": 859,
1363
+ "wife": 860,
1364
+ "shoe": 861,
1365
+ "shoulder": 862,
1366
+ "arrange": 864,
1367
+ "camp": 865,
1368
+ "invent": 866,
1369
+ "cotton": 867,
1370
+ "born": 868,
1371
+ "determine": 869,
1372
+ "quart": 870,
1373
+ "nine": 871,
1374
+ "truck": 872,
1375
+ "noise": 873,
1376
+ "level": 874,
1377
+ "chance": 875,
1378
+ "gather": 876,
1379
+ "shop": 877,
1380
+ "stretch": 878,
1381
+ "throw": 879,
1382
+ "shine": 880,
1383
+ "property": 881,
1384
+ "column": 882,
1385
+ "molecule": 883,
1386
+ "select": 884,
1387
+ "wrong": 885,
1388
+ "gray": 886,
1389
+ "repeat": 887,
1390
+ "require": 888,
1391
+ "broad": 889,
1392
+ "prepare": 890,
1393
+ "salt": 891,
1394
+ "nose": 892,
1395
+ "plural": 893,
1396
+ "anger": 894,
1397
+ "claim": 895,
1398
+ "continent": 896,
1399
+ "mom": 897,
1400
+ "rail": 913,
1401
+ "please": 1023,
1402
+ "protect": 1024,
1403
+ "noon": 1025,
1404
+ "crop": 1026,
1405
+ "modern": 1027,
1406
+ "element": 1028,
1407
+ "hit": 1029,
1408
+ "student": 1030,
1409
+ "party": 1032,
1410
+ "supply": 1033,
1411
+ "bone": 1034,
1412
+ "tube": 1035,
1413
+ "famous": 1036,
1414
+ "dollar": 1037,
1415
+ "stream": 1038,
1416
+ "fear": 1039,
1417
+ "sight": 1040,
1418
+ "thin": 1041,
1419
+ "triangle": 1042,
1420
+ "planet": 1043,
1421
+ "hurry": 1044,
1422
+ "chief": 1045,
1423
+ "colony": 1046,
1424
+ "clock": 1047,
1425
+ "mine": 1048,
1426
+ "tie": 1049,
1427
+ "enter": 1050,
1428
+ "major": 1051,
1429
+ "fresh": 1052,
1430
+ "search": 1053,
1431
+ "send": 1054,
1432
+ "yellow": 1055,
1433
+ "gun": 1056,
1434
+ "allow": 1057,
1435
+ "print": 1058,
1436
+ "dead": 1059,
1437
+ "spot": 1060,
1438
+ "desert": 1061,
1439
+ "suit": 1062,
1440
+ "current": 1063,
1441
+ "lift": 1064,
1442
+ "rose": 1065,
1443
+ "arrive": 1066,
1444
+ "master": 1067,
1445
+ "track": 1068,
1446
+ "locate": 1069,
1447
+ "ring": 1070,
1448
+ "believe": 1071,
1449
+ "gentle": 1072,
1450
+ "woman": 1073,
1451
+ "captain": 1074,
1452
+ "guess": 1075,
1453
+ "necessary": 1076,
1454
+ "sharp": 1077,
1455
+ "wing": 1078,
1456
+ "create": 1079,
1457
+ "neighbor": 1080,
1458
+ "wash": 1081,
1459
+ "bat": 1082,
1460
+ "rather": 1083,
1461
+ "crowd": 1084,
1462
+ "corn": 1085,
1463
+ "compare": 1086,
1464
+ "poem": 1087,
1465
+ "string": 1088,
1466
+ "bell": 1089,
1467
+ "depend": 1090,
1468
+ "meat": 1091,
1469
+ "rub": 1092,
1470
+ "indicate": 1096,
1471
+ "metal": 1097,
1472
+ "whether": 1098,
1473
+ "push": 1099,
1474
+ "seven": 1100,
1475
+ # Additional common words 1101-5000
1476
+ "village": 1101,
1477
+ "meet": 1102,
1478
+ "root": 1103,
1479
+ "buy": 1104,
1480
+ "raise": 1105,
1481
+ "solve": 1106,
1482
+ "understand": 1107,
1483
+ "member": 1108,
1484
+ "describe": 1112,
1485
+ "ocean": 1114,
1486
+ "electric": 1115,
1487
+ "expect": 1116,
1488
+ "imagine": 1119,
1489
+ "provide": 1120,
1490
+ "agree": 1121,
1491
+ "thus": 1122,
1492
+ # For brevity, jumping to approximate ranks for less common words
1493
+ "political": 1500,
1494
+ "social": 1501,
1495
+ "business": 1502,
1496
+ "service": 1503,
1497
+ "attention": 1504,
1498
+ "international": 1505,
1499
+ "various": 1506,
1500
+ "community": 1507,
1501
+ "national": 1508,
1502
+ "american": 1509,
1503
+ "president": 1510,
1504
+ "available": 1511,
1505
+ "information": 1512,
1506
+ "development": 1513,
1507
+ "different": 1515,
1508
+ "important": 1516,
1509
+ "education": 1517,
1510
+ "director": 1518,
1511
+ "economic": 1519,
1512
+ "evidence": 1520,
1513
+ "management": 1521,
1514
+ "hospital": 1522,
1515
+ "personal": 1523,
1516
+ "professional": 1526,
1517
+ "performance": 1527,
1518
+ "individual": 1528,
1519
+ "organization": 1529,
1520
+ "structure": 1530,
1521
+ "responsibility": 1531,
1522
+ "technology": 1532,
1523
+ "democratic": 1533,
1524
+ "relationship": 1534,
1525
+ "environmental": 1535,
1526
+ "significantly": 1536,
1527
+ "particularly": 1537,
1528
+ "approximately": 1538,
1529
+ "ultimately": 1539,
1530
+ "comprehensive": 1540,
1531
+ "substantial": 1541,
1532
+ "fundamental": 1542,
1533
+ "analysis": 1543,
1534
+ "investigation": 1544,
1535
+ "demonstrate": 1546,
1536
+ "theoretical": 1547,
1537
+ "significant": 1548,
1538
+ "hypothesis": 1549,
1539
+ "empirical": 1550,
1540
+ "methodology": 1551,
1541
+ "framework": 1552,
1542
+ "implications": 1553,
1543
+ "phenomena": 1554,
1544
+ "parameters": 1555,
1545
+ "correlation": 1556,
1546
+ "variables": 1557,
1547
+ "statistical": 1558,
1548
+ "preliminary": 1559,
1549
+ }
1550
+
1551
+
1552
+ def _tokenize_for_frequency_analysis(text: str) -> list[str]:
1553
+ """Tokenize text for frequency analysis.
1554
+
1555
+ Args:
1556
+ text: Input text to tokenize
1557
+
1558
+ Returns:
1559
+ List of clean, lowercase tokens
1560
+
1561
+ Process:
1562
+ - Lowercase entire text
1563
+ - Split on whitespace
1564
+ - Strip punctuation from each token
1565
+ - Filter out empty tokens
1566
+ """
1567
+ if not text or not text.strip():
1568
+ return []
1569
+
1570
+ text_lower = text.lower()
1571
+ raw_tokens = text_lower.split()
1572
+
1573
+ # Comprehensive punctuation set
1574
+ punctuation_chars = set(".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„\"\"''‚'")
1575
+
1576
+ tokens = []
1577
+ for token in raw_tokens:
1578
+ clean_token = token.strip("".join(punctuation_chars))
1579
+ if clean_token:
1580
+ tokens.append(clean_token)
1581
+
1582
+ return tokens
1583
+
1584
+
1585
+ def _get_frequency_rank(word: str, frequency_dict: dict[str, int], max_rank: int) -> int:
1586
+ """Get frequency rank for a word, or 50000 if unknown.
1587
+
1588
+ Args:
1589
+ word: Word to look up (should be lowercase)
1590
+ frequency_dict: Dictionary mapping words to frequency ranks
1591
+ max_rank: Maximum rank in the frequency dictionary
1592
+
1593
+ Returns:
1594
+ Frequency rank (1 = most common), or 50000 if word not found
1595
+ (Unknown words are treated as very rare)
1596
+ """
1597
+ return frequency_dict.get(word, 50000)
1598
+
1599
+
1600
+ def compute_word_frequency_sophistication(
1601
+ text: str,
1602
+ frequency_corpus: str = "coca",
1603
+ rare_threshold: int = 10000,
1604
+ common_threshold: int = 1000,
1605
+ chunk_size: int = 1000,
1606
+ ) -> WordFrequencySophisticationResult:
1607
+ """
1608
+ Compute word frequency sophistication metrics.
1609
+
1610
+ Analyzes vocabulary sophistication by comparing text words against
1611
+ reference frequency lists from large corpora. Words are classified
1612
+ as common, rare, or academic based on their frequency ranks in the
1613
+ reference corpus.
1614
+
1615
+ Related GitHub Issue:
1616
+ #15 - Word Frequency Sophistication Metrics
1617
+ https://github.com/craigtrim/pystylometry/issues/15
1618
+
1619
+ Sophistication is a key indicator of writing quality and expertise:
1620
+ - Academic writing uses more low-frequency, technical words
1621
+ - Fiction uses moderate-frequency, descriptive words
1622
+ - Journalism uses high-frequency, accessible words
1623
+ - Authors with larger vocabularies use rarer words
1624
+ - Native speakers use different frequency profiles than learners
1625
+
1626
+ Applications:
1627
+ - Assessing vocabulary richness beyond simple TTR
1628
+ - Comparing writing sophistication across authors or genres
1629
+ - Tracking vocabulary development over time
1630
+ - Identifying register (formal vs. informal)
1631
+ - Detecting text difficulty level
1632
+
1633
+ Frequency bands (example for 100,000-word corpus):
1634
+ - Very common: Rank 1-1,000 (top 1%)
1635
+ - Common: Rank 1,001-5,000 (top 5%)
1636
+ - Moderate: Rank 5,001-10,000 (top 10%)
1637
+ - Rare: Rank 10,001-20,000 (top 20%)
1638
+ - Very rare: Rank 20,001+ (bottom 80%)
1639
+
1640
+ Args:
1641
+ text: Input text to analyze. Should contain at least 50+ words
1642
+ for meaningful statistics. Shorter texts may have unreliable
1643
+ sophistication metrics.
1644
+ frequency_corpus: Reference corpus to use for frequency data.
1645
+ Options: "coca", "bnc", "google_ngrams", "subtlex"
1646
+ Default is "coca" (Corpus of Contemporary American English).
1647
+ rare_threshold: Frequency rank threshold for "rare" words. Words with
1648
+ rank > rare_threshold are considered rare. Default 10,000.
1649
+ common_threshold: Frequency rank threshold for "common" words. Words with
1650
+ rank <= common_threshold are considered common. Default 1,000.
1651
+
1652
+ Returns:
1653
+ WordFrequencySophisticationResult containing:
1654
+ - mean_frequency_rank: Average frequency rank (lower = more common)
1655
+ - median_frequency_rank: Median frequency rank
1656
+ - rare_word_ratio: Proportion of words beyond rare_threshold
1657
+ - common_word_ratio: Proportion of words within common_threshold
1658
+ - academic_word_ratio: Proportion of Academic Word List words
1659
+ - advanced_word_ratio: Proportion of sophisticated vocabulary
1660
+ - frequency_band_distribution: Distribution across frequency bands
1661
+ - rarest_words: Least frequent words with their ranks
1662
+ - most_common_words: Most frequent words with their ranks
1663
+ - metadata: Corpus info, thresholds, counts, etc.
1664
+
1665
+ Example:
1666
+ >>> result = compute_word_frequency_sophistication("Sample academic text...")
1667
+ >>> print(f"Mean frequency rank: {result.mean_frequency_rank:.1f}")
1668
+ Mean frequency rank: 4523.7
1669
+ >>> print(f"Rare word ratio: {result.rare_word_ratio:.3f}")
1670
+ Rare word ratio: 0.234
1671
+ >>> print(f"Academic words: {result.academic_word_ratio:.3f}")
1672
+ Academic words: 0.156
1673
+
1674
+ >>> # Compare authors
1675
+ >>> author1 = compute_word_frequency_sophistication("Text by author 1...")
1676
+ >>> author2 = compute_word_frequency_sophistication("Text by author 2...")
1677
+ >>> print(f"Author 1 mean rank: {author1.mean_frequency_rank:.1f}")
1678
+ >>> print(f"Author 2 mean rank: {author2.mean_frequency_rank:.1f}")
1679
+ >>> # Lower rank = uses more common words
1680
+
1681
+ Note:
1682
+ - Frequency ranks are corpus-specific (COCA ranks differ from BNC ranks)
1683
+ - Words not in reference corpus are assigned maximum rank + 1
1684
+ - Case-insensitive matching (all words lowercased)
1685
+ - Lemmatization recommended but not required
1686
+ - Function words (the, of, and) dominate high-frequency ranks
1687
+ - Stopword removal can provide cleaner sophistication metrics
1688
+ - Academic Word List is field-independent academic vocabulary
1689
+ """
1690
+ # Validate corpus parameter
1691
+ if frequency_corpus != "coca":
1692
+ raise ValueError(f"Only 'coca' corpus is currently supported, got '{frequency_corpus}'")
1693
+
1694
+ # Load frequency dictionary
1695
+ frequency_dict = COCA_FREQUENCY_RANKS
1696
+ max_rank = max(frequency_dict.values())
1697
+ # Unknown words are assigned rank 50000 (treated as very rare)
1698
+ unknown_rank = 50000
1699
+
1700
+ # Tokenize text
1701
+ tokens = _tokenize_for_frequency_analysis(text)
1702
+ total_words = len(tokens)
1703
+
1704
+ if total_words == 0:
1705
+ raise ValueError("Text contains no valid tokens")
1706
+
1707
+ # Look up frequency rank for each word
1708
+ word_ranks = [_get_frequency_rank(word, frequency_dict, max_rank) for word in tokens]
1709
+
1710
+ # Calculate mean and median frequency ranks
1711
+ mean_rank = sum(word_ranks) / len(word_ranks)
1712
+ sorted_ranks = sorted(word_ranks)
1713
+ n = len(sorted_ranks)
1714
+ if n % 2 == 0:
1715
+ median_rank = (sorted_ranks[n // 2 - 1] + sorted_ranks[n // 2]) / 2.0
1716
+ else:
1717
+ median_rank = float(sorted_ranks[n // 2])
1718
+
1719
+ # Count words in different categories
1720
+ rare_count = sum(1 for rank in word_ranks if rank > rare_threshold)
1721
+ common_count = sum(1 for rank in word_ranks if rank <= common_threshold)
1722
+ academic_count = sum(1 for word in tokens if word in ACADEMIC_WORD_LIST)
1723
+ unknown_count = sum(1 for rank in word_ranks if rank == unknown_rank)
1724
+
1725
+ # Calculate ratios
1726
+ rare_word_ratio = rare_count / total_words
1727
+ common_word_ratio = common_count / total_words
1728
+ academic_word_ratio = academic_count / total_words
1729
+
1730
+ # Advanced words = words that are either rare OR academic (union)
1731
+ advanced_words = set()
1732
+ for i, word in enumerate(tokens):
1733
+ if word_ranks[i] > rare_threshold or word in ACADEMIC_WORD_LIST:
1734
+ advanced_words.add(word)
1735
+ # Count token occurrences of advanced words
1736
+ advanced_count = sum(1 for w in tokens if w in advanced_words)
1737
+ advanced_word_ratio = advanced_count / total_words
1738
+
1739
+ # Frequency band distribution
1740
+ band_counts = {
1741
+ "very_common": sum(1 for r in word_ranks if r <= 1000),
1742
+ "common": sum(1 for r in word_ranks if 1000 < r <= 5000),
1743
+ "moderate": sum(1 for r in word_ranks if 5000 < r <= 10000),
1744
+ "rare": sum(1 for r in word_ranks if 10000 < r <= 20000),
1745
+ "very_rare": sum(1 for r in word_ranks if r > 20000),
1746
+ }
1747
+ frequency_band_distribution = {band: count / total_words for band, count in band_counts.items()}
1748
+
1749
+ # Find rarest and most common words (top 10 each, deduplicated)
1750
+ word_rank_pairs = list(zip(tokens, word_ranks))
1751
+ # Create unique word-rank mapping (uses last occurrence rank if word repeats)
1752
+ unique_pairs: dict[str, int] = {}
1753
+ for word, rank in word_rank_pairs:
1754
+ if word not in unique_pairs:
1755
+ unique_pairs[word] = rank
1756
+
1757
+ # Rarest: highest ranks
1758
+ sorted_by_rarest = sorted(unique_pairs.items(), key=lambda x: x[1], reverse=True)
1759
+ rarest_words = [(word, float(rank)) for word, rank in sorted_by_rarest[:10]]
1760
+
1761
+ # Most common: lowest ranks
1762
+ sorted_by_common = sorted(unique_pairs.items(), key=lambda x: x[1])
1763
+ most_common_words = [(word, float(rank)) for word, rank in sorted_by_common[:10]]
1764
+
1765
+ # Create single-value distributions (analysis is done on full text)
1766
+ mean_frequency_rank_dist = make_distribution([mean_rank])
1767
+ median_frequency_rank_dist = make_distribution([median_rank])
1768
+ rare_word_ratio_dist = make_distribution([rare_word_ratio])
1769
+ common_word_ratio_dist = make_distribution([common_word_ratio])
1770
+ academic_word_ratio_dist = make_distribution([academic_word_ratio])
1771
+ advanced_word_ratio_dist = make_distribution([advanced_word_ratio])
1772
+
1773
+ # Metadata
1774
+ metadata = {
1775
+ "frequency_corpus": frequency_corpus,
1776
+ "rare_threshold": rare_threshold,
1777
+ "common_threshold": common_threshold,
1778
+ "total_words": total_words,
1779
+ "unique_words": len(set(tokens)),
1780
+ "unknown_words": unknown_count,
1781
+ "unknown_word_ratio": unknown_count / total_words,
1782
+ "frequency_list_size": len(frequency_dict),
1783
+ "max_frequency_rank": max_rank,
1784
+ }
1785
+
1786
+ return WordFrequencySophisticationResult(
1787
+ mean_frequency_rank=mean_rank,
1788
+ median_frequency_rank=median_rank,
1789
+ rare_word_ratio=rare_word_ratio,
1790
+ common_word_ratio=common_word_ratio,
1791
+ academic_word_ratio=academic_word_ratio,
1792
+ advanced_word_ratio=advanced_word_ratio,
1793
+ frequency_band_distribution=frequency_band_distribution,
1794
+ rarest_words=rarest_words,
1795
+ most_common_words=most_common_words,
1796
+ mean_frequency_rank_dist=mean_frequency_rank_dist,
1797
+ median_frequency_rank_dist=median_frequency_rank_dist,
1798
+ rare_word_ratio_dist=rare_word_ratio_dist,
1799
+ common_word_ratio_dist=common_word_ratio_dist,
1800
+ academic_word_ratio_dist=academic_word_ratio_dist,
1801
+ advanced_word_ratio_dist=advanced_word_ratio_dist,
1802
+ chunk_size=chunk_size,
1803
+ chunk_count=1, # Single pass analysis
1804
+ metadata=metadata,
1805
+ )