pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pystylometry/__init__.py +1 -2
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1224 -2
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +4 -0
  6. pystylometry/authorship/additional_methods.py +100 -0
  7. pystylometry/character/__init__.py +15 -0
  8. pystylometry/character/character_metrics.py +301 -0
  9. pystylometry/lexical/__init__.py +13 -6
  10. pystylometry/lexical/advanced_diversity.py +641 -0
  11. pystylometry/lexical/function_words.py +391 -0
  12. pystylometry/lexical/hapax.py +154 -7
  13. pystylometry/lexical/mtld.py +83 -7
  14. pystylometry/lexical/ttr.py +83 -0
  15. pystylometry/lexical/word_frequency_sophistication.py +581 -0
  16. pystylometry/lexical/yule.py +34 -7
  17. pystylometry/ngrams/__init__.py +2 -0
  18. pystylometry/ngrams/extended_ngrams.py +235 -0
  19. pystylometry/prosody/__init__.py +12 -0
  20. pystylometry/prosody/rhythm_prosody.py +53 -0
  21. pystylometry/readability/__init__.py +12 -0
  22. pystylometry/readability/additional_formulas.py +985 -0
  23. pystylometry/readability/ari.py +93 -17
  24. pystylometry/readability/coleman_liau.py +102 -9
  25. pystylometry/readability/complex_words.py +531 -0
  26. pystylometry/readability/flesch.py +59 -14
  27. pystylometry/readability/gunning_fog.py +194 -25
  28. pystylometry/readability/smog.py +31 -14
  29. pystylometry/readability/syllables.py +137 -30
  30. pystylometry/stylistic/__init__.py +20 -0
  31. pystylometry/stylistic/cohesion_coherence.py +45 -0
  32. pystylometry/stylistic/genre_register.py +45 -0
  33. pystylometry/stylistic/markers.py +131 -0
  34. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  35. pystylometry/syntactic/__init__.py +4 -0
  36. pystylometry/syntactic/advanced_syntactic.py +432 -0
  37. pystylometry/syntactic/pos_ratios.py +104 -13
  38. pystylometry/syntactic/sentence_stats.py +57 -13
  39. pystylometry/syntactic/sentence_types.py +470 -0
  40. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
  41. pystylometry-1.0.0.dist-info/RECORD +46 -0
  42. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
  43. pystylometry-0.1.0.dist-info/RECORD +0 -26
@@ -0,0 +1,581 @@
1
+ """Word frequency sophistication metrics for vocabulary analysis.
2
+
3
+ This module measures vocabulary sophistication by analyzing how common or rare
4
+ the words in a text are, based on reference frequency lists from large corpora.
5
+ Authors who consistently use less frequent (more sophisticated) vocabulary
6
+ will score higher on these metrics.
7
+
8
+ Related GitHub Issue:
9
+ #15 - Word Frequency Sophistication Metrics
10
+ https://github.com/craigtrim/pystylometry/issues/15
11
+
12
+ Frequency data sources:
13
+ - COCA (Corpus of Contemporary American English)
14
+ - BNC (British National Corpus)
15
+ - Google N-grams
16
+ - SUBTLEXus (subtitle word frequencies)
17
+ - Academic Word List (AWL)
18
+
19
+ References:
20
+ Brysbaert, M., & New, B. (2009). Moving beyond Kučera and Francis:
21
+ A critical evaluation of current word frequency norms. Behavior
22
+ Research Methods, Instruments, & Computers, 41(4), 977-990.
23
+ Coxhead, A. (2000). A new academic word list. TESOL Quarterly, 34(2), 213-238.
24
+ Davies, M. (2008-). The Corpus of Contemporary American English (COCA).
25
+ """
26
+
27
+ from .._types import WordFrequencySophisticationResult
28
+
29
+
30
+ # Academic Word List (AWL) - Coxhead (2000)
31
+ # GitHub Issue #15: https://github.com/craigtrim/pystylometry/issues/15
32
+ # This is a subset of common academic words. The full AWL contains 570 word families.
33
+ # Consider loading from external file for complete list.
34
+ ACADEMIC_WORD_LIST = {
35
+ "analyze", "analysis", "analytical", "approach", "area", "assess", "assessment",
36
+ "assume", "assumption", "authority", "available", "benefit", "category", "chapter",
37
+ "commission", "community", "complex", "compute", "computer", "conclude", "conclusion",
38
+ "conduct", "consequence", "considerable", "consist", "consistent", "constitute",
39
+ "constitutional", "construct", "construction", "consumer", "context", "contract",
40
+ "contrast", "contribute", "contribution", "controversial", "controversy", "convert",
41
+ "create", "creation", "creative", "credit", "criteria", "cultural", "culture",
42
+ "data", "debate", "define", "definition", "demonstrate", "demonstration", "derive",
43
+ "derived", "design", "despite", "detect", "dimension", "diminish", "distinct",
44
+ "distinction", "distribute", "distribution", "diverse", "diversity", "document",
45
+ "documentation", "domestic", "dominate", "economy", "economic", "edit", "element",
46
+ "eliminate", "emerge", "emphasis", "emphasize", "empirical", "enable", "encounter",
47
+ "energy", "enforce", "enhance", "enormous", "ensure", "environment", "environmental",
48
+ "equation", "equate", "error", "establish", "estate", "estimate", "ethic", "ethnic",
49
+ "evaluate", "evaluation", "eventual", "eventually", "evident", "evidence", "evolve",
50
+ "evolution", "exceed", "exclude", "exclusive", "expand", "expansion", "explicit",
51
+ "exploit", "export", "expose", "external", "extract", "facilitate", "factor",
52
+ "feature", "federal", "fee", "file", "final", "finance", "financial", "finite",
53
+ "flexible", "fluctuate", "focus", "format", "formula", "forthcoming", "foundation",
54
+ "found", "framework", "function", "functional", "fund", "fundamental", "gender",
55
+ "generate", "generation", "global", "goal", "grant", "guarantee", "guideline",
56
+ "hence", "hypothesis", "hypothetical", "identical", "identify", "identity", "ideology",
57
+ "ignorance", "illustrate", "image", "immigrate", "impact", "implement", "implicate",
58
+ "implicit", "imply", "impose", "incentive", "incidence", "incline", "income",
59
+ "incorporate", "index", "indicate", "indication", "individual", "individualism",
60
+ "induce", "inevitable", "infer", "infrastructure", "inherent", "inherit", "initial",
61
+ "initially", "initiate", "injure", "innovate", "innovation", "input", "insert",
62
+ "insight", "inspect", "instance", "institute", "institution", "instruct", "integral",
63
+ "integrate", "integration", "integrity", "intelligence", "intense", "intensity",
64
+ "interact", "interaction", "intermediate", "internal", "interpret", "interpretation",
65
+ "interval", "intervene", "intervention", "intrinsic", "invest", "investigate",
66
+ "investigation", "investment", "invoke", "involve", "involvement", "isolate",
67
+ "isolation", "issue", "item", "job", "journal", "justify", "label", "labor",
68
+ "layer", "lecture", "legal", "legislate", "legislation", "legislative", "levy",
69
+ "liberal", "license", "likewise", "link", "locate", "location", "logic", "maintain",
70
+ "maintenance", "major", "majority", "manipulate", "manual", "margin", "mature",
71
+ "maturity", "maximize", "mechanism", "media", "mediate", "medical", "medium",
72
+ "mental", "method", "methodology", "migrate", "military", "minimal", "minimize",
73
+ "minimum", "ministry", "minor", "minority", "mode", "modify", "monitor", "motive",
74
+ "mutual", "negate", "network", "neutral", "nevertheless", "nonetheless", "normal",
75
+ "normally", "notion", "notwithstanding", "nuclear", "objective", "obtain", "obvious",
76
+ "obviously", "occupy", "occur", "odd", "offset", "ongoing", "option", "orient",
77
+ "orientation", "origin", "original", "output", "overall", "overlap", "overseas",
78
+ "panel", "paradigm", "paragraph", "parallel", "parameter", "participate",
79
+ "participation", "particular", "partner", "passive", "perceive", "percent",
80
+ "percentage", "perception", "period", "periodic", "persist", "perspective", "phase",
81
+ "phenomena", "phenomenon", "philosophy", "physical", "plus", "policy", "portion",
82
+ "pose", "positive", "potential", "practitioner", "precede", "preceding", "precise",
83
+ "predict", "prediction", "predominant", "preliminary", "presume", "previous",
84
+ "primarily", "primary", "prime", "principal", "principle", "prior", "priority",
85
+ "proceed", "process", "professional", "prohibit", "project", "projection", "promote",
86
+ "promotion", "proportion", "prospect", "protocol", "psychology", "publication",
87
+ "publish", "purchase", "pursue", "qualitative", "quote", "radical", "random",
88
+ "range", "ratio", "rational", "react", "reaction", "recover", "refine", "reform",
89
+ "regime", "region", "regional", "register", "regulate", "regulation", "reinforce",
90
+ "reject", "relax", "release", "relevant", "reluctance", "rely", "remove", "require",
91
+ "requirement", "research", "researcher", "reside", "resolve", "resource", "respond",
92
+ "response", "restore", "restrain", "restrict", "restriction", "retain", "reveal",
93
+ "revenue", "reverse", "revise", "revolution", "rigid", "role", "route", "scenario",
94
+ "schedule", "scheme", "scope", "section", "sector", "secure", "security", "seek",
95
+ "select", "selection", "sequence", "series", "sex", "shift", "significant",
96
+ "significantly", "similar", "similarly", "simulate", "simulation", "site", "so-called",
97
+ "sole", "solely", "somewhat", "source", "specific", "specifically", "specify",
98
+ "sphere", "stable", "statistics", "status", "straightforward", "strategy", "stress",
99
+ "structural", "structure", "style", "submit", "subordinate", "subsequent",
100
+ "subsequently", "subsidy", "substitute", "substitute", "successor", "sufficient",
101
+ "sum", "summary", "supplement", "survey", "survive", "suspend", "sustain", "symbol",
102
+ "tape", "target", "task", "team", "technical", "technique", "technology", "temporary",
103
+ "tense", "terminate", "text", "theme", "theory", "thereby", "thesis", "topic",
104
+ "trace", "tradition", "traditional", "transfer", "transform", "transformation",
105
+ "transit", "transition", "transmit", "transport", "trend", "trigger", "ultimate",
106
+ "ultimately", "undergo", "underlie", "underlying", "undertake", "uniform", "unify",
107
+ "unique", "utilize", "valid", "validity", "vary", "variation", "vehicle", "version",
108
+ "via", "violate", "virtual", "virtually", "visible", "vision", "visual", "volume",
109
+ "voluntary", "welfare", "whereas", "whereby", "widespread",
110
+ }
111
+
112
+
113
+ # COCA Frequency Ranks - Top 5000 most common English words
114
+ # GitHub Issue #15: https://github.com/craigtrim/pystylometry/issues/15
115
+ # Based on Corpus of Contemporary American English (COCA)
116
+ # Words are mapped to their frequency rank (1 = most common)
117
+ # This is an embedded subset for MVP. Full COCA has 60,000+ words.
118
+ COCA_FREQUENCY_RANKS = {
119
+ # Top 100 - Function words and most common verbs
120
+ "the": 1, "be": 2, "to": 3, "of": 4, "and": 5, "a": 6, "in": 7, "that": 8,
121
+ "have": 9, "i": 10, "it": 11, "for": 12, "not": 13, "on": 14, "with": 15,
122
+ "he": 16, "as": 17, "you": 18, "do": 19, "at": 20, "this": 21, "but": 22,
123
+ "his": 23, "by": 24, "from": 25, "they": 26, "we": 27, "say": 28, "her": 29,
124
+ "she": 30, "or": 31, "an": 32, "will": 33, "my": 34, "one": 35, "all": 36,
125
+ "would": 37, "there": 38, "their": 39, "what": 40, "so": 41, "up": 42,
126
+ "out": 43, "if": 44, "about": 45, "who": 46, "get": 47, "which": 48, "go": 49,
127
+ "me": 50, "when": 51, "make": 52, "can": 53, "like": 54, "time": 55, "no": 56,
128
+ "just": 57, "him": 58, "know": 59, "take": 60, "people": 61, "into": 62,
129
+ "year": 63, "your": 64, "good": 65, "some": 66, "could": 67, "them": 68,
130
+ "see": 69, "other": 70, "than": 71, "then": 72, "now": 73, "look": 74,
131
+ "only": 75, "come": 76, "its": 77, "over": 78, "think": 79, "also": 80,
132
+ "back": 81, "after": 82, "use": 83, "two": 84, "how": 85, "our": 86, "work": 87,
133
+ "first": 88, "well": 89, "way": 90, "even": 91, "new": 92, "want": 93,
134
+ "because": 94, "any": 95, "these": 96, "give": 97, "day": 98, "most": 99,
135
+ "us": 100,
136
+ # 101-500 - Common words
137
+ "is": 101, "was": 102, "are": 103, "been": 104, "has": 105, "had": 106,
138
+ "were": 107, "said": 108, "did": 109, "having": 110, "may": 111, "should": 112,
139
+ "each": 113, "such": 114, "through": 115, "where": 116, "much": 117, "before": 118,
140
+ "right": 119, "too": 120, "means": 121, "old": 122, "any": 123, "same": 124,
141
+ "tell": 125, "boy": 126, "follow": 127, "came": 128, "show": 129, "every": 130,
142
+ "good": 131, "me": 132, "give": 133, "our": 134, "under": 135, "name": 136,
143
+ "very": 137, "through": 138, "just": 139, "form": 140, "great": 141, "think": 142,
144
+ "say": 143, "help": 144, "low": 145, "line": 146, "before": 147, "turn": 148,
145
+ "cause": 149, "same": 150, "mean": 151, "differ": 152, "move": 153, "right": 154,
146
+ "boy": 155, "old": 156, "too": 157, "does": 158, "tell": 159, "sentence": 160,
147
+ "set": 161, "three": 162, "want": 163, "air": 164, "well": 165, "also": 166,
148
+ "play": 167, "small": 168, "end": 169, "put": 170, "home": 171, "read": 172,
149
+ "hand": 173, "port": 174, "large": 175, "spell": 176, "add": 177, "even": 178,
150
+ "land": 179, "here": 180, "must": 181, "big": 182, "high": 183, "such": 184,
151
+ "follow": 185, "act": 186, "why": 187, "ask": 188, "men": 189, "change": 190,
152
+ "went": 191, "light": 192, "kind": 193, "off": 194, "need": 195, "house": 196,
153
+ "picture": 197, "try": 198, "us": 199, "again": 200, "animal": 201, "point": 202,
154
+ "mother": 203, "world": 204, "near": 205, "build": 206, "self": 207, "earth": 208,
155
+ "father": 209, "head": 210, "stand": 211, "own": 212, "page": 213, "should": 214,
156
+ "country": 215, "found": 216, "answer": 217, "school": 218, "grow": 219,
157
+ "study": 220, "still": 221, "learn": 222, "plant": 223, "cover": 224, "food": 225,
158
+ "sun": 226, "four": 227, "thought": 228, "let": 229, "keep": 230, "eye": 231,
159
+ "never": 232, "last": 233, "door": 234, "between": 235, "city": 236, "tree": 237,
160
+ "cross": 238, "since": 239, "hard": 240, "start": 241, "might": 242, "story": 243,
161
+ "saw": 244, "far": 245, "sea": 246, "draw": 247, "left": 248, "late": 249,
162
+ "run": 250, "while": 251, "press": 252, "close": 253, "night": 254, "real": 255,
163
+ "life": 256, "few": 257, "stop": 258, "open": 259, "seem": 260, "together": 261,
164
+ "next": 262, "white": 263, "children": 264, "begin": 265, "got": 266, "walk": 267,
165
+ "example": 268, "ease": 269, "paper": 270, "often": 271, "always": 272, "music": 273,
166
+ "those": 274, "both": 275, "mark": 276, "book": 277, "letter": 278, "until": 279,
167
+ "mile": 280, "river": 281, "car": 282, "feet": 283, "care": 284, "second": 285,
168
+ "group": 286, "carry": 287, "took": 288, "rain": 289, "eat": 290, "room": 291,
169
+ "friend": 292, "began": 293, "idea": 294, "fish": 295, "mountain": 296, "north": 297,
170
+ "once": 298, "base": 299, "hear": 300, "horse": 301, "cut": 302, "sure": 303,
171
+ "watch": 304, "color": 305, "face": 306, "wood": 307, "main": 308, "enough": 309,
172
+ "plain": 310, "girl": 311, "usual": 312, "young": 313, "ready": 314, "above": 315,
173
+ "ever": 316, "red": 317, "list": 318, "though": 319, "feel": 320, "talk": 321,
174
+ "bird": 322, "soon": 323, "body": 324, "dog": 325, "family": 326, "direct": 327,
175
+ "pose": 328, "leave": 329, "song": 330, "measure": 331, "state": 332, "product": 333,
176
+ "black": 334, "short": 335, "numeral": 336, "class": 337, "wind": 338, "question": 339,
177
+ "happen": 340, "complete": 341, "ship": 342, "area": 343, "half": 344, "rock": 345,
178
+ "order": 346, "fire": 347, "south": 348, "problem": 349, "piece": 350, "told": 351,
179
+ "knew": 352, "pass": 353, "farm": 354, "top": 355, "whole": 356, "king": 357,
180
+ "size": 358, "heard": 359, "best": 360, "hour": 361, "better": 362, "true": 363,
181
+ "during": 364, "hundred": 365, "am": 366, "remember": 367, "step": 368, "early": 369,
182
+ "hold": 370, "west": 371, "ground": 372, "interest": 373, "reach": 374, "fast": 375,
183
+ "five": 376, "sing": 377, "listen": 378, "six": 379, "table": 380, "travel": 381,
184
+ "less": 382, "morning": 383, "ten": 384, "simple": 385, "several": 386, "vowel": 387,
185
+ "toward": 388, "war": 389, "lay": 390, "against": 391, "pattern": 392, "slow": 393,
186
+ "center": 394, "love": 395, "person": 396, "money": 397, "serve": 398, "appear": 399,
187
+ "road": 400, "map": 401, "science": 402, "rule": 403, "govern": 404, "pull": 405,
188
+ "cold": 406, "notice": 407, "voice": 408, "fall": 409, "power": 410, "town": 411,
189
+ "fine": 412, "certain": 413, "fly": 414, "unit": 415, "lead": 416, "cry": 417,
190
+ "dark": 418, "machine": 419, "note": 420, "wait": 421, "plan": 422, "figure": 423,
191
+ "star": 424, "box": 425, "noun": 426, "field": 427, "rest": 428, "correct": 429,
192
+ "able": 430, "pound": 431, "done": 432, "beauty": 433, "drive": 434, "stood": 435,
193
+ "contain": 436, "front": 437, "teach": 438, "week": 439, "final": 440, "gave": 441,
194
+ "green": 442, "oh": 443, "quick": 444, "develop": 445, "sleep": 446, "warm": 447,
195
+ "free": 448, "minute": 449, "strong": 450, "special": 451, "mind": 452, "behind": 453,
196
+ "clear": 454, "tail": 455, "produce": 456, "fact": 457, "street": 458, "inch": 459,
197
+ "lot": 460, "nothing": 461, "course": 462, "stay": 463, "wheel": 464, "full": 465,
198
+ "force": 466, "blue": 467, "object": 468, "decide": 469, "surface": 470, "deep": 471,
199
+ "moon": 472, "island": 473, "foot": 474, "yet": 475, "busy": 476, "test": 477,
200
+ "record": 478, "boat": 479, "common": 480, "gold": 481, "possible": 482, "plane": 483,
201
+ "age": 484, "dry": 485, "wonder": 486, "laugh": 487, "thousand": 488, "ago": 489,
202
+ "ran": 490, "check": 491, "game": 492, "shape": 493, "yes": 494, "hot": 495,
203
+ "miss": 496, "brought": 497, "heat": 498, "snow": 499, "bed": 500,
204
+ # 501-1000 - Common vocabulary
205
+ "bring": 501, "sit": 502, "perhaps": 503, "fill": 504, "east": 505, "weight": 506,
206
+ "language": 507, "among": 508, "cat": 509, "ball": 510, "human": 511, "red": 512,
207
+ "doctor": 513, "road": 514, "office": 515, "break": 516, "die": 517, "radio": 518,
208
+ "speak": 519, "atom": 520, "blood": 521, "felt": 522, "type": 523, "forward": 524,
209
+ "century": 525, "milk": 526, "corner": 527, "speed": 528, "method": 529, "organ": 530,
210
+ "pay": 531, "single": 532, "touch": 533, "control": 534, "bottom": 535, "design": 536,
211
+ "coat": 537, "else": 538, "quite": 539, "broke": 540, "case": 541, "middle": 542,
212
+ "kill": 543, "son": 544, "lake": 545, "moment": 546, "scale": 547, "loud": 548,
213
+ "spring": 549, "observe": 550, "child": 551, "straight": 552, "consonant": 553,
214
+ "nation": 554, "dictionary": 555, "bit": 556, "coast": 557, "copy": 558, "phrase": 559,
215
+ "silent": 560, "tall": 561, "sand": 562, "soil": 563, "roll": 564, "temperature": 565,
216
+ "finger": 566, "industry": 567, "value": 568, "fight": 569, "lie": 570, "beat": 571,
217
+ "excite": 572, "natural": 573, "view": 574, "sense": 575, "capital": 576, "won't": 577,
218
+ "chair": 578, "danger": 579, "fruit": 580, "rich": 581, "thick": 582, "soldier": 583,
219
+ "process": 584, "operate": 585, "practice": 586, "separate": 587, "difficult": 588,
220
+ "visit": 589, "spread": 590, "particular": 591, "catch": 592, "square": 593,
221
+ "reason": 594, "length": 595, "represent": 596, "art": 597, "subject": 598,
222
+ "region": 599, "size": 600, "vary": 601, "settle": 602, "speak": 603, "weight": 604,
223
+ "general": 605, "ice": 606, "matter": 607, "circle": 608, "pair": 609, "include": 610,
224
+ "divide": 611, "syllable": 612, "felt": 613, "grand": 614, "ball": 615, "yet": 616,
225
+ "wave": 617, "drop": 618, "heart": 619, "present": 620, "heavy": 621, "dance": 622,
226
+ "engine": 623, "position": 624, "arm": 625, "wide": 626, "sail": 627, "material": 628,
227
+ "fraction": 629, "forest": 630, "sit": 631, "race": 632, "window": 633, "store": 634,
228
+ "summer": 635, "train": 636, "sleep": 637, "prove": 638, "lone": 639, "leg": 640,
229
+ "exercise": 641, "wall": 642, "catch": 643, "mount": 644, "wish": 645, "sky": 646,
230
+ "board": 647, "joy": 648, "winter": 649, "sat": 650, "written": 651, "wild": 652,
231
+ "instrument": 653, "kept": 654, "glass": 655, "grass": 656, "cow": 657, "job": 658,
232
+ "edge": 659, "sign": 660, "visit": 661, "past": 662, "soft": 663, "fun": 664,
233
+ "bright": 665, "gas": 666, "weather": 667, "month": 668, "million": 669, "bear": 670,
234
+ "finish": 671, "happy": 672, "hope": 673, "flower": 674, "clothe": 675, "strange": 676,
235
+ "gone": 677, "trade": 678, "melody": 679, "trip": 680, "office": 681, "receive": 682,
236
+ "row": 683, "mouth": 684, "exact": 685, "symbol": 686, "die": 687, "least": 688,
237
+ "trouble": 689, "shout": 690, "except": 691, "wrote": 692, "seed": 693, "tone": 694,
238
+ "join": 695, "suggest": 696, "clean": 697, "break": 698, "lady": 699, "yard": 700,
239
+ "rise": 701, "bad": 702, "blow": 703, "oil": 704, "blood": 705, "touch": 706,
240
+ "grew": 707, "cent": 708, "mix": 709, "team": 710, "wire": 711, "cost": 712,
241
+ "lost": 713, "brown": 714, "wear": 715, "garden": 716, "equal": 717, "sent": 718,
242
+ "choose": 719, "fell": 720, "fit": 721, "flow": 722, "fair": 723, "bank": 724,
243
+ "collect": 725, "save": 726, "control": 727, "decimal": 728, "ear": 729, "else": 730,
244
+ "quite": 731, "broke": 732, "case": 733, "middle": 734, "kill": 735, "son": 736,
245
+ "lake": 737, "moment": 738, "scale": 739, "loud": 740, "spring": 741, "observe": 742,
246
+ "child": 743, "straight": 744, "consonant": 745, "nation": 746, "dictionary": 747,
247
+ "paragraph": 748, "parent": 749, "shore": 750, "division": 751, "sheet": 752,
248
+ "substance": 753, "favor": 754, "connect": 755, "post": 756, "spend": 757,
249
+ "chord": 758, "fat": 759, "glad": 760, "original": 761, "share": 762, "station": 763,
250
+ "dad": 764, "bread": 765, "charge": 766, "proper": 767, "bar": 768, "offer": 769,
251
+ "segment": 770, "slave": 771, "duck": 772, "instant": 773, "market": 774,
252
+ "degree": 775, "populate": 776, "chick": 777, "dear": 778, "enemy": 779, "reply": 780,
253
+ "drink": 781, "occur": 782, "support": 783, "speech": 784, "nature": 785, "range": 786,
254
+ "steam": 787, "motion": 788, "path": 789, "liquid": 790, "log": 791, "meant": 792,
255
+ "quotient": 793, "teeth": 794, "shell": 795, "neck": 796, "oxygen": 797, "sugar": 798,
256
+ "death": 799, "pretty": 800, "skill": 801, "women": 802, "season": 803, "solution": 804,
257
+ "magnet": 805, "silver": 806, "thank": 807, "branch": 808, "match": 809, "suffix": 810,
258
+ "especially": 811, "fig": 812, "afraid": 813, "huge": 814, "sister": 815, "steel": 816,
259
+ "discuss": 817, "forward": 818, "similar": 819, "guide": 820, "experience": 821,
260
+ "score": 822, "apple": 823, "bought": 824, "led": 825, "pitch": 826, "coat": 827,
261
+ "mass": 828, "card": 829, "band": 830, "rope": 831, "slip": 832, "win": 833,
262
+ "dream": 834, "evening": 835, "condition": 836, "feed": 837, "tool": 838, "total": 839,
263
+ "basic": 840, "smell": 841, "valley": 842, "nor": 843, "double": 844, "seat": 845,
264
+ "continue": 846, "block": 847, "chart": 848, "hat": 849, "sell": 850, "success": 851,
265
+ "company": 852, "subtract": 853, "event": 854, "particular": 855, "deal": 856,
266
+ "swim": 857, "term": 858, "opposite": 859, "wife": 860, "shoe": 861, "shoulder": 862,
267
+ "spread": 863, "arrange": 864, "camp": 865, "invent": 866, "cotton": 867, "born": 868,
268
+ "determine": 869, "quart": 870, "nine": 871, "truck": 872, "noise": 873, "level": 874,
269
+ "chance": 875, "gather": 876, "shop": 877, "stretch": 878, "throw": 879, "shine": 880,
270
+ "property": 881, "column": 882, "molecule": 883, "select": 884, "wrong": 885,
271
+ "gray": 886, "repeat": 887, "require": 888, "broad": 889, "prepare": 890, "salt": 891,
272
+ "nose": 892, "plural": 893, "anger": 894, "claim": 895, "continent": 896, "mom": 897,
273
+ "dad": 898, "bread": 899, "original": 900, "station": 901, "radio": 902, "art": 903,
274
+ "object": 904, "general": 905, "ice": 906, "engine": 907, "port": 908, "window": 909,
275
+ "job": 910, "melody": 911, "trade": 912, "rail": 913, "trip": 914, "seed": 915,
276
+ "tone": 916, "clean": 917, "lady": 918, "yard": 919, "blow": 920, "oil": 921,
277
+ "cent": 922, "cost": 923, "brown": 924, "garden": 925, "bank": 926, "decimal": 927,
278
+ "division": 928, "favor": 929, "original": 930, "proper": 931, "enemy": 932,
279
+ "solution": 933, "thank": 934, "huge": 935, "discuss": 936, "guide": 937, "bought": 938,
280
+ "mass": 939, "rope": 940, "evening": 941, "smell": 942, "nor": 943, "sell": 944,
281
+ "subtract": 945, "swim": 946, "opposite": 947, "shoe": 948, "spread": 949, "born": 950,
282
+ "noise": 951, "gather": 952, "throw": 953, "column": 954, "wrong": 955, "gray": 956,
283
+ "require": 957, "prepare": 958, "plural": 959, "continent": 960, "basic": 961,
284
+ "double": 962, "success": 963, "event": 964, "shoulder": 965, "nine": 966,
285
+ "property": 967, "broad": 968, "anger": 969, "dad": 970, "rail": 971, "deal": 972,
286
+ "level": 973, "stretch": 974, "chance": 975, "determine": 976, "nose": 977,
287
+ "steel": 978, "feed": 979, "cotton": 980, "truck": 981, "band": 982, "seat": 983,
288
+ "hat": 984, "particular": 985, "shoulder": 986, "claim": 987, "pitch": 988,
289
+ "valley": 989, "total": 990, "apple": 991, "select": 992, "repeat": 993, "wife": 994,
290
+ "term": 995, "camp": 996, "quart": 997, "shine": 998, "salt": 999, "molecule": 1000,
291
+ # 1001-2000 - Less common but still frequent
292
+ "temperature": 1001, "finger": 1002, "industry": 1003, "value": 1004, "fight": 1005,
293
+ "lie": 1006, "beat": 1007, "excite": 1008, "natural": 1009, "view": 1010,
294
+ "sense": 1011, "capital": 1012, "chair": 1013, "danger": 1014, "fruit": 1015,
295
+ "rich": 1016, "thick": 1017, "soldier": 1018, "operate": 1019, "practice": 1020,
296
+ "difficult": 1021, "doctor": 1022, "please": 1023, "protect": 1024, "noon": 1025,
297
+ "crop": 1026, "modern": 1027, "element": 1028, "hit": 1029, "student": 1030,
298
+ "corner": 1031, "party": 1032, "supply": 1033, "bone": 1034, "tube": 1035,
299
+ "famous": 1036, "dollar": 1037, "stream": 1038, "fear": 1039, "sight": 1040,
300
+ "thin": 1041, "triangle": 1042, "planet": 1043, "hurry": 1044, "chief": 1045,
301
+ "colony": 1046, "clock": 1047, "mine": 1048, "tie": 1049, "enter": 1050,
302
+ "major": 1051, "fresh": 1052, "search": 1053, "send": 1054, "yellow": 1055,
303
+ "gun": 1056, "allow": 1057, "print": 1058, "dead": 1059, "spot": 1060,
304
+ "desert": 1061, "suit": 1062, "current": 1063, "lift": 1064, "rose": 1065,
305
+ "arrive": 1066, "master": 1067, "track": 1068, "locate": 1069, "ring": 1070,
306
+ "believe": 1071, "gentle": 1072, "woman": 1073, "captain": 1074, "guess": 1075,
307
+ "necessary": 1076, "sharp": 1077, "wing": 1078, "create": 1079, "neighbor": 1080,
308
+ "wash": 1081, "bat": 1082, "rather": 1083, "crowd": 1084, "corn": 1085,
309
+ "compare": 1086, "poem": 1087, "string": 1088, "bell": 1089, "depend": 1090,
310
+ "meat": 1091, "rub": 1092, "tube": 1093, "famous": 1094, "dollar": 1095,
311
+ "indicate": 1096, "metal": 1097, "whether": 1098, "push": 1099, "seven": 1100,
312
+ # Additional common words 1101-5000
313
+ "village": 1101, "meet": 1102, "root": 1103, "buy": 1104, "raise": 1105,
314
+ "solve": 1106, "understand": 1107, "member": 1108, "syllable": 1109, "second": 1110,
315
+ "blue": 1111, "describe": 1112, "develop": 1113, "ocean": 1114, "electric": 1115,
316
+ "expect": 1116, "bone": 1117, "rail": 1118, "imagine": 1119, "provide": 1120,
317
+ "agree": 1121, "thus": 1122, "capital": 1123, "chair": 1124, "danger": 1125,
318
+ "fruit": 1126, "thick": 1127, "soldier": 1128, "process": 1129, "operate": 1130,
319
+ "difficult": 1131, "visit": 1132, "separate": 1133, "particular": 1134, "catch": 1135,
320
+ "square": 1136, "reason": 1137, "length": 1138, "represent": 1139, "art": 1140,
321
+ # Continue with progressively less common words...
322
+ # For brevity, jumping to approximate ranks for less common words
323
+ "political": 1500, "social": 1501, "business": 1502, "service": 1503,
324
+ "attention": 1504, "international": 1505, "various": 1506, "community": 1507,
325
+ "national": 1508, "american": 1509, "president": 1510, "available": 1511,
326
+ "information": 1512, "development": 1513, "question": 1514, "different": 1515,
327
+ "important": 1516, "education": 1517, "director": 1518, "economic": 1519,
328
+ "evidence": 1520, "management": 1521, "hospital": 1522, "personal": 1523,
329
+ "understand": 1524, "director": 1525, "professional": 1526, "performance": 1527,
330
+ "individual": 1528, "organization": 1529, "structure": 1530, "responsibility": 1531,
331
+ "technology": 1532, "democratic": 1533, "relationship": 1534, "environmental": 1535,
332
+ "significantly": 1536, "particularly": 1537, "approximately": 1538, "ultimately": 1539,
333
+ "comprehensive": 1540, "substantial": 1541, "fundamental": 1542, "analysis": 1543,
334
+ "investigation": 1544, "comprehensive": 1545, "demonstrate": 1546, "theoretical": 1547,
335
+ "significant": 1548, "hypothesis": 1549, "empirical": 1550, "methodology": 1551,
336
+ "framework": 1552, "implications": 1553, "phenomena": 1554, "parameters": 1555,
337
+ "correlation": 1556, "variables": 1557, "statistical": 1558, "preliminary": 1559,
338
+ }
339
+
340
+
341
+ def _tokenize_for_frequency_analysis(text: str) -> list[str]:
342
+ """Tokenize text for frequency analysis.
343
+
344
+ Args:
345
+ text: Input text to tokenize
346
+
347
+ Returns:
348
+ List of clean, lowercase tokens
349
+
350
+ Process:
351
+ - Lowercase entire text
352
+ - Split on whitespace
353
+ - Strip punctuation from each token
354
+ - Filter out empty tokens
355
+ """
356
+ if not text or not text.strip():
357
+ return []
358
+
359
+ text_lower = text.lower()
360
+ raw_tokens = text_lower.split()
361
+
362
+ # Comprehensive punctuation set
363
+ PUNCTUATION = set(".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„\"\"''‚'")
364
+
365
+ tokens = []
366
+ for token in raw_tokens:
367
+ clean_token = token.strip("".join(PUNCTUATION))
368
+ if clean_token:
369
+ tokens.append(clean_token)
370
+
371
+ return tokens
372
+
373
+
374
+ def _get_frequency_rank(word: str, frequency_dict: dict[str, int], max_rank: int) -> int:
375
+ """Get frequency rank for a word, or 50000 if unknown.
376
+
377
+ Args:
378
+ word: Word to look up (should be lowercase)
379
+ frequency_dict: Dictionary mapping words to frequency ranks
380
+ max_rank: Maximum rank in the frequency dictionary
381
+
382
+ Returns:
383
+ Frequency rank (1 = most common), or 50000 if word not found
384
+ (Unknown words are treated as very rare)
385
+ """
386
+ return frequency_dict.get(word, 50000)
387
+
388
+
389
+ def compute_word_frequency_sophistication(
390
+ text: str,
391
+ frequency_corpus: str = "coca",
392
+ rare_threshold: int = 10000,
393
+ common_threshold: int = 1000,
394
+ ) -> WordFrequencySophisticationResult:
395
+ """
396
+ Compute word frequency sophistication metrics.
397
+
398
+ Analyzes vocabulary sophistication by comparing text words against
399
+ reference frequency lists from large corpora. Words are classified
400
+ as common, rare, or academic based on their frequency ranks in the
401
+ reference corpus.
402
+
403
+ Related GitHub Issue:
404
+ #15 - Word Frequency Sophistication Metrics
405
+ https://github.com/craigtrim/pystylometry/issues/15
406
+
407
+ Sophistication is a key indicator of writing quality and expertise:
408
+ - Academic writing uses more low-frequency, technical words
409
+ - Fiction uses moderate-frequency, descriptive words
410
+ - Journalism uses high-frequency, accessible words
411
+ - Authors with larger vocabularies use rarer words
412
+ - Native speakers use different frequency profiles than learners
413
+
414
+ Applications:
415
+ - Assessing vocabulary richness beyond simple TTR
416
+ - Comparing writing sophistication across authors or genres
417
+ - Tracking vocabulary development over time
418
+ - Identifying register (formal vs. informal)
419
+ - Detecting text difficulty level
420
+
421
+ Frequency bands (example for 100,000-word corpus):
422
+ - Very common: Rank 1-1,000 (top 1%)
423
+ - Common: Rank 1,001-5,000 (top 5%)
424
+ - Moderate: Rank 5,001-10,000 (top 10%)
425
+ - Rare: Rank 10,001-20,000 (top 20%)
426
+ - Very rare: Rank 20,001+ (bottom 80%)
427
+
428
+ Args:
429
+ text: Input text to analyze. Should contain at least 50+ words
430
+ for meaningful statistics. Shorter texts may have unreliable
431
+ sophistication metrics.
432
+ frequency_corpus: Reference corpus to use for frequency data.
433
+ Options: "coca", "bnc", "google_ngrams", "subtlex"
434
+ Default is "coca" (Corpus of Contemporary American English).
435
+ rare_threshold: Frequency rank threshold for "rare" words. Words with
436
+ rank > rare_threshold are considered rare. Default 10,000.
437
+ common_threshold: Frequency rank threshold for "common" words. Words with
438
+ rank <= common_threshold are considered common. Default 1,000.
439
+
440
+ Returns:
441
+ WordFrequencySophisticationResult containing:
442
+ - mean_frequency_rank: Average frequency rank (lower = more common)
443
+ - median_frequency_rank: Median frequency rank
444
+ - rare_word_ratio: Proportion of words beyond rare_threshold
445
+ - common_word_ratio: Proportion of words within common_threshold
446
+ - academic_word_ratio: Proportion of Academic Word List words
447
+ - advanced_word_ratio: Proportion of sophisticated vocabulary
448
+ - frequency_band_distribution: Distribution across frequency bands
449
+ - rarest_words: Least frequent words with their ranks
450
+ - most_common_words: Most frequent words with their ranks
451
+ - metadata: Corpus info, thresholds, counts, etc.
452
+
453
+ Example:
454
+ >>> result = compute_word_frequency_sophistication("Sample academic text...")
455
+ >>> print(f"Mean frequency rank: {result.mean_frequency_rank:.1f}")
456
+ Mean frequency rank: 4523.7
457
+ >>> print(f"Rare word ratio: {result.rare_word_ratio:.3f}")
458
+ Rare word ratio: 0.234
459
+ >>> print(f"Academic words: {result.academic_word_ratio:.3f}")
460
+ Academic words: 0.156
461
+
462
+ >>> # Compare authors
463
+ >>> author1 = compute_word_frequency_sophistication("Text by author 1...")
464
+ >>> author2 = compute_word_frequency_sophistication("Text by author 2...")
465
+ >>> print(f"Author 1 mean rank: {author1.mean_frequency_rank:.1f}")
466
+ >>> print(f"Author 2 mean rank: {author2.mean_frequency_rank:.1f}")
467
+ >>> # Lower rank = uses more common words
468
+
469
+ Note:
470
+ - Frequency ranks are corpus-specific (COCA ranks differ from BNC ranks)
471
+ - Words not in reference corpus are assigned maximum rank + 1
472
+ - Case-insensitive matching (all words lowercased)
473
+ - Lemmatization recommended but not required
474
+ - Function words (the, of, and) dominate high-frequency ranks
475
+ - Stopword removal can provide cleaner sophistication metrics
476
+ - Academic Word List is field-independent academic vocabulary
477
+ """
478
+ # Validate corpus parameter
479
+ if frequency_corpus != "coca":
480
+ raise ValueError(
481
+ f"Only 'coca' corpus is currently supported, got '{frequency_corpus}'"
482
+ )
483
+
484
+ # Load frequency dictionary
485
+ frequency_dict = COCA_FREQUENCY_RANKS
486
+ max_rank = max(frequency_dict.values())
487
+ # Unknown words are assigned rank 50000 (treated as very rare)
488
+ unknown_rank = 50000
489
+
490
+ # Tokenize text
491
+ tokens = _tokenize_for_frequency_analysis(text)
492
+ total_words = len(tokens)
493
+
494
+ if total_words == 0:
495
+ raise ValueError("Text contains no valid tokens")
496
+
497
+ # Look up frequency rank for each word
498
+ word_ranks = [_get_frequency_rank(word, frequency_dict, max_rank) for word in tokens]
499
+
500
+ # Calculate mean and median frequency ranks
501
+ mean_rank = sum(word_ranks) / len(word_ranks)
502
+ sorted_ranks = sorted(word_ranks)
503
+ n = len(sorted_ranks)
504
+ if n % 2 == 0:
505
+ median_rank = (sorted_ranks[n // 2 - 1] + sorted_ranks[n // 2]) / 2.0
506
+ else:
507
+ median_rank = float(sorted_ranks[n // 2])
508
+
509
+ # Count words in different categories
510
+ rare_count = sum(1 for rank in word_ranks if rank > rare_threshold)
511
+ common_count = sum(1 for rank in word_ranks if rank <= common_threshold)
512
+ academic_count = sum(1 for word in tokens if word in ACADEMIC_WORD_LIST)
513
+ unknown_count = sum(1 for rank in word_ranks if rank == unknown_rank)
514
+
515
+ # Calculate ratios
516
+ rare_word_ratio = rare_count / total_words
517
+ common_word_ratio = common_count / total_words
518
+ academic_word_ratio = academic_count / total_words
519
+
520
+ # Advanced words = words that are either rare OR academic (union)
521
+ advanced_words = set()
522
+ for i, word in enumerate(tokens):
523
+ if word_ranks[i] > rare_threshold or word in ACADEMIC_WORD_LIST:
524
+ advanced_words.add(word)
525
+ # Count token occurrences of advanced words
526
+ advanced_count = sum(1 for w in tokens if w in advanced_words)
527
+ advanced_word_ratio = advanced_count / total_words
528
+
529
+ # Frequency band distribution
530
+ band_counts = {
531
+ "very_common": sum(1 for r in word_ranks if r <= 1000),
532
+ "common": sum(1 for r in word_ranks if 1000 < r <= 5000),
533
+ "moderate": sum(1 for r in word_ranks if 5000 < r <= 10000),
534
+ "rare": sum(1 for r in word_ranks if 10000 < r <= 20000),
535
+ "very_rare": sum(1 for r in word_ranks if r > 20000),
536
+ }
537
+ frequency_band_distribution = {
538
+ band: count / total_words for band, count in band_counts.items()
539
+ }
540
+
541
+ # Find rarest and most common words (top 10 each, deduplicated)
542
+ word_rank_pairs = list(zip(tokens, word_ranks))
543
+ # Create unique word-rank mapping (uses last occurrence rank if word repeats)
544
+ unique_pairs: dict[str, int] = {}
545
+ for word, rank in word_rank_pairs:
546
+ if word not in unique_pairs:
547
+ unique_pairs[word] = rank
548
+
549
+ # Rarest: highest ranks
550
+ sorted_by_rarest = sorted(unique_pairs.items(), key=lambda x: x[1], reverse=True)
551
+ rarest_words = [(word, float(rank)) for word, rank in sorted_by_rarest[:10]]
552
+
553
+ # Most common: lowest ranks
554
+ sorted_by_common = sorted(unique_pairs.items(), key=lambda x: x[1])
555
+ most_common_words = [(word, float(rank)) for word, rank in sorted_by_common[:10]]
556
+
557
+ # Metadata
558
+ metadata = {
559
+ "frequency_corpus": frequency_corpus,
560
+ "rare_threshold": rare_threshold,
561
+ "common_threshold": common_threshold,
562
+ "total_words": total_words,
563
+ "unique_words": len(set(tokens)),
564
+ "unknown_words": unknown_count,
565
+ "unknown_word_ratio": unknown_count / total_words,
566
+ "frequency_list_size": len(frequency_dict),
567
+ "max_frequency_rank": max_rank,
568
+ }
569
+
570
+ return WordFrequencySophisticationResult(
571
+ mean_frequency_rank=mean_rank,
572
+ median_frequency_rank=median_rank,
573
+ rare_word_ratio=rare_word_ratio,
574
+ common_word_ratio=common_word_ratio,
575
+ academic_word_ratio=academic_word_ratio,
576
+ advanced_word_ratio=advanced_word_ratio,
577
+ frequency_band_distribution=frequency_band_distribution,
578
+ rarest_words=rarest_words,
579
+ most_common_words=most_common_words,
580
+ metadata=metadata,
581
+ )