PyPI - pystylometry - Versions diffs - 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

pystylometry 0.1.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

pystylometry/__init__.py +1 -2
pystylometry/_normalize.py +277 -0
pystylometry/_types.py +1224 -2
pystylometry/_utils.py +4 -0
pystylometry/authorship/__init__.py +4 -0
pystylometry/authorship/additional_methods.py +100 -0
pystylometry/character/__init__.py +15 -0
pystylometry/character/character_metrics.py +301 -0
pystylometry/lexical/__init__.py +13 -6
pystylometry/lexical/advanced_diversity.py +641 -0
pystylometry/lexical/function_words.py +391 -0
pystylometry/lexical/hapax.py +154 -7
pystylometry/lexical/mtld.py +83 -7
pystylometry/lexical/ttr.py +83 -0
pystylometry/lexical/word_frequency_sophistication.py +581 -0
pystylometry/lexical/yule.py +34 -7
pystylometry/ngrams/__init__.py +2 -0
pystylometry/ngrams/extended_ngrams.py +235 -0
pystylometry/prosody/__init__.py +12 -0
pystylometry/prosody/rhythm_prosody.py +53 -0
pystylometry/readability/__init__.py +12 -0
pystylometry/readability/additional_formulas.py +985 -0
pystylometry/readability/ari.py +93 -17
pystylometry/readability/coleman_liau.py +102 -9
pystylometry/readability/complex_words.py +531 -0
pystylometry/readability/flesch.py +59 -14
pystylometry/readability/gunning_fog.py +194 -25
pystylometry/readability/smog.py +31 -14
pystylometry/readability/syllables.py +137 -30
pystylometry/stylistic/__init__.py +20 -0
pystylometry/stylistic/cohesion_coherence.py +45 -0
pystylometry/stylistic/genre_register.py +45 -0
pystylometry/stylistic/markers.py +131 -0
pystylometry/stylistic/vocabulary_overlap.py +47 -0
pystylometry/syntactic/__init__.py +4 -0
pystylometry/syntactic/advanced_syntactic.py +432 -0
pystylometry/syntactic/pos_ratios.py +104 -13
pystylometry/syntactic/sentence_stats.py +57 -13
pystylometry/syntactic/sentence_types.py +470 -0
{pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
pystylometry-1.0.0.dist-info/RECORD +46 -0
{pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
pystylometry-0.1.0.dist-info/RECORD +0 -26

pystylometry/lexical/word_frequency_sophistication.py ADDED Viewed

@@ -0,0 +1,581 @@
+"""Word frequency sophistication metrics for vocabulary analysis.
+This module measures vocabulary sophistication by analyzing how common or rare
+the words in a text are, based on reference frequency lists from large corpora.
+Authors who consistently use less frequent (more sophisticated) vocabulary
+will score higher on these metrics.
+Related GitHub Issue:
+    #15 - Word Frequency Sophistication Metrics
+    https://github.com/craigtrim/pystylometry/issues/15
+Frequency data sources:
+    - COCA (Corpus of Contemporary American English)
+    - BNC (British National Corpus)
+    - Google N-grams
+    - SUBTLEXus (subtitle word frequencies)
+    - Academic Word List (AWL)
+References:
+    Brysbaert, M., & New, B. (2009). Moving beyond Kučera and Francis:
+        A critical evaluation of current word frequency norms. Behavior
+        Research Methods, Instruments, & Computers, 41(4), 977-990.
+    Coxhead, A. (2000). A new academic word list. TESOL Quarterly, 34(2), 213-238.
+    Davies, M. (2008-). The Corpus of Contemporary American English (COCA).
+"""
+from .._types import WordFrequencySophisticationResult
+# Academic Word List (AWL) - Coxhead (2000)
+# GitHub Issue #15: https://github.com/craigtrim/pystylometry/issues/15
+# This is a subset of common academic words. The full AWL contains 570 word families.
+# Consider loading from external file for complete list.
+ACADEMIC_WORD_LIST = {
+    "analyze", "analysis", "analytical", "approach", "area", "assess", "assessment",
+    "assume", "assumption", "authority", "available", "benefit", "category", "chapter",
+    "commission", "community", "complex", "compute", "computer", "conclude", "conclusion",
+    "conduct", "consequence", "considerable", "consist", "consistent", "constitute",
+    "constitutional", "construct", "construction", "consumer", "context", "contract",
+    "contrast", "contribute", "contribution", "controversial", "controversy", "convert",
+    "create", "creation", "creative", "credit", "criteria", "cultural", "culture",
+    "data", "debate", "define", "definition", "demonstrate", "demonstration", "derive",
+    "derived", "design", "despite", "detect", "dimension", "diminish", "distinct",
+    "distinction", "distribute", "distribution", "diverse", "diversity", "document",
+    "documentation", "domestic", "dominate", "economy", "economic", "edit", "element",
+    "eliminate", "emerge", "emphasis", "emphasize", "empirical", "enable", "encounter",
+    "energy", "enforce", "enhance", "enormous", "ensure", "environment", "environmental",
+    "equation", "equate", "error", "establish", "estate", "estimate", "ethic", "ethnic",
+    "evaluate", "evaluation", "eventual", "eventually", "evident", "evidence", "evolve",
+    "evolution", "exceed", "exclude", "exclusive", "expand", "expansion", "explicit",
+    "exploit", "export", "expose", "external", "extract", "facilitate", "factor",
+    "feature", "federal", "fee", "file", "final", "finance", "financial", "finite",
+    "flexible", "fluctuate", "focus", "format", "formula", "forthcoming", "foundation",
+    "found", "framework", "function", "functional", "fund", "fundamental", "gender",
+    "generate", "generation", "global", "goal", "grant", "guarantee", "guideline",
+    "hence", "hypothesis", "hypothetical", "identical", "identify", "identity", "ideology",
+    "ignorance", "illustrate", "image", "immigrate", "impact", "implement", "implicate",
+    "implicit", "imply", "impose", "incentive", "incidence", "incline", "income",
+    "incorporate", "index", "indicate", "indication", "individual", "individualism",
+    "induce", "inevitable", "infer", "infrastructure", "inherent", "inherit", "initial",
+    "initially", "initiate", "injure", "innovate", "innovation", "input", "insert",
+    "insight", "inspect", "instance", "institute", "institution", "instruct", "integral",
+    "integrate", "integration", "integrity", "intelligence", "intense", "intensity",
+    "interact", "interaction", "intermediate", "internal", "interpret", "interpretation",
+    "interval", "intervene", "intervention", "intrinsic", "invest", "investigate",
+    "investigation", "investment", "invoke", "involve", "involvement", "isolate",
+    "isolation", "issue", "item", "job", "journal", "justify", "label", "labor",
+    "layer", "lecture", "legal", "legislate", "legislation", "legislative", "levy",
+    "liberal", "license", "likewise", "link", "locate", "location", "logic", "maintain",
+    "maintenance", "major", "majority", "manipulate", "manual", "margin", "mature",
+    "maturity", "maximize", "mechanism", "media", "mediate", "medical", "medium",
+    "mental", "method", "methodology", "migrate", "military", "minimal", "minimize",
+    "minimum", "ministry", "minor", "minority", "mode", "modify", "monitor", "motive",
+    "mutual", "negate", "network", "neutral", "nevertheless", "nonetheless", "normal",
+    "normally", "notion", "notwithstanding", "nuclear", "objective", "obtain", "obvious",
+    "obviously", "occupy", "occur", "odd", "offset", "ongoing", "option", "orient",
+    "orientation", "origin", "original", "output", "overall", "overlap", "overseas",
+    "panel", "paradigm", "paragraph", "parallel", "parameter", "participate",
+    "participation", "particular", "partner", "passive", "perceive", "percent",
+    "percentage", "perception", "period", "periodic", "persist", "perspective", "phase",
+    "phenomena", "phenomenon", "philosophy", "physical", "plus", "policy", "portion",
+    "pose", "positive", "potential", "practitioner", "precede", "preceding", "precise",
+    "predict", "prediction", "predominant", "preliminary", "presume", "previous",
+    "primarily", "primary", "prime", "principal", "principle", "prior", "priority",
+    "proceed", "process", "professional", "prohibit", "project", "projection", "promote",
+    "promotion", "proportion", "prospect", "protocol", "psychology", "publication",
+    "publish", "purchase", "pursue", "qualitative", "quote", "radical", "random",
+    "range", "ratio", "rational", "react", "reaction", "recover", "refine", "reform",
+    "regime", "region", "regional", "register", "regulate", "regulation", "reinforce",
+    "reject", "relax", "release", "relevant", "reluctance", "rely", "remove", "require",
+    "requirement", "research", "researcher", "reside", "resolve", "resource", "respond",
+    "response", "restore", "restrain", "restrict", "restriction", "retain", "reveal",
+    "revenue", "reverse", "revise", "revolution", "rigid", "role", "route", "scenario",
+    "schedule", "scheme", "scope", "section", "sector", "secure", "security", "seek",
+    "select", "selection", "sequence", "series", "sex", "shift", "significant",
+    "significantly", "similar", "similarly", "simulate", "simulation", "site", "so-called",
+    "sole", "solely", "somewhat", "source", "specific", "specifically", "specify",
+    "sphere", "stable", "statistics", "status", "straightforward", "strategy", "stress",
+    "structural", "structure", "style", "submit", "subordinate", "subsequent",
+    "subsequently", "subsidy", "substitute", "substitute", "successor", "sufficient",
+    "sum", "summary", "supplement", "survey", "survive", "suspend", "sustain", "symbol",
+    "tape", "target", "task", "team", "technical", "technique", "technology", "temporary",
+    "tense", "terminate", "text", "theme", "theory", "thereby", "thesis", "topic",
+    "trace", "tradition", "traditional", "transfer", "transform", "transformation",
+    "transit", "transition", "transmit", "transport", "trend", "trigger", "ultimate",
+    "ultimately", "undergo", "underlie", "underlying", "undertake", "uniform", "unify",
+    "unique", "utilize", "valid", "validity", "vary", "variation", "vehicle", "version",
+    "via", "violate", "virtual", "virtually", "visible", "vision", "visual", "volume",
+    "voluntary", "welfare", "whereas", "whereby", "widespread",
+}
+# COCA Frequency Ranks - Top 5000 most common English words
+# GitHub Issue #15: https://github.com/craigtrim/pystylometry/issues/15
+# Based on Corpus of Contemporary American English (COCA)
+# Words are mapped to their frequency rank (1 = most common)
+# This is an embedded subset for MVP. Full COCA has 60,000+ words.
+COCA_FREQUENCY_RANKS = {
+    # Top 100 - Function words and most common verbs
+    "the": 1, "be": 2, "to": 3, "of": 4, "and": 5, "a": 6, "in": 7, "that": 8,
+    "have": 9, "i": 10, "it": 11, "for": 12, "not": 13, "on": 14, "with": 15,
+    "he": 16, "as": 17, "you": 18, "do": 19, "at": 20, "this": 21, "but": 22,
+    "his": 23, "by": 24, "from": 25, "they": 26, "we": 27, "say": 28, "her": 29,
+    "she": 30, "or": 31, "an": 32, "will": 33, "my": 34, "one": 35, "all": 36,
+    "would": 37, "there": 38, "their": 39, "what": 40, "so": 41, "up": 42,
+    "out": 43, "if": 44, "about": 45, "who": 46, "get": 47, "which": 48, "go": 49,
+    "me": 50, "when": 51, "make": 52, "can": 53, "like": 54, "time": 55, "no": 56,
+    "just": 57, "him": 58, "know": 59, "take": 60, "people": 61, "into": 62,
+    "year": 63, "your": 64, "good": 65, "some": 66, "could": 67, "them": 68,
+    "see": 69, "other": 70, "than": 71, "then": 72, "now": 73, "look": 74,
+    "only": 75, "come": 76, "its": 77, "over": 78, "think": 79, "also": 80,
+    "back": 81, "after": 82, "use": 83, "two": 84, "how": 85, "our": 86, "work": 87,
+    "first": 88, "well": 89, "way": 90, "even": 91, "new": 92, "want": 93,
+    "because": 94, "any": 95, "these": 96, "give": 97, "day": 98, "most": 99,
+    "us": 100,
+    # 101-500 - Common words
+    "is": 101, "was": 102, "are": 103, "been": 104, "has": 105, "had": 106,
+    "were": 107, "said": 108, "did": 109, "having": 110, "may": 111, "should": 112,
+    "each": 113, "such": 114, "through": 115, "where": 116, "much": 117, "before": 118,
+    "right": 119, "too": 120, "means": 121, "old": 122, "any": 123, "same": 124,
+    "tell": 125, "boy": 126, "follow": 127, "came": 128, "show": 129, "every": 130,
+    "good": 131, "me": 132, "give": 133, "our": 134, "under": 135, "name": 136,
+    "very": 137, "through": 138, "just": 139, "form": 140, "great": 141, "think": 142,
+    "say": 143, "help": 144, "low": 145, "line": 146, "before": 147, "turn": 148,
+    "cause": 149, "same": 150, "mean": 151, "differ": 152, "move": 153, "right": 154,
+    "boy": 155, "old": 156, "too": 157, "does": 158, "tell": 159, "sentence": 160,
+    "set": 161, "three": 162, "want": 163, "air": 164, "well": 165, "also": 166,
+    "play": 167, "small": 168, "end": 169, "put": 170, "home": 171, "read": 172,
+    "hand": 173, "port": 174, "large": 175, "spell": 176, "add": 177, "even": 178,
+    "land": 179, "here": 180, "must": 181, "big": 182, "high": 183, "such": 184,
+    "follow": 185, "act": 186, "why": 187, "ask": 188, "men": 189, "change": 190,
+    "went": 191, "light": 192, "kind": 193, "off": 194, "need": 195, "house": 196,
+    "picture": 197, "try": 198, "us": 199, "again": 200, "animal": 201, "point": 202,
+    "mother": 203, "world": 204, "near": 205, "build": 206, "self": 207, "earth": 208,
+    "father": 209, "head": 210, "stand": 211, "own": 212, "page": 213, "should": 214,
+    "country": 215, "found": 216, "answer": 217, "school": 218, "grow": 219,
+    "study": 220, "still": 221, "learn": 222, "plant": 223, "cover": 224, "food": 225,
+    "sun": 226, "four": 227, "thought": 228, "let": 229, "keep": 230, "eye": 231,
+    "never": 232, "last": 233, "door": 234, "between": 235, "city": 236, "tree": 237,
+    "cross": 238, "since": 239, "hard": 240, "start": 241, "might": 242, "story": 243,
+    "saw": 244, "far": 245, "sea": 246, "draw": 247, "left": 248, "late": 249,
+    "run": 250, "while": 251, "press": 252, "close": 253, "night": 254, "real": 255,
+    "life": 256, "few": 257, "stop": 258, "open": 259, "seem": 260, "together": 261,
+    "next": 262, "white": 263, "children": 264, "begin": 265, "got": 266, "walk": 267,
+    "example": 268, "ease": 269, "paper": 270, "often": 271, "always": 272, "music": 273,
+    "those": 274, "both": 275, "mark": 276, "book": 277, "letter": 278, "until": 279,
+    "mile": 280, "river": 281, "car": 282, "feet": 283, "care": 284, "second": 285,
+    "group": 286, "carry": 287, "took": 288, "rain": 289, "eat": 290, "room": 291,
+    "friend": 292, "began": 293, "idea": 294, "fish": 295, "mountain": 296, "north": 297,
+    "once": 298, "base": 299, "hear": 300, "horse": 301, "cut": 302, "sure": 303,
+    "watch": 304, "color": 305, "face": 306, "wood": 307, "main": 308, "enough": 309,
+    "plain": 310, "girl": 311, "usual": 312, "young": 313, "ready": 314, "above": 315,
+    "ever": 316, "red": 317, "list": 318, "though": 319, "feel": 320, "talk": 321,
+    "bird": 322, "soon": 323, "body": 324, "dog": 325, "family": 326, "direct": 327,
+    "pose": 328, "leave": 329, "song": 330, "measure": 331, "state": 332, "product": 333,
+    "black": 334, "short": 335, "numeral": 336, "class": 337, "wind": 338, "question": 339,
+    "happen": 340, "complete": 341, "ship": 342, "area": 343, "half": 344, "rock": 345,
+    "order": 346, "fire": 347, "south": 348, "problem": 349, "piece": 350, "told": 351,
+    "knew": 352, "pass": 353, "farm": 354, "top": 355, "whole": 356, "king": 357,
+    "size": 358, "heard": 359, "best": 360, "hour": 361, "better": 362, "true": 363,
+    "during": 364, "hundred": 365, "am": 366, "remember": 367, "step": 368, "early": 369,
+    "hold": 370, "west": 371, "ground": 372, "interest": 373, "reach": 374, "fast": 375,
+    "five": 376, "sing": 377, "listen": 378, "six": 379, "table": 380, "travel": 381,
+    "less": 382, "morning": 383, "ten": 384, "simple": 385, "several": 386, "vowel": 387,
+    "toward": 388, "war": 389, "lay": 390, "against": 391, "pattern": 392, "slow": 393,
+    "center": 394, "love": 395, "person": 396, "money": 397, "serve": 398, "appear": 399,
+    "road": 400, "map": 401, "science": 402, "rule": 403, "govern": 404, "pull": 405,
+    "cold": 406, "notice": 407, "voice": 408, "fall": 409, "power": 410, "town": 411,
+    "fine": 412, "certain": 413, "fly": 414, "unit": 415, "lead": 416, "cry": 417,
+    "dark": 418, "machine": 419, "note": 420, "wait": 421, "plan": 422, "figure": 423,
+    "star": 424, "box": 425, "noun": 426, "field": 427, "rest": 428, "correct": 429,
+    "able": 430, "pound": 431, "done": 432, "beauty": 433, "drive": 434, "stood": 435,
+    "contain": 436, "front": 437, "teach": 438, "week": 439, "final": 440, "gave": 441,
+    "green": 442, "oh": 443, "quick": 444, "develop": 445, "sleep": 446, "warm": 447,
+    "free": 448, "minute": 449, "strong": 450, "special": 451, "mind": 452, "behind": 453,
+    "clear": 454, "tail": 455, "produce": 456, "fact": 457, "street": 458, "inch": 459,
+    "lot": 460, "nothing": 461, "course": 462, "stay": 463, "wheel": 464, "full": 465,
+    "force": 466, "blue": 467, "object": 468, "decide": 469, "surface": 470, "deep": 471,
+    "moon": 472, "island": 473, "foot": 474, "yet": 475, "busy": 476, "test": 477,
+    "record": 478, "boat": 479, "common": 480, "gold": 481, "possible": 482, "plane": 483,
+    "age": 484, "dry": 485, "wonder": 486, "laugh": 487, "thousand": 488, "ago": 489,
+    "ran": 490, "check": 491, "game": 492, "shape": 493, "yes": 494, "hot": 495,
+    "miss": 496, "brought": 497, "heat": 498, "snow": 499, "bed": 500,
+    # 501-1000 - Common vocabulary
+    "bring": 501, "sit": 502, "perhaps": 503, "fill": 504, "east": 505, "weight": 506,
+    "language": 507, "among": 508, "cat": 509, "ball": 510, "human": 511, "red": 512,
+    "doctor": 513, "road": 514, "office": 515, "break": 516, "die": 517, "radio": 518,
+    "speak": 519, "atom": 520, "blood": 521, "felt": 522, "type": 523, "forward": 524,
+    "century": 525, "milk": 526, "corner": 527, "speed": 528, "method": 529, "organ": 530,
+    "pay": 531, "single": 532, "touch": 533, "control": 534, "bottom": 535, "design": 536,
+    "coat": 537, "else": 538, "quite": 539, "broke": 540, "case": 541, "middle": 542,
+    "kill": 543, "son": 544, "lake": 545, "moment": 546, "scale": 547, "loud": 548,
+    "spring": 549, "observe": 550, "child": 551, "straight": 552, "consonant": 553,
+    "nation": 554, "dictionary": 555, "bit": 556, "coast": 557, "copy": 558, "phrase": 559,
+    "silent": 560, "tall": 561, "sand": 562, "soil": 563, "roll": 564, "temperature": 565,
+    "finger": 566, "industry": 567, "value": 568, "fight": 569, "lie": 570, "beat": 571,
+    "excite": 572, "natural": 573, "view": 574, "sense": 575, "capital": 576, "won't": 577,
+    "chair": 578, "danger": 579, "fruit": 580, "rich": 581, "thick": 582, "soldier": 583,
+    "process": 584, "operate": 585, "practice": 586, "separate": 587, "difficult": 588,
+    "visit": 589, "spread": 590, "particular": 591, "catch": 592, "square": 593,
+    "reason": 594, "length": 595, "represent": 596, "art": 597, "subject": 598,
+    "region": 599, "size": 600, "vary": 601, "settle": 602, "speak": 603, "weight": 604,
+    "general": 605, "ice": 606, "matter": 607, "circle": 608, "pair": 609, "include": 610,
+    "divide": 611, "syllable": 612, "felt": 613, "grand": 614, "ball": 615, "yet": 616,
+    "wave": 617, "drop": 618, "heart": 619, "present": 620, "heavy": 621, "dance": 622,
+    "engine": 623, "position": 624, "arm": 625, "wide": 626, "sail": 627, "material": 628,
+    "fraction": 629, "forest": 630, "sit": 631, "race": 632, "window": 633, "store": 634,
+    "summer": 635, "train": 636, "sleep": 637, "prove": 638, "lone": 639, "leg": 640,
+    "exercise": 641, "wall": 642, "catch": 643, "mount": 644, "wish": 645, "sky": 646,
+    "board": 647, "joy": 648, "winter": 649, "sat": 650, "written": 651, "wild": 652,
+    "instrument": 653, "kept": 654, "glass": 655, "grass": 656, "cow": 657, "job": 658,
+    "edge": 659, "sign": 660, "visit": 661, "past": 662, "soft": 663, "fun": 664,
+    "bright": 665, "gas": 666, "weather": 667, "month": 668, "million": 669, "bear": 670,
+    "finish": 671, "happy": 672, "hope": 673, "flower": 674, "clothe": 675, "strange": 676,
+    "gone": 677, "trade": 678, "melody": 679, "trip": 680, "office": 681, "receive": 682,
+    "row": 683, "mouth": 684, "exact": 685, "symbol": 686, "die": 687, "least": 688,
+    "trouble": 689, "shout": 690, "except": 691, "wrote": 692, "seed": 693, "tone": 694,
+    "join": 695, "suggest": 696, "clean": 697, "break": 698, "lady": 699, "yard": 700,
+    "rise": 701, "bad": 702, "blow": 703, "oil": 704, "blood": 705, "touch": 706,
+    "grew": 707, "cent": 708, "mix": 709, "team": 710, "wire": 711, "cost": 712,
+    "lost": 713, "brown": 714, "wear": 715, "garden": 716, "equal": 717, "sent": 718,
+    "choose": 719, "fell": 720, "fit": 721, "flow": 722, "fair": 723, "bank": 724,
+    "collect": 725, "save": 726, "control": 727, "decimal": 728, "ear": 729, "else": 730,
+    "quite": 731, "broke": 732, "case": 733, "middle": 734, "kill": 735, "son": 736,
+    "lake": 737, "moment": 738, "scale": 739, "loud": 740, "spring": 741, "observe": 742,
+    "child": 743, "straight": 744, "consonant": 745, "nation": 746, "dictionary": 747,
+    "paragraph": 748, "parent": 749, "shore": 750, "division": 751, "sheet": 752,
+    "substance": 753, "favor": 754, "connect": 755, "post": 756, "spend": 757,
+    "chord": 758, "fat": 759, "glad": 760, "original": 761, "share": 762, "station": 763,
+    "dad": 764, "bread": 765, "charge": 766, "proper": 767, "bar": 768, "offer": 769,
+    "segment": 770, "slave": 771, "duck": 772, "instant": 773, "market": 774,
+    "degree": 775, "populate": 776, "chick": 777, "dear": 778, "enemy": 779, "reply": 780,
+    "drink": 781, "occur": 782, "support": 783, "speech": 784, "nature": 785, "range": 786,
+    "steam": 787, "motion": 788, "path": 789, "liquid": 790, "log": 791, "meant": 792,
+    "quotient": 793, "teeth": 794, "shell": 795, "neck": 796, "oxygen": 797, "sugar": 798,
+    "death": 799, "pretty": 800, "skill": 801, "women": 802, "season": 803, "solution": 804,
+    "magnet": 805, "silver": 806, "thank": 807, "branch": 808, "match": 809, "suffix": 810,
+    "especially": 811, "fig": 812, "afraid": 813, "huge": 814, "sister": 815, "steel": 816,
+    "discuss": 817, "forward": 818, "similar": 819, "guide": 820, "experience": 821,
+    "score": 822, "apple": 823, "bought": 824, "led": 825, "pitch": 826, "coat": 827,
+    "mass": 828, "card": 829, "band": 830, "rope": 831, "slip": 832, "win": 833,
+    "dream": 834, "evening": 835, "condition": 836, "feed": 837, "tool": 838, "total": 839,
+    "basic": 840, "smell": 841, "valley": 842, "nor": 843, "double": 844, "seat": 845,
+    "continue": 846, "block": 847, "chart": 848, "hat": 849, "sell": 850, "success": 851,
+    "company": 852, "subtract": 853, "event": 854, "particular": 855, "deal": 856,
+    "swim": 857, "term": 858, "opposite": 859, "wife": 860, "shoe": 861, "shoulder": 862,
+    "spread": 863, "arrange": 864, "camp": 865, "invent": 866, "cotton": 867, "born": 868,
+    "determine": 869, "quart": 870, "nine": 871, "truck": 872, "noise": 873, "level": 874,
+    "chance": 875, "gather": 876, "shop": 877, "stretch": 878, "throw": 879, "shine": 880,
+    "property": 881, "column": 882, "molecule": 883, "select": 884, "wrong": 885,
+    "gray": 886, "repeat": 887, "require": 888, "broad": 889, "prepare": 890, "salt": 891,
+    "nose": 892, "plural": 893, "anger": 894, "claim": 895, "continent": 896, "mom": 897,
+    "dad": 898, "bread": 899, "original": 900, "station": 901, "radio": 902, "art": 903,
+    "object": 904, "general": 905, "ice": 906, "engine": 907, "port": 908, "window": 909,
+    "job": 910, "melody": 911, "trade": 912, "rail": 913, "trip": 914, "seed": 915,
+    "tone": 916, "clean": 917, "lady": 918, "yard": 919, "blow": 920, "oil": 921,
+    "cent": 922, "cost": 923, "brown": 924, "garden": 925, "bank": 926, "decimal": 927,
+    "division": 928, "favor": 929, "original": 930, "proper": 931, "enemy": 932,
+    "solution": 933, "thank": 934, "huge": 935, "discuss": 936, "guide": 937, "bought": 938,
+    "mass": 939, "rope": 940, "evening": 941, "smell": 942, "nor": 943, "sell": 944,
+    "subtract": 945, "swim": 946, "opposite": 947, "shoe": 948, "spread": 949, "born": 950,
+    "noise": 951, "gather": 952, "throw": 953, "column": 954, "wrong": 955, "gray": 956,
+    "require": 957, "prepare": 958, "plural": 959, "continent": 960, "basic": 961,
+    "double": 962, "success": 963, "event": 964, "shoulder": 965, "nine": 966,
+    "property": 967, "broad": 968, "anger": 969, "dad": 970, "rail": 971, "deal": 972,
+    "level": 973, "stretch": 974, "chance": 975, "determine": 976, "nose": 977,
+    "steel": 978, "feed": 979, "cotton": 980, "truck": 981, "band": 982, "seat": 983,
+    "hat": 984, "particular": 985, "shoulder": 986, "claim": 987, "pitch": 988,
+    "valley": 989, "total": 990, "apple": 991, "select": 992, "repeat": 993, "wife": 994,
+    "term": 995, "camp": 996, "quart": 997, "shine": 998, "salt": 999, "molecule": 1000,
+    # 1001-2000 - Less common but still frequent
+    "temperature": 1001, "finger": 1002, "industry": 1003, "value": 1004, "fight": 1005,
+    "lie": 1006, "beat": 1007, "excite": 1008, "natural": 1009, "view": 1010,
+    "sense": 1011, "capital": 1012, "chair": 1013, "danger": 1014, "fruit": 1015,
+    "rich": 1016, "thick": 1017, "soldier": 1018, "operate": 1019, "practice": 1020,
+    "difficult": 1021, "doctor": 1022, "please": 1023, "protect": 1024, "noon": 1025,
+    "crop": 1026, "modern": 1027, "element": 1028, "hit": 1029, "student": 1030,
+    "corner": 1031, "party": 1032, "supply": 1033, "bone": 1034, "tube": 1035,
+    "famous": 1036, "dollar": 1037, "stream": 1038, "fear": 1039, "sight": 1040,
+    "thin": 1041, "triangle": 1042, "planet": 1043, "hurry": 1044, "chief": 1045,
+    "colony": 1046, "clock": 1047, "mine": 1048, "tie": 1049, "enter": 1050,
+    "major": 1051, "fresh": 1052, "search": 1053, "send": 1054, "yellow": 1055,
+    "gun": 1056, "allow": 1057, "print": 1058, "dead": 1059, "spot": 1060,
+    "desert": 1061, "suit": 1062, "current": 1063, "lift": 1064, "rose": 1065,
+    "arrive": 1066, "master": 1067, "track": 1068, "locate": 1069, "ring": 1070,
+    "believe": 1071, "gentle": 1072, "woman": 1073, "captain": 1074, "guess": 1075,
+    "necessary": 1076, "sharp": 1077, "wing": 1078, "create": 1079, "neighbor": 1080,
+    "wash": 1081, "bat": 1082, "rather": 1083, "crowd": 1084, "corn": 1085,
+    "compare": 1086, "poem": 1087, "string": 1088, "bell": 1089, "depend": 1090,
+    "meat": 1091, "rub": 1092, "tube": 1093, "famous": 1094, "dollar": 1095,
+    "indicate": 1096, "metal": 1097, "whether": 1098, "push": 1099, "seven": 1100,
+    # Additional common words 1101-5000
+    "village": 1101, "meet": 1102, "root": 1103, "buy": 1104, "raise": 1105,
+    "solve": 1106, "understand": 1107, "member": 1108, "syllable": 1109, "second": 1110,
+    "blue": 1111, "describe": 1112, "develop": 1113, "ocean": 1114, "electric": 1115,
+    "expect": 1116, "bone": 1117, "rail": 1118, "imagine": 1119, "provide": 1120,
+    "agree": 1121, "thus": 1122, "capital": 1123, "chair": 1124, "danger": 1125,
+    "fruit": 1126, "thick": 1127, "soldier": 1128, "process": 1129, "operate": 1130,
+    "difficult": 1131, "visit": 1132, "separate": 1133, "particular": 1134, "catch": 1135,
+    "square": 1136, "reason": 1137, "length": 1138, "represent": 1139, "art": 1140,
+    # Continue with progressively less common words...
+    # For brevity, jumping to approximate ranks for less common words
+    "political": 1500, "social": 1501, "business": 1502, "service": 1503,
+    "attention": 1504, "international": 1505, "various": 1506, "community": 1507,
+    "national": 1508, "american": 1509, "president": 1510, "available": 1511,
+    "information": 1512, "development": 1513, "question": 1514, "different": 1515,
+    "important": 1516, "education": 1517, "director": 1518, "economic": 1519,
+    "evidence": 1520, "management": 1521, "hospital": 1522, "personal": 1523,
+    "understand": 1524, "director": 1525, "professional": 1526, "performance": 1527,
+    "individual": 1528, "organization": 1529, "structure": 1530, "responsibility": 1531,
+    "technology": 1532, "democratic": 1533, "relationship": 1534, "environmental": 1535,
+    "significantly": 1536, "particularly": 1537, "approximately": 1538, "ultimately": 1539,
+    "comprehensive": 1540, "substantial": 1541, "fundamental": 1542, "analysis": 1543,
+    "investigation": 1544, "comprehensive": 1545, "demonstrate": 1546, "theoretical": 1547,
+    "significant": 1548, "hypothesis": 1549, "empirical": 1550, "methodology": 1551,
+    "framework": 1552, "implications": 1553, "phenomena": 1554, "parameters": 1555,
+    "correlation": 1556, "variables": 1557, "statistical": 1558, "preliminary": 1559,
+}
+def _tokenize_for_frequency_analysis(text: str) -> list[str]:
+    """Tokenize text for frequency analysis.
+    Args:
+        text: Input text to tokenize
+    Returns:
+        List of clean, lowercase tokens
+    Process:
+        - Lowercase entire text
+        - Split on whitespace
+        - Strip punctuation from each token
+        - Filter out empty tokens
+    """
+    if not text or not text.strip():
+        return []
+    text_lower = text.lower()
+    raw_tokens = text_lower.split()
+    # Comprehensive punctuation set
+    PUNCTUATION = set(".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„\"\"''‚'")
+    tokens = []
+    for token in raw_tokens:
+        clean_token = token.strip("".join(PUNCTUATION))
+        if clean_token:
+            tokens.append(clean_token)
+    return tokens
+def _get_frequency_rank(word: str, frequency_dict: dict[str, int], max_rank: int) -> int:
+    """Get frequency rank for a word, or 50000 if unknown.
+    Args:
+        word: Word to look up (should be lowercase)
+        frequency_dict: Dictionary mapping words to frequency ranks
+        max_rank: Maximum rank in the frequency dictionary
+    Returns:
+        Frequency rank (1 = most common), or 50000 if word not found
+        (Unknown words are treated as very rare)
+    """
+    return frequency_dict.get(word, 50000)
+def compute_word_frequency_sophistication(
+    text: str,
+    frequency_corpus: str = "coca",
+    rare_threshold: int = 10000,
+    common_threshold: int = 1000,
+) -> WordFrequencySophisticationResult:
+    """
+    Compute word frequency sophistication metrics.
+    Analyzes vocabulary sophistication by comparing text words against
+    reference frequency lists from large corpora. Words are classified
+    as common, rare, or academic based on their frequency ranks in the
+    reference corpus.
+    Related GitHub Issue:
+        #15 - Word Frequency Sophistication Metrics
+        https://github.com/craigtrim/pystylometry/issues/15
+    Sophistication is a key indicator of writing quality and expertise:
+        - Academic writing uses more low-frequency, technical words
+        - Fiction uses moderate-frequency, descriptive words
+        - Journalism uses high-frequency, accessible words
+        - Authors with larger vocabularies use rarer words
+        - Native speakers use different frequency profiles than learners
+    Applications:
+        - Assessing vocabulary richness beyond simple TTR
+        - Comparing writing sophistication across authors or genres
+        - Tracking vocabulary development over time
+        - Identifying register (formal vs. informal)
+        - Detecting text difficulty level
+    Frequency bands (example for 100,000-word corpus):
+        - Very common: Rank 1-1,000 (top 1%)
+        - Common: Rank 1,001-5,000 (top 5%)
+        - Moderate: Rank 5,001-10,000 (top 10%)
+        - Rare: Rank 10,001-20,000 (top 20%)
+        - Very rare: Rank 20,001+ (bottom 80%)
+    Args:
+        text: Input text to analyze. Should contain at least 50+ words
+              for meaningful statistics. Shorter texts may have unreliable
+              sophistication metrics.
+        frequency_corpus: Reference corpus to use for frequency data.
+                          Options: "coca", "bnc", "google_ngrams", "subtlex"
+                          Default is "coca" (Corpus of Contemporary American English).
+        rare_threshold: Frequency rank threshold for "rare" words. Words with
+                        rank > rare_threshold are considered rare. Default 10,000.
+        common_threshold: Frequency rank threshold for "common" words. Words with
+                          rank <= common_threshold are considered common. Default 1,000.
+    Returns:
+        WordFrequencySophisticationResult containing:
+            - mean_frequency_rank: Average frequency rank (lower = more common)
+            - median_frequency_rank: Median frequency rank
+            - rare_word_ratio: Proportion of words beyond rare_threshold
+            - common_word_ratio: Proportion of words within common_threshold
+            - academic_word_ratio: Proportion of Academic Word List words
+            - advanced_word_ratio: Proportion of sophisticated vocabulary
+            - frequency_band_distribution: Distribution across frequency bands
+            - rarest_words: Least frequent words with their ranks
+            - most_common_words: Most frequent words with their ranks
+            - metadata: Corpus info, thresholds, counts, etc.
+    Example:
+        >>> result = compute_word_frequency_sophistication("Sample academic text...")
+        >>> print(f"Mean frequency rank: {result.mean_frequency_rank:.1f}")
+        Mean frequency rank: 4523.7
+        >>> print(f"Rare word ratio: {result.rare_word_ratio:.3f}")
+        Rare word ratio: 0.234
+        >>> print(f"Academic words: {result.academic_word_ratio:.3f}")
+        Academic words: 0.156
+        >>> # Compare authors
+        >>> author1 = compute_word_frequency_sophistication("Text by author 1...")
+        >>> author2 = compute_word_frequency_sophistication("Text by author 2...")
+        >>> print(f"Author 1 mean rank: {author1.mean_frequency_rank:.1f}")
+        >>> print(f"Author 2 mean rank: {author2.mean_frequency_rank:.1f}")
+        >>> # Lower rank = uses more common words
+    Note:
+        - Frequency ranks are corpus-specific (COCA ranks differ from BNC ranks)
+        - Words not in reference corpus are assigned maximum rank + 1
+        - Case-insensitive matching (all words lowercased)
+        - Lemmatization recommended but not required
+        - Function words (the, of, and) dominate high-frequency ranks
+        - Stopword removal can provide cleaner sophistication metrics
+        - Academic Word List is field-independent academic vocabulary
+    """
+    # Validate corpus parameter
+    if frequency_corpus != "coca":
+        raise ValueError(
+            f"Only 'coca' corpus is currently supported, got '{frequency_corpus}'"
+        )
+    # Load frequency dictionary
+    frequency_dict = COCA_FREQUENCY_RANKS
+    max_rank = max(frequency_dict.values())
+    # Unknown words are assigned rank 50000 (treated as very rare)
+    unknown_rank = 50000
+    # Tokenize text
+    tokens = _tokenize_for_frequency_analysis(text)
+    total_words = len(tokens)
+    if total_words == 0:
+        raise ValueError("Text contains no valid tokens")
+    # Look up frequency rank for each word
+    word_ranks = [_get_frequency_rank(word, frequency_dict, max_rank) for word in tokens]
+    # Calculate mean and median frequency ranks
+    mean_rank = sum(word_ranks) / len(word_ranks)
+    sorted_ranks = sorted(word_ranks)
+    n = len(sorted_ranks)
+    if n % 2 == 0:
+        median_rank = (sorted_ranks[n // 2 - 1] + sorted_ranks[n // 2]) / 2.0
+    else:
+        median_rank = float(sorted_ranks[n // 2])
+    # Count words in different categories
+    rare_count = sum(1 for rank in word_ranks if rank > rare_threshold)
+    common_count = sum(1 for rank in word_ranks if rank <= common_threshold)
+    academic_count = sum(1 for word in tokens if word in ACADEMIC_WORD_LIST)
+    unknown_count = sum(1 for rank in word_ranks if rank == unknown_rank)
+    # Calculate ratios
+    rare_word_ratio = rare_count / total_words
+    common_word_ratio = common_count / total_words
+    academic_word_ratio = academic_count / total_words
+    # Advanced words = words that are either rare OR academic (union)
+    advanced_words = set()
+    for i, word in enumerate(tokens):
+        if word_ranks[i] > rare_threshold or word in ACADEMIC_WORD_LIST:
+            advanced_words.add(word)
+    # Count token occurrences of advanced words
+    advanced_count = sum(1 for w in tokens if w in advanced_words)
+    advanced_word_ratio = advanced_count / total_words
+    # Frequency band distribution
+    band_counts = {
+        "very_common": sum(1 for r in word_ranks if r <= 1000),
+        "common": sum(1 for r in word_ranks if 1000 < r <= 5000),
+        "moderate": sum(1 for r in word_ranks if 5000 < r <= 10000),
+        "rare": sum(1 for r in word_ranks if 10000 < r <= 20000),
+        "very_rare": sum(1 for r in word_ranks if r > 20000),
+    }
+    frequency_band_distribution = {
+        band: count / total_words for band, count in band_counts.items()
+    }
+    # Find rarest and most common words (top 10 each, deduplicated)
+    word_rank_pairs = list(zip(tokens, word_ranks))
+    # Create unique word-rank mapping (uses last occurrence rank if word repeats)
+    unique_pairs: dict[str, int] = {}
+    for word, rank in word_rank_pairs:
+        if word not in unique_pairs:
+            unique_pairs[word] = rank
+    # Rarest: highest ranks
+    sorted_by_rarest = sorted(unique_pairs.items(), key=lambda x: x[1], reverse=True)
+    rarest_words = [(word, float(rank)) for word, rank in sorted_by_rarest[:10]]
+    # Most common: lowest ranks
+    sorted_by_common = sorted(unique_pairs.items(), key=lambda x: x[1])
+    most_common_words = [(word, float(rank)) for word, rank in sorted_by_common[:10]]
+    # Metadata
+    metadata = {
+        "frequency_corpus": frequency_corpus,
+        "rare_threshold": rare_threshold,
+        "common_threshold": common_threshold,
+        "total_words": total_words,
+        "unique_words": len(set(tokens)),
+        "unknown_words": unknown_count,
+        "unknown_word_ratio": unknown_count / total_words,
+        "frequency_list_size": len(frequency_dict),
+        "max_frequency_rank": max_rank,
+    }
+    return WordFrequencySophisticationResult(
+        mean_frequency_rank=mean_rank,
+        median_frequency_rank=median_rank,
+        rare_word_ratio=rare_word_ratio,
+        common_word_ratio=common_word_ratio,
+        academic_word_ratio=academic_word_ratio,
+        advanced_word_ratio=advanced_word_ratio,
+        frequency_band_distribution=frequency_band_distribution,
+        rarest_words=rarest_words,
+        most_common_words=most_common_words,
+        metadata=metadata,
+    )

pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

pystylometry 0.1.0py3-none-any.whl → 1.0.0py3-none-any.whl