pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +1 -2
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1224 -2
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +4 -0
- pystylometry/authorship/additional_methods.py +100 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +301 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +641 -0
- pystylometry/lexical/function_words.py +391 -0
- pystylometry/lexical/hapax.py +154 -7
- pystylometry/lexical/mtld.py +83 -7
- pystylometry/lexical/ttr.py +83 -0
- pystylometry/lexical/word_frequency_sophistication.py +581 -0
- pystylometry/lexical/yule.py +34 -7
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +985 -0
- pystylometry/readability/ari.py +93 -17
- pystylometry/readability/coleman_liau.py +102 -9
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +59 -14
- pystylometry/readability/gunning_fog.py +194 -25
- pystylometry/readability/smog.py +31 -14
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +432 -0
- pystylometry/syntactic/pos_ratios.py +104 -13
- pystylometry/syntactic/sentence_stats.py +57 -13
- pystylometry/syntactic/sentence_types.py +470 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
- pystylometry-1.0.0.dist-info/RECORD +46 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
- pystylometry-0.1.0.dist-info/RECORD +0 -26
|
@@ -0,0 +1,985 @@
|
|
|
1
|
+
"""Additional readability formulas.
|
|
2
|
+
|
|
3
|
+
This module provides additional readability metrics beyond the core formulas
|
|
4
|
+
(Flesch, SMOG, Gunning Fog, Coleman-Liau, ARI). These formulas offer alternative
|
|
5
|
+
approaches to measuring text difficulty and are valuable for cross-validation
|
|
6
|
+
and comprehensive readability assessment.
|
|
7
|
+
|
|
8
|
+
Related GitHub Issue:
|
|
9
|
+
#16 - Additional Readability Formulas
|
|
10
|
+
https://github.com/craigtrim/pystylometry/issues/16
|
|
11
|
+
|
|
12
|
+
Formulas implemented:
|
|
13
|
+
- Dale-Chall: Based on list of 3000 familiar words
|
|
14
|
+
- Linsear Write: Developed for technical writing assessment
|
|
15
|
+
- Fry Readability Graph: Visual graph-based assessment
|
|
16
|
+
- FORCAST: Military formula using only single-syllable words
|
|
17
|
+
- Powers-Sumner-Kearl: Recalibrated Flesch for primary grades
|
|
18
|
+
|
|
19
|
+
References:
|
|
20
|
+
Dale, E., & Chall, J. S. (1948). A formula for predicting readability.
|
|
21
|
+
Chall, J. S., & Dale, E. (1995). Readability revisited: The new Dale-Chall
|
|
22
|
+
readability formula. Brookline Books.
|
|
23
|
+
Klare, G. R. (1974-1975). Assessing readability. Reading Research Quarterly.
|
|
24
|
+
Fry, E. (1968). A readability formula that saves time. Journal of Reading.
|
|
25
|
+
Caylor, J. S., et al. (1973). Methodologies for determining reading requirements
|
|
26
|
+
of military occupational specialties. Human Resources Research Organization.
|
|
27
|
+
Powers, R. D., Sumner, W. A., & Kearl, B. E. (1958). A recalculation of four
|
|
28
|
+
adult readability formulas. Journal of Educational Psychology.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
from .._normalize import normalize_for_readability
|
|
32
|
+
from .._types import (
|
|
33
|
+
DaleChallResult,
|
|
34
|
+
FORCASTResult,
|
|
35
|
+
FryResult,
|
|
36
|
+
LinsearWriteResult,
|
|
37
|
+
PowersSumnerKearlResult,
|
|
38
|
+
)
|
|
39
|
+
from .._utils import split_sentences, tokenize
|
|
40
|
+
from .syllables import count_syllables
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# Dale-Chall List of Familiar Words (subset of ~1200 words)
|
|
44
|
+
# GitHub Issue #16: https://github.com/craigtrim/pystylometry/issues/16
|
|
45
|
+
# Full Dale-Chall list has 3000 words that 80% of 4th graders understand.
|
|
46
|
+
# This is a representative subset covering most common everyday words.
|
|
47
|
+
DALE_CHALL_FAMILIAR_WORDS = {
|
|
48
|
+
# Articles, pronouns, determiners
|
|
49
|
+
"a", "an", "the", "this", "that", "these", "those", "some", "any", "all",
|
|
50
|
+
"each", "every", "both", "few", "many", "much", "more", "most", "other",
|
|
51
|
+
"another", "such", "what", "which", "who", "whom", "whose", "whoever",
|
|
52
|
+
"i", "me", "my", "mine", "myself", "we", "us", "our", "ours", "ourselves",
|
|
53
|
+
"you", "your", "yours", "yourself", "yourselves",
|
|
54
|
+
"he", "him", "his", "himself", "she", "her", "hers", "herself",
|
|
55
|
+
"it", "its", "itself", "they", "them", "their", "theirs", "themselves",
|
|
56
|
+
"one", "ones", "someone", "somebody", "something", "anyone", "anybody", "anything",
|
|
57
|
+
"everyone", "everybody", "everything", "no", "none", "nobody", "nothing",
|
|
58
|
+
|
|
59
|
+
# Conjunctions and prepositions
|
|
60
|
+
"and", "or", "but", "if", "when", "where", "why", "how", "because", "so",
|
|
61
|
+
"for", "nor", "yet", "after", "before", "while", "since", "until", "unless",
|
|
62
|
+
"though", "although", "whether", "than", "as", "like",
|
|
63
|
+
"of", "to", "in", "on", "at", "by", "with", "from", "about", "into",
|
|
64
|
+
"through", "over", "under", "above", "below", "between", "among", "against",
|
|
65
|
+
"during", "without", "within", "along", "across", "behind", "beside", "near",
|
|
66
|
+
"off", "out", "up", "down", "around", "past", "toward", "upon",
|
|
67
|
+
|
|
68
|
+
# Common verbs (base, past, -ing, -ed forms included)
|
|
69
|
+
"be", "am", "is", "are", "was", "were", "been", "being",
|
|
70
|
+
"have", "has", "had", "having", "do", "does", "did", "doing", "done",
|
|
71
|
+
"will", "would", "shall", "should", "may", "might", "must", "can", "could",
|
|
72
|
+
"go", "goes", "went", "gone", "going", "come", "comes", "came", "coming",
|
|
73
|
+
"make", "makes", "made", "making", "get", "gets", "got", "getting", "gotten",
|
|
74
|
+
"know", "knows", "knew", "known", "knowing",
|
|
75
|
+
"think", "thinks", "thought", "thinking",
|
|
76
|
+
"see", "sees", "saw", "seen", "seeing", "look", "looks", "looked", "looking",
|
|
77
|
+
"take", "takes", "took", "taken", "taking", "give", "gives", "gave", "given", "giving",
|
|
78
|
+
"find", "finds", "found", "finding", "tell", "tells", "told", "telling",
|
|
79
|
+
"ask", "asks", "asked", "asking", "work", "works", "worked", "working",
|
|
80
|
+
"seem", "seems", "seemed", "seeming", "feel", "feels", "felt", "feeling",
|
|
81
|
+
"try", "tries", "tried", "trying", "leave", "leaves", "left", "leaving",
|
|
82
|
+
"call", "calls", "called", "calling", "use", "uses", "used", "using",
|
|
83
|
+
"want", "wants", "wanted", "wanting", "need", "needs", "needed", "needing",
|
|
84
|
+
"say", "says", "said", "saying", "talk", "talks", "talked", "talking",
|
|
85
|
+
"turn", "turns", "turned", "turning", "run", "runs", "ran", "running",
|
|
86
|
+
"move", "moves", "moved", "moving", "live", "lives", "lived", "living",
|
|
87
|
+
"believe", "believes", "believed", "believing",
|
|
88
|
+
"hold", "holds", "held", "holding", "bring", "brings", "brought", "bringing",
|
|
89
|
+
"happen", "happens", "happened", "happening",
|
|
90
|
+
"write", "writes", "wrote", "written", "writing",
|
|
91
|
+
"sit", "sits", "sat", "sitting", "stand", "stands", "stood", "standing",
|
|
92
|
+
"hear", "hears", "heard", "hearing", "let", "lets", "letting",
|
|
93
|
+
"help", "helps", "helped", "helping", "show", "shows", "showed", "shown", "showing",
|
|
94
|
+
"play", "plays", "played", "playing", "read", "reads", "reading",
|
|
95
|
+
"change", "changes", "changed", "changing", "keep", "keeps", "kept", "keeping",
|
|
96
|
+
"start", "starts", "started", "starting", "stop", "stops", "stopped", "stopping",
|
|
97
|
+
"learn", "learns", "learned", "learning", "grow", "grows", "grew", "grown", "growing",
|
|
98
|
+
"open", "opens", "opened", "opening", "close", "closes", "closed", "closing",
|
|
99
|
+
"walk", "walks", "walked", "walking", "win", "wins", "won", "winning",
|
|
100
|
+
"begin", "begins", "began", "begun", "beginning", "end", "ends", "ended", "ending",
|
|
101
|
+
"lose", "loses", "lost", "losing", "send", "sends", "sent", "sending",
|
|
102
|
+
"buy", "buys", "bought", "buying", "pay", "pays", "paid", "paying",
|
|
103
|
+
"eat", "eats", "ate", "eaten", "eating", "drink", "drinks", "drank", "drinking",
|
|
104
|
+
"sleep", "sleeps", "slept", "sleeping", "wake", "wakes", "woke", "waking",
|
|
105
|
+
"sing", "sings", "sang", "sung", "singing", "dance", "dances", "danced", "dancing",
|
|
106
|
+
"wait", "waits", "waited", "waiting", "stay", "stays", "stayed", "staying",
|
|
107
|
+
"fly", "flies", "flew", "flown", "flying", "fall", "falls", "fell", "fallen", "falling",
|
|
108
|
+
"cut", "cuts", "cutting", "break", "breaks", "broke", "broken", "breaking",
|
|
109
|
+
"watch", "watches", "watched", "watching", "listen", "listens", "listened", "listening",
|
|
110
|
+
"remember", "remembers", "remembered", "remembering",
|
|
111
|
+
"forget", "forgets", "forgot", "forgotten", "forgetting",
|
|
112
|
+
"meet", "meets", "met", "meeting", "follow", "follows", "followed", "following",
|
|
113
|
+
"carry", "carries", "carried", "carrying", "catch", "catches", "caught", "catching",
|
|
114
|
+
"draw", "draws", "drew", "drawn", "drawing", "drive", "drives", "drove", "driven", "driving",
|
|
115
|
+
"ride", "rides", "rode", "ridden", "riding", "wear", "wears", "wore", "worn", "wearing",
|
|
116
|
+
"pull", "pulls", "pulled", "pulling", "push", "pushes", "pushed", "pushing",
|
|
117
|
+
"throw", "throws", "threw", "thrown", "throwing",
|
|
118
|
+
"reach", "reaches", "reached", "reaching", "pass", "passes", "passed", "passing",
|
|
119
|
+
"shoot", "shoots", "shot", "shooting", "rise", "rises", "rose", "risen", "rising",
|
|
120
|
+
"blow", "blows", "blew", "blown", "blowing", "grow", "grows", "grew", "grown", "growing",
|
|
121
|
+
"hit", "hits", "hitting", "fight", "fights", "fought", "fighting",
|
|
122
|
+
"die", "dies", "died", "dying", "kill", "kills", "killed", "killing",
|
|
123
|
+
"speak", "speaks", "spoke", "spoken", "speaking",
|
|
124
|
+
|
|
125
|
+
# Common nouns
|
|
126
|
+
"time", "times", "year", "years", "day", "days", "week", "weeks",
|
|
127
|
+
"month", "months", "hour", "hours", "minute", "minutes", "second", "seconds",
|
|
128
|
+
"morning", "afternoon", "evening", "night", "today", "yesterday", "tomorrow",
|
|
129
|
+
"people", "person", "man", "men", "woman", "women", "child", "children",
|
|
130
|
+
"boy", "boys", "girl", "girls", "baby", "babies", "friend", "friends",
|
|
131
|
+
"family", "families", "mother", "father", "parent", "parents",
|
|
132
|
+
"brother", "brothers", "sister", "sisters", "son", "daughter",
|
|
133
|
+
"place", "places", "home", "house", "houses", "room", "rooms",
|
|
134
|
+
"school", "schools", "class", "classes", "student", "students", "teacher", "teachers",
|
|
135
|
+
"way", "ways", "thing", "things", "part", "parts", "group", "groups",
|
|
136
|
+
"number", "numbers", "side", "sides", "kind", "kinds", "head", "heads",
|
|
137
|
+
"hand", "hands", "eye", "eyes", "face", "faces", "body", "bodies",
|
|
138
|
+
"foot", "feet", "arm", "arms", "leg", "legs", "ear", "ears", "mouth",
|
|
139
|
+
"water", "food", "air", "land", "earth", "ground", "world",
|
|
140
|
+
"country", "countries", "state", "states", "city", "cities", "town", "towns",
|
|
141
|
+
"name", "names", "word", "words", "line", "lines", "page", "pages",
|
|
142
|
+
"book", "books", "story", "stories", "letter", "letters", "paper", "papers",
|
|
143
|
+
"point", "points", "end", "ends", "top", "bottom", "front", "back",
|
|
144
|
+
"life", "lives", "problem", "problems", "question", "questions", "answer", "answers",
|
|
145
|
+
"work", "works", "job", "jobs", "money", "door", "doors", "window", "windows",
|
|
146
|
+
"car", "cars", "road", "roads", "street", "streets", "tree", "trees",
|
|
147
|
+
"animal", "animals", "bird", "birds", "fish", "dog", "dogs", "cat", "cats",
|
|
148
|
+
"horse", "horses", "sea", "mountain", "mountains", "river", "rivers",
|
|
149
|
+
"sun", "moon", "star", "stars", "sky", "cloud", "clouds", "rain", "snow",
|
|
150
|
+
"wind", "fire", "light", "dark", "sound", "sounds", "color", "colors",
|
|
151
|
+
"white", "black", "red", "blue", "green", "yellow", "brown", "orange",
|
|
152
|
+
"game", "games", "ball", "music", "song", "songs", "picture", "pictures",
|
|
153
|
+
"table", "tables", "chair", "chairs", "bed", "beds", "floor", "wall", "walls",
|
|
154
|
+
"minute", "power", "war", "force", "age", "care", "order", "case",
|
|
155
|
+
|
|
156
|
+
# Common adjectives
|
|
157
|
+
"good", "better", "best", "bad", "worse", "worst",
|
|
158
|
+
"big", "bigger", "biggest", "small", "smaller", "smallest",
|
|
159
|
+
"large", "larger", "largest", "little", "less", "least",
|
|
160
|
+
"long", "longer", "longest", "short", "shorter", "shortest",
|
|
161
|
+
"high", "higher", "highest", "low", "lower", "lowest",
|
|
162
|
+
"old", "older", "oldest", "young", "younger", "youngest", "new", "newer", "newest",
|
|
163
|
+
"great", "greater", "greatest", "important", "right", "left", "own",
|
|
164
|
+
"other", "different", "same", "next", "last", "first", "second", "third",
|
|
165
|
+
"early", "earlier", "earliest", "late", "later", "latest",
|
|
166
|
+
"easy", "easier", "easiest", "hard", "harder", "hardest",
|
|
167
|
+
"hot", "hotter", "hottest", "cold", "colder", "coldest",
|
|
168
|
+
"warm", "warmer", "warmest", "cool", "cooler", "coolest",
|
|
169
|
+
"fast", "faster", "fastest", "slow", "slower", "slowest",
|
|
170
|
+
"strong", "stronger", "strongest", "weak", "weaker", "weakest",
|
|
171
|
+
"happy", "happier", "happiest", "sad", "sadder", "saddest",
|
|
172
|
+
"nice", "nicer", "nicest", "kind", "kinder", "kindest",
|
|
173
|
+
"sure", "free", "full", "whole", "ready", "simple", "clear",
|
|
174
|
+
"real", "true", "certain", "public", "able", "several",
|
|
175
|
+
"open", "closed", "deep", "wide", "bright", "dark", "heavy", "light",
|
|
176
|
+
"clean", "dirty", "wet", "dry", "soft", "hard", "quiet", "loud",
|
|
177
|
+
"quick", "slow", "rich", "poor", "sick", "well", "dead", "alive",
|
|
178
|
+
"empty", "busy", "pretty", "beautiful", "ugly",
|
|
179
|
+
|
|
180
|
+
# Common adverbs
|
|
181
|
+
"very", "too", "so", "more", "most", "less", "least",
|
|
182
|
+
"well", "better", "best", "just", "only", "even", "still",
|
|
183
|
+
"also", "just", "now", "then", "here", "there", "where",
|
|
184
|
+
"how", "when", "why", "not", "never", "always", "often",
|
|
185
|
+
"sometimes", "usually", "ever", "again", "back", "away",
|
|
186
|
+
"together", "once", "twice", "soon", "today", "yesterday", "tomorrow",
|
|
187
|
+
"already", "almost", "enough", "quite", "rather", "really",
|
|
188
|
+
"perhaps", "maybe", "probably", "certainly", "surely",
|
|
189
|
+
"yes", "no", "please", "thank", "sorry",
|
|
190
|
+
|
|
191
|
+
# Numbers
|
|
192
|
+
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
|
|
193
|
+
"eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty",
|
|
194
|
+
"thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
|
|
195
|
+
"hundred", "thousand", "million",
|
|
196
|
+
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth",
|
|
197
|
+
|
|
198
|
+
# Additional common words
|
|
199
|
+
"able", "accept", "across", "act", "add", "afraid", "against", "agree",
|
|
200
|
+
"allow", "alone", "appear", "apple", "area", "arm", "arrive", "art",
|
|
201
|
+
"aunt", "ball", "become", "believe", "belong", "boat", "build",
|
|
202
|
+
"burn", "business", "chair", "chance", "church", "clear", "climb",
|
|
203
|
+
"clothe", "clothes", "company", "contain", "continue", "control",
|
|
204
|
+
"cook", "corner", "cost", "count", "course", "cover", "create",
|
|
205
|
+
"cross", "crowd", "cry", "decide", "depend", "describe", "develop",
|
|
206
|
+
"die", "direction", "discover", "doctor", "double", "drop", "during",
|
|
207
|
+
"edge", "effect", "eight", "either", "else", "enjoy", "enough",
|
|
208
|
+
"enter", "example", "except", "excite", "expect", "explain", "express",
|
|
209
|
+
"fact", "fair", "farm", "fear", "field", "fill", "final", "fine",
|
|
210
|
+
"finger", "finish", "flower", "force", "foreign", "forest", "form",
|
|
211
|
+
"fresh", "front", "garden", "general", "glass", "god", "gold",
|
|
212
|
+
"hang", "hat", "hope", "hot", "idea", "include", "increase",
|
|
213
|
+
"instead", "interest", "island", "join", "laugh", "law", "lead",
|
|
214
|
+
"lie", "lift", "list", "lock", "love", "machine", "mark",
|
|
215
|
+
"matter", "mean", "measure", "member", "mention", "middle", "mile",
|
|
216
|
+
"mind", "miss", "moment", "nation", "natural", "nature", "necessary",
|
|
217
|
+
"neighbor", "notice", "object", "ocean", "offer", "office", "opinion",
|
|
218
|
+
"paint", "pair", "party", "pattern", "period", "pick", "plan",
|
|
219
|
+
"plant", "position", "possible", "pound", "prepare", "present", "president",
|
|
220
|
+
"press", "prince", "print", "probable", "produce", "promise", "proper",
|
|
221
|
+
"protect", "prove", "purpose", "quarter", "queen", "question", "quick",
|
|
222
|
+
"quiet", "race", "raise", "range", "rate", "reason", "receive",
|
|
223
|
+
"record", "region", "remain", "reply", "report", "represent", "require",
|
|
224
|
+
"rest", "result", "return", "roll", "rule", "sail", "salt",
|
|
225
|
+
"save", "science", "season", "seat", "seem", "sell", "sense",
|
|
226
|
+
"sentence", "separate", "serve", "set", "settle", "seven", "shape",
|
|
227
|
+
"share", "ship", "shore", "sign", "silver", "single", "sir",
|
|
228
|
+
"six", "size", "skin", "soldier", "solve", "south", "space",
|
|
229
|
+
"special", "speed", "spell", "spend", "spread", "spring", "square",
|
|
230
|
+
"step", "stone", "straight", "strange", "stream", "strength", "strike",
|
|
231
|
+
"subject", "success", "sudden", "suffer", "suggest", "suit", "summer",
|
|
232
|
+
"supply", "support", "suppose", "surface", "surprise", "sweet", "swim",
|
|
233
|
+
"system", "tail", "taste", "teach", "team", "telephone", "television",
|
|
234
|
+
"temperature", "ten", "test", "thick", "thin", "though", "thousand",
|
|
235
|
+
"three", "tire", "total", "touch", "track", "train", "travel",
|
|
236
|
+
"trip", "trouble", "type", "uncle", "understand", "unit", "universe",
|
|
237
|
+
"value", "various", "view", "village", "visit", "voice", "vote",
|
|
238
|
+
"wagon", "wander", "warm", "wash", "wave", "wealth", "weather",
|
|
239
|
+
"weight", "welcome", "west", "wheel", "wild", "wind", "winter",
|
|
240
|
+
"wish", "wonder", "wood", "yard", "yellow",
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def compute_dale_chall(text: str) -> DaleChallResult:
|
|
245
|
+
"""
|
|
246
|
+
Compute Dale-Chall Readability Formula.
|
|
247
|
+
|
|
248
|
+
The Dale-Chall formula estimates reading difficulty based on the percentage
|
|
249
|
+
of words that are NOT on a list of 3000 familiar words (words understood
|
|
250
|
+
by 80% of 4th graders). It also considers average sentence length.
|
|
251
|
+
|
|
252
|
+
Related GitHub Issue:
|
|
253
|
+
#16 - Additional Readability Formulas
|
|
254
|
+
https://github.com/craigtrim/pystylometry/issues/16
|
|
255
|
+
|
|
256
|
+
Formula:
|
|
257
|
+
Raw Score = 0.1579 * (difficult_words_pct) + 0.0496 * (avg_sentence_length)
|
|
258
|
+
|
|
259
|
+
If difficult_words_pct > 5%:
|
|
260
|
+
Adjusted Score = Raw Score + 3.6365
|
|
261
|
+
|
|
262
|
+
Grade Level Correspondence:
|
|
263
|
+
4.9 or lower: Grade 4 and below
|
|
264
|
+
5.0-5.9: Grades 5-6
|
|
265
|
+
6.0-6.9: Grades 7-8
|
|
266
|
+
7.0-7.9: Grades 9-10
|
|
267
|
+
8.0-8.9: Grades 11-12
|
|
268
|
+
9.0-9.9: Grades 13-15 (College)
|
|
269
|
+
10.0+: Grade 16+ (College Graduate)
|
|
270
|
+
|
|
271
|
+
Advantages:
|
|
272
|
+
- Based on empirical word familiarity data
|
|
273
|
+
- Works well for educational materials
|
|
274
|
+
- Well-validated across grade levels
|
|
275
|
+
- Considers both vocabulary and syntax
|
|
276
|
+
|
|
277
|
+
Disadvantages:
|
|
278
|
+
- Requires maintaining 3000-word familiar list
|
|
279
|
+
- List is dated (1948, updated 1995)
|
|
280
|
+
- May not reflect modern vocabulary
|
|
281
|
+
- Doesn't account for concept difficulty
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
text: Input text to analyze. Should contain at least one complete
|
|
285
|
+
sentence. Empty text returns NaN values.
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
DaleChallResult containing:
|
|
289
|
+
- dale_chall_score: The Dale-Chall readability score
|
|
290
|
+
- grade_level: Grade range (e.g., "7-8", "College")
|
|
291
|
+
- difficult_word_count: Words not on familiar list
|
|
292
|
+
- difficult_word_ratio: Difficult words / total words
|
|
293
|
+
- avg_sentence_length: Average words per sentence
|
|
294
|
+
- total_words: Total word count
|
|
295
|
+
- metadata: List of difficult words, adjusted score flag, etc.
|
|
296
|
+
|
|
297
|
+
Example:
|
|
298
|
+
>>> result = compute_dale_chall("Sample educational text...")
|
|
299
|
+
>>> print(f"Dale-Chall score: {result.dale_chall_score:.2f}")
|
|
300
|
+
Dale-Chall score: 7.3
|
|
301
|
+
>>> print(f"Grade level: {result.grade_level}")
|
|
302
|
+
Grade level: 7-8
|
|
303
|
+
>>> print(f"Difficult words: {result.difficult_word_ratio * 100:.1f}%")
|
|
304
|
+
Difficult words: 12.4%
|
|
305
|
+
|
|
306
|
+
Note:
|
|
307
|
+
- Case-insensitive word matching
|
|
308
|
+
- Punctuation stripped before word lookup
|
|
309
|
+
- Proper nouns may be flagged as difficult even if well-known
|
|
310
|
+
- Technical/specialized texts score higher than general texts
|
|
311
|
+
"""
|
|
312
|
+
# Tokenize and segment
|
|
313
|
+
sentences = split_sentences(text)
|
|
314
|
+
tokens = tokenize(text)
|
|
315
|
+
word_tokens = normalize_for_readability(tokens)
|
|
316
|
+
|
|
317
|
+
if len(sentences) == 0 or len(word_tokens) == 0:
|
|
318
|
+
return DaleChallResult(
|
|
319
|
+
dale_chall_score=float("nan"),
|
|
320
|
+
grade_level="Unknown",
|
|
321
|
+
difficult_word_count=0,
|
|
322
|
+
difficult_word_ratio=float("nan"),
|
|
323
|
+
avg_sentence_length=float("nan"),
|
|
324
|
+
total_words=0,
|
|
325
|
+
metadata={
|
|
326
|
+
"sentence_count": 0,
|
|
327
|
+
"raw_score": float("nan"),
|
|
328
|
+
"adjusted": False,
|
|
329
|
+
"difficult_words_sample": [],
|
|
330
|
+
},
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
# Count difficult words (not in familiar list)
|
|
334
|
+
difficult_words = []
|
|
335
|
+
for word in word_tokens:
|
|
336
|
+
word_lower = word.lower()
|
|
337
|
+
if word_lower not in DALE_CHALL_FAMILIAR_WORDS:
|
|
338
|
+
difficult_words.append(word)
|
|
339
|
+
|
|
340
|
+
difficult_word_count = len(difficult_words)
|
|
341
|
+
difficult_word_ratio = difficult_word_count / len(word_tokens)
|
|
342
|
+
difficult_word_pct = difficult_word_ratio * 100
|
|
343
|
+
|
|
344
|
+
# Calculate average sentence length
|
|
345
|
+
avg_sentence_length = len(word_tokens) / len(sentences)
|
|
346
|
+
|
|
347
|
+
# Calculate raw score
|
|
348
|
+
raw_score = 0.1579 * difficult_word_pct + 0.0496 * avg_sentence_length
|
|
349
|
+
|
|
350
|
+
# Apply adjustment if difficult word % > 5.0
|
|
351
|
+
adjusted = difficult_word_pct > 5.0
|
|
352
|
+
if adjusted:
|
|
353
|
+
dale_chall_score = raw_score + 3.6365
|
|
354
|
+
else:
|
|
355
|
+
dale_chall_score = raw_score
|
|
356
|
+
|
|
357
|
+
# Map score to grade level
|
|
358
|
+
if dale_chall_score < 5.0:
|
|
359
|
+
grade_level = "4 and below"
|
|
360
|
+
elif dale_chall_score < 6.0:
|
|
361
|
+
grade_level = "5-6"
|
|
362
|
+
elif dale_chall_score < 7.0:
|
|
363
|
+
grade_level = "7-8"
|
|
364
|
+
elif dale_chall_score < 8.0:
|
|
365
|
+
grade_level = "9-10"
|
|
366
|
+
elif dale_chall_score < 9.0:
|
|
367
|
+
grade_level = "11-12"
|
|
368
|
+
elif dale_chall_score < 10.0:
|
|
369
|
+
grade_level = "College"
|
|
370
|
+
else:
|
|
371
|
+
grade_level = "College Graduate"
|
|
372
|
+
|
|
373
|
+
# Build metadata
|
|
374
|
+
# Sample up to 20 difficult words for metadata (avoid huge lists)
|
|
375
|
+
difficult_words_sample = list(set(difficult_words))[:20]
|
|
376
|
+
|
|
377
|
+
metadata = {
|
|
378
|
+
"sentence_count": len(sentences),
|
|
379
|
+
"raw_score": raw_score,
|
|
380
|
+
"adjusted": adjusted,
|
|
381
|
+
"difficult_word_pct": difficult_word_pct,
|
|
382
|
+
"difficult_words_sample": difficult_words_sample,
|
|
383
|
+
"familiar_word_list_size": len(DALE_CHALL_FAMILIAR_WORDS),
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
return DaleChallResult(
|
|
387
|
+
dale_chall_score=dale_chall_score,
|
|
388
|
+
grade_level=grade_level,
|
|
389
|
+
difficult_word_count=difficult_word_count,
|
|
390
|
+
difficult_word_ratio=difficult_word_ratio,
|
|
391
|
+
avg_sentence_length=avg_sentence_length,
|
|
392
|
+
total_words=len(word_tokens),
|
|
393
|
+
metadata=metadata,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def compute_linsear_write(text: str) -> LinsearWriteResult:
|
|
398
|
+
"""
|
|
399
|
+
Compute Linsear Write Readability Formula.
|
|
400
|
+
|
|
401
|
+
Developed for the U.S. Air Force to assess technical writing, the Linsear
|
|
402
|
+
Write formula classifies words as "easy" (1-2 syllables) or "hard" (3+
|
|
403
|
+
syllables) and uses sentence length to estimate grade level.
|
|
404
|
+
|
|
405
|
+
Related GitHub Issue:
|
|
406
|
+
#16 - Additional Readability Formulas
|
|
407
|
+
https://github.com/craigtrim/pystylometry/issues/16
|
|
408
|
+
|
|
409
|
+
Formula:
|
|
410
|
+
1. Count "easy" words (1-2 syllables): multiply count by 1
|
|
411
|
+
2. Count "hard" words (3+ syllables): multiply count by 3
|
|
412
|
+
3. Divide sum by number of sentences
|
|
413
|
+
4. If result > 20, divide by 2 to get grade level
|
|
414
|
+
5. If result <= 20, subtract 2, then divide by 2
|
|
415
|
+
|
|
416
|
+
The formula is optimized for technical writing and works best with
|
|
417
|
+
passages of about 100 words.
|
|
418
|
+
|
|
419
|
+
Advantages:
|
|
420
|
+
- Simple binary classification (easy/hard)
|
|
421
|
+
- Effective for technical documents
|
|
422
|
+
- Fast computation
|
|
423
|
+
- Developed specifically for instructional materials
|
|
424
|
+
|
|
425
|
+
Disadvantages:
|
|
426
|
+
- Less well-known than other formulas
|
|
427
|
+
- Binary word classification is crude
|
|
428
|
+
- May overestimate difficulty of technical terms
|
|
429
|
+
- Limited validation compared to Flesch or Dale-Chall
|
|
430
|
+
|
|
431
|
+
Args:
|
|
432
|
+
text: Input text to analyze. Works best with 100-word samples.
|
|
433
|
+
Empty text returns NaN values.
|
|
434
|
+
|
|
435
|
+
Returns:
|
|
436
|
+
LinsearWriteResult containing:
|
|
437
|
+
- linsear_score: The Linsear Write score
|
|
438
|
+
- grade_level: Corresponding U.S. grade level (integer)
|
|
439
|
+
- easy_word_count: Words with 1-2 syllables
|
|
440
|
+
- hard_word_count: Words with 3+ syllables
|
|
441
|
+
- avg_sentence_length: Average words per sentence
|
|
442
|
+
- metadata: Calculation details, sentence count, etc.
|
|
443
|
+
|
|
444
|
+
Example:
|
|
445
|
+
>>> result = compute_linsear_write("Technical manual text...")
|
|
446
|
+
>>> print(f"Linsear Write score: {result.linsear_score:.2f}")
|
|
447
|
+
Linsear Write score: 11.3
|
|
448
|
+
>>> print(f"Grade level: {result.grade_level}")
|
|
449
|
+
Grade level: 11
|
|
450
|
+
>>> print(f"Easy words: {result.easy_word_count}")
|
|
451
|
+
Easy words: 78
|
|
452
|
+
>>> print(f"Hard words: {result.hard_word_count}")
|
|
453
|
+
Hard words: 22
|
|
454
|
+
|
|
455
|
+
Note:
|
|
456
|
+
- Syllable counting required (use existing syllable module)
|
|
457
|
+
- Punctuation and numbers typically excluded
|
|
458
|
+
- Most accurate with 100-word samples
|
|
459
|
+
- Grade level is rounded to nearest integer
|
|
460
|
+
"""
|
|
461
|
+
# Tokenize and segment
|
|
462
|
+
sentences = split_sentences(text)
|
|
463
|
+
tokens = tokenize(text)
|
|
464
|
+
word_tokens = normalize_for_readability(tokens)
|
|
465
|
+
|
|
466
|
+
if len(sentences) == 0 or len(word_tokens) == 0:
|
|
467
|
+
return LinsearWriteResult(
|
|
468
|
+
linsear_score=float("nan"),
|
|
469
|
+
grade_level=0,
|
|
470
|
+
easy_word_count=0,
|
|
471
|
+
hard_word_count=0,
|
|
472
|
+
avg_sentence_length=float("nan"),
|
|
473
|
+
metadata={"sentence_count": 0, "total_words": 0, "raw_score": float("nan")},
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
# Classify words as easy (1-2 syllables) or hard (3+ syllables)
|
|
477
|
+
easy_word_count = 0
|
|
478
|
+
hard_word_count = 0
|
|
479
|
+
|
|
480
|
+
for word in word_tokens:
|
|
481
|
+
syllable_count = count_syllables(word)
|
|
482
|
+
if syllable_count <= 2:
|
|
483
|
+
easy_word_count += 1
|
|
484
|
+
else:
|
|
485
|
+
hard_word_count += 1
|
|
486
|
+
|
|
487
|
+
# Calculate weighted sum
|
|
488
|
+
weighted_sum = (easy_word_count * 1) + (hard_word_count * 3)
|
|
489
|
+
|
|
490
|
+
# Calculate score
|
|
491
|
+
raw_score = weighted_sum / len(sentences)
|
|
492
|
+
|
|
493
|
+
# Convert to grade level
|
|
494
|
+
if raw_score > 20:
|
|
495
|
+
grade_level = round(raw_score / 2)
|
|
496
|
+
else:
|
|
497
|
+
grade_level = round((raw_score - 2) / 2)
|
|
498
|
+
|
|
499
|
+
# Ensure grade level is non-negative
|
|
500
|
+
grade_level = max(0, grade_level)
|
|
501
|
+
|
|
502
|
+
# Calculate average sentence length
|
|
503
|
+
avg_sentence_length = len(word_tokens) / len(sentences)
|
|
504
|
+
|
|
505
|
+
# Build metadata
|
|
506
|
+
metadata = {
|
|
507
|
+
"total_words": len(word_tokens),
|
|
508
|
+
"sentence_count": len(sentences),
|
|
509
|
+
"raw_score": raw_score,
|
|
510
|
+
"weighted_sum": weighted_sum,
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
return LinsearWriteResult(
|
|
514
|
+
linsear_score=raw_score,
|
|
515
|
+
grade_level=grade_level,
|
|
516
|
+
easy_word_count=easy_word_count,
|
|
517
|
+
hard_word_count=hard_word_count,
|
|
518
|
+
avg_sentence_length=avg_sentence_length,
|
|
519
|
+
metadata=metadata,
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def compute_fry(text: str) -> FryResult:
|
|
524
|
+
"""
|
|
525
|
+
Compute Fry Readability Graph metrics.
|
|
526
|
+
|
|
527
|
+
The Fry Readability Graph plots average sentence length against average
|
|
528
|
+
syllables per 100 words to determine reading difficulty. This implementation
|
|
529
|
+
provides the numerical coordinates and estimated grade level.
|
|
530
|
+
|
|
531
|
+
Related GitHub Issue:
|
|
532
|
+
#16 - Additional Readability Formulas
|
|
533
|
+
https://github.com/craigtrim/pystylometry/issues/16
|
|
534
|
+
|
|
535
|
+
Method:
|
|
536
|
+
1. Select three 100-word samples from text
|
|
537
|
+
2. Count average sentence length across samples
|
|
538
|
+
3. Count average syllables per 100 words across samples
|
|
539
|
+
4. Plot coordinates on Fry graph (or use numerical approximation)
|
|
540
|
+
5. Determine grade level from graph zone
|
|
541
|
+
|
|
542
|
+
The original Fry graph has zones corresponding to grade levels 1-17+.
|
|
543
|
+
This implementation uses numerical approximation to estimate grade level.
|
|
544
|
+
|
|
545
|
+
Advantages:
|
|
546
|
+
- Visual/graphical approach (intuitive)
|
|
547
|
+
- Uses two independent dimensions (length & syllables)
|
|
548
|
+
- Well-validated for educational materials
|
|
549
|
+
- Covers wide range of grade levels (1-17+)
|
|
550
|
+
|
|
551
|
+
Disadvantages:
|
|
552
|
+
- Requires exactly 100-word samples (padding/truncation needed)
|
|
553
|
+
- Graph reading can be subjective
|
|
554
|
+
- Less precise than formula-based methods
|
|
555
|
+
- Multiple samples needed for reliability
|
|
556
|
+
|
|
557
|
+
Args:
|
|
558
|
+
text: Input text to analyze. Should contain at least 100 words.
|
|
559
|
+
Shorter texts are padded or return limited results.
|
|
560
|
+
|
|
561
|
+
Returns:
|
|
562
|
+
FryResult containing:
|
|
563
|
+
- avg_sentence_length: Average words per sentence
|
|
564
|
+
- avg_syllables_per_100: Average syllables per 100 words
|
|
565
|
+
- grade_level: Estimated grade level (e.g., "5", "7", "College")
|
|
566
|
+
- graph_zone: Which zone of Fry graph (for validity checking)
|
|
567
|
+
- metadata: Sample details, total sentences, syllables, etc.
|
|
568
|
+
|
|
569
|
+
Example:
|
|
570
|
+
>>> result = compute_fry("Educational text for grade assessment...")
|
|
571
|
+
>>> print(f"Avg sentence length: {result.avg_sentence_length:.1f}")
|
|
572
|
+
Avg sentence length: 14.3
|
|
573
|
+
>>> print(f"Syllables/100 words: {result.avg_syllables_per_100:.1f}")
|
|
574
|
+
Syllables/100 words: 142.7
|
|
575
|
+
>>> print(f"Grade level: {result.grade_level}")
|
|
576
|
+
Grade level: 6
|
|
577
|
+
|
|
578
|
+
Note:
|
|
579
|
+
- Original method uses three 100-word samples
|
|
580
|
+
- Implementation may use single sample or whole text
|
|
581
|
+
- Syllable counting required
|
|
582
|
+
- Grade level estimation uses zone boundaries
|
|
583
|
+
- Some texts fall outside graph zones (marked as invalid)
|
|
584
|
+
"""
|
|
585
|
+
# Tokenize and segment
|
|
586
|
+
sentences = split_sentences(text)
|
|
587
|
+
tokens = tokenize(text)
|
|
588
|
+
word_tokens = normalize_for_readability(tokens)
|
|
589
|
+
|
|
590
|
+
if len(sentences) == 0 or len(word_tokens) == 0:
|
|
591
|
+
return FryResult(
|
|
592
|
+
avg_sentence_length=float("nan"),
|
|
593
|
+
avg_syllables_per_100=float("nan"),
|
|
594
|
+
grade_level="Unknown",
|
|
595
|
+
graph_zone="invalid",
|
|
596
|
+
metadata={
|
|
597
|
+
"total_sentences": 0,
|
|
598
|
+
"total_syllables": 0,
|
|
599
|
+
"total_words": 0,
|
|
600
|
+
"sample_size": 0,
|
|
601
|
+
},
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
# Use first 100 words for sample (or entire text if < 100 words)
|
|
605
|
+
sample_size = min(100, len(word_tokens))
|
|
606
|
+
sample_tokens = word_tokens[:sample_size]
|
|
607
|
+
|
|
608
|
+
# Count syllables in sample
|
|
609
|
+
total_syllables = sum(count_syllables(word) for word in sample_tokens)
|
|
610
|
+
|
|
611
|
+
# Count sentences within the sample
|
|
612
|
+
# We need to determine how many sentences are in the first sample_size words
|
|
613
|
+
word_count_so_far = 0
|
|
614
|
+
sentences_in_sample = 0
|
|
615
|
+
for sent in sentences:
|
|
616
|
+
sent_tokens = tokenize(sent)
|
|
617
|
+
sent_word_tokens = normalize_for_readability(sent_tokens)
|
|
618
|
+
if word_count_so_far + len(sent_word_tokens) <= sample_size:
|
|
619
|
+
sentences_in_sample += 1
|
|
620
|
+
word_count_so_far += len(sent_word_tokens)
|
|
621
|
+
else:
|
|
622
|
+
# Partial sentence in sample
|
|
623
|
+
if word_count_so_far < sample_size:
|
|
624
|
+
sentences_in_sample += 1
|
|
625
|
+
break
|
|
626
|
+
|
|
627
|
+
# Ensure at least 1 sentence for division
|
|
628
|
+
sentences_in_sample = max(1, sentences_in_sample)
|
|
629
|
+
|
|
630
|
+
# Calculate avg_sentence_length (for the sample)
|
|
631
|
+
avg_sentence_length = sample_size / sentences_in_sample
|
|
632
|
+
|
|
633
|
+
# Calculate avg_syllables_per_100 (scale if sample < 100)
|
|
634
|
+
avg_syllables_per_100 = (total_syllables / sample_size) * 100
|
|
635
|
+
|
|
636
|
+
# Map to grade level using Fry graph approximation
|
|
637
|
+
# Fry graph zones (simplified numerical approximation):
|
|
638
|
+
# These are rough boundaries based on Fry graph zones
|
|
639
|
+
# X-axis: avg sentences per 100 words (inverse of avg_sentence_length)
|
|
640
|
+
# Y-axis: avg syllables per 100 words
|
|
641
|
+
|
|
642
|
+
# Determine grade level based on avg_sentence_length and avg_syllables_per_100
|
|
643
|
+
# Higher syllables per 100 = higher grade
|
|
644
|
+
# Longer sentences = higher grade
|
|
645
|
+
# Simplified zone mapping:
|
|
646
|
+
if avg_syllables_per_100 < 125:
|
|
647
|
+
if avg_sentence_length < 7:
|
|
648
|
+
grade_level = "1"
|
|
649
|
+
graph_zone = "valid"
|
|
650
|
+
elif avg_sentence_length < 11:
|
|
651
|
+
grade_level = "2"
|
|
652
|
+
graph_zone = "valid"
|
|
653
|
+
else:
|
|
654
|
+
grade_level = "3"
|
|
655
|
+
graph_zone = "valid"
|
|
656
|
+
elif avg_syllables_per_100 < 135:
|
|
657
|
+
if avg_sentence_length < 8:
|
|
658
|
+
grade_level = "2"
|
|
659
|
+
graph_zone = "valid"
|
|
660
|
+
elif avg_sentence_length < 12:
|
|
661
|
+
grade_level = "3"
|
|
662
|
+
graph_zone = "valid"
|
|
663
|
+
else:
|
|
664
|
+
grade_level = "4"
|
|
665
|
+
graph_zone = "valid"
|
|
666
|
+
elif avg_syllables_per_100 < 145:
|
|
667
|
+
if avg_sentence_length < 9:
|
|
668
|
+
grade_level = "3"
|
|
669
|
+
graph_zone = "valid"
|
|
670
|
+
elif avg_sentence_length < 13:
|
|
671
|
+
grade_level = "5"
|
|
672
|
+
graph_zone = "valid"
|
|
673
|
+
else:
|
|
674
|
+
grade_level = "6"
|
|
675
|
+
graph_zone = "valid"
|
|
676
|
+
elif avg_syllables_per_100 < 155:
|
|
677
|
+
if avg_sentence_length < 10:
|
|
678
|
+
grade_level = "4"
|
|
679
|
+
graph_zone = "valid"
|
|
680
|
+
elif avg_sentence_length < 14:
|
|
681
|
+
grade_level = "7"
|
|
682
|
+
graph_zone = "valid"
|
|
683
|
+
else:
|
|
684
|
+
grade_level = "8"
|
|
685
|
+
graph_zone = "valid"
|
|
686
|
+
elif avg_syllables_per_100 < 165:
|
|
687
|
+
if avg_sentence_length < 12:
|
|
688
|
+
grade_level = "6"
|
|
689
|
+
graph_zone = "valid"
|
|
690
|
+
elif avg_sentence_length < 16:
|
|
691
|
+
grade_level = "9"
|
|
692
|
+
graph_zone = "valid"
|
|
693
|
+
else:
|
|
694
|
+
grade_level = "10"
|
|
695
|
+
graph_zone = "valid"
|
|
696
|
+
elif avg_syllables_per_100 < 175:
|
|
697
|
+
if avg_sentence_length < 14:
|
|
698
|
+
grade_level = "8"
|
|
699
|
+
graph_zone = "valid"
|
|
700
|
+
elif avg_sentence_length < 18:
|
|
701
|
+
grade_level = "11"
|
|
702
|
+
graph_zone = "valid"
|
|
703
|
+
else:
|
|
704
|
+
grade_level = "12"
|
|
705
|
+
graph_zone = "valid"
|
|
706
|
+
else: # avg_syllables_per_100 >= 175
|
|
707
|
+
if avg_sentence_length < 16:
|
|
708
|
+
grade_level = "10"
|
|
709
|
+
graph_zone = "valid"
|
|
710
|
+
elif avg_sentence_length < 20:
|
|
711
|
+
grade_level = "College"
|
|
712
|
+
graph_zone = "valid"
|
|
713
|
+
else:
|
|
714
|
+
grade_level = "College+"
|
|
715
|
+
graph_zone = "valid"
|
|
716
|
+
|
|
717
|
+
# Check if outside typical graph bounds
|
|
718
|
+
if avg_syllables_per_100 > 185 or avg_sentence_length > 25:
|
|
719
|
+
graph_zone = "above_graph"
|
|
720
|
+
elif avg_syllables_per_100 < 110:
|
|
721
|
+
graph_zone = "below_graph"
|
|
722
|
+
|
|
723
|
+
# Build metadata
|
|
724
|
+
metadata = {
|
|
725
|
+
"total_sentences": len(sentences),
|
|
726
|
+
"total_syllables": sum(count_syllables(w) for w in word_tokens),
|
|
727
|
+
"total_words": len(word_tokens),
|
|
728
|
+
"sample_size": sample_size,
|
|
729
|
+
"sentences_in_sample": sentences_in_sample,
|
|
730
|
+
"syllables_in_sample": total_syllables,
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
return FryResult(
|
|
734
|
+
avg_sentence_length=avg_sentence_length,
|
|
735
|
+
avg_syllables_per_100=avg_syllables_per_100,
|
|
736
|
+
grade_level=grade_level,
|
|
737
|
+
graph_zone=graph_zone,
|
|
738
|
+
metadata=metadata,
|
|
739
|
+
)
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
def compute_forcast(text: str) -> FORCASTResult:
|
|
743
|
+
"""
|
|
744
|
+
Compute FORCAST Readability Formula.
|
|
745
|
+
|
|
746
|
+
FORCAST (FORmula for CASTing readability) was developed by the U.S. military
|
|
747
|
+
to assess readability without counting syllables. It uses only the count of
|
|
748
|
+
single-syllable words as its metric, making it fast and simple.
|
|
749
|
+
|
|
750
|
+
Related GitHub Issue:
|
|
751
|
+
#16 - Additional Readability Formulas
|
|
752
|
+
https://github.com/craigtrim/pystylometry/issues/16
|
|
753
|
+
|
|
754
|
+
Formula:
|
|
755
|
+
Grade Level = 20 - (N / 10)
|
|
756
|
+
|
|
757
|
+
Where N is the number of single-syllable words in a 150-word sample.
|
|
758
|
+
|
|
759
|
+
The formula is optimized for technical and military documents and works
|
|
760
|
+
best with standardized 150-word samples.
|
|
761
|
+
|
|
762
|
+
Advantages:
|
|
763
|
+
- Extremely simple (only counts single-syllable words)
|
|
764
|
+
- No sentence segmentation required
|
|
765
|
+
- Fast computation
|
|
766
|
+
- Developed specifically for military/technical texts
|
|
767
|
+
|
|
768
|
+
Disadvantages:
|
|
769
|
+
- Less well-known and validated than other formulas
|
|
770
|
+
- Requires exactly 150-word samples
|
|
771
|
+
- Single dimension (doesn't consider sentence length)
|
|
772
|
+
- May not generalize well beyond military context
|
|
773
|
+
|
|
774
|
+
Args:
|
|
775
|
+
text: Input text to analyze. Works best with 150-word samples.
|
|
776
|
+
Shorter texts are padded or scored proportionally.
|
|
777
|
+
Longer texts use first 150 words or multiple samples.
|
|
778
|
+
|
|
779
|
+
Returns:
|
|
780
|
+
FORCASTResult containing:
|
|
781
|
+
- forcast_score: The FORCAST readability score
|
|
782
|
+
- grade_level: Corresponding U.S. grade level (integer)
|
|
783
|
+
- single_syllable_ratio: Single-syllable words / total words
|
|
784
|
+
- single_syllable_count: Count of single-syllable words
|
|
785
|
+
- total_words: Total word count analyzed
|
|
786
|
+
- metadata: Sample details, calculation specifics, etc.
|
|
787
|
+
|
|
788
|
+
Example:
|
|
789
|
+
>>> result = compute_forcast("Military technical document...")
|
|
790
|
+
>>> print(f"FORCAST score: {result.forcast_score:.2f}")
|
|
791
|
+
FORCAST score: 9.7
|
|
792
|
+
>>> print(f"Grade level: {result.grade_level}")
|
|
793
|
+
Grade level: 10
|
|
794
|
+
>>> print(f"Single-syllable ratio: {result.single_syllable_ratio:.3f}")
|
|
795
|
+
Single-syllable ratio: 0.687
|
|
796
|
+
|
|
797
|
+
Note:
|
|
798
|
+
- Syllable counting required (but only to identify 1-syllable words)
|
|
799
|
+
- Recommended sample size is 150 words
|
|
800
|
+
- Multiple samples can be averaged for longer texts
|
|
801
|
+
- Simpler than most readability formulas
|
|
802
|
+
- Grade levels typically range from 5-12
|
|
803
|
+
"""
|
|
804
|
+
# Tokenize
|
|
805
|
+
tokens = tokenize(text)
|
|
806
|
+
word_tokens = normalize_for_readability(tokens)
|
|
807
|
+
|
|
808
|
+
if len(word_tokens) == 0:
|
|
809
|
+
return FORCASTResult(
|
|
810
|
+
forcast_score=float("nan"),
|
|
811
|
+
grade_level=0,
|
|
812
|
+
single_syllable_ratio=float("nan"),
|
|
813
|
+
single_syllable_count=0,
|
|
814
|
+
total_words=0,
|
|
815
|
+
metadata={"sample_size": 0, "scaled_n": float("nan")},
|
|
816
|
+
)
|
|
817
|
+
|
|
818
|
+
# Use first 150 words for sample (or entire text if < 150 words)
|
|
819
|
+
sample_size = min(150, len(word_tokens))
|
|
820
|
+
sample_tokens = word_tokens[:sample_size]
|
|
821
|
+
|
|
822
|
+
# Count single-syllable words in sample
|
|
823
|
+
single_syllable_count = 0
|
|
824
|
+
for word in sample_tokens:
|
|
825
|
+
if count_syllables(word) == 1:
|
|
826
|
+
single_syllable_count += 1
|
|
827
|
+
|
|
828
|
+
# Scale N to 150-word basis if sample < 150
|
|
829
|
+
if sample_size < 150:
|
|
830
|
+
scaled_n = single_syllable_count * (150 / sample_size)
|
|
831
|
+
else:
|
|
832
|
+
scaled_n = single_syllable_count
|
|
833
|
+
|
|
834
|
+
# Calculate grade level: 20 - (N / 10)
|
|
835
|
+
forcast_score = 20 - (scaled_n / 10)
|
|
836
|
+
grade_level = round(forcast_score)
|
|
837
|
+
|
|
838
|
+
# Ensure grade level is in reasonable range (0-20)
|
|
839
|
+
grade_level = max(0, min(20, grade_level))
|
|
840
|
+
|
|
841
|
+
# Calculate single syllable ratio (for the sample)
|
|
842
|
+
single_syllable_ratio = single_syllable_count / sample_size
|
|
843
|
+
|
|
844
|
+
# Build metadata
|
|
845
|
+
metadata = {
|
|
846
|
+
"sample_size": sample_size,
|
|
847
|
+
"scaled_n": scaled_n,
|
|
848
|
+
"total_words_in_text": len(word_tokens),
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
return FORCASTResult(
|
|
852
|
+
forcast_score=forcast_score,
|
|
853
|
+
grade_level=grade_level,
|
|
854
|
+
single_syllable_ratio=single_syllable_ratio,
|
|
855
|
+
single_syllable_count=single_syllable_count,
|
|
856
|
+
total_words=sample_size,
|
|
857
|
+
metadata=metadata,
|
|
858
|
+
)
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
def compute_powers_sumner_kearl(text: str) -> PowersSumnerKearlResult:
|
|
862
|
+
"""
|
|
863
|
+
Compute Powers-Sumner-Kearl Readability Formula.
|
|
864
|
+
|
|
865
|
+
The Powers-Sumner-Kearl (PSK) formula is a recalibration of the Flesch
|
|
866
|
+
Reading Ease formula, optimized for primary grade levels (grades 1-4).
|
|
867
|
+
It uses the same inputs (sentence length, syllables per word) but with
|
|
868
|
+
different coefficients.
|
|
869
|
+
|
|
870
|
+
Related GitHub Issue:
|
|
871
|
+
#16 - Additional Readability Formulas
|
|
872
|
+
https://github.com/craigtrim/pystylometry/issues/16
|
|
873
|
+
|
|
874
|
+
Formula:
|
|
875
|
+
Grade Level = 0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
|
|
876
|
+
|
|
877
|
+
The formula was derived from analysis of primary-grade texts and provides
|
|
878
|
+
more accurate grade-level estimates for beginning readers than the original
|
|
879
|
+
Flesch formula.
|
|
880
|
+
|
|
881
|
+
Advantages:
|
|
882
|
+
- Optimized for primary grades (1-4)
|
|
883
|
+
- More accurate than Flesch for young readers
|
|
884
|
+
- Uses same inputs as Flesch (easy to compare)
|
|
885
|
+
- Well-validated on educational materials
|
|
886
|
+
|
|
887
|
+
Disadvantages:
|
|
888
|
+
- Less accurate for higher grade levels
|
|
889
|
+
- Less well-known than Flesch
|
|
890
|
+
- Limited range (not suitable for college-level texts)
|
|
891
|
+
- Requires syllable counting
|
|
892
|
+
|
|
893
|
+
Args:
|
|
894
|
+
text: Input text to analyze. Optimized for children's literature
|
|
895
|
+
and primary-grade educational materials. Empty text returns
|
|
896
|
+
NaN values.
|
|
897
|
+
|
|
898
|
+
Returns:
|
|
899
|
+
PowersSumnerKearlResult containing:
|
|
900
|
+
- psk_score: The Powers-Sumner-Kearl score
|
|
901
|
+
- grade_level: Corresponding grade (decimal, e.g., 2.5 = mid-2nd grade)
|
|
902
|
+
- avg_sentence_length: Average words per sentence
|
|
903
|
+
- avg_syllables_per_word: Average syllables per word
|
|
904
|
+
- total_sentences: Total sentence count
|
|
905
|
+
- total_words: Total word count
|
|
906
|
+
- total_syllables: Total syllable count
|
|
907
|
+
- metadata: Comparison to Flesch, calculation details, etc.
|
|
908
|
+
|
|
909
|
+
Example:
|
|
910
|
+
>>> result = compute_powers_sumner_kearl("Children's book text...")
|
|
911
|
+
>>> print(f"PSK score: {result.psk_score:.2f}")
|
|
912
|
+
PSK score: 2.3
|
|
913
|
+
>>> print(f"Grade level: {result.grade_level:.1f}")
|
|
914
|
+
Grade level: 2.3
|
|
915
|
+
>>> print(f"Avg sentence length: {result.avg_sentence_length:.1f}")
|
|
916
|
+
Avg sentence length: 8.5
|
|
917
|
+
|
|
918
|
+
Note:
|
|
919
|
+
- Most accurate for grades 1-4
|
|
920
|
+
- Can produce negative scores for very simple texts
|
|
921
|
+
- Grade level is continuous (can be decimal)
|
|
922
|
+
- Syllable counting required (same as Flesch)
|
|
923
|
+
- Compare to Flesch results for validation
|
|
924
|
+
"""
|
|
925
|
+
# Tokenize and segment
|
|
926
|
+
sentences = split_sentences(text)
|
|
927
|
+
tokens = tokenize(text)
|
|
928
|
+
word_tokens = normalize_for_readability(tokens)
|
|
929
|
+
|
|
930
|
+
if len(sentences) == 0 or len(word_tokens) == 0:
|
|
931
|
+
return PowersSumnerKearlResult(
|
|
932
|
+
psk_score=float("nan"),
|
|
933
|
+
grade_level=float("nan"),
|
|
934
|
+
avg_sentence_length=float("nan"),
|
|
935
|
+
avg_syllables_per_word=float("nan"),
|
|
936
|
+
total_sentences=0,
|
|
937
|
+
total_words=0,
|
|
938
|
+
total_syllables=0,
|
|
939
|
+
metadata={
|
|
940
|
+
"flesch_reading_ease": float("nan"),
|
|
941
|
+
"flesch_kincaid_grade": float("nan"),
|
|
942
|
+
},
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
# Count syllables
|
|
946
|
+
total_syllables = sum(count_syllables(word) for word in word_tokens)
|
|
947
|
+
|
|
948
|
+
# Calculate metrics
|
|
949
|
+
avg_sentence_length = len(word_tokens) / len(sentences)
|
|
950
|
+
avg_syllables_per_word = total_syllables / len(word_tokens)
|
|
951
|
+
|
|
952
|
+
# Apply Powers-Sumner-Kearl formula
|
|
953
|
+
# Grade = 0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
|
|
954
|
+
psk_score = (
|
|
955
|
+
0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
|
|
956
|
+
)
|
|
957
|
+
grade_level = round(psk_score, 1) # Round to 1 decimal place
|
|
958
|
+
|
|
959
|
+
# Optional: Calculate Flesch scores for comparison
|
|
960
|
+
flesch_reading_ease = (
|
|
961
|
+
206.835 - 1.015 * avg_sentence_length - 84.6 * avg_syllables_per_word
|
|
962
|
+
)
|
|
963
|
+
flesch_kincaid_grade = (
|
|
964
|
+
0.39 * avg_sentence_length + 11.8 * avg_syllables_per_word - 15.59
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
# Build metadata
|
|
968
|
+
metadata = {
|
|
969
|
+
"flesch_reading_ease": flesch_reading_ease,
|
|
970
|
+
"flesch_kincaid_grade": flesch_kincaid_grade,
|
|
971
|
+
"difference_from_flesch": psk_score - flesch_kincaid_grade,
|
|
972
|
+
"words_per_sentence": avg_sentence_length,
|
|
973
|
+
"syllables_per_word": avg_syllables_per_word,
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
return PowersSumnerKearlResult(
|
|
977
|
+
psk_score=psk_score,
|
|
978
|
+
grade_level=grade_level,
|
|
979
|
+
avg_sentence_length=avg_sentence_length,
|
|
980
|
+
avg_syllables_per_word=avg_syllables_per_word,
|
|
981
|
+
total_sentences=len(sentences),
|
|
982
|
+
total_words=len(word_tokens),
|
|
983
|
+
total_syllables=total_syllables,
|
|
984
|
+
metadata=metadata,
|
|
985
|
+
)
|