pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +45 -3
- pystylometry/_types.py +1017 -259
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +28 -4
- pystylometry/authorship/additional_methods.py +260 -40
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +354 -0
- pystylometry/character/README.md +17 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/README.md +27 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- pystylometry-1.3.0.dist-info/METADATA +136 -0
- pystylometry-1.3.0.dist-info/RECORD +76 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
- pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/METADATA +0 -275
- pystylometry-1.0.0.dist-info/RECORD +0 -46
|
@@ -5,9 +5,9 @@ This module provides additional readability metrics beyond the core formulas
|
|
|
5
5
|
approaches to measuring text difficulty and are valuable for cross-validation
|
|
6
6
|
and comprehensive readability assessment.
|
|
7
7
|
|
|
8
|
-
Related GitHub
|
|
8
|
+
Related GitHub Issues:
|
|
9
9
|
#16 - Additional Readability Formulas
|
|
10
|
-
|
|
10
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
11
11
|
|
|
12
12
|
Formulas implemented:
|
|
13
13
|
- Dale-Chall: Based on list of 3000 familiar words
|
|
@@ -28,230 +28,1425 @@ References:
|
|
|
28
28
|
adult readability formulas. Journal of Educational Psychology.
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
|
+
import math
|
|
32
|
+
|
|
31
33
|
from .._normalize import normalize_for_readability
|
|
32
34
|
from .._types import (
|
|
33
35
|
DaleChallResult,
|
|
36
|
+
Distribution,
|
|
34
37
|
FORCASTResult,
|
|
35
38
|
FryResult,
|
|
36
39
|
LinsearWriteResult,
|
|
37
40
|
PowersSumnerKearlResult,
|
|
41
|
+
chunk_text,
|
|
42
|
+
make_distribution,
|
|
38
43
|
)
|
|
39
44
|
from .._utils import split_sentences, tokenize
|
|
40
45
|
from .syllables import count_syllables
|
|
41
46
|
|
|
42
|
-
|
|
43
47
|
# Dale-Chall List of Familiar Words (subset of ~1200 words)
|
|
44
48
|
# GitHub Issue #16: https://github.com/craigtrim/pystylometry/issues/16
|
|
45
49
|
# Full Dale-Chall list has 3000 words that 80% of 4th graders understand.
|
|
46
50
|
# This is a representative subset covering most common everyday words.
|
|
47
51
|
DALE_CHALL_FAMILIAR_WORDS = {
|
|
48
52
|
# Articles, pronouns, determiners
|
|
49
|
-
"a",
|
|
50
|
-
"
|
|
51
|
-
"
|
|
52
|
-
"
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
"
|
|
57
|
-
"
|
|
58
|
-
|
|
53
|
+
"a",
|
|
54
|
+
"an",
|
|
55
|
+
"the",
|
|
56
|
+
"this",
|
|
57
|
+
"that",
|
|
58
|
+
"these",
|
|
59
|
+
"those",
|
|
60
|
+
"some",
|
|
61
|
+
"any",
|
|
62
|
+
"all",
|
|
63
|
+
"each",
|
|
64
|
+
"every",
|
|
65
|
+
"both",
|
|
66
|
+
"few",
|
|
67
|
+
"many",
|
|
68
|
+
"much",
|
|
69
|
+
"more",
|
|
70
|
+
"most",
|
|
71
|
+
"other",
|
|
72
|
+
"another",
|
|
73
|
+
"such",
|
|
74
|
+
"what",
|
|
75
|
+
"which",
|
|
76
|
+
"who",
|
|
77
|
+
"whom",
|
|
78
|
+
"whose",
|
|
79
|
+
"whoever",
|
|
80
|
+
"i",
|
|
81
|
+
"me",
|
|
82
|
+
"my",
|
|
83
|
+
"mine",
|
|
84
|
+
"myself",
|
|
85
|
+
"we",
|
|
86
|
+
"us",
|
|
87
|
+
"our",
|
|
88
|
+
"ours",
|
|
89
|
+
"ourselves",
|
|
90
|
+
"you",
|
|
91
|
+
"your",
|
|
92
|
+
"yours",
|
|
93
|
+
"yourself",
|
|
94
|
+
"yourselves",
|
|
95
|
+
"he",
|
|
96
|
+
"him",
|
|
97
|
+
"his",
|
|
98
|
+
"himself",
|
|
99
|
+
"she",
|
|
100
|
+
"her",
|
|
101
|
+
"hers",
|
|
102
|
+
"herself",
|
|
103
|
+
"it",
|
|
104
|
+
"its",
|
|
105
|
+
"itself",
|
|
106
|
+
"they",
|
|
107
|
+
"them",
|
|
108
|
+
"their",
|
|
109
|
+
"theirs",
|
|
110
|
+
"themselves",
|
|
111
|
+
"one",
|
|
112
|
+
"ones",
|
|
113
|
+
"someone",
|
|
114
|
+
"somebody",
|
|
115
|
+
"something",
|
|
116
|
+
"anyone",
|
|
117
|
+
"anybody",
|
|
118
|
+
"anything",
|
|
119
|
+
"everyone",
|
|
120
|
+
"everybody",
|
|
121
|
+
"everything",
|
|
122
|
+
"no",
|
|
123
|
+
"none",
|
|
124
|
+
"nobody",
|
|
125
|
+
"nothing",
|
|
59
126
|
# Conjunctions and prepositions
|
|
60
|
-
"and",
|
|
61
|
-
"
|
|
62
|
-
"
|
|
63
|
-
"
|
|
64
|
-
"
|
|
65
|
-
"
|
|
66
|
-
"
|
|
67
|
-
|
|
127
|
+
"and",
|
|
128
|
+
"or",
|
|
129
|
+
"but",
|
|
130
|
+
"if",
|
|
131
|
+
"when",
|
|
132
|
+
"where",
|
|
133
|
+
"why",
|
|
134
|
+
"how",
|
|
135
|
+
"because",
|
|
136
|
+
"so",
|
|
137
|
+
"for",
|
|
138
|
+
"nor",
|
|
139
|
+
"yet",
|
|
140
|
+
"after",
|
|
141
|
+
"before",
|
|
142
|
+
"while",
|
|
143
|
+
"since",
|
|
144
|
+
"until",
|
|
145
|
+
"unless",
|
|
146
|
+
"though",
|
|
147
|
+
"although",
|
|
148
|
+
"whether",
|
|
149
|
+
"than",
|
|
150
|
+
"as",
|
|
151
|
+
"like",
|
|
152
|
+
"of",
|
|
153
|
+
"to",
|
|
154
|
+
"in",
|
|
155
|
+
"on",
|
|
156
|
+
"at",
|
|
157
|
+
"by",
|
|
158
|
+
"with",
|
|
159
|
+
"from",
|
|
160
|
+
"about",
|
|
161
|
+
"into",
|
|
162
|
+
"through",
|
|
163
|
+
"over",
|
|
164
|
+
"under",
|
|
165
|
+
"above",
|
|
166
|
+
"below",
|
|
167
|
+
"between",
|
|
168
|
+
"among",
|
|
169
|
+
"against",
|
|
170
|
+
"during",
|
|
171
|
+
"without",
|
|
172
|
+
"within",
|
|
173
|
+
"along",
|
|
174
|
+
"across",
|
|
175
|
+
"behind",
|
|
176
|
+
"beside",
|
|
177
|
+
"near",
|
|
178
|
+
"off",
|
|
179
|
+
"out",
|
|
180
|
+
"up",
|
|
181
|
+
"down",
|
|
182
|
+
"around",
|
|
183
|
+
"past",
|
|
184
|
+
"toward",
|
|
185
|
+
"upon",
|
|
68
186
|
# Common verbs (base, past, -ing, -ed forms included)
|
|
69
|
-
"be",
|
|
70
|
-
"
|
|
71
|
-
"
|
|
72
|
-
"
|
|
73
|
-
"
|
|
74
|
-
"
|
|
75
|
-
"
|
|
76
|
-
"
|
|
77
|
-
"
|
|
78
|
-
"
|
|
79
|
-
"
|
|
80
|
-
"
|
|
81
|
-
"
|
|
82
|
-
"
|
|
83
|
-
"
|
|
84
|
-
"
|
|
85
|
-
"
|
|
86
|
-
"
|
|
87
|
-
"
|
|
88
|
-
"
|
|
89
|
-
"
|
|
90
|
-
"
|
|
91
|
-
"
|
|
92
|
-
"
|
|
93
|
-
"
|
|
94
|
-
"
|
|
95
|
-
"
|
|
96
|
-
"
|
|
97
|
-
"
|
|
98
|
-
"
|
|
99
|
-
"
|
|
100
|
-
"
|
|
101
|
-
"
|
|
102
|
-
"
|
|
103
|
-
"
|
|
104
|
-
"
|
|
105
|
-
"
|
|
106
|
-
"
|
|
107
|
-
"
|
|
108
|
-
"
|
|
109
|
-
"
|
|
110
|
-
"
|
|
111
|
-
"
|
|
112
|
-
"
|
|
113
|
-
"
|
|
114
|
-
"
|
|
115
|
-
"
|
|
116
|
-
"
|
|
117
|
-
"
|
|
118
|
-
"
|
|
119
|
-
"
|
|
120
|
-
"
|
|
121
|
-
"
|
|
122
|
-
"
|
|
123
|
-
"
|
|
124
|
-
|
|
187
|
+
"be",
|
|
188
|
+
"am",
|
|
189
|
+
"is",
|
|
190
|
+
"are",
|
|
191
|
+
"was",
|
|
192
|
+
"were",
|
|
193
|
+
"been",
|
|
194
|
+
"being",
|
|
195
|
+
"have",
|
|
196
|
+
"has",
|
|
197
|
+
"had",
|
|
198
|
+
"having",
|
|
199
|
+
"do",
|
|
200
|
+
"does",
|
|
201
|
+
"did",
|
|
202
|
+
"doing",
|
|
203
|
+
"done",
|
|
204
|
+
"will",
|
|
205
|
+
"would",
|
|
206
|
+
"shall",
|
|
207
|
+
"should",
|
|
208
|
+
"may",
|
|
209
|
+
"might",
|
|
210
|
+
"must",
|
|
211
|
+
"can",
|
|
212
|
+
"could",
|
|
213
|
+
"go",
|
|
214
|
+
"goes",
|
|
215
|
+
"went",
|
|
216
|
+
"gone",
|
|
217
|
+
"going",
|
|
218
|
+
"come",
|
|
219
|
+
"comes",
|
|
220
|
+
"came",
|
|
221
|
+
"coming",
|
|
222
|
+
"make",
|
|
223
|
+
"makes",
|
|
224
|
+
"made",
|
|
225
|
+
"making",
|
|
226
|
+
"get",
|
|
227
|
+
"gets",
|
|
228
|
+
"got",
|
|
229
|
+
"getting",
|
|
230
|
+
"gotten",
|
|
231
|
+
"know",
|
|
232
|
+
"knows",
|
|
233
|
+
"knew",
|
|
234
|
+
"known",
|
|
235
|
+
"knowing",
|
|
236
|
+
"think",
|
|
237
|
+
"thinks",
|
|
238
|
+
"thought",
|
|
239
|
+
"thinking",
|
|
240
|
+
"see",
|
|
241
|
+
"sees",
|
|
242
|
+
"saw",
|
|
243
|
+
"seen",
|
|
244
|
+
"seeing",
|
|
245
|
+
"look",
|
|
246
|
+
"looks",
|
|
247
|
+
"looked",
|
|
248
|
+
"looking",
|
|
249
|
+
"take",
|
|
250
|
+
"takes",
|
|
251
|
+
"took",
|
|
252
|
+
"taken",
|
|
253
|
+
"taking",
|
|
254
|
+
"give",
|
|
255
|
+
"gives",
|
|
256
|
+
"gave",
|
|
257
|
+
"given",
|
|
258
|
+
"giving",
|
|
259
|
+
"find",
|
|
260
|
+
"finds",
|
|
261
|
+
"found",
|
|
262
|
+
"finding",
|
|
263
|
+
"tell",
|
|
264
|
+
"tells",
|
|
265
|
+
"told",
|
|
266
|
+
"telling",
|
|
267
|
+
"ask",
|
|
268
|
+
"asks",
|
|
269
|
+
"asked",
|
|
270
|
+
"asking",
|
|
271
|
+
"work",
|
|
272
|
+
"works",
|
|
273
|
+
"worked",
|
|
274
|
+
"working",
|
|
275
|
+
"seem",
|
|
276
|
+
"seems",
|
|
277
|
+
"seemed",
|
|
278
|
+
"seeming",
|
|
279
|
+
"feel",
|
|
280
|
+
"feels",
|
|
281
|
+
"felt",
|
|
282
|
+
"feeling",
|
|
283
|
+
"try",
|
|
284
|
+
"tries",
|
|
285
|
+
"tried",
|
|
286
|
+
"trying",
|
|
287
|
+
"leave",
|
|
288
|
+
"leaves",
|
|
289
|
+
"left",
|
|
290
|
+
"leaving",
|
|
291
|
+
"call",
|
|
292
|
+
"calls",
|
|
293
|
+
"called",
|
|
294
|
+
"calling",
|
|
295
|
+
"use",
|
|
296
|
+
"uses",
|
|
297
|
+
"used",
|
|
298
|
+
"using",
|
|
299
|
+
"want",
|
|
300
|
+
"wants",
|
|
301
|
+
"wanted",
|
|
302
|
+
"wanting",
|
|
303
|
+
"need",
|
|
304
|
+
"needs",
|
|
305
|
+
"needed",
|
|
306
|
+
"needing",
|
|
307
|
+
"say",
|
|
308
|
+
"says",
|
|
309
|
+
"said",
|
|
310
|
+
"saying",
|
|
311
|
+
"talk",
|
|
312
|
+
"talks",
|
|
313
|
+
"talked",
|
|
314
|
+
"talking",
|
|
315
|
+
"turn",
|
|
316
|
+
"turns",
|
|
317
|
+
"turned",
|
|
318
|
+
"turning",
|
|
319
|
+
"run",
|
|
320
|
+
"runs",
|
|
321
|
+
"ran",
|
|
322
|
+
"running",
|
|
323
|
+
"move",
|
|
324
|
+
"moves",
|
|
325
|
+
"moved",
|
|
326
|
+
"moving",
|
|
327
|
+
"live",
|
|
328
|
+
"lives",
|
|
329
|
+
"lived",
|
|
330
|
+
"living",
|
|
331
|
+
"believe",
|
|
332
|
+
"believes",
|
|
333
|
+
"believed",
|
|
334
|
+
"believing",
|
|
335
|
+
"hold",
|
|
336
|
+
"holds",
|
|
337
|
+
"held",
|
|
338
|
+
"holding",
|
|
339
|
+
"bring",
|
|
340
|
+
"brings",
|
|
341
|
+
"brought",
|
|
342
|
+
"bringing",
|
|
343
|
+
"happen",
|
|
344
|
+
"happens",
|
|
345
|
+
"happened",
|
|
346
|
+
"happening",
|
|
347
|
+
"write",
|
|
348
|
+
"writes",
|
|
349
|
+
"wrote",
|
|
350
|
+
"written",
|
|
351
|
+
"writing",
|
|
352
|
+
"sit",
|
|
353
|
+
"sits",
|
|
354
|
+
"sat",
|
|
355
|
+
"sitting",
|
|
356
|
+
"stand",
|
|
357
|
+
"stands",
|
|
358
|
+
"stood",
|
|
359
|
+
"standing",
|
|
360
|
+
"hear",
|
|
361
|
+
"hears",
|
|
362
|
+
"heard",
|
|
363
|
+
"hearing",
|
|
364
|
+
"let",
|
|
365
|
+
"lets",
|
|
366
|
+
"letting",
|
|
367
|
+
"help",
|
|
368
|
+
"helps",
|
|
369
|
+
"helped",
|
|
370
|
+
"helping",
|
|
371
|
+
"show",
|
|
372
|
+
"shows",
|
|
373
|
+
"showed",
|
|
374
|
+
"shown",
|
|
375
|
+
"showing",
|
|
376
|
+
"play",
|
|
377
|
+
"plays",
|
|
378
|
+
"played",
|
|
379
|
+
"playing",
|
|
380
|
+
"read",
|
|
381
|
+
"reads",
|
|
382
|
+
"reading",
|
|
383
|
+
"change",
|
|
384
|
+
"changes",
|
|
385
|
+
"changed",
|
|
386
|
+
"changing",
|
|
387
|
+
"keep",
|
|
388
|
+
"keeps",
|
|
389
|
+
"kept",
|
|
390
|
+
"keeping",
|
|
391
|
+
"start",
|
|
392
|
+
"starts",
|
|
393
|
+
"started",
|
|
394
|
+
"starting",
|
|
395
|
+
"stop",
|
|
396
|
+
"stops",
|
|
397
|
+
"stopped",
|
|
398
|
+
"stopping",
|
|
399
|
+
"learn",
|
|
400
|
+
"learns",
|
|
401
|
+
"learned",
|
|
402
|
+
"learning",
|
|
403
|
+
"grow",
|
|
404
|
+
"grows",
|
|
405
|
+
"grew",
|
|
406
|
+
"grown",
|
|
407
|
+
"growing",
|
|
408
|
+
"open",
|
|
409
|
+
"opens",
|
|
410
|
+
"opened",
|
|
411
|
+
"opening",
|
|
412
|
+
"close",
|
|
413
|
+
"closes",
|
|
414
|
+
"closed",
|
|
415
|
+
"closing",
|
|
416
|
+
"walk",
|
|
417
|
+
"walks",
|
|
418
|
+
"walked",
|
|
419
|
+
"walking",
|
|
420
|
+
"win",
|
|
421
|
+
"wins",
|
|
422
|
+
"won",
|
|
423
|
+
"winning",
|
|
424
|
+
"begin",
|
|
425
|
+
"begins",
|
|
426
|
+
"began",
|
|
427
|
+
"begun",
|
|
428
|
+
"beginning",
|
|
429
|
+
"end",
|
|
430
|
+
"ends",
|
|
431
|
+
"ended",
|
|
432
|
+
"ending",
|
|
433
|
+
"lose",
|
|
434
|
+
"loses",
|
|
435
|
+
"lost",
|
|
436
|
+
"losing",
|
|
437
|
+
"send",
|
|
438
|
+
"sends",
|
|
439
|
+
"sent",
|
|
440
|
+
"sending",
|
|
441
|
+
"buy",
|
|
442
|
+
"buys",
|
|
443
|
+
"bought",
|
|
444
|
+
"buying",
|
|
445
|
+
"pay",
|
|
446
|
+
"pays",
|
|
447
|
+
"paid",
|
|
448
|
+
"paying",
|
|
449
|
+
"eat",
|
|
450
|
+
"eats",
|
|
451
|
+
"ate",
|
|
452
|
+
"eaten",
|
|
453
|
+
"eating",
|
|
454
|
+
"drink",
|
|
455
|
+
"drinks",
|
|
456
|
+
"drank",
|
|
457
|
+
"drinking",
|
|
458
|
+
"sleep",
|
|
459
|
+
"sleeps",
|
|
460
|
+
"slept",
|
|
461
|
+
"sleeping",
|
|
462
|
+
"wake",
|
|
463
|
+
"wakes",
|
|
464
|
+
"woke",
|
|
465
|
+
"waking",
|
|
466
|
+
"sing",
|
|
467
|
+
"sings",
|
|
468
|
+
"sang",
|
|
469
|
+
"sung",
|
|
470
|
+
"singing",
|
|
471
|
+
"dance",
|
|
472
|
+
"dances",
|
|
473
|
+
"danced",
|
|
474
|
+
"dancing",
|
|
475
|
+
"wait",
|
|
476
|
+
"waits",
|
|
477
|
+
"waited",
|
|
478
|
+
"waiting",
|
|
479
|
+
"stay",
|
|
480
|
+
"stays",
|
|
481
|
+
"stayed",
|
|
482
|
+
"staying",
|
|
483
|
+
"fly",
|
|
484
|
+
"flies",
|
|
485
|
+
"flew",
|
|
486
|
+
"flown",
|
|
487
|
+
"flying",
|
|
488
|
+
"fall",
|
|
489
|
+
"falls",
|
|
490
|
+
"fell",
|
|
491
|
+
"fallen",
|
|
492
|
+
"falling",
|
|
493
|
+
"cut",
|
|
494
|
+
"cuts",
|
|
495
|
+
"cutting",
|
|
496
|
+
"break",
|
|
497
|
+
"breaks",
|
|
498
|
+
"broke",
|
|
499
|
+
"broken",
|
|
500
|
+
"breaking",
|
|
501
|
+
"watch",
|
|
502
|
+
"watches",
|
|
503
|
+
"watched",
|
|
504
|
+
"watching",
|
|
505
|
+
"listen",
|
|
506
|
+
"listens",
|
|
507
|
+
"listened",
|
|
508
|
+
"listening",
|
|
509
|
+
"remember",
|
|
510
|
+
"remembers",
|
|
511
|
+
"remembered",
|
|
512
|
+
"remembering",
|
|
513
|
+
"forget",
|
|
514
|
+
"forgets",
|
|
515
|
+
"forgot",
|
|
516
|
+
"forgotten",
|
|
517
|
+
"forgetting",
|
|
518
|
+
"meet",
|
|
519
|
+
"meets",
|
|
520
|
+
"met",
|
|
521
|
+
"meeting",
|
|
522
|
+
"follow",
|
|
523
|
+
"follows",
|
|
524
|
+
"followed",
|
|
525
|
+
"following",
|
|
526
|
+
"carry",
|
|
527
|
+
"carries",
|
|
528
|
+
"carried",
|
|
529
|
+
"carrying",
|
|
530
|
+
"catch",
|
|
531
|
+
"catches",
|
|
532
|
+
"caught",
|
|
533
|
+
"catching",
|
|
534
|
+
"draw",
|
|
535
|
+
"draws",
|
|
536
|
+
"drew",
|
|
537
|
+
"drawn",
|
|
538
|
+
"drawing",
|
|
539
|
+
"drive",
|
|
540
|
+
"drives",
|
|
541
|
+
"drove",
|
|
542
|
+
"driven",
|
|
543
|
+
"driving",
|
|
544
|
+
"ride",
|
|
545
|
+
"rides",
|
|
546
|
+
"rode",
|
|
547
|
+
"ridden",
|
|
548
|
+
"riding",
|
|
549
|
+
"wear",
|
|
550
|
+
"wears",
|
|
551
|
+
"wore",
|
|
552
|
+
"worn",
|
|
553
|
+
"wearing",
|
|
554
|
+
"pull",
|
|
555
|
+
"pulls",
|
|
556
|
+
"pulled",
|
|
557
|
+
"pulling",
|
|
558
|
+
"push",
|
|
559
|
+
"pushes",
|
|
560
|
+
"pushed",
|
|
561
|
+
"pushing",
|
|
562
|
+
"throw",
|
|
563
|
+
"throws",
|
|
564
|
+
"threw",
|
|
565
|
+
"thrown",
|
|
566
|
+
"throwing",
|
|
567
|
+
"reach",
|
|
568
|
+
"reaches",
|
|
569
|
+
"reached",
|
|
570
|
+
"reaching",
|
|
571
|
+
"pass",
|
|
572
|
+
"passes",
|
|
573
|
+
"passed",
|
|
574
|
+
"passing",
|
|
575
|
+
"shoot",
|
|
576
|
+
"shoots",
|
|
577
|
+
"shot",
|
|
578
|
+
"shooting",
|
|
579
|
+
"rise",
|
|
580
|
+
"rises",
|
|
581
|
+
"rose",
|
|
582
|
+
"risen",
|
|
583
|
+
"rising",
|
|
584
|
+
"blow",
|
|
585
|
+
"blows",
|
|
586
|
+
"blew",
|
|
587
|
+
"blown",
|
|
588
|
+
"blowing",
|
|
589
|
+
"grow",
|
|
590
|
+
"grows",
|
|
591
|
+
"grew",
|
|
592
|
+
"grown",
|
|
593
|
+
"growing",
|
|
594
|
+
"hit",
|
|
595
|
+
"hits",
|
|
596
|
+
"hitting",
|
|
597
|
+
"fight",
|
|
598
|
+
"fights",
|
|
599
|
+
"fought",
|
|
600
|
+
"fighting",
|
|
601
|
+
"die",
|
|
602
|
+
"dies",
|
|
603
|
+
"died",
|
|
604
|
+
"dying",
|
|
605
|
+
"kill",
|
|
606
|
+
"kills",
|
|
607
|
+
"killed",
|
|
608
|
+
"killing",
|
|
609
|
+
"speak",
|
|
610
|
+
"speaks",
|
|
611
|
+
"spoke",
|
|
612
|
+
"spoken",
|
|
613
|
+
"speaking",
|
|
125
614
|
# Common nouns
|
|
126
|
-
"time",
|
|
127
|
-
"
|
|
128
|
-
"
|
|
129
|
-
"
|
|
130
|
-
"
|
|
131
|
-
"
|
|
132
|
-
"
|
|
133
|
-
"
|
|
134
|
-
"
|
|
135
|
-
"
|
|
136
|
-
"
|
|
137
|
-
"
|
|
138
|
-
"
|
|
139
|
-
"
|
|
140
|
-
"
|
|
141
|
-
"
|
|
142
|
-
"
|
|
143
|
-
"
|
|
144
|
-
"
|
|
145
|
-
"
|
|
146
|
-
"
|
|
147
|
-
"
|
|
148
|
-
"
|
|
149
|
-
"
|
|
150
|
-
"
|
|
151
|
-
"
|
|
152
|
-
"
|
|
153
|
-
"
|
|
154
|
-
"
|
|
155
|
-
|
|
615
|
+
"time",
|
|
616
|
+
"times",
|
|
617
|
+
"year",
|
|
618
|
+
"years",
|
|
619
|
+
"day",
|
|
620
|
+
"days",
|
|
621
|
+
"week",
|
|
622
|
+
"weeks",
|
|
623
|
+
"month",
|
|
624
|
+
"months",
|
|
625
|
+
"hour",
|
|
626
|
+
"hours",
|
|
627
|
+
"minute",
|
|
628
|
+
"minutes",
|
|
629
|
+
"second",
|
|
630
|
+
"seconds",
|
|
631
|
+
"morning",
|
|
632
|
+
"afternoon",
|
|
633
|
+
"evening",
|
|
634
|
+
"night",
|
|
635
|
+
"today",
|
|
636
|
+
"yesterday",
|
|
637
|
+
"tomorrow",
|
|
638
|
+
"people",
|
|
639
|
+
"person",
|
|
640
|
+
"man",
|
|
641
|
+
"men",
|
|
642
|
+
"woman",
|
|
643
|
+
"women",
|
|
644
|
+
"child",
|
|
645
|
+
"children",
|
|
646
|
+
"boy",
|
|
647
|
+
"boys",
|
|
648
|
+
"girl",
|
|
649
|
+
"girls",
|
|
650
|
+
"baby",
|
|
651
|
+
"babies",
|
|
652
|
+
"friend",
|
|
653
|
+
"friends",
|
|
654
|
+
"family",
|
|
655
|
+
"families",
|
|
656
|
+
"mother",
|
|
657
|
+
"father",
|
|
658
|
+
"parent",
|
|
659
|
+
"parents",
|
|
660
|
+
"brother",
|
|
661
|
+
"brothers",
|
|
662
|
+
"sister",
|
|
663
|
+
"sisters",
|
|
664
|
+
"son",
|
|
665
|
+
"daughter",
|
|
666
|
+
"place",
|
|
667
|
+
"places",
|
|
668
|
+
"home",
|
|
669
|
+
"house",
|
|
670
|
+
"houses",
|
|
671
|
+
"room",
|
|
672
|
+
"rooms",
|
|
673
|
+
"school",
|
|
674
|
+
"schools",
|
|
675
|
+
"class",
|
|
676
|
+
"classes",
|
|
677
|
+
"student",
|
|
678
|
+
"students",
|
|
679
|
+
"teacher",
|
|
680
|
+
"teachers",
|
|
681
|
+
"way",
|
|
682
|
+
"ways",
|
|
683
|
+
"thing",
|
|
684
|
+
"things",
|
|
685
|
+
"part",
|
|
686
|
+
"parts",
|
|
687
|
+
"group",
|
|
688
|
+
"groups",
|
|
689
|
+
"number",
|
|
690
|
+
"numbers",
|
|
691
|
+
"side",
|
|
692
|
+
"sides",
|
|
693
|
+
"kind",
|
|
694
|
+
"kinds",
|
|
695
|
+
"head",
|
|
696
|
+
"heads",
|
|
697
|
+
"hand",
|
|
698
|
+
"hands",
|
|
699
|
+
"eye",
|
|
700
|
+
"eyes",
|
|
701
|
+
"face",
|
|
702
|
+
"faces",
|
|
703
|
+
"body",
|
|
704
|
+
"bodies",
|
|
705
|
+
"foot",
|
|
706
|
+
"feet",
|
|
707
|
+
"arm",
|
|
708
|
+
"arms",
|
|
709
|
+
"leg",
|
|
710
|
+
"legs",
|
|
711
|
+
"ear",
|
|
712
|
+
"ears",
|
|
713
|
+
"mouth",
|
|
714
|
+
"water",
|
|
715
|
+
"food",
|
|
716
|
+
"air",
|
|
717
|
+
"land",
|
|
718
|
+
"earth",
|
|
719
|
+
"ground",
|
|
720
|
+
"world",
|
|
721
|
+
"country",
|
|
722
|
+
"countries",
|
|
723
|
+
"state",
|
|
724
|
+
"states",
|
|
725
|
+
"city",
|
|
726
|
+
"cities",
|
|
727
|
+
"town",
|
|
728
|
+
"towns",
|
|
729
|
+
"name",
|
|
730
|
+
"names",
|
|
731
|
+
"word",
|
|
732
|
+
"words",
|
|
733
|
+
"line",
|
|
734
|
+
"lines",
|
|
735
|
+
"page",
|
|
736
|
+
"pages",
|
|
737
|
+
"book",
|
|
738
|
+
"books",
|
|
739
|
+
"story",
|
|
740
|
+
"stories",
|
|
741
|
+
"letter",
|
|
742
|
+
"letters",
|
|
743
|
+
"paper",
|
|
744
|
+
"papers",
|
|
745
|
+
"point",
|
|
746
|
+
"points",
|
|
747
|
+
"end",
|
|
748
|
+
"ends",
|
|
749
|
+
"top",
|
|
750
|
+
"bottom",
|
|
751
|
+
"front",
|
|
752
|
+
"back",
|
|
753
|
+
"life",
|
|
754
|
+
"lives",
|
|
755
|
+
"problem",
|
|
756
|
+
"problems",
|
|
757
|
+
"question",
|
|
758
|
+
"questions",
|
|
759
|
+
"answer",
|
|
760
|
+
"answers",
|
|
761
|
+
"work",
|
|
762
|
+
"works",
|
|
763
|
+
"job",
|
|
764
|
+
"jobs",
|
|
765
|
+
"money",
|
|
766
|
+
"door",
|
|
767
|
+
"doors",
|
|
768
|
+
"window",
|
|
769
|
+
"windows",
|
|
770
|
+
"car",
|
|
771
|
+
"cars",
|
|
772
|
+
"road",
|
|
773
|
+
"roads",
|
|
774
|
+
"street",
|
|
775
|
+
"streets",
|
|
776
|
+
"tree",
|
|
777
|
+
"trees",
|
|
778
|
+
"animal",
|
|
779
|
+
"animals",
|
|
780
|
+
"bird",
|
|
781
|
+
"birds",
|
|
782
|
+
"fish",
|
|
783
|
+
"dog",
|
|
784
|
+
"dogs",
|
|
785
|
+
"cat",
|
|
786
|
+
"cats",
|
|
787
|
+
"horse",
|
|
788
|
+
"horses",
|
|
789
|
+
"sea",
|
|
790
|
+
"mountain",
|
|
791
|
+
"mountains",
|
|
792
|
+
"river",
|
|
793
|
+
"rivers",
|
|
794
|
+
"sun",
|
|
795
|
+
"moon",
|
|
796
|
+
"star",
|
|
797
|
+
"stars",
|
|
798
|
+
"sky",
|
|
799
|
+
"cloud",
|
|
800
|
+
"clouds",
|
|
801
|
+
"rain",
|
|
802
|
+
"snow",
|
|
803
|
+
"wind",
|
|
804
|
+
"fire",
|
|
805
|
+
"light",
|
|
806
|
+
"dark",
|
|
807
|
+
"sound",
|
|
808
|
+
"sounds",
|
|
809
|
+
"color",
|
|
810
|
+
"colors",
|
|
811
|
+
"white",
|
|
812
|
+
"black",
|
|
813
|
+
"red",
|
|
814
|
+
"blue",
|
|
815
|
+
"green",
|
|
816
|
+
"yellow",
|
|
817
|
+
"brown",
|
|
818
|
+
"orange",
|
|
819
|
+
"game",
|
|
820
|
+
"games",
|
|
821
|
+
"ball",
|
|
822
|
+
"music",
|
|
823
|
+
"song",
|
|
824
|
+
"songs",
|
|
825
|
+
"picture",
|
|
826
|
+
"pictures",
|
|
827
|
+
"table",
|
|
828
|
+
"tables",
|
|
829
|
+
"chair",
|
|
830
|
+
"chairs",
|
|
831
|
+
"bed",
|
|
832
|
+
"beds",
|
|
833
|
+
"floor",
|
|
834
|
+
"wall",
|
|
835
|
+
"walls",
|
|
836
|
+
"minute",
|
|
837
|
+
"power",
|
|
838
|
+
"war",
|
|
839
|
+
"force",
|
|
840
|
+
"age",
|
|
841
|
+
"care",
|
|
842
|
+
"order",
|
|
843
|
+
"case",
|
|
156
844
|
# Common adjectives
|
|
157
|
-
"good",
|
|
158
|
-
"
|
|
159
|
-
"
|
|
160
|
-
"
|
|
161
|
-
"
|
|
162
|
-
"
|
|
163
|
-
"
|
|
164
|
-
"
|
|
165
|
-
"
|
|
166
|
-
"
|
|
167
|
-
"
|
|
168
|
-
"
|
|
169
|
-
"
|
|
170
|
-
"
|
|
171
|
-
"
|
|
172
|
-
"
|
|
173
|
-
"
|
|
174
|
-
"
|
|
175
|
-
"
|
|
176
|
-
"
|
|
177
|
-
"
|
|
178
|
-
"
|
|
179
|
-
|
|
845
|
+
"good",
|
|
846
|
+
"better",
|
|
847
|
+
"best",
|
|
848
|
+
"bad",
|
|
849
|
+
"worse",
|
|
850
|
+
"worst",
|
|
851
|
+
"big",
|
|
852
|
+
"bigger",
|
|
853
|
+
"biggest",
|
|
854
|
+
"small",
|
|
855
|
+
"smaller",
|
|
856
|
+
"smallest",
|
|
857
|
+
"large",
|
|
858
|
+
"larger",
|
|
859
|
+
"largest",
|
|
860
|
+
"little",
|
|
861
|
+
"less",
|
|
862
|
+
"least",
|
|
863
|
+
"long",
|
|
864
|
+
"longer",
|
|
865
|
+
"longest",
|
|
866
|
+
"short",
|
|
867
|
+
"shorter",
|
|
868
|
+
"shortest",
|
|
869
|
+
"high",
|
|
870
|
+
"higher",
|
|
871
|
+
"highest",
|
|
872
|
+
"low",
|
|
873
|
+
"lower",
|
|
874
|
+
"lowest",
|
|
875
|
+
"old",
|
|
876
|
+
"older",
|
|
877
|
+
"oldest",
|
|
878
|
+
"young",
|
|
879
|
+
"younger",
|
|
880
|
+
"youngest",
|
|
881
|
+
"new",
|
|
882
|
+
"newer",
|
|
883
|
+
"newest",
|
|
884
|
+
"great",
|
|
885
|
+
"greater",
|
|
886
|
+
"greatest",
|
|
887
|
+
"important",
|
|
888
|
+
"right",
|
|
889
|
+
"left",
|
|
890
|
+
"own",
|
|
891
|
+
"other",
|
|
892
|
+
"different",
|
|
893
|
+
"same",
|
|
894
|
+
"next",
|
|
895
|
+
"last",
|
|
896
|
+
"first",
|
|
897
|
+
"second",
|
|
898
|
+
"third",
|
|
899
|
+
"early",
|
|
900
|
+
"earlier",
|
|
901
|
+
"earliest",
|
|
902
|
+
"late",
|
|
903
|
+
"later",
|
|
904
|
+
"latest",
|
|
905
|
+
"easy",
|
|
906
|
+
"easier",
|
|
907
|
+
"easiest",
|
|
908
|
+
"hard",
|
|
909
|
+
"harder",
|
|
910
|
+
"hardest",
|
|
911
|
+
"hot",
|
|
912
|
+
"hotter",
|
|
913
|
+
"hottest",
|
|
914
|
+
"cold",
|
|
915
|
+
"colder",
|
|
916
|
+
"coldest",
|
|
917
|
+
"warm",
|
|
918
|
+
"warmer",
|
|
919
|
+
"warmest",
|
|
920
|
+
"cool",
|
|
921
|
+
"cooler",
|
|
922
|
+
"coolest",
|
|
923
|
+
"fast",
|
|
924
|
+
"faster",
|
|
925
|
+
"fastest",
|
|
926
|
+
"slow",
|
|
927
|
+
"slower",
|
|
928
|
+
"slowest",
|
|
929
|
+
"strong",
|
|
930
|
+
"stronger",
|
|
931
|
+
"strongest",
|
|
932
|
+
"weak",
|
|
933
|
+
"weaker",
|
|
934
|
+
"weakest",
|
|
935
|
+
"happy",
|
|
936
|
+
"happier",
|
|
937
|
+
"happiest",
|
|
938
|
+
"sad",
|
|
939
|
+
"sadder",
|
|
940
|
+
"saddest",
|
|
941
|
+
"nice",
|
|
942
|
+
"nicer",
|
|
943
|
+
"nicest",
|
|
944
|
+
"kind",
|
|
945
|
+
"kinder",
|
|
946
|
+
"kindest",
|
|
947
|
+
"sure",
|
|
948
|
+
"free",
|
|
949
|
+
"full",
|
|
950
|
+
"whole",
|
|
951
|
+
"ready",
|
|
952
|
+
"simple",
|
|
953
|
+
"clear",
|
|
954
|
+
"real",
|
|
955
|
+
"true",
|
|
956
|
+
"certain",
|
|
957
|
+
"public",
|
|
958
|
+
"able",
|
|
959
|
+
"several",
|
|
960
|
+
"open",
|
|
961
|
+
"closed",
|
|
962
|
+
"deep",
|
|
963
|
+
"wide",
|
|
964
|
+
"bright",
|
|
965
|
+
"dark",
|
|
966
|
+
"heavy",
|
|
967
|
+
"light",
|
|
968
|
+
"clean",
|
|
969
|
+
"dirty",
|
|
970
|
+
"wet",
|
|
971
|
+
"dry",
|
|
972
|
+
"soft",
|
|
973
|
+
"hard",
|
|
974
|
+
"quiet",
|
|
975
|
+
"loud",
|
|
976
|
+
"quick",
|
|
977
|
+
"slow",
|
|
978
|
+
"rich",
|
|
979
|
+
"poor",
|
|
980
|
+
"sick",
|
|
981
|
+
"well",
|
|
982
|
+
"dead",
|
|
983
|
+
"alive",
|
|
984
|
+
"empty",
|
|
985
|
+
"busy",
|
|
986
|
+
"pretty",
|
|
987
|
+
"beautiful",
|
|
988
|
+
"ugly",
|
|
180
989
|
# Common adverbs
|
|
181
|
-
"very",
|
|
182
|
-
"
|
|
183
|
-
"
|
|
184
|
-
"
|
|
185
|
-
"
|
|
186
|
-
"
|
|
187
|
-
"
|
|
188
|
-
"
|
|
189
|
-
"
|
|
190
|
-
|
|
990
|
+
"very",
|
|
991
|
+
"too",
|
|
992
|
+
"so",
|
|
993
|
+
"more",
|
|
994
|
+
"most",
|
|
995
|
+
"less",
|
|
996
|
+
"least",
|
|
997
|
+
"well",
|
|
998
|
+
"better",
|
|
999
|
+
"best",
|
|
1000
|
+
"just",
|
|
1001
|
+
"only",
|
|
1002
|
+
"even",
|
|
1003
|
+
"still",
|
|
1004
|
+
"also",
|
|
1005
|
+
"just",
|
|
1006
|
+
"now",
|
|
1007
|
+
"then",
|
|
1008
|
+
"here",
|
|
1009
|
+
"there",
|
|
1010
|
+
"where",
|
|
1011
|
+
"how",
|
|
1012
|
+
"when",
|
|
1013
|
+
"why",
|
|
1014
|
+
"not",
|
|
1015
|
+
"never",
|
|
1016
|
+
"always",
|
|
1017
|
+
"often",
|
|
1018
|
+
"sometimes",
|
|
1019
|
+
"usually",
|
|
1020
|
+
"ever",
|
|
1021
|
+
"again",
|
|
1022
|
+
"back",
|
|
1023
|
+
"away",
|
|
1024
|
+
"together",
|
|
1025
|
+
"once",
|
|
1026
|
+
"twice",
|
|
1027
|
+
"soon",
|
|
1028
|
+
"today",
|
|
1029
|
+
"yesterday",
|
|
1030
|
+
"tomorrow",
|
|
1031
|
+
"already",
|
|
1032
|
+
"almost",
|
|
1033
|
+
"enough",
|
|
1034
|
+
"quite",
|
|
1035
|
+
"rather",
|
|
1036
|
+
"really",
|
|
1037
|
+
"perhaps",
|
|
1038
|
+
"maybe",
|
|
1039
|
+
"probably",
|
|
1040
|
+
"certainly",
|
|
1041
|
+
"surely",
|
|
1042
|
+
"yes",
|
|
1043
|
+
"no",
|
|
1044
|
+
"please",
|
|
1045
|
+
"thank",
|
|
1046
|
+
"sorry",
|
|
191
1047
|
# Numbers
|
|
192
|
-
"zero",
|
|
193
|
-
"
|
|
194
|
-
"
|
|
195
|
-
"
|
|
196
|
-
"
|
|
197
|
-
|
|
1048
|
+
"zero",
|
|
1049
|
+
"one",
|
|
1050
|
+
"two",
|
|
1051
|
+
"three",
|
|
1052
|
+
"four",
|
|
1053
|
+
"five",
|
|
1054
|
+
"six",
|
|
1055
|
+
"seven",
|
|
1056
|
+
"eight",
|
|
1057
|
+
"nine",
|
|
1058
|
+
"ten",
|
|
1059
|
+
"eleven",
|
|
1060
|
+
"twelve",
|
|
1061
|
+
"thirteen",
|
|
1062
|
+
"fourteen",
|
|
1063
|
+
"fifteen",
|
|
1064
|
+
"sixteen",
|
|
1065
|
+
"seventeen",
|
|
1066
|
+
"eighteen",
|
|
1067
|
+
"nineteen",
|
|
1068
|
+
"twenty",
|
|
1069
|
+
"thirty",
|
|
1070
|
+
"forty",
|
|
1071
|
+
"fifty",
|
|
1072
|
+
"sixty",
|
|
1073
|
+
"seventy",
|
|
1074
|
+
"eighty",
|
|
1075
|
+
"ninety",
|
|
1076
|
+
"hundred",
|
|
1077
|
+
"thousand",
|
|
1078
|
+
"million",
|
|
1079
|
+
"first",
|
|
1080
|
+
"second",
|
|
1081
|
+
"third",
|
|
1082
|
+
"fourth",
|
|
1083
|
+
"fifth",
|
|
1084
|
+
"sixth",
|
|
1085
|
+
"seventh",
|
|
1086
|
+
"eighth",
|
|
1087
|
+
"ninth",
|
|
1088
|
+
"tenth",
|
|
198
1089
|
# Additional common words
|
|
199
|
-
"able",
|
|
200
|
-
"
|
|
201
|
-
"
|
|
202
|
-
"
|
|
203
|
-
"
|
|
204
|
-
"
|
|
205
|
-
"
|
|
206
|
-
"
|
|
207
|
-
"
|
|
208
|
-
"
|
|
209
|
-
"
|
|
210
|
-
"
|
|
211
|
-
"
|
|
212
|
-
"
|
|
213
|
-
"
|
|
214
|
-
"
|
|
215
|
-
"
|
|
216
|
-
"
|
|
217
|
-
"
|
|
218
|
-
"
|
|
219
|
-
"
|
|
220
|
-
"
|
|
221
|
-
"
|
|
222
|
-
"
|
|
223
|
-
"
|
|
224
|
-
"
|
|
225
|
-
"
|
|
226
|
-
"
|
|
227
|
-
"
|
|
228
|
-
"
|
|
229
|
-
"
|
|
230
|
-
"
|
|
231
|
-
"
|
|
232
|
-
"
|
|
233
|
-
"
|
|
234
|
-
"
|
|
235
|
-
"
|
|
236
|
-
"
|
|
237
|
-
"
|
|
238
|
-
"
|
|
239
|
-
"
|
|
240
|
-
"
|
|
1090
|
+
"able",
|
|
1091
|
+
"accept",
|
|
1092
|
+
"across",
|
|
1093
|
+
"act",
|
|
1094
|
+
"add",
|
|
1095
|
+
"afraid",
|
|
1096
|
+
"against",
|
|
1097
|
+
"agree",
|
|
1098
|
+
"allow",
|
|
1099
|
+
"alone",
|
|
1100
|
+
"appear",
|
|
1101
|
+
"apple",
|
|
1102
|
+
"area",
|
|
1103
|
+
"arm",
|
|
1104
|
+
"arrive",
|
|
1105
|
+
"art",
|
|
1106
|
+
"aunt",
|
|
1107
|
+
"ball",
|
|
1108
|
+
"become",
|
|
1109
|
+
"believe",
|
|
1110
|
+
"belong",
|
|
1111
|
+
"boat",
|
|
1112
|
+
"build",
|
|
1113
|
+
"burn",
|
|
1114
|
+
"business",
|
|
1115
|
+
"chair",
|
|
1116
|
+
"chance",
|
|
1117
|
+
"church",
|
|
1118
|
+
"clear",
|
|
1119
|
+
"climb",
|
|
1120
|
+
"clothe",
|
|
1121
|
+
"clothes",
|
|
1122
|
+
"company",
|
|
1123
|
+
"contain",
|
|
1124
|
+
"continue",
|
|
1125
|
+
"control",
|
|
1126
|
+
"cook",
|
|
1127
|
+
"corner",
|
|
1128
|
+
"cost",
|
|
1129
|
+
"count",
|
|
1130
|
+
"course",
|
|
1131
|
+
"cover",
|
|
1132
|
+
"create",
|
|
1133
|
+
"cross",
|
|
1134
|
+
"crowd",
|
|
1135
|
+
"cry",
|
|
1136
|
+
"decide",
|
|
1137
|
+
"depend",
|
|
1138
|
+
"describe",
|
|
1139
|
+
"develop",
|
|
1140
|
+
"die",
|
|
1141
|
+
"direction",
|
|
1142
|
+
"discover",
|
|
1143
|
+
"doctor",
|
|
1144
|
+
"double",
|
|
1145
|
+
"drop",
|
|
1146
|
+
"during",
|
|
1147
|
+
"edge",
|
|
1148
|
+
"effect",
|
|
1149
|
+
"eight",
|
|
1150
|
+
"either",
|
|
1151
|
+
"else",
|
|
1152
|
+
"enjoy",
|
|
1153
|
+
"enough",
|
|
1154
|
+
"enter",
|
|
1155
|
+
"example",
|
|
1156
|
+
"except",
|
|
1157
|
+
"excite",
|
|
1158
|
+
"expect",
|
|
1159
|
+
"explain",
|
|
1160
|
+
"express",
|
|
1161
|
+
"fact",
|
|
1162
|
+
"fair",
|
|
1163
|
+
"farm",
|
|
1164
|
+
"fear",
|
|
1165
|
+
"field",
|
|
1166
|
+
"fill",
|
|
1167
|
+
"final",
|
|
1168
|
+
"fine",
|
|
1169
|
+
"finger",
|
|
1170
|
+
"finish",
|
|
1171
|
+
"flower",
|
|
1172
|
+
"force",
|
|
1173
|
+
"foreign",
|
|
1174
|
+
"forest",
|
|
1175
|
+
"form",
|
|
1176
|
+
"fresh",
|
|
1177
|
+
"front",
|
|
1178
|
+
"garden",
|
|
1179
|
+
"general",
|
|
1180
|
+
"glass",
|
|
1181
|
+
"god",
|
|
1182
|
+
"gold",
|
|
1183
|
+
"hang",
|
|
1184
|
+
"hat",
|
|
1185
|
+
"hope",
|
|
1186
|
+
"hot",
|
|
1187
|
+
"idea",
|
|
1188
|
+
"include",
|
|
1189
|
+
"increase",
|
|
1190
|
+
"instead",
|
|
1191
|
+
"interest",
|
|
1192
|
+
"island",
|
|
1193
|
+
"join",
|
|
1194
|
+
"laugh",
|
|
1195
|
+
"law",
|
|
1196
|
+
"lead",
|
|
1197
|
+
"lie",
|
|
1198
|
+
"lift",
|
|
1199
|
+
"list",
|
|
1200
|
+
"lock",
|
|
1201
|
+
"love",
|
|
1202
|
+
"machine",
|
|
1203
|
+
"mark",
|
|
1204
|
+
"matter",
|
|
1205
|
+
"mean",
|
|
1206
|
+
"measure",
|
|
1207
|
+
"member",
|
|
1208
|
+
"mention",
|
|
1209
|
+
"middle",
|
|
1210
|
+
"mile",
|
|
1211
|
+
"mind",
|
|
1212
|
+
"miss",
|
|
1213
|
+
"moment",
|
|
1214
|
+
"nation",
|
|
1215
|
+
"natural",
|
|
1216
|
+
"nature",
|
|
1217
|
+
"necessary",
|
|
1218
|
+
"neighbor",
|
|
1219
|
+
"notice",
|
|
1220
|
+
"object",
|
|
1221
|
+
"ocean",
|
|
1222
|
+
"offer",
|
|
1223
|
+
"office",
|
|
1224
|
+
"opinion",
|
|
1225
|
+
"paint",
|
|
1226
|
+
"pair",
|
|
1227
|
+
"party",
|
|
1228
|
+
"pattern",
|
|
1229
|
+
"period",
|
|
1230
|
+
"pick",
|
|
1231
|
+
"plan",
|
|
1232
|
+
"plant",
|
|
1233
|
+
"position",
|
|
1234
|
+
"possible",
|
|
1235
|
+
"pound",
|
|
1236
|
+
"prepare",
|
|
1237
|
+
"present",
|
|
1238
|
+
"president",
|
|
1239
|
+
"press",
|
|
1240
|
+
"prince",
|
|
1241
|
+
"print",
|
|
1242
|
+
"probable",
|
|
1243
|
+
"produce",
|
|
1244
|
+
"promise",
|
|
1245
|
+
"proper",
|
|
1246
|
+
"protect",
|
|
1247
|
+
"prove",
|
|
1248
|
+
"purpose",
|
|
1249
|
+
"quarter",
|
|
1250
|
+
"queen",
|
|
1251
|
+
"question",
|
|
1252
|
+
"quick",
|
|
1253
|
+
"quiet",
|
|
1254
|
+
"race",
|
|
1255
|
+
"raise",
|
|
1256
|
+
"range",
|
|
1257
|
+
"rate",
|
|
1258
|
+
"reason",
|
|
1259
|
+
"receive",
|
|
1260
|
+
"record",
|
|
1261
|
+
"region",
|
|
1262
|
+
"remain",
|
|
1263
|
+
"reply",
|
|
1264
|
+
"report",
|
|
1265
|
+
"represent",
|
|
1266
|
+
"require",
|
|
1267
|
+
"rest",
|
|
1268
|
+
"result",
|
|
1269
|
+
"return",
|
|
1270
|
+
"roll",
|
|
1271
|
+
"rule",
|
|
1272
|
+
"sail",
|
|
1273
|
+
"salt",
|
|
1274
|
+
"save",
|
|
1275
|
+
"science",
|
|
1276
|
+
"season",
|
|
1277
|
+
"seat",
|
|
1278
|
+
"seem",
|
|
1279
|
+
"sell",
|
|
1280
|
+
"sense",
|
|
1281
|
+
"sentence",
|
|
1282
|
+
"separate",
|
|
1283
|
+
"serve",
|
|
1284
|
+
"set",
|
|
1285
|
+
"settle",
|
|
1286
|
+
"seven",
|
|
1287
|
+
"shape",
|
|
1288
|
+
"share",
|
|
1289
|
+
"ship",
|
|
1290
|
+
"shore",
|
|
1291
|
+
"sign",
|
|
1292
|
+
"silver",
|
|
1293
|
+
"single",
|
|
1294
|
+
"sir",
|
|
1295
|
+
"six",
|
|
1296
|
+
"size",
|
|
1297
|
+
"skin",
|
|
1298
|
+
"soldier",
|
|
1299
|
+
"solve",
|
|
1300
|
+
"south",
|
|
1301
|
+
"space",
|
|
1302
|
+
"special",
|
|
1303
|
+
"speed",
|
|
1304
|
+
"spell",
|
|
1305
|
+
"spend",
|
|
1306
|
+
"spread",
|
|
1307
|
+
"spring",
|
|
1308
|
+
"square",
|
|
1309
|
+
"step",
|
|
1310
|
+
"stone",
|
|
1311
|
+
"straight",
|
|
1312
|
+
"strange",
|
|
1313
|
+
"stream",
|
|
1314
|
+
"strength",
|
|
1315
|
+
"strike",
|
|
1316
|
+
"subject",
|
|
1317
|
+
"success",
|
|
1318
|
+
"sudden",
|
|
1319
|
+
"suffer",
|
|
1320
|
+
"suggest",
|
|
1321
|
+
"suit",
|
|
1322
|
+
"summer",
|
|
1323
|
+
"supply",
|
|
1324
|
+
"support",
|
|
1325
|
+
"suppose",
|
|
1326
|
+
"surface",
|
|
1327
|
+
"surprise",
|
|
1328
|
+
"sweet",
|
|
1329
|
+
"swim",
|
|
1330
|
+
"system",
|
|
1331
|
+
"tail",
|
|
1332
|
+
"taste",
|
|
1333
|
+
"teach",
|
|
1334
|
+
"team",
|
|
1335
|
+
"telephone",
|
|
1336
|
+
"television",
|
|
1337
|
+
"temperature",
|
|
1338
|
+
"ten",
|
|
1339
|
+
"test",
|
|
1340
|
+
"thick",
|
|
1341
|
+
"thin",
|
|
1342
|
+
"though",
|
|
1343
|
+
"thousand",
|
|
1344
|
+
"three",
|
|
1345
|
+
"tire",
|
|
1346
|
+
"total",
|
|
1347
|
+
"touch",
|
|
1348
|
+
"track",
|
|
1349
|
+
"train",
|
|
1350
|
+
"travel",
|
|
1351
|
+
"trip",
|
|
1352
|
+
"trouble",
|
|
1353
|
+
"type",
|
|
1354
|
+
"uncle",
|
|
1355
|
+
"understand",
|
|
1356
|
+
"unit",
|
|
1357
|
+
"universe",
|
|
1358
|
+
"value",
|
|
1359
|
+
"various",
|
|
1360
|
+
"view",
|
|
1361
|
+
"village",
|
|
1362
|
+
"visit",
|
|
1363
|
+
"voice",
|
|
1364
|
+
"vote",
|
|
1365
|
+
"wagon",
|
|
1366
|
+
"wander",
|
|
1367
|
+
"warm",
|
|
1368
|
+
"wash",
|
|
1369
|
+
"wave",
|
|
1370
|
+
"wealth",
|
|
1371
|
+
"weather",
|
|
1372
|
+
"weight",
|
|
1373
|
+
"welcome",
|
|
1374
|
+
"west",
|
|
1375
|
+
"wheel",
|
|
1376
|
+
"wild",
|
|
1377
|
+
"wind",
|
|
1378
|
+
"winter",
|
|
1379
|
+
"wish",
|
|
1380
|
+
"wonder",
|
|
1381
|
+
"wood",
|
|
1382
|
+
"yard",
|
|
1383
|
+
"yellow",
|
|
241
1384
|
}
|
|
242
1385
|
|
|
243
1386
|
|
|
244
|
-
def
|
|
1387
|
+
def _compute_dale_chall_single(text: str) -> tuple[float, int, float, float, dict]:
|
|
1388
|
+
"""Compute Dale-Chall for a single chunk."""
|
|
1389
|
+
sentences = split_sentences(text)
|
|
1390
|
+
tokens = tokenize(text)
|
|
1391
|
+
word_tokens = normalize_for_readability(tokens)
|
|
1392
|
+
|
|
1393
|
+
if len(sentences) == 0 or len(word_tokens) == 0:
|
|
1394
|
+
return (float("nan"), 0, float("nan"), float("nan"), {"sentence_count": 0, "word_count": 0})
|
|
1395
|
+
|
|
1396
|
+
difficult_words = [w for w in word_tokens if w.lower() not in DALE_CHALL_FAMILIAR_WORDS]
|
|
1397
|
+
difficult_word_count = len(difficult_words)
|
|
1398
|
+
difficult_word_ratio = difficult_word_count / len(word_tokens)
|
|
1399
|
+
difficult_word_pct = difficult_word_ratio * 100
|
|
1400
|
+
avg_sentence_length = len(word_tokens) / len(sentences)
|
|
1401
|
+
raw_score = 0.1579 * difficult_word_pct + 0.0496 * avg_sentence_length
|
|
1402
|
+
adjusted = difficult_word_pct > 5.0
|
|
1403
|
+
dale_chall_score = raw_score + 3.6365 if adjusted else raw_score
|
|
1404
|
+
|
|
1405
|
+
return (
|
|
1406
|
+
dale_chall_score,
|
|
1407
|
+
difficult_word_count,
|
|
1408
|
+
difficult_word_ratio,
|
|
1409
|
+
avg_sentence_length,
|
|
1410
|
+
{
|
|
1411
|
+
"sentence_count": len(sentences),
|
|
1412
|
+
"word_count": len(word_tokens),
|
|
1413
|
+
"adjusted": adjusted,
|
|
1414
|
+
"raw_score": raw_score,
|
|
1415
|
+
"difficult_word_pct": difficult_word_pct,
|
|
1416
|
+
},
|
|
1417
|
+
)
|
|
1418
|
+
|
|
1419
|
+
|
|
1420
|
+
def _get_dale_chall_grade_level(score: float) -> str:
|
|
1421
|
+
"""Map Dale-Chall score to grade level."""
|
|
1422
|
+
if math.isnan(score):
|
|
1423
|
+
return "Unknown"
|
|
1424
|
+
if score < 5.0:
|
|
1425
|
+
return "4 and below"
|
|
1426
|
+
elif score < 6.0:
|
|
1427
|
+
return "5-6"
|
|
1428
|
+
elif score < 7.0:
|
|
1429
|
+
return "7-8"
|
|
1430
|
+
elif score < 8.0:
|
|
1431
|
+
return "9-10"
|
|
1432
|
+
elif score < 9.0:
|
|
1433
|
+
return "11-12"
|
|
1434
|
+
elif score < 10.0:
|
|
1435
|
+
return "College"
|
|
1436
|
+
else:
|
|
1437
|
+
return "College Graduate"
|
|
1438
|
+
|
|
1439
|
+
|
|
1440
|
+
def compute_dale_chall(text: str, chunk_size: int = 1000) -> DaleChallResult:
|
|
245
1441
|
"""
|
|
246
1442
|
Compute Dale-Chall Readability Formula.
|
|
247
1443
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
by 80% of 4th graders). It also considers average sentence length.
|
|
1444
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
1445
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
251
1446
|
|
|
252
|
-
Related GitHub
|
|
1447
|
+
Related GitHub Issues:
|
|
253
1448
|
#16 - Additional Readability Formulas
|
|
254
|
-
|
|
1449
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
255
1450
|
|
|
256
1451
|
Formula:
|
|
257
1452
|
Raw Score = 0.1579 * (difficult_words_pct) + 0.0496 * (avg_sentence_length)
|
|
@@ -259,62 +1454,42 @@ def compute_dale_chall(text: str) -> DaleChallResult:
|
|
|
259
1454
|
If difficult_words_pct > 5%:
|
|
260
1455
|
Adjusted Score = Raw Score + 3.6365
|
|
261
1456
|
|
|
262
|
-
Grade Level Correspondence:
|
|
263
|
-
4.9 or lower: Grade 4 and below
|
|
264
|
-
5.0-5.9: Grades 5-6
|
|
265
|
-
6.0-6.9: Grades 7-8
|
|
266
|
-
7.0-7.9: Grades 9-10
|
|
267
|
-
8.0-8.9: Grades 11-12
|
|
268
|
-
9.0-9.9: Grades 13-15 (College)
|
|
269
|
-
10.0+: Grade 16+ (College Graduate)
|
|
270
|
-
|
|
271
|
-
Advantages:
|
|
272
|
-
- Based on empirical word familiarity data
|
|
273
|
-
- Works well for educational materials
|
|
274
|
-
- Well-validated across grade levels
|
|
275
|
-
- Considers both vocabulary and syntax
|
|
276
|
-
|
|
277
|
-
Disadvantages:
|
|
278
|
-
- Requires maintaining 3000-word familiar list
|
|
279
|
-
- List is dated (1948, updated 1995)
|
|
280
|
-
- May not reflect modern vocabulary
|
|
281
|
-
- Doesn't account for concept difficulty
|
|
282
|
-
|
|
283
1457
|
Args:
|
|
284
|
-
text: Input text to analyze
|
|
285
|
-
|
|
1458
|
+
text: Input text to analyze
|
|
1459
|
+
chunk_size: Number of words per chunk (default: 1000)
|
|
286
1460
|
|
|
287
1461
|
Returns:
|
|
288
|
-
DaleChallResult
|
|
289
|
-
- dale_chall_score: The Dale-Chall readability score
|
|
290
|
-
- grade_level: Grade range (e.g., "7-8", "College")
|
|
291
|
-
- difficult_word_count: Words not on familiar list
|
|
292
|
-
- difficult_word_ratio: Difficult words / total words
|
|
293
|
-
- avg_sentence_length: Average words per sentence
|
|
294
|
-
- total_words: Total word count
|
|
295
|
-
- metadata: List of difficult words, adjusted score flag, etc.
|
|
1462
|
+
DaleChallResult with dale_chall_score, grade_level, distributions, and metadata
|
|
296
1463
|
|
|
297
1464
|
Example:
|
|
298
|
-
>>> result = compute_dale_chall("
|
|
299
|
-
>>>
|
|
300
|
-
|
|
301
|
-
>>>
|
|
302
|
-
|
|
303
|
-
>>> print(f"Difficult words: {result.difficult_word_ratio * 100:.1f}%")
|
|
304
|
-
Difficult words: 12.4%
|
|
305
|
-
|
|
306
|
-
Note:
|
|
307
|
-
- Case-insensitive word matching
|
|
308
|
-
- Punctuation stripped before word lookup
|
|
309
|
-
- Proper nouns may be flagged as difficult even if well-known
|
|
310
|
-
- Technical/specialized texts score higher than general texts
|
|
1465
|
+
>>> result = compute_dale_chall("Long text here...", chunk_size=1000)
|
|
1466
|
+
>>> result.dale_chall_score # Mean across chunks
|
|
1467
|
+
7.3
|
|
1468
|
+
>>> result.dale_chall_score_dist.std # Variance reveals fingerprint
|
|
1469
|
+
0.5
|
|
311
1470
|
"""
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
1471
|
+
chunks = chunk_text(text, chunk_size)
|
|
1472
|
+
score_values = []
|
|
1473
|
+
ratio_values = []
|
|
1474
|
+
sent_len_values = []
|
|
1475
|
+
total_difficult = 0
|
|
1476
|
+
total_words = 0
|
|
1477
|
+
total_sentences = 0
|
|
1478
|
+
|
|
1479
|
+
for chunk in chunks:
|
|
1480
|
+
sc, diff_cnt, diff_rat, sent_len, meta = _compute_dale_chall_single(chunk)
|
|
1481
|
+
if not math.isnan(sc):
|
|
1482
|
+
score_values.append(sc)
|
|
1483
|
+
ratio_values.append(diff_rat)
|
|
1484
|
+
sent_len_values.append(sent_len)
|
|
1485
|
+
total_difficult += diff_cnt
|
|
1486
|
+
total_words += meta.get("word_count", 0)
|
|
1487
|
+
total_sentences += meta.get("sentence_count", 0)
|
|
1488
|
+
|
|
1489
|
+
if not score_values:
|
|
1490
|
+
empty_dist = Distribution(
|
|
1491
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
1492
|
+
)
|
|
318
1493
|
return DaleChallResult(
|
|
319
1494
|
dale_chall_score=float("nan"),
|
|
320
1495
|
grade_level="Unknown",
|
|
@@ -322,612 +1497,562 @@ def compute_dale_chall(text: str) -> DaleChallResult:
|
|
|
322
1497
|
difficult_word_ratio=float("nan"),
|
|
323
1498
|
avg_sentence_length=float("nan"),
|
|
324
1499
|
total_words=0,
|
|
1500
|
+
dale_chall_score_dist=empty_dist,
|
|
1501
|
+
difficult_word_ratio_dist=empty_dist,
|
|
1502
|
+
avg_sentence_length_dist=empty_dist,
|
|
1503
|
+
chunk_size=chunk_size,
|
|
1504
|
+
chunk_count=len(chunks),
|
|
325
1505
|
metadata={
|
|
326
1506
|
"sentence_count": 0,
|
|
327
1507
|
"raw_score": float("nan"),
|
|
328
1508
|
"adjusted": False,
|
|
329
|
-
"
|
|
1509
|
+
"difficult_word_pct": float("nan"),
|
|
1510
|
+
"reliable": False,
|
|
330
1511
|
},
|
|
331
1512
|
)
|
|
332
1513
|
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
word_lower = word.lower()
|
|
337
|
-
if word_lower not in DALE_CHALL_FAMILIAR_WORDS:
|
|
338
|
-
difficult_words.append(word)
|
|
1514
|
+
score_dist = make_distribution(score_values)
|
|
1515
|
+
ratio_dist = make_distribution(ratio_values)
|
|
1516
|
+
sent_len_dist = make_distribution(sent_len_values)
|
|
339
1517
|
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
1518
|
+
# Calculate overall raw score and adjusted status for metadata
|
|
1519
|
+
overall_difficult_pct = (total_difficult / total_words * 100) if total_words > 0 else 0.0
|
|
1520
|
+
overall_raw_score = 0.1579 * overall_difficult_pct + 0.0496 * sent_len_dist.mean
|
|
1521
|
+
overall_adjusted = overall_difficult_pct > 5.0
|
|
343
1522
|
|
|
344
|
-
|
|
345
|
-
|
|
1523
|
+
return DaleChallResult(
|
|
1524
|
+
dale_chall_score=score_dist.mean,
|
|
1525
|
+
grade_level=_get_dale_chall_grade_level(score_dist.mean),
|
|
1526
|
+
difficult_word_count=total_difficult,
|
|
1527
|
+
difficult_word_ratio=ratio_dist.mean,
|
|
1528
|
+
avg_sentence_length=sent_len_dist.mean,
|
|
1529
|
+
total_words=total_words,
|
|
1530
|
+
dale_chall_score_dist=score_dist,
|
|
1531
|
+
difficult_word_ratio_dist=ratio_dist,
|
|
1532
|
+
avg_sentence_length_dist=sent_len_dist,
|
|
1533
|
+
chunk_size=chunk_size,
|
|
1534
|
+
chunk_count=len(chunks),
|
|
1535
|
+
metadata={
|
|
1536
|
+
"sentence_count": total_sentences,
|
|
1537
|
+
"raw_score": overall_raw_score,
|
|
1538
|
+
"adjusted": overall_adjusted,
|
|
1539
|
+
"difficult_word_pct": overall_difficult_pct,
|
|
1540
|
+
"total_sentence_count": total_sentences,
|
|
1541
|
+
"total_word_count": total_words,
|
|
1542
|
+
"total_difficult_word_count": total_difficult,
|
|
1543
|
+
"reliable": total_words >= 100,
|
|
1544
|
+
},
|
|
1545
|
+
)
|
|
346
1546
|
|
|
347
|
-
# Calculate raw score
|
|
348
|
-
raw_score = 0.1579 * difficult_word_pct + 0.0496 * avg_sentence_length
|
|
349
1547
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
dale_chall_score = raw_score
|
|
356
|
-
|
|
357
|
-
# Map score to grade level
|
|
358
|
-
if dale_chall_score < 5.0:
|
|
359
|
-
grade_level = "4 and below"
|
|
360
|
-
elif dale_chall_score < 6.0:
|
|
361
|
-
grade_level = "5-6"
|
|
362
|
-
elif dale_chall_score < 7.0:
|
|
363
|
-
grade_level = "7-8"
|
|
364
|
-
elif dale_chall_score < 8.0:
|
|
365
|
-
grade_level = "9-10"
|
|
366
|
-
elif dale_chall_score < 9.0:
|
|
367
|
-
grade_level = "11-12"
|
|
368
|
-
elif dale_chall_score < 10.0:
|
|
369
|
-
grade_level = "College"
|
|
370
|
-
else:
|
|
371
|
-
grade_level = "College Graduate"
|
|
1548
|
+
def _compute_linsear_single(text: str) -> tuple[float, float, int, int, float, dict]:
|
|
1549
|
+
"""Compute Linsear Write for a single chunk."""
|
|
1550
|
+
sentences = split_sentences(text)
|
|
1551
|
+
tokens = tokenize(text)
|
|
1552
|
+
word_tokens = normalize_for_readability(tokens)
|
|
372
1553
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
1554
|
+
if len(sentences) == 0 or len(word_tokens) == 0:
|
|
1555
|
+
return (
|
|
1556
|
+
float("nan"),
|
|
1557
|
+
float("nan"),
|
|
1558
|
+
0,
|
|
1559
|
+
0,
|
|
1560
|
+
float("nan"),
|
|
1561
|
+
{"sentence_count": 0, "word_count": 0},
|
|
1562
|
+
)
|
|
376
1563
|
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
}
|
|
1564
|
+
easy_word_count = sum(1 for w in word_tokens if count_syllables(w) <= 2)
|
|
1565
|
+
hard_word_count = len(word_tokens) - easy_word_count
|
|
1566
|
+
weighted_sum = easy_word_count + hard_word_count * 3
|
|
1567
|
+
raw_score = weighted_sum / len(sentences)
|
|
1568
|
+
grade_level_raw = round(raw_score / 2) if raw_score > 20 else round((raw_score - 2) / 2)
|
|
1569
|
+
grade_level = max(0.0, float(grade_level_raw))
|
|
1570
|
+
avg_sentence_length = len(word_tokens) / len(sentences)
|
|
385
1571
|
|
|
386
|
-
return
|
|
387
|
-
|
|
388
|
-
grade_level
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
avg_sentence_length
|
|
392
|
-
|
|
393
|
-
metadata=metadata,
|
|
1572
|
+
return (
|
|
1573
|
+
raw_score,
|
|
1574
|
+
grade_level,
|
|
1575
|
+
easy_word_count,
|
|
1576
|
+
hard_word_count,
|
|
1577
|
+
avg_sentence_length,
|
|
1578
|
+
{"sentence_count": len(sentences), "word_count": len(word_tokens)},
|
|
394
1579
|
)
|
|
395
1580
|
|
|
396
1581
|
|
|
397
|
-
def compute_linsear_write(text: str) -> LinsearWriteResult:
|
|
1582
|
+
def compute_linsear_write(text: str, chunk_size: int = 1000) -> LinsearWriteResult:
|
|
398
1583
|
"""
|
|
399
1584
|
Compute Linsear Write Readability Formula.
|
|
400
1585
|
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
syllables) and uses sentence length to estimate grade level.
|
|
1586
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
1587
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
404
1588
|
|
|
405
|
-
Related GitHub
|
|
1589
|
+
Related GitHub Issues:
|
|
406
1590
|
#16 - Additional Readability Formulas
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
Formula:
|
|
410
|
-
1. Count "easy" words (1-2 syllables): multiply count by 1
|
|
411
|
-
2. Count "hard" words (3+ syllables): multiply count by 3
|
|
412
|
-
3. Divide sum by number of sentences
|
|
413
|
-
4. If result > 20, divide by 2 to get grade level
|
|
414
|
-
5. If result <= 20, subtract 2, then divide by 2
|
|
415
|
-
|
|
416
|
-
The formula is optimized for technical writing and works best with
|
|
417
|
-
passages of about 100 words.
|
|
418
|
-
|
|
419
|
-
Advantages:
|
|
420
|
-
- Simple binary classification (easy/hard)
|
|
421
|
-
- Effective for technical documents
|
|
422
|
-
- Fast computation
|
|
423
|
-
- Developed specifically for instructional materials
|
|
424
|
-
|
|
425
|
-
Disadvantages:
|
|
426
|
-
- Less well-known than other formulas
|
|
427
|
-
- Binary word classification is crude
|
|
428
|
-
- May overestimate difficulty of technical terms
|
|
429
|
-
- Limited validation compared to Flesch or Dale-Chall
|
|
1591
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
430
1592
|
|
|
431
1593
|
Args:
|
|
432
|
-
text: Input text to analyze
|
|
433
|
-
|
|
1594
|
+
text: Input text to analyze
|
|
1595
|
+
chunk_size: Number of words per chunk (default: 1000)
|
|
434
1596
|
|
|
435
1597
|
Returns:
|
|
436
|
-
LinsearWriteResult
|
|
437
|
-
- linsear_score: The Linsear Write score
|
|
438
|
-
- grade_level: Corresponding U.S. grade level (integer)
|
|
439
|
-
- easy_word_count: Words with 1-2 syllables
|
|
440
|
-
- hard_word_count: Words with 3+ syllables
|
|
441
|
-
- avg_sentence_length: Average words per sentence
|
|
442
|
-
- metadata: Calculation details, sentence count, etc.
|
|
1598
|
+
LinsearWriteResult with score, grade_level, distributions, and metadata
|
|
443
1599
|
|
|
444
1600
|
Example:
|
|
445
|
-
>>> result = compute_linsear_write("
|
|
446
|
-
>>>
|
|
447
|
-
|
|
448
|
-
>>> print(f"Grade level: {result.grade_level}")
|
|
449
|
-
Grade level: 11
|
|
450
|
-
>>> print(f"Easy words: {result.easy_word_count}")
|
|
451
|
-
Easy words: 78
|
|
452
|
-
>>> print(f"Hard words: {result.hard_word_count}")
|
|
453
|
-
Hard words: 22
|
|
454
|
-
|
|
455
|
-
Note:
|
|
456
|
-
- Syllable counting required (use existing syllable module)
|
|
457
|
-
- Punctuation and numbers typically excluded
|
|
458
|
-
- Most accurate with 100-word samples
|
|
459
|
-
- Grade level is rounded to nearest integer
|
|
1601
|
+
>>> result = compute_linsear_write("Long text here...", chunk_size=1000)
|
|
1602
|
+
>>> result.linsear_score # Mean across chunks
|
|
1603
|
+
11.3
|
|
460
1604
|
"""
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
1605
|
+
chunks = chunk_text(text, chunk_size)
|
|
1606
|
+
score_values = []
|
|
1607
|
+
grade_values = []
|
|
1608
|
+
sent_len_values = []
|
|
1609
|
+
total_easy = 0
|
|
1610
|
+
total_hard = 0
|
|
1611
|
+
total_words = 0
|
|
1612
|
+
|
|
1613
|
+
for chunk in chunks:
|
|
1614
|
+
sc, gr, easy, hard, sent_len, meta = _compute_linsear_single(chunk)
|
|
1615
|
+
if not math.isnan(sc):
|
|
1616
|
+
score_values.append(sc)
|
|
1617
|
+
grade_values.append(gr)
|
|
1618
|
+
sent_len_values.append(sent_len)
|
|
1619
|
+
total_easy += easy
|
|
1620
|
+
total_hard += hard
|
|
1621
|
+
total_words += meta.get("word_count", 0)
|
|
1622
|
+
|
|
1623
|
+
if not score_values:
|
|
1624
|
+
empty_dist = Distribution(
|
|
1625
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
1626
|
+
)
|
|
467
1627
|
return LinsearWriteResult(
|
|
468
1628
|
linsear_score=float("nan"),
|
|
469
|
-
grade_level=
|
|
1629
|
+
grade_level=float("nan"),
|
|
470
1630
|
easy_word_count=0,
|
|
471
1631
|
hard_word_count=0,
|
|
472
1632
|
avg_sentence_length=float("nan"),
|
|
473
|
-
|
|
1633
|
+
linsear_score_dist=empty_dist,
|
|
1634
|
+
grade_level_dist=empty_dist,
|
|
1635
|
+
avg_sentence_length_dist=empty_dist,
|
|
1636
|
+
chunk_size=chunk_size,
|
|
1637
|
+
chunk_count=len(chunks),
|
|
1638
|
+
metadata={"total_words": 0, "reliable": False},
|
|
474
1639
|
)
|
|
475
1640
|
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
1641
|
+
score_dist = make_distribution(score_values)
|
|
1642
|
+
grade_dist = make_distribution(grade_values)
|
|
1643
|
+
sent_len_dist = make_distribution(sent_len_values)
|
|
479
1644
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
1645
|
+
return LinsearWriteResult(
|
|
1646
|
+
linsear_score=score_dist.mean,
|
|
1647
|
+
grade_level=grade_dist.mean,
|
|
1648
|
+
easy_word_count=total_easy,
|
|
1649
|
+
hard_word_count=total_hard,
|
|
1650
|
+
avg_sentence_length=sent_len_dist.mean,
|
|
1651
|
+
linsear_score_dist=score_dist,
|
|
1652
|
+
grade_level_dist=grade_dist,
|
|
1653
|
+
avg_sentence_length_dist=sent_len_dist,
|
|
1654
|
+
chunk_size=chunk_size,
|
|
1655
|
+
chunk_count=len(chunks),
|
|
1656
|
+
metadata={"total_words": total_words, "reliable": total_words >= 100},
|
|
1657
|
+
)
|
|
486
1658
|
|
|
487
|
-
# Calculate weighted sum
|
|
488
|
-
weighted_sum = (easy_word_count * 1) + (hard_word_count * 3)
|
|
489
1659
|
|
|
490
|
-
|
|
491
|
-
|
|
1660
|
+
def _get_fry_grade_level(avg_sent_len: float, avg_syl_100: float) -> tuple[str, str]:
|
|
1661
|
+
"""Get Fry grade level and zone from coordinates."""
|
|
1662
|
+
if math.isnan(avg_sent_len) or math.isnan(avg_syl_100):
|
|
1663
|
+
return ("Unknown", "invalid")
|
|
492
1664
|
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
1665
|
+
if avg_syl_100 < 125:
|
|
1666
|
+
if avg_sent_len < 7:
|
|
1667
|
+
grade, zone = "1", "valid"
|
|
1668
|
+
elif avg_sent_len < 11:
|
|
1669
|
+
grade, zone = "2", "valid"
|
|
1670
|
+
else:
|
|
1671
|
+
grade, zone = "3", "valid"
|
|
1672
|
+
elif avg_syl_100 < 135:
|
|
1673
|
+
if avg_sent_len < 8:
|
|
1674
|
+
grade, zone = "2", "valid"
|
|
1675
|
+
elif avg_sent_len < 12:
|
|
1676
|
+
grade, zone = "3", "valid"
|
|
1677
|
+
else:
|
|
1678
|
+
grade, zone = "4", "valid"
|
|
1679
|
+
elif avg_syl_100 < 145:
|
|
1680
|
+
if avg_sent_len < 9:
|
|
1681
|
+
grade, zone = "3", "valid"
|
|
1682
|
+
elif avg_sent_len < 13:
|
|
1683
|
+
grade, zone = "5", "valid"
|
|
1684
|
+
else:
|
|
1685
|
+
grade, zone = "6", "valid"
|
|
1686
|
+
elif avg_syl_100 < 155:
|
|
1687
|
+
if avg_sent_len < 10:
|
|
1688
|
+
grade, zone = "4", "valid"
|
|
1689
|
+
elif avg_sent_len < 14:
|
|
1690
|
+
grade, zone = "7", "valid"
|
|
1691
|
+
else:
|
|
1692
|
+
grade, zone = "8", "valid"
|
|
1693
|
+
elif avg_syl_100 < 165:
|
|
1694
|
+
if avg_sent_len < 12:
|
|
1695
|
+
grade, zone = "6", "valid"
|
|
1696
|
+
elif avg_sent_len < 16:
|
|
1697
|
+
grade, zone = "9", "valid"
|
|
1698
|
+
else:
|
|
1699
|
+
grade, zone = "10", "valid"
|
|
1700
|
+
elif avg_syl_100 < 175:
|
|
1701
|
+
if avg_sent_len < 14:
|
|
1702
|
+
grade, zone = "8", "valid"
|
|
1703
|
+
elif avg_sent_len < 18:
|
|
1704
|
+
grade, zone = "11", "valid"
|
|
1705
|
+
else:
|
|
1706
|
+
grade, zone = "12", "valid"
|
|
496
1707
|
else:
|
|
497
|
-
|
|
1708
|
+
if avg_sent_len < 16:
|
|
1709
|
+
grade, zone = "10", "valid"
|
|
1710
|
+
elif avg_sent_len < 20:
|
|
1711
|
+
grade, zone = "College", "valid"
|
|
1712
|
+
else:
|
|
1713
|
+
grade, zone = "College+", "valid"
|
|
498
1714
|
|
|
499
|
-
|
|
500
|
-
|
|
1715
|
+
if avg_syl_100 > 185 or avg_sent_len > 25:
|
|
1716
|
+
zone = "above_graph"
|
|
1717
|
+
elif avg_syl_100 < 110:
|
|
1718
|
+
zone = "below_graph"
|
|
501
1719
|
|
|
502
|
-
|
|
503
|
-
avg_sentence_length = len(word_tokens) / len(sentences)
|
|
1720
|
+
return (grade, zone)
|
|
504
1721
|
|
|
505
|
-
# Build metadata
|
|
506
|
-
metadata = {
|
|
507
|
-
"total_words": len(word_tokens),
|
|
508
|
-
"sentence_count": len(sentences),
|
|
509
|
-
"raw_score": raw_score,
|
|
510
|
-
"weighted_sum": weighted_sum,
|
|
511
|
-
}
|
|
512
1722
|
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
1723
|
+
def _compute_fry_single(text: str) -> tuple[float, float, dict]:
|
|
1724
|
+
"""Compute Fry for a single chunk. Returns (avg_sent_len, avg_syl_100, meta)."""
|
|
1725
|
+
sentences = split_sentences(text)
|
|
1726
|
+
tokens = tokenize(text)
|
|
1727
|
+
word_tokens = normalize_for_readability(tokens)
|
|
1728
|
+
|
|
1729
|
+
if len(sentences) == 0 or len(word_tokens) == 0:
|
|
1730
|
+
return (
|
|
1731
|
+
float("nan"),
|
|
1732
|
+
float("nan"),
|
|
1733
|
+
{"sentence_count": 0, "word_count": 0, "syllable_count": 0, "sample_size": 0},
|
|
1734
|
+
)
|
|
1735
|
+
|
|
1736
|
+
sample_size = min(100, len(word_tokens))
|
|
1737
|
+
sample_tokens = word_tokens[:sample_size]
|
|
1738
|
+
total_syllables = sum(count_syllables(w) for w in sample_tokens)
|
|
1739
|
+
|
|
1740
|
+
word_count_so_far = 0
|
|
1741
|
+
sentences_in_sample = 0
|
|
1742
|
+
for sent in sentences:
|
|
1743
|
+
sent_tokens = normalize_for_readability(tokenize(sent))
|
|
1744
|
+
if word_count_so_far + len(sent_tokens) <= sample_size:
|
|
1745
|
+
sentences_in_sample += 1
|
|
1746
|
+
word_count_so_far += len(sent_tokens)
|
|
1747
|
+
else:
|
|
1748
|
+
if word_count_so_far < sample_size:
|
|
1749
|
+
sentences_in_sample += 1
|
|
1750
|
+
break
|
|
1751
|
+
|
|
1752
|
+
sentences_in_sample = max(1, sentences_in_sample)
|
|
1753
|
+
avg_sentence_length = sample_size / sentences_in_sample
|
|
1754
|
+
avg_syllables_per_100 = (total_syllables / sample_size) * 100
|
|
1755
|
+
|
|
1756
|
+
return (
|
|
1757
|
+
avg_sentence_length,
|
|
1758
|
+
avg_syllables_per_100,
|
|
1759
|
+
{
|
|
1760
|
+
"sentence_count": len(sentences),
|
|
1761
|
+
"word_count": len(word_tokens),
|
|
1762
|
+
"syllable_count": total_syllables,
|
|
1763
|
+
"sample_size": sample_size,
|
|
1764
|
+
},
|
|
520
1765
|
)
|
|
521
1766
|
|
|
522
1767
|
|
|
523
|
-
def compute_fry(text: str) -> FryResult:
|
|
1768
|
+
def compute_fry(text: str, chunk_size: int = 1000) -> FryResult:
|
|
524
1769
|
"""
|
|
525
1770
|
Compute Fry Readability Graph metrics.
|
|
526
1771
|
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
provides the numerical coordinates and estimated grade level.
|
|
1772
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
1773
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
530
1774
|
|
|
531
|
-
Related GitHub
|
|
1775
|
+
Related GitHub Issues:
|
|
532
1776
|
#16 - Additional Readability Formulas
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
Method:
|
|
536
|
-
1. Select three 100-word samples from text
|
|
537
|
-
2. Count average sentence length across samples
|
|
538
|
-
3. Count average syllables per 100 words across samples
|
|
539
|
-
4. Plot coordinates on Fry graph (or use numerical approximation)
|
|
540
|
-
5. Determine grade level from graph zone
|
|
541
|
-
|
|
542
|
-
The original Fry graph has zones corresponding to grade levels 1-17+.
|
|
543
|
-
This implementation uses numerical approximation to estimate grade level.
|
|
544
|
-
|
|
545
|
-
Advantages:
|
|
546
|
-
- Visual/graphical approach (intuitive)
|
|
547
|
-
- Uses two independent dimensions (length & syllables)
|
|
548
|
-
- Well-validated for educational materials
|
|
549
|
-
- Covers wide range of grade levels (1-17+)
|
|
550
|
-
|
|
551
|
-
Disadvantages:
|
|
552
|
-
- Requires exactly 100-word samples (padding/truncation needed)
|
|
553
|
-
- Graph reading can be subjective
|
|
554
|
-
- Less precise than formula-based methods
|
|
555
|
-
- Multiple samples needed for reliability
|
|
1777
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
556
1778
|
|
|
557
1779
|
Args:
|
|
558
|
-
text: Input text to analyze
|
|
559
|
-
|
|
1780
|
+
text: Input text to analyze
|
|
1781
|
+
chunk_size: Number of words per chunk (default: 1000)
|
|
560
1782
|
|
|
561
1783
|
Returns:
|
|
562
|
-
FryResult
|
|
563
|
-
- avg_sentence_length: Average words per sentence
|
|
564
|
-
- avg_syllables_per_100: Average syllables per 100 words
|
|
565
|
-
- grade_level: Estimated grade level (e.g., "5", "7", "College")
|
|
566
|
-
- graph_zone: Which zone of Fry graph (for validity checking)
|
|
567
|
-
- metadata: Sample details, total sentences, syllables, etc.
|
|
1784
|
+
FryResult with avg_sentence_length, avg_syllables_per_100, distributions, and metadata
|
|
568
1785
|
|
|
569
1786
|
Example:
|
|
570
|
-
>>> result = compute_fry("
|
|
571
|
-
>>>
|
|
572
|
-
|
|
573
|
-
>>> print(f"Syllables/100 words: {result.avg_syllables_per_100:.1f}")
|
|
574
|
-
Syllables/100 words: 142.7
|
|
575
|
-
>>> print(f"Grade level: {result.grade_level}")
|
|
576
|
-
Grade level: 6
|
|
577
|
-
|
|
578
|
-
Note:
|
|
579
|
-
- Original method uses three 100-word samples
|
|
580
|
-
- Implementation may use single sample or whole text
|
|
581
|
-
- Syllable counting required
|
|
582
|
-
- Grade level estimation uses zone boundaries
|
|
583
|
-
- Some texts fall outside graph zones (marked as invalid)
|
|
1787
|
+
>>> result = compute_fry("Long text here...", chunk_size=1000)
|
|
1788
|
+
>>> result.avg_sentence_length # Mean across chunks
|
|
1789
|
+
14.3
|
|
584
1790
|
"""
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
1791
|
+
chunks = chunk_text(text, chunk_size)
|
|
1792
|
+
sent_len_values = []
|
|
1793
|
+
syl_100_values = []
|
|
1794
|
+
total_words = 0
|
|
1795
|
+
total_sentences = 0
|
|
1796
|
+
total_syllables = 0
|
|
1797
|
+
|
|
1798
|
+
for chunk in chunks:
|
|
1799
|
+
sent_len, syl_100, meta = _compute_fry_single(chunk)
|
|
1800
|
+
if not math.isnan(sent_len):
|
|
1801
|
+
sent_len_values.append(sent_len)
|
|
1802
|
+
syl_100_values.append(syl_100)
|
|
1803
|
+
total_words += meta.get("word_count", 0)
|
|
1804
|
+
total_sentences += meta.get("sentence_count", 0)
|
|
1805
|
+
total_syllables += meta.get("syllable_count", 0)
|
|
1806
|
+
|
|
1807
|
+
if not sent_len_values:
|
|
1808
|
+
empty_dist = Distribution(
|
|
1809
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
1810
|
+
)
|
|
591
1811
|
return FryResult(
|
|
592
1812
|
avg_sentence_length=float("nan"),
|
|
593
1813
|
avg_syllables_per_100=float("nan"),
|
|
594
1814
|
grade_level="Unknown",
|
|
595
1815
|
graph_zone="invalid",
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
},
|
|
1816
|
+
avg_sentence_length_dist=empty_dist,
|
|
1817
|
+
avg_syllables_per_100_dist=empty_dist,
|
|
1818
|
+
chunk_size=chunk_size,
|
|
1819
|
+
chunk_count=len(chunks),
|
|
1820
|
+
metadata={"total_sentences": 0, "total_words": 0, "sample_size": 0, "reliable": False},
|
|
602
1821
|
)
|
|
603
1822
|
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
1823
|
+
sent_len_dist = make_distribution(sent_len_values)
|
|
1824
|
+
syl_100_dist = make_distribution(syl_100_values)
|
|
1825
|
+
grade_level, graph_zone = _get_fry_grade_level(sent_len_dist.mean, syl_100_dist.mean)
|
|
607
1826
|
|
|
608
|
-
#
|
|
609
|
-
|
|
1827
|
+
# Calculate sample size (min of 100 or total_words for overall)
|
|
1828
|
+
sample_size = min(100, total_words)
|
|
610
1829
|
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
1830
|
+
return FryResult(
|
|
1831
|
+
avg_sentence_length=sent_len_dist.mean,
|
|
1832
|
+
avg_syllables_per_100=syl_100_dist.mean,
|
|
1833
|
+
grade_level=grade_level,
|
|
1834
|
+
graph_zone=graph_zone,
|
|
1835
|
+
avg_sentence_length_dist=sent_len_dist,
|
|
1836
|
+
avg_syllables_per_100_dist=syl_100_dist,
|
|
1837
|
+
chunk_size=chunk_size,
|
|
1838
|
+
chunk_count=len(chunks),
|
|
1839
|
+
metadata={
|
|
1840
|
+
"total_sentences": total_sentences,
|
|
1841
|
+
"total_words": total_words,
|
|
1842
|
+
"total_syllables": total_syllables,
|
|
1843
|
+
"sample_size": sample_size,
|
|
1844
|
+
"reliable": total_words >= 100,
|
|
1845
|
+
},
|
|
1846
|
+
)
|
|
626
1847
|
|
|
627
|
-
# Ensure at least 1 sentence for division
|
|
628
|
-
sentences_in_sample = max(1, sentences_in_sample)
|
|
629
1848
|
|
|
630
|
-
|
|
631
|
-
|
|
1849
|
+
def _compute_forcast_single(text: str) -> tuple[float, float, int, float, dict]:
|
|
1850
|
+
"""Compute FORCAST for a single chunk."""
|
|
1851
|
+
tokens = tokenize(text)
|
|
1852
|
+
word_tokens = normalize_for_readability(tokens)
|
|
632
1853
|
|
|
633
|
-
|
|
634
|
-
|
|
1854
|
+
if len(word_tokens) == 0:
|
|
1855
|
+
return (
|
|
1856
|
+
float("nan"),
|
|
1857
|
+
float("nan"),
|
|
1858
|
+
0,
|
|
1859
|
+
float("nan"),
|
|
1860
|
+
{"word_count": 0, "sample_size": 0, "scaled_n": 0.0},
|
|
1861
|
+
)
|
|
635
1862
|
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
# Simplified zone mapping:
|
|
646
|
-
if avg_syllables_per_100 < 125:
|
|
647
|
-
if avg_sentence_length < 7:
|
|
648
|
-
grade_level = "1"
|
|
649
|
-
graph_zone = "valid"
|
|
650
|
-
elif avg_sentence_length < 11:
|
|
651
|
-
grade_level = "2"
|
|
652
|
-
graph_zone = "valid"
|
|
653
|
-
else:
|
|
654
|
-
grade_level = "3"
|
|
655
|
-
graph_zone = "valid"
|
|
656
|
-
elif avg_syllables_per_100 < 135:
|
|
657
|
-
if avg_sentence_length < 8:
|
|
658
|
-
grade_level = "2"
|
|
659
|
-
graph_zone = "valid"
|
|
660
|
-
elif avg_sentence_length < 12:
|
|
661
|
-
grade_level = "3"
|
|
662
|
-
graph_zone = "valid"
|
|
663
|
-
else:
|
|
664
|
-
grade_level = "4"
|
|
665
|
-
graph_zone = "valid"
|
|
666
|
-
elif avg_syllables_per_100 < 145:
|
|
667
|
-
if avg_sentence_length < 9:
|
|
668
|
-
grade_level = "3"
|
|
669
|
-
graph_zone = "valid"
|
|
670
|
-
elif avg_sentence_length < 13:
|
|
671
|
-
grade_level = "5"
|
|
672
|
-
graph_zone = "valid"
|
|
673
|
-
else:
|
|
674
|
-
grade_level = "6"
|
|
675
|
-
graph_zone = "valid"
|
|
676
|
-
elif avg_syllables_per_100 < 155:
|
|
677
|
-
if avg_sentence_length < 10:
|
|
678
|
-
grade_level = "4"
|
|
679
|
-
graph_zone = "valid"
|
|
680
|
-
elif avg_sentence_length < 14:
|
|
681
|
-
grade_level = "7"
|
|
682
|
-
graph_zone = "valid"
|
|
683
|
-
else:
|
|
684
|
-
grade_level = "8"
|
|
685
|
-
graph_zone = "valid"
|
|
686
|
-
elif avg_syllables_per_100 < 165:
|
|
687
|
-
if avg_sentence_length < 12:
|
|
688
|
-
grade_level = "6"
|
|
689
|
-
graph_zone = "valid"
|
|
690
|
-
elif avg_sentence_length < 16:
|
|
691
|
-
grade_level = "9"
|
|
692
|
-
graph_zone = "valid"
|
|
693
|
-
else:
|
|
694
|
-
grade_level = "10"
|
|
695
|
-
graph_zone = "valid"
|
|
696
|
-
elif avg_syllables_per_100 < 175:
|
|
697
|
-
if avg_sentence_length < 14:
|
|
698
|
-
grade_level = "8"
|
|
699
|
-
graph_zone = "valid"
|
|
700
|
-
elif avg_sentence_length < 18:
|
|
701
|
-
grade_level = "11"
|
|
702
|
-
graph_zone = "valid"
|
|
703
|
-
else:
|
|
704
|
-
grade_level = "12"
|
|
705
|
-
graph_zone = "valid"
|
|
706
|
-
else: # avg_syllables_per_100 >= 175
|
|
707
|
-
if avg_sentence_length < 16:
|
|
708
|
-
grade_level = "10"
|
|
709
|
-
graph_zone = "valid"
|
|
710
|
-
elif avg_sentence_length < 20:
|
|
711
|
-
grade_level = "College"
|
|
712
|
-
graph_zone = "valid"
|
|
713
|
-
else:
|
|
714
|
-
grade_level = "College+"
|
|
715
|
-
graph_zone = "valid"
|
|
716
|
-
|
|
717
|
-
# Check if outside typical graph bounds
|
|
718
|
-
if avg_syllables_per_100 > 185 or avg_sentence_length > 25:
|
|
719
|
-
graph_zone = "above_graph"
|
|
720
|
-
elif avg_syllables_per_100 < 110:
|
|
721
|
-
graph_zone = "below_graph"
|
|
722
|
-
|
|
723
|
-
# Build metadata
|
|
724
|
-
metadata = {
|
|
725
|
-
"total_sentences": len(sentences),
|
|
726
|
-
"total_syllables": sum(count_syllables(w) for w in word_tokens),
|
|
727
|
-
"total_words": len(word_tokens),
|
|
728
|
-
"sample_size": sample_size,
|
|
729
|
-
"sentences_in_sample": sentences_in_sample,
|
|
730
|
-
"syllables_in_sample": total_syllables,
|
|
731
|
-
}
|
|
1863
|
+
sample_size = min(150, len(word_tokens))
|
|
1864
|
+
sample_tokens = word_tokens[:sample_size]
|
|
1865
|
+
single_syllable_count = sum(1 for w in sample_tokens if count_syllables(w) == 1)
|
|
1866
|
+
scaled_n = (
|
|
1867
|
+
single_syllable_count * (150 / sample_size) if sample_size < 150 else single_syllable_count
|
|
1868
|
+
)
|
|
1869
|
+
forcast_score = 20 - (scaled_n / 10)
|
|
1870
|
+
grade_level = float(max(0, min(20, round(forcast_score))))
|
|
1871
|
+
single_syllable_ratio = single_syllable_count / sample_size
|
|
732
1872
|
|
|
733
|
-
return
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
1873
|
+
return (
|
|
1874
|
+
forcast_score,
|
|
1875
|
+
grade_level,
|
|
1876
|
+
single_syllable_count,
|
|
1877
|
+
single_syllable_ratio,
|
|
1878
|
+
{"word_count": len(word_tokens), "sample_size": sample_size, "scaled_n": scaled_n},
|
|
739
1879
|
)
|
|
740
1880
|
|
|
741
1881
|
|
|
742
|
-
def compute_forcast(text: str) -> FORCASTResult:
|
|
1882
|
+
def compute_forcast(text: str, chunk_size: int = 1000) -> FORCASTResult:
|
|
743
1883
|
"""
|
|
744
1884
|
Compute FORCAST Readability Formula.
|
|
745
1885
|
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
single-syllable words as its metric, making it fast and simple.
|
|
1886
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
1887
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
749
1888
|
|
|
750
|
-
Related GitHub
|
|
1889
|
+
Related GitHub Issues:
|
|
751
1890
|
#16 - Additional Readability Formulas
|
|
752
|
-
|
|
1891
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
753
1892
|
|
|
754
1893
|
Formula:
|
|
755
1894
|
Grade Level = 20 - (N / 10)
|
|
756
|
-
|
|
757
1895
|
Where N is the number of single-syllable words in a 150-word sample.
|
|
758
1896
|
|
|
759
|
-
The formula is optimized for technical and military documents and works
|
|
760
|
-
best with standardized 150-word samples.
|
|
761
|
-
|
|
762
|
-
Advantages:
|
|
763
|
-
- Extremely simple (only counts single-syllable words)
|
|
764
|
-
- No sentence segmentation required
|
|
765
|
-
- Fast computation
|
|
766
|
-
- Developed specifically for military/technical texts
|
|
767
|
-
|
|
768
|
-
Disadvantages:
|
|
769
|
-
- Less well-known and validated than other formulas
|
|
770
|
-
- Requires exactly 150-word samples
|
|
771
|
-
- Single dimension (doesn't consider sentence length)
|
|
772
|
-
- May not generalize well beyond military context
|
|
773
|
-
|
|
774
1897
|
Args:
|
|
775
|
-
text: Input text to analyze
|
|
776
|
-
|
|
777
|
-
Longer texts use first 150 words or multiple samples.
|
|
1898
|
+
text: Input text to analyze
|
|
1899
|
+
chunk_size: Number of words per chunk (default: 1000)
|
|
778
1900
|
|
|
779
1901
|
Returns:
|
|
780
|
-
FORCASTResult
|
|
781
|
-
- forcast_score: The FORCAST readability score
|
|
782
|
-
- grade_level: Corresponding U.S. grade level (integer)
|
|
783
|
-
- single_syllable_ratio: Single-syllable words / total words
|
|
784
|
-
- single_syllable_count: Count of single-syllable words
|
|
785
|
-
- total_words: Total word count analyzed
|
|
786
|
-
- metadata: Sample details, calculation specifics, etc.
|
|
1902
|
+
FORCASTResult with score, grade_level, distributions, and metadata
|
|
787
1903
|
|
|
788
1904
|
Example:
|
|
789
|
-
>>> result = compute_forcast("
|
|
790
|
-
>>>
|
|
791
|
-
|
|
792
|
-
>>> print(f"Grade level: {result.grade_level}")
|
|
793
|
-
Grade level: 10
|
|
794
|
-
>>> print(f"Single-syllable ratio: {result.single_syllable_ratio:.3f}")
|
|
795
|
-
Single-syllable ratio: 0.687
|
|
796
|
-
|
|
797
|
-
Note:
|
|
798
|
-
- Syllable counting required (but only to identify 1-syllable words)
|
|
799
|
-
- Recommended sample size is 150 words
|
|
800
|
-
- Multiple samples can be averaged for longer texts
|
|
801
|
-
- Simpler than most readability formulas
|
|
802
|
-
- Grade levels typically range from 5-12
|
|
1905
|
+
>>> result = compute_forcast("Long text here...", chunk_size=1000)
|
|
1906
|
+
>>> result.forcast_score # Mean across chunks
|
|
1907
|
+
9.7
|
|
803
1908
|
"""
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
1909
|
+
chunks = chunk_text(text, chunk_size)
|
|
1910
|
+
score_values = []
|
|
1911
|
+
grade_values = []
|
|
1912
|
+
ratio_values = []
|
|
1913
|
+
total_single = 0
|
|
1914
|
+
total_words = 0
|
|
1915
|
+
|
|
1916
|
+
for chunk in chunks:
|
|
1917
|
+
sc, gr, single_cnt, single_rat, meta = _compute_forcast_single(chunk)
|
|
1918
|
+
if not math.isnan(sc):
|
|
1919
|
+
score_values.append(sc)
|
|
1920
|
+
grade_values.append(gr)
|
|
1921
|
+
ratio_values.append(single_rat)
|
|
1922
|
+
total_single += single_cnt
|
|
1923
|
+
total_words += meta.get("word_count", 0)
|
|
1924
|
+
|
|
1925
|
+
if not score_values:
|
|
1926
|
+
empty_dist = Distribution(
|
|
1927
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
1928
|
+
)
|
|
809
1929
|
return FORCASTResult(
|
|
810
1930
|
forcast_score=float("nan"),
|
|
811
|
-
grade_level=
|
|
1931
|
+
grade_level=float("nan"),
|
|
812
1932
|
single_syllable_ratio=float("nan"),
|
|
813
1933
|
single_syllable_count=0,
|
|
814
1934
|
total_words=0,
|
|
815
|
-
|
|
1935
|
+
forcast_score_dist=empty_dist,
|
|
1936
|
+
grade_level_dist=empty_dist,
|
|
1937
|
+
single_syllable_ratio_dist=empty_dist,
|
|
1938
|
+
chunk_size=chunk_size,
|
|
1939
|
+
chunk_count=len(chunks),
|
|
1940
|
+
metadata={"sample_size": 0, "scaled_n": 0.0, "reliable": False},
|
|
816
1941
|
)
|
|
817
1942
|
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
# Count single-syllable words in sample
|
|
823
|
-
single_syllable_count = 0
|
|
824
|
-
for word in sample_tokens:
|
|
825
|
-
if count_syllables(word) == 1:
|
|
826
|
-
single_syllable_count += 1
|
|
1943
|
+
score_dist = make_distribution(score_values)
|
|
1944
|
+
grade_dist = make_distribution(grade_values)
|
|
1945
|
+
ratio_dist = make_distribution(ratio_values)
|
|
827
1946
|
|
|
828
|
-
#
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
1947
|
+
# Calculate overall sample_size and scaled_n for metadata
|
|
1948
|
+
overall_sample_size = min(150, total_words)
|
|
1949
|
+
overall_scaled_n = (
|
|
1950
|
+
total_single * (150 / overall_sample_size)
|
|
1951
|
+
if overall_sample_size < 150
|
|
1952
|
+
else float(total_single)
|
|
1953
|
+
)
|
|
833
1954
|
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
1955
|
+
return FORCASTResult(
|
|
1956
|
+
forcast_score=score_dist.mean,
|
|
1957
|
+
grade_level=grade_dist.mean,
|
|
1958
|
+
single_syllable_ratio=ratio_dist.mean,
|
|
1959
|
+
single_syllable_count=total_single,
|
|
1960
|
+
total_words=total_words,
|
|
1961
|
+
forcast_score_dist=score_dist,
|
|
1962
|
+
grade_level_dist=grade_dist,
|
|
1963
|
+
single_syllable_ratio_dist=ratio_dist,
|
|
1964
|
+
chunk_size=chunk_size,
|
|
1965
|
+
chunk_count=len(chunks),
|
|
1966
|
+
metadata={
|
|
1967
|
+
"sample_size": overall_sample_size,
|
|
1968
|
+
"scaled_n": overall_scaled_n,
|
|
1969
|
+
"reliable": total_words >= 100,
|
|
1970
|
+
},
|
|
1971
|
+
)
|
|
837
1972
|
|
|
838
|
-
# Ensure grade level is in reasonable range (0-20)
|
|
839
|
-
grade_level = max(0, min(20, grade_level))
|
|
840
1973
|
|
|
841
|
-
|
|
842
|
-
|
|
1974
|
+
def _compute_psk_single(text: str) -> tuple[float, float, float, float, int, dict]:
|
|
1975
|
+
"""Compute PSK for a single chunk."""
|
|
1976
|
+
sentences = split_sentences(text)
|
|
1977
|
+
tokens = tokenize(text)
|
|
1978
|
+
word_tokens = normalize_for_readability(tokens)
|
|
843
1979
|
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
1980
|
+
if len(sentences) == 0 or len(word_tokens) == 0:
|
|
1981
|
+
return (
|
|
1982
|
+
float("nan"),
|
|
1983
|
+
float("nan"),
|
|
1984
|
+
float("nan"),
|
|
1985
|
+
float("nan"),
|
|
1986
|
+
0,
|
|
1987
|
+
{"sentence_count": 0, "word_count": 0},
|
|
1988
|
+
)
|
|
850
1989
|
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
1990
|
+
total_syllables = sum(count_syllables(w) for w in word_tokens)
|
|
1991
|
+
avg_sentence_length = len(word_tokens) / len(sentences)
|
|
1992
|
+
avg_syllables_per_word = total_syllables / len(word_tokens)
|
|
1993
|
+
psk_score = 0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
|
|
1994
|
+
grade_level = round(psk_score, 1)
|
|
1995
|
+
|
|
1996
|
+
return (
|
|
1997
|
+
psk_score,
|
|
1998
|
+
grade_level,
|
|
1999
|
+
avg_sentence_length,
|
|
2000
|
+
avg_syllables_per_word,
|
|
2001
|
+
total_syllables,
|
|
2002
|
+
{"sentence_count": len(sentences), "word_count": len(word_tokens)},
|
|
858
2003
|
)
|
|
859
2004
|
|
|
860
2005
|
|
|
861
|
-
def compute_powers_sumner_kearl(text: str) -> PowersSumnerKearlResult:
|
|
2006
|
+
def compute_powers_sumner_kearl(text: str, chunk_size: int = 1000) -> PowersSumnerKearlResult:
|
|
862
2007
|
"""
|
|
863
2008
|
Compute Powers-Sumner-Kearl Readability Formula.
|
|
864
2009
|
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
It uses the same inputs (sentence length, syllables per word) but with
|
|
868
|
-
different coefficients.
|
|
2010
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
2011
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
869
2012
|
|
|
870
|
-
Related GitHub
|
|
2013
|
+
Related GitHub Issues:
|
|
871
2014
|
#16 - Additional Readability Formulas
|
|
872
|
-
|
|
2015
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
873
2016
|
|
|
874
2017
|
Formula:
|
|
875
2018
|
Grade Level = 0.0778 * avg_sentence_length + 0.0455 * avg_syllables_per_word - 2.2029
|
|
876
2019
|
|
|
877
|
-
The formula was derived from analysis of primary-grade texts and provides
|
|
878
|
-
more accurate grade-level estimates for beginning readers than the original
|
|
879
|
-
Flesch formula.
|
|
880
|
-
|
|
881
|
-
Advantages:
|
|
882
|
-
- Optimized for primary grades (1-4)
|
|
883
|
-
- More accurate than Flesch for young readers
|
|
884
|
-
- Uses same inputs as Flesch (easy to compare)
|
|
885
|
-
- Well-validated on educational materials
|
|
886
|
-
|
|
887
|
-
Disadvantages:
|
|
888
|
-
- Less accurate for higher grade levels
|
|
889
|
-
- Less well-known than Flesch
|
|
890
|
-
- Limited range (not suitable for college-level texts)
|
|
891
|
-
- Requires syllable counting
|
|
892
|
-
|
|
893
2020
|
Args:
|
|
894
|
-
text: Input text to analyze
|
|
895
|
-
|
|
896
|
-
NaN values.
|
|
2021
|
+
text: Input text to analyze
|
|
2022
|
+
chunk_size: Number of words per chunk (default: 1000)
|
|
897
2023
|
|
|
898
2024
|
Returns:
|
|
899
|
-
PowersSumnerKearlResult
|
|
900
|
-
- psk_score: The Powers-Sumner-Kearl score
|
|
901
|
-
- grade_level: Corresponding grade (decimal, e.g., 2.5 = mid-2nd grade)
|
|
902
|
-
- avg_sentence_length: Average words per sentence
|
|
903
|
-
- avg_syllables_per_word: Average syllables per word
|
|
904
|
-
- total_sentences: Total sentence count
|
|
905
|
-
- total_words: Total word count
|
|
906
|
-
- total_syllables: Total syllable count
|
|
907
|
-
- metadata: Comparison to Flesch, calculation details, etc.
|
|
2025
|
+
PowersSumnerKearlResult with score, grade_level, distributions, and metadata
|
|
908
2026
|
|
|
909
2027
|
Example:
|
|
910
|
-
>>> result = compute_powers_sumner_kearl("
|
|
911
|
-
>>>
|
|
912
|
-
|
|
913
|
-
>>> print(f"Grade level: {result.grade_level:.1f}")
|
|
914
|
-
Grade level: 2.3
|
|
915
|
-
>>> print(f"Avg sentence length: {result.avg_sentence_length:.1f}")
|
|
916
|
-
Avg sentence length: 8.5
|
|
917
|
-
|
|
918
|
-
Note:
|
|
919
|
-
- Most accurate for grades 1-4
|
|
920
|
-
- Can produce negative scores for very simple texts
|
|
921
|
-
- Grade level is continuous (can be decimal)
|
|
922
|
-
- Syllable counting required (same as Flesch)
|
|
923
|
-
- Compare to Flesch results for validation
|
|
2028
|
+
>>> result = compute_powers_sumner_kearl("Long text here...", chunk_size=1000)
|
|
2029
|
+
>>> result.psk_score # Mean across chunks
|
|
2030
|
+
2.3
|
|
924
2031
|
"""
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
2032
|
+
chunks = chunk_text(text, chunk_size)
|
|
2033
|
+
score_values = []
|
|
2034
|
+
grade_values = []
|
|
2035
|
+
sent_len_values = []
|
|
2036
|
+
syl_per_word_values = []
|
|
2037
|
+
total_sentences = 0
|
|
2038
|
+
total_words = 0
|
|
2039
|
+
total_syllables = 0
|
|
2040
|
+
|
|
2041
|
+
for chunk in chunks:
|
|
2042
|
+
sc, gr, sent_len, syl_word, syls, meta = _compute_psk_single(chunk)
|
|
2043
|
+
if not math.isnan(sc):
|
|
2044
|
+
score_values.append(sc)
|
|
2045
|
+
grade_values.append(gr)
|
|
2046
|
+
sent_len_values.append(sent_len)
|
|
2047
|
+
syl_per_word_values.append(syl_word)
|
|
2048
|
+
total_sentences += meta.get("sentence_count", 0)
|
|
2049
|
+
total_words += meta.get("word_count", 0)
|
|
2050
|
+
total_syllables += syls
|
|
2051
|
+
|
|
2052
|
+
if not score_values:
|
|
2053
|
+
empty_dist = Distribution(
|
|
2054
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
2055
|
+
)
|
|
931
2056
|
return PowersSumnerKearlResult(
|
|
932
2057
|
psk_score=float("nan"),
|
|
933
2058
|
grade_level=float("nan"),
|
|
@@ -936,50 +2061,50 @@ def compute_powers_sumner_kearl(text: str) -> PowersSumnerKearlResult:
|
|
|
936
2061
|
total_sentences=0,
|
|
937
2062
|
total_words=0,
|
|
938
2063
|
total_syllables=0,
|
|
2064
|
+
psk_score_dist=empty_dist,
|
|
2065
|
+
grade_level_dist=empty_dist,
|
|
2066
|
+
avg_sentence_length_dist=empty_dist,
|
|
2067
|
+
avg_syllables_per_word_dist=empty_dist,
|
|
2068
|
+
chunk_size=chunk_size,
|
|
2069
|
+
chunk_count=len(chunks),
|
|
939
2070
|
metadata={
|
|
940
2071
|
"flesch_reading_ease": float("nan"),
|
|
941
2072
|
"flesch_kincaid_grade": float("nan"),
|
|
2073
|
+
"difference_from_flesch": float("nan"),
|
|
2074
|
+
"reliable": False,
|
|
942
2075
|
},
|
|
943
2076
|
)
|
|
944
2077
|
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
avg_sentence_length = len(word_tokens) / len(sentences)
|
|
950
|
-
avg_syllables_per_word = total_syllables / len(word_tokens)
|
|
2078
|
+
score_dist = make_distribution(score_values)
|
|
2079
|
+
grade_dist = make_distribution(grade_values)
|
|
2080
|
+
sent_len_dist = make_distribution(sent_len_values)
|
|
2081
|
+
syl_word_dist = make_distribution(syl_per_word_values)
|
|
951
2082
|
|
|
952
|
-
#
|
|
953
|
-
#
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
# Optional: Calculate Flesch scores for comparison
|
|
960
|
-
flesch_reading_ease = (
|
|
961
|
-
206.835 - 1.015 * avg_sentence_length - 84.6 * avg_syllables_per_word
|
|
962
|
-
)
|
|
963
|
-
flesch_kincaid_grade = (
|
|
964
|
-
0.39 * avg_sentence_length + 11.8 * avg_syllables_per_word - 15.59
|
|
965
|
-
)
|
|
966
|
-
|
|
967
|
-
# Build metadata
|
|
968
|
-
metadata = {
|
|
969
|
-
"flesch_reading_ease": flesch_reading_ease,
|
|
970
|
-
"flesch_kincaid_grade": flesch_kincaid_grade,
|
|
971
|
-
"difference_from_flesch": psk_score - flesch_kincaid_grade,
|
|
972
|
-
"words_per_sentence": avg_sentence_length,
|
|
973
|
-
"syllables_per_word": avg_syllables_per_word,
|
|
974
|
-
}
|
|
2083
|
+
# Compute Flesch metrics for comparison (using the same avg values)
|
|
2084
|
+
# Flesch Reading Ease: 206.835 - 1.015 * ASL - 84.6 * ASW
|
|
2085
|
+
# Flesch-Kincaid Grade: 0.39 * ASL + 11.8 * ASW - 15.59
|
|
2086
|
+
flesch_reading_ease = 206.835 - 1.015 * sent_len_dist.mean - 84.6 * syl_word_dist.mean
|
|
2087
|
+
flesch_kincaid_grade = 0.39 * sent_len_dist.mean + 11.8 * syl_word_dist.mean - 15.59
|
|
2088
|
+
difference_from_flesch = grade_dist.mean - flesch_kincaid_grade
|
|
975
2089
|
|
|
976
2090
|
return PowersSumnerKearlResult(
|
|
977
|
-
psk_score=
|
|
978
|
-
grade_level=
|
|
979
|
-
avg_sentence_length=
|
|
980
|
-
avg_syllables_per_word=
|
|
981
|
-
total_sentences=
|
|
982
|
-
total_words=
|
|
2091
|
+
psk_score=score_dist.mean,
|
|
2092
|
+
grade_level=grade_dist.mean,
|
|
2093
|
+
avg_sentence_length=sent_len_dist.mean,
|
|
2094
|
+
avg_syllables_per_word=syl_word_dist.mean,
|
|
2095
|
+
total_sentences=total_sentences,
|
|
2096
|
+
total_words=total_words,
|
|
983
2097
|
total_syllables=total_syllables,
|
|
984
|
-
|
|
2098
|
+
psk_score_dist=score_dist,
|
|
2099
|
+
grade_level_dist=grade_dist,
|
|
2100
|
+
avg_sentence_length_dist=sent_len_dist,
|
|
2101
|
+
avg_syllables_per_word_dist=syl_word_dist,
|
|
2102
|
+
chunk_size=chunk_size,
|
|
2103
|
+
chunk_count=len(chunks),
|
|
2104
|
+
metadata={
|
|
2105
|
+
"flesch_reading_ease": flesch_reading_ease,
|
|
2106
|
+
"flesch_kincaid_grade": flesch_kincaid_grade,
|
|
2107
|
+
"difference_from_flesch": difference_from_flesch,
|
|
2108
|
+
"reliable": total_words >= 100,
|
|
2109
|
+
},
|
|
985
2110
|
)
|