textstat 0.1.9 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/counter.rb +6 -6
- data/lib/dictionaries/da.txt +2000 -0
- data/lib/dictionaries/de.txt +2000 -0
- data/lib/dictionaries/en_uk.txt +2945 -0
- data/lib/dictionaries/es.txt +2000 -0
- data/lib/dictionaries/et.txt +2000 -0
- data/lib/dictionaries/fi.txt +2000 -0
- data/lib/dictionaries/fr.txt +2000 -0
- data/lib/dictionaries/hr.txt +1980 -0
- data/lib/dictionaries/hu.txt +2000 -0
- data/lib/dictionaries/id.txt +2000 -0
- data/lib/dictionaries/is.txt +2000 -0
- data/lib/dictionaries/it.txt +2000 -0
- data/lib/dictionaries/la.txt +2000 -0
- data/lib/dictionaries/no2.txt +2000 -0
- data/lib/dictionaries/pl.txt +2000 -0
- data/lib/dictionaries/pt.txt +2000 -0
- data/lib/dictionaries/ru.txt +2000 -0
- data/lib/dictionaries/sv.txt +2000 -0
- data/lib/textstat/basic_stats.rb +156 -0
- data/lib/textstat/dictionary_manager.rb +156 -0
- data/lib/textstat/main.rb +137 -0
- data/lib/textstat/readability_formulas.rb +363 -0
- data/lib/textstat/version.rb +21 -2
- data/lib/textstat.rb +36 -313
- metadata +217 -21
- data/spec/textstat_spec.rb +0 -197
@@ -0,0 +1,363 @@
|
|
1
|
+
module TextStat
|
2
|
+
# Readability formulas and text difficulty calculations
|
3
|
+
#
|
4
|
+
# This module implements various readability formulas used to determine
|
5
|
+
# the reading level and complexity of text. Each formula uses different
|
6
|
+
# metrics and is suitable for different types of content and audiences.
|
7
|
+
#
|
8
|
+
# @author Jakub Polak
|
9
|
+
# @since 1.0.0
|
10
|
+
# @example Basic readability analysis
|
11
|
+
# text = "This is a sample text for readability analysis."
|
12
|
+
# TextStat.flesch_reading_ease(text) # => 83.32
|
13
|
+
# TextStat.flesch_kincaid_grade(text) # => 3.7
|
14
|
+
# TextStat.text_standard(text) # => "3rd and 4th grade"
|
15
|
+
#
|
16
|
+
# @example Multi-language support
|
17
|
+
# TextStat.flesch_reading_ease(spanish_text, 'es')
|
18
|
+
# TextStat.smog_index(french_text, 'fr')
|
19
|
+
# TextStat.gunning_fog(german_text, 'de')
|
20
|
+
module ReadabilityFormulas
|
21
|
+
# Calculate Flesch Reading Ease score
|
22
|
+
#
|
23
|
+
# The Flesch Reading Ease formula produces a score between 0 and 100,
|
24
|
+
# with higher scores indicating easier readability.
|
25
|
+
#
|
26
|
+
# Score ranges:
|
27
|
+
# - 90-100: Very Easy
|
28
|
+
# - 80-89: Easy
|
29
|
+
# - 70-79: Fairly Easy
|
30
|
+
# - 60-69: Standard
|
31
|
+
# - 50-59: Fairly Difficult
|
32
|
+
# - 30-49: Difficult
|
33
|
+
# - 0-29: Very Difficult
|
34
|
+
#
|
35
|
+
# @param text [String] the text to analyze
|
36
|
+
# @param language [String] language code for syllable counting
|
37
|
+
# @return [Float] Flesch Reading Ease score
|
38
|
+
# @example
|
39
|
+
# TextStat.flesch_reading_ease("The cat sat on the mat.") # => 116.15
|
40
|
+
# TextStat.flesch_reading_ease("Comprehensive analysis.") # => 43.73
|
41
|
+
def flesch_reading_ease(text, language = 'en_us')
|
42
|
+
sentence_length = avg_sentence_length(text)
|
43
|
+
syllables_per_word = avg_syllables_per_word(text, language)
|
44
|
+
flesch = 206.835 - (1.015 * sentence_length) - (84.6 * syllables_per_word)
|
45
|
+
flesch.round(2)
|
46
|
+
end
|
47
|
+
|
48
|
+
# Calculate Flesch-Kincaid Grade Level
|
49
|
+
#
|
50
|
+
# This formula converts the Flesch Reading Ease score into a U.S. grade level,
|
51
|
+
# making it easier to understand the education level required to comprehend the text.
|
52
|
+
#
|
53
|
+
# @param text [String] the text to analyze
|
54
|
+
# @param language [String] language code for syllable counting
|
55
|
+
# @return [Float] grade level (e.g., 8.5 = 8th to 9th grade)
|
56
|
+
# @example
|
57
|
+
# TextStat.flesch_kincaid_grade("Simple text.") # => 2.1
|
58
|
+
# TextStat.flesch_kincaid_grade("Complex analysis.") # => 5.8
|
59
|
+
def flesch_kincaid_grade(text, language = 'en_us')
|
60
|
+
sentence_length = avg_sentence_length(text)
|
61
|
+
syllables_per_word = avg_syllables_per_word(text, language)
|
62
|
+
flesch = (0.39 * sentence_length) + (11.8 * syllables_per_word) - 15.59
|
63
|
+
flesch.round(1)
|
64
|
+
end
|
65
|
+
|
66
|
+
# Calculate SMOG Index (Simple Measure of Gobbledygook)
|
67
|
+
#
|
68
|
+
# SMOG estimates the years of education needed to understand a text.
|
69
|
+
# It focuses on polysyllabic words and is particularly useful for health
|
70
|
+
# and educational materials.
|
71
|
+
#
|
72
|
+
# @param text [String] the text to analyze (minimum 3 sentences)
|
73
|
+
# @param language [String] language code for syllable counting
|
74
|
+
# @return [Float] SMOG grade level
|
75
|
+
# @example
|
76
|
+
# TextStat.smog_index("The quick brown fox jumps. It is fast. Very agile.") # => 8.2
|
77
|
+
def smog_index(text, language = 'en_us')
|
78
|
+
sentences = sentence_count(text)
|
79
|
+
|
80
|
+
if sentences >= 3
|
81
|
+
polysyllab = polysyllab_count(text, language)
|
82
|
+
smog = (1.043 * Math.sqrt((30.0 * polysyllab) / sentences)) + 3.1291
|
83
|
+
smog.round(1)
|
84
|
+
else
|
85
|
+
0.0
|
86
|
+
end
|
87
|
+
rescue ZeroDivisionError
|
88
|
+
0.0
|
89
|
+
end
|
90
|
+
|
91
|
+
# Calculate Coleman-Liau Index
|
92
|
+
#
|
93
|
+
# This formula relies on character counts instead of syllable counts,
|
94
|
+
# making it more suitable for automated analysis. It estimates the
|
95
|
+
# U.S. grade level required to understand the text.
|
96
|
+
#
|
97
|
+
# @param text [String] the text to analyze
|
98
|
+
# @return [Float] Coleman-Liau grade level
|
99
|
+
# @example
|
100
|
+
# TextStat.coleman_liau_index("Short words are easy to read.") # => 4.71
|
101
|
+
def coleman_liau_index(text)
|
102
|
+
letters = (avg_letter_per_word(text) * 100).round(2)
|
103
|
+
sentences = (avg_sentence_per_word(text) * 100).round(2)
|
104
|
+
coleman = (0.0588 * letters) - (0.296 * sentences) - 15.8
|
105
|
+
coleman.round(2)
|
106
|
+
end
|
107
|
+
|
108
|
+
# Calculate Automated Readability Index (ARI)
|
109
|
+
#
|
110
|
+
# ARI uses character counts and word lengths to estimate readability.
|
111
|
+
# It's designed to be easily calculated by computer programs.
|
112
|
+
#
|
113
|
+
# @param text [String] the text to analyze
|
114
|
+
# @return [Float] ARI grade level
|
115
|
+
# @example
|
116
|
+
# TextStat.automated_readability_index("This text is easy to read.") # => 2.9
|
117
|
+
def automated_readability_index(text)
|
118
|
+
chars = char_count(text)
|
119
|
+
words = lexicon_count(text)
|
120
|
+
sentences = sentence_count(text)
|
121
|
+
|
122
|
+
a = chars.to_f / words
|
123
|
+
b = words.to_f / sentences
|
124
|
+
readability = (4.71 * a) + (0.5 * b) - 21.43
|
125
|
+
readability.round(1)
|
126
|
+
rescue ZeroDivisionError
|
127
|
+
0.0
|
128
|
+
end
|
129
|
+
|
130
|
+
# Calculate Linsear Write Formula
|
131
|
+
#
|
132
|
+
# This formula is designed for technical writing and focuses on
|
133
|
+
# the percentage of words with three or more syllables.
|
134
|
+
#
|
135
|
+
# @param text [String] the text to analyze
|
136
|
+
# @param language [String] language code for syllable counting
|
137
|
+
# @return [Float] Linsear Write grade level
|
138
|
+
# @example
|
139
|
+
# TextStat.linsear_write_formula("Technical documentation analysis.") # => 6.5
|
140
|
+
def linsear_write_formula(text, language = 'en_us')
|
141
|
+
easy_word = 0
|
142
|
+
difficult_word = 0
|
143
|
+
text_list = text.split[0..100]
|
144
|
+
|
145
|
+
text_list.each do |word|
|
146
|
+
if syllable_count(word, language) < 3
|
147
|
+
easy_word += 1
|
148
|
+
else
|
149
|
+
difficult_word += 1
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
text = text_list.join(' ')
|
154
|
+
number = ((easy_word * 1) + (difficult_word * 3)).to_f / sentence_count(text)
|
155
|
+
number -= 2 if number <= 20
|
156
|
+
number / 2
|
157
|
+
end
|
158
|
+
|
159
|
+
# Calculate Dale-Chall Readability Score
|
160
|
+
#
|
161
|
+
# This formula uses a list of 3000 familiar words to determine text difficulty.
|
162
|
+
# It's particularly effective for elementary and middle school texts.
|
163
|
+
#
|
164
|
+
# @param text [String] the text to analyze
|
165
|
+
# @param language [String] language code for dictionary selection
|
166
|
+
# @return [Float] Dale-Chall readability score
|
167
|
+
# @example
|
168
|
+
# TextStat.dale_chall_readability_score("Simple story for children.") # => 5.12
|
169
|
+
def dale_chall_readability_score(text, language = 'en_us')
|
170
|
+
word_count = lexicon_count(text)
|
171
|
+
count = word_count - difficult_words(text, language)
|
172
|
+
|
173
|
+
per = (100.0 * count) / word_count
|
174
|
+
difficult_words_percentage = 100 - per
|
175
|
+
score = (0.1579 * difficult_words_percentage) + (0.0496 * avg_sentence_length(text))
|
176
|
+
score += 3.6365 if difficult_words_percentage > 5
|
177
|
+
|
178
|
+
score.round(2)
|
179
|
+
rescue ZeroDivisionError
|
180
|
+
0.0
|
181
|
+
end
|
182
|
+
|
183
|
+
# Calculate Gunning Fog Index
|
184
|
+
#
|
185
|
+
# The Fog Index estimates the years of formal education needed to understand
|
186
|
+
# the text. It focuses on sentence length and polysyllabic words.
|
187
|
+
#
|
188
|
+
# @param text [String] the text to analyze
|
189
|
+
# @param language [String] language code for syllable counting
|
190
|
+
# @return [Float] Gunning Fog grade level
|
191
|
+
# @example
|
192
|
+
# TextStat.gunning_fog("Business communication analysis.") # => 12.3
|
193
|
+
def gunning_fog(text, language = 'en_us')
|
194
|
+
per_diff_words = ((100.0 * difficult_words(text, language)) / lexicon_count(text)) + 5
|
195
|
+
grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
|
196
|
+
grade.round(2)
|
197
|
+
rescue ZeroDivisionError
|
198
|
+
0.0
|
199
|
+
end
|
200
|
+
|
201
|
+
# Calculate LIX Readability Formula
|
202
|
+
#
|
203
|
+
# LIX (Läsbarhetsindex) is a Swedish readability formula that works well
|
204
|
+
# for multiple languages. It uses sentence length and percentage of long words.
|
205
|
+
#
|
206
|
+
# @param text [String] the text to analyze
|
207
|
+
# @return [Float] LIX readability score
|
208
|
+
# @example
|
209
|
+
# TextStat.lix("International readability measurement.") # => 45.2
|
210
|
+
def lix(text)
|
211
|
+
words = text.split
|
212
|
+
words_length = words.length
|
213
|
+
long_words = words.count { |word| word.length > 6 }
|
214
|
+
|
215
|
+
per_long_words = (100.0 * long_words) / words_length
|
216
|
+
asl = avg_sentence_length(text)
|
217
|
+
lix = asl + per_long_words
|
218
|
+
lix.round(2)
|
219
|
+
end
|
220
|
+
|
221
|
+
# Calculate FORCAST Readability Formula
|
222
|
+
#
|
223
|
+
# FORCAST (FOg Readability by CASTing) is designed for technical materials
|
224
|
+
# and focuses on single-syllable words to determine readability.
|
225
|
+
#
|
226
|
+
# @param text [String] the text to analyze (uses first 150 words)
|
227
|
+
# @param language [String] language code for syllable counting
|
228
|
+
# @return [Integer] FORCAST grade level
|
229
|
+
# @example
|
230
|
+
# TextStat.forcast("Technical manual instructions.") # => 11
|
231
|
+
def forcast(text, language = 'en_us')
|
232
|
+
words = text.split[0..149]
|
233
|
+
words_with_one_syllabe = words.count do |word|
|
234
|
+
syllable_count(word, language) == 1
|
235
|
+
end
|
236
|
+
20 - (words_with_one_syllabe / 10)
|
237
|
+
end
|
238
|
+
|
239
|
+
# Calculate Powers-Sumner-Kearl Readability Formula
|
240
|
+
#
|
241
|
+
# This formula was developed for primary-grade reading materials and
|
242
|
+
# uses sentence length and syllable count to determine grade level.
|
243
|
+
#
|
244
|
+
# @param text [String] the text to analyze
|
245
|
+
# @param language [String] language code for syllable counting
|
246
|
+
# @return [Float] Powers-Sumner-Kearl grade level
|
247
|
+
# @example
|
248
|
+
# TextStat.powers_sumner_kearl("Elementary school reading material.") # => 4.2
|
249
|
+
def powers_sumner_kearl(text, language = 'en_us')
|
250
|
+
grade = (0.0778 * avg_sentence_length(text)) + (0.0455 * syllable_count(text, language)) - 2.2029
|
251
|
+
grade.round(2)
|
252
|
+
end
|
253
|
+
|
254
|
+
# Calculate SPACHE Readability Formula
|
255
|
+
#
|
256
|
+
# The SPACHE formula is designed for primary-grade reading materials
|
257
|
+
# (grades 1-4) and uses a list of familiar words for analysis.
|
258
|
+
#
|
259
|
+
# @param text [String] the text to analyze
|
260
|
+
# @param language [String] language code for dictionary selection
|
261
|
+
# @return [Float] SPACHE grade level
|
262
|
+
# @example
|
263
|
+
# TextStat.spache("Primary school reading text.") # => 2.8
|
264
|
+
def spache(text, language = 'en_us')
|
265
|
+
words = text.split.count
|
266
|
+
unfamiliar_words = difficult_words(text, language) / words
|
267
|
+
grade = (0.141 * avg_sentence_length(text)) + (0.086 * unfamiliar_words) + 0.839
|
268
|
+
grade.round(2)
|
269
|
+
end
|
270
|
+
|
271
|
+
# Calculate consensus text standard from multiple formulas
|
272
|
+
#
|
273
|
+
# This method combines results from multiple readability formulas to provide
|
274
|
+
# a consensus grade level recommendation. It's more reliable than using
|
275
|
+
# a single formula alone.
|
276
|
+
#
|
277
|
+
# @param text [String] the text to analyze
|
278
|
+
# @param float_output [Boolean] whether to return numeric grade or description
|
279
|
+
# @return [String, Float] grade level description or numeric value
|
280
|
+
# @example
|
281
|
+
# TextStat.text_standard("Sample text for analysis.") # => "5th and 6th grade"
|
282
|
+
# TextStat.text_standard("Sample text for analysis.", true) # => 5.0
|
283
|
+
def text_standard(text, float_output = nil)
|
284
|
+
grade = []
|
285
|
+
|
286
|
+
# Collect grades from all formulas
|
287
|
+
add_flesch_kincaid_grades(text, grade)
|
288
|
+
add_flesch_reading_ease_grade(text, grade)
|
289
|
+
add_other_readability_grades(text, grade)
|
290
|
+
|
291
|
+
# Find consensus grade
|
292
|
+
final_grade = calculate_consensus_grade(grade)
|
293
|
+
|
294
|
+
format_grade_output(final_grade, float_output)
|
295
|
+
end
|
296
|
+
|
297
|
+
private
|
298
|
+
|
299
|
+
# Add Flesch-Kincaid grade levels to grade array
|
300
|
+
def add_flesch_kincaid_grades(text, grade)
|
301
|
+
flesch_grade = flesch_kincaid_grade(text)
|
302
|
+
grade.append(flesch_grade.round.to_i)
|
303
|
+
grade.append(flesch_grade.ceil.to_i)
|
304
|
+
end
|
305
|
+
|
306
|
+
# Add Flesch Reading Ease grade level to grade array
|
307
|
+
def add_flesch_reading_ease_grade(text, grade)
|
308
|
+
score = flesch_reading_ease(text)
|
309
|
+
case score
|
310
|
+
when 90...100
|
311
|
+
grade.append(5)
|
312
|
+
when 80...90
|
313
|
+
grade.append(6)
|
314
|
+
when 70...80
|
315
|
+
grade.append(7)
|
316
|
+
when 60...70
|
317
|
+
grade.append(8, 9)
|
318
|
+
when 50...60
|
319
|
+
grade.append(10)
|
320
|
+
when 40...50
|
321
|
+
grade.append(11)
|
322
|
+
when 30...40
|
323
|
+
grade.append(12)
|
324
|
+
else
|
325
|
+
grade.append(13)
|
326
|
+
end
|
327
|
+
end
|
328
|
+
|
329
|
+
# Add other readability formula grades to grade array
|
330
|
+
def add_other_readability_grades(text, grade)
|
331
|
+
readability_scores = [
|
332
|
+
smog_index(text),
|
333
|
+
coleman_liau_index(text),
|
334
|
+
automated_readability_index(text),
|
335
|
+
dale_chall_readability_score(text),
|
336
|
+
linsear_write_formula(text),
|
337
|
+
gunning_fog(text)
|
338
|
+
]
|
339
|
+
|
340
|
+
readability_scores.each do |score|
|
341
|
+
grade.append(score.round.to_i)
|
342
|
+
grade.append(score.ceil.to_i)
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
# Calculate consensus grade from all collected grades
|
347
|
+
def calculate_consensus_grade(grade)
|
348
|
+
require_relative '../counter'
|
349
|
+
counter = Counter.new(grade)
|
350
|
+
most_common = counter.most_common(1)
|
351
|
+
most_common[0][0]
|
352
|
+
end
|
353
|
+
|
354
|
+
# Format grade output based on float_output parameter
|
355
|
+
def format_grade_output(grade, float_output)
|
356
|
+
if float_output
|
357
|
+
grade.to_f
|
358
|
+
else
|
359
|
+
"#{grade.to_i - 1}th and #{grade.to_i}th grade"
|
360
|
+
end
|
361
|
+
end
|
362
|
+
end
|
363
|
+
end
|
data/lib/textstat/version.rb
CHANGED
@@ -1,3 +1,22 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# TextStat version information
|
2
|
+
#
|
3
|
+
# This module defines the current version of the TextStat gem.
|
4
|
+
# The version follows Semantic Versioning (semver.org).
|
5
|
+
#
|
6
|
+
# @author Jakub Polak
|
7
|
+
# @since 0.1.0
|
8
|
+
module TextStat
|
9
|
+
# Current version of the TextStat gem
|
10
|
+
#
|
11
|
+
# Version 1.0.0 represents the first stable release with:
|
12
|
+
# - 36x performance improvement through dictionary caching
|
13
|
+
# - Modular architecture with separate modules for different functionality
|
14
|
+
# - Comprehensive test coverage (199 tests)
|
15
|
+
# - Support for 22 languages
|
16
|
+
# - Full backward compatibility with 0.1.x series
|
17
|
+
#
|
18
|
+
# @return [String] current version string
|
19
|
+
# @example
|
20
|
+
# TextStat::VERSION # => \"1.0.0\"
|
21
|
+
VERSION = '1.0.0'.freeze
|
3
22
|
end
|