textstat 0.1.9 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,363 @@
1
+ module TextStat
2
+ # Readability formulas and text difficulty calculations
3
+ #
4
+ # This module implements various readability formulas used to determine
5
+ # the reading level and complexity of text. Each formula uses different
6
+ # metrics and is suitable for different types of content and audiences.
7
+ #
8
+ # @author Jakub Polak
9
+ # @since 1.0.0
10
+ # @example Basic readability analysis
11
+ # text = "This is a sample text for readability analysis."
12
+ # TextStat.flesch_reading_ease(text) # => 83.32
13
+ # TextStat.flesch_kincaid_grade(text) # => 3.7
14
+ # TextStat.text_standard(text) # => "3rd and 4th grade"
15
+ #
16
+ # @example Multi-language support
17
+ # TextStat.flesch_reading_ease(spanish_text, 'es')
18
+ # TextStat.smog_index(french_text, 'fr')
19
+ # TextStat.gunning_fog(german_text, 'de')
20
+ module ReadabilityFormulas
21
+ # Calculate Flesch Reading Ease score
22
+ #
23
+ # The Flesch Reading Ease formula produces a score between 0 and 100,
24
+ # with higher scores indicating easier readability.
25
+ #
26
+ # Score ranges:
27
+ # - 90-100: Very Easy
28
+ # - 80-89: Easy
29
+ # - 70-79: Fairly Easy
30
+ # - 60-69: Standard
31
+ # - 50-59: Fairly Difficult
32
+ # - 30-49: Difficult
33
+ # - 0-29: Very Difficult
34
+ #
35
+ # @param text [String] the text to analyze
36
+ # @param language [String] language code for syllable counting
37
+ # @return [Float] Flesch Reading Ease score
38
+ # @example
39
+ # TextStat.flesch_reading_ease("The cat sat on the mat.") # => 116.15
40
+ # TextStat.flesch_reading_ease("Comprehensive analysis.") # => 43.73
41
+ def flesch_reading_ease(text, language = 'en_us')
42
+ sentence_length = avg_sentence_length(text)
43
+ syllables_per_word = avg_syllables_per_word(text, language)
44
+ flesch = 206.835 - (1.015 * sentence_length) - (84.6 * syllables_per_word)
45
+ flesch.round(2)
46
+ end
47
+
48
+ # Calculate Flesch-Kincaid Grade Level
49
+ #
50
+ # This formula converts the Flesch Reading Ease score into a U.S. grade level,
51
+ # making it easier to understand the education level required to comprehend the text.
52
+ #
53
+ # @param text [String] the text to analyze
54
+ # @param language [String] language code for syllable counting
55
+ # @return [Float] grade level (e.g., 8.5 = 8th to 9th grade)
56
+ # @example
57
+ # TextStat.flesch_kincaid_grade("Simple text.") # => 2.1
58
+ # TextStat.flesch_kincaid_grade("Complex analysis.") # => 5.8
59
+ def flesch_kincaid_grade(text, language = 'en_us')
60
+ sentence_length = avg_sentence_length(text)
61
+ syllables_per_word = avg_syllables_per_word(text, language)
62
+ flesch = (0.39 * sentence_length) + (11.8 * syllables_per_word) - 15.59
63
+ flesch.round(1)
64
+ end
65
+
66
+ # Calculate SMOG Index (Simple Measure of Gobbledygook)
67
+ #
68
+ # SMOG estimates the years of education needed to understand a text.
69
+ # It focuses on polysyllabic words and is particularly useful for health
70
+ # and educational materials.
71
+ #
72
+ # @param text [String] the text to analyze (minimum 3 sentences)
73
+ # @param language [String] language code for syllable counting
74
+ # @return [Float] SMOG grade level
75
+ # @example
76
+ # TextStat.smog_index("The quick brown fox jumps. It is fast. Very agile.") # => 8.2
77
+ def smog_index(text, language = 'en_us')
78
+ sentences = sentence_count(text)
79
+
80
+ if sentences >= 3
81
+ polysyllab = polysyllab_count(text, language)
82
+ smog = (1.043 * Math.sqrt((30.0 * polysyllab) / sentences)) + 3.1291
83
+ smog.round(1)
84
+ else
85
+ 0.0
86
+ end
87
+ rescue ZeroDivisionError
88
+ 0.0
89
+ end
90
+
91
+ # Calculate Coleman-Liau Index
92
+ #
93
+ # This formula relies on character counts instead of syllable counts,
94
+ # making it more suitable for automated analysis. It estimates the
95
+ # U.S. grade level required to understand the text.
96
+ #
97
+ # @param text [String] the text to analyze
98
+ # @return [Float] Coleman-Liau grade level
99
+ # @example
100
+ # TextStat.coleman_liau_index("Short words are easy to read.") # => 4.71
101
+ def coleman_liau_index(text)
102
+ letters = (avg_letter_per_word(text) * 100).round(2)
103
+ sentences = (avg_sentence_per_word(text) * 100).round(2)
104
+ coleman = (0.0588 * letters) - (0.296 * sentences) - 15.8
105
+ coleman.round(2)
106
+ end
107
+
108
+ # Calculate Automated Readability Index (ARI)
109
+ #
110
+ # ARI uses character counts and word lengths to estimate readability.
111
+ # It's designed to be easily calculated by computer programs.
112
+ #
113
+ # @param text [String] the text to analyze
114
+ # @return [Float] ARI grade level
115
+ # @example
116
+ # TextStat.automated_readability_index("This text is easy to read.") # => 2.9
117
+ def automated_readability_index(text)
118
+ chars = char_count(text)
119
+ words = lexicon_count(text)
120
+ sentences = sentence_count(text)
121
+
122
+ a = chars.to_f / words
123
+ b = words.to_f / sentences
124
+ readability = (4.71 * a) + (0.5 * b) - 21.43
125
+ readability.round(1)
126
+ rescue ZeroDivisionError
127
+ 0.0
128
+ end
129
+
130
+ # Calculate Linsear Write Formula
131
+ #
132
+ # This formula is designed for technical writing and focuses on
133
+ # the percentage of words with three or more syllables.
134
+ #
135
+ # @param text [String] the text to analyze
136
+ # @param language [String] language code for syllable counting
137
+ # @return [Float] Linsear Write grade level
138
+ # @example
139
+ # TextStat.linsear_write_formula("Technical documentation analysis.") # => 6.5
140
+ def linsear_write_formula(text, language = 'en_us')
141
+ easy_word = 0
142
+ difficult_word = 0
143
+ text_list = text.split[0..100]
144
+
145
+ text_list.each do |word|
146
+ if syllable_count(word, language) < 3
147
+ easy_word += 1
148
+ else
149
+ difficult_word += 1
150
+ end
151
+ end
152
+
153
+ text = text_list.join(' ')
154
+ number = ((easy_word * 1) + (difficult_word * 3)).to_f / sentence_count(text)
155
+ number -= 2 if number <= 20
156
+ number / 2
157
+ end
158
+
159
+ # Calculate Dale-Chall Readability Score
160
+ #
161
+ # This formula uses a list of 3000 familiar words to determine text difficulty.
162
+ # It's particularly effective for elementary and middle school texts.
163
+ #
164
+ # @param text [String] the text to analyze
165
+ # @param language [String] language code for dictionary selection
166
+ # @return [Float] Dale-Chall readability score
167
+ # @example
168
+ # TextStat.dale_chall_readability_score("Simple story for children.") # => 5.12
169
+ def dale_chall_readability_score(text, language = 'en_us')
170
+ word_count = lexicon_count(text)
171
+ count = word_count - difficult_words(text, language)
172
+
173
+ per = (100.0 * count) / word_count
174
+ difficult_words_percentage = 100 - per
175
+ score = (0.1579 * difficult_words_percentage) + (0.0496 * avg_sentence_length(text))
176
+ score += 3.6365 if difficult_words_percentage > 5
177
+
178
+ score.round(2)
179
+ rescue ZeroDivisionError
180
+ 0.0
181
+ end
182
+
183
+ # Calculate Gunning Fog Index
184
+ #
185
+ # The Fog Index estimates the years of formal education needed to understand
186
+ # the text. It focuses on sentence length and polysyllabic words.
187
+ #
188
+ # @param text [String] the text to analyze
189
+ # @param language [String] language code for syllable counting
190
+ # @return [Float] Gunning Fog grade level
191
+ # @example
192
+ # TextStat.gunning_fog("Business communication analysis.") # => 12.3
193
+ def gunning_fog(text, language = 'en_us')
194
+ per_diff_words = ((100.0 * difficult_words(text, language)) / lexicon_count(text)) + 5
195
+ grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
196
+ grade.round(2)
197
+ rescue ZeroDivisionError
198
+ 0.0
199
+ end
200
+
201
+ # Calculate LIX Readability Formula
202
+ #
203
+ # LIX (Läsbarhetsindex) is a Swedish readability formula that works well
204
+ # for multiple languages. It uses sentence length and percentage of long words.
205
+ #
206
+ # @param text [String] the text to analyze
207
+ # @return [Float] LIX readability score
208
+ # @example
209
+ # TextStat.lix("International readability measurement.") # => 45.2
210
+ def lix(text)
211
+ words = text.split
212
+ words_length = words.length
213
+ long_words = words.count { |word| word.length > 6 }
214
+
215
+ per_long_words = (100.0 * long_words) / words_length
216
+ asl = avg_sentence_length(text)
217
+ lix = asl + per_long_words
218
+ lix.round(2)
219
+ end
220
+
221
+ # Calculate FORCAST Readability Formula
222
+ #
223
+ # FORCAST (FOg Readability by CASTing) is designed for technical materials
224
+ # and focuses on single-syllable words to determine readability.
225
+ #
226
+ # @param text [String] the text to analyze (uses first 150 words)
227
+ # @param language [String] language code for syllable counting
228
+ # @return [Integer] FORCAST grade level
229
+ # @example
230
+ # TextStat.forcast("Technical manual instructions.") # => 11
231
+ def forcast(text, language = 'en_us')
232
+ words = text.split[0..149]
233
+ words_with_one_syllabe = words.count do |word|
234
+ syllable_count(word, language) == 1
235
+ end
236
+ 20 - (words_with_one_syllabe / 10)
237
+ end
238
+
239
+ # Calculate Powers-Sumner-Kearl Readability Formula
240
+ #
241
+ # This formula was developed for primary-grade reading materials and
242
+ # uses sentence length and syllable count to determine grade level.
243
+ #
244
+ # @param text [String] the text to analyze
245
+ # @param language [String] language code for syllable counting
246
+ # @return [Float] Powers-Sumner-Kearl grade level
247
+ # @example
248
+ # TextStat.powers_sumner_kearl("Elementary school reading material.") # => 4.2
249
+ def powers_sumner_kearl(text, language = 'en_us')
250
+ grade = (0.0778 * avg_sentence_length(text)) + (0.0455 * syllable_count(text, language)) - 2.2029
251
+ grade.round(2)
252
+ end
253
+
254
+ # Calculate SPACHE Readability Formula
255
+ #
256
+ # The SPACHE formula is designed for primary-grade reading materials
257
+ # (grades 1-4) and uses a list of familiar words for analysis.
258
+ #
259
+ # @param text [String] the text to analyze
260
+ # @param language [String] language code for dictionary selection
261
+ # @return [Float] SPACHE grade level
262
+ # @example
263
+ # TextStat.spache("Primary school reading text.") # => 2.8
264
+ def spache(text, language = 'en_us')
265
+ words = text.split.count
266
+ unfamiliar_words = difficult_words(text, language) / words
267
+ grade = (0.141 * avg_sentence_length(text)) + (0.086 * unfamiliar_words) + 0.839
268
+ grade.round(2)
269
+ end
270
+
271
+ # Calculate consensus text standard from multiple formulas
272
+ #
273
+ # This method combines results from multiple readability formulas to provide
274
+ # a consensus grade level recommendation. It's more reliable than using
275
+ # a single formula alone.
276
+ #
277
+ # @param text [String] the text to analyze
278
+ # @param float_output [Boolean] whether to return numeric grade or description
279
+ # @return [String, Float] grade level description or numeric value
280
+ # @example
281
+ # TextStat.text_standard("Sample text for analysis.") # => "5th and 6th grade"
282
+ # TextStat.text_standard("Sample text for analysis.", true) # => 5.0
283
+ def text_standard(text, float_output = nil)
284
+ grade = []
285
+
286
+ # Collect grades from all formulas
287
+ add_flesch_kincaid_grades(text, grade)
288
+ add_flesch_reading_ease_grade(text, grade)
289
+ add_other_readability_grades(text, grade)
290
+
291
+ # Find consensus grade
292
+ final_grade = calculate_consensus_grade(grade)
293
+
294
+ format_grade_output(final_grade, float_output)
295
+ end
296
+
297
+ private
298
+
299
+ # Add Flesch-Kincaid grade levels to grade array
300
+ def add_flesch_kincaid_grades(text, grade)
301
+ flesch_grade = flesch_kincaid_grade(text)
302
+ grade.append(flesch_grade.round.to_i)
303
+ grade.append(flesch_grade.ceil.to_i)
304
+ end
305
+
306
+ # Add Flesch Reading Ease grade level to grade array
307
+ def add_flesch_reading_ease_grade(text, grade)
308
+ score = flesch_reading_ease(text)
309
+ case score
310
+ when 90...100
311
+ grade.append(5)
312
+ when 80...90
313
+ grade.append(6)
314
+ when 70...80
315
+ grade.append(7)
316
+ when 60...70
317
+ grade.append(8, 9)
318
+ when 50...60
319
+ grade.append(10)
320
+ when 40...50
321
+ grade.append(11)
322
+ when 30...40
323
+ grade.append(12)
324
+ else
325
+ grade.append(13)
326
+ end
327
+ end
328
+
329
+ # Add other readability formula grades to grade array
330
+ def add_other_readability_grades(text, grade)
331
+ readability_scores = [
332
+ smog_index(text),
333
+ coleman_liau_index(text),
334
+ automated_readability_index(text),
335
+ dale_chall_readability_score(text),
336
+ linsear_write_formula(text),
337
+ gunning_fog(text)
338
+ ]
339
+
340
+ readability_scores.each do |score|
341
+ grade.append(score.round.to_i)
342
+ grade.append(score.ceil.to_i)
343
+ end
344
+ end
345
+
346
+ # Calculate consensus grade from all collected grades
347
+ def calculate_consensus_grade(grade)
348
+ require_relative '../counter'
349
+ counter = Counter.new(grade)
350
+ most_common = counter.most_common(1)
351
+ most_common[0][0]
352
+ end
353
+
354
+ # Format grade output based on float_output parameter
355
+ def format_grade_output(grade, float_output)
356
+ if float_output
357
+ grade.to_f
358
+ else
359
+ "#{grade.to_i - 1}th and #{grade.to_i}th grade"
360
+ end
361
+ end
362
+ end
363
+ end
@@ -1,3 +1,22 @@
1
- class TextStat
2
- VERSION = "0.1.9"
1
+ # TextStat version information
2
+ #
3
+ # This module defines the current version of the TextStat gem.
4
+ # The version follows Semantic Versioning (semver.org).
5
+ #
6
+ # @author Jakub Polak
7
+ # @since 0.1.0
8
+ module TextStat
9
+ # Current version of the TextStat gem
10
+ #
11
+ # Version 1.0.0 represents the first stable release with:
12
+ # - 36x performance improvement through dictionary caching
13
+ # - Modular architecture with separate modules for different functionality
14
+ # - Comprehensive test coverage (199 tests)
15
+ # - Support for 22 languages
16
+ # - Full backward compatibility with 0.1.x series
17
+ #
18
+ # @return [String] current version string
19
+ # @example
20
+ # TextStat::VERSION # => \"1.0.0\"
21
+ VERSION = '1.0.0'.freeze
3
22
  end