textstat 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/textstat.rb CHANGED
@@ -1,303 +1,309 @@
1
- require 'text-hyphen'
2
-
3
- class TextStat
4
- GEM_PATH = File.dirname(File.dirname(__FILE__))
5
-
6
- def self.char_count(text, ignore_spaces = true)
7
- text = text.delete(' ') if ignore_spaces
8
- text.length
9
- end
10
-
11
- def self.lexicon_count(text, remove_punctuation = true)
12
- text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
13
- count = text.split(' ').count
14
- count
15
- end
16
-
17
- def self.syllable_count(text, language = 'en_us')
18
- return 0 if text.empty?
19
-
20
- text = text.downcase
21
- text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
22
- dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
23
- count = 0
24
- text.split(' ').each do |word|
25
- word_hyphenated = dictionary.visualise(word)
26
- count += [1, word_hyphenated.count('-') + 1].max
27
- end
28
- count
29
- end
30
-
31
- def self.sentence_count(text)
32
- text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
33
- end
34
-
35
- def self.avg_sentence_length(text)
36
- asl = lexicon_count(text).to_f / sentence_count(text).to_f
37
- asl.round(1)
38
- rescue ZeroDivisionError
39
- 0.0
40
- end
41
-
42
- def self.avg_syllables_per_word(text)
43
- syllable = syllable_count(text)
44
- words = lexicon_count(text)
45
- begin
46
- syllables_per_word = syllable.to_f / words.to_f
47
- return syllables_per_word.round(1)
48
- rescue ZeroDivisionError
49
- return 0.0
50
- end
51
- end
52
-
53
- def self.avg_letter_per_word(text)
54
- letters_per_word = char_count(text).to_f / lexicon_count(text).to_f
55
- letters_per_word.round(2)
56
- rescue ZeroDivisionError
57
- 0.0
58
- end
59
-
60
- def self.avg_sentence_per_word(text)
61
- sentence_per_word = sentence_count(text).to_f / lexicon_count(text).to_f
62
- sentence_per_word.round(2)
63
- rescue ZeroDivisionError
64
- 0.0
65
- end
66
-
67
- def self.flesch_reading_ease(text)
68
- sentence_length = avg_sentence_length(text)
69
- syllables_per_word = avg_syllables_per_word(text)
70
- flesch = (
71
- 206.835 - (1.015 * sentence_length).to_f - (84.6 * syllables_per_word).to_f
72
- )
73
- flesch.round(2)
74
- end
75
-
76
- def self.flesch_kincaid_grade(text)
77
- sentence_length = avg_sentence_length(text)
78
- syllables_per_word = avg_syllables_per_word(text)
79
- flesch = (0.39 * sentence_length.to_f) + (11.8 * syllables_per_word.to_f) - 15.59
80
- flesch.round(1)
81
- end
82
-
83
- def self.polysyllab_count(text)
84
- count = 0
85
- text.split(' ').each do |word|
86
- w = syllable_count(word)
87
- count += 1 if w >= 3
88
- end
89
- count
90
- end
91
-
92
- def self.smog_index(text)
93
- sentences = sentence_count(text)
94
-
95
- if sentences >= 3
96
- begin
97
- polysyllab = polysyllab_count(text)
98
- smog = (
99
- (1.043 * (30 * (polysyllab / sentences))**0.5) + 3.1291)
100
- return smog.round(1)
101
- rescue ZeroDivisionError
102
- return 0.0
103
- end
104
- else
105
- return 0.0
106
- end
107
- end
108
-
109
- def self.coleman_liau_index(text)
110
- letters = (avg_letter_per_word(text) * 100).round(2)
111
- sentences = (avg_sentence_per_word(text) * 100).round(2)
112
- coleman = ((0.058 * letters) - (0.296 * sentences) - 15.8).to_f
113
- coleman.round(2)
114
- end
115
-
116
- def self.automated_readability_index(text)
117
- chars = char_count(text)
118
- words = lexicon_count(text)
119
- sentences = sentence_count(text)
120
- begin
121
- a = chars.to_f / words.to_f
122
- b = words.to_f / sentences.to_f
123
-
124
- readability = (
125
- (4.71 * a.round(2) + (0.5 * b.round(2))) - 21.43)
126
- return readability.round(1)
127
- rescue ZeroDivisionError
128
- return 0.0
129
- end
130
- end
131
-
132
- def self.linsear_write_formula(text)
133
- easy_word = 0
134
- difficult_word = 0
135
- text_list = text.split(' ')[0..100]
136
-
137
- text_list.each do |word|
138
- if syllable_count(word) < 3
139
- easy_word += 1
140
- else
141
- difficult_word += 1
142
- end
143
- end
144
-
145
- text = text_list.join(' ')
146
-
147
- number = ((easy_word * 1 + difficult_word * 3) / sentence_count(text)).to_f
148
- if number <= 20
149
- number -= 2
150
- end
151
- return number / 2
152
- end
153
-
154
- def self.difficult_words(text, language = 'en_us')
155
- require 'set'
156
- easy_words = Set.new
157
- File.read(File.join(dictionary_path, "#{language}.txt")).each_line do |line|
158
- easy_words << line.chop
159
- end
160
-
161
- text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
162
- diff_words_set = Set.new
163
- text_list.each do |value|
164
- unless easy_words.include? value
165
- if syllable_count(value) > 1
166
- diff_words_set.add(value)
167
- end
168
- end
169
- end
170
- return diff_words_set.length
171
- end
172
-
173
- def self.dale_chall_readability_score(text)
174
- word_count = lexicon_count(text)
175
- count = word_count - difficult_words(text)
176
-
177
- begin
178
- per = count.to_f / word_count.to_f * 100
179
- rescue ZeroDivisionError
180
- return 0.0
181
- end
182
-
183
- difficult_words = 100 - per
184
- score = (
185
- (0.1579 * difficult_words)
186
- + (0.0496 * avg_sentence_length(text)))
187
-
188
- if difficult_words > 5
189
- score += 3.6365
190
- end
191
- return score.round(2)
192
- end
193
-
194
- def self.gunning_fog(text)
195
- begin
196
- per_diff_words = (
197
- (difficult_words(text) / lexicon_count(text) * 100) + 5)
198
-
199
- grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
200
- return grade.round(2)
201
- rescue ZeroDivisionError
202
- return 0.0
203
- end
204
- end
205
-
206
- def self.lix(text)
207
- words = text.split(' ')
208
- words_length = words.length
209
- long_words = words.select { |word| word.length > 6 }.count
210
-
211
- per_long_words = (long_words * 100).to_f / words_length
212
- asl = avg_sentence_length(text)
213
- lix = asl + per_long_words
214
-
215
- return lix.round(2)
216
- end
217
-
218
- def self.text_standard(text, float_output=nil)
219
- grade = []
220
-
221
- lower = flesch_kincaid_grade(text).round
222
- upper = flesch_kincaid_grade(text).ceil
223
- grade.append(lower.to_i)
224
- grade.append(upper.to_i)
225
-
226
- # Appending Flesch Reading Easy
227
- score = flesch_reading_ease(text)
228
- if score < 100 && score >= 90
229
- grade.append(5)
230
- elsif score < 90 && score >= 80
231
- grade.append(6)
232
- elsif score < 80 && score >= 70
233
- grade.append(7)
234
- elsif score < 70 && score >= 60
235
- grade.append(8)
236
- grade.append(9)
237
- elsif score < 60 && score >= 50
238
- grade.append(10)
239
- elsif score < 50 && score >= 40
240
- grade.append(11)
241
- elsif score < 40 && score >= 30
242
- grade.append(12)
243
- else
244
- grade.append(13)
245
- end
246
-
247
- # Appending SMOG Index
248
- lower = smog_index(text).round
249
- upper = smog_index(text).ceil
250
- grade.append(lower.to_i)
251
- grade.append(upper.to_i)
252
-
253
- # Appending Coleman_Liau_Index
254
- lower = coleman_liau_index(text).round
255
- upper = coleman_liau_index(text).ceil
256
- grade.append(lower.to_i)
257
- grade.append(upper.to_i)
258
-
259
- # Appending Automated_Readability_Index
260
- lower = automated_readability_index(text).round
261
- upper = automated_readability_index(text).ceil
262
- grade.append(lower.to_i)
263
- grade.append(upper.to_i)
264
-
265
- # Appending Dale_Chall_Readability_Score
266
- lower = dale_chall_readability_score(text).round
267
- upper = dale_chall_readability_score(text).ceil
268
- grade.append(lower.to_i)
269
- grade.append(upper.to_i)
270
-
271
- # Appending Linsear_Write_Formula
272
- lower = linsear_write_formula(text).round
273
- upper = linsear_write_formula(text).ceil
274
- grade.append(lower.to_i)
275
- grade.append(upper.to_i)
276
-
277
- # Appending Gunning Fog Index
278
- lower = gunning_fog(text).round
279
- upper = gunning_fog(text).ceil
280
- grade.append(lower.to_i)
281
- grade.append(upper.to_i)
282
-
283
- # Finding the Readability Consensus based upon all the above tests
284
- require 'counter'
285
- d = Counter.new(grade)
286
- final_grade = d.most_common(1)
287
- score = final_grade[0][0]
288
-
289
- if float_output
290
- return score.to_f
291
- else
292
- return "#{score.to_i - 1}th and #{score.to_i}th grade"
293
- end
294
- end
295
-
296
- def self.dictionary_path=(path)
297
- @dictionary_path = path
298
- end
299
-
300
- def self.dictionary_path
301
- @dictionary_path ||= File.join(TextStat::GEM_PATH, 'lib', 'dictionaries')
302
- end
303
- end
1
+ require 'text-hyphen'
2
+
3
+ class TextStat
4
+ GEM_PATH = File.dirname(File.dirname(__FILE__))
5
+
6
+ def self.char_count(text, ignore_spaces = true)
7
+ text = text.delete(' ') if ignore_spaces
8
+ text.length
9
+ end
10
+
11
+ def self.lexicon_count(text, remove_punctuation = true)
12
+ text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
13
+ count = text.split(' ').count
14
+ count
15
+ end
16
+
17
+ def self.syllable_count(text, language = 'en_us')
18
+ return 0 if text.empty?
19
+
20
+ text = text.downcase
21
+ text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
22
+ dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
23
+ count = 0
24
+ text.split(' ').each do |word|
25
+ word_hyphenated = dictionary.visualise(word)
26
+ count += word_hyphenated.count('-') + 1
27
+ end
28
+ count
29
+ end
30
+
31
+ def self.sentence_count(text)
32
+ text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
33
+ end
34
+
35
+ def self.avg_sentence_length(text)
36
+ asl = lexicon_count(text).to_f / sentence_count(text)
37
+ asl.round(1)
38
+ rescue ZeroDivisionError
39
+ 0.0
40
+ end
41
+
42
+ def self.avg_syllables_per_word(text)
43
+ syllable = syllable_count(text)
44
+ words = lexicon_count(text)
45
+ begin
46
+ syllables_per_word = syllable.to_f / words
47
+ syllables_per_word.round(1)
48
+ rescue ZeroDivisionError
49
+ 0.0
50
+ end
51
+ end
52
+
53
+ def self.avg_letter_per_word(text)
54
+ letters_per_word = char_count(text).to_f / lexicon_count(text)
55
+ letters_per_word.round(2)
56
+ rescue ZeroDivisionError
57
+ 0.0
58
+ end
59
+
60
+ def self.avg_sentence_per_word(text)
61
+ sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
62
+ sentence_per_word.round(2)
63
+ rescue ZeroDivisionError
64
+ 0.0
65
+ end
66
+
67
+ def self.flesch_reading_ease(text)
68
+ sentence_length = avg_sentence_length(text)
69
+ syllables_per_word = avg_syllables_per_word(text)
70
+ flesch = 206.835 - 1.015 * sentence_length - 84.6 * syllables_per_word
71
+ flesch.round(2)
72
+ end
73
+
74
+ def self.flesch_kincaid_grade(text)
75
+ sentence_length = avg_sentence_length(text)
76
+ syllables_per_word = avg_syllables_per_word(text)
77
+ flesch = 0.39 * sentence_length + 11.8 * syllables_per_word - 15.59
78
+ flesch.round(1)
79
+ end
80
+
81
+ def self.polysyllab_count(text)
82
+ count = 0
83
+ text.split(' ').each do |word|
84
+ w = syllable_count(word)
85
+ count += 1 if w >= 3
86
+ end
87
+ count
88
+ end
89
+
90
+ def self.smog_index(text)
91
+ sentences = sentence_count(text)
92
+
93
+ if sentences >= 3
94
+ begin
95
+ polysyllab = polysyllab_count(text)
96
+ smog = 1.043 * Math.sqrt(30.0 * polysyllab / sentences) + 3.1291
97
+ smog.round(1)
98
+ rescue ZeroDivisionError
99
+ 0.0
100
+ end
101
+ else
102
+ 0.0
103
+ end
104
+ end
105
+
106
+ def self.coleman_liau_index(text)
107
+ letters = (avg_letter_per_word(text) * 100).round(2)
108
+ sentences = (avg_sentence_per_word(text) * 100).round(2)
109
+ coleman = 0.0588 * letters - 0.296 * sentences - 15.8
110
+ coleman.round(2)
111
+ end
112
+
113
+ def self.automated_readability_index(text)
114
+ chars = char_count(text)
115
+ words = lexicon_count(text)
116
+ sentences = sentence_count(text)
117
+ begin
118
+ a = chars.to_f / words
119
+ b = words.to_f / sentences
120
+
121
+ readability = 4.71 * a + 0.5 * b - 21.43
122
+ readability.round(1)
123
+ rescue ZeroDivisionError
124
+ 0.0
125
+ end
126
+ end
127
+
128
+ def self.linsear_write_formula(text)
129
+ easy_word = 0
130
+ difficult_word = 0
131
+ text_list = text.split(' ')[0..100]
132
+
133
+ text_list.each do |word|
134
+ if syllable_count(word) < 3
135
+ easy_word += 1
136
+ else
137
+ difficult_word += 1
138
+ end
139
+ end
140
+
141
+ text = text_list.join(' ')
142
+
143
+ number = (easy_word * 1 + difficult_word * 3).to_f / sentence_count(text)
144
+ number -= 2 if number <= 20
145
+ number / 2
146
+ end
147
+
148
+ def self.difficult_words(text, language = 'en_us')
149
+ require 'set'
150
+ easy_words = Set.new
151
+ File.read(File.join(dictionary_path, "#{language}.txt")).each_line do |line|
152
+ easy_words << line.chop
153
+ end
154
+
155
+ text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
156
+ diff_words_set = Set.new
157
+ text_list.each do |value|
158
+ next if easy_words.include? value
159
+
160
+ diff_words_set.add(value) if syllable_count(value) > 1
161
+ end
162
+ diff_words_set.length
163
+ end
164
+
165
+ def self.dale_chall_readability_score(text)
166
+ word_count = lexicon_count(text)
167
+ count = word_count - difficult_words(text)
168
+
169
+ begin
170
+ per = 100.0 * count / word_count
171
+ rescue ZeroDivisionError
172
+ return 0.0
173
+ end
174
+
175
+ difficult_words = 100 - per
176
+ score = 0.1579 * difficult_words + 0.0496 * avg_sentence_length(text)
177
+ score += 3.6365 if difficult_words > 5
178
+
179
+ score.round(2)
180
+ end
181
+
182
+ def self.gunning_fog(text)
183
+ per_diff_words = 100.0 * difficult_words(text) / lexicon_count(text) + 5
184
+ grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
185
+
186
+ grade.round(2)
187
+ rescue ZeroDivisionError
188
+ 0.0
189
+ end
190
+
191
+ def self.lix(text)
192
+ words = text.split(' ')
193
+ words_length = words.length
194
+ long_words = words.count { |word| word.length > 6 }
195
+
196
+ per_long_words = 100.0 * long_words / words_length
197
+ asl = avg_sentence_length(text)
198
+ lix = asl + per_long_words
199
+
200
+ lix.round(2)
201
+ end
202
+
203
+ def self.forcast(text, language = 'en_us')
204
+ words = text.split(' ')[0..149]
205
+ words_with_one_syllabe = words.count {
206
+ |word| syllable_count(word, language) == 1
207
+ }
208
+ forcast = 20 - (words_with_one_syllabe / 10)
209
+ forcast
210
+ end
211
+
212
+ def self.powers_sumner_kearl(text)
213
+ grade = 0.0778 * avg_sentence_length(text) + 0.0455 * syllable_count(text) - 2.2029
214
+ grade.round(2)
215
+ end
216
+
217
+ def self.spache(text, language = 'en_us')
218
+ words = text.split(' ').count
219
+ unfamiliar_words = difficult_words(text, language) / words
220
+ grade = (0.141 * avg_sentence_length(text)) + (0.086 * unfamiliar_words) + 0.839
221
+ grade.round(2)
222
+ end
223
+
224
+ def self.text_standard(text, float_output=nil)
225
+ grade = []
226
+
227
+ lower = flesch_kincaid_grade(text).round
228
+ upper = flesch_kincaid_grade(text).ceil
229
+ grade.append(lower.to_i)
230
+ grade.append(upper.to_i)
231
+
232
+ # Appending Flesch Reading Easy
233
+ score = flesch_reading_ease(text)
234
+ if score < 100 && score >= 90
235
+ grade.append(5)
236
+ elsif score < 90 && score >= 80
237
+ grade.append(6)
238
+ elsif score < 80 && score >= 70
239
+ grade.append(7)
240
+ elsif score < 70 && score >= 60
241
+ grade.append(8)
242
+ grade.append(9)
243
+ elsif score < 60 && score >= 50
244
+ grade.append(10)
245
+ elsif score < 50 && score >= 40
246
+ grade.append(11)
247
+ elsif score < 40 && score >= 30
248
+ grade.append(12)
249
+ else
250
+ grade.append(13)
251
+ end
252
+
253
+ # Appending SMOG Index
254
+ lower = smog_index(text).round
255
+ upper = smog_index(text).ceil
256
+ grade.append(lower.to_i)
257
+ grade.append(upper.to_i)
258
+
259
+ # Appending Coleman_Liau_Index
260
+ lower = coleman_liau_index(text).round
261
+ upper = coleman_liau_index(text).ceil
262
+ grade.append(lower.to_i)
263
+ grade.append(upper.to_i)
264
+
265
+ # Appending Automated_Readability_Index
266
+ lower = automated_readability_index(text).round
267
+ upper = automated_readability_index(text).ceil
268
+ grade.append(lower.to_i)
269
+ grade.append(upper.to_i)
270
+
271
+ # Appending Dale_Chall_Readability_Score
272
+ lower = dale_chall_readability_score(text).round
273
+ upper = dale_chall_readability_score(text).ceil
274
+ grade.append(lower.to_i)
275
+ grade.append(upper.to_i)
276
+
277
+ # Appending Linsear_Write_Formula
278
+ lower = linsear_write_formula(text).round
279
+ upper = linsear_write_formula(text).ceil
280
+ grade.append(lower.to_i)
281
+ grade.append(upper.to_i)
282
+
283
+ # Appending Gunning Fog Index
284
+ lower = gunning_fog(text).round
285
+ upper = gunning_fog(text).ceil
286
+ grade.append(lower.to_i)
287
+ grade.append(upper.to_i)
288
+
289
+ # Finding the Readability Consensus based upon all the above tests
290
+ require 'counter'
291
+ d = Counter.new(grade)
292
+ final_grade = d.most_common(1)
293
+ score = final_grade[0][0]
294
+
295
+ if float_output
296
+ score.to_f
297
+ else
298
+ "#{score.to_i - 1}th and #{score.to_i}th grade"
299
+ end
300
+ end
301
+
302
+ def self.dictionary_path=(path)
303
+ @dictionary_path = path
304
+ end
305
+
306
+ def self.dictionary_path
307
+ @dictionary_path ||= File.join(TextStat::GEM_PATH, 'lib', 'dictionaries')
308
+ end
309
+ end