textstat 0.1.6 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
data/lib/textstat.rb CHANGED
@@ -1,303 +1,309 @@
1
- require 'text-hyphen'
2
-
3
- class TextStat
4
- GEM_PATH = File.dirname(File.dirname(__FILE__))
5
-
6
- def self.char_count(text, ignore_spaces = true)
7
- text = text.delete(' ') if ignore_spaces
8
- text.length
9
- end
10
-
11
- def self.lexicon_count(text, remove_punctuation = true)
12
- text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
13
- count = text.split(' ').count
14
- count
15
- end
16
-
17
- def self.syllable_count(text, language = 'en_us')
18
- return 0 if text.empty?
19
-
20
- text = text.downcase
21
- text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
22
- dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
23
- count = 0
24
- text.split(' ').each do |word|
25
- word_hyphenated = dictionary.visualise(word)
26
- count += [1, word_hyphenated.count('-') + 1].max
27
- end
28
- count
29
- end
30
-
31
- def self.sentence_count(text)
32
- text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
33
- end
34
-
35
- def self.avg_sentence_length(text)
36
- asl = lexicon_count(text).to_f / sentence_count(text).to_f
37
- asl.round(1)
38
- rescue ZeroDivisionError
39
- 0.0
40
- end
41
-
42
- def self.avg_syllables_per_word(text)
43
- syllable = syllable_count(text)
44
- words = lexicon_count(text)
45
- begin
46
- syllables_per_word = syllable.to_f / words.to_f
47
- return syllables_per_word.round(1)
48
- rescue ZeroDivisionError
49
- return 0.0
50
- end
51
- end
52
-
53
- def self.avg_letter_per_word(text)
54
- letters_per_word = char_count(text).to_f / lexicon_count(text).to_f
55
- letters_per_word.round(2)
56
- rescue ZeroDivisionError
57
- 0.0
58
- end
59
-
60
- def self.avg_sentence_per_word(text)
61
- sentence_per_word = sentence_count(text).to_f / lexicon_count(text).to_f
62
- sentence_per_word.round(2)
63
- rescue ZeroDivisionError
64
- 0.0
65
- end
66
-
67
- def self.flesch_reading_ease(text)
68
- sentence_length = avg_sentence_length(text)
69
- syllables_per_word = avg_syllables_per_word(text)
70
- flesch = (
71
- 206.835 - (1.015 * sentence_length).to_f - (84.6 * syllables_per_word).to_f
72
- )
73
- flesch.round(2)
74
- end
75
-
76
- def self.flesch_kincaid_grade(text)
77
- sentence_length = avg_sentence_length(text)
78
- syllables_per_word = avg_syllables_per_word(text)
79
- flesch = (0.39 * sentence_length.to_f) + (11.8 * syllables_per_word.to_f) - 15.59
80
- flesch.round(1)
81
- end
82
-
83
- def self.polysyllab_count(text)
84
- count = 0
85
- text.split(' ').each do |word|
86
- w = syllable_count(word)
87
- count += 1 if w >= 3
88
- end
89
- count
90
- end
91
-
92
- def self.smog_index(text)
93
- sentences = sentence_count(text)
94
-
95
- if sentences >= 3
96
- begin
97
- polysyllab = polysyllab_count(text)
98
- smog = (
99
- (1.043 * (30 * (polysyllab / sentences))**0.5) + 3.1291)
100
- return smog.round(1)
101
- rescue ZeroDivisionError
102
- return 0.0
103
- end
104
- else
105
- return 0.0
106
- end
107
- end
108
-
109
- def self.coleman_liau_index(text)
110
- letters = (avg_letter_per_word(text) * 100).round(2)
111
- sentences = (avg_sentence_per_word(text) * 100).round(2)
112
- coleman = ((0.058 * letters) - (0.296 * sentences) - 15.8).to_f
113
- coleman.round(2)
114
- end
115
-
116
- def self.automated_readability_index(text)
117
- chars = char_count(text)
118
- words = lexicon_count(text)
119
- sentences = sentence_count(text)
120
- begin
121
- a = chars.to_f / words.to_f
122
- b = words.to_f / sentences.to_f
123
-
124
- readability = (
125
- (4.71 * a.round(2) + (0.5 * b.round(2))) - 21.43)
126
- return readability.round(1)
127
- rescue ZeroDivisionError
128
- return 0.0
129
- end
130
- end
131
-
132
- def self.linsear_write_formula(text)
133
- easy_word = 0
134
- difficult_word = 0
135
- text_list = text.split(' ')[0..100]
136
-
137
- text_list.each do |word|
138
- if syllable_count(word) < 3
139
- easy_word += 1
140
- else
141
- difficult_word += 1
142
- end
143
- end
144
-
145
- text = text_list.join(' ')
146
-
147
- number = ((easy_word * 1 + difficult_word * 3) / sentence_count(text)).to_f
148
- if number <= 20
149
- number -= 2
150
- end
151
- return number / 2
152
- end
153
-
154
- def self.difficult_words(text, language = 'en_us')
155
- require 'set'
156
- easy_words = Set.new
157
- File.read(File.join(dictionary_path, "#{language}.txt")).each_line do |line|
158
- easy_words << line.chop
159
- end
160
-
161
- text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
162
- diff_words_set = Set.new
163
- text_list.each do |value|
164
- unless easy_words.include? value
165
- if syllable_count(value) > 1
166
- diff_words_set.add(value)
167
- end
168
- end
169
- end
170
- return diff_words_set.length
171
- end
172
-
173
- def self.dale_chall_readability_score(text)
174
- word_count = lexicon_count(text)
175
- count = word_count - difficult_words(text)
176
-
177
- begin
178
- per = count.to_f / word_count.to_f * 100
179
- rescue ZeroDivisionError
180
- return 0.0
181
- end
182
-
183
- difficult_words = 100 - per
184
- score = (
185
- (0.1579 * difficult_words)
186
- + (0.0496 * avg_sentence_length(text)))
187
-
188
- if difficult_words > 5
189
- score += 3.6365
190
- end
191
- return score.round(2)
192
- end
193
-
194
- def self.gunning_fog(text)
195
- begin
196
- per_diff_words = (
197
- (difficult_words(text) / lexicon_count(text) * 100) + 5)
198
-
199
- grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
200
- return grade.round(2)
201
- rescue ZeroDivisionError
202
- return 0.0
203
- end
204
- end
205
-
206
- def self.lix(text)
207
- words = text.split(' ')
208
- words_length = words.length
209
- long_words = words.select { |word| word.length > 6 }.count
210
-
211
- per_long_words = (long_words * 100).to_f / words_length
212
- asl = avg_sentence_length(text)
213
- lix = asl + per_long_words
214
-
215
- return lix.round(2)
216
- end
217
-
218
- def self.text_standard(text, float_output=nil)
219
- grade = []
220
-
221
- lower = flesch_kincaid_grade(text).round
222
- upper = flesch_kincaid_grade(text).ceil
223
- grade.append(lower.to_i)
224
- grade.append(upper.to_i)
225
-
226
- # Appending Flesch Reading Easy
227
- score = flesch_reading_ease(text)
228
- if score < 100 && score >= 90
229
- grade.append(5)
230
- elsif score < 90 && score >= 80
231
- grade.append(6)
232
- elsif score < 80 && score >= 70
233
- grade.append(7)
234
- elsif score < 70 && score >= 60
235
- grade.append(8)
236
- grade.append(9)
237
- elsif score < 60 && score >= 50
238
- grade.append(10)
239
- elsif score < 50 && score >= 40
240
- grade.append(11)
241
- elsif score < 40 && score >= 30
242
- grade.append(12)
243
- else
244
- grade.append(13)
245
- end
246
-
247
- # Appending SMOG Index
248
- lower = smog_index(text).round
249
- upper = smog_index(text).ceil
250
- grade.append(lower.to_i)
251
- grade.append(upper.to_i)
252
-
253
- # Appending Coleman_Liau_Index
254
- lower = coleman_liau_index(text).round
255
- upper = coleman_liau_index(text).ceil
256
- grade.append(lower.to_i)
257
- grade.append(upper.to_i)
258
-
259
- # Appending Automated_Readability_Index
260
- lower = automated_readability_index(text).round
261
- upper = automated_readability_index(text).ceil
262
- grade.append(lower.to_i)
263
- grade.append(upper.to_i)
264
-
265
- # Appending Dale_Chall_Readability_Score
266
- lower = dale_chall_readability_score(text).round
267
- upper = dale_chall_readability_score(text).ceil
268
- grade.append(lower.to_i)
269
- grade.append(upper.to_i)
270
-
271
- # Appending Linsear_Write_Formula
272
- lower = linsear_write_formula(text).round
273
- upper = linsear_write_formula(text).ceil
274
- grade.append(lower.to_i)
275
- grade.append(upper.to_i)
276
-
277
- # Appending Gunning Fog Index
278
- lower = gunning_fog(text).round
279
- upper = gunning_fog(text).ceil
280
- grade.append(lower.to_i)
281
- grade.append(upper.to_i)
282
-
283
- # Finding the Readability Consensus based upon all the above tests
284
- require 'counter'
285
- d = Counter.new(grade)
286
- final_grade = d.most_common(1)
287
- score = final_grade[0][0]
288
-
289
- if float_output
290
- return score.to_f
291
- else
292
- return "#{score.to_i - 1}th and #{score.to_i}th grade"
293
- end
294
- end
295
-
296
- def self.dictionary_path=(path)
297
- @dictionary_path = path
298
- end
299
-
300
- def self.dictionary_path
301
- @dictionary_path ||= File.join(TextStat::GEM_PATH, 'lib', 'dictionaries')
302
- end
303
- end
1
+ require 'text-hyphen'
2
+
3
+ class TextStat
4
+ GEM_PATH = File.dirname(File.dirname(__FILE__))
5
+
6
+ def self.char_count(text, ignore_spaces = true)
7
+ text = text.delete(' ') if ignore_spaces
8
+ text.length
9
+ end
10
+
11
+ def self.lexicon_count(text, remove_punctuation = true)
12
+ text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
13
+ count = text.split(' ').count
14
+ count
15
+ end
16
+
17
+ def self.syllable_count(text, language = 'en_us')
18
+ return 0 if text.empty?
19
+
20
+ text = text.downcase
21
+ text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
22
+ dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
23
+ count = 0
24
+ text.split(' ').each do |word|
25
+ word_hyphenated = dictionary.visualise(word)
26
+ count += word_hyphenated.count('-') + 1
27
+ end
28
+ count
29
+ end
30
+
31
+ def self.sentence_count(text)
32
+ text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
33
+ end
34
+
35
+ def self.avg_sentence_length(text)
36
+ asl = lexicon_count(text).to_f / sentence_count(text)
37
+ asl.round(1)
38
+ rescue ZeroDivisionError
39
+ 0.0
40
+ end
41
+
42
+ def self.avg_syllables_per_word(text)
43
+ syllable = syllable_count(text)
44
+ words = lexicon_count(text)
45
+ begin
46
+ syllables_per_word = syllable.to_f / words
47
+ syllables_per_word.round(1)
48
+ rescue ZeroDivisionError
49
+ 0.0
50
+ end
51
+ end
52
+
53
+ def self.avg_letter_per_word(text)
54
+ letters_per_word = char_count(text).to_f / lexicon_count(text)
55
+ letters_per_word.round(2)
56
+ rescue ZeroDivisionError
57
+ 0.0
58
+ end
59
+
60
+ def self.avg_sentence_per_word(text)
61
+ sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
62
+ sentence_per_word.round(2)
63
+ rescue ZeroDivisionError
64
+ 0.0
65
+ end
66
+
67
+ def self.flesch_reading_ease(text)
68
+ sentence_length = avg_sentence_length(text)
69
+ syllables_per_word = avg_syllables_per_word(text)
70
+ flesch = 206.835 - 1.015 * sentence_length - 84.6 * syllables_per_word
71
+ flesch.round(2)
72
+ end
73
+
74
+ def self.flesch_kincaid_grade(text)
75
+ sentence_length = avg_sentence_length(text)
76
+ syllables_per_word = avg_syllables_per_word(text)
77
+ flesch = 0.39 * sentence_length + 11.8 * syllables_per_word - 15.59
78
+ flesch.round(1)
79
+ end
80
+
81
+ def self.polysyllab_count(text)
82
+ count = 0
83
+ text.split(' ').each do |word|
84
+ w = syllable_count(word)
85
+ count += 1 if w >= 3
86
+ end
87
+ count
88
+ end
89
+
90
+ def self.smog_index(text)
91
+ sentences = sentence_count(text)
92
+
93
+ if sentences >= 3
94
+ begin
95
+ polysyllab = polysyllab_count(text)
96
+ smog = 1.043 * Math.sqrt(30.0 * polysyllab / sentences) + 3.1291
97
+ smog.round(1)
98
+ rescue ZeroDivisionError
99
+ 0.0
100
+ end
101
+ else
102
+ 0.0
103
+ end
104
+ end
105
+
106
+ def self.coleman_liau_index(text)
107
+ letters = (avg_letter_per_word(text) * 100).round(2)
108
+ sentences = (avg_sentence_per_word(text) * 100).round(2)
109
+ coleman = 0.0588 * letters - 0.296 * sentences - 15.8
110
+ coleman.round(2)
111
+ end
112
+
113
+ def self.automated_readability_index(text)
114
+ chars = char_count(text)
115
+ words = lexicon_count(text)
116
+ sentences = sentence_count(text)
117
+ begin
118
+ a = chars.to_f / words
119
+ b = words.to_f / sentences
120
+
121
+ readability = 4.71 * a + 0.5 * b - 21.43
122
+ readability.round(1)
123
+ rescue ZeroDivisionError
124
+ 0.0
125
+ end
126
+ end
127
+
128
+ def self.linsear_write_formula(text)
129
+ easy_word = 0
130
+ difficult_word = 0
131
+ text_list = text.split(' ')[0..100]
132
+
133
+ text_list.each do |word|
134
+ if syllable_count(word) < 3
135
+ easy_word += 1
136
+ else
137
+ difficult_word += 1
138
+ end
139
+ end
140
+
141
+ text = text_list.join(' ')
142
+
143
+ number = (easy_word * 1 + difficult_word * 3).to_f / sentence_count(text)
144
+ number -= 2 if number <= 20
145
+ number / 2
146
+ end
147
+
148
+ def self.difficult_words(text, language = 'en_us')
149
+ require 'set'
150
+ easy_words = Set.new
151
+ File.read(File.join(dictionary_path, "#{language}.txt")).each_line do |line|
152
+ easy_words << line.chop
153
+ end
154
+
155
+ text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
156
+ diff_words_set = Set.new
157
+ text_list.each do |value|
158
+ next if easy_words.include? value
159
+
160
+ diff_words_set.add(value) if syllable_count(value) > 1
161
+ end
162
+ diff_words_set.length
163
+ end
164
+
165
+ def self.dale_chall_readability_score(text)
166
+ word_count = lexicon_count(text)
167
+ count = word_count - difficult_words(text)
168
+
169
+ begin
170
+ per = 100.0 * count / word_count
171
+ rescue ZeroDivisionError
172
+ return 0.0
173
+ end
174
+
175
+ difficult_words = 100 - per
176
+ score = 0.1579 * difficult_words + 0.0496 * avg_sentence_length(text)
177
+ score += 3.6365 if difficult_words > 5
178
+
179
+ score.round(2)
180
+ end
181
+
182
+ def self.gunning_fog(text)
183
+ per_diff_words = 100.0 * difficult_words(text) / lexicon_count(text) + 5
184
+ grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
185
+
186
+ grade.round(2)
187
+ rescue ZeroDivisionError
188
+ 0.0
189
+ end
190
+
191
+ def self.lix(text)
192
+ words = text.split(' ')
193
+ words_length = words.length
194
+ long_words = words.count { |word| word.length > 6 }
195
+
196
+ per_long_words = 100.0 * long_words / words_length
197
+ asl = avg_sentence_length(text)
198
+ lix = asl + per_long_words
199
+
200
+ lix.round(2)
201
+ end
202
+
203
+ def self.forcast(text, language = 'en_us')
204
+ words = text.split(' ')[0..149]
205
+ words_with_one_syllabe = words.count {
206
+ |word| syllable_count(word, language) == 1
207
+ }
208
+ forcast = 20 - (words_with_one_syllabe / 10)
209
+ forcast
210
+ end
211
+
212
+ def self.powers_sumner_kearl(text)
213
+ grade = 0.0778 * avg_sentence_length(text) + 0.0455 * syllable_count(text) - 2.2029
214
+ grade.round(2)
215
+ end
216
+
217
+ def self.spache(text, language = 'en_us')
218
+ words = text.split(' ').count
219
+ unfamiliar_words = difficult_words(text, language) / words
220
+ grade = (0.141 * avg_sentence_length(text)) + (0.086 * unfamiliar_words) + 0.839
221
+ grade.round(2)
222
+ end
223
+
224
+ def self.text_standard(text, float_output=nil)
225
+ grade = []
226
+
227
+ lower = flesch_kincaid_grade(text).round
228
+ upper = flesch_kincaid_grade(text).ceil
229
+ grade.append(lower.to_i)
230
+ grade.append(upper.to_i)
231
+
232
+ # Appending Flesch Reading Easy
233
+ score = flesch_reading_ease(text)
234
+ if score < 100 && score >= 90
235
+ grade.append(5)
236
+ elsif score < 90 && score >= 80
237
+ grade.append(6)
238
+ elsif score < 80 && score >= 70
239
+ grade.append(7)
240
+ elsif score < 70 && score >= 60
241
+ grade.append(8)
242
+ grade.append(9)
243
+ elsif score < 60 && score >= 50
244
+ grade.append(10)
245
+ elsif score < 50 && score >= 40
246
+ grade.append(11)
247
+ elsif score < 40 && score >= 30
248
+ grade.append(12)
249
+ else
250
+ grade.append(13)
251
+ end
252
+
253
+ # Appending SMOG Index
254
+ lower = smog_index(text).round
255
+ upper = smog_index(text).ceil
256
+ grade.append(lower.to_i)
257
+ grade.append(upper.to_i)
258
+
259
+ # Appending Coleman_Liau_Index
260
+ lower = coleman_liau_index(text).round
261
+ upper = coleman_liau_index(text).ceil
262
+ grade.append(lower.to_i)
263
+ grade.append(upper.to_i)
264
+
265
+ # Appending Automated_Readability_Index
266
+ lower = automated_readability_index(text).round
267
+ upper = automated_readability_index(text).ceil
268
+ grade.append(lower.to_i)
269
+ grade.append(upper.to_i)
270
+
271
+ # Appending Dale_Chall_Readability_Score
272
+ lower = dale_chall_readability_score(text).round
273
+ upper = dale_chall_readability_score(text).ceil
274
+ grade.append(lower.to_i)
275
+ grade.append(upper.to_i)
276
+
277
+ # Appending Linsear_Write_Formula
278
+ lower = linsear_write_formula(text).round
279
+ upper = linsear_write_formula(text).ceil
280
+ grade.append(lower.to_i)
281
+ grade.append(upper.to_i)
282
+
283
+ # Appending Gunning Fog Index
284
+ lower = gunning_fog(text).round
285
+ upper = gunning_fog(text).ceil
286
+ grade.append(lower.to_i)
287
+ grade.append(upper.to_i)
288
+
289
+ # Finding the Readability Consensus based upon all the above tests
290
+ require 'counter'
291
+ d = Counter.new(grade)
292
+ final_grade = d.most_common(1)
293
+ score = final_grade[0][0]
294
+
295
+ if float_output
296
+ score.to_f
297
+ else
298
+ "#{score.to_i - 1}th and #{score.to_i}th grade"
299
+ end
300
+ end
301
+
302
+ def self.dictionary_path=(path)
303
+ @dictionary_path = path
304
+ end
305
+
306
+ def self.dictionary_path
307
+ @dictionary_path ||= File.join(TextStat::GEM_PATH, 'lib', 'dictionaries')
308
+ end
309
+ end