textstat 0.1.1 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
data/lib/textstat.rb CHANGED
@@ -1,293 +1,309 @@
1
- require 'text-hyphen'
2
-
3
- class TextStat
4
- def self.char_count(text, ignore_spaces = true)
5
- text = text.delete(' ') if ignore_spaces
6
- text.length
7
- end
8
-
9
- def self.lexicon_count(text, remove_punctuation = true)
10
- text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
11
- count = text.split(' ').count
12
- count
13
- end
14
-
15
- def self.syllable_count(text, language = 'en_us')
16
- return 0 if text.empty?
17
-
18
- text = text.downcase
19
- text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
20
- dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
21
- count = 0
22
- text.split(' ').each do |word|
23
- word_hyphenated = dictionary.visualise(word)
24
- count += [1, word_hyphenated.count('-') + 1].max
25
- end
26
- count
27
- end
28
-
29
- def self.sentence_count(text)
30
- text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
31
- end
32
-
33
- def self.avg_sentence_length(text)
34
- asl = lexicon_count(text).to_f / sentence_count(text).to_f
35
- asl.round(1)
36
- rescue ZeroDivisionError
37
- 0.0
38
- end
39
-
40
- def self.avg_syllables_per_word(text)
41
- syllable = syllable_count(text)
42
- words = lexicon_count(text)
43
- begin
44
- syllables_per_word = syllable.to_f / words.to_f
45
- return syllables_per_word.round(1)
46
- rescue ZeroDivisionError
47
- return 0.0
48
- end
49
- end
50
-
51
- def self.avg_letter_per_word(text)
52
- letters_per_word = char_count(text).to_f / lexicon_count(text).to_f
53
- letters_per_word.round(2)
54
- rescue ZeroDivisionError
55
- 0.0
56
- end
57
-
58
- def self.avg_sentence_per_word(text)
59
- sentence_per_word = sentence_count(text).to_f / lexicon_count(text).to_f
60
- sentence_per_word.round(2)
61
- rescue ZeroDivisionError
62
- 0.0
63
- end
64
-
65
- def self.flesch_reading_ease(text)
66
- sentence_length = avg_sentence_length(text)
67
- syllables_per_word = avg_syllables_per_word(text)
68
- flesch = (
69
- 206.835 - (1.015 * sentence_length).to_f - (84.6 * syllables_per_word).to_f
70
- )
71
- flesch.round(2)
72
- end
73
-
74
- def self.flesch_kincaid_grade(text)
75
- sentence_length = avg_sentence_length(text)
76
- syllables_per_word = avg_syllables_per_word(text)
77
- flesch = (0.39 * sentence_length.to_f) + (11.8 * syllables_per_word.to_f) - 15.59
78
- flesch.round(1)
79
- end
80
-
81
- def self.polysyllab_count(text)
82
- count = 0
83
- text.split(' ').each do |word|
84
- w = syllable_count(word)
85
- count += 1 if w >= 3
86
- end
87
- count
88
- end
89
-
90
- def self.smog_index(text)
91
- sentences = sentence_count(text)
92
-
93
- if sentences >= 3
94
- begin
95
- polysyllab = polysyllab_count(text)
96
- smog = (
97
- (1.043 * (30 * (polysyllab / sentences))**0.5) + 3.1291)
98
- return smog.round(1)
99
- rescue ZeroDivisionError
100
- return 0.0
101
- end
102
- else
103
- return 0.0
104
- end
105
- end
106
-
107
- def self.coleman_liau_index(text)
108
- letters = (avg_letter_per_word(text) * 100).round(2)
109
- sentences = (avg_sentence_per_word(text) * 100).round(2)
110
- coleman = ((0.058 * letters) - (0.296 * sentences) - 15.8).to_f
111
- coleman.round(2)
112
- end
113
-
114
- def self.automated_readability_index(text)
115
- chars = char_count(text)
116
- words = lexicon_count(text)
117
- sentences = sentence_count(text)
118
- begin
119
- a = chars.to_f / words.to_f
120
- b = words.to_f / sentences.to_f
121
-
122
- readability = (
123
- (4.71 * a.round(2) + (0.5 * b.round(2))) - 21.43)
124
- return readability.round(1)
125
- rescue ZeroDivisionError
126
- return 0.0
127
- end
128
- end
129
-
130
- def self.linsear_write_formula(text)
131
- easy_word = 0
132
- difficult_word = 0
133
- text_list = text.split(' ')[0..100]
134
-
135
- text_list.each do |word|
136
- if syllable_count(word) < 3
137
- easy_word += 1
138
- else
139
- difficult_word += 1
140
- end
141
- end
142
-
143
- text = text_list.join(' ')
144
-
145
- number = ((easy_word * 1 + difficult_word * 3) / sentence_count(text)).to_f
146
- if number <= 20
147
- number -= 2
148
- end
149
- return number / 2
150
- end
151
-
152
- def self.difficult_words(text, language = 'en_us')
153
- require 'set'
154
- easy_words = Set.new
155
- File.read("lib/dictionaries/#{language}.txt").each_line do |line|
156
- easy_words << line.chop
157
- end
158
-
159
- text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
160
- diff_words_set = Set.new
161
- text_list.each do |value|
162
- unless easy_words.include? value
163
- if syllable_count(value) > 1
164
- diff_words_set.add(value)
165
- end
166
- end
167
- end
168
- return diff_words_set.length
169
- end
170
-
171
- def self.dale_chall_readability_score(text)
172
- word_count = lexicon_count(text)
173
- count = word_count - difficult_words(text)
174
-
175
- begin
176
- per = count.to_f / word_count.to_f * 100
177
- rescue ZeroDivisionError
178
- return 0.0
179
- end
180
-
181
- difficult_words = 100 - per
182
- score = (
183
- (0.1579 * difficult_words)
184
- + (0.0496 * avg_sentence_length(text)))
185
-
186
- if difficult_words > 5
187
- score += 3.6365
188
- end
189
- return score.round(2)
190
- end
191
-
192
- def self.gunning_fog(text)
193
- begin
194
- per_diff_words = (
195
- (difficult_words(text) / lexicon_count(text) * 100) + 5)
196
-
197
- grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
198
- return grade.round(2)
199
- rescue ZeroDivisionError
200
- return 0.0
201
- end
202
- end
203
-
204
- def self.lix(text)
205
- words = text.split(' ')
206
- words_length = words.length
207
- long_words = words.select { |word| word.length > 6 }.count
208
-
209
- per_long_words = (long_words * 100).to_f / words_length
210
- asl = avg_sentence_length(text)
211
- lix = asl + per_long_words
212
-
213
- return lix.round(2)
214
- end
215
-
216
- def self.text_standard(text, float_output=nil)
217
- grade = []
218
-
219
- lower = flesch_kincaid_grade(text).round
220
- upper = flesch_kincaid_grade(text).ceil
221
- grade.append(lower.to_i)
222
- grade.append(upper.to_i)
223
-
224
- # Appending Flesch Reading Easy
225
- score = flesch_reading_ease(text)
226
- if score < 100 && score >= 90
227
- grade.append(5)
228
- elsif score < 90 && score >= 80
229
- grade.append(6)
230
- elsif score < 80 && score >= 70
231
- grade.append(7)
232
- elsif score < 70 && score >= 60
233
- grade.append(8)
234
- grade.append(9)
235
- elsif score < 60 && score >= 50
236
- grade.append(10)
237
- elsif score < 50 && score >= 40
238
- grade.append(11)
239
- elsif score < 40 && score >= 30
240
- grade.append(12)
241
- else
242
- grade.append(13)
243
- end
244
-
245
- # Appending SMOG Index
246
- lower = smog_index(text).round
247
- upper = smog_index(text).ceil
248
- grade.append(lower.to_i)
249
- grade.append(upper.to_i)
250
-
251
- # Appending Coleman_Liau_Index
252
- lower = coleman_liau_index(text).round
253
- upper = coleman_liau_index(text).ceil
254
- grade.append(lower.to_i)
255
- grade.append(upper.to_i)
256
-
257
- # Appending Automated_Readability_Index
258
- lower = automated_readability_index(text).round
259
- upper = automated_readability_index(text).ceil
260
- grade.append(lower.to_i)
261
- grade.append(upper.to_i)
262
-
263
- # Appending Dale_Chall_Readability_Score
264
- lower = dale_chall_readability_score(text).round
265
- upper = dale_chall_readability_score(text).ceil
266
- grade.append(lower.to_i)
267
- grade.append(upper.to_i)
268
-
269
- # Appending Linsear_Write_Formula
270
- lower = linsear_write_formula(text).round
271
- upper = linsear_write_formula(text).ceil
272
- grade.append(lower.to_i)
273
- grade.append(upper.to_i)
274
-
275
- # Appending Gunning Fog Index
276
- lower = gunning_fog(text).round
277
- upper = gunning_fog(text).ceil
278
- grade.append(lower.to_i)
279
- grade.append(upper.to_i)
280
-
281
- # Finding the Readability Consensus based upon all the above tests
282
- require 'counter'
283
- d = Counter.new(grade)
284
- final_grade = d.most_common(1)
285
- score = final_grade[0][0]
286
-
287
- if float_output
288
- return score.to_f
289
- else
290
- return "#{score.to_i - 1}th and #{score.to_i}th grade"
291
- end
292
- end
293
- end
1
+ require 'text-hyphen'
2
+
3
+ class TextStat
4
+ GEM_PATH = File.dirname(File.dirname(__FILE__))
5
+
6
+ def self.char_count(text, ignore_spaces = true)
7
+ text = text.delete(' ') if ignore_spaces
8
+ text.length
9
+ end
10
+
11
+ def self.lexicon_count(text, remove_punctuation = true)
12
+ text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
13
+ count = text.split(' ').count
14
+ count
15
+ end
16
+
17
+ def self.syllable_count(text, language = 'en_us')
18
+ return 0 if text.empty?
19
+
20
+ text = text.downcase
21
+ text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
22
+ dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
23
+ count = 0
24
+ text.split(' ').each do |word|
25
+ word_hyphenated = dictionary.visualise(word)
26
+ count += word_hyphenated.count('-') + 1
27
+ end
28
+ count
29
+ end
30
+
31
+ def self.sentence_count(text)
32
+ text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
33
+ end
34
+
35
+ def self.avg_sentence_length(text)
36
+ asl = lexicon_count(text).to_f / sentence_count(text)
37
+ asl.round(1)
38
+ rescue ZeroDivisionError
39
+ 0.0
40
+ end
41
+
42
+ def self.avg_syllables_per_word(text)
43
+ syllable = syllable_count(text)
44
+ words = lexicon_count(text)
45
+ begin
46
+ syllables_per_word = syllable.to_f / words
47
+ syllables_per_word.round(1)
48
+ rescue ZeroDivisionError
49
+ 0.0
50
+ end
51
+ end
52
+
53
+ def self.avg_letter_per_word(text)
54
+ letters_per_word = char_count(text).to_f / lexicon_count(text)
55
+ letters_per_word.round(2)
56
+ rescue ZeroDivisionError
57
+ 0.0
58
+ end
59
+
60
+ def self.avg_sentence_per_word(text)
61
+ sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
62
+ sentence_per_word.round(2)
63
+ rescue ZeroDivisionError
64
+ 0.0
65
+ end
66
+
67
+ def self.flesch_reading_ease(text)
68
+ sentence_length = avg_sentence_length(text)
69
+ syllables_per_word = avg_syllables_per_word(text)
70
+ flesch = 206.835 - 1.015 * sentence_length - 84.6 * syllables_per_word
71
+ flesch.round(2)
72
+ end
73
+
74
+ def self.flesch_kincaid_grade(text)
75
+ sentence_length = avg_sentence_length(text)
76
+ syllables_per_word = avg_syllables_per_word(text)
77
+ flesch = 0.39 * sentence_length + 11.8 * syllables_per_word - 15.59
78
+ flesch.round(1)
79
+ end
80
+
81
+ def self.polysyllab_count(text)
82
+ count = 0
83
+ text.split(' ').each do |word|
84
+ w = syllable_count(word)
85
+ count += 1 if w >= 3
86
+ end
87
+ count
88
+ end
89
+
90
+ def self.smog_index(text)
91
+ sentences = sentence_count(text)
92
+
93
+ if sentences >= 3
94
+ begin
95
+ polysyllab = polysyllab_count(text)
96
+ smog = 1.043 * Math.sqrt(30.0 * polysyllab / sentences) + 3.1291
97
+ smog.round(1)
98
+ rescue ZeroDivisionError
99
+ 0.0
100
+ end
101
+ else
102
+ 0.0
103
+ end
104
+ end
105
+
106
+ def self.coleman_liau_index(text)
107
+ letters = (avg_letter_per_word(text) * 100).round(2)
108
+ sentences = (avg_sentence_per_word(text) * 100).round(2)
109
+ coleman = 0.0588 * letters - 0.296 * sentences - 15.8
110
+ coleman.round(2)
111
+ end
112
+
113
+ def self.automated_readability_index(text)
114
+ chars = char_count(text)
115
+ words = lexicon_count(text)
116
+ sentences = sentence_count(text)
117
+ begin
118
+ a = chars.to_f / words
119
+ b = words.to_f / sentences
120
+
121
+ readability = 4.71 * a + 0.5 * b - 21.43
122
+ readability.round(1)
123
+ rescue ZeroDivisionError
124
+ 0.0
125
+ end
126
+ end
127
+
128
+ def self.linsear_write_formula(text)
129
+ easy_word = 0
130
+ difficult_word = 0
131
+ text_list = text.split(' ')[0..100]
132
+
133
+ text_list.each do |word|
134
+ if syllable_count(word) < 3
135
+ easy_word += 1
136
+ else
137
+ difficult_word += 1
138
+ end
139
+ end
140
+
141
+ text = text_list.join(' ')
142
+
143
+ number = (easy_word * 1 + difficult_word * 3).to_f / sentence_count(text)
144
+ number -= 2 if number <= 20
145
+ number / 2
146
+ end
147
+
148
+ def self.difficult_words(text, language = 'en_us')
149
+ require 'set'
150
+ easy_words = Set.new
151
+ File.read(File.join(dictionary_path, "#{language}.txt")).each_line do |line|
152
+ easy_words << line.chop
153
+ end
154
+
155
+ text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
156
+ diff_words_set = Set.new
157
+ text_list.each do |value|
158
+ next if easy_words.include? value
159
+
160
+ diff_words_set.add(value) if syllable_count(value) > 1
161
+ end
162
+ diff_words_set.length
163
+ end
164
+
165
+ def self.dale_chall_readability_score(text)
166
+ word_count = lexicon_count(text)
167
+ count = word_count - difficult_words(text)
168
+
169
+ begin
170
+ per = 100.0 * count / word_count
171
+ rescue ZeroDivisionError
172
+ return 0.0
173
+ end
174
+
175
+ difficult_words = 100 - per
176
+ score = 0.1579 * difficult_words + 0.0496 * avg_sentence_length(text)
177
+ score += 3.6365 if difficult_words > 5
178
+
179
+ score.round(2)
180
+ end
181
+
182
+ def self.gunning_fog(text)
183
+ per_diff_words = 100.0 * difficult_words(text) / lexicon_count(text) + 5
184
+ grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
185
+
186
+ grade.round(2)
187
+ rescue ZeroDivisionError
188
+ 0.0
189
+ end
190
+
191
+ def self.lix(text)
192
+ words = text.split(' ')
193
+ words_length = words.length
194
+ long_words = words.count { |word| word.length > 6 }
195
+
196
+ per_long_words = 100.0 * long_words / words_length
197
+ asl = avg_sentence_length(text)
198
+ lix = asl + per_long_words
199
+
200
+ lix.round(2)
201
+ end
202
+
203
+ def self.forcast(text, language = 'en_us')
204
+ words = text.split(' ')[0..149]
205
+ words_with_one_syllabe = words.count {
206
+ |word| syllable_count(word, language) == 1
207
+ }
208
+ forcast = 20 - (words_with_one_syllabe / 10)
209
+ forcast
210
+ end
211
+
212
+ def self.powers_sumner_kearl(text)
213
+ grade = 0.0778 * avg_sentence_length(text) + 0.0455 * syllable_count(text) - 2.2029
214
+ grade.round(2)
215
+ end
216
+
217
+ def self.spache(text, language = 'en_us')
218
+ words = text.split(' ').count
219
+ unfamiliar_words = difficult_words(text, language) / words
220
+ grade = (0.141 * avg_sentence_length(text)) + (0.086 * unfamiliar_words) + 0.839
221
+ grade.round(2)
222
+ end
223
+
224
+ def self.text_standard(text, float_output=nil)
225
+ grade = []
226
+
227
+ lower = flesch_kincaid_grade(text).round
228
+ upper = flesch_kincaid_grade(text).ceil
229
+ grade.append(lower.to_i)
230
+ grade.append(upper.to_i)
231
+
232
+ # Appending Flesch Reading Easy
233
+ score = flesch_reading_ease(text)
234
+ if score < 100 && score >= 90
235
+ grade.append(5)
236
+ elsif score < 90 && score >= 80
237
+ grade.append(6)
238
+ elsif score < 80 && score >= 70
239
+ grade.append(7)
240
+ elsif score < 70 && score >= 60
241
+ grade.append(8)
242
+ grade.append(9)
243
+ elsif score < 60 && score >= 50
244
+ grade.append(10)
245
+ elsif score < 50 && score >= 40
246
+ grade.append(11)
247
+ elsif score < 40 && score >= 30
248
+ grade.append(12)
249
+ else
250
+ grade.append(13)
251
+ end
252
+
253
+ # Appending SMOG Index
254
+ lower = smog_index(text).round
255
+ upper = smog_index(text).ceil
256
+ grade.append(lower.to_i)
257
+ grade.append(upper.to_i)
258
+
259
+ # Appending Coleman_Liau_Index
260
+ lower = coleman_liau_index(text).round
261
+ upper = coleman_liau_index(text).ceil
262
+ grade.append(lower.to_i)
263
+ grade.append(upper.to_i)
264
+
265
+ # Appending Automated_Readability_Index
266
+ lower = automated_readability_index(text).round
267
+ upper = automated_readability_index(text).ceil
268
+ grade.append(lower.to_i)
269
+ grade.append(upper.to_i)
270
+
271
+ # Appending Dale_Chall_Readability_Score
272
+ lower = dale_chall_readability_score(text).round
273
+ upper = dale_chall_readability_score(text).ceil
274
+ grade.append(lower.to_i)
275
+ grade.append(upper.to_i)
276
+
277
+ # Appending Linsear_Write_Formula
278
+ lower = linsear_write_formula(text).round
279
+ upper = linsear_write_formula(text).ceil
280
+ grade.append(lower.to_i)
281
+ grade.append(upper.to_i)
282
+
283
+ # Appending Gunning Fog Index
284
+ lower = gunning_fog(text).round
285
+ upper = gunning_fog(text).ceil
286
+ grade.append(lower.to_i)
287
+ grade.append(upper.to_i)
288
+
289
+ # Finding the Readability Consensus based upon all the above tests
290
+ require 'counter'
291
+ d = Counter.new(grade)
292
+ final_grade = d.most_common(1)
293
+ score = final_grade[0][0]
294
+
295
+ if float_output
296
+ score.to_f
297
+ else
298
+ "#{score.to_i - 1}th and #{score.to_i}th grade"
299
+ end
300
+ end
301
+
302
+ def self.dictionary_path=(path)
303
+ @dictionary_path = path
304
+ end
305
+
306
+ def self.dictionary_path
307
+ @dictionary_path ||= File.join(TextStat::GEM_PATH, 'lib', 'dictionaries')
308
+ end
309
+ end