textstat 0.1.7 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,3 @@
1
- class TextStat
2
- VERSION = "0.1.7"
3
- end
1
+ class TextStat
2
+ VERSION = "0.1.9"
3
+ end
data/lib/textstat.rb CHANGED
@@ -1,309 +1,313 @@
1
- require 'text-hyphen'
2
-
3
- class TextStat
4
- GEM_PATH = File.dirname(File.dirname(__FILE__))
5
-
6
- def self.char_count(text, ignore_spaces = true)
7
- text = text.delete(' ') if ignore_spaces
8
- text.length
9
- end
10
-
11
- def self.lexicon_count(text, remove_punctuation = true)
12
- text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
13
- count = text.split(' ').count
14
- count
15
- end
16
-
17
- def self.syllable_count(text, language = 'en_us')
18
- return 0 if text.empty?
19
-
20
- text = text.downcase
21
- text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
22
- dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
23
- count = 0
24
- text.split(' ').each do |word|
25
- word_hyphenated = dictionary.visualise(word)
26
- count += word_hyphenated.count('-') + 1
27
- end
28
- count
29
- end
30
-
31
- def self.sentence_count(text)
32
- text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
33
- end
34
-
35
- def self.avg_sentence_length(text)
36
- asl = lexicon_count(text).to_f / sentence_count(text)
37
- asl.round(1)
38
- rescue ZeroDivisionError
39
- 0.0
40
- end
41
-
42
- def self.avg_syllables_per_word(text)
43
- syllable = syllable_count(text)
44
- words = lexicon_count(text)
45
- begin
46
- syllables_per_word = syllable.to_f / words
47
- syllables_per_word.round(1)
48
- rescue ZeroDivisionError
49
- 0.0
50
- end
51
- end
52
-
53
- def self.avg_letter_per_word(text)
54
- letters_per_word = char_count(text).to_f / lexicon_count(text)
55
- letters_per_word.round(2)
56
- rescue ZeroDivisionError
57
- 0.0
58
- end
59
-
60
- def self.avg_sentence_per_word(text)
61
- sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
62
- sentence_per_word.round(2)
63
- rescue ZeroDivisionError
64
- 0.0
65
- end
66
-
67
- def self.flesch_reading_ease(text)
68
- sentence_length = avg_sentence_length(text)
69
- syllables_per_word = avg_syllables_per_word(text)
70
- flesch = 206.835 - 1.015 * sentence_length - 84.6 * syllables_per_word
71
- flesch.round(2)
72
- end
73
-
74
- def self.flesch_kincaid_grade(text)
75
- sentence_length = avg_sentence_length(text)
76
- syllables_per_word = avg_syllables_per_word(text)
77
- flesch = 0.39 * sentence_length + 11.8 * syllables_per_word - 15.59
78
- flesch.round(1)
79
- end
80
-
81
- def self.polysyllab_count(text)
82
- count = 0
83
- text.split(' ').each do |word|
84
- w = syllable_count(word)
85
- count += 1 if w >= 3
86
- end
87
- count
88
- end
89
-
90
- def self.smog_index(text)
91
- sentences = sentence_count(text)
92
-
93
- if sentences >= 3
94
- begin
95
- polysyllab = polysyllab_count(text)
96
- smog = 1.043 * Math.sqrt(30.0 * polysyllab / sentences) + 3.1291
97
- smog.round(1)
98
- rescue ZeroDivisionError
99
- 0.0
100
- end
101
- else
102
- 0.0
103
- end
104
- end
105
-
106
- def self.coleman_liau_index(text)
107
- letters = (avg_letter_per_word(text) * 100).round(2)
108
- sentences = (avg_sentence_per_word(text) * 100).round(2)
109
- coleman = 0.0588 * letters - 0.296 * sentences - 15.8
110
- coleman.round(2)
111
- end
112
-
113
- def self.automated_readability_index(text)
114
- chars = char_count(text)
115
- words = lexicon_count(text)
116
- sentences = sentence_count(text)
117
- begin
118
- a = chars.to_f / words
119
- b = words.to_f / sentences
120
-
121
- readability = 4.71 * a + 0.5 * b - 21.43
122
- readability.round(1)
123
- rescue ZeroDivisionError
124
- 0.0
125
- end
126
- end
127
-
128
- def self.linsear_write_formula(text)
129
- easy_word = 0
130
- difficult_word = 0
131
- text_list = text.split(' ')[0..100]
132
-
133
- text_list.each do |word|
134
- if syllable_count(word) < 3
135
- easy_word += 1
136
- else
137
- difficult_word += 1
138
- end
139
- end
140
-
141
- text = text_list.join(' ')
142
-
143
- number = (easy_word * 1 + difficult_word * 3).to_f / sentence_count(text)
144
- number -= 2 if number <= 20
145
- number / 2
146
- end
147
-
148
- def self.difficult_words(text, language = 'en_us')
149
- require 'set'
150
- easy_words = Set.new
151
- File.read(File.join(dictionary_path, "#{language}.txt")).each_line do |line|
152
- easy_words << line.chop
153
- end
154
-
155
- text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
156
- diff_words_set = Set.new
157
- text_list.each do |value|
158
- next if easy_words.include? value
159
-
160
- diff_words_set.add(value) if syllable_count(value) > 1
161
- end
162
- diff_words_set.length
163
- end
164
-
165
- def self.dale_chall_readability_score(text)
166
- word_count = lexicon_count(text)
167
- count = word_count - difficult_words(text)
168
-
169
- begin
170
- per = 100.0 * count / word_count
171
- rescue ZeroDivisionError
172
- return 0.0
173
- end
174
-
175
- difficult_words = 100 - per
176
- score = 0.1579 * difficult_words + 0.0496 * avg_sentence_length(text)
177
- score += 3.6365 if difficult_words > 5
178
-
179
- score.round(2)
180
- end
181
-
182
- def self.gunning_fog(text)
183
- per_diff_words = 100.0 * difficult_words(text) / lexicon_count(text) + 5
184
- grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
185
-
186
- grade.round(2)
187
- rescue ZeroDivisionError
188
- 0.0
189
- end
190
-
191
- def self.lix(text)
192
- words = text.split(' ')
193
- words_length = words.length
194
- long_words = words.count { |word| word.length > 6 }
195
-
196
- per_long_words = 100.0 * long_words / words_length
197
- asl = avg_sentence_length(text)
198
- lix = asl + per_long_words
199
-
200
- lix.round(2)
201
- end
202
-
203
- def self.forcast(text, language = 'en_us')
204
- words = text.split(' ')[0..149]
205
- words_with_one_syllabe = words.count {
206
- |word| syllable_count(word, language) == 1
207
- }
208
- forcast = 20 - (words_with_one_syllabe / 10)
209
- forcast
210
- end
211
-
212
- def self.powers_sumner_kearl(text)
213
- grade = 0.0778 * avg_sentence_length(text) + 0.0455 * syllable_count(text) - 2.2029
214
- grade.round(2)
215
- end
216
-
217
- def self.spache(text, language = 'en_us')
218
- words = text.split(' ').count
219
- unfamiliar_words = difficult_words(text, language) / words
220
- grade = (0.141 * avg_sentence_length(text)) + (0.086 * unfamiliar_words) + 0.839
221
- grade.round(2)
222
- end
223
-
224
- def self.text_standard(text, float_output=nil)
225
- grade = []
226
-
227
- lower = flesch_kincaid_grade(text).round
228
- upper = flesch_kincaid_grade(text).ceil
229
- grade.append(lower.to_i)
230
- grade.append(upper.to_i)
231
-
232
- # Appending Flesch Reading Easy
233
- score = flesch_reading_ease(text)
234
- if score < 100 && score >= 90
235
- grade.append(5)
236
- elsif score < 90 && score >= 80
237
- grade.append(6)
238
- elsif score < 80 && score >= 70
239
- grade.append(7)
240
- elsif score < 70 && score >= 60
241
- grade.append(8)
242
- grade.append(9)
243
- elsif score < 60 && score >= 50
244
- grade.append(10)
245
- elsif score < 50 && score >= 40
246
- grade.append(11)
247
- elsif score < 40 && score >= 30
248
- grade.append(12)
249
- else
250
- grade.append(13)
251
- end
252
-
253
- # Appending SMOG Index
254
- lower = smog_index(text).round
255
- upper = smog_index(text).ceil
256
- grade.append(lower.to_i)
257
- grade.append(upper.to_i)
258
-
259
- # Appending Coleman_Liau_Index
260
- lower = coleman_liau_index(text).round
261
- upper = coleman_liau_index(text).ceil
262
- grade.append(lower.to_i)
263
- grade.append(upper.to_i)
264
-
265
- # Appending Automated_Readability_Index
266
- lower = automated_readability_index(text).round
267
- upper = automated_readability_index(text).ceil
268
- grade.append(lower.to_i)
269
- grade.append(upper.to_i)
270
-
271
- # Appending Dale_Chall_Readability_Score
272
- lower = dale_chall_readability_score(text).round
273
- upper = dale_chall_readability_score(text).ceil
274
- grade.append(lower.to_i)
275
- grade.append(upper.to_i)
276
-
277
- # Appending Linsear_Write_Formula
278
- lower = linsear_write_formula(text).round
279
- upper = linsear_write_formula(text).ceil
280
- grade.append(lower.to_i)
281
- grade.append(upper.to_i)
282
-
283
- # Appending Gunning Fog Index
284
- lower = gunning_fog(text).round
285
- upper = gunning_fog(text).ceil
286
- grade.append(lower.to_i)
287
- grade.append(upper.to_i)
288
-
289
- # Finding the Readability Consensus based upon all the above tests
290
- require 'counter'
291
- d = Counter.new(grade)
292
- final_grade = d.most_common(1)
293
- score = final_grade[0][0]
294
-
295
- if float_output
296
- score.to_f
297
- else
298
- "#{score.to_i - 1}th and #{score.to_i}th grade"
299
- end
300
- end
301
-
302
- def self.dictionary_path=(path)
303
- @dictionary_path = path
304
- end
305
-
306
- def self.dictionary_path
307
- @dictionary_path ||= File.join(TextStat::GEM_PATH, 'lib', 'dictionaries')
308
- end
309
- end
1
+ require 'text-hyphen'
2
+
3
+ class TextStat
4
+ GEM_PATH = File.dirname(File.dirname(__FILE__))
5
+
6
+ def self.char_count(text, ignore_spaces = true)
7
+ text = text.delete(' ') if ignore_spaces
8
+ text.length
9
+ end
10
+
11
+ def self.lexicon_count(text, remove_punctuation = true)
12
+ text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
13
+ count = text.split(' ').count
14
+ count
15
+ end
16
+
17
+ def self.syllable_count(text, language = 'en_us')
18
+ return 0 if text.empty?
19
+
20
+ text = text.downcase
21
+ text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
22
+ dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
23
+ count = 0
24
+ text.split(' ').each do |word|
25
+ word_hyphenated = dictionary.visualise(word)
26
+ count += word_hyphenated.count('-') + 1
27
+ end
28
+ count
29
+ end
30
+
31
+ def self.sentence_count(text)
32
+ text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
33
+ end
34
+
35
+ def self.avg_sentence_length(text)
36
+ asl = lexicon_count(text).to_f / sentence_count(text)
37
+ asl.round(1)
38
+ rescue ZeroDivisionError
39
+ 0.0
40
+ end
41
+
42
+ def self.avg_syllables_per_word(text, language = 'en_us')
43
+ syllable = syllable_count(text, language)
44
+ words = lexicon_count(text)
45
+ begin
46
+ syllables_per_word = syllable.to_f / words
47
+ syllables_per_word.round(1)
48
+ rescue ZeroDivisionError
49
+ 0.0
50
+ end
51
+ end
52
+
53
+ def self.avg_letter_per_word(text)
54
+ letters_per_word = char_count(text).to_f / lexicon_count(text)
55
+ letters_per_word.round(2)
56
+ rescue ZeroDivisionError
57
+ 0.0
58
+ end
59
+
60
+ def self.avg_sentence_per_word(text)
61
+ sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
62
+ sentence_per_word.round(2)
63
+ rescue ZeroDivisionError
64
+ 0.0
65
+ end
66
+
67
+ def self.flesch_reading_ease(text, language = 'en_us')
68
+ sentence_length = avg_sentence_length(text)
69
+ syllables_per_word = avg_syllables_per_word(text, language)
70
+ flesch = 206.835 - 1.015 * sentence_length - 84.6 * syllables_per_word
71
+ flesch.round(2)
72
+ end
73
+
74
+ def self.flesch_kincaid_grade(text, language = 'en_us')
75
+ sentence_length = avg_sentence_length(text)
76
+ syllables_per_word = avg_syllables_per_word(text, language)
77
+ flesch = 0.39 * sentence_length + 11.8 * syllables_per_word - 15.59
78
+ flesch.round(1)
79
+ end
80
+
81
+ def self.polysyllab_count(text, language = 'en_us')
82
+ count = 0
83
+ text.split(' ').each do |word|
84
+ w = syllable_count(word, language)
85
+ count += 1 if w >= 3
86
+ end
87
+ count
88
+ end
89
+
90
+ def self.smog_index(text, language = 'en_us')
91
+ sentences = sentence_count(text)
92
+
93
+ if sentences >= 3
94
+ begin
95
+ polysyllab = polysyllab_count(text, language)
96
+ smog = 1.043 * Math.sqrt(30.0 * polysyllab / sentences) + 3.1291
97
+ smog.round(1)
98
+ rescue ZeroDivisionError
99
+ 0.0
100
+ end
101
+ else
102
+ 0.0
103
+ end
104
+ end
105
+
106
+ def self.coleman_liau_index(text)
107
+ letters = (avg_letter_per_word(text) * 100).round(2)
108
+ sentences = (avg_sentence_per_word(text) * 100).round(2)
109
+ coleman = 0.0588 * letters - 0.296 * sentences - 15.8
110
+ coleman.round(2)
111
+ end
112
+
113
+ def self.automated_readability_index(text)
114
+ chars = char_count(text)
115
+ words = lexicon_count(text)
116
+ sentences = sentence_count(text)
117
+ begin
118
+ a = chars.to_f / words
119
+ b = words.to_f / sentences
120
+
121
+ readability = 4.71 * a + 0.5 * b - 21.43
122
+ readability.round(1)
123
+ rescue ZeroDivisionError
124
+ 0.0
125
+ end
126
+ end
127
+
128
+ def self.linsear_write_formula(text, language = 'en_us')
129
+ easy_word = 0
130
+ difficult_word = 0
131
+ text_list = text.split(' ')[0..100]
132
+
133
+ text_list.each do |word|
134
+ if syllable_count(word, language) < 3
135
+ easy_word += 1
136
+ else
137
+ difficult_word += 1
138
+ end
139
+ end
140
+
141
+ text = text_list.join(' ')
142
+
143
+ number = (easy_word * 1 + difficult_word * 3).to_f / sentence_count(text)
144
+ number -= 2 if number <= 20
145
+ number / 2
146
+ end
147
+
148
+ def self.difficult_words(text, language = 'en_us', return_words = false)
149
+ require 'set'
150
+ easy_words = Set.new
151
+ File.read(File.join(dictionary_path, "#{language}.txt")).each_line do |line|
152
+ easy_words << line.chop
153
+ end
154
+
155
+ text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
156
+ diff_words_set = Set.new
157
+ text_list.each do |value|
158
+ next if easy_words.include? value
159
+
160
+ diff_words_set.add(value) if syllable_count(value, language) > 1
161
+ end
162
+ if return_words
163
+ diff_words_set
164
+ else
165
+ diff_words_set.length
166
+ end
167
+ end
168
+
169
+ def self.dale_chall_readability_score(text, language = 'en_us')
170
+ word_count = lexicon_count(text)
171
+ count = word_count - difficult_words(text, language)
172
+
173
+ begin
174
+ per = 100.0 * count / word_count
175
+ rescue ZeroDivisionError
176
+ return 0.0
177
+ end
178
+
179
+ difficult_words = 100 - per
180
+ score = 0.1579 * difficult_words + 0.0496 * avg_sentence_length(text)
181
+ score += 3.6365 if difficult_words > 5
182
+
183
+ score.round(2)
184
+ end
185
+
186
+ def self.gunning_fog(text, language = 'en_us')
187
+ per_diff_words = 100.0 * difficult_words(text, language) / lexicon_count(text) + 5
188
+ grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
189
+
190
+ grade.round(2)
191
+ rescue ZeroDivisionError
192
+ 0.0
193
+ end
194
+
195
+ def self.lix(text)
196
+ words = text.split(' ')
197
+ words_length = words.length
198
+ long_words = words.count { |word| word.length > 6 }
199
+
200
+ per_long_words = 100.0 * long_words / words_length
201
+ asl = avg_sentence_length(text)
202
+ lix = asl + per_long_words
203
+
204
+ lix.round(2)
205
+ end
206
+
207
+ def self.forcast(text, language = 'en_us')
208
+ words = text.split(' ')[0..149]
209
+ words_with_one_syllabe = words.count {
210
+ |word| syllable_count(word, language) == 1
211
+ }
212
+ forcast = 20 - (words_with_one_syllabe / 10)
213
+ forcast
214
+ end
215
+
216
+ def self.powers_sumner_kearl(text, language = 'en_us')
217
+ grade = 0.0778 * avg_sentence_length(text) + 0.0455 * syllable_count(text, language) - 2.2029
218
+ grade.round(2)
219
+ end
220
+
221
+ def self.spache(text, language = 'en_us')
222
+ words = text.split(' ').count
223
+ unfamiliar_words = difficult_words(text, language) / words
224
+ grade = (0.141 * avg_sentence_length(text)) + (0.086 * unfamiliar_words) + 0.839
225
+ grade.round(2)
226
+ end
227
+
228
+ def self.text_standard(text, float_output=nil)
229
+ grade = []
230
+
231
+ lower = flesch_kincaid_grade(text).round
232
+ upper = flesch_kincaid_grade(text).ceil
233
+ grade.append(lower.to_i)
234
+ grade.append(upper.to_i)
235
+
236
+ # Appending Flesch Reading Easy
237
+ score = flesch_reading_ease(text)
238
+ if score < 100 && score >= 90
239
+ grade.append(5)
240
+ elsif score < 90 && score >= 80
241
+ grade.append(6)
242
+ elsif score < 80 && score >= 70
243
+ grade.append(7)
244
+ elsif score < 70 && score >= 60
245
+ grade.append(8)
246
+ grade.append(9)
247
+ elsif score < 60 && score >= 50
248
+ grade.append(10)
249
+ elsif score < 50 && score >= 40
250
+ grade.append(11)
251
+ elsif score < 40 && score >= 30
252
+ grade.append(12)
253
+ else
254
+ grade.append(13)
255
+ end
256
+
257
+ # Appending SMOG Index
258
+ lower = smog_index(text).round
259
+ upper = smog_index(text).ceil
260
+ grade.append(lower.to_i)
261
+ grade.append(upper.to_i)
262
+
263
+ # Appending Coleman_Liau_Index
264
+ lower = coleman_liau_index(text).round
265
+ upper = coleman_liau_index(text).ceil
266
+ grade.append(lower.to_i)
267
+ grade.append(upper.to_i)
268
+
269
+ # Appending Automated_Readability_Index
270
+ lower = automated_readability_index(text).round
271
+ upper = automated_readability_index(text).ceil
272
+ grade.append(lower.to_i)
273
+ grade.append(upper.to_i)
274
+
275
+ # Appending Dale_Chall_Readability_Score
276
+ lower = dale_chall_readability_score(text).round
277
+ upper = dale_chall_readability_score(text).ceil
278
+ grade.append(lower.to_i)
279
+ grade.append(upper.to_i)
280
+
281
+ # Appending Linsear_Write_Formula
282
+ lower = linsear_write_formula(text).round
283
+ upper = linsear_write_formula(text).ceil
284
+ grade.append(lower.to_i)
285
+ grade.append(upper.to_i)
286
+
287
+ # Appending Gunning Fog Index
288
+ lower = gunning_fog(text).round
289
+ upper = gunning_fog(text).ceil
290
+ grade.append(lower.to_i)
291
+ grade.append(upper.to_i)
292
+
293
+ # Finding the Readability Consensus based upon all the above tests
294
+ require 'counter'
295
+ d = Counter.new(grade)
296
+ final_grade = d.most_common(1)
297
+ score = final_grade[0][0]
298
+
299
+ if float_output
300
+ score.to_f
301
+ else
302
+ "#{score.to_i - 1}th and #{score.to_i}th grade"
303
+ end
304
+ end
305
+
306
+ def self.dictionary_path=(path)
307
+ @dictionary_path = path
308
+ end
309
+
310
+ def self.dictionary_path
311
+ @dictionary_path ||= File.join(TextStat::GEM_PATH, 'lib', 'dictionaries')
312
+ end
313
+ end