textstat 0.1.7 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,3 @@
1
- class TextStat
2
- VERSION = "0.1.7"
3
- end
1
+ class TextStat
2
+ VERSION = "0.1.9"
3
+ end
data/lib/textstat.rb CHANGED
@@ -1,309 +1,313 @@
1
- require 'text-hyphen'
2
-
3
- class TextStat
4
- GEM_PATH = File.dirname(File.dirname(__FILE__))
5
-
6
- def self.char_count(text, ignore_spaces = true)
7
- text = text.delete(' ') if ignore_spaces
8
- text.length
9
- end
10
-
11
- def self.lexicon_count(text, remove_punctuation = true)
12
- text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
13
- count = text.split(' ').count
14
- count
15
- end
16
-
17
- def self.syllable_count(text, language = 'en_us')
18
- return 0 if text.empty?
19
-
20
- text = text.downcase
21
- text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
22
- dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
23
- count = 0
24
- text.split(' ').each do |word|
25
- word_hyphenated = dictionary.visualise(word)
26
- count += word_hyphenated.count('-') + 1
27
- end
28
- count
29
- end
30
-
31
- def self.sentence_count(text)
32
- text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
33
- end
34
-
35
- def self.avg_sentence_length(text)
36
- asl = lexicon_count(text).to_f / sentence_count(text)
37
- asl.round(1)
38
- rescue ZeroDivisionError
39
- 0.0
40
- end
41
-
42
- def self.avg_syllables_per_word(text)
43
- syllable = syllable_count(text)
44
- words = lexicon_count(text)
45
- begin
46
- syllables_per_word = syllable.to_f / words
47
- syllables_per_word.round(1)
48
- rescue ZeroDivisionError
49
- 0.0
50
- end
51
- end
52
-
53
- def self.avg_letter_per_word(text)
54
- letters_per_word = char_count(text).to_f / lexicon_count(text)
55
- letters_per_word.round(2)
56
- rescue ZeroDivisionError
57
- 0.0
58
- end
59
-
60
- def self.avg_sentence_per_word(text)
61
- sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
62
- sentence_per_word.round(2)
63
- rescue ZeroDivisionError
64
- 0.0
65
- end
66
-
67
- def self.flesch_reading_ease(text)
68
- sentence_length = avg_sentence_length(text)
69
- syllables_per_word = avg_syllables_per_word(text)
70
- flesch = 206.835 - 1.015 * sentence_length - 84.6 * syllables_per_word
71
- flesch.round(2)
72
- end
73
-
74
- def self.flesch_kincaid_grade(text)
75
- sentence_length = avg_sentence_length(text)
76
- syllables_per_word = avg_syllables_per_word(text)
77
- flesch = 0.39 * sentence_length + 11.8 * syllables_per_word - 15.59
78
- flesch.round(1)
79
- end
80
-
81
- def self.polysyllab_count(text)
82
- count = 0
83
- text.split(' ').each do |word|
84
- w = syllable_count(word)
85
- count += 1 if w >= 3
86
- end
87
- count
88
- end
89
-
90
- def self.smog_index(text)
91
- sentences = sentence_count(text)
92
-
93
- if sentences >= 3
94
- begin
95
- polysyllab = polysyllab_count(text)
96
- smog = 1.043 * Math.sqrt(30.0 * polysyllab / sentences) + 3.1291
97
- smog.round(1)
98
- rescue ZeroDivisionError
99
- 0.0
100
- end
101
- else
102
- 0.0
103
- end
104
- end
105
-
106
- def self.coleman_liau_index(text)
107
- letters = (avg_letter_per_word(text) * 100).round(2)
108
- sentences = (avg_sentence_per_word(text) * 100).round(2)
109
- coleman = 0.0588 * letters - 0.296 * sentences - 15.8
110
- coleman.round(2)
111
- end
112
-
113
- def self.automated_readability_index(text)
114
- chars = char_count(text)
115
- words = lexicon_count(text)
116
- sentences = sentence_count(text)
117
- begin
118
- a = chars.to_f / words
119
- b = words.to_f / sentences
120
-
121
- readability = 4.71 * a + 0.5 * b - 21.43
122
- readability.round(1)
123
- rescue ZeroDivisionError
124
- 0.0
125
- end
126
- end
127
-
128
- def self.linsear_write_formula(text)
129
- easy_word = 0
130
- difficult_word = 0
131
- text_list = text.split(' ')[0..100]
132
-
133
- text_list.each do |word|
134
- if syllable_count(word) < 3
135
- easy_word += 1
136
- else
137
- difficult_word += 1
138
- end
139
- end
140
-
141
- text = text_list.join(' ')
142
-
143
- number = (easy_word * 1 + difficult_word * 3).to_f / sentence_count(text)
144
- number -= 2 if number <= 20
145
- number / 2
146
- end
147
-
148
- def self.difficult_words(text, language = 'en_us')
149
- require 'set'
150
- easy_words = Set.new
151
- File.read(File.join(dictionary_path, "#{language}.txt")).each_line do |line|
152
- easy_words << line.chop
153
- end
154
-
155
- text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
156
- diff_words_set = Set.new
157
- text_list.each do |value|
158
- next if easy_words.include? value
159
-
160
- diff_words_set.add(value) if syllable_count(value) > 1
161
- end
162
- diff_words_set.length
163
- end
164
-
165
- def self.dale_chall_readability_score(text)
166
- word_count = lexicon_count(text)
167
- count = word_count - difficult_words(text)
168
-
169
- begin
170
- per = 100.0 * count / word_count
171
- rescue ZeroDivisionError
172
- return 0.0
173
- end
174
-
175
- difficult_words = 100 - per
176
- score = 0.1579 * difficult_words + 0.0496 * avg_sentence_length(text)
177
- score += 3.6365 if difficult_words > 5
178
-
179
- score.round(2)
180
- end
181
-
182
- def self.gunning_fog(text)
183
- per_diff_words = 100.0 * difficult_words(text) / lexicon_count(text) + 5
184
- grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
185
-
186
- grade.round(2)
187
- rescue ZeroDivisionError
188
- 0.0
189
- end
190
-
191
- def self.lix(text)
192
- words = text.split(' ')
193
- words_length = words.length
194
- long_words = words.count { |word| word.length > 6 }
195
-
196
- per_long_words = 100.0 * long_words / words_length
197
- asl = avg_sentence_length(text)
198
- lix = asl + per_long_words
199
-
200
- lix.round(2)
201
- end
202
-
203
- def self.forcast(text, language = 'en_us')
204
- words = text.split(' ')[0..149]
205
- words_with_one_syllabe = words.count {
206
- |word| syllable_count(word, language) == 1
207
- }
208
- forcast = 20 - (words_with_one_syllabe / 10)
209
- forcast
210
- end
211
-
212
- def self.powers_sumner_kearl(text)
213
- grade = 0.0778 * avg_sentence_length(text) + 0.0455 * syllable_count(text) - 2.2029
214
- grade.round(2)
215
- end
216
-
217
- def self.spache(text, language = 'en_us')
218
- words = text.split(' ').count
219
- unfamiliar_words = difficult_words(text, language) / words
220
- grade = (0.141 * avg_sentence_length(text)) + (0.086 * unfamiliar_words) + 0.839
221
- grade.round(2)
222
- end
223
-
224
- def self.text_standard(text, float_output=nil)
225
- grade = []
226
-
227
- lower = flesch_kincaid_grade(text).round
228
- upper = flesch_kincaid_grade(text).ceil
229
- grade.append(lower.to_i)
230
- grade.append(upper.to_i)
231
-
232
- # Appending Flesch Reading Easy
233
- score = flesch_reading_ease(text)
234
- if score < 100 && score >= 90
235
- grade.append(5)
236
- elsif score < 90 && score >= 80
237
- grade.append(6)
238
- elsif score < 80 && score >= 70
239
- grade.append(7)
240
- elsif score < 70 && score >= 60
241
- grade.append(8)
242
- grade.append(9)
243
- elsif score < 60 && score >= 50
244
- grade.append(10)
245
- elsif score < 50 && score >= 40
246
- grade.append(11)
247
- elsif score < 40 && score >= 30
248
- grade.append(12)
249
- else
250
- grade.append(13)
251
- end
252
-
253
- # Appending SMOG Index
254
- lower = smog_index(text).round
255
- upper = smog_index(text).ceil
256
- grade.append(lower.to_i)
257
- grade.append(upper.to_i)
258
-
259
- # Appending Coleman_Liau_Index
260
- lower = coleman_liau_index(text).round
261
- upper = coleman_liau_index(text).ceil
262
- grade.append(lower.to_i)
263
- grade.append(upper.to_i)
264
-
265
- # Appending Automated_Readability_Index
266
- lower = automated_readability_index(text).round
267
- upper = automated_readability_index(text).ceil
268
- grade.append(lower.to_i)
269
- grade.append(upper.to_i)
270
-
271
- # Appending Dale_Chall_Readability_Score
272
- lower = dale_chall_readability_score(text).round
273
- upper = dale_chall_readability_score(text).ceil
274
- grade.append(lower.to_i)
275
- grade.append(upper.to_i)
276
-
277
- # Appending Linsear_Write_Formula
278
- lower = linsear_write_formula(text).round
279
- upper = linsear_write_formula(text).ceil
280
- grade.append(lower.to_i)
281
- grade.append(upper.to_i)
282
-
283
- # Appending Gunning Fog Index
284
- lower = gunning_fog(text).round
285
- upper = gunning_fog(text).ceil
286
- grade.append(lower.to_i)
287
- grade.append(upper.to_i)
288
-
289
- # Finding the Readability Consensus based upon all the above tests
290
- require 'counter'
291
- d = Counter.new(grade)
292
- final_grade = d.most_common(1)
293
- score = final_grade[0][0]
294
-
295
- if float_output
296
- score.to_f
297
- else
298
- "#{score.to_i - 1}th and #{score.to_i}th grade"
299
- end
300
- end
301
-
302
- def self.dictionary_path=(path)
303
- @dictionary_path = path
304
- end
305
-
306
- def self.dictionary_path
307
- @dictionary_path ||= File.join(TextStat::GEM_PATH, 'lib', 'dictionaries')
308
- end
309
- end
1
+ require 'text-hyphen'
2
+
3
+ class TextStat
4
+ GEM_PATH = File.dirname(File.dirname(__FILE__))
5
+
6
+ def self.char_count(text, ignore_spaces = true)
7
+ text = text.delete(' ') if ignore_spaces
8
+ text.length
9
+ end
10
+
11
+ def self.lexicon_count(text, remove_punctuation = true)
12
+ text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
13
+ count = text.split(' ').count
14
+ count
15
+ end
16
+
17
+ def self.syllable_count(text, language = 'en_us')
18
+ return 0 if text.empty?
19
+
20
+ text = text.downcase
21
+ text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
22
+ dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
23
+ count = 0
24
+ text.split(' ').each do |word|
25
+ word_hyphenated = dictionary.visualise(word)
26
+ count += word_hyphenated.count('-') + 1
27
+ end
28
+ count
29
+ end
30
+
31
+ def self.sentence_count(text)
32
+ text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
33
+ end
34
+
35
+ def self.avg_sentence_length(text)
36
+ asl = lexicon_count(text).to_f / sentence_count(text)
37
+ asl.round(1)
38
+ rescue ZeroDivisionError
39
+ 0.0
40
+ end
41
+
42
+ def self.avg_syllables_per_word(text, language = 'en_us')
43
+ syllable = syllable_count(text, language)
44
+ words = lexicon_count(text)
45
+ begin
46
+ syllables_per_word = syllable.to_f / words
47
+ syllables_per_word.round(1)
48
+ rescue ZeroDivisionError
49
+ 0.0
50
+ end
51
+ end
52
+
53
+ def self.avg_letter_per_word(text)
54
+ letters_per_word = char_count(text).to_f / lexicon_count(text)
55
+ letters_per_word.round(2)
56
+ rescue ZeroDivisionError
57
+ 0.0
58
+ end
59
+
60
+ def self.avg_sentence_per_word(text)
61
+ sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
62
+ sentence_per_word.round(2)
63
+ rescue ZeroDivisionError
64
+ 0.0
65
+ end
66
+
67
+ def self.flesch_reading_ease(text, language = 'en_us')
68
+ sentence_length = avg_sentence_length(text)
69
+ syllables_per_word = avg_syllables_per_word(text, language)
70
+ flesch = 206.835 - 1.015 * sentence_length - 84.6 * syllables_per_word
71
+ flesch.round(2)
72
+ end
73
+
74
+ def self.flesch_kincaid_grade(text, language = 'en_us')
75
+ sentence_length = avg_sentence_length(text)
76
+ syllables_per_word = avg_syllables_per_word(text, language)
77
+ flesch = 0.39 * sentence_length + 11.8 * syllables_per_word - 15.59
78
+ flesch.round(1)
79
+ end
80
+
81
+ def self.polysyllab_count(text, language = 'en_us')
82
+ count = 0
83
+ text.split(' ').each do |word|
84
+ w = syllable_count(word, language)
85
+ count += 1 if w >= 3
86
+ end
87
+ count
88
+ end
89
+
90
+ def self.smog_index(text, language = 'en_us')
91
+ sentences = sentence_count(text)
92
+
93
+ if sentences >= 3
94
+ begin
95
+ polysyllab = polysyllab_count(text, language)
96
+ smog = 1.043 * Math.sqrt(30.0 * polysyllab / sentences) + 3.1291
97
+ smog.round(1)
98
+ rescue ZeroDivisionError
99
+ 0.0
100
+ end
101
+ else
102
+ 0.0
103
+ end
104
+ end
105
+
106
+ def self.coleman_liau_index(text)
107
+ letters = (avg_letter_per_word(text) * 100).round(2)
108
+ sentences = (avg_sentence_per_word(text) * 100).round(2)
109
+ coleman = 0.0588 * letters - 0.296 * sentences - 15.8
110
+ coleman.round(2)
111
+ end
112
+
113
+ def self.automated_readability_index(text)
114
+ chars = char_count(text)
115
+ words = lexicon_count(text)
116
+ sentences = sentence_count(text)
117
+ begin
118
+ a = chars.to_f / words
119
+ b = words.to_f / sentences
120
+
121
+ readability = 4.71 * a + 0.5 * b - 21.43
122
+ readability.round(1)
123
+ rescue ZeroDivisionError
124
+ 0.0
125
+ end
126
+ end
127
+
128
+ def self.linsear_write_formula(text, language = 'en_us')
129
+ easy_word = 0
130
+ difficult_word = 0
131
+ text_list = text.split(' ')[0..100]
132
+
133
+ text_list.each do |word|
134
+ if syllable_count(word, language) < 3
135
+ easy_word += 1
136
+ else
137
+ difficult_word += 1
138
+ end
139
+ end
140
+
141
+ text = text_list.join(' ')
142
+
143
+ number = (easy_word * 1 + difficult_word * 3).to_f / sentence_count(text)
144
+ number -= 2 if number <= 20
145
+ number / 2
146
+ end
147
+
148
+ def self.difficult_words(text, language = 'en_us', return_words = false)
149
+ require 'set'
150
+ easy_words = Set.new
151
+ File.read(File.join(dictionary_path, "#{language}.txt")).each_line do |line|
152
+ easy_words << line.chop
153
+ end
154
+
155
+ text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
156
+ diff_words_set = Set.new
157
+ text_list.each do |value|
158
+ next if easy_words.include? value
159
+
160
+ diff_words_set.add(value) if syllable_count(value, language) > 1
161
+ end
162
+ if return_words
163
+ diff_words_set
164
+ else
165
+ diff_words_set.length
166
+ end
167
+ end
168
+
169
+ def self.dale_chall_readability_score(text, language = 'en_us')
170
+ word_count = lexicon_count(text)
171
+ count = word_count - difficult_words(text, language)
172
+
173
+ begin
174
+ per = 100.0 * count / word_count
175
+ rescue ZeroDivisionError
176
+ return 0.0
177
+ end
178
+
179
+ difficult_words = 100 - per
180
+ score = 0.1579 * difficult_words + 0.0496 * avg_sentence_length(text)
181
+ score += 3.6365 if difficult_words > 5
182
+
183
+ score.round(2)
184
+ end
185
+
186
+ def self.gunning_fog(text, language = 'en_us')
187
+ per_diff_words = 100.0 * difficult_words(text, language) / lexicon_count(text) + 5
188
+ grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
189
+
190
+ grade.round(2)
191
+ rescue ZeroDivisionError
192
+ 0.0
193
+ end
194
+
195
+ def self.lix(text)
196
+ words = text.split(' ')
197
+ words_length = words.length
198
+ long_words = words.count { |word| word.length > 6 }
199
+
200
+ per_long_words = 100.0 * long_words / words_length
201
+ asl = avg_sentence_length(text)
202
+ lix = asl + per_long_words
203
+
204
+ lix.round(2)
205
+ end
206
+
207
+ def self.forcast(text, language = 'en_us')
208
+ words = text.split(' ')[0..149]
209
+ words_with_one_syllabe = words.count {
210
+ |word| syllable_count(word, language) == 1
211
+ }
212
+ forcast = 20 - (words_with_one_syllabe / 10)
213
+ forcast
214
+ end
215
+
216
+ def self.powers_sumner_kearl(text, language = 'en_us')
217
+ grade = 0.0778 * avg_sentence_length(text) + 0.0455 * syllable_count(text, language) - 2.2029
218
+ grade.round(2)
219
+ end
220
+
221
+ def self.spache(text, language = 'en_us')
222
+ words = text.split(' ').count
223
+ unfamiliar_words = difficult_words(text, language) / words
224
+ grade = (0.141 * avg_sentence_length(text)) + (0.086 * unfamiliar_words) + 0.839
225
+ grade.round(2)
226
+ end
227
+
228
+ def self.text_standard(text, float_output=nil)
229
+ grade = []
230
+
231
+ lower = flesch_kincaid_grade(text).round
232
+ upper = flesch_kincaid_grade(text).ceil
233
+ grade.append(lower.to_i)
234
+ grade.append(upper.to_i)
235
+
236
+ # Appending Flesch Reading Easy
237
+ score = flesch_reading_ease(text)
238
+ if score < 100 && score >= 90
239
+ grade.append(5)
240
+ elsif score < 90 && score >= 80
241
+ grade.append(6)
242
+ elsif score < 80 && score >= 70
243
+ grade.append(7)
244
+ elsif score < 70 && score >= 60
245
+ grade.append(8)
246
+ grade.append(9)
247
+ elsif score < 60 && score >= 50
248
+ grade.append(10)
249
+ elsif score < 50 && score >= 40
250
+ grade.append(11)
251
+ elsif score < 40 && score >= 30
252
+ grade.append(12)
253
+ else
254
+ grade.append(13)
255
+ end
256
+
257
+ # Appending SMOG Index
258
+ lower = smog_index(text).round
259
+ upper = smog_index(text).ceil
260
+ grade.append(lower.to_i)
261
+ grade.append(upper.to_i)
262
+
263
+ # Appending Coleman_Liau_Index
264
+ lower = coleman_liau_index(text).round
265
+ upper = coleman_liau_index(text).ceil
266
+ grade.append(lower.to_i)
267
+ grade.append(upper.to_i)
268
+
269
+ # Appending Automated_Readability_Index
270
+ lower = automated_readability_index(text).round
271
+ upper = automated_readability_index(text).ceil
272
+ grade.append(lower.to_i)
273
+ grade.append(upper.to_i)
274
+
275
+ # Appending Dale_Chall_Readability_Score
276
+ lower = dale_chall_readability_score(text).round
277
+ upper = dale_chall_readability_score(text).ceil
278
+ grade.append(lower.to_i)
279
+ grade.append(upper.to_i)
280
+
281
+ # Appending Linsear_Write_Formula
282
+ lower = linsear_write_formula(text).round
283
+ upper = linsear_write_formula(text).ceil
284
+ grade.append(lower.to_i)
285
+ grade.append(upper.to_i)
286
+
287
+ # Appending Gunning Fog Index
288
+ lower = gunning_fog(text).round
289
+ upper = gunning_fog(text).ceil
290
+ grade.append(lower.to_i)
291
+ grade.append(upper.to_i)
292
+
293
+ # Finding the Readability Consensus based upon all the above tests
294
+ require 'counter'
295
+ d = Counter.new(grade)
296
+ final_grade = d.most_common(1)
297
+ score = final_grade[0][0]
298
+
299
+ if float_output
300
+ score.to_f
301
+ else
302
+ "#{score.to_i - 1}th and #{score.to_i}th grade"
303
+ end
304
+ end
305
+
306
+ def self.dictionary_path=(path)
307
+ @dictionary_path = path
308
+ end
309
+
310
+ def self.dictionary_path
311
+ @dictionary_path ||= File.join(TextStat::GEM_PATH, 'lib', 'dictionaries')
312
+ end
313
+ end