textstat 0.1.1 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/textstat.rb CHANGED
@@ -1,293 +1,309 @@
1
- require 'text-hyphen'
2
-
3
- class TextStat
4
- def self.char_count(text, ignore_spaces = true)
5
- text = text.delete(' ') if ignore_spaces
6
- text.length
7
- end
8
-
9
- def self.lexicon_count(text, remove_punctuation = true)
10
- text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
11
- count = text.split(' ').count
12
- count
13
- end
14
-
15
- def self.syllable_count(text, language = 'en_us')
16
- return 0 if text.empty?
17
-
18
- text = text.downcase
19
- text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
20
- dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
21
- count = 0
22
- text.split(' ').each do |word|
23
- word_hyphenated = dictionary.visualise(word)
24
- count += [1, word_hyphenated.count('-') + 1].max
25
- end
26
- count
27
- end
28
-
29
- def self.sentence_count(text)
30
- text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
31
- end
32
-
33
- def self.avg_sentence_length(text)
34
- asl = lexicon_count(text).to_f / sentence_count(text).to_f
35
- asl.round(1)
36
- rescue ZeroDivisionError
37
- 0.0
38
- end
39
-
40
- def self.avg_syllables_per_word(text)
41
- syllable = syllable_count(text)
42
- words = lexicon_count(text)
43
- begin
44
- syllables_per_word = syllable.to_f / words.to_f
45
- return syllables_per_word.round(1)
46
- rescue ZeroDivisionError
47
- return 0.0
48
- end
49
- end
50
-
51
- def self.avg_letter_per_word(text)
52
- letters_per_word = char_count(text).to_f / lexicon_count(text).to_f
53
- letters_per_word.round(2)
54
- rescue ZeroDivisionError
55
- 0.0
56
- end
57
-
58
- def self.avg_sentence_per_word(text)
59
- sentence_per_word = sentence_count(text).to_f / lexicon_count(text).to_f
60
- sentence_per_word.round(2)
61
- rescue ZeroDivisionError
62
- 0.0
63
- end
64
-
65
- def self.flesch_reading_ease(text)
66
- sentence_length = avg_sentence_length(text)
67
- syllables_per_word = avg_syllables_per_word(text)
68
- flesch = (
69
- 206.835 - (1.015 * sentence_length).to_f - (84.6 * syllables_per_word).to_f
70
- )
71
- flesch.round(2)
72
- end
73
-
74
- def self.flesch_kincaid_grade(text)
75
- sentence_length = avg_sentence_length(text)
76
- syllables_per_word = avg_syllables_per_word(text)
77
- flesch = (0.39 * sentence_length.to_f) + (11.8 * syllables_per_word.to_f) - 15.59
78
- flesch.round(1)
79
- end
80
-
81
- def self.polysyllab_count(text)
82
- count = 0
83
- text.split(' ').each do |word|
84
- w = syllable_count(word)
85
- count += 1 if w >= 3
86
- end
87
- count
88
- end
89
-
90
- def self.smog_index(text)
91
- sentences = sentence_count(text)
92
-
93
- if sentences >= 3
94
- begin
95
- polysyllab = polysyllab_count(text)
96
- smog = (
97
- (1.043 * (30 * (polysyllab / sentences))**0.5) + 3.1291)
98
- return smog.round(1)
99
- rescue ZeroDivisionError
100
- return 0.0
101
- end
102
- else
103
- return 0.0
104
- end
105
- end
106
-
107
- def self.coleman_liau_index(text)
108
- letters = (avg_letter_per_word(text) * 100).round(2)
109
- sentences = (avg_sentence_per_word(text) * 100).round(2)
110
- coleman = ((0.058 * letters) - (0.296 * sentences) - 15.8).to_f
111
- coleman.round(2)
112
- end
113
-
114
- def self.automated_readability_index(text)
115
- chars = char_count(text)
116
- words = lexicon_count(text)
117
- sentences = sentence_count(text)
118
- begin
119
- a = chars.to_f / words.to_f
120
- b = words.to_f / sentences.to_f
121
-
122
- readability = (
123
- (4.71 * a.round(2) + (0.5 * b.round(2))) - 21.43)
124
- return readability.round(1)
125
- rescue ZeroDivisionError
126
- return 0.0
127
- end
128
- end
129
-
130
- def self.linsear_write_formula(text)
131
- easy_word = 0
132
- difficult_word = 0
133
- text_list = text.split(' ')[0..100]
134
-
135
- text_list.each do |word|
136
- if syllable_count(word) < 3
137
- easy_word += 1
138
- else
139
- difficult_word += 1
140
- end
141
- end
142
-
143
- text = text_list.join(' ')
144
-
145
- number = ((easy_word * 1 + difficult_word * 3) / sentence_count(text)).to_f
146
- if number <= 20
147
- number -= 2
148
- end
149
- return number / 2
150
- end
151
-
152
- def self.difficult_words(text, language = 'en_us')
153
- require 'set'
154
- easy_words = Set.new
155
- File.read("lib/dictionaries/#{language}.txt").each_line do |line|
156
- easy_words << line.chop
157
- end
158
-
159
- text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
160
- diff_words_set = Set.new
161
- text_list.each do |value|
162
- unless easy_words.include? value
163
- if syllable_count(value) > 1
164
- diff_words_set.add(value)
165
- end
166
- end
167
- end
168
- return diff_words_set.length
169
- end
170
-
171
- def self.dale_chall_readability_score(text)
172
- word_count = lexicon_count(text)
173
- count = word_count - difficult_words(text)
174
-
175
- begin
176
- per = count.to_f / word_count.to_f * 100
177
- rescue ZeroDivisionError
178
- return 0.0
179
- end
180
-
181
- difficult_words = 100 - per
182
- score = (
183
- (0.1579 * difficult_words)
184
- + (0.0496 * avg_sentence_length(text)))
185
-
186
- if difficult_words > 5
187
- score += 3.6365
188
- end
189
- return score.round(2)
190
- end
191
-
192
- def self.gunning_fog(text)
193
- begin
194
- per_diff_words = (
195
- (difficult_words(text) / lexicon_count(text) * 100) + 5)
196
-
197
- grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
198
- return grade.round(2)
199
- rescue ZeroDivisionError
200
- return 0.0
201
- end
202
- end
203
-
204
- def self.lix(text)
205
- words = text.split(' ')
206
- words_length = words.length
207
- long_words = words.select { |word| word.length > 6 }.count
208
-
209
- per_long_words = (long_words * 100).to_f / words_length
210
- asl = avg_sentence_length(text)
211
- lix = asl + per_long_words
212
-
213
- return lix.round(2)
214
- end
215
-
216
- def self.text_standard(text, float_output=nil)
217
- grade = []
218
-
219
- lower = flesch_kincaid_grade(text).round
220
- upper = flesch_kincaid_grade(text).ceil
221
- grade.append(lower.to_i)
222
- grade.append(upper.to_i)
223
-
224
- # Appending Flesch Reading Easy
225
- score = flesch_reading_ease(text)
226
- if score < 100 && score >= 90
227
- grade.append(5)
228
- elsif score < 90 && score >= 80
229
- grade.append(6)
230
- elsif score < 80 && score >= 70
231
- grade.append(7)
232
- elsif score < 70 && score >= 60
233
- grade.append(8)
234
- grade.append(9)
235
- elsif score < 60 && score >= 50
236
- grade.append(10)
237
- elsif score < 50 && score >= 40
238
- grade.append(11)
239
- elsif score < 40 && score >= 30
240
- grade.append(12)
241
- else
242
- grade.append(13)
243
- end
244
-
245
- # Appending SMOG Index
246
- lower = smog_index(text).round
247
- upper = smog_index(text).ceil
248
- grade.append(lower.to_i)
249
- grade.append(upper.to_i)
250
-
251
- # Appending Coleman_Liau_Index
252
- lower = coleman_liau_index(text).round
253
- upper = coleman_liau_index(text).ceil
254
- grade.append(lower.to_i)
255
- grade.append(upper.to_i)
256
-
257
- # Appending Automated_Readability_Index
258
- lower = automated_readability_index(text).round
259
- upper = automated_readability_index(text).ceil
260
- grade.append(lower.to_i)
261
- grade.append(upper.to_i)
262
-
263
- # Appending Dale_Chall_Readability_Score
264
- lower = dale_chall_readability_score(text).round
265
- upper = dale_chall_readability_score(text).ceil
266
- grade.append(lower.to_i)
267
- grade.append(upper.to_i)
268
-
269
- # Appending Linsear_Write_Formula
270
- lower = linsear_write_formula(text).round
271
- upper = linsear_write_formula(text).ceil
272
- grade.append(lower.to_i)
273
- grade.append(upper.to_i)
274
-
275
- # Appending Gunning Fog Index
276
- lower = gunning_fog(text).round
277
- upper = gunning_fog(text).ceil
278
- grade.append(lower.to_i)
279
- grade.append(upper.to_i)
280
-
281
- # Finding the Readability Consensus based upon all the above tests
282
- require 'counter'
283
- d = Counter.new(grade)
284
- final_grade = d.most_common(1)
285
- score = final_grade[0][0]
286
-
287
- if float_output
288
- return score.to_f
289
- else
290
- return "#{score.to_i - 1}th and #{score.to_i}th grade"
291
- end
292
- end
293
- end
1
+ require 'text-hyphen'
2
+
3
+ class TextStat
4
+ GEM_PATH = File.dirname(File.dirname(__FILE__))
5
+
6
+ def self.char_count(text, ignore_spaces = true)
7
+ text = text.delete(' ') if ignore_spaces
8
+ text.length
9
+ end
10
+
11
+ def self.lexicon_count(text, remove_punctuation = true)
12
+ text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
13
+ count = text.split(' ').count
14
+ count
15
+ end
16
+
17
+ def self.syllable_count(text, language = 'en_us')
18
+ return 0 if text.empty?
19
+
20
+ text = text.downcase
21
+ text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
22
+ dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
23
+ count = 0
24
+ text.split(' ').each do |word|
25
+ word_hyphenated = dictionary.visualise(word)
26
+ count += word_hyphenated.count('-') + 1
27
+ end
28
+ count
29
+ end
30
+
31
+ def self.sentence_count(text)
32
+ text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
33
+ end
34
+
35
+ def self.avg_sentence_length(text)
36
+ asl = lexicon_count(text).to_f / sentence_count(text)
37
+ asl.round(1)
38
+ rescue ZeroDivisionError
39
+ 0.0
40
+ end
41
+
42
+ def self.avg_syllables_per_word(text)
43
+ syllable = syllable_count(text)
44
+ words = lexicon_count(text)
45
+ begin
46
+ syllables_per_word = syllable.to_f / words
47
+ syllables_per_word.round(1)
48
+ rescue ZeroDivisionError
49
+ 0.0
50
+ end
51
+ end
52
+
53
+ def self.avg_letter_per_word(text)
54
+ letters_per_word = char_count(text).to_f / lexicon_count(text)
55
+ letters_per_word.round(2)
56
+ rescue ZeroDivisionError
57
+ 0.0
58
+ end
59
+
60
+ def self.avg_sentence_per_word(text)
61
+ sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
62
+ sentence_per_word.round(2)
63
+ rescue ZeroDivisionError
64
+ 0.0
65
+ end
66
+
67
+ def self.flesch_reading_ease(text)
68
+ sentence_length = avg_sentence_length(text)
69
+ syllables_per_word = avg_syllables_per_word(text)
70
+ flesch = 206.835 - 1.015 * sentence_length - 84.6 * syllables_per_word
71
+ flesch.round(2)
72
+ end
73
+
74
+ def self.flesch_kincaid_grade(text)
75
+ sentence_length = avg_sentence_length(text)
76
+ syllables_per_word = avg_syllables_per_word(text)
77
+ flesch = 0.39 * sentence_length + 11.8 * syllables_per_word - 15.59
78
+ flesch.round(1)
79
+ end
80
+
81
+ def self.polysyllab_count(text)
82
+ count = 0
83
+ text.split(' ').each do |word|
84
+ w = syllable_count(word)
85
+ count += 1 if w >= 3
86
+ end
87
+ count
88
+ end
89
+
90
+ def self.smog_index(text)
91
+ sentences = sentence_count(text)
92
+
93
+ if sentences >= 3
94
+ begin
95
+ polysyllab = polysyllab_count(text)
96
+ smog = 1.043 * Math.sqrt(30.0 * polysyllab / sentences) + 3.1291
97
+ smog.round(1)
98
+ rescue ZeroDivisionError
99
+ 0.0
100
+ end
101
+ else
102
+ 0.0
103
+ end
104
+ end
105
+
106
+ def self.coleman_liau_index(text)
107
+ letters = (avg_letter_per_word(text) * 100).round(2)
108
+ sentences = (avg_sentence_per_word(text) * 100).round(2)
109
+ coleman = 0.0588 * letters - 0.296 * sentences - 15.8
110
+ coleman.round(2)
111
+ end
112
+
113
+ def self.automated_readability_index(text)
114
+ chars = char_count(text)
115
+ words = lexicon_count(text)
116
+ sentences = sentence_count(text)
117
+ begin
118
+ a = chars.to_f / words
119
+ b = words.to_f / sentences
120
+
121
+ readability = 4.71 * a + 0.5 * b - 21.43
122
+ readability.round(1)
123
+ rescue ZeroDivisionError
124
+ 0.0
125
+ end
126
+ end
127
+
128
+ def self.linsear_write_formula(text)
129
+ easy_word = 0
130
+ difficult_word = 0
131
+ text_list = text.split(' ')[0..100]
132
+
133
+ text_list.each do |word|
134
+ if syllable_count(word) < 3
135
+ easy_word += 1
136
+ else
137
+ difficult_word += 1
138
+ end
139
+ end
140
+
141
+ text = text_list.join(' ')
142
+
143
+ number = (easy_word * 1 + difficult_word * 3).to_f / sentence_count(text)
144
+ number -= 2 if number <= 20
145
+ number / 2
146
+ end
147
+
148
+ def self.difficult_words(text, language = 'en_us')
149
+ require 'set'
150
+ easy_words = Set.new
151
+ File.read(File.join(dictionary_path, "#{language}.txt")).each_line do |line|
152
+ easy_words << line.chop
153
+ end
154
+
155
+ text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
156
+ diff_words_set = Set.new
157
+ text_list.each do |value|
158
+ next if easy_words.include? value
159
+
160
+ diff_words_set.add(value) if syllable_count(value) > 1
161
+ end
162
+ diff_words_set.length
163
+ end
164
+
165
+ def self.dale_chall_readability_score(text)
166
+ word_count = lexicon_count(text)
167
+ count = word_count - difficult_words(text)
168
+
169
+ begin
170
+ per = 100.0 * count / word_count
171
+ rescue ZeroDivisionError
172
+ return 0.0
173
+ end
174
+
175
+ difficult_words = 100 - per
176
+ score = 0.1579 * difficult_words + 0.0496 * avg_sentence_length(text)
177
+ score += 3.6365 if difficult_words > 5
178
+
179
+ score.round(2)
180
+ end
181
+
182
+ def self.gunning_fog(text)
183
+ per_diff_words = 100.0 * difficult_words(text) / lexicon_count(text) + 5
184
+ grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
185
+
186
+ grade.round(2)
187
+ rescue ZeroDivisionError
188
+ 0.0
189
+ end
190
+
191
+ def self.lix(text)
192
+ words = text.split(' ')
193
+ words_length = words.length
194
+ long_words = words.count { |word| word.length > 6 }
195
+
196
+ per_long_words = 100.0 * long_words / words_length
197
+ asl = avg_sentence_length(text)
198
+ lix = asl + per_long_words
199
+
200
+ lix.round(2)
201
+ end
202
+
203
+ def self.forcast(text, language = 'en_us')
204
+ words = text.split(' ')[0..149]
205
+ words_with_one_syllabe = words.count {
206
+ |word| syllable_count(word, language) == 1
207
+ }
208
+ forcast = 20 - (words_with_one_syllabe / 10)
209
+ forcast
210
+ end
211
+
212
+ def self.powers_sumner_kearl(text)
213
+ grade = 0.0778 * avg_sentence_length(text) + 0.0455 * syllable_count(text) - 2.2029
214
+ grade.round(2)
215
+ end
216
+
217
+ def self.spache(text, language = 'en_us')
218
+ words = text.split(' ').count
219
+ unfamiliar_words = difficult_words(text, language) / words
220
+ grade = (0.141 * avg_sentence_length(text)) + (0.086 * unfamiliar_words) + 0.839
221
+ grade.round(2)
222
+ end
223
+
224
+ def self.text_standard(text, float_output=nil)
225
+ grade = []
226
+
227
+ lower = flesch_kincaid_grade(text).round
228
+ upper = flesch_kincaid_grade(text).ceil
229
+ grade.append(lower.to_i)
230
+ grade.append(upper.to_i)
231
+
232
+ # Appending Flesch Reading Easy
233
+ score = flesch_reading_ease(text)
234
+ if score < 100 && score >= 90
235
+ grade.append(5)
236
+ elsif score < 90 && score >= 80
237
+ grade.append(6)
238
+ elsif score < 80 && score >= 70
239
+ grade.append(7)
240
+ elsif score < 70 && score >= 60
241
+ grade.append(8)
242
+ grade.append(9)
243
+ elsif score < 60 && score >= 50
244
+ grade.append(10)
245
+ elsif score < 50 && score >= 40
246
+ grade.append(11)
247
+ elsif score < 40 && score >= 30
248
+ grade.append(12)
249
+ else
250
+ grade.append(13)
251
+ end
252
+
253
+ # Appending SMOG Index
254
+ lower = smog_index(text).round
255
+ upper = smog_index(text).ceil
256
+ grade.append(lower.to_i)
257
+ grade.append(upper.to_i)
258
+
259
+ # Appending Coleman_Liau_Index
260
+ lower = coleman_liau_index(text).round
261
+ upper = coleman_liau_index(text).ceil
262
+ grade.append(lower.to_i)
263
+ grade.append(upper.to_i)
264
+
265
+ # Appending Automated_Readability_Index
266
+ lower = automated_readability_index(text).round
267
+ upper = automated_readability_index(text).ceil
268
+ grade.append(lower.to_i)
269
+ grade.append(upper.to_i)
270
+
271
+ # Appending Dale_Chall_Readability_Score
272
+ lower = dale_chall_readability_score(text).round
273
+ upper = dale_chall_readability_score(text).ceil
274
+ grade.append(lower.to_i)
275
+ grade.append(upper.to_i)
276
+
277
+ # Appending Linsear_Write_Formula
278
+ lower = linsear_write_formula(text).round
279
+ upper = linsear_write_formula(text).ceil
280
+ grade.append(lower.to_i)
281
+ grade.append(upper.to_i)
282
+
283
+ # Appending Gunning Fog Index
284
+ lower = gunning_fog(text).round
285
+ upper = gunning_fog(text).ceil
286
+ grade.append(lower.to_i)
287
+ grade.append(upper.to_i)
288
+
289
+ # Finding the Readability Consensus based upon all the above tests
290
+ require 'counter'
291
+ d = Counter.new(grade)
292
+ final_grade = d.most_common(1)
293
+ score = final_grade[0][0]
294
+
295
+ if float_output
296
+ score.to_f
297
+ else
298
+ "#{score.to_i - 1}th and #{score.to_i}th grade"
299
+ end
300
+ end
301
+
302
+ def self.dictionary_path=(path)
303
+ @dictionary_path = path
304
+ end
305
+
306
+ def self.dictionary_path
307
+ @dictionary_path ||= File.join(TextStat::GEM_PATH, 'lib', 'dictionaries')
308
+ end
309
+ end