textstat 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/counter.rb +37 -37
- data/lib/dictionaries/ca.txt +2487 -2487
- data/lib/dictionaries/cs.txt +2499 -2499
- data/lib/dictionaries/en_us.txt +2944 -2944
- data/lib/dictionaries/nl.txt +2549 -2549
- data/lib/textstat.rb +309 -303
- data/lib/textstat/version.rb +3 -3
- data/spec/textstat_spec.rb +191 -176
- metadata +5 -5
data/lib/textstat.rb
CHANGED
@@ -1,303 +1,309 @@
|
|
1
|
-
require 'text-hyphen'
|
2
|
-
|
3
|
-
class TextStat
|
4
|
-
GEM_PATH = File.dirname(File.dirname(__FILE__))
|
5
|
-
|
6
|
-
def self.char_count(text, ignore_spaces = true)
|
7
|
-
text = text.delete(' ') if ignore_spaces
|
8
|
-
text.length
|
9
|
-
end
|
10
|
-
|
11
|
-
def self.lexicon_count(text, remove_punctuation = true)
|
12
|
-
text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
|
13
|
-
count = text.split(' ').count
|
14
|
-
count
|
15
|
-
end
|
16
|
-
|
17
|
-
def self.syllable_count(text, language = 'en_us')
|
18
|
-
return 0 if text.empty?
|
19
|
-
|
20
|
-
text = text.downcase
|
21
|
-
text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
|
22
|
-
dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
|
23
|
-
count = 0
|
24
|
-
text.split(' ').each do |word|
|
25
|
-
word_hyphenated = dictionary.visualise(word)
|
26
|
-
count +=
|
27
|
-
end
|
28
|
-
count
|
29
|
-
end
|
30
|
-
|
31
|
-
def self.sentence_count(text)
|
32
|
-
text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
|
33
|
-
end
|
34
|
-
|
35
|
-
def self.avg_sentence_length(text)
|
36
|
-
asl = lexicon_count(text).to_f / sentence_count(text)
|
37
|
-
asl.round(1)
|
38
|
-
rescue ZeroDivisionError
|
39
|
-
0.0
|
40
|
-
end
|
41
|
-
|
42
|
-
def self.avg_syllables_per_word(text)
|
43
|
-
syllable = syllable_count(text)
|
44
|
-
words = lexicon_count(text)
|
45
|
-
begin
|
46
|
-
syllables_per_word = syllable.to_f / words
|
47
|
-
|
48
|
-
rescue ZeroDivisionError
|
49
|
-
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
def self.avg_letter_per_word(text)
|
54
|
-
letters_per_word = char_count(text).to_f / lexicon_count(text)
|
55
|
-
letters_per_word.round(2)
|
56
|
-
rescue ZeroDivisionError
|
57
|
-
0.0
|
58
|
-
end
|
59
|
-
|
60
|
-
def self.avg_sentence_per_word(text)
|
61
|
-
sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
|
62
|
-
sentence_per_word.round(2)
|
63
|
-
rescue ZeroDivisionError
|
64
|
-
0.0
|
65
|
-
end
|
66
|
-
|
67
|
-
def self.flesch_reading_ease(text)
|
68
|
-
sentence_length = avg_sentence_length(text)
|
69
|
-
syllables_per_word = avg_syllables_per_word(text)
|
70
|
-
flesch =
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
sentences
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
diff_words_set
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
grade.append(
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
grade.append(
|
243
|
-
|
244
|
-
grade.append(
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
# Appending
|
254
|
-
lower =
|
255
|
-
upper =
|
256
|
-
grade.append(lower.to_i)
|
257
|
-
grade.append(upper.to_i)
|
258
|
-
|
259
|
-
# Appending
|
260
|
-
lower =
|
261
|
-
upper =
|
262
|
-
grade.append(lower.to_i)
|
263
|
-
grade.append(upper.to_i)
|
264
|
-
|
265
|
-
# Appending
|
266
|
-
lower =
|
267
|
-
upper =
|
268
|
-
grade.append(lower.to_i)
|
269
|
-
grade.append(upper.to_i)
|
270
|
-
|
271
|
-
# Appending
|
272
|
-
lower =
|
273
|
-
upper =
|
274
|
-
grade.append(lower.to_i)
|
275
|
-
grade.append(upper.to_i)
|
276
|
-
|
277
|
-
# Appending
|
278
|
-
lower =
|
279
|
-
upper =
|
280
|
-
grade.append(lower.to_i)
|
281
|
-
grade.append(upper.to_i)
|
282
|
-
|
283
|
-
#
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
1
|
+
require 'text-hyphen'
|
2
|
+
|
3
|
+
class TextStat
|
4
|
+
GEM_PATH = File.dirname(File.dirname(__FILE__))
|
5
|
+
|
6
|
+
def self.char_count(text, ignore_spaces = true)
|
7
|
+
text = text.delete(' ') if ignore_spaces
|
8
|
+
text.length
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.lexicon_count(text, remove_punctuation = true)
|
12
|
+
text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
|
13
|
+
count = text.split(' ').count
|
14
|
+
count
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.syllable_count(text, language = 'en_us')
|
18
|
+
return 0 if text.empty?
|
19
|
+
|
20
|
+
text = text.downcase
|
21
|
+
text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
|
22
|
+
dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
|
23
|
+
count = 0
|
24
|
+
text.split(' ').each do |word|
|
25
|
+
word_hyphenated = dictionary.visualise(word)
|
26
|
+
count += word_hyphenated.count('-') + 1
|
27
|
+
end
|
28
|
+
count
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.sentence_count(text)
|
32
|
+
text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.avg_sentence_length(text)
|
36
|
+
asl = lexicon_count(text).to_f / sentence_count(text)
|
37
|
+
asl.round(1)
|
38
|
+
rescue ZeroDivisionError
|
39
|
+
0.0
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.avg_syllables_per_word(text)
|
43
|
+
syllable = syllable_count(text)
|
44
|
+
words = lexicon_count(text)
|
45
|
+
begin
|
46
|
+
syllables_per_word = syllable.to_f / words
|
47
|
+
syllables_per_word.round(1)
|
48
|
+
rescue ZeroDivisionError
|
49
|
+
0.0
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.avg_letter_per_word(text)
|
54
|
+
letters_per_word = char_count(text).to_f / lexicon_count(text)
|
55
|
+
letters_per_word.round(2)
|
56
|
+
rescue ZeroDivisionError
|
57
|
+
0.0
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.avg_sentence_per_word(text)
|
61
|
+
sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
|
62
|
+
sentence_per_word.round(2)
|
63
|
+
rescue ZeroDivisionError
|
64
|
+
0.0
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.flesch_reading_ease(text)
|
68
|
+
sentence_length = avg_sentence_length(text)
|
69
|
+
syllables_per_word = avg_syllables_per_word(text)
|
70
|
+
flesch = 206.835 - 1.015 * sentence_length - 84.6 * syllables_per_word
|
71
|
+
flesch.round(2)
|
72
|
+
end
|
73
|
+
|
74
|
+
def self.flesch_kincaid_grade(text)
|
75
|
+
sentence_length = avg_sentence_length(text)
|
76
|
+
syllables_per_word = avg_syllables_per_word(text)
|
77
|
+
flesch = 0.39 * sentence_length + 11.8 * syllables_per_word - 15.59
|
78
|
+
flesch.round(1)
|
79
|
+
end
|
80
|
+
|
81
|
+
def self.polysyllab_count(text)
|
82
|
+
count = 0
|
83
|
+
text.split(' ').each do |word|
|
84
|
+
w = syllable_count(word)
|
85
|
+
count += 1 if w >= 3
|
86
|
+
end
|
87
|
+
count
|
88
|
+
end
|
89
|
+
|
90
|
+
def self.smog_index(text)
|
91
|
+
sentences = sentence_count(text)
|
92
|
+
|
93
|
+
if sentences >= 3
|
94
|
+
begin
|
95
|
+
polysyllab = polysyllab_count(text)
|
96
|
+
smog = 1.043 * Math.sqrt(30.0 * polysyllab / sentences) + 3.1291
|
97
|
+
smog.round(1)
|
98
|
+
rescue ZeroDivisionError
|
99
|
+
0.0
|
100
|
+
end
|
101
|
+
else
|
102
|
+
0.0
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def self.coleman_liau_index(text)
|
107
|
+
letters = (avg_letter_per_word(text) * 100).round(2)
|
108
|
+
sentences = (avg_sentence_per_word(text) * 100).round(2)
|
109
|
+
coleman = 0.0588 * letters - 0.296 * sentences - 15.8
|
110
|
+
coleman.round(2)
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.automated_readability_index(text)
|
114
|
+
chars = char_count(text)
|
115
|
+
words = lexicon_count(text)
|
116
|
+
sentences = sentence_count(text)
|
117
|
+
begin
|
118
|
+
a = chars.to_f / words
|
119
|
+
b = words.to_f / sentences
|
120
|
+
|
121
|
+
readability = 4.71 * a + 0.5 * b - 21.43
|
122
|
+
readability.round(1)
|
123
|
+
rescue ZeroDivisionError
|
124
|
+
0.0
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def self.linsear_write_formula(text)
|
129
|
+
easy_word = 0
|
130
|
+
difficult_word = 0
|
131
|
+
text_list = text.split(' ')[0..100]
|
132
|
+
|
133
|
+
text_list.each do |word|
|
134
|
+
if syllable_count(word) < 3
|
135
|
+
easy_word += 1
|
136
|
+
else
|
137
|
+
difficult_word += 1
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
text = text_list.join(' ')
|
142
|
+
|
143
|
+
number = (easy_word * 1 + difficult_word * 3).to_f / sentence_count(text)
|
144
|
+
number -= 2 if number <= 20
|
145
|
+
number / 2
|
146
|
+
end
|
147
|
+
|
148
|
+
def self.difficult_words(text, language = 'en_us')
|
149
|
+
require 'set'
|
150
|
+
easy_words = Set.new
|
151
|
+
File.read(File.join(dictionary_path, "#{language}.txt")).each_line do |line|
|
152
|
+
easy_words << line.chop
|
153
|
+
end
|
154
|
+
|
155
|
+
text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
|
156
|
+
diff_words_set = Set.new
|
157
|
+
text_list.each do |value|
|
158
|
+
next if easy_words.include? value
|
159
|
+
|
160
|
+
diff_words_set.add(value) if syllable_count(value) > 1
|
161
|
+
end
|
162
|
+
diff_words_set.length
|
163
|
+
end
|
164
|
+
|
165
|
+
def self.dale_chall_readability_score(text)
|
166
|
+
word_count = lexicon_count(text)
|
167
|
+
count = word_count - difficult_words(text)
|
168
|
+
|
169
|
+
begin
|
170
|
+
per = 100.0 * count / word_count
|
171
|
+
rescue ZeroDivisionError
|
172
|
+
return 0.0
|
173
|
+
end
|
174
|
+
|
175
|
+
difficult_words = 100 - per
|
176
|
+
score = 0.1579 * difficult_words + 0.0496 * avg_sentence_length(text)
|
177
|
+
score += 3.6365 if difficult_words > 5
|
178
|
+
|
179
|
+
score.round(2)
|
180
|
+
end
|
181
|
+
|
182
|
+
def self.gunning_fog(text)
|
183
|
+
per_diff_words = 100.0 * difficult_words(text) / lexicon_count(text) + 5
|
184
|
+
grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
|
185
|
+
|
186
|
+
grade.round(2)
|
187
|
+
rescue ZeroDivisionError
|
188
|
+
0.0
|
189
|
+
end
|
190
|
+
|
191
|
+
def self.lix(text)
|
192
|
+
words = text.split(' ')
|
193
|
+
words_length = words.length
|
194
|
+
long_words = words.count { |word| word.length > 6 }
|
195
|
+
|
196
|
+
per_long_words = 100.0 * long_words / words_length
|
197
|
+
asl = avg_sentence_length(text)
|
198
|
+
lix = asl + per_long_words
|
199
|
+
|
200
|
+
lix.round(2)
|
201
|
+
end
|
202
|
+
|
203
|
+
def self.forcast(text, language = 'en_us')
|
204
|
+
words = text.split(' ')[0..149]
|
205
|
+
words_with_one_syllabe = words.count {
|
206
|
+
|word| syllable_count(word, language) == 1
|
207
|
+
}
|
208
|
+
forcast = 20 - (words_with_one_syllabe / 10)
|
209
|
+
forcast
|
210
|
+
end
|
211
|
+
|
212
|
+
def self.powers_sumner_kearl(text)
|
213
|
+
grade = 0.0778 * avg_sentence_length(text) + 0.0455 * syllable_count(text) - 2.2029
|
214
|
+
grade.round(2)
|
215
|
+
end
|
216
|
+
|
217
|
+
def self.spache(text, language = 'en_us')
|
218
|
+
words = text.split(' ').count
|
219
|
+
unfamiliar_words = difficult_words(text, language) / words
|
220
|
+
grade = (0.141 * avg_sentence_length(text)) + (0.086 * unfamiliar_words) + 0.839
|
221
|
+
grade.round(2)
|
222
|
+
end
|
223
|
+
|
224
|
+
def self.text_standard(text, float_output=nil)
|
225
|
+
grade = []
|
226
|
+
|
227
|
+
lower = flesch_kincaid_grade(text).round
|
228
|
+
upper = flesch_kincaid_grade(text).ceil
|
229
|
+
grade.append(lower.to_i)
|
230
|
+
grade.append(upper.to_i)
|
231
|
+
|
232
|
+
# Appending Flesch Reading Easy
|
233
|
+
score = flesch_reading_ease(text)
|
234
|
+
if score < 100 && score >= 90
|
235
|
+
grade.append(5)
|
236
|
+
elsif score < 90 && score >= 80
|
237
|
+
grade.append(6)
|
238
|
+
elsif score < 80 && score >= 70
|
239
|
+
grade.append(7)
|
240
|
+
elsif score < 70 && score >= 60
|
241
|
+
grade.append(8)
|
242
|
+
grade.append(9)
|
243
|
+
elsif score < 60 && score >= 50
|
244
|
+
grade.append(10)
|
245
|
+
elsif score < 50 && score >= 40
|
246
|
+
grade.append(11)
|
247
|
+
elsif score < 40 && score >= 30
|
248
|
+
grade.append(12)
|
249
|
+
else
|
250
|
+
grade.append(13)
|
251
|
+
end
|
252
|
+
|
253
|
+
# Appending SMOG Index
|
254
|
+
lower = smog_index(text).round
|
255
|
+
upper = smog_index(text).ceil
|
256
|
+
grade.append(lower.to_i)
|
257
|
+
grade.append(upper.to_i)
|
258
|
+
|
259
|
+
# Appending Coleman_Liau_Index
|
260
|
+
lower = coleman_liau_index(text).round
|
261
|
+
upper = coleman_liau_index(text).ceil
|
262
|
+
grade.append(lower.to_i)
|
263
|
+
grade.append(upper.to_i)
|
264
|
+
|
265
|
+
# Appending Automated_Readability_Index
|
266
|
+
lower = automated_readability_index(text).round
|
267
|
+
upper = automated_readability_index(text).ceil
|
268
|
+
grade.append(lower.to_i)
|
269
|
+
grade.append(upper.to_i)
|
270
|
+
|
271
|
+
# Appending Dale_Chall_Readability_Score
|
272
|
+
lower = dale_chall_readability_score(text).round
|
273
|
+
upper = dale_chall_readability_score(text).ceil
|
274
|
+
grade.append(lower.to_i)
|
275
|
+
grade.append(upper.to_i)
|
276
|
+
|
277
|
+
# Appending Linsear_Write_Formula
|
278
|
+
lower = linsear_write_formula(text).round
|
279
|
+
upper = linsear_write_formula(text).ceil
|
280
|
+
grade.append(lower.to_i)
|
281
|
+
grade.append(upper.to_i)
|
282
|
+
|
283
|
+
# Appending Gunning Fog Index
|
284
|
+
lower = gunning_fog(text).round
|
285
|
+
upper = gunning_fog(text).ceil
|
286
|
+
grade.append(lower.to_i)
|
287
|
+
grade.append(upper.to_i)
|
288
|
+
|
289
|
+
# Finding the Readability Consensus based upon all the above tests
|
290
|
+
require 'counter'
|
291
|
+
d = Counter.new(grade)
|
292
|
+
final_grade = d.most_common(1)
|
293
|
+
score = final_grade[0][0]
|
294
|
+
|
295
|
+
if float_output
|
296
|
+
score.to_f
|
297
|
+
else
|
298
|
+
"#{score.to_i - 1}th and #{score.to_i}th grade"
|
299
|
+
end
|
300
|
+
end
|
301
|
+
|
302
|
+
def self.dictionary_path=(path)
|
303
|
+
@dictionary_path = path
|
304
|
+
end
|
305
|
+
|
306
|
+
def self.dictionary_path
|
307
|
+
@dictionary_path ||= File.join(TextStat::GEM_PATH, 'lib', 'dictionaries')
|
308
|
+
end
|
309
|
+
end
|