textstat 0.1.4 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/counter.rb +0 -0
- data/lib/dictionaries/ca.txt +0 -0
- data/lib/dictionaries/cs.txt +0 -0
- data/lib/dictionaries/nl.txt +0 -0
- data/lib/textstat/version.rb +1 -1
- data/lib/textstat.rb +79 -73
- data/spec/textstat_spec.rb +21 -6
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 61ef6dcf0e938af4c3c30ad4b45c3241f10ffc01e4d63e566a74d993b57309ac
|
4
|
+
data.tar.gz: 20f6412df8d5a8658d4113ddb48e808697b7e1489ba5acb4fae9d97346393acd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ef7dee598e3db4c26e2e305f464fbebf6b3757429a72ae66b887ab7f068a34a0d2b6c2e5c52771b2c342e8c06d2ec9a69fa1d60065eaa37b3ccf76eb232f647d
|
7
|
+
data.tar.gz: 5ca1f7c6dcb11a81457b87339ab153ac95125542378875fd451c9ba58659925fc018d38af4bbc61149cb5f8f53765f67363029d8eb698ad80effa969d7ecbbeb
|
data/lib/counter.rb
CHANGED
File without changes
|
data/lib/dictionaries/ca.txt
CHANGED
File without changes
|
data/lib/dictionaries/cs.txt
CHANGED
File without changes
|
data/lib/dictionaries/nl.txt
CHANGED
File without changes
|
data/lib/textstat/version.rb
CHANGED
data/lib/textstat.rb
CHANGED
@@ -23,7 +23,7 @@ class TextStat
|
|
23
23
|
count = 0
|
24
24
|
text.split(' ').each do |word|
|
25
25
|
word_hyphenated = dictionary.visualise(word)
|
26
|
-
count +=
|
26
|
+
count += word_hyphenated.count('-') + 1
|
27
27
|
end
|
28
28
|
count
|
29
29
|
end
|
@@ -33,109 +33,105 @@ class TextStat
|
|
33
33
|
end
|
34
34
|
|
35
35
|
def self.avg_sentence_length(text)
|
36
|
-
asl = lexicon_count(text).to_f / sentence_count(text)
|
36
|
+
asl = lexicon_count(text).to_f / sentence_count(text)
|
37
37
|
asl.round(1)
|
38
38
|
rescue ZeroDivisionError
|
39
39
|
0.0
|
40
40
|
end
|
41
41
|
|
42
|
-
def self.avg_syllables_per_word(text)
|
43
|
-
syllable = syllable_count(text)
|
42
|
+
def self.avg_syllables_per_word(text, language = 'en_us')
|
43
|
+
syllable = syllable_count(text, language)
|
44
44
|
words = lexicon_count(text)
|
45
45
|
begin
|
46
|
-
syllables_per_word = syllable.to_f / words
|
47
|
-
|
46
|
+
syllables_per_word = syllable.to_f / words
|
47
|
+
syllables_per_word.round(1)
|
48
48
|
rescue ZeroDivisionError
|
49
|
-
|
49
|
+
0.0
|
50
50
|
end
|
51
51
|
end
|
52
52
|
|
53
53
|
def self.avg_letter_per_word(text)
|
54
|
-
letters_per_word = char_count(text).to_f / lexicon_count(text)
|
54
|
+
letters_per_word = char_count(text).to_f / lexicon_count(text)
|
55
55
|
letters_per_word.round(2)
|
56
56
|
rescue ZeroDivisionError
|
57
57
|
0.0
|
58
58
|
end
|
59
59
|
|
60
60
|
def self.avg_sentence_per_word(text)
|
61
|
-
sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
|
61
|
+
sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
|
62
62
|
sentence_per_word.round(2)
|
63
63
|
rescue ZeroDivisionError
|
64
64
|
0.0
|
65
65
|
end
|
66
66
|
|
67
|
-
def self.flesch_reading_ease(text)
|
67
|
+
def self.flesch_reading_ease(text, language = 'en_us')
|
68
68
|
sentence_length = avg_sentence_length(text)
|
69
|
-
syllables_per_word = avg_syllables_per_word(text)
|
70
|
-
flesch =
|
71
|
-
206.835 - (1.015 * sentence_length).to_f - (84.6 * syllables_per_word).to_f
|
72
|
-
)
|
69
|
+
syllables_per_word = avg_syllables_per_word(text, language)
|
70
|
+
flesch = 206.835 - 1.015 * sentence_length - 84.6 * syllables_per_word
|
73
71
|
flesch.round(2)
|
74
72
|
end
|
75
73
|
|
76
|
-
def self.flesch_kincaid_grade(text)
|
74
|
+
def self.flesch_kincaid_grade(text, language = 'en_us')
|
77
75
|
sentence_length = avg_sentence_length(text)
|
78
|
-
syllables_per_word = avg_syllables_per_word(text)
|
79
|
-
flesch =
|
76
|
+
syllables_per_word = avg_syllables_per_word(text, language)
|
77
|
+
flesch = 0.39 * sentence_length + 11.8 * syllables_per_word - 15.59
|
80
78
|
flesch.round(1)
|
81
79
|
end
|
82
80
|
|
83
|
-
def self.polysyllab_count(text)
|
81
|
+
def self.polysyllab_count(text, language = 'en_us')
|
84
82
|
count = 0
|
85
83
|
text.split(' ').each do |word|
|
86
|
-
w = syllable_count(word)
|
84
|
+
w = syllable_count(word, language)
|
87
85
|
count += 1 if w >= 3
|
88
86
|
end
|
89
87
|
count
|
90
88
|
end
|
91
89
|
|
92
|
-
def self.smog_index(text)
|
90
|
+
def self.smog_index(text, language = 'en_us')
|
93
91
|
sentences = sentence_count(text)
|
94
92
|
|
95
93
|
if sentences >= 3
|
96
94
|
begin
|
97
|
-
polysyllab = polysyllab_count(text)
|
98
|
-
smog = (
|
99
|
-
(1
|
100
|
-
return smog.round(1)
|
95
|
+
polysyllab = polysyllab_count(text, language)
|
96
|
+
smog = 1.043 * Math.sqrt(30.0 * polysyllab / sentences) + 3.1291
|
97
|
+
smog.round(1)
|
101
98
|
rescue ZeroDivisionError
|
102
|
-
|
99
|
+
0.0
|
103
100
|
end
|
104
101
|
else
|
105
|
-
|
102
|
+
0.0
|
106
103
|
end
|
107
104
|
end
|
108
105
|
|
109
106
|
def self.coleman_liau_index(text)
|
110
|
-
letters
|
107
|
+
letters = (avg_letter_per_word(text) * 100).round(2)
|
111
108
|
sentences = (avg_sentence_per_word(text) * 100).round(2)
|
112
|
-
coleman
|
109
|
+
coleman = 0.0588 * letters - 0.296 * sentences - 15.8
|
113
110
|
coleman.round(2)
|
114
111
|
end
|
115
112
|
|
116
113
|
def self.automated_readability_index(text)
|
117
|
-
chars
|
118
|
-
words
|
114
|
+
chars = char_count(text)
|
115
|
+
words = lexicon_count(text)
|
119
116
|
sentences = sentence_count(text)
|
120
117
|
begin
|
121
|
-
a = chars.to_f / words
|
122
|
-
b = words.to_f / sentences
|
118
|
+
a = chars.to_f / words
|
119
|
+
b = words.to_f / sentences
|
123
120
|
|
124
|
-
readability =
|
125
|
-
|
126
|
-
return readability.round(1)
|
121
|
+
readability = 4.71 * a + 0.5 * b - 21.43
|
122
|
+
readability.round(1)
|
127
123
|
rescue ZeroDivisionError
|
128
|
-
|
124
|
+
0.0
|
129
125
|
end
|
130
126
|
end
|
131
127
|
|
132
|
-
def self.linsear_write_formula(text)
|
128
|
+
def self.linsear_write_formula(text, language = 'en_us')
|
133
129
|
easy_word = 0
|
134
130
|
difficult_word = 0
|
135
131
|
text_list = text.split(' ')[0..100]
|
136
132
|
|
137
133
|
text_list.each do |word|
|
138
|
-
if syllable_count(word) < 3
|
134
|
+
if syllable_count(word, language) < 3
|
139
135
|
easy_word += 1
|
140
136
|
else
|
141
137
|
difficult_word += 1
|
@@ -144,11 +140,9 @@ class TextStat
|
|
144
140
|
|
145
141
|
text = text_list.join(' ')
|
146
142
|
|
147
|
-
number = (
|
148
|
-
if number <= 20
|
149
|
-
|
150
|
-
end
|
151
|
-
return number / 2
|
143
|
+
number = (easy_word * 1 + difficult_word * 3).to_f / sentence_count(text)
|
144
|
+
number -= 2 if number <= 20
|
145
|
+
number / 2
|
152
146
|
end
|
153
147
|
|
154
148
|
def self.difficult_words(text, language = 'en_us')
|
@@ -161,58 +155,70 @@ class TextStat
|
|
161
155
|
text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
|
162
156
|
diff_words_set = Set.new
|
163
157
|
text_list.each do |value|
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
end
|
168
|
-
end
|
158
|
+
next if easy_words.include? value
|
159
|
+
|
160
|
+
diff_words_set.add(value) if syllable_count(value, language) > 1
|
169
161
|
end
|
170
|
-
|
162
|
+
diff_words_set.length
|
171
163
|
end
|
172
164
|
|
173
|
-
def self.dale_chall_readability_score(text)
|
165
|
+
def self.dale_chall_readability_score(text, language = 'en_us')
|
174
166
|
word_count = lexicon_count(text)
|
175
|
-
count = word_count - difficult_words(text)
|
167
|
+
count = word_count - difficult_words(text, language)
|
176
168
|
|
177
169
|
begin
|
178
|
-
per =
|
170
|
+
per = 100.0 * count / word_count
|
179
171
|
rescue ZeroDivisionError
|
180
172
|
return 0.0
|
181
173
|
end
|
182
174
|
|
183
175
|
difficult_words = 100 - per
|
184
|
-
score = (
|
185
|
-
|
186
|
-
+ (0.0496 * avg_sentence_length(text)))
|
176
|
+
score = 0.1579 * difficult_words + 0.0496 * avg_sentence_length(text)
|
177
|
+
score += 3.6365 if difficult_words > 5
|
187
178
|
|
188
|
-
|
189
|
-
score += 3.6365
|
190
|
-
end
|
191
|
-
return score.round(2)
|
179
|
+
score.round(2)
|
192
180
|
end
|
193
181
|
|
194
|
-
def self.gunning_fog(text)
|
195
|
-
|
196
|
-
|
197
|
-
(difficult_words(text) / lexicon_count(text) * 100) + 5)
|
182
|
+
def self.gunning_fog(text, language = 'en_us')
|
183
|
+
per_diff_words = 100.0 * difficult_words(text, language) / lexicon_count(text) + 5
|
184
|
+
grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
|
198
185
|
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
return 0.0
|
203
|
-
end
|
186
|
+
grade.round(2)
|
187
|
+
rescue ZeroDivisionError
|
188
|
+
0.0
|
204
189
|
end
|
205
190
|
|
206
191
|
def self.lix(text)
|
207
192
|
words = text.split(' ')
|
208
193
|
words_length = words.length
|
209
|
-
long_words = words.
|
194
|
+
long_words = words.count { |word| word.length > 6 }
|
210
195
|
|
211
|
-
per_long_words =
|
196
|
+
per_long_words = 100.0 * long_words / words_length
|
212
197
|
asl = avg_sentence_length(text)
|
213
198
|
lix = asl + per_long_words
|
214
199
|
|
215
|
-
|
200
|
+
lix.round(2)
|
201
|
+
end
|
202
|
+
|
203
|
+
def self.forcast(text, language = 'en_us')
|
204
|
+
words = text.split(' ')[0..149]
|
205
|
+
words_with_one_syllabe = words.count {
|
206
|
+
|word| syllable_count(word, language) == 1
|
207
|
+
}
|
208
|
+
forcast = 20 - (words_with_one_syllabe / 10)
|
209
|
+
forcast
|
210
|
+
end
|
211
|
+
|
212
|
+
def self.powers_sumner_kearl(text, language = 'en_us')
|
213
|
+
grade = 0.0778 * avg_sentence_length(text) + 0.0455 * syllable_count(text, language) - 2.2029
|
214
|
+
grade.round(2)
|
215
|
+
end
|
216
|
+
|
217
|
+
def self.spache(text, language = 'en_us')
|
218
|
+
words = text.split(' ').count
|
219
|
+
unfamiliar_words = difficult_words(text, language) / words
|
220
|
+
grade = (0.141 * avg_sentence_length(text)) + (0.086 * unfamiliar_words) + 0.839
|
221
|
+
grade.round(2)
|
216
222
|
end
|
217
223
|
|
218
224
|
def self.text_standard(text, float_output=nil)
|
@@ -287,9 +293,9 @@ class TextStat
|
|
287
293
|
score = final_grade[0][0]
|
288
294
|
|
289
295
|
if float_output
|
290
|
-
|
296
|
+
score.to_f
|
291
297
|
else
|
292
|
-
|
298
|
+
"#{score.to_i - 1}th and #{score.to_i}th grade"
|
293
299
|
end
|
294
300
|
end
|
295
301
|
|
data/spec/textstat_spec.rb
CHANGED
@@ -116,22 +116,22 @@ describe TextStat do
|
|
116
116
|
|
117
117
|
it 'should return the correct smog index' do
|
118
118
|
index = TextStat.smog_index(@long_test)
|
119
|
-
expect(index).to eql
|
119
|
+
expect(index).to eql 12.5
|
120
120
|
end
|
121
121
|
|
122
122
|
it 'should return the correct Coleman–Liau index' do
|
123
123
|
index = TextStat.coleman_liau_index(@long_test)
|
124
|
-
expect(index).to eql 10.
|
124
|
+
expect(index).to eql 10.65
|
125
125
|
end
|
126
126
|
|
127
127
|
it 'should return the correct automated readability index' do
|
128
128
|
index = TextStat.automated_readability_index(@long_test)
|
129
|
-
expect(index).to eql 12.
|
129
|
+
expect(index).to eql 12.4
|
130
130
|
end
|
131
131
|
|
132
132
|
it 'should return the correct linsear write formula result' do
|
133
133
|
result = TextStat.linsear_write_formula(@long_test)
|
134
|
-
expect(result).to eql 14.
|
134
|
+
expect(result).to eql 14.875
|
135
135
|
end
|
136
136
|
|
137
137
|
it 'should return the correct difficult words result' do
|
@@ -141,12 +141,12 @@ describe TextStat do
|
|
141
141
|
|
142
142
|
it 'should return the correct Dale–Chall readability score' do
|
143
143
|
score = TextStat.dale_chall_readability_score(@long_test)
|
144
|
-
expect(score).to eql
|
144
|
+
expect(score).to eql 7.25
|
145
145
|
end
|
146
146
|
|
147
147
|
it 'should return the correct Gunning fog score' do
|
148
148
|
score = TextStat.gunning_fog(@long_test)
|
149
|
-
expect(score).to eql
|
149
|
+
expect(score).to eql 17.56
|
150
150
|
end
|
151
151
|
|
152
152
|
it 'should return the correct Lix readability test score' do
|
@@ -154,6 +154,21 @@ describe TextStat do
|
|
154
154
|
expect(score).to eql 45.11
|
155
155
|
end
|
156
156
|
|
157
|
+
it 'should return the correct FORCAST readability test score' do
|
158
|
+
score = TextStat.forcast(@long_test)
|
159
|
+
expect(score).to eql 10
|
160
|
+
end
|
161
|
+
|
162
|
+
it 'should return the correct Powers Sumner Kearl readability test score' do
|
163
|
+
score = TextStat.powers_sumner_kearl(@long_test)
|
164
|
+
expect(score).to eql 25.04
|
165
|
+
end
|
166
|
+
|
167
|
+
it 'should return the correct SPACHE readability test score' do
|
168
|
+
score = TextStat.spache(@long_test)
|
169
|
+
expect(score).to eql 4.12
|
170
|
+
end
|
171
|
+
|
157
172
|
it 'should return the readability consensus score' do
|
158
173
|
standard = TextStat.text_standard(@long_test)
|
159
174
|
expect(standard).to eql '10th and 11th grade'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textstat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jakub Polak
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-05-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: text-hyphen
|
@@ -50,14 +50,14 @@ dependencies:
|
|
50
50
|
requirements:
|
51
51
|
- - "~>"
|
52
52
|
- !ruby/object:Gem::Version
|
53
|
-
version: '
|
53
|
+
version: '13.0'
|
54
54
|
type: :development
|
55
55
|
prerelease: false
|
56
56
|
version_requirements: !ruby/object:Gem::Requirement
|
57
57
|
requirements:
|
58
58
|
- - "~>"
|
59
59
|
- !ruby/object:Gem::Version
|
60
|
-
version: '
|
60
|
+
version: '13.0'
|
61
61
|
- !ruby/object:Gem::Dependency
|
62
62
|
name: rspec
|
63
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -108,7 +108,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
108
108
|
- !ruby/object:Gem::Version
|
109
109
|
version: '0'
|
110
110
|
requirements: []
|
111
|
-
rubygems_version: 3.
|
111
|
+
rubygems_version: 3.2.17
|
112
112
|
signing_key:
|
113
113
|
specification_version: 4
|
114
114
|
summary: Ruby gem to calculate readability statistics of a text object - paragraphs,
|