textstat 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ class TextStat
2
+ VERSION = "0.1.0"
3
+ end
data/lib/textstat.rb ADDED
@@ -0,0 +1,293 @@
1
+ require 'text-hyphen'
2
+
3
+ class TextStat
4
+ def self.char_count(text, ignore_spaces = true)
5
+ text = text.delete(' ') if ignore_spaces
6
+ text.length
7
+ end
8
+
9
+ def self.lexicon_count(text, remove_punctuation = true)
10
+ text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
11
+ count = text.split(' ').count
12
+ count
13
+ end
14
+
15
+ def self.syllable_count(text, language = 'en_us')
16
+ return 0 if text.empty?
17
+
18
+ text = text.downcase
19
+ text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
20
+ dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
21
+ count = 0
22
+ text.split(' ').each do |word|
23
+ word_hyphenated = dictionary.visualise(word)
24
+ count += [1, word_hyphenated.count('-') + 1].max
25
+ end
26
+ count
27
+ end
28
+
29
+ def self.sentence_count(text)
30
+ text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
31
+ end
32
+
33
+ def self.avg_sentence_length(text)
34
+ asl = lexicon_count(text).to_f / sentence_count(text).to_f
35
+ asl.round(1)
36
+ rescue ZeroDivisionError
37
+ 0.0
38
+ end
39
+
40
+ def self.avg_syllables_per_word(text)
41
+ syllable = syllable_count(text)
42
+ words = lexicon_count(text)
43
+ begin
44
+ syllables_per_word = syllable.to_f / words.to_f
45
+ return syllables_per_word.round(1)
46
+ rescue ZeroDivisionError
47
+ return 0.0
48
+ end
49
+ end
50
+
51
+ def self.avg_letter_per_word(text)
52
+ letters_per_word = char_count(text).to_f / lexicon_count(text).to_f
53
+ letters_per_word.round(2)
54
+ rescue ZeroDivisionError
55
+ 0.0
56
+ end
57
+
58
+ def self.avg_sentence_per_word(text)
59
+ sentence_per_word = sentence_count(text).to_f / lexicon_count(text).to_f
60
+ sentence_per_word.round(2)
61
+ rescue ZeroDivisionError
62
+ 0.0
63
+ end
64
+
65
+ def self.flesch_reading_ease(text)
66
+ sentence_length = avg_sentence_length(text)
67
+ syllables_per_word = avg_syllables_per_word(text)
68
+ flesch = (
69
+ 206.835 - (1.015 * sentence_length).to_f - (84.6 * syllables_per_word).to_f
70
+ )
71
+ flesch.round(2)
72
+ end
73
+
74
+ def self.flesch_kincaid_grade(text)
75
+ sentence_length = avg_sentence_length(text)
76
+ syllables_per_word = avg_syllables_per_word(text)
77
+ flesch = (0.39 * sentence_length.to_f) + (11.8 * syllables_per_word.to_f) - 15.59
78
+ flesch.round(1)
79
+ end
80
+
81
+ def self.polysyllab_count(text)
82
+ count = 0
83
+ text.split(' ').each do |word|
84
+ w = syllable_count(word)
85
+ count += 1 if w >= 3
86
+ end
87
+ count
88
+ end
89
+
90
+ def self.smog_index(text)
91
+ sentences = sentence_count(text)
92
+
93
+ if sentences >= 3
94
+ begin
95
+ polysyllab = polysyllab_count(text)
96
+ smog = (
97
+ (1.043 * (30 * (polysyllab / sentences))**0.5) + 3.1291)
98
+ return smog.round(1)
99
+ rescue ZeroDivisionError
100
+ return 0.0
101
+ end
102
+ else
103
+ return 0.0
104
+ end
105
+ end
106
+
107
+ def self.coleman_liau_index(text)
108
+ letters = (avg_letter_per_word(text) * 100).round(2)
109
+ sentences = (avg_sentence_per_word(text) * 100).round(2)
110
+ coleman = ((0.058 * letters) - (0.296 * sentences) - 15.8).to_f
111
+ coleman.round(2)
112
+ end
113
+
114
+ def self.automated_readability_index(text)
115
+ chars = char_count(text)
116
+ words = lexicon_count(text)
117
+ sentences = sentence_count(text)
118
+ begin
119
+ a = chars.to_f / words.to_f
120
+ b = words.to_f / sentences.to_f
121
+
122
+ readability = (
123
+ (4.71 * a.round(2) + (0.5 * b.round(2))) - 21.43)
124
+ return readability.round(1)
125
+ rescue ZeroDivisionError
126
+ return 0.0
127
+ end
128
+ end
129
+
130
+ def self.linsear_write_formula(text)
131
+ easy_word = 0
132
+ difficult_word = 0
133
+ text_list = text.split(' ')[0..100]
134
+
135
+ text_list.each do |word|
136
+ if syllable_count(word) < 3
137
+ easy_word += 1
138
+ else
139
+ difficult_word += 1
140
+ end
141
+ end
142
+
143
+ text = text_list.join(' ')
144
+
145
+ number = ((easy_word * 1 + difficult_word * 3) / sentence_count(text)).to_f
146
+ if number <= 20
147
+ number -= 2
148
+ end
149
+ return number / 2
150
+ end
151
+
152
+ def self.difficult_words(text)
153
+ require 'set'
154
+ easy_words = Set.new
155
+ File.read('lib/easy_words.txt').each_line do |line|
156
+ easy_words << line.chop
157
+ end
158
+
159
+ text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
160
+ diff_words_set = Set.new
161
+ text_list.each do |value|
162
+ unless easy_words.include? value
163
+ if syllable_count(value) > 1
164
+ diff_words_set.add(value)
165
+ end
166
+ end
167
+ end
168
+ return diff_words_set.length
169
+ end
170
+
171
+ def self.dale_chall_readability_score(text)
172
+ word_count = lexicon_count(text)
173
+ count = word_count - difficult_words(text)
174
+
175
+ begin
176
+ per = count.to_f / word_count.to_f * 100
177
+ rescue ZeroDivisionError
178
+ return 0.0
179
+ end
180
+
181
+ difficult_words = 100 - per
182
+ score = (
183
+ (0.1579 * difficult_words)
184
+ + (0.0496 * avg_sentence_length(text)))
185
+
186
+ if difficult_words > 5
187
+ score += 3.6365
188
+ end
189
+ return score.round(2)
190
+ end
191
+
192
+ def self.gunning_fog(text)
193
+ begin
194
+ per_diff_words = (
195
+ (difficult_words(text) / lexicon_count(text) * 100) + 5)
196
+
197
+ grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
198
+ return grade.round(2)
199
+ rescue ZeroDivisionError
200
+ return 0.0
201
+ end
202
+ end
203
+
204
+ def self.lix(text)
205
+ words = text.split(' ')
206
+ words_length = words.length
207
+ long_words = words.select { |word| word.length > 6 }.count
208
+
209
+ per_long_words = (long_words * 100).to_f / words_length
210
+ asl = avg_sentence_length(text)
211
+ lix = asl + per_long_words
212
+
213
+ return lix.round(2)
214
+ end
215
+
216
+ def self.text_standard(text, float_output=nil)
217
+ grade = []
218
+
219
+ lower = flesch_kincaid_grade(text).round
220
+ upper = flesch_kincaid_grade(text).ceil
221
+ grade.append(lower.to_i)
222
+ grade.append(upper.to_i)
223
+
224
+ # Appending Flesch Reading Easy
225
+ score = flesch_reading_ease(text)
226
+ if score < 100 && score >= 90
227
+ grade.append(5)
228
+ elsif score < 90 && score >= 80
229
+ grade.append(6)
230
+ elsif score < 80 && score >= 70
231
+ grade.append(7)
232
+ elsif score < 70 && score >= 60
233
+ grade.append(8)
234
+ grade.append(9)
235
+ elsif score < 60 && score >= 50
236
+ grade.append(10)
237
+ elsif score < 50 && score >= 40
238
+ grade.append(11)
239
+ elsif score < 40 && score >= 30
240
+ grade.append(12)
241
+ else
242
+ grade.append(13)
243
+ end
244
+
245
+ # Appending SMOG Index
246
+ lower = smog_index(text).round
247
+ upper = smog_index(text).ceil
248
+ grade.append(lower.to_i)
249
+ grade.append(upper.to_i)
250
+
251
+ # Appending Coleman_Liau_Index
252
+ lower = coleman_liau_index(text).round
253
+ upper = coleman_liau_index(text).ceil
254
+ grade.append(lower.to_i)
255
+ grade.append(upper.to_i)
256
+
257
+ # Appending Automated_Readability_Index
258
+ lower = automated_readability_index(text).round
259
+ upper = automated_readability_index(text).ceil
260
+ grade.append(lower.to_i)
261
+ grade.append(upper.to_i)
262
+
263
+ # Appending Dale_Chall_Readability_Score
264
+ lower = dale_chall_readability_score(text).round
265
+ upper = dale_chall_readability_score(text).ceil
266
+ grade.append(lower.to_i)
267
+ grade.append(upper.to_i)
268
+
269
+ # Appending Linsear_Write_Formula
270
+ lower = linsear_write_formula(text).round
271
+ upper = linsear_write_formula(text).ceil
272
+ grade.append(lower.to_i)
273
+ grade.append(upper.to_i)
274
+
275
+ # Appending Gunning Fog Index
276
+ lower = gunning_fog(text).round
277
+ upper = gunning_fog(text).ceil
278
+ grade.append(lower.to_i)
279
+ grade.append(upper.to_i)
280
+
281
+ # Finding the Readability Consensus based upon all the above tests
282
+ require 'counter'
283
+ d = Counter.new(grade)
284
+ final_grade = d.most_common(1)
285
+ score = final_grade[0][0]
286
+
287
+ if float_output
288
+ return score.to_f
289
+ else
290
+ return "#{score.to_i - 1}th and #{score.to_i}th grade"
291
+ end
292
+ end
293
+ end
@@ -0,0 +1,162 @@
1
+ require 'rspec'
2
+ require_relative '../lib/textstat.rb'
3
+
4
+ describe TextStat do
5
+ before do
6
+ @long_test = 'Playing ... games has always been thought to be ' \
7
+ 'important to the development of well-balanced and ' \
8
+ 'creative children; however, what part, if any, ' \
9
+ 'they should play in the lives of adults has never ' \
10
+ 'been researched that deeply. I believe that ' \
11
+ 'playing games is every bit as important for adults ' \
12
+ 'as for children. Not only is taking time out to ' \
13
+ 'play games with our children and other adults ' \
14
+ 'valuable to building interpersonal relationships ' \
15
+ 'but is also a wonderful way to release built up ' \
16
+ "tension.\n" \
17
+ "There's nothing my husband enjoys more after a " \
18
+ 'hard day of work than to come home and play a game ' \
19
+ 'of Chess with someone. This enables him to unwind ' \
20
+ "from the day's activities and to discuss the highs " \
21
+ 'and lows of the day in a non-threatening, kick back ' \
22
+ 'environment. One of my most memorable wedding ' \
23
+ 'gifts, a Backgammon set, was received by a close ' \
24
+ 'friend. I asked him why in the world he had given ' \
25
+ 'us such a gift. He replied that he felt that an ' \
26
+ 'important aspect of marriage was for a couple to ' \
27
+ 'never quit playing games together. Over the years, ' \
28
+ 'as I have come to purchase and play, with other ' \
29
+ 'couples & coworkers, many games like: Monopoly, ' \
30
+ 'Chutes & Ladders, Mastermind, Dweebs, Geeks, & ' \
31
+ 'Weirdos, etc. I can reflect on the integral part ' \
32
+ 'they have played in our weekends and our ' \
33
+ '"shut-off the T.V. and do something more ' \
34
+ 'stimulating" weeks. They have enriched my life and ' \
35
+ 'made it more interesting. Sadly, many adults ' \
36
+ 'forget that games even exist and have put them ' \
37
+ 'away in the cupboards, forgotten until the ' \
38
+ "grandchildren come over.\n" \
39
+ 'All too often, adults get so caught up in working ' \
40
+ 'to pay the bills and keeping up with the ' \
41
+ "\"Joneses'\" that they neglect to harness the fun " \
42
+ 'in life; the fun that can be the reward of ' \
43
+ 'enjoying a relaxing game with another person. It ' \
44
+ 'has been said that "man is that he might have ' \
45
+ 'joy" but all too often we skate through life ' \
46
+ 'without much of it. Playing games allows us to: ' \
47
+ 'relax, learn something new and stimulating, ' \
48
+ 'interact with people on a different more ' \
49
+ 'comfortable level, and to enjoy non-threatening ' \
50
+ 'competition. For these reasons, adults should ' \
51
+ 'place a higher priority on playing games in their ' \
52
+ 'lives'
53
+ end
54
+
55
+ context 'When testing the TextStat class' do
56
+ it 'should return the correct number of chars' do
57
+ count = TextStat.char_count(@long_test)
58
+ count_spaces = TextStat.char_count(@long_test, false)
59
+
60
+ expect(count).to eql 1750
61
+ expect(count_spaces).to eql 2123
62
+ end
63
+
64
+ it 'should return the correct number of lexicons' do
65
+ count = TextStat.lexicon_count(@long_test)
66
+ count_punctuation = TextStat.lexicon_count(@long_test, false)
67
+
68
+ expect(count).to eql 372
69
+ expect(count_punctuation).to eql 376
70
+ end
71
+
72
+ it 'should return the correct number of syllables' do
73
+ count = TextStat.syllable_count(@long_test)
74
+ expect(count).to eql 559
75
+ end
76
+
77
+ it 'should return the correct number of sentences' do
78
+ count = TextStat.sentence_count(@long_test)
79
+ expect(count).to eql 16
80
+ end
81
+
82
+ it 'should return the correct average sentence length' do
83
+ avg = TextStat.avg_sentence_length(@long_test)
84
+ expect(avg).to eql 23.3
85
+ end
86
+
87
+ it 'should return the correct average syllables per word' do
88
+ avg = TextStat.avg_syllables_per_word(@long_test)
89
+ expect(avg).to eql 1.5
90
+ end
91
+
92
+ it 'should return the correct average letters per word' do
93
+ avg = TextStat.avg_letter_per_word(@long_test)
94
+ expect(avg).to eql 4.7
95
+ end
96
+
97
+ it 'should return the correct average sentence per word' do
98
+ avg = TextStat.avg_sentence_per_word(@long_test)
99
+ expect(avg).to eql 0.04
100
+ end
101
+
102
+ it 'should return the correct Flesch reading-ease test score' do
103
+ score = TextStat.flesch_reading_ease(@long_test)
104
+ expect(score).to eql 56.29
105
+ end
106
+
107
+ it 'should return the correct Flesch–Kincaid grade' do
108
+ score = TextStat.flesch_kincaid_grade(@long_test)
109
+ expect(score).to eql 11.2
110
+ end
111
+
112
+ it 'should return the correct number of polysyllab' do
113
+ count = TextStat.polysyllab_count(@long_test)
114
+ expect(count).to eql 43
115
+ end
116
+
117
+ it 'should return the correct smog index' do
118
+ index = TextStat.smog_index(@long_test)
119
+ expect(index).to eql 11.2
120
+ end
121
+
122
+ it 'should return the correct Coleman–Liau index' do
123
+ index = TextStat.coleman_liau_index(@long_test)
124
+ expect(index).to eql 10.28
125
+ end
126
+
127
+ it 'should return the correct automated readability index' do
128
+ index = TextStat.automated_readability_index(@long_test)
129
+ expect(index).to eql 12.3
130
+ end
131
+
132
+ it 'should return the correct linsear write formula result' do
133
+ result = TextStat.linsear_write_formula(@long_test)
134
+ expect(result).to eql 14.5
135
+ end
136
+
137
+ it 'should return the correct difficult words result' do
138
+ result = TextStat.difficult_words(@long_test)
139
+ expect(result).to eql 58
140
+ end
141
+
142
+ it 'should return the correct Dale–Chall readability score' do
143
+ score = TextStat.dale_chall_readability_score(@long_test)
144
+ expect(score).to eql 4.79
145
+ end
146
+
147
+ it 'should return the correct Gunning fog score' do
148
+ score = TextStat.gunning_fog(@long_test)
149
+ expect(score).to eql 11.32
150
+ end
151
+
152
+ it 'should return the correct Lix readability test score' do
153
+ score = TextStat.lix(@long_test)
154
+ expect(score).to eql 45.11
155
+ end
156
+
157
+ it 'should return the readability consensus score' do
158
+ standard = TextStat.text_standard(@long_test)
159
+ expect(standard).to eql '10th and 11th grade'
160
+ end
161
+ end
162
+ end
metadata ADDED
@@ -0,0 +1,116 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: textstat
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jakub Polak
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-11-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: text-hyphen
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.4'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.4.1
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.4'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.4.1
33
+ - !ruby/object:Gem::Dependency
34
+ name: bundler
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: 2.0.a
40
+ type: :development
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: 2.0.a
47
+ - !ruby/object:Gem::Dependency
48
+ name: rake
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '10.0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '10.0'
61
+ - !ruby/object:Gem::Dependency
62
+ name: rspec
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '3.0'
68
+ type: :development
69
+ prerelease: false
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '3.0'
75
+ description:
76
+ email:
77
+ - jakub.polak.vz@gmail.com
78
+ executables: []
79
+ extensions: []
80
+ extra_rdoc_files: []
81
+ files:
82
+ - lib/counter.rb
83
+ - lib/easy_words.txt
84
+ - lib/textstat.rb
85
+ - lib/textstat/version.rb
86
+ - spec/textstat_spec.rb
87
+ homepage: https://github.com/kupolak/textstat
88
+ licenses:
89
+ - MIT
90
+ metadata:
91
+ homepage_uri: https://github.com/kupolak/textstat
92
+ source_code_uri: https://github.com/kupolak/textstat
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.7.8
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: Ruby gem to calculate readability statistics of a text object - paragraphs,
113
+ sentences, articles
114
+ test_files:
115
+ - spec/textstat_spec.rb
116
+ - lib/easy_words.txt