textstat 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ class TextStat
2
+ VERSION = "0.1.0"
3
+ end
data/lib/textstat.rb ADDED
@@ -0,0 +1,293 @@
1
+ require 'text-hyphen'
2
+
3
+ class TextStat
4
+ def self.char_count(text, ignore_spaces = true)
5
+ text = text.delete(' ') if ignore_spaces
6
+ text.length
7
+ end
8
+
9
+ def self.lexicon_count(text, remove_punctuation = true)
10
+ text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
11
+ count = text.split(' ').count
12
+ count
13
+ end
14
+
15
+ def self.syllable_count(text, language = 'en_us')
16
+ return 0 if text.empty?
17
+
18
+ text = text.downcase
19
+ text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
20
+ dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
21
+ count = 0
22
+ text.split(' ').each do |word|
23
+ word_hyphenated = dictionary.visualise(word)
24
+ count += [1, word_hyphenated.count('-') + 1].max
25
+ end
26
+ count
27
+ end
28
+
29
+ def self.sentence_count(text)
30
+ text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
31
+ end
32
+
33
+ def self.avg_sentence_length(text)
34
+ asl = lexicon_count(text).to_f / sentence_count(text).to_f
35
+ asl.round(1)
36
+ rescue ZeroDivisionError
37
+ 0.0
38
+ end
39
+
40
+ def self.avg_syllables_per_word(text)
41
+ syllable = syllable_count(text)
42
+ words = lexicon_count(text)
43
+ begin
44
+ syllables_per_word = syllable.to_f / words.to_f
45
+ return syllables_per_word.round(1)
46
+ rescue ZeroDivisionError
47
+ return 0.0
48
+ end
49
+ end
50
+
51
+ def self.avg_letter_per_word(text)
52
+ letters_per_word = char_count(text).to_f / lexicon_count(text).to_f
53
+ letters_per_word.round(2)
54
+ rescue ZeroDivisionError
55
+ 0.0
56
+ end
57
+
58
+ def self.avg_sentence_per_word(text)
59
+ sentence_per_word = sentence_count(text).to_f / lexicon_count(text).to_f
60
+ sentence_per_word.round(2)
61
+ rescue ZeroDivisionError
62
+ 0.0
63
+ end
64
+
65
+ def self.flesch_reading_ease(text)
66
+ sentence_length = avg_sentence_length(text)
67
+ syllables_per_word = avg_syllables_per_word(text)
68
+ flesch = (
69
+ 206.835 - (1.015 * sentence_length).to_f - (84.6 * syllables_per_word).to_f
70
+ )
71
+ flesch.round(2)
72
+ end
73
+
74
+ def self.flesch_kincaid_grade(text)
75
+ sentence_length = avg_sentence_length(text)
76
+ syllables_per_word = avg_syllables_per_word(text)
77
+ flesch = (0.39 * sentence_length.to_f) + (11.8 * syllables_per_word.to_f) - 15.59
78
+ flesch.round(1)
79
+ end
80
+
81
+ def self.polysyllab_count(text)
82
+ count = 0
83
+ text.split(' ').each do |word|
84
+ w = syllable_count(word)
85
+ count += 1 if w >= 3
86
+ end
87
+ count
88
+ end
89
+
90
+ def self.smog_index(text)
91
+ sentences = sentence_count(text)
92
+
93
+ if sentences >= 3
94
+ begin
95
+ polysyllab = polysyllab_count(text)
96
+ smog = (
97
+ (1.043 * (30 * (polysyllab / sentences))**0.5) + 3.1291)
98
+ return smog.round(1)
99
+ rescue ZeroDivisionError
100
+ return 0.0
101
+ end
102
+ else
103
+ return 0.0
104
+ end
105
+ end
106
+
107
+ def self.coleman_liau_index(text)
108
+ letters = (avg_letter_per_word(text) * 100).round(2)
109
+ sentences = (avg_sentence_per_word(text) * 100).round(2)
110
+ coleman = ((0.058 * letters) - (0.296 * sentences) - 15.8).to_f
111
+ coleman.round(2)
112
+ end
113
+
114
+ def self.automated_readability_index(text)
115
+ chars = char_count(text)
116
+ words = lexicon_count(text)
117
+ sentences = sentence_count(text)
118
+ begin
119
+ a = chars.to_f / words.to_f
120
+ b = words.to_f / sentences.to_f
121
+
122
+ readability = (
123
+ (4.71 * a.round(2) + (0.5 * b.round(2))) - 21.43)
124
+ return readability.round(1)
125
+ rescue ZeroDivisionError
126
+ return 0.0
127
+ end
128
+ end
129
+
130
+ def self.linsear_write_formula(text)
131
+ easy_word = 0
132
+ difficult_word = 0
133
+ text_list = text.split(' ')[0..100]
134
+
135
+ text_list.each do |word|
136
+ if syllable_count(word) < 3
137
+ easy_word += 1
138
+ else
139
+ difficult_word += 1
140
+ end
141
+ end
142
+
143
+ text = text_list.join(' ')
144
+
145
+ number = ((easy_word * 1 + difficult_word * 3) / sentence_count(text)).to_f
146
+ if number <= 20
147
+ number -= 2
148
+ end
149
+ return number / 2
150
+ end
151
+
152
+ def self.difficult_words(text)
153
+ require 'set'
154
+ easy_words = Set.new
155
+ File.read('lib/easy_words.txt').each_line do |line|
156
+ easy_words << line.chop
157
+ end
158
+
159
+ text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
160
+ diff_words_set = Set.new
161
+ text_list.each do |value|
162
+ unless easy_words.include? value
163
+ if syllable_count(value) > 1
164
+ diff_words_set.add(value)
165
+ end
166
+ end
167
+ end
168
+ return diff_words_set.length
169
+ end
170
+
171
+ def self.dale_chall_readability_score(text)
172
+ word_count = lexicon_count(text)
173
+ count = word_count - difficult_words(text)
174
+
175
+ begin
176
+ per = count.to_f / word_count.to_f * 100
177
+ rescue ZeroDivisionError
178
+ return 0.0
179
+ end
180
+
181
+ difficult_words = 100 - per
182
+ score = (
183
+ (0.1579 * difficult_words)
184
+ + (0.0496 * avg_sentence_length(text)))
185
+
186
+ if difficult_words > 5
187
+ score += 3.6365
188
+ end
189
+ return score.round(2)
190
+ end
191
+
192
+ def self.gunning_fog(text)
193
+ begin
194
+ per_diff_words = (
195
+ (difficult_words(text) / lexicon_count(text) * 100) + 5)
196
+
197
+ grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
198
+ return grade.round(2)
199
+ rescue ZeroDivisionError
200
+ return 0.0
201
+ end
202
+ end
203
+
204
+ def self.lix(text)
205
+ words = text.split(' ')
206
+ words_length = words.length
207
+ long_words = words.select { |word| word.length > 6 }.count
208
+
209
+ per_long_words = (long_words * 100).to_f / words_length
210
+ asl = avg_sentence_length(text)
211
+ lix = asl + per_long_words
212
+
213
+ return lix.round(2)
214
+ end
215
+
216
+ def self.text_standard(text, float_output=nil)
217
+ grade = []
218
+
219
+ lower = flesch_kincaid_grade(text).round
220
+ upper = flesch_kincaid_grade(text).ceil
221
+ grade.append(lower.to_i)
222
+ grade.append(upper.to_i)
223
+
224
+ # Appending Flesch Reading Easy
225
+ score = flesch_reading_ease(text)
226
+ if score < 100 && score >= 90
227
+ grade.append(5)
228
+ elsif score < 90 && score >= 80
229
+ grade.append(6)
230
+ elsif score < 80 && score >= 70
231
+ grade.append(7)
232
+ elsif score < 70 && score >= 60
233
+ grade.append(8)
234
+ grade.append(9)
235
+ elsif score < 60 && score >= 50
236
+ grade.append(10)
237
+ elsif score < 50 && score >= 40
238
+ grade.append(11)
239
+ elsif score < 40 && score >= 30
240
+ grade.append(12)
241
+ else
242
+ grade.append(13)
243
+ end
244
+
245
+ # Appending SMOG Index
246
+ lower = smog_index(text).round
247
+ upper = smog_index(text).ceil
248
+ grade.append(lower.to_i)
249
+ grade.append(upper.to_i)
250
+
251
+ # Appending Coleman_Liau_Index
252
+ lower = coleman_liau_index(text).round
253
+ upper = coleman_liau_index(text).ceil
254
+ grade.append(lower.to_i)
255
+ grade.append(upper.to_i)
256
+
257
+ # Appending Automated_Readability_Index
258
+ lower = automated_readability_index(text).round
259
+ upper = automated_readability_index(text).ceil
260
+ grade.append(lower.to_i)
261
+ grade.append(upper.to_i)
262
+
263
+ # Appending Dale_Chall_Readability_Score
264
+ lower = dale_chall_readability_score(text).round
265
+ upper = dale_chall_readability_score(text).ceil
266
+ grade.append(lower.to_i)
267
+ grade.append(upper.to_i)
268
+
269
+ # Appending Linsear_Write_Formula
270
+ lower = linsear_write_formula(text).round
271
+ upper = linsear_write_formula(text).ceil
272
+ grade.append(lower.to_i)
273
+ grade.append(upper.to_i)
274
+
275
+ # Appending Gunning Fog Index
276
+ lower = gunning_fog(text).round
277
+ upper = gunning_fog(text).ceil
278
+ grade.append(lower.to_i)
279
+ grade.append(upper.to_i)
280
+
281
+ # Finding the Readability Consensus based upon all the above tests
282
+ require 'counter'
283
+ d = Counter.new(grade)
284
+ final_grade = d.most_common(1)
285
+ score = final_grade[0][0]
286
+
287
+ if float_output
288
+ return score.to_f
289
+ else
290
+ return "#{score.to_i - 1}th and #{score.to_i}th grade"
291
+ end
292
+ end
293
+ end
@@ -0,0 +1,162 @@
1
+ require 'rspec'
2
+ require_relative '../lib/textstat.rb'
3
+
4
+ describe TextStat do
5
+ before do
6
+ @long_test = 'Playing ... games has always been thought to be ' \
7
+ 'important to the development of well-balanced and ' \
8
+ 'creative children; however, what part, if any, ' \
9
+ 'they should play in the lives of adults has never ' \
10
+ 'been researched that deeply. I believe that ' \
11
+ 'playing games is every bit as important for adults ' \
12
+ 'as for children. Not only is taking time out to ' \
13
+ 'play games with our children and other adults ' \
14
+ 'valuable to building interpersonal relationships ' \
15
+ 'but is also a wonderful way to release built up ' \
16
+ "tension.\n" \
17
+ "There's nothing my husband enjoys more after a " \
18
+ 'hard day of work than to come home and play a game ' \
19
+ 'of Chess with someone. This enables him to unwind ' \
20
+ "from the day's activities and to discuss the highs " \
21
+ 'and lows of the day in a non-threatening, kick back ' \
22
+ 'environment. One of my most memorable wedding ' \
23
+ 'gifts, a Backgammon set, was received by a close ' \
24
+ 'friend. I asked him why in the world he had given ' \
25
+ 'us such a gift. He replied that he felt that an ' \
26
+ 'important aspect of marriage was for a couple to ' \
27
+ 'never quit playing games together. Over the years, ' \
28
+ 'as I have come to purchase and play, with other ' \
29
+ 'couples & coworkers, many games like: Monopoly, ' \
30
+ 'Chutes & Ladders, Mastermind, Dweebs, Geeks, & ' \
31
+ 'Weirdos, etc. I can reflect on the integral part ' \
32
+ 'they have played in our weekends and our ' \
33
+ '"shut-off the T.V. and do something more ' \
34
+ 'stimulating" weeks. They have enriched my life and ' \
35
+ 'made it more interesting. Sadly, many adults ' \
36
+ 'forget that games even exist and have put them ' \
37
+ 'away in the cupboards, forgotten until the ' \
38
+ "grandchildren come over.\n" \
39
+ 'All too often, adults get so caught up in working ' \
40
+ 'to pay the bills and keeping up with the ' \
41
+ "\"Joneses'\" that they neglect to harness the fun " \
42
+ 'in life; the fun that can be the reward of ' \
43
+ 'enjoying a relaxing game with another person. It ' \
44
+ 'has been said that "man is that he might have ' \
45
+ 'joy" but all too often we skate through life ' \
46
+ 'without much of it. Playing games allows us to: ' \
47
+ 'relax, learn something new and stimulating, ' \
48
+ 'interact with people on a different more ' \
49
+ 'comfortable level, and to enjoy non-threatening ' \
50
+ 'competition. For these reasons, adults should ' \
51
+ 'place a higher priority on playing games in their ' \
52
+ 'lives'
53
+ end
54
+
55
+ context 'When testing the TextStat class' do
56
+ it 'should return the correct number of chars' do
57
+ count = TextStat.char_count(@long_test)
58
+ count_spaces = TextStat.char_count(@long_test, false)
59
+
60
+ expect(count).to eql 1750
61
+ expect(count_spaces).to eql 2123
62
+ end
63
+
64
+ it 'should return the correct number of lexicons' do
65
+ count = TextStat.lexicon_count(@long_test)
66
+ count_punctuation = TextStat.lexicon_count(@long_test, false)
67
+
68
+ expect(count).to eql 372
69
+ expect(count_punctuation).to eql 376
70
+ end
71
+
72
+ it 'should return the correct number of syllables' do
73
+ count = TextStat.syllable_count(@long_test)
74
+ expect(count).to eql 559
75
+ end
76
+
77
+ it 'should return the correct number of sentences' do
78
+ count = TextStat.sentence_count(@long_test)
79
+ expect(count).to eql 16
80
+ end
81
+
82
+ it 'should return the correct average sentence length' do
83
+ avg = TextStat.avg_sentence_length(@long_test)
84
+ expect(avg).to eql 23.3
85
+ end
86
+
87
+ it 'should return the correct average syllables per word' do
88
+ avg = TextStat.avg_syllables_per_word(@long_test)
89
+ expect(avg).to eql 1.5
90
+ end
91
+
92
+ it 'should return the correct average letters per word' do
93
+ avg = TextStat.avg_letter_per_word(@long_test)
94
+ expect(avg).to eql 4.7
95
+ end
96
+
97
+ it 'should return the correct average sentence per word' do
98
+ avg = TextStat.avg_sentence_per_word(@long_test)
99
+ expect(avg).to eql 0.04
100
+ end
101
+
102
+ it 'should return the correct Flesch reading-ease test score' do
103
+ score = TextStat.flesch_reading_ease(@long_test)
104
+ expect(score).to eql 56.29
105
+ end
106
+
107
+ it 'should return the correct Flesch–Kincaid grade' do
108
+ score = TextStat.flesch_kincaid_grade(@long_test)
109
+ expect(score).to eql 11.2
110
+ end
111
+
112
+ it 'should return the correct number of polysyllab' do
113
+ count = TextStat.polysyllab_count(@long_test)
114
+ expect(count).to eql 43
115
+ end
116
+
117
+ it 'should return the correct smog index' do
118
+ index = TextStat.smog_index(@long_test)
119
+ expect(index).to eql 11.2
120
+ end
121
+
122
+ it 'should return the correct Coleman–Liau index' do
123
+ index = TextStat.coleman_liau_index(@long_test)
124
+ expect(index).to eql 10.28
125
+ end
126
+
127
+ it 'should return the correct automated readability index' do
128
+ index = TextStat.automated_readability_index(@long_test)
129
+ expect(index).to eql 12.3
130
+ end
131
+
132
+ it 'should return the correct linsear write formula result' do
133
+ result = TextStat.linsear_write_formula(@long_test)
134
+ expect(result).to eql 14.5
135
+ end
136
+
137
+ it 'should return the correct difficult words result' do
138
+ result = TextStat.difficult_words(@long_test)
139
+ expect(result).to eql 58
140
+ end
141
+
142
+ it 'should return the correct Dale–Chall readability score' do
143
+ score = TextStat.dale_chall_readability_score(@long_test)
144
+ expect(score).to eql 4.79
145
+ end
146
+
147
+ it 'should return the correct Gunning fog score' do
148
+ score = TextStat.gunning_fog(@long_test)
149
+ expect(score).to eql 11.32
150
+ end
151
+
152
+ it 'should return the correct Lix readability test score' do
153
+ score = TextStat.lix(@long_test)
154
+ expect(score).to eql 45.11
155
+ end
156
+
157
+ it 'should return the readability consensus score' do
158
+ standard = TextStat.text_standard(@long_test)
159
+ expect(standard).to eql '10th and 11th grade'
160
+ end
161
+ end
162
+ end
metadata ADDED
@@ -0,0 +1,116 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: textstat
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jakub Polak
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-11-12 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: text-hyphen
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.4'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.4.1
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.4'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.4.1
33
+ - !ruby/object:Gem::Dependency
34
+ name: bundler
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: 2.0.a
40
+ type: :development
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: 2.0.a
47
+ - !ruby/object:Gem::Dependency
48
+ name: rake
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '10.0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '10.0'
61
+ - !ruby/object:Gem::Dependency
62
+ name: rspec
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '3.0'
68
+ type: :development
69
+ prerelease: false
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '3.0'
75
+ description:
76
+ email:
77
+ - jakub.polak.vz@gmail.com
78
+ executables: []
79
+ extensions: []
80
+ extra_rdoc_files: []
81
+ files:
82
+ - lib/counter.rb
83
+ - lib/easy_words.txt
84
+ - lib/textstat.rb
85
+ - lib/textstat/version.rb
86
+ - spec/textstat_spec.rb
87
+ homepage: https://github.com/kupolak/textstat
88
+ licenses:
89
+ - MIT
90
+ metadata:
91
+ homepage_uri: https://github.com/kupolak/textstat
92
+ source_code_uri: https://github.com/kupolak/textstat
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.7.8
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: Ruby gem to calculate readability statistics of a text object - paragraphs,
113
+ sentences, articles
114
+ test_files:
115
+ - spec/textstat_spec.rb
116
+ - lib/easy_words.txt