textstat 0.1.8 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/textstat.rb CHANGED
@@ -1,309 +1,36 @@
1
- require 'text-hyphen'
2
-
3
- class TextStat
4
- GEM_PATH = File.dirname(File.dirname(__FILE__))
5
-
6
- def self.char_count(text, ignore_spaces = true)
7
- text = text.delete(' ') if ignore_spaces
8
- text.length
9
- end
10
-
11
- def self.lexicon_count(text, remove_punctuation = true)
12
- text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
13
- count = text.split(' ').count
14
- count
15
- end
16
-
17
- def self.syllable_count(text, language = 'en_us')
18
- return 0 if text.empty?
19
-
20
- text = text.downcase
21
- text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
22
- dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
23
- count = 0
24
- text.split(' ').each do |word|
25
- word_hyphenated = dictionary.visualise(word)
26
- count += word_hyphenated.count('-') + 1
27
- end
28
- count
29
- end
30
-
31
- def self.sentence_count(text)
32
- text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
33
- end
34
-
35
- def self.avg_sentence_length(text)
36
- asl = lexicon_count(text).to_f / sentence_count(text)
37
- asl.round(1)
38
- rescue ZeroDivisionError
39
- 0.0
40
- end
41
-
42
- def self.avg_syllables_per_word(text, language = 'en_us')
43
- syllable = syllable_count(text, language)
44
- words = lexicon_count(text)
45
- begin
46
- syllables_per_word = syllable.to_f / words
47
- syllables_per_word.round(1)
48
- rescue ZeroDivisionError
49
- 0.0
50
- end
51
- end
52
-
53
- def self.avg_letter_per_word(text)
54
- letters_per_word = char_count(text).to_f / lexicon_count(text)
55
- letters_per_word.round(2)
56
- rescue ZeroDivisionError
57
- 0.0
58
- end
59
-
60
- def self.avg_sentence_per_word(text)
61
- sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
62
- sentence_per_word.round(2)
63
- rescue ZeroDivisionError
64
- 0.0
65
- end
66
-
67
- def self.flesch_reading_ease(text, language = 'en_us')
68
- sentence_length = avg_sentence_length(text)
69
- syllables_per_word = avg_syllables_per_word(text, language)
70
- flesch = 206.835 - 1.015 * sentence_length - 84.6 * syllables_per_word
71
- flesch.round(2)
72
- end
73
-
74
- def self.flesch_kincaid_grade(text, language = 'en_us')
75
- sentence_length = avg_sentence_length(text)
76
- syllables_per_word = avg_syllables_per_word(text, language)
77
- flesch = 0.39 * sentence_length + 11.8 * syllables_per_word - 15.59
78
- flesch.round(1)
79
- end
80
-
81
- def self.polysyllab_count(text, language = 'en_us')
82
- count = 0
83
- text.split(' ').each do |word|
84
- w = syllable_count(word, language)
85
- count += 1 if w >= 3
86
- end
87
- count
88
- end
89
-
90
- def self.smog_index(text, language = 'en_us')
91
- sentences = sentence_count(text)
92
-
93
- if sentences >= 3
94
- begin
95
- polysyllab = polysyllab_count(text, language)
96
- smog = 1.043 * Math.sqrt(30.0 * polysyllab / sentences) + 3.1291
97
- smog.round(1)
98
- rescue ZeroDivisionError
99
- 0.0
100
- end
101
- else
102
- 0.0
103
- end
104
- end
105
-
106
- def self.coleman_liau_index(text)
107
- letters = (avg_letter_per_word(text) * 100).round(2)
108
- sentences = (avg_sentence_per_word(text) * 100).round(2)
109
- coleman = 0.0588 * letters - 0.296 * sentences - 15.8
110
- coleman.round(2)
111
- end
112
-
113
- def self.automated_readability_index(text)
114
- chars = char_count(text)
115
- words = lexicon_count(text)
116
- sentences = sentence_count(text)
117
- begin
118
- a = chars.to_f / words
119
- b = words.to_f / sentences
120
-
121
- readability = 4.71 * a + 0.5 * b - 21.43
122
- readability.round(1)
123
- rescue ZeroDivisionError
124
- 0.0
125
- end
126
- end
127
-
128
- def self.linsear_write_formula(text, language = 'en_us')
129
- easy_word = 0
130
- difficult_word = 0
131
- text_list = text.split(' ')[0..100]
132
-
133
- text_list.each do |word|
134
- if syllable_count(word, language) < 3
135
- easy_word += 1
136
- else
137
- difficult_word += 1
138
- end
139
- end
140
-
141
- text = text_list.join(' ')
142
-
143
- number = (easy_word * 1 + difficult_word * 3).to_f / sentence_count(text)
144
- number -= 2 if number <= 20
145
- number / 2
146
- end
147
-
148
- def self.difficult_words(text, language = 'en_us')
149
- require 'set'
150
- easy_words = Set.new
151
- File.read(File.join(dictionary_path, "#{language}.txt")).each_line do |line|
152
- easy_words << line.chop
153
- end
154
-
155
- text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
156
- diff_words_set = Set.new
157
- text_list.each do |value|
158
- next if easy_words.include? value
159
-
160
- diff_words_set.add(value) if syllable_count(value, language) > 1
161
- end
162
- diff_words_set.length
163
- end
164
-
165
- def self.dale_chall_readability_score(text, language = 'en_us')
166
- word_count = lexicon_count(text)
167
- count = word_count - difficult_words(text, language)
168
-
169
- begin
170
- per = 100.0 * count / word_count
171
- rescue ZeroDivisionError
172
- return 0.0
173
- end
174
-
175
- difficult_words = 100 - per
176
- score = 0.1579 * difficult_words + 0.0496 * avg_sentence_length(text)
177
- score += 3.6365 if difficult_words > 5
178
-
179
- score.round(2)
180
- end
181
-
182
- def self.gunning_fog(text, language = 'en_us')
183
- per_diff_words = 100.0 * difficult_words(text, language) / lexicon_count(text) + 5
184
- grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
185
-
186
- grade.round(2)
187
- rescue ZeroDivisionError
188
- 0.0
189
- end
190
-
191
- def self.lix(text)
192
- words = text.split(' ')
193
- words_length = words.length
194
- long_words = words.count { |word| word.length > 6 }
195
-
196
- per_long_words = 100.0 * long_words / words_length
197
- asl = avg_sentence_length(text)
198
- lix = asl + per_long_words
199
-
200
- lix.round(2)
201
- end
202
-
203
- def self.forcast(text, language = 'en_us')
204
- words = text.split(' ')[0..149]
205
- words_with_one_syllabe = words.count {
206
- |word| syllable_count(word, language) == 1
207
- }
208
- forcast = 20 - (words_with_one_syllabe / 10)
209
- forcast
210
- end
211
-
212
- def self.powers_sumner_kearl(text, language = 'en_us')
213
- grade = 0.0778 * avg_sentence_length(text) + 0.0455 * syllable_count(text, language) - 2.2029
214
- grade.round(2)
215
- end
216
-
217
- def self.spache(text, language = 'en_us')
218
- words = text.split(' ').count
219
- unfamiliar_words = difficult_words(text, language) / words
220
- grade = (0.141 * avg_sentence_length(text)) + (0.086 * unfamiliar_words) + 0.839
221
- grade.round(2)
222
- end
223
-
224
- def self.text_standard(text, float_output=nil)
225
- grade = []
226
-
227
- lower = flesch_kincaid_grade(text).round
228
- upper = flesch_kincaid_grade(text).ceil
229
- grade.append(lower.to_i)
230
- grade.append(upper.to_i)
231
-
232
- # Appending Flesch Reading Easy
233
- score = flesch_reading_ease(text)
234
- if score < 100 && score >= 90
235
- grade.append(5)
236
- elsif score < 90 && score >= 80
237
- grade.append(6)
238
- elsif score < 80 && score >= 70
239
- grade.append(7)
240
- elsif score < 70 && score >= 60
241
- grade.append(8)
242
- grade.append(9)
243
- elsif score < 60 && score >= 50
244
- grade.append(10)
245
- elsif score < 50 && score >= 40
246
- grade.append(11)
247
- elsif score < 40 && score >= 30
248
- grade.append(12)
249
- else
250
- grade.append(13)
251
- end
252
-
253
- # Appending SMOG Index
254
- lower = smog_index(text).round
255
- upper = smog_index(text).ceil
256
- grade.append(lower.to_i)
257
- grade.append(upper.to_i)
258
-
259
- # Appending Coleman_Liau_Index
260
- lower = coleman_liau_index(text).round
261
- upper = coleman_liau_index(text).ceil
262
- grade.append(lower.to_i)
263
- grade.append(upper.to_i)
264
-
265
- # Appending Automated_Readability_Index
266
- lower = automated_readability_index(text).round
267
- upper = automated_readability_index(text).ceil
268
- grade.append(lower.to_i)
269
- grade.append(upper.to_i)
270
-
271
- # Appending Dale_Chall_Readability_Score
272
- lower = dale_chall_readability_score(text).round
273
- upper = dale_chall_readability_score(text).ceil
274
- grade.append(lower.to_i)
275
- grade.append(upper.to_i)
276
-
277
- # Appending Linsear_Write_Formula
278
- lower = linsear_write_formula(text).round
279
- upper = linsear_write_formula(text).ceil
280
- grade.append(lower.to_i)
281
- grade.append(upper.to_i)
282
-
283
- # Appending Gunning Fog Index
284
- lower = gunning_fog(text).round
285
- upper = gunning_fog(text).ceil
286
- grade.append(lower.to_i)
287
- grade.append(upper.to_i)
288
-
289
- # Finding the Readability Consensus based upon all the above tests
290
- require 'counter'
291
- d = Counter.new(grade)
292
- final_grade = d.most_common(1)
293
- score = final_grade[0][0]
294
-
295
- if float_output
296
- score.to_f
297
- else
298
- "#{score.to_i - 1}th and #{score.to_i}th grade"
299
- end
300
- end
301
-
302
- def self.dictionary_path=(path)
303
- @dictionary_path = path
304
- end
305
-
306
- def self.dictionary_path
307
- @dictionary_path ||= File.join(TextStat::GEM_PATH, 'lib', 'dictionaries')
308
- end
309
- end
1
+ # TextStat - Ruby gem for text readability analysis
2
+ #
3
+ # @author Jakub Polak
4
+ # @version 1.0.0
5
+ # @since 0.1.0
6
+ #
7
+ # TextStat is a Ruby gem that calculates statistics from text to determine
8
+ # readability, complexity and grade level of a particular corpus.
9
+ #
10
+ # @example Basic usage
11
+ # require 'textstat'
12
+ #
13
+ # text = \"This is a sample text for analysis.\"
14
+ # TextStat.flesch_reading_ease(text) # => 83.32
15
+ # TextStat.difficult_words(text) # => 1
16
+ # TextStat.text_standard(text) # => \"6th and 7th grade\"
17
+ #
18
+ # @example Performance optimization with caching
19
+ # # Dictionary caching provides 36x performance improvement
20
+ # TextStat.difficult_words(text, 'en_us') # First call loads dictionary
21
+ # TextStat.difficult_words(text, 'en_us') # Subsequent calls use cache
22
+ #
23
+ # # Check cache status
24
+ # TextStat::DictionaryManager.cache_size # => 1
25
+ # TextStat::DictionaryManager.cached_languages # => ['en_us']
26
+ #
27
+ # @see https://github.com/kupolak/textstat
28
+ # @see CHANGELOG.md
29
+
30
+ require_relative 'textstat/main'
31
+
32
+ # For backward compatibility, this file now just loads the new modular structure
33
+ # All functionality has been moved to separate modules:
34
+ # - TextStat::BasicStats - basic text statistics
35
+ # - TextStat::DictionaryManager - dictionary management with caching
36
+ # - TextStat::ReadabilityFormulas - readability calculation formulas
metadata CHANGED
@@ -1,23 +1,20 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textstat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.8
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jakub Polak
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-05-15 00:00:00.000000000 Z
11
+ date: 2025-07-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: text-hyphen
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
- - !ruby/object:Gem::Version
19
- version: '1.4'
20
- - - ">="
21
18
  - !ruby/object:Gem::Version
22
19
  version: 1.4.1
23
20
  type: :runtime
@@ -25,54 +22,233 @@ dependencies:
25
22
  version_requirements: !ruby/object:Gem::Requirement
26
23
  requirements:
27
24
  - - "~>"
28
- - !ruby/object:Gem::Version
29
- version: '1.4'
30
- - - ">="
31
25
  - !ruby/object:Gem::Version
32
26
  version: 1.4.1
33
27
  - !ruby/object:Gem::Dependency
34
28
  name: bundler
35
29
  requirement: !ruby/object:Gem::Requirement
36
30
  requirements:
37
- - - "~>"
31
+ - - ">="
38
32
  - !ruby/object:Gem::Version
39
- version: 2.0.a
33
+ version: '2.0'
40
34
  type: :development
41
35
  prerelease: false
42
36
  version_requirements: !ruby/object:Gem::Requirement
43
37
  requirements:
44
- - - "~>"
38
+ - - ">="
45
39
  - !ruby/object:Gem::Version
46
- version: 2.0.a
40
+ version: '2.0'
47
41
  - !ruby/object:Gem::Dependency
48
42
  name: rake
49
43
  requirement: !ruby/object:Gem::Requirement
50
44
  requirements:
51
45
  - - "~>"
52
46
  - !ruby/object:Gem::Version
53
- version: '13.0'
47
+ version: '13.3'
54
48
  type: :development
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
51
  requirements:
58
52
  - - "~>"
59
53
  - !ruby/object:Gem::Version
60
- version: '13.0'
54
+ version: '13.3'
61
55
  - !ruby/object:Gem::Dependency
62
56
  name: rspec
63
57
  requirement: !ruby/object:Gem::Requirement
64
58
  requirements:
65
59
  - - "~>"
66
60
  - !ruby/object:Gem::Version
67
- version: '3.0'
61
+ version: '3.13'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.13'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.22'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.22'
83
+ - !ruby/object:Gem::Dependency
84
+ name: simplecov-lcov
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.8'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.8'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rubocop
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.69'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.69'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rubocop-performance
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '1.23'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '1.23'
125
+ - !ruby/object:Gem::Dependency
126
+ name: rubocop-rake
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '0.6'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '0.6'
139
+ - !ruby/object:Gem::Dependency
140
+ name: rubocop-rspec
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '2.31'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '2.31'
153
+ - !ruby/object:Gem::Dependency
154
+ name: rubocop-thread_safety
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '0.6'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '0.6'
167
+ - !ruby/object:Gem::Dependency
168
+ name: redcarpet
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - "~>"
172
+ - !ruby/object:Gem::Version
173
+ version: '3.6'
174
+ type: :development
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - "~>"
179
+ - !ruby/object:Gem::Version
180
+ version: '3.6'
181
+ - !ruby/object:Gem::Dependency
182
+ name: yard
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - "~>"
186
+ - !ruby/object:Gem::Version
187
+ version: '0.9'
188
+ type: :development
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - "~>"
193
+ - !ruby/object:Gem::Version
194
+ version: '0.9'
195
+ - !ruby/object:Gem::Dependency
196
+ name: benchmark-ips
197
+ requirement: !ruby/object:Gem::Requirement
198
+ requirements:
199
+ - - "~>"
200
+ - !ruby/object:Gem::Version
201
+ version: '2.14'
202
+ type: :development
203
+ prerelease: false
204
+ version_requirements: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - "~>"
207
+ - !ruby/object:Gem::Version
208
+ version: '2.14'
209
+ - !ruby/object:Gem::Dependency
210
+ name: memory_profiler
211
+ requirement: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - "~>"
214
+ - !ruby/object:Gem::Version
215
+ version: '1.1'
216
+ type: :development
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - "~>"
221
+ - !ruby/object:Gem::Version
222
+ version: '1.1'
223
+ - !ruby/object:Gem::Dependency
224
+ name: bundler-audit
225
+ requirement: !ruby/object:Gem::Requirement
226
+ requirements:
227
+ - - "~>"
228
+ - !ruby/object:Gem::Version
229
+ version: '0.9'
68
230
  type: :development
69
231
  prerelease: false
70
232
  version_requirements: !ruby/object:Gem::Requirement
71
233
  requirements:
72
234
  - - "~>"
73
235
  - !ruby/object:Gem::Version
74
- version: '3.0'
75
- description:
236
+ version: '0.9'
237
+ - !ruby/object:Gem::Dependency
238
+ name: brakeman
239
+ requirement: !ruby/object:Gem::Requirement
240
+ requirements:
241
+ - - "~>"
242
+ - !ruby/object:Gem::Version
243
+ version: '6.2'
244
+ type: :development
245
+ prerelease: false
246
+ version_requirements: !ruby/object:Gem::Requirement
247
+ requirements:
248
+ - - "~>"
249
+ - !ruby/object:Gem::Version
250
+ version: '6.2'
251
+ description:
76
252
  email:
77
253
  - jakub.polak.vz@gmail.com
78
254
  executables: []
@@ -82,18 +258,40 @@ files:
82
258
  - lib/counter.rb
83
259
  - lib/dictionaries/ca.txt
84
260
  - lib/dictionaries/cs.txt
261
+ - lib/dictionaries/da.txt
262
+ - lib/dictionaries/de.txt
263
+ - lib/dictionaries/en_uk.txt
85
264
  - lib/dictionaries/en_us.txt
265
+ - lib/dictionaries/es.txt
266
+ - lib/dictionaries/et.txt
267
+ - lib/dictionaries/fi.txt
268
+ - lib/dictionaries/fr.txt
269
+ - lib/dictionaries/hr.txt
270
+ - lib/dictionaries/hu.txt
271
+ - lib/dictionaries/id.txt
272
+ - lib/dictionaries/is.txt
273
+ - lib/dictionaries/it.txt
274
+ - lib/dictionaries/la.txt
86
275
  - lib/dictionaries/nl.txt
276
+ - lib/dictionaries/no2.txt
277
+ - lib/dictionaries/pl.txt
278
+ - lib/dictionaries/pt.txt
279
+ - lib/dictionaries/ru.txt
280
+ - lib/dictionaries/sv.txt
87
281
  - lib/textstat.rb
282
+ - lib/textstat/basic_stats.rb
283
+ - lib/textstat/dictionary_manager.rb
284
+ - lib/textstat/main.rb
285
+ - lib/textstat/readability_formulas.rb
88
286
  - lib/textstat/version.rb
89
- - spec/textstat_spec.rb
90
287
  homepage: https://github.com/kupolak/textstat
91
288
  licenses:
92
289
  - MIT
93
290
  metadata:
94
291
  homepage_uri: https://github.com/kupolak/textstat
95
292
  source_code_uri: https://github.com/kupolak/textstat
96
- post_install_message:
293
+ rubygems_mfa_required: 'true'
294
+ post_install_message:
97
295
  rdoc_options: []
98
296
  require_paths:
99
297
  - lib
@@ -108,11 +306,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
108
306
  - !ruby/object:Gem::Version
109
307
  version: '0'
110
308
  requirements: []
111
- rubygems_version: 3.2.17
112
- signing_key:
309
+ rubygems_version: 3.5.22
310
+ signing_key:
113
311
  specification_version: 4
114
312
  summary: Ruby gem to calculate readability statistics of a text object - paragraphs,
115
313
  sentences, articles
116
- test_files:
117
- - spec/textstat_spec.rb
118
- - lib/dictionaries/en_us.txt
314
+ test_files: []