textstat 0.1.9 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/textstat.rb CHANGED
@@ -1,313 +1,36 @@
1
- require 'text-hyphen'
2
-
3
- class TextStat
4
- GEM_PATH = File.dirname(File.dirname(__FILE__))
5
-
6
- def self.char_count(text, ignore_spaces = true)
7
- text = text.delete(' ') if ignore_spaces
8
- text.length
9
- end
10
-
11
- def self.lexicon_count(text, remove_punctuation = true)
12
- text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
13
- count = text.split(' ').count
14
- count
15
- end
16
-
17
- def self.syllable_count(text, language = 'en_us')
18
- return 0 if text.empty?
19
-
20
- text = text.downcase
21
- text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
22
- dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
23
- count = 0
24
- text.split(' ').each do |word|
25
- word_hyphenated = dictionary.visualise(word)
26
- count += word_hyphenated.count('-') + 1
27
- end
28
- count
29
- end
30
-
31
- def self.sentence_count(text)
32
- text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
33
- end
34
-
35
- def self.avg_sentence_length(text)
36
- asl = lexicon_count(text).to_f / sentence_count(text)
37
- asl.round(1)
38
- rescue ZeroDivisionError
39
- 0.0
40
- end
41
-
42
- def self.avg_syllables_per_word(text, language = 'en_us')
43
- syllable = syllable_count(text, language)
44
- words = lexicon_count(text)
45
- begin
46
- syllables_per_word = syllable.to_f / words
47
- syllables_per_word.round(1)
48
- rescue ZeroDivisionError
49
- 0.0
50
- end
51
- end
52
-
53
- def self.avg_letter_per_word(text)
54
- letters_per_word = char_count(text).to_f / lexicon_count(text)
55
- letters_per_word.round(2)
56
- rescue ZeroDivisionError
57
- 0.0
58
- end
59
-
60
- def self.avg_sentence_per_word(text)
61
- sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
62
- sentence_per_word.round(2)
63
- rescue ZeroDivisionError
64
- 0.0
65
- end
66
-
67
- def self.flesch_reading_ease(text, language = 'en_us')
68
- sentence_length = avg_sentence_length(text)
69
- syllables_per_word = avg_syllables_per_word(text, language)
70
- flesch = 206.835 - 1.015 * sentence_length - 84.6 * syllables_per_word
71
- flesch.round(2)
72
- end
73
-
74
- def self.flesch_kincaid_grade(text, language = 'en_us')
75
- sentence_length = avg_sentence_length(text)
76
- syllables_per_word = avg_syllables_per_word(text, language)
77
- flesch = 0.39 * sentence_length + 11.8 * syllables_per_word - 15.59
78
- flesch.round(1)
79
- end
80
-
81
- def self.polysyllab_count(text, language = 'en_us')
82
- count = 0
83
- text.split(' ').each do |word|
84
- w = syllable_count(word, language)
85
- count += 1 if w >= 3
86
- end
87
- count
88
- end
89
-
90
- def self.smog_index(text, language = 'en_us')
91
- sentences = sentence_count(text)
92
-
93
- if sentences >= 3
94
- begin
95
- polysyllab = polysyllab_count(text, language)
96
- smog = 1.043 * Math.sqrt(30.0 * polysyllab / sentences) + 3.1291
97
- smog.round(1)
98
- rescue ZeroDivisionError
99
- 0.0
100
- end
101
- else
102
- 0.0
103
- end
104
- end
105
-
106
- def self.coleman_liau_index(text)
107
- letters = (avg_letter_per_word(text) * 100).round(2)
108
- sentences = (avg_sentence_per_word(text) * 100).round(2)
109
- coleman = 0.0588 * letters - 0.296 * sentences - 15.8
110
- coleman.round(2)
111
- end
112
-
113
- def self.automated_readability_index(text)
114
- chars = char_count(text)
115
- words = lexicon_count(text)
116
- sentences = sentence_count(text)
117
- begin
118
- a = chars.to_f / words
119
- b = words.to_f / sentences
120
-
121
- readability = 4.71 * a + 0.5 * b - 21.43
122
- readability.round(1)
123
- rescue ZeroDivisionError
124
- 0.0
125
- end
126
- end
127
-
128
- def self.linsear_write_formula(text, language = 'en_us')
129
- easy_word = 0
130
- difficult_word = 0
131
- text_list = text.split(' ')[0..100]
132
-
133
- text_list.each do |word|
134
- if syllable_count(word, language) < 3
135
- easy_word += 1
136
- else
137
- difficult_word += 1
138
- end
139
- end
140
-
141
- text = text_list.join(' ')
142
-
143
- number = (easy_word * 1 + difficult_word * 3).to_f / sentence_count(text)
144
- number -= 2 if number <= 20
145
- number / 2
146
- end
147
-
148
- def self.difficult_words(text, language = 'en_us', return_words = false)
149
- require 'set'
150
- easy_words = Set.new
151
- File.read(File.join(dictionary_path, "#{language}.txt")).each_line do |line|
152
- easy_words << line.chop
153
- end
154
-
155
- text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split(' ')
156
- diff_words_set = Set.new
157
- text_list.each do |value|
158
- next if easy_words.include? value
159
-
160
- diff_words_set.add(value) if syllable_count(value, language) > 1
161
- end
162
- if return_words
163
- diff_words_set
164
- else
165
- diff_words_set.length
166
- end
167
- end
168
-
169
- def self.dale_chall_readability_score(text, language = 'en_us')
170
- word_count = lexicon_count(text)
171
- count = word_count - difficult_words(text, language)
172
-
173
- begin
174
- per = 100.0 * count / word_count
175
- rescue ZeroDivisionError
176
- return 0.0
177
- end
178
-
179
- difficult_words = 100 - per
180
- score = 0.1579 * difficult_words + 0.0496 * avg_sentence_length(text)
181
- score += 3.6365 if difficult_words > 5
182
-
183
- score.round(2)
184
- end
185
-
186
- def self.gunning_fog(text, language = 'en_us')
187
- per_diff_words = 100.0 * difficult_words(text, language) / lexicon_count(text) + 5
188
- grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
189
-
190
- grade.round(2)
191
- rescue ZeroDivisionError
192
- 0.0
193
- end
194
-
195
- def self.lix(text)
196
- words = text.split(' ')
197
- words_length = words.length
198
- long_words = words.count { |word| word.length > 6 }
199
-
200
- per_long_words = 100.0 * long_words / words_length
201
- asl = avg_sentence_length(text)
202
- lix = asl + per_long_words
203
-
204
- lix.round(2)
205
- end
206
-
207
- def self.forcast(text, language = 'en_us')
208
- words = text.split(' ')[0..149]
209
- words_with_one_syllabe = words.count {
210
- |word| syllable_count(word, language) == 1
211
- }
212
- forcast = 20 - (words_with_one_syllabe / 10)
213
- forcast
214
- end
215
-
216
- def self.powers_sumner_kearl(text, language = 'en_us')
217
- grade = 0.0778 * avg_sentence_length(text) + 0.0455 * syllable_count(text, language) - 2.2029
218
- grade.round(2)
219
- end
220
-
221
- def self.spache(text, language = 'en_us')
222
- words = text.split(' ').count
223
- unfamiliar_words = difficult_words(text, language) / words
224
- grade = (0.141 * avg_sentence_length(text)) + (0.086 * unfamiliar_words) + 0.839
225
- grade.round(2)
226
- end
227
-
228
- def self.text_standard(text, float_output=nil)
229
- grade = []
230
-
231
- lower = flesch_kincaid_grade(text).round
232
- upper = flesch_kincaid_grade(text).ceil
233
- grade.append(lower.to_i)
234
- grade.append(upper.to_i)
235
-
236
- # Appending Flesch Reading Easy
237
- score = flesch_reading_ease(text)
238
- if score < 100 && score >= 90
239
- grade.append(5)
240
- elsif score < 90 && score >= 80
241
- grade.append(6)
242
- elsif score < 80 && score >= 70
243
- grade.append(7)
244
- elsif score < 70 && score >= 60
245
- grade.append(8)
246
- grade.append(9)
247
- elsif score < 60 && score >= 50
248
- grade.append(10)
249
- elsif score < 50 && score >= 40
250
- grade.append(11)
251
- elsif score < 40 && score >= 30
252
- grade.append(12)
253
- else
254
- grade.append(13)
255
- end
256
-
257
- # Appending SMOG Index
258
- lower = smog_index(text).round
259
- upper = smog_index(text).ceil
260
- grade.append(lower.to_i)
261
- grade.append(upper.to_i)
262
-
263
- # Appending Coleman_Liau_Index
264
- lower = coleman_liau_index(text).round
265
- upper = coleman_liau_index(text).ceil
266
- grade.append(lower.to_i)
267
- grade.append(upper.to_i)
268
-
269
- # Appending Automated_Readability_Index
270
- lower = automated_readability_index(text).round
271
- upper = automated_readability_index(text).ceil
272
- grade.append(lower.to_i)
273
- grade.append(upper.to_i)
274
-
275
- # Appending Dale_Chall_Readability_Score
276
- lower = dale_chall_readability_score(text).round
277
- upper = dale_chall_readability_score(text).ceil
278
- grade.append(lower.to_i)
279
- grade.append(upper.to_i)
280
-
281
- # Appending Linsear_Write_Formula
282
- lower = linsear_write_formula(text).round
283
- upper = linsear_write_formula(text).ceil
284
- grade.append(lower.to_i)
285
- grade.append(upper.to_i)
286
-
287
- # Appending Gunning Fog Index
288
- lower = gunning_fog(text).round
289
- upper = gunning_fog(text).ceil
290
- grade.append(lower.to_i)
291
- grade.append(upper.to_i)
292
-
293
- # Finding the Readability Consensus based upon all the above tests
294
- require 'counter'
295
- d = Counter.new(grade)
296
- final_grade = d.most_common(1)
297
- score = final_grade[0][0]
298
-
299
- if float_output
300
- score.to_f
301
- else
302
- "#{score.to_i - 1}th and #{score.to_i}th grade"
303
- end
304
- end
305
-
306
- def self.dictionary_path=(path)
307
- @dictionary_path = path
308
- end
309
-
310
- def self.dictionary_path
311
- @dictionary_path ||= File.join(TextStat::GEM_PATH, 'lib', 'dictionaries')
312
- end
313
- end
1
+ # TextStat - Ruby gem for text readability analysis
2
+ #
3
+ # @author Jakub Polak
4
+ # @version 1.0.0
5
+ # @since 0.1.0
6
+ #
7
+ # TextStat is a Ruby gem that calculates statistics from text to determine
8
+ # readability, complexity and grade level of a particular corpus.
9
+ #
10
+ # @example Basic usage
11
+ # require 'textstat'
12
+ #
13
+ # text = \"This is a sample text for analysis.\"
14
+ # TextStat.flesch_reading_ease(text) # => 83.32
15
+ # TextStat.difficult_words(text) # => 1
16
+ # TextStat.text_standard(text) # => \"6th and 7th grade\"
17
+ #
18
+ # @example Performance optimization with caching
19
+ # # Dictionary caching provides 36x performance improvement
20
+ # TextStat.difficult_words(text, 'en_us') # First call loads dictionary
21
+ # TextStat.difficult_words(text, 'en_us') # Subsequent calls use cache
22
+ #
23
+ # # Check cache status
24
+ # TextStat::DictionaryManager.cache_size # => 1
25
+ # TextStat::DictionaryManager.cached_languages # => ['en_us']
26
+ #
27
+ # @see https://github.com/kupolak/textstat
28
+ # @see CHANGELOG.md
29
+
30
+ require_relative 'textstat/main'
31
+
32
+ # For backward compatibility, this file now just loads the new modular structure
33
+ # All functionality has been moved to separate modules:
34
+ # - TextStat::BasicStats - basic text statistics
35
+ # - TextStat::DictionaryManager - dictionary management with caching
36
+ # - TextStat::ReadabilityFormulas - readability calculation formulas
metadata CHANGED
@@ -1,23 +1,20 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textstat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jakub Polak
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-05-21 00:00:00.000000000 Z
11
+ date: 2025-07-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: text-hyphen
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
- - !ruby/object:Gem::Version
19
- version: '1.4'
20
- - - ">="
21
18
  - !ruby/object:Gem::Version
22
19
  version: 1.4.1
23
20
  type: :runtime
@@ -25,53 +22,232 @@ dependencies:
25
22
  version_requirements: !ruby/object:Gem::Requirement
26
23
  requirements:
27
24
  - - "~>"
28
- - !ruby/object:Gem::Version
29
- version: '1.4'
30
- - - ">="
31
25
  - !ruby/object:Gem::Version
32
26
  version: 1.4.1
33
27
  - !ruby/object:Gem::Dependency
34
28
  name: bundler
35
29
  requirement: !ruby/object:Gem::Requirement
36
30
  requirements:
37
- - - "~>"
31
+ - - ">="
38
32
  - !ruby/object:Gem::Version
39
- version: 2.0.a
33
+ version: '2.0'
40
34
  type: :development
41
35
  prerelease: false
42
36
  version_requirements: !ruby/object:Gem::Requirement
43
37
  requirements:
44
- - - "~>"
38
+ - - ">="
45
39
  - !ruby/object:Gem::Version
46
- version: 2.0.a
40
+ version: '2.0'
47
41
  - !ruby/object:Gem::Dependency
48
42
  name: rake
49
43
  requirement: !ruby/object:Gem::Requirement
50
44
  requirements:
51
45
  - - "~>"
52
46
  - !ruby/object:Gem::Version
53
- version: '13.0'
47
+ version: '13.3'
54
48
  type: :development
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
51
  requirements:
58
52
  - - "~>"
59
53
  - !ruby/object:Gem::Version
60
- version: '13.0'
54
+ version: '13.3'
61
55
  - !ruby/object:Gem::Dependency
62
56
  name: rspec
63
57
  requirement: !ruby/object:Gem::Requirement
64
58
  requirements:
65
59
  - - "~>"
66
60
  - !ruby/object:Gem::Version
67
- version: '3.0'
61
+ version: '3.13'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.13'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simplecov
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.22'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.22'
83
+ - !ruby/object:Gem::Dependency
84
+ name: simplecov-lcov
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.8'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.8'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rubocop
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.69'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.69'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rubocop-performance
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '1.23'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '1.23'
125
+ - !ruby/object:Gem::Dependency
126
+ name: rubocop-rake
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '0.6'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '0.6'
139
+ - !ruby/object:Gem::Dependency
140
+ name: rubocop-rspec
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '2.31'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '2.31'
153
+ - !ruby/object:Gem::Dependency
154
+ name: rubocop-thread_safety
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '0.6'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '0.6'
167
+ - !ruby/object:Gem::Dependency
168
+ name: redcarpet
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - "~>"
172
+ - !ruby/object:Gem::Version
173
+ version: '3.6'
174
+ type: :development
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - "~>"
179
+ - !ruby/object:Gem::Version
180
+ version: '3.6'
181
+ - !ruby/object:Gem::Dependency
182
+ name: yard
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - "~>"
186
+ - !ruby/object:Gem::Version
187
+ version: '0.9'
188
+ type: :development
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - "~>"
193
+ - !ruby/object:Gem::Version
194
+ version: '0.9'
195
+ - !ruby/object:Gem::Dependency
196
+ name: benchmark-ips
197
+ requirement: !ruby/object:Gem::Requirement
198
+ requirements:
199
+ - - "~>"
200
+ - !ruby/object:Gem::Version
201
+ version: '2.14'
202
+ type: :development
203
+ prerelease: false
204
+ version_requirements: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - "~>"
207
+ - !ruby/object:Gem::Version
208
+ version: '2.14'
209
+ - !ruby/object:Gem::Dependency
210
+ name: memory_profiler
211
+ requirement: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - "~>"
214
+ - !ruby/object:Gem::Version
215
+ version: '1.1'
216
+ type: :development
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - "~>"
221
+ - !ruby/object:Gem::Version
222
+ version: '1.1'
223
+ - !ruby/object:Gem::Dependency
224
+ name: bundler-audit
225
+ requirement: !ruby/object:Gem::Requirement
226
+ requirements:
227
+ - - "~>"
228
+ - !ruby/object:Gem::Version
229
+ version: '0.9'
68
230
  type: :development
69
231
  prerelease: false
70
232
  version_requirements: !ruby/object:Gem::Requirement
71
233
  requirements:
72
234
  - - "~>"
73
235
  - !ruby/object:Gem::Version
74
- version: '3.0'
236
+ version: '0.9'
237
+ - !ruby/object:Gem::Dependency
238
+ name: brakeman
239
+ requirement: !ruby/object:Gem::Requirement
240
+ requirements:
241
+ - - "~>"
242
+ - !ruby/object:Gem::Version
243
+ version: '6.2'
244
+ type: :development
245
+ prerelease: false
246
+ version_requirements: !ruby/object:Gem::Requirement
247
+ requirements:
248
+ - - "~>"
249
+ - !ruby/object:Gem::Version
250
+ version: '6.2'
75
251
  description:
76
252
  email:
77
253
  - jakub.polak.vz@gmail.com
@@ -82,17 +258,39 @@ files:
82
258
  - lib/counter.rb
83
259
  - lib/dictionaries/ca.txt
84
260
  - lib/dictionaries/cs.txt
261
+ - lib/dictionaries/da.txt
262
+ - lib/dictionaries/de.txt
263
+ - lib/dictionaries/en_uk.txt
85
264
  - lib/dictionaries/en_us.txt
265
+ - lib/dictionaries/es.txt
266
+ - lib/dictionaries/et.txt
267
+ - lib/dictionaries/fi.txt
268
+ - lib/dictionaries/fr.txt
269
+ - lib/dictionaries/hr.txt
270
+ - lib/dictionaries/hu.txt
271
+ - lib/dictionaries/id.txt
272
+ - lib/dictionaries/is.txt
273
+ - lib/dictionaries/it.txt
274
+ - lib/dictionaries/la.txt
86
275
  - lib/dictionaries/nl.txt
276
+ - lib/dictionaries/no2.txt
277
+ - lib/dictionaries/pl.txt
278
+ - lib/dictionaries/pt.txt
279
+ - lib/dictionaries/ru.txt
280
+ - lib/dictionaries/sv.txt
87
281
  - lib/textstat.rb
282
+ - lib/textstat/basic_stats.rb
283
+ - lib/textstat/dictionary_manager.rb
284
+ - lib/textstat/main.rb
285
+ - lib/textstat/readability_formulas.rb
88
286
  - lib/textstat/version.rb
89
- - spec/textstat_spec.rb
90
287
  homepage: https://github.com/kupolak/textstat
91
288
  licenses:
92
289
  - MIT
93
290
  metadata:
94
291
  homepage_uri: https://github.com/kupolak/textstat
95
292
  source_code_uri: https://github.com/kupolak/textstat
293
+ rubygems_mfa_required: 'true'
96
294
  post_install_message:
97
295
  rdoc_options: []
98
296
  require_paths:
@@ -108,11 +306,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
108
306
  - !ruby/object:Gem::Version
109
307
  version: '0'
110
308
  requirements: []
111
- rubygems_version: 3.4.16
309
+ rubygems_version: 3.5.22
112
310
  signing_key:
113
311
  specification_version: 4
114
312
  summary: Ruby gem to calculate readability statistics of a text object - paragraphs,
115
313
  sentences, articles
116
- test_files:
117
- - spec/textstat_spec.rb
118
- - lib/dictionaries/en_us.txt
314
+ test_files: []