textstat 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dictionaries/eu.txt +2200 -0
- data/lib/dictionaries/ga.txt +2200 -0
- data/lib/textstat/basic_stats.rb +50 -9
- data/lib/textstat/dictionary_manager.rb +19 -7
- data/lib/textstat/readability_formulas.rb +4 -4
- data/lib/textstat/version.rb +7 -8
- metadata +10 -9
- data/lib/counter.rb +0 -37
data/lib/textstat/basic_stats.rb
CHANGED
|
@@ -14,6 +14,33 @@ module TextStat
|
|
|
14
14
|
# TextStat.syllable_count(text) # => 6
|
|
15
15
|
# TextStat.sentence_count(text) # => 2
|
|
16
16
|
module BasicStats
|
|
17
|
+
# Frozen regex constants to avoid recompilation overhead
|
|
18
|
+
NON_ALPHA_REGEX = /[^a-zA-Z\s]/.freeze
|
|
19
|
+
SENTENCE_BOUNDARY_REGEX = /[.?!]['\\)\]]*[ |\n][A-Z]/.freeze
|
|
20
|
+
|
|
21
|
+
# Cache for Text::Hyphen instances to avoid recreating them for each call
|
|
22
|
+
@hyphenator_cache = {}
|
|
23
|
+
|
|
24
|
+
class << self
|
|
25
|
+
attr_accessor :hyphenator_cache
|
|
26
|
+
|
|
27
|
+
# Get or create a cached Text::Hyphen instance for the specified language
|
|
28
|
+
#
|
|
29
|
+
# @param language [String] language code
|
|
30
|
+
# @return [Text::Hyphen] cached hyphenator instance
|
|
31
|
+
# @private
|
|
32
|
+
def get_hyphenator(language)
|
|
33
|
+
@hyphenator_cache[language] ||= Text::Hyphen.new(language: language, left: 0, right: 0)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Clear all cached hyphenators
|
|
37
|
+
#
|
|
38
|
+
# @return [Hash] empty cache
|
|
39
|
+
# @private
|
|
40
|
+
def clear_hyphenator_cache
|
|
41
|
+
@hyphenator_cache.clear
|
|
42
|
+
end
|
|
43
|
+
end
|
|
17
44
|
# Count characters in text
|
|
18
45
|
#
|
|
19
46
|
# @param text [String] the text to analyze
|
|
@@ -36,7 +63,7 @@ module TextStat
|
|
|
36
63
|
# TextStat.lexicon_count("Hello, world!") # => 2
|
|
37
64
|
# TextStat.lexicon_count("Hello, world!", false) # => 2
|
|
38
65
|
def lexicon_count(text, remove_punctuation = true)
|
|
39
|
-
text = text.gsub(
|
|
66
|
+
text = text.gsub(NON_ALPHA_REGEX, '').squeeze(' ') if remove_punctuation
|
|
40
67
|
text.split.count
|
|
41
68
|
end
|
|
42
69
|
|
|
@@ -44,7 +71,7 @@ module TextStat
|
|
|
44
71
|
#
|
|
45
72
|
# Uses the text-hyphen library for accurate syllable counting across
|
|
46
73
|
# different languages. Supports 22 languages including English, Spanish,
|
|
47
|
-
# French, German, and more.
|
|
74
|
+
# French, German, and more. Hyphenator instances are cached for performance.
|
|
48
75
|
#
|
|
49
76
|
# @param text [String] the text to analyze
|
|
50
77
|
# @param language [String] language code for hyphenation dictionary
|
|
@@ -58,11 +85,11 @@ module TextStat
|
|
|
58
85
|
return 0 if text.empty?
|
|
59
86
|
|
|
60
87
|
text = text.downcase
|
|
61
|
-
text.gsub(
|
|
62
|
-
|
|
88
|
+
text.gsub(NON_ALPHA_REGEX, '').squeeze(' ') # NOTE: not assigned back (matches original behavior)
|
|
89
|
+
hyphenator = BasicStats.get_hyphenator(language)
|
|
63
90
|
count = 0
|
|
64
91
|
text.split.each do |word|
|
|
65
|
-
word_hyphenated =
|
|
92
|
+
word_hyphenated = hyphenator.visualise(word)
|
|
66
93
|
count += word_hyphenated.count('-') + 1
|
|
67
94
|
end
|
|
68
95
|
count
|
|
@@ -79,7 +106,7 @@ module TextStat
|
|
|
79
106
|
# TextStat.sentence_count("Hello world! How are you?") # => 2
|
|
80
107
|
# TextStat.sentence_count("Dr. Smith went to the U.S.A.") # => 1
|
|
81
108
|
def sentence_count(text)
|
|
82
|
-
text.scan(
|
|
109
|
+
text.scan(SENTENCE_BOUNDARY_REGEX).map(&:strip).count + 1
|
|
83
110
|
end
|
|
84
111
|
|
|
85
112
|
# Calculate average sentence length
|
|
@@ -139,16 +166,30 @@ module TextStat
|
|
|
139
166
|
|
|
140
167
|
# Count polysyllabic words (3+ syllables)
|
|
141
168
|
#
|
|
169
|
+
# Optimized to count syllables for all words in one pass using a cached hyphenator.
|
|
170
|
+
#
|
|
142
171
|
# @param text [String] the text to analyze
|
|
143
172
|
# @param language [String] language code for hyphenation dictionary
|
|
144
173
|
# @return [Integer] number of polysyllabic words
|
|
145
174
|
# @example
|
|
146
175
|
# TextStat.polysyllab_count("beautiful complicated") # => 2
|
|
147
176
|
def polysyllab_count(text, language = 'en_us')
|
|
177
|
+
return 0 if text.empty?
|
|
178
|
+
|
|
179
|
+
# Clean and split text once
|
|
180
|
+
cleaned_text = text.downcase.gsub(NON_ALPHA_REGEX, '').squeeze(' ')
|
|
181
|
+
words = cleaned_text.split
|
|
182
|
+
return 0 if words.empty?
|
|
183
|
+
|
|
184
|
+
# Use cached hyphenator for better performance
|
|
185
|
+
hyphenator = BasicStats.get_hyphenator(language)
|
|
148
186
|
count = 0
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
187
|
+
words.each do |word|
|
|
188
|
+
next if word.empty?
|
|
189
|
+
|
|
190
|
+
word_hyphenated = hyphenator.visualise(word)
|
|
191
|
+
syllables = word_hyphenated.count('-') + 1
|
|
192
|
+
count += 1 if syllables >= 3
|
|
152
193
|
end
|
|
153
194
|
count
|
|
154
195
|
end
|
|
@@ -45,7 +45,8 @@ module TextStat
|
|
|
45
45
|
#
|
|
46
46
|
# Loads a language-specific dictionary from disk and caches it in memory
|
|
47
47
|
# for subsequent calls. This provides significant performance improvements
|
|
48
|
-
# for repeated operations.
|
|
48
|
+
# for repeated operations. Uses optimized file reading with streaming for
|
|
49
|
+
# better performance and memory efficiency.
|
|
49
50
|
#
|
|
50
51
|
# @param language [String] language code (e.g., 'en_us', 'es', 'fr')
|
|
51
52
|
# @return [Set] set of easy words for the specified language
|
|
@@ -63,8 +64,9 @@ module TextStat
|
|
|
63
64
|
easy_words = Set.new
|
|
64
65
|
|
|
65
66
|
if File.exist?(dictionary_file)
|
|
66
|
-
|
|
67
|
-
|
|
67
|
+
# Use foreach for streaming - efficient and memory-friendly for large files
|
|
68
|
+
File.foreach(dictionary_file, chomp: true) do |line|
|
|
69
|
+
easy_words << line
|
|
68
70
|
end
|
|
69
71
|
end
|
|
70
72
|
|
|
@@ -123,7 +125,7 @@ module TextStat
|
|
|
123
125
|
# 1. Not being in the language's easy words dictionary
|
|
124
126
|
# 2. Having more than one syllable
|
|
125
127
|
#
|
|
126
|
-
# This method uses the cached dictionary
|
|
128
|
+
# This method uses the cached dictionary and hyphenator systems for optimal performance.
|
|
127
129
|
#
|
|
128
130
|
# @param text [String] the text to analyze
|
|
129
131
|
# @param language [String] language code for dictionary selection
|
|
@@ -142,12 +144,22 @@ module TextStat
|
|
|
142
144
|
def difficult_words(text, language = 'en_us', return_words = false)
|
|
143
145
|
easy_words = DictionaryManager.load_dictionary(language)
|
|
144
146
|
|
|
147
|
+
# Clean and split text once
|
|
145
148
|
text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split
|
|
149
|
+
return return_words ? Set.new : 0 if text_list.empty?
|
|
150
|
+
|
|
151
|
+
# Get cached hyphenator for syllable counting
|
|
152
|
+
hyphenator = BasicStats.get_hyphenator(language)
|
|
146
153
|
diff_words_set = Set.new
|
|
147
|
-
text_list.each do |value|
|
|
148
|
-
next if easy_words.include? value
|
|
149
154
|
|
|
150
|
-
|
|
155
|
+
# Process each word once
|
|
156
|
+
text_list.each do |word|
|
|
157
|
+
next if easy_words.include?(word)
|
|
158
|
+
|
|
159
|
+
# Count syllables inline using cached hyphenator
|
|
160
|
+
word_hyphenated = hyphenator.visualise(word)
|
|
161
|
+
syllables = word_hyphenated.count('-') + 1
|
|
162
|
+
diff_words_set.add(word) if syllables > 1
|
|
151
163
|
end
|
|
152
164
|
|
|
153
165
|
return_words ? diff_words_set : diff_words_set.length
|
|
@@ -344,11 +344,11 @@ module TextStat
|
|
|
344
344
|
end
|
|
345
345
|
|
|
346
346
|
# Calculate consensus grade from all collected grades
|
|
347
|
+
# Uses Ruby's built-in tally method for better performance
|
|
348
|
+
# Note: Requires Ruby 2.7+, which matches the gem's minimum requirement
|
|
347
349
|
def calculate_consensus_grade(grade)
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
most_common = counter.most_common(1)
|
|
351
|
-
most_common[0][0]
|
|
350
|
+
tallied = grade.tally
|
|
351
|
+
tallied.max_by { |_grade, count| count }[0]
|
|
352
352
|
end
|
|
353
353
|
|
|
354
354
|
# Format grade output based on float_output parameter
|
data/lib/textstat/version.rb
CHANGED
|
@@ -8,15 +8,14 @@
|
|
|
8
8
|
module TextStat
|
|
9
9
|
# Current version of the TextStat gem
|
|
10
10
|
#
|
|
11
|
-
# Version 1.0.
|
|
12
|
-
# -
|
|
13
|
-
# -
|
|
14
|
-
# -
|
|
15
|
-
# -
|
|
16
|
-
# - Full backward compatibility with 0.1.x series
|
|
11
|
+
# Version 1.0.1 includes performance optimizations and bug fixes
|
|
12
|
+
# - Optimized dictionary caching with lazy loading
|
|
13
|
+
# - Improved text_standard performance
|
|
14
|
+
# - Reduced memory allocations
|
|
15
|
+
# - Code quality improvements (Rubocop compliance)
|
|
17
16
|
#
|
|
18
17
|
# @return [String] current version string
|
|
19
18
|
# @example
|
|
20
|
-
# TextStat::VERSION # =>
|
|
21
|
-
VERSION = '1.0.
|
|
19
|
+
# TextStat::VERSION # => "1.0.1"
|
|
20
|
+
VERSION = '1.0.1'.freeze
|
|
22
21
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: textstat
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.0.
|
|
4
|
+
version: 1.0.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Jakub Polak
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-
|
|
11
|
+
date: 2025-12-10 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: text-hyphen
|
|
@@ -16,14 +16,14 @@ dependencies:
|
|
|
16
16
|
requirements:
|
|
17
17
|
- - "~>"
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version: 1.
|
|
19
|
+
version: 1.5.0
|
|
20
20
|
type: :runtime
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
24
|
- - "~>"
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: 1.
|
|
26
|
+
version: 1.5.0
|
|
27
27
|
- !ruby/object:Gem::Dependency
|
|
28
28
|
name: bundler
|
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -142,14 +142,14 @@ dependencies:
|
|
|
142
142
|
requirements:
|
|
143
143
|
- - "~>"
|
|
144
144
|
- !ruby/object:Gem::Version
|
|
145
|
-
version: '
|
|
145
|
+
version: '3.8'
|
|
146
146
|
type: :development
|
|
147
147
|
prerelease: false
|
|
148
148
|
version_requirements: !ruby/object:Gem::Requirement
|
|
149
149
|
requirements:
|
|
150
150
|
- - "~>"
|
|
151
151
|
- !ruby/object:Gem::Version
|
|
152
|
-
version: '
|
|
152
|
+
version: '3.8'
|
|
153
153
|
- !ruby/object:Gem::Dependency
|
|
154
154
|
name: rubocop-thread_safety
|
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -240,14 +240,14 @@ dependencies:
|
|
|
240
240
|
requirements:
|
|
241
241
|
- - "~>"
|
|
242
242
|
- !ruby/object:Gem::Version
|
|
243
|
-
version: '
|
|
243
|
+
version: '7.1'
|
|
244
244
|
type: :development
|
|
245
245
|
prerelease: false
|
|
246
246
|
version_requirements: !ruby/object:Gem::Requirement
|
|
247
247
|
requirements:
|
|
248
248
|
- - "~>"
|
|
249
249
|
- !ruby/object:Gem::Version
|
|
250
|
-
version: '
|
|
250
|
+
version: '7.1'
|
|
251
251
|
description:
|
|
252
252
|
email:
|
|
253
253
|
- jakub.polak.vz@gmail.com
|
|
@@ -255,7 +255,6 @@ executables: []
|
|
|
255
255
|
extensions: []
|
|
256
256
|
extra_rdoc_files: []
|
|
257
257
|
files:
|
|
258
|
-
- lib/counter.rb
|
|
259
258
|
- lib/dictionaries/ca.txt
|
|
260
259
|
- lib/dictionaries/cs.txt
|
|
261
260
|
- lib/dictionaries/da.txt
|
|
@@ -264,8 +263,10 @@ files:
|
|
|
264
263
|
- lib/dictionaries/en_us.txt
|
|
265
264
|
- lib/dictionaries/es.txt
|
|
266
265
|
- lib/dictionaries/et.txt
|
|
266
|
+
- lib/dictionaries/eu.txt
|
|
267
267
|
- lib/dictionaries/fi.txt
|
|
268
268
|
- lib/dictionaries/fr.txt
|
|
269
|
+
- lib/dictionaries/ga.txt
|
|
269
270
|
- lib/dictionaries/hr.txt
|
|
270
271
|
- lib/dictionaries/hu.txt
|
|
271
272
|
- lib/dictionaries/id.txt
|
data/lib/counter.rb
DELETED
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
class Counter < Hash
|
|
2
|
-
def initialize(other = nil)
|
|
3
|
-
super(0)
|
|
4
|
-
other.each { |e| self[e] += 1 } if other.is_a? Array
|
|
5
|
-
other.each { |k, v| self[k] = v } if other.is_a? Hash
|
|
6
|
-
other.each_char { |e| self[e] += 1 } if other.is_a? String
|
|
7
|
-
end
|
|
8
|
-
|
|
9
|
-
def +(other)
|
|
10
|
-
raise TypeError, "cannot add #{other.class} to a Counter" unless other.is_a? Counter
|
|
11
|
-
|
|
12
|
-
result = Counter.new(self)
|
|
13
|
-
other.each { |k, v| result[k] += v }
|
|
14
|
-
result
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
def -(other)
|
|
18
|
-
raise TypeError, "cannot subtract #{other.class} to a Counter" unless other.is_a? Counter
|
|
19
|
-
|
|
20
|
-
result = Counter.new(self)
|
|
21
|
-
other.each { |k, v| result[k] -= v }
|
|
22
|
-
result
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
def most_common(n = nil)
|
|
26
|
-
s = sort_by { |_k, v| -v }
|
|
27
|
-
n ? s.take(n) : s
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
def to_s
|
|
31
|
-
"Counter(#{super})"
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
def inspect
|
|
35
|
-
to_s
|
|
36
|
-
end
|
|
37
|
-
end
|