textstat 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,33 @@ module TextStat
14
14
  # TextStat.syllable_count(text) # => 6
15
15
  # TextStat.sentence_count(text) # => 2
16
16
  module BasicStats
17
+ # Frozen regex constants to avoid recompilation overhead
18
+ NON_ALPHA_REGEX = /[^a-zA-Z\s]/.freeze
19
+ SENTENCE_BOUNDARY_REGEX = /[.?!]['\\)\]]*[ |\n][A-Z]/.freeze
20
+
21
+ # Cache for Text::Hyphen instances to avoid recreating them for each call
22
+ @hyphenator_cache = {}
23
+
24
+ class << self
25
+ attr_accessor :hyphenator_cache
26
+
27
+ # Get or create a cached Text::Hyphen instance for the specified language
28
+ #
29
+ # @param language [String] language code
30
+ # @return [Text::Hyphen] cached hyphenator instance
31
+ # @private
32
+ def get_hyphenator(language)
33
+ @hyphenator_cache[language] ||= Text::Hyphen.new(language: language, left: 0, right: 0)
34
+ end
35
+
36
+ # Clear all cached hyphenators
37
+ #
38
+ # @return [Hash] empty cache
39
+ # @private
40
+ def clear_hyphenator_cache
41
+ @hyphenator_cache.clear
42
+ end
43
+ end
17
44
  # Count characters in text
18
45
  #
19
46
  # @param text [String] the text to analyze
@@ -36,7 +63,7 @@ module TextStat
36
63
  # TextStat.lexicon_count("Hello, world!") # => 2
37
64
  # TextStat.lexicon_count("Hello, world!", false) # => 2
38
65
  def lexicon_count(text, remove_punctuation = true)
39
- text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
66
+ text = text.gsub(NON_ALPHA_REGEX, '').squeeze(' ') if remove_punctuation
40
67
  text.split.count
41
68
  end
42
69
 
@@ -44,7 +71,7 @@ module TextStat
44
71
  #
45
72
  # Uses the text-hyphen library for accurate syllable counting across
46
73
  # different languages. Supports 22 languages including English, Spanish,
47
- # French, German, and more.
74
+ # French, German, and more. Hyphenator instances are cached for performance.
48
75
  #
49
76
  # @param text [String] the text to analyze
50
77
  # @param language [String] language code for hyphenation dictionary
@@ -58,11 +85,11 @@ module TextStat
58
85
  return 0 if text.empty?
59
86
 
60
87
  text = text.downcase
61
- text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
62
- dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
88
+ text.gsub(NON_ALPHA_REGEX, '').squeeze(' ') # NOTE: not assigned back (matches original behavior)
89
+ hyphenator = BasicStats.get_hyphenator(language)
63
90
  count = 0
64
91
  text.split.each do |word|
65
- word_hyphenated = dictionary.visualise(word)
92
+ word_hyphenated = hyphenator.visualise(word)
66
93
  count += word_hyphenated.count('-') + 1
67
94
  end
68
95
  count
@@ -79,7 +106,7 @@ module TextStat
79
106
  # TextStat.sentence_count("Hello world! How are you?") # => 2
80
107
  # TextStat.sentence_count("Dr. Smith went to the U.S.A.") # => 1
81
108
  def sentence_count(text)
82
- text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
109
+ text.scan(SENTENCE_BOUNDARY_REGEX).map(&:strip).count + 1
83
110
  end
84
111
 
85
112
  # Calculate average sentence length
@@ -139,16 +166,30 @@ module TextStat
139
166
 
140
167
  # Count polysyllabic words (3+ syllables)
141
168
  #
169
+ # Optimized to count syllables for all words in one pass using a cached hyphenator.
170
+ #
142
171
  # @param text [String] the text to analyze
143
172
  # @param language [String] language code for hyphenation dictionary
144
173
  # @return [Integer] number of polysyllabic words
145
174
  # @example
146
175
  # TextStat.polysyllab_count("beautiful complicated") # => 2
147
176
  def polysyllab_count(text, language = 'en_us')
177
+ return 0 if text.empty?
178
+
179
+ # Clean and split text once
180
+ cleaned_text = text.downcase.gsub(NON_ALPHA_REGEX, '').squeeze(' ')
181
+ words = cleaned_text.split
182
+ return 0 if words.empty?
183
+
184
+ # Use cached hyphenator for better performance
185
+ hyphenator = BasicStats.get_hyphenator(language)
148
186
  count = 0
149
- text.split.each do |word|
150
- w = syllable_count(word, language)
151
- count += 1 if w >= 3
187
+ words.each do |word|
188
+ next if word.empty?
189
+
190
+ word_hyphenated = hyphenator.visualise(word)
191
+ syllables = word_hyphenated.count('-') + 1
192
+ count += 1 if syllables >= 3
152
193
  end
153
194
  count
154
195
  end
@@ -45,7 +45,8 @@ module TextStat
45
45
  #
46
46
  # Loads a language-specific dictionary from disk and caches it in memory
47
47
  # for subsequent calls. This provides significant performance improvements
48
- # for repeated operations.
48
+ # for repeated operations. Uses optimized file reading with streaming for
49
+ # better performance and memory efficiency.
49
50
  #
50
51
  # @param language [String] language code (e.g., 'en_us', 'es', 'fr')
51
52
  # @return [Set] set of easy words for the specified language
@@ -63,8 +64,9 @@ module TextStat
63
64
  easy_words = Set.new
64
65
 
65
66
  if File.exist?(dictionary_file)
66
- File.read(dictionary_file).each_line do |line|
67
- easy_words << line.chomp
67
+ # Use foreach for streaming - efficient and memory-friendly for large files
68
+ File.foreach(dictionary_file, chomp: true) do |line|
69
+ easy_words << line
68
70
  end
69
71
  end
70
72
 
@@ -123,7 +125,7 @@ module TextStat
123
125
  # 1. Not being in the language's easy words dictionary
124
126
  # 2. Having more than one syllable
125
127
  #
126
- # This method uses the cached dictionary system for optimal performance.
128
+ # This method uses the cached dictionary and hyphenator systems for optimal performance.
127
129
  #
128
130
  # @param text [String] the text to analyze
129
131
  # @param language [String] language code for dictionary selection
@@ -142,12 +144,22 @@ module TextStat
142
144
  def difficult_words(text, language = 'en_us', return_words = false)
143
145
  easy_words = DictionaryManager.load_dictionary(language)
144
146
 
147
+ # Clean and split text once
145
148
  text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split
149
+ return return_words ? Set.new : 0 if text_list.empty?
150
+
151
+ # Get cached hyphenator for syllable counting
152
+ hyphenator = BasicStats.get_hyphenator(language)
146
153
  diff_words_set = Set.new
147
- text_list.each do |value|
148
- next if easy_words.include? value
149
154
 
150
- diff_words_set.add(value) if syllable_count(value, language) > 1
155
+ # Process each word once
156
+ text_list.each do |word|
157
+ next if easy_words.include?(word)
158
+
159
+ # Count syllables inline using cached hyphenator
160
+ word_hyphenated = hyphenator.visualise(word)
161
+ syllables = word_hyphenated.count('-') + 1
162
+ diff_words_set.add(word) if syllables > 1
151
163
  end
152
164
 
153
165
  return_words ? diff_words_set : diff_words_set.length
@@ -344,11 +344,11 @@ module TextStat
344
344
  end
345
345
 
346
346
  # Calculate consensus grade from all collected grades
347
+ # Uses Ruby's built-in tally method for better performance
348
+ # Note: Requires Ruby 2.7+, which matches the gem's minimum requirement
347
349
  def calculate_consensus_grade(grade)
348
- require_relative '../counter'
349
- counter = Counter.new(grade)
350
- most_common = counter.most_common(1)
351
- most_common[0][0]
350
+ tallied = grade.tally
351
+ tallied.max_by { |_grade, count| count }[0]
352
352
  end
353
353
 
354
354
  # Format grade output based on float_output parameter
@@ -8,15 +8,14 @@
8
8
  module TextStat
9
9
  # Current version of the TextStat gem
10
10
  #
11
- # Version 1.0.0 represents the first stable release with:
12
- # - 36x performance improvement through dictionary caching
13
- # - Modular architecture with separate modules for different functionality
14
- # - Comprehensive test coverage (199 tests)
15
- # - Support for 22 languages
16
- # - Full backward compatibility with 0.1.x series
11
+ # Version 1.0.1 includes performance optimizations and bug fixes
12
+ # - Optimized dictionary caching with lazy loading
13
+ # - Improved text_standard performance
14
+ # - Reduced memory allocations
15
+ # - Code quality improvements (Rubocop compliance)
17
16
  #
18
17
  # @return [String] current version string
19
18
  # @example
20
- # TextStat::VERSION # => \"1.0.0\"
21
- VERSION = '1.0.0'.freeze
19
+ # TextStat::VERSION # => "1.0.1"
20
+ VERSION = '1.0.1'.freeze
22
21
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textstat
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jakub Polak
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-07-08 00:00:00.000000000 Z
11
+ date: 2025-12-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: text-hyphen
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 1.4.1
19
+ version: 1.5.0
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 1.4.1
26
+ version: 1.5.0
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -142,14 +142,14 @@ dependencies:
142
142
  requirements:
143
143
  - - "~>"
144
144
  - !ruby/object:Gem::Version
145
- version: '2.31'
145
+ version: '3.8'
146
146
  type: :development
147
147
  prerelease: false
148
148
  version_requirements: !ruby/object:Gem::Requirement
149
149
  requirements:
150
150
  - - "~>"
151
151
  - !ruby/object:Gem::Version
152
- version: '2.31'
152
+ version: '3.8'
153
153
  - !ruby/object:Gem::Dependency
154
154
  name: rubocop-thread_safety
155
155
  requirement: !ruby/object:Gem::Requirement
@@ -240,14 +240,14 @@ dependencies:
240
240
  requirements:
241
241
  - - "~>"
242
242
  - !ruby/object:Gem::Version
243
- version: '6.2'
243
+ version: '7.1'
244
244
  type: :development
245
245
  prerelease: false
246
246
  version_requirements: !ruby/object:Gem::Requirement
247
247
  requirements:
248
248
  - - "~>"
249
249
  - !ruby/object:Gem::Version
250
- version: '6.2'
250
+ version: '7.1'
251
251
  description:
252
252
  email:
253
253
  - jakub.polak.vz@gmail.com
@@ -255,7 +255,6 @@ executables: []
255
255
  extensions: []
256
256
  extra_rdoc_files: []
257
257
  files:
258
- - lib/counter.rb
259
258
  - lib/dictionaries/ca.txt
260
259
  - lib/dictionaries/cs.txt
261
260
  - lib/dictionaries/da.txt
@@ -264,8 +263,10 @@ files:
264
263
  - lib/dictionaries/en_us.txt
265
264
  - lib/dictionaries/es.txt
266
265
  - lib/dictionaries/et.txt
266
+ - lib/dictionaries/eu.txt
267
267
  - lib/dictionaries/fi.txt
268
268
  - lib/dictionaries/fr.txt
269
+ - lib/dictionaries/ga.txt
269
270
  - lib/dictionaries/hr.txt
270
271
  - lib/dictionaries/hu.txt
271
272
  - lib/dictionaries/id.txt
data/lib/counter.rb DELETED
@@ -1,37 +0,0 @@
1
- class Counter < Hash
2
- def initialize(other = nil)
3
- super(0)
4
- other.each { |e| self[e] += 1 } if other.is_a? Array
5
- other.each { |k, v| self[k] = v } if other.is_a? Hash
6
- other.each_char { |e| self[e] += 1 } if other.is_a? String
7
- end
8
-
9
- def +(other)
10
- raise TypeError, "cannot add #{other.class} to a Counter" unless other.is_a? Counter
11
-
12
- result = Counter.new(self)
13
- other.each { |k, v| result[k] += v }
14
- result
15
- end
16
-
17
- def -(other)
18
- raise TypeError, "cannot subtract #{other.class} to a Counter" unless other.is_a? Counter
19
-
20
- result = Counter.new(self)
21
- other.each { |k, v| result[k] -= v }
22
- result
23
- end
24
-
25
- def most_common(n = nil)
26
- s = sort_by { |_k, v| -v }
27
- n ? s.take(n) : s
28
- end
29
-
30
- def to_s
31
- "Counter(#{super})"
32
- end
33
-
34
- def inspect
35
- to_s
36
- end
37
- end