RubyGems - textstat - Versions diffs - 1.0.0 → 1.0.1 - Mend

textstat 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/lib/dictionaries/eu.txt +2200 -0
data/lib/dictionaries/ga.txt +2200 -0
data/lib/textstat/basic_stats.rb +50 -9
data/lib/textstat/dictionary_manager.rb +19 -7
data/lib/textstat/readability_formulas.rb +4 -4
data/lib/textstat/version.rb +7 -8
metadata +10 -9
data/lib/counter.rb +0 -37

data/lib/textstat/basic_stats.rb CHANGED Viewed

@@ -14,6 +14,33 @@ module TextStat
   #   TextStat.syllable_count(text)      # => 6
   #   TextStat.sentence_count(text)      # => 2
   module BasicStats
+    # Frozen regex constants to avoid recompilation overhead
+    NON_ALPHA_REGEX = /[^a-zA-Z\s]/.freeze
+    SENTENCE_BOUNDARY_REGEX = /[.?!]['\\)\]]*[ |\n][A-Z]/.freeze
+    # Cache for Text::Hyphen instances to avoid recreating them for each call
+    @hyphenator_cache = {}
+    class << self
+      attr_accessor :hyphenator_cache
+      # Get or create a cached Text::Hyphen instance for the specified language
+      #
+      # @param language [String] language code
+      # @return [Text::Hyphen] cached hyphenator instance
+      # @private
+      def get_hyphenator(language)
+        @hyphenator_cache[language] ||= Text::Hyphen.new(language: language, left: 0, right: 0)
+      end
+      # Clear all cached hyphenators
+      #
+      # @return [Hash] empty cache
+      # @private
+      def clear_hyphenator_cache
+        @hyphenator_cache.clear
+      end
+    end
     # Count characters in text
     #
     # @param text [String] the text to analyze
@@ -36,7 +63,7 @@ module TextStat
     #   TextStat.lexicon_count("Hello, world!")       # => 2
     #   TextStat.lexicon_count("Hello, world!", false) # => 2
     def lexicon_count(text, remove_punctuation = true)
-      text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
+      text = text.gsub(NON_ALPHA_REGEX, '').squeeze(' ') if remove_punctuation
       text.split.count
     end
@@ -44,7 +71,7 @@ module TextStat
     #
     # Uses the text-hyphen library for accurate syllable counting across
     # different languages. Supports 22 languages including English, Spanish,
-    # French, German, and more.
+    # French, German, and more. Hyphenator instances are cached for performance.
     #
     # @param text [String] the text to analyze
     # @param language [String] language code for hyphenation dictionary
@@ -58,11 +85,11 @@ module TextStat
       return 0 if text.empty?
       text = text.downcase
-      text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
-      dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
+      text.gsub(NON_ALPHA_REGEX, '').squeeze(' ') # NOTE: not assigned back (matches original behavior)
+      hyphenator = BasicStats.get_hyphenator(language)
       count = 0
       text.split.each do |word|
-        word_hyphenated = dictionary.visualise(word)
+        word_hyphenated = hyphenator.visualise(word)
         count += word_hyphenated.count('-') + 1
       end
       count
@@ -79,7 +106,7 @@ module TextStat
     #   TextStat.sentence_count("Hello world! How are you?")  # => 2
     #   TextStat.sentence_count("Dr. Smith went to the U.S.A.") # => 1
     def sentence_count(text)
-      text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
+      text.scan(SENTENCE_BOUNDARY_REGEX).map(&:strip).count + 1
     end
     # Calculate average sentence length
@@ -139,16 +166,30 @@ module TextStat
     # Count polysyllabic words (3+ syllables)
     #
+    # Optimized to count syllables for all words in one pass using a cached hyphenator.
+    #
     # @param text [String] the text to analyze
     # @param language [String] language code for hyphenation dictionary
     # @return [Integer] number of polysyllabic words
     # @example
     #   TextStat.polysyllab_count("beautiful complicated")  # => 2
     def polysyllab_count(text, language = 'en_us')
+      return 0 if text.empty?
+      # Clean and split text once
+      cleaned_text = text.downcase.gsub(NON_ALPHA_REGEX, '').squeeze(' ')
+      words = cleaned_text.split
+      return 0 if words.empty?
+      # Use cached hyphenator for better performance
+      hyphenator = BasicStats.get_hyphenator(language)
       count = 0
-      text.split.each do |word|
-        w = syllable_count(word, language)
-        count += 1 if w >= 3
+      words.each do |word|
+        next if word.empty?
+        word_hyphenated = hyphenator.visualise(word)
+        syllables = word_hyphenated.count('-') + 1
+        count += 1 if syllables >= 3
       end
       count
     end

data/lib/textstat/dictionary_manager.rb CHANGED Viewed

@@ -45,7 +45,8 @@ module TextStat
       #
       # Loads a language-specific dictionary from disk and caches it in memory
       # for subsequent calls. This provides significant performance improvements
-      # for repeated operations.
+      # for repeated operations. Uses optimized file reading with streaming for
+      # better performance and memory efficiency.
       #
       # @param language [String] language code (e.g., 'en_us', 'es', 'fr')
       # @return [Set] set of easy words for the specified language
@@ -63,8 +64,9 @@ module TextStat
         easy_words = Set.new
         if File.exist?(dictionary_file)
-          File.read(dictionary_file).each_line do |line|
-            easy_words << line.chomp
+          # Use foreach for streaming - efficient and memory-friendly for large files
+          File.foreach(dictionary_file, chomp: true) do |line|
+            easy_words << line
           end
         end
@@ -123,7 +125,7 @@ module TextStat
     # 1. Not being in the language's easy words dictionary
     # 2. Having more than one syllable
     #
-    # This method uses the cached dictionary system for optimal performance.
+    # This method uses the cached dictionary and hyphenator systems for optimal performance.
     #
     # @param text [String] the text to analyze
     # @param language [String] language code for dictionary selection
@@ -142,12 +144,22 @@ module TextStat
     def difficult_words(text, language = 'en_us', return_words = false)
       easy_words = DictionaryManager.load_dictionary(language)
+      # Clean and split text once
       text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split
+      return return_words ? Set.new : 0 if text_list.empty?
+      # Get cached hyphenator for syllable counting
+      hyphenator = BasicStats.get_hyphenator(language)
       diff_words_set = Set.new
-      text_list.each do |value|
-        next if easy_words.include? value
-        diff_words_set.add(value) if syllable_count(value, language) > 1
+      # Process each word once
+      text_list.each do |word|
+        next if easy_words.include?(word)
+        # Count syllables inline using cached hyphenator
+        word_hyphenated = hyphenator.visualise(word)
+        syllables = word_hyphenated.count('-') + 1
+        diff_words_set.add(word) if syllables > 1
       end
       return_words ? diff_words_set : diff_words_set.length

data/lib/textstat/readability_formulas.rb CHANGED Viewed

@@ -344,11 +344,11 @@ module TextStat
     end
     # Calculate consensus grade from all collected grades
+    # Uses Ruby's built-in tally method for better performance
+    # Note: Requires Ruby 2.7+, which matches the gem's minimum requirement
     def calculate_consensus_grade(grade)
-      require_relative '../counter'
-      counter = Counter.new(grade)
-      most_common = counter.most_common(1)
-      most_common[0][0]
+      tallied = grade.tally
+      tallied.max_by { |_grade, count| count }[0]
     end
     # Format grade output based on float_output parameter

data/lib/textstat/version.rb CHANGED Viewed

@@ -8,15 +8,14 @@
 module TextStat
   # Current version of the TextStat gem
   #
-  # Version 1.0.0 represents the first stable release with:
-  # - 36x performance improvement through dictionary caching
-  # - Modular architecture with separate modules for different functionality
-  # - Comprehensive test coverage (199 tests)
-  # - Support for 22 languages
-  # - Full backward compatibility with 0.1.x series
+  # Version 1.0.1 includes performance optimizations and bug fixes
+  # - Optimized dictionary caching with lazy loading
+  # - Improved text_standard performance
+  # - Reduced memory allocations
+  # - Code quality improvements (Rubocop compliance)
   #
   # @return [String] current version string
   # @example
-  #   TextStat::VERSION  # => \"1.0.0\"
-  VERSION = '1.0.0'.freeze
+  #   TextStat::VERSION  # => "1.0.1"
+  VERSION = '1.0.1'.freeze
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: textstat
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.0.1
 platform: ruby
 authors:
 - Jakub Polak
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2025-07-08 00:00:00.000000000 Z
+date: 2025-12-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: text-hyphen
@@ -16,14 +16,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.4.1
+        version: 1.5.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.4.1
+        version: 1.5.0
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
@@ -142,14 +142,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '2.31'
+        version: '3.8'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '2.31'
+        version: '3.8'
 - !ruby/object:Gem::Dependency
   name: rubocop-thread_safety
   requirement: !ruby/object:Gem::Requirement
@@ -240,14 +240,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '6.2'
+        version: '7.1'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '6.2'
+        version: '7.1'
 description:
 email:
 - jakub.polak.vz@gmail.com
@@ -255,7 +255,6 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
-- lib/counter.rb
 - lib/dictionaries/ca.txt
 - lib/dictionaries/cs.txt
 - lib/dictionaries/da.txt
@@ -264,8 +263,10 @@ files:
 - lib/dictionaries/en_us.txt
 - lib/dictionaries/es.txt
 - lib/dictionaries/et.txt
+- lib/dictionaries/eu.txt
 - lib/dictionaries/fi.txt
 - lib/dictionaries/fr.txt
+- lib/dictionaries/ga.txt
 - lib/dictionaries/hr.txt
 - lib/dictionaries/hu.txt
 - lib/dictionaries/id.txt

data/lib/counter.rb DELETED Viewed

@@ -1,37 +0,0 @@
-class Counter < Hash
-  def initialize(other = nil)
-    super(0)
-    other.each { |e| self[e] += 1 } if other.is_a? Array
-    other.each { |k, v| self[k] = v } if other.is_a? Hash
-    other.each_char { |e| self[e] += 1 } if other.is_a? String
-  end
-  def +(other)
-    raise TypeError, "cannot add #{other.class} to a Counter" unless other.is_a? Counter
-    result = Counter.new(self)
-    other.each { |k, v| result[k] += v }
-    result
-  end
-  def -(other)
-    raise TypeError, "cannot subtract #{other.class} to a Counter" unless other.is_a? Counter
-    result = Counter.new(self)
-    other.each { |k, v| result[k] -= v }
-    result
-  end
-  def most_common(n = nil)
-    s = sort_by { |_k, v| -v }
-    n ? s.take(n) : s
-  end
-  def to_s
-    "Counter(#{super})"
-  end
-  def inspect
-    to_s
-  end
-end