RubyGems - text-metrics - Versions diffs - 0.0.1 → 1.0.0.beta2 - Mend

text-metrics 0.0.1 → 1.0.0.beta2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +38 -1
data/README.md +70 -32
data/UPGRADING.md +73 -0
data/lib/text-metrics.rb +6 -0
data/lib/text_metrics/dictionaries/english_word_syllable_database.txt +126052 -0
data/lib/text_metrics/levenshtein.rb +46 -0
data/lib/text_metrics/processors/american_english.rb +38 -10
data/lib/text_metrics/processors/base.rb +117 -126
data/lib/text_metrics/processors/french.rb +32 -14
data/lib/text_metrics/version.rb +1 -1
data/lib/text_metrics.rb +28 -25
metadata +12 -14
data/lib/text_metrics/dictionnaries/en_us.txt +0 -2945
data/lib/text_metrics/dictionnaries/fr.txt +0 -1462
data/lib/text_metrics/dictionnaries/french_word_syllable_database.yml +0 -125345
data/lib/text_metrics/dictionnaries/lexique-383.csv +0 -142695
/data/lib/text_metrics/{dictionnaries → dictionaries}/french_word_syllable_exceptions.yml +0 -0

data/lib/text_metrics/levenshtein.rb ADDED Viewed

@@ -0,0 +1,46 @@
+# frozen_string_literal: true
+module TextMetrics
+  # Levenshtein edit distance between two strings, plus a normalized similarity score.
+  # Comparison is a property of two texts, so it lives here rather than on a single-text analyzer.
+  module Levenshtein
+    module_function
+    # Raw edit distance: the number of single-character insertions, deletions or
+    # substitutions needed to turn +first+ into +second+. Case-sensitive.
+    def distance(first, second)
+      first = first.to_s
+      second = second.to_s
+      m = first.length
+      n = second.length
+      return n if m.zero?
+      return m if n.zero?
+      matrix = Array.new(m + 1) { Array.new(n + 1, 0) }
+      (0..m).each { |i| matrix[i][0] = i }
+      (0..n).each { |j| matrix[0][j] = j }
+      (1..m).each do |i|
+        (1..n).each do |j|
+          cost = (first[i - 1] == second[j - 1]) ? 0 : 1
+          matrix[i][j] = [
+            matrix[i - 1][j] + 1,        # deletion
+            matrix[i][j - 1] + 1,        # insertion
+            matrix[i - 1][j - 1] + cost  # substitution
+          ].min
+        end
+      end
+      matrix[m][n]
+    end
+    # Similarity as a 0–100 score: 100.0 means identical, 0.0 means nothing in common.
+    def similarity(first, second)
+      max_length = [first.to_s.length, second.to_s.length].max
+      return 100.0 if max_length.zero?
+      ((max_length - distance(first, second)).to_f / max_length * 100).round(2)
+    end
+  end
+end

data/lib/text_metrics/processors/american_english.rb CHANGED Viewed

@@ -5,25 +5,53 @@ require "text_metrics/processors/base"
 module TextMetrics
   module Processors
     class AmericanEnglish < TextMetrics::Processors::Base
-      def flesch_reading_ease
-        sentence_length = words_per_sentence_average
-        syllables_per_word = syllables_per_word_average
-        flesch = 206.835 - 1.015 * sentence_length - 84.6 * syllables_per_word
+      SYLLABLE_DATABASE_PATH = File.join(GEM_PATH, "dictionaries/english_word_syllable_database.txt").freeze
+      DATABASE_LOAD_MUTEX = Mutex.new
+      class << self
+        # CMU Pronouncing Dictionary syllable counts, loaded once and shared across all
+        # instances and threads. Lazy so requiring the gem (or using only
+        # French/Levenshtein) doesn't pay the load cost; the mutex guarantees the file is
+        # parsed exactly once under concurrent first use, and the double check keeps the
+        # common path lock-free. The result is frozen, so concurrent reads are safe.
+        def syllable_database
+          return @syllable_database if @syllable_database
+          DATABASE_LOAD_MUTEX.synchronize do
+            @syllable_database ||= load_syllable_database
+          end
+        end
+        private
+        def load_syllable_database
+          database = {}
+          File.foreach(SYLLABLE_DATABASE_PATH) do |line|
+            word, count = line.split(" ", 2)
+            database[word] = count.to_i
+          end
+          database.freeze
+        end
+      end
-        flesch.round(2).clamp(0.0, 100.0)
+      def initialize(text, language: :en_us)
+        super
       end
-      def flesch_kincaid_grade
-        sentence_length = words_per_sentence_average
-        syllables_per_word = syllables_per_word_average
-        flesch = 0.39 * sentence_length + 11.8 * syllables_per_word - 15.59
+      def flesch_reading_ease
+        return 0.0 if words_count.zero?
-        flesch.round(1).clamp(0.0, 18.0)
+        (206.835 - 1.015 * average_words_per_sentence - 84.6 * average_syllables_per_word).round(2)
       end
       private
+      # CMUdict is the source of truth; fall back to hyphenation for out-of-vocabulary words.
       def count_syllables_in_word(word)
+        self.class.syllable_database.fetch(word.downcase) { hyphenated_syllable_count(word) }
+      end
+      def hyphenated_syllable_count(word)
         hyphen_dictionary.visualise(word).count("-") + 1
       end

data/lib/text_metrics/processors/base.rb CHANGED Viewed

@@ -5,43 +5,52 @@ require "text/hyphen"
 module TextMetrics
   module Processors
     class Base
-      attr_reader :text, :with_syllable_exceptions
-      def initialize(text:, with_syllable_exceptions: true)
-        @text = text&.squeeze(" ") || ""
-        @with_syllable_exceptions = with_syllable_exceptions
-      end
-      def all
-        @all ||= {
-          words_count: words_count,
-          characters_count: characters_count,
-          sentences_count: sentences_count,
-          syllables_count: syllables_count,
-          punctuations_count: punctuations_count,
-          syllables_per_word_average: syllables_per_word_average,
-          letters_per_word_average: letters_per_word_average,
-          words_per_sentence_average: words_per_sentence_average,
-          words_per_punctuations_average: words_per_punctuations_average,
-          characters_per_sentence_average: characters_per_sentence_average,
-          punctuations_per_sentence_average: punctuations_per_sentence_average,
-          flesch_reading_ease: flesch_reading_ease,
-          flesch_kincaid_grade: flesch_kincaid_grade,
-          lix: lix,
-          smog_index: smog_index,
-          coleman_liau_index: coleman_liau_index
-        }
-      end
-      # _count methods
-      def characters_count(ignore_spaces: true)
-        ignore_spaces ? text.delete(" ").length : text.length
-      end
+      GEM_PATH = File.dirname(__FILE__, 2).freeze
+      # The public metric surface. #to_h and the individual readers are both derived
+      # from this list, so they can never drift apart.
+      METRICS = %i[
+        words_count
+        characters_count
+        sentences_count
+        syllables_count
+        punctuation_count
+        syllables_per_word_average
+        letters_per_word_average
+        words_per_sentence_average
+        characters_per_sentence_average
+        words_per_punctuation_average
+        punctuation_per_sentence_average
+        flesch_reading_ease
+        flesch_kincaid_grade
+        lix
+        smog_index
+        gunning_fog_index
+        coleman_liau_index
+      ].freeze
+      attr_reader :text, :language
+      def initialize(text, language: nil)
+        @text = (text || "").squeeze(" ")
+        @language = language
+      end
+      # Every metric in one hash. Single source of truth for the public surface.
+      # Memoized — the analyzer is immutable once built.
+      def to_h
+        @to_h ||= METRICS.to_h { |metric| [metric, public_send(metric)] }
+      end
+      # counts
       def words_count
         words.size
       end
+      def characters_count(ignore_spaces: true)
+        ignore_spaces ? text.delete(" ").length : text.length
+      end
       def sentences_count
         return 0 if words_count.zero?
@@ -52,81 +61,79 @@ module TextMetrics
         words.sum { |word| count_syllables_in_word(word) }
       end
-      def poly_syllabes_count
-        words.count { |word| count_syllables_in_word(word) >= 3 }
-      end
-      def punctuations_count
-        punctuations.size
+      def punctuation_count
+        punctuation_marks.size
       end
-      # _average methods
+      # averages — rounded for display only. The readability scores below are computed
+      # from the full-precision ratios (#average_*), not from these rounded values.
       def syllables_per_word_average
-        return 0.0 if words_count.zero? || syllables_count.zero?
-        (syllables_count.to_f / words_count).round(1)
+        average_syllables_per_word.round(1)
       end
       def letters_per_word_average
-        return 0.0 if words_count.zero? || characters_count.zero?
-        (characters_count.to_f / words_count).round(2)
+        average_letters_per_word.round(2)
       end
       def words_per_sentence_average
-        return 0.0 if words_count.zero? || sentences_count.zero?
-        (words_count.to_f / sentences_count).round(2)
+        average_words_per_sentence.round(2)
       end
       def characters_per_sentence_average
-        return 0.0 if characters_count.zero? || sentences_count.zero?
+        return 0.0 if sentences_count.zero?
         (characters_count.to_f / sentences_count).round(2)
       end
-      def punctuations_per_sentence_average
-        return 0.0 if punctuations_count.zero? || sentences_count.zero?
+      def words_per_punctuation_average
+        return 0.0 if words_count.zero? || punctuation_count.zero?
-        (punctuations_count.to_f / sentences_count).round(2)
+        (words_count.to_f / punctuation_count).round(2)
       end
-      def words_per_punctuations_average
-        return 0.0 if words_count.zero? || punctuations_count.zero?
+      def punctuation_per_sentence_average
+        return 0.0 if punctuation_count.zero? || sentences_count.zero?
-        (words_count.to_f / punctuations_count).round(2)
+        (punctuation_count.to_f / sentences_count).round(2)
       end
-      # readability scores
+      # readability scores — computed from full-precision ratios, rounded only at the end,
+      # and returned unclamped (a Flesch score can legitimately exceed 100 or go negative).
+      # Language-specific; subclasses supply the constants.
       def flesch_reading_ease
         raise NotImplementedError
       end
+      # Flesch-Kincaid Grade Level (US school grade). The same formula is used for every
+      # language — there is no validated non-English adaptation.
       def flesch_kincaid_grade
-        raise NotImplementedError
+        return 0.0 if words_count.zero?
+        (0.39 * average_words_per_sentence + 11.8 * average_syllables_per_word - 15.59).round(1)
       end
       def smog_index
-        if sentences_count >= 3
-          begin
-            smog = 1.043 * Math.sqrt(30.0 * poly_syllabes_count / sentences_count) + 3.1291
-            smog.round(1)
-          rescue ZeroDivisionError
-            0.0
-          end
-        else
-          0.0
-        end
+        return 0.0 if sentences_count < 3
+        (1.043 * Math.sqrt(30.0 * count_polysyllabic_words / sentences_count) + 3.1291).round(1)
+      rescue ZeroDivisionError
+        0.0
+      end
+      def gunning_fog_index
+        return 0.0 if words_count.zero?
+        (0.4 * (average_words_per_sentence + 100.0 * count_polysyllabic_words / words_count)).round(1)
       end
       def coleman_liau_index
-        return 0.0 if words_per_sentence_average.zero? || letters_per_word_average.zero?
+        return 0.0 if words_count.zero?
+        letters_per_100_words = average_letters_per_word * 100
+        sentences_per_100_words = sentences_count.to_f / words_count * 100
-        letters = (letters_per_word_average * 100).round(2)
-        sentences = (1.to_f / words_per_sentence_average * 100).round(2)
-        coleman = 0.0588 * letters - 0.296 * sentences - 15.8
-        coleman.round(2).clamp(0.0, 20.0)
+        (0.0588 * letters_per_100_words - 0.296 * sentences_per_100_words - 15.8).round(2)
       end
       def lix
@@ -134,39 +141,56 @@ module TextMetrics
         long_words = words.count { |word| word.length > 6 }
-        per_long_words = 100.0 * long_words / words_count
-        lix = words_per_sentence_average + per_long_words
+        (average_words_per_sentence + 100.0 * long_words / words_count).round(2)
+      end
+      private
+      # full-precision ratios feeding the readability formulas
+      def average_syllables_per_word
+        return 0.0 if words_count.zero?
-        lix.round(2).clamp(0.0, 100.0)
+        syllables_count.to_f / words_count
       end
-      # similarity
-      def levenshtein_distance_from(other_text, normalize: true)
-        distance = levenshtein_distance(@text, other_text)
-        return distance unless normalize
+      def average_letters_per_word
+        return 0.0 if words_count.zero?
-        # Normalize to a score out of 100
-        max_length = [@text.length, other_text.length].max
-        normalized_score = if max_length.zero?
-          100
-        else
-          ((max_length - distance).to_f / max_length) * 100
-        end
+        letters_count.to_f / words_count
+      end
+      def average_words_per_sentence
+        return 0.0 if sentences_count.zero?
+        words_count.to_f / sentences_count
+      end
-        normalized_score.round(2)
+      # Count of alphabetic characters only (letters), as required by Coleman-Liau and the
+      # letters-per-word metric — distinct from #characters_count, which includes digits
+      # and punctuation.
+      def letters_count
+        @letters_count ||= text.scan(/[[:alpha:]]/).size
+      end
+      # Subclasses provide the language-specific syllable counting.
+      def count_syllables_in_word(word)
+        raise NotImplementedError
+      end
+      def count_polysyllabic_words
+        words.count { |word| count_syllables_in_word(word) >= 3 }
       end
       # tokenizers
-      #
-      def punctuations
-        @punctuations ||= text.scan(/[.,!?;:]/)
+      def punctuation_marks
+        @punctuation_marks ||= text.scan(/[.,!?;:]/)
       end
       def words
         @words ||= begin
           normalized_text = text.downcase.strip
-          # Split the sentence into words, including hyphenated words, and excluding numbers
+          # Split into words, including hyphenated words, and excluding numbers
           normalized_text.scan(/\b[A-Za-zÀ-ÖØ-öø-ÿ'-]+\b/)
         end
       end
@@ -174,39 +198,6 @@ module TextMetrics
       def sentences
         @sentences ||= text.scan(/(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(?=\s|$)/)
       end
-      private
-      def levenshtein_distance(s1, s2)
-        m = s1.length
-        n = s2.length
-        # Return if one of the strings is empty
-        return n if m == 0
-        return m if n == 0
-        # Create a matrix
-        matrix = Array.new(m + 1) { Array.new(n + 1) }
-        # Initialize the first row and column
-        (0..m).each { |i| matrix[i][0] = i }
-        (0..n).each { |j| matrix[0][j] = j }
-        # Fill in the matrix
-        (1..m).each do |i|
-          (1..n).each do |j|
-            cost = (s1[i - 1] == s2[j - 1]) ? 0 : 1
-            matrix[i][j] = [
-              matrix[i - 1][j] + 1,      # Deletion
-              matrix[i][j - 1] + 1,      # Insertion
-              matrix[i - 1][j - 1] + cost  # Substitution
-            ].min
-          end
-        end
-        # Return the Levenshtein distance
-        matrix[m][n]
-      end
     end
   end
 end

data/lib/text_metrics/processors/french.rb CHANGED Viewed

@@ -6,30 +6,48 @@ require "yaml"
 module TextMetrics
   module Processors
     class French < TextMetrics::Processors::Base
-      GEM_PATH = File.dirname(__FILE__, 2).freeze
-      SYLLABLE_EXCEPTIONS_PATH = File.join(GEM_PATH, "dictionnaries/french_word_syllable_exceptions.yml").freeze
-      SYLLABLE_EXCEPTIONS = YAML.load_file(SYLLABLE_EXCEPTIONS_PATH).freeze
+      SYLLABLE_EXCEPTIONS_PATH = File.join(GEM_PATH, "dictionaries/french_word_syllable_exceptions.yml").freeze
+      EXCEPTIONS_LOAD_MUTEX = Mutex.new
-      def flesch_reading_ease
-        sentence_length = words_per_sentence_average
-        syllables_per_word = syllables_per_word_average
-        flesch = 206.835 - 1.015 * sentence_length - 73.6 * syllables_per_word
+      class << self
+        # Syllable counts for the words the heuristic gets wrong (derived from Lexique),
+        # loaded once and shared across all instances and threads. Lazy so requiring the gem
+        # (or using only English/Levenshtein) doesn't pay the YAML load; the mutex guarantees
+        # the file is parsed exactly once under concurrent first use, and the double check keeps
+        # the common path lock-free. The result is frozen, so concurrent reads are safe.
+        def syllable_exceptions
+          return @syllable_exceptions if @syllable_exceptions
-        flesch.round(2).clamp(0.0, 100.0)
+          EXCEPTIONS_LOAD_MUTEX.synchronize do
+            @syllable_exceptions ||= YAML.load_file(SYLLABLE_EXCEPTIONS_PATH).freeze
+          end
+        end
       end
-      def flesch_kincaid_grade
-        sentence_length = words_per_sentence_average
-        syllables_per_word = syllables_per_word_average
-        flesch = (0.55 * sentence_length) + (11.76 * syllables_per_word) - 15.79
+      # +with_syllable_exceptions+ is an internal toggle used by the dictionary-generation
+      # scripts to run the bare heuristic; the public API always leaves it on.
+      def initialize(text, language: :fr, with_syllable_exceptions: true)
+        super(text, language: language)
+        @with_syllable_exceptions = with_syllable_exceptions
+      end
-        flesch.round(1).clamp(0.0, 18.0)
+      # French Flesch Reading Ease — the Kandel-Moles (1958) adaptation:
+      # 207 - 1.015 * (words / sentences) - 73.6 * (syllables / words).
+      def flesch_reading_ease
+        return 0.0 if words_count.zero?
+        (207 - 1.015 * average_words_per_sentence - 73.6 * average_syllables_per_word).round(2)
       end
       private
+      attr_reader :with_syllable_exceptions
       def count_syllables_in_word(word)
-        return SYLLABLE_EXCEPTIONS[word].to_i if with_syllable_exceptions && SYLLABLE_EXCEPTIONS.key?(word)
+        if with_syllable_exceptions
+          exceptions = self.class.syllable_exceptions
+          return exceptions[word].to_i if exceptions.key?(word)
+        end
         word = word.downcase.gsub(/[^a-zàâäéèêëîïôöùûüç]/, "")

data/lib/text_metrics/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module TextMetrics # :nodoc:
-  VERSION = "0.0.1"
+  VERSION = "1.0.0.beta2"
 end

data/lib/text_metrics.rb CHANGED Viewed

@@ -1,41 +1,44 @@
 # frozen_string_literal: true
 require "text_metrics/version"
-require "text_metrics/processors/french"
+require "text_metrics/levenshtein"
 require "text_metrics/processors/american_english"
-require "forwardable"
+require "text_metrics/processors/french"
 module TextMetrics
   class Error < StandardError; end
-  class TextMetrics
-    extend Forwardable
-    def_delegators :text_metrics_processor, :words_count, :characters_count, :syllables_count,
-      :sentences_count, :words_per_sentence_average, :syllables_per_word_average,
-      :letters_per_word_average, :words_per_sentence_average, :characters_per_sentence_average,
-      :flesch_reading_ease, :flesch_kincaid_grade, :all, :levenshtein_distance_from
-    PROCESSORS = {
-      "fr" => Processors::French,
-      "en_us" => Processors::AmericanEnglish
-    }
+  DEFAULT_LANGUAGE = :en_us
-    attr_reader :text, :language, :text_metrics_processor
+  PROCESSORS = {
+    en_us: Processors::AmericanEnglish,
+    fr: Processors::French
+  }.freeze
-    def initialize(text:, language: "en_us")
-      @text = text
-      @language = language
-      @text_metrics_processor = PROCESSORS[language].new(text: text)
-    end
+  # Build an analyzer for +text+ in the given +language+ (:en_us or :fr).
+  # Returns the language-specific processor, which exposes every metric and #to_h.
+  def self.new(text, language: DEFAULT_LANGUAGE)
+    language = resolve_language(language)
+    PROCESSORS.fetch(language).new(text, language: language)
+  end
-    private
+  # Raw Levenshtein edit distance between two texts.
+  def self.distance(text, other)
+    Levenshtein.distance(text, other)
+  end
-    def processor_for(language)
-      PROCESSORS[language] || raise("Unknown language: #{language}, available languages: #{PROCESSORS.keys}")
-    end
+  # Levenshtein similarity between two texts, as a 0–100 score (100.0 == identical).
+  def self.similarity(text, other)
+    Levenshtein.similarity(text, other)
   end
-  def self.new(text:, language: "en_us")
-    TextMetrics.new(text: text, language: language)
+  # Coerce to a known language symbol, or raise a helpful error.
+  # Handles nil, strings and symbols without leaking a NoMethodError.
+  def self.resolve_language(language)
+    resolved = language.to_s.to_sym
+    return resolved if PROCESSORS.key?(resolved)
+    raise Error, "Unknown language #{language.inspect}. Available languages: #{PROCESSORS.keys.join(", ")}"
   end
+  private_class_method :resolve_language
 end

metadata CHANGED Viewed

@@ -1,14 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: text-metrics
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 1.0.0.beta2
 platform: ruby
 authors:
 - Adrien POLY
-autorequire:
 bindir: bin
 cert_chain: []
-date: 2026-06-18 00:00:00.000000000 Z
+date: 1980-01-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: text-hyphen
@@ -108,8 +107,9 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
-description: A Ruby gem to compute various metrics for text, Currently focusing on
-  English and French
+description: Text Metrics computes readability scores (Flesch Reading Ease, Flesch-Kincaid
+  Grade, SMOG, Coleman-Liau, LIX) along with word, sentence, syllable and character
+  statistics, plus Levenshtein distance and similarity. English and French are supported.
 email:
 - adrienpoly@gmail.com
 executables: []
@@ -119,12 +119,12 @@ files:
 - CHANGELOG.md
 - LICENSE.txt
 - README.md
+- UPGRADING.md
+- lib/text-metrics.rb
 - lib/text_metrics.rb
-- lib/text_metrics/dictionnaries/en_us.txt
-- lib/text_metrics/dictionnaries/fr.txt
-- lib/text_metrics/dictionnaries/french_word_syllable_database.yml
-- lib/text_metrics/dictionnaries/french_word_syllable_exceptions.yml
-- lib/text_metrics/dictionnaries/lexique-383.csv
+- lib/text_metrics/dictionaries/english_word_syllable_database.txt
+- lib/text_metrics/dictionaries/french_word_syllable_exceptions.yml
+- lib/text_metrics/levenshtein.rb
 - lib/text_metrics/processors/american_english.rb
 - lib/text_metrics/processors/base.rb
 - lib/text_metrics/processors/french.rb
@@ -138,7 +138,6 @@ metadata:
   documentation_uri: https://github.com/plume-app/text-metrics
   homepage_uri: https://github.com/plume-app/text-metrics
   source_code_uri: https://github.com/plume-app/text-metrics
-post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -153,8 +152,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.4.19
-signing_key:
+rubygems_version: 3.6.9
 specification_version: 4
-summary: A Ruby gem to compute various metrics for text
+summary: Readability scores and text statistics for English and French
 test_files: []