RubyGems - phonetics - Versions diffs - 1.1.1 → 1.5.0 - Mend

phonetics 1.1.1 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/.github/workflows/gempush.yml +32 -0
data/.gitignore +2 -0
data/Gemfile.lock +22 -1
data/README.md +1 -1
data/Rakefile +21 -2
data/VERSION +1 -1
data/ext/c_levenshtein/extconf.rb +10 -0
data/ext/c_levenshtein/levenshtein.c +197 -0
data/ext/c_levenshtein/phonetic_cost.c +53075 -0
data/ext/c_levenshtein/phonetic_cost.h +1 -0
data/lib/phonetics.rb +86 -9
data/lib/phonetics/c_levenshtein.bundle +0 -0
data/lib/phonetics/levenshtein.rb +16 -101
data/lib/phonetics/ruby_levenshtein.rb +171 -0
data/phonetics.gemspec +4 -1
metadata +61 -13

data/ext/c_levenshtein/phonetic_cost.h ADDED

	@@ -0,0 +1 @@
1	+ float phonetic_cost(long, long);

data/lib/phonetics.rb CHANGED

@@ -223,14 +223,8 @@ module Phonetics
   )
   def distance(phoneme1, phoneme2)
-    types = [Symbols.fetch(phoneme1), Symbols.fetch(phoneme2)].sort
-    if types == [:consonant, :vowel]
-      1.0
-    elsif types == [:vowel, :vowel]
-      Vowels.distance(phoneme1, phoneme2)
-    elsif types == [:consonant, :consonant]
-      Consonants.distance(phoneme1, phoneme2)
-    end
+    return 0 if phoneme1 == phoneme2
+    distance_map.fetch(phoneme1).fetch(phoneme2)
   end
   def distance_map
@@ -238,9 +232,92 @@ module Phonetics
       Vowels.phonemes + Consonants.phonemes
     ).permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} } ) do |pair, scores|
       p1, p2 = *pair
-      score = distance(p1, p2)
+      score = _distance(p1, p2)
       scores[p1][p2] = score
       scores[p2][p1] = score
     end
   end
+  # as_utf_8_long("aɰ̊ h")
+  # => [97, 8404, 32, 104]
+  def as_utf_8_long(string)
+    string.each_grapheme_cluster.map { |grapheme| grapheme_as_utf_8_long(grapheme) }
+  end
+  # Encode individual multi-byte strings as a single integer.
+  #
+  # "ɰ̊".unpack('U*')
+  # => [624, 778]
+  #
+  # grapheme_as_utf_8_long("ɰ̊")
+  # => 1413 (624 + (10 * 778))
+  def grapheme_as_utf_8_long(grapheme)
+    grapheme.unpack('U*').each_with_index.reduce(0) do |total, (byte, i)|
+      total += (10**i) * byte
+    end
+  end
+  # This will print a C code file with a function that implements a two-level C
+  # switch like the following:
+  #
+  #    switch (a) {
+  #      case 100: // 'd'
+  #        switch (b) {
+  #          case 618: // 'ɪ'
+  #            return (float) 0.73827;
+  #            break;
+  #        }
+  #    }
+  #
+  def generate_phonetic_cost_c_code(writer = STDOUT)
+    # First, flatten the bytes of the runes (unicode codepoints encoded via
+    # UTF-8) into single integers. We do this by adding the utf-8 values, each
+    # multiplied by 10 * their byte number. The specific encoding doesn't
+    # matter so long as it's:
+    #   * consistent
+    #   * has no collisions
+    #   * produces a value that's a valid C case conditional
+    #   * can be applied to runes of input strings later
+    integer_distance_map = distance_map.reduce({}) do |acc_a, (a, distances)|
+      acc_a.update [a, grapheme_as_utf_8_long(a)] => (distances.reduce({}) do |acc_b, (b, distance)|
+        acc_b.update [b, grapheme_as_utf_8_long(b)] => distance
+      end)
+    end
+    # Then we print out C code full of switches
+    writer.puts(<<-FUNC.gsub(/^ {4}/, ''))
+    float phonetic_cost(int a, int b) {
+      // This is compiled from Ruby, using `String#unpack("U")` on each character
+      // to retrieve the UTF-8 codepoint as a C long value.
+      if (a == b) { return 0.0; };
+    FUNC
+    writer.puts '  switch (a) {'
+    integer_distance_map.each do |(a, a_i), distances|
+      writer.puts "    case #{a_i}: // #{a}"
+      writer.puts '      switch (b) {'
+      distances.each do |(b, b_i), distance|
+        writer.puts "        case #{b_i}: // #{a}->#{b}"
+        writer.puts "          return (float) #{distance};"
+        writer.puts "          break;"
+      end
+      writer.puts '      }'
+    end
+    writer.puts '  }'
+    writer.puts '  return 1.0;'
+    writer.puts '}'
+  end
+  private
+  def _distance(phoneme1, phoneme2)
+    types = [Symbols.fetch(phoneme1), Symbols.fetch(phoneme2)].sort
+    if types == [:consonant, :vowel]
+      1.0
+    elsif types == [:vowel, :vowel]
+      Vowels.distance(phoneme1, phoneme2)
+    elsif types == [:consonant, :consonant]
+      Consonants.distance(phoneme1, phoneme2)
+    end
+  end
 end

data/lib/phonetics/c_levenshtein.bundle ADDED

Binary file

data/lib/phonetics/levenshtein.rb CHANGED

@@ -1,5 +1,4 @@
-require_relative '../phonetics'
+require_relative 'c_levenshtein'
 # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
 # count used instead of a binary edit distance calculation
 #
@@ -11,110 +10,26 @@ require_relative '../phonetics'
 # Aumont, 2016
 # https://hal.archives-ouvertes.fr/hal-01474904/document
 module Phonetics
-  class Levenshtein
-    def initialize(ipa_str1, ipa_str2)
-      @str1 = ipa_str1
-      @str2 = ipa_str2
-      @len1 = ipa_str1.size
-      @len2 = ipa_str2.size
-      prepare_matrix
-      set_edit_distances(ipa_str1, ipa_str2)
-    end
-    def distance
-      return 0 if walk.empty?
-      walk.last[:distance]
-    end
+  module Levenshtein
+    extend ::PhoneticsLevenshteinCBinding
     def self.distance(str1, str2)
-      new(str1, str2).distance
-    end
-    private
-    def walk
-      res = []
-      cell = [@len2, @len1]
-      while cell != [0, 0]
-        cell, char = char_data(cell)
-        res.unshift char
-      end
-      res
-    end
-    def set_edit_distances(str1, str2)
-      (1..@len2).each do |i|
-        (1..@len1).each do |j|
-          no_change(i, j) && next if str2[i - 1] == str1[j - 1]
-          @matrix[i][j] = [del(i, j) + 1.0, ins(i, j) + 1.0, subst(i, j)].min
+      ensure_is_phonetic!(str1, str2)
+      internal_phonetic_distance(
+        Phonetics.as_utf_8_long(str1),
+        Phonetics.as_utf_8_long(str2),
+      )
+    end
+    def self.ensure_is_phonetic!(str1, str2)
+      [str1, str2].each do |string|
+        string.chars.each do |char|
+          unless Phonetics.phonemes.include?(char)
+            raise ArgumentError, "#{char.inspect} is not a character in the International Phonetic Alphabet. #{self.class.name} only works with IPA-transcribed strings"
+          end
         end
       end
     end
-    def char_data(cell)
-      char = { distance: @matrix[cell[0]][cell[1]] }
-      val = find_previous(cell)
-      previous_value = val[0][0]
-      char[:type] = previous_value == char[:distance] ? :same : val[1]
-      cell = val.pop
-      [cell, char]
-    end
-    def find_previous(cell)
-      candidates = [
-        [
-          [ins(*cell), 1],
-          :ins,
-          [cell[0], cell[1] - 1],
-        ],
-        [
-          [del(*cell), 2],
-          :del,
-          [cell[0] - 1, cell[1]],
-        ],
-        [
-          [subst(*cell), 0],
-          :subst,
-          [cell[0] - 1, cell[1] - 1],
-        ],
-      ]
-      select_cell(candidates)
-    end
-    def select_cell(candidates)
-      candidates.select { |e| e[-1][0] >= 0 && e[-1][1] >= 0 }.
-        sort_by(&:first).first
-    end
-    # TODO: Score the edit distance lower if sonorant sounds are found in sequence.
-    def del(i, j)
-      @matrix[i - 1][j]
-    end
-    def ins(i, j)
-      @matrix[i][j - 1]
-    end
-    # This is where we implement the modifications to Damerau-Levenshtein according to
-    # https://hal.archives-ouvertes.fr/hal-01474904/document
-    def subst(i, j)
-      map = Phonetics.distance_map[@str1[j]]
-      score = map[@str2[i]] if map
-      score ||= 1.0
-      @matrix[i - 1][j - 1] + score
-    end
-    def no_change(i, j)
-      @matrix[i][j] = @matrix[i - 1][j - 1]
-    end
-    def prepare_matrix
-      @matrix = []
-      @matrix << (0..@len1).to_a
-      @len2.times do |i|
-        ary = [i + 1] + (1..@len1).map { nil }
-        @matrix << ary
-      end
-    end
   end
 end

data/lib/phonetics/ruby_levenshtein.rb ADDED

@@ -0,0 +1,171 @@
+require_relative '../phonetics'
+# Using the Damerau version of the Levenshtein algorithm, with phonetic feature
+# count used instead of a binary edit distance calculation
+#
+# This implementation is almost entirely taken from the damerau-levenshtein gem
+# (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
+# The implementation is modified based on "Using Phonologically Weighted
+# Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
+# Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
+# Aumont, 2016
+# https://hal.archives-ouvertes.fr/hal-01474904/document
+module Phonetics
+  class RubyLevenshtein
+    attr_reader :str1, :str2, :len1, :len2, :matrix
+    def initialize(ipa_str1, ipa_str2, verbose = false)
+      @str1 = ipa_str1
+      @str2 = ipa_str2
+      @len1 = ipa_str1.size
+      @len2 = ipa_str2.size
+      @verbose = verbose
+      ensure_is_phonetic!
+      prepare_matrix
+      set_edit_distances(ipa_str1, ipa_str2)
+    end
+    def distance
+      return 0 if walk.empty?
+      print_matrix if @verbose
+      walk.last[:distance]
+    end
+    def self.distance(str1, str2)
+      new(str1, str2).distance
+    end
+    private
+    def ensure_is_phonetic!
+      [str1, str2].each do |string|
+        string.chars.each do |char|
+          unless Phonetics.phonemes.include?(char)
+            raise ArgumentError, "#{char.inspect} is not a character in the International Phonetic Alphabet. #{self.class.name} only works with IPA-transcribed strings"
+          end
+        end
+      end
+    end
+    def walk
+      res = []
+      i, j = len2, len1
+      return res if i == 0 && j == 0
+      begin
+        i, j, char = char_data(i, j)
+        res.unshift char
+      end while i > 0 && j > 0
+      res
+    end
+    def set_edit_distances(str1, str2)
+      i = 0
+      while (i += 1) <= len2
+        j = 0
+        while (j += 1) <= len1
+          options = [
+            ins(i, j),
+            del(i, j),
+            subst(i, j),
+          ]
+          # This is where we implement the modifications to Damerau-Levenshtein
+          # according to https://hal.archives-ouvertes.fr/hal-01474904/document
+          phonetic_cost = Phonetics.distance(str1[j - 1], str2[i - 1])
+          matrix[i][j] = options.min + phonetic_cost
+          puts "------- #{j}/#{i} #{j + (i*(len1+1))}" if @verbose
+          print_matrix if @verbose
+        end
+      end
+    end
+    def char_data(i, j)
+      char = { distance: matrix[i][j] }
+      operation, move = find_previous(i, j)
+      previous_value = move[:value]
+      char[:type] = previous_value == char[:distance] ? :same : operation
+      i, j = move[:move_to]
+      [i, j, char]
+    end
+    def find_previous(i, j)
+      [
+        [ :insert, { cost: ins(i, j), move_to: [i, j - 1] }],
+        [ :delete, { cost: del(i, j), move_to: [i, j - 1] }],
+        [ :substitute, { cost: subst(i, j), move_to: [i, j - 1] }],
+      ].select do |operation, data|
+        # Don't send us out of bounds
+        data[:move_to][0] >= 0 && data[:move_to][1] >= 0
+      end.sort_by do |operation, data|
+        # pick the cheapest one
+        data[:value]
+      end.first
+    end
+    # TODO: Score the edit distance lower if sonorant sounds are found in sequence.
+    def del(i, j)
+      matrix[i - 1][j]
+    end
+    def ins(i, j)
+      matrix[i][j - 1]
+    end
+    def subst(i, j)
+      matrix[i - 1][j - 1]
+    end
+    # Set the minimum scores equal to the distance between each phoneme,
+    # sequentially.
+    #
+    # The first value is always zero.
+    # The second value is always the phonetic distance between the first
+    # phonemes of each string.
+    # Subsequent values are the cumulative phonetic distance between each
+    # phoneme within the same string.
+    # "aek" -> [0, 1, 1.61, 2.61]
+    def initial_distances(str1, str2)
+      if len1 == 0 || len2 == 0
+       starting_distance = 0
+      else
+       starting_distance = Phonetics.distance(str1[0], str2[0])
+      end
+      distances1 = (1..(str1.length-1)).reduce([0, starting_distance]) do |acc, i|
+        acc << acc.last + Phonetics.distance(str1[i-1], str1[i])
+      end
+      distances2 = (1..(str2.length-1)).reduce([0, starting_distance]) do |acc, i|
+        acc << acc.last + Phonetics.distance(str2[i-1], str2[i])
+      end
+      [ distances1, distances2 ]
+    end
+    def prepare_matrix
+      str1_initial, str2_initial = initial_distances(str1, str2)
+      @matrix = Array.new(len2 + 1) { Array.new(len1 + 1) { nil } }
+      # The first row is the initial values for str2
+      @matrix[0] = str1_initial
+      # The first column is the initial values for str1
+      (len2 + 1).times { |n| @matrix[n][0] = str2_initial[n] }
+    end
+    # This is a helper method for developers to use when exploring this
+    # algorithm.
+    def print_matrix
+      puts "           #{str1.chars.map {|c| c.ljust(9, " ") }.join}"
+      matrix.each_with_index do |row, ridx|
+        print '  ' if ridx == 0
+        print "#{str2[ridx - 1]} " if ridx > 0
+        row.each_with_index do |cell, cidx|
+          cell ||= 0.0
+          print cell.to_s[0, 8].ljust(8, '0')
+          print ' '
+        end
+        puts ''
+      end
+      ''
+    end
+  end
+end

data/phonetics.gemspec CHANGED

@@ -16,8 +16,11 @@ Gem::Specification.new do |spec|
   end
   spec.require_paths = ["lib"]
+  spec.add_development_dependency "pry-byebug"
+  spec.add_development_dependency "rake-compiler", "~> 1.0"
+  spec.add_development_dependency "rubocop", "~> 0.52"
+  spec.add_development_dependency "ruby-prof", "~> 0.17"
   spec.add_development_dependency 'bundler', '~> 1.16'
   spec.add_development_dependency 'rake'
-  spec.add_development_dependency "pry-byebug"
   spec.add_development_dependency 'rspec', '~> 3.0'
 end

metadata CHANGED

@@ -1,45 +1,87 @@
 --- !ruby/object:Gem::Specification
 name: phonetics
 version: !ruby/object:Gem::Version
-  version: 1.1.1
+  version: 1.5.0
 platform: ruby
 authors:
 - Jack Danger
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-04-13 00:00:00.000000000 Z
+date: 2019-08-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
-  name: bundler
+  name: pry-byebug
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rake-compiler
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.16'
+        version: '1.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.16'
+        version: '1.0'
 - !ruby/object:Gem::Dependency
-  name: rake
+  name: rubocop
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '0.52'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '0.52'
 - !ruby/object:Gem::Dependency
-  name: pry-byebug
+  name: ruby-prof
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.17'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.17'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.16'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.16'
+- !ruby/object:Gem::Dependency
+  name: rake
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - ">="
@@ -73,6 +115,7 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- ".github/workflows/gempush.yml"
 - ".gitignore"
 - ".rspec"
 - ".travis.yml"
@@ -83,8 +126,14 @@ files:
 - README.md
 - Rakefile
 - VERSION
+- ext/c_levenshtein/extconf.rb
+- ext/c_levenshtein/levenshtein.c
+- ext/c_levenshtein/phonetic_cost.c
+- ext/c_levenshtein/phonetic_cost.h
 - lib/phonetics.rb
+- lib/phonetics/c_levenshtein.bundle
 - lib/phonetics/levenshtein.rb
+- lib/phonetics/ruby_levenshtein.rb
 - lib/phonetics/version.rb
 - phonetics.gemspec
 homepage: https://github.com/JackDanger/phonetics
@@ -106,8 +155,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.7.6
+rubygems_version: 3.0.3
 signing_key:
 specification_version: 4
 summary: tools for linguistic code using the International Phonetic Alphabet