RubyGems - phonetics - Versions diffs - 1.5.4 → 1.8.0 - Mend

phonetics 1.5.4 → 1.8.0

Files changed (16) hide show

checksums.yaml +4 -4
data/Rakefile +22 -11
data/VERSION +1 -1
data/bin/console +13 -0
data/ext/c_levenshtein/levenshtein.c +104 -76
data/ext/c_levenshtein/next_phoneme_length.c +1364 -0
data/ext/c_levenshtein/next_phoneme_length.h +1 -0
data/ext/c_levenshtein/phonemes.c +33 -0
data/ext/c_levenshtein/phonemes.h +2 -0
data/ext/c_levenshtein/phonetic_cost.c +134245 -42305
data/ext/c_levenshtein/phonetic_cost.h +1 -1
data/lib/phonetics.rb +2 -90
data/lib/phonetics/code_generator.rb +285 -0
data/lib/phonetics/levenshtein.rb +12 -21
data/lib/phonetics/ruby_levenshtein.rb +5 -14
metadata +8 -2

data/ext/c_levenshtein/phonetic_cost.h CHANGED

	@@ -1 +1 @@
1	- float phonetic_cost(~~long~~, ~~long~~);
1	+ float phonetic_cost(int string1, int string1_offset, int phoneme1_length, int string2, int string2_offset, int phoneme2_length);

data/lib/phonetics.rb CHANGED

@@ -112,22 +112,6 @@ module Phonetics
   module Consonants
     extend self
-    # Plosives and fricatives are less similar than trills and flaps, or
-    # sibilant fricatives and non-sibilant fricatives
-    # TODO: this is unfinished and possibly a bad idea
-    MannerDistances = {
-                       'Nasal' => %w[continuant],
-                        'Stop' => %w[],
-          'Sibilant fricative' => %w[continuant fricative],
-      'Non-sibilant fricative' => %w[continuant non_sibilant fricative],
-                 'Approximant' => %w[],
-                    'Tap/Flap' => %w[],
-                       'Trill' => %w[],
-           'Lateral fricative' => %w[continuant fricative],
-         'Lateral approximant' => %w[],
-            'Lateral tap/flap' => %w[],
-    }.freeze
     # This chart (columns 2 through the end, anyway) is a direct port of
     # https://en.wikipedia.org/wiki/International_Phonetic_Alphabet#Letters
     # We store the consonant table in this format to make updating it easier.
@@ -218,7 +202,7 @@ module Phonetics
   end
   def phonemes
-    Consonants.phonemes + Vowels.phonemes
+    Vowels.phonemes + Consonants.phonemes
   end
   Symbols = Consonants.phonemes.reduce({}) { |acc, p| acc.update p => :consonant }.merge(
@@ -232,9 +216,7 @@ module Phonetics
   end
   def distance_map
-    @distance_map ||= (
-      Vowels.phonemes + Consonants.phonemes
-    ).permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
+    @distance_map ||= phonemes.permutation(2).each_with_object(Hash.new { |h, k| h[k] = {} }) do |pair, scores|
       p1, p2 = *pair
       score = _distance(p1, p2)
       scores[p1][p2] = score
@@ -242,76 +224,6 @@ module Phonetics
     end
   end
-  # as_utf_8_long("aɰ̊ h")
-  # => [97, 8404, 32, 104]
-  def as_utf_8_long(string)
-    string.each_grapheme_cluster.map { |grapheme| grapheme_as_utf_8_long(grapheme) }
-  end
-  # Encode individual multi-byte strings as a single integer.
-  #
-  # "ɰ̊".unpack('U*')
-  # => [624, 778]
-  #
-  # grapheme_as_utf_8_long("ɰ̊")
-  # => 1413 (624 + (10 * 778))
-  def grapheme_as_utf_8_long(grapheme)
-    grapheme.unpack('U*').each_with_index.reduce(0) do |total, (byte, i)|
-      total + (10**i) * byte
-    end
-  end
-  # This will print a C code file with a function that implements a two-level C
-  # switch like the following:
-  #
-  #    switch (a) {
-  #      case 100: // 'd'
-  #        switch (b) {
-  #          case 618: // 'ɪ'
-  #            return (float) 0.73827;
-  #            break;
-  #        }
-  #    }
-  #
-  def generate_phonetic_cost_c_code(writer = STDOUT)
-    # First, flatten the bytes of the runes (unicode codepoints encoded via
-    # UTF-8) into single integers. We do this by adding the utf-8 values, each
-    # multiplied by 10 * their byte number. The specific encoding doesn't
-    # matter so long as it's:
-    #   * consistent
-    #   * has no collisions
-    #   * produces a value that's a valid C case conditional
-    #   * can be applied to runes of input strings later
-    integer_distance_map = distance_map.reduce({}) do |acc_a, (a, distances)|
-      acc_a.update [a, grapheme_as_utf_8_long(a)] => (distances.reduce({}) do |acc_b, (b, distance)|
-        acc_b.update [b, grapheme_as_utf_8_long(b)] => distance
-      end)
-    end
-    # Then we print out C code full of switches
-    writer.puts(<<-FUNC.gsub(/^ {4}/, ''))
-    float phonetic_cost(int a, int b) {
-      // This is compiled from Ruby, using `String#unpack("U")` on each character
-      // to retrieve the UTF-8 codepoint as a C long value.
-      if (a == b) { return 0.0; };
-    FUNC
-    writer.puts '  switch (a) {'
-    integer_distance_map.each do |(a, a_i), distances|
-      writer.puts "    case #{a_i}: // #{a}"
-      writer.puts '      switch (b) {'
-      distances.each do |(b, b_i), distance|
-        writer.puts "        case #{b_i}: // #{a}->#{b}"
-        writer.puts "          return (float) #{distance};"
-        writer.puts '          break;'
-      end
-      writer.puts '      }'
-    end
-    writer.puts '  }'
-    writer.puts '  return 1.0;'
-    writer.puts '}'
-  end
   private
   def _distance(phoneme1, phoneme2)

data/lib/phonetics/code_generator.rb ADDED

@@ -0,0 +1,285 @@
+# frozen_string_literal: true
+require_relative '../phonetics'
+require 'json'
+module Phonetics
+  class CodeGenerator
+    attr_reader :writer
+    def initialize(writer = STDOUT)
+      @writer = writer
+    end
+    def generate_phonetic_cost_c_code
+      generator = PhoneticCost.new(writer)
+      generator.generate
+      writer.flush
+    end
+    def generate_next_phoneme_length_c_code
+      generator = NextPhonemeLength.new(writer)
+      generator.generate
+      writer.flush
+    end
+    private
+    # Turn the bytes of all phonemes into a lookup trie where a sequence of
+    # bytes can find a phoneme in linear time.
+    def phoneme_byte_trie
+      phoneme_byte_trie_for(Phonetics.phonemes)
+    end
+    def phoneme_byte_trie_for(phonemes)
+      phonemes.each_with_object({}) do |phoneme, trie|
+        phoneme.bytes.each_with_index.reduce(trie) do |subtrie, (byte, idx)|
+          subtrie[byte] ||= {}
+          # If we've reached the end of the byte string
+          if phoneme.bytes.length - 1 == idx
+            # Check if this is a duplicate lookup path. If there's a collision
+            # then this whole approach makes no sense.
+            if subtrie[byte].key?(:source)
+              source = subtrie[byte][:source]
+              raise "Duplicate byte sequence on #{phoneme.inspect} & #{source.inspect} (#{phoneme.bytes.inspect})"
+            else
+              subtrie[byte][:source] = phoneme
+            end
+          end
+          subtrie[byte]
+        end
+      end
+    end
+    def ruby_source
+      location = caller_locations.first
+      "#{location.path.split('/')[-4..-1].join('/')}:#{location.lineno}"
+    end
+    def describe(phoneme, depth)
+      indent depth, "// Phoneme: #{phoneme.inspect}, bytes: #{phoneme.bytes.inspect}"
+      if Phonetics::Consonants.features.key?(phoneme)
+        indent depth, "// consonant features: #{Phonetics::Consonants.features[phoneme].to_json}"
+      else
+        indent depth, "// vowel features: #{Phonetics::Vowels::FormantFrequencies[phoneme].to_json}"
+      end
+    end
+    def indent(depth, line)
+      write "    #{'  ' * depth}#{line}"
+    end
+    def write(line)
+      writer.puts line
+    end
+  end
+  class PhoneticCost < CodeGenerator
+    # We find the phonetic distance between two phonemes using a compiled
+    # lookup table. This is implemented as a set of nested switch statements.
+    # Hard to read when compiled, but simple to generate and fast at runtime.
+    #
+    # We generate a `phonetic_cost` function that takes four arguments: Two
+    # strings, and the lengths of those strings. Each string should be exactly
+    # one valid phoneme, which is possible thanks to the (also generated)
+    # next_phoneme_length() function.
+    #
+    # This will print a C code file with a function that implements a multil-level C
+    # switch like the following:
+    #
+    #    switch (phoneme1_length) {
+    #      case 2:
+    #        switch(string1[1]) {
+    #          case 201: // first byte of "ɪ"
+    #            switch(string1[3]) {
+    #              case 170: // second and final byte of "ɪ"
+    #                // Phoneme: "ɪ", bytes: [201, 170]
+    #                // vowel features: {"F1":300,"F2":2100,"rounded":false}
+    #                switch(string2[6]) {
+    #                  case 105: // first and only byte of "i"
+    #                    // Phoneme: "i", bytes: [105]
+    #                    // vowel features: {"F1":240,"F2":2400,"rounded":false}
+    #                    return (float) 0.14355381904337383;
+    #                    break;
+    #
+    #  the distance of ("ɪ", "i")2 is therefore 0.14355
+    #
+    def generate
+      write(<<-HEADER.gsub(/^ {6}/, ''))
+      // This is compiled from Ruby, in #{ruby_source}
+      #include <stdbool.h>
+      #include <stdio.h>
+      #include "./phonemes.h"
+      float phonetic_cost(int *string1, int string1_offset, int phoneme1_length, int *string2, int string2_offset, int phoneme2_length) {
+      HEADER
+      write '  switch (phoneme1_length) {'
+      by_byte_length.each do |length, phonemes|
+        write "    case #{length}:"
+        switch_phoneme1(phoneme_byte_trie_for(phonemes), 0)
+        write '    break;'
+      end
+      write '  }'
+      write '  return (float) 1.0;'
+      write '};'
+      write ''
+    end
+    def switch_phoneme1(trie, depth = 0)
+      indent depth, "switch(string1[string1_offset + #{depth}]) {"
+      trie.each do |key, subtrie|
+        next if key == :source
+        next if subtrie.empty?
+        indent depth + 1, "case #{key}:"
+        phoneme1 = subtrie[:source]
+        # If this could be a match of a phoneme1 then find phoneme2
+        if phoneme1
+          # Add a comment to help understand the dataset
+          describe(phoneme1, depth + 2) if phoneme1
+          by_byte_length.each do |_, phonemes|
+            byte_trie = phoneme_byte_trie_for(phonemes)
+            next if byte_trie.empty?
+            switch_phoneme2(byte_trie, phoneme1, 0)
+          end
+        else
+          switch_phoneme1(subtrie, depth + 1)
+        end
+        indent depth + 2, 'break;'
+      end
+      indent depth, '}'
+    end
+    def switch_phoneme2(trie, previous_phoneme, depth = 0)
+      indent depth, "switch(string2[string2_offset + #{depth}]) {"
+      trie.each do |key, subtrie|
+        next if key == :source
+        next if subtrie.empty?
+        phoneme2 = subtrie[:source]
+        indent depth + 1, "case #{key}:"
+        if phoneme2
+          value = if previous_phoneme == phoneme2
+                    0.0
+                  else
+                    distance(previous_phoneme, phoneme2)
+                  end
+          # Add a comment to help understand the dataset
+          describe(phoneme2, depth + 2)
+          indent depth + 2, "return (float) #{value};"
+        else
+          switch_phoneme2(subtrie, previous_phoneme, depth + 1)
+        end
+        indent depth + 2, 'break;'
+      end
+      indent depth, '}'
+    end
+    def by_byte_length
+      Phonetics.phonemes.group_by do |phoneme|
+        phoneme.bytes.length
+      end.sort_by(&:first)
+    end
+    def distance(p1, p2)
+      Phonetics.distance_map[p1][p2]
+    end
+  end
+  class NextPhonemeLength < CodeGenerator
+    # There's no simple way to break a string of IPA characters into phonemes.
+    # We do it by generating a function that, given a string of IPA characters,
+    # the starting index in that string, and the length of the string, returns
+    # the length of the next phoneme, or zero if none is found.
+    #
+    # Pseudocode:
+    #   - return 0 if length - index == 0
+    #   - switch on first byte, matching on possible first bytes of phonemes
+    #     within the selected case statement:
+    #     - return 1 if length - index == 1
+    #     - switch on second byte, matching on possible second bytes of phonemes
+    #       within the selected case statement:
+    #       - return 2 if length - index == 1
+    #       ...
+    #       - default case: return 2 iff a phoneme terminates here
+    #     - default case: return 1 iff a phoneme terminates here
+    #   - return 0
+    #
+    def generate
+      write(<<-HEADER.gsub(/^ {6}/, ''))
+      // This is compiled from Ruby, in #{ruby_source}
+      int next_phoneme_length(int *string, int cursor, int length) {
+        int max_length;
+        max_length = length - cursor;
+      HEADER
+      next_phoneme_switch(phoneme_byte_trie, 0)
+      # If we fell through all the cases, return 0
+      write '  return 0;'
+      write '}'
+    end
+    private
+    # Recursively build switch statements for the body of next_phoneme_length
+    def next_phoneme_switch(trie, depth)
+      # switch (string[cursor + depth]) {
+      #   case N: // for N in subtrie.keys
+      #     // if a case statement matches the current byte AND there's chance
+      #     // that a longer string might match, recurse.
+      #     if (max_length >= depth) {
+      #       // recurse
+      #     }
+      #     break;
+      #   // if there's a :source key here then a phoneme terminates at this
+      #   // point and this depth is a valid return value.
+      #   default:
+      #     return depth;
+      #     break;
+      # }
+      indent depth, "switch(string[cursor + #{depth}]) {"
+      write ''
+      trie.each do |key, subtrie|
+        next if key == :source
+        next if subtrie.empty?
+        indent depth, "case #{key}:"
+        # Add a comment to help understand the dataset
+        describe(subtrie[:source], depth + 1) if subtrie[:source]
+        if subtrie.keys == [:source]
+          indent depth, " return #{depth + 1};"
+        else
+          indent depth, " if (max_length > #{depth + 1}) {"
+          next_phoneme_switch(subtrie, depth + 1)
+          indent depth, ' } else {'
+          indent depth, "   return #{depth + 1};"
+          indent depth, ' }'
+        end
+        indent depth, '    break;'
+      end
+      if trie.key?(:source)
+        indent depth, '  default:'
+        indent depth, "    return #{depth};"
+      end
+      indent depth, '}'
+    end
+  end
+end

data/lib/phonetics/levenshtein.rb CHANGED

@@ -1,38 +1,29 @@
 # frozen_string_literal: true
+require_relative '../phonetics'
 require_relative 'c_levenshtein'
 # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
 # count used instead of a binary edit distance calculation
 #
-# This implementation is almost entirely taken from the damerau-levenshtein gem
+# This implementation was dually inspired by the damerau-levenshtein gem
 # (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
-# The implementation is modified based on "Using Phonologically Weighted
-# Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
-# Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
-# Aumont, 2016
+# and "Using Phonologically Weighted Levenshtein Distances for the Prediction
+# of Microscopic Intelligibility" by Lionel Fontan, Isabelle Ferrané, Jérôme
+# Farinas, Julien Pinquier, Xavier Aumont, 2016
 # https://hal.archives-ouvertes.fr/hal-01474904/document
 module Phonetics
   module Levenshtein
     extend ::PhoneticsLevenshteinCBinding
-    def self.distance(str1, str2, verbose = false)
-      ensure_is_phonetic!(str1, str2)
-      internal_phonetic_distance(
-        Phonetics.as_utf_8_long(str1),
-        Phonetics.as_utf_8_long(str2),
-        verbose
-      )
+    def inspect_bytes(str)
+      puts "Rubyland str: #{str.inspect}"
+      puts "Rubyland bytes: #{str.bytes.inspect}"
+      testing_codepoints(str)
     end
-    def self.ensure_is_phonetic!(str1, str2)
-      [str1, str2].each do |string|
-        string.chars.each do |char|
-          unless Phonetics.phonemes.include?(char)
-            msg = "#{char.inspect} is not a character in the International Phonetic Alphabet. #{self.class.name} only works with IPA-transcribed strings"
-            raise ArgumentError, msg
-          end
-        end
-      end
+    def self.distance(str1, str2, verbose = false)
+      internal_phonetic_distance(str1, str2, verbose)
     end
   end
 end