RubyGems - levenshtein_comparator - Versions diffs - 1.0.0 - Mend

levenshtein_comparator 1.0.0

Files changed (3) hide show

checksums.yaml +7 -0
data/lib/levenshtein_comparator.rb +159 -0
metadata +45 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: b9eb28e2308b2dfb0db9dc0fcff05fbed95d58e3
+  data.tar.gz: 3c767a18435998d0ee3337e9db42bcf6ab893d4c
+SHA512:
+  metadata.gz: 75b98f99eaf86a91a5a75fca343c9012254f04f3be0088933344071642796b9b5f0634c3b9d1c1bc7a485d6969d68cdfc8527deae15b4a24dce55281afe19f25
+  data.tar.gz: 25ab135e9ede8d6df2a6459e088896f8887f779f2b28e4c90ec2bbb1e83311e7ada4c04761341e1b602e49c59c90383e94f74b65dc5c490171257eb34ea1c402

data/lib/levenshtein_comparator.rb ADDED Viewed

@@ -0,0 +1,159 @@
+# coding: utf-8
+require 'levenshtein'
+require 'htmlentities'
+class LevenshteinComparator
+  attr_accessor :cleanified_strings
+  STOP_WORDS = [
+    "un",
+    "une",
+    "the",
+    "le",
+    "la",
+    "les",
+    "a",
+    "an",
+    "of",
+    "du",
+    "de",
+    "des",
+    "et",
+    "and",
+    "ne",
+    "en",
+    "au"
+  ]
+  ASCII_REGEXP_MAPPING = {
+    /[ÄÀÁÂÃÅĀĄĂ]/ => 'A',
+    /[âäàãáäåāăąǎǟǡǻȁȃȧẵặ]/ => 'a',
+    /[Æ]/ => 'Ae',
+    /[æ]/ => 'ae',
+    /[ÇĆČĈĊ]/ => 'C',
+    /[çćčĉċ]/ => 'c',
+    /[ĎĐ]/ => 'D',
+    /[ďđ]/ => 'd',
+    /[ÈÉÊËĒĘĚĔĖ]/ =>'E',
+    /[ëêéèẽēĕėẻȅȇẹȩęḙḛềếễểḕḗệḝ]/ => 'e',
+    /[ƒ]/ => 'f',
+    /[ĜĞĠĢ]/ => 'G',
+    /[ĝğġģ]/ => 'g',
+    /[ĤĦ]/ => 'H',
+    /[ĥħ]/ => 'h',
+    /[ÌÍÎÏĪĨĬĮİ]/ => 'I',
+    /[ìíîĩīĭïỉǐịįȉȋḭɨḯ]/ => 'i',
+    /[Ĳ]/ => 'IJ',
+    /[Ĵ]/ => 'J',
+    /[ĵ]/ => 'j',
+    /[Ķ]/ => 'K',
+    /[ķĸ]/ => 'k',
+    /[ŁĽĹĻĿ]/ => 'L',
+    /[łľĺļŀ]/ => 'l',
+    /[ÑŃŇŅŊ]/ => 'N',
+    /[ñńňņŉŋ]/ => 'n',
+    /[ÒÓÔÕØŌŐŎÖ]/ => 'O',
+    /[òóôõōŏȯöỏőǒȍȏơǫọɵøồốỗổȱȫȭṍṏṑṓờớỡởợǭộǿ]/ => 'o',
+    /[Œ]/ => 'OE',
+    /[œ]/ => 'oe',
+    /[ŔŘŖ]/ =>'R',
+    /[ŕřŗ]/ =>'r',
+    /[ŚŠŞŜȘ]/ => 'S',
+    /[śšşŝș]/ => 's',
+    /[ß]/ => 'ss',
+    /[ŤŢŦȚ]/ => 'T',
+    /[ťţŧț]/ => 't',
+    /[ÜÙÚÛŪŮŰŬŨŲ]/ => 'U',
+    /[ùúûũūŭüủůűǔȕȗưụṳųṷṵṹṻǖǜǘǖǚừứữửự]/ => 'u',
+    /[Ŵ]/ => 'W',
+    /[ŵ]/ => 'w',
+    /[ỳýŷỹȳẏÿỷẙƴỵ]/ => 'y',
+    /[ŹŽŻ]/ =>'Z',
+    /[žżź]/ =>'z'
+  }
+  def initialize(s)
+    self.cleanified_strings = self.class.to_array(s)
+  end
+  def self.remove_parenthesis(s)
+    res = s.gsub(/([\(\[].*[\)\]])/, '')
+    res.strip
+  end
+  def self.remove_featuring(s)
+    res = s.gsub(/([fF]eat(\.|uring) .*)/, '')
+    res.strip
+  end
+  def self.unaccent!(s)
+    ASCII_REGEXP_MAPPING.each do |key, value|
+      s.gsub! key, value
+    end
+    s
+  end
+  def self.unaccent(s)
+    self.unaccent!(s.dup)
+  end
+  def self.decode_html_entities(s)
+    HTMLEntities.new.decode(s)
+  end
+  def self.remove_stop_words(a)
+    a - STOP_WORDS
+  end
+  # Cut the string into an array of words
+  # Two words separated by a dash (-) should be considered as :
+  # 1 word if the first or the second word is only 1 character
+  # 2 words otherwise
+  def self.to_array(s)
+    s = self.clean(s)
+    arr = s.gsub(/\b(\w{2,})-(\w{2,})\b/, '\1 \2').split.map do |w|
+      w.gsub(/[^A-Za-z0-9]/, '').downcase
+    end.delete_if do |w|
+      w.length < 2 && w !~ /\d/
+    end
+    self.remove_stop_words(arr)
+  end
+  def self.clean(s)
+    self.unaccent(
+      self.remove_featuring(
+        self.remove_parenthesis(
+          self.decode_html_entities(s)
+        )
+      )
+    )
+  end
+  def compare(pattern)
+    pattern = self.class.to_array(pattern)
+    size = cleanified_strings.size
+    cleanified_strings.delete_if do |word|
+      matched_word = pattern.find do |guess|
+        if word =~ /\d+/
+          guess == word
+        else
+          if guess.length > 4 and word.length > 4
+            Levenshtein.distance(guess, word) <= 2
+          elsif guess.length > 2 and word.length > 2
+            Levenshtein.distance(guess, word) <= 1
+          else
+            guess == word
+          end
+        end
+      end
+      # only deleting one of the words
+      pattern.delete_at(pattern.index(matched_word)) if matched_word
+    end
+    size != cleanified_strings.size ? cleanified_strings.size == 0 ? :ok : :almost : :ko
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,45 @@
+--- !ruby/object:Gem::Specification
+name: levenshtein_comparator
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+platform: ruby
+authors:
+- Stéphane Akkaoui
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2017-11-03 00:00:00.000000000 Z
+dependencies: []
+description: Levenstein Comparator allows you to compare two sentences and say if
+  their is a match, almost a match or nothing to compare.
+email: sakkaoui@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/levenshtein_comparator.rb
+homepage: https://github.com/meuble/levenshtein_comparator
+licenses:
+- WTFPL
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.5.1
+signing_key:
+specification_version: 4
+summary: A string comparator using Danau-Levenshtein distance
+test_files: []