RubyGems - family-reunion - Versions diffs - 0.1.1 - Mend

family-reunion 0.1.1

Files changed (39) hide show

data/.document +5 -0
data/.rspec +1 -0
data/.rvmrc +1 -0
data/Gemfile +27 -0
data/Gemfile.lock +82 -0
data/LICENSE.txt +20 -0
data/README.rdoc +19 -0
data/Rakefile +52 -0
data/VERSION +1 -0
data/family-reunion.gemspec +108 -0
data/features/family-reunion.feature +9 -0
data/features/step_definitions/family-reunion_steps.rb +0 -0
data/features/support/env.rb +13 -0
data/lib/family-reunion.rb +49 -0
data/lib/family-reunion/cache.rb +13 -0
data/lib/family-reunion/exact_matcher.rb +72 -0
data/lib/family-reunion/fuzzy_matcher.rb +93 -0
data/lib/family-reunion/matcher_helper.rb +22 -0
data/lib/family-reunion/nomatch_organizer.rb +57 -0
data/lib/family-reunion/taxamatch_preprocessor.rb +103 -0
data/lib/family-reunion/taxamatch_wrapper.rb +54 -0
data/lib/family-reunion/top_node.rb +102 -0
data/scripts/dwca2fr.rb +84 -0
data/spec/family-reunion_spec.rb +20 -0
data/spec/fixtures/ants_primary.json +1 -0
data/spec/fixtures/ants_secondary.json +1 -0
data/spec/fixtures/matched_merges.json +1 -0
data/spec/fixtures/nodes_to_match.json +1 -0
data/spec/fixtures/synonyms_strings_primary.json +1 -0
data/spec/fixtures/synonyms_strings_secondary.json +1 -0
data/spec/fixtures/valid_names_strings_primary.json +1 -0
data/spec/fixtures/valid_names_strings_secondary.json +1 -0
data/spec/fuzzy_matcher_spec.rb +32 -0
data/spec/node_spec.rb +26 -0
data/spec/nomatch_organizer_spec.rb +23 -0
data/spec/spec_helper.rb +29 -0
data/spec/taxamatch_preprocessor_spec.rb +49 -0
data/spec/taxamatch_wrapper_spec.rb +21 -0
metadata +256 -0

data/lib/family-reunion/fuzzy_matcher.rb ADDED Viewed

@@ -0,0 +1,93 @@
+class FamilyReunion
+  class FuzzyMatcher
+    include MatcherHelper
+    def initialize(family_reunion)
+      @fr = family_reunion
+      @tw = FamilyReunion::TaxamatchWrapper.new
+    end
+    def merge
+      add_matches(get_valid_matches, :fuzzy_valid_to_valid)
+      add_matches(get_valid_to_synonym_matches, :fuzzy_valid_to_synonym)
+      add_matches(get_synonym_to_valid_matches, :fuzzy_synonym_to_valid)
+      add_matches(get_synonym_to_synonym_matches, :fuzzy_synonym_to_synonym)
+    end
+    def get_valid_matches
+      primary_names = @fr.primary_valid_names_set - @fr.secondary_valid_names_set
+      secondary_names = @fr.secondary_valid_names_set - @fr.primary_valid_names_set
+      make_match(primary_names, secondary_names, :valid_name, :valid_name)
+    end
+    def get_valid_to_synonym_matches
+      primary_names = @fr.primary_valid_names_set - @fr.secondary_synonyms_set
+      secondary_names = @fr.secondary_synonyms_set - @fr.primary_valid_names_set
+      make_match(primary_names, secondary_names, :valid_name, :synonym)
+    end
+    def get_synonym_to_valid_matches
+      primary_names = @fr.primary_synonyms_set - @fr.secondary_valid_names_set
+      secondary_names = @fr.secondary_valid_names_set - @fr.primary_synonyms_set
+      make_match(primary_names, secondary_names, :synonym, :valid_name)
+    end
+    def get_synonym_to_synonym_matches
+      primary_names = @fr.primary_synonyms_set - @fr.secondary_synonyms_set
+      secondary_names = @fr.secondary_synonyms_set - @fr.primary_synonyms_set
+      make_match(primary_names, secondary_names, :synonym, :synonym)
+    end
+    private
+    def add_matches(matched_nodes, match_type)
+      matched_nodes.each do |primary_node, secondary_nodes|
+        primary_id = primary_node[:id]
+        secondary_ids = secondary_nodes.map { |n| n[:id] }
+        secondary_id_matches = format_secondary_id_for_merge(secondary_ids, match_type)
+        add_record_to_merges(primary_id, secondary_id_matches)
+      end
+    end
+    def make_match(primary_names, secondary_names, primary_name_type, secondary_name_type)
+      canonical_matches = @tw.match_canonicals_lists(primary_names, secondary_names)
+      match_nodes_candidates = get_nodes_from_canonicals(canonical_matches, primary_name_type, secondary_name_type)
+      @tw.match_nodes(match_nodes_candidates)
+    end
+    def get_nodes_from_canonicals(canonical_matches, primary_name_type, secondary_name_type)
+      res = []
+      canonical_matches.each do |primary_name, secondary_names|
+        primary_nodes = self.send("get_#{primary_name_type}_node", @fr.primary_node, primary_name)
+        secondary_nodes = secondary_names.map do |secondary_name|
+          self.send("get_#{secondary_name_type}_node", @fr.secondary_node, secondary_name)
+        end
+        append_nodes(res, primary_nodes, secondary_nodes)
+      end
+      res
+    end
+    def append_nodes(nodes, primary_nodes, secondary_nodes)
+      secondary_nodes = secondary_nodes.flatten.uniq
+      primary_nodes.each do |primary_node|
+        nodes << [primary_node, secondary_nodes]
+      end
+    end
+    def get_valid_name_node(top_node, name)
+      node = top_node.valid_names_hash[name]
+      node.merge!({ :name_to_match => node[:valid_name][:name] })
+      [node]
+    end
+    def get_synonym_node(top_node, name)
+      nodes = top_node.synonyms_hash[name]
+      nodes.each do |n|
+        synonym_name = n[:synonyms].select { |s| s[:canonical_name] == name }.first[:name]
+        n.merge!({ :name_to_match => synonym_name })
+      end
+    end
+  end
+end

data/lib/family-reunion/matcher_helper.rb ADDED Viewed

@@ -0,0 +1,22 @@
+class FamilyReunion
+  module MatcherHelper
+    private
+    def format_secondary_id_for_merge(secondary_ids, match_type)
+      secondary_ids.inject({}) do |res, i|
+        i = i.to_s
+        res[i] = {:match_type => match_type} unless res.has_key?(i)
+        res
+      end
+    end
+    def add_record_to_merges(primary_id, secondary_id_matches)
+      if @fr.merges.has_key?(primary_id)
+        secondary_id_matches.each do |key, val|
+          @fr.merges[primary_id][:matches][key] = val unless @fr.merges[primary_id][:matches].has_key?(key)
+        end
+      else
+        @fr.merges[primary_id] = {:matches => secondary_id_matches, :nonmatches => []}
+      end
+    end
+  end
+end

data/lib/family-reunion/nomatch_organizer.rb ADDED Viewed

@@ -0,0 +1,57 @@
+class FamilyReunion
+  class NomatchOrganizer
+    def initialize(family_reunion)
+      @fr = family_reunion
+      @nomatch_secondary_ids = nil
+    end
+    def merge
+      organize_nonmatches(get_nomach_secondary_ids)
+    end
+    def get_nomach_secondary_ids
+      return @nomatch_secondary_ids if @nomatch_secondary_ids
+      match_ids = @fr.merges.map { |key, val| val[:matches].keys }.flatten.uniq
+      empty_nodes_ids =  @fr.secondary_node.data[:empty_nodes].map { |node| node[:id].to_sym }
+      valid_names_ids = @fr.secondary_node.ids_hash.keys.map { |k| k }
+      @nomatch_secondary_ids = valid_names_ids - match_ids
+    end
+    def organize_nonmatches(nomatch_secondary_ids)
+      ids_hash = @fr.secondary_node.ids_hash
+      paths_hash = @fr.primary_node.paths_hash
+      nomatch_secondary_ids.each do |i|
+        node = ids_hash[i]
+        merge_node(node, paths_hash)
+      end
+    end
+    private
+    def merge_node(node, paths_hash)
+      path = node[:path].dup
+      last_name = path.pop.to_sym
+      return if paths_hash.has_key?(last_name)
+      found_node = false
+      until path.empty?
+        name = path.pop.to_sym
+        if paths_hash.has_key?(name)
+          found_node = true
+          add_merged_node(paths_hash[name][1][-1], node[:id])
+          break
+        end
+      end
+      add_merged_node(@fr.primary_node.root_id, node[:id]) unless found_node
+    end
+    def add_merged_node(primary_node_id, secondary_node_id)
+      if @fr.merges.has_key?(primary_node_id)
+        @fr.merges[primary_node_id][:nonmatches] << secondary_node_id
+      else
+        @fr.merges[primary_node_id] = {:matches => [], :nonmatches => [secondary_node_id]}
+      end
+    end
+  end
+end

data/lib/family-reunion/taxamatch_preprocessor.rb ADDED Viewed

@@ -0,0 +1,103 @@
+class FamilyReunion
+  class TaxamatchPreprocessor
+    def initialize(cache)
+      @cache = cache
+    end
+    def get_match_candidates(list1, list2)
+      match_candidates = {:uninomials => {}, :binomials => {}, :trinomials => {}}
+      partitioned_names1 = partition_canonicals(list1)
+      partitioned_names2 = partition_canonicals(list2)
+      [:uninomials, :binomials, :trinomials].each do |bucket|
+        candidates = self.send("process_#{bucket}", partitioned_names1[bucket], partitioned_names2[bucket])
+        match_candidates[bucket].merge!(candidates)
+      end
+      match_candidates
+    end
+    def partition_canonicals(canonicals)
+      partitions = { :uninomials => [], :binomials => [], :trinomials => [], :multinomials => [] }
+      canonicals.each do |name|
+        words = name.split(' ')
+        key = case words.size
+              when 1
+                :uninomials
+              when 2
+                :binomials
+              when 3
+                :trinomials
+              else
+                :multinomials
+              end
+        partitions[key] << [name, words]
+      end
+      partitions
+    end
+    def process_uninomials(names1, names2)
+      names1.inject({}) do |res, n1|
+        names2.each do |n2|
+          if similar_words?(n1[1][0], n2[1][0])
+            res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] }
+          end
+        end
+        res
+      end
+    end
+    def process_binomials(names1, names2)
+      names1.inject({}) do |res, n1|
+        names2.each do |n2|
+          if similar_words?(n1[1][0], n2[1][0]) && similar_words?(n1[1][1], n2[1][1])
+            res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] }
+          end
+        end
+        res
+      end
+    end
+    def process_trinomials(names1, names2)
+      names1.inject({}) do |res, n1|
+        names2.each do |n2|
+          if similar_words?(n1[1][0], n2[1][0]) && similar_words?(n1[1][1], n2[1][1]) && similar_words?(n1[1][2], n2[1][2])
+            res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] }
+          end
+        end
+        res
+      end
+    end
+    def similar_words?(word1, word2)
+      raise RuntimeError unless (word1.is_a?(String) && word2.is_a?(String))
+      key = [word1, word2].sort.join(':')
+      cached = @cache.similar_words[key]
+      return cached if cached != nil
+      are_similar = false
+      if word1 == word2
+        are_similar = true
+      else
+        letters1 = get_letters(word1)
+        letters2 = get_letters(word2)
+        symmertric_difference = (letters1 - letters2) + (letters2 - letters1)
+        similar_letters = symmertric_difference.size.to_f/(letters1.size + letters2.size) <= 0.3
+        similar_length = (word1.size - word2.size).abs.to_f/word1.size <= 0.2
+        are_similar = similar_letters && similar_length
+      end
+      @cache.similar_words[key] = are_similar
+      are_similar
+    end
+    def get_letters(word)
+      letters = @cache.word_letters[word]
+      if letters == nil
+        letters = word.split('').uniq
+        @cache.word_letters[word] = letters
+      end
+      letters
+    end
+  end
+end

data/lib/family-reunion/taxamatch_wrapper.rb ADDED Viewed

@@ -0,0 +1,54 @@
+class FamilyReunion
+  class TaxamatchWrapper
+    def initialize
+      @tm = Taxamatch::Base.new
+      @cache = FamilyReunion::Cache.new
+      @tp = FamilyReunion::TaxamatchPreprocessor.new(@cache)
+    end
+    def match_canonicals_lists(list1, list2)
+      matches = {}
+      match_candidates = @tp.get_match_candidates(list1, list2)
+      [:uninomials, :binomials, :trinomials].each do |bucket|
+        match_candidates[bucket].each do |name1, possible_matches|
+          possible_matches[:candidates].each do |name2|
+            if self.send("#{bucket}_match?", name1, name2[0])
+              matches.has_key?(name1) ? matches[name1] << name2[0] : matches[name1] = [name2[0]]
+            end
+          end
+        end
+      end
+      matches
+    end
+    def match_nodes(nodes)
+      res = []
+      nodes.each do |primary_node, secondary_nodes|
+        secondary_nodes.each do |secondary_node|
+          if @tm.taxamatch(primary_node[:name_to_match], secondary_node[:name_to_match])
+            if res.last && res.last[0][:id] == primary_node[:id]
+              res.last[1] << secondary_node
+            else
+              res << [primary_node, [secondary_node]]
+            end
+          end
+        end
+      end
+      res
+    end
+    def uninomials_match?(name1, name2)
+      @tm.taxamatch(name1, name2)
+    end
+    def binomials_match?(name1, name2)
+      uninomials_match?(name1, name2)
+    end
+    def trinomials_match?(name1, name2)
+      uninomials_match?(name1, name2)
+    end
+  end
+end

data/lib/family-reunion/top_node.rb ADDED Viewed

@@ -0,0 +1,102 @@
+class FamilyReunion
+  class TopNode
+    attr :data
+    def initialize(data)
+      @data = data
+      @valid_names_hash = nil
+      @valid_names_duplicates = nil
+      @synonyms_hash = nil
+      @ids_hash = nil
+      @paths_hash = nil
+    end
+    def valid_names_hash
+      return @valid_names_hash if @valid_names_hash #TODO: make it more robust for situations with exceptions etc.
+      @valid_names_duplicates = {}
+      @valid_names_hash = {}
+      @paths_hash = {}
+      @data[:leaves].each do |row|
+        canonical = row[:valid_name][:canonical_name]
+        update_paths_hash(row)
+        if @valid_names_hash.has_key?(canonical)
+          if @valid_names_duplicates.has_key?(canonical)
+            @valid_names_duplicates[canonical] << row
+          else
+            @valid_names_duplicates[canonical] = [row]
+          end
+        else
+          @valid_names_hash[canonical] = row
+        end
+      end
+      @valid_names_duplicates.keys.each do |k|
+        @valid_names_duplicates[k] << @valid_names_hash.delete(k)
+      end
+      data[:empty_nodes].each do |row|
+        update_paths_hash(row)
+      end
+      @valid_names_hash
+    end
+    def paths_hash
+      unless @paths_hash
+        valid_names_hash
+      end
+      @paths_hash
+    end
+    def ids_hash
+      return @ids_hash if @ids_hash
+      @ids_hash = valid_names_hash.inject({}) do |res, key_val|
+        res[key_val[1][:id].to_sym] = key_val[1]
+        res
+      end
+      data[:empty_nodes].each do |node|
+        @ids_hash[node[:id].to_sym] = node
+      end
+      @ids_hash
+    end
+    def synonyms_hash
+      return @synonyms_hash if @synonyms_hash
+      @synonyms_hash = {}
+      @valid_names_hash.keys.each do |name|
+        synonyms = @valid_names_hash[name][:synonyms]
+        synonyms.each do |syn|
+          @synonyms_hash.has_key?(syn[:canonical_name]) ? @synonyms_hash[syn[:canonical_name]] << @valid_names_hash[name] : @synonyms_hash[syn[:canonical_name]] = [@valid_names_hash[name]]
+        end
+      end
+      @synonyms_hash
+    end
+    def valid_names_duplicates
+      valid_names_hash unless @valid_names_duplicates
+      @valid_names_duplicates
+    end
+    def update_paths_hash(node)
+      path = node[:path].map { |n| n.to_sym }
+      path_ids = node[:path_ids].map { |i| i.to_sym }
+      until path.empty?
+        populate_paths_hash(path, path_ids)
+        path.pop
+        path_ids.pop
+      end
+    end
+    def populate_paths_hash(path, path_ids)
+      name = path[-1]
+      unless @paths_hash[name]
+        @paths_hash[name] = [path.dup, path_ids.dup]
+      end
+    end
+    def root_id
+      unless @root_id
+        @root_id = data[:leaves][0][:path_ids][0].to_sym
+      end
+      @root_id
+    end
+  end
+end

data/scripts/dwca2fr.rb ADDED Viewed

@@ -0,0 +1,84 @@
+#!/usr/bin/env ruby
+require "dwc-archive"
+require 'json'
+class Node
+  attr_reader :classification
+  def logger
+    @logger || Logger.new($stdout)
+  end
+  def initialize(dwca_file)
+    @dwca = DarwinCore.new(dwca_file)
+    DarwinCore.logger = logger
+    logger.info("Creating classification tree")
+    @classification = DarwinCore::ClassificationNormalizer.new(@dwca)
+    @classification.normalize
+    @leaves = []
+    @empty_nodes = []
+  end
+  def leaves(node_id)
+    node = @classification.normalized_data[node_id]
+    path = node.classification_path_id
+    @node_path_size = path.size - 1
+    current_node = @classification.tree
+    until path.empty? do
+      current_node = current_node[path.shift]
+    end
+    walk_tree(current_node)
+    [@leaves, @empty_nodes]
+  end
+  private
+  def walk_tree(current_node)
+    current_node.keys.each do |key|
+      get_data(key, current_node[key].empty?)
+      walk_tree(current_node[key])
+    end
+  end
+  def get_data(node_id, node_is_empty)
+    node = @classification.normalized_data[node_id]
+    if is_species?(node.current_name_canonical)
+      add_node(@leaves, node)
+    elsif node_is_empty
+      add_node(@empty_nodes, node)
+    end
+  end
+  def add_node(res, node)
+    range = @node_path_size..node.classification_path.size
+    valid_name = {:name => node.current_name, :canonical_name => node.current_name_canonical, :type => :valid, :status => node.status}
+    synonyms = node.synonyms.inject([]) do |res, syn|
+      res << {:name => syn.name, :canonical_name => syn.canonical_name, :type => :synonym, :status => syn.status}
+    end
+    res << {:id => node.classification_path_id.last, :path => node.classification_path[range], :path_ids => node.classification_path_id[range], :rank => node.rank, :valid_name => valid_name, :synonyms => synonyms}
+  end
+  def is_species?(name_string)
+    name_string.split(/\s+/).size >= 2
+  end
+end
+if __FILE__ == $0
+  unless ARGV[1]
+    puts "script creates a json file with data compatible for family-reunion from a darwin core archive"
+    puts "Usage #{$0} path_to_dwca_file node_id [output_file]"
+    puts "output_file is optional"
+    exit
+  end
+  dwca_file = ARGV[0]
+  node_id = ARGV[1]
+  paths_file = ARGV[2] ? ARGV[2] : "node_leaves.json"
+  node = Node.new(dwca_file)
+  leaves, empty_nodes = node.leaves(node_id)
+  f = open(paths_file,'w')
+  f.write JSON.dump({:empty_nodes => empty_nodes, :leaves => leaves})
+end