family-reunion 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/.document +5 -0
  2. data/.rspec +1 -0
  3. data/.rvmrc +1 -0
  4. data/Gemfile +27 -0
  5. data/Gemfile.lock +82 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.rdoc +19 -0
  8. data/Rakefile +52 -0
  9. data/VERSION +1 -0
  10. data/family-reunion.gemspec +108 -0
  11. data/features/family-reunion.feature +9 -0
  12. data/features/step_definitions/family-reunion_steps.rb +0 -0
  13. data/features/support/env.rb +13 -0
  14. data/lib/family-reunion.rb +49 -0
  15. data/lib/family-reunion/cache.rb +13 -0
  16. data/lib/family-reunion/exact_matcher.rb +72 -0
  17. data/lib/family-reunion/fuzzy_matcher.rb +93 -0
  18. data/lib/family-reunion/matcher_helper.rb +22 -0
  19. data/lib/family-reunion/nomatch_organizer.rb +57 -0
  20. data/lib/family-reunion/taxamatch_preprocessor.rb +103 -0
  21. data/lib/family-reunion/taxamatch_wrapper.rb +54 -0
  22. data/lib/family-reunion/top_node.rb +102 -0
  23. data/scripts/dwca2fr.rb +84 -0
  24. data/spec/family-reunion_spec.rb +20 -0
  25. data/spec/fixtures/ants_primary.json +1 -0
  26. data/spec/fixtures/ants_secondary.json +1 -0
  27. data/spec/fixtures/matched_merges.json +1 -0
  28. data/spec/fixtures/nodes_to_match.json +1 -0
  29. data/spec/fixtures/synonyms_strings_primary.json +1 -0
  30. data/spec/fixtures/synonyms_strings_secondary.json +1 -0
  31. data/spec/fixtures/valid_names_strings_primary.json +1 -0
  32. data/spec/fixtures/valid_names_strings_secondary.json +1 -0
  33. data/spec/fuzzy_matcher_spec.rb +32 -0
  34. data/spec/node_spec.rb +26 -0
  35. data/spec/nomatch_organizer_spec.rb +23 -0
  36. data/spec/spec_helper.rb +29 -0
  37. data/spec/taxamatch_preprocessor_spec.rb +49 -0
  38. data/spec/taxamatch_wrapper_spec.rb +21 -0
  39. metadata +256 -0
@@ -0,0 +1,93 @@
1
+ class FamilyReunion
2
+ class FuzzyMatcher
3
+ include MatcherHelper
4
+
5
+ def initialize(family_reunion)
6
+ @fr = family_reunion
7
+ @tw = FamilyReunion::TaxamatchWrapper.new
8
+ end
9
+
10
+ def merge
11
+ add_matches(get_valid_matches, :fuzzy_valid_to_valid)
12
+ add_matches(get_valid_to_synonym_matches, :fuzzy_valid_to_synonym)
13
+ add_matches(get_synonym_to_valid_matches, :fuzzy_synonym_to_valid)
14
+ add_matches(get_synonym_to_synonym_matches, :fuzzy_synonym_to_synonym)
15
+ end
16
+
17
+ def get_valid_matches
18
+ primary_names = @fr.primary_valid_names_set - @fr.secondary_valid_names_set
19
+ secondary_names = @fr.secondary_valid_names_set - @fr.primary_valid_names_set
20
+ make_match(primary_names, secondary_names, :valid_name, :valid_name)
21
+ end
22
+
23
+ def get_valid_to_synonym_matches
24
+ primary_names = @fr.primary_valid_names_set - @fr.secondary_synonyms_set
25
+ secondary_names = @fr.secondary_synonyms_set - @fr.primary_valid_names_set
26
+ make_match(primary_names, secondary_names, :valid_name, :synonym)
27
+ end
28
+
29
+ def get_synonym_to_valid_matches
30
+ primary_names = @fr.primary_synonyms_set - @fr.secondary_valid_names_set
31
+ secondary_names = @fr.secondary_valid_names_set - @fr.primary_synonyms_set
32
+ make_match(primary_names, secondary_names, :synonym, :valid_name)
33
+ end
34
+
35
+ def get_synonym_to_synonym_matches
36
+ primary_names = @fr.primary_synonyms_set - @fr.secondary_synonyms_set
37
+ secondary_names = @fr.secondary_synonyms_set - @fr.primary_synonyms_set
38
+ make_match(primary_names, secondary_names, :synonym, :synonym)
39
+ end
40
+
41
+ private
42
+
43
+ def add_matches(matched_nodes, match_type)
44
+ matched_nodes.each do |primary_node, secondary_nodes|
45
+ primary_id = primary_node[:id]
46
+ secondary_ids = secondary_nodes.map { |n| n[:id] }
47
+ secondary_id_matches = format_secondary_id_for_merge(secondary_ids, match_type)
48
+ add_record_to_merges(primary_id, secondary_id_matches)
49
+ end
50
+ end
51
+
52
+
53
+ def make_match(primary_names, secondary_names, primary_name_type, secondary_name_type)
54
+ canonical_matches = @tw.match_canonicals_lists(primary_names, secondary_names)
55
+ match_nodes_candidates = get_nodes_from_canonicals(canonical_matches, primary_name_type, secondary_name_type)
56
+ @tw.match_nodes(match_nodes_candidates)
57
+ end
58
+
59
+ def get_nodes_from_canonicals(canonical_matches, primary_name_type, secondary_name_type)
60
+ res = []
61
+ canonical_matches.each do |primary_name, secondary_names|
62
+ primary_nodes = self.send("get_#{primary_name_type}_node", @fr.primary_node, primary_name)
63
+ secondary_nodes = secondary_names.map do |secondary_name|
64
+ self.send("get_#{secondary_name_type}_node", @fr.secondary_node, secondary_name)
65
+ end
66
+ append_nodes(res, primary_nodes, secondary_nodes)
67
+ end
68
+ res
69
+ end
70
+
71
+ def append_nodes(nodes, primary_nodes, secondary_nodes)
72
+ secondary_nodes = secondary_nodes.flatten.uniq
73
+ primary_nodes.each do |primary_node|
74
+ nodes << [primary_node, secondary_nodes]
75
+ end
76
+ end
77
+
78
+ def get_valid_name_node(top_node, name)
79
+ node = top_node.valid_names_hash[name]
80
+ node.merge!({ :name_to_match => node[:valid_name][:name] })
81
+ [node]
82
+ end
83
+
84
+ def get_synonym_node(top_node, name)
85
+ nodes = top_node.synonyms_hash[name]
86
+ nodes.each do |n|
87
+ synonym_name = n[:synonyms].select { |s| s[:canonical_name] == name }.first[:name]
88
+ n.merge!({ :name_to_match => synonym_name })
89
+ end
90
+ end
91
+
92
+ end
93
+ end
@@ -0,0 +1,22 @@
1
+ class FamilyReunion
2
+ module MatcherHelper
3
+ private
4
+ def format_secondary_id_for_merge(secondary_ids, match_type)
5
+ secondary_ids.inject({}) do |res, i|
6
+ i = i.to_s
7
+ res[i] = {:match_type => match_type} unless res.has_key?(i)
8
+ res
9
+ end
10
+ end
11
+
12
+ def add_record_to_merges(primary_id, secondary_id_matches)
13
+ if @fr.merges.has_key?(primary_id)
14
+ secondary_id_matches.each do |key, val|
15
+ @fr.merges[primary_id][:matches][key] = val unless @fr.merges[primary_id][:matches].has_key?(key)
16
+ end
17
+ else
18
+ @fr.merges[primary_id] = {:matches => secondary_id_matches, :nonmatches => []}
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,57 @@
1
+ class FamilyReunion
2
+ class NomatchOrganizer
3
+
4
+ def initialize(family_reunion)
5
+ @fr = family_reunion
6
+ @nomatch_secondary_ids = nil
7
+ end
8
+
9
+ def merge
10
+ organize_nonmatches(get_nomach_secondary_ids)
11
+ end
12
+
13
+ def get_nomach_secondary_ids
14
+ return @nomatch_secondary_ids if @nomatch_secondary_ids
15
+ match_ids = @fr.merges.map { |key, val| val[:matches].keys }.flatten.uniq
16
+ empty_nodes_ids = @fr.secondary_node.data[:empty_nodes].map { |node| node[:id].to_sym }
17
+ valid_names_ids = @fr.secondary_node.ids_hash.keys.map { |k| k }
18
+ @nomatch_secondary_ids = valid_names_ids - match_ids
19
+ end
20
+
21
+ def organize_nonmatches(nomatch_secondary_ids)
22
+ ids_hash = @fr.secondary_node.ids_hash
23
+ paths_hash = @fr.primary_node.paths_hash
24
+ nomatch_secondary_ids.each do |i|
25
+ node = ids_hash[i]
26
+ merge_node(node, paths_hash)
27
+ end
28
+ end
29
+
30
+ private
31
+
32
+ def merge_node(node, paths_hash)
33
+ path = node[:path].dup
34
+ last_name = path.pop.to_sym
35
+ return if paths_hash.has_key?(last_name)
36
+ found_node = false
37
+ until path.empty?
38
+ name = path.pop.to_sym
39
+ if paths_hash.has_key?(name)
40
+ found_node = true
41
+ add_merged_node(paths_hash[name][1][-1], node[:id])
42
+ break
43
+ end
44
+ end
45
+ add_merged_node(@fr.primary_node.root_id, node[:id]) unless found_node
46
+ end
47
+
48
+ def add_merged_node(primary_node_id, secondary_node_id)
49
+ if @fr.merges.has_key?(primary_node_id)
50
+ @fr.merges[primary_node_id][:nonmatches] << secondary_node_id
51
+ else
52
+ @fr.merges[primary_node_id] = {:matches => [], :nonmatches => [secondary_node_id]}
53
+ end
54
+ end
55
+
56
+ end
57
+ end
@@ -0,0 +1,103 @@
1
+ class FamilyReunion
2
+ class TaxamatchPreprocessor
3
+
4
+ def initialize(cache)
5
+ @cache = cache
6
+ end
7
+
8
+ def get_match_candidates(list1, list2)
9
+ match_candidates = {:uninomials => {}, :binomials => {}, :trinomials => {}}
10
+ partitioned_names1 = partition_canonicals(list1)
11
+ partitioned_names2 = partition_canonicals(list2)
12
+ [:uninomials, :binomials, :trinomials].each do |bucket|
13
+ candidates = self.send("process_#{bucket}", partitioned_names1[bucket], partitioned_names2[bucket])
14
+ match_candidates[bucket].merge!(candidates)
15
+ end
16
+ match_candidates
17
+ end
18
+
19
+ def partition_canonicals(canonicals)
20
+ partitions = { :uninomials => [], :binomials => [], :trinomials => [], :multinomials => [] }
21
+ canonicals.each do |name|
22
+ words = name.split(' ')
23
+ key = case words.size
24
+ when 1
25
+ :uninomials
26
+ when 2
27
+ :binomials
28
+ when 3
29
+ :trinomials
30
+ else
31
+ :multinomials
32
+ end
33
+ partitions[key] << [name, words]
34
+ end
35
+ partitions
36
+ end
37
+
38
+ def process_uninomials(names1, names2)
39
+ names1.inject({}) do |res, n1|
40
+ names2.each do |n2|
41
+ if similar_words?(n1[1][0], n2[1][0])
42
+ res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] }
43
+ end
44
+ end
45
+ res
46
+ end
47
+ end
48
+
49
+ def process_binomials(names1, names2)
50
+ names1.inject({}) do |res, n1|
51
+ names2.each do |n2|
52
+ if similar_words?(n1[1][0], n2[1][0]) && similar_words?(n1[1][1], n2[1][1])
53
+ res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] }
54
+ end
55
+ end
56
+ res
57
+ end
58
+ end
59
+
60
+ def process_trinomials(names1, names2)
61
+ names1.inject({}) do |res, n1|
62
+ names2.each do |n2|
63
+ if similar_words?(n1[1][0], n2[1][0]) && similar_words?(n1[1][1], n2[1][1]) && similar_words?(n1[1][2], n2[1][2])
64
+ res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] }
65
+ end
66
+ end
67
+ res
68
+ end
69
+ end
70
+
71
+ def similar_words?(word1, word2)
72
+ raise RuntimeError unless (word1.is_a?(String) && word2.is_a?(String))
73
+
74
+ key = [word1, word2].sort.join(':')
75
+ cached = @cache.similar_words[key]
76
+ return cached if cached != nil
77
+
78
+ are_similar = false
79
+ if word1 == word2
80
+ are_similar = true
81
+ else
82
+ letters1 = get_letters(word1)
83
+ letters2 = get_letters(word2)
84
+ symmertric_difference = (letters1 - letters2) + (letters2 - letters1)
85
+ similar_letters = symmertric_difference.size.to_f/(letters1.size + letters2.size) <= 0.3
86
+ similar_length = (word1.size - word2.size).abs.to_f/word1.size <= 0.2
87
+ are_similar = similar_letters && similar_length
88
+ end
89
+ @cache.similar_words[key] = are_similar
90
+ are_similar
91
+ end
92
+
93
+ def get_letters(word)
94
+ letters = @cache.word_letters[word]
95
+ if letters == nil
96
+ letters = word.split('').uniq
97
+ @cache.word_letters[word] = letters
98
+ end
99
+ letters
100
+ end
101
+
102
+ end
103
+ end
@@ -0,0 +1,54 @@
1
+ class FamilyReunion
2
+ class TaxamatchWrapper
3
+
4
+ def initialize
5
+ @tm = Taxamatch::Base.new
6
+ @cache = FamilyReunion::Cache.new
7
+ @tp = FamilyReunion::TaxamatchPreprocessor.new(@cache)
8
+ end
9
+
10
+ def match_canonicals_lists(list1, list2)
11
+ matches = {}
12
+ match_candidates = @tp.get_match_candidates(list1, list2)
13
+ [:uninomials, :binomials, :trinomials].each do |bucket|
14
+ match_candidates[bucket].each do |name1, possible_matches|
15
+ possible_matches[:candidates].each do |name2|
16
+ if self.send("#{bucket}_match?", name1, name2[0])
17
+ matches.has_key?(name1) ? matches[name1] << name2[0] : matches[name1] = [name2[0]]
18
+ end
19
+ end
20
+ end
21
+ end
22
+ matches
23
+ end
24
+
25
+ def match_nodes(nodes)
26
+ res = []
27
+ nodes.each do |primary_node, secondary_nodes|
28
+ secondary_nodes.each do |secondary_node|
29
+ if @tm.taxamatch(primary_node[:name_to_match], secondary_node[:name_to_match])
30
+ if res.last && res.last[0][:id] == primary_node[:id]
31
+ res.last[1] << secondary_node
32
+ else
33
+ res << [primary_node, [secondary_node]]
34
+ end
35
+ end
36
+ end
37
+ end
38
+ res
39
+ end
40
+
41
+ def uninomials_match?(name1, name2)
42
+ @tm.taxamatch(name1, name2)
43
+ end
44
+
45
+ def binomials_match?(name1, name2)
46
+ uninomials_match?(name1, name2)
47
+ end
48
+
49
+ def trinomials_match?(name1, name2)
50
+ uninomials_match?(name1, name2)
51
+ end
52
+
53
+ end
54
+ end
@@ -0,0 +1,102 @@
1
+ class FamilyReunion
2
+ class TopNode
3
+ attr :data
4
+
5
+ def initialize(data)
6
+ @data = data
7
+ @valid_names_hash = nil
8
+ @valid_names_duplicates = nil
9
+ @synonyms_hash = nil
10
+ @ids_hash = nil
11
+ @paths_hash = nil
12
+ end
13
+
14
+ def valid_names_hash
15
+ return @valid_names_hash if @valid_names_hash #TODO: make it more robust for situations with exceptions etc.
16
+ @valid_names_duplicates = {}
17
+ @valid_names_hash = {}
18
+ @paths_hash = {}
19
+ @data[:leaves].each do |row|
20
+ canonical = row[:valid_name][:canonical_name]
21
+ update_paths_hash(row)
22
+ if @valid_names_hash.has_key?(canonical)
23
+ if @valid_names_duplicates.has_key?(canonical)
24
+ @valid_names_duplicates[canonical] << row
25
+ else
26
+ @valid_names_duplicates[canonical] = [row]
27
+ end
28
+ else
29
+ @valid_names_hash[canonical] = row
30
+ end
31
+ end
32
+ @valid_names_duplicates.keys.each do |k|
33
+ @valid_names_duplicates[k] << @valid_names_hash.delete(k)
34
+ end
35
+ data[:empty_nodes].each do |row|
36
+ update_paths_hash(row)
37
+ end
38
+ @valid_names_hash
39
+ end
40
+
41
+ def paths_hash
42
+ unless @paths_hash
43
+ valid_names_hash
44
+ end
45
+ @paths_hash
46
+ end
47
+
48
+ def ids_hash
49
+ return @ids_hash if @ids_hash
50
+ @ids_hash = valid_names_hash.inject({}) do |res, key_val|
51
+ res[key_val[1][:id].to_sym] = key_val[1]
52
+ res
53
+ end
54
+ data[:empty_nodes].each do |node|
55
+ @ids_hash[node[:id].to_sym] = node
56
+ end
57
+ @ids_hash
58
+ end
59
+
60
+ def synonyms_hash
61
+ return @synonyms_hash if @synonyms_hash
62
+ @synonyms_hash = {}
63
+ @valid_names_hash.keys.each do |name|
64
+ synonyms = @valid_names_hash[name][:synonyms]
65
+ synonyms.each do |syn|
66
+ @synonyms_hash.has_key?(syn[:canonical_name]) ? @synonyms_hash[syn[:canonical_name]] << @valid_names_hash[name] : @synonyms_hash[syn[:canonical_name]] = [@valid_names_hash[name]]
67
+ end
68
+ end
69
+ @synonyms_hash
70
+ end
71
+
72
+ def valid_names_duplicates
73
+ valid_names_hash unless @valid_names_duplicates
74
+ @valid_names_duplicates
75
+ end
76
+
77
+ def update_paths_hash(node)
78
+ path = node[:path].map { |n| n.to_sym }
79
+ path_ids = node[:path_ids].map { |i| i.to_sym }
80
+ until path.empty?
81
+ populate_paths_hash(path, path_ids)
82
+ path.pop
83
+ path_ids.pop
84
+ end
85
+ end
86
+
87
+ def populate_paths_hash(path, path_ids)
88
+ name = path[-1]
89
+ unless @paths_hash[name]
90
+ @paths_hash[name] = [path.dup, path_ids.dup]
91
+ end
92
+ end
93
+
94
+ def root_id
95
+ unless @root_id
96
+ @root_id = data[:leaves][0][:path_ids][0].to_sym
97
+ end
98
+ @root_id
99
+ end
100
+
101
+ end
102
+ end
@@ -0,0 +1,84 @@
1
+ #!/usr/bin/env ruby
2
+ require "dwc-archive"
3
+ require 'json'
4
+
5
+ class Node
6
+ attr_reader :classification
7
+
8
+ def logger
9
+ @logger || Logger.new($stdout)
10
+ end
11
+
12
+ def initialize(dwca_file)
13
+ @dwca = DarwinCore.new(dwca_file)
14
+ DarwinCore.logger = logger
15
+ logger.info("Creating classification tree")
16
+ @classification = DarwinCore::ClassificationNormalizer.new(@dwca)
17
+ @classification.normalize
18
+ @leaves = []
19
+ @empty_nodes = []
20
+ end
21
+
22
+ def leaves(node_id)
23
+ node = @classification.normalized_data[node_id]
24
+ path = node.classification_path_id
25
+ @node_path_size = path.size - 1
26
+ current_node = @classification.tree
27
+ until path.empty? do
28
+ current_node = current_node[path.shift]
29
+ end
30
+ walk_tree(current_node)
31
+ [@leaves, @empty_nodes]
32
+ end
33
+
34
+ private
35
+
36
+ def walk_tree(current_node)
37
+ current_node.keys.each do |key|
38
+ get_data(key, current_node[key].empty?)
39
+ walk_tree(current_node[key])
40
+ end
41
+ end
42
+
43
+ def get_data(node_id, node_is_empty)
44
+ node = @classification.normalized_data[node_id]
45
+ if is_species?(node.current_name_canonical)
46
+ add_node(@leaves, node)
47
+ elsif node_is_empty
48
+ add_node(@empty_nodes, node)
49
+ end
50
+ end
51
+
52
+ def add_node(res, node)
53
+ range = @node_path_size..node.classification_path.size
54
+ valid_name = {:name => node.current_name, :canonical_name => node.current_name_canonical, :type => :valid, :status => node.status}
55
+ synonyms = node.synonyms.inject([]) do |res, syn|
56
+ res << {:name => syn.name, :canonical_name => syn.canonical_name, :type => :synonym, :status => syn.status}
57
+ end
58
+ res << {:id => node.classification_path_id.last, :path => node.classification_path[range], :path_ids => node.classification_path_id[range], :rank => node.rank, :valid_name => valid_name, :synonyms => synonyms}
59
+ end
60
+
61
+ def is_species?(name_string)
62
+ name_string.split(/\s+/).size >= 2
63
+ end
64
+
65
+ end
66
+
67
+ if __FILE__ == $0
68
+
69
+ unless ARGV[1]
70
+ puts "script creates a json file with data compatible for family-reunion from a darwin core archive"
71
+ puts "Usage #{$0} path_to_dwca_file node_id [output_file]"
72
+ puts "output_file is optional"
73
+ exit
74
+ end
75
+
76
+ dwca_file = ARGV[0]
77
+ node_id = ARGV[1]
78
+ paths_file = ARGV[2] ? ARGV[2] : "node_leaves.json"
79
+
80
+ node = Node.new(dwca_file)
81
+ leaves, empty_nodes = node.leaves(node_id)
82
+ f = open(paths_file,'w')
83
+ f.write JSON.dump({:empty_nodes => empty_nodes, :leaves => leaves})
84
+ end