family-reunion 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.rspec +1 -0
- data/.rvmrc +1 -0
- data/Gemfile +27 -0
- data/Gemfile.lock +82 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +52 -0
- data/VERSION +1 -0
- data/family-reunion.gemspec +108 -0
- data/features/family-reunion.feature +9 -0
- data/features/step_definitions/family-reunion_steps.rb +0 -0
- data/features/support/env.rb +13 -0
- data/lib/family-reunion.rb +49 -0
- data/lib/family-reunion/cache.rb +13 -0
- data/lib/family-reunion/exact_matcher.rb +72 -0
- data/lib/family-reunion/fuzzy_matcher.rb +93 -0
- data/lib/family-reunion/matcher_helper.rb +22 -0
- data/lib/family-reunion/nomatch_organizer.rb +57 -0
- data/lib/family-reunion/taxamatch_preprocessor.rb +103 -0
- data/lib/family-reunion/taxamatch_wrapper.rb +54 -0
- data/lib/family-reunion/top_node.rb +102 -0
- data/scripts/dwca2fr.rb +84 -0
- data/spec/family-reunion_spec.rb +20 -0
- data/spec/fixtures/ants_primary.json +1 -0
- data/spec/fixtures/ants_secondary.json +1 -0
- data/spec/fixtures/matched_merges.json +1 -0
- data/spec/fixtures/nodes_to_match.json +1 -0
- data/spec/fixtures/synonyms_strings_primary.json +1 -0
- data/spec/fixtures/synonyms_strings_secondary.json +1 -0
- data/spec/fixtures/valid_names_strings_primary.json +1 -0
- data/spec/fixtures/valid_names_strings_secondary.json +1 -0
- data/spec/fuzzy_matcher_spec.rb +32 -0
- data/spec/node_spec.rb +26 -0
- data/spec/nomatch_organizer_spec.rb +23 -0
- data/spec/spec_helper.rb +29 -0
- data/spec/taxamatch_preprocessor_spec.rb +49 -0
- data/spec/taxamatch_wrapper_spec.rb +21 -0
- metadata +256 -0
@@ -0,0 +1,93 @@
|
|
1
|
+
class FamilyReunion
|
2
|
+
class FuzzyMatcher
|
3
|
+
include MatcherHelper
|
4
|
+
|
5
|
+
def initialize(family_reunion)
|
6
|
+
@fr = family_reunion
|
7
|
+
@tw = FamilyReunion::TaxamatchWrapper.new
|
8
|
+
end
|
9
|
+
|
10
|
+
def merge
|
11
|
+
add_matches(get_valid_matches, :fuzzy_valid_to_valid)
|
12
|
+
add_matches(get_valid_to_synonym_matches, :fuzzy_valid_to_synonym)
|
13
|
+
add_matches(get_synonym_to_valid_matches, :fuzzy_synonym_to_valid)
|
14
|
+
add_matches(get_synonym_to_synonym_matches, :fuzzy_synonym_to_synonym)
|
15
|
+
end
|
16
|
+
|
17
|
+
def get_valid_matches
|
18
|
+
primary_names = @fr.primary_valid_names_set - @fr.secondary_valid_names_set
|
19
|
+
secondary_names = @fr.secondary_valid_names_set - @fr.primary_valid_names_set
|
20
|
+
make_match(primary_names, secondary_names, :valid_name, :valid_name)
|
21
|
+
end
|
22
|
+
|
23
|
+
def get_valid_to_synonym_matches
|
24
|
+
primary_names = @fr.primary_valid_names_set - @fr.secondary_synonyms_set
|
25
|
+
secondary_names = @fr.secondary_synonyms_set - @fr.primary_valid_names_set
|
26
|
+
make_match(primary_names, secondary_names, :valid_name, :synonym)
|
27
|
+
end
|
28
|
+
|
29
|
+
def get_synonym_to_valid_matches
|
30
|
+
primary_names = @fr.primary_synonyms_set - @fr.secondary_valid_names_set
|
31
|
+
secondary_names = @fr.secondary_valid_names_set - @fr.primary_synonyms_set
|
32
|
+
make_match(primary_names, secondary_names, :synonym, :valid_name)
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_synonym_to_synonym_matches
|
36
|
+
primary_names = @fr.primary_synonyms_set - @fr.secondary_synonyms_set
|
37
|
+
secondary_names = @fr.secondary_synonyms_set - @fr.primary_synonyms_set
|
38
|
+
make_match(primary_names, secondary_names, :synonym, :synonym)
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def add_matches(matched_nodes, match_type)
|
44
|
+
matched_nodes.each do |primary_node, secondary_nodes|
|
45
|
+
primary_id = primary_node[:id]
|
46
|
+
secondary_ids = secondary_nodes.map { |n| n[:id] }
|
47
|
+
secondary_id_matches = format_secondary_id_for_merge(secondary_ids, match_type)
|
48
|
+
add_record_to_merges(primary_id, secondary_id_matches)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
def make_match(primary_names, secondary_names, primary_name_type, secondary_name_type)
|
54
|
+
canonical_matches = @tw.match_canonicals_lists(primary_names, secondary_names)
|
55
|
+
match_nodes_candidates = get_nodes_from_canonicals(canonical_matches, primary_name_type, secondary_name_type)
|
56
|
+
@tw.match_nodes(match_nodes_candidates)
|
57
|
+
end
|
58
|
+
|
59
|
+
def get_nodes_from_canonicals(canonical_matches, primary_name_type, secondary_name_type)
|
60
|
+
res = []
|
61
|
+
canonical_matches.each do |primary_name, secondary_names|
|
62
|
+
primary_nodes = self.send("get_#{primary_name_type}_node", @fr.primary_node, primary_name)
|
63
|
+
secondary_nodes = secondary_names.map do |secondary_name|
|
64
|
+
self.send("get_#{secondary_name_type}_node", @fr.secondary_node, secondary_name)
|
65
|
+
end
|
66
|
+
append_nodes(res, primary_nodes, secondary_nodes)
|
67
|
+
end
|
68
|
+
res
|
69
|
+
end
|
70
|
+
|
71
|
+
def append_nodes(nodes, primary_nodes, secondary_nodes)
|
72
|
+
secondary_nodes = secondary_nodes.flatten.uniq
|
73
|
+
primary_nodes.each do |primary_node|
|
74
|
+
nodes << [primary_node, secondary_nodes]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def get_valid_name_node(top_node, name)
|
79
|
+
node = top_node.valid_names_hash[name]
|
80
|
+
node.merge!({ :name_to_match => node[:valid_name][:name] })
|
81
|
+
[node]
|
82
|
+
end
|
83
|
+
|
84
|
+
def get_synonym_node(top_node, name)
|
85
|
+
nodes = top_node.synonyms_hash[name]
|
86
|
+
nodes.each do |n|
|
87
|
+
synonym_name = n[:synonyms].select { |s| s[:canonical_name] == name }.first[:name]
|
88
|
+
n.merge!({ :name_to_match => synonym_name })
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
class FamilyReunion
|
2
|
+
module MatcherHelper
|
3
|
+
private
|
4
|
+
def format_secondary_id_for_merge(secondary_ids, match_type)
|
5
|
+
secondary_ids.inject({}) do |res, i|
|
6
|
+
i = i.to_s
|
7
|
+
res[i] = {:match_type => match_type} unless res.has_key?(i)
|
8
|
+
res
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def add_record_to_merges(primary_id, secondary_id_matches)
|
13
|
+
if @fr.merges.has_key?(primary_id)
|
14
|
+
secondary_id_matches.each do |key, val|
|
15
|
+
@fr.merges[primary_id][:matches][key] = val unless @fr.merges[primary_id][:matches].has_key?(key)
|
16
|
+
end
|
17
|
+
else
|
18
|
+
@fr.merges[primary_id] = {:matches => secondary_id_matches, :nonmatches => []}
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
class FamilyReunion
|
2
|
+
class NomatchOrganizer
|
3
|
+
|
4
|
+
def initialize(family_reunion)
|
5
|
+
@fr = family_reunion
|
6
|
+
@nomatch_secondary_ids = nil
|
7
|
+
end
|
8
|
+
|
9
|
+
def merge
|
10
|
+
organize_nonmatches(get_nomach_secondary_ids)
|
11
|
+
end
|
12
|
+
|
13
|
+
def get_nomach_secondary_ids
|
14
|
+
return @nomatch_secondary_ids if @nomatch_secondary_ids
|
15
|
+
match_ids = @fr.merges.map { |key, val| val[:matches].keys }.flatten.uniq
|
16
|
+
empty_nodes_ids = @fr.secondary_node.data[:empty_nodes].map { |node| node[:id].to_sym }
|
17
|
+
valid_names_ids = @fr.secondary_node.ids_hash.keys.map { |k| k }
|
18
|
+
@nomatch_secondary_ids = valid_names_ids - match_ids
|
19
|
+
end
|
20
|
+
|
21
|
+
def organize_nonmatches(nomatch_secondary_ids)
|
22
|
+
ids_hash = @fr.secondary_node.ids_hash
|
23
|
+
paths_hash = @fr.primary_node.paths_hash
|
24
|
+
nomatch_secondary_ids.each do |i|
|
25
|
+
node = ids_hash[i]
|
26
|
+
merge_node(node, paths_hash)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def merge_node(node, paths_hash)
|
33
|
+
path = node[:path].dup
|
34
|
+
last_name = path.pop.to_sym
|
35
|
+
return if paths_hash.has_key?(last_name)
|
36
|
+
found_node = false
|
37
|
+
until path.empty?
|
38
|
+
name = path.pop.to_sym
|
39
|
+
if paths_hash.has_key?(name)
|
40
|
+
found_node = true
|
41
|
+
add_merged_node(paths_hash[name][1][-1], node[:id])
|
42
|
+
break
|
43
|
+
end
|
44
|
+
end
|
45
|
+
add_merged_node(@fr.primary_node.root_id, node[:id]) unless found_node
|
46
|
+
end
|
47
|
+
|
48
|
+
def add_merged_node(primary_node_id, secondary_node_id)
|
49
|
+
if @fr.merges.has_key?(primary_node_id)
|
50
|
+
@fr.merges[primary_node_id][:nonmatches] << secondary_node_id
|
51
|
+
else
|
52
|
+
@fr.merges[primary_node_id] = {:matches => [], :nonmatches => [secondary_node_id]}
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
class FamilyReunion
|
2
|
+
class TaxamatchPreprocessor
|
3
|
+
|
4
|
+
def initialize(cache)
|
5
|
+
@cache = cache
|
6
|
+
end
|
7
|
+
|
8
|
+
def get_match_candidates(list1, list2)
|
9
|
+
match_candidates = {:uninomials => {}, :binomials => {}, :trinomials => {}}
|
10
|
+
partitioned_names1 = partition_canonicals(list1)
|
11
|
+
partitioned_names2 = partition_canonicals(list2)
|
12
|
+
[:uninomials, :binomials, :trinomials].each do |bucket|
|
13
|
+
candidates = self.send("process_#{bucket}", partitioned_names1[bucket], partitioned_names2[bucket])
|
14
|
+
match_candidates[bucket].merge!(candidates)
|
15
|
+
end
|
16
|
+
match_candidates
|
17
|
+
end
|
18
|
+
|
19
|
+
def partition_canonicals(canonicals)
|
20
|
+
partitions = { :uninomials => [], :binomials => [], :trinomials => [], :multinomials => [] }
|
21
|
+
canonicals.each do |name|
|
22
|
+
words = name.split(' ')
|
23
|
+
key = case words.size
|
24
|
+
when 1
|
25
|
+
:uninomials
|
26
|
+
when 2
|
27
|
+
:binomials
|
28
|
+
when 3
|
29
|
+
:trinomials
|
30
|
+
else
|
31
|
+
:multinomials
|
32
|
+
end
|
33
|
+
partitions[key] << [name, words]
|
34
|
+
end
|
35
|
+
partitions
|
36
|
+
end
|
37
|
+
|
38
|
+
def process_uninomials(names1, names2)
|
39
|
+
names1.inject({}) do |res, n1|
|
40
|
+
names2.each do |n2|
|
41
|
+
if similar_words?(n1[1][0], n2[1][0])
|
42
|
+
res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
res
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def process_binomials(names1, names2)
|
50
|
+
names1.inject({}) do |res, n1|
|
51
|
+
names2.each do |n2|
|
52
|
+
if similar_words?(n1[1][0], n2[1][0]) && similar_words?(n1[1][1], n2[1][1])
|
53
|
+
res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
res
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def process_trinomials(names1, names2)
|
61
|
+
names1.inject({}) do |res, n1|
|
62
|
+
names2.each do |n2|
|
63
|
+
if similar_words?(n1[1][0], n2[1][0]) && similar_words?(n1[1][1], n2[1][1]) && similar_words?(n1[1][2], n2[1][2])
|
64
|
+
res.has_key?(n1[0]) ? res[n1[0]][:candidates] << n2 : res[n1[0]] = { :words => n1[1], :candidates => [n2] }
|
65
|
+
end
|
66
|
+
end
|
67
|
+
res
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def similar_words?(word1, word2)
|
72
|
+
raise RuntimeError unless (word1.is_a?(String) && word2.is_a?(String))
|
73
|
+
|
74
|
+
key = [word1, word2].sort.join(':')
|
75
|
+
cached = @cache.similar_words[key]
|
76
|
+
return cached if cached != nil
|
77
|
+
|
78
|
+
are_similar = false
|
79
|
+
if word1 == word2
|
80
|
+
are_similar = true
|
81
|
+
else
|
82
|
+
letters1 = get_letters(word1)
|
83
|
+
letters2 = get_letters(word2)
|
84
|
+
symmertric_difference = (letters1 - letters2) + (letters2 - letters1)
|
85
|
+
similar_letters = symmertric_difference.size.to_f/(letters1.size + letters2.size) <= 0.3
|
86
|
+
similar_length = (word1.size - word2.size).abs.to_f/word1.size <= 0.2
|
87
|
+
are_similar = similar_letters && similar_length
|
88
|
+
end
|
89
|
+
@cache.similar_words[key] = are_similar
|
90
|
+
are_similar
|
91
|
+
end
|
92
|
+
|
93
|
+
def get_letters(word)
|
94
|
+
letters = @cache.word_letters[word]
|
95
|
+
if letters == nil
|
96
|
+
letters = word.split('').uniq
|
97
|
+
@cache.word_letters[word] = letters
|
98
|
+
end
|
99
|
+
letters
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
103
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
class FamilyReunion
|
2
|
+
class TaxamatchWrapper
|
3
|
+
|
4
|
+
def initialize
|
5
|
+
@tm = Taxamatch::Base.new
|
6
|
+
@cache = FamilyReunion::Cache.new
|
7
|
+
@tp = FamilyReunion::TaxamatchPreprocessor.new(@cache)
|
8
|
+
end
|
9
|
+
|
10
|
+
def match_canonicals_lists(list1, list2)
|
11
|
+
matches = {}
|
12
|
+
match_candidates = @tp.get_match_candidates(list1, list2)
|
13
|
+
[:uninomials, :binomials, :trinomials].each do |bucket|
|
14
|
+
match_candidates[bucket].each do |name1, possible_matches|
|
15
|
+
possible_matches[:candidates].each do |name2|
|
16
|
+
if self.send("#{bucket}_match?", name1, name2[0])
|
17
|
+
matches.has_key?(name1) ? matches[name1] << name2[0] : matches[name1] = [name2[0]]
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
matches
|
23
|
+
end
|
24
|
+
|
25
|
+
def match_nodes(nodes)
|
26
|
+
res = []
|
27
|
+
nodes.each do |primary_node, secondary_nodes|
|
28
|
+
secondary_nodes.each do |secondary_node|
|
29
|
+
if @tm.taxamatch(primary_node[:name_to_match], secondary_node[:name_to_match])
|
30
|
+
if res.last && res.last[0][:id] == primary_node[:id]
|
31
|
+
res.last[1] << secondary_node
|
32
|
+
else
|
33
|
+
res << [primary_node, [secondary_node]]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
res
|
39
|
+
end
|
40
|
+
|
41
|
+
def uninomials_match?(name1, name2)
|
42
|
+
@tm.taxamatch(name1, name2)
|
43
|
+
end
|
44
|
+
|
45
|
+
def binomials_match?(name1, name2)
|
46
|
+
uninomials_match?(name1, name2)
|
47
|
+
end
|
48
|
+
|
49
|
+
def trinomials_match?(name1, name2)
|
50
|
+
uninomials_match?(name1, name2)
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
class FamilyReunion
|
2
|
+
class TopNode
|
3
|
+
attr :data
|
4
|
+
|
5
|
+
def initialize(data)
|
6
|
+
@data = data
|
7
|
+
@valid_names_hash = nil
|
8
|
+
@valid_names_duplicates = nil
|
9
|
+
@synonyms_hash = nil
|
10
|
+
@ids_hash = nil
|
11
|
+
@paths_hash = nil
|
12
|
+
end
|
13
|
+
|
14
|
+
def valid_names_hash
|
15
|
+
return @valid_names_hash if @valid_names_hash #TODO: make it more robust for situations with exceptions etc.
|
16
|
+
@valid_names_duplicates = {}
|
17
|
+
@valid_names_hash = {}
|
18
|
+
@paths_hash = {}
|
19
|
+
@data[:leaves].each do |row|
|
20
|
+
canonical = row[:valid_name][:canonical_name]
|
21
|
+
update_paths_hash(row)
|
22
|
+
if @valid_names_hash.has_key?(canonical)
|
23
|
+
if @valid_names_duplicates.has_key?(canonical)
|
24
|
+
@valid_names_duplicates[canonical] << row
|
25
|
+
else
|
26
|
+
@valid_names_duplicates[canonical] = [row]
|
27
|
+
end
|
28
|
+
else
|
29
|
+
@valid_names_hash[canonical] = row
|
30
|
+
end
|
31
|
+
end
|
32
|
+
@valid_names_duplicates.keys.each do |k|
|
33
|
+
@valid_names_duplicates[k] << @valid_names_hash.delete(k)
|
34
|
+
end
|
35
|
+
data[:empty_nodes].each do |row|
|
36
|
+
update_paths_hash(row)
|
37
|
+
end
|
38
|
+
@valid_names_hash
|
39
|
+
end
|
40
|
+
|
41
|
+
def paths_hash
|
42
|
+
unless @paths_hash
|
43
|
+
valid_names_hash
|
44
|
+
end
|
45
|
+
@paths_hash
|
46
|
+
end
|
47
|
+
|
48
|
+
def ids_hash
|
49
|
+
return @ids_hash if @ids_hash
|
50
|
+
@ids_hash = valid_names_hash.inject({}) do |res, key_val|
|
51
|
+
res[key_val[1][:id].to_sym] = key_val[1]
|
52
|
+
res
|
53
|
+
end
|
54
|
+
data[:empty_nodes].each do |node|
|
55
|
+
@ids_hash[node[:id].to_sym] = node
|
56
|
+
end
|
57
|
+
@ids_hash
|
58
|
+
end
|
59
|
+
|
60
|
+
def synonyms_hash
|
61
|
+
return @synonyms_hash if @synonyms_hash
|
62
|
+
@synonyms_hash = {}
|
63
|
+
@valid_names_hash.keys.each do |name|
|
64
|
+
synonyms = @valid_names_hash[name][:synonyms]
|
65
|
+
synonyms.each do |syn|
|
66
|
+
@synonyms_hash.has_key?(syn[:canonical_name]) ? @synonyms_hash[syn[:canonical_name]] << @valid_names_hash[name] : @synonyms_hash[syn[:canonical_name]] = [@valid_names_hash[name]]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
@synonyms_hash
|
70
|
+
end
|
71
|
+
|
72
|
+
def valid_names_duplicates
|
73
|
+
valid_names_hash unless @valid_names_duplicates
|
74
|
+
@valid_names_duplicates
|
75
|
+
end
|
76
|
+
|
77
|
+
def update_paths_hash(node)
|
78
|
+
path = node[:path].map { |n| n.to_sym }
|
79
|
+
path_ids = node[:path_ids].map { |i| i.to_sym }
|
80
|
+
until path.empty?
|
81
|
+
populate_paths_hash(path, path_ids)
|
82
|
+
path.pop
|
83
|
+
path_ids.pop
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def populate_paths_hash(path, path_ids)
|
88
|
+
name = path[-1]
|
89
|
+
unless @paths_hash[name]
|
90
|
+
@paths_hash[name] = [path.dup, path_ids.dup]
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def root_id
|
95
|
+
unless @root_id
|
96
|
+
@root_id = data[:leaves][0][:path_ids][0].to_sym
|
97
|
+
end
|
98
|
+
@root_id
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
end
|
data/scripts/dwca2fr.rb
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require "dwc-archive"
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
class Node
|
6
|
+
attr_reader :classification
|
7
|
+
|
8
|
+
def logger
|
9
|
+
@logger || Logger.new($stdout)
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(dwca_file)
|
13
|
+
@dwca = DarwinCore.new(dwca_file)
|
14
|
+
DarwinCore.logger = logger
|
15
|
+
logger.info("Creating classification tree")
|
16
|
+
@classification = DarwinCore::ClassificationNormalizer.new(@dwca)
|
17
|
+
@classification.normalize
|
18
|
+
@leaves = []
|
19
|
+
@empty_nodes = []
|
20
|
+
end
|
21
|
+
|
22
|
+
def leaves(node_id)
|
23
|
+
node = @classification.normalized_data[node_id]
|
24
|
+
path = node.classification_path_id
|
25
|
+
@node_path_size = path.size - 1
|
26
|
+
current_node = @classification.tree
|
27
|
+
until path.empty? do
|
28
|
+
current_node = current_node[path.shift]
|
29
|
+
end
|
30
|
+
walk_tree(current_node)
|
31
|
+
[@leaves, @empty_nodes]
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def walk_tree(current_node)
|
37
|
+
current_node.keys.each do |key|
|
38
|
+
get_data(key, current_node[key].empty?)
|
39
|
+
walk_tree(current_node[key])
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def get_data(node_id, node_is_empty)
|
44
|
+
node = @classification.normalized_data[node_id]
|
45
|
+
if is_species?(node.current_name_canonical)
|
46
|
+
add_node(@leaves, node)
|
47
|
+
elsif node_is_empty
|
48
|
+
add_node(@empty_nodes, node)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def add_node(res, node)
|
53
|
+
range = @node_path_size..node.classification_path.size
|
54
|
+
valid_name = {:name => node.current_name, :canonical_name => node.current_name_canonical, :type => :valid, :status => node.status}
|
55
|
+
synonyms = node.synonyms.inject([]) do |res, syn|
|
56
|
+
res << {:name => syn.name, :canonical_name => syn.canonical_name, :type => :synonym, :status => syn.status}
|
57
|
+
end
|
58
|
+
res << {:id => node.classification_path_id.last, :path => node.classification_path[range], :path_ids => node.classification_path_id[range], :rank => node.rank, :valid_name => valid_name, :synonyms => synonyms}
|
59
|
+
end
|
60
|
+
|
61
|
+
def is_species?(name_string)
|
62
|
+
name_string.split(/\s+/).size >= 2
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
if __FILE__ == $0
|
68
|
+
|
69
|
+
unless ARGV[1]
|
70
|
+
puts "script creates a json file with data compatible for family-reunion from a darwin core archive"
|
71
|
+
puts "Usage #{$0} path_to_dwca_file node_id [output_file]"
|
72
|
+
puts "output_file is optional"
|
73
|
+
exit
|
74
|
+
end
|
75
|
+
|
76
|
+
dwca_file = ARGV[0]
|
77
|
+
node_id = ARGV[1]
|
78
|
+
paths_file = ARGV[2] ? ARGV[2] : "node_leaves.json"
|
79
|
+
|
80
|
+
node = Node.new(dwca_file)
|
81
|
+
leaves, empty_nodes = node.leaves(node_id)
|
82
|
+
f = open(paths_file,'w')
|
83
|
+
f.write JSON.dump({:empty_nodes => empty_nodes, :leaves => leaves})
|
84
|
+
end
|