family-reunion 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/family-reunion.gemspec +2 -2
- data/lib/family-reunion.rb +24 -1
- data/lib/family-reunion/exact_matcher.rb +10 -6
- data/lib/family-reunion/fuzzy_matcher.rb +6 -2
- data/lib/family-reunion/matcher_helper.rb +2 -1
- data/lib/family-reunion/nomatch_organizer.rb +1 -0
- data/lib/family-reunion/top_node.rb +1 -1
- data/spec/family-reunion_spec.rb +1 -0
- metadata +4 -4
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.3
|
data/family-reunion.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{family-reunion}
|
8
|
-
s.version = "0.1.
|
8
|
+
s.version = "0.1.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Dmitry Mozzherin", "David Shorthouse"]
|
12
|
-
s.date = %q{2011-06-
|
12
|
+
s.date = %q{2011-06-14}
|
13
13
|
s.description = %q{An algorithm to merge related nodes of two taxonomic hierarchies with synonym information}
|
14
14
|
s.email = %q{dmozzherin@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
data/lib/family-reunion.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'logger'
|
1
2
|
require 'json'
|
2
3
|
require 'taxamatch_rb'
|
3
4
|
require 'family-reunion/cache'
|
@@ -15,6 +16,24 @@ class FamilyReunion
|
|
15
16
|
attr :primary_valid_names_set, :secondary_valid_names_set
|
16
17
|
attr :primary_synonyms_set, :secondary_synonyms_set
|
17
18
|
|
19
|
+
VERSION = open(File.join(File.dirname(__FILE__), '..', 'VERSION')).readline.strip
|
20
|
+
|
21
|
+
def self.logger
|
22
|
+
@@logger ||= Logger.new(nil)
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.logger=(logger)
|
26
|
+
@@logger = logger
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.logger_reset
|
30
|
+
self.logger = Logger.new(nil)
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.logger_write(obj_id, message, method = :info)
|
34
|
+
self.logger.send(method, "|%s|%s|" % [obj_id, message])
|
35
|
+
end
|
36
|
+
|
18
37
|
def initialize(primary_node, secondary_node)
|
19
38
|
@primary_node = FamilyReunion::TopNode.new(primary_node)
|
20
39
|
@secondary_node = FamilyReunion::TopNode.new(secondary_node)
|
@@ -29,20 +48,24 @@ class FamilyReunion
|
|
29
48
|
merge_exact_matches
|
30
49
|
merge_fuzzy_matches if with_fuzzy_matching
|
31
50
|
merge_no_matches
|
51
|
+
FamilyReunion.logger_write(self.object_id, "Merging is complete")
|
32
52
|
@merges
|
33
53
|
end
|
34
54
|
|
35
55
|
private
|
36
56
|
|
37
57
|
def merge_exact_matches
|
58
|
+
FamilyReunion.logger_write(self.object_id, "Started merging of exact matches")
|
38
59
|
ExactMatcher.new(self).merge
|
39
60
|
end
|
40
61
|
|
41
62
|
def merge_fuzzy_matches
|
63
|
+
FamilyReunion.logger_write(self.object_id, "Started merging of fuzzy matches")
|
42
64
|
FuzzyMatcher.new(self).merge
|
43
65
|
end
|
44
|
-
|
66
|
+
|
45
67
|
def merge_no_matches
|
68
|
+
FamilyReunion.logger_write(self.object_id, "Started gap filling, adding new species and uninomials")
|
46
69
|
NomatchOrganizer.new(self).merge
|
47
70
|
end
|
48
71
|
|
@@ -7,9 +7,13 @@ class FamilyReunion
|
|
7
7
|
end
|
8
8
|
|
9
9
|
def merge
|
10
|
+
FamilyReunion.logger_write(@fr.object_id, "Merging exact matches of accepted names")
|
10
11
|
add_valid_matches(get_valid_matches)
|
12
|
+
FamilyReunion.logger_write(@fr.object_id, "Merging exact matches of accepted names to synonyms")
|
11
13
|
add_synonym_matches(get_valid_to_synonym_matches, :valid_to_synonym)
|
14
|
+
FamilyReunion.logger_write(@fr.object_id, "Merging exact matches of synonyms to accepted names")
|
12
15
|
add_synonym_matches(get_synonym_to_valid_matches, :synonym_to_valid)
|
16
|
+
FamilyReunion.logger_write(@fr.object_id, "Merging exact matches of synonyms")
|
13
17
|
add_synonym_matches(get_synonym_to_synonym_matches, :synonym_to_synonym)
|
14
18
|
end
|
15
19
|
|
@@ -23,9 +27,9 @@ class FamilyReunion
|
|
23
27
|
# Homonyms are treated separately, and are not matched by the algorithm,
|
24
28
|
# they are excluded from valid_matches
|
25
29
|
valid_matches.each do |name|
|
26
|
-
primary_id = @fr.primary_node.valid_names_hash[name][:id]
|
27
|
-
secondary_id = @fr.secondary_node.valid_names_hash[name][:id]
|
28
|
-
@fr.merges[primary_id] = {:matches => {secondary_id
|
30
|
+
primary_id = @fr.primary_node.valid_names_hash[name][:id].to_s.to_sym
|
31
|
+
secondary_id = @fr.secondary_node.valid_names_hash[name][:id].to_s.to_sym
|
32
|
+
@fr.merges[primary_id] = {:matches => {secondary_id => {:match_type => :valid_to_valid}}, :nonmatches => []}
|
29
33
|
end
|
30
34
|
end
|
31
35
|
|
@@ -36,7 +40,7 @@ class FamilyReunion
|
|
36
40
|
def get_synonym_to_valid_matches
|
37
41
|
@fr.primary_synonyms_set & @fr.secondary_valid_names_set
|
38
42
|
end
|
39
|
-
|
43
|
+
|
40
44
|
def get_synonym_to_synonym_matches
|
41
45
|
@fr.primary_synonyms_set & @fr.secondary_synonyms_set
|
42
46
|
end
|
@@ -61,9 +65,9 @@ class FamilyReunion
|
|
61
65
|
valid_names = node.valid_names_hash
|
62
66
|
synonyms = node.synonyms_hash
|
63
67
|
if valid_names.has_key?(name)
|
64
|
-
return [valid_names[name][:id]]
|
68
|
+
return [valid_names[name][:id].to_s.to_sym]
|
65
69
|
else
|
66
|
-
return synonyms[name].map {|n| n[:id]}
|
70
|
+
return synonyms[name].map {|n| n[:id].to_s.to_sym}
|
67
71
|
end
|
68
72
|
end
|
69
73
|
|
@@ -8,9 +8,13 @@ class FamilyReunion
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def merge
|
11
|
+
FamilyReunion.logger_write(@fr.object_id, "Merging fuzzy matches of accepted names")
|
11
12
|
add_matches(get_valid_matches, :fuzzy_valid_to_valid)
|
13
|
+
FamilyReunion.logger_write(@fr.object_id, "Merging fuzzy matches of accepted names to synonyms")
|
12
14
|
add_matches(get_valid_to_synonym_matches, :fuzzy_valid_to_synonym)
|
15
|
+
FamilyReunion.logger_write(@fr.object_id, "Merging fuzzy matches of synonyms to accepted names")
|
13
16
|
add_matches(get_synonym_to_valid_matches, :fuzzy_synonym_to_valid)
|
17
|
+
FamilyReunion.logger_write(@fr.object_id, "Merging fuzzy matches of synonyms")
|
14
18
|
add_matches(get_synonym_to_synonym_matches, :fuzzy_synonym_to_synonym)
|
15
19
|
end
|
16
20
|
|
@@ -42,8 +46,8 @@ class FamilyReunion
|
|
42
46
|
|
43
47
|
def add_matches(matched_nodes, match_type)
|
44
48
|
matched_nodes.each do |primary_node, secondary_nodes|
|
45
|
-
primary_id = primary_node[:id]
|
46
|
-
secondary_ids = secondary_nodes.map { |n| n[:id] }
|
49
|
+
primary_id = primary_node[:id].to_s.to_sym
|
50
|
+
secondary_ids = secondary_nodes.map { |n| n[:id].to_s.to_sym }
|
47
51
|
secondary_id_matches = format_secondary_id_for_merge(secondary_ids, match_type)
|
48
52
|
add_record_to_merges(primary_id, secondary_id_matches)
|
49
53
|
end
|
@@ -3,13 +3,14 @@ class FamilyReunion
|
|
3
3
|
private
|
4
4
|
def format_secondary_id_for_merge(secondary_ids, match_type)
|
5
5
|
secondary_ids.inject({}) do |res, i|
|
6
|
-
|
6
|
+
raise "Secondary id is not a symbol" unless i.is_a?(Symbol)
|
7
7
|
res[i] = {:match_type => match_type} unless res.has_key?(i)
|
8
8
|
res
|
9
9
|
end
|
10
10
|
end
|
11
11
|
|
12
12
|
def add_record_to_merges(primary_id, secondary_id_matches)
|
13
|
+
raise "Primary id is not a symbol" unless primary_id.is_a?(Symbol)
|
13
14
|
if @fr.merges.has_key?(primary_id)
|
14
15
|
secondary_id_matches.each do |key, val|
|
15
16
|
@fr.merges[primary_id][:matches][key] = val unless @fr.merges[primary_id][:matches].has_key?(key)
|
@@ -76,7 +76,7 @@ class FamilyReunion
|
|
76
76
|
|
77
77
|
def update_paths_hash(node)
|
78
78
|
path = node[:path].map { |n| n.to_sym }
|
79
|
-
path_ids = node[:path_ids].map { |i| i.to_sym }
|
79
|
+
path_ids = node[:path_ids].map { |i| i.to_s.to_sym }
|
80
80
|
until path.empty?
|
81
81
|
populate_paths_hash(path, path_ids)
|
82
82
|
path.pop
|
data/spec/family-reunion_spec.rb
CHANGED
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 3
|
9
|
+
version: 0.1.3
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Dmitry Mozzherin
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-06-
|
18
|
+
date: 2011-06-14 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -233,7 +233,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
233
233
|
requirements:
|
234
234
|
- - ">="
|
235
235
|
- !ruby/object:Gem::Version
|
236
|
-
hash:
|
236
|
+
hash: 4425929328463574267
|
237
237
|
segments:
|
238
238
|
- 0
|
239
239
|
version: "0"
|