family-reunion 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/.document +5 -0
  2. data/.rspec +1 -0
  3. data/.rvmrc +1 -0
  4. data/Gemfile +27 -0
  5. data/Gemfile.lock +82 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.rdoc +19 -0
  8. data/Rakefile +52 -0
  9. data/VERSION +1 -0
  10. data/family-reunion.gemspec +108 -0
  11. data/features/family-reunion.feature +9 -0
  12. data/features/step_definitions/family-reunion_steps.rb +0 -0
  13. data/features/support/env.rb +13 -0
  14. data/lib/family-reunion.rb +49 -0
  15. data/lib/family-reunion/cache.rb +13 -0
  16. data/lib/family-reunion/exact_matcher.rb +72 -0
  17. data/lib/family-reunion/fuzzy_matcher.rb +93 -0
  18. data/lib/family-reunion/matcher_helper.rb +22 -0
  19. data/lib/family-reunion/nomatch_organizer.rb +57 -0
  20. data/lib/family-reunion/taxamatch_preprocessor.rb +103 -0
  21. data/lib/family-reunion/taxamatch_wrapper.rb +54 -0
  22. data/lib/family-reunion/top_node.rb +102 -0
  23. data/scripts/dwca2fr.rb +84 -0
  24. data/spec/family-reunion_spec.rb +20 -0
  25. data/spec/fixtures/ants_primary.json +1 -0
  26. data/spec/fixtures/ants_secondary.json +1 -0
  27. data/spec/fixtures/matched_merges.json +1 -0
  28. data/spec/fixtures/nodes_to_match.json +1 -0
  29. data/spec/fixtures/synonyms_strings_primary.json +1 -0
  30. data/spec/fixtures/synonyms_strings_secondary.json +1 -0
  31. data/spec/fixtures/valid_names_strings_primary.json +1 -0
  32. data/spec/fixtures/valid_names_strings_secondary.json +1 -0
  33. data/spec/fuzzy_matcher_spec.rb +32 -0
  34. data/spec/node_spec.rb +26 -0
  35. data/spec/nomatch_organizer_spec.rb +23 -0
  36. data/spec/spec_helper.rb +29 -0
  37. data/spec/taxamatch_preprocessor_spec.rb +49 -0
  38. data/spec/taxamatch_wrapper_spec.rb +21 -0
  39. metadata +256 -0
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use ruby-1.9.2-p0
data/Gemfile ADDED
@@ -0,0 +1,27 @@
1
+ source "http://rubygems.org"
2
+ require 'yaml'
3
+ YAML::ENGINE.yamler= 'syck'
4
+
5
+
6
+ # Add dependencies required to use your gem here.
7
+ # Example:
8
+ # gem "activesupport", ">= 2.3.5"
9
+
10
+ # Add dependencies to develop your gem here.
11
+ # Include everything needed to run rake, tests, features, etc.
12
+
13
+ gem "dwc-archive", ">= 0.5.13"
14
+ gem "taxamatch_rb", ">= 0.6.5"
15
+
16
+
17
+ group :development do
18
+ gem "rspec", "~> 2.3.0"
19
+ gem "cucumber", ">= 0"
20
+ gem "bundler", "~> 1.0.0"
21
+ gem "jeweler", "~> 1.6.0"
22
+ gem "rcov", ">= 0"
23
+ gem "ruby-debug19"
24
+ gem "ruby-prof"
25
+ gem "shoulda"
26
+ gem "mocha"
27
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,82 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ RubyInline (3.9.0)
5
+ ZenTest (~> 4.3)
6
+ ZenTest (4.5.0)
7
+ archive-tar-minitar (0.5.2)
8
+ biodiversity (0.7.3)
9
+ json
10
+ treetop
11
+ builder (3.0.0)
12
+ columnize (0.3.2)
13
+ cucumber (0.10.2)
14
+ builder (>= 2.1.2)
15
+ diff-lcs (>= 1.1.2)
16
+ gherkin (>= 2.3.5)
17
+ json (>= 1.4.6)
18
+ term-ansicolor (>= 1.0.5)
19
+ diff-lcs (1.1.2)
20
+ dwc-archive (0.5.13)
21
+ parsley-store (>= 0.2.0)
22
+ gherkin (2.3.7)
23
+ json (>= 1.4.6)
24
+ git (1.2.5)
25
+ jeweler (1.6.0)
26
+ bundler (~> 1.0.0)
27
+ git (>= 1.2.5)
28
+ rake
29
+ json (1.5.1)
30
+ linecache19 (0.5.11)
31
+ ruby_core_source (>= 0.1.4)
32
+ mocha (0.9.8)
33
+ rake
34
+ parsley-store (0.2.2)
35
+ biodiversity
36
+ redis
37
+ polyglot (0.3.1)
38
+ rake (0.8.7)
39
+ rcov (0.9.9)
40
+ redis (2.2.0)
41
+ rspec (2.3.0)
42
+ rspec-core (~> 2.3.0)
43
+ rspec-expectations (~> 2.3.0)
44
+ rspec-mocks (~> 2.3.0)
45
+ rspec-core (2.3.1)
46
+ rspec-expectations (2.3.0)
47
+ diff-lcs (~> 1.1.2)
48
+ rspec-mocks (2.3.0)
49
+ ruby-debug-base19 (0.11.24)
50
+ columnize (>= 0.3.1)
51
+ linecache19 (>= 0.5.11)
52
+ ruby_core_source (>= 0.1.4)
53
+ ruby-debug19 (0.11.6)
54
+ columnize (>= 0.3.1)
55
+ linecache19 (>= 0.5.11)
56
+ ruby-debug-base19 (>= 0.11.19)
57
+ ruby-prof (0.10.7)
58
+ ruby_core_source (0.1.4)
59
+ archive-tar-minitar (>= 0.5.2)
60
+ shoulda (2.11.3)
61
+ taxamatch_rb (0.6.5)
62
+ RubyInline
63
+ biodiversity (>= 0.5.13)
64
+ term-ansicolor (1.0.5)
65
+ treetop (1.4.9)
66
+ polyglot (>= 0.3.1)
67
+
68
+ PLATFORMS
69
+ ruby
70
+
71
+ DEPENDENCIES
72
+ bundler (~> 1.0.0)
73
+ cucumber
74
+ dwc-archive (>= 0.5.13)
75
+ jeweler (~> 1.6.0)
76
+ mocha
77
+ rcov
78
+ rspec (~> 2.3.0)
79
+ ruby-debug19
80
+ ruby-prof
81
+ shoulda
82
+ taxamatch_rb (>= 0.6.5)
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Dmitry Mozzherin
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,19 @@
1
+ = family-reunion
2
+
3
+ Description goes here.
4
+
5
+ == Contributing to family-reunion
6
+
7
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
8
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
9
+ * Fork the project
10
+ * Start a feature/bugfix branch
11
+ * Commit and push until you are happy with your contribution
12
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
13
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2011 Marine Biological Laboratory. See LICENSE.txt for
18
+ further details.
19
+
data/Rakefile ADDED
@@ -0,0 +1,52 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "family-reunion"
18
+ gem.homepage = "http://github.com/dimus/family-reunion"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{An algorithm to merge related nodes of two taxonomic hierarchies}
21
+ gem.description = %Q{An algorithm to merge related nodes of two taxonomic hierarchies with synonym information}
22
+ gem.email = "dmozzherin@gmail.com"
23
+ gem.authors = ["Dmitry Mozzherin", "David Shorthouse"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rspec/core'
29
+ require 'rspec/core/rake_task'
30
+ RSpec::Core::RakeTask.new(:spec) do |spec|
31
+ spec.pattern = FileList['spec/**/*_spec.rb']
32
+ end
33
+
34
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
35
+ spec.pattern = 'spec/**/*_spec.rb'
36
+ spec.rcov = true
37
+ end
38
+
39
+ require 'cucumber/rake/task'
40
+ Cucumber::Rake::Task.new(:features)
41
+
42
+ task :default => :spec
43
+
44
+ require 'rake/rdoctask'
45
+ Rake::RDocTask.new do |rdoc|
46
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
47
+
48
+ rdoc.rdoc_dir = 'rdoc'
49
+ rdoc.title = "family-reunion #{version}"
50
+ rdoc.rdoc_files.include('README*')
51
+ rdoc.rdoc_files.include('lib/**/*.rb')
52
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.1
@@ -0,0 +1,108 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{family-reunion}
8
+ s.version = "0.1.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Dmitry Mozzherin", "David Shorthouse"]
12
+ s.date = %q{2011-06-07}
13
+ s.description = %q{An algorithm to merge related nodes of two taxonomic hierarchies with synonym information}
14
+ s.email = %q{dmozzherin@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".rspec",
22
+ ".rvmrc",
23
+ "Gemfile",
24
+ "Gemfile.lock",
25
+ "LICENSE.txt",
26
+ "README.rdoc",
27
+ "Rakefile",
28
+ "VERSION",
29
+ "family-reunion.gemspec",
30
+ "features/family-reunion.feature",
31
+ "features/step_definitions/family-reunion_steps.rb",
32
+ "features/support/env.rb",
33
+ "lib/family-reunion.rb",
34
+ "lib/family-reunion/cache.rb",
35
+ "lib/family-reunion/exact_matcher.rb",
36
+ "lib/family-reunion/fuzzy_matcher.rb",
37
+ "lib/family-reunion/matcher_helper.rb",
38
+ "lib/family-reunion/nomatch_organizer.rb",
39
+ "lib/family-reunion/taxamatch_preprocessor.rb",
40
+ "lib/family-reunion/taxamatch_wrapper.rb",
41
+ "lib/family-reunion/top_node.rb",
42
+ "scripts/dwca2fr.rb",
43
+ "spec/family-reunion_spec.rb",
44
+ "spec/fixtures/ants_primary.json",
45
+ "spec/fixtures/ants_secondary.json",
46
+ "spec/fixtures/matched_merges.json",
47
+ "spec/fixtures/nodes_to_match.json",
48
+ "spec/fixtures/synonyms_strings_primary.json",
49
+ "spec/fixtures/synonyms_strings_secondary.json",
50
+ "spec/fixtures/valid_names_strings_primary.json",
51
+ "spec/fixtures/valid_names_strings_secondary.json",
52
+ "spec/fuzzy_matcher_spec.rb",
53
+ "spec/node_spec.rb",
54
+ "spec/nomatch_organizer_spec.rb",
55
+ "spec/spec_helper.rb",
56
+ "spec/taxamatch_preprocessor_spec.rb",
57
+ "spec/taxamatch_wrapper_spec.rb"
58
+ ]
59
+ s.homepage = %q{http://github.com/dimus/family-reunion}
60
+ s.licenses = ["MIT"]
61
+ s.require_paths = ["lib"]
62
+ s.rubygems_version = %q{1.3.7}
63
+ s.summary = %q{An algorithm to merge related nodes of two taxonomic hierarchies}
64
+
65
+ if s.respond_to? :specification_version then
66
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
67
+ s.specification_version = 3
68
+
69
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
70
+ s.add_runtime_dependency(%q<dwc-archive>, [">= 0.5.13"])
71
+ s.add_runtime_dependency(%q<taxamatch_rb>, [">= 0.6.5"])
72
+ s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
73
+ s.add_development_dependency(%q<cucumber>, [">= 0"])
74
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
75
+ s.add_development_dependency(%q<jeweler>, ["~> 1.6.0"])
76
+ s.add_development_dependency(%q<rcov>, [">= 0"])
77
+ s.add_development_dependency(%q<ruby-debug19>, [">= 0"])
78
+ s.add_development_dependency(%q<ruby-prof>, [">= 0"])
79
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
80
+ s.add_development_dependency(%q<mocha>, [">= 0"])
81
+ else
82
+ s.add_dependency(%q<dwc-archive>, [">= 0.5.13"])
83
+ s.add_dependency(%q<taxamatch_rb>, [">= 0.6.5"])
84
+ s.add_dependency(%q<rspec>, ["~> 2.3.0"])
85
+ s.add_dependency(%q<cucumber>, [">= 0"])
86
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
87
+ s.add_dependency(%q<jeweler>, ["~> 1.6.0"])
88
+ s.add_dependency(%q<rcov>, [">= 0"])
89
+ s.add_dependency(%q<ruby-debug19>, [">= 0"])
90
+ s.add_dependency(%q<ruby-prof>, [">= 0"])
91
+ s.add_dependency(%q<shoulda>, [">= 0"])
92
+ s.add_dependency(%q<mocha>, [">= 0"])
93
+ end
94
+ else
95
+ s.add_dependency(%q<dwc-archive>, [">= 0.5.13"])
96
+ s.add_dependency(%q<taxamatch_rb>, [">= 0.6.5"])
97
+ s.add_dependency(%q<rspec>, ["~> 2.3.0"])
98
+ s.add_dependency(%q<cucumber>, [">= 0"])
99
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
100
+ s.add_dependency(%q<jeweler>, ["~> 1.6.0"])
101
+ s.add_dependency(%q<rcov>, [">= 0"])
102
+ s.add_dependency(%q<ruby-debug19>, [">= 0"])
103
+ s.add_dependency(%q<ruby-prof>, [">= 0"])
104
+ s.add_dependency(%q<shoulda>, [">= 0"])
105
+ s.add_dependency(%q<mocha>, [">= 0"])
106
+ end
107
+ end
108
+
@@ -0,0 +1,9 @@
1
+ Feature: something something
2
+ In order to something something
3
+ A user something something
4
+ something something something
5
+
6
+ Scenario: something something
7
+ Given inspiration
8
+ When I create a sweet new gem
9
+ Then everyone should see how awesome I am
File without changes
@@ -0,0 +1,13 @@
1
+ require 'bundler'
2
+ begin
3
+ Bundler.setup(:default, :development)
4
+ rescue Bundler::BundlerError => e
5
+ $stderr.puts e.message
6
+ $stderr.puts "Run `bundle install` to install missing gems"
7
+ exit e.status_code
8
+ end
9
+
10
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
11
+ require 'family-reunion'
12
+
13
+ require 'rspec/expectations'
@@ -0,0 +1,49 @@
1
+ require 'json'
2
+ require 'taxamatch_rb'
3
+ require 'family-reunion/cache'
4
+ require 'family-reunion/top_node'
5
+ require 'family-reunion/matcher_helper'
6
+ require 'family-reunion/exact_matcher'
7
+ require 'family-reunion/fuzzy_matcher'
8
+ require 'family-reunion/taxamatch_wrapper'
9
+ require 'family-reunion/taxamatch_preprocessor'
10
+ require 'family-reunion/nomatch_organizer'
11
+
12
+
13
+ class FamilyReunion
14
+ attr :primary_node, :secondary_node, :merges
15
+ attr :primary_valid_names_set, :secondary_valid_names_set
16
+ attr :primary_synonyms_set, :secondary_synonyms_set
17
+
18
+ def initialize(primary_node, secondary_node)
19
+ @primary_node = FamilyReunion::TopNode.new(primary_node)
20
+ @secondary_node = FamilyReunion::TopNode.new(secondary_node)
21
+ @primary_valid_names_set = Set.new(@primary_node.valid_names_hash.keys)
22
+ @secondary_valid_names_set = Set.new(@secondary_node.valid_names_hash.keys)
23
+ @primary_synonyms_set = Set.new(@primary_node.synonyms_hash.keys)
24
+ @secondary_synonyms_set = Set.new(@secondary_node.synonyms_hash.keys)
25
+ @merges = {}
26
+ end
27
+
28
+ def merge(with_fuzzy_matching = true)
29
+ merge_exact_matches
30
+ merge_fuzzy_matches if with_fuzzy_matching
31
+ merge_no_matches
32
+ @merges
33
+ end
34
+
35
+ private
36
+
37
+ def merge_exact_matches
38
+ ExactMatcher.new(self).merge
39
+ end
40
+
41
+ def merge_fuzzy_matches
42
+ FuzzyMatcher.new(self).merge
43
+ end
44
+
45
+ def merge_no_matches
46
+ NomatchOrganizer.new(self).merge
47
+ end
48
+
49
+ end
@@ -0,0 +1,13 @@
1
+ class FamilyReunion
2
+ class Cache
3
+ attr :word_letters, :similar_words
4
+
5
+ def initialize
6
+ @word_letters = {}
7
+ @similar_words = {}
8
+ @taxamatch_genus = {}
9
+ @taxamatch_species = {}
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,72 @@
1
+ class FamilyReunion
2
+ class ExactMatcher
3
+ include MatcherHelper
4
+
5
+ def initialize(family_reunion)
6
+ @fr = family_reunion
7
+ end
8
+
9
+ def merge
10
+ add_valid_matches(get_valid_matches)
11
+ add_synonym_matches(get_valid_to_synonym_matches, :valid_to_synonym)
12
+ add_synonym_matches(get_synonym_to_valid_matches, :synonym_to_valid)
13
+ add_synonym_matches(get_synonym_to_synonym_matches, :synonym_to_synonym)
14
+ end
15
+
16
+ private
17
+
18
+ def get_valid_matches
19
+ valid_matches = @fr.primary_valid_names_set & @fr.secondary_valid_names_set
20
+ end
21
+
22
+ def add_valid_matches(valid_matches)
23
+ # Homonyms are treated separately, and are not matched by the algorithm,
24
+ # they are excluded from valid_matches
25
+ valid_matches.each do |name|
26
+ primary_id = @fr.primary_node.valid_names_hash[name][:id]
27
+ secondary_id = @fr.secondary_node.valid_names_hash[name][:id]
28
+ @fr.merges[primary_id] = {:matches => {secondary_id.to_s => {:match_type => :valid_to_valid}}, :nonmatches => []}
29
+ end
30
+ end
31
+
32
+ def get_valid_to_synonym_matches
33
+ @fr.primary_valid_names_set & @fr.secondary_synonyms_set
34
+ end
35
+
36
+ def get_synonym_to_valid_matches
37
+ @fr.primary_synonyms_set & @fr.secondary_valid_names_set
38
+ end
39
+
40
+ def get_synonym_to_synonym_matches
41
+ @fr.primary_synonyms_set & @fr.secondary_synonyms_set
42
+ end
43
+
44
+ def add_synonym_matches(match_set, match_type)
45
+ match_set.each do |name|
46
+ primary_ids, secondary_ids = get_valid_name_ids(name)
47
+ secondary_id_matches = format_secondary_id_for_merge(secondary_ids, match_type)
48
+ primary_ids.each do |primary_id|
49
+ add_record_to_merges(primary_id, secondary_id_matches)
50
+ end
51
+ end
52
+ end
53
+
54
+ def get_valid_name_ids(name)
55
+ primary_ids = get_ids_from_node(name, @fr.primary_node)
56
+ secondary_ids = get_ids_from_node(name, @fr.secondary_node)
57
+ [primary_ids, secondary_ids]
58
+ end
59
+
60
+ def get_ids_from_node(name, node)
61
+ valid_names = node.valid_names_hash
62
+ synonyms = node.synonyms_hash
63
+ if valid_names.has_key?(name)
64
+ return [valid_names[name][:id]]
65
+ else
66
+ return synonyms[name].map {|n| n[:id]}
67
+ end
68
+ end
69
+
70
+ end
71
+ end
72
+