synonym-finder 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use ruby-1.9.2-p290@synfnd --create
data/Gemfile ADDED
@@ -0,0 +1,19 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+ gem 'sqlite3'
6
+ gem 'taxamatch_rb'
7
+ gem 'biodiversity19'
8
+ gem 'ruby-stemmer'
9
+
10
+ # Add dependencies to develop your gem here.
11
+ # Include everything needed to run rake, tests, features, etc.
12
+ group :development do
13
+ gem "ruby-debug19"
14
+ gem "rspec", "~> 2.3.0"
15
+ gem "cucumber", ">= 0"
16
+ gem "bundler", "~> 1.0.0"
17
+ gem "jeweler", "~> 1.6.0"
18
+ gem "rcov", ">= 0"
19
+ end
@@ -0,0 +1,72 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ archive-tar-minitar (0.5.2)
5
+ biodiversity (0.7.3)
6
+ json
7
+ treetop
8
+ biodiversity19 (0.7.3)
9
+ treetop
10
+ builder (3.0.0)
11
+ columnize (0.3.3)
12
+ cucumber (1.0.2)
13
+ builder (>= 2.1.2)
14
+ diff-lcs (>= 1.1.2)
15
+ gherkin (~> 2.4.5)
16
+ json (>= 1.4.6)
17
+ term-ansicolor (>= 1.0.5)
18
+ diff-lcs (1.1.2)
19
+ gherkin (2.4.5)
20
+ json (>= 1.4.6)
21
+ git (1.2.5)
22
+ jeweler (1.6.4)
23
+ bundler (~> 1.0)
24
+ git (>= 1.2.5)
25
+ rake
26
+ json (1.5.3)
27
+ linecache19 (0.5.12)
28
+ ruby_core_source (>= 0.1.4)
29
+ polyglot (0.3.1)
30
+ rake (0.9.2)
31
+ rcov (0.9.9)
32
+ rspec (2.3.0)
33
+ rspec-core (~> 2.3.0)
34
+ rspec-expectations (~> 2.3.0)
35
+ rspec-mocks (~> 2.3.0)
36
+ rspec-core (2.3.1)
37
+ rspec-expectations (2.3.0)
38
+ diff-lcs (~> 1.1.2)
39
+ rspec-mocks (2.3.0)
40
+ ruby-debug-base19 (0.11.25)
41
+ columnize (>= 0.3.1)
42
+ linecache19 (>= 0.5.11)
43
+ ruby_core_source (>= 0.1.4)
44
+ ruby-debug19 (0.11.6)
45
+ columnize (>= 0.3.1)
46
+ linecache19 (>= 0.5.11)
47
+ ruby-debug-base19 (>= 0.11.19)
48
+ ruby-stemmer (0.9.1)
49
+ ruby_core_source (0.1.5)
50
+ archive-tar-minitar (>= 0.5.2)
51
+ sqlite3 (1.3.3)
52
+ taxamatch_rb (0.7.6)
53
+ biodiversity (~> 0.7.3)
54
+ biodiversity19 (~> 0.7.3)
55
+ term-ansicolor (1.0.5)
56
+ treetop (1.4.9)
57
+ polyglot (>= 0.3.1)
58
+
59
+ PLATFORMS
60
+ ruby
61
+
62
+ DEPENDENCIES
63
+ biodiversity19
64
+ bundler (~> 1.0.0)
65
+ cucumber
66
+ jeweler (~> 1.6.0)
67
+ rcov
68
+ rspec (~> 2.3.0)
69
+ ruby-debug19
70
+ ruby-stemmer
71
+ sqlite3
72
+ taxamatch_rb
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Dmitry Mozzherin
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,64 @@
1
+ = synonym-finder
2
+
3
+ Synonym finder is a biodiversity tool for finding homotypic nomenclatural synonyms in taxonomic hierarchies.`
4
+
5
+ == Installation
6
+
7
+ gem install synonym-finder
8
+
9
+ == Usage
10
+
11
+ #prepare intput for the gem. It understand following array of hashes as input:
12
+
13
+ input = [
14
+ {id: 001, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Gnamptogenys", name: "Gnamptogenys porcata (Emery, 1896)"},
15
+ {id: 003, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Gnamptogenys", name: "Gnamptogenys triangularis (Mayr, 1887)"},
16
+ {id: 004, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Gnamptogenys", name: "Gnamptogenys triangularis var. alba Brown 1992"},
17
+ {id: 005, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Gnamptogenys", name: "Gnamptogenys triangularis var. borealis Brown 1992"},
18
+ {id: 100, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Nylanderia", name: "Nylanderia porcata"}, #match 001, no authorhsip
19
+ {id: 101, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Nylanderia", name: "Nylanderia porcatum Emery, 1896"},
20
+ {id: 102, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Nylanderia", name: "Nylanderia porcatum"}...]
21
+
22
+ # please note that id can be a number of a string
23
+
24
+ requre 'synonym-finder'
25
+
26
+ sf = SynonymFinder.new(input)
27
+ output = sf.find_synonyms
28
+
29
+ # output should be of a following format:
30
+ # matched putative synonyms are collected into groups of different types
31
+ #
32
+ # [ {:type=>"chresonym", :name_ids=>[203, 204]},
33
+ # {:type=>"alt_placement", :name_ids=>[400, 600]},
34
+ # {:type=>"chresonym", :name_ids=>[101, 102]},
35
+ # {:type=>"homotypic", :name_ids=>[203, 303]},
36
+ # {:type=>"lexical_variant", :name_ids=>[800, 803]},
37
+ # {:type=>"lexical_variant", :name_ids=>[801, 802]},
38
+ # {:type=>"homotypic", :name_ids=>[202, 302]},
39
+ # {:type=>"homotypic", :name_ids=>[1, 101]},
40
+ # {:type=>"misplaced_synonym", :name_ids=>[801, 803, 802, 800]}]
41
+
42
+ == Synonym types
43
+
44
+ * homotypic -- possible placement of species to a different genus
45
+ * alt_placement -- possibly the same name (i.e. genus moved to a different family
46
+ * chresonym -- different authorship with for the same canonical form, both having the same parent
47
+ * lexical_variant -- The same parent and genus, but species epithet suffix is different (for example change of the epithet gender)
48
+ * misplaced_synonym -- The same parent, matching species epithet, but genus varies, usually happens if a synonyn located at the same level as species.
49
+
50
+ == Contributing to synonym-finder
51
+
52
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
53
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
54
+ * Fork the project
55
+ * Start a feature/bugfix branch
56
+ * Commit and push until you are happy with your contribution
57
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
58
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
59
+
60
+ == Copyright
61
+
62
+ Copyright (c) 2011 Dmitry Mozzherin. See LICENSE.txt for
63
+ further details.
64
+
@@ -0,0 +1,52 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "synonym-finder"
18
+ gem.homepage = "http://github.com/dimus/synonym-finder"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{Synonym finder is a biodiversity tool for finding homotypic nomenclatural synonyms in taxonomic hierarchies.}
21
+ gem.description = %Q{Synonym finder is a biodiversity tool for finding homotypic nomenclatural synonyms in taxonomic hierarchies.}
22
+ gem.email = "dmozzherin@gmail.com"
23
+ gem.authors = ["Dmitry Mozzherin"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ require 'rspec/core'
29
+ require 'rspec/core/rake_task'
30
+ RSpec::Core::RakeTask.new(:spec) do |spec|
31
+ spec.pattern = FileList['spec/**/*_spec.rb']
32
+ end
33
+
34
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
35
+ spec.pattern = 'spec/**/*_spec.rb'
36
+ spec.rcov = true
37
+ end
38
+
39
+ require 'cucumber/rake/task'
40
+ Cucumber::Rake::Task.new(:features)
41
+
42
+ task :default => :spec
43
+
44
+ require 'rake/rdoctask'
45
+ Rake::RDocTask.new do |rdoc|
46
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
47
+
48
+ rdoc.rdoc_dir = 'rdoc'
49
+ rdoc.title = "synonym-finder #{version}"
50
+ rdoc.rdoc_files.include('README*')
51
+ rdoc.rdoc_files.include('lib/**/*.rb')
52
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.2
@@ -0,0 +1,13 @@
1
+ require 'bundler'
2
+ begin
3
+ Bundler.setup(:default, :development)
4
+ rescue Bundler::BundlerError => e
5
+ $stderr.puts e.message
6
+ $stderr.puts "Run `bundle install` to install missing gems"
7
+ exit e.status_code
8
+ end
9
+
10
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
11
+ require 'synonym-finder'
12
+
13
+ require 'rspec/expectations'
@@ -0,0 +1,9 @@
1
+ Feature: something something
2
+ In order to something something
3
+ A user something something
4
+ something something something
5
+
6
+ Scenario: something something
7
+ Given inspiration
8
+ When I create a sweet new gem
9
+ Then everyone should see how awesome I am
@@ -0,0 +1,148 @@
1
+ require 'logger'
2
+ require 'json'
3
+ require 'sqlite3'
4
+ require 'taxamatch_rb'
5
+ require 'lingua/stemmer'
6
+
7
+ Dir["#{File.dirname(__FILE__)}/synonym-finder/**/*.rb"].each {|f| require f}
8
+
9
+ class SynonymFinder
10
+ NO_AUTH_INFO = 10
11
+ PARTIAL_AUTH_INFO = 20
12
+ AUTH_MATCH = 100
13
+ AUTH_NO_MATCH = 0
14
+
15
+ attr :input, :db, :matches, :part_matches
16
+
17
+ def self.logger
18
+ @@logger ||= Logger.new(nil)
19
+ end
20
+
21
+ def self.logger=(logger)
22
+ @@logger = logger
23
+ end
24
+
25
+ def self.logger_reset
26
+ self.logger = Logger.new(nil)
27
+ end
28
+
29
+ def self.logger_write(obj_id, message, method = :info)
30
+ self.logger.send(method, "|%s|%s|" % [obj_id, message])
31
+ end
32
+
33
+
34
+ def initialize(input, in_memory = true)
35
+ @input = input
36
+ @atomizer = Taxamatch::Atomizer.new
37
+ @tm = Taxamatch::Base.new
38
+ @stemmer = Lingua::Stemmer.new(:language => "latin")
39
+ @db = init_db(in_memory)
40
+ #tmp_populate
41
+ build_tree unless @db.execute("select count(*) from names")[0][0].to_i > 0
42
+ @matches = {}
43
+ @part_matches = {}
44
+ @duplicate_finder = DuplicateFinder.new(self)
45
+ @group_organizer = GroupOrganizer.new(self)
46
+ end
47
+
48
+ def find_matches(threshold = 5)
49
+ @duplicate_finder.canonical_duplicates
50
+ matches = @duplicate_finder.species_epithet_duplicates(threshold)
51
+ matches = compare_authorship(matches)
52
+ clean_up(matches)
53
+ @group_organizer.organize
54
+ end
55
+
56
+ private
57
+
58
+ def clean_up(matches)
59
+ matches.each do |key, value|
60
+ next if value[:type] != :chresonym && value[:auth_match] < 20
61
+ value[:auth_match] == 100 || value[:type] == :chresonym ? @matches[key] = value : @part_matches[key] = value
62
+ end
63
+ end
64
+
65
+ def compare_authorship(matches)
66
+ SynonymFinder.logger_write(self.object_id, "Matching authorship")
67
+ count = 0
68
+ matches.each do |key, value|
69
+ count += 1
70
+ SynonymFinder.logger_write(self.object_id, "Matching authors %s" % count) if count % 1000 == 0
71
+ ids = key.join(",")
72
+ res = @db.execute("select authors, years from names where id in (#{ids})")
73
+ data1 = {:all_authors => Marshal.load(res[0][0]), :all_years =>Marshal.load(res[0][1])}
74
+ data2 = {:all_authors => Marshal.load(res[1][0]), :all_years =>Marshal.load(res[1][1])}
75
+ if (data1[:all_authors] + data1[:all_years] + data2[:all_authors] + data2 [:all_years]) == []
76
+ value[:auth_match] = NO_AUTH_INFO
77
+ elsif (data1[:all_authors] + data1[:all_years]).empty? || (data2[:all_authors] + data2[:all_years]).empty?
78
+ value[:auth_match] = PARTIAL_AUTH_INFO
79
+ else
80
+ value[:auth_match] = @tm.match_authors(data1, data2) == 0 ? AUTH_NO_MATCH : AUTH_MATCH
81
+ end
82
+ end
83
+ matches
84
+ end
85
+
86
+ def build_tree
87
+ SynonymFinder.logger_write(self.object_id, "Ingesting data")
88
+ @input.each_with_index do |row, i|
89
+ i += 1
90
+ SynonymFinder.logger_write(self.object_id, "Ingesting record %s" % i) if i % 1000 == 0
91
+ atomized_name = @atomizer.parse row[:name] rescue nil
92
+ next unless atomized_name && atomized_name[:species]
93
+ species_string = get_species(atomized_name)
94
+ canonical_name = atomized_name[:genus][:string] + " " + species_string
95
+ @db.execute("insert into names (id, name, authors, years) values (?, ?, ?, ?)", [row[:id], row[:name], Marshal.dump(atomized_name[:all_authors]), Marshal.dump(atomized_name[:all_years])])
96
+ @db.execute("insert into name_parts (name_id, path, canonical, epithet, epithet_stem) values (?, ?, ?, ?, ?)", [row[:id], row[:path], canonical_name, species_string, stem_epithet(species_string)])
97
+ end
98
+ end
99
+
100
+ def init_db(in_memory)
101
+ if in_memory == true
102
+ db = SQLite3::Database.new( ":memory:" )
103
+ create_tables(db)
104
+ else
105
+ db_file = "/tmp/syn_finder.sql"
106
+ db_exist = File.exist?(db_file)
107
+ db = SQLite3::Database.new("/tmp/syn_finder.sql")
108
+ unless db_exist
109
+ create_tables(db)
110
+ end
111
+ end
112
+ db
113
+ end
114
+
115
+ def create_tables(db)
116
+ db.execute("create table names (id string primary key, name string, authors, years)")
117
+ # db.execute("create table paths (id integer primary key autoincrement, path)")
118
+ # db.execute("create table paths_names (path_id integer, name_id string, level integer, primary key (path_id, name_id))")
119
+ db.execute("create table name_parts (name_id string, path string, canonical string, epithet string, epithet_stem string)")
120
+ db.execute("create index idx_name_parts_1 on name_parts (canonical)")
121
+ db.execute("create index idx_name_parts_2 on name_parts (epithet_stem)")
122
+ db.execute("create table groups (id integer primary key, type)")
123
+ db.execute("create table names_groups (name_id integer, group_id integer, score_max integer, score_sum integer, score_num integer, primary key (name_id, group_id))")
124
+ db.execute("create index idx_names_groups_2 on names_groups (group_id)")
125
+ end
126
+
127
+ def get_species(atomized_name)
128
+ species = [atomized_name[:species][:string]]
129
+ species += atomized_name[:infraspecies].map {|i| i[:string]} if atomized_name[:infraspecies]
130
+ species.join(" ")
131
+ end
132
+
133
+ def stem_epithet(epithet)
134
+ epithet.split(" ").map { |e| @stemmer.stem(e) }.join(" ")
135
+ end
136
+
137
+ def tmp_populate
138
+ f = open("/tmp/dump.sql")
139
+ f.each_with_index do |line, i|
140
+ i += 1
141
+ puts "loading from dump line %s" % i if i % 10000 == 0
142
+ if line.match /INSERT/
143
+ @db.execute(line.strip)
144
+ end
145
+ end
146
+ end
147
+
148
+ end
@@ -0,0 +1,87 @@
1
+ class SynonymFinder
2
+ class DuplicateFinder
3
+
4
+ def initialize(synonym_finder)
5
+ @synonym_finder = synonym_finder
6
+ @db = @synonym_finder.db
7
+ @matches = {}
8
+ end
9
+
10
+ def canonical_duplicates
11
+ SynonymFinder.logger_write(@synonym_finder.object_id, "Processing canonical forms")
12
+ @db.execute("select canonical from name_parts group by canonical having count(*) > 1").each_with_index do |canonical, i|
13
+ i = i + 1
14
+ SynonymFinder.logger_write(@synonym_finder.object_id, "Processing canonical form candidate %s" % i) if i % 100 == 0
15
+ names = @db.execute("select name_id, path from name_parts where canonical = ?", canonical)
16
+ find_pairs(names)
17
+ end
18
+ @matches.each do |key, value|
19
+ if value[:total_distance] == 0
20
+ value[:type] = :chresonym
21
+ else
22
+ value[:type] = :alt_placement
23
+ end
24
+ end
25
+ @matches
26
+ end
27
+
28
+ def find_pairs(names, threshold = 0)
29
+ pairs = get_pairs(names)
30
+ pairs.each do |pair|
31
+ key = [pair[0][0], pair[1][0]]
32
+ total_distance = get_total_distance(pair[0][1], pair[1][1])
33
+ value = {:total_distance => total_distance}
34
+ @matches[key] = value if !@matches.has_key?(key) && (threshold == 0 || total_distance <= threshold)
35
+ end
36
+ end
37
+
38
+ def get_total_distance(path1, path2)
39
+ total_distance = path1.size + path2.size
40
+ count = 0
41
+ path1.zip(path2).each do |pair|
42
+ break if pair[0] != pair[1]
43
+ count += 1
44
+ end
45
+ total_distance - count * 2
46
+ end
47
+
48
+ def get_pairs(names)
49
+ names = names.map { |n| [n[0], n[1].to_s.split("|")] }
50
+ pairs = []
51
+ until names.empty?
52
+ name = names.pop
53
+ names.each {|n| pairs << [name, n].sort}
54
+ end
55
+ pairs
56
+ end
57
+
58
+ def species_epithet_duplicates(threshold_distance)
59
+ SynonymFinder.logger_write(@synonym_finder.object_id, "Processing species epithets")
60
+ @db.execute("select epithet_stem from name_parts group by epithet_stem having count(*) > 1").each_with_index do |stem, i|
61
+ i = i + 1
62
+ SynonymFinder.logger_write(@synonym_finder.object_id, "Processing species epithet candidate %s" % i) if i % 100 == 0
63
+ names = @db.execute("select name_id, path from name_parts where epithet_stem = ?", stem)
64
+ find_pairs(names, threshold_distance)
65
+ end
66
+ count = 0
67
+ SynonymFinder.logger_write(@synonym_finder.object_id, "Assigning type to found matches")
68
+ @matches.each do |key, value|
69
+ next if value.has_key?(:type)
70
+ count += 1
71
+ SynonymFinder.logger_write(@synonym_finder.object_id, "Processing match %s" % count) if count % 10000 == 0
72
+ if value[:total_distance] == 0
73
+ epithets = @db.execute("select distinct epithet from name_parts where name_id in (#{key.join(",")})")
74
+ if epithets.size == 1
75
+ value[:type] = :misplaced_synonym
76
+ else
77
+ genera = @db.execute("select canonical from name_parts where name_id in (#{key.join(",")})").map { |c| c[0].split(" ")[0] }.uniq
78
+ value[:type] = genera.size == 1 ? :lexical_variant : :misplaced_synonym
79
+ end
80
+ else
81
+ value[:type] = :homotypic
82
+ end
83
+ end
84
+ @matches
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,121 @@
1
+ class SynonymFinder
2
+ class GroupOrganizer
3
+
4
+ def initialize(synonym_finder)
5
+ @synonym_finder = synonym_finder
6
+ @db = @synonym_finder.db
7
+ @groups = {}
8
+ end
9
+
10
+ # Finds duplication groups for a name. A name can be one or more duplication groups: chresonym, lexical variant, homotypic, alt placement
11
+ def organize
12
+ SynonymFinder.logger_write(@synonym_finder.object_id, "Grouping results")
13
+ organize_matches
14
+ #organize_partial_matches
15
+ get_output
16
+ end
17
+
18
+ private
19
+
20
+ def organize_matches
21
+ @last_id = 1
22
+ count = 0
23
+ @synonym_finder.matches.each do |key, value|
24
+ count += 1
25
+ SynonymFinder.logger_write(@synonym_finder.object_id, "Grouping match %s" % count) if count % 10000 == 0
26
+ gr1 = get_group(key[0], value[:type])
27
+ gr2 = get_group(key[1], value[:type])
28
+ if gr1 && gr2
29
+ update_group(gr1, gr2) if gr1 != gr2
30
+ key.each { |name_id| update_score(name_id, value) }
31
+ elsif !gr1 && !gr2
32
+ create_group(key, value)
33
+ else
34
+ add_to_group(key, value)
35
+ end
36
+ end
37
+ end
38
+
39
+ def organize_partial_matches
40
+ added = {}
41
+ count = 0
42
+ @synonym_finder.part_matches.each do |key, value|
43
+ count += 1
44
+ SynonymFinder.logger_write(@synonym_finder.object_id, "Adding partial matches %s" % count) if count % 10000 == 0
45
+ gr1 = get_group(key[0], value[:type])
46
+ gr2 = get_group(key[1], value[:type])
47
+ if gr1 || gr2
48
+ group_id, name_id, name_id_db = gr1 ? [gr1, key[1], key[0]] : [gr2, key[0], key[1]] #name without authorship
49
+ unless added[name_id] && added[name_id][name_id_db]
50
+ score = get_score(value)
51
+ @db.execute("insert into names_groups (name_id, group_id, score_max, score_sum, score_num) values (?, ?, ?, ?, 1)", [name_id, group_id, score, score])
52
+ added[name_id] = { name_id_db => 1 }
53
+ end
54
+ else
55
+ create_group(key, value)
56
+ end
57
+ end
58
+ end
59
+
60
+ def get_group(name_id, type)
61
+ return nil unless @groups[name_id]
62
+ @groups[name_id][type]
63
+ end
64
+
65
+ def create_group(key, value)
66
+ @db.execute("insert into groups (id, type) values (?, ?)", [@last_id, value[:type].to_s])
67
+ key.each {|i| @groups[i] = {} unless @groups.has_key?(i) }
68
+ score = get_score(value)
69
+ @groups[key[0]][value[:type]] = @groups[key[1]][value[:type]] = @last_id
70
+ @db.execute("insert into names_groups (name_id, group_id, score_max, score_sum, score_num) values (?, ?, ?, ?, 1)", [key[0], @last_id, score, score])
71
+ @db.execute("insert into names_groups (name_id, group_id, score_max, score_sum, score_num) values (?, ?, ?, ?, 1)", [key[1], @last_id, score, score])
72
+ @last_id += 1
73
+ end
74
+
75
+ def update_group(gr1, gr2)
76
+ @db.execute("update names_groups set group_id = ? where group_id = ?", [gr1, gr2])
77
+ @db.execute("delete from groups where id = ?", gr2)
78
+ end
79
+
80
+ def add_to_group(key, value)
81
+ gr1 = get_group(key[0], value[:type])
82
+ gr2 = get_group(key[1], value[:type])
83
+ name_id1, name_id2, group_id = gr1 ? [key[1], key[0], gr1] : [key[0], key[1], gr2]
84
+ update_score(name_id2, value)
85
+ score = get_score(value)
86
+ @groups[name_id1] = {} unless @groups.has_key?(name_id1)
87
+ @groups[name_id1][value[:type]] = group_id
88
+ @db.execute("insert into names_groups (name_id, group_id, score_max, score_sum, score_num) values (?, ?, ?, ?, 1)", [name_id1, group_id, score, score])
89
+ end
90
+
91
+ def update_score(name_id, value)
92
+ score = get_score(value)
93
+ group_id = get_group(name_id, value[:type])
94
+ @db.execute("update names_groups set score_max = max(score_max, ?), score_sum = score_sum + ?, score_num = score_num + 1 where name_id = ? and group_id = ?", [score, score, name_id, group_id])
95
+ end
96
+
97
+ def get_score(value)
98
+ return 100 if value[:type] == :chresonym
99
+ return 10 if value[:alt_placement] && value[:total_length] > 8
100
+ score = value[:auth_match]
101
+ end
102
+
103
+ def get_output
104
+ data = @db.execute("select x.group_id, g.type, ng.name_id from (select group_id from names_groups group by group_id order by count(*), group_id) x join names_groups ng on x.group_id = ng.group_id join names n on n.id = ng.name_id join groups g on g.id = ng.group_id")
105
+ group = 0
106
+ res = []
107
+ current_group = nil
108
+ data.each do |group_id, type, name_id|
109
+ if group_id != group
110
+ res << current_group if current_group
111
+ group = group_id
112
+ current_group = { :type => type, :name_ids => [name_id] }
113
+ else
114
+ current_group[:name_ids] << name_id
115
+ end
116
+ end
117
+ res
118
+ end
119
+
120
+ end
121
+ end
@@ -0,0 +1,19 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
3
+ require 'rspec'
4
+ require 'ostruct'
5
+ require 'synonym-finder'
6
+
7
+ # Requires supporting files with custom matchers and macros, etc,
8
+ # in ./support/ and its subdirectories.
9
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
10
+
11
+ RSpec.configure do |config|
12
+
13
+ end
14
+
15
+ class SynonymFinder::Spec
16
+ Config = OpenStruct.new(
17
+ :input => INPUT
18
+ )
19
+ end
@@ -0,0 +1,32 @@
1
+ class SynonymFinder::Spec
2
+
3
+ INPUT = [
4
+ {id: 001, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Gnamptogenys", name: "Gnamptogenys porcata (Emery, 1896)"},
5
+ {id: 003, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Gnamptogenys", name: "Gnamptogenys triangularis (Mayr, 1887)"},
6
+ {id: 004, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Gnamptogenys", name: "Gnamptogenys triangularis var. alba Brown 1992"},
7
+ {id: 005, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Gnamptogenys", name: "Gnamptogenys triangularis var. borealis Brown 1992"},
8
+ {id: 100, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Nylanderia", name: "Nylanderia porcata"}, #match 001, no authorhsip
9
+ {id: 101, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Nylanderia", name: "Nylanderia porcatum Emery, 1896"}, #match 001 by stem
10
+ {id: 102, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Nylanderia", name: "Nylanderia porcatum"}, #match 001 by stem
11
+ {id: 200, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Brachymyrmex", name: "Brachymyrmex obscurior Forel, 1893"},
12
+ {id: 201, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Brachymyrmex", name: "Brachymyrmex brevicornis Emery, 1906"},
13
+ {id: 202, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Brachymyrmex", name: "Brachymyrmex patagonicus Mayr, 1868"},
14
+ {id: 203, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Brachymyrmex", name: "Brachymyrmex minutus Forel, 1893"},
15
+ {id: 204, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Brachymyrmex", name: "Brachymyrmex minutus Brown, 2010"}, #chresonym match with 203
16
+ {id: 205, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Brachymyrmex", name: "Brachymyrmex micropeda Forel, 1893"},
17
+ {id: 300, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Neobrachymyrmex", name: "Neobrachymyrmex obscurior"}, #match 200 no auth
18
+ {id: 301, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Neobrachymyrmex", name: "Neobrachymyrmex brevicornis"}, #match 201 no auth
19
+ {id: 302, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Neobrachymyrmex", name: "Neobrachymyrmex patagonicus Mayr, 1868"}, #match 203 auth
20
+ {id: 303, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Neobrachymyrmex", name: "Neobrachymyrmex minutus Forel"}, #match 204 no auth (part)
21
+ {id: 304, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Neobrachymyrmex", name: "Neobrachymyrmex micropeda Brown 1995"}, #match 205 no auth
22
+ {id: 400, path: "Animalia|Athropoda|Insecta|Hymenoptera|Formicidae|Crematogaster", name: "Crematogaster obscurata Emery, 1895"},
23
+ {id: 500, path: "Animalia|Athropoda|Insecta|Hymenoptera|Tiphiidae|Diamma", name: "Diamma obscurata (Emery, 1895)"}, #match 2 degrees 400 auth
24
+ {id: 600, path: "Animalia|Athropoda|Insecta|Hymenoptera|Tiphiidae|Crematogaster", name: "Crematogaster obscurata Em. 1895"}, #full name match
25
+ {id: 700, path: "Animalia|Something1|Something2|Something3|Something4|Somename", name: "Somename obscurata Emery"}, #distance over threshold
26
+ {id: 800, path: "Animalia|Athropoda|Insecta|Hymenoptera|Tiphiidae|Neobrachymyrmex", name: "Neobrachymyrmex obscurata (Emery, 1895)"}, #match 2 degrees 400 auth
27
+ {id: 801, path: "Animalia|Athropoda|Insecta|Hymenoptera|Tiphiidae|Neobrachymyrmex", name: "Brachymyrmex obscuratum (Emery, 1895)"}, #misplaced synonym by stem 800
28
+ {id: 802, path: "Animalia|Athropoda|Insecta|Hymenoptera|Tiphiidae|Neobrachymyrmex", name: "Brachymyrmex obscurata (Emery, 1895)"}, #misplaced synonym by epithet 800
29
+ {id: 803, path: "Animalia|Athropoda|Insecta|Hymenoptera|Tiphiidae|Neobrachymyrmex", name: "Neobrachymyrmex obscuratum (Emery, 1895)"}, #lex var by epithet 800
30
+ ]
31
+
32
+ end
@@ -0,0 +1,32 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "SynonymFinder" do
4
+ before(:all) do
5
+ SynonymFinder.logger = Logger.new($stdout)
6
+ @sf = SynonymFinder.new(SynonymFinder::Spec::Config.input)
7
+ # @sf = SynonymFinder.new(open(File.dirname(__FILE__) + "/support/union_data.txt").read)
8
+ end
9
+
10
+ it "should able to ingest input in correct format" do
11
+ @sf.input.is_a?(Array).should be_true
12
+ @sf.input[0].keys.should == [:id, :path, :name]
13
+ end
14
+
15
+ it "should be able to find species epithet duplications" do
16
+ output = @sf.find_matches
17
+ m = @sf.matches
18
+ m[[1, 100]].should be_nil # 1 name has no auth
19
+ m[[1, 101]].should == {:total_distance=>2, :type=>:homotypic, :auth_match=>100}
20
+ m[[1, 102]].should be_nil # 1 name has no auth
21
+ m[[203, 204]].should == {:total_distance=>0, :type=>:chresonym, :auth_match=>0}
22
+ m[[202, 302]].should == {:total_distance=>2, :type=>:homotypic, :auth_match=>100}
23
+ m[[400, 500]].should == {:total_distance=>4, :type=>:homotypic, :auth_match=>100}
24
+ m[[400, 600]].should == {:total_distance=>4, :type=>:alt_placement, :auth_match=>100}
25
+ m[[400, 700]].should be_nil
26
+ m[[400, 800]].should == {:total_distance=>4, :type=>:homotypic, :auth_match=>100}
27
+ m[[800, 801]].should == {:total_distance=>0, :type=>:misplaced_synonym, :auth_match=>100}
28
+ m[[800, 802]].should == {:total_distance=>0, :type=>:misplaced_synonym, :auth_match=>100}
29
+ m[[800, 803]].should == {:total_distance=>0, :type=>:lexical_variant, :auth_match=>100}
30
+ output.should == [{:type=>"chresonym", :name_ids=>[203, 204]}, {:type=>"alt_placement", :name_ids=>[400, 600]}, {:type=>"chresonym", :name_ids=>[101, 102]}, {:type=>"homotypic", :name_ids=>[203, 303]}, {:type=>"lexical_variant", :name_ids=>[800, 803]}, {:type=>"lexical_variant", :name_ids=>[801, 802]}, {:type=>"homotypic", :name_ids=>[202, 302]}, {:type=>"homotypic", :name_ids=>[1, 101]}, {:type=>"misplaced_synonym", :name_ids=>[801, 803, 802, 800]}]
31
+ end
32
+ end
@@ -0,0 +1,85 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{synonym-finder}
8
+ s.version = "0.2.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = [%q{Dmitry Mozzherin}]
12
+ s.date = %q{2011-08-12}
13
+ s.description = %q{Synonym finder is a biodiversity tool for finding homotypic nomenclatural synonyms in taxonomic hierarchies.}
14
+ s.email = %q{dmozzherin@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".rspec",
22
+ ".rvmrc",
23
+ "Gemfile",
24
+ "Gemfile.lock",
25
+ "LICENSE.txt",
26
+ "README.rdoc",
27
+ "Rakefile",
28
+ "VERSION",
29
+ "features/step_definitions/synonym-finder_steps.rb",
30
+ "features/support/env.rb",
31
+ "features/synonym-finder.feature",
32
+ "lib/synonym-finder.rb",
33
+ "lib/synonym-finder/duplicate_finder.rb",
34
+ "lib/synonym-finder/group_organizer.rb",
35
+ "spec/spec_helper.rb",
36
+ "spec/support/input.rb",
37
+ "spec/synonym-finder_spec.rb",
38
+ "synonym-finder.gemspec"
39
+ ]
40
+ s.homepage = %q{http://github.com/dimus/synonym-finder}
41
+ s.licenses = [%q{MIT}]
42
+ s.require_paths = [%q{lib}]
43
+ s.rubygems_version = %q{1.8.6}
44
+ s.summary = %q{Synonym finder is a biodiversity tool for finding homotypic nomenclatural synonyms in taxonomic hierarchies.}
45
+
46
+ if s.respond_to? :specification_version then
47
+ s.specification_version = 3
48
+
49
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
50
+ s.add_runtime_dependency(%q<sqlite3>, [">= 0"])
51
+ s.add_runtime_dependency(%q<taxamatch_rb>, [">= 0"])
52
+ s.add_runtime_dependency(%q<biodiversity19>, [">= 0"])
53
+ s.add_runtime_dependency(%q<ruby-stemmer>, [">= 0"])
54
+ s.add_development_dependency(%q<ruby-debug19>, [">= 0"])
55
+ s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
56
+ s.add_development_dependency(%q<cucumber>, [">= 0"])
57
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
58
+ s.add_development_dependency(%q<jeweler>, ["~> 1.6.0"])
59
+ s.add_development_dependency(%q<rcov>, [">= 0"])
60
+ else
61
+ s.add_dependency(%q<sqlite3>, [">= 0"])
62
+ s.add_dependency(%q<taxamatch_rb>, [">= 0"])
63
+ s.add_dependency(%q<biodiversity19>, [">= 0"])
64
+ s.add_dependency(%q<ruby-stemmer>, [">= 0"])
65
+ s.add_dependency(%q<ruby-debug19>, [">= 0"])
66
+ s.add_dependency(%q<rspec>, ["~> 2.3.0"])
67
+ s.add_dependency(%q<cucumber>, [">= 0"])
68
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
69
+ s.add_dependency(%q<jeweler>, ["~> 1.6.0"])
70
+ s.add_dependency(%q<rcov>, [">= 0"])
71
+ end
72
+ else
73
+ s.add_dependency(%q<sqlite3>, [">= 0"])
74
+ s.add_dependency(%q<taxamatch_rb>, [">= 0"])
75
+ s.add_dependency(%q<biodiversity19>, [">= 0"])
76
+ s.add_dependency(%q<ruby-stemmer>, [">= 0"])
77
+ s.add_dependency(%q<ruby-debug19>, [">= 0"])
78
+ s.add_dependency(%q<rspec>, ["~> 2.3.0"])
79
+ s.add_dependency(%q<cucumber>, [">= 0"])
80
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
81
+ s.add_dependency(%q<jeweler>, ["~> 1.6.0"])
82
+ s.add_dependency(%q<rcov>, [">= 0"])
83
+ end
84
+ end
85
+
metadata ADDED
@@ -0,0 +1,181 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: synonym-finder
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Dmitry Mozzherin
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-08-12 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: sqlite3
16
+ requirement: &70213009815740 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70213009815740
25
+ - !ruby/object:Gem::Dependency
26
+ name: taxamatch_rb
27
+ requirement: &70213009815220 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *70213009815220
36
+ - !ruby/object:Gem::Dependency
37
+ name: biodiversity19
38
+ requirement: &70213009814740 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *70213009814740
47
+ - !ruby/object:Gem::Dependency
48
+ name: ruby-stemmer
49
+ requirement: &70213009814220 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *70213009814220
58
+ - !ruby/object:Gem::Dependency
59
+ name: ruby-debug19
60
+ requirement: &70213009813740 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ type: :development
67
+ prerelease: false
68
+ version_requirements: *70213009813740
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: &70213009813260 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ~>
75
+ - !ruby/object:Gem::Version
76
+ version: 2.3.0
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: *70213009813260
80
+ - !ruby/object:Gem::Dependency
81
+ name: cucumber
82
+ requirement: &70213009812780 !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ! '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ type: :development
89
+ prerelease: false
90
+ version_requirements: *70213009812780
91
+ - !ruby/object:Gem::Dependency
92
+ name: bundler
93
+ requirement: &70213009812300 !ruby/object:Gem::Requirement
94
+ none: false
95
+ requirements:
96
+ - - ~>
97
+ - !ruby/object:Gem::Version
98
+ version: 1.0.0
99
+ type: :development
100
+ prerelease: false
101
+ version_requirements: *70213009812300
102
+ - !ruby/object:Gem::Dependency
103
+ name: jeweler
104
+ requirement: &70213009811820 !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ~>
108
+ - !ruby/object:Gem::Version
109
+ version: 1.6.0
110
+ type: :development
111
+ prerelease: false
112
+ version_requirements: *70213009811820
113
+ - !ruby/object:Gem::Dependency
114
+ name: rcov
115
+ requirement: &70213009811340 !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ! '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ type: :development
122
+ prerelease: false
123
+ version_requirements: *70213009811340
124
+ description: Synonym finder is a biodiversity tool for finding homotypic nomenclatural
125
+ synonyms in taxonomic hierarchies.
126
+ email: dmozzherin@gmail.com
127
+ executables: []
128
+ extensions: []
129
+ extra_rdoc_files:
130
+ - LICENSE.txt
131
+ - README.rdoc
132
+ files:
133
+ - .document
134
+ - .rspec
135
+ - .rvmrc
136
+ - Gemfile
137
+ - Gemfile.lock
138
+ - LICENSE.txt
139
+ - README.rdoc
140
+ - Rakefile
141
+ - VERSION
142
+ - features/step_definitions/synonym-finder_steps.rb
143
+ - features/support/env.rb
144
+ - features/synonym-finder.feature
145
+ - lib/synonym-finder.rb
146
+ - lib/synonym-finder/duplicate_finder.rb
147
+ - lib/synonym-finder/group_organizer.rb
148
+ - spec/spec_helper.rb
149
+ - spec/support/input.rb
150
+ - spec/synonym-finder_spec.rb
151
+ - synonym-finder.gemspec
152
+ homepage: http://github.com/dimus/synonym-finder
153
+ licenses:
154
+ - MIT
155
+ post_install_message:
156
+ rdoc_options: []
157
+ require_paths:
158
+ - lib
159
+ required_ruby_version: !ruby/object:Gem::Requirement
160
+ none: false
161
+ requirements:
162
+ - - ! '>='
163
+ - !ruby/object:Gem::Version
164
+ version: '0'
165
+ segments:
166
+ - 0
167
+ hash: 2631280896359598637
168
+ required_rubygems_version: !ruby/object:Gem::Requirement
169
+ none: false
170
+ requirements:
171
+ - - ! '>='
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
174
+ requirements: []
175
+ rubyforge_project:
176
+ rubygems_version: 1.8.6
177
+ signing_key:
178
+ specification_version: 3
179
+ summary: Synonym finder is a biodiversity tool for finding homotypic nomenclatural
180
+ synonyms in taxonomic hierarchies.
181
+ test_files: []