taxamatch_rb 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +3 -0
- data/.rubocop.yml +9 -0
- data/.travis.yml +10 -0
- data/CHANGELOG +3 -0
- data/CODE_OF_CONDUCT.md +31 -0
- data/Gemfile +2 -17
- data/LICENSE.txt +21 -0
- data/README.md +53 -26
- data/Rakefile +11 -40
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/lib/taxamatch_rb/base.rb +154 -0
- data/lib/taxamatch_rb/version.rb +7 -0
- data/lib/taxamatch_rb.rb +12 -172
- data/taxamatch_rb.gemspec +30 -57
- metadata +124 -48
- data/Gemfile.lock +0 -86
- data/LICENSE +0 -20
- data/Makefile +0 -157
- data/VERSION +0 -1
- data/spec/spec.opts +0 -1
- data/spec/spec_helper.rb +0 -22
- data/spec/taxamatch_rb_spec.rb +0 -406
- data/spec/taxamatch_test.txt +0 -46
data/Makefile
DELETED
@@ -1,157 +0,0 @@
|
|
1
|
-
|
2
|
-
SHELL = /bin/sh
|
3
|
-
|
4
|
-
#### Start of system configuration section. ####
|
5
|
-
|
6
|
-
srcdir = /Users/dimus/.rvm/gems/ruby-1.9.2-p0/bin
|
7
|
-
topdir = /Users/dimus/.rvm/rubies/ruby-1.9.2-p0/include/ruby-1.9.1
|
8
|
-
hdrdir = /Users/dimus/.rvm/rubies/ruby-1.9.2-p0/include/ruby-1.9.1
|
9
|
-
arch_hdrdir = /Users/dimus/.rvm/rubies/ruby-1.9.2-p0/include/ruby-1.9.1/$(arch)
|
10
|
-
VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
|
11
|
-
prefix = $(DESTDIR)/Users/dimus/.rvm/rubies/ruby-1.9.2-p0
|
12
|
-
rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
|
13
|
-
exec_prefix = $(prefix)
|
14
|
-
vendorhdrdir = $(rubyhdrdir)/vendor_ruby
|
15
|
-
sitehdrdir = $(rubyhdrdir)/site_ruby
|
16
|
-
rubyhdrdir = $(includedir)/$(RUBY_BASE_NAME)-$(ruby_version)
|
17
|
-
vendordir = $(rubylibprefix)/vendor_ruby
|
18
|
-
sitedir = $(rubylibprefix)/site_ruby
|
19
|
-
ridir = $(datarootdir)/$(RI_BASE_NAME)
|
20
|
-
mandir = $(datarootdir)/man
|
21
|
-
localedir = $(datarootdir)/locale
|
22
|
-
libdir = $(exec_prefix)/lib
|
23
|
-
psdir = $(docdir)
|
24
|
-
pdfdir = $(docdir)
|
25
|
-
dvidir = $(docdir)
|
26
|
-
htmldir = $(docdir)
|
27
|
-
infodir = $(datarootdir)/info
|
28
|
-
docdir = $(datarootdir)/doc/$(PACKAGE)
|
29
|
-
oldincludedir = $(DESTDIR)/usr/include
|
30
|
-
includedir = $(prefix)/include
|
31
|
-
localstatedir = $(prefix)/var
|
32
|
-
sharedstatedir = $(prefix)/com
|
33
|
-
sysconfdir = $(prefix)/etc
|
34
|
-
datadir = $(datarootdir)
|
35
|
-
datarootdir = $(prefix)/share
|
36
|
-
libexecdir = $(exec_prefix)/libexec
|
37
|
-
sbindir = $(exec_prefix)/sbin
|
38
|
-
bindir = $(exec_prefix)/bin
|
39
|
-
rubylibdir = $(rubylibprefix)/$(ruby_version)
|
40
|
-
archdir = $(rubylibdir)/$(arch)
|
41
|
-
sitelibdir = $(sitedir)/$(ruby_version)
|
42
|
-
sitearchdir = $(sitelibdir)/$(sitearch)
|
43
|
-
vendorlibdir = $(vendordir)/$(ruby_version)
|
44
|
-
vendorarchdir = $(vendorlibdir)/$(sitearch)
|
45
|
-
|
46
|
-
CC = gcc
|
47
|
-
CXX = g++
|
48
|
-
LIBRUBY = $(LIBRUBY_SO)
|
49
|
-
LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
|
50
|
-
LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
|
51
|
-
LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
|
52
|
-
OUTFLAG = -o
|
53
|
-
COUTFLAG = -o
|
54
|
-
|
55
|
-
RUBY_EXTCONF_H =
|
56
|
-
cflags = $(optflags) $(debugflags) $(warnflags)
|
57
|
-
optflags = -O3
|
58
|
-
debugflags = -ggdb
|
59
|
-
warnflags = -Wextra -Wno-unused-parameter -Wno-parentheses -Wpointer-arith -Wwrite-strings -Wno-missing-field-initializers -Wshorten-64-to-32 -Wno-long-long
|
60
|
-
CFLAGS = -fno-common $(cflags) -fno-common -pipe
|
61
|
-
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
62
|
-
DEFS =
|
63
|
-
CPPFLAGS = -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE $(DEFS) $(cppflags)
|
64
|
-
CXXFLAGS = $(CFLAGS) $(cxxflags)
|
65
|
-
ldflags = -L.
|
66
|
-
dldflags = -Wl,-undefined,dynamic_lookup -Wl,-multiply_defined,suppress -Wl,-flat_namespace
|
67
|
-
ARCH_FLAG =
|
68
|
-
DLDFLAGS = $(ldflags) $(dldflags)
|
69
|
-
LDSHARED = $(CC) -dynamic -bundle
|
70
|
-
LDSHAREDXX = $(CXX) -dynamic -bundle
|
71
|
-
AR = ar
|
72
|
-
EXEEXT =
|
73
|
-
|
74
|
-
RUBY_BASE_NAME = ruby
|
75
|
-
RUBY_INSTALL_NAME = ruby
|
76
|
-
RUBY_SO_NAME = ruby.1.9.1
|
77
|
-
arch = x86_64-darwin10.3.1
|
78
|
-
sitearch = $(arch)
|
79
|
-
ruby_version = 1.9.1
|
80
|
-
ruby = /Users/dimus/.rvm/rubies/ruby-1.9.2-p0/bin/ruby
|
81
|
-
RUBY = $(ruby)
|
82
|
-
RM = rm -f
|
83
|
-
RM_RF = $(RUBY) -run -e rm -- -rf
|
84
|
-
RMDIRS = $(RUBY) -run -e rmdir -- -p
|
85
|
-
MAKEDIRS = mkdir -p
|
86
|
-
INSTALL = /usr/bin/install -c
|
87
|
-
INSTALL_PROG = $(INSTALL) -m 0755
|
88
|
-
INSTALL_DATA = $(INSTALL) -m 644
|
89
|
-
COPY = cp
|
90
|
-
|
91
|
-
#### End of system configuration section. ####
|
92
|
-
|
93
|
-
preload =
|
94
|
-
|
95
|
-
libpath = . $(libdir)
|
96
|
-
LIBPATH = -L. -L$(libdir)
|
97
|
-
DEFFILE =
|
98
|
-
|
99
|
-
CLEANFILES = mkmf.log
|
100
|
-
DISTCLEANFILES =
|
101
|
-
DISTCLEANDIRS =
|
102
|
-
|
103
|
-
extout =
|
104
|
-
extout_prefix =
|
105
|
-
target_prefix =
|
106
|
-
LOCAL_LIBS =
|
107
|
-
LIBS = $(LIBRUBYARG_SHARED) -lpthread -ldl -lobjc
|
108
|
-
SRCS =
|
109
|
-
OBJS =
|
110
|
-
TARGET =
|
111
|
-
DLLIB =
|
112
|
-
EXTSTATIC =
|
113
|
-
STATIC_LIB =
|
114
|
-
|
115
|
-
BINDIR = $(bindir)
|
116
|
-
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
117
|
-
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
118
|
-
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
119
|
-
HDRDIR = $(rubyhdrdir)/ruby$(target_prefix)
|
120
|
-
ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
|
121
|
-
|
122
|
-
TARGET_SO = $(DLLIB)
|
123
|
-
CLEANLIBS = $(TARGET).bundle
|
124
|
-
CLEANOBJS = *.o *.bak
|
125
|
-
|
126
|
-
all: Makefile
|
127
|
-
static: $(STATIC_LIB)
|
128
|
-
.PHONY: all install static install-so install-rb
|
129
|
-
.PHONY: clean clean-so clean-rb
|
130
|
-
|
131
|
-
clean-rb-default::
|
132
|
-
clean-rb::
|
133
|
-
clean-so::
|
134
|
-
clean: clean-so clean-rb-default clean-rb
|
135
|
-
@-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
|
136
|
-
|
137
|
-
distclean-rb-default::
|
138
|
-
distclean-rb::
|
139
|
-
distclean-so::
|
140
|
-
distclean: clean distclean-so distclean-rb-default distclean-rb
|
141
|
-
@-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
|
142
|
-
@-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
|
143
|
-
@-$(RMDIRS) $(DISTCLEANDIRS)
|
144
|
-
|
145
|
-
realclean: distclean
|
146
|
-
install: install-so install-rb
|
147
|
-
|
148
|
-
install-so: Makefile
|
149
|
-
install-rb: pre-install-rb install-rb-default
|
150
|
-
install-rb-default: pre-install-rb-default
|
151
|
-
pre-install-rb: Makefile
|
152
|
-
pre-install-rb-default: Makefile
|
153
|
-
|
154
|
-
site-install: site-install-so site-install-rb
|
155
|
-
site-install-so: install-so
|
156
|
-
site-install-rb: install-rb
|
157
|
-
|
data/VERSION
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
1.0.1
|
data/spec/spec.opts
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
--colour
|
data/spec/spec_helper.rb
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
require 'rspec'
|
2
|
-
|
3
|
-
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
4
|
-
require 'taxamatch_rb'
|
5
|
-
|
6
|
-
def read_test_file(file, fields_num)
|
7
|
-
f = open(file)
|
8
|
-
f.each do |line|
|
9
|
-
fields = line.split("|")
|
10
|
-
if line.match(/^\s*#/) == nil && fields.size == fields_num
|
11
|
-
fields[-1] = fields[-1].split('#')[0].strip
|
12
|
-
yield(fields)
|
13
|
-
else
|
14
|
-
yield(nil)
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
def make_taxamatch_hash(string)
|
20
|
-
normalized = Taxamatch::Normalizer.normalize(string)
|
21
|
-
{:string => string, :normalized => normalized, :phonetized => Taxamatch::Phonetizer.near_match(normalized)}
|
22
|
-
end
|
data/spec/taxamatch_rb_spec.rb
DELETED
@@ -1,406 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
require 'spec_helper'
|
3
|
-
|
4
|
-
describe 'Atomizer' do
|
5
|
-
before(:all) do
|
6
|
-
@parser = Taxamatch::Atomizer.new
|
7
|
-
end
|
8
|
-
|
9
|
-
it 'should parse uninomials' do
|
10
|
-
@parser.parse('Betula').should == { :all_authors => [], :all_years => [],
|
11
|
-
:canonical_form => "Betula", :uninomial => { :string => "Betula",
|
12
|
-
:normalized => 'BETULA', :phonetized => "BITILA", :authors => [],
|
13
|
-
:years => [], :normalized_authors => [] } }
|
14
|
-
@parser.parse('Ærenea Lacordaire, 1872').should == {
|
15
|
-
:all_authors => ["LACORDAIRE"], :all_years => [1872],
|
16
|
-
:canonical_form => "Aerenea", :uninomial => { :string => "Aerenea",
|
17
|
-
:normalized => "AERENEA", :phonetized => "ERINIA",
|
18
|
-
:authors => ["Lacordaire"], :years => [1872],
|
19
|
-
:normalized_authors => ["LACORDAIRE"] } }
|
20
|
-
end
|
21
|
-
|
22
|
-
it 'should parse binomials' do
|
23
|
-
@parser.parse('Leœptura laetifica Dow, 1913').should == {
|
24
|
-
:all_authors => ["DOW"], :all_years => [1913],
|
25
|
-
:canonical_form => "Leoeptura laetifica", :genus => {
|
26
|
-
:string => "Leoeptura", :normalized => "LEOEPTURA",
|
27
|
-
:phonetized => "LIPTIRA", :authors => [], :years => [],
|
28
|
-
:normalized_authors => []}, :species => {
|
29
|
-
:string => "laetifica", :normalized => "LAETIFICA",
|
30
|
-
:phonetized => "LITIFICA", :authors => ["Dow"],
|
31
|
-
:years => [1913], :normalized_authors => ["DOW"] } }
|
32
|
-
end
|
33
|
-
|
34
|
-
it 'should parse trinomials' do
|
35
|
-
@parser.parse('Hydnellum scrobiculatum zonatum ' +
|
36
|
-
'(Banker) D. Hall et D.E. Stuntz 1972').should == {
|
37
|
-
:all_authors => ["BANKER", "D HALL", "D E STUNTZ"], :all_years => [1972],
|
38
|
-
:canonical_form => "Hydnellum scrobiculatum zonatum", :genus=>{
|
39
|
-
:string => "Hydnellum", :normalized => "HYDNELLUM",
|
40
|
-
:phonetized => "HIDNILIM", :authors => [], :years => [],
|
41
|
-
:normalized_authors => [] }, :species => { :string => "scrobiculatum",
|
42
|
-
:normalized => "SCROBICULATUM", :phonetized => "SCRABICILATA",
|
43
|
-
:authors => [], :years => [], :normalized_authors => [] },
|
44
|
-
:infraspecies => [{ :string => "zonatum", :normalized => "ZONATUM",
|
45
|
-
:phonetized => "ZANATA", :authors => ["Banker", "D. Hall", "D.E. Stuntz"],
|
46
|
-
:years => [1972], :normalized_authors => ["BANKER", "D HALL",
|
47
|
-
"D E STUNTZ"] }] }
|
48
|
-
end
|
49
|
-
|
50
|
-
it 'should normalize years to integers' do
|
51
|
-
future_year = Time.now.year + 10
|
52
|
-
@parser.parse("Hydnellum scrobiculatum Kern #{future_year} " +
|
53
|
-
"zonatum (Banker) D. Hall et D.E. Stuntz 1972?").should == {
|
54
|
-
:all_authors => ["KERN", "BANKER", "D HALL", "D E STUNTZ"],
|
55
|
-
:all_years => [1972],
|
56
|
-
:canonical_form => "Hydnellum scrobiculatum zonatum", :genus => {
|
57
|
-
:string => "Hydnellum", :normalized => "HYDNELLUM",
|
58
|
-
:phonetized => "HIDNILIM", :authors => [], :years => [],
|
59
|
-
:normalized_authors => [] }, :species => { :string => "scrobiculatum",
|
60
|
-
:normalized => "SCROBICULATUM", :phonetized => "SCRABICILATA",
|
61
|
-
:authors => ["Kern"], :years => [], :normalized_authors => ["KERN"] },
|
62
|
-
:infraspecies => [{ :string => "zonatum", :normalized => "ZONATUM",
|
63
|
-
:phonetized => "ZANATA", :authors =>
|
64
|
-
["Banker", "D. Hall", "D.E. Stuntz"], :years => [1972],
|
65
|
-
:normalized_authors => ["BANKER", "D HALL", "D E STUNTZ"] }] }
|
66
|
-
end
|
67
|
-
|
68
|
-
it 'should normalize names with abbreviated genus after cf.' do
|
69
|
-
@parser.parse('Unio cf. U. alba').should == { :all_authors => [],
|
70
|
-
:all_years => [], :canonical_form => "Unio",
|
71
|
-
:genus => { :string => "Unio", :normalized => "UNIO",
|
72
|
-
:phonetized => "UNIA", :authors => [], :years => [],
|
73
|
-
:normalized_authors => [] } }
|
74
|
-
end
|
75
|
-
|
76
|
-
it 'should parse names which broke it before' do
|
77
|
-
['Parus caeruleus species complex',
|
78
|
-
'Euxoa nr. idahoensis sp. 1clay',
|
79
|
-
'Cetraria islandica ? islandica',
|
80
|
-
'Buteo borealis ? ventralis'].each do |n|
|
81
|
-
res = @parser.parse(n)
|
82
|
-
res.class.should == Hash
|
83
|
-
res.empty?.should be_false
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
|
89
|
-
describe 'Taxamatch::Normalizer' do
|
90
|
-
it 'should normalize strings' do
|
91
|
-
Taxamatch::Normalizer.normalize('abcd').should == 'ABCD'
|
92
|
-
Taxamatch::Normalizer.normalize('Leœptura').should == 'LEOEPTURA'
|
93
|
-
Taxamatch::Normalizer.normalize('Ærenea').should == 'AERENEA'
|
94
|
-
Taxamatch::Normalizer.normalize('Fallén').should == 'FALLEN'
|
95
|
-
Taxamatch::Normalizer.normalize('Fallé€n').should == 'FALLE?N'
|
96
|
-
Taxamatch::Normalizer.normalize('Fallén привет').should == 'FALLEN ??????'
|
97
|
-
Taxamatch::Normalizer.normalize('Choriozopella trägårdhi').should ==
|
98
|
-
'CHORIOZOPELLA TRAGARDHI'
|
99
|
-
Taxamatch::Normalizer.normalize('×Zygomena').should == 'xZYGOMENA'
|
100
|
-
end
|
101
|
-
|
102
|
-
it 'should normalize words' do
|
103
|
-
Taxamatch::Normalizer.normalize_word('L-3eœ|pt[ura$').should ==
|
104
|
-
'L-3EOEPTURA'
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
describe 'Taxamatch::Base' do
|
109
|
-
before(:all) do
|
110
|
-
@tm = Taxamatch::Base.new
|
111
|
-
end
|
112
|
-
|
113
|
-
it 'should get txt tests' do
|
114
|
-
test_file = File.expand_path(File.dirname(__FILE__)) + '/taxamatch_test.txt'
|
115
|
-
read_test_file(test_file, 4) do |y|
|
116
|
-
if y
|
117
|
-
y[2] = y[2] == 'true' ? true : false
|
118
|
-
res = @tm.taxamatch(y[0], y[1], false)
|
119
|
-
# puts "%s, %s, %s, %s" % [y[0], y[1], y[2], y[3]]
|
120
|
-
res['match'].should == y[2]
|
121
|
-
res['edit_distance'].should == y[3].to_i
|
122
|
-
end
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
it 'should work with names that cannot be parsed' do
|
127
|
-
res = @tm.taxamatch('Quadraspidiotus ostreaeformis MacGillivray, 1921',
|
128
|
-
'Quadraspidiotus ostreaeformis Curtis)')
|
129
|
-
res = false
|
130
|
-
end
|
131
|
-
|
132
|
-
it 'should compare genera' do
|
133
|
-
# edit distance 1 always match
|
134
|
-
g1 = make_taxamatch_hash 'Plantago'
|
135
|
-
g2 = make_taxamatch_hash 'Plantagon'
|
136
|
-
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
137
|
-
'edit_distance' => 1, 'match' => true }
|
138
|
-
# edit_distance above threshold does not math
|
139
|
-
g1 = make_taxamatch_hash 'Plantago'
|
140
|
-
g2 = make_taxamatch_hash 'This shouldnt match'
|
141
|
-
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
142
|
-
'match' => false, 'edit_distance' => 4 }
|
143
|
-
# phonetic_match matches
|
144
|
-
g1 = make_taxamatch_hash 'Plantagi'
|
145
|
-
g2 = make_taxamatch_hash 'Plantagy'
|
146
|
-
@tm.match_genera(g1, g2).should == { 'phonetic_match' => true,
|
147
|
-
'edit_distance' => 1, 'match' => true }
|
148
|
-
@tm.match_genera(g1, g2, :with_phonetic_match => false).should == {
|
149
|
-
'phonetic_match' => false, 'edit_distance' => 1, 'match' => true }
|
150
|
-
# distance 1 in first letter also matches
|
151
|
-
g1 = make_taxamatch_hash 'Xantheri'
|
152
|
-
g2 = make_taxamatch_hash 'Pantheri'
|
153
|
-
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
154
|
-
'edit_distance' => 1, 'match' => true }
|
155
|
-
# phonetic match tramps everything
|
156
|
-
g1 = make_taxamatch_hash 'Xaaaaantheriiiiiiiiiiiiiii'
|
157
|
-
g2 = make_taxamatch_hash 'Zaaaaaaaaaaaantheryyyyyyyy'
|
158
|
-
@tm.match_genera(g1, g2).should == { 'phonetic_match' => true,
|
159
|
-
'edit_distance' => 4, 'match' => true }
|
160
|
-
@tm.match_genera(g1, g2, :with_phonetic_match => false).should == {
|
161
|
-
'phonetic_match' => false, 'edit_distance' => 4, 'match' => false }
|
162
|
-
# same first letter and distance 2 should match
|
163
|
-
g1 = make_taxamatch_hash 'Xaaaantherii'
|
164
|
-
g2 = make_taxamatch_hash 'Xaaaantherrr'
|
165
|
-
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
166
|
-
'match' => true, 'edit_distance' => 2 }
|
167
|
-
# First letter is the same and distance is 3 should match, no phonetic match
|
168
|
-
g1 = make_taxamatch_hash 'Xaaaaaaaaaaantheriii'
|
169
|
-
g2 = make_taxamatch_hash 'Xaaaaaaaaaaantherrrr'
|
170
|
-
@tm.match_genera(g1, g2).should ==
|
171
|
-
{ 'phonetic_match' => false, 'match' => true, 'edit_distance' => 3 }
|
172
|
-
# Should not match if one of words is shorter than 2x edit
|
173
|
-
# distance and distance is 2 or 3
|
174
|
-
g1 = make_taxamatch_hash 'Xant'
|
175
|
-
g2 = make_taxamatch_hash 'Xanthe'
|
176
|
-
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
177
|
-
'match' => false, 'edit_distance' => 2 }
|
178
|
-
# Should not match if edit distance > 3 and no phonetic match
|
179
|
-
g1 = make_taxamatch_hash 'Xantheriiii'
|
180
|
-
g2 = make_taxamatch_hash 'Xantherrrrr'
|
181
|
-
@tm.match_genera(g1, g2).should == { 'phonetic_match' => false,
|
182
|
-
'match' => false, 'edit_distance' => 4 }
|
183
|
-
end
|
184
|
-
|
185
|
-
it 'should compare species' do
|
186
|
-
# Exact match
|
187
|
-
s1 = make_taxamatch_hash 'major'
|
188
|
-
s2 = make_taxamatch_hash 'major'
|
189
|
-
@tm.match_species(s1, s2).should == { 'phonetic_match' => true,
|
190
|
-
'match' => true, 'edit_distance' => 0 }
|
191
|
-
@tm.match_species(s1, s2, :with_phonetic_match => false).should == {
|
192
|
-
'phonetic_match' => false, 'match' => true, 'edit_distance' => 0 }
|
193
|
-
# Phonetic match always works
|
194
|
-
s1 = make_taxamatch_hash 'xanteriiieeeeeeeeeeeee'
|
195
|
-
s2 = make_taxamatch_hash 'zantereeeeeeeeeeeeeeee'
|
196
|
-
@tm.match_species(s1, s2).should == { 'phonetic_match' => true,
|
197
|
-
'match' => true, 'edit_distance' => 4 }
|
198
|
-
@tm.match_species(s1, s2, :with_phonetic_match => false).should ==
|
199
|
-
{ 'phonetic_match' => false, 'match' => false, 'edit_distance' => 4 }
|
200
|
-
# Phonetic match works with different endings
|
201
|
-
s1 = make_taxamatch_hash 'majorum'
|
202
|
-
s2 = make_taxamatch_hash 'majoris'
|
203
|
-
@tm.match_species(s1, s2).should == {
|
204
|
-
'phonetic_match' => true, 'match' => true, 'edit_distance' => 2 }
|
205
|
-
@tm.match_species(s1, s2, :with_phonetic_match => false).should ==
|
206
|
-
{ 'phonetic_match' => false, 'match' => true, 'edit_distance' => 2 }
|
207
|
-
# Distance 4 matches if first 3 chars are the same
|
208
|
-
s1 = make_taxamatch_hash 'majjjjorrrrr'
|
209
|
-
s2 = make_taxamatch_hash 'majjjjoraaaa'
|
210
|
-
@tm.match_species(s1, s2).should ==
|
211
|
-
{ 'phonetic_match' => false, 'match' => true, 'edit_distance' => 4 }
|
212
|
-
# Should not match if Distance 4 matches and first 3 chars are not the same
|
213
|
-
s1 = make_taxamatch_hash 'majorrrrr'
|
214
|
-
s2 = make_taxamatch_hash 'marorraaa'
|
215
|
-
@tm.match_species(s1, s2).should == {
|
216
|
-
'phonetic_match' => false, 'match' => false, 'edit_distance' => 4 }
|
217
|
-
# Distance 2 or 3 matches if first 1 char is the same
|
218
|
-
s1 = make_taxamatch_hash 'moooorrrr'
|
219
|
-
s2 = make_taxamatch_hash 'mooooraaa'
|
220
|
-
@tm.match_species(s1, s2).should == { 'phonetic_match' => false,
|
221
|
-
'match' => true, 'edit_distance' => 3 }
|
222
|
-
# Should not match if Distance 2 or 3 and first 1 char is not the same
|
223
|
-
s1 = make_taxamatch_hash 'morrrr'
|
224
|
-
s2 = make_taxamatch_hash 'torraa'
|
225
|
-
@tm.match_species(s1, s2).should == {
|
226
|
-
'phonetic_match' => false, 'match' => false, 'edit_distance' => 3 }
|
227
|
-
# Distance 1 will match anywhere
|
228
|
-
s1 = make_taxamatch_hash 'major'
|
229
|
-
s2 = make_taxamatch_hash 'rajor'
|
230
|
-
@tm.match_species(s1, s2).should == {
|
231
|
-
'phonetic_match' => false, 'match' => true, 'edit_distance' => 1 }
|
232
|
-
# Will not match if distance 3 and length is less then twice
|
233
|
-
# of the edit distance
|
234
|
-
s1 = make_taxamatch_hash 'marrr'
|
235
|
-
s2 = make_taxamatch_hash 'maaaa'
|
236
|
-
@tm.match_species(s1, s2).should == {
|
237
|
-
'phonetic_match' => false, 'match' => false, 'edit_distance' => 3 }
|
238
|
-
end
|
239
|
-
|
240
|
-
it 'should match matches' do
|
241
|
-
# No trobule case
|
242
|
-
gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
|
243
|
-
smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
|
244
|
-
@tm.match_matches(gmatch, smatch).should ==
|
245
|
-
{ 'phonetic_match' => true, 'edit_distance' => 2, 'match' => true }
|
246
|
-
# Will not match if either genus or sp. epithet dont match
|
247
|
-
gmatch = { 'match' => false,
|
248
|
-
'phonetic_match' => false, 'edit_distance' => 1 }
|
249
|
-
smatch = { 'match' => true,
|
250
|
-
'phonetic_match' => true, 'edit_distance' => 1 }
|
251
|
-
@tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
|
252
|
-
'edit_distance' => 2, 'match' => false }
|
253
|
-
gmatch = { 'match' => true, 'phonetic_match' => true,
|
254
|
-
'edit_distance' => 1 }
|
255
|
-
smatch = { 'match' => false, 'phonetic_match' => false,
|
256
|
-
'edit_distance' => 1 }
|
257
|
-
@tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
|
258
|
-
'edit_distance' => 2, 'match' => false }
|
259
|
-
# Should not match if binomial edit distance > 4
|
260
|
-
# NOTE: EVEN with full phonetic match
|
261
|
-
gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 3 }
|
262
|
-
smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
|
263
|
-
@tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => true,
|
264
|
-
'edit_distance' => 5, 'match' => false }
|
265
|
-
# Should not have phonetic match if one of the components
|
266
|
-
# does not match phonetically
|
267
|
-
gmatch = { 'match' => true,
|
268
|
-
'phonetic_match' => false, 'edit_distance' => 1 }
|
269
|
-
smatch = { 'match' => true,
|
270
|
-
'phonetic_match' => true, 'edit_distance' => 1 }
|
271
|
-
@tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
|
272
|
-
'edit_distance' => 2, 'match' => true }
|
273
|
-
gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 1 }
|
274
|
-
smatch = { 'match' => true,
|
275
|
-
'phonetic_match' => false, 'edit_distance' => 1 }
|
276
|
-
@tm.match_matches(gmatch, smatch).should == { 'phonetic_match' => false,
|
277
|
-
'edit_distance' => 2, 'match' => true }
|
278
|
-
# edit distance should be equal the sum of of edit distances
|
279
|
-
gmatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
|
280
|
-
smatch = { 'match' => true, 'phonetic_match' => true, 'edit_distance' => 2 }
|
281
|
-
@tm.match_matches(gmatch, smatch).should == {
|
282
|
-
'phonetic_match'=>true, 'edit_distance'=>4, 'match'=>true }
|
283
|
-
end
|
284
|
-
|
285
|
-
it 'should return only boolean values' do
|
286
|
-
@tm.taxamatch("AJLJljljlj", "sls").should_not be_nil
|
287
|
-
@tm.taxamatch('Olsl','a')
|
288
|
-
end
|
289
|
-
|
290
|
-
it "should not match authors from different parts of name" do
|
291
|
-
parser = Taxamatch::Atomizer.new
|
292
|
-
t = Taxamatch::Base.new
|
293
|
-
n1 = parser.parse "Betula Linnaeus"
|
294
|
-
n2 = parser.parse "Betula alba Linnaeus"
|
295
|
-
n3 = parser.parse "Betula alba alba Linnaeus"
|
296
|
-
n4 = parser.parse "Betula alba L."
|
297
|
-
n5 = parser.parse "Betula alba"
|
298
|
-
n6 = parser.parse "Betula olba"
|
299
|
-
n7 = parser.parse "Betula alba Linnaeus alba"
|
300
|
-
n8 = parser.parse "Betula alba Linnaeus alba Smith"
|
301
|
-
n9 = parser.parse "Betula alba Smith alba L."
|
302
|
-
n10 = parser.parse "Betula Linn."
|
303
|
-
# if one authorship is empty, return 0
|
304
|
-
t.match_authors(n1, n5).should == 0
|
305
|
-
t.match_authors(n5, n1).should == 0
|
306
|
-
t.match_authors(n5, n6).should == 0
|
307
|
-
# if authorship matches on different levels ignore
|
308
|
-
t.match_authors(n7, n3).should == 0
|
309
|
-
t.match_authors(n8, n3).should == -1
|
310
|
-
t.match_authors(n2, n8).should == 0
|
311
|
-
t.match_authors(n1, n2).should == 0
|
312
|
-
# match on infraspecies level
|
313
|
-
t.match_authors(n9, n3).should == 1
|
314
|
-
# match on species level
|
315
|
-
t.match_authors(n2, n4).should == 1
|
316
|
-
# match on uninomial level
|
317
|
-
t.match_authors(n1, n10).should == 1
|
318
|
-
end
|
319
|
-
|
320
|
-
|
321
|
-
describe 'Taxamatch::Authmatch' do
|
322
|
-
before(:all) do
|
323
|
-
@am = Taxamatch::Authmatch
|
324
|
-
end
|
325
|
-
|
326
|
-
it 'should calculate score' do
|
327
|
-
res = @am.authmatch(['Linnaeus', 'Muller'], ['L'], [], [1788])
|
328
|
-
res.should == 90
|
329
|
-
res = @am.authmatch(['Linnaeus'],['Kurtz'], [], [])
|
330
|
-
res.should == 0
|
331
|
-
# found all authors, same year
|
332
|
-
res = @am.authmatch(['Linnaeus', 'Muller'],
|
333
|
-
['Muller', 'Linnaeus'], [1766], [1766])
|
334
|
-
res.should == 100
|
335
|
-
# all authors, 1 year diff
|
336
|
-
res = @am.authmatch(['Linnaeus', 'Muller'],
|
337
|
-
['Muller', 'Linnaeus'], [1767], [1766])
|
338
|
-
res.should == 54
|
339
|
-
# year is not counted in
|
340
|
-
res = @am.authmatch(['Linnaeus', 'Muller'],
|
341
|
-
['Muller', 'Linnaeus'], [1767], [])
|
342
|
-
res.should == 94
|
343
|
-
# found all authors on one side, same year
|
344
|
-
res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'],
|
345
|
-
['Muller', 'Linnaeus'], [1767], [1767])
|
346
|
-
res.should == 91
|
347
|
-
# found all authors on one side, 1 year diff
|
348
|
-
res = @am.authmatch(['Linnaeus', 'Muller', 'Kurtz'],
|
349
|
-
['Muller', 'Linnaeus'], [1766], [1767])
|
350
|
-
res.should == 51
|
351
|
-
# found all authors on one side, year does not count
|
352
|
-
res = @am.authmatch(['Linnaeus', 'Muller'],
|
353
|
-
['Muller', 'Linnaeus', 'Kurtz'], [1766], [])
|
354
|
-
res.should == 90
|
355
|
-
# found some authors
|
356
|
-
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'],
|
357
|
-
['Muller', 'Kurtz', 'Stepanov'], [1766], [])
|
358
|
-
res.should == 67
|
359
|
-
# if year does not match or not present no match for previous case
|
360
|
-
res = @am.authmatch(['Stepanov', 'Linnaeus', 'Muller'],
|
361
|
-
['Muller', 'Kurtz', 'Stepanov'], [1766], [1765])
|
362
|
-
res.should == 0
|
363
|
-
end
|
364
|
-
|
365
|
-
it 'should compare years' do
|
366
|
-
@am.compare_years([1882],[1880]).should == 2
|
367
|
-
@am.compare_years([1882],[]).should == nil
|
368
|
-
@am.compare_years([],[]).should == 0
|
369
|
-
@am.compare_years([1788,1798], [1788,1798]).should be_nil
|
370
|
-
end
|
371
|
-
|
372
|
-
it 'should remove duplicate authors' do
|
373
|
-
# Li submatches Linnaeus and it its size 3 is big enought to remove
|
374
|
-
# Linnaeus Muller is identical
|
375
|
-
res = @am.remove_duplicate_authors(['Lin', 'Muller'],
|
376
|
-
['Linnaeus', 'Muller'])
|
377
|
-
res.should == [[], []]
|
378
|
-
# same in different order
|
379
|
-
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
|
380
|
-
['Linn', 'Muller'])
|
381
|
-
res.should == [[], []]
|
382
|
-
# auth Li submatches Linnaeus, but Li size less then 3
|
383
|
-
# required to remove Linnaeus
|
384
|
-
res = @am.remove_duplicate_authors(['Dem', 'Li'],
|
385
|
-
['Linnaeus', 'Stepanov'])
|
386
|
-
res.should == [["Dem"], ["Linnaeus", "Stepanov"]]
|
387
|
-
# fuzzy match
|
388
|
-
res = @am.remove_duplicate_authors(['Dem', 'Lennaeus'],
|
389
|
-
['Linnaeus', 'Stepanov'])
|
390
|
-
res.should == [["Dem"], ["Stepanov"]]
|
391
|
-
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
|
392
|
-
['L', 'Kenn'])
|
393
|
-
res.should == [['Linnaeus', 'Muller'], ['Kenn']]
|
394
|
-
res = @am.remove_duplicate_authors(['Linnaeus', 'Muller'],
|
395
|
-
['Muller', 'Linnaeus', 'Kurtz'])
|
396
|
-
res.should == [[],['Kurtz']]
|
397
|
-
end
|
398
|
-
|
399
|
-
it 'should fuzzy match authors' do
|
400
|
-
res = @am.fuzzy_match_authors('L', 'Muller')
|
401
|
-
res.should be_false
|
402
|
-
end
|
403
|
-
|
404
|
-
end
|
405
|
-
|
406
|
-
end
|
data/spec/taxamatch_test.txt
DELETED
@@ -1,46 +0,0 @@
|
|
1
|
-
###
|
2
|
-
#
|
3
|
-
# Tests for string comparison by taxamatch algorithm
|
4
|
-
# name1|name2|match|edit_distance
|
5
|
-
#
|
6
|
-
##
|
7
|
-
# Comparing uninomials
|
8
|
-
Pomatomus|Pomatomas|true|1
|
9
|
-
Pomatomus L.|Pomatomas Linn.|true|1
|
10
|
-
Pomatomus Ber|Pomatomas Linn|false|1
|
11
|
-
Pomatomus L. 1758|Pomatomus Linn. 1800|false|0
|
12
|
-
Patella|Abbella|false|3
|
13
|
-
|
14
|
-
## additional authorship should match
|
15
|
-
Puma concolor|Puma concolor L.|true|0
|
16
|
-
#
|
17
|
-
## one-letter misspeling in species epithet should match
|
18
|
-
Puma concolor|Puma cancolor|true|1
|
19
|
-
#
|
20
|
-
Pomatomus saltatrix|Pomatomus saltratix|true|2
|
21
|
-
Pomatomus saltator|Pomatomus saltatrix|false|3 #!!!
|
22
|
-
#
|
23
|
-
Loligo pealeii|Loligo plei|false|3
|
24
|
-
#
|
25
|
-
## different authors should not match
|
26
|
-
Puma concolor Linnaeus|Puma concolor Kurtz|false|0
|
27
|
-
#
|
28
|
-
##real life examples
|
29
|
-
Biatora borealis|Bactra borealis Diakonoff 1964|false|3
|
30
|
-
#
|
31
|
-
Homo sapien|Homo sapiens|true|1
|
32
|
-
Homo sapiens Linnaeus|Homo sapens (Linn. 1758) |true|1
|
33
|
-
Homo sapiens Mozzherin|Homo sapiens Linneaus|false|0
|
34
|
-
#
|
35
|
-
Quinqueloculina punctata|Quinqueloculina punctata d'Orbigny 1905|true|0
|
36
|
-
Pomatomus saltator (Linnaeus, 1766)|Pomatomus saltatrix (Linnaeus, 1766)|true|0|3
|
37
|
-
#
|
38
|
-
#Trinomial names
|
39
|
-
Homo sapiens stupidus|Homo spiens stupidus|true|1
|
40
|
-
Pomatomus saltator saltator L. 1758|Pomatomus saltator var. saltatror L. 1758|true|1
|
41
|
-
Pomatomus saltator L. 1758|Pomatomus saltator var. saltatror L. 1758|false|5
|
42
|
-
Pomatomus saltator saltator saltatorische|Pomatomus saltator soltator|true|1
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|