name-spotter 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.byebug_history +44 -0
- data/.gitignore +51 -0
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/.travis.yml +22 -0
- data/CHANGELOG +2 -0
- data/Gemfile +2 -22
- data/README.md +116 -0
- data/Rakefile +2 -19
- data/lib/name-spotter.rb +3 -1
- data/lib/name-spotter/monkey_patches.rb +4 -2
- data/lib/name-spotter/neti_neti_client.rb +13 -6
- data/lib/name-spotter/scientific_name.rb +3 -3
- data/lib/name-spotter/taxon_finder_client.rb +35 -24
- data/lib/name-spotter/version.rb +8 -0
- data/name-spotter.gemspec +26 -98
- data/spec/name-spotter_spec.rb +334 -131
- data/spec/scientific_name_spec.rb +14 -19
- data/spec/spec_helper.rb +2 -12
- data/tf_logic.txt +3 -3
- metadata +69 -142
- data/.rvmrc +0 -1
- data/Gemfile.lock +0 -84
- data/README.rdoc +0 -95
- data/VERSION +0 -1
- data/features/name-spotter.feature +0 -9
- data/features/step_definitions/name-spotter_steps.rb +0 -0
- data/features/support/env.rb +0 -13
data/name-spotter.gemspec
CHANGED
@@ -1,103 +1,31 @@
|
|
1
|
-
|
2
|
-
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
-
# -*- encoding: utf-8 -*-
|
1
|
+
$:.push File.expand_path("../lib", __FILE__)
|
5
2
|
|
6
|
-
|
7
|
-
s.name = "name-spotter"
|
8
|
-
s.version = "0.2.4"
|
3
|
+
require "name-spotter/version"
|
9
4
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
".rvmrc",
|
23
|
-
"CHANGELOG",
|
24
|
-
"Gemfile",
|
25
|
-
"Gemfile.lock",
|
26
|
-
"LICENSE.txt",
|
27
|
-
"README.rdoc",
|
28
|
-
"Rakefile",
|
29
|
-
"VERSION",
|
30
|
-
"features/name-spotter.feature",
|
31
|
-
"features/step_definitions/name-spotter_steps.rb",
|
32
|
-
"features/support/env.rb",
|
33
|
-
"lib/name-spotter.rb",
|
34
|
-
"lib/name-spotter/client.rb",
|
35
|
-
"lib/name-spotter/monkey_patches.rb",
|
36
|
-
"lib/name-spotter/neti_neti_client.rb",
|
37
|
-
"lib/name-spotter/scientific_name.rb",
|
38
|
-
"lib/name-spotter/taxon_finder_client.rb",
|
39
|
-
"name-spotter.gemspec",
|
40
|
-
"spec/files/english.txt",
|
41
|
-
"spec/files/journalofentomol13pomo_0018.txt",
|
42
|
-
"spec/files/journalofentomol13pomo_0063.txt",
|
43
|
-
"spec/files/not_english.txt",
|
44
|
-
"spec/name-spotter_spec.rb",
|
45
|
-
"spec/scientific_name_spec.rb",
|
46
|
-
"spec/spec_helper.rb",
|
47
|
-
"tf_logic.txt"
|
48
|
-
]
|
49
|
-
s.homepage = "http://github.com/GlobalNamesArchitecture/name-spotter"
|
50
|
-
s.licenses = ["MIT"]
|
51
|
-
s.require_paths = ["lib"]
|
52
|
-
s.rubygems_version = "1.8.24"
|
53
|
-
s.summary = "Scientific names finder"
|
5
|
+
Gem::Specification.new do |gem|
|
6
|
+
gem.name = "name-spotter"
|
7
|
+
gem.homepage = "http://github.com/GlobalNamesArchitecture/name-spotter"
|
8
|
+
gem.version = NameSpotter::VERSION
|
9
|
+
gem.authors = ["Anthony Goddard", "Chuck Ha",
|
10
|
+
"Dmitry Mozzherin", "David Shorthouse"]
|
11
|
+
gem.license = "MIT"
|
12
|
+
gem.summary = "Scientific names finder"
|
13
|
+
gem.description = %q|The gem searches for scientific names in texts using
|
14
|
+
socket servers running TaxonFinder (by Patrick Leary)
|
15
|
+
and NetiNeti (by Lakshmi Manohar Akella)|
|
16
|
+
gem.email = "dmozzherin@gmail.com"
|
54
17
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
s.add_development_dependency(%q<cucumber>, [">= 0"])
|
68
|
-
s.add_development_dependency(%q<capybara>, [">= 0"])
|
69
|
-
s.add_development_dependency(%q<bundler>, [">= 0"])
|
70
|
-
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
71
|
-
s.add_development_dependency(%q<debugger>, [">= 0"])
|
72
|
-
else
|
73
|
-
s.add_dependency(%q<rake>, [">= 0"])
|
74
|
-
s.add_dependency(%q<rest-client>, [">= 0"])
|
75
|
-
s.add_dependency(%q<builder>, [">= 0"])
|
76
|
-
s.add_dependency(%q<json>, [">= 0"])
|
77
|
-
s.add_dependency(%q<unicode_utils>, [">= 0"])
|
78
|
-
s.add_dependency(%q<unsupervised-language-detection>, [">= 0"])
|
79
|
-
s.add_dependency(%q<rspec>, [">= 0"])
|
80
|
-
s.add_dependency(%q<rspec-expectations>, [">= 0"])
|
81
|
-
s.add_dependency(%q<cucumber>, [">= 0"])
|
82
|
-
s.add_dependency(%q<capybara>, [">= 0"])
|
83
|
-
s.add_dependency(%q<bundler>, [">= 0"])
|
84
|
-
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
85
|
-
s.add_dependency(%q<debugger>, [">= 0"])
|
86
|
-
end
|
87
|
-
else
|
88
|
-
s.add_dependency(%q<rake>, [">= 0"])
|
89
|
-
s.add_dependency(%q<rest-client>, [">= 0"])
|
90
|
-
s.add_dependency(%q<builder>, [">= 0"])
|
91
|
-
s.add_dependency(%q<json>, [">= 0"])
|
92
|
-
s.add_dependency(%q<unicode_utils>, [">= 0"])
|
93
|
-
s.add_dependency(%q<unsupervised-language-detection>, [">= 0"])
|
94
|
-
s.add_dependency(%q<rspec>, [">= 0"])
|
95
|
-
s.add_dependency(%q<rspec-expectations>, [">= 0"])
|
96
|
-
s.add_dependency(%q<cucumber>, [">= 0"])
|
97
|
-
s.add_dependency(%q<capybara>, [">= 0"])
|
98
|
-
s.add_dependency(%q<bundler>, [">= 0"])
|
99
|
-
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
100
|
-
s.add_dependency(%q<debugger>, [">= 0"])
|
101
|
-
end
|
18
|
+
gem.files = `git ls-files`.split("\n")
|
19
|
+
gem.require_paths = ["lib"]
|
20
|
+
gem.add_runtime_dependency "rake", "~> 10.5"
|
21
|
+
gem.add_runtime_dependency "rest-client", "~> 1.8"
|
22
|
+
gem.add_runtime_dependency "nokogiri", "~> 1.6"
|
23
|
+
gem.add_runtime_dependency "builder", "~> 3.2"
|
24
|
+
gem.add_runtime_dependency "json", "~> 1.8"
|
25
|
+
gem.add_runtime_dependency "unicode_utils", "~> 1.4"
|
26
|
+
gem.add_runtime_dependency "unsupervised-language-detection", "~> 0.0.6"
|
27
|
+
gem.add_development_dependency "rspec", "~> 3.1"
|
28
|
+
gem.add_development_dependency "bundler", "~> 1.10"
|
29
|
+
gem.add_development_dependency "byebug", "~> 8.2"
|
102
30
|
end
|
103
31
|
|
data/spec/name-spotter_spec.rb
CHANGED
@@ -1,163 +1,366 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
3
|
-
|
4
1
|
describe "NameSpotter" do
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
@tf = NameSpotter.new(taxon_finder)
|
10
|
-
@clients = [@neti, @tf]
|
11
|
-
end
|
2
|
+
subject { NameSpotter }
|
3
|
+
let(:neti) { subject.new(subject::NetiNetiClient.new()) }
|
4
|
+
let(:tf) { subject.new(subject::TaxonFinderClient.new()) }
|
5
|
+
let(:clients) { [neti, tf] }
|
12
6
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
eng3 = open(File.join(File.dirname(__FILE__), 'files', 'journalofentomol13pomo_0018.txt'), 'r:utf-8').read
|
17
|
-
eng3 = open(File.join(File.dirname(__FILE__), 'files', 'journalofentomol13pomo_0063.txt'), 'r:utf-8').read
|
18
|
-
|
19
|
-
not_eng = open(File.join(File.dirname(__FILE__), 'files', 'not_english.txt'), 'r:utf-8').read
|
20
|
-
100.times do
|
21
|
-
NameSpotter.english?(eng).should be_true
|
22
|
-
NameSpotter.english?(eng2).should be_true
|
23
|
-
NameSpotter.english?(eng3).should be_false
|
24
|
-
NameSpotter.english?(not_eng).should be_false
|
7
|
+
describe ".version" do
|
8
|
+
it "returns version" do
|
9
|
+
expect(subject.version).to match /\d+\.\d+\.\d+/
|
25
10
|
end
|
26
11
|
end
|
27
12
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
13
|
+
describe ".english?" do
|
14
|
+
let(:eng) { read("english.txt") }
|
15
|
+
let(:eng2) { read("journalofentomol13pomo_0018.txt") }
|
16
|
+
let(:eng3) { read("journalofentomol13pomo_0063.txt") }
|
32
17
|
|
33
|
-
|
34
|
-
|
35
|
-
|
18
|
+
it "detects english" do
|
19
|
+
100.times do
|
20
|
+
expect(subject.english? eng).to be true
|
21
|
+
expect(subject.english? eng2).to be true
|
22
|
+
expect(subject.english? eng3).to be false
|
23
|
+
end
|
36
24
|
end
|
37
25
|
end
|
38
26
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
c.find('', 'json').should == "{\"names\":[]}"
|
44
|
-
c.find('', "xml").should == "<?xml version=\"1.0\"?>\n<names/>\n"
|
27
|
+
describe ".new" do
|
28
|
+
it "works" do
|
29
|
+
expect(neti).to be_kind_of NameSpotter
|
30
|
+
expect(tf).to be_kind_of NameSpotter
|
45
31
|
end
|
46
32
|
end
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
33
|
+
|
34
|
+
describe "#find" do
|
35
|
+
context "empty text" do
|
36
|
+
it "returns empty list" do
|
37
|
+
clients.each do |c|
|
38
|
+
expect(c.find(nil)).to eq({ names: [] })
|
39
|
+
expect(c.find(nil, 'json')).to eq "{\"names\":[]}"
|
40
|
+
expect(c.find(nil, "xml"))
|
41
|
+
.to eq "<?xml version=\"1.0\"?>\n<names/>\n"
|
42
|
+
expect(c.find('', 'json')).to eq "{\"names\":[]}"
|
43
|
+
expect(c.find('', "xml"))
|
44
|
+
.to eq "<?xml version=\"1.0\"?>\n<names/>\n"
|
45
|
+
end
|
46
|
+
end
|
53
47
|
end
|
54
|
-
end
|
55
48
|
|
56
|
-
|
57
|
-
|
58
|
-
res = @neti.find(text)[:names].map { |n| n[:scientificName] }
|
59
|
-
res.should == ["Betula alba", "Mus musculus", "B. alba", "Aranea röselii", "Varanus bitatawa"]
|
60
|
-
tf_res = @tf.find(text)
|
61
|
-
res = tf_res[:names].map { |n| n[:scientificName] }
|
62
|
-
res.should == ["Betula alba", "Mus musculus", "B[etula] alba", "Aranea röselii", "Varanus"]
|
63
|
-
end
|
49
|
+
context "text without sci names" do
|
50
|
+
let(:text) { "one two three, no scientific names" }
|
64
51
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
res.should == ['Xysticus canadensis', 'Pardosa moesta', 'X. canadensis']
|
75
|
-
res = @tf.find(text)[:names].map { |n| n[:scientificName] }
|
76
|
-
res.should == ['Xysticus canadensis', 'Pardosa moesta', 'X[ysticus] canadensis']
|
77
|
-
end
|
52
|
+
it "returns empty list" do
|
53
|
+
clients.each do |c|
|
54
|
+
expect(c.find(text)).to eq({ names: [] })
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
context "text with one sci name" do
|
60
|
+
let(:text) { "Pardosa moesta" }
|
78
61
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
res.map do |name|
|
85
|
-
verbatim = name[:verbatim]
|
86
|
-
found_name = text[name[:offsetStart]..name[:offsetEnd]]
|
87
|
-
found_name.should == verbatim
|
62
|
+
it "returns empty list" do
|
63
|
+
clients.each do |c|
|
64
|
+
expect(c.find(text)[:names].size).to eq 1
|
65
|
+
end
|
66
|
+
end
|
88
67
|
end
|
89
|
-
end
|
90
68
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
69
|
+
context "text with several names" do
|
70
|
+
let(:text) do
|
71
|
+
"Some text that has Betula\n alba and Mus musculus " \
|
72
|
+
"and \neven B. alba and even M. mus-\nculus and " \
|
73
|
+
"unicoded name Aranea röselii. Also it has name " \
|
74
|
+
"unknown before: Varanus bitatawa"
|
75
|
+
end
|
76
|
+
let(:text2) do
|
77
|
+
"Some another text that has Xysticus \ncanadensis and " \
|
78
|
+
"Pardosa moesta and \neven X. canadensis and even " \
|
79
|
+
"P. mo-\nesta."
|
80
|
+
end
|
81
|
+
|
82
|
+
it "returns names" do
|
83
|
+
res = neti.find(text)[:names].map { |n| n[:scientificName] }
|
84
|
+
expect(res).to eq ["Betula alba", "Mus musculus",
|
85
|
+
"B. alba", "Aranea röselii", "Varanus bitatawa"]
|
86
|
+
res = tf.find(text)[:names].map { |n| n[:scientificName] }
|
87
|
+
expect(res).to eq ["Betula alba", "Mus musculus",
|
88
|
+
"B[etula] alba", "Aranea röselii",
|
89
|
+
"Varanus"]
|
90
|
+
end
|
91
|
+
|
92
|
+
it "forgets previous searches" do
|
93
|
+
res = neti.find(text)[:names].map { |n| n[:scientificName] }
|
94
|
+
expect(res).to eq ["Betula alba", "Mus musculus",
|
95
|
+
"B. alba", "Aranea röselii", "Varanus bitatawa"]
|
96
|
+
res = tf.find(text)[:names].map { |n| n[:scientificName] }
|
97
|
+
expect(res).to eq ["Betula alba", "Mus musculus",
|
98
|
+
"B[etula] alba", "Aranea röselii",
|
99
|
+
"Varanus"]
|
100
|
+
res = neti.find(text2)[:names].map { |n| n[:scientificName] }
|
101
|
+
expect(res).to eq ['Xysticus canadensis', 'Pardosa moesta',
|
102
|
+
'X. canadensis']
|
103
|
+
res = tf.find(text2)[:names].map { |n| n[:scientificName] }
|
104
|
+
expect(res).to eq ['Xysticus canadensis', 'Pardosa moesta',
|
105
|
+
'X[ysticus] canadensis']
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
context "offsets" do
|
110
|
+
let(:text3) do
|
111
|
+
"\r\r\n>':¥/. \r\nA text with multibyte characters " \
|
112
|
+
"नेति नेति: Some text that has Betula\n alba and " \
|
113
|
+
"Mus musculus and \neven B. alba and even M. " \
|
114
|
+
"mus-\nculus. Also it has name " \
|
115
|
+
"unknown before: Varanus bitatawa species"
|
116
|
+
end
|
117
|
+
let(:text4) do
|
118
|
+
"We have to be sure that Betula\n alba and " \
|
119
|
+
"PSEUDOSCORPIONIDA and ×Inkea which is not " \
|
120
|
+
"Passeriformes. We also have another hybrid Passiflora " \
|
121
|
+
"×rosea and Aranea röselii and capitalized ARANEA " \
|
122
|
+
"RÖSELII and Pardosa\n moesta f. moesta Banks, 1892 " \
|
123
|
+
"all get their offsets"
|
124
|
+
end
|
125
|
+
let(:text5) { read "journalofentomol13pomo_0063.txt" }
|
126
|
+
|
127
|
+
it "return correct names with multibyte chars" do
|
128
|
+
# this test depends on netineti tornado server, not on
|
129
|
+
# namespotter itself. Go and fix that!
|
130
|
+
# the issue and the fix: https://github.com/mbl-cli/NetiNeti/pull/1
|
131
|
+
res = neti.find(text3)[:names]
|
132
|
+
res.map do |name|
|
133
|
+
verbatim = name[:verbatim]
|
134
|
+
found_name = text3[name[:offsetStart]..name[:offsetEnd]]
|
135
|
+
expect(found_name).to eq verbatim
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
it "returns offset for all names" do
|
140
|
+
res = neti.find(text4)
|
141
|
+
tf_res = tf.find(text4)
|
142
|
+
expect(res).to eq({names: [
|
143
|
+
{verbatim: "Betula\n alba", scientificName: "Betula alba",
|
144
|
+
offsetStart: 24, offsetEnd: 35},
|
145
|
+
{verbatim: "Passiflora ×rosea", scientificName: "Passiflora ×rosea",
|
146
|
+
offsetStart: 126, offsetEnd: 142},
|
147
|
+
{verbatim: "Aranea röselii", scientificName: "Aranea röselii",
|
148
|
+
offsetStart: 148, offsetEnd: 161},
|
149
|
+
{verbatim: "Pardosa\n moesta", scientificName: "Pardosa moesta",
|
150
|
+
offsetStart: 198, offsetEnd: 212}
|
151
|
+
]})
|
152
|
+
expect(tf_res).to eq({names: [
|
153
|
+
{verbatim: "Betula alba", scientificName: "Betula alba",
|
154
|
+
offsetStart: 24, offsetEnd: 35},
|
155
|
+
{verbatim: "PSEUDOSCORPIONIDA",
|
156
|
+
scientificName: "Pseudoscorpionida", offsetStart: 41,
|
157
|
+
offsetEnd: 57},
|
158
|
+
{verbatim: "Passeriformes.", scientificName: "Passeriformes",
|
159
|
+
offsetStart: 83, offsetEnd: 96},
|
160
|
+
{verbatim: "Passiflora ×rosea", scientificName: "Passiflora rosea",
|
161
|
+
offsetStart: 126, offsetEnd: 142},
|
162
|
+
{verbatim: "Aranea röselii", scientificName: "Aranea röselii",
|
163
|
+
offsetStart: 148, offsetEnd: 161},
|
164
|
+
{verbatim: "ARANEA", scientificName: "Aranea", offsetStart: 179,
|
165
|
+
offsetEnd: 184},
|
166
|
+
{verbatim: "Pardosa moesta f. moesta", scientificName:
|
167
|
+
"Pardosa moesta f. moesta", offsetStart: 198, offsetEnd: 222}
|
168
|
+
]})
|
169
|
+
end
|
170
|
+
|
171
|
+
it "makes offsets in order with netineti" do
|
172
|
+
res = neti.find(text5)
|
173
|
+
offsets = res[:names].map { |n| n[:offsetStart] }
|
174
|
+
expect(offsets).to eq offsets
|
175
|
+
expect(offsets[0]).to eq 67
|
176
|
+
end
|
177
|
+
end
|
103
178
|
end
|
104
179
|
|
105
|
-
|
106
|
-
text
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
180
|
+
context "abbreviations" do
|
181
|
+
let(:text) do
|
182
|
+
"Pardosa moesta Banks, 1892 is one spider, Schizocosa " \
|
183
|
+
"ocreata Keyserling, 1887 is a second and a third is " \
|
184
|
+
"Schizocosa saltatrix borealis. The abbreviations are P. " \
|
185
|
+
"moesta, S. ocreata, and S. saltatrix borealis is the third."
|
186
|
+
end
|
187
|
+
let(:text2) do
|
188
|
+
"Pardosa moesta! If we encounter Pardosa moesta and then P.modica " \
|
189
|
+
"another name I know is Xenopus laevis and also P.moesta. Again " \
|
190
|
+
"without space TaxonFinder should find both. And Plantago major foreva"
|
191
|
+
end
|
192
|
+
let(:text3) do
|
193
|
+
"What happens another called P. (LYCOSIDAE) is the species?"
|
194
|
+
end
|
195
|
+
|
196
|
+
it "ignores abbreviated genus before family for TaxonFinder" do
|
197
|
+
res = tf.find(text3)
|
198
|
+
expect(res[:names].size).to be 1
|
199
|
+
expect(res).to eq(
|
200
|
+
{names: [{verbatim: "(LYCOSIDAE)", scientificName: "Lycosidae",
|
201
|
+
offsetStart: 32, offsetEnd: 42}]}
|
202
|
+
)
|
203
|
+
end
|
204
|
+
|
205
|
+
it "preserves TaxonFinder expansions" do
|
206
|
+
tf_res = tf.find(text)
|
207
|
+
expect(tf_res).to eq(
|
208
|
+
{names: [
|
209
|
+
{verbatim: "Pardosa moesta", scientificName: "Pardosa moesta",
|
210
|
+
offsetStart: 0, offsetEnd: 13},
|
211
|
+
{verbatim: "Schizocosa ocreata",
|
212
|
+
scientificName: "Schizocosa ocreata", offsetStart: 42,
|
213
|
+
offsetEnd: 59},
|
214
|
+
{verbatim: "Schizocosa saltatrix borealis",
|
215
|
+
scientificName: "Schizocosa saltatrix borealis",
|
216
|
+
offsetStart: 105, offsetEnd: 133},
|
217
|
+
{verbatim: "P. moesta", scientificName: "P[ardosa] moesta",
|
218
|
+
offsetStart: 158, offsetEnd: 166},
|
219
|
+
{verbatim: "S. ocreata", scientificName: "S[chizocosa] ocreata",
|
220
|
+
offsetStart: 169, offsetEnd: 178},
|
221
|
+
{verbatim: "S. saltatrix borealis",
|
222
|
+
scientificName: "S[chizocosa] saltatrix borealis",
|
223
|
+
offsetStart: 185, offsetEnd: 205}]}
|
224
|
+
)
|
225
|
+
end
|
226
|
+
|
227
|
+
|
228
|
+
it "recognizes abbreviations no space (TF)" do
|
229
|
+
res = tf.find(text2)
|
230
|
+
expect(res).to eq(
|
231
|
+
{names: [
|
232
|
+
{verbatim: "Pardosa moesta", scientificName: "Pardosa moesta",
|
233
|
+
offsetStart: 0, offsetEnd: 13},
|
234
|
+
{verbatim: "Pardosa moesta", scientificName: "Pardosa moesta",
|
235
|
+
offsetStart: 32, offsetEnd: 45},
|
236
|
+
{verbatim: "P.modica", scientificName: "P[ardosa] modica",
|
237
|
+
offsetStart: 56, offsetEnd: 63},
|
238
|
+
{verbatim: "Xenopus laevis", scientificName: "Xenopus laevis",
|
239
|
+
offsetStart: 88, offsetEnd: 101},
|
240
|
+
{verbatim: "P.moesta", scientificName: "P[ardosa] moesta",
|
241
|
+
offsetStart: 112, offsetEnd: 119},
|
242
|
+
{verbatim: "Plantago major", scientificName: "Plantago major",
|
243
|
+
offsetStart: 176, offsetEnd: 189}]}
|
244
|
+
)
|
245
|
+
res[:names].map do |name|
|
246
|
+
verbatim = name[:verbatim]
|
247
|
+
found_name = text2[name[:offsetStart]..name[:offsetEnd]]
|
248
|
+
expect(found_name).to eq verbatim
|
249
|
+
end
|
250
|
+
end
|
111
251
|
end
|
112
252
|
|
113
|
-
|
253
|
+
context "capitalization" do
|
114
254
|
#this is a problem we are aware of
|
115
|
-
text
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
255
|
+
let(:text) do
|
256
|
+
"We need to make sure that Ophioihrix nidis and " \
|
257
|
+
"OPHTOMVXIDAE and also Ophiocynodus and especially " \
|
258
|
+
"ASTÉROCHEMIDAE and definitely STFROPHVTIDAE and may be " \
|
259
|
+
"Asleronyx excavata should all be capitalized correctly"
|
260
|
+
end
|
261
|
+
|
262
|
+
it "does not change capitalization" do
|
263
|
+
res = neti.find(text)
|
264
|
+
expect(res).to eq(
|
265
|
+
{names: [
|
266
|
+
{verbatim: "Ophioihrix nidis", scientificName: "Ophioihrix nidis",
|
267
|
+
offsetStart: 26, offsetEnd: 41},
|
268
|
+
{verbatim: "OPHTOMVXIDAE", scientificName: "OPHTOMVXIDAE",
|
269
|
+
offsetStart: 47, offsetEnd: 58},
|
270
|
+
{verbatim: "Ophiocynodus", scientificName: "Ophiocynodus",
|
271
|
+
offsetStart: 70, offsetEnd: 81},
|
272
|
+
{verbatim: "ASTÉROCHEMIDAE", scientificName: "ASTÉROCHEMIDAE",
|
273
|
+
offsetStart: 98, offsetEnd: 111},
|
274
|
+
{verbatim: "STFROPHVTIDAE", scientificName: "STFROPHVTIDAE",
|
275
|
+
offsetStart: 128, offsetEnd: 140},
|
276
|
+
{verbatim: "Asleronyx excavata", scientificName: "Asleronyx excavata",
|
277
|
+
offsetStart: 153, offsetEnd: 170}
|
278
|
+
]}
|
279
|
+
)
|
280
|
+
end
|
124
281
|
end
|
125
282
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
283
|
+
context "OCR errors" do
|
284
|
+
let(:pipe) do
|
285
|
+
"We need to make sure that Oph|oihrix nidis and " \
|
286
|
+
"OPHTOMVX|DAE will not break results"
|
287
|
+
end
|
288
|
+
|
289
|
+
it "substitutes | with l" do
|
290
|
+
res = neti.find(pipe)
|
291
|
+
expect(res).to eq(
|
292
|
+
{ names: [{ verbatim: "Ophloihrix nidis",
|
293
|
+
scientificName: "Ophloihrix nidis",
|
294
|
+
offsetStart: 26, offsetEnd: 41 }] }
|
295
|
+
)
|
296
|
+
end
|
130
297
|
end
|
131
298
|
|
132
|
-
|
133
|
-
text
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
299
|
+
context "extremely nexted infraspecies" do
|
300
|
+
let(:text) do
|
301
|
+
"If we encounter Plantago major it is ok, but if it is " \
|
302
|
+
"Plantago quercus quercus quercus quercus quercus quercus " \
|
303
|
+
"quercus quercus quercus quercus quercus quercus quercus " \
|
304
|
+
"quercus, something is probably not right. However we take " \
|
305
|
+
"Plantago quercus quercus quercus quercus quercus by some " \
|
306
|
+
"strange reason. Well, the reason is this kind of thing -- " \
|
307
|
+
"Pardosa moesta var. moesta f. moesta or something like that"
|
308
|
+
end
|
309
|
+
|
310
|
+
it "stops at five infraspecies levels" do
|
311
|
+
res = tf.find(text)
|
312
|
+
expect(res).to eq(
|
313
|
+
{names: [
|
314
|
+
{verbatim: "Plantago major", scientificName: "Plantago major",
|
315
|
+
offsetStart: 16, offsetEnd: 29},
|
316
|
+
{verbatim: "Plantago quercus quercus quercus quercus quercus",
|
317
|
+
scientificName: "Plantago quercus quercus quercus quercus quercus",
|
318
|
+
offsetStart: 225, offsetEnd: 272},
|
319
|
+
{verbatim: "Pardosa moesta var. moesta f. moesta",
|
320
|
+
scientificName: "Pardosa moesta var. moesta f. moesta",
|
321
|
+
offsetStart: 340, offsetEnd: 375}]}
|
322
|
+
)
|
323
|
+
end
|
147
324
|
end
|
148
325
|
|
149
|
-
|
150
|
-
text
|
151
|
-
|
152
|
-
|
153
|
-
|
326
|
+
context "nested names" do
|
327
|
+
let(:text) do
|
328
|
+
"What happens another called Pardosa moesta (Araneae: Lycosidae) is " \
|
329
|
+
"the species?"
|
330
|
+
end
|
331
|
+
|
332
|
+
it "(TF) handles nested names in one cycle" do
|
333
|
+
res = tf.find(text)
|
334
|
+
expect(res).to eq (
|
335
|
+
{names: [
|
336
|
+
{verbatim: "Pardosa moesta", scientificName: "Pardosa moesta",
|
337
|
+
offsetStart: 29, offsetEnd: 42},
|
338
|
+
{verbatim: "(Araneae:", scientificName: "Araneae",
|
339
|
+
offsetStart: 44, offsetEnd: 52},
|
340
|
+
{verbatim: "Lycosidae)", scientificName: "Lycosidae",
|
341
|
+
offsetStart: 54, offsetEnd: 63}]}
|
342
|
+
)
|
343
|
+
end
|
154
344
|
end
|
155
|
-
|
156
|
-
|
157
|
-
text
|
158
|
-
|
159
|
-
|
160
|
-
|
345
|
+
|
346
|
+
context "diacritics" do
|
347
|
+
let(:text) { "Mactra triangula Renieri. Fissurella nubécula Linnó." }
|
348
|
+
|
349
|
+
it "finds names with diacrictics" do
|
350
|
+
res = tf.find(text)
|
351
|
+
expect(res[:names].size).to be 2
|
352
|
+
expect(res).to eq(
|
353
|
+
{names: [
|
354
|
+
{verbatim: "Mactra triangula", scientificName: "Mactra triangula",
|
355
|
+
offsetStart: 0, offsetEnd: 15},
|
356
|
+
{verbatim: "Fissurella nubécula",
|
357
|
+
scientificName: "Fissurella nubécula",
|
358
|
+
offsetStart: 26, offsetEnd: 44}]}
|
359
|
+
)
|
360
|
+
end
|
161
361
|
end
|
162
362
|
|
363
|
+
def read(file)
|
364
|
+
File.read(File.join(__dir__, "files", file))
|
365
|
+
end
|
163
366
|
end
|