name-spotter 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,8 @@
1
+ class NameSpotter
2
+ VERSION = "0.3.0"
3
+
4
+ def self.version
5
+ VERSION
6
+ end
7
+ end
8
+
@@ -1,103 +1,31 @@
1
- # Generated by jeweler
2
- # DO NOT EDIT THIS FILE DIRECTLY
3
- # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
- # -*- encoding: utf-8 -*-
1
+ $:.push File.expand_path("../lib", __FILE__)
5
2
 
6
- Gem::Specification.new do |s|
7
- s.name = "name-spotter"
8
- s.version = "0.2.4"
3
+ require "name-spotter/version"
9
4
 
10
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
- s.authors = ["Anthony Goddard", "Chuck Ha", "Dmitry Mozzherin"]
12
- s.date = "2012-08-20"
13
- s.description = "The gem searches for scientific names in texts using socket servers running TaxonFinder (by Patrick Leary) and NetiNeti (by Lakshmi Manohar Akella)"
14
- s.email = "dmozzherin@gmail.com"
15
- s.extra_rdoc_files = [
16
- "LICENSE.txt",
17
- "README.rdoc"
18
- ]
19
- s.files = [
20
- ".document",
21
- ".rspec",
22
- ".rvmrc",
23
- "CHANGELOG",
24
- "Gemfile",
25
- "Gemfile.lock",
26
- "LICENSE.txt",
27
- "README.rdoc",
28
- "Rakefile",
29
- "VERSION",
30
- "features/name-spotter.feature",
31
- "features/step_definitions/name-spotter_steps.rb",
32
- "features/support/env.rb",
33
- "lib/name-spotter.rb",
34
- "lib/name-spotter/client.rb",
35
- "lib/name-spotter/monkey_patches.rb",
36
- "lib/name-spotter/neti_neti_client.rb",
37
- "lib/name-spotter/scientific_name.rb",
38
- "lib/name-spotter/taxon_finder_client.rb",
39
- "name-spotter.gemspec",
40
- "spec/files/english.txt",
41
- "spec/files/journalofentomol13pomo_0018.txt",
42
- "spec/files/journalofentomol13pomo_0063.txt",
43
- "spec/files/not_english.txt",
44
- "spec/name-spotter_spec.rb",
45
- "spec/scientific_name_spec.rb",
46
- "spec/spec_helper.rb",
47
- "tf_logic.txt"
48
- ]
49
- s.homepage = "http://github.com/GlobalNamesArchitecture/name-spotter"
50
- s.licenses = ["MIT"]
51
- s.require_paths = ["lib"]
52
- s.rubygems_version = "1.8.24"
53
- s.summary = "Scientific names finder"
5
+ Gem::Specification.new do |gem|
6
+ gem.name = "name-spotter"
7
+ gem.homepage = "http://github.com/GlobalNamesArchitecture/name-spotter"
8
+ gem.version = NameSpotter::VERSION
9
+ gem.authors = ["Anthony Goddard", "Chuck Ha",
10
+ "Dmitry Mozzherin", "David Shorthouse"]
11
+ gem.license = "MIT"
12
+ gem.summary = "Scientific names finder"
13
+ gem.description = %q|The gem searches for scientific names in texts using
14
+ socket servers running TaxonFinder (by Patrick Leary)
15
+ and NetiNeti (by Lakshmi Manohar Akella)|
16
+ gem.email = "dmozzherin@gmail.com"
54
17
 
55
- if s.respond_to? :specification_version then
56
- s.specification_version = 3
57
-
58
- if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
59
- s.add_runtime_dependency(%q<rake>, [">= 0"])
60
- s.add_runtime_dependency(%q<rest-client>, [">= 0"])
61
- s.add_runtime_dependency(%q<builder>, [">= 0"])
62
- s.add_runtime_dependency(%q<json>, [">= 0"])
63
- s.add_runtime_dependency(%q<unicode_utils>, [">= 0"])
64
- s.add_runtime_dependency(%q<unsupervised-language-detection>, [">= 0"])
65
- s.add_development_dependency(%q<rspec>, [">= 0"])
66
- s.add_development_dependency(%q<rspec-expectations>, [">= 0"])
67
- s.add_development_dependency(%q<cucumber>, [">= 0"])
68
- s.add_development_dependency(%q<capybara>, [">= 0"])
69
- s.add_development_dependency(%q<bundler>, [">= 0"])
70
- s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
71
- s.add_development_dependency(%q<debugger>, [">= 0"])
72
- else
73
- s.add_dependency(%q<rake>, [">= 0"])
74
- s.add_dependency(%q<rest-client>, [">= 0"])
75
- s.add_dependency(%q<builder>, [">= 0"])
76
- s.add_dependency(%q<json>, [">= 0"])
77
- s.add_dependency(%q<unicode_utils>, [">= 0"])
78
- s.add_dependency(%q<unsupervised-language-detection>, [">= 0"])
79
- s.add_dependency(%q<rspec>, [">= 0"])
80
- s.add_dependency(%q<rspec-expectations>, [">= 0"])
81
- s.add_dependency(%q<cucumber>, [">= 0"])
82
- s.add_dependency(%q<capybara>, [">= 0"])
83
- s.add_dependency(%q<bundler>, [">= 0"])
84
- s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
85
- s.add_dependency(%q<debugger>, [">= 0"])
86
- end
87
- else
88
- s.add_dependency(%q<rake>, [">= 0"])
89
- s.add_dependency(%q<rest-client>, [">= 0"])
90
- s.add_dependency(%q<builder>, [">= 0"])
91
- s.add_dependency(%q<json>, [">= 0"])
92
- s.add_dependency(%q<unicode_utils>, [">= 0"])
93
- s.add_dependency(%q<unsupervised-language-detection>, [">= 0"])
94
- s.add_dependency(%q<rspec>, [">= 0"])
95
- s.add_dependency(%q<rspec-expectations>, [">= 0"])
96
- s.add_dependency(%q<cucumber>, [">= 0"])
97
- s.add_dependency(%q<capybara>, [">= 0"])
98
- s.add_dependency(%q<bundler>, [">= 0"])
99
- s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
100
- s.add_dependency(%q<debugger>, [">= 0"])
101
- end
18
+ gem.files = `git ls-files`.split("\n")
19
+ gem.require_paths = ["lib"]
20
+ gem.add_runtime_dependency "rake", "~> 10.5"
21
+ gem.add_runtime_dependency "rest-client", "~> 1.8"
22
+ gem.add_runtime_dependency "nokogiri", "~> 1.6"
23
+ gem.add_runtime_dependency "builder", "~> 3.2"
24
+ gem.add_runtime_dependency "json", "~> 1.8"
25
+ gem.add_runtime_dependency "unicode_utils", "~> 1.4"
26
+ gem.add_runtime_dependency "unsupervised-language-detection", "~> 0.0.6"
27
+ gem.add_development_dependency "rspec", "~> 3.1"
28
+ gem.add_development_dependency "bundler", "~> 1.10"
29
+ gem.add_development_dependency "byebug", "~> 8.2"
102
30
  end
103
31
 
@@ -1,163 +1,366 @@
1
- # encoding: utf-8
2
- require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
3
-
4
1
  describe "NameSpotter" do
5
- before(:all) do
6
- neti_neti = NameSpotter::NetiNetiClient.new()
7
- taxon_finder = NameSpotter::TaxonFinderClient.new()
8
- @neti = NameSpotter.new(neti_neti)
9
- @tf = NameSpotter.new(taxon_finder)
10
- @clients = [@neti, @tf]
11
- end
2
+ subject { NameSpotter }
3
+ let(:neti) { subject.new(subject::NetiNetiClient.new()) }
4
+ let(:tf) { subject.new(subject::TaxonFinderClient.new()) }
5
+ let(:clients) { [neti, tf] }
12
6
 
13
- it "should find if a text is in english" do
14
- eng = open(File.join(File.dirname(__FILE__), 'files', 'english.txt'), 'r:utf-8').read
15
- eng2 = open(File.join(File.dirname(__FILE__), 'files', 'english.txt'), 'r:utf-8').read
16
- eng3 = open(File.join(File.dirname(__FILE__), 'files', 'journalofentomol13pomo_0018.txt'), 'r:utf-8').read
17
- eng3 = open(File.join(File.dirname(__FILE__), 'files', 'journalofentomol13pomo_0063.txt'), 'r:utf-8').read
18
-
19
- not_eng = open(File.join(File.dirname(__FILE__), 'files', 'not_english.txt'), 'r:utf-8').read
20
- 100.times do
21
- NameSpotter.english?(eng).should be_true
22
- NameSpotter.english?(eng2).should be_true
23
- NameSpotter.english?(eng3).should be_false
24
- NameSpotter.english?(not_eng).should be_false
7
+ describe ".version" do
8
+ it "returns version" do
9
+ expect(subject.version).to match /\d+\.\d+\.\d+/
25
10
  end
26
11
  end
27
12
 
28
- it "should exist" do
29
- @neti.is_a?(NameSpotter).should be_true
30
- @tf.is_a?(NameSpotter).should be_true
31
- end
13
+ describe ".english?" do
14
+ let(:eng) { read("english.txt") }
15
+ let(:eng2) { read("journalofentomol13pomo_0018.txt") }
16
+ let(:eng3) { read("journalofentomol13pomo_0063.txt") }
32
17
 
33
- it "should use ruby as default format" do
34
- @clients.each do |c|
35
- c.find(nil).should == {names: []}
18
+ it "detects english" do
19
+ 100.times do
20
+ expect(subject.english? eng).to be true
21
+ expect(subject.english? eng2).to be true
22
+ expect(subject.english? eng3).to be false
23
+ end
36
24
  end
37
25
  end
38
26
 
39
- it "should return empty result if input is empty" do
40
- @clients.each do |c|
41
- c.find(nil, 'json').should == "{\"names\":[]}"
42
- c.find(nil, "xml").should == "<?xml version=\"1.0\"?>\n<names/>\n"
43
- c.find('', 'json').should == "{\"names\":[]}"
44
- c.find('', "xml").should == "<?xml version=\"1.0\"?>\n<names/>\n"
27
+ describe ".new" do
28
+ it "works" do
29
+ expect(neti).to be_kind_of NameSpotter
30
+ expect(tf).to be_kind_of NameSpotter
45
31
  end
46
32
  end
47
-
48
- it "should return empty result if no names are found" do
49
- text = "one two three, no scientific names"
50
- @clients.each do |c|
51
- c.find(text, "json").should == "{\"names\":[]}"
52
- c.find(text, "xml").should == "<?xml version=\"1.0\"?>\n<names/>\n"
33
+
34
+ describe "#find" do
35
+ context "empty text" do
36
+ it "returns empty list" do
37
+ clients.each do |c|
38
+ expect(c.find(nil)).to eq({ names: [] })
39
+ expect(c.find(nil, 'json')).to eq "{\"names\":[]}"
40
+ expect(c.find(nil, "xml"))
41
+ .to eq "<?xml version=\"1.0\"?>\n<names/>\n"
42
+ expect(c.find('', 'json')).to eq "{\"names\":[]}"
43
+ expect(c.find('', "xml"))
44
+ .to eq "<?xml version=\"1.0\"?>\n<names/>\n"
45
+ end
46
+ end
53
47
  end
54
- end
55
48
 
56
- it "should be able to find scientific names in text" do
57
- text = "Some text that has Betula\n alba and Mus musculus and \neven B. alba and even M. mus-\nculus and unicoded name Aranea röselii. Also it has name unknown before: Varanus bitatawa species"
58
- res = @neti.find(text)[:names].map { |n| n[:scientificName] }
59
- res.should == ["Betula alba", "Mus musculus", "B. alba", "Aranea röselii", "Varanus bitatawa"]
60
- tf_res = @tf.find(text)
61
- res = tf_res[:names].map { |n| n[:scientificName] }
62
- res.should == ["Betula alba", "Mus musculus", "B[etula] alba", "Aranea röselii", "Varanus"]
63
- end
49
+ context "text without sci names" do
50
+ let(:text) { "one two three, no scientific names" }
64
51
 
65
-
66
- it "should not remember previous search results" do
67
- text = "Some text that has Betula\n alba and Mus musculus and \neven B. alba and even M. mus-\nculus. Also it has name unknown before: Varanus bitatawa species"
68
- res = @neti.find(text)[:names].map { |n| n[:scientificName] }
69
- res.should == ["Betula alba", "Mus musculus", "B. alba", "Varanus bitatawa"]
70
- res = @tf.find(text)[:names].map { |n| n[:scientificName] }
71
- res.should == ["Betula alba", "Mus musculus", "B[etula] alba", "Varanus"]
72
- text = "Some another text that has Xysticus \ncanadensis and Pardosa moesta and \neven X. canadensis and even P. mo-\nesta."
73
- res = @neti.find(text)[:names].map { |n| n[:scientificName] }
74
- res.should == ['Xysticus canadensis', 'Pardosa moesta', 'X. canadensis']
75
- res = @tf.find(text)[:names].map { |n| n[:scientificName] }
76
- res.should == ['Xysticus canadensis', 'Pardosa moesta', 'X[ysticus] canadensis']
77
- end
52
+ it "returns empty list" do
53
+ clients.each do |c|
54
+ expect(c.find(text)).to eq({ names: [] })
55
+ end
56
+ end
57
+ end
58
+
59
+ context "text with one sci name" do
60
+ let(:text) { "Pardosa moesta" }
78
61
 
79
- it "should get back correct names using offsets in utf-8 based text" do
80
- # this test depends on netineti tornado server, not on namespotter itself. Go and fix that!
81
- # the issue and the fix: https://github.com/mbl-cli/NetiNeti/pull/1
82
- text = "\r\r\n>':¥/. \r\nA text with multibyte characters नेति नेति: Some text that has Betula\n alba and Mus musculus and \neven B. alba and even M. mus-\nculus. Also it has name unknown before: Varanus bitatawa species"
83
- res = @neti.find(text)[:names]
84
- res.map do |name|
85
- verbatim = name[:verbatim]
86
- found_name = text[name[:offsetStart]..name[:offsetEnd]]
87
- found_name.should == verbatim
62
+ it "returns empty list" do
63
+ clients.each do |c|
64
+ expect(c.find(text)[:names].size).to eq 1
65
+ end
66
+ end
88
67
  end
89
- end
90
68
 
91
- it "should be able to return offsets for all names" do
92
- text = "We have to be sure that Betula\n alba and PSEUDOSCORPIONIDA and ×Inkea which is not Passeriformes. We also have another hybrid Passiflora ×rosea and Aranea röselii and capitalized ARANEA RÖSELII and Pardosa\n moesta f. moesta Banks, 1892 all get their offsets"
93
- res = @neti.find(text)
94
- res.should == {:names=>[{:verbatim=>"Betula\n alba", :scientificName=>"Betula alba", :offsetStart=>24, :offsetEnd=>35}, {:verbatim=>"Passiflora ×rosea", :scientificName=>"Passiflora ×rosea", :offsetStart=>126, :offsetEnd=>142}, {:verbatim=>"Aranea röselii", :scientificName=>"Aranea röselii", :offsetStart=>148, :offsetEnd=>161}, {:verbatim=>"Pardosa\n moesta", :scientificName=>"Pardosa moesta", :offsetStart=>198, :offsetEnd=>212}]}
95
- tf_res = @tf.find(text)
96
- tf_res.should == {:names=>[{:verbatim=>"Betula alba", :scientificName=>"Betula alba", :offsetStart=>24, :offsetEnd=>35}, {:verbatim=>"PSEUDOSCORPIONIDA", :scientificName=>"Pseudoscorpionida", :offsetStart=>41, :offsetEnd=>57}, {:verbatim=>"Passeriformes.", :scientificName=>"Passeriformes", :offsetStart=>83, :offsetEnd=>96}, {:verbatim=>"Passiflora ×rosea", :scientificName=>"Passiflora rosea", :offsetStart=>126, :offsetEnd=>142}, {:verbatim=>"Aranea röselii", :scientificName=>"Aranea röselii", :offsetStart=>148, :offsetEnd=>161}, {:verbatim=>"ARANEA", :scientificName=>"Aranea", :offsetStart=>179, :offsetEnd=>184}, {:verbatim=>"Pardosa moesta f. moesta", :scientificName=>"Pardosa moesta f. moesta", :offsetStart=>198, :offsetEnd=>222}]}
97
- end
98
-
99
- it "should properly handle abbreviated names found by taxonfinder" do
100
- text = "Pardosa moesta Banks, 1892 is one spider, Schizocosa ocreata Keyserling, 1887 is a second and a third is Schizocosa saltatrix borealis. The abbreviations are P. moesta, S. ocreata, and S. saltatrix borealis is the third."
101
- tf_res = @tf.find(text)
102
- tf_res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>0, :offsetEnd=>13}, {:verbatim=>"Schizocosa ocreata", :scientificName=>"Schizocosa ocreata", :offsetStart=>42, :offsetEnd=>59}, {:verbatim=>"Schizocosa saltatrix borealis", :scientificName=>"Schizocosa saltatrix borealis", :offsetStart=>105, :offsetEnd=>133}, {:verbatim=>"P. moesta", :scientificName=>"P[ardosa] moesta", :offsetStart=>158, :offsetEnd=>166}, {:verbatim=>"S. ocreata", :scientificName=>"S[chizocosa] ocreata", :offsetStart=>169, :offsetEnd=>178}, {:verbatim=>"S. saltatrix borealis", :scientificName=>"S[chizocosa] saltatrix borealis", :offsetStart=>185, :offsetEnd=>205}]}
69
+ context "text with several names" do
70
+ let(:text) do
71
+ "Some text that has Betula\n alba and Mus musculus " \
72
+ "and \neven B. alba and even M. mus-\nculus and " \
73
+ "unicoded name Aranea röselii. Also it has name " \
74
+ "unknown before: Varanus bitatawa"
75
+ end
76
+ let(:text2) do
77
+ "Some another text that has Xysticus \ncanadensis and " \
78
+ "Pardosa moesta and \neven X. canadensis and even " \
79
+ "P. mo-\nesta."
80
+ end
81
+
82
+ it "returns names" do
83
+ res = neti.find(text)[:names].map { |n| n[:scientificName] }
84
+ expect(res).to eq ["Betula alba", "Mus musculus",
85
+ "B. alba", "Aranea röselii", "Varanus bitatawa"]
86
+ res = tf.find(text)[:names].map { |n| n[:scientificName] }
87
+ expect(res).to eq ["Betula alba", "Mus musculus",
88
+ "B[etula] alba", "Aranea röselii",
89
+ "Varanus"]
90
+ end
91
+
92
+ it "forgets previous searches" do
93
+ res = neti.find(text)[:names].map { |n| n[:scientificName] }
94
+ expect(res).to eq ["Betula alba", "Mus musculus",
95
+ "B. alba", "Aranea röselii", "Varanus bitatawa"]
96
+ res = tf.find(text)[:names].map { |n| n[:scientificName] }
97
+ expect(res).to eq ["Betula alba", "Mus musculus",
98
+ "B[etula] alba", "Aranea röselii",
99
+ "Varanus"]
100
+ res = neti.find(text2)[:names].map { |n| n[:scientificName] }
101
+ expect(res).to eq ['Xysticus canadensis', 'Pardosa moesta',
102
+ 'X. canadensis']
103
+ res = tf.find(text2)[:names].map { |n| n[:scientificName] }
104
+ expect(res).to eq ['Xysticus canadensis', 'Pardosa moesta',
105
+ 'X[ysticus] canadensis']
106
+ end
107
+ end
108
+
109
+ context "offsets" do
110
+ let(:text3) do
111
+ "\r\r\n>':¥/. \r\nA text with multibyte characters " \
112
+ "नेति नेति: Some text that has Betula\n alba and " \
113
+ "Mus musculus and \neven B. alba and even M. " \
114
+ "mus-\nculus. Also it has name " \
115
+ "unknown before: Varanus bitatawa species"
116
+ end
117
+ let(:text4) do
118
+ "We have to be sure that Betula\n alba and " \
119
+ "PSEUDOSCORPIONIDA and ×Inkea which is not " \
120
+ "Passeriformes. We also have another hybrid Passiflora " \
121
+ "×rosea and Aranea röselii and capitalized ARANEA " \
122
+ "RÖSELII and Pardosa\n moesta f. moesta Banks, 1892 " \
123
+ "all get their offsets"
124
+ end
125
+ let(:text5) { read "journalofentomol13pomo_0063.txt" }
126
+
127
+ it "return correct names with multibyte chars" do
128
+ # this test depends on netineti tornado server, not on
129
+ # namespotter itself. Go and fix that!
130
+ # the issue and the fix: https://github.com/mbl-cli/NetiNeti/pull/1
131
+ res = neti.find(text3)[:names]
132
+ res.map do |name|
133
+ verbatim = name[:verbatim]
134
+ found_name = text3[name[:offsetStart]..name[:offsetEnd]]
135
+ expect(found_name).to eq verbatim
136
+ end
137
+ end
138
+
139
+ it "returns offset for all names" do
140
+ res = neti.find(text4)
141
+ tf_res = tf.find(text4)
142
+ expect(res).to eq({names: [
143
+ {verbatim: "Betula\n alba", scientificName: "Betula alba",
144
+ offsetStart: 24, offsetEnd: 35},
145
+ {verbatim: "Passiflora ×rosea", scientificName: "Passiflora ×rosea",
146
+ offsetStart: 126, offsetEnd: 142},
147
+ {verbatim: "Aranea röselii", scientificName: "Aranea röselii",
148
+ offsetStart: 148, offsetEnd: 161},
149
+ {verbatim: "Pardosa\n moesta", scientificName: "Pardosa moesta",
150
+ offsetStart: 198, offsetEnd: 212}
151
+ ]})
152
+ expect(tf_res).to eq({names: [
153
+ {verbatim: "Betula alba", scientificName: "Betula alba",
154
+ offsetStart: 24, offsetEnd: 35},
155
+ {verbatim: "PSEUDOSCORPIONIDA",
156
+ scientificName: "Pseudoscorpionida", offsetStart: 41,
157
+ offsetEnd: 57},
158
+ {verbatim: "Passeriformes.", scientificName: "Passeriformes",
159
+ offsetStart: 83, offsetEnd: 96},
160
+ {verbatim: "Passiflora ×rosea", scientificName: "Passiflora rosea",
161
+ offsetStart: 126, offsetEnd: 142},
162
+ {verbatim: "Aranea röselii", scientificName: "Aranea röselii",
163
+ offsetStart: 148, offsetEnd: 161},
164
+ {verbatim: "ARANEA", scientificName: "Aranea", offsetStart: 179,
165
+ offsetEnd: 184},
166
+ {verbatim: "Pardosa moesta f. moesta", scientificName:
167
+ "Pardosa moesta f. moesta", offsetStart: 198, offsetEnd: 222}
168
+ ]})
169
+ end
170
+
171
+ it "makes offsets in order with netineti" do
172
+ res = neti.find(text5)
173
+ offsets = res[:names].map { |n| n[:offsetStart] }
174
+ expect(offsets).to eq offsets
175
+ expect(offsets[0]).to eq 67
176
+ end
177
+ end
103
178
  end
104
179
 
105
- it "should not make unsequential offsets on a page when using NetiNeti" do
106
- text = open(File.join(File.dirname(__FILE__), 'files', 'journalofentomol13pomo_0063.txt'), 'r:utf-8').read
107
- res = @neti.find(text)
108
- offsets = res[:names].map {|n| n[:offsetStart]}
109
- offsets.sort.should == offsets
110
- offsets[0].should == 67
180
+ context "abbreviations" do
181
+ let(:text) do
182
+ "Pardosa moesta Banks, 1892 is one spider, Schizocosa " \
183
+ "ocreata Keyserling, 1887 is a second and a third is " \
184
+ "Schizocosa saltatrix borealis. The abbreviations are P. " \
185
+ "moesta, S. ocreata, and S. saltatrix borealis is the third."
186
+ end
187
+ let(:text2) do
188
+ "Pardosa moesta! If we encounter Pardosa moesta and then P.modica " \
189
+ "another name I know is Xenopus laevis and also P.moesta. Again " \
190
+ "without space TaxonFinder should find both. And Plantago major foreva"
191
+ end
192
+ let(:text3) do
193
+ "What happens another called P. (LYCOSIDAE) is the species?"
194
+ end
195
+
196
+ it "ignores abbreviated genus before family for TaxonFinder" do
197
+ res = tf.find(text3)
198
+ expect(res[:names].size).to be 1
199
+ expect(res).to eq(
200
+ {names: [{verbatim: "(LYCOSIDAE)", scientificName: "Lycosidae",
201
+ offsetStart: 32, offsetEnd: 42}]}
202
+ )
203
+ end
204
+
205
+ it "preserves TaxonFinder expansions" do
206
+ tf_res = tf.find(text)
207
+ expect(tf_res).to eq(
208
+ {names: [
209
+ {verbatim: "Pardosa moesta", scientificName: "Pardosa moesta",
210
+ offsetStart: 0, offsetEnd: 13},
211
+ {verbatim: "Schizocosa ocreata",
212
+ scientificName: "Schizocosa ocreata", offsetStart: 42,
213
+ offsetEnd: 59},
214
+ {verbatim: "Schizocosa saltatrix borealis",
215
+ scientificName: "Schizocosa saltatrix borealis",
216
+ offsetStart: 105, offsetEnd: 133},
217
+ {verbatim: "P. moesta", scientificName: "P[ardosa] moesta",
218
+ offsetStart: 158, offsetEnd: 166},
219
+ {verbatim: "S. ocreata", scientificName: "S[chizocosa] ocreata",
220
+ offsetStart: 169, offsetEnd: 178},
221
+ {verbatim: "S. saltatrix borealis",
222
+ scientificName: "S[chizocosa] saltatrix borealis",
223
+ offsetStart: 185, offsetEnd: 205}]}
224
+ )
225
+ end
226
+
227
+
228
+ it "recognizes abbreviations no space (TF)" do
229
+ res = tf.find(text2)
230
+ expect(res).to eq(
231
+ {names: [
232
+ {verbatim: "Pardosa moesta", scientificName: "Pardosa moesta",
233
+ offsetStart: 0, offsetEnd: 13},
234
+ {verbatim: "Pardosa moesta", scientificName: "Pardosa moesta",
235
+ offsetStart: 32, offsetEnd: 45},
236
+ {verbatim: "P.modica", scientificName: "P[ardosa] modica",
237
+ offsetStart: 56, offsetEnd: 63},
238
+ {verbatim: "Xenopus laevis", scientificName: "Xenopus laevis",
239
+ offsetStart: 88, offsetEnd: 101},
240
+ {verbatim: "P.moesta", scientificName: "P[ardosa] moesta",
241
+ offsetStart: 112, offsetEnd: 119},
242
+ {verbatim: "Plantago major", scientificName: "Plantago major",
243
+ offsetStart: 176, offsetEnd: 189}]}
244
+ )
245
+ res[:names].map do |name|
246
+ verbatim = name[:verbatim]
247
+ found_name = text2[name[:offsetStart]..name[:offsetEnd]]
248
+ expect(found_name).to eq verbatim
249
+ end
250
+ end
111
251
  end
112
252
 
113
- it "should not normalize capitalization of found names" do
253
+ context "capitalization" do
114
254
  #this is a problem we are aware of
115
- text = "We need to make sure that Ophioihrix nidis and OPHTOMVXIDAE and also Ophiocynodus and especially ASTÉROCHEMIDAE and definitely STFROPHVTIDAE and may be Asleronyx excavata should all be capitalized correctly"
116
- res = @neti.find(text)
117
- res.should == {:names=>[{:verbatim=>"Ophioihrix nidis", :scientificName=>"Ophioihrix nidis", :offsetStart=>26, :offsetEnd=>41}, {:verbatim=>"OPHTOMVXIDAE", :scientificName=>"OPHTOMVXIDAE", :offsetStart=>47, :offsetEnd=>58}, {:verbatim=>"Ophiocynodus", :scientificName=>"Ophiocynodus", :offsetStart=>70, :offsetEnd=>81}, {:verbatim=>"ASTÉROCHEMIDAE", :scientificName=>"ASTÉROCHEMIDAE", :offsetStart=>98, :offsetEnd=>111}, {:verbatim=>"STFROPHVTIDAE", :scientificName=>"STFROPHVTIDAE", :offsetStart=>128, :offsetEnd=>140}, {:verbatim=>"Asleronyx excavata", :scientificName=>"Asleronyx excavata", :offsetStart=>153, :offsetEnd=>170}]}
118
- end
119
-
120
- it "should not break NetiNeti results from processing OCR with | character in it" do
121
- text = "We need to make sure that Oph|oihrix nidis and OPHTOMVX|DAE will not break results"
122
- res = @neti.find(text)
123
- res.should == {:names=>[{:verbatim=>"Ophloihrix nidis", :scientificName=>"Ophloihrix nidis", :offsetStart=>26, :offsetEnd=>41}]}
255
+ let(:text) do
256
+ "We need to make sure that Ophioihrix nidis and " \
257
+ "OPHTOMVXIDAE and also Ophiocynodus and especially " \
258
+ "ASTÉROCHEMIDAE and definitely STFROPHVTIDAE and may be " \
259
+ "Asleronyx excavata should all be capitalized correctly"
260
+ end
261
+
262
+ it "does not change capitalization" do
263
+ res = neti.find(text)
264
+ expect(res).to eq(
265
+ {names: [
266
+ {verbatim: "Ophioihrix nidis", scientificName: "Ophioihrix nidis",
267
+ offsetStart: 26, offsetEnd: 41},
268
+ {verbatim: "OPHTOMVXIDAE", scientificName: "OPHTOMVXIDAE",
269
+ offsetStart: 47, offsetEnd: 58},
270
+ {verbatim: "Ophiocynodus", scientificName: "Ophiocynodus",
271
+ offsetStart: 70, offsetEnd: 81},
272
+ {verbatim: "ASTÉROCHEMIDAE", scientificName: "ASTÉROCHEMIDAE",
273
+ offsetStart: 98, offsetEnd: 111},
274
+ {verbatim: "STFROPHVTIDAE", scientificName: "STFROPHVTIDAE",
275
+ offsetStart: 128, offsetEnd: 140},
276
+ {verbatim: "Asleronyx excavata", scientificName: "Asleronyx excavata",
277
+ offsetStart: 153, offsetEnd: 170}
278
+ ]}
279
+ )
280
+ end
124
281
  end
125
282
 
126
- it "should not parse ridiculously long infraspecies names by taxon finder" do
127
- text = "If we encounter Plantago major it is ok, but if it is Plantago quercus quercus quercus quercus quercus quercus quercus quercus quercus quercus quercus quercus quercus quercus, something is probably not right. However we take Plantago quercus quercus quercus quercus quercus by some strange reason. Well, the reason is this kind of thing -- Pardosa moesta var. moesta f. moesta or something like that"
128
- res = @tf.find(text)
129
- res.should == {:names=>[{:verbatim=>"Plantago major", :scientificName=>"Plantago major", :offsetStart=>16, :offsetEnd=>29}, {:verbatim=>"Plantago quercus quercus quercus quercus quercus", :scientificName=>"Plantago quercus quercus quercus quercus quercus", :offsetStart=>225, :offsetEnd=>272}, {:verbatim=>"Pardosa moesta var. moesta f. moesta", :scientificName=>"Pardosa moesta var. moesta f. moesta", :offsetStart=>340, :offsetEnd=>375}]}
283
+ context "OCR errors" do
284
+ let(:pipe) do
285
+ "We need to make sure that Oph|oihrix nidis and " \
286
+ "OPHTOMVX|DAE will not break results"
287
+ end
288
+
289
+ it "substitutes | with l" do
290
+ res = neti.find(pipe)
291
+ expect(res).to eq(
292
+ { names: [{ verbatim: "Ophloihrix nidis",
293
+ scientificName: "Ophloihrix nidis",
294
+ offsetStart: 26, offsetEnd: 41 }] }
295
+ )
296
+ end
130
297
  end
131
298
 
132
- it "should be able to recognize names like P.moesta by TaxonFinder" do
133
- text = "Pardosa moesta! If we encounter Pardosa moesta and then P.modica another name I know is Xenopus laevis and also P.moesta. Again without space TaxonFinder should find both. And Plantago major foreva"
134
- res = @tf.find(text)
135
- res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>0, :offsetEnd=>13}, {:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>32, :offsetEnd=>45}, {:verbatim=>"P.modica", :scientificName=>"P[ardosa] modica", :offsetStart=>56, :offsetEnd=>63}, {:verbatim=>"Xenopus laevis", :scientificName=>"Xenopus laevis", :offsetStart=>88, :offsetEnd=>101}, {:verbatim=>"P.moesta", :scientificName=>"P[ardosa] moesta", :offsetStart=>112, :offsetEnd=>119}, {:verbatim=>"Plantago major", :scientificName=>"Plantago major", :offsetStart=>176, :offsetEnd=>189}]}
136
- res[:names].map do |name|
137
- verbatim = name[:verbatim]
138
- found_name = text[name[:offsetStart]..name[:offsetEnd]]
139
- found_name.should == verbatim
140
- end
141
- end
142
-
143
- it "should register situations where new name started and prev name is finished in the same cycle in TF" do
144
- text = "What happens another called Pardosa moesta (Araneae: Lycosidae) is the species?"
145
- res = @tf.find(text)
146
- res.should == {:names=>[{:verbatim=>"Pardosa moesta", :scientificName=>"Pardosa moesta", :offsetStart=>29, :offsetEnd=>42}, {:verbatim=>"(Araneae:", :scientificName=>"Araneae", :offsetStart=>44, :offsetEnd=>52}, {:verbatim=>"Lycosidae)", :scientificName=>"Lycosidae", :offsetStart=>54, :offsetEnd=>63}]}
299
+ context "extremely nexted infraspecies" do
300
+ let(:text) do
301
+ "If we encounter Plantago major it is ok, but if it is " \
302
+ "Plantago quercus quercus quercus quercus quercus quercus " \
303
+ "quercus quercus quercus quercus quercus quercus quercus " \
304
+ "quercus, something is probably not right. However we take " \
305
+ "Plantago quercus quercus quercus quercus quercus by some " \
306
+ "strange reason. Well, the reason is this kind of thing -- " \
307
+ "Pardosa moesta var. moesta f. moesta or something like that"
308
+ end
309
+
310
+ it "stops at five infraspecies levels" do
311
+ res = tf.find(text)
312
+ expect(res).to eq(
313
+ {names: [
314
+ {verbatim: "Plantago major", scientificName: "Plantago major",
315
+ offsetStart: 16, offsetEnd: 29},
316
+ {verbatim: "Plantago quercus quercus quercus quercus quercus",
317
+ scientificName: "Plantago quercus quercus quercus quercus quercus",
318
+ offsetStart: 225, offsetEnd: 272},
319
+ {verbatim: "Pardosa moesta var. moesta f. moesta",
320
+ scientificName: "Pardosa moesta var. moesta f. moesta",
321
+ offsetStart: 340, offsetEnd: 375}]}
322
+ )
323
+ end
147
324
  end
148
325
 
149
- it "should ignore abbreviated genus before family for TaxonFinder" do
150
- text = "What happens another called P. (LYCOSIDAE) is the species?"
151
- res = @tf.find(text)
152
- res[:names].size.should == 1
153
- res.should == {:names=>[{:verbatim=>"(LYCOSIDAE)", :scientificName=>"Lycosidae", :offsetStart=>32, :offsetEnd=>42}]}
326
+ context "nested names" do
327
+ let(:text) do
328
+ "What happens another called Pardosa moesta (Araneae: Lycosidae) is " \
329
+ "the species?"
330
+ end
331
+
332
+ it "(TF) handles nested names in one cycle" do
333
+ res = tf.find(text)
334
+ expect(res).to eq (
335
+ {names: [
336
+ {verbatim: "Pardosa moesta", scientificName: "Pardosa moesta",
337
+ offsetStart: 29, offsetEnd: 42},
338
+ {verbatim: "(Araneae:", scientificName: "Araneae",
339
+ offsetStart: 44, offsetEnd: 52},
340
+ {verbatim: "Lycosidae)", scientificName: "Lycosidae",
341
+ offsetStart: 54, offsetEnd: 63}]}
342
+ )
343
+ end
154
344
  end
155
-
156
- it "should find names with diacrictics" do
157
- text = 'Mactra triangula Renieri. Fissurella nubécula Linnó.'
158
- res = @tf.find(text)
159
- res[:names].size.should == 2
160
- res.should == {:names=>[{:verbatim=>"Mactra triangula", :scientificName=>"Mactra triangula", :offsetStart=>0, :offsetEnd=>15}, {:verbatim=>"Fissurella nubécula", :scientificName=>"Fissurella nubécula", :offsetStart=>26, :offsetEnd=>44}]}
345
+
346
+ context "diacritics" do
347
+ let(:text) { "Mactra triangula Renieri. Fissurella nubécula Linnó." }
348
+
349
+ it "finds names with diacrictics" do
350
+ res = tf.find(text)
351
+ expect(res[:names].size).to be 2
352
+ expect(res).to eq(
353
+ {names: [
354
+ {verbatim: "Mactra triangula", scientificName: "Mactra triangula",
355
+ offsetStart: 0, offsetEnd: 15},
356
+ {verbatim: "Fissurella nubécula",
357
+ scientificName: "Fissurella nubécula",
358
+ offsetStart: 26, offsetEnd: 44}]}
359
+ )
360
+ end
161
361
  end
162
362
 
363
+ def read(file)
364
+ File.read(File.join(__dir__, "files", file))
365
+ end
163
366
  end