biodiversity 3.5.1 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rubocop.yml +9 -6
- data/.ruby-version +1 -1
- data/.travis.yml +1 -6
- data/CHANGELOG +3 -0
- data/Gemfile +2 -0
- data/README.md +37 -178
- data/Rakefile +15 -48
- data/biodiversity.gemspec +18 -21
- data/clib/linux/libgnparser.h +93 -0
- data/clib/linux/libgnparser.so +0 -0
- data/clib/mac/libgnparser.h +93 -0
- data/clib/mac/libgnparser.so +0 -0
- data/lib/biodiversity.rb +4 -9
- data/lib/biodiversity/parser.rb +65 -281
- data/lib/biodiversity/version.rb +8 -1
- data/spec/lib/biodiversity_spec.rb +9 -0
- data/spec/lib/parser_spec.rb +38 -0
- data/spec/spec_helper.rb +4 -81
- metadata +27 -102
- data/.byebug_history +0 -18
- data/.document +0 -5
- data/examples/socket_client.rb +0 -25
- data/lib/biodiversity/guid.rb +0 -1
- data/lib/biodiversity/guid/lsid.rb +0 -16
- data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -528
- data/lib/biodiversity/parser/scientific_name_canonical.treetop +0 -120
- data/lib/biodiversity/parser/scientific_name_clean.rb +0 -8991
- data/lib/biodiversity/parser/scientific_name_clean.treetop +0 -1632
- data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1298
- data/lib/biodiversity/parser/scientific_name_dirty.treetop +0 -264
- data/spec/biodiversity_spec.rb +0 -11
- data/spec/files/test_data.txt +0 -490
- data/spec/files/todo.txt +0 -55
- data/spec/guid/lsid.spec.rb +0 -15
- data/spec/parser/scientific_name_canonical_spec.rb +0 -36
- data/spec/parser/scientific_name_clean_spec.rb +0 -1137
- data/spec/parser/scientific_name_dirty_spec.rb +0 -165
- data/spec/parser/scientific_name_spec.rb +0 -193
@@ -1,165 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
describe ScientificNameDirty do
|
4
|
-
before(:all) do
|
5
|
-
set_parser(ScientificNameDirtyParser.new)
|
6
|
-
end
|
7
|
-
|
8
|
-
it 'parses clean names' do
|
9
|
-
expect(parse('Betula verucosa (L.) Bar. 1899')).to_not be_nil
|
10
|
-
end
|
11
|
-
|
12
|
-
it 'parses double parenthesis' do
|
13
|
-
sn = 'Eichornia crassipes ( (Martius) ) Solms-Laub.'
|
14
|
-
expect(parse(sn)).to_not be_nil
|
15
|
-
expect(value(sn)).to eq 'Eichornia crassipes (Martius) Solms-Laub.'
|
16
|
-
expect(details(sn)).to eq [{ genus: { string: 'Eichornia' },
|
17
|
-
species: { string: 'crassipes',
|
18
|
-
authorship: '( (Martius) ) Solms-Laub.',
|
19
|
-
combinationAuthorTeam: { authorTeam: 'Solms-Laub.',
|
20
|
-
author: ['Solms-Laub.'] },
|
21
|
-
basionymAuthorTeam: { authorTeam: 'Martius',
|
22
|
-
author: ['Martius'] } } }]
|
23
|
-
expect(pos(sn)).to eq({ 0 => ['genus', 9], 10 => ['species', 19],
|
24
|
-
23 => ['author_word', 30],
|
25
|
-
34 => ['author_word', 45] })
|
26
|
-
end
|
27
|
-
|
28
|
-
it 'parses year without author' do
|
29
|
-
sn = 'Acarospora cratericola 1929'
|
30
|
-
expect(parse(sn)).to_not be_nil
|
31
|
-
expect(pos(sn)).to eq({ 0 => ['genus', 10],
|
32
|
-
11 => ['species', 22], 23 => ['year', 27] })
|
33
|
-
expect(details(sn)).to eq [{ genus: { string: 'Acarospora' },
|
34
|
-
species: { string: 'cratericola',
|
35
|
-
year: '1929' } }]
|
36
|
-
end
|
37
|
-
|
38
|
-
it 'parses double years' do
|
39
|
-
sn = 'Tridentella tangeroae Bruce, 1987-92'
|
40
|
-
expect(parse(sn)).to_not be_nil
|
41
|
-
expect(pos(sn)).to eq({ 0 => ['genus', 11], 12 => ['species', 21],
|
42
|
-
22 => ['author_word', 27], 29 => ['year', 36] })
|
43
|
-
expect(details(sn)).to eq [{ genus: { string: 'Tridentella' },
|
44
|
-
species: { string: 'tangeroae', authorship: 'Bruce, 1987-92',
|
45
|
-
basionymAuthorTeam: { authorTeam: 'Bruce', author: ['Bruce'],
|
46
|
-
year: '1987-92' } } }]
|
47
|
-
end
|
48
|
-
|
49
|
-
it 'parses dirty years' do
|
50
|
-
expect(parse('Tridentella tangeroae Bruce, 1988B')).to_not be_nil
|
51
|
-
expect(parse('Tridentella tangeroae Bruce, 1988b')).to_not be_nil
|
52
|
-
expect(parse('Tridentella tangeroae Bruce, 1988d')).to_not be_nil
|
53
|
-
sn = 'Tridentella tangeroae Bruce, 198?'
|
54
|
-
expect(parse(sn)).to_not be_nil
|
55
|
-
expect(pos(sn)).to eq({ 0 => ['genus', 11], 12 => ['species', 21],
|
56
|
-
22 => ['author_word', 27], 29 => ['year', 33] })
|
57
|
-
end
|
58
|
-
|
59
|
-
it 'parses year with page number' do
|
60
|
-
sn = 'Gymnodactylus irregularis WERMUTH 1965: 54'
|
61
|
-
expect(parse(sn)).to_not be_nil
|
62
|
-
expect(value(sn)).to eq 'Gymnodactylus irregularis Wermuth 1965'
|
63
|
-
expect(details(sn)).to eq [{ genus: { string: 'Gymnodactylus' },
|
64
|
-
species: { string: 'irregularis', authorship: 'WERMUTH 1965: 54',
|
65
|
-
basionymAuthorTeam: { authorTeam: 'WERMUTH', author: ['Wermuth'],
|
66
|
-
year: '1965' } } }]
|
67
|
-
expect(pos(sn)).to eq({ 0 => ['genus', 13], 14 => ['species', 25],
|
68
|
-
26 => ['author_word', 33], 34 => ['year', 38] })
|
69
|
-
end
|
70
|
-
|
71
|
-
it 'parses year with []' do
|
72
|
-
expect(parse('Anthoscopus Cabanis [1851]')).to_not be_nil
|
73
|
-
expect(value('Anthoscopus Cabanis [185?]')).
|
74
|
-
to eq 'Anthoscopus Cabanis (185?)'
|
75
|
-
expect(parse('Anthoscopus Cabanis [1851?]')).to_not be_nil
|
76
|
-
expect(value('Anthoscopus Cabanis [1851]')).
|
77
|
-
to eq 'Anthoscopus Cabanis (1851)'
|
78
|
-
sn = 'Anthoscopus Cabanis [1851?]'
|
79
|
-
expect(value(sn)).to eq 'Anthoscopus Cabanis (1851?)'
|
80
|
-
expect(details(sn)).to eq [{ uninomial: { string: 'Anthoscopus',
|
81
|
-
authorship: 'Cabanis [1851?]', basionymAuthorTeam:
|
82
|
-
{ authorTeam: 'Cabanis', author: ['Cabanis'],
|
83
|
-
approximate_year: '(1851?)' } } }]
|
84
|
-
expect(pos(sn)).to eq({ 0 => ['uninomial', 11],
|
85
|
-
12 => ['author_word', 19], 21 => ['year', 26] })
|
86
|
-
sn = 'Trismegistia monodii Ando, 1973 [1974]'
|
87
|
-
expect(parse(sn)).to_not be_nil
|
88
|
-
|
89
|
-
#should it be 'Trismegistia monodii Ando 1973 (1974)' instead?
|
90
|
-
expect(value(sn)).to eq 'Trismegistia monodii Ando 1973 (1974)'
|
91
|
-
|
92
|
-
expect(details(sn)).to eq [{ genus: { string: 'Trismegistia' },
|
93
|
-
species: { string: 'monodii', authorship: 'Ando, 1973 [1974]',
|
94
|
-
basionymAuthorTeam: { authorTeam: 'Ando', author: ['Ando'],
|
95
|
-
year: '1973', approximate_year: '(1974)' } } }]
|
96
|
-
expect(pos(sn)).to eq({ 0 => ['genus', 12], 13 => ['species', 20],
|
97
|
-
21 => ['author_word', 25], 27 => ['year', 31], 33 => ['year', 37] })
|
98
|
-
expect(parse('Zygaena witti Wiegel [1973]')).to_not be_nil
|
99
|
-
sn = 'Deyeuxia coarctata Kunth, 1815 [1816]'
|
100
|
-
expect(parse(sn)).to_not be_nil
|
101
|
-
expect(pos(sn)).to eq({ 0 => ['genus', 8], 9 => ['species', 18],
|
102
|
-
19 => ['author_word', 24], 26 => ['year', 30],
|
103
|
-
32 => ['year', 36] })
|
104
|
-
end
|
105
|
-
|
106
|
-
it 'parses new stuff' do
|
107
|
-
sn = 'Zoropsis (TaKeoa) nishimurai Yaginuma, 1971' #skipping for now
|
108
|
-
sn = 'Campylobacter pyloridis Marshall et al.1985.'
|
109
|
-
expect(details(sn)).to eq [{ genus: { string: 'Campylobacter' },
|
110
|
-
species: { string: 'pyloridis', authorship: 'Marshall et al.1985.',
|
111
|
-
basionymAuthorTeam: { authorTeam: 'Marshall et al.',
|
112
|
-
author: ['Marshall et al.'], year: '1985' } } }]
|
113
|
-
sn = 'Beijerinckia derxii venezuelae corrig. Thompson and Skerman, 1981'
|
114
|
-
expect(details(sn)).to eq [{ genus: { string: 'Beijerinckia' },
|
115
|
-
species: { string: 'derxii' }, infraspecies: [{ string: 'venezuelae',
|
116
|
-
rank: 'n/a', authorship: 'Thompson and Skerman, 1981',
|
117
|
-
basionymAuthorTeam: { authorTeam: 'Thompson and Skerman',
|
118
|
-
author: ['Thompson', 'Skerman'], year: '1981' } }] }]
|
119
|
-
expect(details('Streptomyces parvisporogenes ignotus 1960')).
|
120
|
-
to eq [{ genus: { string: 'Streptomyces' },
|
121
|
-
species: { string: 'parvisporogenes' },
|
122
|
-
infraspecies: [{ string: 'ignotus', rank: 'n/a', year: '1960' }] }]
|
123
|
-
expect(details('Oscillaria caviae Simons 1920, according to Simons 1922')).
|
124
|
-
to eq [{ genus: { string: 'Oscillaria' }, species: { string: 'caviae',
|
125
|
-
authorship: 'Simons 1920', basionymAuthorTeam:
|
126
|
-
{ authorTeam: 'Simons', author: ['Simons'], year: '1920' } } }]
|
127
|
-
sn = 'Bacterium monocytogenes hominis\'\' Nyfeldt 1932'
|
128
|
-
expect(details(sn)).to eq [{ genus: { string: 'Bacterium' },
|
129
|
-
species: { string: 'monocytogenes' },
|
130
|
-
infraspecies: [{ string: 'hominis', rank: 'n/a' }] }]
|
131
|
-
sn = 'Choriozopella trägårdhi Lawrence, 1947'
|
132
|
-
expect(details(sn)).to eq [{ genus: { string: 'Choriozopella' },
|
133
|
-
species: { string: 'tragardhi', authorship: 'Lawrence, 1947',
|
134
|
-
basionymAuthorTeam: { authorTeam: 'Lawrence', author: ['Lawrence'],
|
135
|
-
year: '1947' } } }]
|
136
|
-
sn = 'Sparassus françoisi Simon, 1898'
|
137
|
-
expect(details(sn)).to eq [{ genus: { string: 'Sparassus' },
|
138
|
-
species: { string: 'francoisi', authorship: 'Simon, 1898',
|
139
|
-
basionymAuthorTeam: { authorTeam: 'Simon', author: ['Simon'],
|
140
|
-
year: '1898' } } }]
|
141
|
-
sn = 'Dyarcyops birói Kulczynski, 1908'
|
142
|
-
expect(details(sn)).to eq [{ genus: { string: 'Dyarcyops' },
|
143
|
-
species: { string: 'biroi', authorship: 'Kulczynski, 1908',
|
144
|
-
basionymAuthorTeam: { authorTeam: 'Kulczynski',
|
145
|
-
author: ['Kulczynski'], year: '1908' } } }]
|
146
|
-
end
|
147
|
-
|
148
|
-
it 'parses names with common utf-8 charactes' do
|
149
|
-
names = ['Rühlella','Sténométope laevissimus Bibron 1855',
|
150
|
-
'Döringina Ihering 1929'].each do |name|
|
151
|
-
expect(parse(name)).to_not be_nil
|
152
|
-
end
|
153
|
-
expect(details('Hirsutëlla mâle')).to eq [{ genus: { string: 'Hirsutella' },
|
154
|
-
species: { string: 'male' } }]
|
155
|
-
expect(details('Triticum repens vulgäre')).
|
156
|
-
to eq [{ genus: { string: 'Triticum' }, species: { string: 'repens' },
|
157
|
-
infraspecies: [{ string: 'vulgare', rank: 'n/a' }] }]
|
158
|
-
end
|
159
|
-
|
160
|
-
# AsterophUa japonica
|
161
|
-
# AsyTuktus ridiculw Parent 1931
|
162
|
-
# AtremOEa Staud 1870
|
163
|
-
|
164
|
-
|
165
|
-
end
|
@@ -1,193 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
#NOTE: this spec needs compiled treetop files.
|
4
|
-
|
5
|
-
describe ScientificNameParser do
|
6
|
-
before(:all) do
|
7
|
-
set_parser(ScientificNameParser.new)
|
8
|
-
end
|
9
|
-
|
10
|
-
it "returns version number" do
|
11
|
-
expect(ScientificNameParser.version).to match /^\d+\.\d+\.\d+/
|
12
|
-
end
|
13
|
-
|
14
|
-
it "fixes cases" do
|
15
|
-
names = [
|
16
|
-
["QUERCUS ALBA", "Quercus alba"],
|
17
|
-
["QUERCUS (QUERCUS) ALBA", "Quercus (Quercus) alba"],
|
18
|
-
["QÜERCUS", "Qüercus"],
|
19
|
-
["PARDOSA MOéSTA", "Pardosa moésta"],
|
20
|
-
]
|
21
|
-
names.each do |name, capitalization|
|
22
|
-
expect(ScientificNameParser::fix_case(name)).to eq capitalization
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
it "generates standardized json" do
|
27
|
-
read_test_file do |y|
|
28
|
-
expect(JSON.load(json(y[:name]))).to eq JSON.
|
29
|
-
load(y[:jsn]) unless y[:comment]
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
|
34
|
-
# it "generates new test_file" do
|
35
|
-
# new_test = open(File.expand_path(dir +
|
36
|
-
# "../../spec/parser/test_data_new.txt"),"w")
|
37
|
-
# read_test_file do |y|
|
38
|
-
# if y[:comment]
|
39
|
-
# new_test.write y[:comment]
|
40
|
-
# else
|
41
|
-
# name = y[:name]
|
42
|
-
# jsn = json(y[:name])# rescue puts(y[:name])
|
43
|
-
# new_test.write("#{name}|#{jsn}\n")
|
44
|
-
# end
|
45
|
-
# end
|
46
|
-
# end
|
47
|
-
|
48
|
-
it "generates reasonable output if parser failed" do
|
49
|
-
sn = "ddd sljlkj 3223452432"
|
50
|
-
expect(json(sn)).to eq "{\"scientificName\":" \
|
51
|
-
"{\"id\":\"3ebf93d9-b62a-5198-8715-4c8302f0a5d7\",\"parsed\":false," \
|
52
|
-
"\"parser_version\":\"test_version\"," \
|
53
|
-
"\"verbatim\":\"ddd sljlkj 3223452432\"}}"
|
54
|
-
end
|
55
|
-
|
56
|
-
it "shows version when the flag :show_version set to true" do
|
57
|
-
expect(parse("Homo sapiens")[:scientificName][:parser_version]).
|
58
|
-
to_not be_nil
|
59
|
-
end
|
60
|
-
|
61
|
-
it "shows version for not spelled names" do
|
62
|
-
expect(parse("not_a_name")[:scientificName][:parser_version]).to_not be_nil
|
63
|
-
end
|
64
|
-
|
65
|
-
it "generates version for viruses" do
|
66
|
-
expect(parse("Nile virus")[:scientificName][:parser_version]).to_not be_nil
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
describe "ScientificNameParser with ranked canonicals" do
|
71
|
-
before(:all) do
|
72
|
-
@parser = ScientificNameParser.new(canonical_with_rank: true)
|
73
|
-
end
|
74
|
-
|
75
|
-
it "does not influence output for uninomials and binomials" do
|
76
|
-
data = [
|
77
|
-
["Ekbainacanthus Yakowlew 1902","Ekbainacanthus"],
|
78
|
-
["Ekboarmia sagnesi herrerai Exposito 2007",
|
79
|
-
"Ekboarmia sagnesi herrerai"],
|
80
|
-
["Ekboarmia holli Oberthür", "Ekboarmia holli"]]
|
81
|
-
|
82
|
-
data.each do |d|
|
83
|
-
parsed = @parser.parse(d[0])[:scientificName][:canonical]
|
84
|
-
expect(parsed).to eq d[1]
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
it "preserves rank for ranked multinomials" do
|
89
|
-
data = [
|
90
|
-
["Cola cordifolia var. puberula A. Chev.",
|
91
|
-
"Cola cordifolia var. puberula"],
|
92
|
-
["Abies homolepis forma umbilicata (Mayr) Schelle",
|
93
|
-
"Abies homolepis forma umbilicata"],
|
94
|
-
["Quercus ilex ssp. ballota (Desf.) Samp",
|
95
|
-
"Quercus ilex ssp. ballota"],
|
96
|
-
["Physarum globuliferum forma. flavum Leontyev & Dudka",
|
97
|
-
"Physarum globuliferum forma. flavum"]
|
98
|
-
]
|
99
|
-
data.each do |d|
|
100
|
-
parsed = @parser.parse(d[0])[:scientificName][:canonical]
|
101
|
-
expect(parsed).to eq d[1]
|
102
|
-
end
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
106
|
-
describe ".add_rank_to_canonical" do
|
107
|
-
subject(:parser) { ScientificNameParser.new }
|
108
|
-
|
109
|
-
it "adds rank to infraspecies with rank" do
|
110
|
-
data = [
|
111
|
-
["Cola cordifolia var. puberula A. Chev.",
|
112
|
-
"Cola cordifolia puberula",
|
113
|
-
"Cola cordifolia var. puberula"],
|
114
|
-
["Abies homolepis forma umbilicata (Mayr) Schelle",
|
115
|
-
"Abies homolepis umbilicata",
|
116
|
-
"Abies homolepis forma umbilicata"],
|
117
|
-
["Quercus ilex ssp. ballota (Desf.) Samp",
|
118
|
-
"Quercus ilex ballota",
|
119
|
-
"Quercus ilex ssp. ballota"],
|
120
|
-
["Physarum globuliferum forma. flavum Leontyev & Dudka",
|
121
|
-
"Physarum globuliferum flavum",
|
122
|
-
"Physarum globuliferum forma. flavum"]
|
123
|
-
]
|
124
|
-
data.each do |d|
|
125
|
-
parsed = parser.parse(d[0])
|
126
|
-
canonical1 = parsed[:scientificName][:canonical]
|
127
|
-
expect(canonical1).to eq d[1]
|
128
|
-
ScientificNameParser.add_rank_to_canonical(parsed)
|
129
|
-
canonical2 = parsed[:scientificName][:canonical]
|
130
|
-
expect(canonical2).to eq d[2]
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
it "returns preprocessing tail if it exists" do
|
135
|
-
sn = "Stenometope laevissimus sensu Eschmeyer 2004"
|
136
|
-
res = parser.parse(sn)
|
137
|
-
expect(res).to_not be_nil
|
138
|
-
expect(res[:scientificName][:tail]).to eq "sensu Eschmeyer 2004"
|
139
|
-
end
|
140
|
-
|
141
|
-
it "does not work for hybrids yet" do
|
142
|
-
data = [["Corda X cordiflora var. puberula",
|
143
|
-
"Corda cordiflora puberula"]]
|
144
|
-
data.each do |d|
|
145
|
-
parsed = parser.parse(d[0])
|
146
|
-
canonical1 = parsed[:scientificName][:canonical]
|
147
|
-
expect(canonical1).to eq d[1]
|
148
|
-
ScientificNameParser.add_rank_to_canonical(parsed)
|
149
|
-
canonical2 = parsed[:scientificName][:canonical]
|
150
|
-
expect(canonical2).to eq d[1]
|
151
|
-
end
|
152
|
-
end
|
153
|
-
end
|
154
|
-
|
155
|
-
describe ParallelParser do
|
156
|
-
it "finds number of cpus" do
|
157
|
-
pparser = ParallelParser.new
|
158
|
-
expect(pparser.cpu_num).to be > 0
|
159
|
-
end
|
160
|
-
|
161
|
-
it "parses several names in parallel" do
|
162
|
-
names = []
|
163
|
-
read_test_file { |n| names << (n[:name]) if n[:name] }
|
164
|
-
names.uniq!
|
165
|
-
pparser = ParallelParser.new
|
166
|
-
res = pparser.parse(names)
|
167
|
-
expect(names.size).to be > 100
|
168
|
-
expect(res.keys.size).to eq names.size
|
169
|
-
end
|
170
|
-
|
171
|
-
it "parses several names in parallel with given num of processes" do
|
172
|
-
names = []
|
173
|
-
read_test_file { |n| names << (n[:name]) if n[:name] }
|
174
|
-
names.uniq!
|
175
|
-
pparser = ParallelParser.new(4)
|
176
|
-
res = pparser.parse(names)
|
177
|
-
expect(names.size).to be > 100
|
178
|
-
expect(res.keys.size).to eq names.size
|
179
|
-
end
|
180
|
-
|
181
|
-
it "has parsed name in native ruby format and in returned as \
|
182
|
-
a hash with name as a key and parsed data as value" do
|
183
|
-
names = []
|
184
|
-
read_test_file { |n| names << (n[:name]) if n[:name] }
|
185
|
-
names.uniq!
|
186
|
-
pparser = ParallelParser.new(4)
|
187
|
-
res = pparser.parse(names)
|
188
|
-
names.each_with_index do |name, i|
|
189
|
-
expect(res[name].is_a?(Hash)).to be true
|
190
|
-
expect(res[name][:scientificName][:verbatim]).to eq name
|
191
|
-
end
|
192
|
-
end
|
193
|
-
end
|