biodiversity 3.5.1 → 4.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rubocop.yml +9 -6
- data/.ruby-version +1 -1
- data/.travis.yml +1 -6
- data/CHANGELOG +3 -0
- data/Gemfile +2 -0
- data/README.md +37 -178
- data/Rakefile +15 -48
- data/biodiversity.gemspec +18 -21
- data/clib/linux/libgnparser.h +93 -0
- data/clib/linux/libgnparser.so +0 -0
- data/clib/mac/libgnparser.h +93 -0
- data/clib/mac/libgnparser.so +0 -0
- data/lib/biodiversity.rb +4 -9
- data/lib/biodiversity/parser.rb +65 -281
- data/lib/biodiversity/version.rb +8 -1
- data/spec/lib/biodiversity_spec.rb +9 -0
- data/spec/lib/parser_spec.rb +38 -0
- data/spec/spec_helper.rb +4 -81
- metadata +27 -102
- data/.byebug_history +0 -18
- data/.document +0 -5
- data/examples/socket_client.rb +0 -25
- data/lib/biodiversity/guid.rb +0 -1
- data/lib/biodiversity/guid/lsid.rb +0 -16
- data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -528
- data/lib/biodiversity/parser/scientific_name_canonical.treetop +0 -120
- data/lib/biodiversity/parser/scientific_name_clean.rb +0 -8991
- data/lib/biodiversity/parser/scientific_name_clean.treetop +0 -1632
- data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1298
- data/lib/biodiversity/parser/scientific_name_dirty.treetop +0 -264
- data/spec/biodiversity_spec.rb +0 -11
- data/spec/files/test_data.txt +0 -490
- data/spec/files/todo.txt +0 -55
- data/spec/guid/lsid.spec.rb +0 -15
- data/spec/parser/scientific_name_canonical_spec.rb +0 -36
- data/spec/parser/scientific_name_clean_spec.rb +0 -1137
- data/spec/parser/scientific_name_dirty_spec.rb +0 -165
- data/spec/parser/scientific_name_spec.rb +0 -193
@@ -1,165 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
describe ScientificNameDirty do
|
4
|
-
before(:all) do
|
5
|
-
set_parser(ScientificNameDirtyParser.new)
|
6
|
-
end
|
7
|
-
|
8
|
-
it 'parses clean names' do
|
9
|
-
expect(parse('Betula verucosa (L.) Bar. 1899')).to_not be_nil
|
10
|
-
end
|
11
|
-
|
12
|
-
it 'parses double parenthesis' do
|
13
|
-
sn = 'Eichornia crassipes ( (Martius) ) Solms-Laub.'
|
14
|
-
expect(parse(sn)).to_not be_nil
|
15
|
-
expect(value(sn)).to eq 'Eichornia crassipes (Martius) Solms-Laub.'
|
16
|
-
expect(details(sn)).to eq [{ genus: { string: 'Eichornia' },
|
17
|
-
species: { string: 'crassipes',
|
18
|
-
authorship: '( (Martius) ) Solms-Laub.',
|
19
|
-
combinationAuthorTeam: { authorTeam: 'Solms-Laub.',
|
20
|
-
author: ['Solms-Laub.'] },
|
21
|
-
basionymAuthorTeam: { authorTeam: 'Martius',
|
22
|
-
author: ['Martius'] } } }]
|
23
|
-
expect(pos(sn)).to eq({ 0 => ['genus', 9], 10 => ['species', 19],
|
24
|
-
23 => ['author_word', 30],
|
25
|
-
34 => ['author_word', 45] })
|
26
|
-
end
|
27
|
-
|
28
|
-
it 'parses year without author' do
|
29
|
-
sn = 'Acarospora cratericola 1929'
|
30
|
-
expect(parse(sn)).to_not be_nil
|
31
|
-
expect(pos(sn)).to eq({ 0 => ['genus', 10],
|
32
|
-
11 => ['species', 22], 23 => ['year', 27] })
|
33
|
-
expect(details(sn)).to eq [{ genus: { string: 'Acarospora' },
|
34
|
-
species: { string: 'cratericola',
|
35
|
-
year: '1929' } }]
|
36
|
-
end
|
37
|
-
|
38
|
-
it 'parses double years' do
|
39
|
-
sn = 'Tridentella tangeroae Bruce, 1987-92'
|
40
|
-
expect(parse(sn)).to_not be_nil
|
41
|
-
expect(pos(sn)).to eq({ 0 => ['genus', 11], 12 => ['species', 21],
|
42
|
-
22 => ['author_word', 27], 29 => ['year', 36] })
|
43
|
-
expect(details(sn)).to eq [{ genus: { string: 'Tridentella' },
|
44
|
-
species: { string: 'tangeroae', authorship: 'Bruce, 1987-92',
|
45
|
-
basionymAuthorTeam: { authorTeam: 'Bruce', author: ['Bruce'],
|
46
|
-
year: '1987-92' } } }]
|
47
|
-
end
|
48
|
-
|
49
|
-
it 'parses dirty years' do
|
50
|
-
expect(parse('Tridentella tangeroae Bruce, 1988B')).to_not be_nil
|
51
|
-
expect(parse('Tridentella tangeroae Bruce, 1988b')).to_not be_nil
|
52
|
-
expect(parse('Tridentella tangeroae Bruce, 1988d')).to_not be_nil
|
53
|
-
sn = 'Tridentella tangeroae Bruce, 198?'
|
54
|
-
expect(parse(sn)).to_not be_nil
|
55
|
-
expect(pos(sn)).to eq({ 0 => ['genus', 11], 12 => ['species', 21],
|
56
|
-
22 => ['author_word', 27], 29 => ['year', 33] })
|
57
|
-
end
|
58
|
-
|
59
|
-
it 'parses year with page number' do
|
60
|
-
sn = 'Gymnodactylus irregularis WERMUTH 1965: 54'
|
61
|
-
expect(parse(sn)).to_not be_nil
|
62
|
-
expect(value(sn)).to eq 'Gymnodactylus irregularis Wermuth 1965'
|
63
|
-
expect(details(sn)).to eq [{ genus: { string: 'Gymnodactylus' },
|
64
|
-
species: { string: 'irregularis', authorship: 'WERMUTH 1965: 54',
|
65
|
-
basionymAuthorTeam: { authorTeam: 'WERMUTH', author: ['Wermuth'],
|
66
|
-
year: '1965' } } }]
|
67
|
-
expect(pos(sn)).to eq({ 0 => ['genus', 13], 14 => ['species', 25],
|
68
|
-
26 => ['author_word', 33], 34 => ['year', 38] })
|
69
|
-
end
|
70
|
-
|
71
|
-
it 'parses year with []' do
|
72
|
-
expect(parse('Anthoscopus Cabanis [1851]')).to_not be_nil
|
73
|
-
expect(value('Anthoscopus Cabanis [185?]')).
|
74
|
-
to eq 'Anthoscopus Cabanis (185?)'
|
75
|
-
expect(parse('Anthoscopus Cabanis [1851?]')).to_not be_nil
|
76
|
-
expect(value('Anthoscopus Cabanis [1851]')).
|
77
|
-
to eq 'Anthoscopus Cabanis (1851)'
|
78
|
-
sn = 'Anthoscopus Cabanis [1851?]'
|
79
|
-
expect(value(sn)).to eq 'Anthoscopus Cabanis (1851?)'
|
80
|
-
expect(details(sn)).to eq [{ uninomial: { string: 'Anthoscopus',
|
81
|
-
authorship: 'Cabanis [1851?]', basionymAuthorTeam:
|
82
|
-
{ authorTeam: 'Cabanis', author: ['Cabanis'],
|
83
|
-
approximate_year: '(1851?)' } } }]
|
84
|
-
expect(pos(sn)).to eq({ 0 => ['uninomial', 11],
|
85
|
-
12 => ['author_word', 19], 21 => ['year', 26] })
|
86
|
-
sn = 'Trismegistia monodii Ando, 1973 [1974]'
|
87
|
-
expect(parse(sn)).to_not be_nil
|
88
|
-
|
89
|
-
#should it be 'Trismegistia monodii Ando 1973 (1974)' instead?
|
90
|
-
expect(value(sn)).to eq 'Trismegistia monodii Ando 1973 (1974)'
|
91
|
-
|
92
|
-
expect(details(sn)).to eq [{ genus: { string: 'Trismegistia' },
|
93
|
-
species: { string: 'monodii', authorship: 'Ando, 1973 [1974]',
|
94
|
-
basionymAuthorTeam: { authorTeam: 'Ando', author: ['Ando'],
|
95
|
-
year: '1973', approximate_year: '(1974)' } } }]
|
96
|
-
expect(pos(sn)).to eq({ 0 => ['genus', 12], 13 => ['species', 20],
|
97
|
-
21 => ['author_word', 25], 27 => ['year', 31], 33 => ['year', 37] })
|
98
|
-
expect(parse('Zygaena witti Wiegel [1973]')).to_not be_nil
|
99
|
-
sn = 'Deyeuxia coarctata Kunth, 1815 [1816]'
|
100
|
-
expect(parse(sn)).to_not be_nil
|
101
|
-
expect(pos(sn)).to eq({ 0 => ['genus', 8], 9 => ['species', 18],
|
102
|
-
19 => ['author_word', 24], 26 => ['year', 30],
|
103
|
-
32 => ['year', 36] })
|
104
|
-
end
|
105
|
-
|
106
|
-
it 'parses new stuff' do
|
107
|
-
sn = 'Zoropsis (TaKeoa) nishimurai Yaginuma, 1971' #skipping for now
|
108
|
-
sn = 'Campylobacter pyloridis Marshall et al.1985.'
|
109
|
-
expect(details(sn)).to eq [{ genus: { string: 'Campylobacter' },
|
110
|
-
species: { string: 'pyloridis', authorship: 'Marshall et al.1985.',
|
111
|
-
basionymAuthorTeam: { authorTeam: 'Marshall et al.',
|
112
|
-
author: ['Marshall et al.'], year: '1985' } } }]
|
113
|
-
sn = 'Beijerinckia derxii venezuelae corrig. Thompson and Skerman, 1981'
|
114
|
-
expect(details(sn)).to eq [{ genus: { string: 'Beijerinckia' },
|
115
|
-
species: { string: 'derxii' }, infraspecies: [{ string: 'venezuelae',
|
116
|
-
rank: 'n/a', authorship: 'Thompson and Skerman, 1981',
|
117
|
-
basionymAuthorTeam: { authorTeam: 'Thompson and Skerman',
|
118
|
-
author: ['Thompson', 'Skerman'], year: '1981' } }] }]
|
119
|
-
expect(details('Streptomyces parvisporogenes ignotus 1960')).
|
120
|
-
to eq [{ genus: { string: 'Streptomyces' },
|
121
|
-
species: { string: 'parvisporogenes' },
|
122
|
-
infraspecies: [{ string: 'ignotus', rank: 'n/a', year: '1960' }] }]
|
123
|
-
expect(details('Oscillaria caviae Simons 1920, according to Simons 1922')).
|
124
|
-
to eq [{ genus: { string: 'Oscillaria' }, species: { string: 'caviae',
|
125
|
-
authorship: 'Simons 1920', basionymAuthorTeam:
|
126
|
-
{ authorTeam: 'Simons', author: ['Simons'], year: '1920' } } }]
|
127
|
-
sn = 'Bacterium monocytogenes hominis\'\' Nyfeldt 1932'
|
128
|
-
expect(details(sn)).to eq [{ genus: { string: 'Bacterium' },
|
129
|
-
species: { string: 'monocytogenes' },
|
130
|
-
infraspecies: [{ string: 'hominis', rank: 'n/a' }] }]
|
131
|
-
sn = 'Choriozopella trägårdhi Lawrence, 1947'
|
132
|
-
expect(details(sn)).to eq [{ genus: { string: 'Choriozopella' },
|
133
|
-
species: { string: 'tragardhi', authorship: 'Lawrence, 1947',
|
134
|
-
basionymAuthorTeam: { authorTeam: 'Lawrence', author: ['Lawrence'],
|
135
|
-
year: '1947' } } }]
|
136
|
-
sn = 'Sparassus françoisi Simon, 1898'
|
137
|
-
expect(details(sn)).to eq [{ genus: { string: 'Sparassus' },
|
138
|
-
species: { string: 'francoisi', authorship: 'Simon, 1898',
|
139
|
-
basionymAuthorTeam: { authorTeam: 'Simon', author: ['Simon'],
|
140
|
-
year: '1898' } } }]
|
141
|
-
sn = 'Dyarcyops birói Kulczynski, 1908'
|
142
|
-
expect(details(sn)).to eq [{ genus: { string: 'Dyarcyops' },
|
143
|
-
species: { string: 'biroi', authorship: 'Kulczynski, 1908',
|
144
|
-
basionymAuthorTeam: { authorTeam: 'Kulczynski',
|
145
|
-
author: ['Kulczynski'], year: '1908' } } }]
|
146
|
-
end
|
147
|
-
|
148
|
-
it 'parses names with common utf-8 charactes' do
|
149
|
-
names = ['Rühlella','Sténométope laevissimus Bibron 1855',
|
150
|
-
'Döringina Ihering 1929'].each do |name|
|
151
|
-
expect(parse(name)).to_not be_nil
|
152
|
-
end
|
153
|
-
expect(details('Hirsutëlla mâle')).to eq [{ genus: { string: 'Hirsutella' },
|
154
|
-
species: { string: 'male' } }]
|
155
|
-
expect(details('Triticum repens vulgäre')).
|
156
|
-
to eq [{ genus: { string: 'Triticum' }, species: { string: 'repens' },
|
157
|
-
infraspecies: [{ string: 'vulgare', rank: 'n/a' }] }]
|
158
|
-
end
|
159
|
-
|
160
|
-
# AsterophUa japonica
|
161
|
-
# AsyTuktus ridiculw Parent 1931
|
162
|
-
# AtremOEa Staud 1870
|
163
|
-
|
164
|
-
|
165
|
-
end
|
@@ -1,193 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
#NOTE: this spec needs compiled treetop files.
|
4
|
-
|
5
|
-
describe ScientificNameParser do
|
6
|
-
before(:all) do
|
7
|
-
set_parser(ScientificNameParser.new)
|
8
|
-
end
|
9
|
-
|
10
|
-
it "returns version number" do
|
11
|
-
expect(ScientificNameParser.version).to match /^\d+\.\d+\.\d+/
|
12
|
-
end
|
13
|
-
|
14
|
-
it "fixes cases" do
|
15
|
-
names = [
|
16
|
-
["QUERCUS ALBA", "Quercus alba"],
|
17
|
-
["QUERCUS (QUERCUS) ALBA", "Quercus (Quercus) alba"],
|
18
|
-
["QÜERCUS", "Qüercus"],
|
19
|
-
["PARDOSA MOéSTA", "Pardosa moésta"],
|
20
|
-
]
|
21
|
-
names.each do |name, capitalization|
|
22
|
-
expect(ScientificNameParser::fix_case(name)).to eq capitalization
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
it "generates standardized json" do
|
27
|
-
read_test_file do |y|
|
28
|
-
expect(JSON.load(json(y[:name]))).to eq JSON.
|
29
|
-
load(y[:jsn]) unless y[:comment]
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
|
34
|
-
# it "generates new test_file" do
|
35
|
-
# new_test = open(File.expand_path(dir +
|
36
|
-
# "../../spec/parser/test_data_new.txt"),"w")
|
37
|
-
# read_test_file do |y|
|
38
|
-
# if y[:comment]
|
39
|
-
# new_test.write y[:comment]
|
40
|
-
# else
|
41
|
-
# name = y[:name]
|
42
|
-
# jsn = json(y[:name])# rescue puts(y[:name])
|
43
|
-
# new_test.write("#{name}|#{jsn}\n")
|
44
|
-
# end
|
45
|
-
# end
|
46
|
-
# end
|
47
|
-
|
48
|
-
it "generates reasonable output if parser failed" do
|
49
|
-
sn = "ddd sljlkj 3223452432"
|
50
|
-
expect(json(sn)).to eq "{\"scientificName\":" \
|
51
|
-
"{\"id\":\"3ebf93d9-b62a-5198-8715-4c8302f0a5d7\",\"parsed\":false," \
|
52
|
-
"\"parser_version\":\"test_version\"," \
|
53
|
-
"\"verbatim\":\"ddd sljlkj 3223452432\"}}"
|
54
|
-
end
|
55
|
-
|
56
|
-
it "shows version when the flag :show_version set to true" do
|
57
|
-
expect(parse("Homo sapiens")[:scientificName][:parser_version]).
|
58
|
-
to_not be_nil
|
59
|
-
end
|
60
|
-
|
61
|
-
it "shows version for not spelled names" do
|
62
|
-
expect(parse("not_a_name")[:scientificName][:parser_version]).to_not be_nil
|
63
|
-
end
|
64
|
-
|
65
|
-
it "generates version for viruses" do
|
66
|
-
expect(parse("Nile virus")[:scientificName][:parser_version]).to_not be_nil
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
describe "ScientificNameParser with ranked canonicals" do
|
71
|
-
before(:all) do
|
72
|
-
@parser = ScientificNameParser.new(canonical_with_rank: true)
|
73
|
-
end
|
74
|
-
|
75
|
-
it "does not influence output for uninomials and binomials" do
|
76
|
-
data = [
|
77
|
-
["Ekbainacanthus Yakowlew 1902","Ekbainacanthus"],
|
78
|
-
["Ekboarmia sagnesi herrerai Exposito 2007",
|
79
|
-
"Ekboarmia sagnesi herrerai"],
|
80
|
-
["Ekboarmia holli Oberthür", "Ekboarmia holli"]]
|
81
|
-
|
82
|
-
data.each do |d|
|
83
|
-
parsed = @parser.parse(d[0])[:scientificName][:canonical]
|
84
|
-
expect(parsed).to eq d[1]
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
it "preserves rank for ranked multinomials" do
|
89
|
-
data = [
|
90
|
-
["Cola cordifolia var. puberula A. Chev.",
|
91
|
-
"Cola cordifolia var. puberula"],
|
92
|
-
["Abies homolepis forma umbilicata (Mayr) Schelle",
|
93
|
-
"Abies homolepis forma umbilicata"],
|
94
|
-
["Quercus ilex ssp. ballota (Desf.) Samp",
|
95
|
-
"Quercus ilex ssp. ballota"],
|
96
|
-
["Physarum globuliferum forma. flavum Leontyev & Dudka",
|
97
|
-
"Physarum globuliferum forma. flavum"]
|
98
|
-
]
|
99
|
-
data.each do |d|
|
100
|
-
parsed = @parser.parse(d[0])[:scientificName][:canonical]
|
101
|
-
expect(parsed).to eq d[1]
|
102
|
-
end
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
106
|
-
describe ".add_rank_to_canonical" do
|
107
|
-
subject(:parser) { ScientificNameParser.new }
|
108
|
-
|
109
|
-
it "adds rank to infraspecies with rank" do
|
110
|
-
data = [
|
111
|
-
["Cola cordifolia var. puberula A. Chev.",
|
112
|
-
"Cola cordifolia puberula",
|
113
|
-
"Cola cordifolia var. puberula"],
|
114
|
-
["Abies homolepis forma umbilicata (Mayr) Schelle",
|
115
|
-
"Abies homolepis umbilicata",
|
116
|
-
"Abies homolepis forma umbilicata"],
|
117
|
-
["Quercus ilex ssp. ballota (Desf.) Samp",
|
118
|
-
"Quercus ilex ballota",
|
119
|
-
"Quercus ilex ssp. ballota"],
|
120
|
-
["Physarum globuliferum forma. flavum Leontyev & Dudka",
|
121
|
-
"Physarum globuliferum flavum",
|
122
|
-
"Physarum globuliferum forma. flavum"]
|
123
|
-
]
|
124
|
-
data.each do |d|
|
125
|
-
parsed = parser.parse(d[0])
|
126
|
-
canonical1 = parsed[:scientificName][:canonical]
|
127
|
-
expect(canonical1).to eq d[1]
|
128
|
-
ScientificNameParser.add_rank_to_canonical(parsed)
|
129
|
-
canonical2 = parsed[:scientificName][:canonical]
|
130
|
-
expect(canonical2).to eq d[2]
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
it "returns preprocessing tail if it exists" do
|
135
|
-
sn = "Stenometope laevissimus sensu Eschmeyer 2004"
|
136
|
-
res = parser.parse(sn)
|
137
|
-
expect(res).to_not be_nil
|
138
|
-
expect(res[:scientificName][:tail]).to eq "sensu Eschmeyer 2004"
|
139
|
-
end
|
140
|
-
|
141
|
-
it "does not work for hybrids yet" do
|
142
|
-
data = [["Corda X cordiflora var. puberula",
|
143
|
-
"Corda cordiflora puberula"]]
|
144
|
-
data.each do |d|
|
145
|
-
parsed = parser.parse(d[0])
|
146
|
-
canonical1 = parsed[:scientificName][:canonical]
|
147
|
-
expect(canonical1).to eq d[1]
|
148
|
-
ScientificNameParser.add_rank_to_canonical(parsed)
|
149
|
-
canonical2 = parsed[:scientificName][:canonical]
|
150
|
-
expect(canonical2).to eq d[1]
|
151
|
-
end
|
152
|
-
end
|
153
|
-
end
|
154
|
-
|
155
|
-
describe ParallelParser do
|
156
|
-
it "finds number of cpus" do
|
157
|
-
pparser = ParallelParser.new
|
158
|
-
expect(pparser.cpu_num).to be > 0
|
159
|
-
end
|
160
|
-
|
161
|
-
it "parses several names in parallel" do
|
162
|
-
names = []
|
163
|
-
read_test_file { |n| names << (n[:name]) if n[:name] }
|
164
|
-
names.uniq!
|
165
|
-
pparser = ParallelParser.new
|
166
|
-
res = pparser.parse(names)
|
167
|
-
expect(names.size).to be > 100
|
168
|
-
expect(res.keys.size).to eq names.size
|
169
|
-
end
|
170
|
-
|
171
|
-
it "parses several names in parallel with given num of processes" do
|
172
|
-
names = []
|
173
|
-
read_test_file { |n| names << (n[:name]) if n[:name] }
|
174
|
-
names.uniq!
|
175
|
-
pparser = ParallelParser.new(4)
|
176
|
-
res = pparser.parse(names)
|
177
|
-
expect(names.size).to be > 100
|
178
|
-
expect(res.keys.size).to eq names.size
|
179
|
-
end
|
180
|
-
|
181
|
-
it "has parsed name in native ruby format and in returned as \
|
182
|
-
a hash with name as a key and parsed data as value" do
|
183
|
-
names = []
|
184
|
-
read_test_file { |n| names << (n[:name]) if n[:name] }
|
185
|
-
names.uniq!
|
186
|
-
pparser = ParallelParser.new(4)
|
187
|
-
res = pparser.parse(names)
|
188
|
-
names.each_with_index do |name, i|
|
189
|
-
expect(res[name].is_a?(Hash)).to be true
|
190
|
-
expect(res[name][:scientificName][:verbatim]).to eq name
|
191
|
-
end
|
192
|
-
end
|
193
|
-
end
|