dimus-biodiversity 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/biodiversity/parser.rb +6 -5
- data/lib/biodiversity/parser/scientific_name_canonical.rb +142 -91
- data/lib/biodiversity/parser/scientific_name_canonical.treetop +8 -6
- data/lib/biodiversity/parser/scientific_name_clean.rb +864 -375
- data/lib/biodiversity/parser/scientific_name_clean.treetop +47 -26
- data/lib/biodiversity/parser/scientific_name_dirty.rb +421 -5
- data/lib/biodiversity/parser/scientific_name_dirty.treetop +90 -2
- data/spec/parser/scientific_name.spec.rb +7 -30
- data/spec/parser/scientific_name_canonical.spec.rb +4 -29
- data/spec/parser/scientific_name_clean.spec.rb +31 -27
- data/spec/parser/scientific_name_dirty.spec.rb +19 -45
- metadata +1 -1
@@ -5,6 +5,68 @@ grammar ScientificNameDirty
|
|
5
5
|
rule root
|
6
6
|
super
|
7
7
|
end
|
8
|
+
|
9
|
+
rule scientific_name_5
|
10
|
+
a:scientific_name_4 garbage {
|
11
|
+
def value
|
12
|
+
a.value
|
13
|
+
end
|
14
|
+
|
15
|
+
def canonical
|
16
|
+
a.canonical
|
17
|
+
end
|
18
|
+
|
19
|
+
def pos
|
20
|
+
a.pos
|
21
|
+
end
|
22
|
+
|
23
|
+
def details
|
24
|
+
a.details
|
25
|
+
end
|
26
|
+
}
|
27
|
+
/
|
28
|
+
super
|
29
|
+
end
|
30
|
+
|
31
|
+
rule infraspecies
|
32
|
+
a:infraspecies_epitheton space b:year {
|
33
|
+
def value
|
34
|
+
a.value + " " + b.value
|
35
|
+
end
|
36
|
+
|
37
|
+
def canonical
|
38
|
+
a.canonical
|
39
|
+
end
|
40
|
+
|
41
|
+
def pos
|
42
|
+
a.pos.merge(b.pos)
|
43
|
+
end
|
44
|
+
|
45
|
+
def details
|
46
|
+
{:infraspecies => a.details[:infraspecies].merge(b.details)}
|
47
|
+
end
|
48
|
+
}
|
49
|
+
/
|
50
|
+
a:infraspecies_epitheton space epitheton_authorship_inconsistencies space b:authorship {
|
51
|
+
def value
|
52
|
+
a.value + " " + b.value
|
53
|
+
end
|
54
|
+
|
55
|
+
def canonical
|
56
|
+
a.canonical
|
57
|
+
end
|
58
|
+
|
59
|
+
def pos
|
60
|
+
a.pos.merge(b.pos)
|
61
|
+
end
|
62
|
+
|
63
|
+
def details
|
64
|
+
{:infraspecies => a.details[:infraspecies].merge(b.details)}
|
65
|
+
end
|
66
|
+
}
|
67
|
+
/
|
68
|
+
super
|
69
|
+
end
|
8
70
|
|
9
71
|
rule species
|
10
72
|
a:species_epitheton space b:year {
|
@@ -33,8 +95,6 @@ grammar ScientificNameDirty
|
|
33
95
|
/
|
34
96
|
super
|
35
97
|
end
|
36
|
-
|
37
|
-
|
38
98
|
|
39
99
|
rule left_paren
|
40
100
|
"(" space "("
|
@@ -71,6 +131,8 @@ grammar ScientificNameDirty
|
|
71
131
|
end
|
72
132
|
}
|
73
133
|
/
|
134
|
+
year_number_with_punctuation
|
135
|
+
/
|
74
136
|
approximate_year
|
75
137
|
/
|
76
138
|
double_year
|
@@ -111,6 +173,23 @@ grammar ScientificNameDirty
|
|
111
173
|
}
|
112
174
|
end
|
113
175
|
|
176
|
+
rule year_number_with_punctuation
|
177
|
+
a:year_number "." {
|
178
|
+
def value
|
179
|
+
a.text_value
|
180
|
+
end
|
181
|
+
|
182
|
+
def pos
|
183
|
+
{interval.begin => ['year', interval.end]}
|
184
|
+
end
|
185
|
+
|
186
|
+
def details
|
187
|
+
{:year => value}
|
188
|
+
end
|
189
|
+
}
|
190
|
+
end
|
191
|
+
|
192
|
+
|
114
193
|
rule page_number
|
115
194
|
":" space [\d]+
|
116
195
|
{
|
@@ -119,5 +198,14 @@ grammar ScientificNameDirty
|
|
119
198
|
}
|
120
199
|
end
|
121
200
|
|
201
|
+
rule epitheton_authorship_inconsistencies
|
202
|
+
("corrig.")
|
203
|
+
end
|
204
|
+
|
205
|
+
rule garbage
|
206
|
+
space (["',.]) space [^щ]*
|
207
|
+
/
|
208
|
+
space_hard [^ш]+
|
209
|
+
end
|
122
210
|
|
123
211
|
end
|
@@ -1,39 +1,11 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'spec'
|
3
|
-
require 'treetop'
|
4
|
-
require 'yaml'
|
5
|
-
|
6
1
|
#NOTE: this spec needs compiled treetop files.
|
7
2
|
dir = File.dirname("__FILE__")
|
3
|
+
require File.expand_path(dir + '../../spec/parser/spec_helper')
|
8
4
|
require File.expand_path(dir + '../../lib/biodiversity/parser')
|
9
5
|
|
10
6
|
describe ScientificNameClean do
|
11
7
|
before(:all) do
|
12
|
-
|
13
|
-
end
|
14
|
-
|
15
|
-
def parse(input)
|
16
|
-
@parser.parse(input)
|
17
|
-
end
|
18
|
-
|
19
|
-
def value(input)
|
20
|
-
parse(input).value
|
21
|
-
end
|
22
|
-
|
23
|
-
def canonical(input)
|
24
|
-
parse(input).canonical
|
25
|
-
end
|
26
|
-
|
27
|
-
def details(input)
|
28
|
-
parse(input).details
|
29
|
-
end
|
30
|
-
|
31
|
-
def pos(input)
|
32
|
-
parse(input).pos
|
33
|
-
end
|
34
|
-
|
35
|
-
def json(input)
|
36
|
-
parse(input).to_json
|
8
|
+
set_parser(ScientificNameParser.new)
|
37
9
|
end
|
38
10
|
|
39
11
|
it 'should generate standardized json' do
|
@@ -44,5 +16,10 @@ describe ScientificNameClean do
|
|
44
16
|
JSON.load(json(name)).should == JSON.load(jsn)
|
45
17
|
end
|
46
18
|
end
|
19
|
+
|
20
|
+
it 'should generate reasonable output if parser failed' do
|
21
|
+
sn = 'ddd sljlkj 3223452432'
|
22
|
+
json(sn).should == '{"scientificName":{"parsed":false,"verbatim":"ddd sljlkj 3223452432"}}'
|
23
|
+
end
|
47
24
|
|
48
25
|
end
|
@@ -1,38 +1,12 @@
|
|
1
|
+
# encoding: UTF-8
|
1
2
|
dir = File.dirname("__FILE__")
|
2
|
-
require '
|
3
|
-
require 'spec'
|
4
|
-
require 'treetop'
|
5
|
-
require 'yaml'
|
6
|
-
|
7
|
-
Treetop.load(File.expand_path(dir + '../../lib/biodiversity/parser/scientific_name_clean'))
|
8
|
-
Treetop.load(File.expand_path(dir + '../../lib/biodiversity/parser/scientific_name_dirty'))
|
9
|
-
Treetop.load(File.expand_path(dir + '../../lib/biodiversity/parser/scientific_name_canonical'))
|
10
|
-
|
3
|
+
require File.expand_path(dir + '../../spec/parser/spec_helper')
|
11
4
|
|
12
5
|
describe ScientificNameCanonical do
|
13
6
|
before(:all) do
|
14
|
-
|
15
|
-
end
|
16
|
-
|
17
|
-
def parse(input)
|
18
|
-
@parser.parse(input)
|
19
|
-
end
|
20
|
-
|
21
|
-
def value(input)
|
22
|
-
parse(input).value
|
7
|
+
set_parser(ScientificNameCanonicalParser.new)
|
23
8
|
end
|
24
9
|
|
25
|
-
def canonical(input)
|
26
|
-
parse(input).canonical
|
27
|
-
end
|
28
|
-
|
29
|
-
def details(input)
|
30
|
-
parse(input).details
|
31
|
-
end
|
32
|
-
|
33
|
-
def pos(input)
|
34
|
-
parse(input).pos
|
35
|
-
end
|
36
10
|
|
37
11
|
it 'should parse names with valid name part and unparseable rest' do
|
38
12
|
[
|
@@ -48,4 +22,5 @@ describe ScientificNameCanonical do
|
|
48
22
|
pos(n[0]).should == n[3]
|
49
23
|
end
|
50
24
|
end
|
25
|
+
|
51
26
|
end
|
@@ -1,35 +1,11 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
dir = File.dirname("__FILE__")
|
3
|
-
require '
|
4
|
-
require 'spec'
|
5
|
-
require 'yaml'
|
6
|
-
require 'treetop'
|
3
|
+
require File.expand_path(dir + '../../spec/parser/spec_helper')
|
7
4
|
|
8
|
-
Treetop.load(File.expand_path(dir + '../../lib/biodiversity/parser/scientific_name_clean'))
|
9
5
|
|
10
6
|
describe ScientificNameClean do
|
11
7
|
before(:all) do
|
12
|
-
|
13
|
-
end
|
14
|
-
|
15
|
-
def parse(input)
|
16
|
-
@parser.parse(input)
|
17
|
-
end
|
18
|
-
|
19
|
-
def value(input)
|
20
|
-
parse(input).value
|
21
|
-
end
|
22
|
-
|
23
|
-
def canonical(input)
|
24
|
-
parse(input).canonical
|
25
|
-
end
|
26
|
-
|
27
|
-
def details(input)
|
28
|
-
parse(input).details
|
29
|
-
end
|
30
|
-
|
31
|
-
def pos(input)
|
32
|
-
parse(input).pos
|
8
|
+
set_parser(ScientificNameCleanParser.new)
|
33
9
|
end
|
34
10
|
|
35
11
|
it 'should parse uninomial' do
|
@@ -312,7 +288,7 @@ describe ScientificNameClean do
|
|
312
288
|
sn = "Gastrosericus eremorum von Beaumont 1955"
|
313
289
|
canonical(sn).should == 'Gastrosericus eremorum'
|
314
290
|
sn = "Cypraeovula (Luponia) amphithales perdentata"
|
315
|
-
canonical(sn).should == 'Cypraeovula amphithales perdentata'
|
291
|
+
canonical(sn).should == 'Cypraeovula Luponia amphithales perdentata'
|
316
292
|
details(sn).should == {:genus=>{:epitheton=>"Cypraeovula"}, :subgenus=>{:epitheton=>"Luponia"}, :species=>{:epitheton=>"amphithales"}, :infraspecies=>{:epitheton=>"perdentata", :rank=>"n/a"}}
|
317
293
|
sn = "Polyrhachis orsyllus nat musculus Forel 1901"
|
318
294
|
canonical(sn).should == "Polyrhachis orsyllus musculus"
|
@@ -490,5 +466,33 @@ describe ScientificNameClean do
|
|
490
466
|
parse("Trematоsphaeria phaeáapora").should be_nil #cyrillic o
|
491
467
|
end
|
492
468
|
|
469
|
+
it "should parse new stuff" do
|
470
|
+
sn = 'Nesticus quelpartensis Paik & Namkung, in Paik, Yaginuma & Namkung, 1969'
|
471
|
+
details(sn).should == {:genus=>{:epitheton=>"Nesticus"}, :species=>{:epitheton=>"quelpartensis", :authorship=>"Paik & Namkung, in Paik, Yaginuma & Namkung, 1969", :basionymAuthorTeam=>{:authorTeam=>"Paik & Namkung", :author=>["Paik", "Namkung"], :exAuthorTeam=>{:authorTeam=>"Paik, Yaginuma & Namkung", :author=>["Paik", "Yaginuma", "Namkung"], :year=>"1969"}}}}
|
472
|
+
parse('Dipoena yoshidai Ono, in Ono et al., 1991').should_not be_nil
|
473
|
+
sn = 'Choriozopella trägårdhi Lawrence, 1947'
|
474
|
+
details(sn).should == {:genus=>{:epitheton=>"Choriozopella"}, :species=>{:epitheton=>"trägårdhi", :authorship=>"Lawrence, 1947", :basionymAuthorTeam=>{:authorTeam=>"Lawrence", :author=>["Lawrence"], :year=>"1947"}}}
|
475
|
+
sn = 'Latrodectus mactans bishopi Kaston, 1938'
|
476
|
+
details(sn).should == {:genus=>{:epitheton=>"Latrodectus"}, :species=>{:epitheton=>"mactans"}, :infraspecies=>{:epitheton=>"bishopi", :rank=>"n/a", :authorship=>"Kaston, 1938", :basionymAuthorTeam=>{:authorTeam=>"Kaston", :author=>["Kaston"], :year=>"1938"}}}
|
477
|
+
sn = 'Diplocephalus aff. procerus Thaler, 1972'
|
478
|
+
details(sn).should == {:genus=>{:epitheton=>"Diplocephalus"}, :species=>{:epitheton=>"procerus", :authorship=>"Thaler, 1972", :basionymAuthorTeam=>{:authorTeam=>"Thaler", :author=>["Thaler"], :year=>"1972"}}}
|
479
|
+
sn = 'Dyarcyops birói Kulczynski, 1908'
|
480
|
+
details(sn).should == {:genus=>{:epitheton=>"Dyarcyops"}, :species=>{:epitheton=>"birói", :authorship=>"Kulczynski, 1908", :basionymAuthorTeam=>{:authorTeam=>"Kulczynski", :author=>["Kulczynski"], :year=>"1908"}}}
|
481
|
+
sn = 'Sparassus françoisi Simon, 1898'
|
482
|
+
details(sn).should == {:genus=>{:epitheton=>"Sparassus"}, :species=>{:epitheton=>"françoisi", :authorship=>"Simon, 1898", :basionymAuthorTeam=>{:authorTeam=>"Simon", :author=>["Simon"], :year=>"1898"}}}
|
483
|
+
sn = 'Thiobacillus x Parker and Prisk 1953' #have to figure out black lists for this one
|
484
|
+
sn = 'Bacille de Plaut, Kritchevsky and Séguin 1921'
|
485
|
+
details(sn).should == {:uninomial=>{:epitheton=>"Bacille", :authorship=>"de Plaut, Kritchevsky and Séguin 1921", :basionymAuthorTeam=>{:authorTeam=>"de Plaut, Kritchevsky and Séguin", :author=>["de Plaut", "Kritchevsky", "Séguin"], :year=>"1921"}}}
|
486
|
+
sn = 'Araneus van bruysseli Petrunkevitch, 1911'
|
487
|
+
details(sn).should == {:genus=>{:epitheton=>"Araneus"}, :species=>{:epitheton=>"van"}, :infraspecies=>{:epitheton=>"bruysseli", :rank=>"n/a", :authorship=>"Petrunkevitch, 1911", :basionymAuthorTeam=>{:authorTeam=>"Petrunkevitch", :author=>["Petrunkevitch"], :year=>"1911"}}}
|
488
|
+
sn = 'Sapromyces laidlawi ab Sabin 1941'
|
489
|
+
details(sn).should == {:genus=>{:epitheton=>"Sapromyces"}, :species=>{:epitheton=>"laidlawi", :authorship=>"ab Sabin 1941", :basionymAuthorTeam=>{:authorTeam=>"ab Sabin", :author=>["ab Sabin"], :year=>"1941"}}}
|
490
|
+
sn = 'Nocardia rugosa di Marco and Spalla 1957'
|
491
|
+
details(sn).should == {:genus=>{:epitheton=>"Nocardia"}, :species=>{:epitheton=>"rugosa", :authorship=>"di Marco and Spalla 1957", :basionymAuthorTeam=>{:authorTeam=>"di Marco and Spalla", :author=>["di Marco", "Spalla"], :year=>"1957"}}}
|
492
|
+
sn = 'Flexibacter elegans Lewin 1969 non Soriano 1945'
|
493
|
+
details(sn).should == {:genus=>{:epitheton=>"Flexibacter"}, :species=>{:epitheton=>"elegans", :authorship=>"Lewin 1969 non Soriano 1945", :basionymAuthorTeam=>{:authorTeam=>"Lewin", :author=>["Lewin"], :year=>"1969"}}}
|
494
|
+
sn = 'Flexibacter elegans Soriano 1945, non Lewin 1969'
|
495
|
+
details(sn).should == {:genus=>{:epitheton=>"Flexibacter"}, :species=>{:epitheton=>"elegans", :authorship=>"Soriano 1945, non Lewin 1969", :basionymAuthorTeam=>{:authorTeam=>"Soriano", :author=>["Soriano"], :year=>"1945"}}}
|
496
|
+
end
|
493
497
|
|
494
498
|
end
|
@@ -1,55 +1,13 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
dir = File.dirname("__FILE__")
|
3
|
-
require '
|
4
|
-
require 'spec'
|
5
|
-
require 'yaml'
|
6
|
-
require 'treetop'
|
7
|
-
|
8
|
-
Treetop.load(File.expand_path(dir + '../../lib/biodiversity/parser/scientific_name_clean'))
|
9
|
-
Treetop.load(File.expand_path(dir + '../../lib/biodiversity/parser/scientific_name_dirty'))
|
3
|
+
require File.expand_path(dir + '../../spec/parser/spec_helper')
|
10
4
|
|
11
5
|
describe ScientificNameDirty do
|
12
6
|
before(:all) do
|
13
|
-
|
14
|
-
end
|
15
|
-
|
16
|
-
def parse(input)
|
17
|
-
@parser.parse(input)
|
18
|
-
end
|
19
|
-
|
20
|
-
def value(input)
|
21
|
-
parse(input).value
|
22
|
-
end
|
23
|
-
|
24
|
-
def canonical(input)
|
25
|
-
parse(input).canonical
|
26
|
-
end
|
27
|
-
|
28
|
-
def details(input)
|
29
|
-
parse(input).details
|
30
|
-
end
|
31
|
-
|
32
|
-
def pos(input)
|
33
|
-
parse(input).pos
|
34
|
-
end
|
35
|
-
|
36
|
-
def debug(input)
|
37
|
-
res = parse(input)
|
38
|
-
puts "<pre>"
|
39
|
-
if res
|
40
|
-
puts 'success!'
|
41
|
-
puts res.inspect
|
42
|
-
else
|
43
|
-
puts input
|
44
|
-
val = @parser.failure_reason.to_s.match(/column [0-9]*/).to_s.gsub(/column /,'').to_i
|
45
|
-
print ("-" * (val - 1))
|
46
|
-
print "^ Computer says 'no'!\n"
|
47
|
-
puts @parser.failure_reason
|
48
|
-
puts @parser.to_yaml
|
49
|
-
end
|
50
|
-
puts "</pre>"
|
7
|
+
set_parser(ScientificNameDirtyParser.new)
|
51
8
|
end
|
52
9
|
|
10
|
+
|
53
11
|
it 'should parse clean names' do
|
54
12
|
parse("Betula verucosa (L.) Bar. 1899").should_not be_nil
|
55
13
|
end
|
@@ -113,4 +71,20 @@ describe ScientificNameDirty do
|
|
113
71
|
pos(sn).should == {0=>["genus", 8], 9=>["species", 18], 19=>["author_word", 24], 26=>["year", 30], 32=>["year", 36]}
|
114
72
|
end
|
115
73
|
|
74
|
+
it "should parse new stuff" do
|
75
|
+
sn = 'Zoropsis (TaKeoa) nishimurai Yaginuma, 1971' #skipping for now
|
76
|
+
sn = 'Campylobacter pyloridis Marshall et al.1985.'
|
77
|
+
details(sn).should == {:genus=>{:epitheton=>"Campylobacter"}, :species=>{:epitheton=>"pyloridis", :authorship=>"Marshall et al.1985.", :basionymAuthorTeam=>{:authorTeam=>"Marshall et al.", :author=>["Marshall et al."], :year=>"1985"}}}
|
78
|
+
sn = 'Staphylococcus hyicus chromogenes Devriese et al. 1978 (Approved Lists 1980).'
|
79
|
+
details(sn).should == {:genus=>{:epitheton=>"Staphylococcus"}, :species=>{:epitheton=>"hyicus"}, :infraspecies=>{:epitheton=>"chromogenes", :rank=>"n/a", :authorship=>"Devriese et al. 1978", :basionymAuthorTeam=>{:authorTeam=>"Devriese et al.", :author=>["Devriese et al."], :year=>"1978"}}}
|
80
|
+
sn = 'Kitasatospora corrig. griseola Takahashi et al. 1985.'
|
81
|
+
details(sn).should == {:genus=>{:epitheton=>"Kitasatospora"}, :species=>{:epitheton=>"griseola", :authorship=>"Takahashi et al. 1985.", :basionymAuthorTeam=>{:authorTeam=>"Takahashi et al.", :author=>["Takahashi et al."], :year=>"1985"}}}
|
82
|
+
sn = 'Beijerinckia derxii venezuelae corrig. Thompson and Skerman, 1981'
|
83
|
+
details(sn).should == {:genus=>{:epitheton=>"Beijerinckia"}, :species=>{:epitheton=>"derxii"}, :infraspecies=>{:epitheton=>"venezuelae", :rank=>"n/a", :authorship=>"Thompson and Skerman, 1981", :basionymAuthorTeam=>{:authorTeam=>"Thompson and Skerman", :author=>["Thompson", "Skerman"], :year=>"1981"}}}
|
84
|
+
details('Streptomyces parvisporogenes ignotus 1960').should == {:genus=>{:epitheton=>"Streptomyces"}, :species=>{:epitheton=>"parvisporogenes"}, :infraspecies=>{:epitheton=>"ignotus", :rank=>"n/a", :year=>"1960"}}
|
85
|
+
details('Oscillaria caviae Simons 1920, according to Simons 1922').should == {:genus=>{:epitheton=>"Oscillaria"}, :species=>{:epitheton=>"caviae", :authorship=>"Simons 1920", :basionymAuthorTeam=>{:authorTeam=>"Simons", :author=>["Simons"], :year=>"1920"}}}
|
86
|
+
sn = 'Bacterium monocytogenes hominis"" Nyfeldt 1932'
|
87
|
+
details(sn).should == {:genus=>{:epitheton=>"Bacterium"}, :species=>{:epitheton=>"monocytogenes"}, :infraspecies=>{:epitheton=>"hominis", :rank=>"n/a"}}
|
88
|
+
end
|
89
|
+
|
116
90
|
end
|