biodiversity19 0.5.15
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +13 -0
- data/LICENSE +20 -0
- data/README.rdoc +44 -0
- data/Rakefile +43 -0
- data/VERSION +1 -0
- data/bin/nnparse +43 -0
- data/bin/parserver +14 -0
- data/biodiversity.gemspec +85 -0
- data/conf/environment.rb +3 -0
- data/lib/biodiversity/guid/lsid.rb +18 -0
- data/lib/biodiversity/guid.rb +2 -0
- data/lib/biodiversity/parser/scientific_name_canonical.rb +475 -0
- data/lib/biodiversity/parser/scientific_name_canonical.treetop +111 -0
- data/lib/biodiversity/parser/scientific_name_clean.rb +6142 -0
- data/lib/biodiversity/parser/scientific_name_clean.treetop +1195 -0
- data/lib/biodiversity/parser/scientific_name_dirty.rb +1096 -0
- data/lib/biodiversity/parser/scientific_name_dirty.treetop +211 -0
- data/lib/biodiversity/parser.rb +57 -0
- data/lib/biodiversity.rb +9 -0
- data/pkg/.gitignore +0 -0
- data/spec/biodiversity_spec.rb +0 -0
- data/spec/guid/lsid.spec.rb +12 -0
- data/spec/parser/scientific_name.spec.rb +35 -0
- data/spec/parser/scientific_name_canonical.spec.rb +27 -0
- data/spec/parser/scientific_name_clean.spec.rb +504 -0
- data/spec/parser/scientific_name_dirty.spec.rb +90 -0
- data/spec/parser/spec_helper.rb +69 -0
- data/spec/parser/test_data.txt +235 -0
- data/spec/spec_helper.rb +0 -0
- metadata +122 -0
@@ -0,0 +1,211 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
grammar ScientificNameDirty
|
3
|
+
include ScientificNameClean
|
4
|
+
|
5
|
+
rule root
|
6
|
+
super
|
7
|
+
end
|
8
|
+
|
9
|
+
rule scientific_name_5
|
10
|
+
a:scientific_name_4 garbage {
|
11
|
+
def value
|
12
|
+
a.value
|
13
|
+
end
|
14
|
+
|
15
|
+
def canonical
|
16
|
+
a.canonical
|
17
|
+
end
|
18
|
+
|
19
|
+
def pos
|
20
|
+
a.pos
|
21
|
+
end
|
22
|
+
|
23
|
+
def details
|
24
|
+
a.details
|
25
|
+
end
|
26
|
+
}
|
27
|
+
/
|
28
|
+
super
|
29
|
+
end
|
30
|
+
|
31
|
+
rule infraspecies
|
32
|
+
a:infraspecies_string space b:year {
|
33
|
+
def value
|
34
|
+
a.value + " " + b.value
|
35
|
+
end
|
36
|
+
|
37
|
+
def canonical
|
38
|
+
a.canonical
|
39
|
+
end
|
40
|
+
|
41
|
+
def pos
|
42
|
+
a.pos.merge(b.pos)
|
43
|
+
end
|
44
|
+
|
45
|
+
def details
|
46
|
+
{:infraspecies => a.details[:infraspecies].merge(b.details)}
|
47
|
+
end
|
48
|
+
}
|
49
|
+
/
|
50
|
+
a:infraspecies_string space string_authorship_inconsistencies space b:authorship {
|
51
|
+
def value
|
52
|
+
a.value + " " + b.value
|
53
|
+
end
|
54
|
+
|
55
|
+
def canonical
|
56
|
+
a.canonical
|
57
|
+
end
|
58
|
+
|
59
|
+
def pos
|
60
|
+
a.pos.merge(b.pos)
|
61
|
+
end
|
62
|
+
|
63
|
+
def details
|
64
|
+
{:infraspecies => a.details[:infraspecies].merge(b.details)}
|
65
|
+
end
|
66
|
+
}
|
67
|
+
/
|
68
|
+
super
|
69
|
+
end
|
70
|
+
|
71
|
+
rule species
|
72
|
+
a:species_string space b:year {
|
73
|
+
def value
|
74
|
+
a.value + " " + b.value
|
75
|
+
end
|
76
|
+
|
77
|
+
def canonical
|
78
|
+
a.canonical
|
79
|
+
end
|
80
|
+
|
81
|
+
def pos
|
82
|
+
a.pos.merge(b.pos)
|
83
|
+
end
|
84
|
+
|
85
|
+
def details
|
86
|
+
{:species => a.details[:species].merge(b.details)}
|
87
|
+
end
|
88
|
+
}
|
89
|
+
/
|
90
|
+
super
|
91
|
+
end
|
92
|
+
|
93
|
+
rule right_paren
|
94
|
+
")" space ")"
|
95
|
+
/
|
96
|
+
super
|
97
|
+
end
|
98
|
+
|
99
|
+
rule left_paren
|
100
|
+
"(" space "("
|
101
|
+
/
|
102
|
+
super
|
103
|
+
end
|
104
|
+
|
105
|
+
rule year
|
106
|
+
a:year_number space b:approximate_year {
|
107
|
+
def value
|
108
|
+
a.value + " " + b.value
|
109
|
+
end
|
110
|
+
|
111
|
+
def pos
|
112
|
+
a.pos.merge(b.pos)
|
113
|
+
end
|
114
|
+
|
115
|
+
def details
|
116
|
+
{:year => a.value, :approximate_year => b.value}
|
117
|
+
end
|
118
|
+
}
|
119
|
+
/
|
120
|
+
a:year_number space page_number {
|
121
|
+
def value
|
122
|
+
a.text_value
|
123
|
+
end
|
124
|
+
|
125
|
+
def pos
|
126
|
+
{a.interval.begin => ['year', a.interval.end]}
|
127
|
+
end
|
128
|
+
|
129
|
+
def details
|
130
|
+
{:year => value}
|
131
|
+
end
|
132
|
+
}
|
133
|
+
/
|
134
|
+
year_number_with_punctuation
|
135
|
+
/
|
136
|
+
approximate_year
|
137
|
+
/
|
138
|
+
double_year
|
139
|
+
/
|
140
|
+
super
|
141
|
+
end
|
142
|
+
|
143
|
+
rule approximate_year
|
144
|
+
"[" space a:year_number space "]"+ {
|
145
|
+
def value
|
146
|
+
"(" + a.text_value + ")"
|
147
|
+
end
|
148
|
+
|
149
|
+
def pos
|
150
|
+
{a.interval.begin => ['year', a.interval.end]}
|
151
|
+
end
|
152
|
+
|
153
|
+
def details
|
154
|
+
{:approximate_year => value}
|
155
|
+
end
|
156
|
+
}
|
157
|
+
end
|
158
|
+
|
159
|
+
|
160
|
+
rule double_year
|
161
|
+
year_number "-" [0-9]+ [A-Za-z]? [\?]? {
|
162
|
+
def value
|
163
|
+
text_value
|
164
|
+
end
|
165
|
+
|
166
|
+
def pos
|
167
|
+
{interval.begin => ['year', interval.end]}
|
168
|
+
end
|
169
|
+
|
170
|
+
def details
|
171
|
+
{:year => value}
|
172
|
+
end
|
173
|
+
}
|
174
|
+
end
|
175
|
+
|
176
|
+
rule year_number_with_punctuation
|
177
|
+
a:year_number "." {
|
178
|
+
def value
|
179
|
+
a.text_value
|
180
|
+
end
|
181
|
+
|
182
|
+
def pos
|
183
|
+
{interval.begin => ['year', interval.end]}
|
184
|
+
end
|
185
|
+
|
186
|
+
def details
|
187
|
+
{:year => value}
|
188
|
+
end
|
189
|
+
}
|
190
|
+
end
|
191
|
+
|
192
|
+
|
193
|
+
rule page_number
|
194
|
+
":" space [\d]+
|
195
|
+
{
|
196
|
+
def value
|
197
|
+
end
|
198
|
+
}
|
199
|
+
end
|
200
|
+
|
201
|
+
rule string_authorship_inconsistencies
|
202
|
+
("corrig.")
|
203
|
+
end
|
204
|
+
|
205
|
+
rule garbage
|
206
|
+
space (["',.]) space [^щ]*
|
207
|
+
/
|
208
|
+
space_hard [^ш]+
|
209
|
+
end
|
210
|
+
|
211
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
dir = File.dirname(__FILE__)
|
3
|
+
require File.join(dir, *%w[parser scientific_name_clean])
|
4
|
+
require File.join(dir, *%w[parser scientific_name_dirty])
|
5
|
+
require File.join(dir, *%w[parser scientific_name_canonical])
|
6
|
+
require 'rubygems'
|
7
|
+
require 'json'
|
8
|
+
|
9
|
+
class ScientificNameParser
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@verbatim = ''
|
13
|
+
@clean = ScientificNameCleanParser.new
|
14
|
+
@dirty = ScientificNameDirtyParser.new
|
15
|
+
@canonical = ScientificNameCanonicalParser.new
|
16
|
+
@parsed = nil
|
17
|
+
end
|
18
|
+
|
19
|
+
def parsed
|
20
|
+
@parsed
|
21
|
+
end
|
22
|
+
|
23
|
+
def parse(a_string)
|
24
|
+
@verbatim = a_string
|
25
|
+
@parsed = @clean.parse(a_string) || @dirty.parse(a_string) || @canonical.parse(a_string) || {:verbatim => a_string}
|
26
|
+
def @parsed.all
|
27
|
+
parsed = self.class != Hash
|
28
|
+
res = {:parsed => parsed}
|
29
|
+
if parsed
|
30
|
+
hybrid = self.hybrid rescue false
|
31
|
+
res.merge!({
|
32
|
+
:verbatim => self.text_value,
|
33
|
+
:normalized => self.value,
|
34
|
+
:canonical => self.canonical,
|
35
|
+
:hybrid => hybrid,
|
36
|
+
:details => self.details,
|
37
|
+
:positions => self.pos
|
38
|
+
})
|
39
|
+
else
|
40
|
+
res.merge!(self)
|
41
|
+
end
|
42
|
+
res = {:scientificName => res}
|
43
|
+
res
|
44
|
+
end
|
45
|
+
|
46
|
+
def @parsed.pos_json
|
47
|
+
self.pos.to_json rescue ''
|
48
|
+
end
|
49
|
+
|
50
|
+
def @parsed.all_json
|
51
|
+
self.all.to_json rescue ''
|
52
|
+
end
|
53
|
+
|
54
|
+
@parsed.all
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
data/lib/biodiversity.rb
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'treetop'
|
3
|
+
|
4
|
+
dir = File.dirname(__FILE__)
|
5
|
+
|
6
|
+
BIODIVERSITY_ROOT = File.join(dir, 'biodiversity')
|
7
|
+
require File.join(dir, "/../conf/environment")
|
8
|
+
require File.join(BIODIVERSITY_ROOT, "parser")
|
9
|
+
require File.join(BIODIVERSITY_ROOT, "guid")
|
data/pkg/.gitignore
ADDED
File without changes
|
File without changes
|
@@ -0,0 +1,12 @@
|
|
1
|
+
dir = File.dirname("__FILE__")
|
2
|
+
require 'rubygems'
|
3
|
+
require 'spec'
|
4
|
+
require File.expand_path(dir + "../../conf/environment")
|
5
|
+
require File.expand_path(dir + "../../lib/biodiversity/guid")
|
6
|
+
|
7
|
+
describe LsidResolver do
|
8
|
+
it "should return RFD document from lsid" do
|
9
|
+
lsid = "urn:lsid:ubio.org:classificationbank:2232671"
|
10
|
+
LsidResolver.resolve(lsid).class.should == "".class
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
#NOTE: this spec needs compiled treetop files.
|
2
|
+
dir = File.dirname("__FILE__")
|
3
|
+
require File.expand_path(dir + '../../spec/parser/spec_helper')
|
4
|
+
require File.expand_path(dir + '../../lib/biodiversity/parser')
|
5
|
+
|
6
|
+
describe ScientificNameClean do
|
7
|
+
before(:all) do
|
8
|
+
set_parser(ScientificNameParser.new)
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'should generate standardized json' do
|
12
|
+
read_test_file do |y|
|
13
|
+
JSON.load(json(y[:name])).should == JSON.load(y[:jsn]) unless y[:comment]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# it 'should generate new test_file' do
|
18
|
+
# new_test = open(File.expand_path(dir + "../../spec/parser/test_data_new.txt"),'w')
|
19
|
+
# read_test_file do |y|
|
20
|
+
# if y[:comment]
|
21
|
+
# new_test.write y[:comment]
|
22
|
+
# else
|
23
|
+
# name = y[:name]
|
24
|
+
# jsn = json(y[:name])# rescue puts(y[:name])
|
25
|
+
# new_test.write("#{name}|#{jsn}\n")
|
26
|
+
# end
|
27
|
+
# end
|
28
|
+
# end
|
29
|
+
|
30
|
+
it 'should generate reasonable output if parser failed' do
|
31
|
+
sn = 'ddd sljlkj 3223452432'
|
32
|
+
json(sn).should == '{"scientificName":{"parsed":false,"verbatim":"ddd sljlkj 3223452432"}}'
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
dir = File.dirname("__FILE__")
|
3
|
+
require File.expand_path(dir + '../../spec/parser/spec_helper')
|
4
|
+
|
5
|
+
describe ScientificNameCanonical do
|
6
|
+
before(:all) do
|
7
|
+
set_parser(ScientificNameCanonicalParser.new)
|
8
|
+
end
|
9
|
+
|
10
|
+
|
11
|
+
it 'should parse names with valid name part and unparseable rest' do
|
12
|
+
[
|
13
|
+
['Morea ssjjlajajaj324$33 234243242','Morea', [{:uninomial=>{:string=>"Morea"}}], {0=>["uninomial", 5]}],
|
14
|
+
['Morea (Morea) Burt 2342343242 23424322342 23424234', 'Morea (Morea)', [{:genus=>{:string=>"Morea"}, :infragenus=>{:string=>"Morea"}}], {0=>["genus", 5], 7=>["infragenus", 12]}],
|
15
|
+
['Morea (Morea) burtius 2342343242 23424322342 23424234', 'Morea (Morea) burtius', [{:genus=>{:string=>"Morea"}, :infragenus=>{:string=>"Morea"}, :species=>{:string=>"burtius"}}], {0=>["genus", 5], 7=>["infragenus", 12], 14=>["species", 21]}],
|
16
|
+
['Moraea spathulata ( (L. f. Klatt','Moraea spathulata',[{:genus=>{:string=>"Moraea"}, :species=>{:string=>"spathulata"}}], {0=>["genus", 6], 7=>["species", 17]} ],
|
17
|
+
['Verpericola megasoma ""Dall" Pils.','Verpericola megasoma',[{:genus=>{:string=>"Verpericola"}, :species=>{:string=>"megasoma"}}], {0=>["genus", 11], 12=>["species", 20]}]
|
18
|
+
].each do |n|
|
19
|
+
parse(n[0]).should_not be_nil
|
20
|
+
value(n[0]).should == n[1]
|
21
|
+
details(n[0]).should == n[2]
|
22
|
+
pos(n[0]).should == n[3]
|
23
|
+
parse(n[0]).hybrid.should be_false
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|