biodiversity19 1.2.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rvmrc +1 -1
- data/CHANGELOG +8 -0
- data/README.rdoc +4 -4
- data/Rakefile +0 -1
- data/VERSION +1 -1
- data/bin/parserver +32 -43
- data/lib/biodiversity/parser.rb +40 -28
- data/spec/parser/scientific_name.spec.rb +1 -1
- data/spec/parser/test_data.txt +2 -2
- metadata +3 -3
data/.rvmrc
CHANGED
@@ -1 +1 @@
|
|
1
|
-
rvm use ruby-1.9.3-
|
1
|
+
rvm use ruby-1.9.3-p392@biodiversity --create
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
2.0.0 -- backward incompatibe change in parserver, therefore new major number.
|
2
|
+
In parserver removed option --output=canonical_with_rank, instead added -r
|
3
|
+
option which allows to have canonical with rank with either json or canonical
|
4
|
+
outputs
|
5
|
+
|
6
|
+
1.2.0 -- changed method invocation signature ScientificNameParser.new
|
7
|
+
Now it can take options
|
8
|
+
|
1
9
|
1.1.3 -- added 'fo' as rank
|
2
10
|
|
3
11
|
1.1.2 -- static method for fixins all-caps canonical names, fixing caps
|
data/README.rdoc
CHANGED
@@ -31,14 +31,14 @@ options:
|
|
31
31
|
|
32
32
|
to return a canonical form of the name string
|
33
33
|
|
34
|
-
parserver --output=canonical_with_rank
|
35
|
-
|
36
|
-
the same as above, but infraspecies' rank is shown if available
|
37
|
-
|
38
34
|
parserver --port 5555
|
39
35
|
|
40
36
|
run socket server on a different port
|
41
37
|
|
38
|
+
parserver --canonical_with_rank
|
39
|
+
|
40
|
+
to add rank to canonical forms with infraspecific epithet, if it is given
|
41
|
+
|
42
42
|
Then you can access it via 4334 port using a socket client library of your programming language. You can find socket client script example in the examples directory of the gem.
|
43
43
|
|
44
44
|
If you want to check if socket server works for you:
|
data/Rakefile
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
2.0.0
|
data/bin/parserver
CHANGED
@@ -5,10 +5,11 @@ require 'socket'
|
|
5
5
|
require 'biodiversity' # Get sockets from stdlib
|
6
6
|
|
7
7
|
DEFAULT_PORT = 4334
|
8
|
-
RUBY_VERSION_INT = RUBY_VERSION.split(
|
8
|
+
RUBY_VERSION_INT = RUBY_VERSION.split('.')[0..1].join('').to_i
|
9
9
|
OPTIONS = {
|
10
|
-
:
|
11
|
-
:
|
10
|
+
output: 'json',
|
11
|
+
canonical_with_rank: false,
|
12
|
+
port: DEFAULT_PORT
|
12
13
|
}
|
13
14
|
|
14
15
|
options = {}
|
@@ -16,67 +17,55 @@ ARGV.options do |opts|
|
|
16
17
|
script_name = File.basename($0)
|
17
18
|
opts.banner = "Usage: ruby #{script_name} [options]"
|
18
19
|
|
19
|
-
opts.separator
|
20
|
+
opts.separator ''
|
20
21
|
|
21
|
-
opts.on(
|
22
|
-
|
22
|
+
opts.on('-r',
|
23
|
+
'--canonical_with_rank',
|
24
|
+
'Adds infraspecies rank to canonical forms'
|
25
|
+
) { |rank| options[:canonical_with_rank] = rank }
|
26
|
+
|
27
|
+
opts.separator ''
|
28
|
+
|
29
|
+
opts.on('-o', '--output=output', String,
|
30
|
+
'Specifies the type of the output:
|
23
31
|
json - parsed results in json
|
24
|
-
canonical - canonical
|
25
|
-
|
26
|
-
"Default: json") { |output| options[:output] = output }
|
32
|
+
canonical - canonical form only',
|
33
|
+
'Default: json') { |output| options[:output] = output }
|
27
34
|
|
28
|
-
opts.separator
|
35
|
+
opts.separator ''
|
29
36
|
|
30
|
-
opts.on(
|
31
|
-
|
37
|
+
opts.on('-p', '--port=port', String,
|
38
|
+
'Specifies the port number',
|
32
39
|
"Default: #{DEFAULT_PORT}") { |port| options[:port] = port }
|
33
40
|
|
34
|
-
opts.separator
|
41
|
+
opts.separator ''
|
35
42
|
|
36
|
-
opts.on(
|
37
|
-
|
43
|
+
opts.on('-h', '--help',
|
44
|
+
'Show this help message.') { puts opts; exit }
|
38
45
|
|
39
46
|
opts.parse!
|
40
47
|
end
|
41
48
|
|
42
|
-
OPTIONS[:output] = options[:output] if ['canonical'
|
49
|
+
OPTIONS[:output] = options[:output] if ['canonical'].include?(options[:output])
|
43
50
|
OPTIONS[:port] = options[:port].to_i if options[:port].to_i > 0
|
44
|
-
|
45
|
-
def parser_error(name_string)
|
46
|
-
{:scientificName => {:parsed => false, :verbatim => name_string, :error => 'Parser error'}}
|
47
|
-
end
|
51
|
+
OPTIONS[:canonical_with_rank] = !!options[:canonical_with_rank]
|
48
52
|
|
49
53
|
def get_output(name_string, parser)
|
50
54
|
begin
|
51
|
-
if RUBY_VERSION_INT < 19
|
52
|
-
old_kcode = $KCODE
|
53
|
-
$KCODE = 'NONE'
|
54
|
-
end
|
55
55
|
parsed = parser.parse(name_string)
|
56
|
-
if RUBY_VERSION_INT < 19
|
57
|
-
$KCODE = old_kcode
|
58
|
-
end
|
59
56
|
rescue
|
60
|
-
parsed =
|
57
|
+
parsed = ScientificNameParser::FAILED_RESULT.(name_string)
|
61
58
|
end
|
62
59
|
output = OPTIONS[:output]
|
63
60
|
return parsed.to_json if output == 'json'
|
64
|
-
|
65
|
-
return canonical.to_s if output == 'canonical' || canonical == nil || parsed[:scientificName][:hybrid] || !parsed[:scientificName][:parsed]
|
66
|
-
parts = parsed[:scientificName][:canonical].split(" ")
|
67
|
-
|
68
|
-
if parts.size > 2 && parsed[:scientificName][:details][0][:infraspecies]
|
69
|
-
name_ary = parts[0..1]
|
70
|
-
parsed[:scientificName][:details][0][:infraspecies].each do |data|
|
71
|
-
name_ary << (data[:rank] && data[:rank] != 'n/a'? "#{data[:rank]} #{data[:string]}" : data[:string])
|
72
|
-
end
|
73
|
-
canonical = name_ary.join(" ")
|
74
|
-
end
|
75
|
-
canonical
|
61
|
+
parsed[:scientificName][:canonical].to_s
|
76
62
|
end
|
77
63
|
|
78
|
-
puts "Running parser service on port
|
79
|
-
|
64
|
+
puts "Running parser service on port %s, output type is '%s'" %
|
65
|
+
[OPTIONS[:port], OPTIONS[:output]]
|
66
|
+
opts = {}
|
67
|
+
opts = {canonical_with_rank: true} if OPTIONS[:canonical_with_rank]
|
68
|
+
parser = ScientificNameParser.new(opts)
|
80
69
|
server = TCPServer.open(OPTIONS[:port]) # Socket to listen on a port
|
81
70
|
loop do # Servers run forever
|
82
71
|
Thread.start(server.accept) do |client|
|
@@ -85,7 +74,7 @@ loop do # Servers run forever
|
|
85
74
|
while a = client.readline rescue nil
|
86
75
|
count += 1
|
87
76
|
puts "parsed %s'th name" % count if count % 1000 == 0
|
88
|
-
a.force_encoding(
|
77
|
+
a.force_encoding('utf-8') if a && RUBY_VERSION_INT >= 19
|
89
78
|
if ['end','exit','q', '.'].include? a.strip
|
90
79
|
client.close
|
91
80
|
break
|
data/lib/biodiversity/parser.rb
CHANGED
@@ -15,7 +15,8 @@ module PreProcessor
|
|
15
15
|
LAST_WORD_JUNK = /(,\s*|\s+)(spp\.|spp|var\.|var|von|van|ined\.|ined|sensu|new|non|nec|nudum|cf\.|cf|sp\.|sp|ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/i
|
16
16
|
|
17
17
|
def self.clean(a_string)
|
18
|
-
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
|
18
|
+
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
|
19
|
+
TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
|
19
20
|
a_string = a_string.gsub(i, '')
|
20
21
|
end
|
21
22
|
a_string = a_string.tr('ſ','s') #old 's'
|
@@ -49,10 +50,7 @@ class ParallelParser
|
|
49
50
|
private
|
50
51
|
def parse_process(name)
|
51
52
|
p = ScientificNameParser.new
|
52
|
-
|
53
|
-
verbatim: name,
|
54
|
-
error: 'Parser error' } }
|
55
|
-
p.parse(name) rescue failed_res
|
53
|
+
p.parse(name) rescue ScientificNameParser::FAILED_RESULT.(name)
|
56
54
|
end
|
57
55
|
end
|
58
56
|
|
@@ -73,6 +71,12 @@ class ScientificNameParser
|
|
73
71
|
'..',
|
74
72
|
'..',
|
75
73
|
'VERSION')).readline.strip
|
74
|
+
|
75
|
+
FAILED_RESULT = ->(name) do
|
76
|
+
{ scientificName:
|
77
|
+
{ parsed: false, verbatim: name.to_s.strip, error: 'Parser error' }
|
78
|
+
}
|
79
|
+
end
|
76
80
|
|
77
81
|
def self.fix_case(name_string)
|
78
82
|
name_ary = name_string.split(/\s+/)
|
@@ -87,17 +91,21 @@ class ScientificNameParser
|
|
87
91
|
end
|
88
92
|
else
|
89
93
|
if name_ary[0].size > 1
|
90
|
-
word1 = UnicodeUtils.upcase(name_ary[0][0]) +
|
94
|
+
word1 = UnicodeUtils.upcase(name_ary[0][0]) +
|
95
|
+
UnicodeUtils.downcase(name_ary[0][1..-1])
|
91
96
|
else
|
92
97
|
word1 = name_ary[0]
|
93
98
|
end
|
94
99
|
if name_ary[1].match(/^\(/)
|
95
|
-
word2 = name_ary[1].gsub(/\)$/, '') +
|
96
|
-
word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
|
100
|
+
word2 = name_ary[1].gsub(/\)$/, '') + ')'
|
101
|
+
word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
|
102
|
+
UnicodeUtils.downcase(word2[2..-1])
|
97
103
|
else
|
98
104
|
word2 = UnicodeUtils.downcase(name_ary[1])
|
99
105
|
end
|
100
|
-
res = word1 +
|
106
|
+
res = word1 + ' ' +
|
107
|
+
word2 + ' ' +
|
108
|
+
name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(' ')
|
101
109
|
res.strip!
|
102
110
|
end
|
103
111
|
res
|
@@ -114,7 +122,9 @@ class ScientificNameParser
|
|
114
122
|
end
|
115
123
|
|
116
124
|
def virus?(a_string)
|
117
|
-
!!(a_string.match(/\sICTV\s*$/) ||
|
125
|
+
!!(a_string.match(/\sICTV\s*$/) ||
|
126
|
+
a_string.match(/\b(virus|viruses|phage|phages|viroid|viroids|satellite|satellites|prion|prions)\b/i) ||
|
127
|
+
a_string.match(/[A-Z]?[a-z]+virus\b/))
|
118
128
|
end
|
119
129
|
|
120
130
|
def unknown_placement?(a_string)
|
@@ -126,13 +136,13 @@ class ScientificNameParser
|
|
126
136
|
end
|
127
137
|
|
128
138
|
def parse(a_string)
|
129
|
-
@verbatim = a_string
|
139
|
+
@verbatim = a_string.strip
|
130
140
|
a_string = PreProcessor::clean(a_string)
|
131
141
|
|
132
142
|
if virus?(a_string)
|
133
|
-
@parsed = { :
|
143
|
+
@parsed = { verbatim: a_string, virus: true }
|
134
144
|
elsif unknown_placement?(a_string)
|
135
|
-
@parsed = { :
|
145
|
+
@parsed = { verbatim: a_string }
|
136
146
|
else
|
137
147
|
begin
|
138
148
|
@parsed = @clean.parse(a_string) || @dirty.parse(a_string)
|
@@ -140,12 +150,12 @@ class ScientificNameParser
|
|
140
150
|
index = @dirty.index || @clean.index
|
141
151
|
salvage_match = a_string[0..index].split(/\s+/)[0..-2]
|
142
152
|
salvage_string = salvage_match ? salvage_match.join(' ') : a_string
|
143
|
-
@parsed = @dirty.parse(salvage_string) ||
|
153
|
+
@parsed = @dirty.parse(salvage_string) ||
|
154
|
+
@canonical.parse(a_string) ||
|
155
|
+
{ verbatim: a_string }
|
144
156
|
end
|
145
157
|
rescue
|
146
|
-
@parsed =
|
147
|
-
verbatim: name,
|
148
|
-
error: 'Parser error' } }
|
158
|
+
@parsed = FAILED_RESULT.(@verbatim)
|
149
159
|
end
|
150
160
|
end
|
151
161
|
|
@@ -156,22 +166,24 @@ class ScientificNameParser
|
|
156
166
|
def @parsed.all(opts = {})
|
157
167
|
canonical_with_rank = !!opts[:canonical_with_rank]
|
158
168
|
parsed = self.class != Hash
|
159
|
-
res = { :
|
169
|
+
res = { parsed: parsed, parser_version: ScientificNameParser::VERSION}
|
160
170
|
if parsed
|
161
171
|
hybrid = self.hybrid rescue false
|
162
172
|
res.merge!({
|
163
|
-
:
|
164
|
-
:
|
165
|
-
:
|
166
|
-
:
|
167
|
-
:
|
168
|
-
:
|
169
|
-
:
|
173
|
+
verbatim: @verbatim,
|
174
|
+
normalized: self.value,
|
175
|
+
canonical: self.canonical,
|
176
|
+
hybrid: hybrid,
|
177
|
+
details: self.details,
|
178
|
+
parser_run: self.parser_run,
|
179
|
+
positions: self.pos
|
170
180
|
})
|
171
181
|
else
|
172
182
|
res.merge!(self)
|
173
183
|
end
|
174
|
-
if canonical_with_rank &&
|
184
|
+
if (canonical_with_rank &&
|
185
|
+
canonical.count(' ') > 1 &&
|
186
|
+
res[:details][0][:infraspecies])
|
175
187
|
ScientificNameParser.add_rank_to_canonical(res)
|
176
188
|
end
|
177
189
|
res = {:scientificName => res}
|
@@ -192,14 +204,14 @@ class ScientificNameParser
|
|
192
204
|
private
|
193
205
|
|
194
206
|
def self.add_rank_to_canonical(parsed)
|
195
|
-
parts = parsed[:canonical].split(
|
207
|
+
parts = parsed[:canonical].split(' ')
|
196
208
|
name_ary = parts[0..1]
|
197
209
|
parsed[:details][0][:infraspecies].each do |data|
|
198
210
|
infrasp = data[:string]
|
199
211
|
rank = data[:rank]
|
200
212
|
name_ary << (rank && rank != 'n/a' ? "#{rank} #{infrasp}" : infrasp)
|
201
213
|
end
|
202
|
-
parsed[:canonical] = name_ary.join(
|
214
|
+
parsed[:canonical] = name_ary.join(' ')
|
203
215
|
end
|
204
216
|
|
205
217
|
end
|
@@ -123,7 +123,7 @@ describe ParallelParser do
|
|
123
123
|
res = pparser.parse(names)
|
124
124
|
names.each_with_index do |name, i|
|
125
125
|
res[name].is_a?(Hash).should be_true
|
126
|
-
res[name][:scientificName][:verbatim].should == name
|
126
|
+
res[name][:scientificName][:verbatim].should == name.strip
|
127
127
|
end
|
128
128
|
end
|
129
129
|
end
|
data/spec/parser/test_data.txt
CHANGED
@@ -276,8 +276,8 @@ Coeloglossum viride (L.) Hartman x Dactylorhiza majalis (Rchb. f.) P.F. Hunt & S
|
|
276
276
|
Polypodium x vulgare nothosubsp. mantoniae (Rothm.) Schidlay|{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Polypodium x vulgare nothosubsp. mantoniae (Rothm.) Schidlay", "normalized":"Polypodium × vulgare nothosubsp. mantoniae (Rothm.) Schidlay", "canonical":"Polypodium vulgare mantoniae", "hybrid":true, "details":[{"genus":{"string":"Polypodium"}, "species":{"string":"vulgare"}, "infraspecies":[{"string":"mantoniae", "rank":"nothosubsp.", "authorship":"(Rothm.) Schidlay", "combinationAuthorTeam":{"authorTeam":"Schidlay", "author":["Schidlay"]}, "basionymAuthorTeam":{"authorTeam":"Rothm.", "author":["Rothm."]}}]}], "parser_run":1, "positions":{"0":["genus", 10], "14":["species", 21], "22":["infraspecific_type", 33], "34":["infraspecies", 43], "45":["author_word", 51], "53":["author_word", 61]}}}
|
277
277
|
|
278
278
|
#empty spaces
|
279
|
-
Asplenium Xinexpectatum(E. L. Braun ex Friesner )Morton |{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"
|
280
|
-
|
279
|
+
Asplenium Xinexpectatum(E. L. Braun ex Friesner )Morton |{"scientificName":{"parsed":true, "parser_version":"test_version", "verbatim":"Asplenium Xinexpectatum(E. L. Braun ex Friesner )Morton", "normalized":"Asplenium × inexpectatum (E. L. Braun ex Friesner) Morton", "canonical":"Asplenium inexpectatum", "hybrid":true, "details":[{"genus":{"string":"Asplenium"}, "species":{"string":"inexpectatum", "authorship":"(E. L. Braun ex Friesner )Morton", "combinationAuthorTeam":{"authorTeam":"Morton", "author":["Morton"]}, "basionymAuthorTeam":{"authorTeam":"E. L. Braun", "author":["E. L. Braun"], "exAuthorTeam":{"authorTeam":"Friesner", "author":["Friesner"]}}}}], "parser_run":1, "positions":{"4":["genus", 13], "21":["species", 33], "34":["author_word", 36], "37":["author_word", 39], "40":["author_word", 45], "49":["author_word", 57], "64":["author_word", 70]}}}
|
280
|
+
|
281
281
|
####
|
282
282
|
#
|
283
283
|
# Names with problems
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: biodiversity19
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: treetop
|
@@ -178,7 +178,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
178
178
|
version: '0'
|
179
179
|
segments:
|
180
180
|
- 0
|
181
|
-
hash:
|
181
|
+
hash: -2248676907984019850
|
182
182
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
183
183
|
none: false
|
184
184
|
requirements:
|