biodiversity 3.1.10 → 3.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rspec +3 -0
- data/.ruby-version +1 -1
- data/CHANGELOG +5 -0
- data/README.md +95 -71
- data/biodiversity.gemspec +1 -0
- data/lib/biodiversity/parser.rb +33 -30
- data/lib/biodiversity/parser/scientific_name_clean.rb +45 -36
- data/lib/biodiversity/parser/scientific_name_clean.treetop +1 -1
- data/lib/biodiversity/version.rb +1 -1
- data/spec/biodiversity_spec.rb +0 -2
- data/spec/files/t.rb +15 -0
- data/spec/files/test_data.txt +345 -335
- data/spec/files/test_data.txt.new +463 -0
- data/spec/guid/lsid.spec.rb +0 -2
- data/spec/parser/scientific_name_canonical_spec.rb +0 -1
- data/spec/parser/scientific_name_clean_spec.rb +0 -2
- data/spec/parser/scientific_name_dirty_spec.rb +0 -1
- data/spec/parser/scientific_name_spec.rb +5 -4
- metadata +20 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d7bb0304f5e151933f5350780677b9a47a099716
|
4
|
+
data.tar.gz: 7adaf2c1bfce44db79bc2c04d75c584d53957fd0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1a450a93fb07f985b5f1e7761e669ed772c8add2c01955e4a8e70a575f3b5f0bc86b1b215507260c48a1604b386b60543f27048c7a65119f71a4b0ddfd7bcefe
|
7
|
+
data.tar.gz: 19090297f99d64580b4b6012a06729ede74fd3ccf7fb9ffdfeecd6480d2a1a0bd0ce8dc8654747f51a60301464cbaebd1f9693ceb54f2827140163d72314f62b
|
data/.rspec
ADDED
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.1.
|
1
|
+
2.1.6
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
3.2.0 -- added UUID version 5 identifiers for every name string, better
|
2
|
+
normalizing for the names with apostrophes, underscore-formatted names are
|
3
|
+
supported. Minor version increase because of change in the output format ("id"
|
4
|
+
field)
|
5
|
+
|
1
6
|
3.1.10 -- NPV viruses added
|
2
7
|
|
3
8
|
3.1.9 -- more virus keywords, better handling of apostrophes in
|
data/README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
Biodiversity
|
2
2
|
============
|
3
3
|
|
4
|
-
[![Gem Version][
|
5
|
-
[![Continuous Integration Status][
|
6
|
-
[![CodePolice][
|
7
|
-
[![Dependency Status][
|
4
|
+
[![Gem Version][gem_svg]][gem_link]
|
5
|
+
[![Continuous Integration Status][ci_svg]][ci_link]
|
6
|
+
[![CodePolice][cc_svg]][cc_link]
|
7
|
+
[![Dependency Status][deps_svg]][deps_link]
|
8
8
|
|
9
9
|
Parses taxonomic scientific name and breaks it into semantic elements.
|
10
10
|
|
@@ -12,10 +12,12 @@ Parses taxonomic scientific name and breaks it into semantic elements.
|
|
12
12
|
Support for Ruby 1.8.7 IS DROPPED. Both biodiversity and
|
13
13
|
biodiversity19 will be for Ruby > 1.9.1 and will be identical gems.
|
14
14
|
|
15
|
-
biodiversity19 is now deprecated and will be
|
15
|
+
biodiversity19 is now deprecated and will not be updated anymore.
|
16
16
|
You are strongly encouraged to change your dependencies from
|
17
17
|
biodiversity19 to biodiversity
|
18
18
|
|
19
|
+
Follow [biodiversity issues][waffle] on waffle.io
|
20
|
+
|
19
21
|
Installation
|
20
22
|
------------
|
21
23
|
|
@@ -46,7 +48,7 @@ you can use a socket server
|
|
46
48
|
parserver -h
|
47
49
|
Usage: parserver [options]
|
48
50
|
|
49
|
-
-r, --canonical_with_rank Adds infraspecies rank
|
51
|
+
-r, --canonical_with_rank Adds infraspecies rank
|
50
52
|
to canonical forms
|
51
53
|
|
52
54
|
-o, --output=output Specifies the type of the output:
|
@@ -65,7 +67,7 @@ you can use a socket server
|
|
65
67
|
|
66
68
|
With default settings you can access parserserver via 4334 port using a
|
67
69
|
socket client library of your programming language. You can find
|
68
|
-
[socket client script example][
|
70
|
+
[socket client script example][socket_example] in the examples directory of the gem.
|
69
71
|
|
70
72
|
If you want to check if socket server works for you:
|
71
73
|
|
@@ -93,76 +95,94 @@ of scientific name
|
|
93
95
|
|
94
96
|
You can use it as a library in Ruby, JRuby etc.
|
95
97
|
|
96
|
-
require 'biodiversity'
|
97
98
|
|
98
|
-
|
99
|
+
```ruby
|
100
|
+
require 'biodiversity'
|
101
|
+
|
102
|
+
parser = ScientificNameParser.new
|
99
103
|
|
100
|
-
|
101
|
-
|
104
|
+
#to find version number
|
105
|
+
ScientificNameParser.version
|
102
106
|
|
103
|
-
|
104
|
-
|
105
|
-
|
107
|
+
# to fix capitalization in canonicals
|
108
|
+
ScientificNameParser.fix_case("QUERCUS (QUERCUS) ALBA")
|
109
|
+
# Output: Quercus (Quercus) alba
|
106
110
|
|
107
|
-
|
108
|
-
|
111
|
+
# to parse a scientific name into a ruby hash
|
112
|
+
parser.parse("Plantago major")
|
109
113
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
114
|
+
#to get json representation
|
115
|
+
parser.parse("Plantago").to_json
|
116
|
+
#or
|
117
|
+
parser.parse("Plantago")
|
118
|
+
parser.all_json
|
115
119
|
|
116
|
-
|
117
|
-
|
120
|
+
# to clean name up
|
121
|
+
parser.parse(" Plantago major ")[:scientificName][:normalized]
|
118
122
|
|
119
|
-
|
120
|
-
|
121
|
-
|
123
|
+
# to get only cleaned up latin part of the name
|
124
|
+
parser.parse("Pseudocercospora dendrobii (H.C. Burnett) U. \
|
125
|
+
Braun & Crous 2003")[:scientificName][:canonical]
|
122
126
|
|
123
|
-
|
124
|
-
|
125
|
-
|
127
|
+
# to get detailed information about elements of the name
|
128
|
+
parser.parse("Pseudocercospora dendrobii (H.C. Burnett 1883) U. \
|
129
|
+
Braun & Crous 2003")[:scientificName][:details]
|
130
|
+
```
|
126
131
|
|
127
132
|
Returned result is not always linear, if name is complex. To get simple linear
|
128
133
|
representation of the name you can use:
|
129
134
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
135
|
+
|
136
|
+
```ruby
|
137
|
+
parser.parse("Pseudocercospora dendrobii (H.C. Burnett) \
|
138
|
+
U. Braun & Crous 2003")[:scientificName][:position]
|
139
|
+
# returns {0=>["genus", 16], 17=>["species", 26],
|
140
|
+
# 28=>["author_word", 32], 33=>["author_word", 40],
|
141
|
+
# 42=>["author_word", 44], 45=>["author_word", 50],
|
142
|
+
# 53=>["author_word", 58], 59=>["year", 63]}
|
143
|
+
# where the key is the char index of the start of
|
144
|
+
# a word, first element of the value is a semantic meaning
|
145
|
+
# of the word, second element of the value is the character index
|
146
|
+
# of end of the word
|
147
|
+
```
|
140
148
|
|
141
149
|
'Surrogate' is a broad group which includes 'Barcode of Life' names, and various
|
142
150
|
undetermined names with cf. sp. spp. nr. in them:
|
143
|
-
|
144
|
-
parser.parse("Coleoptera BOLD:1234567")[:scientificName][:surrogate]
|
145
151
|
|
146
|
-
|
152
|
+
```ruby
|
153
|
+
parser.parse("Coleoptera BOLD:1234567")[:scientificName][:surrogate]
|
154
|
+
```
|
155
|
+
### What is "id" in the parsed results?
|
156
|
+
|
157
|
+
ID field contains UUID v5 hexadecimal string. ID is generated out of bytes
|
158
|
+
from the name string itself, and identical id can be generated using [any
|
159
|
+
popular programming language][uuid_examples]. You can read more about UUID
|
160
|
+
version 5 in a [blog post][uuid_blog]
|
161
|
+
|
162
|
+
### Parse using several CPUs (4 threads seem to be optimal)
|
147
163
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
164
|
+
```ruby
|
165
|
+
parser = ParallelParser.new
|
166
|
+
# ParallelParser.new(4) will try to run 4 processes if hardware allows
|
167
|
+
array_of_names = ["Betula alba", "Homo sapiens"....]
|
168
|
+
parser.parse(array_of_names)
|
169
|
+
# Output: {"Betula alba" => {:scientificName...},
|
170
|
+
# "Homo sapiens" => {:scientificName...}, ...}
|
171
|
+
```
|
154
172
|
|
155
|
-
parallel parser takes list of names and returns back a hash with names as
|
173
|
+
parallel parser takes list of names and returns back a hash with names as
|
156
174
|
keys and parsed data as values
|
157
175
|
|
158
|
-
|
176
|
+
### Canonicals with ranks for infraspecific epithets:
|
159
177
|
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
178
|
+
```ruby
|
179
|
+
parser = ScientificNameParser.new(canonical_with_rank: true)
|
180
|
+
parser.parse('Cola cordifolia var. puberula \
|
181
|
+
A. Chev.')[:scientificName][:canonical]
|
182
|
+
# Output: Cola cordifolia var. puberula
|
183
|
+
```
|
164
184
|
|
165
|
-
|
185
|
+
### Resolving lsid and geting back RDF file
|
166
186
|
|
167
187
|
LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
|
168
188
|
|
@@ -174,7 +194,7 @@ If nnparse or parserver do not start -- try to run
|
|
174
194
|
gem uninstall biodiversity
|
175
195
|
gem uninstall biodiversity19
|
176
196
|
|
177
|
-
and make sure you remove all versions and all nnparse and parserver scripts.
|
197
|
+
and make sure you remove all versions and all nnparse and parserver scripts.
|
178
198
|
Then install biodiversity again
|
179
199
|
|
180
200
|
gem install biodiversity
|
@@ -184,18 +204,22 @@ It should fix the problem.
|
|
184
204
|
Copyright
|
185
205
|
---------
|
186
206
|
|
187
|
-
Authors: [Dmitry Mozzherin][
|
188
|
-
|
189
|
-
Copyright (c) 2008-2015 Marine Biological Laboratory. See LICENSE
|
190
|
-
further details.
|
191
|
-
|
192
|
-
[
|
193
|
-
[
|
194
|
-
[
|
195
|
-
[
|
196
|
-
[
|
197
|
-
[
|
198
|
-
[
|
199
|
-
[
|
200
|
-
[
|
201
|
-
[
|
207
|
+
Authors: [Dmitry Mozzherin][dimus]
|
208
|
+
|
209
|
+
Copyright (c) 2008-2015 Marine Biological Laboratory. See [LICENSE][license]
|
210
|
+
for further details.
|
211
|
+
|
212
|
+
[gem_svg]: https://badge.fury.io/rb/biodiversity.svg
|
213
|
+
[gem_link]: http://badge.fury.io/rb/biodiversity
|
214
|
+
[ci_svg]: https://secure.travis-ci.org/GlobalNamesArchitecture/biodiversity.svg
|
215
|
+
[ci_link]: http://travis-ci.org/GlobalNamesArchitecture/biodiversity
|
216
|
+
[cc_svg]: https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity.svg
|
217
|
+
[cc_link]: https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity
|
218
|
+
[deps_svg]: https://gemnasium.com/GlobalNamesArchitecture/biodiversity.svg
|
219
|
+
[deps_link]: https://gemnasium.com/GlobalNamesArchitecture/biodiversity
|
220
|
+
[socket_example]: http://bit.ly/149iLm5
|
221
|
+
[dimus]: https://github.com/dimus
|
222
|
+
[license]: https://github.com/GlobalNamesArchitecture/biodiversity/blob/master/LICENSE
|
223
|
+
[waffle]: https://waffle.io/GlobalNamesArchitecture/biodiversity
|
224
|
+
[uuid_examples]: https://github.com/GlobalNamesArchitecture/gn_uuid_examples
|
225
|
+
[uuid_blog]: http://globalnamesarchitecture.github.io/crossmap/gna/2015/05/31/gn-uuid-0-5-0.html
|
data/biodiversity.gemspec
CHANGED
@@ -19,6 +19,7 @@ Gem::Specification.new do |gem|
|
|
19
19
|
gem.add_runtime_dependency "treetop", "~> 1.4.1"
|
20
20
|
gem.add_runtime_dependency "parallel", "~> 1.4"
|
21
21
|
gem.add_runtime_dependency "unicode_utils", "~> 1.4"
|
22
|
+
gem.add_runtime_dependency "gn_uuid", "~> 0.5"
|
22
23
|
|
23
24
|
gem.add_development_dependency "bundler", "~> 1.6"
|
24
25
|
gem.add_development_dependency "rake", "~> 10.4"
|
data/lib/biodiversity/parser.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
# encoding: UTF-8
|
2
|
-
|
3
|
-
require_relative
|
4
|
-
require_relative
|
2
|
+
require "gn_uuid"
|
3
|
+
require_relative "parser/scientific_name_clean"
|
4
|
+
require_relative "parser/scientific_name_dirty"
|
5
|
+
require_relative "parser/scientific_name_canonical"
|
5
6
|
|
6
7
|
module PreProcessor
|
7
8
|
NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
|
@@ -24,9 +25,10 @@ module PreProcessor
|
|
24
25
|
def self.clean(a_string)
|
25
26
|
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
|
26
27
|
TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
|
27
|
-
a_string = a_string.gsub(i,
|
28
|
+
a_string = a_string.gsub(i, "")
|
28
29
|
end
|
29
|
-
a_string = a_string.tr(
|
30
|
+
a_string = a_string.tr("ſ","s") #old "s"
|
31
|
+
a_string = a_string.tr("_", " ") if a_string.strip.match(/\s/).nil?
|
30
32
|
a_string
|
31
33
|
end
|
32
34
|
end
|
@@ -36,7 +38,7 @@ end
|
|
36
38
|
# Examples
|
37
39
|
#
|
38
40
|
# parser = ParallelParser.new(4)
|
39
|
-
# parser.parse([
|
41
|
+
# parser.parse(["Betula L.", "Pardosa moesta"])
|
40
42
|
class ParallelParser
|
41
43
|
|
42
44
|
# Public: Initialize ParallelParser.
|
@@ -45,7 +47,7 @@ class ParallelParser
|
|
45
47
|
# If processes number is not set it will be determined
|
46
48
|
# automatically.
|
47
49
|
def initialize(processes_num = nil)
|
48
|
-
require
|
50
|
+
require "parallel"
|
49
51
|
cpu_num
|
50
52
|
if processes_num.to_i > 0
|
51
53
|
@processes_num = [processes_num, cpu_num - 1].min
|
@@ -66,7 +68,7 @@ class ParallelParser
|
|
66
68
|
# Examples
|
67
69
|
#
|
68
70
|
# parser = ParallelParser.new(4)
|
69
|
-
# parser.parse([
|
71
|
+
# parser.parse(["Homo sapiens L.", "Quercus quercus"])
|
70
72
|
#
|
71
73
|
# Returns a Hash with scientific names as a key, and parsing results as
|
72
74
|
# a value.
|
@@ -108,7 +110,8 @@ class ScientificNameParser
|
|
108
110
|
|
109
111
|
FAILED_RESULT = ->(name) do
|
110
112
|
{ scientificName:
|
111
|
-
{ parsed: false, verbatim: name
|
113
|
+
{ id: GnUUID.uuid(name), parsed: false, verbatim: name,
|
114
|
+
error: "Parser internal error" }
|
112
115
|
}
|
113
116
|
end
|
114
117
|
|
@@ -121,7 +124,7 @@ class ScientificNameParser
|
|
121
124
|
words_num = name_ary.size
|
122
125
|
res = nil
|
123
126
|
if words_num == 1
|
124
|
-
res = name_ary[0].gsub(/[\(\)\{\}]/,
|
127
|
+
res = name_ary[0].gsub(/[\(\)\{\}]/, "")
|
125
128
|
if res.size > 1
|
126
129
|
res = UnicodeUtils.upcase(res[0]) + UnicodeUtils.downcase(res[1..-1])
|
127
130
|
else
|
@@ -135,15 +138,15 @@ class ScientificNameParser
|
|
135
138
|
word1 = name_ary[0]
|
136
139
|
end
|
137
140
|
if name_ary[1].match(/^\(/)
|
138
|
-
word2 = name_ary[1].gsub(/\)$/,
|
141
|
+
word2 = name_ary[1].gsub(/\)$/, "") + ")"
|
139
142
|
word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
|
140
143
|
UnicodeUtils.downcase(word2[2..-1])
|
141
144
|
else
|
142
145
|
word2 = UnicodeUtils.downcase(name_ary[1])
|
143
146
|
end
|
144
|
-
res = word1 +
|
145
|
-
word2 +
|
146
|
-
name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(
|
147
|
+
res = word1 + " " +
|
148
|
+
word2 + " " +
|
149
|
+
name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(" ")
|
147
150
|
res.strip!
|
148
151
|
end
|
149
152
|
res
|
@@ -152,7 +155,7 @@ class ScientificNameParser
|
|
152
155
|
|
153
156
|
def initialize(opts = {})
|
154
157
|
@canonical_with_rank = !!opts[:canonical_with_rank]
|
155
|
-
@verbatim =
|
158
|
+
@verbatim = ""
|
156
159
|
@clean = ScientificNameCleanParser.new
|
157
160
|
@dirty = ScientificNameDirtyParser.new
|
158
161
|
@canonical = ScientificNameCanonicalParser.new
|
@@ -180,23 +183,23 @@ class ScientificNameParser
|
|
180
183
|
end
|
181
184
|
|
182
185
|
def parse(a_string)
|
183
|
-
@verbatim = a_string
|
186
|
+
@verbatim = a_string
|
184
187
|
a_string = PreProcessor::clean(a_string)
|
185
188
|
|
186
189
|
if virus?(a_string)
|
187
|
-
@parsed = { verbatim:
|
190
|
+
@parsed = { verbatim: @verbatim, virus: true }
|
188
191
|
elsif noparse?(a_string)
|
189
|
-
@parsed = { verbatim:
|
192
|
+
@parsed = { verbatim: @verbatim }
|
190
193
|
else
|
191
194
|
begin
|
192
195
|
@parsed = @clean.parse(a_string) || @dirty.parse(a_string)
|
193
196
|
unless @parsed
|
194
197
|
index = @dirty.index || @clean.index
|
195
198
|
salvage_match = a_string[0..index].split(/\s+/)[0..-2]
|
196
|
-
salvage_string = salvage_match ? salvage_match.join(
|
199
|
+
salvage_string = salvage_match ? salvage_match.join(" ") : a_string
|
197
200
|
@parsed = @dirty.parse(salvage_string) ||
|
198
201
|
@canonical.parse(a_string) ||
|
199
|
-
{ verbatim:
|
202
|
+
{ verbatim: @verbatim }
|
200
203
|
end
|
201
204
|
rescue
|
202
205
|
@parsed = FAILED_RESULT.(@verbatim)
|
@@ -205,12 +208,14 @@ class ScientificNameParser
|
|
205
208
|
|
206
209
|
def @parsed.verbatim=(a_string)
|
207
210
|
@verbatim = a_string
|
211
|
+
@id = GnUUID.uuid(@verbatim)
|
208
212
|
end
|
209
213
|
|
210
214
|
def @parsed.all(opts = {})
|
211
215
|
canonical_with_rank = !!opts[:canonical_with_rank]
|
212
216
|
parsed = self.class != Hash
|
213
|
-
res = {
|
217
|
+
res = { id: @id, parsed: parsed,
|
218
|
+
parser_version: ScientificNameParser::version}
|
214
219
|
if parsed
|
215
220
|
hybrid = self.hybrid rescue false
|
216
221
|
res.merge!({
|
@@ -226,7 +231,7 @@ class ScientificNameParser
|
|
226
231
|
res.merge!(self)
|
227
232
|
end
|
228
233
|
if (canonical_with_rank &&
|
229
|
-
canonical.count(
|
234
|
+
canonical.count(" ") > 1 &&
|
230
235
|
res[:details][0][:infraspecies])
|
231
236
|
ScientificNameParser.add_rank_to_canonical(res)
|
232
237
|
end
|
@@ -235,11 +240,11 @@ class ScientificNameParser
|
|
235
240
|
end
|
236
241
|
|
237
242
|
def @parsed.pos_json
|
238
|
-
self.pos.to_json rescue
|
243
|
+
self.pos.to_json rescue ""
|
239
244
|
end
|
240
245
|
|
241
246
|
def @parsed.all_json
|
242
|
-
self.all.to_json rescue
|
247
|
+
self.all.to_json rescue ""
|
243
248
|
end
|
244
249
|
|
245
250
|
@parsed.verbatim = @verbatim
|
@@ -256,7 +261,7 @@ class ScientificNameParser
|
|
256
261
|
surrogate2 = /\b(spp|sp|nr|cf)[\.]?[\s]*$/i
|
257
262
|
is_surrogate = false
|
258
263
|
|
259
|
-
ai_index = pos.index(
|
264
|
+
ai_index = pos.index("annotation_identification")
|
260
265
|
if ai_index
|
261
266
|
ai = name[pos[ai_index - 1]..pos[ai_index + 1]]
|
262
267
|
is_surrogate = true if ai.match(/^(spp|cf|sp|nr)/)
|
@@ -267,15 +272,13 @@ class ScientificNameParser
|
|
267
272
|
end
|
268
273
|
|
269
274
|
def self.add_rank_to_canonical(parsed)
|
270
|
-
parts = parsed[:canonical].split(
|
275
|
+
parts = parsed[:canonical].split(" ")
|
271
276
|
name_ary = parts[0..1]
|
272
277
|
parsed[:details][0][:infraspecies].each do |data|
|
273
278
|
infrasp = data[:string]
|
274
279
|
rank = data[:rank]
|
275
|
-
name_ary << (rank && rank !=
|
280
|
+
name_ary << (rank && rank != "n/a" ? "#{rank} #{infrasp}" : infrasp)
|
276
281
|
end
|
277
|
-
parsed[:canonical] = name_ary.join(
|
282
|
+
parsed[:canonical] = name_ary.join(" ")
|
278
283
|
end
|
279
|
-
|
280
284
|
end
|
281
|
-
|