biodiversity19 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +6 -0
- data/CHANGELOG +9 -7
- data/Gemfile +8 -7
- data/Gemfile.lock +27 -27
- data/README.md +146 -0
- data/VERSION +1 -1
- data/bin/parserver +1 -1
- data/lib/biodiversity/parser.rb +77 -30
- data/spec/parser/scientific_name.spec.rb +28 -16
- metadata +55 -22
- data/README.rdoc +0 -109
data/.travis.yml
ADDED
data/CHANGELOG
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
+
2.1.0 -- added ScientificNameParser.version method
|
2
|
+
|
1
3
|
2.0.0 -- backward incompatibe change in parserver, therefore new major number.
|
2
4
|
In parserver removed option --output=canonical_with_rank, instead added -r
|
3
|
-
option which allows to have canonical with rank with either json or canonical
|
5
|
+
option which allows to have canonical with rank with either json or canonical
|
4
6
|
outputs
|
5
7
|
|
6
|
-
1.2.0 -- changed method invocation signature ScientificNameParser.new
|
8
|
+
1.2.0 -- changed method invocation signature ScientificNameParser.new
|
7
9
|
Now it can take options
|
8
10
|
|
9
11
|
1.1.3 -- added 'fo' as rank
|
10
12
|
|
11
|
-
1.1.2 -- static method for fixins all-caps canonical names, fixing caps
|
13
|
+
1.1.2 -- static method for fixins all-caps canonical names, fixing caps
|
12
14
|
for authors words, ampersand instead of 'et' in normalization
|
13
15
|
|
14
16
|
1.1.1 -- more multi-uninomials cases, expanded viruses detection, added
|
@@ -23,14 +25,14 @@ cf., sp. etc), bug fixes, more robust salvage mode
|
|
23
25
|
|
24
26
|
1.0.15 -- additional rules added for names ending with ssp. sp sp. and cf.
|
25
27
|
|
26
|
-
1.0.14 -- canonical forms had allowed ë as a character until now. After this
|
27
|
-
version the only utf-8 character allowed in canonical forms should be the
|
28
|
+
1.0.14 -- canonical forms had allowed ë as a character until now. After this
|
29
|
+
version the only utf-8 character allowed in canonical forms should be the
|
28
30
|
multiplication sign for hybrids.
|
29
31
|
|
30
|
-
1.0.13 -- canonical forms for cf. aff. qualifiers are modified: canonical for
|
32
|
+
1.0.13 -- canonical forms for cf. aff. qualifiers are modified: canonical for
|
31
33
|
'Aus cf. bus' is now 'Aus bus'; canonical for 'Aus aff. bus' is now 'Aus'.
|
32
34
|
Ranks at the end of the name like 'var', 'ssp', 'spp' are considered junk and
|
33
35
|
are ignored
|
34
36
|
|
35
|
-
1.0.12 -- bug is fixed which prevented 'Cucurbita pepo' be parsed correctly,
|
37
|
+
1.0.12 -- bug is fixed which prevented 'Cucurbita pepo' be parsed correctly,
|
36
38
|
f., forma, fr. are now treated as any other ranks.
|
data/Gemfile
CHANGED
@@ -1,14 +1,15 @@
|
|
1
|
-
source
|
1
|
+
source 'https://rubygems.org'
|
2
2
|
|
3
|
-
gem
|
4
|
-
gem
|
5
|
-
gem
|
3
|
+
gem 'rake', '~> 10.0'
|
4
|
+
gem 'treetop', '~> 1.4'
|
5
|
+
gem 'parallel', '~> 0.6'
|
6
|
+
gem 'unicode_utils', '~> 1.4'
|
6
7
|
|
7
8
|
group :development do
|
8
|
-
gem
|
9
|
+
gem 'debugger', '~> 1.5'
|
10
|
+
gem 'jeweler', '~> 1.8'
|
9
11
|
end
|
10
12
|
|
11
13
|
group :test do
|
12
|
-
gem
|
13
|
-
gem "rspec"
|
14
|
+
gem 'rspec', '~> 2.13'
|
14
15
|
end
|
data/Gemfile.lock
CHANGED
@@ -1,36 +1,35 @@
|
|
1
1
|
GEM
|
2
|
-
remote:
|
2
|
+
remote: https://rubygems.org/
|
3
3
|
specs:
|
4
4
|
columnize (0.3.6)
|
5
|
-
debugger (1.
|
5
|
+
debugger (1.5.0)
|
6
6
|
columnize (>= 0.3.1)
|
7
|
-
debugger-linecache (~> 1.
|
8
|
-
debugger-ruby_core_source (~> 1.
|
9
|
-
debugger-linecache (1.
|
10
|
-
|
11
|
-
|
12
|
-
diff-lcs (1.1.3)
|
7
|
+
debugger-linecache (~> 1.2.0)
|
8
|
+
debugger-ruby_core_source (~> 1.2.0)
|
9
|
+
debugger-linecache (1.2.0)
|
10
|
+
debugger-ruby_core_source (1.2.0)
|
11
|
+
diff-lcs (1.2.1)
|
13
12
|
git (1.2.5)
|
14
13
|
jeweler (1.8.4)
|
15
14
|
bundler (~> 1.0)
|
16
15
|
git (>= 1.2.5)
|
17
16
|
rake
|
18
17
|
rdoc
|
19
|
-
json (1.7.
|
20
|
-
parallel (0.
|
18
|
+
json (1.7.7)
|
19
|
+
parallel (0.6.2)
|
21
20
|
polyglot (0.3.3)
|
22
|
-
rake (0.
|
23
|
-
rdoc (
|
21
|
+
rake (10.0.3)
|
22
|
+
rdoc (4.0.0)
|
24
23
|
json (~> 1.4)
|
25
|
-
rspec (2.
|
26
|
-
rspec-core (~> 2.
|
27
|
-
rspec-expectations (~> 2.
|
28
|
-
rspec-mocks (~> 2.
|
29
|
-
rspec-core (2.
|
30
|
-
rspec-expectations (2.
|
31
|
-
diff-lcs (
|
32
|
-
rspec-mocks (2.
|
33
|
-
treetop (1.4.
|
24
|
+
rspec (2.13.0)
|
25
|
+
rspec-core (~> 2.13.0)
|
26
|
+
rspec-expectations (~> 2.13.0)
|
27
|
+
rspec-mocks (~> 2.13.0)
|
28
|
+
rspec-core (2.13.1)
|
29
|
+
rspec-expectations (2.13.0)
|
30
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
31
|
+
rspec-mocks (2.13.0)
|
32
|
+
treetop (1.4.12)
|
34
33
|
polyglot
|
35
34
|
polyglot (>= 0.3.1)
|
36
35
|
unicode_utils (1.4.0)
|
@@ -39,9 +38,10 @@ PLATFORMS
|
|
39
38
|
ruby
|
40
39
|
|
41
40
|
DEPENDENCIES
|
42
|
-
debugger
|
43
|
-
jeweler
|
44
|
-
parallel
|
45
|
-
|
46
|
-
|
47
|
-
|
41
|
+
debugger (~> 1.5)
|
42
|
+
jeweler (~> 1.8)
|
43
|
+
parallel (~> 0.6)
|
44
|
+
rake (~> 10.0)
|
45
|
+
rspec (~> 2.13)
|
46
|
+
treetop (~> 1.4)
|
47
|
+
unicode_utils (~> 1.4)
|
data/README.md
ADDED
@@ -0,0 +1,146 @@
|
|
1
|
+
Biodiversity
|
2
|
+
============
|
3
|
+
|
4
|
+
[![Gem Version][1]][2]
|
5
|
+
[![Continuous Integration Status][3]][4]
|
6
|
+
[![CodePolice][5]][6]
|
7
|
+
[![Dependency Status][7]][8]
|
8
|
+
|
9
|
+
Parses taxonomic scientific name and breaks it into semantic elements.
|
10
|
+
|
11
|
+
Installation
|
12
|
+
------------
|
13
|
+
|
14
|
+
*WARNING:* Do not use Ruby 1.8.7 -- it is outdated. The
|
15
|
+
biodiversity gem for Ruby 1.8.7 is not getting updated anymore
|
16
|
+
|
17
|
+
sudo gem install biodiversity19 #for ruby 1.9.x
|
18
|
+
sudo gem install biodiversity #for ruby 1.8.x
|
19
|
+
|
20
|
+
Example usage
|
21
|
+
-------------
|
22
|
+
|
23
|
+
### As a command line script
|
24
|
+
|
25
|
+
You can parse file with taxonomic names from command line.
|
26
|
+
File should contain one scientific name per line
|
27
|
+
|
28
|
+
nnparser file_with_names
|
29
|
+
|
30
|
+
### As a socket server
|
31
|
+
|
32
|
+
If you do not use Ruby and need a fast access to the parser functionality
|
33
|
+
you can use a socket server
|
34
|
+
|
35
|
+
parserver
|
36
|
+
|
37
|
+
parserver -h
|
38
|
+
Usage: parserver [options]
|
39
|
+
|
40
|
+
-r, --canonical_with_rank Adds infraspecies rank to canonical forms
|
41
|
+
|
42
|
+
-o, --output=output Specifies the type of the output:
|
43
|
+
json - parsed results in json
|
44
|
+
canonical - canonical form only
|
45
|
+
Default: json
|
46
|
+
|
47
|
+
-p, --port=port Specifies the port number
|
48
|
+
Default: 4334
|
49
|
+
|
50
|
+
-h, --help Show this help message.
|
51
|
+
|
52
|
+
parserver --output=canonical
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
With default settings you can access parserserver via 4334 port using a
|
57
|
+
socket client library of your programming language. You can find
|
58
|
+
[socket client script example][9] in the examples directory of the gem.
|
59
|
+
|
60
|
+
If you want to check if socket server works for you:
|
61
|
+
|
62
|
+
#run server in one terminal
|
63
|
+
parserver
|
64
|
+
|
65
|
+
#in another terminal window type
|
66
|
+
telnet localhost 4334
|
67
|
+
|
68
|
+
If you enter a line with a scientific name -- server will send you back
|
69
|
+
parsed information in json format.
|
70
|
+
|
71
|
+
To stop telnet client type any of `end`,`exit`,`q`, `.` instead
|
72
|
+
of scientific name
|
73
|
+
|
74
|
+
$ telnet localhost 4334
|
75
|
+
Trying ::1...
|
76
|
+
Connected to localhost.
|
77
|
+
Escape character is '^]'.
|
78
|
+
Acacia abyssinica Hochst. ex Benth. ssp. calophylla Brenan
|
79
|
+
{"scientificName":{"canonical":"Acacia abyssinica calophylla"...}}
|
80
|
+
end
|
81
|
+
|
82
|
+
### As a library
|
83
|
+
|
84
|
+
You can use it as a library in Ruby, JRuby etc.
|
85
|
+
|
86
|
+
require 'biodiversity'
|
87
|
+
|
88
|
+
parser = ScientificNameParser.new
|
89
|
+
|
90
|
+
#to find version number
|
91
|
+
ScientificNameParser.version
|
92
|
+
|
93
|
+
# to fix capitalization in canonicals
|
94
|
+
ScientificNameParser.fix_case("QUERCUS (QUERCUS) ALBA")
|
95
|
+
# Output: Quercus (Quercus) alba
|
96
|
+
|
97
|
+
# to parse a scientific name into a ruby hash
|
98
|
+
parser.parse("Plantago major")
|
99
|
+
|
100
|
+
#to get json representation
|
101
|
+
parser.parse("Plantago").to_json
|
102
|
+
#or
|
103
|
+
parser.parse("Plantago")
|
104
|
+
parser.all_json
|
105
|
+
|
106
|
+
# to clean name up
|
107
|
+
parser.parse(" Plantago major ")[:scientificName][:normalized]
|
108
|
+
|
109
|
+
# to get only cleaned up latin part of the name
|
110
|
+
parser.parse("Pseudocercospora dendrobii (H.C. Burnett) U. Braun & Crous 2003")[:scientificName][:canonical]
|
111
|
+
|
112
|
+
# to get detailed information about elements of the name
|
113
|
+
parser.parse("Pseudocercospora dendrobii (H.C. Burnett 1883) U. Braun & Crous 2003")[:scientificName][:details]
|
114
|
+
|
115
|
+
|
116
|
+
To parse using several CPUs (4 seem to be optimal)
|
117
|
+
|
118
|
+
parser = ParallelParser.new
|
119
|
+
# ParallelParser.new(4) will try to run 4 processes if hardware allows
|
120
|
+
array_of_names = ["Betula alba", "Homo sapiens"....]
|
121
|
+
parser.parse(array_of_names)
|
122
|
+
# Output: {"Betula alba" => {:scientificName...}, "Homo sapiens" => {:scientificName...}, ...}
|
123
|
+
|
124
|
+
parallel parser takes list of names and returns back a hash with names as keys and parsed data as values
|
125
|
+
|
126
|
+
To get canonicals with ranks for infraspecific epithets:
|
127
|
+
|
128
|
+
parser = ScientificNameParser.new(canonical_with_rank: true)
|
129
|
+
parser.parse('Cola cordifolia var. puberula A. Chev.')[:scientificName][:canonical]
|
130
|
+
# Output: Cola cordifolia var. puberula
|
131
|
+
|
132
|
+
To resolve lsid and get back RDF file
|
133
|
+
|
134
|
+
LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
|
135
|
+
|
136
|
+
|
137
|
+
|
138
|
+
[1]: https://badge.fury.io/rb/biodiversity19.png
|
139
|
+
[2]: http://badge.fury.io/rb/biodiversity19
|
140
|
+
[3]: https://secure.travis-ci.org/GlobalNamesArchitecture/biodiversity.png
|
141
|
+
[4]: http://travis-ci.org/GlobalNamesArchitecture/biodiversity
|
142
|
+
[5]: https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity.png
|
143
|
+
[6]: https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity
|
144
|
+
[7]: https://gemnasium.com/GlobalNamesArchitecture/biodiversity.png
|
145
|
+
[8]: https://gemnasium.com/GlobalNamesArchitecture/biodiversity
|
146
|
+
[9]: https://github.com/GlobalNamesArchitecture/biodiversity/blob/master/examples/socket_client.rb
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.1.0
|
data/bin/parserver
CHANGED
data/lib/biodiversity/parser.rb
CHANGED
@@ -9,23 +9,44 @@ require 'json'
|
|
9
9
|
module PreProcessor
|
10
10
|
NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
|
11
11
|
TAXON_CONCEPTS1 = /\s+(sensu\.|sensu|auct\.|auct)\b.*$/i
|
12
|
-
TAXON_CONCEPTS2 = /\s+
|
13
|
-
|
12
|
+
TAXON_CONCEPTS2 = /\s+
|
13
|
+
(\(?s\.\s?s\.|
|
14
|
+
\(?s\.\s?l\.|
|
15
|
+
\(?s\.\s?str\.|
|
16
|
+
\(?s\.\s?lat\.|
|
17
|
+
sec\.|sec|near)\b.*$/x
|
18
|
+
TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p\.\s?p\.)\s*$/i
|
14
19
|
NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
|
15
|
-
LAST_WORD_JUNK = /(,\s*|\s+)
|
16
|
-
|
20
|
+
LAST_WORD_JUNK = /(,\s*|\s+)
|
21
|
+
(spp\.|spp|var\.|
|
22
|
+
var|von|van|ined\.|
|
23
|
+
ined|sensu|new|non|nec|
|
24
|
+
nudum|cf\.|cf|sp\.|sp|
|
25
|
+
ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/ix
|
26
|
+
|
17
27
|
def self.clean(a_string)
|
18
|
-
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
|
28
|
+
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
|
19
29
|
TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
|
20
30
|
a_string = a_string.gsub(i, '')
|
21
31
|
end
|
22
32
|
a_string = a_string.tr('ſ','s') #old 's'
|
23
33
|
a_string
|
24
|
-
end
|
34
|
+
end
|
25
35
|
end
|
26
36
|
|
37
|
+
# Public: Parser which runs in parallel.
|
38
|
+
#
|
39
|
+
# Examples
|
40
|
+
#
|
41
|
+
# parser = ParallelParser.new(4)
|
42
|
+
# parser.parse(['Betula L.', 'Pardosa moesta'])
|
27
43
|
class ParallelParser
|
28
44
|
|
45
|
+
# Public: Initialize ParallelParser.
|
46
|
+
#
|
47
|
+
# processes_num - an Integer to setup the number of processes (default: nil).
|
48
|
+
# If processes number is not set it will be determined
|
49
|
+
# automatically.
|
29
50
|
def initialize(processes_num = nil)
|
30
51
|
require 'parallel'
|
31
52
|
cpu_num
|
@@ -36,13 +57,32 @@ class ParallelParser
|
|
36
57
|
end
|
37
58
|
end
|
38
59
|
|
60
|
+
# Public: Parses an array of scientific names using several processes
|
61
|
+
# in parallel.
|
62
|
+
#
|
63
|
+
# Scientific names are deduplicated in the process, so every string is
|
64
|
+
# parsed only once.
|
65
|
+
#
|
66
|
+
# names_list - takes an Array of scientific names,
|
67
|
+
# each element should be a String.
|
68
|
+
#
|
69
|
+
# Examples
|
70
|
+
#
|
71
|
+
# parser = ParallelParser.new(4)
|
72
|
+
# parser.parse(['Homo sapiens L.', 'Quercus quercus'])
|
73
|
+
#
|
74
|
+
# Returns a Hash with scientific names as a key, and parsing results as
|
75
|
+
# a value.
|
39
76
|
def parse(names_list)
|
40
|
-
parsed = Parallel.map(names_list.uniq, in_processes: @processes_num) do |n|
|
77
|
+
parsed = Parallel.map(names_list.uniq, in_processes: @processes_num) do |n|
|
41
78
|
[n, parse_process(n)]
|
42
79
|
end
|
43
80
|
parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
|
44
81
|
end
|
45
82
|
|
83
|
+
# Public: Returns the number of cores/CPUs.
|
84
|
+
#
|
85
|
+
# Returns Integer of cores/CPUs.
|
46
86
|
def cpu_num
|
47
87
|
@cpu_num ||= Parallel.processor_count
|
48
88
|
end
|
@@ -61,23 +101,28 @@ end
|
|
61
101
|
# @family = /^\s*[A-Z][a-z]\+viridae|viroidae/i
|
62
102
|
# @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
|
63
103
|
# @genus = /^\s*[A-Z][a-z]\+virus|viroid/i
|
64
|
-
# @species = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|
|
104
|
+
# @species = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|
|
105
|
+
# viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/ix
|
65
106
|
# @parsed = nil
|
66
107
|
# end
|
67
108
|
# end
|
68
109
|
|
69
110
|
class ScientificNameParser
|
70
|
-
VERSION = open(File.join(File.dirname(__FILE__),
|
111
|
+
VERSION = open(File.join(File.dirname(__FILE__),
|
71
112
|
'..',
|
72
113
|
'..',
|
73
114
|
'VERSION')).readline.strip
|
74
115
|
|
75
116
|
FAILED_RESULT = ->(name) do
|
76
|
-
{ scientificName:
|
77
|
-
{ parsed: false, verbatim: name.to_s.strip, error: 'Parser error' }
|
117
|
+
{ scientificName:
|
118
|
+
{ parsed: false, verbatim: name.to_s.strip, error: 'Parser error' }
|
78
119
|
}
|
79
120
|
end
|
80
|
-
|
121
|
+
|
122
|
+
def self.version
|
123
|
+
VERSION
|
124
|
+
end
|
125
|
+
|
81
126
|
def self.fix_case(name_string)
|
82
127
|
name_ary = name_string.split(/\s+/)
|
83
128
|
words_num = name_ary.size
|
@@ -91,27 +136,27 @@ class ScientificNameParser
|
|
91
136
|
end
|
92
137
|
else
|
93
138
|
if name_ary[0].size > 1
|
94
|
-
word1 = UnicodeUtils.upcase(name_ary[0][0]) +
|
139
|
+
word1 = UnicodeUtils.upcase(name_ary[0][0]) +
|
95
140
|
UnicodeUtils.downcase(name_ary[0][1..-1])
|
96
141
|
else
|
97
142
|
word1 = name_ary[0]
|
98
143
|
end
|
99
144
|
if name_ary[1].match(/^\(/)
|
100
145
|
word2 = name_ary[1].gsub(/\)$/, '') + ')'
|
101
|
-
word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
|
146
|
+
word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
|
102
147
|
UnicodeUtils.downcase(word2[2..-1])
|
103
148
|
else
|
104
149
|
word2 = UnicodeUtils.downcase(name_ary[1])
|
105
150
|
end
|
106
|
-
res = word1 + ' ' +
|
107
|
-
word2 + ' ' +
|
151
|
+
res = word1 + ' ' +
|
152
|
+
word2 + ' ' +
|
108
153
|
name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(' ')
|
109
154
|
res.strip!
|
110
155
|
end
|
111
156
|
res
|
112
157
|
end
|
113
158
|
|
114
|
-
|
159
|
+
|
115
160
|
def initialize(opts = {})
|
116
161
|
@canonical_with_rank = !!opts[:canonical_with_rank]
|
117
162
|
@verbatim = ''
|
@@ -122,8 +167,10 @@ class ScientificNameParser
|
|
122
167
|
end
|
123
168
|
|
124
169
|
def virus?(a_string)
|
125
|
-
!!(a_string.match(/\sICTV\s*$/) ||
|
126
|
-
a_string.match(/\b(virus|viruses|
|
170
|
+
!!(a_string.match(/\sICTV\s*$/) ||
|
171
|
+
a_string.match(/\b(virus|viruses|
|
172
|
+
phage|phages|viroid|viroids|
|
173
|
+
satellite|satellites|prion|prions)\b/ix) ||
|
127
174
|
a_string.match(/[A-Z]?[a-z]+virus\b/))
|
128
175
|
end
|
129
176
|
|
@@ -134,24 +181,24 @@ class ScientificNameParser
|
|
134
181
|
def parsed
|
135
182
|
@parsed
|
136
183
|
end
|
137
|
-
|
184
|
+
|
138
185
|
def parse(a_string)
|
139
186
|
@verbatim = a_string.strip
|
140
187
|
a_string = PreProcessor::clean(a_string)
|
141
|
-
|
188
|
+
|
142
189
|
if virus?(a_string)
|
143
190
|
@parsed = { verbatim: a_string, virus: true }
|
144
191
|
elsif unknown_placement?(a_string)
|
145
192
|
@parsed = { verbatim: a_string }
|
146
193
|
else
|
147
194
|
begin
|
148
|
-
@parsed = @clean.parse(a_string) || @dirty.parse(a_string)
|
195
|
+
@parsed = @clean.parse(a_string) || @dirty.parse(a_string)
|
149
196
|
unless @parsed
|
150
197
|
index = @dirty.index || @clean.index
|
151
198
|
salvage_match = a_string[0..index].split(/\s+/)[0..-2]
|
152
199
|
salvage_string = salvage_match ? salvage_match.join(' ') : a_string
|
153
|
-
@parsed = @dirty.parse(salvage_string) ||
|
154
|
-
@canonical.parse(a_string) ||
|
200
|
+
@parsed = @dirty.parse(salvage_string) ||
|
201
|
+
@canonical.parse(a_string) ||
|
155
202
|
{ verbatim: a_string }
|
156
203
|
end
|
157
204
|
rescue
|
@@ -181,18 +228,18 @@ class ScientificNameParser
|
|
181
228
|
else
|
182
229
|
res.merge!(self)
|
183
230
|
end
|
184
|
-
if (canonical_with_rank &&
|
185
|
-
canonical.count(' ') > 1 &&
|
231
|
+
if (canonical_with_rank &&
|
232
|
+
canonical.count(' ') > 1 &&
|
186
233
|
res[:details][0][:infraspecies])
|
187
234
|
ScientificNameParser.add_rank_to_canonical(res)
|
188
235
|
end
|
189
236
|
res = {:scientificName => res}
|
190
237
|
end
|
191
|
-
|
238
|
+
|
192
239
|
def @parsed.pos_json
|
193
240
|
self.pos.to_json rescue ''
|
194
241
|
end
|
195
|
-
|
242
|
+
|
196
243
|
def @parsed.all_json
|
197
244
|
self.all.to_json rescue ''
|
198
245
|
end
|
@@ -200,7 +247,7 @@ class ScientificNameParser
|
|
200
247
|
@parsed.verbatim = @verbatim
|
201
248
|
@parsed.all(canonical_with_rank: @canonical_with_rank)
|
202
249
|
end
|
203
|
-
|
250
|
+
|
204
251
|
private
|
205
252
|
|
206
253
|
def self.add_rank_to_canonical(parsed)
|
@@ -213,6 +260,6 @@ class ScientificNameParser
|
|
213
260
|
end
|
214
261
|
parsed[:canonical] = name_ary.join(' ')
|
215
262
|
end
|
216
|
-
|
263
|
+
|
217
264
|
end
|
218
265
|
|
@@ -9,10 +9,14 @@ describe ScientificNameParser do
|
|
9
9
|
set_parser(ScientificNameParser.new)
|
10
10
|
end
|
11
11
|
|
12
|
+
it 'should return version number' do
|
13
|
+
ScientificNameParser.version.should =~ /^\d+\.\d+\.\d+/
|
14
|
+
end
|
15
|
+
|
12
16
|
it 'should ScientificNameParser::fix_case' do
|
13
|
-
names = [
|
14
|
-
["QUERCUS ALBA", "Quercus alba"],
|
15
|
-
["QUERCUS (QUERCUS) ALBA", "Quercus (Quercus) alba"],
|
17
|
+
names = [
|
18
|
+
["QUERCUS ALBA", "Quercus alba"],
|
19
|
+
["QUERCUS (QUERCUS) ALBA", "Quercus (Quercus) alba"],
|
16
20
|
["QÜERCUS", "Qüercus"],
|
17
21
|
["PARDOSA MOéSTA", "Pardosa moésta"],
|
18
22
|
]
|
@@ -20,16 +24,17 @@ describe ScientificNameParser do
|
|
20
24
|
ScientificNameParser::fix_case(name).should == capitalization
|
21
25
|
end
|
22
26
|
end
|
23
|
-
|
27
|
+
|
24
28
|
it 'should generate standardized json' do
|
25
29
|
read_test_file do |y|
|
26
30
|
JSON.load(json(y[:name])).should == JSON.load(y[:jsn]) unless y[:comment]
|
27
31
|
end
|
28
32
|
end
|
29
33
|
|
30
|
-
|
34
|
+
|
31
35
|
# it 'should generate new test_file' do
|
32
|
-
# new_test = open(File.expand_path(dir +
|
36
|
+
# new_test = open(File.expand_path(dir +
|
37
|
+
# "../../spec/parser/test_data_new.txt"),'w')
|
33
38
|
# read_test_file do |y|
|
34
39
|
# if y[:comment]
|
35
40
|
# new_test.write y[:comment]
|
@@ -40,10 +45,12 @@ describe ScientificNameParser do
|
|
40
45
|
# end
|
41
46
|
# end
|
42
47
|
# end
|
43
|
-
|
48
|
+
|
44
49
|
it 'should generate reasonable output if parser failed' do
|
45
50
|
sn = 'ddd sljlkj 3223452432'
|
46
|
-
json(sn).should == '{"scientificName":{"parsed":false,
|
51
|
+
json(sn).should == '{"scientificName":{"parsed":false,' +
|
52
|
+
'"parser_version":"test_version","verbatim":"ddd sljlkj 3223452432"}}'
|
53
|
+
end
|
47
54
|
|
48
55
|
it "should show version when the flag :show_version set to true" do
|
49
56
|
parse('Homo sapiens')[:scientificName][:parser_version].should_not be_nil
|
@@ -58,7 +65,7 @@ describe ScientificNameParser do
|
|
58
65
|
end
|
59
66
|
end
|
60
67
|
|
61
|
-
describe "ScientificNameParser with ranked canonicals" do
|
68
|
+
describe "ScientificNameParser with ranked canonicals" do
|
62
69
|
before(:all) do
|
63
70
|
@parser = ScientificNameParser.new(canonical_with_rank: true)
|
64
71
|
end
|
@@ -66,10 +73,11 @@ describe "ScientificNameParser with ranked canonicals" do
|
|
66
73
|
it 'should not influence output for uninomials and binomials' do
|
67
74
|
data = [
|
68
75
|
['Ekbainacanthus Yakowlew 1902','Ekbainacanthus'],
|
69
|
-
['Ekboarmia sagnesi herrerai Exposito 2007',
|
76
|
+
['Ekboarmia sagnesi herrerai Exposito 2007',
|
77
|
+
'Ekboarmia sagnesi herrerai'],
|
70
78
|
['Ekboarmia holli Oberthür', 'Ekboarmia holli']]
|
71
79
|
|
72
|
-
data.each do |d|
|
80
|
+
data.each do |d|
|
73
81
|
parsed = @parser.parse(d[0])[:scientificName][:canonical]
|
74
82
|
parsed.should == d[1]
|
75
83
|
end
|
@@ -77,13 +85,16 @@ describe "ScientificNameParser with ranked canonicals" do
|
|
77
85
|
|
78
86
|
it 'should preserve rank for ranked multinomials' do
|
79
87
|
data = [
|
80
|
-
['Cola cordifolia var. puberula A. Chev.',
|
81
|
-
|
82
|
-
['
|
88
|
+
['Cola cordifolia var. puberula A. Chev.',
|
89
|
+
'Cola cordifolia var. puberula'],
|
90
|
+
['Abies homolepis forma umbilicata (Mayr) Schelle',
|
91
|
+
'Abies homolepis forma umbilicata'],
|
92
|
+
['Quercus ilex ssp. ballota (Desf.) Samp',
|
93
|
+
'Quercus ilex ssp. ballota']
|
83
94
|
]
|
84
95
|
data.each do |d|
|
85
96
|
parsed = @parser.parse(d[0])[:scientificName][:canonical]
|
86
|
-
parsed.should == d[1]
|
97
|
+
parsed.should == d[1]
|
87
98
|
end
|
88
99
|
end
|
89
100
|
|
@@ -115,7 +126,8 @@ describe ParallelParser do
|
|
115
126
|
res.keys.size.should == names.size
|
116
127
|
end
|
117
128
|
|
118
|
-
it "should have parsed name in native ruby format and in returned as
|
129
|
+
it "should have parsed name in native ruby format and in returned as \
|
130
|
+
a hash with name as a key and parsed data as value" do
|
119
131
|
names = []
|
120
132
|
read_test_file { |n| names << (n[:name]) if n[:name] }
|
121
133
|
names.uniq!
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: biodiversity19
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,72 +9,104 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rake
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '10.0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '10.0'
|
14
30
|
- !ruby/object:Gem::Dependency
|
15
31
|
name: treetop
|
16
32
|
requirement: !ruby/object:Gem::Requirement
|
17
33
|
none: false
|
18
34
|
requirements:
|
19
|
-
- -
|
35
|
+
- - ~>
|
20
36
|
- !ruby/object:Gem::Version
|
21
|
-
version: '
|
37
|
+
version: '1.4'
|
22
38
|
type: :runtime
|
23
39
|
prerelease: false
|
24
40
|
version_requirements: !ruby/object:Gem::Requirement
|
25
41
|
none: false
|
26
42
|
requirements:
|
27
|
-
- -
|
43
|
+
- - ~>
|
28
44
|
- !ruby/object:Gem::Version
|
29
|
-
version: '
|
45
|
+
version: '1.4'
|
30
46
|
- !ruby/object:Gem::Dependency
|
31
47
|
name: parallel
|
32
48
|
requirement: !ruby/object:Gem::Requirement
|
33
49
|
none: false
|
34
50
|
requirements:
|
35
|
-
- -
|
51
|
+
- - ~>
|
36
52
|
- !ruby/object:Gem::Version
|
37
|
-
version: '0'
|
53
|
+
version: '0.6'
|
38
54
|
type: :runtime
|
39
55
|
prerelease: false
|
40
56
|
version_requirements: !ruby/object:Gem::Requirement
|
41
57
|
none: false
|
42
58
|
requirements:
|
43
|
-
- -
|
59
|
+
- - ~>
|
44
60
|
- !ruby/object:Gem::Version
|
45
|
-
version: '0'
|
61
|
+
version: '0.6'
|
46
62
|
- !ruby/object:Gem::Dependency
|
47
63
|
name: unicode_utils
|
48
64
|
requirement: !ruby/object:Gem::Requirement
|
49
65
|
none: false
|
50
66
|
requirements:
|
51
|
-
- -
|
67
|
+
- - ~>
|
52
68
|
- !ruby/object:Gem::Version
|
53
|
-
version: '
|
69
|
+
version: '1.4'
|
54
70
|
type: :runtime
|
55
71
|
prerelease: false
|
56
72
|
version_requirements: !ruby/object:Gem::Requirement
|
57
73
|
none: false
|
58
74
|
requirements:
|
59
|
-
- -
|
75
|
+
- - ~>
|
60
76
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
77
|
+
version: '1.4'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: debugger
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ~>
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '1.5'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ~>
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '1.5'
|
62
94
|
- !ruby/object:Gem::Dependency
|
63
95
|
name: jeweler
|
64
96
|
requirement: !ruby/object:Gem::Requirement
|
65
97
|
none: false
|
66
98
|
requirements:
|
67
|
-
- -
|
99
|
+
- - ~>
|
68
100
|
- !ruby/object:Gem::Version
|
69
|
-
version: '
|
101
|
+
version: '1.8'
|
70
102
|
type: :development
|
71
103
|
prerelease: false
|
72
104
|
version_requirements: !ruby/object:Gem::Requirement
|
73
105
|
none: false
|
74
106
|
requirements:
|
75
|
-
- -
|
107
|
+
- - ~>
|
76
108
|
- !ruby/object:Gem::Version
|
77
|
-
version: '
|
109
|
+
version: '1.8'
|
78
110
|
- !ruby/object:Gem::Dependency
|
79
111
|
name: treetop
|
80
112
|
requirement: !ruby/object:Gem::Requirement
|
@@ -131,15 +163,16 @@ executables:
|
|
131
163
|
extensions: []
|
132
164
|
extra_rdoc_files:
|
133
165
|
- LICENSE
|
134
|
-
- README.
|
166
|
+
- README.md
|
135
167
|
files:
|
136
168
|
- .document
|
137
169
|
- .rvmrc
|
170
|
+
- .travis.yml
|
138
171
|
- CHANGELOG
|
139
172
|
- Gemfile
|
140
173
|
- Gemfile.lock
|
141
174
|
- LICENSE
|
142
|
-
- README.
|
175
|
+
- README.md
|
143
176
|
- Rakefile
|
144
177
|
- VERSION
|
145
178
|
- bin/nnparse
|
@@ -178,7 +211,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
178
211
|
version: '0'
|
179
212
|
segments:
|
180
213
|
- 0
|
181
|
-
hash:
|
214
|
+
hash: 1065169335698854656
|
182
215
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
183
216
|
none: false
|
184
217
|
requirements:
|
@@ -187,7 +220,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
187
220
|
version: '0'
|
188
221
|
requirements: []
|
189
222
|
rubyforge_project:
|
190
|
-
rubygems_version: 1.8.
|
223
|
+
rubygems_version: 1.8.25
|
191
224
|
signing_key:
|
192
225
|
specification_version: 3
|
193
226
|
summary: Parser of scientific names
|
data/README.rdoc
DELETED
@@ -1,109 +0,0 @@
|
|
1
|
-
= Biodiversity
|
2
|
-
|
3
|
-
{<img src="https://codeclimate.com/badge.png" />}[https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity]
|
4
|
-
|
5
|
-
Parses taxonomic scientific name and breaks it into semantic elements.
|
6
|
-
|
7
|
-
== Installation
|
8
|
-
|
9
|
-
To install gem you need RubyGems >= 1.3.6
|
10
|
-
|
11
|
-
$ sudo gem install biodiversity #for ruby 1.8.x
|
12
|
-
$ sudo gem install biodiversity19 #for ruby 1.9.x
|
13
|
-
|
14
|
-
== Example usage
|
15
|
-
|
16
|
-
=== As a command line script
|
17
|
-
|
18
|
-
You can parse file with taxonomic names from command line. File should contain one scientific name per line
|
19
|
-
|
20
|
-
nnparser file_with_names
|
21
|
-
|
22
|
-
=== As a socket server
|
23
|
-
|
24
|
-
If you do not use ruby and need a fast access to the parser functionality you can use a socket server
|
25
|
-
|
26
|
-
parserver
|
27
|
-
|
28
|
-
options:
|
29
|
-
|
30
|
-
parserver --output=canonical
|
31
|
-
|
32
|
-
to return a canonical form of the name string
|
33
|
-
|
34
|
-
parserver --port 5555
|
35
|
-
|
36
|
-
run socket server on a different port
|
37
|
-
|
38
|
-
parserver --canonical_with_rank
|
39
|
-
|
40
|
-
to add rank to canonical forms with infraspecific epithet, if it is given
|
41
|
-
|
42
|
-
Then you can access it via 4334 port using a socket client library of your programming language. You can find socket client script example in the examples directory of the gem.
|
43
|
-
|
44
|
-
If you want to check if socket server works for you:
|
45
|
-
|
46
|
-
#run server in one terminal
|
47
|
-
parserver
|
48
|
-
|
49
|
-
#in another terminal window type
|
50
|
-
telnet localhost 4334
|
51
|
-
|
52
|
-
If you enter a line with a scientific name server will send you back parsed information in json format.
|
53
|
-
|
54
|
-
To stop telnet client type any of 'end','exit','q', '.' (without quotes) instead of scientific name
|
55
|
-
|
56
|
-
$ telnet localhost 4334
|
57
|
-
Trying ::1...
|
58
|
-
Connected to localhost.
|
59
|
-
Escape character is '^]'.
|
60
|
-
Acacia abyssinica Hochst. ex Benth. ssp. calophylla Brenan
|
61
|
-
{"scientificName":{"canonical":"Acacia abyssinica calophylla","parsed":true,"parser_run":1,"verbatim":"Acacia abyssinica Hochst. ex Benth. ssp. calophylla Brenan\r\n","positions":{"0":["genus",6],"18":["author_word",25],"29":["author_word",35],"7":["species",17],"41":["infraspecies",51],"52":["author_word",58]},"hybrid":false,"normalized":"Acacia abyssinica Hochst. ex Benth. ssp. calophylla Brenan","details":[{"species":{"basionymAuthorTeam":{"exAuthorTeam":{"author":["Benth."],"authorTeam":"Benth."},"author":["Hochst."],"authorTeam":"Hochst."},"string":"abyssinica","authorship":"Hochst. ex Benth."},"infraspecies":[{"basionymAuthorTeam":{"author":["Brenan"],"authorTeam":"Brenan"},"string":"calophylla","rank":"ssp.","authorship":"Brenan"}],"genus":{"string":"Acacia"}}]}}
|
62
|
-
|
63
|
-
|
64
|
-
=== As a library
|
65
|
-
|
66
|
-
You can use it as a library
|
67
|
-
|
68
|
-
require 'biodiversity'
|
69
|
-
|
70
|
-
parser = ScientificNameParser.new
|
71
|
-
|
72
|
-
# to fix capitalization in canonicals
|
73
|
-
ScientificNameParser.fix_case("QUERCUS (QUERCUS) ALBA") # Quercus (Quercus) alba
|
74
|
-
|
75
|
-
# to parse a scientific name into a ruby hash
|
76
|
-
parser.parse("Plantago major")
|
77
|
-
|
78
|
-
#to get json representation
|
79
|
-
parser.parse("Plantago").to_json
|
80
|
-
#or
|
81
|
-
parser.parse("Plantago")
|
82
|
-
parser.all_json
|
83
|
-
|
84
|
-
# to clean name up
|
85
|
-
parser.parse(" Plantago major ")[:scientificName][:normalized]
|
86
|
-
|
87
|
-
# to get only cleaned up latin part of the name
|
88
|
-
parser.parse("Pseudocercospora dendrobii (H.C. Burnett) U. Braun & Crous 2003")[:scientificName][:canonical]
|
89
|
-
|
90
|
-
# to get detailed information about elements of the name
|
91
|
-
parser.parse("Pseudocercospora dendrobii (H.C. Burnett 1883) U. Braun & Crous 2003")[:scientificName][:details]
|
92
|
-
|
93
|
-
# to parse using several CPUs (4 seem to be optimal)
|
94
|
-
parser = ParallelParser.new # ParallelParser.new(4) will try to run 4 processes if hardware allows
|
95
|
-
array_of_names = ["Betula alba", "Homo sapiens"....]
|
96
|
-
parser.parse(array_of_names) # -> {"Betula alba" => {:scientificName...}, "Homo sapiens" => {:scientificName...}, ...}
|
97
|
-
|
98
|
-
parallel parser takes list of names and returns back a hash with names as keys and parsed data as values
|
99
|
-
|
100
|
-
# to resolve lsid and get back RDF file
|
101
|
-
LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
|
102
|
-
|
103
|
-
# to get canonicals with ranks for infraspecific epithets:
|
104
|
-
parser = ScientificNameParser.new(canonical_with_rank: true)
|
105
|
-
parser.parse('Cola cordifolia var. puberula A. Chev.')[:scientificName][:canonical]
|
106
|
-
# should get 'Cola cordifolia var. puberula'
|
107
|
-
|
108
|
-
Copyright (c) 2009-2011 Marine Biological Laboratory. See LICENSE.txt for
|
109
|
-
further details.
|