biodiversity19 2.0.0 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +6 -0
- data/CHANGELOG +9 -7
- data/Gemfile +8 -7
- data/Gemfile.lock +27 -27
- data/README.md +146 -0
- data/VERSION +1 -1
- data/bin/parserver +1 -1
- data/lib/biodiversity/parser.rb +77 -30
- data/spec/parser/scientific_name.spec.rb +28 -16
- metadata +55 -22
- data/README.rdoc +0 -109
data/.travis.yml
ADDED
data/CHANGELOG
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
+
2.1.0 -- added ScientificNameParser.version method
|
2
|
+
|
1
3
|
2.0.0 -- backward incompatibe change in parserver, therefore new major number.
|
2
4
|
In parserver removed option --output=canonical_with_rank, instead added -r
|
3
|
-
option which allows to have canonical with rank with either json or canonical
|
5
|
+
option which allows to have canonical with rank with either json or canonical
|
4
6
|
outputs
|
5
7
|
|
6
|
-
1.2.0 -- changed method invocation signature ScientificNameParser.new
|
8
|
+
1.2.0 -- changed method invocation signature ScientificNameParser.new
|
7
9
|
Now it can take options
|
8
10
|
|
9
11
|
1.1.3 -- added 'fo' as rank
|
10
12
|
|
11
|
-
1.1.2 -- static method for fixins all-caps canonical names, fixing caps
|
13
|
+
1.1.2 -- static method for fixins all-caps canonical names, fixing caps
|
12
14
|
for authors words, ampersand instead of 'et' in normalization
|
13
15
|
|
14
16
|
1.1.1 -- more multi-uninomials cases, expanded viruses detection, added
|
@@ -23,14 +25,14 @@ cf., sp. etc), bug fixes, more robust salvage mode
|
|
23
25
|
|
24
26
|
1.0.15 -- additional rules added for names ending with ssp. sp sp. and cf.
|
25
27
|
|
26
|
-
1.0.14 -- canonical forms had allowed ë as a character until now. After this
|
27
|
-
version the only utf-8 character allowed in canonical forms should be the
|
28
|
+
1.0.14 -- canonical forms had allowed ë as a character until now. After this
|
29
|
+
version the only utf-8 character allowed in canonical forms should be the
|
28
30
|
multiplication sign for hybrids.
|
29
31
|
|
30
|
-
1.0.13 -- canonical forms for cf. aff. qualifiers are modified: canonical for
|
32
|
+
1.0.13 -- canonical forms for cf. aff. qualifiers are modified: canonical for
|
31
33
|
'Aus cf. bus' is now 'Aus bus'; canonical for 'Aus aff. bus' is now 'Aus'.
|
32
34
|
Ranks at the end of the name like 'var', 'ssp', 'spp' are considered junk and
|
33
35
|
are ignored
|
34
36
|
|
35
|
-
1.0.12 -- bug is fixed which prevented 'Cucurbita pepo' be parsed correctly,
|
37
|
+
1.0.12 -- bug is fixed which prevented 'Cucurbita pepo' be parsed correctly,
|
36
38
|
f., forma, fr. are now treated as any other ranks.
|
data/Gemfile
CHANGED
@@ -1,14 +1,15 @@
|
|
1
|
-
source
|
1
|
+
source 'https://rubygems.org'
|
2
2
|
|
3
|
-
gem
|
4
|
-
gem
|
5
|
-
gem
|
3
|
+
gem 'rake', '~> 10.0'
|
4
|
+
gem 'treetop', '~> 1.4'
|
5
|
+
gem 'parallel', '~> 0.6'
|
6
|
+
gem 'unicode_utils', '~> 1.4'
|
6
7
|
|
7
8
|
group :development do
|
8
|
-
gem
|
9
|
+
gem 'debugger', '~> 1.5'
|
10
|
+
gem 'jeweler', '~> 1.8'
|
9
11
|
end
|
10
12
|
|
11
13
|
group :test do
|
12
|
-
gem
|
13
|
-
gem "rspec"
|
14
|
+
gem 'rspec', '~> 2.13'
|
14
15
|
end
|
data/Gemfile.lock
CHANGED
@@ -1,36 +1,35 @@
|
|
1
1
|
GEM
|
2
|
-
remote:
|
2
|
+
remote: https://rubygems.org/
|
3
3
|
specs:
|
4
4
|
columnize (0.3.6)
|
5
|
-
debugger (1.
|
5
|
+
debugger (1.5.0)
|
6
6
|
columnize (>= 0.3.1)
|
7
|
-
debugger-linecache (~> 1.
|
8
|
-
debugger-ruby_core_source (~> 1.
|
9
|
-
debugger-linecache (1.
|
10
|
-
|
11
|
-
|
12
|
-
diff-lcs (1.1.3)
|
7
|
+
debugger-linecache (~> 1.2.0)
|
8
|
+
debugger-ruby_core_source (~> 1.2.0)
|
9
|
+
debugger-linecache (1.2.0)
|
10
|
+
debugger-ruby_core_source (1.2.0)
|
11
|
+
diff-lcs (1.2.1)
|
13
12
|
git (1.2.5)
|
14
13
|
jeweler (1.8.4)
|
15
14
|
bundler (~> 1.0)
|
16
15
|
git (>= 1.2.5)
|
17
16
|
rake
|
18
17
|
rdoc
|
19
|
-
json (1.7.
|
20
|
-
parallel (0.
|
18
|
+
json (1.7.7)
|
19
|
+
parallel (0.6.2)
|
21
20
|
polyglot (0.3.3)
|
22
|
-
rake (0.
|
23
|
-
rdoc (
|
21
|
+
rake (10.0.3)
|
22
|
+
rdoc (4.0.0)
|
24
23
|
json (~> 1.4)
|
25
|
-
rspec (2.
|
26
|
-
rspec-core (~> 2.
|
27
|
-
rspec-expectations (~> 2.
|
28
|
-
rspec-mocks (~> 2.
|
29
|
-
rspec-core (2.
|
30
|
-
rspec-expectations (2.
|
31
|
-
diff-lcs (
|
32
|
-
rspec-mocks (2.
|
33
|
-
treetop (1.4.
|
24
|
+
rspec (2.13.0)
|
25
|
+
rspec-core (~> 2.13.0)
|
26
|
+
rspec-expectations (~> 2.13.0)
|
27
|
+
rspec-mocks (~> 2.13.0)
|
28
|
+
rspec-core (2.13.1)
|
29
|
+
rspec-expectations (2.13.0)
|
30
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
31
|
+
rspec-mocks (2.13.0)
|
32
|
+
treetop (1.4.12)
|
34
33
|
polyglot
|
35
34
|
polyglot (>= 0.3.1)
|
36
35
|
unicode_utils (1.4.0)
|
@@ -39,9 +38,10 @@ PLATFORMS
|
|
39
38
|
ruby
|
40
39
|
|
41
40
|
DEPENDENCIES
|
42
|
-
debugger
|
43
|
-
jeweler
|
44
|
-
parallel
|
45
|
-
|
46
|
-
|
47
|
-
|
41
|
+
debugger (~> 1.5)
|
42
|
+
jeweler (~> 1.8)
|
43
|
+
parallel (~> 0.6)
|
44
|
+
rake (~> 10.0)
|
45
|
+
rspec (~> 2.13)
|
46
|
+
treetop (~> 1.4)
|
47
|
+
unicode_utils (~> 1.4)
|
data/README.md
ADDED
@@ -0,0 +1,146 @@
|
|
1
|
+
Biodiversity
|
2
|
+
============
|
3
|
+
|
4
|
+
[![Gem Version][1]][2]
|
5
|
+
[![Continuous Integration Status][3]][4]
|
6
|
+
[![CodePolice][5]][6]
|
7
|
+
[![Dependency Status][7]][8]
|
8
|
+
|
9
|
+
Parses taxonomic scientific name and breaks it into semantic elements.
|
10
|
+
|
11
|
+
Installation
|
12
|
+
------------
|
13
|
+
|
14
|
+
*WARNING:* Do not use Ruby 1.8.7 -- it is outdated. The
|
15
|
+
biodiversity gem for Ruby 1.8.7 is not getting updated anymore
|
16
|
+
|
17
|
+
sudo gem install biodiversity19 #for ruby 1.9.x
|
18
|
+
sudo gem install biodiversity #for ruby 1.8.x
|
19
|
+
|
20
|
+
Example usage
|
21
|
+
-------------
|
22
|
+
|
23
|
+
### As a command line script
|
24
|
+
|
25
|
+
You can parse file with taxonomic names from command line.
|
26
|
+
File should contain one scientific name per line
|
27
|
+
|
28
|
+
nnparser file_with_names
|
29
|
+
|
30
|
+
### As a socket server
|
31
|
+
|
32
|
+
If you do not use Ruby and need a fast access to the parser functionality
|
33
|
+
you can use a socket server
|
34
|
+
|
35
|
+
parserver
|
36
|
+
|
37
|
+
parserver -h
|
38
|
+
Usage: parserver [options]
|
39
|
+
|
40
|
+
-r, --canonical_with_rank Adds infraspecies rank to canonical forms
|
41
|
+
|
42
|
+
-o, --output=output Specifies the type of the output:
|
43
|
+
json - parsed results in json
|
44
|
+
canonical - canonical form only
|
45
|
+
Default: json
|
46
|
+
|
47
|
+
-p, --port=port Specifies the port number
|
48
|
+
Default: 4334
|
49
|
+
|
50
|
+
-h, --help Show this help message.
|
51
|
+
|
52
|
+
parserver --output=canonical
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
With default settings you can access parserserver via 4334 port using a
|
57
|
+
socket client library of your programming language. You can find
|
58
|
+
[socket client script example][9] in the examples directory of the gem.
|
59
|
+
|
60
|
+
If you want to check if socket server works for you:
|
61
|
+
|
62
|
+
#run server in one terminal
|
63
|
+
parserver
|
64
|
+
|
65
|
+
#in another terminal window type
|
66
|
+
telnet localhost 4334
|
67
|
+
|
68
|
+
If you enter a line with a scientific name -- server will send you back
|
69
|
+
parsed information in json format.
|
70
|
+
|
71
|
+
To stop telnet client type any of `end`,`exit`,`q`, `.` instead
|
72
|
+
of scientific name
|
73
|
+
|
74
|
+
$ telnet localhost 4334
|
75
|
+
Trying ::1...
|
76
|
+
Connected to localhost.
|
77
|
+
Escape character is '^]'.
|
78
|
+
Acacia abyssinica Hochst. ex Benth. ssp. calophylla Brenan
|
79
|
+
{"scientificName":{"canonical":"Acacia abyssinica calophylla"...}}
|
80
|
+
end
|
81
|
+
|
82
|
+
### As a library
|
83
|
+
|
84
|
+
You can use it as a library in Ruby, JRuby etc.
|
85
|
+
|
86
|
+
require 'biodiversity'
|
87
|
+
|
88
|
+
parser = ScientificNameParser.new
|
89
|
+
|
90
|
+
#to find version number
|
91
|
+
ScientificNameParser.version
|
92
|
+
|
93
|
+
# to fix capitalization in canonicals
|
94
|
+
ScientificNameParser.fix_case("QUERCUS (QUERCUS) ALBA")
|
95
|
+
# Output: Quercus (Quercus) alba
|
96
|
+
|
97
|
+
# to parse a scientific name into a ruby hash
|
98
|
+
parser.parse("Plantago major")
|
99
|
+
|
100
|
+
#to get json representation
|
101
|
+
parser.parse("Plantago").to_json
|
102
|
+
#or
|
103
|
+
parser.parse("Plantago")
|
104
|
+
parser.all_json
|
105
|
+
|
106
|
+
# to clean name up
|
107
|
+
parser.parse(" Plantago major ")[:scientificName][:normalized]
|
108
|
+
|
109
|
+
# to get only cleaned up latin part of the name
|
110
|
+
parser.parse("Pseudocercospora dendrobii (H.C. Burnett) U. Braun & Crous 2003")[:scientificName][:canonical]
|
111
|
+
|
112
|
+
# to get detailed information about elements of the name
|
113
|
+
parser.parse("Pseudocercospora dendrobii (H.C. Burnett 1883) U. Braun & Crous 2003")[:scientificName][:details]
|
114
|
+
|
115
|
+
|
116
|
+
To parse using several CPUs (4 seem to be optimal)
|
117
|
+
|
118
|
+
parser = ParallelParser.new
|
119
|
+
# ParallelParser.new(4) will try to run 4 processes if hardware allows
|
120
|
+
array_of_names = ["Betula alba", "Homo sapiens"....]
|
121
|
+
parser.parse(array_of_names)
|
122
|
+
# Output: {"Betula alba" => {:scientificName...}, "Homo sapiens" => {:scientificName...}, ...}
|
123
|
+
|
124
|
+
parallel parser takes list of names and returns back a hash with names as keys and parsed data as values
|
125
|
+
|
126
|
+
To get canonicals with ranks for infraspecific epithets:
|
127
|
+
|
128
|
+
parser = ScientificNameParser.new(canonical_with_rank: true)
|
129
|
+
parser.parse('Cola cordifolia var. puberula A. Chev.')[:scientificName][:canonical]
|
130
|
+
# Output: Cola cordifolia var. puberula
|
131
|
+
|
132
|
+
To resolve lsid and get back RDF file
|
133
|
+
|
134
|
+
LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
|
135
|
+
|
136
|
+
|
137
|
+
|
138
|
+
[1]: https://badge.fury.io/rb/biodiversity19.png
|
139
|
+
[2]: http://badge.fury.io/rb/biodiversity19
|
140
|
+
[3]: https://secure.travis-ci.org/GlobalNamesArchitecture/biodiversity.png
|
141
|
+
[4]: http://travis-ci.org/GlobalNamesArchitecture/biodiversity
|
142
|
+
[5]: https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity.png
|
143
|
+
[6]: https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity
|
144
|
+
[7]: https://gemnasium.com/GlobalNamesArchitecture/biodiversity.png
|
145
|
+
[8]: https://gemnasium.com/GlobalNamesArchitecture/biodiversity
|
146
|
+
[9]: https://github.com/GlobalNamesArchitecture/biodiversity/blob/master/examples/socket_client.rb
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.1.0
|
data/bin/parserver
CHANGED
data/lib/biodiversity/parser.rb
CHANGED
@@ -9,23 +9,44 @@ require 'json'
|
|
9
9
|
module PreProcessor
|
10
10
|
NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
|
11
11
|
TAXON_CONCEPTS1 = /\s+(sensu\.|sensu|auct\.|auct)\b.*$/i
|
12
|
-
TAXON_CONCEPTS2 = /\s+
|
13
|
-
|
12
|
+
TAXON_CONCEPTS2 = /\s+
|
13
|
+
(\(?s\.\s?s\.|
|
14
|
+
\(?s\.\s?l\.|
|
15
|
+
\(?s\.\s?str\.|
|
16
|
+
\(?s\.\s?lat\.|
|
17
|
+
sec\.|sec|near)\b.*$/x
|
18
|
+
TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p\.\s?p\.)\s*$/i
|
14
19
|
NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen|\(?nom\.|\(?comb\.).*$/i
|
15
|
-
LAST_WORD_JUNK = /(,\s*|\s+)
|
16
|
-
|
20
|
+
LAST_WORD_JUNK = /(,\s*|\s+)
|
21
|
+
(spp\.|spp|var\.|
|
22
|
+
var|von|van|ined\.|
|
23
|
+
ined|sensu|new|non|nec|
|
24
|
+
nudum|cf\.|cf|sp\.|sp|
|
25
|
+
ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/ix
|
26
|
+
|
17
27
|
def self.clean(a_string)
|
18
|
-
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
|
28
|
+
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
|
19
29
|
TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
|
20
30
|
a_string = a_string.gsub(i, '')
|
21
31
|
end
|
22
32
|
a_string = a_string.tr('ſ','s') #old 's'
|
23
33
|
a_string
|
24
|
-
end
|
34
|
+
end
|
25
35
|
end
|
26
36
|
|
37
|
+
# Public: Parser which runs in parallel.
|
38
|
+
#
|
39
|
+
# Examples
|
40
|
+
#
|
41
|
+
# parser = ParallelParser.new(4)
|
42
|
+
# parser.parse(['Betula L.', 'Pardosa moesta'])
|
27
43
|
class ParallelParser
|
28
44
|
|
45
|
+
# Public: Initialize ParallelParser.
|
46
|
+
#
|
47
|
+
# processes_num - an Integer to setup the number of processes (default: nil).
|
48
|
+
# If processes number is not set it will be determined
|
49
|
+
# automatically.
|
29
50
|
def initialize(processes_num = nil)
|
30
51
|
require 'parallel'
|
31
52
|
cpu_num
|
@@ -36,13 +57,32 @@ class ParallelParser
|
|
36
57
|
end
|
37
58
|
end
|
38
59
|
|
60
|
+
# Public: Parses an array of scientific names using several processes
|
61
|
+
# in parallel.
|
62
|
+
#
|
63
|
+
# Scientific names are deduplicated in the process, so every string is
|
64
|
+
# parsed only once.
|
65
|
+
#
|
66
|
+
# names_list - takes an Array of scientific names,
|
67
|
+
# each element should be a String.
|
68
|
+
#
|
69
|
+
# Examples
|
70
|
+
#
|
71
|
+
# parser = ParallelParser.new(4)
|
72
|
+
# parser.parse(['Homo sapiens L.', 'Quercus quercus'])
|
73
|
+
#
|
74
|
+
# Returns a Hash with scientific names as a key, and parsing results as
|
75
|
+
# a value.
|
39
76
|
def parse(names_list)
|
40
|
-
parsed = Parallel.map(names_list.uniq, in_processes: @processes_num) do |n|
|
77
|
+
parsed = Parallel.map(names_list.uniq, in_processes: @processes_num) do |n|
|
41
78
|
[n, parse_process(n)]
|
42
79
|
end
|
43
80
|
parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
|
44
81
|
end
|
45
82
|
|
83
|
+
# Public: Returns the number of cores/CPUs.
|
84
|
+
#
|
85
|
+
# Returns Integer of cores/CPUs.
|
46
86
|
def cpu_num
|
47
87
|
@cpu_num ||= Parallel.processor_count
|
48
88
|
end
|
@@ -61,23 +101,28 @@ end
|
|
61
101
|
# @family = /^\s*[A-Z][a-z]\+viridae|viroidae/i
|
62
102
|
# @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
|
63
103
|
# @genus = /^\s*[A-Z][a-z]\+virus|viroid/i
|
64
|
-
# @species = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|
|
104
|
+
# @species = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|
|
105
|
+
# viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/ix
|
65
106
|
# @parsed = nil
|
66
107
|
# end
|
67
108
|
# end
|
68
109
|
|
69
110
|
class ScientificNameParser
|
70
|
-
VERSION = open(File.join(File.dirname(__FILE__),
|
111
|
+
VERSION = open(File.join(File.dirname(__FILE__),
|
71
112
|
'..',
|
72
113
|
'..',
|
73
114
|
'VERSION')).readline.strip
|
74
115
|
|
75
116
|
FAILED_RESULT = ->(name) do
|
76
|
-
{ scientificName:
|
77
|
-
{ parsed: false, verbatim: name.to_s.strip, error: 'Parser error' }
|
117
|
+
{ scientificName:
|
118
|
+
{ parsed: false, verbatim: name.to_s.strip, error: 'Parser error' }
|
78
119
|
}
|
79
120
|
end
|
80
|
-
|
121
|
+
|
122
|
+
def self.version
|
123
|
+
VERSION
|
124
|
+
end
|
125
|
+
|
81
126
|
def self.fix_case(name_string)
|
82
127
|
name_ary = name_string.split(/\s+/)
|
83
128
|
words_num = name_ary.size
|
@@ -91,27 +136,27 @@ class ScientificNameParser
|
|
91
136
|
end
|
92
137
|
else
|
93
138
|
if name_ary[0].size > 1
|
94
|
-
word1 = UnicodeUtils.upcase(name_ary[0][0]) +
|
139
|
+
word1 = UnicodeUtils.upcase(name_ary[0][0]) +
|
95
140
|
UnicodeUtils.downcase(name_ary[0][1..-1])
|
96
141
|
else
|
97
142
|
word1 = name_ary[0]
|
98
143
|
end
|
99
144
|
if name_ary[1].match(/^\(/)
|
100
145
|
word2 = name_ary[1].gsub(/\)$/, '') + ')'
|
101
|
-
word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
|
146
|
+
word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
|
102
147
|
UnicodeUtils.downcase(word2[2..-1])
|
103
148
|
else
|
104
149
|
word2 = UnicodeUtils.downcase(name_ary[1])
|
105
150
|
end
|
106
|
-
res = word1 + ' ' +
|
107
|
-
word2 + ' ' +
|
151
|
+
res = word1 + ' ' +
|
152
|
+
word2 + ' ' +
|
108
153
|
name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(' ')
|
109
154
|
res.strip!
|
110
155
|
end
|
111
156
|
res
|
112
157
|
end
|
113
158
|
|
114
|
-
|
159
|
+
|
115
160
|
def initialize(opts = {})
|
116
161
|
@canonical_with_rank = !!opts[:canonical_with_rank]
|
117
162
|
@verbatim = ''
|
@@ -122,8 +167,10 @@ class ScientificNameParser
|
|
122
167
|
end
|
123
168
|
|
124
169
|
def virus?(a_string)
|
125
|
-
!!(a_string.match(/\sICTV\s*$/) ||
|
126
|
-
a_string.match(/\b(virus|viruses|
|
170
|
+
!!(a_string.match(/\sICTV\s*$/) ||
|
171
|
+
a_string.match(/\b(virus|viruses|
|
172
|
+
phage|phages|viroid|viroids|
|
173
|
+
satellite|satellites|prion|prions)\b/ix) ||
|
127
174
|
a_string.match(/[A-Z]?[a-z]+virus\b/))
|
128
175
|
end
|
129
176
|
|
@@ -134,24 +181,24 @@ class ScientificNameParser
|
|
134
181
|
def parsed
|
135
182
|
@parsed
|
136
183
|
end
|
137
|
-
|
184
|
+
|
138
185
|
def parse(a_string)
|
139
186
|
@verbatim = a_string.strip
|
140
187
|
a_string = PreProcessor::clean(a_string)
|
141
|
-
|
188
|
+
|
142
189
|
if virus?(a_string)
|
143
190
|
@parsed = { verbatim: a_string, virus: true }
|
144
191
|
elsif unknown_placement?(a_string)
|
145
192
|
@parsed = { verbatim: a_string }
|
146
193
|
else
|
147
194
|
begin
|
148
|
-
@parsed = @clean.parse(a_string) || @dirty.parse(a_string)
|
195
|
+
@parsed = @clean.parse(a_string) || @dirty.parse(a_string)
|
149
196
|
unless @parsed
|
150
197
|
index = @dirty.index || @clean.index
|
151
198
|
salvage_match = a_string[0..index].split(/\s+/)[0..-2]
|
152
199
|
salvage_string = salvage_match ? salvage_match.join(' ') : a_string
|
153
|
-
@parsed = @dirty.parse(salvage_string) ||
|
154
|
-
@canonical.parse(a_string) ||
|
200
|
+
@parsed = @dirty.parse(salvage_string) ||
|
201
|
+
@canonical.parse(a_string) ||
|
155
202
|
{ verbatim: a_string }
|
156
203
|
end
|
157
204
|
rescue
|
@@ -181,18 +228,18 @@ class ScientificNameParser
|
|
181
228
|
else
|
182
229
|
res.merge!(self)
|
183
230
|
end
|
184
|
-
if (canonical_with_rank &&
|
185
|
-
canonical.count(' ') > 1 &&
|
231
|
+
if (canonical_with_rank &&
|
232
|
+
canonical.count(' ') > 1 &&
|
186
233
|
res[:details][0][:infraspecies])
|
187
234
|
ScientificNameParser.add_rank_to_canonical(res)
|
188
235
|
end
|
189
236
|
res = {:scientificName => res}
|
190
237
|
end
|
191
|
-
|
238
|
+
|
192
239
|
def @parsed.pos_json
|
193
240
|
self.pos.to_json rescue ''
|
194
241
|
end
|
195
|
-
|
242
|
+
|
196
243
|
def @parsed.all_json
|
197
244
|
self.all.to_json rescue ''
|
198
245
|
end
|
@@ -200,7 +247,7 @@ class ScientificNameParser
|
|
200
247
|
@parsed.verbatim = @verbatim
|
201
248
|
@parsed.all(canonical_with_rank: @canonical_with_rank)
|
202
249
|
end
|
203
|
-
|
250
|
+
|
204
251
|
private
|
205
252
|
|
206
253
|
def self.add_rank_to_canonical(parsed)
|
@@ -213,6 +260,6 @@ class ScientificNameParser
|
|
213
260
|
end
|
214
261
|
parsed[:canonical] = name_ary.join(' ')
|
215
262
|
end
|
216
|
-
|
263
|
+
|
217
264
|
end
|
218
265
|
|
@@ -9,10 +9,14 @@ describe ScientificNameParser do
|
|
9
9
|
set_parser(ScientificNameParser.new)
|
10
10
|
end
|
11
11
|
|
12
|
+
it 'should return version number' do
|
13
|
+
ScientificNameParser.version.should =~ /^\d+\.\d+\.\d+/
|
14
|
+
end
|
15
|
+
|
12
16
|
it 'should ScientificNameParser::fix_case' do
|
13
|
-
names = [
|
14
|
-
["QUERCUS ALBA", "Quercus alba"],
|
15
|
-
["QUERCUS (QUERCUS) ALBA", "Quercus (Quercus) alba"],
|
17
|
+
names = [
|
18
|
+
["QUERCUS ALBA", "Quercus alba"],
|
19
|
+
["QUERCUS (QUERCUS) ALBA", "Quercus (Quercus) alba"],
|
16
20
|
["QÜERCUS", "Qüercus"],
|
17
21
|
["PARDOSA MOéSTA", "Pardosa moésta"],
|
18
22
|
]
|
@@ -20,16 +24,17 @@ describe ScientificNameParser do
|
|
20
24
|
ScientificNameParser::fix_case(name).should == capitalization
|
21
25
|
end
|
22
26
|
end
|
23
|
-
|
27
|
+
|
24
28
|
it 'should generate standardized json' do
|
25
29
|
read_test_file do |y|
|
26
30
|
JSON.load(json(y[:name])).should == JSON.load(y[:jsn]) unless y[:comment]
|
27
31
|
end
|
28
32
|
end
|
29
33
|
|
30
|
-
|
34
|
+
|
31
35
|
# it 'should generate new test_file' do
|
32
|
-
# new_test = open(File.expand_path(dir +
|
36
|
+
# new_test = open(File.expand_path(dir +
|
37
|
+
# "../../spec/parser/test_data_new.txt"),'w')
|
33
38
|
# read_test_file do |y|
|
34
39
|
# if y[:comment]
|
35
40
|
# new_test.write y[:comment]
|
@@ -40,10 +45,12 @@ describe ScientificNameParser do
|
|
40
45
|
# end
|
41
46
|
# end
|
42
47
|
# end
|
43
|
-
|
48
|
+
|
44
49
|
it 'should generate reasonable output if parser failed' do
|
45
50
|
sn = 'ddd sljlkj 3223452432'
|
46
|
-
json(sn).should == '{"scientificName":{"parsed":false,
|
51
|
+
json(sn).should == '{"scientificName":{"parsed":false,' +
|
52
|
+
'"parser_version":"test_version","verbatim":"ddd sljlkj 3223452432"}}'
|
53
|
+
end
|
47
54
|
|
48
55
|
it "should show version when the flag :show_version set to true" do
|
49
56
|
parse('Homo sapiens')[:scientificName][:parser_version].should_not be_nil
|
@@ -58,7 +65,7 @@ describe ScientificNameParser do
|
|
58
65
|
end
|
59
66
|
end
|
60
67
|
|
61
|
-
describe "ScientificNameParser with ranked canonicals" do
|
68
|
+
describe "ScientificNameParser with ranked canonicals" do
|
62
69
|
before(:all) do
|
63
70
|
@parser = ScientificNameParser.new(canonical_with_rank: true)
|
64
71
|
end
|
@@ -66,10 +73,11 @@ describe "ScientificNameParser with ranked canonicals" do
|
|
66
73
|
it 'should not influence output for uninomials and binomials' do
|
67
74
|
data = [
|
68
75
|
['Ekbainacanthus Yakowlew 1902','Ekbainacanthus'],
|
69
|
-
['Ekboarmia sagnesi herrerai Exposito 2007',
|
76
|
+
['Ekboarmia sagnesi herrerai Exposito 2007',
|
77
|
+
'Ekboarmia sagnesi herrerai'],
|
70
78
|
['Ekboarmia holli Oberthür', 'Ekboarmia holli']]
|
71
79
|
|
72
|
-
data.each do |d|
|
80
|
+
data.each do |d|
|
73
81
|
parsed = @parser.parse(d[0])[:scientificName][:canonical]
|
74
82
|
parsed.should == d[1]
|
75
83
|
end
|
@@ -77,13 +85,16 @@ describe "ScientificNameParser with ranked canonicals" do
|
|
77
85
|
|
78
86
|
it 'should preserve rank for ranked multinomials' do
|
79
87
|
data = [
|
80
|
-
['Cola cordifolia var. puberula A. Chev.',
|
81
|
-
|
82
|
-
['
|
88
|
+
['Cola cordifolia var. puberula A. Chev.',
|
89
|
+
'Cola cordifolia var. puberula'],
|
90
|
+
['Abies homolepis forma umbilicata (Mayr) Schelle',
|
91
|
+
'Abies homolepis forma umbilicata'],
|
92
|
+
['Quercus ilex ssp. ballota (Desf.) Samp',
|
93
|
+
'Quercus ilex ssp. ballota']
|
83
94
|
]
|
84
95
|
data.each do |d|
|
85
96
|
parsed = @parser.parse(d[0])[:scientificName][:canonical]
|
86
|
-
parsed.should == d[1]
|
97
|
+
parsed.should == d[1]
|
87
98
|
end
|
88
99
|
end
|
89
100
|
|
@@ -115,7 +126,8 @@ describe ParallelParser do
|
|
115
126
|
res.keys.size.should == names.size
|
116
127
|
end
|
117
128
|
|
118
|
-
it "should have parsed name in native ruby format and in returned as
|
129
|
+
it "should have parsed name in native ruby format and in returned as \
|
130
|
+
a hash with name as a key and parsed data as value" do
|
119
131
|
names = []
|
120
132
|
read_test_file { |n| names << (n[:name]) if n[:name] }
|
121
133
|
names.uniq!
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: biodiversity19
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,72 +9,104 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rake
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '10.0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '10.0'
|
14
30
|
- !ruby/object:Gem::Dependency
|
15
31
|
name: treetop
|
16
32
|
requirement: !ruby/object:Gem::Requirement
|
17
33
|
none: false
|
18
34
|
requirements:
|
19
|
-
- -
|
35
|
+
- - ~>
|
20
36
|
- !ruby/object:Gem::Version
|
21
|
-
version: '
|
37
|
+
version: '1.4'
|
22
38
|
type: :runtime
|
23
39
|
prerelease: false
|
24
40
|
version_requirements: !ruby/object:Gem::Requirement
|
25
41
|
none: false
|
26
42
|
requirements:
|
27
|
-
- -
|
43
|
+
- - ~>
|
28
44
|
- !ruby/object:Gem::Version
|
29
|
-
version: '
|
45
|
+
version: '1.4'
|
30
46
|
- !ruby/object:Gem::Dependency
|
31
47
|
name: parallel
|
32
48
|
requirement: !ruby/object:Gem::Requirement
|
33
49
|
none: false
|
34
50
|
requirements:
|
35
|
-
- -
|
51
|
+
- - ~>
|
36
52
|
- !ruby/object:Gem::Version
|
37
|
-
version: '0'
|
53
|
+
version: '0.6'
|
38
54
|
type: :runtime
|
39
55
|
prerelease: false
|
40
56
|
version_requirements: !ruby/object:Gem::Requirement
|
41
57
|
none: false
|
42
58
|
requirements:
|
43
|
-
- -
|
59
|
+
- - ~>
|
44
60
|
- !ruby/object:Gem::Version
|
45
|
-
version: '0'
|
61
|
+
version: '0.6'
|
46
62
|
- !ruby/object:Gem::Dependency
|
47
63
|
name: unicode_utils
|
48
64
|
requirement: !ruby/object:Gem::Requirement
|
49
65
|
none: false
|
50
66
|
requirements:
|
51
|
-
- -
|
67
|
+
- - ~>
|
52
68
|
- !ruby/object:Gem::Version
|
53
|
-
version: '
|
69
|
+
version: '1.4'
|
54
70
|
type: :runtime
|
55
71
|
prerelease: false
|
56
72
|
version_requirements: !ruby/object:Gem::Requirement
|
57
73
|
none: false
|
58
74
|
requirements:
|
59
|
-
- -
|
75
|
+
- - ~>
|
60
76
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
77
|
+
version: '1.4'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: debugger
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ~>
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '1.5'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ~>
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '1.5'
|
62
94
|
- !ruby/object:Gem::Dependency
|
63
95
|
name: jeweler
|
64
96
|
requirement: !ruby/object:Gem::Requirement
|
65
97
|
none: false
|
66
98
|
requirements:
|
67
|
-
- -
|
99
|
+
- - ~>
|
68
100
|
- !ruby/object:Gem::Version
|
69
|
-
version: '
|
101
|
+
version: '1.8'
|
70
102
|
type: :development
|
71
103
|
prerelease: false
|
72
104
|
version_requirements: !ruby/object:Gem::Requirement
|
73
105
|
none: false
|
74
106
|
requirements:
|
75
|
-
- -
|
107
|
+
- - ~>
|
76
108
|
- !ruby/object:Gem::Version
|
77
|
-
version: '
|
109
|
+
version: '1.8'
|
78
110
|
- !ruby/object:Gem::Dependency
|
79
111
|
name: treetop
|
80
112
|
requirement: !ruby/object:Gem::Requirement
|
@@ -131,15 +163,16 @@ executables:
|
|
131
163
|
extensions: []
|
132
164
|
extra_rdoc_files:
|
133
165
|
- LICENSE
|
134
|
-
- README.
|
166
|
+
- README.md
|
135
167
|
files:
|
136
168
|
- .document
|
137
169
|
- .rvmrc
|
170
|
+
- .travis.yml
|
138
171
|
- CHANGELOG
|
139
172
|
- Gemfile
|
140
173
|
- Gemfile.lock
|
141
174
|
- LICENSE
|
142
|
-
- README.
|
175
|
+
- README.md
|
143
176
|
- Rakefile
|
144
177
|
- VERSION
|
145
178
|
- bin/nnparse
|
@@ -178,7 +211,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
178
211
|
version: '0'
|
179
212
|
segments:
|
180
213
|
- 0
|
181
|
-
hash:
|
214
|
+
hash: 1065169335698854656
|
182
215
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
183
216
|
none: false
|
184
217
|
requirements:
|
@@ -187,7 +220,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
187
220
|
version: '0'
|
188
221
|
requirements: []
|
189
222
|
rubyforge_project:
|
190
|
-
rubygems_version: 1.8.
|
223
|
+
rubygems_version: 1.8.25
|
191
224
|
signing_key:
|
192
225
|
specification_version: 3
|
193
226
|
summary: Parser of scientific names
|
data/README.rdoc
DELETED
@@ -1,109 +0,0 @@
|
|
1
|
-
= Biodiversity
|
2
|
-
|
3
|
-
{<img src="https://codeclimate.com/badge.png" />}[https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity]
|
4
|
-
|
5
|
-
Parses taxonomic scientific name and breaks it into semantic elements.
|
6
|
-
|
7
|
-
== Installation
|
8
|
-
|
9
|
-
To install gem you need RubyGems >= 1.3.6
|
10
|
-
|
11
|
-
$ sudo gem install biodiversity #for ruby 1.8.x
|
12
|
-
$ sudo gem install biodiversity19 #for ruby 1.9.x
|
13
|
-
|
14
|
-
== Example usage
|
15
|
-
|
16
|
-
=== As a command line script
|
17
|
-
|
18
|
-
You can parse file with taxonomic names from command line. File should contain one scientific name per line
|
19
|
-
|
20
|
-
nnparser file_with_names
|
21
|
-
|
22
|
-
=== As a socket server
|
23
|
-
|
24
|
-
If you do not use ruby and need a fast access to the parser functionality you can use a socket server
|
25
|
-
|
26
|
-
parserver
|
27
|
-
|
28
|
-
options:
|
29
|
-
|
30
|
-
parserver --output=canonical
|
31
|
-
|
32
|
-
to return a canonical form of the name string
|
33
|
-
|
34
|
-
parserver --port 5555
|
35
|
-
|
36
|
-
run socket server on a different port
|
37
|
-
|
38
|
-
parserver --canonical_with_rank
|
39
|
-
|
40
|
-
to add rank to canonical forms with infraspecific epithet, if it is given
|
41
|
-
|
42
|
-
Then you can access it via 4334 port using a socket client library of your programming language. You can find socket client script example in the examples directory of the gem.
|
43
|
-
|
44
|
-
If you want to check if socket server works for you:
|
45
|
-
|
46
|
-
#run server in one terminal
|
47
|
-
parserver
|
48
|
-
|
49
|
-
#in another terminal window type
|
50
|
-
telnet localhost 4334
|
51
|
-
|
52
|
-
If you enter a line with a scientific name server will send you back parsed information in json format.
|
53
|
-
|
54
|
-
To stop telnet client type any of 'end','exit','q', '.' (without quotes) instead of scientific name
|
55
|
-
|
56
|
-
$ telnet localhost 4334
|
57
|
-
Trying ::1...
|
58
|
-
Connected to localhost.
|
59
|
-
Escape character is '^]'.
|
60
|
-
Acacia abyssinica Hochst. ex Benth. ssp. calophylla Brenan
|
61
|
-
{"scientificName":{"canonical":"Acacia abyssinica calophylla","parsed":true,"parser_run":1,"verbatim":"Acacia abyssinica Hochst. ex Benth. ssp. calophylla Brenan\r\n","positions":{"0":["genus",6],"18":["author_word",25],"29":["author_word",35],"7":["species",17],"41":["infraspecies",51],"52":["author_word",58]},"hybrid":false,"normalized":"Acacia abyssinica Hochst. ex Benth. ssp. calophylla Brenan","details":[{"species":{"basionymAuthorTeam":{"exAuthorTeam":{"author":["Benth."],"authorTeam":"Benth."},"author":["Hochst."],"authorTeam":"Hochst."},"string":"abyssinica","authorship":"Hochst. ex Benth."},"infraspecies":[{"basionymAuthorTeam":{"author":["Brenan"],"authorTeam":"Brenan"},"string":"calophylla","rank":"ssp.","authorship":"Brenan"}],"genus":{"string":"Acacia"}}]}}
|
62
|
-
|
63
|
-
|
64
|
-
=== As a library
|
65
|
-
|
66
|
-
You can use it as a library
|
67
|
-
|
68
|
-
require 'biodiversity'
|
69
|
-
|
70
|
-
parser = ScientificNameParser.new
|
71
|
-
|
72
|
-
# to fix capitalization in canonicals
|
73
|
-
ScientificNameParser.fix_case("QUERCUS (QUERCUS) ALBA") # Quercus (Quercus) alba
|
74
|
-
|
75
|
-
# to parse a scientific name into a ruby hash
|
76
|
-
parser.parse("Plantago major")
|
77
|
-
|
78
|
-
#to get json representation
|
79
|
-
parser.parse("Plantago").to_json
|
80
|
-
#or
|
81
|
-
parser.parse("Plantago")
|
82
|
-
parser.all_json
|
83
|
-
|
84
|
-
# to clean name up
|
85
|
-
parser.parse(" Plantago major ")[:scientificName][:normalized]
|
86
|
-
|
87
|
-
# to get only cleaned up latin part of the name
|
88
|
-
parser.parse("Pseudocercospora dendrobii (H.C. Burnett) U. Braun & Crous 2003")[:scientificName][:canonical]
|
89
|
-
|
90
|
-
# to get detailed information about elements of the name
|
91
|
-
parser.parse("Pseudocercospora dendrobii (H.C. Burnett 1883) U. Braun & Crous 2003")[:scientificName][:details]
|
92
|
-
|
93
|
-
# to parse using several CPUs (4 seem to be optimal)
|
94
|
-
parser = ParallelParser.new # ParallelParser.new(4) will try to run 4 processes if hardware allows
|
95
|
-
array_of_names = ["Betula alba", "Homo sapiens"....]
|
96
|
-
parser.parse(array_of_names) # -> {"Betula alba" => {:scientificName...}, "Homo sapiens" => {:scientificName...}, ...}
|
97
|
-
|
98
|
-
parallel parser takes list of names and returns back a hash with names as keys and parsed data as values
|
99
|
-
|
100
|
-
# to resolve lsid and get back RDF file
|
101
|
-
LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
|
102
|
-
|
103
|
-
# to get canonicals with ranks for infraspecific epithets:
|
104
|
-
parser = ScientificNameParser.new(canonical_with_rank: true)
|
105
|
-
parser.parse('Cola cordifolia var. puberula A. Chev.')[:scientificName][:canonical]
|
106
|
-
# should get 'Cola cordifolia var. puberula'
|
107
|
-
|
108
|
-
Copyright (c) 2009-2011 Marine Biological Laboratory. See LICENSE.txt for
|
109
|
-
further details.
|