biodiversity 3.5.1 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rubocop.yml +9 -6
- data/.ruby-version +1 -1
- data/.travis.yml +1 -6
- data/CHANGELOG +3 -0
- data/Gemfile +2 -0
- data/README.md +37 -178
- data/Rakefile +15 -48
- data/biodiversity.gemspec +18 -21
- data/clib/linux/libgnparser.h +93 -0
- data/clib/linux/libgnparser.so +0 -0
- data/clib/mac/libgnparser.h +93 -0
- data/clib/mac/libgnparser.so +0 -0
- data/lib/biodiversity.rb +4 -9
- data/lib/biodiversity/parser.rb +65 -281
- data/lib/biodiversity/version.rb +8 -1
- data/spec/lib/biodiversity_spec.rb +9 -0
- data/spec/lib/parser_spec.rb +38 -0
- data/spec/spec_helper.rb +4 -81
- metadata +27 -102
- data/.byebug_history +0 -18
- data/.document +0 -5
- data/examples/socket_client.rb +0 -25
- data/lib/biodiversity/guid.rb +0 -1
- data/lib/biodiversity/guid/lsid.rb +0 -16
- data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -528
- data/lib/biodiversity/parser/scientific_name_canonical.treetop +0 -120
- data/lib/biodiversity/parser/scientific_name_clean.rb +0 -8991
- data/lib/biodiversity/parser/scientific_name_clean.treetop +0 -1632
- data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1298
- data/lib/biodiversity/parser/scientific_name_dirty.treetop +0 -264
- data/spec/biodiversity_spec.rb +0 -11
- data/spec/files/test_data.txt +0 -490
- data/spec/files/todo.txt +0 -55
- data/spec/guid/lsid.spec.rb +0 -15
- data/spec/parser/scientific_name_canonical_spec.rb +0 -36
- data/spec/parser/scientific_name_clean_spec.rb +0 -1137
- data/spec/parser/scientific_name_dirty_spec.rb +0 -165
- data/spec/parser/scientific_name_spec.rb +0 -193
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: be6873b8833844649f8f58641c1cb12250d6b539cdaa12c7e4657433c17ea2c1
|
4
|
+
data.tar.gz: 70ff3729b66bb1f8ae6eb7f553c1fe48d98cd191020f5a6ccf03b2f509843865
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b4bd5630b6e1401c98c0ed9fefdd945d7dd21f52ec911a6ca235163f6497c72ce94ad0dc957919601f87cdd73ce67c3f7274e910e7af1497d672eefdfa2a9d2b
|
7
|
+
data.tar.gz: 14a5920a83bb40a7d2f51d6bf633d1dc6415e596013a3a54ab1c27d07fdd0d64a5c2fb50fb4a47c430552d6a7c5f5c0e2a5ede9cc9890f295b3dcc1cec0e1fa8
|
data/.gitignore
CHANGED
data/.rubocop.yml
CHANGED
@@ -1,10 +1,13 @@
|
|
1
1
|
AllCops:
|
2
2
|
Exclude:
|
3
|
-
- features/**/*
|
4
3
|
- .bundle/**/*
|
5
4
|
- bundle_bin/**/*
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
5
|
+
Metrics/AbcSize:
|
6
|
+
Exclude:
|
7
|
+
- lib/**/*
|
8
|
+
Metrics/MethodLength:
|
9
|
+
Exclude:
|
10
|
+
- lib/**/*
|
11
|
+
Metrics/BlockLength:
|
12
|
+
Exclude:
|
13
|
+
- spec/**/*
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.5.7
|
data/.travis.yml
CHANGED
data/CHANGELOG
CHANGED
@@ -1,3 +1,6 @@
|
|
1
|
+
4.0.0 -- migrate code to gnparser C-shared library. This change breaks
|
2
|
+
backward compatibility, and makes parser dramatically faster.
|
3
|
+
|
1
4
|
3.5.1 -- allow comma before 'and' or '&' in authorship
|
2
5
|
|
3
6
|
3.5.0 -- add the tail cut by preprocessing to the results. The tail usually
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -4,169 +4,76 @@ Biodiversity
|
|
4
4
|
[](https://zenodo.org/badge/latestdoi/19435/GlobalNamesArchitecture/biodiversity)
|
5
5
|
[![Gem Version][gem_svg]][gem_link]
|
6
6
|
[![Continuous Integration Status][ci_svg]][ci_link]
|
7
|
-
[![CodePolice][cc_svg]][cc_link]
|
8
|
-
[![Dependency Status][deps_svg]][deps_link]
|
9
7
|
|
10
8
|
Parses taxonomic scientific name and breaks it into semantic elements.
|
11
9
|
|
12
|
-
|
13
|
-
|
14
|
-
|
10
|
+
**Important**: Biodiversity parser >= 4.0.0 uses binding to
|
11
|
+
https://gitlab.com/gogna/gnparser and
|
12
|
+
is not backward compatible with older versions. However it is much much faster
|
13
|
+
and better than previous versions.
|
15
14
|
|
16
|
-
|
17
|
-
|
18
|
-
biodiversity19 to biodiversity
|
15
|
+
This gem does not have a remote server or a command line executable anymore.
|
16
|
+
For such features use https://gitlab.com/gogna/gnparser.
|
19
17
|
|
20
|
-
Follow [biodiversity issues][waffle] on waffle.io
|
21
18
|
|
22
|
-
Installation
|
23
|
-
------------
|
19
|
+
## Installation
|
24
20
|
|
25
21
|
sudo gem install biodiversity
|
26
22
|
|
27
|
-
|
28
|
-
-------------
|
23
|
+
The gem should work on Linux, Mac and Windows (64bit) machines
|
29
24
|
|
30
|
-
|
25
|
+
## Example usage
|
31
26
|
|
32
|
-
You can
|
33
|
-
File should contain one scientific name per line
|
34
|
-
|
35
|
-
nnparse file_with_names
|
36
|
-
|
37
|
-
The resuls will be put into parsed.json file in the current directory.
|
38
|
-
To save results into a different file:
|
39
|
-
|
40
|
-
nnparse file_with_names output_file
|
41
|
-
|
42
|
-
### As a socket server
|
43
|
-
|
44
|
-
If you do not use Ruby and need a fast access to the parser functionality
|
45
|
-
you can use a socket server
|
46
|
-
|
47
|
-
parserver
|
48
|
-
|
49
|
-
parserver -h
|
50
|
-
Usage: parserver [options]
|
51
|
-
|
52
|
-
-r, --canonical_with_rank Adds infraspecies rank
|
53
|
-
to canonical forms
|
54
|
-
|
55
|
-
-o, --output=output Specifies the type of the output:
|
56
|
-
json - parsed results in json
|
57
|
-
canonical - canonical form only
|
58
|
-
Default: json
|
59
|
-
|
60
|
-
-H, --host=host Specifies host as "127.0.0.1",
|
61
|
-
"localhost" etc.
|
62
|
-
Default: 127.0.0.1
|
63
|
-
|
64
|
-
-p, --port=port Specifies the port number
|
65
|
-
Default: 4334
|
66
|
-
|
67
|
-
-h, --help Show this help message.
|
68
|
-
|
69
|
-
parserver --output=canonical
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
With default settings you can access parserserver via 4334 port using a
|
74
|
-
socket client library of your programming language. You can find
|
75
|
-
[socket client script example][socket_example] in the examples directory of the gem.
|
76
|
-
|
77
|
-
If you want to check if socket server works for you:
|
78
|
-
|
79
|
-
#run server in one terminal
|
80
|
-
parserver
|
81
|
-
|
82
|
-
#in another terminal window type
|
83
|
-
telnet localhost 4334
|
84
|
-
|
85
|
-
If you enter a line with a scientific name -- server will send you back
|
86
|
-
parsed information in json format.
|
87
|
-
|
88
|
-
To stop telnet client type any of `end`,`exit`,`q`, `.` instead
|
89
|
-
of scientific name
|
90
|
-
|
91
|
-
$ telnet localhost 4334
|
92
|
-
Trying ::1...
|
93
|
-
Connected to localhost.
|
94
|
-
Escape character is '^]'.
|
95
|
-
Acacia abyssinica Hochst. ex Benth. ssp. calophylla Brenan
|
96
|
-
{"scientificName":{"canonical":"Acacia abyssinica calophylla"...}}
|
97
|
-
end
|
98
|
-
|
99
|
-
### As a library
|
100
|
-
|
101
|
-
You can use it as a library in Ruby, JRuby etc.
|
27
|
+
You can use it as a library in Ruby:
|
102
28
|
|
103
29
|
|
104
30
|
```ruby
|
105
31
|
require 'biodiversity'
|
106
32
|
|
107
|
-
|
33
|
+
#to find the gem version number
|
34
|
+
Biodiversity.version
|
108
35
|
|
109
|
-
#to
|
110
|
-
|
36
|
+
# Note that the version in parsed output will correspond to the version of
|
37
|
+
# gnparser.
|
111
38
|
|
112
|
-
# to
|
113
|
-
|
114
|
-
# Output: Quercus (Quercus) alba
|
39
|
+
# to parse a scientific name into a simple Ruby hash
|
40
|
+
Biodiversity::Parser.parse("Plantago major", simple = true)
|
115
41
|
|
116
|
-
# to parse
|
117
|
-
|
42
|
+
# to parse many scientific names using all computer CPUs
|
43
|
+
Biodiversity::Parser.parse(["Plantago major", ... ], simple = true)
|
44
|
+
|
45
|
+
# to parse a scientific name into a very detailed Ruby hash
|
46
|
+
Biodiversity::Parser.parse("Plantago major")
|
47
|
+
|
48
|
+
# to parse many scientific names with all details using all computer CPUs
|
49
|
+
Biodiversity::Parser.parse(["Plantago major", ... ])
|
118
50
|
|
119
51
|
#to get json representation
|
120
|
-
|
121
|
-
#or
|
122
|
-
parser.parse("Plantago")
|
123
|
-
parser.all_json
|
52
|
+
Biodiversity::Parser.parse("Plantago").to_json
|
124
53
|
|
125
54
|
# to clean name up
|
126
|
-
|
55
|
+
Biodiversity::Parser.parse(" Plantago major ")[:normalized]
|
127
56
|
|
128
|
-
# to get only cleaned up latin part of the name
|
129
|
-
parser.parse("Pseudocercospora dendrobii (H.C. Burnett) U. \
|
130
|
-
Braun & Crous 2003")[:scientificName][:canonical]
|
131
57
|
|
132
|
-
# to get canonical form with infraspecies ranks
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
ranked = parser.parse("Seddera latifolia Hochst. & Steud. var. latifolia")
|
139
|
-
ranked[:scientificName][:canonical]
|
58
|
+
# to get canonical form with or without infraspecies ranks, as well as
|
59
|
+
# stemmed version.
|
60
|
+
parsed = Biodiversity::Parser.parse("Seddera latifolia H. & S. var. latifolia")
|
61
|
+
parsed[:canonicalName][:full]
|
62
|
+
parsed[:canonicalName][:simple]
|
63
|
+
parsed[:canonicalName][:stem]
|
140
64
|
|
141
65
|
# to get detailed information about elements of the name
|
142
|
-
|
143
|
-
Braun & Crous 2003")[:
|
144
|
-
```
|
145
|
-
|
146
|
-
Returned result is not always linear, if name is complex. To get simple linear
|
147
|
-
representation of the name you can use:
|
148
|
-
|
149
|
-
|
150
|
-
```ruby
|
151
|
-
parser.parse("Pseudocercospora dendrobii (H.C. Burnett) \
|
152
|
-
U. Braun & Crous 2003")[:scientificName][:position]
|
153
|
-
# returns {0=>["genus", 16], 17=>["species", 26],
|
154
|
-
# 28=>["author_word", 32], 33=>["author_word", 40],
|
155
|
-
# 42=>["author_word", 44], 45=>["author_word", 50],
|
156
|
-
# 53=>["author_word", 58], 59=>["year", 63]}
|
157
|
-
# where the key is the char index of the start of
|
158
|
-
# a word, first element of the value is a semantic meaning
|
159
|
-
# of the word, second element of the value is the character index
|
160
|
-
# of end of the word
|
66
|
+
Biodiversity::Parser.parse("Pseudocercospora dendrobii (H.C. Burnett 1883) U. \
|
67
|
+
Braun & Crous 2003")[:details]
|
161
68
|
```
|
162
69
|
|
163
70
|
'Surrogate' is a broad group which includes 'Barcode of Life' names, and various
|
164
71
|
undetermined names with cf. sp. spp. nr. in them:
|
165
72
|
|
166
73
|
```ruby
|
167
|
-
parser.parse("Coleoptera BOLD:1234567")[:
|
74
|
+
parser.parse("Coleoptera BOLD:1234567")[:surrogate]
|
168
75
|
```
|
169
|
-
### What is "
|
76
|
+
### What is "nameStringID" in the parsed results?
|
170
77
|
|
171
78
|
ID field contains UUID v5 hexadecimal string. ID is generated out of bytes
|
172
79
|
from the name string itself, and identical id can be generated using [any
|
@@ -175,67 +82,19 @@ version 5 in a [blog post][uuid_blog]
|
|
175
82
|
|
176
83
|
For example "Homo sapiens" should generate "16f235a0-e4a3-529c-9b83-bd15fe722110" UUID
|
177
84
|
|
178
|
-
### Parse using several CPUs (4 threads seem to be optimal)
|
179
|
-
|
180
|
-
```ruby
|
181
|
-
parser = ParallelParser.new
|
182
|
-
# ParallelParser.new(4) will try to run 4 processes if hardware allows
|
183
|
-
array_of_names = ["Betula alba", "Homo sapiens"....]
|
184
|
-
parser.parse(array_of_names)
|
185
|
-
# Output: {"Betula alba" => {:scientificName...},
|
186
|
-
# "Homo sapiens" => {:scientificName...}, ...}
|
187
|
-
```
|
188
|
-
|
189
|
-
parallel parser takes list of names and returns back a hash with names as
|
190
|
-
keys and parsed data as values
|
191
|
-
|
192
|
-
### Canonicals with ranks for infraspecific epithets:
|
193
|
-
|
194
|
-
```ruby
|
195
|
-
parser = ScientificNameParser.new(canonical_with_rank: true)
|
196
|
-
parser.parse('Cola cordifolia var. puberula \
|
197
|
-
A. Chev.')[:scientificName][:canonical]
|
198
|
-
# Output: Cola cordifolia var. puberula
|
199
|
-
```
|
200
|
-
|
201
|
-
### Resolving lsid and geting back RDF file
|
202
|
-
|
203
|
-
LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
|
204
|
-
|
205
|
-
Troubleshooting
|
206
|
-
---------------
|
207
|
-
|
208
|
-
If nnparse or parserver do not start -- try to run
|
209
|
-
|
210
|
-
gem uninstall biodiversity
|
211
|
-
gem uninstall biodiversity19
|
212
|
-
|
213
|
-
and make sure you remove all versions and all nnparse and parserver scripts.
|
214
|
-
Then install biodiversity again
|
215
|
-
|
216
|
-
gem install biodiversity
|
217
|
-
|
218
|
-
It should fix the problem.
|
219
|
-
|
220
85
|
Copyright
|
221
86
|
---------
|
222
87
|
|
223
88
|
Authors: [Dmitry Mozzherin][dimus]
|
224
89
|
|
225
|
-
Copyright (c) 2008-
|
90
|
+
Copyright (c) 2008-2019 Dmitry Mozzherin. See [LICENSE][license]
|
226
91
|
for further details.
|
227
92
|
|
228
93
|
[gem_svg]: https://badge.fury.io/rb/biodiversity.svg
|
229
94
|
[gem_link]: http://badge.fury.io/rb/biodiversity
|
230
95
|
[ci_svg]: https://secure.travis-ci.org/GlobalNamesArchitecture/biodiversity.svg
|
231
96
|
[ci_link]: http://travis-ci.org/GlobalNamesArchitecture/biodiversity
|
232
|
-
[cc_svg]: https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity.svg
|
233
|
-
[cc_link]: https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity
|
234
|
-
[deps_svg]: https://gemnasium.com/GlobalNamesArchitecture/biodiversity.svg
|
235
|
-
[deps_link]: https://gemnasium.com/GlobalNamesArchitecture/biodiversity
|
236
|
-
[socket_example]: http://bit.ly/149iLm5
|
237
97
|
[dimus]: https://github.com/dimus
|
238
98
|
[license]: https://github.com/GlobalNamesArchitecture/biodiversity/blob/master/LICENSE
|
239
|
-
[waffle]: https://waffle.io/GlobalNamesArchitecture/biodiversity
|
240
99
|
[uuid_examples]: https://github.com/GlobalNamesArchitecture/gn_uuid_examples
|
241
100
|
[uuid_blog]: http://globalnamesarchitecture.github.io/gna/uuid/2015/05/31/gn-uuid-0-5-0.html
|
data/Rakefile
CHANGED
@@ -1,53 +1,20 @@
|
|
1
|
-
|
2
|
-
Bundler::GemHelper.install_tasks
|
3
|
-
|
4
|
-
begin
|
5
|
-
Bundler.setup(:default, :development)
|
6
|
-
rescue Bundler::BundlerError => e
|
7
|
-
$stderr.puts e.message
|
8
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
9
|
-
exit e.status_code
|
10
|
-
end
|
11
|
-
|
12
|
-
require "rspec/core"
|
13
|
-
require "rspec/core/rake_task"
|
14
|
-
require "rake/dsl_definition"
|
15
|
-
require "rake"
|
16
|
-
require "rspec"
|
17
|
-
require "rspec/core/rake_task"
|
1
|
+
# frozen_string_literal: true
|
18
2
|
|
3
|
+
require 'bundler/gem_tasks'
|
4
|
+
require 'rspec/core/rake_task'
|
5
|
+
require 'rubocop/rake_task'
|
6
|
+
require 'rake/dsl_definition'
|
7
|
+
require 'rake'
|
8
|
+
require 'rspec'
|
19
9
|
|
20
|
-
|
21
|
-
|
22
|
-
RSpec::Core::RakeTask.new do |t|
|
23
|
-
t.pattern = "spec/**/*spec.rb"
|
10
|
+
RSpec::Core::RakeTask.new(:spec) do |rspec|
|
11
|
+
rspec.pattern = FileList['spec/**/*_spec.rb']
|
24
12
|
end
|
25
13
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
FileUtils.rm("#{file}.rb") if FileTest.exist?("#{file}.rb")
|
33
|
-
system("tt #{file}.treetop")
|
34
|
-
rf = "#{file}.rb"
|
35
|
-
rfn = open(rf + ".tmp", "w")
|
36
|
-
skip_head = false
|
37
|
-
f = open(rf)
|
38
|
-
# getting around a bug in treetop which prevents setting
|
39
|
-
# UTF-8 encoding in ruby19
|
40
|
-
f.each_with_index do |l, i|
|
41
|
-
skip_head = l.match(/^# Autogenerated/) if i == 0
|
42
|
-
if skip_head && (l.strip == "" || l.match(/^# Autogenerated/))
|
43
|
-
next
|
44
|
-
else
|
45
|
-
skip_head = false
|
46
|
-
rfn.write(l)
|
47
|
-
end
|
48
|
-
end
|
49
|
-
rfn.close
|
50
|
-
f.close
|
51
|
-
`mv #{rf}.tmp #{rf}`
|
52
|
-
end
|
14
|
+
RuboCop::RakeTask.new
|
15
|
+
task default: %i[rubocop spec]
|
16
|
+
|
17
|
+
desc 'open an irb session preloaded with this gem'
|
18
|
+
task :console do
|
19
|
+
sh 'irb -r pp -r ./lib/biodiversity.rb'
|
53
20
|
end
|
data/biodiversity.gemspec
CHANGED
@@ -1,30 +1,27 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
3
|
+
$LOAD_PATH.push File.expand_path('lib', __dir__)
|
4
|
+
|
5
|
+
require 'biodiversity/version'
|
4
6
|
|
5
7
|
Gem::Specification.new do |gem|
|
6
|
-
gem.name =
|
8
|
+
gem.name = 'biodiversity'
|
7
9
|
gem.version = Biodiversity::VERSION
|
8
|
-
gem.homepage =
|
9
|
-
gem.license =
|
10
|
-
gem.summary =
|
11
|
-
gem.description =
|
12
|
-
gem.authors = [
|
13
|
-
gem.email =
|
10
|
+
gem.homepage = 'https://github.com/GlobalNamesArchitecture/biodiversity'
|
11
|
+
gem.license = 'MIT'
|
12
|
+
gem.summary = 'Parser of scientific names'
|
13
|
+
gem.description = 'Parsing tool for biodiversity informatics'
|
14
|
+
gem.authors = ['Dmitry Mozzherin']
|
15
|
+
gem.email = 'dmozzherin@gmail.com'
|
14
16
|
|
15
17
|
gem.files = `git ls-files`.split("\n")
|
16
|
-
gem.
|
17
|
-
gem.require_paths = ["lib"]
|
18
|
+
gem.require_paths = ['lib']
|
18
19
|
|
19
|
-
gem.add_runtime_dependency
|
20
|
-
gem.add_runtime_dependency "parallel", "~> 1.12"
|
21
|
-
gem.add_runtime_dependency "unicode_utils", "~> 1.4"
|
22
|
-
gem.add_runtime_dependency "gn_uuid", "~> 0.5"
|
20
|
+
gem.add_runtime_dependency 'ffi', '~> 1.11'
|
23
21
|
|
24
|
-
gem.add_development_dependency
|
25
|
-
gem.add_development_dependency
|
26
|
-
gem.add_development_dependency
|
27
|
-
gem.add_development_dependency
|
28
|
-
gem.add_development_dependency
|
29
|
-
gem.add_development_dependency "rubocop", "~> 0.52"
|
22
|
+
gem.add_development_dependency 'bundler', '~> 2.0'
|
23
|
+
gem.add_development_dependency 'byebug', '~> 11.0'
|
24
|
+
gem.add_development_dependency 'rake', '~> 13.0'
|
25
|
+
gem.add_development_dependency 'rspec', '~> 3.9'
|
26
|
+
gem.add_development_dependency 'rubocop', '~> 0.76'
|
30
27
|
end
|