biodiversity 3.5.1 → 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/.rubocop.yml +9 -6
  4. data/.ruby-version +1 -1
  5. data/.travis.yml +1 -6
  6. data/CHANGELOG +3 -0
  7. data/Gemfile +2 -0
  8. data/README.md +37 -178
  9. data/Rakefile +15 -48
  10. data/biodiversity.gemspec +18 -21
  11. data/clib/linux/libgnparser.h +93 -0
  12. data/clib/linux/libgnparser.so +0 -0
  13. data/clib/mac/libgnparser.h +93 -0
  14. data/clib/mac/libgnparser.so +0 -0
  15. data/lib/biodiversity.rb +4 -9
  16. data/lib/biodiversity/parser.rb +65 -281
  17. data/lib/biodiversity/version.rb +8 -1
  18. data/spec/lib/biodiversity_spec.rb +9 -0
  19. data/spec/lib/parser_spec.rb +38 -0
  20. data/spec/spec_helper.rb +4 -81
  21. metadata +27 -102
  22. data/.byebug_history +0 -18
  23. data/.document +0 -5
  24. data/examples/socket_client.rb +0 -25
  25. data/lib/biodiversity/guid.rb +0 -1
  26. data/lib/biodiversity/guid/lsid.rb +0 -16
  27. data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -528
  28. data/lib/biodiversity/parser/scientific_name_canonical.treetop +0 -120
  29. data/lib/biodiversity/parser/scientific_name_clean.rb +0 -8991
  30. data/lib/biodiversity/parser/scientific_name_clean.treetop +0 -1632
  31. data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1298
  32. data/lib/biodiversity/parser/scientific_name_dirty.treetop +0 -264
  33. data/spec/biodiversity_spec.rb +0 -11
  34. data/spec/files/test_data.txt +0 -490
  35. data/spec/files/todo.txt +0 -55
  36. data/spec/guid/lsid.spec.rb +0 -15
  37. data/spec/parser/scientific_name_canonical_spec.rb +0 -36
  38. data/spec/parser/scientific_name_clean_spec.rb +0 -1137
  39. data/spec/parser/scientific_name_dirty_spec.rb +0 -165
  40. data/spec/parser/scientific_name_spec.rb +0 -193
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 832687083196a98a5923318b0d778fde31c852d0
4
- data.tar.gz: 2dc45e6f8d201ee8c9abcd5c2cbb41ad3e0c20fa
2
+ SHA256:
3
+ metadata.gz: be6873b8833844649f8f58641c1cb12250d6b539cdaa12c7e4657433c17ea2c1
4
+ data.tar.gz: 70ff3729b66bb1f8ae6eb7f553c1fe48d98cd191020f5a6ccf03b2f509843865
5
5
  SHA512:
6
- metadata.gz: af478fe58904ec048ef4b67f0671a1c7c8dc59144303af60129b117ce7ec775a1fa73ceb154f3e10f2a2d05cac173b72ff539231acb0b6cc7dab2a4baacc230f
7
- data.tar.gz: e6494923af734096b435e27b50ef8239477d58110c6b9f59a0f36324cc7110b6461eb81f0218c00766d6919a3d7eaa7f03dfc688195c0456ea50a5141e5d44ac
6
+ metadata.gz: b4bd5630b6e1401c98c0ed9fefdd945d7dd21f52ec911a6ca235163f6497c72ce94ad0dc957919601f87cdd73ce67c3f7274e910e7af1497d672eefdfa2a9d2b
7
+ data.tar.gz: 14a5920a83bb40a7d2f51d6bf633d1dc6415e596013a3a54ab1c27d07fdd0d64a5c2fb50fb4a47c430552d6a7c5f5c0e2a5ede9cc9890f295b3dcc1cec0e1fa8
data/.gitignore CHANGED
@@ -16,3 +16,4 @@ bin
16
16
  .bundle
17
17
  bundle_bin
18
18
  Gemfile.lock
19
+ .byebug_history
data/.rubocop.yml CHANGED
@@ -1,10 +1,13 @@
1
1
  AllCops:
2
2
  Exclude:
3
- - features/**/*
4
3
  - .bundle/**/*
5
4
  - bundle_bin/**/*
6
- Style/StringLiterals:
7
- EnforcedStyle: double_quotes
8
- Style/DotPosition:
9
- EnforcedStyle: trailing
10
-
5
+ Metrics/AbcSize:
6
+ Exclude:
7
+ - lib/**/*
8
+ Metrics/MethodLength:
9
+ Exclude:
10
+ - lib/**/*
11
+ Metrics/BlockLength:
12
+ Exclude:
13
+ - spec/**/*
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.4.4
1
+ 2.5.7
data/.travis.yml CHANGED
@@ -1,11 +1,6 @@
1
1
  rvm:
2
- - 2.1
3
- - 2.2
4
- - 2.3
5
- - 2.4
6
2
  - 2.5
3
+ - 2.6
7
4
  branches:
8
5
  only:
9
6
  - master
10
- before_script:
11
- - bundle exec rake tt
data/CHANGELOG CHANGED
@@ -1,3 +1,6 @@
1
+ 4.0.0 -- migrate code to gnparser C-shared library. This change breaks
2
+ backward compatibility, and makes parser dramatically faster.
3
+
1
4
  3.5.1 -- allow comma before 'and' or '&' in authorship
2
5
 
3
6
  3.5.0 -- add the tail cut by preprocessing to the results. The tail usually
data/Gemfile CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  source 'https://rubygems.org'
2
4
 
3
5
  gemspec
data/README.md CHANGED
@@ -4,169 +4,76 @@ Biodiversity
4
4
  [![DOI](https://zenodo.org/badge/19435/GlobalNamesArchitecture/biodiversity.svg)](https://zenodo.org/badge/latestdoi/19435/GlobalNamesArchitecture/biodiversity)
5
5
  [![Gem Version][gem_svg]][gem_link]
6
6
  [![Continuous Integration Status][ci_svg]][ci_link]
7
- [![CodePolice][cc_svg]][cc_link]
8
- [![Dependency Status][deps_svg]][deps_link]
9
7
 
10
8
  Parses taxonomic scientific name and breaks it into semantic elements.
11
9
 
12
- *WARNING, IMPORTANT!:*
13
- Support for Ruby 1.8.7 IS DROPPED. Both biodiversity and
14
- biodiversity19 will be for Ruby > 1.9.1 and will be identical gems.
10
+ **Important**: Biodiversity parser >= 4.0.0 uses binding to
11
+ https://gitlab.com/gogna/gnparser and
12
+ is not backward compatible with older versions. However it is much much faster
13
+ and better than previous versions.
15
14
 
16
- biodiversity19 is now deprecated and will not be updated anymore.
17
- You are strongly encouraged to change your dependencies from
18
- biodiversity19 to biodiversity
15
+ This gem does not have a remote server or a command line executable anymore.
16
+ For such features use https://gitlab.com/gogna/gnparser.
19
17
 
20
- Follow [biodiversity issues][waffle] on waffle.io
21
18
 
22
- Installation
23
- ------------
19
+ ## Installation
24
20
 
25
21
  sudo gem install biodiversity
26
22
 
27
- Example usage
28
- -------------
23
+ The gem should work on Linux, Mac and Windows (64bit) machines
29
24
 
30
- ### As a command line script
25
+ ## Example usage
31
26
 
32
- You can parse file with taxonomic names from command line.
33
- File should contain one scientific name per line
34
-
35
- nnparse file_with_names
36
-
37
- The resuls will be put into parsed.json file in the current directory.
38
- To save results into a different file:
39
-
40
- nnparse file_with_names output_file
41
-
42
- ### As a socket server
43
-
44
- If you do not use Ruby and need a fast access to the parser functionality
45
- you can use a socket server
46
-
47
- parserver
48
-
49
- parserver -h
50
- Usage: parserver [options]
51
-
52
- -r, --canonical_with_rank Adds infraspecies rank
53
- to canonical forms
54
-
55
- -o, --output=output Specifies the type of the output:
56
- json - parsed results in json
57
- canonical - canonical form only
58
- Default: json
59
-
60
- -H, --host=host Specifies host as "127.0.0.1",
61
- "localhost" etc.
62
- Default: 127.0.0.1
63
-
64
- -p, --port=port Specifies the port number
65
- Default: 4334
66
-
67
- -h, --help Show this help message.
68
-
69
- parserver --output=canonical
70
-
71
-
72
-
73
- With default settings you can access parserserver via 4334 port using a
74
- socket client library of your programming language. You can find
75
- [socket client script example][socket_example] in the examples directory of the gem.
76
-
77
- If you want to check if socket server works for you:
78
-
79
- #run server in one terminal
80
- parserver
81
-
82
- #in another terminal window type
83
- telnet localhost 4334
84
-
85
- If you enter a line with a scientific name -- server will send you back
86
- parsed information in json format.
87
-
88
- To stop telnet client type any of `end`,`exit`,`q`, `.` instead
89
- of scientific name
90
-
91
- $ telnet localhost 4334
92
- Trying ::1...
93
- Connected to localhost.
94
- Escape character is '^]'.
95
- Acacia abyssinica Hochst. ex Benth. ssp. calophylla Brenan
96
- {"scientificName":{"canonical":"Acacia abyssinica calophylla"...}}
97
- end
98
-
99
- ### As a library
100
-
101
- You can use it as a library in Ruby, JRuby etc.
27
+ You can use it as a library in Ruby:
102
28
 
103
29
 
104
30
  ```ruby
105
31
  require 'biodiversity'
106
32
 
107
- parser = ScientificNameParser.new
33
+ #to find the gem version number
34
+ Biodiversity.version
108
35
 
109
- #to find version number
110
- ScientificNameParser.version
36
+ # Note that the version in parsed output will correspond to the version of
37
+ # gnparser.
111
38
 
112
- # to fix capitalization in canonicals
113
- ScientificNameParser.fix_case("QUERCUS (QUERCUS) ALBA")
114
- # Output: Quercus (Quercus) alba
39
+ # to parse a scientific name into a simple Ruby hash
40
+ Biodiversity::Parser.parse("Plantago major", simple = true)
115
41
 
116
- # to parse a scientific name into a ruby hash
117
- parser.parse("Plantago major")
42
+ # to parse many scientific names using all computer CPUs
43
+ Biodiversity::Parser.parse(["Plantago major", ... ], simple = true)
44
+
45
+ # to parse a scientific name into a very detailed Ruby hash
46
+ Biodiversity::Parser.parse("Plantago major")
47
+
48
+ # to parse many scientific names with all details using all computer CPUs
49
+ Biodiversity::Parser.parse(["Plantago major", ... ])
118
50
 
119
51
  #to get json representation
120
- parser.parse("Plantago").to_json
121
- #or
122
- parser.parse("Plantago")
123
- parser.all_json
52
+ Biodiversity::Parser.parse("Plantago").to_json
124
53
 
125
54
  # to clean name up
126
- parser.parse(" Plantago major ")[:scientificName][:normalized]
55
+ Biodiversity::Parser.parse(" Plantago major ")[:normalized]
127
56
 
128
- # to get only cleaned up latin part of the name
129
- parser.parse("Pseudocercospora dendrobii (H.C. Burnett) U. \
130
- Braun & Crous 2003")[:scientificName][:canonical]
131
57
 
132
- # to get canonical form with infraspecies ranks
133
- parsed = parser.parse("Seddera latifolia Hochst. & Steud. var. latifolia")
134
- ranked = ScientificNameParser.add_rank_to_canonical(parsed)
135
- ranked[:scientificName][:canonical]
136
- #or
137
- parser = ScientificNameParser.new(canonical_with_rank: true)
138
- ranked = parser.parse("Seddera latifolia Hochst. & Steud. var. latifolia")
139
- ranked[:scientificName][:canonical]
58
+ # to get canonical form with or without infraspecies ranks, as well as
59
+ # stemmed version.
60
+ parsed = Biodiversity::Parser.parse("Seddera latifolia H. & S. var. latifolia")
61
+ parsed[:canonicalName][:full]
62
+ parsed[:canonicalName][:simple]
63
+ parsed[:canonicalName][:stem]
140
64
 
141
65
  # to get detailed information about elements of the name
142
- parser.parse("Pseudocercospora dendrobii (H.C. Burnett 1883) U. \
143
- Braun & Crous 2003")[:scientificName][:details]
144
- ```
145
-
146
- Returned result is not always linear, if name is complex. To get simple linear
147
- representation of the name you can use:
148
-
149
-
150
- ```ruby
151
- parser.parse("Pseudocercospora dendrobii (H.C. Burnett) \
152
- U. Braun & Crous 2003")[:scientificName][:position]
153
- # returns {0=>["genus", 16], 17=>["species", 26],
154
- # 28=>["author_word", 32], 33=>["author_word", 40],
155
- # 42=>["author_word", 44], 45=>["author_word", 50],
156
- # 53=>["author_word", 58], 59=>["year", 63]}
157
- # where the key is the char index of the start of
158
- # a word, first element of the value is a semantic meaning
159
- # of the word, second element of the value is the character index
160
- # of end of the word
66
+ Biodiversity::Parser.parse("Pseudocercospora dendrobii (H.C. Burnett 1883) U. \
67
+ Braun & Crous 2003")[:details]
161
68
  ```
162
69
 
163
70
  'Surrogate' is a broad group which includes 'Barcode of Life' names, and various
164
71
  undetermined names with cf. sp. spp. nr. in them:
165
72
 
166
73
  ```ruby
167
- parser.parse("Coleoptera BOLD:1234567")[:scientificName][:surrogate]
74
+ parser.parse("Coleoptera BOLD:1234567")[:surrogate]
168
75
  ```
169
- ### What is "id" in the parsed results?
76
+ ### What is "nameStringID" in the parsed results?
170
77
 
171
78
  ID field contains UUID v5 hexadecimal string. ID is generated out of bytes
172
79
  from the name string itself, and identical id can be generated using [any
@@ -175,67 +82,19 @@ version 5 in a [blog post][uuid_blog]
175
82
 
176
83
  For example "Homo sapiens" should generate "16f235a0-e4a3-529c-9b83-bd15fe722110" UUID
177
84
 
178
- ### Parse using several CPUs (4 threads seem to be optimal)
179
-
180
- ```ruby
181
- parser = ParallelParser.new
182
- # ParallelParser.new(4) will try to run 4 processes if hardware allows
183
- array_of_names = ["Betula alba", "Homo sapiens"....]
184
- parser.parse(array_of_names)
185
- # Output: {"Betula alba" => {:scientificName...},
186
- # "Homo sapiens" => {:scientificName...}, ...}
187
- ```
188
-
189
- parallel parser takes list of names and returns back a hash with names as
190
- keys and parsed data as values
191
-
192
- ### Canonicals with ranks for infraspecific epithets:
193
-
194
- ```ruby
195
- parser = ScientificNameParser.new(canonical_with_rank: true)
196
- parser.parse('Cola cordifolia var. puberula \
197
- A. Chev.')[:scientificName][:canonical]
198
- # Output: Cola cordifolia var. puberula
199
- ```
200
-
201
- ### Resolving lsid and geting back RDF file
202
-
203
- LsidResolver.resolve("urn:lsid:ubio.org:classificationbank:2232671")
204
-
205
- Troubleshooting
206
- ---------------
207
-
208
- If nnparse or parserver do not start -- try to run
209
-
210
- gem uninstall biodiversity
211
- gem uninstall biodiversity19
212
-
213
- and make sure you remove all versions and all nnparse and parserver scripts.
214
- Then install biodiversity again
215
-
216
- gem install biodiversity
217
-
218
- It should fix the problem.
219
-
220
85
  Copyright
221
86
  ---------
222
87
 
223
88
  Authors: [Dmitry Mozzherin][dimus]
224
89
 
225
- Copyright (c) 2008-2018 Dmitry Mozzherin. See [LICENSE][license]
90
+ Copyright (c) 2008-2019 Dmitry Mozzherin. See [LICENSE][license]
226
91
  for further details.
227
92
 
228
93
  [gem_svg]: https://badge.fury.io/rb/biodiversity.svg
229
94
  [gem_link]: http://badge.fury.io/rb/biodiversity
230
95
  [ci_svg]: https://secure.travis-ci.org/GlobalNamesArchitecture/biodiversity.svg
231
96
  [ci_link]: http://travis-ci.org/GlobalNamesArchitecture/biodiversity
232
- [cc_svg]: https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity.svg
233
- [cc_link]: https://codeclimate.com/github/GlobalNamesArchitecture/biodiversity
234
- [deps_svg]: https://gemnasium.com/GlobalNamesArchitecture/biodiversity.svg
235
- [deps_link]: https://gemnasium.com/GlobalNamesArchitecture/biodiversity
236
- [socket_example]: http://bit.ly/149iLm5
237
97
  [dimus]: https://github.com/dimus
238
98
  [license]: https://github.com/GlobalNamesArchitecture/biodiversity/blob/master/LICENSE
239
- [waffle]: https://waffle.io/GlobalNamesArchitecture/biodiversity
240
99
  [uuid_examples]: https://github.com/GlobalNamesArchitecture/gn_uuid_examples
241
100
  [uuid_blog]: http://globalnamesarchitecture.github.io/gna/uuid/2015/05/31/gn-uuid-0-5-0.html
data/Rakefile CHANGED
@@ -1,53 +1,20 @@
1
- require "bundler"
2
- Bundler::GemHelper.install_tasks
3
-
4
- begin
5
- Bundler.setup(:default, :development)
6
- rescue Bundler::BundlerError => e
7
- $stderr.puts e.message
8
- $stderr.puts "Run `bundle install` to install missing gems"
9
- exit e.status_code
10
- end
11
-
12
- require "rspec/core"
13
- require "rspec/core/rake_task"
14
- require "rake/dsl_definition"
15
- require "rake"
16
- require "rspec"
17
- require "rspec/core/rake_task"
1
+ # frozen_string_literal: true
18
2
 
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+ require 'rubocop/rake_task'
6
+ require 'rake/dsl_definition'
7
+ require 'rake'
8
+ require 'rspec'
19
9
 
20
- task :default => :spec
21
-
22
- RSpec::Core::RakeTask.new do |t|
23
- t.pattern = "spec/**/*spec.rb"
10
+ RSpec::Core::RakeTask.new(:spec) do |rspec|
11
+ rspec.pattern = FileList['spec/**/*_spec.rb']
24
12
  end
25
13
 
26
- task :tt do
27
- dir = File.dirname(__FILE__)
28
- ["scientific_name_clean",
29
- "scientific_name_dirty",
30
- "scientific_name_canonical"].each do |f|
31
- file = "#{dir}/lib/biodiversity/parser/#{f}"
32
- FileUtils.rm("#{file}.rb") if FileTest.exist?("#{file}.rb")
33
- system("tt #{file}.treetop")
34
- rf = "#{file}.rb"
35
- rfn = open(rf + ".tmp", "w")
36
- skip_head = false
37
- f = open(rf)
38
- # getting around a bug in treetop which prevents setting
39
- # UTF-8 encoding in ruby19
40
- f.each_with_index do |l, i|
41
- skip_head = l.match(/^# Autogenerated/) if i == 0
42
- if skip_head && (l.strip == "" || l.match(/^# Autogenerated/))
43
- next
44
- else
45
- skip_head = false
46
- rfn.write(l)
47
- end
48
- end
49
- rfn.close
50
- f.close
51
- `mv #{rf}.tmp #{rf}`
52
- end
14
+ RuboCop::RakeTask.new
15
+ task default: %i[rubocop spec]
16
+
17
+ desc 'open an irb session preloaded with this gem'
18
+ task :console do
19
+ sh 'irb -r pp -r ./lib/biodiversity.rb'
53
20
  end
data/biodiversity.gemspec CHANGED
@@ -1,30 +1,27 @@
1
- $:.push File.expand_path("../lib", __FILE__)
1
+ # frozen_string_literal: true
2
2
 
3
- require "biodiversity/version"
3
+ $LOAD_PATH.push File.expand_path('lib', __dir__)
4
+
5
+ require 'biodiversity/version'
4
6
 
5
7
  Gem::Specification.new do |gem|
6
- gem.name = "biodiversity"
8
+ gem.name = 'biodiversity'
7
9
  gem.version = Biodiversity::VERSION
8
- gem.homepage = "https://github.com/GlobalNamesArchitecture/biodiversity"
9
- gem.license = "MIT"
10
- gem.summary = "Parser of scientific names"
11
- gem.description = "Tools for biodiversity informatics"
12
- gem.authors = ["Dmitry Mozzherin"]
13
- gem.email = "dmozzherin@gmail.com"
10
+ gem.homepage = 'https://github.com/GlobalNamesArchitecture/biodiversity'
11
+ gem.license = 'MIT'
12
+ gem.summary = 'Parser of scientific names'
13
+ gem.description = 'Parsing tool for biodiversity informatics'
14
+ gem.authors = ['Dmitry Mozzherin']
15
+ gem.email = 'dmozzherin@gmail.com'
14
16
 
15
17
  gem.files = `git ls-files`.split("\n")
16
- gem.executables = ["nnparse", "parserver"]
17
- gem.require_paths = ["lib"]
18
+ gem.require_paths = ['lib']
18
19
 
19
- gem.add_runtime_dependency "treetop", "~> 1.6"
20
- gem.add_runtime_dependency "parallel", "~> 1.12"
21
- gem.add_runtime_dependency "unicode_utils", "~> 1.4"
22
- gem.add_runtime_dependency "gn_uuid", "~> 0.5"
20
+ gem.add_runtime_dependency 'ffi', '~> 1.11'
23
21
 
24
- gem.add_development_dependency "bundler", "~> 1.16"
25
- gem.add_development_dependency "rake", "~> 12.3"
26
- gem.add_development_dependency "rspec", "~> 3.7"
27
- gem.add_development_dependency "webmock", "~> 3.3"
28
- gem.add_development_dependency "rr", "~> 1.2"
29
- gem.add_development_dependency "rubocop", "~> 0.52"
22
+ gem.add_development_dependency 'bundler', '~> 2.0'
23
+ gem.add_development_dependency 'byebug', '~> 11.0'
24
+ gem.add_development_dependency 'rake', '~> 13.0'
25
+ gem.add_development_dependency 'rspec', '~> 3.9'
26
+ gem.add_development_dependency 'rubocop', '~> 0.76'
30
27
  end