biodiversity 3.5.1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/.rubocop.yml +9 -6
  4. data/.ruby-version +1 -1
  5. data/.travis.yml +1 -6
  6. data/CHANGELOG +3 -0
  7. data/Gemfile +2 -0
  8. data/README.md +37 -178
  9. data/Rakefile +15 -48
  10. data/biodiversity.gemspec +18 -21
  11. data/clib/linux/libgnparser.h +93 -0
  12. data/clib/linux/libgnparser.so +0 -0
  13. data/clib/mac/libgnparser.h +93 -0
  14. data/clib/mac/libgnparser.so +0 -0
  15. data/lib/biodiversity.rb +4 -9
  16. data/lib/biodiversity/parser.rb +65 -281
  17. data/lib/biodiversity/version.rb +8 -1
  18. data/spec/lib/biodiversity_spec.rb +9 -0
  19. data/spec/lib/parser_spec.rb +38 -0
  20. data/spec/spec_helper.rb +4 -81
  21. metadata +27 -102
  22. data/.byebug_history +0 -18
  23. data/.document +0 -5
  24. data/examples/socket_client.rb +0 -25
  25. data/lib/biodiversity/guid.rb +0 -1
  26. data/lib/biodiversity/guid/lsid.rb +0 -16
  27. data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -528
  28. data/lib/biodiversity/parser/scientific_name_canonical.treetop +0 -120
  29. data/lib/biodiversity/parser/scientific_name_clean.rb +0 -8991
  30. data/lib/biodiversity/parser/scientific_name_clean.treetop +0 -1632
  31. data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1298
  32. data/lib/biodiversity/parser/scientific_name_dirty.treetop +0 -264
  33. data/spec/biodiversity_spec.rb +0 -11
  34. data/spec/files/test_data.txt +0 -490
  35. data/spec/files/todo.txt +0 -55
  36. data/spec/guid/lsid.spec.rb +0 -15
  37. data/spec/parser/scientific_name_canonical_spec.rb +0 -36
  38. data/spec/parser/scientific_name_clean_spec.rb +0 -1137
  39. data/spec/parser/scientific_name_dirty_spec.rb +0 -165
  40. data/spec/parser/scientific_name_spec.rb +0 -193
@@ -0,0 +1,93 @@
1
+ /* Code generated by cmd/cgo; DO NOT EDIT. */
2
+
3
+ /* package gitlab.com/gogna/gnparser/binding */
4
+
5
+
6
+ #line 1 "cgo-builtin-export-prolog"
7
+
8
+ #include <stddef.h> /* for ptrdiff_t below */
9
+
10
+ #ifndef GO_CGO_EXPORT_PROLOGUE_H
11
+ #define GO_CGO_EXPORT_PROLOGUE_H
12
+
13
+ #ifndef GO_CGO_GOSTRING_TYPEDEF
14
+ typedef struct { const char *p; ptrdiff_t n; } _GoString_;
15
+ #endif
16
+
17
+ #endif
18
+
19
+ /* Start of preamble from import "C" comments. */
20
+
21
+
22
+ #line 3 "main.go"
23
+
24
+ #include "stdlib.h"
25
+
26
+ #line 1 "cgo-generated-wrapper"
27
+
28
+
29
+ /* End of preamble from import "C" comments. */
30
+
31
+
32
+ /* Start of boilerplate cgo prologue. */
33
+ #line 1 "cgo-gcc-export-header-prolog"
34
+
35
+ #ifndef GO_CGO_PROLOGUE_H
36
+ #define GO_CGO_PROLOGUE_H
37
+
38
+ typedef signed char GoInt8;
39
+ typedef unsigned char GoUint8;
40
+ typedef short GoInt16;
41
+ typedef unsigned short GoUint16;
42
+ typedef int GoInt32;
43
+ typedef unsigned int GoUint32;
44
+ typedef long long GoInt64;
45
+ typedef unsigned long long GoUint64;
46
+ typedef GoInt64 GoInt;
47
+ typedef GoUint64 GoUint;
48
+ typedef __SIZE_TYPE__ GoUintptr;
49
+ typedef float GoFloat32;
50
+ typedef double GoFloat64;
51
+ typedef float _Complex GoComplex64;
52
+ typedef double _Complex GoComplex128;
53
+
54
+ /*
55
+ static assertion to make sure the file is being used on architecture
56
+ at least with matching size of GoInt.
57
+ */
58
+ typedef char _check_for_64_bit_pointer_matching_GoInt[sizeof(void*)==64/8 ? 1:-1];
59
+
60
+ #ifndef GO_CGO_GOSTRING_TYPEDEF
61
+ typedef _GoString_ GoString;
62
+ #endif
63
+ typedef void *GoMap;
64
+ typedef void *GoChan;
65
+ typedef struct { void *t; void *v; } GoInterface;
66
+ typedef struct { void *data; GoInt len; GoInt cap; } GoSlice;
67
+
68
+ #endif
69
+
70
+ /* End of boilerplate cgo prologue. */
71
+
72
+ #ifdef __cplusplus
73
+ extern "C" {
74
+ #endif
75
+
76
+
77
+ // ParseToString function takes a name-string, desired format, and parses
78
+ // the name-string to either JSON, or pipe-separated values, depending on
79
+ // the desired format. Format can take values of 'simple', 'compact', 'pretty'.
80
+
81
+ extern char* ParseToString(char* p0, char* p1);
82
+
83
+ // ParseAryToStrings function takes an array of names, parsing format and a
84
+ // reference to an output: an empty array of strings to return the the data
85
+ // back. It populates the output array with raw strings of either JSON or
86
+ // pipe-separated parsed values (depending on a given format). Format can take
87
+ // values of 'simple', 'compact', or 'pretty'.
88
+
89
+ extern void ParseAryToStrings(char** p0, int p1, char* p2, char*** p3);
90
+
91
+ #ifdef __cplusplus
92
+ }
93
+ #endif
Binary file
@@ -0,0 +1,93 @@
1
+ /* Code generated by cmd/cgo; DO NOT EDIT. */
2
+
3
+ /* package gitlab.com/gogna/gnparser/binding */
4
+
5
+
6
+ #line 1 "cgo-builtin-export-prolog"
7
+
8
+ #include <stddef.h> /* for ptrdiff_t below */
9
+
10
+ #ifndef GO_CGO_EXPORT_PROLOGUE_H
11
+ #define GO_CGO_EXPORT_PROLOGUE_H
12
+
13
+ #ifndef GO_CGO_GOSTRING_TYPEDEF
14
+ typedef struct { const char *p; ptrdiff_t n; } _GoString_;
15
+ #endif
16
+
17
+ #endif
18
+
19
+ /* Start of preamble from import "C" comments. */
20
+
21
+
22
+ #line 3 "main.go"
23
+
24
+ #include "stdlib.h"
25
+
26
+ #line 1 "cgo-generated-wrapper"
27
+
28
+
29
+ /* End of preamble from import "C" comments. */
30
+
31
+
32
+ /* Start of boilerplate cgo prologue. */
33
+ #line 1 "cgo-gcc-export-header-prolog"
34
+
35
+ #ifndef GO_CGO_PROLOGUE_H
36
+ #define GO_CGO_PROLOGUE_H
37
+
38
+ typedef signed char GoInt8;
39
+ typedef unsigned char GoUint8;
40
+ typedef short GoInt16;
41
+ typedef unsigned short GoUint16;
42
+ typedef int GoInt32;
43
+ typedef unsigned int GoUint32;
44
+ typedef long long GoInt64;
45
+ typedef unsigned long long GoUint64;
46
+ typedef GoInt64 GoInt;
47
+ typedef GoUint64 GoUint;
48
+ typedef __SIZE_TYPE__ GoUintptr;
49
+ typedef float GoFloat32;
50
+ typedef double GoFloat64;
51
+ typedef float _Complex GoComplex64;
52
+ typedef double _Complex GoComplex128;
53
+
54
+ /*
55
+ static assertion to make sure the file is being used on architecture
56
+ at least with matching size of GoInt.
57
+ */
58
+ typedef char _check_for_64_bit_pointer_matching_GoInt[sizeof(void*)==64/8 ? 1:-1];
59
+
60
+ #ifndef GO_CGO_GOSTRING_TYPEDEF
61
+ typedef _GoString_ GoString;
62
+ #endif
63
+ typedef void *GoMap;
64
+ typedef void *GoChan;
65
+ typedef struct { void *t; void *v; } GoInterface;
66
+ typedef struct { void *data; GoInt len; GoInt cap; } GoSlice;
67
+
68
+ #endif
69
+
70
+ /* End of boilerplate cgo prologue. */
71
+
72
+ #ifdef __cplusplus
73
+ extern "C" {
74
+ #endif
75
+
76
+
77
+ // ParseToString function takes a name-string, desired format, and parses
78
+ // the name-string to either JSON, or pipe-separated values, depending on
79
+ // the desired format. Format can take values of 'simple', 'compact', 'pretty'.
80
+
81
+ extern char* ParseToString(char* p0, char* p1);
82
+
83
+ // ParseAryToStrings function takes an array of names, parsing format and a
84
+ // reference to an output: an empty array of strings to return the the data
85
+ // back. It populates the output array with raw strings of either JSON or
86
+ // pipe-separated parsed values (depending on a given format). Format can take
87
+ // values of 'simple', 'compact', or 'pretty'.
88
+
89
+ extern void ParseAryToStrings(char** p0, int p1, char* p2, char*** p3);
90
+
91
+ #ifdef __cplusplus
92
+ }
93
+ #endif
Binary file
data/lib/biodiversity.rb CHANGED
@@ -1,15 +1,10 @@
1
- require 'treetop'
1
+ # frozen_string_literal: true
2
+
3
+ require 'ffi'
2
4
  require 'json'
3
- require 'open-uri'
4
5
  require_relative 'biodiversity/version'
5
6
  require_relative 'biodiversity/parser'
6
- require_relative 'biodiversity/guid'
7
7
 
8
+ # Biodiversity module provides a namespace for scientific name parser.
8
9
  module Biodiversity
9
- LSID_RESOLVER_URL = 'http://lsid.tdwg.org/'
10
-
11
- def self.version
12
- VERSION
13
- end
14
10
  end
15
-
@@ -1,294 +1,78 @@
1
- # encoding: UTF-8
2
- require "gn_uuid"
3
- require_relative "parser/scientific_name_clean"
4
- require_relative "parser/scientific_name_dirty"
5
- require_relative "parser/scientific_name_canonical"
1
+ # frozen_string_literal: true
6
2
 
7
- module PreProcessor
8
- NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
9
- TAXON_CONCEPTS1 = /\s+(sensu\.|sensu|auct\.|auct)\b.*$/i
10
- TAXON_CONCEPTS2 = /\s+
11
- (\(?s\.\s?s\.|
12
- \(?s\.\s?l\.|
13
- \(?s\.\s?str\.|
14
- \(?s\.\s?lat\.|
15
- sec\.|sec|near)\b.*$/x
16
- TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p\.\s?p\.)\s*$/i
17
- NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen\b|\(?nom\.|\(?comb\.).*$/i
18
- LAST_WORD_JUNK = /(,\s*|\s+)
19
- (spp\.|spp|var\.|
20
- var|von|van|ined\.|
21
- ined|sensu|new|non|nec|
22
- nudum|cf\.|cf|sp\.|sp|
23
- ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/ix
24
-
25
- def self.clean(a_string)
26
- orig = a_string
27
- [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
28
- TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
29
- a_string = a_string.gsub(i, "")
30
- end
31
- tail = orig[a_string.size..-1]
32
- a_string = a_string.tr("ſ","s") #old "s"
33
- a_string = a_string.tr("_", " ") if a_string.strip.match(/\s/).nil?
34
- [a_string, tail.strip]
35
- end
36
- end
37
-
38
- # Public: Parser which runs in parallel.
39
- #
40
- # Examples
41
- #
42
- # parser = ParallelParser.new(4)
43
- # parser.parse(["Betula L.", "Pardosa moesta"])
44
- class ParallelParser
45
-
46
- # Public: Initialize ParallelParser.
47
- #
48
- # processes_num - an Integer to setup the number of processes (default: nil).
49
- # If processes number is not set it will be determined
50
- # automatically.
51
- def initialize(processes_num = nil)
52
- require "parallel"
53
- cpu_num
54
- if processes_num.to_i > 0
55
- @processes_num = [processes_num, cpu_num - 1].min
56
- else
57
- @processes_num = cpu_num > 3 ? cpu_num - 2 : 1
58
- end
59
- end
60
-
61
- # Public: Parses an array of scientific names using several processes
62
- # in parallel.
63
- #
64
- # Scientific names are deduplicated in the process, so every string is
65
- # parsed only once.
66
- #
67
- # names_list - takes an Array of scientific names,
68
- # each element should be a String.
69
- #
70
- # Examples
71
- #
72
- # parser = ParallelParser.new(4)
73
- # parser.parse(["Homo sapiens L.", "Quercus quercus"])
74
- #
75
- # Returns a Hash with scientific names as a key, and parsing results as
76
- # a value.
77
- def parse(names_list)
78
- parsed = Parallel.map(names_list.uniq, in_processes: @processes_num) do |n|
79
- [n, parse_process(n)]
80
- end
81
- parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
82
- end
83
-
84
- # Public: Returns the number of cores/CPUs.
85
- #
86
- # Returns Integer of cores/CPUs.
87
- def cpu_num
88
- @cpu_num ||= Parallel.processor_count
89
- end
90
-
91
- private
92
- def parse_process(name)
93
- p = ScientificNameParser.new
94
- p.parse(name) rescue ScientificNameParser::FAILED_RESULT.(name)
95
- end
3
+ # CLib is required to free memory after it is used by C
4
+ module CLib
5
+ extend FFI::Library
6
+ ffi_lib FFI::Library::LIBC
7
+ attach_function :free, [:pointer], :void
96
8
  end
97
9
 
98
- # we can use these expressions when we are ready to parse virus names
99
- # class VirusParser
100
- # def initialize
101
- # @order = /^\s*[A-Z][a-z]\+virales/i
102
- # @family = /^\s*[A-Z][a-z]\+viridae|viroidae/i
103
- # @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
104
- # @genus = /^\s*[A-Z][a-z]\+virus|viroid/i
105
- # @species = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|
106
- # viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/ix
107
- # @parsed = nil
108
- # end
109
- # end
110
-
111
- class ScientificNameParser
112
-
113
- FAILED_RESULT = ->(name) do
114
- { scientificName:
115
- { id: GnUUID.uuid(name), parsed: false, verbatim: name,
116
- error: "Parser internal error" }
117
- }
118
- end
119
-
120
- def self.add_rank_to_canonical(parsed)
121
- return parsed if parsed[:scientificName][:hybrid]
122
- name = parsed[:scientificName]
123
- parts = name[:canonical].split(" ")
124
- name_ary = parts[0..1]
125
- name[:details][0][:infraspecies].each do |data|
126
- infrasp = data[:string]
127
- rank = data[:rank]
128
- name_ary << (rank && rank != "n/a" ? "#{rank} #{infrasp}" : infrasp)
10
+ module Biodiversity
11
+ # Parser provides a namespace for functions to parse scientific names.
12
+ module Parser
13
+ extend FFI::Library
14
+
15
+ platform = case Gem.platforms[1].os
16
+ when 'linux'
17
+ 'linux'
18
+ when 'darwin'
19
+ 'mac'
20
+ when 'mswin64'
21
+ 'win'
22
+ else
23
+ raise "Unsupported platform: #{Gem.platforms[1].os}"
24
+ end
25
+ ffi_lib File.join(__dir__, '..', '..', 'clib', platform, 'libgnparser.so')
26
+ POINTER_SIZE = FFI.type_size(:pointer)
27
+
28
+ attach_function(:parse_go, :ParseToString, %i[string string], :string)
29
+ attach_function(:parse_ary_go, :ParseAryToStrings,
30
+ %i[pointer int string pointer], :void)
31
+
32
+ def self.parse(name, simple = false)
33
+ format = simple ? 'simple' : 'compact'
34
+ parsed = parse_go(name, format)
35
+ output(parsed, simple)
129
36
  end
130
- parsed[:scientificName][:canonical] = name_ary.join(" ")
131
- parsed
132
- end
133
-
134
- def self.version
135
- Biodiversity::VERSION
136
- end
137
37
 
138
- def self.fix_case(name_string)
139
- name_ary = name_string.split(/\s+/)
140
- words_num = name_ary.size
141
- res = nil
142
- if words_num == 1
143
- res = name_ary[0].gsub(/[\(\)\{\}]/, "")
144
- if res.size > 1
145
- res = UnicodeUtils.upcase(res[0]) + UnicodeUtils.downcase(res[1..-1])
146
- else
147
- res = nil
38
+ def self.parse_ary(ary, simple = false)
39
+ format = simple ? 'simple' : 'compact'
40
+ in_ptr = FFI::MemoryPointer.new(:pointer, ary.length)
41
+ in_ptr.write_array_of_pointer(
42
+ ary.map { |s| FFI::MemoryPointer.from_string(s) }
43
+ )
44
+ out_var = FFI::MemoryPointer.new(:pointer)
45
+ parse_ary_go(in_ptr, ary.length, format, out_var)
46
+
47
+ out_var.read_pointer
48
+ .get_array_of_string(0, ary.length)
49
+ .each_with_object([]) do |prsd, a|
50
+ a << output(prsd, simple)
148
51
  end
149
- else
150
- if name_ary[0].size > 1
151
- word1 = UnicodeUtils.upcase(name_ary[0][0]) +
152
- UnicodeUtils.downcase(name_ary[0][1..-1])
153
- else
154
- word1 = name_ary[0]
52
+ ensure
53
+ out_var.read_pointer.get_array_of_pointer(0, ary.length).each do |p|
54
+ CLib.free(p)
155
55
  end
156
- if name_ary[1].match(/^\(/)
157
- word2 = name_ary[1].gsub(/\)$/, "") + ")"
158
- word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
159
- UnicodeUtils.downcase(word2[2..-1])
160
- else
161
- word2 = UnicodeUtils.downcase(name_ary[1])
162
- end
163
- res = word1 + " " +
164
- word2 + " " +
165
- name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(" ")
166
- res.strip!
56
+ CLib.free(out_var.read_pointer)
167
57
  end
168
- res
169
- end
170
-
171
58
 
172
- def initialize(opts = {})
173
- @canonical_with_rank = !!opts[:canonical_with_rank]
174
- @verbatim = ""
175
- @clean = ScientificNameCleanParser.new
176
- @dirty = ScientificNameDirtyParser.new
177
- @canonical = ScientificNameCanonicalParser.new
178
- @parsed = nil
179
- @tail = nil
180
- end
181
-
182
- def virus?(a_string)
183
- !!(a_string.match(/\sICTV\s*$/) ||
184
- a_string.match(/\b(virus|viruses|particle|particles|
185
- phage|phages|viroid|viroids|virophage|
186
- prion|prions|NPV)\b/ix) ||
187
- a_string.match(/[A-Z]?[a-z]+virus\b/) ||
188
- a_string.match(/\b[A-Za-z]*(satellite[s]?|NPV)\b/))
189
- end
190
-
191
- def noparse?(a_string)
192
- incertae_sedis = a_string.match(/incertae\s+sedis/i) ||
193
- a_string.match(/inc\.\s*sed\./i)
194
- rna = a_string.match(/[^A-Z]RNA[^A-Z]*/)
195
- incertae_sedis || rna
196
- end
197
-
198
- def parsed
199
- @parsed
200
- end
201
-
202
- def parse(a_string)
203
- @verbatim = a_string
204
- a_string, @tail = PreProcessor::clean(a_string)
205
-
206
- if virus?(a_string)
207
- @parsed = { verbatim: @verbatim, virus: true }
208
- elsif noparse?(a_string)
209
- @parsed = { verbatim: @verbatim }
210
- else
211
- begin
212
- @parsed = @clean.parse(a_string) || @dirty.parse(a_string)
213
- unless @parsed
214
- index = @dirty.index || @clean.index
215
- salvage_match = a_string[0..index].split(/\s+/)[0..-2]
216
- salvage_string = salvage_match ? salvage_match.join(" ") : a_string
217
- @parsed = @dirty.parse(salvage_string) ||
218
- @canonical.parse(a_string) ||
219
- { verbatim: @verbatim }
220
- end
221
- rescue
222
- @parsed = FAILED_RESULT.(@verbatim)
223
- end
224
- end
225
-
226
- def @parsed.verbatim=(a_string)
227
- @verbatim = a_string
228
- @id = GnUUID.uuid(@verbatim)
229
- end
230
-
231
- def @parsed.all(opts = {})
232
- canonical_with_rank = !!opts[:canonical_with_rank]
233
- parsed = self.class != Hash
234
- res = { id: @id, parsed: parsed,
235
- parser_version: ScientificNameParser::version}
236
-
237
- if parsed
238
- hybrid = self.hybrid rescue false
239
- res.merge!({
240
- verbatim: @verbatim,
241
- normalized: self.value,
242
- canonical: self.canonical,
243
- hybrid: hybrid,
244
- details: self.details,
245
- parser_run: self.parser_run,
246
- positions: self.pos
247
- })
59
+ def self.output(parsed, simple)
60
+ if simple
61
+ parsed = parsed.split('|')
62
+ {
63
+ id: parsed[0],
64
+ verbatim: parsed[1],
65
+ canonicalName: {
66
+ full: parsed[2],
67
+ simple: parsed[3],
68
+ stem: parsed[4]
69
+ },
70
+ authorship: parsed[5],
71
+ quality: parsed[6]
72
+ }
248
73
  else
249
- res.merge!(self)
250
- end
251
- res[:surrogate] = true if ScientificNameParser.surrogate?(res)
252
- res = {:scientificName => res}
253
- if (canonical_with_rank &&
254
- canonical.count(" ") > 1 &&
255
- res[:scientificName][:details][0][:infraspecies])
256
- ScientificNameParser.add_rank_to_canonical(res)
74
+ JSON.parse(parsed, symbolize_names: true)
257
75
  end
258
- res
259
- end
260
-
261
- def @parsed.pos_json
262
- self.pos.to_json rescue ""
263
76
  end
264
-
265
- def @parsed.all_json
266
- self.all.to_json rescue ""
267
- end
268
-
269
- @parsed.verbatim = @verbatim
270
- res = @parsed.all(canonical_with_rank: @canonical_with_rank)
271
- res[:scientificName].merge!(tail: @tail) if @tail && @tail != ""
272
- res
273
77
  end
274
-
275
- private
276
-
277
- def self.surrogate?(parsed_data)
278
- return false unless parsed_data[:parsed]
279
- name = parsed_data[:verbatim]
280
- pos = parsed_data[:positions].to_a.flatten
281
- surrogate1 = /BOLD:|[\d]{5,}/i
282
- surrogate2 = /\b(spp|sp|nr|cf)[\.]?[\s]*$/i
283
- is_surrogate = false
284
-
285
- ai_index = pos.index("annotation_identification")
286
- if ai_index
287
- ai = name[pos[ai_index - 1]..pos[ai_index + 1]]
288
- is_surrogate = true if ai.match(/^(spp|cf|sp|nr)/)
289
- end
290
- is_surrogate = true if !is_surrogate && (name.match(surrogate1) ||
291
- name.match(surrogate2))
292
- is_surrogate
293
- end
294
- end
78
+ end