biodiversity 3.5.1 → 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/.rubocop.yml +9 -6
  4. data/.ruby-version +1 -1
  5. data/.travis.yml +1 -6
  6. data/CHANGELOG +3 -0
  7. data/Gemfile +2 -0
  8. data/README.md +37 -178
  9. data/Rakefile +15 -48
  10. data/biodiversity.gemspec +18 -21
  11. data/clib/linux/libgnparser.h +93 -0
  12. data/clib/linux/libgnparser.so +0 -0
  13. data/clib/mac/libgnparser.h +93 -0
  14. data/clib/mac/libgnparser.so +0 -0
  15. data/lib/biodiversity.rb +4 -9
  16. data/lib/biodiversity/parser.rb +65 -281
  17. data/lib/biodiversity/version.rb +8 -1
  18. data/spec/lib/biodiversity_spec.rb +9 -0
  19. data/spec/lib/parser_spec.rb +38 -0
  20. data/spec/spec_helper.rb +4 -81
  21. metadata +27 -102
  22. data/.byebug_history +0 -18
  23. data/.document +0 -5
  24. data/examples/socket_client.rb +0 -25
  25. data/lib/biodiversity/guid.rb +0 -1
  26. data/lib/biodiversity/guid/lsid.rb +0 -16
  27. data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -528
  28. data/lib/biodiversity/parser/scientific_name_canonical.treetop +0 -120
  29. data/lib/biodiversity/parser/scientific_name_clean.rb +0 -8991
  30. data/lib/biodiversity/parser/scientific_name_clean.treetop +0 -1632
  31. data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1298
  32. data/lib/biodiversity/parser/scientific_name_dirty.treetop +0 -264
  33. data/spec/biodiversity_spec.rb +0 -11
  34. data/spec/files/test_data.txt +0 -490
  35. data/spec/files/todo.txt +0 -55
  36. data/spec/guid/lsid.spec.rb +0 -15
  37. data/spec/parser/scientific_name_canonical_spec.rb +0 -36
  38. data/spec/parser/scientific_name_clean_spec.rb +0 -1137
  39. data/spec/parser/scientific_name_dirty_spec.rb +0 -165
  40. data/spec/parser/scientific_name_spec.rb +0 -193
@@ -0,0 +1,93 @@
1
+ /* Code generated by cmd/cgo; DO NOT EDIT. */
2
+
3
+ /* package gitlab.com/gogna/gnparser/binding */
4
+
5
+
6
+ #line 1 "cgo-builtin-export-prolog"
7
+
8
+ #include <stddef.h> /* for ptrdiff_t below */
9
+
10
+ #ifndef GO_CGO_EXPORT_PROLOGUE_H
11
+ #define GO_CGO_EXPORT_PROLOGUE_H
12
+
13
+ #ifndef GO_CGO_GOSTRING_TYPEDEF
14
+ typedef struct { const char *p; ptrdiff_t n; } _GoString_;
15
+ #endif
16
+
17
+ #endif
18
+
19
+ /* Start of preamble from import "C" comments. */
20
+
21
+
22
+ #line 3 "main.go"
23
+
24
+ #include "stdlib.h"
25
+
26
+ #line 1 "cgo-generated-wrapper"
27
+
28
+
29
+ /* End of preamble from import "C" comments. */
30
+
31
+
32
+ /* Start of boilerplate cgo prologue. */
33
+ #line 1 "cgo-gcc-export-header-prolog"
34
+
35
+ #ifndef GO_CGO_PROLOGUE_H
36
+ #define GO_CGO_PROLOGUE_H
37
+
38
+ typedef signed char GoInt8;
39
+ typedef unsigned char GoUint8;
40
+ typedef short GoInt16;
41
+ typedef unsigned short GoUint16;
42
+ typedef int GoInt32;
43
+ typedef unsigned int GoUint32;
44
+ typedef long long GoInt64;
45
+ typedef unsigned long long GoUint64;
46
+ typedef GoInt64 GoInt;
47
+ typedef GoUint64 GoUint;
48
+ typedef __SIZE_TYPE__ GoUintptr;
49
+ typedef float GoFloat32;
50
+ typedef double GoFloat64;
51
+ typedef float _Complex GoComplex64;
52
+ typedef double _Complex GoComplex128;
53
+
54
+ /*
55
+ static assertion to make sure the file is being used on architecture
56
+ at least with matching size of GoInt.
57
+ */
58
+ typedef char _check_for_64_bit_pointer_matching_GoInt[sizeof(void*)==64/8 ? 1:-1];
59
+
60
+ #ifndef GO_CGO_GOSTRING_TYPEDEF
61
+ typedef _GoString_ GoString;
62
+ #endif
63
+ typedef void *GoMap;
64
+ typedef void *GoChan;
65
+ typedef struct { void *t; void *v; } GoInterface;
66
+ typedef struct { void *data; GoInt len; GoInt cap; } GoSlice;
67
+
68
+ #endif
69
+
70
+ /* End of boilerplate cgo prologue. */
71
+
72
+ #ifdef __cplusplus
73
+ extern "C" {
74
+ #endif
75
+
76
+
77
+ // ParseToString function takes a name-string, desired format, and parses
78
+ // the name-string to either JSON, or pipe-separated values, depending on
79
+ // the desired format. Format can take values of 'simple', 'compact', 'pretty'.
80
+
81
+ extern char* ParseToString(char* p0, char* p1);
82
+
83
+ // ParseAryToStrings function takes an array of names, parsing format and a
84
+ // reference to an output: an empty array of strings to return the the data
85
+ // back. It populates the output array with raw strings of either JSON or
86
+ // pipe-separated parsed values (depending on a given format). Format can take
87
+ // values of 'simple', 'compact', or 'pretty'.
88
+
89
+ extern void ParseAryToStrings(char** p0, int p1, char* p2, char*** p3);
90
+
91
+ #ifdef __cplusplus
92
+ }
93
+ #endif
Binary file
@@ -0,0 +1,93 @@
1
+ /* Code generated by cmd/cgo; DO NOT EDIT. */
2
+
3
+ /* package gitlab.com/gogna/gnparser/binding */
4
+
5
+
6
+ #line 1 "cgo-builtin-export-prolog"
7
+
8
+ #include <stddef.h> /* for ptrdiff_t below */
9
+
10
+ #ifndef GO_CGO_EXPORT_PROLOGUE_H
11
+ #define GO_CGO_EXPORT_PROLOGUE_H
12
+
13
+ #ifndef GO_CGO_GOSTRING_TYPEDEF
14
+ typedef struct { const char *p; ptrdiff_t n; } _GoString_;
15
+ #endif
16
+
17
+ #endif
18
+
19
+ /* Start of preamble from import "C" comments. */
20
+
21
+
22
+ #line 3 "main.go"
23
+
24
+ #include "stdlib.h"
25
+
26
+ #line 1 "cgo-generated-wrapper"
27
+
28
+
29
+ /* End of preamble from import "C" comments. */
30
+
31
+
32
+ /* Start of boilerplate cgo prologue. */
33
+ #line 1 "cgo-gcc-export-header-prolog"
34
+
35
+ #ifndef GO_CGO_PROLOGUE_H
36
+ #define GO_CGO_PROLOGUE_H
37
+
38
+ typedef signed char GoInt8;
39
+ typedef unsigned char GoUint8;
40
+ typedef short GoInt16;
41
+ typedef unsigned short GoUint16;
42
+ typedef int GoInt32;
43
+ typedef unsigned int GoUint32;
44
+ typedef long long GoInt64;
45
+ typedef unsigned long long GoUint64;
46
+ typedef GoInt64 GoInt;
47
+ typedef GoUint64 GoUint;
48
+ typedef __SIZE_TYPE__ GoUintptr;
49
+ typedef float GoFloat32;
50
+ typedef double GoFloat64;
51
+ typedef float _Complex GoComplex64;
52
+ typedef double _Complex GoComplex128;
53
+
54
+ /*
55
+ static assertion to make sure the file is being used on architecture
56
+ at least with matching size of GoInt.
57
+ */
58
+ typedef char _check_for_64_bit_pointer_matching_GoInt[sizeof(void*)==64/8 ? 1:-1];
59
+
60
+ #ifndef GO_CGO_GOSTRING_TYPEDEF
61
+ typedef _GoString_ GoString;
62
+ #endif
63
+ typedef void *GoMap;
64
+ typedef void *GoChan;
65
+ typedef struct { void *t; void *v; } GoInterface;
66
+ typedef struct { void *data; GoInt len; GoInt cap; } GoSlice;
67
+
68
+ #endif
69
+
70
+ /* End of boilerplate cgo prologue. */
71
+
72
+ #ifdef __cplusplus
73
+ extern "C" {
74
+ #endif
75
+
76
+
77
+ // ParseToString function takes a name-string, desired format, and parses
78
+ // the name-string to either JSON, or pipe-separated values, depending on
79
+ // the desired format. Format can take values of 'simple', 'compact', 'pretty'.
80
+
81
+ extern char* ParseToString(char* p0, char* p1);
82
+
83
+ // ParseAryToStrings function takes an array of names, parsing format and a
84
+ // reference to an output: an empty array of strings to return the the data
85
+ // back. It populates the output array with raw strings of either JSON or
86
+ // pipe-separated parsed values (depending on a given format). Format can take
87
+ // values of 'simple', 'compact', or 'pretty'.
88
+
89
+ extern void ParseAryToStrings(char** p0, int p1, char* p2, char*** p3);
90
+
91
+ #ifdef __cplusplus
92
+ }
93
+ #endif
Binary file
data/lib/biodiversity.rb CHANGED
@@ -1,15 +1,10 @@
1
- require 'treetop'
1
+ # frozen_string_literal: true
2
+
3
+ require 'ffi'
2
4
  require 'json'
3
- require 'open-uri'
4
5
  require_relative 'biodiversity/version'
5
6
  require_relative 'biodiversity/parser'
6
- require_relative 'biodiversity/guid'
7
7
 
8
+ # Biodiversity module provides a namespace for scientific name parser.
8
9
  module Biodiversity
9
- LSID_RESOLVER_URL = 'http://lsid.tdwg.org/'
10
-
11
- def self.version
12
- VERSION
13
- end
14
10
  end
15
-
@@ -1,294 +1,78 @@
1
- # encoding: UTF-8
2
- require "gn_uuid"
3
- require_relative "parser/scientific_name_clean"
4
- require_relative "parser/scientific_name_dirty"
5
- require_relative "parser/scientific_name_canonical"
1
+ # frozen_string_literal: true
6
2
 
7
- module PreProcessor
8
- NOTES = /\s+(species\s+group|species\s+complex|group|author)\b.*$/i
9
- TAXON_CONCEPTS1 = /\s+(sensu\.|sensu|auct\.|auct)\b.*$/i
10
- TAXON_CONCEPTS2 = /\s+
11
- (\(?s\.\s?s\.|
12
- \(?s\.\s?l\.|
13
- \(?s\.\s?str\.|
14
- \(?s\.\s?lat\.|
15
- sec\.|sec|near)\b.*$/x
16
- TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p\.\s?p\.)\s*$/i
17
- NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen\b|\(?nom\.|\(?comb\.).*$/i
18
- LAST_WORD_JUNK = /(,\s*|\s+)
19
- (spp\.|spp|var\.|
20
- var|von|van|ined\.|
21
- ined|sensu|new|non|nec|
22
- nudum|cf\.|cf|sp\.|sp|
23
- ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/ix
24
-
25
- def self.clean(a_string)
26
- orig = a_string
27
- [NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
28
- TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
29
- a_string = a_string.gsub(i, "")
30
- end
31
- tail = orig[a_string.size..-1]
32
- a_string = a_string.tr("ſ","s") #old "s"
33
- a_string = a_string.tr("_", " ") if a_string.strip.match(/\s/).nil?
34
- [a_string, tail.strip]
35
- end
36
- end
37
-
38
- # Public: Parser which runs in parallel.
39
- #
40
- # Examples
41
- #
42
- # parser = ParallelParser.new(4)
43
- # parser.parse(["Betula L.", "Pardosa moesta"])
44
- class ParallelParser
45
-
46
- # Public: Initialize ParallelParser.
47
- #
48
- # processes_num - an Integer to setup the number of processes (default: nil).
49
- # If processes number is not set it will be determined
50
- # automatically.
51
- def initialize(processes_num = nil)
52
- require "parallel"
53
- cpu_num
54
- if processes_num.to_i > 0
55
- @processes_num = [processes_num, cpu_num - 1].min
56
- else
57
- @processes_num = cpu_num > 3 ? cpu_num - 2 : 1
58
- end
59
- end
60
-
61
- # Public: Parses an array of scientific names using several processes
62
- # in parallel.
63
- #
64
- # Scientific names are deduplicated in the process, so every string is
65
- # parsed only once.
66
- #
67
- # names_list - takes an Array of scientific names,
68
- # each element should be a String.
69
- #
70
- # Examples
71
- #
72
- # parser = ParallelParser.new(4)
73
- # parser.parse(["Homo sapiens L.", "Quercus quercus"])
74
- #
75
- # Returns a Hash with scientific names as a key, and parsing results as
76
- # a value.
77
- def parse(names_list)
78
- parsed = Parallel.map(names_list.uniq, in_processes: @processes_num) do |n|
79
- [n, parse_process(n)]
80
- end
81
- parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
82
- end
83
-
84
- # Public: Returns the number of cores/CPUs.
85
- #
86
- # Returns Integer of cores/CPUs.
87
- def cpu_num
88
- @cpu_num ||= Parallel.processor_count
89
- end
90
-
91
- private
92
- def parse_process(name)
93
- p = ScientificNameParser.new
94
- p.parse(name) rescue ScientificNameParser::FAILED_RESULT.(name)
95
- end
3
+ # CLib is required to free memory after it is used by C
4
+ module CLib
5
+ extend FFI::Library
6
+ ffi_lib FFI::Library::LIBC
7
+ attach_function :free, [:pointer], :void
96
8
  end
97
9
 
98
- # we can use these expressions when we are ready to parse virus names
99
- # class VirusParser
100
- # def initialize
101
- # @order = /^\s*[A-Z][a-z]\+virales/i
102
- # @family = /^\s*[A-Z][a-z]\+viridae|viroidae/i
103
- # @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
104
- # @genus = /^\s*[A-Z][a-z]\+virus|viroid/i
105
- # @species = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|
106
- # viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/ix
107
- # @parsed = nil
108
- # end
109
- # end
110
-
111
- class ScientificNameParser
112
-
113
- FAILED_RESULT = ->(name) do
114
- { scientificName:
115
- { id: GnUUID.uuid(name), parsed: false, verbatim: name,
116
- error: "Parser internal error" }
117
- }
118
- end
119
-
120
- def self.add_rank_to_canonical(parsed)
121
- return parsed if parsed[:scientificName][:hybrid]
122
- name = parsed[:scientificName]
123
- parts = name[:canonical].split(" ")
124
- name_ary = parts[0..1]
125
- name[:details][0][:infraspecies].each do |data|
126
- infrasp = data[:string]
127
- rank = data[:rank]
128
- name_ary << (rank && rank != "n/a" ? "#{rank} #{infrasp}" : infrasp)
10
+ module Biodiversity
11
+ # Parser provides a namespace for functions to parse scientific names.
12
+ module Parser
13
+ extend FFI::Library
14
+
15
+ platform = case Gem.platforms[1].os
16
+ when 'linux'
17
+ 'linux'
18
+ when 'darwin'
19
+ 'mac'
20
+ when 'mswin64'
21
+ 'win'
22
+ else
23
+ raise "Unsupported platform: #{Gem.platforms[1].os}"
24
+ end
25
+ ffi_lib File.join(__dir__, '..', '..', 'clib', platform, 'libgnparser.so')
26
+ POINTER_SIZE = FFI.type_size(:pointer)
27
+
28
+ attach_function(:parse_go, :ParseToString, %i[string string], :string)
29
+ attach_function(:parse_ary_go, :ParseAryToStrings,
30
+ %i[pointer int string pointer], :void)
31
+
32
+ def self.parse(name, simple = false)
33
+ format = simple ? 'simple' : 'compact'
34
+ parsed = parse_go(name, format)
35
+ output(parsed, simple)
129
36
  end
130
- parsed[:scientificName][:canonical] = name_ary.join(" ")
131
- parsed
132
- end
133
-
134
- def self.version
135
- Biodiversity::VERSION
136
- end
137
37
 
138
- def self.fix_case(name_string)
139
- name_ary = name_string.split(/\s+/)
140
- words_num = name_ary.size
141
- res = nil
142
- if words_num == 1
143
- res = name_ary[0].gsub(/[\(\)\{\}]/, "")
144
- if res.size > 1
145
- res = UnicodeUtils.upcase(res[0]) + UnicodeUtils.downcase(res[1..-1])
146
- else
147
- res = nil
38
+ def self.parse_ary(ary, simple = false)
39
+ format = simple ? 'simple' : 'compact'
40
+ in_ptr = FFI::MemoryPointer.new(:pointer, ary.length)
41
+ in_ptr.write_array_of_pointer(
42
+ ary.map { |s| FFI::MemoryPointer.from_string(s) }
43
+ )
44
+ out_var = FFI::MemoryPointer.new(:pointer)
45
+ parse_ary_go(in_ptr, ary.length, format, out_var)
46
+
47
+ out_var.read_pointer
48
+ .get_array_of_string(0, ary.length)
49
+ .each_with_object([]) do |prsd, a|
50
+ a << output(prsd, simple)
148
51
  end
149
- else
150
- if name_ary[0].size > 1
151
- word1 = UnicodeUtils.upcase(name_ary[0][0]) +
152
- UnicodeUtils.downcase(name_ary[0][1..-1])
153
- else
154
- word1 = name_ary[0]
52
+ ensure
53
+ out_var.read_pointer.get_array_of_pointer(0, ary.length).each do |p|
54
+ CLib.free(p)
155
55
  end
156
- if name_ary[1].match(/^\(/)
157
- word2 = name_ary[1].gsub(/\)$/, "") + ")"
158
- word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
159
- UnicodeUtils.downcase(word2[2..-1])
160
- else
161
- word2 = UnicodeUtils.downcase(name_ary[1])
162
- end
163
- res = word1 + " " +
164
- word2 + " " +
165
- name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(" ")
166
- res.strip!
56
+ CLib.free(out_var.read_pointer)
167
57
  end
168
- res
169
- end
170
-
171
58
 
172
- def initialize(opts = {})
173
- @canonical_with_rank = !!opts[:canonical_with_rank]
174
- @verbatim = ""
175
- @clean = ScientificNameCleanParser.new
176
- @dirty = ScientificNameDirtyParser.new
177
- @canonical = ScientificNameCanonicalParser.new
178
- @parsed = nil
179
- @tail = nil
180
- end
181
-
182
- def virus?(a_string)
183
- !!(a_string.match(/\sICTV\s*$/) ||
184
- a_string.match(/\b(virus|viruses|particle|particles|
185
- phage|phages|viroid|viroids|virophage|
186
- prion|prions|NPV)\b/ix) ||
187
- a_string.match(/[A-Z]?[a-z]+virus\b/) ||
188
- a_string.match(/\b[A-Za-z]*(satellite[s]?|NPV)\b/))
189
- end
190
-
191
- def noparse?(a_string)
192
- incertae_sedis = a_string.match(/incertae\s+sedis/i) ||
193
- a_string.match(/inc\.\s*sed\./i)
194
- rna = a_string.match(/[^A-Z]RNA[^A-Z]*/)
195
- incertae_sedis || rna
196
- end
197
-
198
- def parsed
199
- @parsed
200
- end
201
-
202
- def parse(a_string)
203
- @verbatim = a_string
204
- a_string, @tail = PreProcessor::clean(a_string)
205
-
206
- if virus?(a_string)
207
- @parsed = { verbatim: @verbatim, virus: true }
208
- elsif noparse?(a_string)
209
- @parsed = { verbatim: @verbatim }
210
- else
211
- begin
212
- @parsed = @clean.parse(a_string) || @dirty.parse(a_string)
213
- unless @parsed
214
- index = @dirty.index || @clean.index
215
- salvage_match = a_string[0..index].split(/\s+/)[0..-2]
216
- salvage_string = salvage_match ? salvage_match.join(" ") : a_string
217
- @parsed = @dirty.parse(salvage_string) ||
218
- @canonical.parse(a_string) ||
219
- { verbatim: @verbatim }
220
- end
221
- rescue
222
- @parsed = FAILED_RESULT.(@verbatim)
223
- end
224
- end
225
-
226
- def @parsed.verbatim=(a_string)
227
- @verbatim = a_string
228
- @id = GnUUID.uuid(@verbatim)
229
- end
230
-
231
- def @parsed.all(opts = {})
232
- canonical_with_rank = !!opts[:canonical_with_rank]
233
- parsed = self.class != Hash
234
- res = { id: @id, parsed: parsed,
235
- parser_version: ScientificNameParser::version}
236
-
237
- if parsed
238
- hybrid = self.hybrid rescue false
239
- res.merge!({
240
- verbatim: @verbatim,
241
- normalized: self.value,
242
- canonical: self.canonical,
243
- hybrid: hybrid,
244
- details: self.details,
245
- parser_run: self.parser_run,
246
- positions: self.pos
247
- })
59
+ def self.output(parsed, simple)
60
+ if simple
61
+ parsed = parsed.split('|')
62
+ {
63
+ id: parsed[0],
64
+ verbatim: parsed[1],
65
+ canonicalName: {
66
+ full: parsed[2],
67
+ simple: parsed[3],
68
+ stem: parsed[4]
69
+ },
70
+ authorship: parsed[5],
71
+ quality: parsed[6]
72
+ }
248
73
  else
249
- res.merge!(self)
250
- end
251
- res[:surrogate] = true if ScientificNameParser.surrogate?(res)
252
- res = {:scientificName => res}
253
- if (canonical_with_rank &&
254
- canonical.count(" ") > 1 &&
255
- res[:scientificName][:details][0][:infraspecies])
256
- ScientificNameParser.add_rank_to_canonical(res)
74
+ JSON.parse(parsed, symbolize_names: true)
257
75
  end
258
- res
259
- end
260
-
261
- def @parsed.pos_json
262
- self.pos.to_json rescue ""
263
76
  end
264
-
265
- def @parsed.all_json
266
- self.all.to_json rescue ""
267
- end
268
-
269
- @parsed.verbatim = @verbatim
270
- res = @parsed.all(canonical_with_rank: @canonical_with_rank)
271
- res[:scientificName].merge!(tail: @tail) if @tail && @tail != ""
272
- res
273
77
  end
274
-
275
- private
276
-
277
- def self.surrogate?(parsed_data)
278
- return false unless parsed_data[:parsed]
279
- name = parsed_data[:verbatim]
280
- pos = parsed_data[:positions].to_a.flatten
281
- surrogate1 = /BOLD:|[\d]{5,}/i
282
- surrogate2 = /\b(spp|sp|nr|cf)[\.]?[\s]*$/i
283
- is_surrogate = false
284
-
285
- ai_index = pos.index("annotation_identification")
286
- if ai_index
287
- ai = name[pos[ai_index - 1]..pos[ai_index + 1]]
288
- is_surrogate = true if ai.match(/^(spp|cf|sp|nr)/)
289
- end
290
- is_surrogate = true if !is_surrogate && (name.match(surrogate1) ||
291
- name.match(surrogate2))
292
- is_surrogate
293
- end
294
- end
78
+ end