biodiversity 3.5.1 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rubocop.yml +9 -6
- data/.ruby-version +1 -1
- data/.travis.yml +1 -6
- data/CHANGELOG +3 -0
- data/Gemfile +2 -0
- data/README.md +37 -178
- data/Rakefile +15 -48
- data/biodiversity.gemspec +18 -21
- data/clib/linux/libgnparser.h +93 -0
- data/clib/linux/libgnparser.so +0 -0
- data/clib/mac/libgnparser.h +93 -0
- data/clib/mac/libgnparser.so +0 -0
- data/lib/biodiversity.rb +4 -9
- data/lib/biodiversity/parser.rb +65 -281
- data/lib/biodiversity/version.rb +8 -1
- data/spec/lib/biodiversity_spec.rb +9 -0
- data/spec/lib/parser_spec.rb +38 -0
- data/spec/spec_helper.rb +4 -81
- metadata +27 -102
- data/.byebug_history +0 -18
- data/.document +0 -5
- data/examples/socket_client.rb +0 -25
- data/lib/biodiversity/guid.rb +0 -1
- data/lib/biodiversity/guid/lsid.rb +0 -16
- data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -528
- data/lib/biodiversity/parser/scientific_name_canonical.treetop +0 -120
- data/lib/biodiversity/parser/scientific_name_clean.rb +0 -8991
- data/lib/biodiversity/parser/scientific_name_clean.treetop +0 -1632
- data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1298
- data/lib/biodiversity/parser/scientific_name_dirty.treetop +0 -264
- data/spec/biodiversity_spec.rb +0 -11
- data/spec/files/test_data.txt +0 -490
- data/spec/files/todo.txt +0 -55
- data/spec/guid/lsid.spec.rb +0 -15
- data/spec/parser/scientific_name_canonical_spec.rb +0 -36
- data/spec/parser/scientific_name_clean_spec.rb +0 -1137
- data/spec/parser/scientific_name_dirty_spec.rb +0 -165
- data/spec/parser/scientific_name_spec.rb +0 -193
@@ -0,0 +1,93 @@
|
|
1
|
+
/* Code generated by cmd/cgo; DO NOT EDIT. */
|
2
|
+
|
3
|
+
/* package gitlab.com/gogna/gnparser/binding */
|
4
|
+
|
5
|
+
|
6
|
+
#line 1 "cgo-builtin-export-prolog"
|
7
|
+
|
8
|
+
#include <stddef.h> /* for ptrdiff_t below */
|
9
|
+
|
10
|
+
#ifndef GO_CGO_EXPORT_PROLOGUE_H
|
11
|
+
#define GO_CGO_EXPORT_PROLOGUE_H
|
12
|
+
|
13
|
+
#ifndef GO_CGO_GOSTRING_TYPEDEF
|
14
|
+
typedef struct { const char *p; ptrdiff_t n; } _GoString_;
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#endif
|
18
|
+
|
19
|
+
/* Start of preamble from import "C" comments. */
|
20
|
+
|
21
|
+
|
22
|
+
#line 3 "main.go"
|
23
|
+
|
24
|
+
#include "stdlib.h"
|
25
|
+
|
26
|
+
#line 1 "cgo-generated-wrapper"
|
27
|
+
|
28
|
+
|
29
|
+
/* End of preamble from import "C" comments. */
|
30
|
+
|
31
|
+
|
32
|
+
/* Start of boilerplate cgo prologue. */
|
33
|
+
#line 1 "cgo-gcc-export-header-prolog"
|
34
|
+
|
35
|
+
#ifndef GO_CGO_PROLOGUE_H
|
36
|
+
#define GO_CGO_PROLOGUE_H
|
37
|
+
|
38
|
+
typedef signed char GoInt8;
|
39
|
+
typedef unsigned char GoUint8;
|
40
|
+
typedef short GoInt16;
|
41
|
+
typedef unsigned short GoUint16;
|
42
|
+
typedef int GoInt32;
|
43
|
+
typedef unsigned int GoUint32;
|
44
|
+
typedef long long GoInt64;
|
45
|
+
typedef unsigned long long GoUint64;
|
46
|
+
typedef GoInt64 GoInt;
|
47
|
+
typedef GoUint64 GoUint;
|
48
|
+
typedef __SIZE_TYPE__ GoUintptr;
|
49
|
+
typedef float GoFloat32;
|
50
|
+
typedef double GoFloat64;
|
51
|
+
typedef float _Complex GoComplex64;
|
52
|
+
typedef double _Complex GoComplex128;
|
53
|
+
|
54
|
+
/*
|
55
|
+
static assertion to make sure the file is being used on architecture
|
56
|
+
at least with matching size of GoInt.
|
57
|
+
*/
|
58
|
+
typedef char _check_for_64_bit_pointer_matching_GoInt[sizeof(void*)==64/8 ? 1:-1];
|
59
|
+
|
60
|
+
#ifndef GO_CGO_GOSTRING_TYPEDEF
|
61
|
+
typedef _GoString_ GoString;
|
62
|
+
#endif
|
63
|
+
typedef void *GoMap;
|
64
|
+
typedef void *GoChan;
|
65
|
+
typedef struct { void *t; void *v; } GoInterface;
|
66
|
+
typedef struct { void *data; GoInt len; GoInt cap; } GoSlice;
|
67
|
+
|
68
|
+
#endif
|
69
|
+
|
70
|
+
/* End of boilerplate cgo prologue. */
|
71
|
+
|
72
|
+
#ifdef __cplusplus
|
73
|
+
extern "C" {
|
74
|
+
#endif
|
75
|
+
|
76
|
+
|
77
|
+
// ParseToString function takes a name-string, desired format, and parses
|
78
|
+
// the name-string to either JSON, or pipe-separated values, depending on
|
79
|
+
// the desired format. Format can take values of 'simple', 'compact', 'pretty'.
|
80
|
+
|
81
|
+
extern char* ParseToString(char* p0, char* p1);
|
82
|
+
|
83
|
+
// ParseAryToStrings function takes an array of names, parsing format and a
|
84
|
+
// reference to an output: an empty array of strings to return the the data
|
85
|
+
// back. It populates the output array with raw strings of either JSON or
|
86
|
+
// pipe-separated parsed values (depending on a given format). Format can take
|
87
|
+
// values of 'simple', 'compact', or 'pretty'.
|
88
|
+
|
89
|
+
extern void ParseAryToStrings(char** p0, int p1, char* p2, char*** p3);
|
90
|
+
|
91
|
+
#ifdef __cplusplus
|
92
|
+
}
|
93
|
+
#endif
|
Binary file
|
@@ -0,0 +1,93 @@
|
|
1
|
+
/* Code generated by cmd/cgo; DO NOT EDIT. */
|
2
|
+
|
3
|
+
/* package gitlab.com/gogna/gnparser/binding */
|
4
|
+
|
5
|
+
|
6
|
+
#line 1 "cgo-builtin-export-prolog"
|
7
|
+
|
8
|
+
#include <stddef.h> /* for ptrdiff_t below */
|
9
|
+
|
10
|
+
#ifndef GO_CGO_EXPORT_PROLOGUE_H
|
11
|
+
#define GO_CGO_EXPORT_PROLOGUE_H
|
12
|
+
|
13
|
+
#ifndef GO_CGO_GOSTRING_TYPEDEF
|
14
|
+
typedef struct { const char *p; ptrdiff_t n; } _GoString_;
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#endif
|
18
|
+
|
19
|
+
/* Start of preamble from import "C" comments. */
|
20
|
+
|
21
|
+
|
22
|
+
#line 3 "main.go"
|
23
|
+
|
24
|
+
#include "stdlib.h"
|
25
|
+
|
26
|
+
#line 1 "cgo-generated-wrapper"
|
27
|
+
|
28
|
+
|
29
|
+
/* End of preamble from import "C" comments. */
|
30
|
+
|
31
|
+
|
32
|
+
/* Start of boilerplate cgo prologue. */
|
33
|
+
#line 1 "cgo-gcc-export-header-prolog"
|
34
|
+
|
35
|
+
#ifndef GO_CGO_PROLOGUE_H
|
36
|
+
#define GO_CGO_PROLOGUE_H
|
37
|
+
|
38
|
+
typedef signed char GoInt8;
|
39
|
+
typedef unsigned char GoUint8;
|
40
|
+
typedef short GoInt16;
|
41
|
+
typedef unsigned short GoUint16;
|
42
|
+
typedef int GoInt32;
|
43
|
+
typedef unsigned int GoUint32;
|
44
|
+
typedef long long GoInt64;
|
45
|
+
typedef unsigned long long GoUint64;
|
46
|
+
typedef GoInt64 GoInt;
|
47
|
+
typedef GoUint64 GoUint;
|
48
|
+
typedef __SIZE_TYPE__ GoUintptr;
|
49
|
+
typedef float GoFloat32;
|
50
|
+
typedef double GoFloat64;
|
51
|
+
typedef float _Complex GoComplex64;
|
52
|
+
typedef double _Complex GoComplex128;
|
53
|
+
|
54
|
+
/*
|
55
|
+
static assertion to make sure the file is being used on architecture
|
56
|
+
at least with matching size of GoInt.
|
57
|
+
*/
|
58
|
+
typedef char _check_for_64_bit_pointer_matching_GoInt[sizeof(void*)==64/8 ? 1:-1];
|
59
|
+
|
60
|
+
#ifndef GO_CGO_GOSTRING_TYPEDEF
|
61
|
+
typedef _GoString_ GoString;
|
62
|
+
#endif
|
63
|
+
typedef void *GoMap;
|
64
|
+
typedef void *GoChan;
|
65
|
+
typedef struct { void *t; void *v; } GoInterface;
|
66
|
+
typedef struct { void *data; GoInt len; GoInt cap; } GoSlice;
|
67
|
+
|
68
|
+
#endif
|
69
|
+
|
70
|
+
/* End of boilerplate cgo prologue. */
|
71
|
+
|
72
|
+
#ifdef __cplusplus
|
73
|
+
extern "C" {
|
74
|
+
#endif
|
75
|
+
|
76
|
+
|
77
|
+
// ParseToString function takes a name-string, desired format, and parses
|
78
|
+
// the name-string to either JSON, or pipe-separated values, depending on
|
79
|
+
// the desired format. Format can take values of 'simple', 'compact', 'pretty'.
|
80
|
+
|
81
|
+
extern char* ParseToString(char* p0, char* p1);
|
82
|
+
|
83
|
+
// ParseAryToStrings function takes an array of names, parsing format and a
|
84
|
+
// reference to an output: an empty array of strings to return the the data
|
85
|
+
// back. It populates the output array with raw strings of either JSON or
|
86
|
+
// pipe-separated parsed values (depending on a given format). Format can take
|
87
|
+
// values of 'simple', 'compact', or 'pretty'.
|
88
|
+
|
89
|
+
extern void ParseAryToStrings(char** p0, int p1, char* p2, char*** p3);
|
90
|
+
|
91
|
+
#ifdef __cplusplus
|
92
|
+
}
|
93
|
+
#endif
|
Binary file
|
data/lib/biodiversity.rb
CHANGED
@@ -1,15 +1,10 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'ffi'
|
2
4
|
require 'json'
|
3
|
-
require 'open-uri'
|
4
5
|
require_relative 'biodiversity/version'
|
5
6
|
require_relative 'biodiversity/parser'
|
6
|
-
require_relative 'biodiversity/guid'
|
7
7
|
|
8
|
+
# Biodiversity module provides a namespace for scientific name parser.
|
8
9
|
module Biodiversity
|
9
|
-
LSID_RESOLVER_URL = 'http://lsid.tdwg.org/'
|
10
|
-
|
11
|
-
def self.version
|
12
|
-
VERSION
|
13
|
-
end
|
14
10
|
end
|
15
|
-
|
data/lib/biodiversity/parser.rb
CHANGED
@@ -1,294 +1,78 @@
|
|
1
|
-
#
|
2
|
-
require "gn_uuid"
|
3
|
-
require_relative "parser/scientific_name_clean"
|
4
|
-
require_relative "parser/scientific_name_dirty"
|
5
|
-
require_relative "parser/scientific_name_canonical"
|
1
|
+
# frozen_string_literal: true
|
6
2
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
\(?s\.\s?l\.|
|
13
|
-
\(?s\.\s?str\.|
|
14
|
-
\(?s\.\s?lat\.|
|
15
|
-
sec\.|sec|near)\b.*$/x
|
16
|
-
TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p\.\s?p\.)\s*$/i
|
17
|
-
NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen\b|\(?nom\.|\(?comb\.).*$/i
|
18
|
-
LAST_WORD_JUNK = /(,\s*|\s+)
|
19
|
-
(spp\.|spp|var\.|
|
20
|
-
var|von|van|ined\.|
|
21
|
-
ined|sensu|new|non|nec|
|
22
|
-
nudum|cf\.|cf|sp\.|sp|
|
23
|
-
ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/ix
|
24
|
-
|
25
|
-
def self.clean(a_string)
|
26
|
-
orig = a_string
|
27
|
-
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
|
28
|
-
TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
|
29
|
-
a_string = a_string.gsub(i, "")
|
30
|
-
end
|
31
|
-
tail = orig[a_string.size..-1]
|
32
|
-
a_string = a_string.tr("ſ","s") #old "s"
|
33
|
-
a_string = a_string.tr("_", " ") if a_string.strip.match(/\s/).nil?
|
34
|
-
[a_string, tail.strip]
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
# Public: Parser which runs in parallel.
|
39
|
-
#
|
40
|
-
# Examples
|
41
|
-
#
|
42
|
-
# parser = ParallelParser.new(4)
|
43
|
-
# parser.parse(["Betula L.", "Pardosa moesta"])
|
44
|
-
class ParallelParser
|
45
|
-
|
46
|
-
# Public: Initialize ParallelParser.
|
47
|
-
#
|
48
|
-
# processes_num - an Integer to setup the number of processes (default: nil).
|
49
|
-
# If processes number is not set it will be determined
|
50
|
-
# automatically.
|
51
|
-
def initialize(processes_num = nil)
|
52
|
-
require "parallel"
|
53
|
-
cpu_num
|
54
|
-
if processes_num.to_i > 0
|
55
|
-
@processes_num = [processes_num, cpu_num - 1].min
|
56
|
-
else
|
57
|
-
@processes_num = cpu_num > 3 ? cpu_num - 2 : 1
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
# Public: Parses an array of scientific names using several processes
|
62
|
-
# in parallel.
|
63
|
-
#
|
64
|
-
# Scientific names are deduplicated in the process, so every string is
|
65
|
-
# parsed only once.
|
66
|
-
#
|
67
|
-
# names_list - takes an Array of scientific names,
|
68
|
-
# each element should be a String.
|
69
|
-
#
|
70
|
-
# Examples
|
71
|
-
#
|
72
|
-
# parser = ParallelParser.new(4)
|
73
|
-
# parser.parse(["Homo sapiens L.", "Quercus quercus"])
|
74
|
-
#
|
75
|
-
# Returns a Hash with scientific names as a key, and parsing results as
|
76
|
-
# a value.
|
77
|
-
def parse(names_list)
|
78
|
-
parsed = Parallel.map(names_list.uniq, in_processes: @processes_num) do |n|
|
79
|
-
[n, parse_process(n)]
|
80
|
-
end
|
81
|
-
parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
|
82
|
-
end
|
83
|
-
|
84
|
-
# Public: Returns the number of cores/CPUs.
|
85
|
-
#
|
86
|
-
# Returns Integer of cores/CPUs.
|
87
|
-
def cpu_num
|
88
|
-
@cpu_num ||= Parallel.processor_count
|
89
|
-
end
|
90
|
-
|
91
|
-
private
|
92
|
-
def parse_process(name)
|
93
|
-
p = ScientificNameParser.new
|
94
|
-
p.parse(name) rescue ScientificNameParser::FAILED_RESULT.(name)
|
95
|
-
end
|
3
|
+
# CLib is required to free memory after it is used by C
|
4
|
+
module CLib
|
5
|
+
extend FFI::Library
|
6
|
+
ffi_lib FFI::Library::LIBC
|
7
|
+
attach_function :free, [:pointer], :void
|
96
8
|
end
|
97
9
|
|
98
|
-
|
99
|
-
#
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
name_ary = parts[0..1]
|
125
|
-
name[:details][0][:infraspecies].each do |data|
|
126
|
-
infrasp = data[:string]
|
127
|
-
rank = data[:rank]
|
128
|
-
name_ary << (rank && rank != "n/a" ? "#{rank} #{infrasp}" : infrasp)
|
10
|
+
module Biodiversity
|
11
|
+
# Parser provides a namespace for functions to parse scientific names.
|
12
|
+
module Parser
|
13
|
+
extend FFI::Library
|
14
|
+
|
15
|
+
platform = case Gem.platforms[1].os
|
16
|
+
when 'linux'
|
17
|
+
'linux'
|
18
|
+
when 'darwin'
|
19
|
+
'mac'
|
20
|
+
when 'mswin64'
|
21
|
+
'win'
|
22
|
+
else
|
23
|
+
raise "Unsupported platform: #{Gem.platforms[1].os}"
|
24
|
+
end
|
25
|
+
ffi_lib File.join(__dir__, '..', '..', 'clib', platform, 'libgnparser.so')
|
26
|
+
POINTER_SIZE = FFI.type_size(:pointer)
|
27
|
+
|
28
|
+
attach_function(:parse_go, :ParseToString, %i[string string], :string)
|
29
|
+
attach_function(:parse_ary_go, :ParseAryToStrings,
|
30
|
+
%i[pointer int string pointer], :void)
|
31
|
+
|
32
|
+
def self.parse(name, simple = false)
|
33
|
+
format = simple ? 'simple' : 'compact'
|
34
|
+
parsed = parse_go(name, format)
|
35
|
+
output(parsed, simple)
|
129
36
|
end
|
130
|
-
parsed[:scientificName][:canonical] = name_ary.join(" ")
|
131
|
-
parsed
|
132
|
-
end
|
133
|
-
|
134
|
-
def self.version
|
135
|
-
Biodiversity::VERSION
|
136
|
-
end
|
137
37
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
38
|
+
def self.parse_ary(ary, simple = false)
|
39
|
+
format = simple ? 'simple' : 'compact'
|
40
|
+
in_ptr = FFI::MemoryPointer.new(:pointer, ary.length)
|
41
|
+
in_ptr.write_array_of_pointer(
|
42
|
+
ary.map { |s| FFI::MemoryPointer.from_string(s) }
|
43
|
+
)
|
44
|
+
out_var = FFI::MemoryPointer.new(:pointer)
|
45
|
+
parse_ary_go(in_ptr, ary.length, format, out_var)
|
46
|
+
|
47
|
+
out_var.read_pointer
|
48
|
+
.get_array_of_string(0, ary.length)
|
49
|
+
.each_with_object([]) do |prsd, a|
|
50
|
+
a << output(prsd, simple)
|
148
51
|
end
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
UnicodeUtils.downcase(name_ary[0][1..-1])
|
153
|
-
else
|
154
|
-
word1 = name_ary[0]
|
52
|
+
ensure
|
53
|
+
out_var.read_pointer.get_array_of_pointer(0, ary.length).each do |p|
|
54
|
+
CLib.free(p)
|
155
55
|
end
|
156
|
-
|
157
|
-
word2 = name_ary[1].gsub(/\)$/, "") + ")"
|
158
|
-
word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
|
159
|
-
UnicodeUtils.downcase(word2[2..-1])
|
160
|
-
else
|
161
|
-
word2 = UnicodeUtils.downcase(name_ary[1])
|
162
|
-
end
|
163
|
-
res = word1 + " " +
|
164
|
-
word2 + " " +
|
165
|
-
name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(" ")
|
166
|
-
res.strip!
|
56
|
+
CLib.free(out_var.read_pointer)
|
167
57
|
end
|
168
|
-
res
|
169
|
-
end
|
170
|
-
|
171
58
|
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
prion|prions|NPV)\b/ix) ||
|
187
|
-
a_string.match(/[A-Z]?[a-z]+virus\b/) ||
|
188
|
-
a_string.match(/\b[A-Za-z]*(satellite[s]?|NPV)\b/))
|
189
|
-
end
|
190
|
-
|
191
|
-
def noparse?(a_string)
|
192
|
-
incertae_sedis = a_string.match(/incertae\s+sedis/i) ||
|
193
|
-
a_string.match(/inc\.\s*sed\./i)
|
194
|
-
rna = a_string.match(/[^A-Z]RNA[^A-Z]*/)
|
195
|
-
incertae_sedis || rna
|
196
|
-
end
|
197
|
-
|
198
|
-
def parsed
|
199
|
-
@parsed
|
200
|
-
end
|
201
|
-
|
202
|
-
def parse(a_string)
|
203
|
-
@verbatim = a_string
|
204
|
-
a_string, @tail = PreProcessor::clean(a_string)
|
205
|
-
|
206
|
-
if virus?(a_string)
|
207
|
-
@parsed = { verbatim: @verbatim, virus: true }
|
208
|
-
elsif noparse?(a_string)
|
209
|
-
@parsed = { verbatim: @verbatim }
|
210
|
-
else
|
211
|
-
begin
|
212
|
-
@parsed = @clean.parse(a_string) || @dirty.parse(a_string)
|
213
|
-
unless @parsed
|
214
|
-
index = @dirty.index || @clean.index
|
215
|
-
salvage_match = a_string[0..index].split(/\s+/)[0..-2]
|
216
|
-
salvage_string = salvage_match ? salvage_match.join(" ") : a_string
|
217
|
-
@parsed = @dirty.parse(salvage_string) ||
|
218
|
-
@canonical.parse(a_string) ||
|
219
|
-
{ verbatim: @verbatim }
|
220
|
-
end
|
221
|
-
rescue
|
222
|
-
@parsed = FAILED_RESULT.(@verbatim)
|
223
|
-
end
|
224
|
-
end
|
225
|
-
|
226
|
-
def @parsed.verbatim=(a_string)
|
227
|
-
@verbatim = a_string
|
228
|
-
@id = GnUUID.uuid(@verbatim)
|
229
|
-
end
|
230
|
-
|
231
|
-
def @parsed.all(opts = {})
|
232
|
-
canonical_with_rank = !!opts[:canonical_with_rank]
|
233
|
-
parsed = self.class != Hash
|
234
|
-
res = { id: @id, parsed: parsed,
|
235
|
-
parser_version: ScientificNameParser::version}
|
236
|
-
|
237
|
-
if parsed
|
238
|
-
hybrid = self.hybrid rescue false
|
239
|
-
res.merge!({
|
240
|
-
verbatim: @verbatim,
|
241
|
-
normalized: self.value,
|
242
|
-
canonical: self.canonical,
|
243
|
-
hybrid: hybrid,
|
244
|
-
details: self.details,
|
245
|
-
parser_run: self.parser_run,
|
246
|
-
positions: self.pos
|
247
|
-
})
|
59
|
+
def self.output(parsed, simple)
|
60
|
+
if simple
|
61
|
+
parsed = parsed.split('|')
|
62
|
+
{
|
63
|
+
id: parsed[0],
|
64
|
+
verbatim: parsed[1],
|
65
|
+
canonicalName: {
|
66
|
+
full: parsed[2],
|
67
|
+
simple: parsed[3],
|
68
|
+
stem: parsed[4]
|
69
|
+
},
|
70
|
+
authorship: parsed[5],
|
71
|
+
quality: parsed[6]
|
72
|
+
}
|
248
73
|
else
|
249
|
-
|
250
|
-
end
|
251
|
-
res[:surrogate] = true if ScientificNameParser.surrogate?(res)
|
252
|
-
res = {:scientificName => res}
|
253
|
-
if (canonical_with_rank &&
|
254
|
-
canonical.count(" ") > 1 &&
|
255
|
-
res[:scientificName][:details][0][:infraspecies])
|
256
|
-
ScientificNameParser.add_rank_to_canonical(res)
|
74
|
+
JSON.parse(parsed, symbolize_names: true)
|
257
75
|
end
|
258
|
-
res
|
259
|
-
end
|
260
|
-
|
261
|
-
def @parsed.pos_json
|
262
|
-
self.pos.to_json rescue ""
|
263
76
|
end
|
264
|
-
|
265
|
-
def @parsed.all_json
|
266
|
-
self.all.to_json rescue ""
|
267
|
-
end
|
268
|
-
|
269
|
-
@parsed.verbatim = @verbatim
|
270
|
-
res = @parsed.all(canonical_with_rank: @canonical_with_rank)
|
271
|
-
res[:scientificName].merge!(tail: @tail) if @tail && @tail != ""
|
272
|
-
res
|
273
77
|
end
|
274
|
-
|
275
|
-
private
|
276
|
-
|
277
|
-
def self.surrogate?(parsed_data)
|
278
|
-
return false unless parsed_data[:parsed]
|
279
|
-
name = parsed_data[:verbatim]
|
280
|
-
pos = parsed_data[:positions].to_a.flatten
|
281
|
-
surrogate1 = /BOLD:|[\d]{5,}/i
|
282
|
-
surrogate2 = /\b(spp|sp|nr|cf)[\.]?[\s]*$/i
|
283
|
-
is_surrogate = false
|
284
|
-
|
285
|
-
ai_index = pos.index("annotation_identification")
|
286
|
-
if ai_index
|
287
|
-
ai = name[pos[ai_index - 1]..pos[ai_index + 1]]
|
288
|
-
is_surrogate = true if ai.match(/^(spp|cf|sp|nr)/)
|
289
|
-
end
|
290
|
-
is_surrogate = true if !is_surrogate && (name.match(surrogate1) ||
|
291
|
-
name.match(surrogate2))
|
292
|
-
is_surrogate
|
293
|
-
end
|
294
|
-
end
|
78
|
+
end
|