biodiversity 3.5.1 → 4.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rubocop.yml +9 -6
- data/.ruby-version +1 -1
- data/.travis.yml +1 -6
- data/CHANGELOG +3 -0
- data/Gemfile +2 -0
- data/README.md +37 -178
- data/Rakefile +15 -48
- data/biodiversity.gemspec +18 -21
- data/clib/linux/libgnparser.h +93 -0
- data/clib/linux/libgnparser.so +0 -0
- data/clib/mac/libgnparser.h +93 -0
- data/clib/mac/libgnparser.so +0 -0
- data/lib/biodiversity.rb +4 -9
- data/lib/biodiversity/parser.rb +65 -281
- data/lib/biodiversity/version.rb +8 -1
- data/spec/lib/biodiversity_spec.rb +9 -0
- data/spec/lib/parser_spec.rb +38 -0
- data/spec/spec_helper.rb +4 -81
- metadata +27 -102
- data/.byebug_history +0 -18
- data/.document +0 -5
- data/examples/socket_client.rb +0 -25
- data/lib/biodiversity/guid.rb +0 -1
- data/lib/biodiversity/guid/lsid.rb +0 -16
- data/lib/biodiversity/parser/scientific_name_canonical.rb +0 -528
- data/lib/biodiversity/parser/scientific_name_canonical.treetop +0 -120
- data/lib/biodiversity/parser/scientific_name_clean.rb +0 -8991
- data/lib/biodiversity/parser/scientific_name_clean.treetop +0 -1632
- data/lib/biodiversity/parser/scientific_name_dirty.rb +0 -1298
- data/lib/biodiversity/parser/scientific_name_dirty.treetop +0 -264
- data/spec/biodiversity_spec.rb +0 -11
- data/spec/files/test_data.txt +0 -490
- data/spec/files/todo.txt +0 -55
- data/spec/guid/lsid.spec.rb +0 -15
- data/spec/parser/scientific_name_canonical_spec.rb +0 -36
- data/spec/parser/scientific_name_clean_spec.rb +0 -1137
- data/spec/parser/scientific_name_dirty_spec.rb +0 -165
- data/spec/parser/scientific_name_spec.rb +0 -193
@@ -0,0 +1,93 @@
|
|
1
|
+
/* Code generated by cmd/cgo; DO NOT EDIT. */
|
2
|
+
|
3
|
+
/* package gitlab.com/gogna/gnparser/binding */
|
4
|
+
|
5
|
+
|
6
|
+
#line 1 "cgo-builtin-export-prolog"
|
7
|
+
|
8
|
+
#include <stddef.h> /* for ptrdiff_t below */
|
9
|
+
|
10
|
+
#ifndef GO_CGO_EXPORT_PROLOGUE_H
|
11
|
+
#define GO_CGO_EXPORT_PROLOGUE_H
|
12
|
+
|
13
|
+
#ifndef GO_CGO_GOSTRING_TYPEDEF
|
14
|
+
typedef struct { const char *p; ptrdiff_t n; } _GoString_;
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#endif
|
18
|
+
|
19
|
+
/* Start of preamble from import "C" comments. */
|
20
|
+
|
21
|
+
|
22
|
+
#line 3 "main.go"
|
23
|
+
|
24
|
+
#include "stdlib.h"
|
25
|
+
|
26
|
+
#line 1 "cgo-generated-wrapper"
|
27
|
+
|
28
|
+
|
29
|
+
/* End of preamble from import "C" comments. */
|
30
|
+
|
31
|
+
|
32
|
+
/* Start of boilerplate cgo prologue. */
|
33
|
+
#line 1 "cgo-gcc-export-header-prolog"
|
34
|
+
|
35
|
+
#ifndef GO_CGO_PROLOGUE_H
|
36
|
+
#define GO_CGO_PROLOGUE_H
|
37
|
+
|
38
|
+
typedef signed char GoInt8;
|
39
|
+
typedef unsigned char GoUint8;
|
40
|
+
typedef short GoInt16;
|
41
|
+
typedef unsigned short GoUint16;
|
42
|
+
typedef int GoInt32;
|
43
|
+
typedef unsigned int GoUint32;
|
44
|
+
typedef long long GoInt64;
|
45
|
+
typedef unsigned long long GoUint64;
|
46
|
+
typedef GoInt64 GoInt;
|
47
|
+
typedef GoUint64 GoUint;
|
48
|
+
typedef __SIZE_TYPE__ GoUintptr;
|
49
|
+
typedef float GoFloat32;
|
50
|
+
typedef double GoFloat64;
|
51
|
+
typedef float _Complex GoComplex64;
|
52
|
+
typedef double _Complex GoComplex128;
|
53
|
+
|
54
|
+
/*
|
55
|
+
static assertion to make sure the file is being used on architecture
|
56
|
+
at least with matching size of GoInt.
|
57
|
+
*/
|
58
|
+
typedef char _check_for_64_bit_pointer_matching_GoInt[sizeof(void*)==64/8 ? 1:-1];
|
59
|
+
|
60
|
+
#ifndef GO_CGO_GOSTRING_TYPEDEF
|
61
|
+
typedef _GoString_ GoString;
|
62
|
+
#endif
|
63
|
+
typedef void *GoMap;
|
64
|
+
typedef void *GoChan;
|
65
|
+
typedef struct { void *t; void *v; } GoInterface;
|
66
|
+
typedef struct { void *data; GoInt len; GoInt cap; } GoSlice;
|
67
|
+
|
68
|
+
#endif
|
69
|
+
|
70
|
+
/* End of boilerplate cgo prologue. */
|
71
|
+
|
72
|
+
#ifdef __cplusplus
|
73
|
+
extern "C" {
|
74
|
+
#endif
|
75
|
+
|
76
|
+
|
77
|
+
// ParseToString function takes a name-string, desired format, and parses
|
78
|
+
// the name-string to either JSON, or pipe-separated values, depending on
|
79
|
+
// the desired format. Format can take values of 'simple', 'compact', 'pretty'.
|
80
|
+
|
81
|
+
extern char* ParseToString(char* p0, char* p1);
|
82
|
+
|
83
|
+
// ParseAryToStrings function takes an array of names, parsing format and a
|
84
|
+
// reference to an output: an empty array of strings to return the the data
|
85
|
+
// back. It populates the output array with raw strings of either JSON or
|
86
|
+
// pipe-separated parsed values (depending on a given format). Format can take
|
87
|
+
// values of 'simple', 'compact', or 'pretty'.
|
88
|
+
|
89
|
+
extern void ParseAryToStrings(char** p0, int p1, char* p2, char*** p3);
|
90
|
+
|
91
|
+
#ifdef __cplusplus
|
92
|
+
}
|
93
|
+
#endif
|
Binary file
|
@@ -0,0 +1,93 @@
|
|
1
|
+
/* Code generated by cmd/cgo; DO NOT EDIT. */
|
2
|
+
|
3
|
+
/* package gitlab.com/gogna/gnparser/binding */
|
4
|
+
|
5
|
+
|
6
|
+
#line 1 "cgo-builtin-export-prolog"
|
7
|
+
|
8
|
+
#include <stddef.h> /* for ptrdiff_t below */
|
9
|
+
|
10
|
+
#ifndef GO_CGO_EXPORT_PROLOGUE_H
|
11
|
+
#define GO_CGO_EXPORT_PROLOGUE_H
|
12
|
+
|
13
|
+
#ifndef GO_CGO_GOSTRING_TYPEDEF
|
14
|
+
typedef struct { const char *p; ptrdiff_t n; } _GoString_;
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#endif
|
18
|
+
|
19
|
+
/* Start of preamble from import "C" comments. */
|
20
|
+
|
21
|
+
|
22
|
+
#line 3 "main.go"
|
23
|
+
|
24
|
+
#include "stdlib.h"
|
25
|
+
|
26
|
+
#line 1 "cgo-generated-wrapper"
|
27
|
+
|
28
|
+
|
29
|
+
/* End of preamble from import "C" comments. */
|
30
|
+
|
31
|
+
|
32
|
+
/* Start of boilerplate cgo prologue. */
|
33
|
+
#line 1 "cgo-gcc-export-header-prolog"
|
34
|
+
|
35
|
+
#ifndef GO_CGO_PROLOGUE_H
|
36
|
+
#define GO_CGO_PROLOGUE_H
|
37
|
+
|
38
|
+
typedef signed char GoInt8;
|
39
|
+
typedef unsigned char GoUint8;
|
40
|
+
typedef short GoInt16;
|
41
|
+
typedef unsigned short GoUint16;
|
42
|
+
typedef int GoInt32;
|
43
|
+
typedef unsigned int GoUint32;
|
44
|
+
typedef long long GoInt64;
|
45
|
+
typedef unsigned long long GoUint64;
|
46
|
+
typedef GoInt64 GoInt;
|
47
|
+
typedef GoUint64 GoUint;
|
48
|
+
typedef __SIZE_TYPE__ GoUintptr;
|
49
|
+
typedef float GoFloat32;
|
50
|
+
typedef double GoFloat64;
|
51
|
+
typedef float _Complex GoComplex64;
|
52
|
+
typedef double _Complex GoComplex128;
|
53
|
+
|
54
|
+
/*
|
55
|
+
static assertion to make sure the file is being used on architecture
|
56
|
+
at least with matching size of GoInt.
|
57
|
+
*/
|
58
|
+
typedef char _check_for_64_bit_pointer_matching_GoInt[sizeof(void*)==64/8 ? 1:-1];
|
59
|
+
|
60
|
+
#ifndef GO_CGO_GOSTRING_TYPEDEF
|
61
|
+
typedef _GoString_ GoString;
|
62
|
+
#endif
|
63
|
+
typedef void *GoMap;
|
64
|
+
typedef void *GoChan;
|
65
|
+
typedef struct { void *t; void *v; } GoInterface;
|
66
|
+
typedef struct { void *data; GoInt len; GoInt cap; } GoSlice;
|
67
|
+
|
68
|
+
#endif
|
69
|
+
|
70
|
+
/* End of boilerplate cgo prologue. */
|
71
|
+
|
72
|
+
#ifdef __cplusplus
|
73
|
+
extern "C" {
|
74
|
+
#endif
|
75
|
+
|
76
|
+
|
77
|
+
// ParseToString function takes a name-string, desired format, and parses
|
78
|
+
// the name-string to either JSON, or pipe-separated values, depending on
|
79
|
+
// the desired format. Format can take values of 'simple', 'compact', 'pretty'.
|
80
|
+
|
81
|
+
extern char* ParseToString(char* p0, char* p1);
|
82
|
+
|
83
|
+
// ParseAryToStrings function takes an array of names, parsing format and a
|
84
|
+
// reference to an output: an empty array of strings to return the the data
|
85
|
+
// back. It populates the output array with raw strings of either JSON or
|
86
|
+
// pipe-separated parsed values (depending on a given format). Format can take
|
87
|
+
// values of 'simple', 'compact', or 'pretty'.
|
88
|
+
|
89
|
+
extern void ParseAryToStrings(char** p0, int p1, char* p2, char*** p3);
|
90
|
+
|
91
|
+
#ifdef __cplusplus
|
92
|
+
}
|
93
|
+
#endif
|
Binary file
|
data/lib/biodiversity.rb
CHANGED
@@ -1,15 +1,10 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'ffi'
|
2
4
|
require 'json'
|
3
|
-
require 'open-uri'
|
4
5
|
require_relative 'biodiversity/version'
|
5
6
|
require_relative 'biodiversity/parser'
|
6
|
-
require_relative 'biodiversity/guid'
|
7
7
|
|
8
|
+
# Biodiversity module provides a namespace for scientific name parser.
|
8
9
|
module Biodiversity
|
9
|
-
LSID_RESOLVER_URL = 'http://lsid.tdwg.org/'
|
10
|
-
|
11
|
-
def self.version
|
12
|
-
VERSION
|
13
|
-
end
|
14
10
|
end
|
15
|
-
|
data/lib/biodiversity/parser.rb
CHANGED
@@ -1,294 +1,78 @@
|
|
1
|
-
#
|
2
|
-
require "gn_uuid"
|
3
|
-
require_relative "parser/scientific_name_clean"
|
4
|
-
require_relative "parser/scientific_name_dirty"
|
5
|
-
require_relative "parser/scientific_name_canonical"
|
1
|
+
# frozen_string_literal: true
|
6
2
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
\(?s\.\s?l\.|
|
13
|
-
\(?s\.\s?str\.|
|
14
|
-
\(?s\.\s?lat\.|
|
15
|
-
sec\.|sec|near)\b.*$/x
|
16
|
-
TAXON_CONCEPTS3 = /(,\s*|\s+)(pro parte|p\.\s?p\.)\s*$/i
|
17
|
-
NOMEN_CONCEPTS = /(,\s*|\s+)(\(?nomen\b|\(?nom\.|\(?comb\.).*$/i
|
18
|
-
LAST_WORD_JUNK = /(,\s*|\s+)
|
19
|
-
(spp\.|spp|var\.|
|
20
|
-
var|von|van|ined\.|
|
21
|
-
ined|sensu|new|non|nec|
|
22
|
-
nudum|cf\.|cf|sp\.|sp|
|
23
|
-
ssp\.|ssp|subsp|subgen|hybrid|hort\.|hort)\??\s*$/ix
|
24
|
-
|
25
|
-
def self.clean(a_string)
|
26
|
-
orig = a_string
|
27
|
-
[NOTES, TAXON_CONCEPTS1, TAXON_CONCEPTS2,
|
28
|
-
TAXON_CONCEPTS3, NOMEN_CONCEPTS, LAST_WORD_JUNK].each do |i|
|
29
|
-
a_string = a_string.gsub(i, "")
|
30
|
-
end
|
31
|
-
tail = orig[a_string.size..-1]
|
32
|
-
a_string = a_string.tr("ſ","s") #old "s"
|
33
|
-
a_string = a_string.tr("_", " ") if a_string.strip.match(/\s/).nil?
|
34
|
-
[a_string, tail.strip]
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
# Public: Parser which runs in parallel.
|
39
|
-
#
|
40
|
-
# Examples
|
41
|
-
#
|
42
|
-
# parser = ParallelParser.new(4)
|
43
|
-
# parser.parse(["Betula L.", "Pardosa moesta"])
|
44
|
-
class ParallelParser
|
45
|
-
|
46
|
-
# Public: Initialize ParallelParser.
|
47
|
-
#
|
48
|
-
# processes_num - an Integer to setup the number of processes (default: nil).
|
49
|
-
# If processes number is not set it will be determined
|
50
|
-
# automatically.
|
51
|
-
def initialize(processes_num = nil)
|
52
|
-
require "parallel"
|
53
|
-
cpu_num
|
54
|
-
if processes_num.to_i > 0
|
55
|
-
@processes_num = [processes_num, cpu_num - 1].min
|
56
|
-
else
|
57
|
-
@processes_num = cpu_num > 3 ? cpu_num - 2 : 1
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
# Public: Parses an array of scientific names using several processes
|
62
|
-
# in parallel.
|
63
|
-
#
|
64
|
-
# Scientific names are deduplicated in the process, so every string is
|
65
|
-
# parsed only once.
|
66
|
-
#
|
67
|
-
# names_list - takes an Array of scientific names,
|
68
|
-
# each element should be a String.
|
69
|
-
#
|
70
|
-
# Examples
|
71
|
-
#
|
72
|
-
# parser = ParallelParser.new(4)
|
73
|
-
# parser.parse(["Homo sapiens L.", "Quercus quercus"])
|
74
|
-
#
|
75
|
-
# Returns a Hash with scientific names as a key, and parsing results as
|
76
|
-
# a value.
|
77
|
-
def parse(names_list)
|
78
|
-
parsed = Parallel.map(names_list.uniq, in_processes: @processes_num) do |n|
|
79
|
-
[n, parse_process(n)]
|
80
|
-
end
|
81
|
-
parsed.inject({}) { |res, x| res[x[0]] = x[1]; res }
|
82
|
-
end
|
83
|
-
|
84
|
-
# Public: Returns the number of cores/CPUs.
|
85
|
-
#
|
86
|
-
# Returns Integer of cores/CPUs.
|
87
|
-
def cpu_num
|
88
|
-
@cpu_num ||= Parallel.processor_count
|
89
|
-
end
|
90
|
-
|
91
|
-
private
|
92
|
-
def parse_process(name)
|
93
|
-
p = ScientificNameParser.new
|
94
|
-
p.parse(name) rescue ScientificNameParser::FAILED_RESULT.(name)
|
95
|
-
end
|
3
|
+
# CLib is required to free memory after it is used by C
|
4
|
+
module CLib
|
5
|
+
extend FFI::Library
|
6
|
+
ffi_lib FFI::Library::LIBC
|
7
|
+
attach_function :free, [:pointer], :void
|
96
8
|
end
|
97
9
|
|
98
|
-
|
99
|
-
#
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
name_ary = parts[0..1]
|
125
|
-
name[:details][0][:infraspecies].each do |data|
|
126
|
-
infrasp = data[:string]
|
127
|
-
rank = data[:rank]
|
128
|
-
name_ary << (rank && rank != "n/a" ? "#{rank} #{infrasp}" : infrasp)
|
10
|
+
module Biodiversity
|
11
|
+
# Parser provides a namespace for functions to parse scientific names.
|
12
|
+
module Parser
|
13
|
+
extend FFI::Library
|
14
|
+
|
15
|
+
platform = case Gem.platforms[1].os
|
16
|
+
when 'linux'
|
17
|
+
'linux'
|
18
|
+
when 'darwin'
|
19
|
+
'mac'
|
20
|
+
when 'mswin64'
|
21
|
+
'win'
|
22
|
+
else
|
23
|
+
raise "Unsupported platform: #{Gem.platforms[1].os}"
|
24
|
+
end
|
25
|
+
ffi_lib File.join(__dir__, '..', '..', 'clib', platform, 'libgnparser.so')
|
26
|
+
POINTER_SIZE = FFI.type_size(:pointer)
|
27
|
+
|
28
|
+
attach_function(:parse_go, :ParseToString, %i[string string], :string)
|
29
|
+
attach_function(:parse_ary_go, :ParseAryToStrings,
|
30
|
+
%i[pointer int string pointer], :void)
|
31
|
+
|
32
|
+
def self.parse(name, simple = false)
|
33
|
+
format = simple ? 'simple' : 'compact'
|
34
|
+
parsed = parse_go(name, format)
|
35
|
+
output(parsed, simple)
|
129
36
|
end
|
130
|
-
parsed[:scientificName][:canonical] = name_ary.join(" ")
|
131
|
-
parsed
|
132
|
-
end
|
133
|
-
|
134
|
-
def self.version
|
135
|
-
Biodiversity::VERSION
|
136
|
-
end
|
137
37
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
38
|
+
def self.parse_ary(ary, simple = false)
|
39
|
+
format = simple ? 'simple' : 'compact'
|
40
|
+
in_ptr = FFI::MemoryPointer.new(:pointer, ary.length)
|
41
|
+
in_ptr.write_array_of_pointer(
|
42
|
+
ary.map { |s| FFI::MemoryPointer.from_string(s) }
|
43
|
+
)
|
44
|
+
out_var = FFI::MemoryPointer.new(:pointer)
|
45
|
+
parse_ary_go(in_ptr, ary.length, format, out_var)
|
46
|
+
|
47
|
+
out_var.read_pointer
|
48
|
+
.get_array_of_string(0, ary.length)
|
49
|
+
.each_with_object([]) do |prsd, a|
|
50
|
+
a << output(prsd, simple)
|
148
51
|
end
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
UnicodeUtils.downcase(name_ary[0][1..-1])
|
153
|
-
else
|
154
|
-
word1 = name_ary[0]
|
52
|
+
ensure
|
53
|
+
out_var.read_pointer.get_array_of_pointer(0, ary.length).each do |p|
|
54
|
+
CLib.free(p)
|
155
55
|
end
|
156
|
-
|
157
|
-
word2 = name_ary[1].gsub(/\)$/, "") + ")"
|
158
|
-
word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
|
159
|
-
UnicodeUtils.downcase(word2[2..-1])
|
160
|
-
else
|
161
|
-
word2 = UnicodeUtils.downcase(name_ary[1])
|
162
|
-
end
|
163
|
-
res = word1 + " " +
|
164
|
-
word2 + " " +
|
165
|
-
name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(" ")
|
166
|
-
res.strip!
|
56
|
+
CLib.free(out_var.read_pointer)
|
167
57
|
end
|
168
|
-
res
|
169
|
-
end
|
170
|
-
|
171
58
|
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
prion|prions|NPV)\b/ix) ||
|
187
|
-
a_string.match(/[A-Z]?[a-z]+virus\b/) ||
|
188
|
-
a_string.match(/\b[A-Za-z]*(satellite[s]?|NPV)\b/))
|
189
|
-
end
|
190
|
-
|
191
|
-
def noparse?(a_string)
|
192
|
-
incertae_sedis = a_string.match(/incertae\s+sedis/i) ||
|
193
|
-
a_string.match(/inc\.\s*sed\./i)
|
194
|
-
rna = a_string.match(/[^A-Z]RNA[^A-Z]*/)
|
195
|
-
incertae_sedis || rna
|
196
|
-
end
|
197
|
-
|
198
|
-
def parsed
|
199
|
-
@parsed
|
200
|
-
end
|
201
|
-
|
202
|
-
def parse(a_string)
|
203
|
-
@verbatim = a_string
|
204
|
-
a_string, @tail = PreProcessor::clean(a_string)
|
205
|
-
|
206
|
-
if virus?(a_string)
|
207
|
-
@parsed = { verbatim: @verbatim, virus: true }
|
208
|
-
elsif noparse?(a_string)
|
209
|
-
@parsed = { verbatim: @verbatim }
|
210
|
-
else
|
211
|
-
begin
|
212
|
-
@parsed = @clean.parse(a_string) || @dirty.parse(a_string)
|
213
|
-
unless @parsed
|
214
|
-
index = @dirty.index || @clean.index
|
215
|
-
salvage_match = a_string[0..index].split(/\s+/)[0..-2]
|
216
|
-
salvage_string = salvage_match ? salvage_match.join(" ") : a_string
|
217
|
-
@parsed = @dirty.parse(salvage_string) ||
|
218
|
-
@canonical.parse(a_string) ||
|
219
|
-
{ verbatim: @verbatim }
|
220
|
-
end
|
221
|
-
rescue
|
222
|
-
@parsed = FAILED_RESULT.(@verbatim)
|
223
|
-
end
|
224
|
-
end
|
225
|
-
|
226
|
-
def @parsed.verbatim=(a_string)
|
227
|
-
@verbatim = a_string
|
228
|
-
@id = GnUUID.uuid(@verbatim)
|
229
|
-
end
|
230
|
-
|
231
|
-
def @parsed.all(opts = {})
|
232
|
-
canonical_with_rank = !!opts[:canonical_with_rank]
|
233
|
-
parsed = self.class != Hash
|
234
|
-
res = { id: @id, parsed: parsed,
|
235
|
-
parser_version: ScientificNameParser::version}
|
236
|
-
|
237
|
-
if parsed
|
238
|
-
hybrid = self.hybrid rescue false
|
239
|
-
res.merge!({
|
240
|
-
verbatim: @verbatim,
|
241
|
-
normalized: self.value,
|
242
|
-
canonical: self.canonical,
|
243
|
-
hybrid: hybrid,
|
244
|
-
details: self.details,
|
245
|
-
parser_run: self.parser_run,
|
246
|
-
positions: self.pos
|
247
|
-
})
|
59
|
+
def self.output(parsed, simple)
|
60
|
+
if simple
|
61
|
+
parsed = parsed.split('|')
|
62
|
+
{
|
63
|
+
id: parsed[0],
|
64
|
+
verbatim: parsed[1],
|
65
|
+
canonicalName: {
|
66
|
+
full: parsed[2],
|
67
|
+
simple: parsed[3],
|
68
|
+
stem: parsed[4]
|
69
|
+
},
|
70
|
+
authorship: parsed[5],
|
71
|
+
quality: parsed[6]
|
72
|
+
}
|
248
73
|
else
|
249
|
-
|
250
|
-
end
|
251
|
-
res[:surrogate] = true if ScientificNameParser.surrogate?(res)
|
252
|
-
res = {:scientificName => res}
|
253
|
-
if (canonical_with_rank &&
|
254
|
-
canonical.count(" ") > 1 &&
|
255
|
-
res[:scientificName][:details][0][:infraspecies])
|
256
|
-
ScientificNameParser.add_rank_to_canonical(res)
|
74
|
+
JSON.parse(parsed, symbolize_names: true)
|
257
75
|
end
|
258
|
-
res
|
259
|
-
end
|
260
|
-
|
261
|
-
def @parsed.pos_json
|
262
|
-
self.pos.to_json rescue ""
|
263
76
|
end
|
264
|
-
|
265
|
-
def @parsed.all_json
|
266
|
-
self.all.to_json rescue ""
|
267
|
-
end
|
268
|
-
|
269
|
-
@parsed.verbatim = @verbatim
|
270
|
-
res = @parsed.all(canonical_with_rank: @canonical_with_rank)
|
271
|
-
res[:scientificName].merge!(tail: @tail) if @tail && @tail != ""
|
272
|
-
res
|
273
77
|
end
|
274
|
-
|
275
|
-
private
|
276
|
-
|
277
|
-
def self.surrogate?(parsed_data)
|
278
|
-
return false unless parsed_data[:parsed]
|
279
|
-
name = parsed_data[:verbatim]
|
280
|
-
pos = parsed_data[:positions].to_a.flatten
|
281
|
-
surrogate1 = /BOLD:|[\d]{5,}/i
|
282
|
-
surrogate2 = /\b(spp|sp|nr|cf)[\.]?[\s]*$/i
|
283
|
-
is_surrogate = false
|
284
|
-
|
285
|
-
ai_index = pos.index("annotation_identification")
|
286
|
-
if ai_index
|
287
|
-
ai = name[pos[ai_index - 1]..pos[ai_index + 1]]
|
288
|
-
is_surrogate = true if ai.match(/^(spp|cf|sp|nr)/)
|
289
|
-
end
|
290
|
-
is_surrogate = true if !is_surrogate && (name.match(surrogate1) ||
|
291
|
-
name.match(surrogate2))
|
292
|
-
is_surrogate
|
293
|
-
end
|
294
|
-
end
|
78
|
+
end
|