dwc-archive 1.1.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/.travis.yml +3 -5
- data/LICENSE +1 -1
- data/README.md +4 -3
- data/dwc-archive.gemspec +1 -1
- data/lib/dwc_archive/classification_normalizer.rb +35 -47
- data/lib/dwc_archive/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a5d4e174cb4ec6df2328fe99abb52b13587c74f22f01efb2a9017db4d9f1d7ba
|
4
|
+
data.tar.gz: e341038db2a23282173cf2e4671750739464dca6076f7560694cc40251bdd036
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8b51b82724e21acab76e1763dc2375658080a16dc440c9913fd632aff195a45c34e7bf9446cf5d4afc9fc397c8ee562763ab36532ab5992421e5ed3f2df64273
|
7
|
+
data.tar.gz: 04cbcc92c8b565b2c5f8e8bf082d284b65c55520d91f19c16bc63789519ea374f5ed0466c176ce69907a83566caa8efd2f834399f7b57a26801603455cf2b118
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.7.1
|
data/.travis.yml
CHANGED
@@ -1,15 +1,13 @@
|
|
1
1
|
rvm:
|
2
|
-
- 2.4
|
3
2
|
- 2.5
|
3
|
+
- 2.6
|
4
|
+
- 2.7
|
4
5
|
before_install:
|
5
6
|
- sudo apt-get update
|
6
|
-
|
7
|
-
services:
|
8
|
-
- redis-server
|
7
|
+
- gem install bundler
|
9
8
|
script:
|
10
9
|
- bundle exec cucumber
|
11
10
|
- bundle exec rake
|
12
11
|
branches:
|
13
12
|
only:
|
14
13
|
- master
|
15
|
-
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -179,11 +179,11 @@ Note on Patches/Pull Requests
|
|
179
179
|
Copyright
|
180
180
|
---------
|
181
181
|
|
182
|
-
Author -- [
|
182
|
+
Author -- [@dimus][13]
|
183
183
|
|
184
|
-
Contributors -- [
|
184
|
+
Contributors -- [@mjy][14], [@LocoDelAssembly][16]
|
185
185
|
|
186
|
-
Copyright (c) 2010-
|
186
|
+
Copyright (c) 2010-2020 [@dimus][15]. See LICENSE for details.
|
187
187
|
|
188
188
|
[1]: https://badge.fury.io/rb/dwc-archive.png
|
189
189
|
[2]: http://badge.fury.io/rb/dwc-archive
|
@@ -200,3 +200,4 @@ Copyright (c) 2010-2014 [Marine Biological Laboratory][15]. See LICENSE for deta
|
|
200
200
|
[13]: https://github.com/dimus
|
201
201
|
[14]: https://github.com/mjy
|
202
202
|
[15]: http://mbl.edu
|
203
|
+
[16]: https://github.com/LocoDelAssembly
|
data/dwc-archive.gemspec
CHANGED
@@ -15,7 +15,7 @@ Gem::Specification.new do |gem|
|
|
15
15
|
gem.homepage = "http://github.com/GlobalNamesArchitecture/dwc-archive"
|
16
16
|
gem.license = "MIT"
|
17
17
|
|
18
|
-
gem.required_ruby_version = ">= 2.
|
18
|
+
gem.required_ruby_version = ">= 2.5.0"
|
19
19
|
gem.files = `git ls-files`.split("\n").map(&:strip)
|
20
20
|
gem.executables = gem.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
21
21
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
@@ -1,10 +1,11 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
class DarwinCore
|
3
4
|
# Returns tree representation of Darwin Core file with vernacular and
|
4
5
|
# and synonyms attached to the taxon nodes
|
5
6
|
class ClassificationNormalizer
|
6
7
|
attr_reader :error_names, :tree, :normalized_data, :dwc
|
7
|
-
|
8
|
+
alias darwin_core dwc
|
8
9
|
|
9
10
|
def initialize(dwc_instance)
|
10
11
|
@dwc = dwc_instance
|
@@ -12,7 +13,6 @@ class DarwinCore
|
|
12
13
|
@extensions = @dwc.extensions.map { |e| [e, find_fields(e)] }
|
13
14
|
@normalized_data = {}
|
14
15
|
@synonyms = {}
|
15
|
-
@parser = ::Biodiversity::Parser
|
16
16
|
@name_strings = {}
|
17
17
|
@vernacular_name_strings = {}
|
18
18
|
@error_names = []
|
@@ -25,6 +25,7 @@ class DarwinCore
|
|
25
25
|
|
26
26
|
def add_vernacular_name_string(name_string)
|
27
27
|
return if @vernacular_name_strings[name_string]
|
28
|
+
|
28
29
|
@vernacular_name_strings[name_string] = 1
|
29
30
|
end
|
30
31
|
|
@@ -70,9 +71,9 @@ class DarwinCore
|
|
70
71
|
|
71
72
|
def get_canonical_name(a_scientific_name)
|
72
73
|
return nil unless @with_canonical_names
|
73
|
-
|
74
|
-
|
75
|
-
|
74
|
+
|
75
|
+
canonical_name = Biodiversity::Parser.parse(a_scientific_name).
|
76
|
+
dig(:canonicalName, :simple)
|
76
77
|
canonical_name.to_s.empty? ? a_scientific_name : canonical_name
|
77
78
|
end
|
78
79
|
|
@@ -87,15 +88,13 @@ class DarwinCore
|
|
87
88
|
end
|
88
89
|
|
89
90
|
def status_synonym?(status)
|
90
|
-
status
|
91
|
+
status&.match(/^syn/)
|
91
92
|
end
|
92
93
|
|
93
94
|
def add_synonym_from_core(taxon_id, row)
|
94
95
|
cf = @core_fields
|
95
96
|
@synonyms[row[cf[:id]]] = taxon_id
|
96
|
-
unless @normalized_data[row[taxon_id]]
|
97
|
-
@normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new
|
98
|
-
end
|
97
|
+
@normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new unless @normalized_data[row[taxon_id]]
|
99
98
|
|
100
99
|
taxon = @normalized_data[row[taxon_id]]
|
101
100
|
synonym = SynonymNormalized.new(
|
@@ -107,7 +106,7 @@ class DarwinCore
|
|
107
106
|
cf[:localid] ? row[cf[:localid]] : nil,
|
108
107
|
cf[:globalid] ? row[cf[:globalid]] : nil
|
109
108
|
)
|
110
|
-
taxon.synonyms <<
|
109
|
+
taxon.synonyms << synonym
|
111
110
|
add_name_string(synonym.name)
|
112
111
|
add_name_string(synonym.canonical_name)
|
113
112
|
end
|
@@ -117,14 +116,10 @@ class DarwinCore
|
|
117
116
|
canonical_name = nil
|
118
117
|
scientific_name = row[fields[:scientificname]].strip
|
119
118
|
if separate_canonical_and_authorship?(row, fields)
|
120
|
-
if @with_canonical_names
|
121
|
-
canonical_name = row[fields[:scientificname]].strip
|
122
|
-
end
|
119
|
+
canonical_name = row[fields[:scientificname]].strip if @with_canonical_names
|
123
120
|
scientific_name += " #{row[fields[:scientificnameauthorship]].strip}"
|
124
121
|
else
|
125
|
-
if @with_canonical_names
|
126
|
-
canonical_name = get_canonical_name(row[fields[:scientificname]])
|
127
|
-
end
|
122
|
+
canonical_name = get_canonical_name(row[fields[:scientificname]]) if @with_canonical_names
|
128
123
|
end
|
129
124
|
fields[:canonicalname] = row.size
|
130
125
|
row << canonical_name
|
@@ -133,18 +128,17 @@ class DarwinCore
|
|
133
128
|
|
134
129
|
def separate_canonical_and_authorship?(row, fields)
|
135
130
|
authorship = ""
|
136
|
-
if fields[:scientificnameauthorship]
|
137
|
-
authorship = row[fields[:scientificnameauthorship]].to_s.strip
|
138
|
-
end
|
131
|
+
authorship = row[fields[:scientificnameauthorship]].to_s.strip if fields[:scientificnameauthorship]
|
139
132
|
!(authorship.empty? || row[fields[:scientificname]].index(authorship))
|
140
133
|
end
|
141
134
|
|
142
135
|
def ingest_core
|
143
136
|
@normalized_data = {}
|
144
137
|
has_name_and_id = @core_fields[:id] && @core_fields[:scientificname]
|
145
|
-
|
146
|
-
|
147
|
-
|
138
|
+
unless has_name_and_id
|
139
|
+
raise(DarwinCore::CoreFileError,
|
140
|
+
"Darwin Core core fields must contain taxon id and scientific name")
|
141
|
+
end
|
148
142
|
@dwc.core.read do |rows|
|
149
143
|
rows[1].each do |error|
|
150
144
|
@error_names << { data: error,
|
@@ -163,32 +157,28 @@ class DarwinCore
|
|
163
157
|
add_synonym_from_core(parent_id, r) if parent_id?
|
164
158
|
else
|
165
159
|
unless @normalized_data[r[@core_fields[:id]]]
|
166
|
-
if gnub_archive?
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
160
|
+
new_taxon = if gnub_archive?
|
161
|
+
DarwinCore::GnubTaxon.new
|
162
|
+
else
|
163
|
+
DarwinCore::TaxonNormalized.new
|
164
|
+
end
|
171
165
|
@normalized_data[r[@core_fields[:id]]] = new_taxon
|
172
166
|
end
|
173
167
|
taxon = @normalized_data[r[@core_fields[:id]]]
|
174
168
|
if gnub_archive?
|
175
169
|
taxon.uuid = r[@core_fields[:originalnameusageid]]
|
176
170
|
taxon.uuid_path = r[@core_fields[:originalnameusageidpath]].
|
177
|
-
|
171
|
+
split("|")
|
178
172
|
end
|
179
173
|
taxon.id = r[@core_fields[:id]]
|
180
174
|
taxon.current_name = r[@core_fields[:scientificname]]
|
181
175
|
taxon.current_name_canonical = r[@core_fields[:canonicalname]]
|
182
176
|
taxon.parent_id = parent_id? ? r[parent_id] : nil
|
183
177
|
taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
|
184
|
-
if @core_fields[:taxonomicstatus]
|
185
|
-
taxon.status = r[@core_fields[:taxonomicstatus]]
|
186
|
-
end
|
178
|
+
taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
|
187
179
|
taxon.source = r[@core_fields[:source]] if @core_fields[:source]
|
188
180
|
taxon.local_id = r[@core_fields[:localid]] if @core_fields[:localid]
|
189
|
-
if @core_fields[:globalid]
|
190
|
-
taxon.global_id = r[@core_fields[:globalid]]
|
191
|
-
end
|
181
|
+
taxon.global_id = r[@core_fields[:globalid]] if @core_fields[:globalid]
|
192
182
|
taxon.linnean_classification_path =
|
193
183
|
get_linnean_classification_path(r, taxon)
|
194
184
|
add_name_string(taxon.current_name)
|
@@ -213,6 +203,7 @@ class DarwinCore
|
|
213
203
|
@paths_num = 0
|
214
204
|
@normalized_data.each do |_taxon_id, taxon|
|
215
205
|
next unless taxon.classification_path_id.empty?
|
206
|
+
|
216
207
|
res = get_classification_path(taxon)
|
217
208
|
next if res == "error"
|
218
209
|
end
|
@@ -220,6 +211,7 @@ class DarwinCore
|
|
220
211
|
|
221
212
|
def get_classification_path(taxon)
|
222
213
|
return unless taxon.classification_path_id.empty?
|
214
|
+
|
223
215
|
@paths_num += 1
|
224
216
|
if @paths_num % 10_000 == 0
|
225
217
|
DarwinCore.logger_write(@dwc.object_id,
|
@@ -227,17 +219,13 @@ class DarwinCore
|
|
227
219
|
end
|
228
220
|
current_node = { taxon.id => {} }
|
229
221
|
if DarwinCore.nil_field?(taxon.parent_id)
|
230
|
-
if @with_canonical_names
|
231
|
-
taxon.classification_path << taxon.current_name_canonical
|
232
|
-
end
|
222
|
+
taxon.classification_path << taxon.current_name_canonical if @with_canonical_names
|
233
223
|
taxon.classification_path_id << taxon.id
|
234
224
|
@tree.merge!(current_node)
|
235
225
|
else
|
236
226
|
parent_cp = parent_cpid = nil
|
237
227
|
if @normalized_data[taxon.parent_id]
|
238
|
-
if @with_canonical_names
|
239
|
-
parent_cp = @normalized_data[taxon.parent_id].classification_path
|
240
|
-
end
|
228
|
+
parent_cp = @normalized_data[taxon.parent_id].classification_path if @with_canonical_names
|
241
229
|
parent_cpid = @normalized_data[taxon.parent_id].
|
242
230
|
classification_path_id
|
243
231
|
else
|
@@ -247,9 +235,7 @@ class DarwinCore
|
|
247
235
|
error: :deprecated_parent,
|
248
236
|
current_parent: current_parent }
|
249
237
|
|
250
|
-
if @with_canonical_names
|
251
|
-
parent_cp = current_parent.classification_path
|
252
|
-
end
|
238
|
+
parent_cp = current_parent.classification_path if @with_canonical_names
|
253
239
|
parent_cpid = current_parent.classification_path_id
|
254
240
|
else
|
255
241
|
@error_names << { data: taxon,
|
@@ -258,6 +244,7 @@ class DarwinCore
|
|
258
244
|
end
|
259
245
|
end
|
260
246
|
return "error" unless parent_cpid
|
247
|
+
|
261
248
|
if parent_cpid.empty?
|
262
249
|
res = "error"
|
263
250
|
begin
|
@@ -268,6 +255,7 @@ class DarwinCore
|
|
268
255
|
current_parent: nil }
|
269
256
|
end
|
270
257
|
return res if res == "error"
|
258
|
+
|
271
259
|
if @with_canonical_names
|
272
260
|
taxon.classification_path += @normalized_data[taxon.parent_id].
|
273
261
|
classification_path +
|
@@ -295,7 +283,7 @@ class DarwinCore
|
|
295
283
|
rescue NoMethodError => e
|
296
284
|
DarwinCore.logger_write(@dwc.object_id,
|
297
285
|
"Error '#{e.message}' taxon #{taxon.id}")
|
298
|
-
|
286
|
+
"error"
|
299
287
|
end
|
300
288
|
end
|
301
289
|
end
|
@@ -381,8 +369,8 @@ class DarwinCore
|
|
381
369
|
|
382
370
|
# Collect linnean classification path only on species level
|
383
371
|
def get_linnean_classification_path(row, _taxon)
|
384
|
-
[
|
385
|
-
|
372
|
+
%i[kingdom phylum class order family genus
|
373
|
+
subgenus].each_with_object([]) do |clade, res|
|
386
374
|
res << [row[@core_fields[clade]], clade] if @core_fields[clade]
|
387
375
|
end
|
388
376
|
end
|
data/lib/dwc_archive/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-06-
|
11
|
+
date: 2020-06-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -253,14 +253,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
253
253
|
requirements:
|
254
254
|
- - ">="
|
255
255
|
- !ruby/object:Gem::Version
|
256
|
-
version: 2.
|
256
|
+
version: 2.5.0
|
257
257
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
258
258
|
requirements:
|
259
259
|
- - ">="
|
260
260
|
- !ruby/object:Gem::Version
|
261
261
|
version: '0'
|
262
262
|
requirements: []
|
263
|
-
rubygems_version: 3.
|
263
|
+
rubygems_version: 3.1.2
|
264
264
|
signing_key:
|
265
265
|
specification_version: 4
|
266
266
|
summary: Handler of Darwin Core Archive files
|