dwc-archive 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/.travis.yml +3 -5
- data/LICENSE +1 -1
- data/README.md +4 -3
- data/dwc-archive.gemspec +1 -1
- data/lib/dwc_archive/classification_normalizer.rb +35 -47
- data/lib/dwc_archive/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a5d4e174cb4ec6df2328fe99abb52b13587c74f22f01efb2a9017db4d9f1d7ba
|
4
|
+
data.tar.gz: e341038db2a23282173cf2e4671750739464dca6076f7560694cc40251bdd036
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8b51b82724e21acab76e1763dc2375658080a16dc440c9913fd632aff195a45c34e7bf9446cf5d4afc9fc397c8ee562763ab36532ab5992421e5ed3f2df64273
|
7
|
+
data.tar.gz: 04cbcc92c8b565b2c5f8e8bf082d284b65c55520d91f19c16bc63789519ea374f5ed0466c176ce69907a83566caa8efd2f834399f7b57a26801603455cf2b118
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.
|
1
|
+
2.7.1
|
data/.travis.yml
CHANGED
@@ -1,15 +1,13 @@
|
|
1
1
|
rvm:
|
2
|
-
- 2.4
|
3
2
|
- 2.5
|
3
|
+
- 2.6
|
4
|
+
- 2.7
|
4
5
|
before_install:
|
5
6
|
- sudo apt-get update
|
6
|
-
|
7
|
-
services:
|
8
|
-
- redis-server
|
7
|
+
- gem install bundler
|
9
8
|
script:
|
10
9
|
- bundle exec cucumber
|
11
10
|
- bundle exec rake
|
12
11
|
branches:
|
13
12
|
only:
|
14
13
|
- master
|
15
|
-
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -179,11 +179,11 @@ Note on Patches/Pull Requests
|
|
179
179
|
Copyright
|
180
180
|
---------
|
181
181
|
|
182
|
-
Author -- [
|
182
|
+
Author -- [@dimus][13]
|
183
183
|
|
184
|
-
Contributors -- [
|
184
|
+
Contributors -- [@mjy][14], [@LocoDelAssembly][16]
|
185
185
|
|
186
|
-
Copyright (c) 2010-
|
186
|
+
Copyright (c) 2010-2020 [@dimus][15]. See LICENSE for details.
|
187
187
|
|
188
188
|
[1]: https://badge.fury.io/rb/dwc-archive.png
|
189
189
|
[2]: http://badge.fury.io/rb/dwc-archive
|
@@ -200,3 +200,4 @@ Copyright (c) 2010-2014 [Marine Biological Laboratory][15]. See LICENSE for deta
|
|
200
200
|
[13]: https://github.com/dimus
|
201
201
|
[14]: https://github.com/mjy
|
202
202
|
[15]: http://mbl.edu
|
203
|
+
[16]: https://github.com/LocoDelAssembly
|
data/dwc-archive.gemspec
CHANGED
@@ -15,7 +15,7 @@ Gem::Specification.new do |gem|
|
|
15
15
|
gem.homepage = "http://github.com/GlobalNamesArchitecture/dwc-archive"
|
16
16
|
gem.license = "MIT"
|
17
17
|
|
18
|
-
gem.required_ruby_version = ">= 2.
|
18
|
+
gem.required_ruby_version = ">= 2.5.0"
|
19
19
|
gem.files = `git ls-files`.split("\n").map(&:strip)
|
20
20
|
gem.executables = gem.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
21
21
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
@@ -1,10 +1,11 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
class DarwinCore
|
3
4
|
# Returns tree representation of Darwin Core file with vernacular and
|
4
5
|
# and synonyms attached to the taxon nodes
|
5
6
|
class ClassificationNormalizer
|
6
7
|
attr_reader :error_names, :tree, :normalized_data, :dwc
|
7
|
-
|
8
|
+
alias darwin_core dwc
|
8
9
|
|
9
10
|
def initialize(dwc_instance)
|
10
11
|
@dwc = dwc_instance
|
@@ -12,7 +13,6 @@ class DarwinCore
|
|
12
13
|
@extensions = @dwc.extensions.map { |e| [e, find_fields(e)] }
|
13
14
|
@normalized_data = {}
|
14
15
|
@synonyms = {}
|
15
|
-
@parser = ::Biodiversity::Parser
|
16
16
|
@name_strings = {}
|
17
17
|
@vernacular_name_strings = {}
|
18
18
|
@error_names = []
|
@@ -25,6 +25,7 @@ class DarwinCore
|
|
25
25
|
|
26
26
|
def add_vernacular_name_string(name_string)
|
27
27
|
return if @vernacular_name_strings[name_string]
|
28
|
+
|
28
29
|
@vernacular_name_strings[name_string] = 1
|
29
30
|
end
|
30
31
|
|
@@ -70,9 +71,9 @@ class DarwinCore
|
|
70
71
|
|
71
72
|
def get_canonical_name(a_scientific_name)
|
72
73
|
return nil unless @with_canonical_names
|
73
|
-
|
74
|
-
|
75
|
-
|
74
|
+
|
75
|
+
canonical_name = Biodiversity::Parser.parse(a_scientific_name).
|
76
|
+
dig(:canonicalName, :simple)
|
76
77
|
canonical_name.to_s.empty? ? a_scientific_name : canonical_name
|
77
78
|
end
|
78
79
|
|
@@ -87,15 +88,13 @@ class DarwinCore
|
|
87
88
|
end
|
88
89
|
|
89
90
|
def status_synonym?(status)
|
90
|
-
status
|
91
|
+
status&.match(/^syn/)
|
91
92
|
end
|
92
93
|
|
93
94
|
def add_synonym_from_core(taxon_id, row)
|
94
95
|
cf = @core_fields
|
95
96
|
@synonyms[row[cf[:id]]] = taxon_id
|
96
|
-
unless @normalized_data[row[taxon_id]]
|
97
|
-
@normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new
|
98
|
-
end
|
97
|
+
@normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new unless @normalized_data[row[taxon_id]]
|
99
98
|
|
100
99
|
taxon = @normalized_data[row[taxon_id]]
|
101
100
|
synonym = SynonymNormalized.new(
|
@@ -107,7 +106,7 @@ class DarwinCore
|
|
107
106
|
cf[:localid] ? row[cf[:localid]] : nil,
|
108
107
|
cf[:globalid] ? row[cf[:globalid]] : nil
|
109
108
|
)
|
110
|
-
taxon.synonyms <<
|
109
|
+
taxon.synonyms << synonym
|
111
110
|
add_name_string(synonym.name)
|
112
111
|
add_name_string(synonym.canonical_name)
|
113
112
|
end
|
@@ -117,14 +116,10 @@ class DarwinCore
|
|
117
116
|
canonical_name = nil
|
118
117
|
scientific_name = row[fields[:scientificname]].strip
|
119
118
|
if separate_canonical_and_authorship?(row, fields)
|
120
|
-
if @with_canonical_names
|
121
|
-
canonical_name = row[fields[:scientificname]].strip
|
122
|
-
end
|
119
|
+
canonical_name = row[fields[:scientificname]].strip if @with_canonical_names
|
123
120
|
scientific_name += " #{row[fields[:scientificnameauthorship]].strip}"
|
124
121
|
else
|
125
|
-
if @with_canonical_names
|
126
|
-
canonical_name = get_canonical_name(row[fields[:scientificname]])
|
127
|
-
end
|
122
|
+
canonical_name = get_canonical_name(row[fields[:scientificname]]) if @with_canonical_names
|
128
123
|
end
|
129
124
|
fields[:canonicalname] = row.size
|
130
125
|
row << canonical_name
|
@@ -133,18 +128,17 @@ class DarwinCore
|
|
133
128
|
|
134
129
|
def separate_canonical_and_authorship?(row, fields)
|
135
130
|
authorship = ""
|
136
|
-
if fields[:scientificnameauthorship]
|
137
|
-
authorship = row[fields[:scientificnameauthorship]].to_s.strip
|
138
|
-
end
|
131
|
+
authorship = row[fields[:scientificnameauthorship]].to_s.strip if fields[:scientificnameauthorship]
|
139
132
|
!(authorship.empty? || row[fields[:scientificname]].index(authorship))
|
140
133
|
end
|
141
134
|
|
142
135
|
def ingest_core
|
143
136
|
@normalized_data = {}
|
144
137
|
has_name_and_id = @core_fields[:id] && @core_fields[:scientificname]
|
145
|
-
|
146
|
-
|
147
|
-
|
138
|
+
unless has_name_and_id
|
139
|
+
raise(DarwinCore::CoreFileError,
|
140
|
+
"Darwin Core core fields must contain taxon id and scientific name")
|
141
|
+
end
|
148
142
|
@dwc.core.read do |rows|
|
149
143
|
rows[1].each do |error|
|
150
144
|
@error_names << { data: error,
|
@@ -163,32 +157,28 @@ class DarwinCore
|
|
163
157
|
add_synonym_from_core(parent_id, r) if parent_id?
|
164
158
|
else
|
165
159
|
unless @normalized_data[r[@core_fields[:id]]]
|
166
|
-
if gnub_archive?
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
160
|
+
new_taxon = if gnub_archive?
|
161
|
+
DarwinCore::GnubTaxon.new
|
162
|
+
else
|
163
|
+
DarwinCore::TaxonNormalized.new
|
164
|
+
end
|
171
165
|
@normalized_data[r[@core_fields[:id]]] = new_taxon
|
172
166
|
end
|
173
167
|
taxon = @normalized_data[r[@core_fields[:id]]]
|
174
168
|
if gnub_archive?
|
175
169
|
taxon.uuid = r[@core_fields[:originalnameusageid]]
|
176
170
|
taxon.uuid_path = r[@core_fields[:originalnameusageidpath]].
|
177
|
-
|
171
|
+
split("|")
|
178
172
|
end
|
179
173
|
taxon.id = r[@core_fields[:id]]
|
180
174
|
taxon.current_name = r[@core_fields[:scientificname]]
|
181
175
|
taxon.current_name_canonical = r[@core_fields[:canonicalname]]
|
182
176
|
taxon.parent_id = parent_id? ? r[parent_id] : nil
|
183
177
|
taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
|
184
|
-
if @core_fields[:taxonomicstatus]
|
185
|
-
taxon.status = r[@core_fields[:taxonomicstatus]]
|
186
|
-
end
|
178
|
+
taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
|
187
179
|
taxon.source = r[@core_fields[:source]] if @core_fields[:source]
|
188
180
|
taxon.local_id = r[@core_fields[:localid]] if @core_fields[:localid]
|
189
|
-
if @core_fields[:globalid]
|
190
|
-
taxon.global_id = r[@core_fields[:globalid]]
|
191
|
-
end
|
181
|
+
taxon.global_id = r[@core_fields[:globalid]] if @core_fields[:globalid]
|
192
182
|
taxon.linnean_classification_path =
|
193
183
|
get_linnean_classification_path(r, taxon)
|
194
184
|
add_name_string(taxon.current_name)
|
@@ -213,6 +203,7 @@ class DarwinCore
|
|
213
203
|
@paths_num = 0
|
214
204
|
@normalized_data.each do |_taxon_id, taxon|
|
215
205
|
next unless taxon.classification_path_id.empty?
|
206
|
+
|
216
207
|
res = get_classification_path(taxon)
|
217
208
|
next if res == "error"
|
218
209
|
end
|
@@ -220,6 +211,7 @@ class DarwinCore
|
|
220
211
|
|
221
212
|
def get_classification_path(taxon)
|
222
213
|
return unless taxon.classification_path_id.empty?
|
214
|
+
|
223
215
|
@paths_num += 1
|
224
216
|
if @paths_num % 10_000 == 0
|
225
217
|
DarwinCore.logger_write(@dwc.object_id,
|
@@ -227,17 +219,13 @@ class DarwinCore
|
|
227
219
|
end
|
228
220
|
current_node = { taxon.id => {} }
|
229
221
|
if DarwinCore.nil_field?(taxon.parent_id)
|
230
|
-
if @with_canonical_names
|
231
|
-
taxon.classification_path << taxon.current_name_canonical
|
232
|
-
end
|
222
|
+
taxon.classification_path << taxon.current_name_canonical if @with_canonical_names
|
233
223
|
taxon.classification_path_id << taxon.id
|
234
224
|
@tree.merge!(current_node)
|
235
225
|
else
|
236
226
|
parent_cp = parent_cpid = nil
|
237
227
|
if @normalized_data[taxon.parent_id]
|
238
|
-
if @with_canonical_names
|
239
|
-
parent_cp = @normalized_data[taxon.parent_id].classification_path
|
240
|
-
end
|
228
|
+
parent_cp = @normalized_data[taxon.parent_id].classification_path if @with_canonical_names
|
241
229
|
parent_cpid = @normalized_data[taxon.parent_id].
|
242
230
|
classification_path_id
|
243
231
|
else
|
@@ -247,9 +235,7 @@ class DarwinCore
|
|
247
235
|
error: :deprecated_parent,
|
248
236
|
current_parent: current_parent }
|
249
237
|
|
250
|
-
if @with_canonical_names
|
251
|
-
parent_cp = current_parent.classification_path
|
252
|
-
end
|
238
|
+
parent_cp = current_parent.classification_path if @with_canonical_names
|
253
239
|
parent_cpid = current_parent.classification_path_id
|
254
240
|
else
|
255
241
|
@error_names << { data: taxon,
|
@@ -258,6 +244,7 @@ class DarwinCore
|
|
258
244
|
end
|
259
245
|
end
|
260
246
|
return "error" unless parent_cpid
|
247
|
+
|
261
248
|
if parent_cpid.empty?
|
262
249
|
res = "error"
|
263
250
|
begin
|
@@ -268,6 +255,7 @@ class DarwinCore
|
|
268
255
|
current_parent: nil }
|
269
256
|
end
|
270
257
|
return res if res == "error"
|
258
|
+
|
271
259
|
if @with_canonical_names
|
272
260
|
taxon.classification_path += @normalized_data[taxon.parent_id].
|
273
261
|
classification_path +
|
@@ -295,7 +283,7 @@ class DarwinCore
|
|
295
283
|
rescue NoMethodError => e
|
296
284
|
DarwinCore.logger_write(@dwc.object_id,
|
297
285
|
"Error '#{e.message}' taxon #{taxon.id}")
|
298
|
-
|
286
|
+
"error"
|
299
287
|
end
|
300
288
|
end
|
301
289
|
end
|
@@ -381,8 +369,8 @@ class DarwinCore
|
|
381
369
|
|
382
370
|
# Collect linnean classification path only on species level
|
383
371
|
def get_linnean_classification_path(row, _taxon)
|
384
|
-
[
|
385
|
-
|
372
|
+
%i[kingdom phylum class order family genus
|
373
|
+
subgenus].each_with_object([]) do |clade, res|
|
386
374
|
res << [row[@core_fields[clade]], clade] if @core_fields[clade]
|
387
375
|
end
|
388
376
|
end
|
data/lib/dwc_archive/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-06-
|
11
|
+
date: 2020-06-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -253,14 +253,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
253
253
|
requirements:
|
254
254
|
- - ">="
|
255
255
|
- !ruby/object:Gem::Version
|
256
|
-
version: 2.
|
256
|
+
version: 2.5.0
|
257
257
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
258
258
|
requirements:
|
259
259
|
- - ">="
|
260
260
|
- !ruby/object:Gem::Version
|
261
261
|
version: '0'
|
262
262
|
requirements: []
|
263
|
-
rubygems_version: 3.
|
263
|
+
rubygems_version: 3.1.2
|
264
264
|
signing_key:
|
265
265
|
specification_version: 4
|
266
266
|
summary: Handler of Darwin Core Archive files
|