dwc-archive 0.9.11 → 1.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rspec +2 -1
- data/.rubocop.yml +23 -0
- data/.ruby-version +1 -1
- data/.travis.yml +4 -7
- data/CHANGELOG +4 -0
- data/Gemfile +3 -1
- data/LICENSE +1 -1
- data/README.md +114 -109
- data/Rakefile +13 -36
- data/dwc-archive.gemspec +23 -19
- data/features/step_definitions/dwc-creator_steps.rb +5 -5
- data/features/step_definitions/dwc-reader_steps.rb +47 -28
- data/features/support/env.rb +1 -1
- data/lib/dwc_archive.rb +124 -0
- data/lib/dwc_archive/archive.rb +60 -0
- data/lib/dwc_archive/classification_normalizer.rb +382 -0
- data/lib/dwc_archive/core.rb +25 -0
- data/lib/{dwc-archive → dwc_archive}/errors.rb +10 -0
- data/lib/dwc_archive/expander.rb +88 -0
- data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
- data/lib/dwc_archive/generator.rb +91 -0
- data/lib/{dwc-archive → dwc_archive}/generator_eml_xml.rb +40 -33
- data/lib/{dwc-archive → dwc_archive}/generator_meta_xml.rb +21 -20
- data/lib/dwc_archive/gnub_taxon.rb +14 -0
- data/lib/dwc_archive/ingester.rb +106 -0
- data/lib/dwc_archive/metadata.rb +57 -0
- data/lib/dwc_archive/taxon_normalized.rb +23 -0
- data/lib/dwc_archive/version.rb +6 -0
- data/lib/dwc_archive/xml_reader.rb +90 -0
- data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
- data/spec/files/generator_eml.xml +1 -1
- data/spec/lib/classification_normalizer_spec.rb +96 -105
- data/spec/lib/core_spec.rb +43 -41
- data/spec/lib/darwin_core_spec.rb +108 -138
- data/spec/lib/generator_eml_xml_spec.rb +12 -11
- data/spec/lib/generator_meta_xml_spec.rb +12 -11
- data/spec/lib/generator_spec.rb +73 -74
- data/spec/lib/gnub_taxon_spec.rb +15 -17
- data/spec/lib/metadata_spec.rb +50 -41
- data/spec/lib/taxon_normalized_spec.rb +62 -65
- data/spec/lib/xml_reader_spec.rb +9 -12
- data/spec/spec_helper.rb +54 -51
- metadata +101 -87
- data/.rvmrc +0 -1
- data/lib/dwc-archive.rb +0 -107
- data/lib/dwc-archive/archive.rb +0 -40
- data/lib/dwc-archive/classification_normalizer.rb +0 -427
- data/lib/dwc-archive/core.rb +0 -19
- data/lib/dwc-archive/expander.rb +0 -85
- data/lib/dwc-archive/generator.rb +0 -86
- data/lib/dwc-archive/ingester.rb +0 -101
- data/lib/dwc-archive/metadata.rb +0 -48
- data/lib/dwc-archive/version.rb +0 -3
- data/lib/dwc-archive/xml_reader.rb +0 -80
@@ -19,7 +19,7 @@ end
|
|
19
19
|
|
20
20
|
Then /^these data should be saved as "([^\"]*)" file$/ do |file_name|
|
21
21
|
file = File.join(@gen.path, file_name)
|
22
|
-
@gen.files.include?(file_name).should
|
22
|
+
@gen.files.include?(file_name).should be true
|
23
23
|
csv = CSV.open(file).count.should == 4
|
24
24
|
end
|
25
25
|
|
@@ -51,7 +51,7 @@ end
|
|
51
51
|
Then /^data are saved as "([^\"]*)" and "([^\"]*)"$/ do |file_name_1, file_name_2|
|
52
52
|
[file_name_1, file_name_2].each do |file_name|
|
53
53
|
file = File.join(@gen.path, file_name)
|
54
|
-
@gen.files.include?(file_name).should
|
54
|
+
@gen.files.include?(file_name).should be true
|
55
55
|
csv = CSV.open(file).count.should > 1
|
56
56
|
end
|
57
57
|
end
|
@@ -86,7 +86,7 @@ end
|
|
86
86
|
|
87
87
|
Then /^there should be "([^\"]*)" file with core and extensions informations$/ do |file_name|
|
88
88
|
meta = File.join(@gen.path, file_name)
|
89
|
-
@gen.files.include?(file_name).should
|
89
|
+
@gen.files.include?(file_name).should be true
|
90
90
|
dom = Nokogiri::XML(open(File.join(@gen.path, file_name)))
|
91
91
|
dom.xpath('//xmlns:core//xmlns:location').text.should == 'darwin_core.txt'
|
92
92
|
dom.xpath('//xmlns:extension[1]//xmlns:location').text.should == 'vernacular.txt'
|
@@ -94,7 +94,7 @@ end
|
|
94
94
|
|
95
95
|
Then /^there should be "([^\"]*)" file with authoriship information$/ do |file_name|
|
96
96
|
eml = File.join(@gen.path, file_name)
|
97
|
-
@gen.files.include?(file_name).should
|
97
|
+
@gen.files.include?(file_name).should be true
|
98
98
|
end
|
99
99
|
|
100
100
|
Given /^a path to a new file \- "([^\"]*)"$/ do |file_name|
|
@@ -107,6 +107,6 @@ end
|
|
107
107
|
|
108
108
|
Then /^there should be a valid new archive file$/ do
|
109
109
|
dwc = DarwinCore.new('/tmp/dwc.tar.gz')
|
110
|
-
dwc.archive.valid?.should
|
110
|
+
dwc.archive.valid?.should be true
|
111
111
|
end
|
112
112
|
|
@@ -1,5 +1,6 @@
|
|
1
1
|
Given /^path to a dwc file "([^\"]*)"$/ do |arg1|
|
2
|
-
@dwca_file = File.expand_path(File.dirname(__FILE__) +
|
2
|
+
@dwca_file = File.expand_path(File.dirname(__FILE__) +
|
3
|
+
"../../../spec/files/" + arg1)
|
3
4
|
@tmp_dir = "/tmp"
|
4
5
|
end
|
5
6
|
|
@@ -8,11 +9,12 @@ When /^I create a new DarwinCore::Archive instance$/ do
|
|
8
9
|
end
|
9
10
|
|
10
11
|
Then /^I should find that the archive is valid$/ do
|
11
|
-
@dwca.valid?.should
|
12
|
+
@dwca.valid?.should be true
|
12
13
|
end
|
13
14
|
|
14
15
|
Then /^I should see what files the archive has$/ do
|
15
|
-
@dwca.files.should == ["DarwinCore.txt", "VernacularName.txt", "eml.xml",
|
16
|
+
@dwca.files.should == ["DarwinCore.txt", "VernacularName.txt", "eml.xml",
|
17
|
+
"meta.xml", "metadata.txt"]
|
16
18
|
end
|
17
19
|
|
18
20
|
When /^I delete expanded files$/ do
|
@@ -20,7 +22,7 @@ When /^I delete expanded files$/ do
|
|
20
22
|
end
|
21
23
|
|
22
24
|
Then /^they should disappear$/ do
|
23
|
-
@dwca.files.should
|
25
|
+
@dwca.files.should be nil
|
24
26
|
end
|
25
27
|
|
26
28
|
When /^I create a new DarwinCore instance$/ do
|
@@ -37,7 +39,7 @@ When /^I create DarwinCore::ClassificationNormalizer instance$/ do
|
|
37
39
|
end
|
38
40
|
|
39
41
|
Then /^instance should have a valid archive$/ do
|
40
|
-
@dwc.archive.valid?.should
|
42
|
+
@dwc.archive.valid?.should be true
|
41
43
|
end
|
42
44
|
|
43
45
|
Then /^instance should have a core$/ do
|
@@ -45,7 +47,7 @@ Then /^instance should have a core$/ do
|
|
45
47
|
end
|
46
48
|
|
47
49
|
Then /^I should see checksum$/ do
|
48
|
-
@dwc.checksum.should ==
|
50
|
+
@dwc.checksum.should == "7d94fc28ffaf434b66fbc790aa5ef00d834057bf"
|
49
51
|
end
|
50
52
|
|
51
53
|
When /^I check core data$/ do
|
@@ -64,7 +66,8 @@ And /^core\.file_path$/ do
|
|
64
66
|
end
|
65
67
|
|
66
68
|
And /^core\.id$/ do
|
67
|
-
@core.id.should == {:
|
69
|
+
@core.id.should == {index: 0,
|
70
|
+
term: "http://rs.tdwg.org/dwc/terms/TaxonID"}
|
68
71
|
end
|
69
72
|
|
70
73
|
And /^core\.fields$/ do
|
@@ -80,14 +83,21 @@ Then /^DarwinCore instance should have dwc\.metadata object$/ do
|
|
80
83
|
end
|
81
84
|
|
82
85
|
And /^I should find id, title, creators, metadata provider$/ do
|
83
|
-
@dwc.metadata.id.should ==
|
84
|
-
@dwc.metadata.title.should ==
|
86
|
+
@dwc.metadata.id.should == "leptogastrinae:version:2.5"
|
87
|
+
@dwc.metadata.title.should ==
|
88
|
+
"Leptogastrinae (Diptera: Asilidae) Classification"
|
85
89
|
@dwc.metadata.authors.should == [
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
@dwc.metadata.
|
90
|
+
{ last_name: "Bayless", email: "keith.bayless@gmail.com",
|
91
|
+
first_name: "Keith" },
|
92
|
+
{ last_name: "Dikow", email: "dshorthouse@eol.org", first_name: "Torsten" }
|
93
|
+
]
|
94
|
+
@dwc.metadata.abstract.should ==
|
95
|
+
"These are all the names in the Leptogastrinae classification."
|
96
|
+
@dwc.metadata.citation.should ==
|
97
|
+
"Dikow, Torsten. 2010. The Leptogastrinae classification."
|
98
|
+
@dwc.metadata.url.should ==
|
99
|
+
"http://leptogastrinae.lifedesks.org/files/leptogastrinae/"\
|
100
|
+
"classification_export/shared/leptogastrinae.tar.gz"
|
91
101
|
end
|
92
102
|
|
93
103
|
Then /^DarwinCore instance should have an extensions array$/ do
|
@@ -103,11 +113,18 @@ end
|
|
103
113
|
|
104
114
|
Then /^extension should have properties, data, file_path, coreid, fields$/ do
|
105
115
|
ext = @dwc.extensions[0]
|
106
|
-
ext.properties.should == {
|
116
|
+
ext.properties.should == {
|
117
|
+
ignoreHeaderLines: 1, encoding: "UTF-8",
|
118
|
+
rowType: "http://rs.gbif.org/ipt/terms/1.0/VernacularName",
|
119
|
+
fieldsEnclosedBy: "", fieldsTerminatedBy: "\\t", linesTerminatedBy: "\\n"
|
120
|
+
}
|
107
121
|
ext.data.class.should == Hash
|
108
122
|
ext.file_path.should match(/\/tmp\/dwc_[\d]+\/VernacularName.txt/)
|
109
|
-
ext.coreid.should == {:
|
110
|
-
ext.fields.should == [
|
123
|
+
ext.coreid.should == { index: 0 }
|
124
|
+
ext.fields.should == [
|
125
|
+
{ term: "http://rs.gbif.org/ecat/terms/vernacularName", index: 1 },
|
126
|
+
{ term: "http://rs.gbif.org/thesaurus/languageCode", index: 2 }
|
127
|
+
]
|
111
128
|
end
|
112
129
|
|
113
130
|
Given /^acces to DarwinCore gem$/ do
|
@@ -153,7 +170,7 @@ end
|
|
153
170
|
|
154
171
|
Then /^I can read its core content using block$/ do
|
155
172
|
res = []
|
156
|
-
@dwc.core.ignore_headers.should
|
173
|
+
@dwc.core.ignore_headers.should be true
|
157
174
|
read_result = @dwc.core.read(200) do |r, err|
|
158
175
|
res << [r.size, err.size]
|
159
176
|
end
|
@@ -164,7 +181,7 @@ end
|
|
164
181
|
Then /^I can read extensions content using block$/ do
|
165
182
|
res = []
|
166
183
|
ext = @dwc.extensions[0]
|
167
|
-
ext.ignore_headers.should
|
184
|
+
ext.ignore_headers.should be true
|
168
185
|
ext.read(200) do |r, err|
|
169
186
|
res << [r.size, err.size]
|
170
187
|
end
|
@@ -196,22 +213,24 @@ Then /^there are paths, synonyms and vernacular names in normalized classificati
|
|
196
213
|
if v.vernacular_names.size > 0
|
197
214
|
@vernaculars_are_generated = true
|
198
215
|
vn = v.vernacular_names[0]
|
199
|
-
(vn.respond_to?(
|
216
|
+
(vn.respond_to?("locality") && vn.respond_to?("country_code") &&
|
217
|
+
vn.respond_to?("language")).should be true
|
200
218
|
end
|
201
219
|
break if (@vernaculars_are_generated && @paths_are_generated && @synonyms_are_generated)
|
202
220
|
end
|
203
|
-
@paths_are_generated.should
|
204
|
-
@vernaculars_are_generated.should
|
205
|
-
@synonyms_are_generated.should
|
221
|
+
@paths_are_generated.should be true
|
222
|
+
@vernaculars_are_generated.should be true
|
223
|
+
@synonyms_are_generated.should be true
|
206
224
|
end
|
207
225
|
|
208
226
|
Then /^there are local_id and global_id methods in taxons and synonyms$/ do
|
209
227
|
@normalized_classification.each do |k, v|
|
210
228
|
if v.synonyms.size > 0
|
211
|
-
v.local_id.should ==
|
229
|
+
v.local_id.should == "2"
|
212
230
|
v.global_id.should == "97498f29-2501-440d-9452-f3817da0d6c2"
|
213
|
-
v.synonyms.first.local_id.should ==
|
214
|
-
v.synonyms.first.global_id.should ==
|
231
|
+
v.synonyms.first.local_id.should == "1"
|
232
|
+
v.synonyms.first.global_id.should ==
|
233
|
+
"e017ed01-407d-4d09-82c5-8b3d9fa76e35"
|
215
234
|
break
|
216
235
|
end
|
217
236
|
end
|
@@ -229,8 +248,8 @@ Then /^there are id paths, no canonical names paths in normalized classification
|
|
229
248
|
id_paths_generated = true
|
230
249
|
end
|
231
250
|
end
|
232
|
-
id_paths_generated.should
|
233
|
-
canonical_paths_generated.should
|
251
|
+
id_paths_generated.should be true
|
252
|
+
canonical_paths_generated.should be false
|
234
253
|
end
|
235
254
|
|
236
255
|
Then /^names used in classification can be accessed by "([^"]*)" method$/ do |name_strings|
|
data/features/support/env.rb
CHANGED
data/lib/dwc_archive.rb
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "fileutils"
|
4
|
+
require "ostruct"
|
5
|
+
require "digest"
|
6
|
+
require "csv"
|
7
|
+
require "logger"
|
8
|
+
require "nokogiri"
|
9
|
+
require "biodiversity"
|
10
|
+
require_relative "dwc_archive/xml_reader"
|
11
|
+
require_relative "dwc_archive/ingester"
|
12
|
+
require_relative "dwc_archive/errors"
|
13
|
+
require_relative "dwc_archive/expander"
|
14
|
+
require_relative "dwc_archive/archive"
|
15
|
+
require_relative "dwc_archive/core"
|
16
|
+
require_relative "dwc_archive/extension"
|
17
|
+
require_relative "dwc_archive/metadata"
|
18
|
+
require_relative "dwc_archive/generator"
|
19
|
+
require_relative "dwc_archive/generator_meta_xml"
|
20
|
+
require_relative "dwc_archive/generator_eml_xml"
|
21
|
+
require_relative "dwc_archive/taxon_normalized"
|
22
|
+
require_relative "dwc_archive/gnub_taxon"
|
23
|
+
require_relative "dwc_archive/classification_normalizer"
|
24
|
+
require_relative "dwc_archive/version"
|
25
|
+
|
26
|
+
# main class for handling darwin core archives
|
27
|
+
class DarwinCore
|
28
|
+
DEFAULT_TMP_DIR = "/tmp"
|
29
|
+
VernacularNormalized = Struct.new(:name, :language, :locality, :country_code)
|
30
|
+
SynonymNormalized = Struct.new(:id, :name, :canonical_name, :status, :source,
|
31
|
+
:local_id, :global_id)
|
32
|
+
|
33
|
+
class << self
|
34
|
+
attr_writer :logger
|
35
|
+
|
36
|
+
def clean(path)
|
37
|
+
FileUtils.rm_rf(path) if FileTest.exists?(path)
|
38
|
+
end
|
39
|
+
|
40
|
+
def files(path)
|
41
|
+
return nil unless path && FileTest.exists?(path)
|
42
|
+
|
43
|
+
Dir.entries(path).reject { |e| e.match(/[.]{1,2}$/) }.sort
|
44
|
+
end
|
45
|
+
|
46
|
+
def random_path(tmp_dir)
|
47
|
+
File.join(tmp_dir, "dwc_#{rand(10_000_000_000)}")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
attr_reader :archive, :core, :metadata, :classification_normalizer
|
52
|
+
alias eml metadata
|
53
|
+
|
54
|
+
def self.nil_field?(field)
|
55
|
+
return true if [nil, "", "/N"].include?(field)
|
56
|
+
|
57
|
+
false
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.clean_all(tmp_dir = DEFAULT_TMP_DIR)
|
61
|
+
Dir.entries(tmp_dir).each do |entry|
|
62
|
+
path = File.join(tmp_dir, entry)
|
63
|
+
FileUtils.rm_rf(path) if FileTest.directory?(path) && entry.match(/^dwc_\d+$/)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.logger
|
68
|
+
@logger ||= Logger.new(nil)
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.logger_reset
|
72
|
+
self.logger = Logger.new(nil)
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.logger_write(obj_id, message, method = :info)
|
76
|
+
logger.send(method, "|#{obj_id}|#{message}|")
|
77
|
+
end
|
78
|
+
|
79
|
+
def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
|
80
|
+
@dwc_path = dwc_path
|
81
|
+
@archive = DarwinCore::Archive.new(@dwc_path, tmp_dir)
|
82
|
+
@core = DarwinCore::Core.new(self)
|
83
|
+
@metadata = DarwinCore::Metadata.new(@archive)
|
84
|
+
extensions
|
85
|
+
end
|
86
|
+
|
87
|
+
def file_name
|
88
|
+
File.split(@dwc_path).last
|
89
|
+
end
|
90
|
+
|
91
|
+
def path
|
92
|
+
File.expand_path(@dwc_path)
|
93
|
+
end
|
94
|
+
|
95
|
+
# generates a hash from a classification data with path to each node,
|
96
|
+
# list of synonyms and vernacular names.
|
97
|
+
def normalize_classification
|
98
|
+
return nil unless parent_id?
|
99
|
+
|
100
|
+
@classification_normalizer ||=
|
101
|
+
DarwinCore::ClassificationNormalizer.new(self)
|
102
|
+
@classification_normalizer.normalize
|
103
|
+
end
|
104
|
+
|
105
|
+
def parent_id?
|
106
|
+
!@core.fields.join("|").
|
107
|
+
downcase.match(/highertaxonid|parentnameusageid/).nil?
|
108
|
+
end
|
109
|
+
|
110
|
+
def checksum
|
111
|
+
Digest::SHA1.hexdigest(File.read(@dwc_path))
|
112
|
+
end
|
113
|
+
|
114
|
+
def extensions
|
115
|
+
return @extensions if @extensions
|
116
|
+
|
117
|
+
root_key = @archive.meta.keys[0]
|
118
|
+
ext = @archive.meta[root_key][:extension]
|
119
|
+
return @extensions = [] unless ext
|
120
|
+
|
121
|
+
ext = [ext] if ext.class != Array
|
122
|
+
@extensions = ext.map { |e| DarwinCore::Extension.new(self, e) }
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Deals with handling DarwinCoreArchive file, and provides meta information
|
5
|
+
# and files information about archive
|
6
|
+
class Archive
|
7
|
+
attr_reader :meta, :eml
|
8
|
+
|
9
|
+
def initialize(archive_path, tmp_dir)
|
10
|
+
@archive_path = archive_path
|
11
|
+
@tmp_dir = tmp_dir
|
12
|
+
@expander = DarwinCore::Expander.new(@archive_path, @tmp_dir)
|
13
|
+
@expander.unpack
|
14
|
+
prepare_metadata
|
15
|
+
end
|
16
|
+
|
17
|
+
def valid?
|
18
|
+
valid = true
|
19
|
+
valid = valid && @expander.path && FileTest.exists?(@expander.path)
|
20
|
+
valid && files && files.include?("meta.xml")
|
21
|
+
end
|
22
|
+
|
23
|
+
def files
|
24
|
+
@expander.files
|
25
|
+
end
|
26
|
+
|
27
|
+
def files_path
|
28
|
+
@expander.path
|
29
|
+
end
|
30
|
+
|
31
|
+
def clean
|
32
|
+
@expander.clean
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def prepare_metadata
|
38
|
+
if valid?
|
39
|
+
prepare_meta_file
|
40
|
+
prepare_eml_file
|
41
|
+
else
|
42
|
+
clean
|
43
|
+
raise InvalidArchiveError
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def prepare_meta_file
|
48
|
+
meta_file = File.open(File.join(@expander.path, "meta.xml"))
|
49
|
+
@meta = DarwinCore::XmlReader.from_xml(meta_file)
|
50
|
+
end
|
51
|
+
|
52
|
+
def prepare_eml_file
|
53
|
+
@eml = nil
|
54
|
+
return unless files.include?("eml.xml")
|
55
|
+
|
56
|
+
eml_file = File.open(File.join(@expander.path, "eml.xml"))
|
57
|
+
@eml = DarwinCore::XmlReader.from_xml(eml_file)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,382 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Returns tree representation of Darwin Core file with vernacular and
|
5
|
+
# and synonyms attached to the taxon nodes
|
6
|
+
class ClassificationNormalizer
|
7
|
+
attr_reader :error_names, :tree, :normalized_data, :dwc
|
8
|
+
alias darwin_core dwc
|
9
|
+
|
10
|
+
def initialize(dwc_instance)
|
11
|
+
@dwc = dwc_instance
|
12
|
+
@core_fields = find_fields(@dwc.core)
|
13
|
+
@extensions = @dwc.extensions.map { |e| [e, find_fields(e)] }
|
14
|
+
@normalized_data = {}
|
15
|
+
@synonyms = {}
|
16
|
+
@name_strings = {}
|
17
|
+
@vernacular_name_strings = {}
|
18
|
+
@error_names = []
|
19
|
+
@tree = {}
|
20
|
+
end
|
21
|
+
|
22
|
+
def add_name_string(name_string)
|
23
|
+
@name_strings[name_string] = 1 unless @name_strings[name_string]
|
24
|
+
end
|
25
|
+
|
26
|
+
def add_vernacular_name_string(name_string)
|
27
|
+
return if @vernacular_name_strings[name_string]
|
28
|
+
|
29
|
+
@vernacular_name_strings[name_string] = 1
|
30
|
+
end
|
31
|
+
|
32
|
+
def name_strings(opts = {})
|
33
|
+
process_strings(@name_strings, opts)
|
34
|
+
end
|
35
|
+
|
36
|
+
def vernacular_name_strings(opts = {})
|
37
|
+
process_strings(@vernacular_name_strings, opts)
|
38
|
+
end
|
39
|
+
|
40
|
+
def normalize(opts = {})
|
41
|
+
opts = { with_canonical_names: true,
|
42
|
+
with_extensions: true }.merge(opts)
|
43
|
+
@with_canonical_names = opts[:with_canonical_names]
|
44
|
+
DarwinCore.logger_write(@dwc.object_id,
|
45
|
+
"Started normalization of the classification")
|
46
|
+
ingest_core
|
47
|
+
DarwinCore.logger_write(
|
48
|
+
@dwc.object_id,
|
49
|
+
"Calculating the classification parent/child paths"
|
50
|
+
)
|
51
|
+
if parent_id?
|
52
|
+
calculate_classification_path
|
53
|
+
else
|
54
|
+
@normalized_data.keys.each { |id| @tree[id] = {} }
|
55
|
+
end
|
56
|
+
DarwinCore.logger_write(@dwc.object_id, "Ingesting data from extensions")
|
57
|
+
ingest_extensions if opts[:with_extensions]
|
58
|
+
@normalized_data
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def process_strings(strings, opts)
|
64
|
+
opts = { with_hash: false }.merge(opts)
|
65
|
+
if opts[:with_hash]
|
66
|
+
strings
|
67
|
+
else
|
68
|
+
strings.keys
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def get_canonical_name(a_scientific_name)
|
73
|
+
return nil unless @with_canonical_names
|
74
|
+
|
75
|
+
canonical_name = Biodiversity::Parser.parse(a_scientific_name).
|
76
|
+
dig(:canonical, :simple)
|
77
|
+
canonical_name.to_s.empty? ? a_scientific_name : canonical_name
|
78
|
+
end
|
79
|
+
|
80
|
+
def find_fields(element)
|
81
|
+
data = element.fields.each_with_object({}) do |f, h|
|
82
|
+
field = f[:term].split("/")[-1]
|
83
|
+
field = field ? field.downcase.to_sym : ""
|
84
|
+
h[field] = f[:index].to_i
|
85
|
+
end
|
86
|
+
data[:id] = element.id[:index] if element.id
|
87
|
+
data
|
88
|
+
end
|
89
|
+
|
90
|
+
def status_synonym?(status)
|
91
|
+
status&.match(/^syn/)
|
92
|
+
end
|
93
|
+
|
94
|
+
def add_synonym_from_core(taxon_id, row)
|
95
|
+
cf = @core_fields
|
96
|
+
@synonyms[row[cf[:id]]] = taxon_id
|
97
|
+
@normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new unless @normalized_data[row[taxon_id]]
|
98
|
+
|
99
|
+
taxon = @normalized_data[row[taxon_id]]
|
100
|
+
synonym = SynonymNormalized.new(
|
101
|
+
row[cf[:id]],
|
102
|
+
row[cf[:scientificname]],
|
103
|
+
row[cf[:canonicalname]],
|
104
|
+
cf[:taxonomicstatus] ? row[cf[:taxonomicstatus]] : nil,
|
105
|
+
cf[:source] ? row[cf[:source]] : nil,
|
106
|
+
cf[:localid] ? row[cf[:localid]] : nil,
|
107
|
+
cf[:globalid] ? row[cf[:globalid]] : nil
|
108
|
+
)
|
109
|
+
taxon.synonyms << synonym
|
110
|
+
add_name_string(synonym.name)
|
111
|
+
add_name_string(synonym.canonical_name)
|
112
|
+
end
|
113
|
+
|
114
|
+
def set_scientific_name(row, fields)
|
115
|
+
row[fields[:scientificname]] = "N/A" unless row[fields[:scientificname]]
|
116
|
+
canonical_name = nil
|
117
|
+
scientific_name = row[fields[:scientificname]].strip
|
118
|
+
if separate_canonical_and_authorship?(row, fields)
|
119
|
+
canonical_name = row[fields[:scientificname]].strip if @with_canonical_names
|
120
|
+
scientific_name += " #{row[fields[:scientificnameauthorship]].strip}"
|
121
|
+
else
|
122
|
+
canonical_name = get_canonical_name(row[fields[:scientificname]]) if @with_canonical_names
|
123
|
+
end
|
124
|
+
fields[:canonicalname] = row.size
|
125
|
+
row << canonical_name
|
126
|
+
row[fields[:scientificname]] = scientific_name
|
127
|
+
end
|
128
|
+
|
129
|
+
def separate_canonical_and_authorship?(row, fields)
|
130
|
+
authorship = ""
|
131
|
+
authorship = row[fields[:scientificnameauthorship]].to_s.strip if fields[:scientificnameauthorship]
|
132
|
+
!(authorship.empty? || row[fields[:scientificname]].index(authorship))
|
133
|
+
end
|
134
|
+
|
135
|
+
def ingest_core
|
136
|
+
@normalized_data = {}
|
137
|
+
has_name_and_id = @core_fields[:id] && @core_fields[:scientificname]
|
138
|
+
unless has_name_and_id
|
139
|
+
raise(DarwinCore::CoreFileError,
|
140
|
+
"Darwin Core core fields must contain taxon id and scientific name")
|
141
|
+
end
|
142
|
+
@dwc.core.read do |rows|
|
143
|
+
rows[1].each do |error|
|
144
|
+
@error_names << { data: error,
|
145
|
+
error: :reading_or_encoding_error }
|
146
|
+
end
|
147
|
+
rows[0].each do |r|
|
148
|
+
set_scientific_name(r, @core_fields)
|
149
|
+
# Core has AcceptedNameUsageId
|
150
|
+
if @core_fields[:acceptednameusageid] &&
|
151
|
+
r[@core_fields[:acceptednameusageid]] &&
|
152
|
+
r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
|
153
|
+
add_synonym_from_core(@core_fields[:acceptednameusageid], r)
|
154
|
+
elsif !@core_fields[:acceptednameusageid] &&
|
155
|
+
@core_fields[:taxonomicstatus] &&
|
156
|
+
status_synonym?(r[@core_fields[:taxonomicstatus]])
|
157
|
+
add_synonym_from_core(parent_id, r) if parent_id?
|
158
|
+
else
|
159
|
+
unless @normalized_data[r[@core_fields[:id]]]
|
160
|
+
new_taxon = if gnub_archive?
|
161
|
+
DarwinCore::GnubTaxon.new
|
162
|
+
else
|
163
|
+
DarwinCore::TaxonNormalized.new
|
164
|
+
end
|
165
|
+
@normalized_data[r[@core_fields[:id]]] = new_taxon
|
166
|
+
end
|
167
|
+
taxon = @normalized_data[r[@core_fields[:id]]]
|
168
|
+
if gnub_archive?
|
169
|
+
taxon.uuid = r[@core_fields[:originalnameusageid]]
|
170
|
+
taxon.uuid_path = r[@core_fields[:originalnameusageidpath]].
|
171
|
+
split("|")
|
172
|
+
end
|
173
|
+
taxon.id = r[@core_fields[:id]]
|
174
|
+
taxon.current_name = r[@core_fields[:scientificname]]
|
175
|
+
taxon.current_name_canonical = r[@core_fields[:canonicalname]]
|
176
|
+
taxon.parent_id = parent_id? ? r[parent_id] : nil
|
177
|
+
taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
|
178
|
+
taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
|
179
|
+
taxon.source = r[@core_fields[:source]] if @core_fields[:source]
|
180
|
+
taxon.local_id = r[@core_fields[:localid]] if @core_fields[:localid]
|
181
|
+
taxon.global_id = r[@core_fields[:globalid]] if @core_fields[:globalid]
|
182
|
+
taxon.linnean_classification_path =
|
183
|
+
get_linnean_classification_path(r, taxon)
|
184
|
+
add_name_string(taxon.current_name)
|
185
|
+
has_canonical = taxon.current_name_canonical &&
|
186
|
+
!taxon.current_name_canonical.empty?
|
187
|
+
add_name_string(taxon.current_name_canonical) if has_canonical
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
def parent_id?
|
194
|
+
@has_parent_id ||= @core_fields.key?(:highertaxonid) ||
|
195
|
+
@core_fields.key?(:parentnameusageid)
|
196
|
+
end
|
197
|
+
|
198
|
+
def parent_id
|
199
|
+
@core_fields[:highertaxonid] || @core_fields[:parentnameusageid]
|
200
|
+
end
|
201
|
+
|
202
|
+
def calculate_classification_path
|
203
|
+
@paths_num = 0
|
204
|
+
@normalized_data.each do |_taxon_id, taxon|
|
205
|
+
next unless taxon.classification_path_id.empty?
|
206
|
+
|
207
|
+
res = get_classification_path(taxon)
|
208
|
+
next if res == "error"
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
def get_classification_path(taxon)
|
213
|
+
return unless taxon.classification_path_id.empty?
|
214
|
+
|
215
|
+
@paths_num += 1
|
216
|
+
if @paths_num % 10_000 == 0
|
217
|
+
DarwinCore.logger_write(@dwc.object_id,
|
218
|
+
"Calculated #{@paths_num} paths")
|
219
|
+
end
|
220
|
+
current_node = { taxon.id => {} }
|
221
|
+
if DarwinCore.nil_field?(taxon.parent_id)
|
222
|
+
taxon.classification_path << taxon.current_name_canonical if @with_canonical_names
|
223
|
+
taxon.classification_path_id << taxon.id
|
224
|
+
@tree.merge!(current_node)
|
225
|
+
else
|
226
|
+
parent_cp = parent_cpid = nil
|
227
|
+
if @normalized_data[taxon.parent_id]
|
228
|
+
parent_cp = @normalized_data[taxon.parent_id].classification_path if @with_canonical_names
|
229
|
+
parent_cpid = @normalized_data[taxon.parent_id].
|
230
|
+
classification_path_id
|
231
|
+
else
|
232
|
+
current_parent = @normalized_data[@synonyms[taxon.parent_id]]
|
233
|
+
if current_parent
|
234
|
+
@error_names << { data: taxon,
|
235
|
+
error: :deprecated_parent,
|
236
|
+
current_parent: current_parent }
|
237
|
+
|
238
|
+
parent_cp = current_parent.classification_path if @with_canonical_names
|
239
|
+
parent_cpid = current_parent.classification_path_id
|
240
|
+
else
|
241
|
+
@error_names << { data: taxon,
|
242
|
+
error: :deprecated_parent,
|
243
|
+
current_parent: nil }
|
244
|
+
end
|
245
|
+
end
|
246
|
+
return "error" unless parent_cpid
|
247
|
+
|
248
|
+
if parent_cpid.empty?
|
249
|
+
res = "error"
|
250
|
+
begin
|
251
|
+
res = get_classification_path(@normalized_data[taxon.parent_id])
|
252
|
+
rescue SystemStackError
|
253
|
+
@error_names << { data: taxon,
|
254
|
+
error: :too_deep_hierarchy,
|
255
|
+
current_parent: nil }
|
256
|
+
end
|
257
|
+
return res if res == "error"
|
258
|
+
|
259
|
+
if @with_canonical_names
|
260
|
+
taxon.classification_path += @normalized_data[taxon.parent_id].
|
261
|
+
classification_path +
|
262
|
+
[taxon.current_name_canonical]
|
263
|
+
end
|
264
|
+
taxon.classification_path_id += @normalized_data[taxon.parent_id].
|
265
|
+
classification_path_id + [taxon.id]
|
266
|
+
parent_node = @normalized_data[taxon.parent_id].
|
267
|
+
classification_path_id.inject(@tree) do |node, id|
|
268
|
+
node[id]
|
269
|
+
end
|
270
|
+
parent_node.merge!(current_node)
|
271
|
+
else
|
272
|
+
if @with_canonical_names
|
273
|
+
taxon.classification_path += parent_cp +
|
274
|
+
[taxon.current_name_canonical]
|
275
|
+
end
|
276
|
+
taxon.classification_path_id += parent_cpid + [taxon.id]
|
277
|
+
parent_node = @normalized_data[taxon.parent_id].
|
278
|
+
classification_path_id.inject(@tree) do |node, id|
|
279
|
+
node[id]
|
280
|
+
end
|
281
|
+
begin
|
282
|
+
parent_node.merge!(current_node)
|
283
|
+
rescue NoMethodError => e
|
284
|
+
DarwinCore.logger_write(@dwc.object_id,
|
285
|
+
"Error '#{e.message}' taxon #{taxon.id}")
|
286
|
+
"error"
|
287
|
+
end
|
288
|
+
end
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
def ingest_extensions
|
293
|
+
@extensions.each do |e|
|
294
|
+
_ext, fields = *e
|
295
|
+
ingest_synonyms(e) if File.split(e[0].file_path).
|
296
|
+
last.match(/synonym/i) &&
|
297
|
+
fields.keys.include?(:scientificname)
|
298
|
+
ingest_vernaculars(e) if fields.keys.include? :vernacularname
|
299
|
+
end
|
300
|
+
end
|
301
|
+
|
302
|
+
def ingest_synonyms(extension)
|
303
|
+
DarwinCore.logger_write(@dwc.object_id, "Ingesting synonyms extension")
|
304
|
+
ext, fields = *extension
|
305
|
+
ext.read do |rows|
|
306
|
+
rows[0].each do |r|
|
307
|
+
synonym = process_synonym(r, fields)
|
308
|
+
add_synonym(synonym, r, fields)
|
309
|
+
end
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
def add_synonym(synonym, record, fields)
|
314
|
+
if @normalized_data[record[fields[:id]]]
|
315
|
+
@normalized_data[record[fields[:id]]].synonyms << synonym
|
316
|
+
add_name_string(synonym.name)
|
317
|
+
add_name_string(synonym.canonical_name)
|
318
|
+
else
|
319
|
+
@error_names << { taxon: synonym,
|
320
|
+
error: :synonym_of_unknown_taxa }
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
def process_synonym(record, fields)
|
325
|
+
set_scientific_name(record, fields)
|
326
|
+
SynonymNormalized.new(
|
327
|
+
nil,
|
328
|
+
record[fields[:scientificname]],
|
329
|
+
record[fields[:canonicalname]],
|
330
|
+
fields[:taxonomicstatus] ? record[fields[:taxonomicstatus]] : nil,
|
331
|
+
fields[:source] ? record[fields[:source]] : nil,
|
332
|
+
fields[:localid] ? record[fields[:localid]] : nil,
|
333
|
+
fields[:globalid] ? record[fields[:globalid]] : nil
|
334
|
+
)
|
335
|
+
end
|
336
|
+
|
337
|
+
def ingest_vernaculars(extension)
|
338
|
+
DarwinCore.logger_write(@dwc.object_id,
|
339
|
+
"Ingesting vernacular names extension")
|
340
|
+
ext, fields = *extension
|
341
|
+
ext.read do |rows|
|
342
|
+
rows[0].each do |row|
|
343
|
+
extract_vernaculars_from_row(row, fields)
|
344
|
+
end
|
345
|
+
end
|
346
|
+
end
|
347
|
+
|
348
|
+
def extract_vernaculars_from_row(row, fields)
|
349
|
+
language = find_vernacular_language(row, fields)
|
350
|
+
locality = fields[:locality] ? row[fields[:locality]] : nil
|
351
|
+
country_code = fields[:countrycode] ? row[fields[:countrycode]] : nil
|
352
|
+
|
353
|
+
vernacular = VernacularNormalized.new(
|
354
|
+
row[fields[:vernacularname]], language, locality, country_code
|
355
|
+
)
|
356
|
+
if @normalized_data[row[fields[:id]]]
|
357
|
+
@normalized_data[row[fields[:id]]].vernacular_names << vernacular
|
358
|
+
add_vernacular_name_string(vernacular.name)
|
359
|
+
else
|
360
|
+
@error_names << { vernacular_name: vernacular,
|
361
|
+
error: :vernacular_of_unknown_taxa }
|
362
|
+
end
|
363
|
+
end
|
364
|
+
|
365
|
+
def find_vernacular_language(row, fields)
|
366
|
+
(fields[:language] && row[fields[:language]]) ||
|
367
|
+
(fields[:languagecode] && row[fields[:languagecode]]) || nil
|
368
|
+
end
|
369
|
+
|
370
|
+
# Collect linnean classification path only on species level
|
371
|
+
def get_linnean_classification_path(row, _taxon)
|
372
|
+
%i[kingdom phylum class order family genus
|
373
|
+
subgenus].each_with_object([]) do |clade, res|
|
374
|
+
res << [row[@core_fields[clade]], clade] if @core_fields[clade]
|
375
|
+
end
|
376
|
+
end
|
377
|
+
|
378
|
+
def gnub_archive?
|
379
|
+
@core_fields[:originalnameusageidpath]
|
380
|
+
end
|
381
|
+
end
|
382
|
+
end
|