dwc-archive 0.9.10 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rspec +2 -1
- data/.rubocop.yml +23 -0
- data/.ruby-version +1 -1
- data/.travis.yml +4 -7
- data/CHANGELOG +14 -8
- data/Gemfile +3 -1
- data/LICENSE +1 -1
- data/README.md +119 -107
- data/Rakefile +13 -36
- data/dwc-archive.gemspec +23 -19
- data/features/step_definitions/dwc-creator_steps.rb +5 -5
- data/features/step_definitions/dwc-reader_steps.rb +47 -28
- data/features/support/env.rb +1 -1
- data/lib/dwc_archive.rb +124 -0
- data/lib/dwc_archive/archive.rb +60 -0
- data/lib/dwc_archive/classification_normalizer.rb +382 -0
- data/lib/dwc_archive/core.rb +25 -0
- data/lib/{dwc-archive → dwc_archive}/errors.rb +10 -0
- data/lib/dwc_archive/expander.rb +88 -0
- data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
- data/lib/dwc_archive/generator.rb +91 -0
- data/lib/dwc_archive/generator_eml_xml.rb +116 -0
- data/lib/dwc_archive/generator_meta_xml.rb +72 -0
- data/lib/dwc_archive/gnub_taxon.rb +14 -0
- data/lib/dwc_archive/ingester.rb +106 -0
- data/lib/dwc_archive/metadata.rb +57 -0
- data/lib/dwc_archive/taxon_normalized.rb +23 -0
- data/lib/dwc_archive/version.rb +6 -0
- data/lib/dwc_archive/xml_reader.rb +90 -0
- data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
- data/spec/files/generator_eml.xml +47 -0
- data/spec/files/generator_meta.xml +19 -0
- data/spec/lib/classification_normalizer_spec.rb +96 -105
- data/spec/lib/core_spec.rb +43 -41
- data/spec/lib/darwin_core_spec.rb +108 -138
- data/spec/lib/generator_eml_xml_spec.rb +12 -11
- data/spec/lib/generator_meta_xml_spec.rb +12 -11
- data/spec/lib/generator_spec.rb +77 -69
- data/spec/lib/gnub_taxon_spec.rb +15 -17
- data/spec/lib/metadata_spec.rb +50 -41
- data/spec/lib/taxon_normalized_spec.rb +62 -65
- data/spec/lib/xml_reader_spec.rb +9 -12
- data/spec/spec_helper.rb +54 -51
- metadata +105 -88
- data/.rvmrc +0 -1
- data/] +0 -40
- data/lib/dwc-archive.rb +0 -107
- data/lib/dwc-archive/archive.rb +0 -40
- data/lib/dwc-archive/classification_normalizer.rb +0 -428
- data/lib/dwc-archive/core.rb +0 -17
- data/lib/dwc-archive/expander.rb +0 -84
- data/lib/dwc-archive/generator.rb +0 -85
- data/lib/dwc-archive/generator_eml_xml.rb +0 -86
- data/lib/dwc-archive/generator_meta_xml.rb +0 -58
- data/lib/dwc-archive/ingester.rb +0 -101
- data/lib/dwc-archive/metadata.rb +0 -48
- data/lib/dwc-archive/version.rb +0 -3
- data/lib/dwc-archive/xml_reader.rb +0 -64
@@ -19,7 +19,7 @@ end
|
|
19
19
|
|
20
20
|
Then /^these data should be saved as "([^\"]*)" file$/ do |file_name|
|
21
21
|
file = File.join(@gen.path, file_name)
|
22
|
-
@gen.files.include?(file_name).should
|
22
|
+
@gen.files.include?(file_name).should be true
|
23
23
|
csv = CSV.open(file).count.should == 4
|
24
24
|
end
|
25
25
|
|
@@ -51,7 +51,7 @@ end
|
|
51
51
|
Then /^data are saved as "([^\"]*)" and "([^\"]*)"$/ do |file_name_1, file_name_2|
|
52
52
|
[file_name_1, file_name_2].each do |file_name|
|
53
53
|
file = File.join(@gen.path, file_name)
|
54
|
-
@gen.files.include?(file_name).should
|
54
|
+
@gen.files.include?(file_name).should be true
|
55
55
|
csv = CSV.open(file).count.should > 1
|
56
56
|
end
|
57
57
|
end
|
@@ -86,7 +86,7 @@ end
|
|
86
86
|
|
87
87
|
Then /^there should be "([^\"]*)" file with core and extensions informations$/ do |file_name|
|
88
88
|
meta = File.join(@gen.path, file_name)
|
89
|
-
@gen.files.include?(file_name).should
|
89
|
+
@gen.files.include?(file_name).should be true
|
90
90
|
dom = Nokogiri::XML(open(File.join(@gen.path, file_name)))
|
91
91
|
dom.xpath('//xmlns:core//xmlns:location').text.should == 'darwin_core.txt'
|
92
92
|
dom.xpath('//xmlns:extension[1]//xmlns:location').text.should == 'vernacular.txt'
|
@@ -94,7 +94,7 @@ end
|
|
94
94
|
|
95
95
|
Then /^there should be "([^\"]*)" file with authoriship information$/ do |file_name|
|
96
96
|
eml = File.join(@gen.path, file_name)
|
97
|
-
@gen.files.include?(file_name).should
|
97
|
+
@gen.files.include?(file_name).should be true
|
98
98
|
end
|
99
99
|
|
100
100
|
Given /^a path to a new file \- "([^\"]*)"$/ do |file_name|
|
@@ -107,6 +107,6 @@ end
|
|
107
107
|
|
108
108
|
Then /^there should be a valid new archive file$/ do
|
109
109
|
dwc = DarwinCore.new('/tmp/dwc.tar.gz')
|
110
|
-
dwc.archive.valid?.should
|
110
|
+
dwc.archive.valid?.should be true
|
111
111
|
end
|
112
112
|
|
@@ -1,5 +1,6 @@
|
|
1
1
|
Given /^path to a dwc file "([^\"]*)"$/ do |arg1|
|
2
|
-
@dwca_file = File.expand_path(File.dirname(__FILE__) +
|
2
|
+
@dwca_file = File.expand_path(File.dirname(__FILE__) +
|
3
|
+
"../../../spec/files/" + arg1)
|
3
4
|
@tmp_dir = "/tmp"
|
4
5
|
end
|
5
6
|
|
@@ -8,11 +9,12 @@ When /^I create a new DarwinCore::Archive instance$/ do
|
|
8
9
|
end
|
9
10
|
|
10
11
|
Then /^I should find that the archive is valid$/ do
|
11
|
-
@dwca.valid?.should
|
12
|
+
@dwca.valid?.should be true
|
12
13
|
end
|
13
14
|
|
14
15
|
Then /^I should see what files the archive has$/ do
|
15
|
-
@dwca.files.should == ["DarwinCore.txt", "VernacularName.txt", "eml.xml",
|
16
|
+
@dwca.files.should == ["DarwinCore.txt", "VernacularName.txt", "eml.xml",
|
17
|
+
"meta.xml", "metadata.txt"]
|
16
18
|
end
|
17
19
|
|
18
20
|
When /^I delete expanded files$/ do
|
@@ -20,7 +22,7 @@ When /^I delete expanded files$/ do
|
|
20
22
|
end
|
21
23
|
|
22
24
|
Then /^they should disappear$/ do
|
23
|
-
@dwca.files.should
|
25
|
+
@dwca.files.should be nil
|
24
26
|
end
|
25
27
|
|
26
28
|
When /^I create a new DarwinCore instance$/ do
|
@@ -37,7 +39,7 @@ When /^I create DarwinCore::ClassificationNormalizer instance$/ do
|
|
37
39
|
end
|
38
40
|
|
39
41
|
Then /^instance should have a valid archive$/ do
|
40
|
-
@dwc.archive.valid?.should
|
42
|
+
@dwc.archive.valid?.should be true
|
41
43
|
end
|
42
44
|
|
43
45
|
Then /^instance should have a core$/ do
|
@@ -45,7 +47,7 @@ Then /^instance should have a core$/ do
|
|
45
47
|
end
|
46
48
|
|
47
49
|
Then /^I should see checksum$/ do
|
48
|
-
@dwc.checksum.should ==
|
50
|
+
@dwc.checksum.should == "7d94fc28ffaf434b66fbc790aa5ef00d834057bf"
|
49
51
|
end
|
50
52
|
|
51
53
|
When /^I check core data$/ do
|
@@ -64,7 +66,8 @@ And /^core\.file_path$/ do
|
|
64
66
|
end
|
65
67
|
|
66
68
|
And /^core\.id$/ do
|
67
|
-
@core.id.should == {:
|
69
|
+
@core.id.should == {index: 0,
|
70
|
+
term: "http://rs.tdwg.org/dwc/terms/TaxonID"}
|
68
71
|
end
|
69
72
|
|
70
73
|
And /^core\.fields$/ do
|
@@ -80,14 +83,21 @@ Then /^DarwinCore instance should have dwc\.metadata object$/ do
|
|
80
83
|
end
|
81
84
|
|
82
85
|
And /^I should find id, title, creators, metadata provider$/ do
|
83
|
-
@dwc.metadata.id.should ==
|
84
|
-
@dwc.metadata.title.should ==
|
86
|
+
@dwc.metadata.id.should == "leptogastrinae:version:2.5"
|
87
|
+
@dwc.metadata.title.should ==
|
88
|
+
"Leptogastrinae (Diptera: Asilidae) Classification"
|
85
89
|
@dwc.metadata.authors.should == [
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
@dwc.metadata.
|
90
|
+
{ last_name: "Bayless", email: "keith.bayless@gmail.com",
|
91
|
+
first_name: "Keith" },
|
92
|
+
{ last_name: "Dikow", email: "dshorthouse@eol.org", first_name: "Torsten" }
|
93
|
+
]
|
94
|
+
@dwc.metadata.abstract.should ==
|
95
|
+
"These are all the names in the Leptogastrinae classification."
|
96
|
+
@dwc.metadata.citation.should ==
|
97
|
+
"Dikow, Torsten. 2010. The Leptogastrinae classification."
|
98
|
+
@dwc.metadata.url.should ==
|
99
|
+
"http://leptogastrinae.lifedesks.org/files/leptogastrinae/"\
|
100
|
+
"classification_export/shared/leptogastrinae.tar.gz"
|
91
101
|
end
|
92
102
|
|
93
103
|
Then /^DarwinCore instance should have an extensions array$/ do
|
@@ -103,11 +113,18 @@ end
|
|
103
113
|
|
104
114
|
Then /^extension should have properties, data, file_path, coreid, fields$/ do
|
105
115
|
ext = @dwc.extensions[0]
|
106
|
-
ext.properties.should == {
|
116
|
+
ext.properties.should == {
|
117
|
+
ignoreHeaderLines: 1, encoding: "UTF-8",
|
118
|
+
rowType: "http://rs.gbif.org/ipt/terms/1.0/VernacularName",
|
119
|
+
fieldsEnclosedBy: "", fieldsTerminatedBy: "\\t", linesTerminatedBy: "\\n"
|
120
|
+
}
|
107
121
|
ext.data.class.should == Hash
|
108
122
|
ext.file_path.should match(/\/tmp\/dwc_[\d]+\/VernacularName.txt/)
|
109
|
-
ext.coreid.should == {:
|
110
|
-
ext.fields.should == [
|
123
|
+
ext.coreid.should == { index: 0 }
|
124
|
+
ext.fields.should == [
|
125
|
+
{ term: "http://rs.gbif.org/ecat/terms/vernacularName", index: 1 },
|
126
|
+
{ term: "http://rs.gbif.org/thesaurus/languageCode", index: 2 }
|
127
|
+
]
|
111
128
|
end
|
112
129
|
|
113
130
|
Given /^acces to DarwinCore gem$/ do
|
@@ -153,7 +170,7 @@ end
|
|
153
170
|
|
154
171
|
Then /^I can read its core content using block$/ do
|
155
172
|
res = []
|
156
|
-
@dwc.core.ignore_headers.should
|
173
|
+
@dwc.core.ignore_headers.should be true
|
157
174
|
read_result = @dwc.core.read(200) do |r, err|
|
158
175
|
res << [r.size, err.size]
|
159
176
|
end
|
@@ -164,7 +181,7 @@ end
|
|
164
181
|
Then /^I can read extensions content using block$/ do
|
165
182
|
res = []
|
166
183
|
ext = @dwc.extensions[0]
|
167
|
-
ext.ignore_headers.should
|
184
|
+
ext.ignore_headers.should be true
|
168
185
|
ext.read(200) do |r, err|
|
169
186
|
res << [r.size, err.size]
|
170
187
|
end
|
@@ -196,22 +213,24 @@ Then /^there are paths, synonyms and vernacular names in normalized classificati
|
|
196
213
|
if v.vernacular_names.size > 0
|
197
214
|
@vernaculars_are_generated = true
|
198
215
|
vn = v.vernacular_names[0]
|
199
|
-
(vn.respond_to?(
|
216
|
+
(vn.respond_to?("locality") && vn.respond_to?("country_code") &&
|
217
|
+
vn.respond_to?("language")).should be true
|
200
218
|
end
|
201
219
|
break if (@vernaculars_are_generated && @paths_are_generated && @synonyms_are_generated)
|
202
220
|
end
|
203
|
-
@paths_are_generated.should
|
204
|
-
@vernaculars_are_generated.should
|
205
|
-
@synonyms_are_generated.should
|
221
|
+
@paths_are_generated.should be true
|
222
|
+
@vernaculars_are_generated.should be true
|
223
|
+
@synonyms_are_generated.should be true
|
206
224
|
end
|
207
225
|
|
208
226
|
Then /^there are local_id and global_id methods in taxons and synonyms$/ do
|
209
227
|
@normalized_classification.each do |k, v|
|
210
228
|
if v.synonyms.size > 0
|
211
|
-
v.local_id.should ==
|
229
|
+
v.local_id.should == "2"
|
212
230
|
v.global_id.should == "97498f29-2501-440d-9452-f3817da0d6c2"
|
213
|
-
v.synonyms.first.local_id.should ==
|
214
|
-
v.synonyms.first.global_id.should ==
|
231
|
+
v.synonyms.first.local_id.should == "1"
|
232
|
+
v.synonyms.first.global_id.should ==
|
233
|
+
"e017ed01-407d-4d09-82c5-8b3d9fa76e35"
|
215
234
|
break
|
216
235
|
end
|
217
236
|
end
|
@@ -229,8 +248,8 @@ Then /^there are id paths, no canonical names paths in normalized classification
|
|
229
248
|
id_paths_generated = true
|
230
249
|
end
|
231
250
|
end
|
232
|
-
id_paths_generated.should
|
233
|
-
canonical_paths_generated.should
|
251
|
+
id_paths_generated.should be true
|
252
|
+
canonical_paths_generated.should be false
|
234
253
|
end
|
235
254
|
|
236
255
|
Then /^names used in classification can be accessed by "([^"]*)" method$/ do |name_strings|
|
data/features/support/env.rb
CHANGED
data/lib/dwc_archive.rb
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "fileutils"
|
4
|
+
require "ostruct"
|
5
|
+
require "digest"
|
6
|
+
require "csv"
|
7
|
+
require "logger"
|
8
|
+
require "nokogiri"
|
9
|
+
require "biodiversity"
|
10
|
+
require_relative "dwc_archive/xml_reader"
|
11
|
+
require_relative "dwc_archive/ingester"
|
12
|
+
require_relative "dwc_archive/errors"
|
13
|
+
require_relative "dwc_archive/expander"
|
14
|
+
require_relative "dwc_archive/archive"
|
15
|
+
require_relative "dwc_archive/core"
|
16
|
+
require_relative "dwc_archive/extension"
|
17
|
+
require_relative "dwc_archive/metadata"
|
18
|
+
require_relative "dwc_archive/generator"
|
19
|
+
require_relative "dwc_archive/generator_meta_xml"
|
20
|
+
require_relative "dwc_archive/generator_eml_xml"
|
21
|
+
require_relative "dwc_archive/taxon_normalized"
|
22
|
+
require_relative "dwc_archive/gnub_taxon"
|
23
|
+
require_relative "dwc_archive/classification_normalizer"
|
24
|
+
require_relative "dwc_archive/version"
|
25
|
+
|
26
|
+
# main class for handling darwin core archives
|
27
|
+
class DarwinCore
|
28
|
+
DEFAULT_TMP_DIR = "/tmp"
|
29
|
+
VernacularNormalized = Struct.new(:name, :language, :locality, :country_code)
|
30
|
+
SynonymNormalized = Struct.new(:id, :name, :canonical_name, :status, :source,
|
31
|
+
:local_id, :global_id)
|
32
|
+
|
33
|
+
class << self
|
34
|
+
attr_writer :logger
|
35
|
+
|
36
|
+
def clean(path)
|
37
|
+
FileUtils.rm_rf(path) if FileTest.exists?(path)
|
38
|
+
end
|
39
|
+
|
40
|
+
def files(path)
|
41
|
+
return nil unless path && FileTest.exists?(path)
|
42
|
+
|
43
|
+
Dir.entries(path).reject { |e| e.match(/[.]{1,2}$/) }.sort
|
44
|
+
end
|
45
|
+
|
46
|
+
def random_path(tmp_dir)
|
47
|
+
File.join(tmp_dir, "dwc_#{rand(10_000_000_000)}")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
attr_reader :archive, :core, :metadata, :classification_normalizer
|
52
|
+
alias eml metadata
|
53
|
+
|
54
|
+
def self.nil_field?(field)
|
55
|
+
return true if [nil, "", "/N"].include?(field)
|
56
|
+
|
57
|
+
false
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.clean_all(tmp_dir = DEFAULT_TMP_DIR)
|
61
|
+
Dir.entries(tmp_dir).each do |entry|
|
62
|
+
path = File.join(tmp_dir, entry)
|
63
|
+
FileUtils.rm_rf(path) if FileTest.directory?(path) && entry.match(/^dwc_\d+$/)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.logger
|
68
|
+
@logger ||= Logger.new(nil)
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.logger_reset
|
72
|
+
self.logger = Logger.new(nil)
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.logger_write(obj_id, message, method = :info)
|
76
|
+
logger.send(method, "|#{obj_id}|#{message}|")
|
77
|
+
end
|
78
|
+
|
79
|
+
def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
|
80
|
+
@dwc_path = dwc_path
|
81
|
+
@archive = DarwinCore::Archive.new(@dwc_path, tmp_dir)
|
82
|
+
@core = DarwinCore::Core.new(self)
|
83
|
+
@metadata = DarwinCore::Metadata.new(@archive)
|
84
|
+
extensions
|
85
|
+
end
|
86
|
+
|
87
|
+
def file_name
|
88
|
+
File.split(@dwc_path).last
|
89
|
+
end
|
90
|
+
|
91
|
+
def path
|
92
|
+
File.expand_path(@dwc_path)
|
93
|
+
end
|
94
|
+
|
95
|
+
# generates a hash from a classification data with path to each node,
|
96
|
+
# list of synonyms and vernacular names.
|
97
|
+
def normalize_classification
|
98
|
+
return nil unless parent_id?
|
99
|
+
|
100
|
+
@classification_normalizer ||=
|
101
|
+
DarwinCore::ClassificationNormalizer.new(self)
|
102
|
+
@classification_normalizer.normalize
|
103
|
+
end
|
104
|
+
|
105
|
+
def parent_id?
|
106
|
+
!@core.fields.join("|").
|
107
|
+
downcase.match(/highertaxonid|parentnameusageid/).nil?
|
108
|
+
end
|
109
|
+
|
110
|
+
def checksum
|
111
|
+
Digest::SHA1.hexdigest(File.read(@dwc_path))
|
112
|
+
end
|
113
|
+
|
114
|
+
def extensions
|
115
|
+
return @extensions if @extensions
|
116
|
+
|
117
|
+
root_key = @archive.meta.keys[0]
|
118
|
+
ext = @archive.meta[root_key][:extension]
|
119
|
+
return @extensions = [] unless ext
|
120
|
+
|
121
|
+
ext = [ext] if ext.class != Array
|
122
|
+
@extensions = ext.map { |e| DarwinCore::Extension.new(self, e) }
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Deals with handling DarwinCoreArchive file, and provides meta information
|
5
|
+
# and files information about archive
|
6
|
+
class Archive
|
7
|
+
attr_reader :meta, :eml
|
8
|
+
|
9
|
+
def initialize(archive_path, tmp_dir)
|
10
|
+
@archive_path = archive_path
|
11
|
+
@tmp_dir = tmp_dir
|
12
|
+
@expander = DarwinCore::Expander.new(@archive_path, @tmp_dir)
|
13
|
+
@expander.unpack
|
14
|
+
prepare_metadata
|
15
|
+
end
|
16
|
+
|
17
|
+
def valid?
|
18
|
+
valid = true
|
19
|
+
valid = valid && @expander.path && FileTest.exists?(@expander.path)
|
20
|
+
valid && files && files.include?("meta.xml")
|
21
|
+
end
|
22
|
+
|
23
|
+
def files
|
24
|
+
@expander.files
|
25
|
+
end
|
26
|
+
|
27
|
+
def files_path
|
28
|
+
@expander.path
|
29
|
+
end
|
30
|
+
|
31
|
+
def clean
|
32
|
+
@expander.clean
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def prepare_metadata
|
38
|
+
if valid?
|
39
|
+
prepare_meta_file
|
40
|
+
prepare_eml_file
|
41
|
+
else
|
42
|
+
clean
|
43
|
+
raise InvalidArchiveError
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def prepare_meta_file
|
48
|
+
meta_file = File.open(File.join(@expander.path, "meta.xml"))
|
49
|
+
@meta = DarwinCore::XmlReader.from_xml(meta_file)
|
50
|
+
end
|
51
|
+
|
52
|
+
def prepare_eml_file
|
53
|
+
@eml = nil
|
54
|
+
return unless files.include?("eml.xml")
|
55
|
+
|
56
|
+
eml_file = File.open(File.join(@expander.path, "eml.xml"))
|
57
|
+
@eml = DarwinCore::XmlReader.from_xml(eml_file)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,382 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Returns tree representation of Darwin Core file with vernacular and
|
5
|
+
# and synonyms attached to the taxon nodes
|
6
|
+
class ClassificationNormalizer
|
7
|
+
attr_reader :error_names, :tree, :normalized_data, :dwc
|
8
|
+
alias darwin_core dwc
|
9
|
+
|
10
|
+
def initialize(dwc_instance)
|
11
|
+
@dwc = dwc_instance
|
12
|
+
@core_fields = find_fields(@dwc.core)
|
13
|
+
@extensions = @dwc.extensions.map { |e| [e, find_fields(e)] }
|
14
|
+
@normalized_data = {}
|
15
|
+
@synonyms = {}
|
16
|
+
@name_strings = {}
|
17
|
+
@vernacular_name_strings = {}
|
18
|
+
@error_names = []
|
19
|
+
@tree = {}
|
20
|
+
end
|
21
|
+
|
22
|
+
def add_name_string(name_string)
|
23
|
+
@name_strings[name_string] = 1 unless @name_strings[name_string]
|
24
|
+
end
|
25
|
+
|
26
|
+
def add_vernacular_name_string(name_string)
|
27
|
+
return if @vernacular_name_strings[name_string]
|
28
|
+
|
29
|
+
@vernacular_name_strings[name_string] = 1
|
30
|
+
end
|
31
|
+
|
32
|
+
def name_strings(opts = {})
|
33
|
+
process_strings(@name_strings, opts)
|
34
|
+
end
|
35
|
+
|
36
|
+
def vernacular_name_strings(opts = {})
|
37
|
+
process_strings(@vernacular_name_strings, opts)
|
38
|
+
end
|
39
|
+
|
40
|
+
def normalize(opts = {})
|
41
|
+
opts = { with_canonical_names: true,
|
42
|
+
with_extensions: true }.merge(opts)
|
43
|
+
@with_canonical_names = opts[:with_canonical_names]
|
44
|
+
DarwinCore.logger_write(@dwc.object_id,
|
45
|
+
"Started normalization of the classification")
|
46
|
+
ingest_core
|
47
|
+
DarwinCore.logger_write(
|
48
|
+
@dwc.object_id,
|
49
|
+
"Calculating the classification parent/child paths"
|
50
|
+
)
|
51
|
+
if parent_id?
|
52
|
+
calculate_classification_path
|
53
|
+
else
|
54
|
+
@normalized_data.keys.each { |id| @tree[id] = {} }
|
55
|
+
end
|
56
|
+
DarwinCore.logger_write(@dwc.object_id, "Ingesting data from extensions")
|
57
|
+
ingest_extensions if opts[:with_extensions]
|
58
|
+
@normalized_data
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def process_strings(strings, opts)
|
64
|
+
opts = { with_hash: false }.merge(opts)
|
65
|
+
if opts[:with_hash]
|
66
|
+
strings
|
67
|
+
else
|
68
|
+
strings.keys
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def get_canonical_name(a_scientific_name)
|
73
|
+
return nil unless @with_canonical_names
|
74
|
+
|
75
|
+
canonical_name = Biodiversity::Parser.parse(a_scientific_name).
|
76
|
+
dig(:canonical, :simple)
|
77
|
+
canonical_name.to_s.empty? ? a_scientific_name : canonical_name
|
78
|
+
end
|
79
|
+
|
80
|
+
def find_fields(element)
|
81
|
+
data = element.fields.each_with_object({}) do |f, h|
|
82
|
+
field = f[:term].split("/")[-1]
|
83
|
+
field = field ? field.downcase.to_sym : ""
|
84
|
+
h[field] = f[:index].to_i
|
85
|
+
end
|
86
|
+
data[:id] = element.id[:index] if element.id
|
87
|
+
data
|
88
|
+
end
|
89
|
+
|
90
|
+
def status_synonym?(status)
|
91
|
+
status&.match(/^syn/)
|
92
|
+
end
|
93
|
+
|
94
|
+
def add_synonym_from_core(taxon_id, row)
|
95
|
+
cf = @core_fields
|
96
|
+
@synonyms[row[cf[:id]]] = taxon_id
|
97
|
+
@normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new unless @normalized_data[row[taxon_id]]
|
98
|
+
|
99
|
+
taxon = @normalized_data[row[taxon_id]]
|
100
|
+
synonym = SynonymNormalized.new(
|
101
|
+
row[cf[:id]],
|
102
|
+
row[cf[:scientificname]],
|
103
|
+
row[cf[:canonicalname]],
|
104
|
+
cf[:taxonomicstatus] ? row[cf[:taxonomicstatus]] : nil,
|
105
|
+
cf[:source] ? row[cf[:source]] : nil,
|
106
|
+
cf[:localid] ? row[cf[:localid]] : nil,
|
107
|
+
cf[:globalid] ? row[cf[:globalid]] : nil
|
108
|
+
)
|
109
|
+
taxon.synonyms << synonym
|
110
|
+
add_name_string(synonym.name)
|
111
|
+
add_name_string(synonym.canonical_name)
|
112
|
+
end
|
113
|
+
|
114
|
+
def set_scientific_name(row, fields)
|
115
|
+
row[fields[:scientificname]] = "N/A" unless row[fields[:scientificname]]
|
116
|
+
canonical_name = nil
|
117
|
+
scientific_name = row[fields[:scientificname]].strip
|
118
|
+
if separate_canonical_and_authorship?(row, fields)
|
119
|
+
canonical_name = row[fields[:scientificname]].strip if @with_canonical_names
|
120
|
+
scientific_name += " #{row[fields[:scientificnameauthorship]].strip}"
|
121
|
+
else
|
122
|
+
canonical_name = get_canonical_name(row[fields[:scientificname]]) if @with_canonical_names
|
123
|
+
end
|
124
|
+
fields[:canonicalname] = row.size
|
125
|
+
row << canonical_name
|
126
|
+
row[fields[:scientificname]] = scientific_name
|
127
|
+
end
|
128
|
+
|
129
|
+
def separate_canonical_and_authorship?(row, fields)
|
130
|
+
authorship = ""
|
131
|
+
authorship = row[fields[:scientificnameauthorship]].to_s.strip if fields[:scientificnameauthorship]
|
132
|
+
!(authorship.empty? || row[fields[:scientificname]].index(authorship))
|
133
|
+
end
|
134
|
+
|
135
|
+
def ingest_core
|
136
|
+
@normalized_data = {}
|
137
|
+
has_name_and_id = @core_fields[:id] && @core_fields[:scientificname]
|
138
|
+
unless has_name_and_id
|
139
|
+
raise(DarwinCore::CoreFileError,
|
140
|
+
"Darwin Core core fields must contain taxon id and scientific name")
|
141
|
+
end
|
142
|
+
@dwc.core.read do |rows|
|
143
|
+
rows[1].each do |error|
|
144
|
+
@error_names << { data: error,
|
145
|
+
error: :reading_or_encoding_error }
|
146
|
+
end
|
147
|
+
rows[0].each do |r|
|
148
|
+
set_scientific_name(r, @core_fields)
|
149
|
+
# Core has AcceptedNameUsageId
|
150
|
+
if @core_fields[:acceptednameusageid] &&
|
151
|
+
r[@core_fields[:acceptednameusageid]] &&
|
152
|
+
r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
|
153
|
+
add_synonym_from_core(@core_fields[:acceptednameusageid], r)
|
154
|
+
elsif !@core_fields[:acceptednameusageid] &&
|
155
|
+
@core_fields[:taxonomicstatus] &&
|
156
|
+
status_synonym?(r[@core_fields[:taxonomicstatus]])
|
157
|
+
add_synonym_from_core(parent_id, r) if parent_id?
|
158
|
+
else
|
159
|
+
unless @normalized_data[r[@core_fields[:id]]]
|
160
|
+
new_taxon = if gnub_archive?
|
161
|
+
DarwinCore::GnubTaxon.new
|
162
|
+
else
|
163
|
+
DarwinCore::TaxonNormalized.new
|
164
|
+
end
|
165
|
+
@normalized_data[r[@core_fields[:id]]] = new_taxon
|
166
|
+
end
|
167
|
+
taxon = @normalized_data[r[@core_fields[:id]]]
|
168
|
+
if gnub_archive?
|
169
|
+
taxon.uuid = r[@core_fields[:originalnameusageid]]
|
170
|
+
taxon.uuid_path = r[@core_fields[:originalnameusageidpath]].
|
171
|
+
split("|")
|
172
|
+
end
|
173
|
+
taxon.id = r[@core_fields[:id]]
|
174
|
+
taxon.current_name = r[@core_fields[:scientificname]]
|
175
|
+
taxon.current_name_canonical = r[@core_fields[:canonicalname]]
|
176
|
+
taxon.parent_id = parent_id? ? r[parent_id] : nil
|
177
|
+
taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
|
178
|
+
taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
|
179
|
+
taxon.source = r[@core_fields[:source]] if @core_fields[:source]
|
180
|
+
taxon.local_id = r[@core_fields[:localid]] if @core_fields[:localid]
|
181
|
+
taxon.global_id = r[@core_fields[:globalid]] if @core_fields[:globalid]
|
182
|
+
taxon.linnean_classification_path =
|
183
|
+
get_linnean_classification_path(r, taxon)
|
184
|
+
add_name_string(taxon.current_name)
|
185
|
+
has_canonical = taxon.current_name_canonical &&
|
186
|
+
!taxon.current_name_canonical.empty?
|
187
|
+
add_name_string(taxon.current_name_canonical) if has_canonical
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
def parent_id?
|
194
|
+
@has_parent_id ||= @core_fields.key?(:highertaxonid) ||
|
195
|
+
@core_fields.key?(:parentnameusageid)
|
196
|
+
end
|
197
|
+
|
198
|
+
def parent_id
|
199
|
+
@core_fields[:highertaxonid] || @core_fields[:parentnameusageid]
|
200
|
+
end
|
201
|
+
|
202
|
+
def calculate_classification_path
|
203
|
+
@paths_num = 0
|
204
|
+
@normalized_data.each do |_taxon_id, taxon|
|
205
|
+
next unless taxon.classification_path_id.empty?
|
206
|
+
|
207
|
+
res = get_classification_path(taxon)
|
208
|
+
next if res == "error"
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
def get_classification_path(taxon)
|
213
|
+
return unless taxon.classification_path_id.empty?
|
214
|
+
|
215
|
+
@paths_num += 1
|
216
|
+
if @paths_num % 10_000 == 0
|
217
|
+
DarwinCore.logger_write(@dwc.object_id,
|
218
|
+
"Calculated #{@paths_num} paths")
|
219
|
+
end
|
220
|
+
current_node = { taxon.id => {} }
|
221
|
+
if DarwinCore.nil_field?(taxon.parent_id)
|
222
|
+
taxon.classification_path << taxon.current_name_canonical if @with_canonical_names
|
223
|
+
taxon.classification_path_id << taxon.id
|
224
|
+
@tree.merge!(current_node)
|
225
|
+
else
|
226
|
+
parent_cp = parent_cpid = nil
|
227
|
+
if @normalized_data[taxon.parent_id]
|
228
|
+
parent_cp = @normalized_data[taxon.parent_id].classification_path if @with_canonical_names
|
229
|
+
parent_cpid = @normalized_data[taxon.parent_id].
|
230
|
+
classification_path_id
|
231
|
+
else
|
232
|
+
current_parent = @normalized_data[@synonyms[taxon.parent_id]]
|
233
|
+
if current_parent
|
234
|
+
@error_names << { data: taxon,
|
235
|
+
error: :deprecated_parent,
|
236
|
+
current_parent: current_parent }
|
237
|
+
|
238
|
+
parent_cp = current_parent.classification_path if @with_canonical_names
|
239
|
+
parent_cpid = current_parent.classification_path_id
|
240
|
+
else
|
241
|
+
@error_names << { data: taxon,
|
242
|
+
error: :deprecated_parent,
|
243
|
+
current_parent: nil }
|
244
|
+
end
|
245
|
+
end
|
246
|
+
return "error" unless parent_cpid
|
247
|
+
|
248
|
+
if parent_cpid.empty?
|
249
|
+
res = "error"
|
250
|
+
begin
|
251
|
+
res = get_classification_path(@normalized_data[taxon.parent_id])
|
252
|
+
rescue SystemStackError
|
253
|
+
@error_names << { data: taxon,
|
254
|
+
error: :too_deep_hierarchy,
|
255
|
+
current_parent: nil }
|
256
|
+
end
|
257
|
+
return res if res == "error"
|
258
|
+
|
259
|
+
if @with_canonical_names
|
260
|
+
taxon.classification_path += @normalized_data[taxon.parent_id].
|
261
|
+
classification_path +
|
262
|
+
[taxon.current_name_canonical]
|
263
|
+
end
|
264
|
+
taxon.classification_path_id += @normalized_data[taxon.parent_id].
|
265
|
+
classification_path_id + [taxon.id]
|
266
|
+
parent_node = @normalized_data[taxon.parent_id].
|
267
|
+
classification_path_id.inject(@tree) do |node, id|
|
268
|
+
node[id]
|
269
|
+
end
|
270
|
+
parent_node.merge!(current_node)
|
271
|
+
else
|
272
|
+
if @with_canonical_names
|
273
|
+
taxon.classification_path += parent_cp +
|
274
|
+
[taxon.current_name_canonical]
|
275
|
+
end
|
276
|
+
taxon.classification_path_id += parent_cpid + [taxon.id]
|
277
|
+
parent_node = @normalized_data[taxon.parent_id].
|
278
|
+
classification_path_id.inject(@tree) do |node, id|
|
279
|
+
node[id]
|
280
|
+
end
|
281
|
+
begin
|
282
|
+
parent_node.merge!(current_node)
|
283
|
+
rescue NoMethodError => e
|
284
|
+
DarwinCore.logger_write(@dwc.object_id,
|
285
|
+
"Error '#{e.message}' taxon #{taxon.id}")
|
286
|
+
"error"
|
287
|
+
end
|
288
|
+
end
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
def ingest_extensions
|
293
|
+
@extensions.each do |e|
|
294
|
+
_ext, fields = *e
|
295
|
+
ingest_synonyms(e) if File.split(e[0].file_path).
|
296
|
+
last.match(/synonym/i) &&
|
297
|
+
fields.keys.include?(:scientificname)
|
298
|
+
ingest_vernaculars(e) if fields.keys.include? :vernacularname
|
299
|
+
end
|
300
|
+
end
|
301
|
+
|
302
|
+
def ingest_synonyms(extension)
|
303
|
+
DarwinCore.logger_write(@dwc.object_id, "Ingesting synonyms extension")
|
304
|
+
ext, fields = *extension
|
305
|
+
ext.read do |rows|
|
306
|
+
rows[0].each do |r|
|
307
|
+
synonym = process_synonym(r, fields)
|
308
|
+
add_synonym(synonym, r, fields)
|
309
|
+
end
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
def add_synonym(synonym, record, fields)
|
314
|
+
if @normalized_data[record[fields[:id]]]
|
315
|
+
@normalized_data[record[fields[:id]]].synonyms << synonym
|
316
|
+
add_name_string(synonym.name)
|
317
|
+
add_name_string(synonym.canonical_name)
|
318
|
+
else
|
319
|
+
@error_names << { taxon: synonym,
|
320
|
+
error: :synonym_of_unknown_taxa }
|
321
|
+
end
|
322
|
+
end
|
323
|
+
|
324
|
+
def process_synonym(record, fields)
|
325
|
+
set_scientific_name(record, fields)
|
326
|
+
SynonymNormalized.new(
|
327
|
+
nil,
|
328
|
+
record[fields[:scientificname]],
|
329
|
+
record[fields[:canonicalname]],
|
330
|
+
fields[:taxonomicstatus] ? record[fields[:taxonomicstatus]] : nil,
|
331
|
+
fields[:source] ? record[fields[:source]] : nil,
|
332
|
+
fields[:localid] ? record[fields[:localid]] : nil,
|
333
|
+
fields[:globalid] ? record[fields[:globalid]] : nil
|
334
|
+
)
|
335
|
+
end
|
336
|
+
|
337
|
+
def ingest_vernaculars(extension)
|
338
|
+
DarwinCore.logger_write(@dwc.object_id,
|
339
|
+
"Ingesting vernacular names extension")
|
340
|
+
ext, fields = *extension
|
341
|
+
ext.read do |rows|
|
342
|
+
rows[0].each do |row|
|
343
|
+
extract_vernaculars_from_row(row, fields)
|
344
|
+
end
|
345
|
+
end
|
346
|
+
end
|
347
|
+
|
348
|
+
def extract_vernaculars_from_row(row, fields)
|
349
|
+
language = find_vernacular_language(row, fields)
|
350
|
+
locality = fields[:locality] ? row[fields[:locality]] : nil
|
351
|
+
country_code = fields[:countrycode] ? row[fields[:countrycode]] : nil
|
352
|
+
|
353
|
+
vernacular = VernacularNormalized.new(
|
354
|
+
row[fields[:vernacularname]], language, locality, country_code
|
355
|
+
)
|
356
|
+
if @normalized_data[row[fields[:id]]]
|
357
|
+
@normalized_data[row[fields[:id]]].vernacular_names << vernacular
|
358
|
+
add_vernacular_name_string(vernacular.name)
|
359
|
+
else
|
360
|
+
@error_names << { vernacular_name: vernacular,
|
361
|
+
error: :vernacular_of_unknown_taxa }
|
362
|
+
end
|
363
|
+
end
|
364
|
+
|
365
|
+
def find_vernacular_language(row, fields)
|
366
|
+
(fields[:language] && row[fields[:language]]) ||
|
367
|
+
(fields[:languagecode] && row[fields[:languagecode]]) || nil
|
368
|
+
end
|
369
|
+
|
370
|
+
# Collect linnean classification path only on species level
|
371
|
+
def get_linnean_classification_path(row, _taxon)
|
372
|
+
%i[kingdom phylum class order family genus
|
373
|
+
subgenus].each_with_object([]) do |clade, res|
|
374
|
+
res << [row[@core_fields[clade]], clade] if @core_fields[clade]
|
375
|
+
end
|
376
|
+
end
|
377
|
+
|
378
|
+
def gnub_archive?
|
379
|
+
@core_fields[:originalnameusageidpath]
|
380
|
+
end
|
381
|
+
end
|
382
|
+
end
|