dwc-archive 0.9.10 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/.rspec +2 -1
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +4 -7
  7. data/CHANGELOG +14 -8
  8. data/Gemfile +3 -1
  9. data/LICENSE +1 -1
  10. data/README.md +119 -107
  11. data/Rakefile +13 -36
  12. data/dwc-archive.gemspec +23 -19
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +124 -0
  17. data/lib/dwc_archive/archive.rb +60 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +382 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +10 -0
  21. data/lib/dwc_archive/expander.rb +88 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +91 -0
  24. data/lib/dwc_archive/generator_eml_xml.rb +116 -0
  25. data/lib/dwc_archive/generator_meta_xml.rb +72 -0
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +57 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +90 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +47 -0
  34. data/spec/files/generator_meta.xml +19 -0
  35. data/spec/lib/classification_normalizer_spec.rb +96 -105
  36. data/spec/lib/core_spec.rb +43 -41
  37. data/spec/lib/darwin_core_spec.rb +108 -138
  38. data/spec/lib/generator_eml_xml_spec.rb +12 -11
  39. data/spec/lib/generator_meta_xml_spec.rb +12 -11
  40. data/spec/lib/generator_spec.rb +77 -69
  41. data/spec/lib/gnub_taxon_spec.rb +15 -17
  42. data/spec/lib/metadata_spec.rb +50 -41
  43. data/spec/lib/taxon_normalized_spec.rb +62 -65
  44. data/spec/lib/xml_reader_spec.rb +9 -12
  45. data/spec/spec_helper.rb +54 -51
  46. metadata +105 -88
  47. data/.rvmrc +0 -1
  48. data/] +0 -40
  49. data/lib/dwc-archive.rb +0 -107
  50. data/lib/dwc-archive/archive.rb +0 -40
  51. data/lib/dwc-archive/classification_normalizer.rb +0 -428
  52. data/lib/dwc-archive/core.rb +0 -17
  53. data/lib/dwc-archive/expander.rb +0 -84
  54. data/lib/dwc-archive/generator.rb +0 -85
  55. data/lib/dwc-archive/generator_eml_xml.rb +0 -86
  56. data/lib/dwc-archive/generator_meta_xml.rb +0 -58
  57. data/lib/dwc-archive/ingester.rb +0 -101
  58. data/lib/dwc-archive/metadata.rb +0 -48
  59. data/lib/dwc-archive/version.rb +0 -3
  60. data/lib/dwc-archive/xml_reader.rb +0 -64
@@ -19,7 +19,7 @@ end
19
19
 
20
20
  Then /^these data should be saved as "([^\"]*)" file$/ do |file_name|
21
21
  file = File.join(@gen.path, file_name)
22
- @gen.files.include?(file_name).should be_true
22
+ @gen.files.include?(file_name).should be true
23
23
  csv = CSV.open(file).count.should == 4
24
24
  end
25
25
 
@@ -51,7 +51,7 @@ end
51
51
  Then /^data are saved as "([^\"]*)" and "([^\"]*)"$/ do |file_name_1, file_name_2|
52
52
  [file_name_1, file_name_2].each do |file_name|
53
53
  file = File.join(@gen.path, file_name)
54
- @gen.files.include?(file_name).should be_true
54
+ @gen.files.include?(file_name).should be true
55
55
  csv = CSV.open(file).count.should > 1
56
56
  end
57
57
  end
@@ -86,7 +86,7 @@ end
86
86
 
87
87
  Then /^there should be "([^\"]*)" file with core and extensions informations$/ do |file_name|
88
88
  meta = File.join(@gen.path, file_name)
89
- @gen.files.include?(file_name).should be_true
89
+ @gen.files.include?(file_name).should be true
90
90
  dom = Nokogiri::XML(open(File.join(@gen.path, file_name)))
91
91
  dom.xpath('//xmlns:core//xmlns:location').text.should == 'darwin_core.txt'
92
92
  dom.xpath('//xmlns:extension[1]//xmlns:location').text.should == 'vernacular.txt'
@@ -94,7 +94,7 @@ end
94
94
 
95
95
  Then /^there should be "([^\"]*)" file with authoriship information$/ do |file_name|
96
96
  eml = File.join(@gen.path, file_name)
97
- @gen.files.include?(file_name).should be_true
97
+ @gen.files.include?(file_name).should be true
98
98
  end
99
99
 
100
100
  Given /^a path to a new file \- "([^\"]*)"$/ do |file_name|
@@ -107,6 +107,6 @@ end
107
107
 
108
108
  Then /^there should be a valid new archive file$/ do
109
109
  dwc = DarwinCore.new('/tmp/dwc.tar.gz')
110
- dwc.archive.valid?.should be_true
110
+ dwc.archive.valid?.should be true
111
111
  end
112
112
 
@@ -1,5 +1,6 @@
1
1
  Given /^path to a dwc file "([^\"]*)"$/ do |arg1|
2
- @dwca_file = File.expand_path(File.dirname(__FILE__) + "../../../spec/files/" + arg1)
2
+ @dwca_file = File.expand_path(File.dirname(__FILE__) +
3
+ "../../../spec/files/" + arg1)
3
4
  @tmp_dir = "/tmp"
4
5
  end
5
6
 
@@ -8,11 +9,12 @@ When /^I create a new DarwinCore::Archive instance$/ do
8
9
  end
9
10
 
10
11
  Then /^I should find that the archive is valid$/ do
11
- @dwca.valid?.should be_true
12
+ @dwca.valid?.should be true
12
13
  end
13
14
 
14
15
  Then /^I should see what files the archive has$/ do
15
- @dwca.files.should == ["DarwinCore.txt", "VernacularName.txt", "eml.xml", "meta.xml", "metadata.txt"]
16
+ @dwca.files.should == ["DarwinCore.txt", "VernacularName.txt", "eml.xml",
17
+ "meta.xml", "metadata.txt"]
16
18
  end
17
19
 
18
20
  When /^I delete expanded files$/ do
@@ -20,7 +22,7 @@ When /^I delete expanded files$/ do
20
22
  end
21
23
 
22
24
  Then /^they should disappear$/ do
23
- @dwca.files.should be_nil
25
+ @dwca.files.should be nil
24
26
  end
25
27
 
26
28
  When /^I create a new DarwinCore instance$/ do
@@ -37,7 +39,7 @@ When /^I create DarwinCore::ClassificationNormalizer instance$/ do
37
39
  end
38
40
 
39
41
  Then /^instance should have a valid archive$/ do
40
- @dwc.archive.valid?.should be_true
42
+ @dwc.archive.valid?.should be true
41
43
  end
42
44
 
43
45
  Then /^instance should have a core$/ do
@@ -45,7 +47,7 @@ Then /^instance should have a core$/ do
45
47
  end
46
48
 
47
49
  Then /^I should see checksum$/ do
48
- @dwc.checksum.should == '7d94fc28ffaf434b66fbc790aa5ef00d834057bf'
50
+ @dwc.checksum.should == "7d94fc28ffaf434b66fbc790aa5ef00d834057bf"
49
51
  end
50
52
 
51
53
  When /^I check core data$/ do
@@ -64,7 +66,8 @@ And /^core\.file_path$/ do
64
66
  end
65
67
 
66
68
  And /^core\.id$/ do
67
- @core.id.should == {:index => 0, :term => 'http://rs.tdwg.org/dwc/terms/TaxonID'}
69
+ @core.id.should == {index: 0,
70
+ term: "http://rs.tdwg.org/dwc/terms/TaxonID"}
68
71
  end
69
72
 
70
73
  And /^core\.fields$/ do
@@ -80,14 +83,21 @@ Then /^DarwinCore instance should have dwc\.metadata object$/ do
80
83
  end
81
84
 
82
85
  And /^I should find id, title, creators, metadata provider$/ do
83
- @dwc.metadata.id.should == 'leptogastrinae:version:2.5'
84
- @dwc.metadata.title.should == 'Leptogastrinae (Diptera: Asilidae) Classification'
86
+ @dwc.metadata.id.should == "leptogastrinae:version:2.5"
87
+ @dwc.metadata.title.should ==
88
+ "Leptogastrinae (Diptera: Asilidae) Classification"
85
89
  @dwc.metadata.authors.should == [
86
- {:last_name=>"Bayless", :email=>"keith.bayless@gmail.com", :first_name=>"Keith"},
87
- {:last_name=>"Dikow", :email=>"dshorthouse@eol.org", :first_name=>"Torsten"}]
88
- @dwc.metadata.abstract.should == 'These are all the names in the Leptogastrinae classification.'
89
- @dwc.metadata.citation.should == 'Dikow, Torsten. 2010. The Leptogastrinae classification.'
90
- @dwc.metadata.url.should == 'http://leptogastrinae.lifedesks.org/files/leptogastrinae/classification_export/shared/leptogastrinae.tar.gz'
90
+ { last_name: "Bayless", email: "keith.bayless@gmail.com",
91
+ first_name: "Keith" },
92
+ { last_name: "Dikow", email: "dshorthouse@eol.org", first_name: "Torsten" }
93
+ ]
94
+ @dwc.metadata.abstract.should ==
95
+ "These are all the names in the Leptogastrinae classification."
96
+ @dwc.metadata.citation.should ==
97
+ "Dikow, Torsten. 2010. The Leptogastrinae classification."
98
+ @dwc.metadata.url.should ==
99
+ "http://leptogastrinae.lifedesks.org/files/leptogastrinae/"\
100
+ "classification_export/shared/leptogastrinae.tar.gz"
91
101
  end
92
102
 
93
103
  Then /^DarwinCore instance should have an extensions array$/ do
@@ -103,11 +113,18 @@ end
103
113
 
104
114
  Then /^extension should have properties, data, file_path, coreid, fields$/ do
105
115
  ext = @dwc.extensions[0]
106
- ext.properties.should == {:ignoreHeaderLines=>1, :encoding=>"UTF-8", :rowType=>"http://rs.gbif.org/ipt/terms/1.0/VernacularName", :fieldsEnclosedBy=>"", :fieldsTerminatedBy=>"\\t", :linesTerminatedBy=>"\\n"}
116
+ ext.properties.should == {
117
+ ignoreHeaderLines: 1, encoding: "UTF-8",
118
+ rowType: "http://rs.gbif.org/ipt/terms/1.0/VernacularName",
119
+ fieldsEnclosedBy: "", fieldsTerminatedBy: "\\t", linesTerminatedBy: "\\n"
120
+ }
107
121
  ext.data.class.should == Hash
108
122
  ext.file_path.should match(/\/tmp\/dwc_[\d]+\/VernacularName.txt/)
109
- ext.coreid.should == {:index=>0}
110
- ext.fields.should == [{:term=>"http://rs.gbif.org/ecat/terms/vernacularName", :index=>1}, {:term=>"http://rs.gbif.org/thesaurus/languageCode", :index=>2}]
123
+ ext.coreid.should == { index: 0 }
124
+ ext.fields.should == [
125
+ { term: "http://rs.gbif.org/ecat/terms/vernacularName", index: 1 },
126
+ { term: "http://rs.gbif.org/thesaurus/languageCode", index: 2 }
127
+ ]
111
128
  end
112
129
 
113
130
  Given /^acces to DarwinCore gem$/ do
@@ -153,7 +170,7 @@ end
153
170
 
154
171
  Then /^I can read its core content using block$/ do
155
172
  res = []
156
- @dwc.core.ignore_headers.should be_true
173
+ @dwc.core.ignore_headers.should be true
157
174
  read_result = @dwc.core.read(200) do |r, err|
158
175
  res << [r.size, err.size]
159
176
  end
@@ -164,7 +181,7 @@ end
164
181
  Then /^I can read extensions content using block$/ do
165
182
  res = []
166
183
  ext = @dwc.extensions[0]
167
- ext.ignore_headers.should be_true
184
+ ext.ignore_headers.should be true
168
185
  ext.read(200) do |r, err|
169
186
  res << [r.size, err.size]
170
187
  end
@@ -196,22 +213,24 @@ Then /^there are paths, synonyms and vernacular names in normalized classificati
196
213
  if v.vernacular_names.size > 0
197
214
  @vernaculars_are_generated = true
198
215
  vn = v.vernacular_names[0]
199
- (vn.respond_to?('locality') && vn.respond_to?('country_code') && vn.respond_to?('language')).should be_true
216
+ (vn.respond_to?("locality") && vn.respond_to?("country_code") &&
217
+ vn.respond_to?("language")).should be true
200
218
  end
201
219
  break if (@vernaculars_are_generated && @paths_are_generated && @synonyms_are_generated)
202
220
  end
203
- @paths_are_generated.should be_true
204
- @vernaculars_are_generated.should be_true
205
- @synonyms_are_generated.should be_true
221
+ @paths_are_generated.should be true
222
+ @vernaculars_are_generated.should be true
223
+ @synonyms_are_generated.should be true
206
224
  end
207
225
 
208
226
  Then /^there are local_id and global_id methods in taxons and synonyms$/ do
209
227
  @normalized_classification.each do |k, v|
210
228
  if v.synonyms.size > 0
211
- v.local_id.should == '2'
229
+ v.local_id.should == "2"
212
230
  v.global_id.should == "97498f29-2501-440d-9452-f3817da0d6c2"
213
- v.synonyms.first.local_id.should == '1'
214
- v.synonyms.first.global_id.should == "e017ed01-407d-4d09-82c5-8b3d9fa76e35"
231
+ v.synonyms.first.local_id.should == "1"
232
+ v.synonyms.first.global_id.should ==
233
+ "e017ed01-407d-4d09-82c5-8b3d9fa76e35"
215
234
  break
216
235
  end
217
236
  end
@@ -229,8 +248,8 @@ Then /^there are id paths, no canonical names paths in normalized classification
229
248
  id_paths_generated = true
230
249
  end
231
250
  end
232
- id_paths_generated.should be_true
233
- canonical_paths_generated.should be_false
251
+ id_paths_generated.should be true
252
+ canonical_paths_generated.should be false
234
253
  end
235
254
 
236
255
  Then /^names used in classification can be accessed by "([^"]*)" method$/ do |name_strings|
@@ -1,4 +1,4 @@
1
1
  $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
2
- require 'dwc-archive'
2
+ require 'dwc_archive'
3
3
 
4
4
  require 'rspec/expectations'
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "ostruct"
5
+ require "digest"
6
+ require "csv"
7
+ require "logger"
8
+ require "nokogiri"
9
+ require "biodiversity"
10
+ require_relative "dwc_archive/xml_reader"
11
+ require_relative "dwc_archive/ingester"
12
+ require_relative "dwc_archive/errors"
13
+ require_relative "dwc_archive/expander"
14
+ require_relative "dwc_archive/archive"
15
+ require_relative "dwc_archive/core"
16
+ require_relative "dwc_archive/extension"
17
+ require_relative "dwc_archive/metadata"
18
+ require_relative "dwc_archive/generator"
19
+ require_relative "dwc_archive/generator_meta_xml"
20
+ require_relative "dwc_archive/generator_eml_xml"
21
+ require_relative "dwc_archive/taxon_normalized"
22
+ require_relative "dwc_archive/gnub_taxon"
23
+ require_relative "dwc_archive/classification_normalizer"
24
+ require_relative "dwc_archive/version"
25
+
26
+ # main class for handling darwin core archives
27
+ class DarwinCore
28
+ DEFAULT_TMP_DIR = "/tmp"
29
+ VernacularNormalized = Struct.new(:name, :language, :locality, :country_code)
30
+ SynonymNormalized = Struct.new(:id, :name, :canonical_name, :status, :source,
31
+ :local_id, :global_id)
32
+
33
+ class << self
34
+ attr_writer :logger
35
+
36
+ def clean(path)
37
+ FileUtils.rm_rf(path) if FileTest.exists?(path)
38
+ end
39
+
40
+ def files(path)
41
+ return nil unless path && FileTest.exists?(path)
42
+
43
+ Dir.entries(path).reject { |e| e.match(/[.]{1,2}$/) }.sort
44
+ end
45
+
46
+ def random_path(tmp_dir)
47
+ File.join(tmp_dir, "dwc_#{rand(10_000_000_000)}")
48
+ end
49
+ end
50
+
51
+ attr_reader :archive, :core, :metadata, :classification_normalizer
52
+ alias eml metadata
53
+
54
+ def self.nil_field?(field)
55
+ return true if [nil, "", "/N"].include?(field)
56
+
57
+ false
58
+ end
59
+
60
+ def self.clean_all(tmp_dir = DEFAULT_TMP_DIR)
61
+ Dir.entries(tmp_dir).each do |entry|
62
+ path = File.join(tmp_dir, entry)
63
+ FileUtils.rm_rf(path) if FileTest.directory?(path) && entry.match(/^dwc_\d+$/)
64
+ end
65
+ end
66
+
67
+ def self.logger
68
+ @logger ||= Logger.new(nil)
69
+ end
70
+
71
+ def self.logger_reset
72
+ self.logger = Logger.new(nil)
73
+ end
74
+
75
+ def self.logger_write(obj_id, message, method = :info)
76
+ logger.send(method, "|#{obj_id}|#{message}|")
77
+ end
78
+
79
+ def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
80
+ @dwc_path = dwc_path
81
+ @archive = DarwinCore::Archive.new(@dwc_path, tmp_dir)
82
+ @core = DarwinCore::Core.new(self)
83
+ @metadata = DarwinCore::Metadata.new(@archive)
84
+ extensions
85
+ end
86
+
87
+ def file_name
88
+ File.split(@dwc_path).last
89
+ end
90
+
91
+ def path
92
+ File.expand_path(@dwc_path)
93
+ end
94
+
95
+ # generates a hash from a classification data with path to each node,
96
+ # list of synonyms and vernacular names.
97
+ def normalize_classification
98
+ return nil unless parent_id?
99
+
100
+ @classification_normalizer ||=
101
+ DarwinCore::ClassificationNormalizer.new(self)
102
+ @classification_normalizer.normalize
103
+ end
104
+
105
+ def parent_id?
106
+ !@core.fields.join("|").
107
+ downcase.match(/highertaxonid|parentnameusageid/).nil?
108
+ end
109
+
110
+ def checksum
111
+ Digest::SHA1.hexdigest(File.read(@dwc_path))
112
+ end
113
+
114
+ def extensions
115
+ return @extensions if @extensions
116
+
117
+ root_key = @archive.meta.keys[0]
118
+ ext = @archive.meta[root_key][:extension]
119
+ return @extensions = [] unless ext
120
+
121
+ ext = [ext] if ext.class != Array
122
+ @extensions = ext.map { |e| DarwinCore::Extension.new(self, e) }
123
+ end
124
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Deals with handling DarwinCoreArchive file, and provides meta information
5
+ # and files information about archive
6
+ class Archive
7
+ attr_reader :meta, :eml
8
+
9
+ def initialize(archive_path, tmp_dir)
10
+ @archive_path = archive_path
11
+ @tmp_dir = tmp_dir
12
+ @expander = DarwinCore::Expander.new(@archive_path, @tmp_dir)
13
+ @expander.unpack
14
+ prepare_metadata
15
+ end
16
+
17
+ def valid?
18
+ valid = true
19
+ valid = valid && @expander.path && FileTest.exists?(@expander.path)
20
+ valid && files && files.include?("meta.xml")
21
+ end
22
+
23
+ def files
24
+ @expander.files
25
+ end
26
+
27
+ def files_path
28
+ @expander.path
29
+ end
30
+
31
+ def clean
32
+ @expander.clean
33
+ end
34
+
35
+ private
36
+
37
+ def prepare_metadata
38
+ if valid?
39
+ prepare_meta_file
40
+ prepare_eml_file
41
+ else
42
+ clean
43
+ raise InvalidArchiveError
44
+ end
45
+ end
46
+
47
+ def prepare_meta_file
48
+ meta_file = File.open(File.join(@expander.path, "meta.xml"))
49
+ @meta = DarwinCore::XmlReader.from_xml(meta_file)
50
+ end
51
+
52
+ def prepare_eml_file
53
+ @eml = nil
54
+ return unless files.include?("eml.xml")
55
+
56
+ eml_file = File.open(File.join(@expander.path, "eml.xml"))
57
+ @eml = DarwinCore::XmlReader.from_xml(eml_file)
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,382 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Returns tree representation of Darwin Core file with vernacular and
5
+ # and synonyms attached to the taxon nodes
6
+ class ClassificationNormalizer
7
+ attr_reader :error_names, :tree, :normalized_data, :dwc
8
+ alias darwin_core dwc
9
+
10
+ def initialize(dwc_instance)
11
+ @dwc = dwc_instance
12
+ @core_fields = find_fields(@dwc.core)
13
+ @extensions = @dwc.extensions.map { |e| [e, find_fields(e)] }
14
+ @normalized_data = {}
15
+ @synonyms = {}
16
+ @name_strings = {}
17
+ @vernacular_name_strings = {}
18
+ @error_names = []
19
+ @tree = {}
20
+ end
21
+
22
+ def add_name_string(name_string)
23
+ @name_strings[name_string] = 1 unless @name_strings[name_string]
24
+ end
25
+
26
+ def add_vernacular_name_string(name_string)
27
+ return if @vernacular_name_strings[name_string]
28
+
29
+ @vernacular_name_strings[name_string] = 1
30
+ end
31
+
32
+ def name_strings(opts = {})
33
+ process_strings(@name_strings, opts)
34
+ end
35
+
36
+ def vernacular_name_strings(opts = {})
37
+ process_strings(@vernacular_name_strings, opts)
38
+ end
39
+
40
+ def normalize(opts = {})
41
+ opts = { with_canonical_names: true,
42
+ with_extensions: true }.merge(opts)
43
+ @with_canonical_names = opts[:with_canonical_names]
44
+ DarwinCore.logger_write(@dwc.object_id,
45
+ "Started normalization of the classification")
46
+ ingest_core
47
+ DarwinCore.logger_write(
48
+ @dwc.object_id,
49
+ "Calculating the classification parent/child paths"
50
+ )
51
+ if parent_id?
52
+ calculate_classification_path
53
+ else
54
+ @normalized_data.keys.each { |id| @tree[id] = {} }
55
+ end
56
+ DarwinCore.logger_write(@dwc.object_id, "Ingesting data from extensions")
57
+ ingest_extensions if opts[:with_extensions]
58
+ @normalized_data
59
+ end
60
+
61
+ private
62
+
63
+ def process_strings(strings, opts)
64
+ opts = { with_hash: false }.merge(opts)
65
+ if opts[:with_hash]
66
+ strings
67
+ else
68
+ strings.keys
69
+ end
70
+ end
71
+
72
+ def get_canonical_name(a_scientific_name)
73
+ return nil unless @with_canonical_names
74
+
75
+ canonical_name = Biodiversity::Parser.parse(a_scientific_name).
76
+ dig(:canonical, :simple)
77
+ canonical_name.to_s.empty? ? a_scientific_name : canonical_name
78
+ end
79
+
80
+ def find_fields(element)
81
+ data = element.fields.each_with_object({}) do |f, h|
82
+ field = f[:term].split("/")[-1]
83
+ field = field ? field.downcase.to_sym : ""
84
+ h[field] = f[:index].to_i
85
+ end
86
+ data[:id] = element.id[:index] if element.id
87
+ data
88
+ end
89
+
90
+ def status_synonym?(status)
91
+ status&.match(/^syn/)
92
+ end
93
+
94
+ def add_synonym_from_core(taxon_id, row)
95
+ cf = @core_fields
96
+ @synonyms[row[cf[:id]]] = taxon_id
97
+ @normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new unless @normalized_data[row[taxon_id]]
98
+
99
+ taxon = @normalized_data[row[taxon_id]]
100
+ synonym = SynonymNormalized.new(
101
+ row[cf[:id]],
102
+ row[cf[:scientificname]],
103
+ row[cf[:canonicalname]],
104
+ cf[:taxonomicstatus] ? row[cf[:taxonomicstatus]] : nil,
105
+ cf[:source] ? row[cf[:source]] : nil,
106
+ cf[:localid] ? row[cf[:localid]] : nil,
107
+ cf[:globalid] ? row[cf[:globalid]] : nil
108
+ )
109
+ taxon.synonyms << synonym
110
+ add_name_string(synonym.name)
111
+ add_name_string(synonym.canonical_name)
112
+ end
113
+
114
+ def set_scientific_name(row, fields)
115
+ row[fields[:scientificname]] = "N/A" unless row[fields[:scientificname]]
116
+ canonical_name = nil
117
+ scientific_name = row[fields[:scientificname]].strip
118
+ if separate_canonical_and_authorship?(row, fields)
119
+ canonical_name = row[fields[:scientificname]].strip if @with_canonical_names
120
+ scientific_name += " #{row[fields[:scientificnameauthorship]].strip}"
121
+ else
122
+ canonical_name = get_canonical_name(row[fields[:scientificname]]) if @with_canonical_names
123
+ end
124
+ fields[:canonicalname] = row.size
125
+ row << canonical_name
126
+ row[fields[:scientificname]] = scientific_name
127
+ end
128
+
129
+ def separate_canonical_and_authorship?(row, fields)
130
+ authorship = ""
131
+ authorship = row[fields[:scientificnameauthorship]].to_s.strip if fields[:scientificnameauthorship]
132
+ !(authorship.empty? || row[fields[:scientificname]].index(authorship))
133
+ end
134
+
135
+ def ingest_core
136
+ @normalized_data = {}
137
+ has_name_and_id = @core_fields[:id] && @core_fields[:scientificname]
138
+ unless has_name_and_id
139
+ raise(DarwinCore::CoreFileError,
140
+ "Darwin Core core fields must contain taxon id and scientific name")
141
+ end
142
+ @dwc.core.read do |rows|
143
+ rows[1].each do |error|
144
+ @error_names << { data: error,
145
+ error: :reading_or_encoding_error }
146
+ end
147
+ rows[0].each do |r|
148
+ set_scientific_name(r, @core_fields)
149
+ # Core has AcceptedNameUsageId
150
+ if @core_fields[:acceptednameusageid] &&
151
+ r[@core_fields[:acceptednameusageid]] &&
152
+ r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
153
+ add_synonym_from_core(@core_fields[:acceptednameusageid], r)
154
+ elsif !@core_fields[:acceptednameusageid] &&
155
+ @core_fields[:taxonomicstatus] &&
156
+ status_synonym?(r[@core_fields[:taxonomicstatus]])
157
+ add_synonym_from_core(parent_id, r) if parent_id?
158
+ else
159
+ unless @normalized_data[r[@core_fields[:id]]]
160
+ new_taxon = if gnub_archive?
161
+ DarwinCore::GnubTaxon.new
162
+ else
163
+ DarwinCore::TaxonNormalized.new
164
+ end
165
+ @normalized_data[r[@core_fields[:id]]] = new_taxon
166
+ end
167
+ taxon = @normalized_data[r[@core_fields[:id]]]
168
+ if gnub_archive?
169
+ taxon.uuid = r[@core_fields[:originalnameusageid]]
170
+ taxon.uuid_path = r[@core_fields[:originalnameusageidpath]].
171
+ split("|")
172
+ end
173
+ taxon.id = r[@core_fields[:id]]
174
+ taxon.current_name = r[@core_fields[:scientificname]]
175
+ taxon.current_name_canonical = r[@core_fields[:canonicalname]]
176
+ taxon.parent_id = parent_id? ? r[parent_id] : nil
177
+ taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
178
+ taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
179
+ taxon.source = r[@core_fields[:source]] if @core_fields[:source]
180
+ taxon.local_id = r[@core_fields[:localid]] if @core_fields[:localid]
181
+ taxon.global_id = r[@core_fields[:globalid]] if @core_fields[:globalid]
182
+ taxon.linnean_classification_path =
183
+ get_linnean_classification_path(r, taxon)
184
+ add_name_string(taxon.current_name)
185
+ has_canonical = taxon.current_name_canonical &&
186
+ !taxon.current_name_canonical.empty?
187
+ add_name_string(taxon.current_name_canonical) if has_canonical
188
+ end
189
+ end
190
+ end
191
+ end
192
+
193
+ def parent_id?
194
+ @has_parent_id ||= @core_fields.key?(:highertaxonid) ||
195
+ @core_fields.key?(:parentnameusageid)
196
+ end
197
+
198
+ def parent_id
199
+ @core_fields[:highertaxonid] || @core_fields[:parentnameusageid]
200
+ end
201
+
202
+ def calculate_classification_path
203
+ @paths_num = 0
204
+ @normalized_data.each do |_taxon_id, taxon|
205
+ next unless taxon.classification_path_id.empty?
206
+
207
+ res = get_classification_path(taxon)
208
+ next if res == "error"
209
+ end
210
+ end
211
+
212
+ def get_classification_path(taxon)
213
+ return unless taxon.classification_path_id.empty?
214
+
215
+ @paths_num += 1
216
+ if @paths_num % 10_000 == 0
217
+ DarwinCore.logger_write(@dwc.object_id,
218
+ "Calculated #{@paths_num} paths")
219
+ end
220
+ current_node = { taxon.id => {} }
221
+ if DarwinCore.nil_field?(taxon.parent_id)
222
+ taxon.classification_path << taxon.current_name_canonical if @with_canonical_names
223
+ taxon.classification_path_id << taxon.id
224
+ @tree.merge!(current_node)
225
+ else
226
+ parent_cp = parent_cpid = nil
227
+ if @normalized_data[taxon.parent_id]
228
+ parent_cp = @normalized_data[taxon.parent_id].classification_path if @with_canonical_names
229
+ parent_cpid = @normalized_data[taxon.parent_id].
230
+ classification_path_id
231
+ else
232
+ current_parent = @normalized_data[@synonyms[taxon.parent_id]]
233
+ if current_parent
234
+ @error_names << { data: taxon,
235
+ error: :deprecated_parent,
236
+ current_parent: current_parent }
237
+
238
+ parent_cp = current_parent.classification_path if @with_canonical_names
239
+ parent_cpid = current_parent.classification_path_id
240
+ else
241
+ @error_names << { data: taxon,
242
+ error: :deprecated_parent,
243
+ current_parent: nil }
244
+ end
245
+ end
246
+ return "error" unless parent_cpid
247
+
248
+ if parent_cpid.empty?
249
+ res = "error"
250
+ begin
251
+ res = get_classification_path(@normalized_data[taxon.parent_id])
252
+ rescue SystemStackError
253
+ @error_names << { data: taxon,
254
+ error: :too_deep_hierarchy,
255
+ current_parent: nil }
256
+ end
257
+ return res if res == "error"
258
+
259
+ if @with_canonical_names
260
+ taxon.classification_path += @normalized_data[taxon.parent_id].
261
+ classification_path +
262
+ [taxon.current_name_canonical]
263
+ end
264
+ taxon.classification_path_id += @normalized_data[taxon.parent_id].
265
+ classification_path_id + [taxon.id]
266
+ parent_node = @normalized_data[taxon.parent_id].
267
+ classification_path_id.inject(@tree) do |node, id|
268
+ node[id]
269
+ end
270
+ parent_node.merge!(current_node)
271
+ else
272
+ if @with_canonical_names
273
+ taxon.classification_path += parent_cp +
274
+ [taxon.current_name_canonical]
275
+ end
276
+ taxon.classification_path_id += parent_cpid + [taxon.id]
277
+ parent_node = @normalized_data[taxon.parent_id].
278
+ classification_path_id.inject(@tree) do |node, id|
279
+ node[id]
280
+ end
281
+ begin
282
+ parent_node.merge!(current_node)
283
+ rescue NoMethodError => e
284
+ DarwinCore.logger_write(@dwc.object_id,
285
+ "Error '#{e.message}' taxon #{taxon.id}")
286
+ "error"
287
+ end
288
+ end
289
+ end
290
+ end
291
+
292
+ def ingest_extensions
293
+ @extensions.each do |e|
294
+ _ext, fields = *e
295
+ ingest_synonyms(e) if File.split(e[0].file_path).
296
+ last.match(/synonym/i) &&
297
+ fields.keys.include?(:scientificname)
298
+ ingest_vernaculars(e) if fields.keys.include? :vernacularname
299
+ end
300
+ end
301
+
302
+ def ingest_synonyms(extension)
303
+ DarwinCore.logger_write(@dwc.object_id, "Ingesting synonyms extension")
304
+ ext, fields = *extension
305
+ ext.read do |rows|
306
+ rows[0].each do |r|
307
+ synonym = process_synonym(r, fields)
308
+ add_synonym(synonym, r, fields)
309
+ end
310
+ end
311
+ end
312
+
313
+ def add_synonym(synonym, record, fields)
314
+ if @normalized_data[record[fields[:id]]]
315
+ @normalized_data[record[fields[:id]]].synonyms << synonym
316
+ add_name_string(synonym.name)
317
+ add_name_string(synonym.canonical_name)
318
+ else
319
+ @error_names << { taxon: synonym,
320
+ error: :synonym_of_unknown_taxa }
321
+ end
322
+ end
323
+
324
+ def process_synonym(record, fields)
325
+ set_scientific_name(record, fields)
326
+ SynonymNormalized.new(
327
+ nil,
328
+ record[fields[:scientificname]],
329
+ record[fields[:canonicalname]],
330
+ fields[:taxonomicstatus] ? record[fields[:taxonomicstatus]] : nil,
331
+ fields[:source] ? record[fields[:source]] : nil,
332
+ fields[:localid] ? record[fields[:localid]] : nil,
333
+ fields[:globalid] ? record[fields[:globalid]] : nil
334
+ )
335
+ end
336
+
337
+ def ingest_vernaculars(extension)
338
+ DarwinCore.logger_write(@dwc.object_id,
339
+ "Ingesting vernacular names extension")
340
+ ext, fields = *extension
341
+ ext.read do |rows|
342
+ rows[0].each do |row|
343
+ extract_vernaculars_from_row(row, fields)
344
+ end
345
+ end
346
+ end
347
+
348
+ def extract_vernaculars_from_row(row, fields)
349
+ language = find_vernacular_language(row, fields)
350
+ locality = fields[:locality] ? row[fields[:locality]] : nil
351
+ country_code = fields[:countrycode] ? row[fields[:countrycode]] : nil
352
+
353
+ vernacular = VernacularNormalized.new(
354
+ row[fields[:vernacularname]], language, locality, country_code
355
+ )
356
+ if @normalized_data[row[fields[:id]]]
357
+ @normalized_data[row[fields[:id]]].vernacular_names << vernacular
358
+ add_vernacular_name_string(vernacular.name)
359
+ else
360
+ @error_names << { vernacular_name: vernacular,
361
+ error: :vernacular_of_unknown_taxa }
362
+ end
363
+ end
364
+
365
+ def find_vernacular_language(row, fields)
366
+ (fields[:language] && row[fields[:language]]) ||
367
+ (fields[:languagecode] && row[fields[:languagecode]]) || nil
368
+ end
369
+
370
+ # Collect linnean classification path only on species level
371
+ def get_linnean_classification_path(row, _taxon)
372
+ %i[kingdom phylum class order family genus
373
+ subgenus].each_with_object([]) do |clade, res|
374
+ res << [row[@core_fields[clade]], clade] if @core_fields[clade]
375
+ end
376
+ end
377
+
378
+ def gnub_archive?
379
+ @core_fields[:originalnameusageidpath]
380
+ end
381
+ end
382
+ end