dwc-archive 0.9.10 → 1.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -0
  3. data/.rspec +2 -1
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -1
  6. data/.travis.yml +4 -7
  7. data/CHANGELOG +14 -8
  8. data/Gemfile +3 -1
  9. data/LICENSE +1 -1
  10. data/README.md +119 -107
  11. data/Rakefile +13 -36
  12. data/dwc-archive.gemspec +23 -19
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +124 -0
  17. data/lib/dwc_archive/archive.rb +60 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +382 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +10 -0
  21. data/lib/dwc_archive/expander.rb +88 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +91 -0
  24. data/lib/dwc_archive/generator_eml_xml.rb +116 -0
  25. data/lib/dwc_archive/generator_meta_xml.rb +72 -0
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +57 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +90 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +47 -0
  34. data/spec/files/generator_meta.xml +19 -0
  35. data/spec/lib/classification_normalizer_spec.rb +96 -105
  36. data/spec/lib/core_spec.rb +43 -41
  37. data/spec/lib/darwin_core_spec.rb +108 -138
  38. data/spec/lib/generator_eml_xml_spec.rb +12 -11
  39. data/spec/lib/generator_meta_xml_spec.rb +12 -11
  40. data/spec/lib/generator_spec.rb +77 -69
  41. data/spec/lib/gnub_taxon_spec.rb +15 -17
  42. data/spec/lib/metadata_spec.rb +50 -41
  43. data/spec/lib/taxon_normalized_spec.rb +62 -65
  44. data/spec/lib/xml_reader_spec.rb +9 -12
  45. data/spec/spec_helper.rb +54 -51
  46. metadata +105 -88
  47. data/.rvmrc +0 -1
  48. data/] +0 -40
  49. data/lib/dwc-archive.rb +0 -107
  50. data/lib/dwc-archive/archive.rb +0 -40
  51. data/lib/dwc-archive/classification_normalizer.rb +0 -428
  52. data/lib/dwc-archive/core.rb +0 -17
  53. data/lib/dwc-archive/expander.rb +0 -84
  54. data/lib/dwc-archive/generator.rb +0 -85
  55. data/lib/dwc-archive/generator_eml_xml.rb +0 -86
  56. data/lib/dwc-archive/generator_meta_xml.rb +0 -58
  57. data/lib/dwc-archive/ingester.rb +0 -101
  58. data/lib/dwc-archive/metadata.rb +0 -48
  59. data/lib/dwc-archive/version.rb +0 -3
  60. data/lib/dwc-archive/xml_reader.rb +0 -64
@@ -19,7 +19,7 @@ end
19
19
 
20
20
  Then /^these data should be saved as "([^\"]*)" file$/ do |file_name|
21
21
  file = File.join(@gen.path, file_name)
22
- @gen.files.include?(file_name).should be_true
22
+ @gen.files.include?(file_name).should be true
23
23
  csv = CSV.open(file).count.should == 4
24
24
  end
25
25
 
@@ -51,7 +51,7 @@ end
51
51
  Then /^data are saved as "([^\"]*)" and "([^\"]*)"$/ do |file_name_1, file_name_2|
52
52
  [file_name_1, file_name_2].each do |file_name|
53
53
  file = File.join(@gen.path, file_name)
54
- @gen.files.include?(file_name).should be_true
54
+ @gen.files.include?(file_name).should be true
55
55
  csv = CSV.open(file).count.should > 1
56
56
  end
57
57
  end
@@ -86,7 +86,7 @@ end
86
86
 
87
87
  Then /^there should be "([^\"]*)" file with core and extensions informations$/ do |file_name|
88
88
  meta = File.join(@gen.path, file_name)
89
- @gen.files.include?(file_name).should be_true
89
+ @gen.files.include?(file_name).should be true
90
90
  dom = Nokogiri::XML(open(File.join(@gen.path, file_name)))
91
91
  dom.xpath('//xmlns:core//xmlns:location').text.should == 'darwin_core.txt'
92
92
  dom.xpath('//xmlns:extension[1]//xmlns:location').text.should == 'vernacular.txt'
@@ -94,7 +94,7 @@ end
94
94
 
95
95
  Then /^there should be "([^\"]*)" file with authoriship information$/ do |file_name|
96
96
  eml = File.join(@gen.path, file_name)
97
- @gen.files.include?(file_name).should be_true
97
+ @gen.files.include?(file_name).should be true
98
98
  end
99
99
 
100
100
  Given /^a path to a new file \- "([^\"]*)"$/ do |file_name|
@@ -107,6 +107,6 @@ end
107
107
 
108
108
  Then /^there should be a valid new archive file$/ do
109
109
  dwc = DarwinCore.new('/tmp/dwc.tar.gz')
110
- dwc.archive.valid?.should be_true
110
+ dwc.archive.valid?.should be true
111
111
  end
112
112
 
@@ -1,5 +1,6 @@
1
1
  Given /^path to a dwc file "([^\"]*)"$/ do |arg1|
2
- @dwca_file = File.expand_path(File.dirname(__FILE__) + "../../../spec/files/" + arg1)
2
+ @dwca_file = File.expand_path(File.dirname(__FILE__) +
3
+ "../../../spec/files/" + arg1)
3
4
  @tmp_dir = "/tmp"
4
5
  end
5
6
 
@@ -8,11 +9,12 @@ When /^I create a new DarwinCore::Archive instance$/ do
8
9
  end
9
10
 
10
11
  Then /^I should find that the archive is valid$/ do
11
- @dwca.valid?.should be_true
12
+ @dwca.valid?.should be true
12
13
  end
13
14
 
14
15
  Then /^I should see what files the archive has$/ do
15
- @dwca.files.should == ["DarwinCore.txt", "VernacularName.txt", "eml.xml", "meta.xml", "metadata.txt"]
16
+ @dwca.files.should == ["DarwinCore.txt", "VernacularName.txt", "eml.xml",
17
+ "meta.xml", "metadata.txt"]
16
18
  end
17
19
 
18
20
  When /^I delete expanded files$/ do
@@ -20,7 +22,7 @@ When /^I delete expanded files$/ do
20
22
  end
21
23
 
22
24
  Then /^they should disappear$/ do
23
- @dwca.files.should be_nil
25
+ @dwca.files.should be nil
24
26
  end
25
27
 
26
28
  When /^I create a new DarwinCore instance$/ do
@@ -37,7 +39,7 @@ When /^I create DarwinCore::ClassificationNormalizer instance$/ do
37
39
  end
38
40
 
39
41
  Then /^instance should have a valid archive$/ do
40
- @dwc.archive.valid?.should be_true
42
+ @dwc.archive.valid?.should be true
41
43
  end
42
44
 
43
45
  Then /^instance should have a core$/ do
@@ -45,7 +47,7 @@ Then /^instance should have a core$/ do
45
47
  end
46
48
 
47
49
  Then /^I should see checksum$/ do
48
- @dwc.checksum.should == '7d94fc28ffaf434b66fbc790aa5ef00d834057bf'
50
+ @dwc.checksum.should == "7d94fc28ffaf434b66fbc790aa5ef00d834057bf"
49
51
  end
50
52
 
51
53
  When /^I check core data$/ do
@@ -64,7 +66,8 @@ And /^core\.file_path$/ do
64
66
  end
65
67
 
66
68
  And /^core\.id$/ do
67
- @core.id.should == {:index => 0, :term => 'http://rs.tdwg.org/dwc/terms/TaxonID'}
69
+ @core.id.should == {index: 0,
70
+ term: "http://rs.tdwg.org/dwc/terms/TaxonID"}
68
71
  end
69
72
 
70
73
  And /^core\.fields$/ do
@@ -80,14 +83,21 @@ Then /^DarwinCore instance should have dwc\.metadata object$/ do
80
83
  end
81
84
 
82
85
  And /^I should find id, title, creators, metadata provider$/ do
83
- @dwc.metadata.id.should == 'leptogastrinae:version:2.5'
84
- @dwc.metadata.title.should == 'Leptogastrinae (Diptera: Asilidae) Classification'
86
+ @dwc.metadata.id.should == "leptogastrinae:version:2.5"
87
+ @dwc.metadata.title.should ==
88
+ "Leptogastrinae (Diptera: Asilidae) Classification"
85
89
  @dwc.metadata.authors.should == [
86
- {:last_name=>"Bayless", :email=>"keith.bayless@gmail.com", :first_name=>"Keith"},
87
- {:last_name=>"Dikow", :email=>"dshorthouse@eol.org", :first_name=>"Torsten"}]
88
- @dwc.metadata.abstract.should == 'These are all the names in the Leptogastrinae classification.'
89
- @dwc.metadata.citation.should == 'Dikow, Torsten. 2010. The Leptogastrinae classification.'
90
- @dwc.metadata.url.should == 'http://leptogastrinae.lifedesks.org/files/leptogastrinae/classification_export/shared/leptogastrinae.tar.gz'
90
+ { last_name: "Bayless", email: "keith.bayless@gmail.com",
91
+ first_name: "Keith" },
92
+ { last_name: "Dikow", email: "dshorthouse@eol.org", first_name: "Torsten" }
93
+ ]
94
+ @dwc.metadata.abstract.should ==
95
+ "These are all the names in the Leptogastrinae classification."
96
+ @dwc.metadata.citation.should ==
97
+ "Dikow, Torsten. 2010. The Leptogastrinae classification."
98
+ @dwc.metadata.url.should ==
99
+ "http://leptogastrinae.lifedesks.org/files/leptogastrinae/"\
100
+ "classification_export/shared/leptogastrinae.tar.gz"
91
101
  end
92
102
 
93
103
  Then /^DarwinCore instance should have an extensions array$/ do
@@ -103,11 +113,18 @@ end
103
113
 
104
114
  Then /^extension should have properties, data, file_path, coreid, fields$/ do
105
115
  ext = @dwc.extensions[0]
106
- ext.properties.should == {:ignoreHeaderLines=>1, :encoding=>"UTF-8", :rowType=>"http://rs.gbif.org/ipt/terms/1.0/VernacularName", :fieldsEnclosedBy=>"", :fieldsTerminatedBy=>"\\t", :linesTerminatedBy=>"\\n"}
116
+ ext.properties.should == {
117
+ ignoreHeaderLines: 1, encoding: "UTF-8",
118
+ rowType: "http://rs.gbif.org/ipt/terms/1.0/VernacularName",
119
+ fieldsEnclosedBy: "", fieldsTerminatedBy: "\\t", linesTerminatedBy: "\\n"
120
+ }
107
121
  ext.data.class.should == Hash
108
122
  ext.file_path.should match(/\/tmp\/dwc_[\d]+\/VernacularName.txt/)
109
- ext.coreid.should == {:index=>0}
110
- ext.fields.should == [{:term=>"http://rs.gbif.org/ecat/terms/vernacularName", :index=>1}, {:term=>"http://rs.gbif.org/thesaurus/languageCode", :index=>2}]
123
+ ext.coreid.should == { index: 0 }
124
+ ext.fields.should == [
125
+ { term: "http://rs.gbif.org/ecat/terms/vernacularName", index: 1 },
126
+ { term: "http://rs.gbif.org/thesaurus/languageCode", index: 2 }
127
+ ]
111
128
  end
112
129
 
113
130
  Given /^acces to DarwinCore gem$/ do
@@ -153,7 +170,7 @@ end
153
170
 
154
171
  Then /^I can read its core content using block$/ do
155
172
  res = []
156
- @dwc.core.ignore_headers.should be_true
173
+ @dwc.core.ignore_headers.should be true
157
174
  read_result = @dwc.core.read(200) do |r, err|
158
175
  res << [r.size, err.size]
159
176
  end
@@ -164,7 +181,7 @@ end
164
181
  Then /^I can read extensions content using block$/ do
165
182
  res = []
166
183
  ext = @dwc.extensions[0]
167
- ext.ignore_headers.should be_true
184
+ ext.ignore_headers.should be true
168
185
  ext.read(200) do |r, err|
169
186
  res << [r.size, err.size]
170
187
  end
@@ -196,22 +213,24 @@ Then /^there are paths, synonyms and vernacular names in normalized classificati
196
213
  if v.vernacular_names.size > 0
197
214
  @vernaculars_are_generated = true
198
215
  vn = v.vernacular_names[0]
199
- (vn.respond_to?('locality') && vn.respond_to?('country_code') && vn.respond_to?('language')).should be_true
216
+ (vn.respond_to?("locality") && vn.respond_to?("country_code") &&
217
+ vn.respond_to?("language")).should be true
200
218
  end
201
219
  break if (@vernaculars_are_generated && @paths_are_generated && @synonyms_are_generated)
202
220
  end
203
- @paths_are_generated.should be_true
204
- @vernaculars_are_generated.should be_true
205
- @synonyms_are_generated.should be_true
221
+ @paths_are_generated.should be true
222
+ @vernaculars_are_generated.should be true
223
+ @synonyms_are_generated.should be true
206
224
  end
207
225
 
208
226
  Then /^there are local_id and global_id methods in taxons and synonyms$/ do
209
227
  @normalized_classification.each do |k, v|
210
228
  if v.synonyms.size > 0
211
- v.local_id.should == '2'
229
+ v.local_id.should == "2"
212
230
  v.global_id.should == "97498f29-2501-440d-9452-f3817da0d6c2"
213
- v.synonyms.first.local_id.should == '1'
214
- v.synonyms.first.global_id.should == "e017ed01-407d-4d09-82c5-8b3d9fa76e35"
231
+ v.synonyms.first.local_id.should == "1"
232
+ v.synonyms.first.global_id.should ==
233
+ "e017ed01-407d-4d09-82c5-8b3d9fa76e35"
215
234
  break
216
235
  end
217
236
  end
@@ -229,8 +248,8 @@ Then /^there are id paths, no canonical names paths in normalized classification
229
248
  id_paths_generated = true
230
249
  end
231
250
  end
232
- id_paths_generated.should be_true
233
- canonical_paths_generated.should be_false
251
+ id_paths_generated.should be true
252
+ canonical_paths_generated.should be false
234
253
  end
235
254
 
236
255
  Then /^names used in classification can be accessed by "([^"]*)" method$/ do |name_strings|
@@ -1,4 +1,4 @@
1
1
  $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
2
- require 'dwc-archive'
2
+ require 'dwc_archive'
3
3
 
4
4
  require 'rspec/expectations'
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "ostruct"
5
+ require "digest"
6
+ require "csv"
7
+ require "logger"
8
+ require "nokogiri"
9
+ require "biodiversity"
10
+ require_relative "dwc_archive/xml_reader"
11
+ require_relative "dwc_archive/ingester"
12
+ require_relative "dwc_archive/errors"
13
+ require_relative "dwc_archive/expander"
14
+ require_relative "dwc_archive/archive"
15
+ require_relative "dwc_archive/core"
16
+ require_relative "dwc_archive/extension"
17
+ require_relative "dwc_archive/metadata"
18
+ require_relative "dwc_archive/generator"
19
+ require_relative "dwc_archive/generator_meta_xml"
20
+ require_relative "dwc_archive/generator_eml_xml"
21
+ require_relative "dwc_archive/taxon_normalized"
22
+ require_relative "dwc_archive/gnub_taxon"
23
+ require_relative "dwc_archive/classification_normalizer"
24
+ require_relative "dwc_archive/version"
25
+
26
+ # main class for handling darwin core archives
27
+ class DarwinCore
28
+ DEFAULT_TMP_DIR = "/tmp"
29
+ VernacularNormalized = Struct.new(:name, :language, :locality, :country_code)
30
+ SynonymNormalized = Struct.new(:id, :name, :canonical_name, :status, :source,
31
+ :local_id, :global_id)
32
+
33
+ class << self
34
+ attr_writer :logger
35
+
36
+ def clean(path)
37
+ FileUtils.rm_rf(path) if FileTest.exists?(path)
38
+ end
39
+
40
+ def files(path)
41
+ return nil unless path && FileTest.exists?(path)
42
+
43
+ Dir.entries(path).reject { |e| e.match(/[.]{1,2}$/) }.sort
44
+ end
45
+
46
+ def random_path(tmp_dir)
47
+ File.join(tmp_dir, "dwc_#{rand(10_000_000_000)}")
48
+ end
49
+ end
50
+
51
+ attr_reader :archive, :core, :metadata, :classification_normalizer
52
+ alias eml metadata
53
+
54
+ def self.nil_field?(field)
55
+ return true if [nil, "", "/N"].include?(field)
56
+
57
+ false
58
+ end
59
+
60
+ def self.clean_all(tmp_dir = DEFAULT_TMP_DIR)
61
+ Dir.entries(tmp_dir).each do |entry|
62
+ path = File.join(tmp_dir, entry)
63
+ FileUtils.rm_rf(path) if FileTest.directory?(path) && entry.match(/^dwc_\d+$/)
64
+ end
65
+ end
66
+
67
+ def self.logger
68
+ @logger ||= Logger.new(nil)
69
+ end
70
+
71
+ def self.logger_reset
72
+ self.logger = Logger.new(nil)
73
+ end
74
+
75
+ def self.logger_write(obj_id, message, method = :info)
76
+ logger.send(method, "|#{obj_id}|#{message}|")
77
+ end
78
+
79
+ def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
80
+ @dwc_path = dwc_path
81
+ @archive = DarwinCore::Archive.new(@dwc_path, tmp_dir)
82
+ @core = DarwinCore::Core.new(self)
83
+ @metadata = DarwinCore::Metadata.new(@archive)
84
+ extensions
85
+ end
86
+
87
+ def file_name
88
+ File.split(@dwc_path).last
89
+ end
90
+
91
+ def path
92
+ File.expand_path(@dwc_path)
93
+ end
94
+
95
+ # generates a hash from a classification data with path to each node,
96
+ # list of synonyms and vernacular names.
97
+ def normalize_classification
98
+ return nil unless parent_id?
99
+
100
+ @classification_normalizer ||=
101
+ DarwinCore::ClassificationNormalizer.new(self)
102
+ @classification_normalizer.normalize
103
+ end
104
+
105
+ def parent_id?
106
+ !@core.fields.join("|").
107
+ downcase.match(/highertaxonid|parentnameusageid/).nil?
108
+ end
109
+
110
+ def checksum
111
+ Digest::SHA1.hexdigest(File.read(@dwc_path))
112
+ end
113
+
114
+ def extensions
115
+ return @extensions if @extensions
116
+
117
+ root_key = @archive.meta.keys[0]
118
+ ext = @archive.meta[root_key][:extension]
119
+ return @extensions = [] unless ext
120
+
121
+ ext = [ext] if ext.class != Array
122
+ @extensions = ext.map { |e| DarwinCore::Extension.new(self, e) }
123
+ end
124
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Deals with handling DarwinCoreArchive file, and provides meta information
5
+ # and files information about archive
6
+ class Archive
7
+ attr_reader :meta, :eml
8
+
9
+ def initialize(archive_path, tmp_dir)
10
+ @archive_path = archive_path
11
+ @tmp_dir = tmp_dir
12
+ @expander = DarwinCore::Expander.new(@archive_path, @tmp_dir)
13
+ @expander.unpack
14
+ prepare_metadata
15
+ end
16
+
17
+ def valid?
18
+ valid = true
19
+ valid = valid && @expander.path && FileTest.exists?(@expander.path)
20
+ valid && files && files.include?("meta.xml")
21
+ end
22
+
23
+ def files
24
+ @expander.files
25
+ end
26
+
27
+ def files_path
28
+ @expander.path
29
+ end
30
+
31
+ def clean
32
+ @expander.clean
33
+ end
34
+
35
+ private
36
+
37
+ def prepare_metadata
38
+ if valid?
39
+ prepare_meta_file
40
+ prepare_eml_file
41
+ else
42
+ clean
43
+ raise InvalidArchiveError
44
+ end
45
+ end
46
+
47
+ def prepare_meta_file
48
+ meta_file = File.open(File.join(@expander.path, "meta.xml"))
49
+ @meta = DarwinCore::XmlReader.from_xml(meta_file)
50
+ end
51
+
52
+ def prepare_eml_file
53
+ @eml = nil
54
+ return unless files.include?("eml.xml")
55
+
56
+ eml_file = File.open(File.join(@expander.path, "eml.xml"))
57
+ @eml = DarwinCore::XmlReader.from_xml(eml_file)
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,382 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Returns tree representation of Darwin Core file with vernacular and
5
+ # and synonyms attached to the taxon nodes
6
+ class ClassificationNormalizer
7
+ attr_reader :error_names, :tree, :normalized_data, :dwc
8
+ alias darwin_core dwc
9
+
10
+ def initialize(dwc_instance)
11
+ @dwc = dwc_instance
12
+ @core_fields = find_fields(@dwc.core)
13
+ @extensions = @dwc.extensions.map { |e| [e, find_fields(e)] }
14
+ @normalized_data = {}
15
+ @synonyms = {}
16
+ @name_strings = {}
17
+ @vernacular_name_strings = {}
18
+ @error_names = []
19
+ @tree = {}
20
+ end
21
+
22
+ def add_name_string(name_string)
23
+ @name_strings[name_string] = 1 unless @name_strings[name_string]
24
+ end
25
+
26
+ def add_vernacular_name_string(name_string)
27
+ return if @vernacular_name_strings[name_string]
28
+
29
+ @vernacular_name_strings[name_string] = 1
30
+ end
31
+
32
+ def name_strings(opts = {})
33
+ process_strings(@name_strings, opts)
34
+ end
35
+
36
+ def vernacular_name_strings(opts = {})
37
+ process_strings(@vernacular_name_strings, opts)
38
+ end
39
+
40
+ def normalize(opts = {})
41
+ opts = { with_canonical_names: true,
42
+ with_extensions: true }.merge(opts)
43
+ @with_canonical_names = opts[:with_canonical_names]
44
+ DarwinCore.logger_write(@dwc.object_id,
45
+ "Started normalization of the classification")
46
+ ingest_core
47
+ DarwinCore.logger_write(
48
+ @dwc.object_id,
49
+ "Calculating the classification parent/child paths"
50
+ )
51
+ if parent_id?
52
+ calculate_classification_path
53
+ else
54
+ @normalized_data.keys.each { |id| @tree[id] = {} }
55
+ end
56
+ DarwinCore.logger_write(@dwc.object_id, "Ingesting data from extensions")
57
+ ingest_extensions if opts[:with_extensions]
58
+ @normalized_data
59
+ end
60
+
61
+ private
62
+
63
+ def process_strings(strings, opts)
64
+ opts = { with_hash: false }.merge(opts)
65
+ if opts[:with_hash]
66
+ strings
67
+ else
68
+ strings.keys
69
+ end
70
+ end
71
+
72
+ def get_canonical_name(a_scientific_name)
73
+ return nil unless @with_canonical_names
74
+
75
+ canonical_name = Biodiversity::Parser.parse(a_scientific_name).
76
+ dig(:canonical, :simple)
77
+ canonical_name.to_s.empty? ? a_scientific_name : canonical_name
78
+ end
79
+
80
+ def find_fields(element)
81
+ data = element.fields.each_with_object({}) do |f, h|
82
+ field = f[:term].split("/")[-1]
83
+ field = field ? field.downcase.to_sym : ""
84
+ h[field] = f[:index].to_i
85
+ end
86
+ data[:id] = element.id[:index] if element.id
87
+ data
88
+ end
89
+
90
+ def status_synonym?(status)
91
+ status&.match(/^syn/)
92
+ end
93
+
94
+ def add_synonym_from_core(taxon_id, row)
95
+ cf = @core_fields
96
+ @synonyms[row[cf[:id]]] = taxon_id
97
+ @normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new unless @normalized_data[row[taxon_id]]
98
+
99
+ taxon = @normalized_data[row[taxon_id]]
100
+ synonym = SynonymNormalized.new(
101
+ row[cf[:id]],
102
+ row[cf[:scientificname]],
103
+ row[cf[:canonicalname]],
104
+ cf[:taxonomicstatus] ? row[cf[:taxonomicstatus]] : nil,
105
+ cf[:source] ? row[cf[:source]] : nil,
106
+ cf[:localid] ? row[cf[:localid]] : nil,
107
+ cf[:globalid] ? row[cf[:globalid]] : nil
108
+ )
109
+ taxon.synonyms << synonym
110
+ add_name_string(synonym.name)
111
+ add_name_string(synonym.canonical_name)
112
+ end
113
+
114
+ def set_scientific_name(row, fields)
115
+ row[fields[:scientificname]] = "N/A" unless row[fields[:scientificname]]
116
+ canonical_name = nil
117
+ scientific_name = row[fields[:scientificname]].strip
118
+ if separate_canonical_and_authorship?(row, fields)
119
+ canonical_name = row[fields[:scientificname]].strip if @with_canonical_names
120
+ scientific_name += " #{row[fields[:scientificnameauthorship]].strip}"
121
+ else
122
+ canonical_name = get_canonical_name(row[fields[:scientificname]]) if @with_canonical_names
123
+ end
124
+ fields[:canonicalname] = row.size
125
+ row << canonical_name
126
+ row[fields[:scientificname]] = scientific_name
127
+ end
128
+
129
+ def separate_canonical_and_authorship?(row, fields)
130
+ authorship = ""
131
+ authorship = row[fields[:scientificnameauthorship]].to_s.strip if fields[:scientificnameauthorship]
132
+ !(authorship.empty? || row[fields[:scientificname]].index(authorship))
133
+ end
134
+
135
+ def ingest_core
136
+ @normalized_data = {}
137
+ has_name_and_id = @core_fields[:id] && @core_fields[:scientificname]
138
+ unless has_name_and_id
139
+ raise(DarwinCore::CoreFileError,
140
+ "Darwin Core core fields must contain taxon id and scientific name")
141
+ end
142
+ @dwc.core.read do |rows|
143
+ rows[1].each do |error|
144
+ @error_names << { data: error,
145
+ error: :reading_or_encoding_error }
146
+ end
147
+ rows[0].each do |r|
148
+ set_scientific_name(r, @core_fields)
149
+ # Core has AcceptedNameUsageId
150
+ if @core_fields[:acceptednameusageid] &&
151
+ r[@core_fields[:acceptednameusageid]] &&
152
+ r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
153
+ add_synonym_from_core(@core_fields[:acceptednameusageid], r)
154
+ elsif !@core_fields[:acceptednameusageid] &&
155
+ @core_fields[:taxonomicstatus] &&
156
+ status_synonym?(r[@core_fields[:taxonomicstatus]])
157
+ add_synonym_from_core(parent_id, r) if parent_id?
158
+ else
159
+ unless @normalized_data[r[@core_fields[:id]]]
160
+ new_taxon = if gnub_archive?
161
+ DarwinCore::GnubTaxon.new
162
+ else
163
+ DarwinCore::TaxonNormalized.new
164
+ end
165
+ @normalized_data[r[@core_fields[:id]]] = new_taxon
166
+ end
167
+ taxon = @normalized_data[r[@core_fields[:id]]]
168
+ if gnub_archive?
169
+ taxon.uuid = r[@core_fields[:originalnameusageid]]
170
+ taxon.uuid_path = r[@core_fields[:originalnameusageidpath]].
171
+ split("|")
172
+ end
173
+ taxon.id = r[@core_fields[:id]]
174
+ taxon.current_name = r[@core_fields[:scientificname]]
175
+ taxon.current_name_canonical = r[@core_fields[:canonicalname]]
176
+ taxon.parent_id = parent_id? ? r[parent_id] : nil
177
+ taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
178
+ taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
179
+ taxon.source = r[@core_fields[:source]] if @core_fields[:source]
180
+ taxon.local_id = r[@core_fields[:localid]] if @core_fields[:localid]
181
+ taxon.global_id = r[@core_fields[:globalid]] if @core_fields[:globalid]
182
+ taxon.linnean_classification_path =
183
+ get_linnean_classification_path(r, taxon)
184
+ add_name_string(taxon.current_name)
185
+ has_canonical = taxon.current_name_canonical &&
186
+ !taxon.current_name_canonical.empty?
187
+ add_name_string(taxon.current_name_canonical) if has_canonical
188
+ end
189
+ end
190
+ end
191
+ end
192
+
193
+ def parent_id?
194
+ @has_parent_id ||= @core_fields.key?(:highertaxonid) ||
195
+ @core_fields.key?(:parentnameusageid)
196
+ end
197
+
198
+ def parent_id
199
+ @core_fields[:highertaxonid] || @core_fields[:parentnameusageid]
200
+ end
201
+
202
+ def calculate_classification_path
203
+ @paths_num = 0
204
+ @normalized_data.each do |_taxon_id, taxon|
205
+ next unless taxon.classification_path_id.empty?
206
+
207
+ res = get_classification_path(taxon)
208
+ next if res == "error"
209
+ end
210
+ end
211
+
212
+ def get_classification_path(taxon)
213
+ return unless taxon.classification_path_id.empty?
214
+
215
+ @paths_num += 1
216
+ if @paths_num % 10_000 == 0
217
+ DarwinCore.logger_write(@dwc.object_id,
218
+ "Calculated #{@paths_num} paths")
219
+ end
220
+ current_node = { taxon.id => {} }
221
+ if DarwinCore.nil_field?(taxon.parent_id)
222
+ taxon.classification_path << taxon.current_name_canonical if @with_canonical_names
223
+ taxon.classification_path_id << taxon.id
224
+ @tree.merge!(current_node)
225
+ else
226
+ parent_cp = parent_cpid = nil
227
+ if @normalized_data[taxon.parent_id]
228
+ parent_cp = @normalized_data[taxon.parent_id].classification_path if @with_canonical_names
229
+ parent_cpid = @normalized_data[taxon.parent_id].
230
+ classification_path_id
231
+ else
232
+ current_parent = @normalized_data[@synonyms[taxon.parent_id]]
233
+ if current_parent
234
+ @error_names << { data: taxon,
235
+ error: :deprecated_parent,
236
+ current_parent: current_parent }
237
+
238
+ parent_cp = current_parent.classification_path if @with_canonical_names
239
+ parent_cpid = current_parent.classification_path_id
240
+ else
241
+ @error_names << { data: taxon,
242
+ error: :deprecated_parent,
243
+ current_parent: nil }
244
+ end
245
+ end
246
+ return "error" unless parent_cpid
247
+
248
+ if parent_cpid.empty?
249
+ res = "error"
250
+ begin
251
+ res = get_classification_path(@normalized_data[taxon.parent_id])
252
+ rescue SystemStackError
253
+ @error_names << { data: taxon,
254
+ error: :too_deep_hierarchy,
255
+ current_parent: nil }
256
+ end
257
+ return res if res == "error"
258
+
259
+ if @with_canonical_names
260
+ taxon.classification_path += @normalized_data[taxon.parent_id].
261
+ classification_path +
262
+ [taxon.current_name_canonical]
263
+ end
264
+ taxon.classification_path_id += @normalized_data[taxon.parent_id].
265
+ classification_path_id + [taxon.id]
266
+ parent_node = @normalized_data[taxon.parent_id].
267
+ classification_path_id.inject(@tree) do |node, id|
268
+ node[id]
269
+ end
270
+ parent_node.merge!(current_node)
271
+ else
272
+ if @with_canonical_names
273
+ taxon.classification_path += parent_cp +
274
+ [taxon.current_name_canonical]
275
+ end
276
+ taxon.classification_path_id += parent_cpid + [taxon.id]
277
+ parent_node = @normalized_data[taxon.parent_id].
278
+ classification_path_id.inject(@tree) do |node, id|
279
+ node[id]
280
+ end
281
+ begin
282
+ parent_node.merge!(current_node)
283
+ rescue NoMethodError => e
284
+ DarwinCore.logger_write(@dwc.object_id,
285
+ "Error '#{e.message}' taxon #{taxon.id}")
286
+ "error"
287
+ end
288
+ end
289
+ end
290
+ end
291
+
292
+ def ingest_extensions
293
+ @extensions.each do |e|
294
+ _ext, fields = *e
295
+ ingest_synonyms(e) if File.split(e[0].file_path).
296
+ last.match(/synonym/i) &&
297
+ fields.keys.include?(:scientificname)
298
+ ingest_vernaculars(e) if fields.keys.include? :vernacularname
299
+ end
300
+ end
301
+
302
+ def ingest_synonyms(extension)
303
+ DarwinCore.logger_write(@dwc.object_id, "Ingesting synonyms extension")
304
+ ext, fields = *extension
305
+ ext.read do |rows|
306
+ rows[0].each do |r|
307
+ synonym = process_synonym(r, fields)
308
+ add_synonym(synonym, r, fields)
309
+ end
310
+ end
311
+ end
312
+
313
+ def add_synonym(synonym, record, fields)
314
+ if @normalized_data[record[fields[:id]]]
315
+ @normalized_data[record[fields[:id]]].synonyms << synonym
316
+ add_name_string(synonym.name)
317
+ add_name_string(synonym.canonical_name)
318
+ else
319
+ @error_names << { taxon: synonym,
320
+ error: :synonym_of_unknown_taxa }
321
+ end
322
+ end
323
+
324
+ def process_synonym(record, fields)
325
+ set_scientific_name(record, fields)
326
+ SynonymNormalized.new(
327
+ nil,
328
+ record[fields[:scientificname]],
329
+ record[fields[:canonicalname]],
330
+ fields[:taxonomicstatus] ? record[fields[:taxonomicstatus]] : nil,
331
+ fields[:source] ? record[fields[:source]] : nil,
332
+ fields[:localid] ? record[fields[:localid]] : nil,
333
+ fields[:globalid] ? record[fields[:globalid]] : nil
334
+ )
335
+ end
336
+
337
+ def ingest_vernaculars(extension)
338
+ DarwinCore.logger_write(@dwc.object_id,
339
+ "Ingesting vernacular names extension")
340
+ ext, fields = *extension
341
+ ext.read do |rows|
342
+ rows[0].each do |row|
343
+ extract_vernaculars_from_row(row, fields)
344
+ end
345
+ end
346
+ end
347
+
348
+ def extract_vernaculars_from_row(row, fields)
349
+ language = find_vernacular_language(row, fields)
350
+ locality = fields[:locality] ? row[fields[:locality]] : nil
351
+ country_code = fields[:countrycode] ? row[fields[:countrycode]] : nil
352
+
353
+ vernacular = VernacularNormalized.new(
354
+ row[fields[:vernacularname]], language, locality, country_code
355
+ )
356
+ if @normalized_data[row[fields[:id]]]
357
+ @normalized_data[row[fields[:id]]].vernacular_names << vernacular
358
+ add_vernacular_name_string(vernacular.name)
359
+ else
360
+ @error_names << { vernacular_name: vernacular,
361
+ error: :vernacular_of_unknown_taxa }
362
+ end
363
+ end
364
+
365
+ def find_vernacular_language(row, fields)
366
+ (fields[:language] && row[fields[:language]]) ||
367
+ (fields[:languagecode] && row[fields[:languagecode]]) || nil
368
+ end
369
+
370
+ # Collect linnean classification path only on species level
371
+ def get_linnean_classification_path(row, _taxon)
372
+ %i[kingdom phylum class order family genus
373
+ subgenus].each_with_object([]) do |clade, res|
374
+ res << [row[@core_fields[clade]], clade] if @core_fields[clade]
375
+ end
376
+ end
377
+
378
+ def gnub_archive?
379
+ @core_fields[:originalnameusageidpath]
380
+ end
381
+ end
382
+ end