dwc-archive 0.9.6 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +31 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -0
  6. data/.travis.yml +4 -5
  7. data/CHANGELOG +15 -7
  8. data/Gemfile +3 -15
  9. data/LICENSE +1 -1
  10. data/README.md +135 -111
  11. data/Rakefile +13 -54
  12. data/dwc-archive.gemspec +37 -0
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +121 -0
  17. data/lib/dwc_archive/archive.rb +59 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +382 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
  21. data/lib/dwc_archive/expander.rb +85 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +90 -0
  24. data/lib/dwc_archive/generator_eml_xml.rb +116 -0
  25. data/lib/dwc_archive/generator_meta_xml.rb +72 -0
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +56 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +89 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +47 -0
  34. data/spec/files/generator_meta.xml +19 -0
  35. data/spec/lib/classification_normalizer_spec.rb +214 -0
  36. data/spec/lib/core_spec.rb +100 -0
  37. data/spec/lib/darwin_core_spec.rb +249 -0
  38. data/spec/lib/generator_eml_xml_spec.rb +22 -0
  39. data/spec/lib/generator_meta_xml_spec.rb +22 -0
  40. data/spec/lib/generator_spec.rb +124 -0
  41. data/spec/lib/gnub_taxon_spec.rb +32 -0
  42. data/spec/lib/metadata_spec.rb +89 -0
  43. data/spec/lib/taxon_normalized_spec.rb +142 -0
  44. data/spec/lib/xml_reader_spec.rb +11 -11
  45. data/spec/spec_helper.rb +78 -6
  46. metadata +180 -92
  47. data/.rvmrc +0 -1
  48. data/Gemfile.lock +0 -155
  49. data/VERSION +0 -1
  50. data/lib/dwc-archive.rb +0 -95
  51. data/lib/dwc-archive/.expander.rb.swo +0 -0
  52. data/lib/dwc-archive/archive.rb +0 -37
  53. data/lib/dwc-archive/classification_normalizer.rb +0 -424
  54. data/lib/dwc-archive/core.rb +0 -17
  55. data/lib/dwc-archive/expander.rb +0 -80
  56. data/lib/dwc-archive/generator.rb +0 -75
  57. data/lib/dwc-archive/generator_eml_xml.rb +0 -84
  58. data/lib/dwc-archive/generator_meta_xml.rb +0 -50
  59. data/lib/dwc-archive/ingester.rb +0 -101
  60. data/lib/dwc-archive/metadata.rb +0 -42
  61. data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
  62. data/lib/dwc-archive/xml_reader.rb +0 -64
  63. data/spec/lib/dwc-archive_spec.rb +0 -250
  64. data/spec/spec.opts +0 -1
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Represents data from EML (Ecological Metadata Language) file
5
+ class Metadata
6
+ attr_reader :metadata
7
+ alias data metadata
8
+
9
+ def initialize(archive = nil)
10
+ @archive = archive
11
+ @metadata = @archive.eml
12
+ end
13
+
14
+ def id
15
+ fix_nil { @metadata[:eml][:dataset][:attributes][:id] }
16
+ end
17
+
18
+ def package_id
19
+ fix_nil { @metadata.data[:eml][:attributes][:packageId] }
20
+ end
21
+
22
+ def title
23
+ fix_nil { @metadata[:eml][:dataset][:title] }
24
+ end
25
+
26
+ def authors
27
+ return nil unless defined?(@metadata[:eml][:dataset][:creator])
28
+ authors = [@metadata[:eml][:dataset][:creator]].flatten
29
+ authors.map do |au|
30
+ { first_name: au[:individualName][:givenName],
31
+ last_name: au[:individualName][:surName],
32
+ email: au[:electronicMailAddress] }
33
+ end
34
+ end
35
+
36
+ def abstract
37
+ fix_nil { @metadata[:eml][:dataset][:abstract] }
38
+ end
39
+
40
+ def citation
41
+ fix_nil { @metadata[:eml][:additionalMetadata][:metadata][:citation] }
42
+ end
43
+
44
+ def url
45
+ fix_nil { @metadata[:eml][:dataset][:distribution][:online][:url] }
46
+ end
47
+
48
+ private
49
+
50
+ def fix_nil
51
+ yield
52
+ rescue NoMethodError
53
+ nil
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Describes normalized taxon
5
+ class TaxonNormalized
6
+ attr_accessor :id, :local_id, :global_id, :source, :parent_id,
7
+ :classification_path_id, :classification_path,
8
+ :linnean_classification_path, :current_name,
9
+ :current_name_canonical, :synonyms, :vernacular_names,
10
+ :rank, :status
11
+
12
+ def initialize
13
+ @id = @parent_id = @rank = @status = nil
14
+ @current_name = @current_name_canonical = @source = @local_id = ""
15
+ @global_id = ""
16
+ @classification_path = []
17
+ @classification_path_id = []
18
+ @synonyms = []
19
+ @vernacular_names = []
20
+ @linnean_classification_path = []
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Version constant of the class
4
+ class DarwinCore
5
+ VERSION = "1.1.1"
6
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # USAGE: Hash.from_xml:(YOUR_XML_STRING)
5
+ # modified from
6
+ # http://stackoverflow.com/questions/1230741/
7
+ # convert-a-nokogiri-document-to-a-ruby-hash/1231297#1231297
8
+ module XmlReader
9
+ def self.from_xml(xml_io)
10
+ result = Nokogiri::XML(xml_io)
11
+ { result.root.name.to_sym => self::Node.new(result.root).value }
12
+ end
13
+
14
+ # Node is a helper class to parse xml into hash
15
+ class Node
16
+ def initialize(node)
17
+ @node = node
18
+ @val = {}
19
+ end
20
+
21
+ def value
22
+ if @node.element?
23
+ prepare_node_element
24
+ else
25
+ prepare(@node.content.to_s)
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def prepare_node_element
32
+ add_attributes
33
+ add_children if @node.children.size.positive?
34
+ @val
35
+ end
36
+
37
+ def prepare(data)
38
+ data.class == String && data.to_i.to_s == data ? data.to_i : data
39
+ end
40
+
41
+ def add_attributes
42
+ return if @node.attributes.empty?
43
+ @val[:attributes] = {}
44
+ @node.attributes.keys.each do |key|
45
+ add_attribute(@val[:attributes], @node.attributes[key])
46
+ end
47
+ end
48
+
49
+ def add_attribute(attributes, attribute)
50
+ attributes[attribute.name.to_sym] = prepare(attribute.value)
51
+ end
52
+
53
+ def add_children
54
+ @node.children.each do |child|
55
+ process_child(child)
56
+ end
57
+ end
58
+
59
+ def process_child(child)
60
+ value = DarwinCore::XmlReader::Node.new(child).value
61
+ if child.name == "text"
62
+ handle_text(child, value)
63
+ else
64
+ add_child_to_value(child, value)
65
+ end
66
+ end
67
+
68
+ def add_child_to_value(child, value)
69
+ if @val[child.name.to_sym]
70
+ handle_child_node(child.name.to_sym, value)
71
+ else
72
+ @val[child.name.to_sym] = prepare(value)
73
+ end
74
+ end
75
+
76
+ def handle_child_node(child, val)
77
+ if @val[child].is_a?(Object::Array)
78
+ @val[child] << prepare(val)
79
+ else
80
+ @val[child] = [@val[child], prepare(val)]
81
+ end
82
+ end
83
+
84
+ def handle_text(child, val)
85
+ @val = prepare(val) unless child.next_sibling || child.previous_sibling
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,47 @@
1
+ <?xml version="1.0"?>
2
+ <eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1" xmlns:md="eml://ecoinformatics.org/methods-2.1.1" xmlns:proj="eml://ecoinformatics.org/project-2.1.1" xmlns:d="eml://ecoinformatics.org/dataset-2.1.1" xmlns:res="eml://ecoinformatics.org/resource-2.1.1" xmlns:dc="http://purl.org/dc/terms/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" packageId="1234/2013-12-30::19:45:33" system="http://globalnames.org" xml:lang="en" xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd">
3
+ <dataset id="1234">
4
+ <title>Test Classification</title>
5
+ <license>http://creativecommons.org/licenses/by-sa/3.0/</license>
6
+ <creator id="1" scope="document">
7
+ <individualName>
8
+ <givenName>John</givenName>
9
+ <surName>Doe</surName>
10
+ </individualName>
11
+ <organizationName>Example</organizationName>
12
+ <positionName>Assistant Professor</positionName>
13
+ <onlineUrl>http://example.org</onlineUrl>
14
+ <electronicMailAddress>jdoe@example.com</electronicMailAddress>
15
+ </creator>
16
+ <creator id="2" scope="document">
17
+ <individualName>
18
+ <givenName>Jane</givenName>
19
+ <surName>Doe</surName>
20
+ </individualName>
21
+ <electronicMailAddress>jane@example.com</electronicMailAddress>
22
+ </creator>
23
+ <metadataProvider>
24
+ <individualName>
25
+ <givenName>Jim</givenName>
26
+ <surName>Doe</surName>
27
+ </individualName>
28
+ <onlineUrl>http://aggregator.example.org</onlineUrl>
29
+ <electronicMailAddress>jimdoe@example.com</electronicMailAddress>
30
+ </metadataProvider>
31
+ <pubDate>2013-12-30 14:45:33 -0500</pubDate>
32
+ <abstract>
33
+ <para>test classification</para>
34
+ </abstract>
35
+ <contact>
36
+ <references>1</references>
37
+ </contact>
38
+ <contact>
39
+ <references>2</references>
40
+ </contact>
41
+ </dataset>
42
+ <additionalMetadata>
43
+ <metadata>
44
+ <citation>Test classification: Doe John, Doe Jane, Taxnonmy, 10, 1, 2010</citation>
45
+ </metadata>
46
+ </additionalMetadata>
47
+ </eml:eml>
@@ -0,0 +1,19 @@
1
+ <?xml version="1.0"?>
2
+ <archive xmlns="http://rs.tdwg.org/dwc/text/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://rs.tdwg.org/dwc/terms/xsd/archive/ http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd">
3
+ <core encoding="UTF-8" fieldsTerminatedBy="," fieldsEnclosedBy="&quot;" linesTerminatedBy="&#10;" rowType="http://rs.tdwg.org/dwc/terms/Taxon" ignoreHeaderLines="1">
4
+ <files>
5
+ <location>core.csv</location>
6
+ </files>
7
+ <id index="0"/>
8
+ <field term="http://rs.tdwg.org/dwc/terms/parentNameUsageID" index="1"/>
9
+ <field term="http://rs.tdwg.org/dwc/terms/scientificName" index="2"/>
10
+ <field term="http://rs.tdwg.org/dwc/terms/taxonRank" index="3"/>
11
+ </core>
12
+ <extension encoding="UTF-8" fieldsTerminatedBy="," fieldsEnclosedBy="&quot;" linesTerminatedBy="&#10;" rowType="http://rs.gbif.org/terms/1.0/VernacularName" ignoreHeaderLines="1">
13
+ <files>
14
+ <location>vern.csv</location>
15
+ </files>
16
+ <coreid index="0"/>
17
+ <field term="http://rs.tdwg.org/dwc/terms/vernacularName" index="1"/>
18
+ </extension>
19
+ </archive>
@@ -0,0 +1,214 @@
1
+ # frozen_string_literal: true
2
+
3
+ describe DarwinCore::ClassificationNormalizer do
4
+ subject(:dwca) { DarwinCore.new(file_path) }
5
+ subject(:normalizer) { DarwinCore::ClassificationNormalizer.new(dwca) }
6
+
7
+ let(:file_dir) { File.expand_path("../files", __dir__) }
8
+ let(:file_path) { File.join(file_dir, file_name) }
9
+
10
+ describe ".new" do
11
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
12
+ it do
13
+ expect(normalizer.is_a?(DarwinCore::ClassificationNormalizer)).to be true
14
+ end
15
+ end
16
+
17
+ describe "#normalize" do
18
+ let(:file_name) { "data.tar.gz" }
19
+
20
+ it "returns normalized data" do
21
+ res = normalizer.normalize
22
+ expect(res).to be normalizer.normalized_data
23
+ end
24
+
25
+ context "flat list" do
26
+ let(:file_path) { File.join(file_dir, "flat_list.tar.gz") }
27
+
28
+ it "returns flat list" do
29
+ normalizer.normalize
30
+ expect(normalizer.normalized_data).to be_kind_of Hash
31
+ expect(normalizer.normalized_data.size).to be > 0
32
+ end
33
+ end
34
+
35
+ context "synonyms from core" do
36
+ let(:file_name) { "synonyms_in_core_accepted_name_field.tar.gz" }
37
+
38
+ it "ingests synonyms using accepted_name field" do
39
+ res = normalizer.normalize
40
+ syn = res.reject { |_, v| v.synonyms.empty? }.values
41
+ expect(syn.size).to be > 0
42
+ expect(syn[0].synonyms[0]).to be_kind_of DarwinCore::SynonymNormalized
43
+ end
44
+ end
45
+
46
+ context "synonyms from extension" do
47
+ let(:file_name) { "synonyms_in_extension.tar.gz" }
48
+ it "ingests synonyms from extension" do
49
+ res = normalizer.normalize
50
+ syn = res.reject { |_, v| v.synonyms.empty? }.values
51
+ expect(syn.size).to be > 0
52
+ expect(syn[0].synonyms[0]).to be_kind_of DarwinCore::SynonymNormalized
53
+ end
54
+ end
55
+
56
+ context "synonyms are not extensions" do
57
+ let(:file_name) { "not_synonym_in_extension.tar.gz" }
58
+
59
+ it "does not ingest synonyms" do
60
+ res = normalizer.normalize
61
+ syn = res.reject { |_, v| v.synonyms.empty? }.values
62
+ expect(syn).to be_empty
63
+ end
64
+ end
65
+
66
+ context "with_extensions flag set on false" do
67
+ let(:file_name) { "synonyms_in_extension.tar.gz" }
68
+ it "should not harvest extensions" do
69
+ res = normalizer.normalize(with_extensions: false)
70
+ syn = res.reject { |_, v| v.synonyms.empty? }.values
71
+ expect(syn).to be_empty
72
+ end
73
+ end
74
+
75
+ context "linnean classification in file (class, order etc fields)" do
76
+ let(:file_name) { "linnean.tar.gz" }
77
+
78
+ it "assembles classification" do
79
+ res = normalizer.normalize
80
+ expect(res.first[1]).to be_kind_of DarwinCore::TaxonNormalized
81
+ expect(res.first[1].linnean_classification_path).
82
+ to eq [["Animalia", :kingdom],
83
+ ["Arthropoda", :phylum],
84
+ ["Insecta", :class],
85
+ ["Diptera", :order],
86
+ ["Cecidomyiidae", :family],
87
+ ["Resseliella", :genus]]
88
+ end
89
+ end
90
+
91
+ context "no linnean fields are given" do
92
+ it "returns empty linnean classification" do
93
+ res = normalizer.normalize
94
+ expect(res.first[1]).to be_kind_of DarwinCore::TaxonNormalized
95
+ expect(res.first[1].linnean_classification_path).to be_empty
96
+ end
97
+ end
98
+
99
+ context "in the presence of scientificNameAuthorship field" do
100
+ let(:file_name) { "sci_name_authorship.tar.gz" }
101
+ it "returns normalized data" do
102
+ normalizer.normalize
103
+ expect(normalizer.darwin_core.file_name).
104
+ to eq "sci_name_authorship.tar.gz"
105
+ expect(normalizer.normalized_data).to be_kind_of Hash
106
+ expect(normalizer.normalized_data.size).to be > 0
107
+ tn = normalizer.normalized_data["leptogastrinae:tid:2688"]
108
+ expect(tn.current_name).to eq "Leptogaster fornicata Martin, 1957"
109
+ expect(tn.current_name_canonical).to eq "Leptogaster fornicata"
110
+ end
111
+ end
112
+
113
+ context "when scientificNameAuthorship duplicates author info" do
114
+ let(:file_name) { "sci_name_authorship_dup.tar.gz" }
115
+ it "returns normalized data" do
116
+ normalizer.normalize
117
+ expect(normalizer.darwin_core.file_name).
118
+ to eq "sci_name_authorship_dup.tar.gz"
119
+ expect(normalizer.normalized_data).to be_kind_of Hash
120
+ expect(normalizer.normalized_data.size).to be > 0
121
+ tn = normalizer.normalized_data["leptogastrinae:tid:2688"]
122
+ expect(tn.current_name).to eq "Leptogaster fornicata Martin, 1957"
123
+ expect(tn.current_name_canonical).to eq "Leptogaster fornicata"
124
+ end
125
+ end
126
+
127
+ context "coreid is empty" do
128
+ let(:file_name) { "empty_coreid.tar.gz" }
129
+ it "should ingest information" do
130
+ res = normalizer.normalize
131
+ expect(normalizer.darwin_core.file_name).
132
+ to eq "empty_coreid.tar.gz"
133
+ tn = res["Taxon9"]
134
+ expect(tn.current_name).to eq "Amanita phalloides"
135
+ end
136
+ end
137
+
138
+ context "vernacular locality info" do
139
+ let(:file_name) { "language_locality.tar.gz" }
140
+ it "should ingest locality and language" do
141
+ res = normalizer.normalize
142
+ tn = res["leptogastrinae:tid:42"]
143
+ vn = tn.vernacular_names[0]
144
+ expect(vn.language).to eq "en"
145
+ expect(vn.locality).to eq "New England"
146
+ end
147
+ end
148
+ end
149
+
150
+ describe "#name_strings" do
151
+ let(:file_path) { File.join(file_dir, "flat_list.tar.gz") }
152
+
153
+ context "before running #normalize" do
154
+ it "is empty" do
155
+ expect(normalizer.name_strings).to be_empty
156
+ end
157
+ end
158
+
159
+ context "after running #normalize" do
160
+ let(:normalized) { normalizer.tap(&:normalize) }
161
+
162
+ context "default attibutes" do
163
+ it "returns array" do
164
+ expect(normalized.name_strings).to be_kind_of Array
165
+ expect(normalized.name_strings.size).to be > 1
166
+ end
167
+ end
168
+
169
+ context "with_hash attribute" do
170
+ it "returns hash" do
171
+ strings = normalized.name_strings(with_hash: true)
172
+ expect(strings).to be_kind_of Hash
173
+ expect(strings.size).to be > 1
174
+ expect(strings.values.uniq).to eq [1]
175
+ end
176
+ end
177
+ end
178
+ end
179
+
180
+ describe "#vernacular_name_strings" do
181
+ let(:file_path) { File.join(file_dir, "flat_list.tar.gz") }
182
+
183
+ context "before running #normalize" do
184
+ subject(:vern) { normalizer.vernacular_name_strings }
185
+
186
+ it "is empty" do
187
+ expect(vern).to be_empty
188
+ end
189
+ end
190
+
191
+ context "after running #normalize" do
192
+ let(:normalized) { normalizer.tap(&:normalize) }
193
+ subject(:vern) { normalized.vernacular_name_strings }
194
+ subject(:vern_w_hash) do
195
+ normalized.vernacular_name_strings(with_hash: true)
196
+ end
197
+
198
+ context "default attibutes" do
199
+ it "returns array" do
200
+ expect(vern).to be_kind_of Array
201
+ expect(vern.size).to be > 0
202
+ end
203
+ end
204
+
205
+ context "with_hash attribute" do
206
+ it "returns hash" do
207
+ expect(vern_w_hash).to be_kind_of Hash
208
+ expect(vern_w_hash.size).to be > 0
209
+ expect(vern_w_hash.values.uniq).to eq [1]
210
+ end
211
+ end
212
+ end
213
+ end
214
+ end