dwc-archive 0.9.5 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +31 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -0
  6. data/.travis.yml +5 -4
  7. data/CHANGELOG +17 -5
  8. data/Gemfile +3 -15
  9. data/LICENSE +1 -1
  10. data/README.md +143 -111
  11. data/Rakefile +13 -49
  12. data/dwc-archive.gemspec +37 -0
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +121 -0
  17. data/lib/dwc_archive/archive.rb +59 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +394 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
  21. data/lib/dwc_archive/expander.rb +85 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +90 -0
  24. data/lib/dwc_archive/generator_eml_xml.rb +116 -0
  25. data/lib/dwc_archive/generator_meta_xml.rb +72 -0
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +56 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +89 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +47 -0
  34. data/spec/files/generator_meta.xml +19 -0
  35. data/spec/files/gnub.tar.gz +0 -0
  36. data/spec/lib/classification_normalizer_spec.rb +214 -0
  37. data/spec/lib/core_spec.rb +100 -0
  38. data/spec/lib/darwin_core_spec.rb +249 -0
  39. data/spec/lib/generator_eml_xml_spec.rb +22 -0
  40. data/spec/lib/generator_meta_xml_spec.rb +22 -0
  41. data/spec/lib/generator_spec.rb +124 -0
  42. data/spec/lib/gnub_taxon_spec.rb +32 -0
  43. data/spec/lib/metadata_spec.rb +89 -0
  44. data/spec/lib/taxon_normalized_spec.rb +142 -0
  45. data/spec/lib/xml_reader_spec.rb +11 -11
  46. data/spec/spec_helper.rb +78 -7
  47. metadata +181 -92
  48. data/.rvmrc +0 -1
  49. data/Gemfile.lock +0 -155
  50. data/VERSION +0 -1
  51. data/lib/dwc-archive.rb +0 -95
  52. data/lib/dwc-archive/.expander.rb.swo +0 -0
  53. data/lib/dwc-archive/archive.rb +0 -37
  54. data/lib/dwc-archive/classification_normalizer.rb +0 -332
  55. data/lib/dwc-archive/core.rb +0 -17
  56. data/lib/dwc-archive/expander.rb +0 -80
  57. data/lib/dwc-archive/generator.rb +0 -75
  58. data/lib/dwc-archive/generator_eml_xml.rb +0 -84
  59. data/lib/dwc-archive/generator_meta_xml.rb +0 -50
  60. data/lib/dwc-archive/ingester.rb +0 -101
  61. data/lib/dwc-archive/metadata.rb +0 -42
  62. data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
  63. data/lib/dwc-archive/xml_reader.rb +0 -64
  64. data/spec/lib/dwc-archive_spec.rb +0 -236
  65. data/spec/spec.opts +0 -1
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Represents data from EML (Ecological Metadata Language) file
5
+ class Metadata
6
+ attr_reader :metadata
7
+ alias data metadata
8
+
9
+ def initialize(archive = nil)
10
+ @archive = archive
11
+ @metadata = @archive.eml
12
+ end
13
+
14
+ def id
15
+ fix_nil { @metadata[:eml][:dataset][:attributes][:id] }
16
+ end
17
+
18
+ def package_id
19
+ fix_nil { @metadata.data[:eml][:attributes][:packageId] }
20
+ end
21
+
22
+ def title
23
+ fix_nil { @metadata[:eml][:dataset][:title] }
24
+ end
25
+
26
+ def authors
27
+ return nil unless defined?(@metadata[:eml][:dataset][:creator])
28
+ authors = [@metadata[:eml][:dataset][:creator]].flatten
29
+ authors.map do |au|
30
+ { first_name: au[:individualName][:givenName],
31
+ last_name: au[:individualName][:surName],
32
+ email: au[:electronicMailAddress] }
33
+ end
34
+ end
35
+
36
+ def abstract
37
+ fix_nil { @metadata[:eml][:dataset][:abstract] }
38
+ end
39
+
40
+ def citation
41
+ fix_nil { @metadata[:eml][:additionalMetadata][:metadata][:citation] }
42
+ end
43
+
44
+ def url
45
+ fix_nil { @metadata[:eml][:dataset][:distribution][:online][:url] }
46
+ end
47
+
48
+ private
49
+
50
+ def fix_nil
51
+ yield
52
+ rescue NoMethodError
53
+ nil
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # Describes normalized taxon
5
+ class TaxonNormalized
6
+ attr_accessor :id, :local_id, :global_id, :source, :parent_id,
7
+ :classification_path_id, :classification_path,
8
+ :linnean_classification_path, :current_name,
9
+ :current_name_canonical, :synonyms, :vernacular_names,
10
+ :rank, :status
11
+
12
+ def initialize
13
+ @id = @parent_id = @rank = @status = nil
14
+ @current_name = @current_name_canonical = @source = @local_id = ""
15
+ @global_id = ""
16
+ @classification_path = []
17
+ @classification_path_id = []
18
+ @synonyms = []
19
+ @vernacular_names = []
20
+ @linnean_classification_path = []
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Version constant of the class
4
+ class DarwinCore
5
+ VERSION = "1.1.0"
6
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ class DarwinCore
4
+ # USAGE: Hash.from_xml:(YOUR_XML_STRING)
5
+ # modified from
6
+ # http://stackoverflow.com/questions/1230741/
7
+ # convert-a-nokogiri-document-to-a-ruby-hash/1231297#1231297
8
+ module XmlReader
9
+ def self.from_xml(xml_io)
10
+ result = Nokogiri::XML(xml_io)
11
+ { result.root.name.to_sym => self::Node.new(result.root).value }
12
+ end
13
+
14
+ # Node is a helper class to parse xml into hash
15
+ class Node
16
+ def initialize(node)
17
+ @node = node
18
+ @val = {}
19
+ end
20
+
21
+ def value
22
+ if @node.element?
23
+ prepare_node_element
24
+ else
25
+ prepare(@node.content.to_s)
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def prepare_node_element
32
+ add_attributes
33
+ add_children if @node.children.size.positive?
34
+ @val
35
+ end
36
+
37
+ def prepare(data)
38
+ data.class == String && data.to_i.to_s == data ? data.to_i : data
39
+ end
40
+
41
+ def add_attributes
42
+ return if @node.attributes.empty?
43
+ @val[:attributes] = {}
44
+ @node.attributes.keys.each do |key|
45
+ add_attribute(@val[:attributes], @node.attributes[key])
46
+ end
47
+ end
48
+
49
+ def add_attribute(attributes, attribute)
50
+ attributes[attribute.name.to_sym] = prepare(attribute.value)
51
+ end
52
+
53
+ def add_children
54
+ @node.children.each do |child|
55
+ process_child(child)
56
+ end
57
+ end
58
+
59
+ def process_child(child)
60
+ value = DarwinCore::XmlReader::Node.new(child).value
61
+ if child.name == "text"
62
+ handle_text(child, value)
63
+ else
64
+ add_child_to_value(child, value)
65
+ end
66
+ end
67
+
68
+ def add_child_to_value(child, value)
69
+ if @val[child.name.to_sym]
70
+ handle_child_node(child.name.to_sym, value)
71
+ else
72
+ @val[child.name.to_sym] = prepare(value)
73
+ end
74
+ end
75
+
76
+ def handle_child_node(child, val)
77
+ if @val[child].is_a?(Object::Array)
78
+ @val[child] << prepare(val)
79
+ else
80
+ @val[child] = [@val[child], prepare(val)]
81
+ end
82
+ end
83
+
84
+ def handle_text(child, val)
85
+ @val = prepare(val) unless child.next_sibling || child.previous_sibling
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,47 @@
1
+ <?xml version="1.0"?>
2
+ <eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1" xmlns:md="eml://ecoinformatics.org/methods-2.1.1" xmlns:proj="eml://ecoinformatics.org/project-2.1.1" xmlns:d="eml://ecoinformatics.org/dataset-2.1.1" xmlns:res="eml://ecoinformatics.org/resource-2.1.1" xmlns:dc="http://purl.org/dc/terms/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" packageId="1234/2013-12-30::19:45:33" system="http://globalnames.org" xml:lang="en" xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd">
3
+ <dataset id="1234">
4
+ <title>Test Classification</title>
5
+ <license>http://creativecommons.org/licenses/by-sa/3.0/</license>
6
+ <creator id="1" scope="document">
7
+ <individualName>
8
+ <givenName>John</givenName>
9
+ <surName>Doe</surName>
10
+ </individualName>
11
+ <organizationName>Example</organizationName>
12
+ <positionName>Assistant Professor</positionName>
13
+ <onlineUrl>http://example.org</onlineUrl>
14
+ <electronicMailAddress>jdoe@example.com</electronicMailAddress>
15
+ </creator>
16
+ <creator id="2" scope="document">
17
+ <individualName>
18
+ <givenName>Jane</givenName>
19
+ <surName>Doe</surName>
20
+ </individualName>
21
+ <electronicMailAddress>jane@example.com</electronicMailAddress>
22
+ </creator>
23
+ <metadataProvider>
24
+ <individualName>
25
+ <givenName>Jim</givenName>
26
+ <surName>Doe</surName>
27
+ </individualName>
28
+ <onlineUrl>http://aggregator.example.org</onlineUrl>
29
+ <electronicMailAddress>jimdoe@example.com</electronicMailAddress>
30
+ </metadataProvider>
31
+ <pubDate>2013-12-30 14:45:33 -0500</pubDate>
32
+ <abstract>
33
+ <para>test classification</para>
34
+ </abstract>
35
+ <contact>
36
+ <references>1</references>
37
+ </contact>
38
+ <contact>
39
+ <references>2</references>
40
+ </contact>
41
+ </dataset>
42
+ <additionalMetadata>
43
+ <metadata>
44
+ <citation>Test classification: Doe John, Doe Jane, Taxnonmy, 10, 1, 2010</citation>
45
+ </metadata>
46
+ </additionalMetadata>
47
+ </eml:eml>
@@ -0,0 +1,19 @@
1
+ <?xml version="1.0"?>
2
+ <archive xmlns="http://rs.tdwg.org/dwc/text/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://rs.tdwg.org/dwc/terms/xsd/archive/ http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd">
3
+ <core encoding="UTF-8" fieldsTerminatedBy="," fieldsEnclosedBy="&quot;" linesTerminatedBy="&#10;" rowType="http://rs.tdwg.org/dwc/terms/Taxon" ignoreHeaderLines="1">
4
+ <files>
5
+ <location>core.csv</location>
6
+ </files>
7
+ <id index="0"/>
8
+ <field term="http://rs.tdwg.org/dwc/terms/parentNameUsageID" index="1"/>
9
+ <field term="http://rs.tdwg.org/dwc/terms/scientificName" index="2"/>
10
+ <field term="http://rs.tdwg.org/dwc/terms/taxonRank" index="3"/>
11
+ </core>
12
+ <extension encoding="UTF-8" fieldsTerminatedBy="," fieldsEnclosedBy="&quot;" linesTerminatedBy="&#10;" rowType="http://rs.gbif.org/terms/1.0/VernacularName" ignoreHeaderLines="1">
13
+ <files>
14
+ <location>vern.csv</location>
15
+ </files>
16
+ <coreid index="0"/>
17
+ <field term="http://rs.tdwg.org/dwc/terms/vernacularName" index="1"/>
18
+ </extension>
19
+ </archive>
Binary file
@@ -0,0 +1,214 @@
1
+ # frozen_string_literal: true
2
+
3
+ describe DarwinCore::ClassificationNormalizer do
4
+ subject(:dwca) { DarwinCore.new(file_path) }
5
+ subject(:normalizer) { DarwinCore::ClassificationNormalizer.new(dwca) }
6
+
7
+ let(:file_dir) { File.expand_path("../files", __dir__) }
8
+ let(:file_path) { File.join(file_dir, file_name) }
9
+
10
+ describe ".new" do
11
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
12
+ it do
13
+ expect(normalizer.is_a?(DarwinCore::ClassificationNormalizer)).to be true
14
+ end
15
+ end
16
+
17
+ describe "#normalize" do
18
+ let(:file_name) { "data.tar.gz" }
19
+
20
+ it "returns normalized data" do
21
+ res = normalizer.normalize
22
+ expect(res).to be normalizer.normalized_data
23
+ end
24
+
25
+ context "flat list" do
26
+ let(:file_path) { File.join(file_dir, "flat_list.tar.gz") }
27
+
28
+ it "returns flat list" do
29
+ normalizer.normalize
30
+ expect(normalizer.normalized_data).to be_kind_of Hash
31
+ expect(normalizer.normalized_data.size).to be > 0
32
+ end
33
+ end
34
+
35
+ context "synonyms from core" do
36
+ let(:file_name) { "synonyms_in_core_accepted_name_field.tar.gz" }
37
+
38
+ it "ingests synonyms using accepted_name field" do
39
+ res = normalizer.normalize
40
+ syn = res.reject { |_, v| v.synonyms.empty? }.values
41
+ expect(syn.size).to be > 0
42
+ expect(syn[0].synonyms[0]).to be_kind_of DarwinCore::SynonymNormalized
43
+ end
44
+ end
45
+
46
+ context "synonyms from extension" do
47
+ let(:file_name) { "synonyms_in_extension.tar.gz" }
48
+ it "ingests synonyms from extension" do
49
+ res = normalizer.normalize
50
+ syn = res.reject { |_, v| v.synonyms.empty? }.values
51
+ expect(syn.size).to be > 0
52
+ expect(syn[0].synonyms[0]).to be_kind_of DarwinCore::SynonymNormalized
53
+ end
54
+ end
55
+
56
+ context "synonyms are not extensions" do
57
+ let(:file_name) { "not_synonym_in_extension.tar.gz" }
58
+
59
+ it "does not ingest synonyms" do
60
+ res = normalizer.normalize
61
+ syn = res.reject { |_, v| v.synonyms.empty? }.values
62
+ expect(syn).to be_empty
63
+ end
64
+ end
65
+
66
+ context "with_extensions flag set on false" do
67
+ let(:file_name) { "synonyms_in_extension.tar.gz" }
68
+ it "should not harvest extensions" do
69
+ res = normalizer.normalize(with_extensions: false)
70
+ syn = res.reject { |_, v| v.synonyms.empty? }.values
71
+ expect(syn).to be_empty
72
+ end
73
+ end
74
+
75
+ context "linnean classification in file (class, order etc fields)" do
76
+ let(:file_name) { "linnean.tar.gz" }
77
+
78
+ it "assembles classification" do
79
+ res = normalizer.normalize
80
+ expect(res.first[1]).to be_kind_of DarwinCore::TaxonNormalized
81
+ expect(res.first[1].linnean_classification_path).
82
+ to eq [["Animalia", :kingdom],
83
+ ["Arthropoda", :phylum],
84
+ ["Insecta", :class],
85
+ ["Diptera", :order],
86
+ ["Cecidomyiidae", :family],
87
+ ["Resseliella", :genus]]
88
+ end
89
+ end
90
+
91
+ context "no linnean fields are given" do
92
+ it "returns empty linnean classification" do
93
+ res = normalizer.normalize
94
+ expect(res.first[1]).to be_kind_of DarwinCore::TaxonNormalized
95
+ expect(res.first[1].linnean_classification_path).to be_empty
96
+ end
97
+ end
98
+
99
+ context "in the presence of scientificNameAuthorship field" do
100
+ let(:file_name) { "sci_name_authorship.tar.gz" }
101
+ it "returns normalized data" do
102
+ normalizer.normalize
103
+ expect(normalizer.darwin_core.file_name).
104
+ to eq "sci_name_authorship.tar.gz"
105
+ expect(normalizer.normalized_data).to be_kind_of Hash
106
+ expect(normalizer.normalized_data.size).to be > 0
107
+ tn = normalizer.normalized_data["leptogastrinae:tid:2688"]
108
+ expect(tn.current_name).to eq "Leptogaster fornicata Martin, 1957"
109
+ expect(tn.current_name_canonical).to eq "Leptogaster fornicata"
110
+ end
111
+ end
112
+
113
+ context "when scientificNameAuthorship duplicates author info" do
114
+ let(:file_name) { "sci_name_authorship_dup.tar.gz" }
115
+ it "returns normalized data" do
116
+ normalizer.normalize
117
+ expect(normalizer.darwin_core.file_name).
118
+ to eq "sci_name_authorship_dup.tar.gz"
119
+ expect(normalizer.normalized_data).to be_kind_of Hash
120
+ expect(normalizer.normalized_data.size).to be > 0
121
+ tn = normalizer.normalized_data["leptogastrinae:tid:2688"]
122
+ expect(tn.current_name).to eq "Leptogaster fornicata Martin, 1957"
123
+ expect(tn.current_name_canonical).to eq "Leptogaster fornicata"
124
+ end
125
+ end
126
+
127
+ context "coreid is empty" do
128
+ let(:file_name) { "empty_coreid.tar.gz" }
129
+ it "should ingest information" do
130
+ res = normalizer.normalize
131
+ expect(normalizer.darwin_core.file_name).
132
+ to eq "empty_coreid.tar.gz"
133
+ tn = res["Taxon9"]
134
+ expect(tn.current_name).to eq "Amanita phalloides"
135
+ end
136
+ end
137
+
138
+ context "vernacular locality info" do
139
+ let(:file_name) { "language_locality.tar.gz" }
140
+ it "should ingest locality and language" do
141
+ res = normalizer.normalize
142
+ tn = res["leptogastrinae:tid:42"]
143
+ vn = tn.vernacular_names[0]
144
+ expect(vn.language).to eq "en"
145
+ expect(vn.locality).to eq "New England"
146
+ end
147
+ end
148
+ end
149
+
150
+ describe "#name_strings" do
151
+ let(:file_path) { File.join(file_dir, "flat_list.tar.gz") }
152
+
153
+ context "before running #normalize" do
154
+ it "is empty" do
155
+ expect(normalizer.name_strings).to be_empty
156
+ end
157
+ end
158
+
159
+ context "after running #normalize" do
160
+ let(:normalized) { normalizer.tap(&:normalize) }
161
+
162
+ context "default attibutes" do
163
+ it "returns array" do
164
+ expect(normalized.name_strings).to be_kind_of Array
165
+ expect(normalized.name_strings.size).to be > 1
166
+ end
167
+ end
168
+
169
+ context "with_hash attribute" do
170
+ it "returns hash" do
171
+ strings = normalized.name_strings(with_hash: true)
172
+ expect(strings).to be_kind_of Hash
173
+ expect(strings.size).to be > 1
174
+ expect(strings.values.uniq).to eq [1]
175
+ end
176
+ end
177
+ end
178
+ end
179
+
180
+ describe "#vernacular_name_strings" do
181
+ let(:file_path) { File.join(file_dir, "flat_list.tar.gz") }
182
+
183
+ context "before running #normalize" do
184
+ subject(:vern) { normalizer.vernacular_name_strings }
185
+
186
+ it "is empty" do
187
+ expect(vern).to be_empty
188
+ end
189
+ end
190
+
191
+ context "after running #normalize" do
192
+ let(:normalized) { normalizer.tap(&:normalize) }
193
+ subject(:vern) { normalized.vernacular_name_strings }
194
+ subject(:vern_w_hash) do
195
+ normalized.vernacular_name_strings(with_hash: true)
196
+ end
197
+
198
+ context "default attibutes" do
199
+ it "returns array" do
200
+ expect(vern).to be_kind_of Array
201
+ expect(vern.size).to be > 0
202
+ end
203
+ end
204
+
205
+ context "with_hash attribute" do
206
+ it "returns hash" do
207
+ expect(vern_w_hash).to be_kind_of Hash
208
+ expect(vern_w_hash.size).to be > 0
209
+ expect(vern_w_hash.values.uniq).to eq [1]
210
+ end
211
+ end
212
+ end
213
+ end
214
+ end