dwc-archive 0.9.6 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +31 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -0
  6. data/.travis.yml +4 -5
  7. data/CHANGELOG +15 -7
  8. data/Gemfile +3 -15
  9. data/LICENSE +1 -1
  10. data/README.md +135 -111
  11. data/Rakefile +13 -54
  12. data/dwc-archive.gemspec +37 -0
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +121 -0
  17. data/lib/dwc_archive/archive.rb +59 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +382 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
  21. data/lib/dwc_archive/expander.rb +85 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +90 -0
  24. data/lib/dwc_archive/generator_eml_xml.rb +116 -0
  25. data/lib/dwc_archive/generator_meta_xml.rb +72 -0
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +56 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +89 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +47 -0
  34. data/spec/files/generator_meta.xml +19 -0
  35. data/spec/lib/classification_normalizer_spec.rb +214 -0
  36. data/spec/lib/core_spec.rb +100 -0
  37. data/spec/lib/darwin_core_spec.rb +249 -0
  38. data/spec/lib/generator_eml_xml_spec.rb +22 -0
  39. data/spec/lib/generator_meta_xml_spec.rb +22 -0
  40. data/spec/lib/generator_spec.rb +124 -0
  41. data/spec/lib/gnub_taxon_spec.rb +32 -0
  42. data/spec/lib/metadata_spec.rb +89 -0
  43. data/spec/lib/taxon_normalized_spec.rb +142 -0
  44. data/spec/lib/xml_reader_spec.rb +11 -11
  45. data/spec/spec_helper.rb +78 -6
  46. metadata +180 -92
  47. data/.rvmrc +0 -1
  48. data/Gemfile.lock +0 -155
  49. data/VERSION +0 -1
  50. data/lib/dwc-archive.rb +0 -95
  51. data/lib/dwc-archive/.expander.rb.swo +0 -0
  52. data/lib/dwc-archive/archive.rb +0 -37
  53. data/lib/dwc-archive/classification_normalizer.rb +0 -424
  54. data/lib/dwc-archive/core.rb +0 -17
  55. data/lib/dwc-archive/expander.rb +0 -80
  56. data/lib/dwc-archive/generator.rb +0 -75
  57. data/lib/dwc-archive/generator_eml_xml.rb +0 -84
  58. data/lib/dwc-archive/generator_meta_xml.rb +0 -50
  59. data/lib/dwc-archive/ingester.rb +0 -101
  60. data/lib/dwc-archive/metadata.rb +0 -42
  61. data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
  62. data/lib/dwc-archive/xml_reader.rb +0 -64
  63. data/spec/lib/dwc-archive_spec.rb +0 -250
  64. data/spec/spec.opts +0 -1
@@ -0,0 +1,100 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../spec_helper"
4
+
5
+ describe DarwinCore::Core do
6
+ subject(:dwca) { DarwinCore.new(file_path) }
7
+ subject(:core) { DarwinCore::Core.new(dwca) }
8
+ let(:file_path) do
9
+ File.join(File.expand_path("../files", __dir__), file_name)
10
+ end
11
+ let(:file_name) { "data.tar.gz" }
12
+
13
+ describe ".new" do
14
+ it "creates new core" do
15
+ expect(core).to be_kind_of DarwinCore::Core
16
+ end
17
+ end
18
+
19
+ describe "#id" do
20
+ it "returns core id" do
21
+ expect(core.id[:index]).to eq 0
22
+ expect(core.id[:term]).to eq "http://rs.tdwg.org/dwc/terms/TaxonID"
23
+ end
24
+
25
+ context "no coreid" do
26
+ let(:file_name) { "empty_coreid.tar.gz" }
27
+
28
+ it "does not return coreid" do
29
+ expect(core.id[:index]).to eq 0
30
+ expect(core.id[:term]).to be_nil
31
+ end
32
+ end
33
+ end
34
+
35
+ describe "#data" do
36
+ it "gers core data" do
37
+ expect(core.data).to be_kind_of Hash
38
+ end
39
+ end
40
+
41
+ describe "#properties" do
42
+ it "gers core properties" do
43
+ expect(core.properties).to be_kind_of Hash
44
+ expect(core.properties.keys).to match_array %i[
45
+ encoding fieldsTerminatedBy linesTerminatedBy fieldsEnclosedBy
46
+ ignoreHeaderLines rowType
47
+ ]
48
+ end
49
+ end
50
+
51
+ describe "#encoding" do
52
+ it "returns encoding of the data" do
53
+ expect(core.encoding).to eq "UTF-8"
54
+ end
55
+ end
56
+
57
+ describe "#fields_separator" do
58
+ it "returns separator of fields for csv files" do
59
+ expect(core.fields_separator).to be_nil
60
+ end
61
+ end
62
+
63
+ describe "#size" do
64
+ it "returns number of lines in the core" do
65
+ expect(core.size).to eq 588
66
+ end
67
+ end
68
+
69
+ describe "#file_path" do
70
+ it "returns file path of core file" do
71
+ expect(core.file_path).to match "DarwinCore.txt"
72
+ end
73
+ end
74
+
75
+ describe "#fields" do
76
+ it "returns fields of the core file" do
77
+ expect(core.fields.size).to eq 7
78
+ expect(core.fields).to be_kind_of Array
79
+ expect(core.fields[0]).to be_kind_of Hash
80
+ end
81
+ end
82
+
83
+ describe "#line_separator" do
84
+ it "returns characters separating lines in csv file" do
85
+ expect(core.line_separator).to eq "\\n"
86
+ end
87
+ end
88
+
89
+ describe "#quote_character" do
90
+ it "returns quote character for the csv file" do
91
+ expect(core.quote_character).to eq ""
92
+ end
93
+ end
94
+
95
+ describe "#ignore headers" do
96
+ it "returns true if headers should not be included into data" do
97
+ expect(core.ignore_headers).to eq true
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,249 @@
1
+ # frozen_string_literal: true
2
+
3
+ describe DarwinCore do
4
+ subject { DarwinCore }
5
+ let(:file_dir) { File.expand_path("../files", __dir__) }
6
+
7
+ it "has version" do
8
+ expect(DarwinCore::VERSION).to match(/\d+\.\d+\.\d/)
9
+ end
10
+
11
+ describe ".nil_field?" do
12
+ it "is true for nil fields" do
13
+ [nil, "/N", ""].each do |i|
14
+ expect(DarwinCore.nil_field?(i)).to be true
15
+ end
16
+ end
17
+
18
+ it "is false for non-nil fields" do
19
+ [0, "0", "123", 123, "dsdfs434343/N"].each do |i|
20
+ expect(subject.nil_field?(i)).to be false
21
+ end
22
+ end
23
+ end
24
+
25
+ describe ".clean_all" do
26
+ let(:tmp_dir) { DarwinCore::DEFAULT_TMP_DIR }
27
+
28
+ it "cleans dwca directories" do
29
+ Dir.chdir(tmp_dir)
30
+ FileUtils.mkdir("dwc_123") unless File.exist?("dwc_123")
31
+ dwca_dirs = Dir.entries(tmp_dir).select { |d| d.match(/^dwc_[\d]+$/) }
32
+ expect(dwca_dirs.size).to be > 0
33
+ subject.clean_all
34
+ dwca_dirs = Dir.entries(tmp_dir).select { |d| d.match(/^dwc_[\d]+$/) }
35
+ expect(dwca_dirs.size).to be 0
36
+ end
37
+
38
+ context "no dwc files exist" do
39
+ it "does nothing" do
40
+ subject.clean_all
41
+ subject.clean_all
42
+ dwca_dirs = Dir.entries(tmp_dir).select { |d| d.match(/^dwc_[\d]+$/) }
43
+ expect(dwca_dirs.size).to be 0
44
+ end
45
+ end
46
+ end
47
+
48
+ describe ".logger" do
49
+ it { expect(subject.logger).to be_kind_of Logger }
50
+ end
51
+
52
+ describe ".logger=" do
53
+ it "sets logger" do
54
+ expect(subject.logger = "fake logger").to eq "fake logger"
55
+ expect(subject.logger).to eq "fake logger"
56
+ end
57
+ end
58
+
59
+ describe ".logger_reset" do
60
+ it "resets logger" do
61
+ subject.logger = "fake logger"
62
+ expect(subject.logger).to eq "fake logger"
63
+ subject.logger_reset
64
+ expect(subject.logger).to be_kind_of Logger
65
+ end
66
+ end
67
+
68
+ describe ".new" do
69
+ subject(:dwca) { DarwinCore.new(file_path) }
70
+
71
+ context "tar.gz and zip files supplied" do
72
+ files = %w[data.zip data.tar.gz minimal.tar.gz junk_dir_inside.zip]
73
+ files.each do |file|
74
+ let(:file_path) { File.join(file_dir, file) }
75
+
76
+ it "creates archive from #{file}" do
77
+ expect(dwca.archive.valid?).to be true
78
+ end
79
+ end
80
+ end
81
+
82
+ context "when file does not exist" do
83
+ let(:file_path) { File.join(file_dir, "no_file.gz") }
84
+
85
+ it "raises not found" do
86
+ expect { dwca }.to raise_error DarwinCore::FileNotFoundError
87
+ end
88
+ end
89
+
90
+ context "archive cannot unpack" do
91
+ let(:file_path) { File.join(file_dir, "broken.tar.gz") }
92
+
93
+ it "raises unpacking error" do
94
+ expect { dwca }.to raise_error DarwinCore::UnpackingError
95
+ end
96
+ end
97
+
98
+ context "archive is broken" do
99
+ let(:file_path) { File.join(file_dir, "invalid.tar.gz") }
100
+
101
+ it "raises error of invalid archive" do
102
+ expect { dwca }.to raise_error DarwinCore::InvalidArchiveError
103
+ end
104
+ end
105
+
106
+ context "archive is not in utf-8 encoding" do
107
+ let(:file_path) { File.join(file_dir, "latin1.tar.gz") }
108
+
109
+ it "raises wrong encoding error" do
110
+ expect { dwca }.to raise_error DarwinCore::EncodingError
111
+ end
112
+ end
113
+
114
+ context "filename with spaces and non-alphanumeric chars" do
115
+ let(:file_path) { File.join(file_dir, "file with characters(3).tar.gz") }
116
+
117
+ it "creates archive" do
118
+ expect(dwca.archive.valid?).to be true
119
+ end
120
+ end
121
+ end
122
+
123
+ describe "file_name" do
124
+ subject(:dwca) { DarwinCore.new(file_path) }
125
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
126
+
127
+ it "returns file name" do
128
+ expect(dwca.file_name).to eq "data.tar.gz"
129
+ end
130
+ end
131
+
132
+ describe "path" do
133
+ subject(:dwca) { DarwinCore.new(file_path) }
134
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
135
+
136
+ it "returns path of the archive" do
137
+ expect(dwca.path).to match(/spec.files.data\.tar\.gz/)
138
+ end
139
+ end
140
+
141
+ describe "#archive" do
142
+ subject(:dwca) { DarwinCore.new(file_path) }
143
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
144
+
145
+ it "returns archive" do
146
+ expect(dwca.archive).to be_kind_of DarwinCore::Archive
147
+ end
148
+ end
149
+
150
+ describe "#core" do
151
+ subject(:dwca) { DarwinCore.new(file_path) }
152
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
153
+
154
+ it "returns core" do
155
+ expect(dwca.core).to be_kind_of DarwinCore::Core
156
+ end
157
+ end
158
+
159
+ describe "#metadata" do
160
+ subject(:dwca) { DarwinCore.new(file_path) }
161
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
162
+
163
+ it "returns eml" do
164
+ expect(dwca.eml).to be_kind_of DarwinCore::Metadata
165
+ expect(dwca.metadata).to be_kind_of DarwinCore::Metadata
166
+ end
167
+ end
168
+
169
+ describe "#extensions" do
170
+ subject(:dwca) { DarwinCore.new(file_path) }
171
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
172
+
173
+ it "returns extensions" do
174
+ extensions = dwca.extensions
175
+ expect(extensions).to be_kind_of Array
176
+ expect(extensions[0]).to be_kind_of DarwinCore::Extension
177
+ end
178
+ end
179
+
180
+ describe "#checksum" do
181
+ subject(:dwca) { DarwinCore.new(file_path) }
182
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
183
+
184
+ it "creates checksum hash" do
185
+ expect(dwca.checksum).to eq "7d94fc28ffaf434b66fbc790aa5ef00d834057bf"
186
+ end
187
+ end
188
+
189
+ describe "#parent_id?" do
190
+ subject(:dwca) { DarwinCore.new(file_path) }
191
+
192
+ context "has classification" do
193
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
194
+ it "returns true" do
195
+ expect(dwca.parent_id?).to be true
196
+ end
197
+ end
198
+
199
+ context "does not have classification" do
200
+ let(:file_path) { File.join(file_dir, "gnub.tar.gz") }
201
+ it "returns false" do
202
+ expect(dwca.parent_id?).to be false
203
+ end
204
+ end
205
+ end
206
+
207
+ describe "#classification_normalizer" do
208
+ subject(:dwca) { DarwinCore.new(file_path) }
209
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
210
+
211
+ context "not initialized" do
212
+ it "is nil" do
213
+ expect(dwca.classification_normalizer).to be_nil
214
+ end
215
+ end
216
+
217
+ context "initialized" do
218
+ it "is DarwinCore::ClassificationNormalizer" do
219
+ dwca.normalize_classification
220
+ expect(dwca.classification_normalizer).
221
+ to be_kind_of DarwinCore::ClassificationNormalizer
222
+ end
223
+ end
224
+ end
225
+
226
+ describe "#normalize_classification" do
227
+ subject(:dwca) { DarwinCore.new(file_path) }
228
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
229
+ let(:normalized) { dwca.normalize_classification }
230
+ let(:encodings) do
231
+ normalized.each_with_object(Set.new) do |taxon, e|
232
+ taxon[1].classification_path.each { |p| e << p.encoding }
233
+ end
234
+ end
235
+
236
+ it "returns hash" do
237
+ expect(normalized).to be_kind_of Hash
238
+ end
239
+
240
+ it "uses utf-8 encoding for classification paths" do
241
+ expect(encodings.map(&:to_s).to_a).to eq ["UTF-8"]
242
+ end
243
+
244
+ it "has elements of DarwinCore::TaxonNormalized type" do
245
+ expect(normalized["leptogastrinae:tid:2857"]).
246
+ to be_kind_of DarwinCore::TaxonNormalized
247
+ end
248
+ end
249
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ describe DarwinCore::Generator::EmlXml do
4
+ subject { DarwinCore::Generator::EmlXml.new(data, path) }
5
+ let(:data) { EML_DATA }
6
+ let(:path) { DarwinCore::DEFAULT_TMP_DIR }
7
+
8
+ describe ".new" do
9
+ it { is_expected.to be_kind_of DarwinCore::Generator::EmlXml }
10
+ end
11
+
12
+ describe "#create" do
13
+ let(:content) do
14
+ subject.create
15
+ File.read(File.join(path, "eml.xml"))
16
+ end
17
+
18
+ it "should create eml xml" do
19
+ expect(content).to match(/Test Classification/)
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ describe DarwinCore::Generator::MetaXml do
4
+ subject { DarwinCore::Generator::MetaXml.new(data, path) }
5
+ let(:data) { META_DATA }
6
+ let(:path) { DarwinCore::DEFAULT_TMP_DIR }
7
+
8
+ describe ".new" do
9
+ it { is_expected.to be_kind_of DarwinCore::Generator::MetaXml }
10
+ end
11
+
12
+ describe "#create" do
13
+ let(:content) do
14
+ subject.create
15
+ File.read(File.join(path, "meta.xml"))
16
+ end
17
+
18
+ it "creates metadata file" do
19
+ expect(content).to match(%r{<location>core.csv</location>})
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ describe DarwinCore::Generator do
4
+ subject(:gen) { DarwinCore::Generator.new(dwc_path, tmp_dir) }
5
+ let(:tmp_dir) { DarwinCore::DEFAULT_TMP_DIR }
6
+ let(:dwc_path) { File.join(tmp_dir, "spec_dwca.tar.gz") }
7
+
8
+ def generate_dwca(gen)
9
+ gen.add_core(CORE_DATA.dup, "core.csv", true)
10
+ gen.add_extension(EXTENSION_DATA.dup, "vern.csv", true,
11
+ "http://rs.gbif.org/terms/1.0/VernacularName")
12
+ gen.add_meta_xml
13
+ gen.add_eml_xml(EML_DATA)
14
+ end
15
+
16
+ describe ".new" do
17
+ it "initializes empty DwCA" do
18
+ expect(gen).to be_kind_of DarwinCore::Generator
19
+ end
20
+ end
21
+
22
+ describe "#add_core" do
23
+ it "adds core to DwCA instance" do
24
+ gen.add_core(CORE_DATA.dup, "core.csv", true)
25
+ core = File.read(File.join(gen.path, "core.csv"))
26
+ expect(core).to match(/taxonID,parentNameUsageID,scientificName/)
27
+ end
28
+
29
+ context "urls are not given in header" do
30
+ it "raises error" do
31
+ data = CORE_DATA.dup
32
+ data[0] = data[0].map { |f| f.split("/")[-1] }
33
+ expect { gen.add_core(data, "core.csv", true) }.
34
+ to raise_error DarwinCore::GeneratorError
35
+ end
36
+ end
37
+ end
38
+
39
+ describe "#add_extension" do
40
+ it "adds extension to DwCA instance" do
41
+ gen.add_extension(EXTENSION_DATA.dup,
42
+ "vern.csv",
43
+ true,
44
+ "http://rs.gbif.org/terms/1.0/VernacularName")
45
+ extension = File.read(File.join(gen.path, "vern.csv"))
46
+
47
+ expect(extension).to match(/Береза/)
48
+ end
49
+ end
50
+
51
+ describe "#add_meta_xml" do
52
+ it "creates metadata for DwCA" do
53
+ gen.add_core(CORE_DATA.dup, "core.csv", true)
54
+ gen.add_extension(EXTENSION_DATA.dup,
55
+ "vern.csv",
56
+ true,
57
+ "http://rs.gbif.org/terms/1.0/VernacularName")
58
+
59
+ gen.add_meta_xml
60
+ meta = File.read(File.join(gen.path, "meta.xml")).strip
61
+ meta_from_file = File.read(
62
+ File.expand_path("../files/generator_meta.xml", __dir__)
63
+ ).strip
64
+ expect(meta).to eq meta_from_file
65
+ end
66
+ end
67
+
68
+ describe "#add_eml_data" do
69
+ it "adds eml data" do
70
+ gen.add_eml_xml(EML_DATA)
71
+ eml = File.read(File.join(gen.path, "eml.xml")).strip
72
+ eml.gsub!(%r{(<pubDate>).*?(</pubDate>)}, '\12013-12-30 14:45:33 -0500\2')
73
+ eml.gsub!(/(packageId=").*?"/, '\11234/2013-12-30::19:45:33"')
74
+
75
+ eml_from_file = File.read(
76
+ File.expand_path("../files/generator_eml.xml", __dir__)
77
+ ).strip
78
+ expect(eml.strip).to eq eml_from_file.strip
79
+ end
80
+ end
81
+
82
+ describe "#path" do
83
+ it "returns temporary path for assembling DwCA" do
84
+ expect(gen.path).to match(/dwc_\d+$/)
85
+ end
86
+ end
87
+
88
+ describe "#files" do
89
+ before(:example) { generate_dwca(gen) }
90
+
91
+ it "returns created files" do
92
+ expect(gen.files).
93
+ to match_array ["core.csv", "eml.xml", "meta.xml", "vern.csv"]
94
+ end
95
+ end
96
+
97
+ describe "#pack" do
98
+ before(:example) do
99
+ FileUtils.rm dwc_path if File.exist?(dwc_path)
100
+ generate_dwca(gen)
101
+ end
102
+
103
+ it "creates final DwCA file" do
104
+ gen.pack
105
+ expect(File.exist?(dwc_path)).to be true
106
+ end
107
+ end
108
+
109
+ describe "#clean" do
110
+ before(:example) { gen.add_eml_xml(EML_DATA) }
111
+
112
+ it "removes temporary directory for DwCA" do
113
+ expect(File.exist?(gen.path)).to be true
114
+ gen.clean
115
+ expect(File.exist?(gen.path)).to be false
116
+ end
117
+ end
118
+
119
+ describe "#eml_xml_data" do
120
+ it "returns current eml data" do
121
+ expect(gen.eml_xml_data).to be_kind_of Hash
122
+ end
123
+ end
124
+ end