dwc-archive 0.9.5 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +31 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -0
  6. data/.travis.yml +5 -4
  7. data/CHANGELOG +17 -5
  8. data/Gemfile +3 -15
  9. data/LICENSE +1 -1
  10. data/README.md +143 -111
  11. data/Rakefile +13 -49
  12. data/dwc-archive.gemspec +37 -0
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +121 -0
  17. data/lib/dwc_archive/archive.rb +59 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +394 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
  21. data/lib/dwc_archive/expander.rb +85 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +90 -0
  24. data/lib/dwc_archive/generator_eml_xml.rb +116 -0
  25. data/lib/dwc_archive/generator_meta_xml.rb +72 -0
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +56 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +89 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +47 -0
  34. data/spec/files/generator_meta.xml +19 -0
  35. data/spec/files/gnub.tar.gz +0 -0
  36. data/spec/lib/classification_normalizer_spec.rb +214 -0
  37. data/spec/lib/core_spec.rb +100 -0
  38. data/spec/lib/darwin_core_spec.rb +249 -0
  39. data/spec/lib/generator_eml_xml_spec.rb +22 -0
  40. data/spec/lib/generator_meta_xml_spec.rb +22 -0
  41. data/spec/lib/generator_spec.rb +124 -0
  42. data/spec/lib/gnub_taxon_spec.rb +32 -0
  43. data/spec/lib/metadata_spec.rb +89 -0
  44. data/spec/lib/taxon_normalized_spec.rb +142 -0
  45. data/spec/lib/xml_reader_spec.rb +11 -11
  46. data/spec/spec_helper.rb +78 -7
  47. metadata +181 -92
  48. data/.rvmrc +0 -1
  49. data/Gemfile.lock +0 -155
  50. data/VERSION +0 -1
  51. data/lib/dwc-archive.rb +0 -95
  52. data/lib/dwc-archive/.expander.rb.swo +0 -0
  53. data/lib/dwc-archive/archive.rb +0 -37
  54. data/lib/dwc-archive/classification_normalizer.rb +0 -332
  55. data/lib/dwc-archive/core.rb +0 -17
  56. data/lib/dwc-archive/expander.rb +0 -80
  57. data/lib/dwc-archive/generator.rb +0 -75
  58. data/lib/dwc-archive/generator_eml_xml.rb +0 -84
  59. data/lib/dwc-archive/generator_meta_xml.rb +0 -50
  60. data/lib/dwc-archive/ingester.rb +0 -101
  61. data/lib/dwc-archive/metadata.rb +0 -42
  62. data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
  63. data/lib/dwc-archive/xml_reader.rb +0 -64
  64. data/spec/lib/dwc-archive_spec.rb +0 -236
  65. data/spec/spec.opts +0 -1
@@ -0,0 +1,100 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../spec_helper"
4
+
5
+ describe DarwinCore::Core do
6
+ subject(:dwca) { DarwinCore.new(file_path) }
7
+ subject(:core) { DarwinCore::Core.new(dwca) }
8
+ let(:file_path) do
9
+ File.join(File.expand_path("../files", __dir__), file_name)
10
+ end
11
+ let(:file_name) { "data.tar.gz" }
12
+
13
+ describe ".new" do
14
+ it "creates new core" do
15
+ expect(core).to be_kind_of DarwinCore::Core
16
+ end
17
+ end
18
+
19
+ describe "#id" do
20
+ it "returns core id" do
21
+ expect(core.id[:index]).to eq 0
22
+ expect(core.id[:term]).to eq "http://rs.tdwg.org/dwc/terms/TaxonID"
23
+ end
24
+
25
+ context "no coreid" do
26
+ let(:file_name) { "empty_coreid.tar.gz" }
27
+
28
+ it "does not return coreid" do
29
+ expect(core.id[:index]).to eq 0
30
+ expect(core.id[:term]).to be_nil
31
+ end
32
+ end
33
+ end
34
+
35
+ describe "#data" do
36
+ it "gers core data" do
37
+ expect(core.data).to be_kind_of Hash
38
+ end
39
+ end
40
+
41
+ describe "#properties" do
42
+ it "gers core properties" do
43
+ expect(core.properties).to be_kind_of Hash
44
+ expect(core.properties.keys).to match_array %i[
45
+ encoding fieldsTerminatedBy linesTerminatedBy fieldsEnclosedBy
46
+ ignoreHeaderLines rowType
47
+ ]
48
+ end
49
+ end
50
+
51
+ describe "#encoding" do
52
+ it "returns encoding of the data" do
53
+ expect(core.encoding).to eq "UTF-8"
54
+ end
55
+ end
56
+
57
+ describe "#fields_separator" do
58
+ it "returns separator of fields for csv files" do
59
+ expect(core.fields_separator).to be_nil
60
+ end
61
+ end
62
+
63
+ describe "#size" do
64
+ it "returns number of lines in the core" do
65
+ expect(core.size).to eq 588
66
+ end
67
+ end
68
+
69
+ describe "#file_path" do
70
+ it "returns file path of core file" do
71
+ expect(core.file_path).to match "DarwinCore.txt"
72
+ end
73
+ end
74
+
75
+ describe "#fields" do
76
+ it "returns fields of the core file" do
77
+ expect(core.fields.size).to eq 7
78
+ expect(core.fields).to be_kind_of Array
79
+ expect(core.fields[0]).to be_kind_of Hash
80
+ end
81
+ end
82
+
83
+ describe "#line_separator" do
84
+ it "returns characters separating lines in csv file" do
85
+ expect(core.line_separator).to eq "\\n"
86
+ end
87
+ end
88
+
89
+ describe "#quote_character" do
90
+ it "returns quote character for the csv file" do
91
+ expect(core.quote_character).to eq ""
92
+ end
93
+ end
94
+
95
+ describe "#ignore headers" do
96
+ it "returns true if headers should not be included into data" do
97
+ expect(core.ignore_headers).to eq true
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,249 @@
1
+ # frozen_string_literal: true
2
+
3
+ describe DarwinCore do
4
+ subject { DarwinCore }
5
+ let(:file_dir) { File.expand_path("../files", __dir__) }
6
+
7
+ it "has version" do
8
+ expect(DarwinCore::VERSION).to match(/\d+\.\d+\.\d/)
9
+ end
10
+
11
+ describe ".nil_field?" do
12
+ it "is true for nil fields" do
13
+ [nil, "/N", ""].each do |i|
14
+ expect(DarwinCore.nil_field?(i)).to be true
15
+ end
16
+ end
17
+
18
+ it "is false for non-nil fields" do
19
+ [0, "0", "123", 123, "dsdfs434343/N"].each do |i|
20
+ expect(subject.nil_field?(i)).to be false
21
+ end
22
+ end
23
+ end
24
+
25
+ describe ".clean_all" do
26
+ let(:tmp_dir) { DarwinCore::DEFAULT_TMP_DIR }
27
+
28
+ it "cleans dwca directories" do
29
+ Dir.chdir(tmp_dir)
30
+ FileUtils.mkdir("dwc_123") unless File.exist?("dwc_123")
31
+ dwca_dirs = Dir.entries(tmp_dir).select { |d| d.match(/^dwc_[\d]+$/) }
32
+ expect(dwca_dirs.size).to be > 0
33
+ subject.clean_all
34
+ dwca_dirs = Dir.entries(tmp_dir).select { |d| d.match(/^dwc_[\d]+$/) }
35
+ expect(dwca_dirs.size).to be 0
36
+ end
37
+
38
+ context "no dwc files exist" do
39
+ it "does nothing" do
40
+ subject.clean_all
41
+ subject.clean_all
42
+ dwca_dirs = Dir.entries(tmp_dir).select { |d| d.match(/^dwc_[\d]+$/) }
43
+ expect(dwca_dirs.size).to be 0
44
+ end
45
+ end
46
+ end
47
+
48
+ describe ".logger" do
49
+ it { expect(subject.logger).to be_kind_of Logger }
50
+ end
51
+
52
+ describe ".logger=" do
53
+ it "sets logger" do
54
+ expect(subject.logger = "fake logger").to eq "fake logger"
55
+ expect(subject.logger).to eq "fake logger"
56
+ end
57
+ end
58
+
59
+ describe ".logger_reset" do
60
+ it "resets logger" do
61
+ subject.logger = "fake logger"
62
+ expect(subject.logger).to eq "fake logger"
63
+ subject.logger_reset
64
+ expect(subject.logger).to be_kind_of Logger
65
+ end
66
+ end
67
+
68
+ describe ".new" do
69
+ subject(:dwca) { DarwinCore.new(file_path) }
70
+
71
+ context "tar.gz and zip files supplied" do
72
+ files = %w[data.zip data.tar.gz minimal.tar.gz junk_dir_inside.zip]
73
+ files.each do |file|
74
+ let(:file_path) { File.join(file_dir, file) }
75
+
76
+ it "creates archive from #{file}" do
77
+ expect(dwca.archive.valid?).to be true
78
+ end
79
+ end
80
+ end
81
+
82
+ context "when file does not exist" do
83
+ let(:file_path) { File.join(file_dir, "no_file.gz") }
84
+
85
+ it "raises not found" do
86
+ expect { dwca }.to raise_error DarwinCore::FileNotFoundError
87
+ end
88
+ end
89
+
90
+ context "archive cannot unpack" do
91
+ let(:file_path) { File.join(file_dir, "broken.tar.gz") }
92
+
93
+ it "raises unpacking error" do
94
+ expect { dwca }.to raise_error DarwinCore::UnpackingError
95
+ end
96
+ end
97
+
98
+ context "archive is broken" do
99
+ let(:file_path) { File.join(file_dir, "invalid.tar.gz") }
100
+
101
+ it "raises error of invalid archive" do
102
+ expect { dwca }.to raise_error DarwinCore::InvalidArchiveError
103
+ end
104
+ end
105
+
106
+ context "archive is not in utf-8 encoding" do
107
+ let(:file_path) { File.join(file_dir, "latin1.tar.gz") }
108
+
109
+ it "raises wrong encoding error" do
110
+ expect { dwca }.to raise_error DarwinCore::EncodingError
111
+ end
112
+ end
113
+
114
+ context "filename with spaces and non-alphanumeric chars" do
115
+ let(:file_path) { File.join(file_dir, "file with characters(3).tar.gz") }
116
+
117
+ it "creates archive" do
118
+ expect(dwca.archive.valid?).to be true
119
+ end
120
+ end
121
+ end
122
+
123
+ describe "file_name" do
124
+ subject(:dwca) { DarwinCore.new(file_path) }
125
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
126
+
127
+ it "returns file name" do
128
+ expect(dwca.file_name).to eq "data.tar.gz"
129
+ end
130
+ end
131
+
132
+ describe "path" do
133
+ subject(:dwca) { DarwinCore.new(file_path) }
134
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
135
+
136
+ it "returns path of the archive" do
137
+ expect(dwca.path).to match(/spec.files.data\.tar\.gz/)
138
+ end
139
+ end
140
+
141
+ describe "#archive" do
142
+ subject(:dwca) { DarwinCore.new(file_path) }
143
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
144
+
145
+ it "returns archive" do
146
+ expect(dwca.archive).to be_kind_of DarwinCore::Archive
147
+ end
148
+ end
149
+
150
+ describe "#core" do
151
+ subject(:dwca) { DarwinCore.new(file_path) }
152
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
153
+
154
+ it "returns core" do
155
+ expect(dwca.core).to be_kind_of DarwinCore::Core
156
+ end
157
+ end
158
+
159
+ describe "#metadata" do
160
+ subject(:dwca) { DarwinCore.new(file_path) }
161
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
162
+
163
+ it "returns eml" do
164
+ expect(dwca.eml).to be_kind_of DarwinCore::Metadata
165
+ expect(dwca.metadata).to be_kind_of DarwinCore::Metadata
166
+ end
167
+ end
168
+
169
+ describe "#extensions" do
170
+ subject(:dwca) { DarwinCore.new(file_path) }
171
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
172
+
173
+ it "returns extensions" do
174
+ extensions = dwca.extensions
175
+ expect(extensions).to be_kind_of Array
176
+ expect(extensions[0]).to be_kind_of DarwinCore::Extension
177
+ end
178
+ end
179
+
180
+ describe "#checksum" do
181
+ subject(:dwca) { DarwinCore.new(file_path) }
182
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
183
+
184
+ it "creates checksum hash" do
185
+ expect(dwca.checksum).to eq "7d94fc28ffaf434b66fbc790aa5ef00d834057bf"
186
+ end
187
+ end
188
+
189
+ describe "#parent_id?" do
190
+ subject(:dwca) { DarwinCore.new(file_path) }
191
+
192
+ context "has classification" do
193
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
194
+ it "returns true" do
195
+ expect(dwca.parent_id?).to be true
196
+ end
197
+ end
198
+
199
+ context "does not have classification" do
200
+ let(:file_path) { File.join(file_dir, "gnub.tar.gz") }
201
+ it "returns false" do
202
+ expect(dwca.parent_id?).to be false
203
+ end
204
+ end
205
+ end
206
+
207
+ describe "#classification_normalizer" do
208
+ subject(:dwca) { DarwinCore.new(file_path) }
209
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
210
+
211
+ context "not initialized" do
212
+ it "is nil" do
213
+ expect(dwca.classification_normalizer).to be_nil
214
+ end
215
+ end
216
+
217
+ context "initialized" do
218
+ it "is DarwinCore::ClassificationNormalizer" do
219
+ dwca.normalize_classification
220
+ expect(dwca.classification_normalizer).
221
+ to be_kind_of DarwinCore::ClassificationNormalizer
222
+ end
223
+ end
224
+ end
225
+
226
+ describe "#normalize_classification" do
227
+ subject(:dwca) { DarwinCore.new(file_path) }
228
+ let(:file_path) { File.join(file_dir, "data.tar.gz") }
229
+ let(:normalized) { dwca.normalize_classification }
230
+ let(:encodings) do
231
+ normalized.each_with_object(Set.new) do |taxon, e|
232
+ taxon[1].classification_path.each { |p| e << p.encoding }
233
+ end
234
+ end
235
+
236
+ it "returns hash" do
237
+ expect(normalized).to be_kind_of Hash
238
+ end
239
+
240
+ it "uses utf-8 encoding for classification paths" do
241
+ expect(encodings.map(&:to_s).to_a).to eq ["UTF-8"]
242
+ end
243
+
244
+ it "has elements of DarwinCore::TaxonNormalized type" do
245
+ expect(normalized["leptogastrinae:tid:2857"]).
246
+ to be_kind_of DarwinCore::TaxonNormalized
247
+ end
248
+ end
249
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ describe DarwinCore::Generator::EmlXml do
4
+ subject { DarwinCore::Generator::EmlXml.new(data, path) }
5
+ let(:data) { EML_DATA }
6
+ let(:path) { DarwinCore::DEFAULT_TMP_DIR }
7
+
8
+ describe ".new" do
9
+ it { is_expected.to be_kind_of DarwinCore::Generator::EmlXml }
10
+ end
11
+
12
+ describe "#create" do
13
+ let(:content) do
14
+ subject.create
15
+ File.read(File.join(path, "eml.xml"))
16
+ end
17
+
18
+ it "should create eml xml" do
19
+ expect(content).to match(/Test Classification/)
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ describe DarwinCore::Generator::MetaXml do
4
+ subject { DarwinCore::Generator::MetaXml.new(data, path) }
5
+ let(:data) { META_DATA }
6
+ let(:path) { DarwinCore::DEFAULT_TMP_DIR }
7
+
8
+ describe ".new" do
9
+ it { is_expected.to be_kind_of DarwinCore::Generator::MetaXml }
10
+ end
11
+
12
+ describe "#create" do
13
+ let(:content) do
14
+ subject.create
15
+ File.read(File.join(path, "meta.xml"))
16
+ end
17
+
18
+ it "creates metadata file" do
19
+ expect(content).to match(%r{<location>core.csv</location>})
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ describe DarwinCore::Generator do
4
+ subject(:gen) { DarwinCore::Generator.new(dwc_path, tmp_dir) }
5
+ let(:tmp_dir) { DarwinCore::DEFAULT_TMP_DIR }
6
+ let(:dwc_path) { File.join(tmp_dir, "spec_dwca.tar.gz") }
7
+
8
+ def generate_dwca(gen)
9
+ gen.add_core(CORE_DATA.dup, "core.csv", true)
10
+ gen.add_extension(EXTENSION_DATA.dup, "vern.csv", true,
11
+ "http://rs.gbif.org/terms/1.0/VernacularName")
12
+ gen.add_meta_xml
13
+ gen.add_eml_xml(EML_DATA)
14
+ end
15
+
16
+ describe ".new" do
17
+ it "initializes empty DwCA" do
18
+ expect(gen).to be_kind_of DarwinCore::Generator
19
+ end
20
+ end
21
+
22
+ describe "#add_core" do
23
+ it "adds core to DwCA instance" do
24
+ gen.add_core(CORE_DATA.dup, "core.csv", true)
25
+ core = File.read(File.join(gen.path, "core.csv"))
26
+ expect(core).to match(/taxonID,parentNameUsageID,scientificName/)
27
+ end
28
+
29
+ context "urls are not given in header" do
30
+ it "raises error" do
31
+ data = CORE_DATA.dup
32
+ data[0] = data[0].map { |f| f.split("/")[-1] }
33
+ expect { gen.add_core(data, "core.csv", true) }.
34
+ to raise_error DarwinCore::GeneratorError
35
+ end
36
+ end
37
+ end
38
+
39
+ describe "#add_extension" do
40
+ it "adds extension to DwCA instance" do
41
+ gen.add_extension(EXTENSION_DATA.dup,
42
+ "vern.csv",
43
+ true,
44
+ "http://rs.gbif.org/terms/1.0/VernacularName")
45
+ extension = File.read(File.join(gen.path, "vern.csv"))
46
+
47
+ expect(extension).to match(/Береза/)
48
+ end
49
+ end
50
+
51
+ describe "#add_meta_xml" do
52
+ it "creates metadata for DwCA" do
53
+ gen.add_core(CORE_DATA.dup, "core.csv", true)
54
+ gen.add_extension(EXTENSION_DATA.dup,
55
+ "vern.csv",
56
+ true,
57
+ "http://rs.gbif.org/terms/1.0/VernacularName")
58
+
59
+ gen.add_meta_xml
60
+ meta = File.read(File.join(gen.path, "meta.xml")).strip
61
+ meta_from_file = File.read(
62
+ File.expand_path("../files/generator_meta.xml", __dir__)
63
+ ).strip
64
+ expect(meta).to eq meta_from_file
65
+ end
66
+ end
67
+
68
+ describe "#add_eml_data" do
69
+ it "adds eml data" do
70
+ gen.add_eml_xml(EML_DATA)
71
+ eml = File.read(File.join(gen.path, "eml.xml")).strip
72
+ eml.gsub!(%r{(<pubDate>).*?(</pubDate>)}, '\12013-12-30 14:45:33 -0500\2')
73
+ eml.gsub!(/(packageId=").*?"/, '\11234/2013-12-30::19:45:33"')
74
+
75
+ eml_from_file = File.read(
76
+ File.expand_path("../files/generator_eml.xml", __dir__)
77
+ ).strip
78
+ expect(eml.strip).to eq eml_from_file.strip
79
+ end
80
+ end
81
+
82
+ describe "#path" do
83
+ it "returns temporary path for assembling DwCA" do
84
+ expect(gen.path).to match(/dwc_\d+$/)
85
+ end
86
+ end
87
+
88
+ describe "#files" do
89
+ before(:example) { generate_dwca(gen) }
90
+
91
+ it "returns created files" do
92
+ expect(gen.files).
93
+ to match_array ["core.csv", "eml.xml", "meta.xml", "vern.csv"]
94
+ end
95
+ end
96
+
97
+ describe "#pack" do
98
+ before(:example) do
99
+ FileUtils.rm dwc_path if File.exist?(dwc_path)
100
+ generate_dwca(gen)
101
+ end
102
+
103
+ it "creates final DwCA file" do
104
+ gen.pack
105
+ expect(File.exist?(dwc_path)).to be true
106
+ end
107
+ end
108
+
109
+ describe "#clean" do
110
+ before(:example) { gen.add_eml_xml(EML_DATA) }
111
+
112
+ it "removes temporary directory for DwCA" do
113
+ expect(File.exist?(gen.path)).to be true
114
+ gen.clean
115
+ expect(File.exist?(gen.path)).to be false
116
+ end
117
+ end
118
+
119
+ describe "#eml_xml_data" do
120
+ it "returns current eml data" do
121
+ expect(gen.eml_xml_data).to be_kind_of Hash
122
+ end
123
+ end
124
+ end