dwc-archive 0.9.6 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +31 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -0
  6. data/.travis.yml +4 -5
  7. data/CHANGELOG +15 -7
  8. data/Gemfile +3 -15
  9. data/LICENSE +1 -1
  10. data/README.md +135 -111
  11. data/Rakefile +13 -54
  12. data/dwc-archive.gemspec +37 -0
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +121 -0
  17. data/lib/dwc_archive/archive.rb +59 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +382 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
  21. data/lib/dwc_archive/expander.rb +85 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +90 -0
  24. data/lib/dwc_archive/generator_eml_xml.rb +116 -0
  25. data/lib/dwc_archive/generator_meta_xml.rb +72 -0
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +56 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +89 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +47 -0
  34. data/spec/files/generator_meta.xml +19 -0
  35. data/spec/lib/classification_normalizer_spec.rb +214 -0
  36. data/spec/lib/core_spec.rb +100 -0
  37. data/spec/lib/darwin_core_spec.rb +249 -0
  38. data/spec/lib/generator_eml_xml_spec.rb +22 -0
  39. data/spec/lib/generator_meta_xml_spec.rb +22 -0
  40. data/spec/lib/generator_spec.rb +124 -0
  41. data/spec/lib/gnub_taxon_spec.rb +32 -0
  42. data/spec/lib/metadata_spec.rb +89 -0
  43. data/spec/lib/taxon_normalized_spec.rb +142 -0
  44. data/spec/lib/xml_reader_spec.rb +11 -11
  45. data/spec/spec_helper.rb +78 -6
  46. metadata +180 -92
  47. data/.rvmrc +0 -1
  48. data/Gemfile.lock +0 -155
  49. data/VERSION +0 -1
  50. data/lib/dwc-archive.rb +0 -95
  51. data/lib/dwc-archive/.expander.rb.swo +0 -0
  52. data/lib/dwc-archive/archive.rb +0 -37
  53. data/lib/dwc-archive/classification_normalizer.rb +0 -424
  54. data/lib/dwc-archive/core.rb +0 -17
  55. data/lib/dwc-archive/expander.rb +0 -80
  56. data/lib/dwc-archive/generator.rb +0 -75
  57. data/lib/dwc-archive/generator_eml_xml.rb +0 -84
  58. data/lib/dwc-archive/generator_meta_xml.rb +0 -50
  59. data/lib/dwc-archive/ingester.rb +0 -101
  60. data/lib/dwc-archive/metadata.rb +0 -42
  61. data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
  62. data/lib/dwc-archive/xml_reader.rb +0 -64
  63. data/spec/lib/dwc-archive_spec.rb +0 -250
  64. data/spec/spec.opts +0 -1
@@ -1,10 +0,0 @@
1
- UTF8RGX = /\A(
2
- [\x09\x0A\x0D\x20-\x7E] # ASCII
3
- | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
4
- | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
5
- | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
6
- | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
7
- | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
8
- | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
9
- | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
10
- )*\z/x
@@ -1,64 +0,0 @@
1
- # USAGE: Hash.from_xml:(YOUR_XML_STRING)
2
- require 'nokogiri'
3
- # modified from http://stackoverflow.com/questions/1230741/convert-a-nokogiri-document-to-a-ruby-hash/1231297#1231297
4
- class DarwinCore
5
- module XmlReader
6
- class << self
7
- def from_xml(xml_io)
8
- begin
9
- result = Nokogiri::XML(xml_io)
10
- return { result.root.name.to_sym => xml_node_to_hash(result.root)}
11
- rescue Exception => e
12
- raise e
13
- end
14
- end
15
-
16
- private
17
-
18
- def xml_node_to_hash(node)
19
- # If we are at the root of the document, start the hash
20
- if node.element?
21
- result_hash = {}
22
- if node.attributes != {}
23
- result_hash[:attributes] = {}
24
- node.attributes.keys.each do |key|
25
- result_hash[:attributes][node.attributes[key].name.to_sym] = prepare(node.attributes[key].value)
26
- end
27
- end
28
- if node.children.size > 0
29
- node.children.each do |child|
30
- result = xml_node_to_hash(child)
31
-
32
- if child.name == "text"
33
- unless child.next_sibling || child.previous_sibling
34
- return prepare(result)
35
- end
36
- elsif result_hash[child.name.to_sym]
37
- if result_hash[child.name.to_sym].is_a?(Object::Array)
38
- result_hash[child.name.to_sym] << prepare(result)
39
- else
40
- result_hash[child.name.to_sym] = [result_hash[child.name.to_sym]] << prepare(result)
41
- end
42
- else
43
- result_hash[child.name.to_sym] = prepare(result)
44
- end
45
- end
46
-
47
- return result_hash
48
- else
49
- return result_hash
50
- end
51
- else
52
- return prepare(node.content.to_s)
53
- end
54
- end
55
-
56
- def prepare(data)
57
- return data if data.class != String
58
- return true if data.strip == "true"
59
- return false if data.strip == "false"
60
- data.to_i.to_s == data ? data.to_i : data
61
- end
62
- end
63
- end
64
- end
@@ -1,250 +0,0 @@
1
- # encoding: utf-8
2
- require File.expand_path(File.dirname(__FILE__) + "/../spec_helper")
3
-
4
- describe DarwinCore do
5
- before(:all) do
6
- @file_dir = File.join(File.dirname(__FILE__), '..', 'files')
7
- end
8
-
9
- describe "VERSION" do
10
- it "should return VERSION number" do
11
- DarwinCore::VERSION.split('.').join('').to_i.should > 41
12
- end
13
- end
14
-
15
- describe "::nil_field?" do
16
- it "should return true for entries which normally mean nil" do
17
- [nil, '/N', ''].each do |i|
18
- DarwinCore.nil_field?(i).should be_true
19
- end
20
- end
21
-
22
- it "should return false for fields that are not nil" do
23
- [0, '0', '123', 123, 'dsdfs434343/N'].each do |i|
24
- DarwinCore.nil_field?(i).should be_false
25
- end
26
- end
27
- end
28
-
29
- describe ".new" do
30
- it "should create DarwinCore instance out of archive file" do
31
- ['data.zip', 'data.tar.gz', 'minimal.tar.gz', 'junk_dir_inside.zip'].each do |file|
32
- file = File.join(@file_dir, file)
33
- dwc = DarwinCore.new(file)
34
- dwc.archive.valid?.should be_true
35
- end
36
- end
37
-
38
- it "should raise an error if archive file does not exist" do
39
- file = 'not_a_file'
40
- lambda { DarwinCore.new(file) }.should raise_error(DarwinCore::FileNotFoundError)
41
- end
42
-
43
- it "should raise an error if archive is broken" do
44
- file = File.join(@file_dir, 'broken.tar.gz')
45
- lambda { DarwinCore.new(file) }.should raise_error(DarwinCore::UnpackingError)
46
- end
47
-
48
- it "should raise an error if archive is invalid" do
49
- file = File.join(@file_dir, 'invalid.tar.gz')
50
- lambda { DarwinCore.new(file) }.should raise_error(DarwinCore::InvalidArchiveError)
51
- end
52
-
53
- it "should raise an error if archive is not in utf-8" do
54
- file = File.join(@file_dir, 'latin1.tar.gz')
55
- lambda { DarwinCore.new(file) }.should raise_error(DarwinCore::EncodingError)
56
- end
57
-
58
- it "should work with files that have non-alfanumeric characters and spaces" do
59
- file = File.join(@file_dir, 'file with characters(3).gz')
60
- dwc = DarwinCore.new(file)
61
- dwc.archive.valid?.should be_true
62
- end
63
- end
64
-
65
- describe ".normalize_classification" do
66
- it "should return flat list if file has no parent id information" do
67
- file = File.join(@file_dir, 'flat_list.tar.gz')
68
- dwc = DarwinCore.new(file)
69
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
70
- cn.normalize
71
- cn.normalized_data.should_not be_nil
72
- cn.normalized_data.size.should > 0
73
- end
74
-
75
- it "should return array or hash of name_strings back" do
76
- file = File.join(@file_dir, 'data.tar.gz')
77
- dwc = DarwinCore.new(file)
78
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
79
- cn.normalize
80
- name_strings = cn.name_strings
81
- name_strings.is_a?(Array).should be_true
82
- name_strings.size.should > 1
83
- name_strings = cn.name_strings(with_hash: true)
84
- name_strings.size.should > 1
85
- name_strings.is_a?(Hash).should be_true
86
- name_strings.is_a?(Hash).should be_true
87
- name_strings.values.uniq.should == [1]
88
- vernacular_name_strings = cn.vernacular_name_strings
89
- vernacular_name_strings.is_a?(Array).should be_true
90
- vernacular_name_strings.size.should > 0
91
- vernacular_name_strings = cn.vernacular_name_strings(with_hash: true)
92
- vernacular_name_strings.size.should > 0
93
- vernacular_name_strings.is_a?(Hash).should be_true
94
- vernacular_name_strings.values.uniq.should == [1]
95
- end
96
-
97
- it "should traverse DarwinCore files and assemble data for every node in memory" do
98
- file = File.join(@file_dir, 'data.tar.gz')
99
- dwc = DarwinCore.new(file)
100
- norm = dwc.normalize_classification
101
- norm.class.should == Hash
102
- path_encodings = []
103
- norm.each do |taxon_id, taxon|
104
- taxon.classification_path.each {|p| path_encodings << p.encoding}
105
- end
106
- path_encodings.uniq!
107
- path_encodings.size.should == 1
108
- path_encodings[0].to_s.should == "UTF-8"
109
- norm['leptogastrinae:tid:2857'].class.should == DarwinCore::TaxonNormalized
110
- norm['leptogastrinae:tid:2857'].source.should == 'http://leptogastrinae.lifedesks.org/pages/2857'
111
- end
112
-
113
- it "should assemble synonyms from core" do
114
- file = File.join(@file_dir, 'data.tar.gz')
115
- dwc = DarwinCore.new(file)
116
- norm = dwc.normalize_classification
117
- syn = norm.values.select {|n| n.synonyms.size > 0}[0].synonyms[0]
118
- syn.id.should == 'leptogastrinae:tid:127'
119
- syn.name.should == "Leptogastridae"
120
- syn.source.should == 'http://leptogastrinae.lifedesks.org/pages/127'
121
- end
122
-
123
- it "should be able to assemble vernacular names from an extension" do
124
- file = File.join(@file_dir, 'data.tar.gz')
125
- dwc = DarwinCore.new(file)
126
- norm = dwc.normalize_classification
127
- norm.select { |k,v| !v.vernacular_names.empty? }.map { |k,v| v.vernacular_names }.size.should > 0
128
- end
129
-
130
- it "should be able to assemble synonyms from extension" do
131
- file = File.join(@file_dir, 'synonyms_in_extension.tar.gz')
132
- dwc = DarwinCore.new(file)
133
- norm = dwc.normalize_classification
134
- norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
135
- end
136
-
137
- it "should not assemble synonyms from extension with scientificName, and file name not matching 'synonym'" do
138
- file = File.join(@file_dir, 'not_synonym_in_extension.tar.gz')
139
- dwc = DarwinCore.new(file)
140
- norm = dwc.normalize_classification
141
- norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should == 0
142
- end
143
-
144
- it "should not attempt to assemble extensions with with_extensions opts set to false" do
145
- file = File.join(@file_dir, 'data.tar.gz')
146
- dwc = DarwinCore.new(file)
147
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
148
- norm = cn.normalize(:with_extensions => false)
149
- norm.select { |k,v| !v.vernacular_names.empty? }.size.should == 0
150
- norm = cn.normalize()
151
- norm.select { |k,v| !v.vernacular_names.empty? }.size.should > 0
152
- file = File.join(@file_dir, 'synonyms_in_extension.tar.gz')
153
- dwc = DarwinCore.new(file)
154
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
155
- norm = cn.normalize(:with_extensions => false)
156
- norm.select { |k,v| !v.synonyms.empty? }.size.should == 0
157
- norm = cn.normalize()
158
- norm.select { |k,v| !v.synonyms.empty? }.size.should > 0
159
- end
160
-
161
- it "should assemble linnean classification if terms for it exists" do
162
- file = File.join(@file_dir, 'linnean.tar.gz')
163
- dwc = DarwinCore.new(file)
164
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
165
- norm = cn.normalize
166
- cn.normalized_data.first.last.linnean_classification_path.should == [["Animalia", :kingdom], ["Arthropoda", :phylum], ["Insecta", :class], ["Diptera", :order], ["Cecidomyiidae", :family], ["Resseliella", :genus]]
167
- end
168
-
169
- it "should keep linnean classification empty if terms are not there" do
170
- file = File.join(@file_dir, 'data.tar.gz')
171
- dwc = DarwinCore.new(file)
172
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
173
- norm = cn.normalize
174
- cn.normalized_data.first.last.linnean_classification_path.should == []
175
- end
176
-
177
- it "should be able to assemble synonyms from core" do
178
- file = File.join(@file_dir, 'synonyms_in_core_accepted_name_field.tar.gz')
179
- dwc = DarwinCore.new(file)
180
- norm = dwc.normalize_classification
181
- norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
182
- end
183
-
184
- it "should be able to assemble synonyms from extension" do
185
- file = File.join(@file_dir, 'data.tar.gz')
186
- dwc = DarwinCore.new(file)
187
- norm = dwc.normalize_classification
188
- nodes_with_syn = norm.select { |k,v| !v.synonyms.empty? }
189
- nodes_with_syn.map { |k,v| v.synonyms }.size.should > 0
190
- nodes_with_syn.first[1].synonyms.first.status.should == 'synonym'
191
- end
192
-
193
- it "should be able work with files which have scientificNameAuthorship" do
194
- file = File.join(@file_dir, 'sci_name_authorship.tar.gz')
195
- dwc = DarwinCore.new(file)
196
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
197
- norm = cn.normalize
198
- path_encodings = norm.map {|taxon_id, taxon| taxon.classification_path}.flatten.map { |name| name.encoding.to_s }.uniq
199
- path_encodings.size.should == 1
200
- path_encodings[0].should == "UTF-8"
201
- taxa = norm.select{|k,v| v.current_name_canonical.match " "}.select{|k,v| [v.current_name.split(" ").size > v.current_name_canonical.split(" ").size]}
202
- taxa.size.should == 507
203
- syn = norm.select{|k,v| v.synonyms.size > 0}.map {|k,v| v.synonyms}.flatten.select {|s| s.name.split(" ").size > s.canonical_name.split(" ").size}
204
- syn.size.should == 50
205
- end
206
-
207
- it "should be able work with files which repeat scientificNameAuthorship value in scientificName field" do
208
- file = File.join(@file_dir, 'sci_name_authorship_dup.tar.gz')
209
- dwc = DarwinCore.new(file)
210
- norm = dwc.normalize_classification
211
- taxa = norm.select{|k,v| v.current_name_canonical.match " "}.select{|k,v| [v.current_name.split(" ").size > v.current_name_canonical.split(" ").size]}
212
- taxa.size.should == 507
213
- syn = norm.select{|k,v| v.synonyms.size > 0}.map {|k,v| v.synonyms}.flatten.select {|s| s.name.split(" ").size > s.canonical_name.split(" ").size}
214
- syn.size.should == 50
215
- end
216
-
217
- it "should be able open files where coreid is empty" do
218
- file = File.join(@file_dir, 'empty_coreid.tar.gz')
219
- dwc = DarwinCore.new(file)
220
- norm = dwc.normalize_classification
221
- taxa = norm.select{|k,v| v.current_name_canonical.match " "}.select{|k,v| [v.current_name.split(" ").size > v.current_name_canonical.split(" ").size]}
222
- taxa.size.should == 2
223
- end
224
-
225
- it "should be able to get language and locality fields for vernacular names" do
226
- file = File.join(@file_dir, 'language_locality.tar.gz')
227
- dwc = DarwinCore.new(file)
228
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
229
- cn.normalize
230
- vn = cn.normalized_data['leptogastrinae:tid:42'].vernacular_names.first
231
- vn.language.should == 'en'
232
- vn.locality.should == 'New England'
233
- end
234
-
235
- it 'should be able to get uuids from gnub dataset' do
236
- file = File.join(@file_dir, 'gnub.tar.gz')
237
- dwc = DarwinCore.new(file)
238
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
239
- cn.normalize
240
- vn = cn.normalized_data['9c399f90-cfb8-5a7f-9a21-18285a473488']
241
- vn.class.should == DarwinCore::GnubTaxon
242
- vn.uuid.should == '8faa91f6-663f-4cfe-b785-0ab4e9415a51'
243
- vn.uuid_path.should == [
244
- "9a9f9eeb-d5f9-4ff6-b6cb-a5ad345e33c3",
245
- "bf4c91c0-3d1f-44c7-9d3b-249382182a26",
246
- "8faa91f6-663f-4cfe-b785-0ab4e9415a51"]
247
- end
248
- end
249
-
250
- end
@@ -1 +0,0 @@
1
- --color