dwc-archive 0.9.6 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +31 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -0
  6. data/.travis.yml +4 -5
  7. data/CHANGELOG +15 -7
  8. data/Gemfile +3 -15
  9. data/LICENSE +1 -1
  10. data/README.md +135 -111
  11. data/Rakefile +13 -54
  12. data/dwc-archive.gemspec +37 -0
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +121 -0
  17. data/lib/dwc_archive/archive.rb +59 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +382 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
  21. data/lib/dwc_archive/expander.rb +85 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +90 -0
  24. data/lib/dwc_archive/generator_eml_xml.rb +116 -0
  25. data/lib/dwc_archive/generator_meta_xml.rb +72 -0
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +56 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +89 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +47 -0
  34. data/spec/files/generator_meta.xml +19 -0
  35. data/spec/lib/classification_normalizer_spec.rb +214 -0
  36. data/spec/lib/core_spec.rb +100 -0
  37. data/spec/lib/darwin_core_spec.rb +249 -0
  38. data/spec/lib/generator_eml_xml_spec.rb +22 -0
  39. data/spec/lib/generator_meta_xml_spec.rb +22 -0
  40. data/spec/lib/generator_spec.rb +124 -0
  41. data/spec/lib/gnub_taxon_spec.rb +32 -0
  42. data/spec/lib/metadata_spec.rb +89 -0
  43. data/spec/lib/taxon_normalized_spec.rb +142 -0
  44. data/spec/lib/xml_reader_spec.rb +11 -11
  45. data/spec/spec_helper.rb +78 -6
  46. metadata +180 -92
  47. data/.rvmrc +0 -1
  48. data/Gemfile.lock +0 -155
  49. data/VERSION +0 -1
  50. data/lib/dwc-archive.rb +0 -95
  51. data/lib/dwc-archive/.expander.rb.swo +0 -0
  52. data/lib/dwc-archive/archive.rb +0 -37
  53. data/lib/dwc-archive/classification_normalizer.rb +0 -424
  54. data/lib/dwc-archive/core.rb +0 -17
  55. data/lib/dwc-archive/expander.rb +0 -80
  56. data/lib/dwc-archive/generator.rb +0 -75
  57. data/lib/dwc-archive/generator_eml_xml.rb +0 -84
  58. data/lib/dwc-archive/generator_meta_xml.rb +0 -50
  59. data/lib/dwc-archive/ingester.rb +0 -101
  60. data/lib/dwc-archive/metadata.rb +0 -42
  61. data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
  62. data/lib/dwc-archive/xml_reader.rb +0 -64
  63. data/spec/lib/dwc-archive_spec.rb +0 -250
  64. data/spec/spec.opts +0 -1
@@ -1,10 +0,0 @@
1
- UTF8RGX = /\A(
2
- [\x09\x0A\x0D\x20-\x7E] # ASCII
3
- | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
4
- | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
5
- | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
6
- | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
7
- | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
8
- | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
9
- | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
10
- )*\z/x
@@ -1,64 +0,0 @@
1
- # USAGE: Hash.from_xml:(YOUR_XML_STRING)
2
- require 'nokogiri'
3
- # modified from http://stackoverflow.com/questions/1230741/convert-a-nokogiri-document-to-a-ruby-hash/1231297#1231297
4
- class DarwinCore
5
- module XmlReader
6
- class << self
7
- def from_xml(xml_io)
8
- begin
9
- result = Nokogiri::XML(xml_io)
10
- return { result.root.name.to_sym => xml_node_to_hash(result.root)}
11
- rescue Exception => e
12
- raise e
13
- end
14
- end
15
-
16
- private
17
-
18
- def xml_node_to_hash(node)
19
- # If we are at the root of the document, start the hash
20
- if node.element?
21
- result_hash = {}
22
- if node.attributes != {}
23
- result_hash[:attributes] = {}
24
- node.attributes.keys.each do |key|
25
- result_hash[:attributes][node.attributes[key].name.to_sym] = prepare(node.attributes[key].value)
26
- end
27
- end
28
- if node.children.size > 0
29
- node.children.each do |child|
30
- result = xml_node_to_hash(child)
31
-
32
- if child.name == "text"
33
- unless child.next_sibling || child.previous_sibling
34
- return prepare(result)
35
- end
36
- elsif result_hash[child.name.to_sym]
37
- if result_hash[child.name.to_sym].is_a?(Object::Array)
38
- result_hash[child.name.to_sym] << prepare(result)
39
- else
40
- result_hash[child.name.to_sym] = [result_hash[child.name.to_sym]] << prepare(result)
41
- end
42
- else
43
- result_hash[child.name.to_sym] = prepare(result)
44
- end
45
- end
46
-
47
- return result_hash
48
- else
49
- return result_hash
50
- end
51
- else
52
- return prepare(node.content.to_s)
53
- end
54
- end
55
-
56
- def prepare(data)
57
- return data if data.class != String
58
- return true if data.strip == "true"
59
- return false if data.strip == "false"
60
- data.to_i.to_s == data ? data.to_i : data
61
- end
62
- end
63
- end
64
- end
@@ -1,250 +0,0 @@
1
- # encoding: utf-8
2
- require File.expand_path(File.dirname(__FILE__) + "/../spec_helper")
3
-
4
- describe DarwinCore do
5
- before(:all) do
6
- @file_dir = File.join(File.dirname(__FILE__), '..', 'files')
7
- end
8
-
9
- describe "VERSION" do
10
- it "should return VERSION number" do
11
- DarwinCore::VERSION.split('.').join('').to_i.should > 41
12
- end
13
- end
14
-
15
- describe "::nil_field?" do
16
- it "should return true for entries which normally mean nil" do
17
- [nil, '/N', ''].each do |i|
18
- DarwinCore.nil_field?(i).should be_true
19
- end
20
- end
21
-
22
- it "should return false for fields that are not nil" do
23
- [0, '0', '123', 123, 'dsdfs434343/N'].each do |i|
24
- DarwinCore.nil_field?(i).should be_false
25
- end
26
- end
27
- end
28
-
29
- describe ".new" do
30
- it "should create DarwinCore instance out of archive file" do
31
- ['data.zip', 'data.tar.gz', 'minimal.tar.gz', 'junk_dir_inside.zip'].each do |file|
32
- file = File.join(@file_dir, file)
33
- dwc = DarwinCore.new(file)
34
- dwc.archive.valid?.should be_true
35
- end
36
- end
37
-
38
- it "should raise an error if archive file does not exist" do
39
- file = 'not_a_file'
40
- lambda { DarwinCore.new(file) }.should raise_error(DarwinCore::FileNotFoundError)
41
- end
42
-
43
- it "should raise an error if archive is broken" do
44
- file = File.join(@file_dir, 'broken.tar.gz')
45
- lambda { DarwinCore.new(file) }.should raise_error(DarwinCore::UnpackingError)
46
- end
47
-
48
- it "should raise an error if archive is invalid" do
49
- file = File.join(@file_dir, 'invalid.tar.gz')
50
- lambda { DarwinCore.new(file) }.should raise_error(DarwinCore::InvalidArchiveError)
51
- end
52
-
53
- it "should raise an error if archive is not in utf-8" do
54
- file = File.join(@file_dir, 'latin1.tar.gz')
55
- lambda { DarwinCore.new(file) }.should raise_error(DarwinCore::EncodingError)
56
- end
57
-
58
- it "should work with files that have non-alfanumeric characters and spaces" do
59
- file = File.join(@file_dir, 'file with characters(3).gz')
60
- dwc = DarwinCore.new(file)
61
- dwc.archive.valid?.should be_true
62
- end
63
- end
64
-
65
- describe ".normalize_classification" do
66
- it "should return flat list if file has no parent id information" do
67
- file = File.join(@file_dir, 'flat_list.tar.gz')
68
- dwc = DarwinCore.new(file)
69
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
70
- cn.normalize
71
- cn.normalized_data.should_not be_nil
72
- cn.normalized_data.size.should > 0
73
- end
74
-
75
- it "should return array or hash of name_strings back" do
76
- file = File.join(@file_dir, 'data.tar.gz')
77
- dwc = DarwinCore.new(file)
78
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
79
- cn.normalize
80
- name_strings = cn.name_strings
81
- name_strings.is_a?(Array).should be_true
82
- name_strings.size.should > 1
83
- name_strings = cn.name_strings(with_hash: true)
84
- name_strings.size.should > 1
85
- name_strings.is_a?(Hash).should be_true
86
- name_strings.is_a?(Hash).should be_true
87
- name_strings.values.uniq.should == [1]
88
- vernacular_name_strings = cn.vernacular_name_strings
89
- vernacular_name_strings.is_a?(Array).should be_true
90
- vernacular_name_strings.size.should > 0
91
- vernacular_name_strings = cn.vernacular_name_strings(with_hash: true)
92
- vernacular_name_strings.size.should > 0
93
- vernacular_name_strings.is_a?(Hash).should be_true
94
- vernacular_name_strings.values.uniq.should == [1]
95
- end
96
-
97
- it "should traverse DarwinCore files and assemble data for every node in memory" do
98
- file = File.join(@file_dir, 'data.tar.gz')
99
- dwc = DarwinCore.new(file)
100
- norm = dwc.normalize_classification
101
- norm.class.should == Hash
102
- path_encodings = []
103
- norm.each do |taxon_id, taxon|
104
- taxon.classification_path.each {|p| path_encodings << p.encoding}
105
- end
106
- path_encodings.uniq!
107
- path_encodings.size.should == 1
108
- path_encodings[0].to_s.should == "UTF-8"
109
- norm['leptogastrinae:tid:2857'].class.should == DarwinCore::TaxonNormalized
110
- norm['leptogastrinae:tid:2857'].source.should == 'http://leptogastrinae.lifedesks.org/pages/2857'
111
- end
112
-
113
- it "should assemble synonyms from core" do
114
- file = File.join(@file_dir, 'data.tar.gz')
115
- dwc = DarwinCore.new(file)
116
- norm = dwc.normalize_classification
117
- syn = norm.values.select {|n| n.synonyms.size > 0}[0].synonyms[0]
118
- syn.id.should == 'leptogastrinae:tid:127'
119
- syn.name.should == "Leptogastridae"
120
- syn.source.should == 'http://leptogastrinae.lifedesks.org/pages/127'
121
- end
122
-
123
- it "should be able to assemble vernacular names from an extension" do
124
- file = File.join(@file_dir, 'data.tar.gz')
125
- dwc = DarwinCore.new(file)
126
- norm = dwc.normalize_classification
127
- norm.select { |k,v| !v.vernacular_names.empty? }.map { |k,v| v.vernacular_names }.size.should > 0
128
- end
129
-
130
- it "should be able to assemble synonyms from extension" do
131
- file = File.join(@file_dir, 'synonyms_in_extension.tar.gz')
132
- dwc = DarwinCore.new(file)
133
- norm = dwc.normalize_classification
134
- norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
135
- end
136
-
137
- it "should not assemble synonyms from extension with scientificName, and file name not matching 'synonym'" do
138
- file = File.join(@file_dir, 'not_synonym_in_extension.tar.gz')
139
- dwc = DarwinCore.new(file)
140
- norm = dwc.normalize_classification
141
- norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should == 0
142
- end
143
-
144
- it "should not attempt to assemble extensions with with_extensions opts set to false" do
145
- file = File.join(@file_dir, 'data.tar.gz')
146
- dwc = DarwinCore.new(file)
147
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
148
- norm = cn.normalize(:with_extensions => false)
149
- norm.select { |k,v| !v.vernacular_names.empty? }.size.should == 0
150
- norm = cn.normalize()
151
- norm.select { |k,v| !v.vernacular_names.empty? }.size.should > 0
152
- file = File.join(@file_dir, 'synonyms_in_extension.tar.gz')
153
- dwc = DarwinCore.new(file)
154
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
155
- norm = cn.normalize(:with_extensions => false)
156
- norm.select { |k,v| !v.synonyms.empty? }.size.should == 0
157
- norm = cn.normalize()
158
- norm.select { |k,v| !v.synonyms.empty? }.size.should > 0
159
- end
160
-
161
- it "should assemble linnean classification if terms for it exists" do
162
- file = File.join(@file_dir, 'linnean.tar.gz')
163
- dwc = DarwinCore.new(file)
164
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
165
- norm = cn.normalize
166
- cn.normalized_data.first.last.linnean_classification_path.should == [["Animalia", :kingdom], ["Arthropoda", :phylum], ["Insecta", :class], ["Diptera", :order], ["Cecidomyiidae", :family], ["Resseliella", :genus]]
167
- end
168
-
169
- it "should keep linnean classification empty if terms are not there" do
170
- file = File.join(@file_dir, 'data.tar.gz')
171
- dwc = DarwinCore.new(file)
172
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
173
- norm = cn.normalize
174
- cn.normalized_data.first.last.linnean_classification_path.should == []
175
- end
176
-
177
- it "should be able to assemble synonyms from core" do
178
- file = File.join(@file_dir, 'synonyms_in_core_accepted_name_field.tar.gz')
179
- dwc = DarwinCore.new(file)
180
- norm = dwc.normalize_classification
181
- norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
182
- end
183
-
184
- it "should be able to assemble synonyms from extension" do
185
- file = File.join(@file_dir, 'data.tar.gz')
186
- dwc = DarwinCore.new(file)
187
- norm = dwc.normalize_classification
188
- nodes_with_syn = norm.select { |k,v| !v.synonyms.empty? }
189
- nodes_with_syn.map { |k,v| v.synonyms }.size.should > 0
190
- nodes_with_syn.first[1].synonyms.first.status.should == 'synonym'
191
- end
192
-
193
- it "should be able work with files which have scientificNameAuthorship" do
194
- file = File.join(@file_dir, 'sci_name_authorship.tar.gz')
195
- dwc = DarwinCore.new(file)
196
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
197
- norm = cn.normalize
198
- path_encodings = norm.map {|taxon_id, taxon| taxon.classification_path}.flatten.map { |name| name.encoding.to_s }.uniq
199
- path_encodings.size.should == 1
200
- path_encodings[0].should == "UTF-8"
201
- taxa = norm.select{|k,v| v.current_name_canonical.match " "}.select{|k,v| [v.current_name.split(" ").size > v.current_name_canonical.split(" ").size]}
202
- taxa.size.should == 507
203
- syn = norm.select{|k,v| v.synonyms.size > 0}.map {|k,v| v.synonyms}.flatten.select {|s| s.name.split(" ").size > s.canonical_name.split(" ").size}
204
- syn.size.should == 50
205
- end
206
-
207
- it "should be able work with files which repeat scientificNameAuthorship value in scientificName field" do
208
- file = File.join(@file_dir, 'sci_name_authorship_dup.tar.gz')
209
- dwc = DarwinCore.new(file)
210
- norm = dwc.normalize_classification
211
- taxa = norm.select{|k,v| v.current_name_canonical.match " "}.select{|k,v| [v.current_name.split(" ").size > v.current_name_canonical.split(" ").size]}
212
- taxa.size.should == 507
213
- syn = norm.select{|k,v| v.synonyms.size > 0}.map {|k,v| v.synonyms}.flatten.select {|s| s.name.split(" ").size > s.canonical_name.split(" ").size}
214
- syn.size.should == 50
215
- end
216
-
217
- it "should be able open files where coreid is empty" do
218
- file = File.join(@file_dir, 'empty_coreid.tar.gz')
219
- dwc = DarwinCore.new(file)
220
- norm = dwc.normalize_classification
221
- taxa = norm.select{|k,v| v.current_name_canonical.match " "}.select{|k,v| [v.current_name.split(" ").size > v.current_name_canonical.split(" ").size]}
222
- taxa.size.should == 2
223
- end
224
-
225
- it "should be able to get language and locality fields for vernacular names" do
226
- file = File.join(@file_dir, 'language_locality.tar.gz')
227
- dwc = DarwinCore.new(file)
228
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
229
- cn.normalize
230
- vn = cn.normalized_data['leptogastrinae:tid:42'].vernacular_names.first
231
- vn.language.should == 'en'
232
- vn.locality.should == 'New England'
233
- end
234
-
235
- it 'should be able to get uuids from gnub dataset' do
236
- file = File.join(@file_dir, 'gnub.tar.gz')
237
- dwc = DarwinCore.new(file)
238
- cn = DarwinCore::ClassificationNormalizer.new(dwc)
239
- cn.normalize
240
- vn = cn.normalized_data['9c399f90-cfb8-5a7f-9a21-18285a473488']
241
- vn.class.should == DarwinCore::GnubTaxon
242
- vn.uuid.should == '8faa91f6-663f-4cfe-b785-0ab4e9415a51'
243
- vn.uuid_path.should == [
244
- "9a9f9eeb-d5f9-4ff6-b6cb-a5ad345e33c3",
245
- "bf4c91c0-3d1f-44c7-9d3b-249382182a26",
246
- "8faa91f6-663f-4cfe-b785-0ab4e9415a51"]
247
- end
248
- end
249
-
250
- end
@@ -1 +0,0 @@
1
- --color