dwc-archive 0.9.6 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +31 -0
- data/.rspec +3 -0
- data/.rubocop.yml +23 -0
- data/.ruby-version +1 -0
- data/.travis.yml +4 -5
- data/CHANGELOG +15 -7
- data/Gemfile +3 -15
- data/LICENSE +1 -1
- data/README.md +135 -111
- data/Rakefile +13 -54
- data/dwc-archive.gemspec +37 -0
- data/features/step_definitions/dwc-creator_steps.rb +5 -5
- data/features/step_definitions/dwc-reader_steps.rb +47 -28
- data/features/support/env.rb +1 -1
- data/lib/dwc_archive.rb +121 -0
- data/lib/dwc_archive/archive.rb +59 -0
- data/lib/dwc_archive/classification_normalizer.rb +382 -0
- data/lib/dwc_archive/core.rb +25 -0
- data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
- data/lib/dwc_archive/expander.rb +85 -0
- data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
- data/lib/dwc_archive/generator.rb +90 -0
- data/lib/dwc_archive/generator_eml_xml.rb +116 -0
- data/lib/dwc_archive/generator_meta_xml.rb +72 -0
- data/lib/dwc_archive/gnub_taxon.rb +14 -0
- data/lib/dwc_archive/ingester.rb +106 -0
- data/lib/dwc_archive/metadata.rb +56 -0
- data/lib/dwc_archive/taxon_normalized.rb +23 -0
- data/lib/dwc_archive/version.rb +6 -0
- data/lib/dwc_archive/xml_reader.rb +89 -0
- data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
- data/spec/files/generator_eml.xml +47 -0
- data/spec/files/generator_meta.xml +19 -0
- data/spec/lib/classification_normalizer_spec.rb +214 -0
- data/spec/lib/core_spec.rb +100 -0
- data/spec/lib/darwin_core_spec.rb +249 -0
- data/spec/lib/generator_eml_xml_spec.rb +22 -0
- data/spec/lib/generator_meta_xml_spec.rb +22 -0
- data/spec/lib/generator_spec.rb +124 -0
- data/spec/lib/gnub_taxon_spec.rb +32 -0
- data/spec/lib/metadata_spec.rb +89 -0
- data/spec/lib/taxon_normalized_spec.rb +142 -0
- data/spec/lib/xml_reader_spec.rb +11 -11
- data/spec/spec_helper.rb +78 -6
- metadata +180 -92
- data/.rvmrc +0 -1
- data/Gemfile.lock +0 -155
- data/VERSION +0 -1
- data/lib/dwc-archive.rb +0 -95
- data/lib/dwc-archive/.expander.rb.swo +0 -0
- data/lib/dwc-archive/archive.rb +0 -37
- data/lib/dwc-archive/classification_normalizer.rb +0 -424
- data/lib/dwc-archive/core.rb +0 -17
- data/lib/dwc-archive/expander.rb +0 -80
- data/lib/dwc-archive/generator.rb +0 -75
- data/lib/dwc-archive/generator_eml_xml.rb +0 -84
- data/lib/dwc-archive/generator_meta_xml.rb +0 -50
- data/lib/dwc-archive/ingester.rb +0 -101
- data/lib/dwc-archive/metadata.rb +0 -42
- data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
- data/lib/dwc-archive/xml_reader.rb +0 -64
- data/spec/lib/dwc-archive_spec.rb +0 -250
- data/spec/spec.opts +0 -1
@@ -1,10 +0,0 @@
|
|
1
|
-
UTF8RGX = /\A(
|
2
|
-
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
3
|
-
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
4
|
-
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
5
|
-
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
6
|
-
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
|
7
|
-
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
|
8
|
-
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
|
9
|
-
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
|
10
|
-
)*\z/x
|
@@ -1,64 +0,0 @@
|
|
1
|
-
# USAGE: Hash.from_xml:(YOUR_XML_STRING)
|
2
|
-
require 'nokogiri'
|
3
|
-
# modified from http://stackoverflow.com/questions/1230741/convert-a-nokogiri-document-to-a-ruby-hash/1231297#1231297
|
4
|
-
class DarwinCore
|
5
|
-
module XmlReader
|
6
|
-
class << self
|
7
|
-
def from_xml(xml_io)
|
8
|
-
begin
|
9
|
-
result = Nokogiri::XML(xml_io)
|
10
|
-
return { result.root.name.to_sym => xml_node_to_hash(result.root)}
|
11
|
-
rescue Exception => e
|
12
|
-
raise e
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
private
|
17
|
-
|
18
|
-
def xml_node_to_hash(node)
|
19
|
-
# If we are at the root of the document, start the hash
|
20
|
-
if node.element?
|
21
|
-
result_hash = {}
|
22
|
-
if node.attributes != {}
|
23
|
-
result_hash[:attributes] = {}
|
24
|
-
node.attributes.keys.each do |key|
|
25
|
-
result_hash[:attributes][node.attributes[key].name.to_sym] = prepare(node.attributes[key].value)
|
26
|
-
end
|
27
|
-
end
|
28
|
-
if node.children.size > 0
|
29
|
-
node.children.each do |child|
|
30
|
-
result = xml_node_to_hash(child)
|
31
|
-
|
32
|
-
if child.name == "text"
|
33
|
-
unless child.next_sibling || child.previous_sibling
|
34
|
-
return prepare(result)
|
35
|
-
end
|
36
|
-
elsif result_hash[child.name.to_sym]
|
37
|
-
if result_hash[child.name.to_sym].is_a?(Object::Array)
|
38
|
-
result_hash[child.name.to_sym] << prepare(result)
|
39
|
-
else
|
40
|
-
result_hash[child.name.to_sym] = [result_hash[child.name.to_sym]] << prepare(result)
|
41
|
-
end
|
42
|
-
else
|
43
|
-
result_hash[child.name.to_sym] = prepare(result)
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
return result_hash
|
48
|
-
else
|
49
|
-
return result_hash
|
50
|
-
end
|
51
|
-
else
|
52
|
-
return prepare(node.content.to_s)
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
def prepare(data)
|
57
|
-
return data if data.class != String
|
58
|
-
return true if data.strip == "true"
|
59
|
-
return false if data.strip == "false"
|
60
|
-
data.to_i.to_s == data ? data.to_i : data
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
64
|
-
end
|
@@ -1,250 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
require File.expand_path(File.dirname(__FILE__) + "/../spec_helper")
|
3
|
-
|
4
|
-
describe DarwinCore do
|
5
|
-
before(:all) do
|
6
|
-
@file_dir = File.join(File.dirname(__FILE__), '..', 'files')
|
7
|
-
end
|
8
|
-
|
9
|
-
describe "VERSION" do
|
10
|
-
it "should return VERSION number" do
|
11
|
-
DarwinCore::VERSION.split('.').join('').to_i.should > 41
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
describe "::nil_field?" do
|
16
|
-
it "should return true for entries which normally mean nil" do
|
17
|
-
[nil, '/N', ''].each do |i|
|
18
|
-
DarwinCore.nil_field?(i).should be_true
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
it "should return false for fields that are not nil" do
|
23
|
-
[0, '0', '123', 123, 'dsdfs434343/N'].each do |i|
|
24
|
-
DarwinCore.nil_field?(i).should be_false
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
describe ".new" do
|
30
|
-
it "should create DarwinCore instance out of archive file" do
|
31
|
-
['data.zip', 'data.tar.gz', 'minimal.tar.gz', 'junk_dir_inside.zip'].each do |file|
|
32
|
-
file = File.join(@file_dir, file)
|
33
|
-
dwc = DarwinCore.new(file)
|
34
|
-
dwc.archive.valid?.should be_true
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
it "should raise an error if archive file does not exist" do
|
39
|
-
file = 'not_a_file'
|
40
|
-
lambda { DarwinCore.new(file) }.should raise_error(DarwinCore::FileNotFoundError)
|
41
|
-
end
|
42
|
-
|
43
|
-
it "should raise an error if archive is broken" do
|
44
|
-
file = File.join(@file_dir, 'broken.tar.gz')
|
45
|
-
lambda { DarwinCore.new(file) }.should raise_error(DarwinCore::UnpackingError)
|
46
|
-
end
|
47
|
-
|
48
|
-
it "should raise an error if archive is invalid" do
|
49
|
-
file = File.join(@file_dir, 'invalid.tar.gz')
|
50
|
-
lambda { DarwinCore.new(file) }.should raise_error(DarwinCore::InvalidArchiveError)
|
51
|
-
end
|
52
|
-
|
53
|
-
it "should raise an error if archive is not in utf-8" do
|
54
|
-
file = File.join(@file_dir, 'latin1.tar.gz')
|
55
|
-
lambda { DarwinCore.new(file) }.should raise_error(DarwinCore::EncodingError)
|
56
|
-
end
|
57
|
-
|
58
|
-
it "should work with files that have non-alfanumeric characters and spaces" do
|
59
|
-
file = File.join(@file_dir, 'file with characters(3).gz')
|
60
|
-
dwc = DarwinCore.new(file)
|
61
|
-
dwc.archive.valid?.should be_true
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
describe ".normalize_classification" do
|
66
|
-
it "should return flat list if file has no parent id information" do
|
67
|
-
file = File.join(@file_dir, 'flat_list.tar.gz')
|
68
|
-
dwc = DarwinCore.new(file)
|
69
|
-
cn = DarwinCore::ClassificationNormalizer.new(dwc)
|
70
|
-
cn.normalize
|
71
|
-
cn.normalized_data.should_not be_nil
|
72
|
-
cn.normalized_data.size.should > 0
|
73
|
-
end
|
74
|
-
|
75
|
-
it "should return array or hash of name_strings back" do
|
76
|
-
file = File.join(@file_dir, 'data.tar.gz')
|
77
|
-
dwc = DarwinCore.new(file)
|
78
|
-
cn = DarwinCore::ClassificationNormalizer.new(dwc)
|
79
|
-
cn.normalize
|
80
|
-
name_strings = cn.name_strings
|
81
|
-
name_strings.is_a?(Array).should be_true
|
82
|
-
name_strings.size.should > 1
|
83
|
-
name_strings = cn.name_strings(with_hash: true)
|
84
|
-
name_strings.size.should > 1
|
85
|
-
name_strings.is_a?(Hash).should be_true
|
86
|
-
name_strings.is_a?(Hash).should be_true
|
87
|
-
name_strings.values.uniq.should == [1]
|
88
|
-
vernacular_name_strings = cn.vernacular_name_strings
|
89
|
-
vernacular_name_strings.is_a?(Array).should be_true
|
90
|
-
vernacular_name_strings.size.should > 0
|
91
|
-
vernacular_name_strings = cn.vernacular_name_strings(with_hash: true)
|
92
|
-
vernacular_name_strings.size.should > 0
|
93
|
-
vernacular_name_strings.is_a?(Hash).should be_true
|
94
|
-
vernacular_name_strings.values.uniq.should == [1]
|
95
|
-
end
|
96
|
-
|
97
|
-
it "should traverse DarwinCore files and assemble data for every node in memory" do
|
98
|
-
file = File.join(@file_dir, 'data.tar.gz')
|
99
|
-
dwc = DarwinCore.new(file)
|
100
|
-
norm = dwc.normalize_classification
|
101
|
-
norm.class.should == Hash
|
102
|
-
path_encodings = []
|
103
|
-
norm.each do |taxon_id, taxon|
|
104
|
-
taxon.classification_path.each {|p| path_encodings << p.encoding}
|
105
|
-
end
|
106
|
-
path_encodings.uniq!
|
107
|
-
path_encodings.size.should == 1
|
108
|
-
path_encodings[0].to_s.should == "UTF-8"
|
109
|
-
norm['leptogastrinae:tid:2857'].class.should == DarwinCore::TaxonNormalized
|
110
|
-
norm['leptogastrinae:tid:2857'].source.should == 'http://leptogastrinae.lifedesks.org/pages/2857'
|
111
|
-
end
|
112
|
-
|
113
|
-
it "should assemble synonyms from core" do
|
114
|
-
file = File.join(@file_dir, 'data.tar.gz')
|
115
|
-
dwc = DarwinCore.new(file)
|
116
|
-
norm = dwc.normalize_classification
|
117
|
-
syn = norm.values.select {|n| n.synonyms.size > 0}[0].synonyms[0]
|
118
|
-
syn.id.should == 'leptogastrinae:tid:127'
|
119
|
-
syn.name.should == "Leptogastridae"
|
120
|
-
syn.source.should == 'http://leptogastrinae.lifedesks.org/pages/127'
|
121
|
-
end
|
122
|
-
|
123
|
-
it "should be able to assemble vernacular names from an extension" do
|
124
|
-
file = File.join(@file_dir, 'data.tar.gz')
|
125
|
-
dwc = DarwinCore.new(file)
|
126
|
-
norm = dwc.normalize_classification
|
127
|
-
norm.select { |k,v| !v.vernacular_names.empty? }.map { |k,v| v.vernacular_names }.size.should > 0
|
128
|
-
end
|
129
|
-
|
130
|
-
it "should be able to assemble synonyms from extension" do
|
131
|
-
file = File.join(@file_dir, 'synonyms_in_extension.tar.gz')
|
132
|
-
dwc = DarwinCore.new(file)
|
133
|
-
norm = dwc.normalize_classification
|
134
|
-
norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
|
135
|
-
end
|
136
|
-
|
137
|
-
it "should not assemble synonyms from extension with scientificName, and file name not matching 'synonym'" do
|
138
|
-
file = File.join(@file_dir, 'not_synonym_in_extension.tar.gz')
|
139
|
-
dwc = DarwinCore.new(file)
|
140
|
-
norm = dwc.normalize_classification
|
141
|
-
norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should == 0
|
142
|
-
end
|
143
|
-
|
144
|
-
it "should not attempt to assemble extensions with with_extensions opts set to false" do
|
145
|
-
file = File.join(@file_dir, 'data.tar.gz')
|
146
|
-
dwc = DarwinCore.new(file)
|
147
|
-
cn = DarwinCore::ClassificationNormalizer.new(dwc)
|
148
|
-
norm = cn.normalize(:with_extensions => false)
|
149
|
-
norm.select { |k,v| !v.vernacular_names.empty? }.size.should == 0
|
150
|
-
norm = cn.normalize()
|
151
|
-
norm.select { |k,v| !v.vernacular_names.empty? }.size.should > 0
|
152
|
-
file = File.join(@file_dir, 'synonyms_in_extension.tar.gz')
|
153
|
-
dwc = DarwinCore.new(file)
|
154
|
-
cn = DarwinCore::ClassificationNormalizer.new(dwc)
|
155
|
-
norm = cn.normalize(:with_extensions => false)
|
156
|
-
norm.select { |k,v| !v.synonyms.empty? }.size.should == 0
|
157
|
-
norm = cn.normalize()
|
158
|
-
norm.select { |k,v| !v.synonyms.empty? }.size.should > 0
|
159
|
-
end
|
160
|
-
|
161
|
-
it "should assemble linnean classification if terms for it exists" do
|
162
|
-
file = File.join(@file_dir, 'linnean.tar.gz')
|
163
|
-
dwc = DarwinCore.new(file)
|
164
|
-
cn = DarwinCore::ClassificationNormalizer.new(dwc)
|
165
|
-
norm = cn.normalize
|
166
|
-
cn.normalized_data.first.last.linnean_classification_path.should == [["Animalia", :kingdom], ["Arthropoda", :phylum], ["Insecta", :class], ["Diptera", :order], ["Cecidomyiidae", :family], ["Resseliella", :genus]]
|
167
|
-
end
|
168
|
-
|
169
|
-
it "should keep linnean classification empty if terms are not there" do
|
170
|
-
file = File.join(@file_dir, 'data.tar.gz')
|
171
|
-
dwc = DarwinCore.new(file)
|
172
|
-
cn = DarwinCore::ClassificationNormalizer.new(dwc)
|
173
|
-
norm = cn.normalize
|
174
|
-
cn.normalized_data.first.last.linnean_classification_path.should == []
|
175
|
-
end
|
176
|
-
|
177
|
-
it "should be able to assemble synonyms from core" do
|
178
|
-
file = File.join(@file_dir, 'synonyms_in_core_accepted_name_field.tar.gz')
|
179
|
-
dwc = DarwinCore.new(file)
|
180
|
-
norm = dwc.normalize_classification
|
181
|
-
norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
|
182
|
-
end
|
183
|
-
|
184
|
-
it "should be able to assemble synonyms from extension" do
|
185
|
-
file = File.join(@file_dir, 'data.tar.gz')
|
186
|
-
dwc = DarwinCore.new(file)
|
187
|
-
norm = dwc.normalize_classification
|
188
|
-
nodes_with_syn = norm.select { |k,v| !v.synonyms.empty? }
|
189
|
-
nodes_with_syn.map { |k,v| v.synonyms }.size.should > 0
|
190
|
-
nodes_with_syn.first[1].synonyms.first.status.should == 'synonym'
|
191
|
-
end
|
192
|
-
|
193
|
-
it "should be able work with files which have scientificNameAuthorship" do
|
194
|
-
file = File.join(@file_dir, 'sci_name_authorship.tar.gz')
|
195
|
-
dwc = DarwinCore.new(file)
|
196
|
-
cn = DarwinCore::ClassificationNormalizer.new(dwc)
|
197
|
-
norm = cn.normalize
|
198
|
-
path_encodings = norm.map {|taxon_id, taxon| taxon.classification_path}.flatten.map { |name| name.encoding.to_s }.uniq
|
199
|
-
path_encodings.size.should == 1
|
200
|
-
path_encodings[0].should == "UTF-8"
|
201
|
-
taxa = norm.select{|k,v| v.current_name_canonical.match " "}.select{|k,v| [v.current_name.split(" ").size > v.current_name_canonical.split(" ").size]}
|
202
|
-
taxa.size.should == 507
|
203
|
-
syn = norm.select{|k,v| v.synonyms.size > 0}.map {|k,v| v.synonyms}.flatten.select {|s| s.name.split(" ").size > s.canonical_name.split(" ").size}
|
204
|
-
syn.size.should == 50
|
205
|
-
end
|
206
|
-
|
207
|
-
it "should be able work with files which repeat scientificNameAuthorship value in scientificName field" do
|
208
|
-
file = File.join(@file_dir, 'sci_name_authorship_dup.tar.gz')
|
209
|
-
dwc = DarwinCore.new(file)
|
210
|
-
norm = dwc.normalize_classification
|
211
|
-
taxa = norm.select{|k,v| v.current_name_canonical.match " "}.select{|k,v| [v.current_name.split(" ").size > v.current_name_canonical.split(" ").size]}
|
212
|
-
taxa.size.should == 507
|
213
|
-
syn = norm.select{|k,v| v.synonyms.size > 0}.map {|k,v| v.synonyms}.flatten.select {|s| s.name.split(" ").size > s.canonical_name.split(" ").size}
|
214
|
-
syn.size.should == 50
|
215
|
-
end
|
216
|
-
|
217
|
-
it "should be able open files where coreid is empty" do
|
218
|
-
file = File.join(@file_dir, 'empty_coreid.tar.gz')
|
219
|
-
dwc = DarwinCore.new(file)
|
220
|
-
norm = dwc.normalize_classification
|
221
|
-
taxa = norm.select{|k,v| v.current_name_canonical.match " "}.select{|k,v| [v.current_name.split(" ").size > v.current_name_canonical.split(" ").size]}
|
222
|
-
taxa.size.should == 2
|
223
|
-
end
|
224
|
-
|
225
|
-
it "should be able to get language and locality fields for vernacular names" do
|
226
|
-
file = File.join(@file_dir, 'language_locality.tar.gz')
|
227
|
-
dwc = DarwinCore.new(file)
|
228
|
-
cn = DarwinCore::ClassificationNormalizer.new(dwc)
|
229
|
-
cn.normalize
|
230
|
-
vn = cn.normalized_data['leptogastrinae:tid:42'].vernacular_names.first
|
231
|
-
vn.language.should == 'en'
|
232
|
-
vn.locality.should == 'New England'
|
233
|
-
end
|
234
|
-
|
235
|
-
it 'should be able to get uuids from gnub dataset' do
|
236
|
-
file = File.join(@file_dir, 'gnub.tar.gz')
|
237
|
-
dwc = DarwinCore.new(file)
|
238
|
-
cn = DarwinCore::ClassificationNormalizer.new(dwc)
|
239
|
-
cn.normalize
|
240
|
-
vn = cn.normalized_data['9c399f90-cfb8-5a7f-9a21-18285a473488']
|
241
|
-
vn.class.should == DarwinCore::GnubTaxon
|
242
|
-
vn.uuid.should == '8faa91f6-663f-4cfe-b785-0ab4e9415a51'
|
243
|
-
vn.uuid_path.should == [
|
244
|
-
"9a9f9eeb-d5f9-4ff6-b6cb-a5ad345e33c3",
|
245
|
-
"bf4c91c0-3d1f-44c7-9d3b-249382182a26",
|
246
|
-
"8faa91f6-663f-4cfe-b785-0ab4e9415a51"]
|
247
|
-
end
|
248
|
-
end
|
249
|
-
|
250
|
-
end
|
data/spec/spec.opts
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
--color
|