dwc-archive 0.9.6 → 0.9.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +30 -0
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/.travis.yml +6 -4
- data/CHANGELOG +2 -0
- data/Gemfile +1 -15
- data/README.md +17 -5
- data/Rakefile +6 -24
- data/] +40 -0
- data/dwc-archive.gemspec +33 -0
- data/lib/dwc-archive.rb +33 -21
- data/lib/dwc-archive/archive.rb +5 -2
- data/lib/dwc-archive/classification_normalizer.rb +4 -0
- data/lib/dwc-archive/core.rb +2 -2
- data/lib/dwc-archive/expander.rb +6 -2
- data/lib/dwc-archive/generator.rb +18 -8
- data/lib/dwc-archive/generator_eml_xml.rb +16 -14
- data/lib/dwc-archive/generator_meta_xml.rb +19 -11
- data/lib/dwc-archive/ingester.rb +1 -1
- data/lib/dwc-archive/metadata.rb +8 -2
- data/lib/dwc-archive/version.rb +3 -0
- data/lib/dwc-archive/xml_reader.rb +9 -9
- data/spec/lib/classification_normalizer_spec.rb +223 -0
- data/spec/lib/core_spec.rb +98 -0
- data/spec/lib/darwin_core_spec.rb +279 -0
- data/spec/lib/generator_eml_xml_spec.rb +21 -0
- data/spec/lib/generator_meta_xml_spec.rb +21 -0
- data/spec/lib/generator_spec.rb +116 -0
- data/spec/lib/gnub_taxon_spec.rb +34 -0
- data/spec/lib/metadata_spec.rb +80 -0
- data/spec/lib/taxon_normalized_spec.rb +145 -0
- data/spec/lib/xml_reader_spec.rb +13 -10
- data/spec/spec_helper.rb +72 -3
- metadata +133 -62
- data/Gemfile.lock +0 -155
- data/VERSION +0 -1
- data/lib/dwc-archive/.expander.rb.swo +0 -0
- data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
- data/spec/lib/dwc-archive_spec.rb +0 -250
- data/spec/spec.opts +0 -1
@@ -8,23 +8,31 @@ class DarwinCore
|
|
8
8
|
end
|
9
9
|
|
10
10
|
def create
|
11
|
+
schema_uri = 'http://rs.tdwg.org/dwc/terms/xsd/archive/' +
|
12
|
+
' http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd'
|
11
13
|
builder = Nokogiri::XML::Builder.new do |xml|
|
12
|
-
opts = { :
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
14
|
+
opts = { encoding: 'UTF-8',
|
15
|
+
fieldsTerminatedBy: ',',
|
16
|
+
fieldsEnclosedBy: '"',
|
17
|
+
linesTerminatedBy: "\n",
|
18
|
+
rowType: 'http://rs.tdwg.org/dwc/terms/Taxon' }
|
19
|
+
xml.archive(xmlns: 'http://rs.tdwg.org/dwc/text/',
|
20
|
+
:'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
|
21
|
+
:'xsi:schemaLocation' => schema_uri) do
|
22
|
+
xml.core(opts.merge(ignoreHeaderLines:
|
23
|
+
@data[:core][:ignoreHeaderLines])) do
|
17
24
|
xml.files { xml.location(@data[:core][:location]) }
|
18
25
|
taxon_id, fields = find_taxon_id(@data[:core][:fields])
|
19
|
-
xml.id_(:
|
20
|
-
fields.each { |f| xml.field(:
|
26
|
+
xml.id_(index: taxon_id[1])
|
27
|
+
fields.each { |f| xml.field(term: f[0], index: f[1]) }
|
21
28
|
end
|
22
29
|
@data[:extensions].each do |e|
|
23
|
-
xml.extension(opts.merge(:
|
30
|
+
xml.extension(opts.merge(ignoreHeaderLines: e[:ignoreHeaderLines],
|
31
|
+
rowType: e[:rowType])) do
|
24
32
|
xml.files { xml.location(e[:location]) }
|
25
33
|
taxon_id, fields = find_taxon_id(e[:fields])
|
26
|
-
xml.coreid(:
|
27
|
-
fields.each { |f| xml.field(:
|
34
|
+
xml.coreid(index: taxon_id[1])
|
35
|
+
fields.each { |f| xml.field(term: f[0], index: f[1]) }
|
28
36
|
end
|
29
37
|
end
|
30
38
|
end
|
@@ -39,7 +47,7 @@ class DarwinCore
|
|
39
47
|
def find_taxon_id(data)
|
40
48
|
fields = []
|
41
49
|
data.each_with_index { |f, i| fields << [f.strip, i] }
|
42
|
-
taxon_id, fields = fields.partition { |f| f[0].match(
|
50
|
+
taxon_id, fields = fields.partition { |f| f[0].match(%r|/taxonid$|i) }
|
43
51
|
raise DarwinCore::GeneratorError if taxon_id.size != 1
|
44
52
|
[taxon_id[0], fields]
|
45
53
|
end
|
data/lib/dwc-archive/ingester.rb
CHANGED
@@ -62,7 +62,7 @@ class DarwinCore
|
|
62
62
|
raise DarwinCore::EncodingError.new(err_msg)
|
63
63
|
end
|
64
64
|
@field_separator = get_field_separator
|
65
|
-
@quote_character = @properties[:fieldsEnclosedBy] ||
|
65
|
+
@quote_character = @properties[:fieldsEnclosedBy] || ''
|
66
66
|
@line_separator = @properties[:linesTerminatedBy] || '\n'
|
67
67
|
@ignore_headers = @properties[:ignoreHeaderLines] ?
|
68
68
|
[1, true].include?(@properties[:ignoreHeaderLines]) :
|
data/lib/dwc-archive/metadata.rb
CHANGED
@@ -23,8 +23,14 @@ class DarwinCore
|
|
23
23
|
|
24
24
|
def authors
|
25
25
|
return nil unless defined?(@metadata[:eml][:dataset][:creator])
|
26
|
-
@metadata[:eml][:dataset][:creator] =
|
27
|
-
|
26
|
+
@metadata[:eml][:dataset][:creator] =
|
27
|
+
[@metadata[:eml][:dataset][:creator]] unless
|
28
|
+
@metadata[:eml][:dataset][:creator].class == Array
|
29
|
+
@metadata[:eml][:dataset][:creator].map do |c|
|
30
|
+
{ first_name: c[:individualName][:givenName],
|
31
|
+
last_name: c[:individualName][:surName],
|
32
|
+
email: c[:electronicMailAddress] }
|
33
|
+
end
|
28
34
|
end
|
29
35
|
|
30
36
|
def abstract
|
@@ -1,16 +1,14 @@
|
|
1
1
|
# USAGE: Hash.from_xml:(YOUR_XML_STRING)
|
2
2
|
require 'nokogiri'
|
3
|
-
# modified from
|
3
|
+
# modified from
|
4
|
+
# http://stackoverflow.com/questions/1230741/
|
5
|
+
# convert-a-nokogiri-document-to-a-ruby-hash/1231297#1231297
|
4
6
|
class DarwinCore
|
5
7
|
module XmlReader
|
6
8
|
class << self
|
7
9
|
def from_xml(xml_io)
|
8
|
-
|
9
|
-
|
10
|
-
return { result.root.name.to_sym => xml_node_to_hash(result.root)}
|
11
|
-
rescue Exception => e
|
12
|
-
raise e
|
13
|
-
end
|
10
|
+
result = Nokogiri::XML(xml_io)
|
11
|
+
return { result.root.name.to_sym => xml_node_to_hash(result.root)}
|
14
12
|
end
|
15
13
|
|
16
14
|
private
|
@@ -22,7 +20,8 @@ class DarwinCore
|
|
22
20
|
if node.attributes != {}
|
23
21
|
result_hash[:attributes] = {}
|
24
22
|
node.attributes.keys.each do |key|
|
25
|
-
result_hash[:attributes][node.attributes[key].
|
23
|
+
result_hash[:attributes][node.attributes[key].
|
24
|
+
name.to_sym] = prepare(node.attributes[key].value)
|
26
25
|
end
|
27
26
|
end
|
28
27
|
if node.children.size > 0
|
@@ -37,7 +36,8 @@ class DarwinCore
|
|
37
36
|
if result_hash[child.name.to_sym].is_a?(Object::Array)
|
38
37
|
result_hash[child.name.to_sym] << prepare(result)
|
39
38
|
else
|
40
|
-
result_hash[child.name.to_sym] =
|
39
|
+
result_hash[child.name.to_sym] =
|
40
|
+
[result_hash[child.name.to_sym]] << prepare(result)
|
41
41
|
end
|
42
42
|
else
|
43
43
|
result_hash[child.name.to_sym] = prepare(result)
|
@@ -0,0 +1,223 @@
|
|
1
|
+
require_relative '../spec_helper'
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
describe DarwinCore::ClassificationNormalizer do
|
5
|
+
|
6
|
+
subject(:dwca) { DarwinCore.new(file_path) }
|
7
|
+
subject(:normalizer) { DarwinCore::ClassificationNormalizer.new(dwca) }
|
8
|
+
|
9
|
+
let(:file_dir) { File.expand_path('../../files', __FILE__) }
|
10
|
+
let(:file_path) { File.join(file_dir, file_name) }
|
11
|
+
|
12
|
+
describe '.new' do
|
13
|
+
let(:file_path) { File.join(file_dir, 'data.tar.gz') }
|
14
|
+
it { expect(normalizer.is_a? DarwinCore::ClassificationNormalizer).
|
15
|
+
to be_true }
|
16
|
+
end
|
17
|
+
|
18
|
+
describe '#normalize' do
|
19
|
+
let(:file_name) { 'data.tar.gz' }
|
20
|
+
|
21
|
+
it 'returns normalized data' do
|
22
|
+
res = normalizer.normalize
|
23
|
+
expect(res).to be normalizer.normalized_data
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
context 'flat list' do
|
28
|
+
let(:file_path) { File.join(file_dir, 'flat_list.tar.gz') }
|
29
|
+
|
30
|
+
it 'returns flat list' do
|
31
|
+
normalizer.normalize
|
32
|
+
expect(normalizer.normalized_data).to be_kind_of Hash
|
33
|
+
expect(normalizer.normalized_data.size).to be > 0
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
context 'synonyms from core' do
|
38
|
+
let(:file_name) { 'synonyms_in_core_accepted_name_field.tar.gz' }
|
39
|
+
|
40
|
+
it 'ingests synonyms using accepted_name field' do
|
41
|
+
res = normalizer.normalize
|
42
|
+
syn = res.select { |k,v| !v.synonyms.empty? }.
|
43
|
+
map { |k,v| v }
|
44
|
+
expect(syn.size).to be > 0
|
45
|
+
expect(syn[0].synonyms[0]).to be_kind_of DarwinCore::SynonymNormalized
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
context 'synonyms from extension' do
|
50
|
+
let(:file_name) { 'synonyms_in_extension.tar.gz' }
|
51
|
+
it 'ingests synonyms from extension' do
|
52
|
+
res = normalizer.normalize
|
53
|
+
syn = res.select { |k,v| !v.synonyms.empty? }.
|
54
|
+
map { |k,v| v }
|
55
|
+
expect(syn.size).to be > 0
|
56
|
+
expect(syn[0].synonyms[0]).to be_kind_of DarwinCore::SynonymNormalized
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
context 'synonyms are not extensions' do
|
61
|
+
let(:file_name) { 'not_synonym_in_extension.tar.gz' }
|
62
|
+
|
63
|
+
it 'does not ingest synonyms' do
|
64
|
+
res = normalizer.normalize
|
65
|
+
syn = res.select { |k,v| !v.synonyms.empty? }.
|
66
|
+
map { |k,v| v }
|
67
|
+
expect(syn).to be_empty
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
context 'with_extensions flag set on false' do
|
72
|
+
let(:file_name) { 'synonyms_in_extension.tar.gz' }
|
73
|
+
it 'should not harvest extensions' do
|
74
|
+
res = normalizer.normalize(with_extensions: false)
|
75
|
+
syn = res.select { |k,v| !v.synonyms.empty? }.
|
76
|
+
map { |k,v| v }
|
77
|
+
expect(syn).to be_empty
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
context 'linnean classification in file (class, order etc fields)' do
|
82
|
+
let(:file_name) { 'linnean.tar.gz' }
|
83
|
+
|
84
|
+
it 'assembles classification' do
|
85
|
+
res = normalizer.normalize
|
86
|
+
expect(res.first[1]).to be_kind_of DarwinCore::TaxonNormalized
|
87
|
+
expect(res.first[1].linnean_classification_path).
|
88
|
+
to eq [["Animalia", :kingdom],
|
89
|
+
["Arthropoda", :phylum],
|
90
|
+
["Insecta", :class],
|
91
|
+
["Diptera", :order],
|
92
|
+
["Cecidomyiidae", :family],
|
93
|
+
["Resseliella", :genus]]
|
94
|
+
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
context 'no linnean fields are given' do
|
99
|
+
it 'returns empty linnean classification' do
|
100
|
+
res = normalizer.normalize
|
101
|
+
expect(res.first[1]).to be_kind_of DarwinCore::TaxonNormalized
|
102
|
+
expect(res.first[1].linnean_classification_path).to be_empty
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
context 'in the presence of scientificNameAuthorship field' do
|
107
|
+
let(:file_name) { 'sci_name_authorship.tar.gz' }
|
108
|
+
it 'returns normalized data' do
|
109
|
+
normalizer.normalize
|
110
|
+
expect(normalizer.darwin_core.file_name).
|
111
|
+
to eq 'sci_name_authorship.tar.gz'
|
112
|
+
expect(normalizer.normalized_data).to be_kind_of Hash
|
113
|
+
expect(normalizer.normalized_data.size).to be > 0
|
114
|
+
tn = normalizer.normalized_data['leptogastrinae:tid:2688']
|
115
|
+
expect(tn.current_name).to eq 'Leptogaster fornicata Martin, 1957'
|
116
|
+
expect(tn.current_name_canonical).to eq 'Leptogaster fornicata'
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
context 'when scientificNameAuthorship duplicates author info' do
|
121
|
+
let(:file_name) { 'sci_name_authorship_dup.tar.gz' }
|
122
|
+
it 'returns normalized data' do
|
123
|
+
normalizer.normalize
|
124
|
+
expect(normalizer.darwin_core.file_name).
|
125
|
+
to eq 'sci_name_authorship_dup.tar.gz'
|
126
|
+
expect(normalizer.normalized_data).to be_kind_of Hash
|
127
|
+
expect(normalizer.normalized_data.size).to be > 0
|
128
|
+
tn = normalizer.normalized_data['leptogastrinae:tid:2688']
|
129
|
+
expect(tn.current_name).to eq 'Leptogaster fornicata Martin, 1957'
|
130
|
+
expect(tn.current_name_canonical).to eq 'Leptogaster fornicata'
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
context 'coreid is empty' do
|
135
|
+
let(:file_name) { 'empty_coreid.tar.gz' }
|
136
|
+
it 'should ingest information' do
|
137
|
+
res = normalizer.normalize
|
138
|
+
expect(normalizer.darwin_core.file_name).
|
139
|
+
to eq 'empty_coreid.tar.gz'
|
140
|
+
tn = res['Taxon9']
|
141
|
+
expect(tn.current_name).to eq 'Amanita phalloides'
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
context 'vernacular locality info' do
|
146
|
+
let(:file_name) { 'language_locality.tar.gz' }
|
147
|
+
it 'should ingest locality and language' do
|
148
|
+
res = normalizer.normalize
|
149
|
+
tn = res['leptogastrinae:tid:42']
|
150
|
+
vn = tn.vernacular_names[0]
|
151
|
+
expect(vn.language).to eq 'en'
|
152
|
+
expect(vn.locality).to eq 'New England'
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
describe '#name_strings' do
|
158
|
+
let(:file_path) { File.join(file_dir, 'flat_list.tar.gz') }
|
159
|
+
|
160
|
+
context 'before running #normalize' do
|
161
|
+
it 'is empty' do
|
162
|
+
expect(normalizer.name_strings).to be_empty
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
context 'after running #normalize' do
|
167
|
+
let(:normalized) { normalizer.tap { |n| n.normalize } }
|
168
|
+
|
169
|
+
context 'default attibutes' do
|
170
|
+
it 'returns array' do
|
171
|
+
expect(normalized.name_strings).to be_kind_of Array
|
172
|
+
expect(normalized.name_strings.size).to be > 1
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
context 'with_hash attribute' do
|
177
|
+
it 'returns hash' do
|
178
|
+
strings = normalized.name_strings(with_hash:true)
|
179
|
+
expect(strings).to be_kind_of Hash
|
180
|
+
expect(strings.size).to be > 1
|
181
|
+
expect(strings.values.uniq).to eq [1]
|
182
|
+
end
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
|
188
|
+
describe '#vernacular_name_strings' do
|
189
|
+
let(:file_path) { File.join(file_dir, 'flat_list.tar.gz') }
|
190
|
+
|
191
|
+
context 'before running #normalize' do
|
192
|
+
subject(:vern) { normalizer.vernacular_name_strings }
|
193
|
+
it 'is empty' do
|
194
|
+
expect(vern).to be_empty
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
context 'after running #normalize' do
|
199
|
+
let(:normalized) { normalizer.tap { |n| n.normalize } }
|
200
|
+
subject(:vern) { normalized.vernacular_name_strings }
|
201
|
+
subject(:vern_w_hash) { normalized.
|
202
|
+
vernacular_name_strings(with_hash: true) }
|
203
|
+
|
204
|
+
context 'default attibutes' do
|
205
|
+
it 'returns array' do
|
206
|
+
expect(vern).to be_kind_of Array
|
207
|
+
expect(vern.size).to be > 0
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
context 'with_hash attribute' do
|
212
|
+
it 'returns hash' do
|
213
|
+
expect(vern_w_hash).to be_kind_of Hash
|
214
|
+
expect(vern_w_hash.size).to be > 0
|
215
|
+
expect(vern_w_hash.values.uniq).to eq [1]
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
end
|
220
|
+
|
221
|
+
end
|
222
|
+
|
223
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require_relative '../spec_helper'
|
2
|
+
|
3
|
+
describe DarwinCore::Core do
|
4
|
+
subject(:dwca) { DarwinCore.new(file_path) }
|
5
|
+
subject(:core) { DarwinCore::Core.new(dwca) }
|
6
|
+
let(:file_path) { File.join(File.expand_path('../../files', __FILE__),
|
7
|
+
file_name) }
|
8
|
+
let(:file_name) { 'data.tar.gz' }
|
9
|
+
|
10
|
+
|
11
|
+
describe '.new' do
|
12
|
+
it 'creates new core' do
|
13
|
+
expect(core).to be_kind_of DarwinCore::Core
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
describe '#id' do
|
18
|
+
|
19
|
+
it 'returns core id' do
|
20
|
+
expect(core.id[:index]).to eq 0
|
21
|
+
expect(core.id[:term]).to eq 'http://rs.tdwg.org/dwc/terms/TaxonID'
|
22
|
+
end
|
23
|
+
|
24
|
+
context 'no coreid' do
|
25
|
+
let(:file_name) { 'empty_coreid.tar.gz' }
|
26
|
+
|
27
|
+
it 'does not return coreid' do
|
28
|
+
expect(core.id[:index]).to eq 0
|
29
|
+
expect(core.id[:term]).to be_nil
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe '#data' do
|
35
|
+
it 'gers core data' do
|
36
|
+
expect(core.data).to be_kind_of Hash
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
describe '#properties' do
|
41
|
+
it 'gers core properties' do
|
42
|
+
expect(core.properties).to be_kind_of Hash
|
43
|
+
expect(core.properties.keys).to match_array [:encoding,
|
44
|
+
:fieldsTerminatedBy, :linesTerminatedBy, :fieldsEnclosedBy,
|
45
|
+
:ignoreHeaderLines, :rowType ]
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
describe '#encoding' do
|
50
|
+
it 'returns encoding of the data' do
|
51
|
+
expect(core.encoding).to eq 'UTF-8'
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
describe '#fields_separator' do
|
56
|
+
it 'returns separator of fields for csv files' do
|
57
|
+
expect(core.fields_separator).to be_nil
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
describe '#size' do
|
62
|
+
it 'returns number of lines in the core' do
|
63
|
+
expect(core.size).to eq 588
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
describe '#file_path' do
|
68
|
+
it 'returns file path of core file' do
|
69
|
+
expect(core.file_path).to match 'DarwinCore.txt'
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe '#fields' do
|
74
|
+
it 'returns fields of the core file' do
|
75
|
+
expect(core.fields.size).to eq 7
|
76
|
+
expect(core.fields).to be_kind_of Array
|
77
|
+
expect(core.fields[0]).to be_kind_of Hash
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
describe '#line_separator' do
|
82
|
+
it 'returns characters separating lines in csv file' do
|
83
|
+
expect(core.line_separator).to eq "\\n"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
describe '#quote_character' do
|
88
|
+
it 'returns quote character for the csv file' do
|
89
|
+
expect(core.quote_character).to eq ''
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
describe '#ignore headers' do
|
94
|
+
it 'returns true if headers should not be included into data' do
|
95
|
+
expect(core.ignore_headers).to eq true
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|