dwc-archive 0.9.6 → 0.9.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,23 +8,31 @@ class DarwinCore
8
8
  end
9
9
 
10
10
  def create
11
+ schema_uri = 'http://rs.tdwg.org/dwc/terms/xsd/archive/' +
12
+ ' http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd'
11
13
  builder = Nokogiri::XML::Builder.new do |xml|
12
- opts = { :encoding => "UTF-8", :fieldsTerminatedBy => ",", :fieldsEnclosedBy => '"', :linesTerminatedBy => "\n", :rowType => "http://rs.tdwg.org/dwc/terms/Taxon" }
13
- xml.archive(:xmlns => "http://rs.tdwg.org/dwc/text/",
14
- "xmlns:xsi" =>"http://www.w3.org/2001/XMLSchema-instance",
15
- "xsi:schemaLocation" => "http://rs.tdwg.org/dwc/terms/xsd/archive/ http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd") do
16
- xml.core(opts.merge(:ignoreHeaderLines => @data[:core][:ignoreHeaderLines])) do
14
+ opts = { encoding: 'UTF-8',
15
+ fieldsTerminatedBy: ',',
16
+ fieldsEnclosedBy: '"',
17
+ linesTerminatedBy: "\n",
18
+ rowType: 'http://rs.tdwg.org/dwc/terms/Taxon' }
19
+ xml.archive(xmlns: 'http://rs.tdwg.org/dwc/text/',
20
+ :'xmlns:xsi' => 'http://www.w3.org/2001/XMLSchema-instance',
21
+ :'xsi:schemaLocation' => schema_uri) do
22
+ xml.core(opts.merge(ignoreHeaderLines:
23
+ @data[:core][:ignoreHeaderLines])) do
17
24
  xml.files { xml.location(@data[:core][:location]) }
18
25
  taxon_id, fields = find_taxon_id(@data[:core][:fields])
19
- xml.id_(:index => taxon_id[1])
20
- fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
26
+ xml.id_(index: taxon_id[1])
27
+ fields.each { |f| xml.field(term: f[0], index: f[1]) }
21
28
  end
22
29
  @data[:extensions].each do |e|
23
- xml.extension(opts.merge(:ignoreHeaderLines => e[:ignoreHeaderLines], :rowType => e[:rowType])) do
30
+ xml.extension(opts.merge(ignoreHeaderLines: e[:ignoreHeaderLines],
31
+ rowType: e[:rowType])) do
24
32
  xml.files { xml.location(e[:location]) }
25
33
  taxon_id, fields = find_taxon_id(e[:fields])
26
- xml.coreid(:index => taxon_id[1])
27
- fields.each { |f| xml.field(:term => f[0], :index => f[1]) }
34
+ xml.coreid(index: taxon_id[1])
35
+ fields.each { |f| xml.field(term: f[0], index: f[1]) }
28
36
  end
29
37
  end
30
38
  end
@@ -39,7 +47,7 @@ class DarwinCore
39
47
  def find_taxon_id(data)
40
48
  fields = []
41
49
  data.each_with_index { |f, i| fields << [f.strip, i] }
42
- taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
50
+ taxon_id, fields = fields.partition { |f| f[0].match(%r|/taxonid$|i) }
43
51
  raise DarwinCore::GeneratorError if taxon_id.size != 1
44
52
  [taxon_id[0], fields]
45
53
  end
@@ -62,7 +62,7 @@ class DarwinCore
62
62
  raise DarwinCore::EncodingError.new(err_msg)
63
63
  end
64
64
  @field_separator = get_field_separator
65
- @quote_character = @properties[:fieldsEnclosedBy] || ""
65
+ @quote_character = @properties[:fieldsEnclosedBy] || ''
66
66
  @line_separator = @properties[:linesTerminatedBy] || '\n'
67
67
  @ignore_headers = @properties[:ignoreHeaderLines] ?
68
68
  [1, true].include?(@properties[:ignoreHeaderLines]) :
@@ -23,8 +23,14 @@ class DarwinCore
23
23
 
24
24
  def authors
25
25
  return nil unless defined?(@metadata[:eml][:dataset][:creator])
26
- @metadata[:eml][:dataset][:creator] = [@metadata[:eml][:dataset][:creator]] unless @metadata[:eml][:dataset][:creator].class == Array
27
- @metadata[:eml][:dataset][:creator].map {|c| {:first_name => c[:individualName][:givenName], :last_name => c[:individualName][:surName], :email => c[:electronicMailAddress]}}
26
+ @metadata[:eml][:dataset][:creator] =
27
+ [@metadata[:eml][:dataset][:creator]] unless
28
+ @metadata[:eml][:dataset][:creator].class == Array
29
+ @metadata[:eml][:dataset][:creator].map do |c|
30
+ { first_name: c[:individualName][:givenName],
31
+ last_name: c[:individualName][:surName],
32
+ email: c[:electronicMailAddress] }
33
+ end
28
34
  end
29
35
 
30
36
  def abstract
@@ -0,0 +1,3 @@
1
+ class DarwinCore
2
+ VERSION = "0.9.10"
3
+ end
@@ -1,16 +1,14 @@
1
1
  # USAGE: Hash.from_xml:(YOUR_XML_STRING)
2
2
  require 'nokogiri'
3
- # modified from http://stackoverflow.com/questions/1230741/convert-a-nokogiri-document-to-a-ruby-hash/1231297#1231297
3
+ # modified from
4
+ # http://stackoverflow.com/questions/1230741/
5
+ # convert-a-nokogiri-document-to-a-ruby-hash/1231297#1231297
4
6
  class DarwinCore
5
7
  module XmlReader
6
8
  class << self
7
9
  def from_xml(xml_io)
8
- begin
9
- result = Nokogiri::XML(xml_io)
10
- return { result.root.name.to_sym => xml_node_to_hash(result.root)}
11
- rescue Exception => e
12
- raise e
13
- end
10
+ result = Nokogiri::XML(xml_io)
11
+ return { result.root.name.to_sym => xml_node_to_hash(result.root)}
14
12
  end
15
13
 
16
14
  private
@@ -22,7 +20,8 @@ class DarwinCore
22
20
  if node.attributes != {}
23
21
  result_hash[:attributes] = {}
24
22
  node.attributes.keys.each do |key|
25
- result_hash[:attributes][node.attributes[key].name.to_sym] = prepare(node.attributes[key].value)
23
+ result_hash[:attributes][node.attributes[key].
24
+ name.to_sym] = prepare(node.attributes[key].value)
26
25
  end
27
26
  end
28
27
  if node.children.size > 0
@@ -37,7 +36,8 @@ class DarwinCore
37
36
  if result_hash[child.name.to_sym].is_a?(Object::Array)
38
37
  result_hash[child.name.to_sym] << prepare(result)
39
38
  else
40
- result_hash[child.name.to_sym] = [result_hash[child.name.to_sym]] << prepare(result)
39
+ result_hash[child.name.to_sym] =
40
+ [result_hash[child.name.to_sym]] << prepare(result)
41
41
  end
42
42
  else
43
43
  result_hash[child.name.to_sym] = prepare(result)
@@ -0,0 +1,223 @@
1
+ require_relative '../spec_helper'
2
+ # encoding: utf-8
3
+
4
+ describe DarwinCore::ClassificationNormalizer do
5
+
6
+ subject(:dwca) { DarwinCore.new(file_path) }
7
+ subject(:normalizer) { DarwinCore::ClassificationNormalizer.new(dwca) }
8
+
9
+ let(:file_dir) { File.expand_path('../../files', __FILE__) }
10
+ let(:file_path) { File.join(file_dir, file_name) }
11
+
12
+ describe '.new' do
13
+ let(:file_path) { File.join(file_dir, 'data.tar.gz') }
14
+ it { expect(normalizer.is_a? DarwinCore::ClassificationNormalizer).
15
+ to be_true }
16
+ end
17
+
18
+ describe '#normalize' do
19
+ let(:file_name) { 'data.tar.gz' }
20
+
21
+ it 'returns normalized data' do
22
+ res = normalizer.normalize
23
+ expect(res).to be normalizer.normalized_data
24
+ end
25
+
26
+
27
+ context 'flat list' do
28
+ let(:file_path) { File.join(file_dir, 'flat_list.tar.gz') }
29
+
30
+ it 'returns flat list' do
31
+ normalizer.normalize
32
+ expect(normalizer.normalized_data).to be_kind_of Hash
33
+ expect(normalizer.normalized_data.size).to be > 0
34
+ end
35
+ end
36
+
37
+ context 'synonyms from core' do
38
+ let(:file_name) { 'synonyms_in_core_accepted_name_field.tar.gz' }
39
+
40
+ it 'ingests synonyms using accepted_name field' do
41
+ res = normalizer.normalize
42
+ syn = res.select { |k,v| !v.synonyms.empty? }.
43
+ map { |k,v| v }
44
+ expect(syn.size).to be > 0
45
+ expect(syn[0].synonyms[0]).to be_kind_of DarwinCore::SynonymNormalized
46
+ end
47
+ end
48
+
49
+ context 'synonyms from extension' do
50
+ let(:file_name) { 'synonyms_in_extension.tar.gz' }
51
+ it 'ingests synonyms from extension' do
52
+ res = normalizer.normalize
53
+ syn = res.select { |k,v| !v.synonyms.empty? }.
54
+ map { |k,v| v }
55
+ expect(syn.size).to be > 0
56
+ expect(syn[0].synonyms[0]).to be_kind_of DarwinCore::SynonymNormalized
57
+ end
58
+ end
59
+
60
+ context 'synonyms are not extensions' do
61
+ let(:file_name) { 'not_synonym_in_extension.tar.gz' }
62
+
63
+ it 'does not ingest synonyms' do
64
+ res = normalizer.normalize
65
+ syn = res.select { |k,v| !v.synonyms.empty? }.
66
+ map { |k,v| v }
67
+ expect(syn).to be_empty
68
+ end
69
+ end
70
+
71
+ context 'with_extensions flag set on false' do
72
+ let(:file_name) { 'synonyms_in_extension.tar.gz' }
73
+ it 'should not harvest extensions' do
74
+ res = normalizer.normalize(with_extensions: false)
75
+ syn = res.select { |k,v| !v.synonyms.empty? }.
76
+ map { |k,v| v }
77
+ expect(syn).to be_empty
78
+ end
79
+ end
80
+
81
+ context 'linnean classification in file (class, order etc fields)' do
82
+ let(:file_name) { 'linnean.tar.gz' }
83
+
84
+ it 'assembles classification' do
85
+ res = normalizer.normalize
86
+ expect(res.first[1]).to be_kind_of DarwinCore::TaxonNormalized
87
+ expect(res.first[1].linnean_classification_path).
88
+ to eq [["Animalia", :kingdom],
89
+ ["Arthropoda", :phylum],
90
+ ["Insecta", :class],
91
+ ["Diptera", :order],
92
+ ["Cecidomyiidae", :family],
93
+ ["Resseliella", :genus]]
94
+
95
+ end
96
+ end
97
+
98
+ context 'no linnean fields are given' do
99
+ it 'returns empty linnean classification' do
100
+ res = normalizer.normalize
101
+ expect(res.first[1]).to be_kind_of DarwinCore::TaxonNormalized
102
+ expect(res.first[1].linnean_classification_path).to be_empty
103
+ end
104
+ end
105
+
106
+ context 'in the presence of scientificNameAuthorship field' do
107
+ let(:file_name) { 'sci_name_authorship.tar.gz' }
108
+ it 'returns normalized data' do
109
+ normalizer.normalize
110
+ expect(normalizer.darwin_core.file_name).
111
+ to eq 'sci_name_authorship.tar.gz'
112
+ expect(normalizer.normalized_data).to be_kind_of Hash
113
+ expect(normalizer.normalized_data.size).to be > 0
114
+ tn = normalizer.normalized_data['leptogastrinae:tid:2688']
115
+ expect(tn.current_name).to eq 'Leptogaster fornicata Martin, 1957'
116
+ expect(tn.current_name_canonical).to eq 'Leptogaster fornicata'
117
+ end
118
+ end
119
+
120
+ context 'when scientificNameAuthorship duplicates author info' do
121
+ let(:file_name) { 'sci_name_authorship_dup.tar.gz' }
122
+ it 'returns normalized data' do
123
+ normalizer.normalize
124
+ expect(normalizer.darwin_core.file_name).
125
+ to eq 'sci_name_authorship_dup.tar.gz'
126
+ expect(normalizer.normalized_data).to be_kind_of Hash
127
+ expect(normalizer.normalized_data.size).to be > 0
128
+ tn = normalizer.normalized_data['leptogastrinae:tid:2688']
129
+ expect(tn.current_name).to eq 'Leptogaster fornicata Martin, 1957'
130
+ expect(tn.current_name_canonical).to eq 'Leptogaster fornicata'
131
+ end
132
+ end
133
+
134
+ context 'coreid is empty' do
135
+ let(:file_name) { 'empty_coreid.tar.gz' }
136
+ it 'should ingest information' do
137
+ res = normalizer.normalize
138
+ expect(normalizer.darwin_core.file_name).
139
+ to eq 'empty_coreid.tar.gz'
140
+ tn = res['Taxon9']
141
+ expect(tn.current_name).to eq 'Amanita phalloides'
142
+ end
143
+ end
144
+
145
+ context 'vernacular locality info' do
146
+ let(:file_name) { 'language_locality.tar.gz' }
147
+ it 'should ingest locality and language' do
148
+ res = normalizer.normalize
149
+ tn = res['leptogastrinae:tid:42']
150
+ vn = tn.vernacular_names[0]
151
+ expect(vn.language).to eq 'en'
152
+ expect(vn.locality).to eq 'New England'
153
+ end
154
+ end
155
+ end
156
+
157
+ describe '#name_strings' do
158
+ let(:file_path) { File.join(file_dir, 'flat_list.tar.gz') }
159
+
160
+ context 'before running #normalize' do
161
+ it 'is empty' do
162
+ expect(normalizer.name_strings).to be_empty
163
+ end
164
+ end
165
+
166
+ context 'after running #normalize' do
167
+ let(:normalized) { normalizer.tap { |n| n.normalize } }
168
+
169
+ context 'default attibutes' do
170
+ it 'returns array' do
171
+ expect(normalized.name_strings).to be_kind_of Array
172
+ expect(normalized.name_strings.size).to be > 1
173
+ end
174
+ end
175
+
176
+ context 'with_hash attribute' do
177
+ it 'returns hash' do
178
+ strings = normalized.name_strings(with_hash:true)
179
+ expect(strings).to be_kind_of Hash
180
+ expect(strings.size).to be > 1
181
+ expect(strings.values.uniq).to eq [1]
182
+ end
183
+ end
184
+ end
185
+
186
+ end
187
+
188
+ describe '#vernacular_name_strings' do
189
+ let(:file_path) { File.join(file_dir, 'flat_list.tar.gz') }
190
+
191
+ context 'before running #normalize' do
192
+ subject(:vern) { normalizer.vernacular_name_strings }
193
+ it 'is empty' do
194
+ expect(vern).to be_empty
195
+ end
196
+ end
197
+
198
+ context 'after running #normalize' do
199
+ let(:normalized) { normalizer.tap { |n| n.normalize } }
200
+ subject(:vern) { normalized.vernacular_name_strings }
201
+ subject(:vern_w_hash) { normalized.
202
+ vernacular_name_strings(with_hash: true) }
203
+
204
+ context 'default attibutes' do
205
+ it 'returns array' do
206
+ expect(vern).to be_kind_of Array
207
+ expect(vern.size).to be > 0
208
+ end
209
+ end
210
+
211
+ context 'with_hash attribute' do
212
+ it 'returns hash' do
213
+ expect(vern_w_hash).to be_kind_of Hash
214
+ expect(vern_w_hash.size).to be > 0
215
+ expect(vern_w_hash.values.uniq).to eq [1]
216
+ end
217
+ end
218
+
219
+ end
220
+
221
+ end
222
+
223
+ end
@@ -0,0 +1,98 @@
1
+ require_relative '../spec_helper'
2
+
3
+ describe DarwinCore::Core do
4
+ subject(:dwca) { DarwinCore.new(file_path) }
5
+ subject(:core) { DarwinCore::Core.new(dwca) }
6
+ let(:file_path) { File.join(File.expand_path('../../files', __FILE__),
7
+ file_name) }
8
+ let(:file_name) { 'data.tar.gz' }
9
+
10
+
11
+ describe '.new' do
12
+ it 'creates new core' do
13
+ expect(core).to be_kind_of DarwinCore::Core
14
+ end
15
+ end
16
+
17
+ describe '#id' do
18
+
19
+ it 'returns core id' do
20
+ expect(core.id[:index]).to eq 0
21
+ expect(core.id[:term]).to eq 'http://rs.tdwg.org/dwc/terms/TaxonID'
22
+ end
23
+
24
+ context 'no coreid' do
25
+ let(:file_name) { 'empty_coreid.tar.gz' }
26
+
27
+ it 'does not return coreid' do
28
+ expect(core.id[:index]).to eq 0
29
+ expect(core.id[:term]).to be_nil
30
+ end
31
+ end
32
+ end
33
+
34
+ describe '#data' do
35
+ it 'gers core data' do
36
+ expect(core.data).to be_kind_of Hash
37
+ end
38
+ end
39
+
40
+ describe '#properties' do
41
+ it 'gers core properties' do
42
+ expect(core.properties).to be_kind_of Hash
43
+ expect(core.properties.keys).to match_array [:encoding,
44
+ :fieldsTerminatedBy, :linesTerminatedBy, :fieldsEnclosedBy,
45
+ :ignoreHeaderLines, :rowType ]
46
+ end
47
+ end
48
+
49
+ describe '#encoding' do
50
+ it 'returns encoding of the data' do
51
+ expect(core.encoding).to eq 'UTF-8'
52
+ end
53
+ end
54
+
55
+ describe '#fields_separator' do
56
+ it 'returns separator of fields for csv files' do
57
+ expect(core.fields_separator).to be_nil
58
+ end
59
+ end
60
+
61
+ describe '#size' do
62
+ it 'returns number of lines in the core' do
63
+ expect(core.size).to eq 588
64
+ end
65
+ end
66
+
67
+ describe '#file_path' do
68
+ it 'returns file path of core file' do
69
+ expect(core.file_path).to match 'DarwinCore.txt'
70
+ end
71
+ end
72
+
73
+ describe '#fields' do
74
+ it 'returns fields of the core file' do
75
+ expect(core.fields.size).to eq 7
76
+ expect(core.fields).to be_kind_of Array
77
+ expect(core.fields[0]).to be_kind_of Hash
78
+ end
79
+ end
80
+
81
+ describe '#line_separator' do
82
+ it 'returns characters separating lines in csv file' do
83
+ expect(core.line_separator).to eq "\\n"
84
+ end
85
+ end
86
+
87
+ describe '#quote_character' do
88
+ it 'returns quote character for the csv file' do
89
+ expect(core.quote_character).to eq ''
90
+ end
91
+ end
92
+
93
+ describe '#ignore headers' do
94
+ it 'returns true if headers should not be included into data' do
95
+ expect(core.ignore_headers).to eq true
96
+ end
97
+ end
98
+ end