dwc-archive 0.7.3 → 0.7.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +3 -0
- data/VERSION +1 -1
- data/features/dwca-reader.feature +8 -0
- data/features/step_definitions/dwc-reader_steps.rb +24 -1
- data/lib/dwc-archive/classification_normalizer.rb +41 -24
- data/spec/lib/dwc-archive_spec.rb +5 -2
- metadata +22 -22
data/README.rdoc
CHANGED
@@ -2,6 +2,9 @@
|
|
2
2
|
|
3
3
|
Darwin Core Archive format is a current standard for information exchange between Global Names Architecture modules. This gem allows to work with Darwin Core Archive data compressed to either zip or tar.gz files. More information about Darwing Core Archive can be found on a GBIF page[http://www.gbif.org/informatics/standards-and-tools/publishing-data/data-standards/darwin-core-archives/]:
|
4
4
|
|
5
|
+
|
6
|
+
{<img src="http://travis-ci.org/dimus/dwc-archive.png" />}[http://travis-ci.org/dwc-archive/dwc-archive]
|
7
|
+
|
5
8
|
== Installation
|
6
9
|
|
7
10
|
Update to latest rubygems (v >= 1.3.6) which adds gemcutter sources by default.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.7.
|
1
|
+
0.7.4
|
@@ -70,3 +70,11 @@ Feature: Reading of a Darwing Core Archive
|
|
70
70
|
And there are paths, synonyms and vernacular names in normalized classification
|
71
71
|
And names used in classification can be accessed by "name_strings" method
|
72
72
|
And nodes_ids organized in trees can be accessed by "tree" method
|
73
|
+
|
74
|
+
Scenario: Normalizing classification without canonical names
|
75
|
+
Given path to a dwc file "data.tar.gz"
|
76
|
+
When I create a new DarwinCore instance
|
77
|
+
Then I am able to use DarwinCore#normalize_classification method
|
78
|
+
Then I am able to use normalize method without canonical names path
|
79
|
+
And get normalized classification in expected format
|
80
|
+
And there are id paths, no canonical names paths in normalized classification
|
@@ -34,6 +34,10 @@ When /^I create a new DarwinCore instance$/ do
|
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
+
When /^I create DarwinCore::ClassificationNormalizer instance$/ do
|
38
|
+
@cn = DarwinCore::ClassificationNormalizer.new(@dwc)
|
39
|
+
end
|
40
|
+
|
37
41
|
Then /^instance should have a valid archive$/ do
|
38
42
|
@dwc.archive.valid?.should be_true
|
39
43
|
end
|
@@ -201,6 +205,22 @@ Then /^there are paths, synonyms and vernacular names in normalized classificati
|
|
201
205
|
@synonyms_are_generated.should be_true
|
202
206
|
end
|
203
207
|
|
208
|
+
Then /^there are id paths, no canonical names paths in normalized classification$/ do
|
209
|
+
id_paths_generated = false
|
210
|
+
canonical_paths_generated = false
|
211
|
+
@cn.normalized_data.should_not be_empty
|
212
|
+
@cn.normalized_data.each do |k, v|
|
213
|
+
if v.classification_path.size > 0
|
214
|
+
canonical_paths_generated = true
|
215
|
+
end
|
216
|
+
if v.classification_path_id.size > 0
|
217
|
+
id_paths_generated = true
|
218
|
+
end
|
219
|
+
end
|
220
|
+
id_paths_generated.should be_true
|
221
|
+
canonical_paths_generated.should be_false
|
222
|
+
end
|
223
|
+
|
204
224
|
Then /^names used in classification can be accessed by "([^"]*)" method$/ do |name_strings|
|
205
225
|
names = @cn.send(name_strings.to_sym)
|
206
226
|
names.size.should > @normalized_classification.size
|
@@ -212,7 +232,6 @@ Then /^nodes_ids organized in trees can be accessed by "([^"]*)" method$/ do |tr
|
|
212
232
|
data.each do |k, v|
|
213
233
|
keys << k
|
214
234
|
if v != {}
|
215
|
-
debugger if v.class != Hash
|
216
235
|
flatten_tree(v, keys)
|
217
236
|
end
|
218
237
|
end
|
@@ -224,3 +243,7 @@ Then /^nodes_ids organized in trees can be accessed by "([^"]*)" method$/ do |tr
|
|
224
243
|
@normalized_classification.size.should == keys.size
|
225
244
|
end
|
226
245
|
|
246
|
+
Then /^I am able to use normalize method without canonical names path$/ do
|
247
|
+
@cn = DarwinCore::ClassificationNormalizer.new(@dwc)
|
248
|
+
@cn.normalize(:with_canonical_names => false)
|
249
|
+
end
|
@@ -44,12 +44,12 @@ class DarwinCore
|
|
44
44
|
@name_strings.keys
|
45
45
|
end
|
46
46
|
|
47
|
-
def normalize
|
47
|
+
def normalize(opts = {:with_canoical_names => true})
|
48
|
+
@with_canonical_names = opts[:with_canonical_names] != nil ? opts[:with_canonical_names] : true
|
48
49
|
DarwinCore.logger_write(@dwc.object_id, "Started normalization of the classification")
|
49
|
-
@normalized_data = {}
|
50
50
|
ingest_core
|
51
51
|
DarwinCore.logger_write(@dwc.object_id, "Calculating the classification parent/child paths")
|
52
|
-
calculate_classification_path
|
52
|
+
has_parent_id? ? calculate_classification_path : @normalized_data.keys.each { |id| @tree[id] = {} }
|
53
53
|
DarwinCore.logger_write(@dwc.object_id, "Ingesting data from extensions")
|
54
54
|
ingest_extensions
|
55
55
|
@normalized_data
|
@@ -59,8 +59,12 @@ class DarwinCore
|
|
59
59
|
|
60
60
|
def get_canonical_name(a_scientific_name)
|
61
61
|
a_scientific_name.force_encoding('utf-8')
|
62
|
-
|
63
|
-
|
62
|
+
if @with_canonical_names
|
63
|
+
canonical_name = @parser.parse(a_scientific_name, :canonical_only => true)
|
64
|
+
canonical_name.to_s.empty? ? a_scientific_name : canonical_name
|
65
|
+
else
|
66
|
+
nil
|
67
|
+
end
|
64
68
|
end
|
65
69
|
|
66
70
|
def get_fields(element)
|
@@ -88,13 +92,13 @@ class DarwinCore
|
|
88
92
|
|
89
93
|
def set_scientific_name(row, fields)
|
90
94
|
row[fields[:scientificname]] = 'N/A' unless row[fields[:scientificname]]
|
91
|
-
canonical_name =
|
95
|
+
canonical_name = nil
|
92
96
|
scientific_name = row[fields[:scientificname]].strip.force_encoding('utf-8')
|
93
97
|
if separate_canonical_and_authorship?(row, fields)
|
94
|
-
canonical_name = row[fields[:scientificname]].strip.force_encoding('utf-8')
|
98
|
+
canonical_name = row[fields[:scientificname]].strip.force_encoding('utf-8') if @with_canonical_names
|
95
99
|
scientific_name += " #{row[fields[:scientificnameauthorship]].strip.force_encoding('utf-8')}"
|
96
100
|
else
|
97
|
-
canonical_name = get_canonical_name(row[fields[:scientificname]])
|
101
|
+
canonical_name = get_canonical_name(row[fields[:scientificname]]) if @with_canonical_names
|
98
102
|
end
|
99
103
|
fields[:canonicalname] = row.size
|
100
104
|
row << canonical_name
|
@@ -112,30 +116,38 @@ class DarwinCore
|
|
112
116
|
|
113
117
|
|
114
118
|
def ingest_core
|
119
|
+
@normalized_data = {}
|
115
120
|
raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core_fields[:id] && @core_fields[:scientificname])
|
116
121
|
@dwc.core.read do |rows|
|
122
|
+
rows[1].each do |error|
|
123
|
+
@error_names << { :data => error, :error => :reading_or_encoding_error }
|
124
|
+
end
|
117
125
|
rows[0].each do |r|
|
118
126
|
set_scientific_name(r, @core_fields)
|
119
127
|
#core has AcceptedNameUsageId
|
120
128
|
if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
|
121
129
|
add_synonym_from_core(@core_fields[:acceptednameusageid], r)
|
122
130
|
elsif !@core_fields[:acceptednameusageid] && @core_fields[:taxonomicstatus] && status_synonym?(r[@core_fields[:taxonomicstatus]])
|
123
|
-
add_synonym_from_core(parent_id, r)
|
131
|
+
add_synonym_from_core(parent_id, r) if has_parent_id?
|
124
132
|
else
|
125
133
|
taxon = @normalized_data[r[@core_fields[:id]]] ? @normalized_data[r[@core_fields[:id]]] : @normalized_data[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new
|
126
134
|
taxon.id = r[@core_fields[:id]]
|
127
135
|
taxon.current_name = r[@core_fields[:scientificname]]
|
128
136
|
taxon.current_name_canonical = r[@core_fields[:canonicalname]]
|
129
|
-
taxon.parent_id = r[parent_id]
|
137
|
+
taxon.parent_id = has_parent_id? ? r[parent_id] : nil
|
130
138
|
taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
|
131
139
|
taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
|
132
140
|
add_name_string(taxon.current_name)
|
133
|
-
add_name_string(taxon.current_name_canonical)
|
141
|
+
add_name_string(taxon.current_name_canonical) if taxon.current_name_canonical && !taxon.current_name_canonical.empty?
|
134
142
|
end
|
135
143
|
end
|
136
144
|
end
|
137
145
|
end
|
138
146
|
|
147
|
+
def has_parent_id?
|
148
|
+
@has_parent_id ||= @core_fields.has_key?(:highertaxonid) || @core_fields.has_key?(:parentnameusageid)
|
149
|
+
end
|
150
|
+
|
139
151
|
def parent_id
|
140
152
|
parent_id_field = @core_fields[:highertaxonid] || @core_fields[:parentnameusageid]
|
141
153
|
end
|
@@ -143,47 +155,52 @@ class DarwinCore
|
|
143
155
|
def calculate_classification_path
|
144
156
|
@paths_num = 0
|
145
157
|
@normalized_data.each do |taxon_id, taxon|
|
146
|
-
next if !taxon.
|
158
|
+
next if !taxon.classification_path_id.empty?
|
147
159
|
res = get_classification_path(taxon)
|
148
160
|
next if res == 'error'
|
149
161
|
end
|
150
162
|
end
|
151
163
|
|
152
164
|
def get_classification_path(taxon)
|
153
|
-
return if !taxon.
|
165
|
+
return if !taxon.classification_path_id.empty?
|
154
166
|
@paths_num += 1
|
155
167
|
DarwinCore.logger_write(@dwc.object_id, "Calculated %s paths" % @paths_num) if @paths_num % 10000 == 0
|
156
168
|
current_node = {taxon.id => {}}
|
157
169
|
if DarwinCore.nil_field?(taxon.parent_id)
|
158
|
-
taxon.classification_path << taxon.current_name_canonical
|
170
|
+
taxon.classification_path << taxon.current_name_canonical if @with_canonical_names
|
159
171
|
taxon.classification_path_id << taxon.id
|
160
172
|
@tree.merge!(current_node)
|
161
173
|
else
|
162
|
-
parent_cp = nil
|
174
|
+
parent_cp = parent_cpid = nil
|
163
175
|
if @normalized_data[taxon.parent_id]
|
164
|
-
parent_cp = @normalized_data[taxon.parent_id].classification_path
|
176
|
+
parent_cp = @normalized_data[taxon.parent_id].classification_path if @with_canonical_names
|
177
|
+
parent_cpid = @normalized_data[taxon.parent_id].classification_path_id
|
165
178
|
else
|
166
179
|
current_parent = @normalized_data[@synonyms[taxon.parent_id]]
|
167
180
|
if current_parent
|
168
181
|
error = "WARNING: The parent of the taxon \'#{taxon.current_name}\' is deprecated"
|
169
|
-
@error_names << {:
|
170
|
-
|
182
|
+
@error_names << {:data => taxon, :error => :deprecated_parent, :current_parent => current_parent }
|
183
|
+
|
184
|
+
parent_cp = current_parent.classification_path if @with_canonical_names
|
185
|
+
parent_cpid = current_parent.classification_path_id
|
171
186
|
else
|
172
187
|
error = "WARNING: The parent of the taxon \'#{taxon.current_name}\' not found"
|
173
|
-
@error_names << {:
|
188
|
+
@error_names << {:data => taxon, :error => :deprecated_parent, :current_parent => nil}
|
174
189
|
end
|
175
190
|
end
|
176
|
-
return 'error' unless
|
177
|
-
if
|
191
|
+
return 'error' unless parent_cpid
|
192
|
+
if parent_cpid.empty?
|
178
193
|
res = get_classification_path(@normalized_data[taxon.parent_id])
|
179
194
|
return res if res == 'error'
|
180
|
-
|
195
|
+
if @with_canonical_names
|
196
|
+
taxon.classification_path += @normalized_data[taxon.parent_id].classification_path + [taxon.current_name_canonical]
|
197
|
+
end
|
181
198
|
taxon.classification_path_id += @normalized_data[taxon.parent_id].classification_path_id + [taxon.id]
|
182
199
|
parent_node = @normalized_data[taxon.parent_id].classification_path_id.inject(@tree) {|node, id| node[id]}
|
183
200
|
parent_node.merge!(current_node)
|
184
201
|
else
|
185
|
-
taxon.classification_path += parent_cp + [taxon.current_name_canonical]
|
186
|
-
taxon.classification_path_id +=
|
202
|
+
taxon.classification_path += parent_cp + [taxon.current_name_canonical] if @with_canonical_names
|
203
|
+
taxon.classification_path_id += parent_cpid + [taxon.id]
|
187
204
|
parent_node = @normalized_data[taxon.parent_id].classification_path_id.inject(@tree) {|node, id| node[id]}
|
188
205
|
begin
|
189
206
|
parent_node.merge!(current_node)
|
@@ -57,10 +57,13 @@ describe DarwinCore do
|
|
57
57
|
end
|
58
58
|
|
59
59
|
describe ".normalize_classification" do
|
60
|
-
it "should return
|
60
|
+
it "should return flat list if file has no parent id information" do
|
61
61
|
file = File.join(@file_dir, 'flat_list.tar.gz')
|
62
62
|
dwc = DarwinCore.new(file)
|
63
|
-
dwc
|
63
|
+
cn = DarwinCore::ClassificationNormalizer.new(dwc)
|
64
|
+
cn.normalize
|
65
|
+
cn.normalized_data.should_not be_nil
|
66
|
+
cn.normalized_data.size.should > 0
|
64
67
|
end
|
65
68
|
|
66
69
|
it "should traverse DarwinCore files and assemble data for every node in memory" do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-11-
|
12
|
+
date: 2011-11-28 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: parsley-store
|
16
|
-
requirement: &
|
16
|
+
requirement: &70241095997220 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 0.3.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70241095997220
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &70241095996620 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 2.3.0
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70241095996620
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70241095995420 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70241095995420
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: cucumber
|
49
|
-
requirement: &
|
49
|
+
requirement: &70241095994720 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70241095994720
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: bundler
|
60
|
-
requirement: &
|
60
|
+
requirement: &70241095993980 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: 1.0.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70241095993980
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: jeweler
|
71
|
-
requirement: &
|
71
|
+
requirement: &70241095993280 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: 1.6.4
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70241095993280
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: ruby-debug19
|
82
|
-
requirement: &
|
82
|
+
requirement: &70241095992540 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70241095992540
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: parsley-store
|
93
|
-
requirement: &
|
93
|
+
requirement: &70241095991840 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: 0.3.0
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70241095991840
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: rspec
|
104
|
-
requirement: &
|
104
|
+
requirement: &70241095991100 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: 1.2.9
|
110
110
|
type: :development
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70241095991100
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: cucumber
|
115
|
-
requirement: &
|
115
|
+
requirement: &70241095990400 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,7 +120,7 @@ dependencies:
|
|
120
120
|
version: '0'
|
121
121
|
type: :development
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70241095990400
|
124
124
|
description: Darwin Core Archive is the current standard exchange format for GLobal
|
125
125
|
Names Architecture modules. This gem makes it easy to incorporate files in Darwin
|
126
126
|
Core Archive format into a ruby project.
|