dwc-archive 0.7.3 → 0.7.4

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -2,6 +2,9 @@
2
2
 
3
3
  Darwin Core Archive format is a current standard for information exchange between Global Names Architecture modules. This gem allows to work with Darwin Core Archive data compressed to either zip or tar.gz files. More information about Darwing Core Archive can be found on a GBIF page[http://www.gbif.org/informatics/standards-and-tools/publishing-data/data-standards/darwin-core-archives/]:
4
4
 
5
+
6
+ {<img src="http://travis-ci.org/dimus/dwc-archive.png" />}[http://travis-ci.org/dwc-archive/dwc-archive]
7
+
5
8
  == Installation
6
9
 
7
10
  Update to latest rubygems (v >= 1.3.6) which adds gemcutter sources by default.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.7.3
1
+ 0.7.4
@@ -70,3 +70,11 @@ Feature: Reading of a Darwing Core Archive
70
70
  And there are paths, synonyms and vernacular names in normalized classification
71
71
  And names used in classification can be accessed by "name_strings" method
72
72
  And nodes_ids organized in trees can be accessed by "tree" method
73
+
74
+ Scenario: Normalizing classification without canonical names
75
+ Given path to a dwc file "data.tar.gz"
76
+ When I create a new DarwinCore instance
77
+ Then I am able to use DarwinCore#normalize_classification method
78
+ Then I am able to use normalize method without canonical names path
79
+ And get normalized classification in expected format
80
+ And there are id paths, no canonical names paths in normalized classification
@@ -34,6 +34,10 @@ When /^I create a new DarwinCore instance$/ do
34
34
  end
35
35
  end
36
36
 
37
+ When /^I create DarwinCore::ClassificationNormalizer instance$/ do
38
+ @cn = DarwinCore::ClassificationNormalizer.new(@dwc)
39
+ end
40
+
37
41
  Then /^instance should have a valid archive$/ do
38
42
  @dwc.archive.valid?.should be_true
39
43
  end
@@ -201,6 +205,22 @@ Then /^there are paths, synonyms and vernacular names in normalized classificati
201
205
  @synonyms_are_generated.should be_true
202
206
  end
203
207
 
208
+ Then /^there are id paths, no canonical names paths in normalized classification$/ do
209
+ id_paths_generated = false
210
+ canonical_paths_generated = false
211
+ @cn.normalized_data.should_not be_empty
212
+ @cn.normalized_data.each do |k, v|
213
+ if v.classification_path.size > 0
214
+ canonical_paths_generated = true
215
+ end
216
+ if v.classification_path_id.size > 0
217
+ id_paths_generated = true
218
+ end
219
+ end
220
+ id_paths_generated.should be_true
221
+ canonical_paths_generated.should be_false
222
+ end
223
+
204
224
  Then /^names used in classification can be accessed by "([^"]*)" method$/ do |name_strings|
205
225
  names = @cn.send(name_strings.to_sym)
206
226
  names.size.should > @normalized_classification.size
@@ -212,7 +232,6 @@ Then /^nodes_ids organized in trees can be accessed by "([^"]*)" method$/ do |tr
212
232
  data.each do |k, v|
213
233
  keys << k
214
234
  if v != {}
215
- debugger if v.class != Hash
216
235
  flatten_tree(v, keys)
217
236
  end
218
237
  end
@@ -224,3 +243,7 @@ Then /^nodes_ids organized in trees can be accessed by "([^"]*)" method$/ do |tr
224
243
  @normalized_classification.size.should == keys.size
225
244
  end
226
245
 
246
+ Then /^I am able to use normalize method without canonical names path$/ do
247
+ @cn = DarwinCore::ClassificationNormalizer.new(@dwc)
248
+ @cn.normalize(:with_canonical_names => false)
249
+ end
@@ -44,12 +44,12 @@ class DarwinCore
44
44
  @name_strings.keys
45
45
  end
46
46
 
47
- def normalize
47
+ def normalize(opts = {:with_canoical_names => true})
48
+ @with_canonical_names = opts[:with_canonical_names] != nil ? opts[:with_canonical_names] : true
48
49
  DarwinCore.logger_write(@dwc.object_id, "Started normalization of the classification")
49
- @normalized_data = {}
50
50
  ingest_core
51
51
  DarwinCore.logger_write(@dwc.object_id, "Calculating the classification parent/child paths")
52
- calculate_classification_path
52
+ has_parent_id? ? calculate_classification_path : @normalized_data.keys.each { |id| @tree[id] = {} }
53
53
  DarwinCore.logger_write(@dwc.object_id, "Ingesting data from extensions")
54
54
  ingest_extensions
55
55
  @normalized_data
@@ -59,8 +59,12 @@ class DarwinCore
59
59
 
60
60
  def get_canonical_name(a_scientific_name)
61
61
  a_scientific_name.force_encoding('utf-8')
62
- canonical_name = @parser.parse(a_scientific_name, :canonical_only => true)
63
- canonical_name.to_s.empty? ? a_scientific_name : canonical_name
62
+ if @with_canonical_names
63
+ canonical_name = @parser.parse(a_scientific_name, :canonical_only => true)
64
+ canonical_name.to_s.empty? ? a_scientific_name : canonical_name
65
+ else
66
+ nil
67
+ end
64
68
  end
65
69
 
66
70
  def get_fields(element)
@@ -88,13 +92,13 @@ class DarwinCore
88
92
 
89
93
  def set_scientific_name(row, fields)
90
94
  row[fields[:scientificname]] = 'N/A' unless row[fields[:scientificname]]
91
- canonical_name = ''
95
+ canonical_name = nil
92
96
  scientific_name = row[fields[:scientificname]].strip.force_encoding('utf-8')
93
97
  if separate_canonical_and_authorship?(row, fields)
94
- canonical_name = row[fields[:scientificname]].strip.force_encoding('utf-8')
98
+ canonical_name = row[fields[:scientificname]].strip.force_encoding('utf-8') if @with_canonical_names
95
99
  scientific_name += " #{row[fields[:scientificnameauthorship]].strip.force_encoding('utf-8')}"
96
100
  else
97
- canonical_name = get_canonical_name(row[fields[:scientificname]])
101
+ canonical_name = get_canonical_name(row[fields[:scientificname]]) if @with_canonical_names
98
102
  end
99
103
  fields[:canonicalname] = row.size
100
104
  row << canonical_name
@@ -112,30 +116,38 @@ class DarwinCore
112
116
 
113
117
 
114
118
  def ingest_core
119
+ @normalized_data = {}
115
120
  raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core_fields[:id] && @core_fields[:scientificname])
116
121
  @dwc.core.read do |rows|
122
+ rows[1].each do |error|
123
+ @error_names << { :data => error, :error => :reading_or_encoding_error }
124
+ end
117
125
  rows[0].each do |r|
118
126
  set_scientific_name(r, @core_fields)
119
127
  #core has AcceptedNameUsageId
120
128
  if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
121
129
  add_synonym_from_core(@core_fields[:acceptednameusageid], r)
122
130
  elsif !@core_fields[:acceptednameusageid] && @core_fields[:taxonomicstatus] && status_synonym?(r[@core_fields[:taxonomicstatus]])
123
- add_synonym_from_core(parent_id, r)
131
+ add_synonym_from_core(parent_id, r) if has_parent_id?
124
132
  else
125
133
  taxon = @normalized_data[r[@core_fields[:id]]] ? @normalized_data[r[@core_fields[:id]]] : @normalized_data[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new
126
134
  taxon.id = r[@core_fields[:id]]
127
135
  taxon.current_name = r[@core_fields[:scientificname]]
128
136
  taxon.current_name_canonical = r[@core_fields[:canonicalname]]
129
- taxon.parent_id = r[parent_id]
137
+ taxon.parent_id = has_parent_id? ? r[parent_id] : nil
130
138
  taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
131
139
  taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
132
140
  add_name_string(taxon.current_name)
133
- add_name_string(taxon.current_name_canonical)
141
+ add_name_string(taxon.current_name_canonical) if taxon.current_name_canonical && !taxon.current_name_canonical.empty?
134
142
  end
135
143
  end
136
144
  end
137
145
  end
138
146
 
147
+ def has_parent_id?
148
+ @has_parent_id ||= @core_fields.has_key?(:highertaxonid) || @core_fields.has_key?(:parentnameusageid)
149
+ end
150
+
139
151
  def parent_id
140
152
  parent_id_field = @core_fields[:highertaxonid] || @core_fields[:parentnameusageid]
141
153
  end
@@ -143,47 +155,52 @@ class DarwinCore
143
155
  def calculate_classification_path
144
156
  @paths_num = 0
145
157
  @normalized_data.each do |taxon_id, taxon|
146
- next if !taxon.classification_path.empty?
158
+ next if !taxon.classification_path_id.empty?
147
159
  res = get_classification_path(taxon)
148
160
  next if res == 'error'
149
161
  end
150
162
  end
151
163
 
152
164
  def get_classification_path(taxon)
153
- return if !taxon.classification_path.empty?
165
+ return if !taxon.classification_path_id.empty?
154
166
  @paths_num += 1
155
167
  DarwinCore.logger_write(@dwc.object_id, "Calculated %s paths" % @paths_num) if @paths_num % 10000 == 0
156
168
  current_node = {taxon.id => {}}
157
169
  if DarwinCore.nil_field?(taxon.parent_id)
158
- taxon.classification_path << taxon.current_name_canonical
170
+ taxon.classification_path << taxon.current_name_canonical if @with_canonical_names
159
171
  taxon.classification_path_id << taxon.id
160
172
  @tree.merge!(current_node)
161
173
  else
162
- parent_cp = nil
174
+ parent_cp = parent_cpid = nil
163
175
  if @normalized_data[taxon.parent_id]
164
- parent_cp = @normalized_data[taxon.parent_id].classification_path
176
+ parent_cp = @normalized_data[taxon.parent_id].classification_path if @with_canonical_names
177
+ parent_cpid = @normalized_data[taxon.parent_id].classification_path_id
165
178
  else
166
179
  current_parent = @normalized_data[@synonyms[taxon.parent_id]]
167
180
  if current_parent
168
181
  error = "WARNING: The parent of the taxon \'#{taxon.current_name}\' is deprecated"
169
- @error_names << {:name => taxon, :error => :deprecated_parent, :current_parent => current_parent }
170
- parent_cp = current_parent.classification_path
182
+ @error_names << {:data => taxon, :error => :deprecated_parent, :current_parent => current_parent }
183
+
184
+ parent_cp = current_parent.classification_path if @with_canonical_names
185
+ parent_cpid = current_parent.classification_path_id
171
186
  else
172
187
  error = "WARNING: The parent of the taxon \'#{taxon.current_name}\' not found"
173
- @error_names << {:name => taxon, :error => :deprecated_parent, :current_parent => nil}
188
+ @error_names << {:data => taxon, :error => :deprecated_parent, :current_parent => nil}
174
189
  end
175
190
  end
176
- return 'error' unless parent_cp
177
- if parent_cp.empty?
191
+ return 'error' unless parent_cpid
192
+ if parent_cpid.empty?
178
193
  res = get_classification_path(@normalized_data[taxon.parent_id])
179
194
  return res if res == 'error'
180
- taxon.classification_path += @normalized_data[taxon.parent_id].classification_path + [taxon.current_name_canonical]
195
+ if @with_canonical_names
196
+ taxon.classification_path += @normalized_data[taxon.parent_id].classification_path + [taxon.current_name_canonical]
197
+ end
181
198
  taxon.classification_path_id += @normalized_data[taxon.parent_id].classification_path_id + [taxon.id]
182
199
  parent_node = @normalized_data[taxon.parent_id].classification_path_id.inject(@tree) {|node, id| node[id]}
183
200
  parent_node.merge!(current_node)
184
201
  else
185
- taxon.classification_path += parent_cp + [taxon.current_name_canonical]
186
- taxon.classification_path_id += @normalized_data[taxon.parent_id].classification_path_id + [taxon.id]
202
+ taxon.classification_path += parent_cp + [taxon.current_name_canonical] if @with_canonical_names
203
+ taxon.classification_path_id += parent_cpid + [taxon.id]
187
204
  parent_node = @normalized_data[taxon.parent_id].classification_path_id.inject(@tree) {|node, id| node[id]}
188
205
  begin
189
206
  parent_node.merge!(current_node)
@@ -57,10 +57,13 @@ describe DarwinCore do
57
57
  end
58
58
 
59
59
  describe ".normalize_classification" do
60
- it "should return nil if file has no parent id information" do
60
+ it "should return flat list if file has no parent id information" do
61
61
  file = File.join(@file_dir, 'flat_list.tar.gz')
62
62
  dwc = DarwinCore.new(file)
63
- dwc.normalize_classification.should be_nil
63
+ cn = DarwinCore::ClassificationNormalizer.new(dwc)
64
+ cn.normalize
65
+ cn.normalized_data.should_not be_nil
66
+ cn.normalized_data.size.should > 0
64
67
  end
65
68
 
66
69
  it "should traverse DarwinCore files and assemble data for every node in memory" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc-archive
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.3
4
+ version: 0.7.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-11-16 00:00:00.000000000Z
12
+ date: 2011-11-28 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: parsley-store
16
- requirement: &70195181105880 !ruby/object:Gem::Requirement
16
+ requirement: &70241095997220 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 0.3.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70195181105880
24
+ version_requirements: *70241095997220
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rspec
27
- requirement: &70195181104820 !ruby/object:Gem::Requirement
27
+ requirement: &70241095996620 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 2.3.0
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *70195181104820
35
+ version_requirements: *70241095996620
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70195181104200 !ruby/object:Gem::Requirement
38
+ requirement: &70241095995420 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *70195181104200
46
+ version_requirements: *70241095995420
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: cucumber
49
- requirement: &70195181103720 !ruby/object:Gem::Requirement
49
+ requirement: &70241095994720 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *70195181103720
57
+ version_requirements: *70241095994720
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: bundler
60
- requirement: &70195181103100 !ruby/object:Gem::Requirement
60
+ requirement: &70241095993980 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ~>
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: 1.0.0
66
66
  type: :development
67
67
  prerelease: false
68
- version_requirements: *70195181103100
68
+ version_requirements: *70241095993980
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: jeweler
71
- requirement: &70195181102480 !ruby/object:Gem::Requirement
71
+ requirement: &70241095993280 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ~>
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: 1.6.4
77
77
  type: :development
78
78
  prerelease: false
79
- version_requirements: *70195181102480
79
+ version_requirements: *70241095993280
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: ruby-debug19
82
- requirement: &70195181102000 !ruby/object:Gem::Requirement
82
+ requirement: &70241095992540 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :development
89
89
  prerelease: false
90
- version_requirements: *70195181102000
90
+ version_requirements: *70241095992540
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: parsley-store
93
- requirement: &70195181101520 !ruby/object:Gem::Requirement
93
+ requirement: &70241095991840 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: 0.3.0
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70195181101520
101
+ version_requirements: *70241095991840
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: rspec
104
- requirement: &70195181101020 !ruby/object:Gem::Requirement
104
+ requirement: &70241095991100 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: 1.2.9
110
110
  type: :development
111
111
  prerelease: false
112
- version_requirements: *70195181101020
112
+ version_requirements: *70241095991100
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: cucumber
115
- requirement: &70195181100540 !ruby/object:Gem::Requirement
115
+ requirement: &70241095990400 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: '0'
121
121
  type: :development
122
122
  prerelease: false
123
- version_requirements: *70195181100540
123
+ version_requirements: *70241095990400
124
124
  description: Darwin Core Archive is the current standard exchange format for GLobal
125
125
  Names Architecture modules. This gem makes it easy to incorporate files in Darwin
126
126
  Core Archive format into a ruby project.