dwc-archive 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +9 -0
- data/VERSION +1 -1
- data/features/dwca-reader.feature +6 -0
- data/features/step_definitions/dwc-reader_steps.rb +9 -3
- data/lib/dwc-archive.rb +11 -0
- data/lib/dwc-archive/classification_normalizer.rb +139 -0
- data/lib/dwc-archive/extension.rb +1 -0
- data/lib/dwc-archive/ingester.rb +5 -10
- data/spec/files/flat_list.tar.gz +0 -0
- data/spec/files/synonyms_in_core_accepted_name_field.tar.gz +0 -0
- data/spec/files/synonyms_in_extension.tar.gz +0 -0
- data/spec/lib/dwc-archive_spec.rb +60 -1
- data/spec/lib/ruby_extenstions_spec.rb +1 -1
- metadata +8 -9
data/README.rdoc
CHANGED
@@ -44,6 +44,15 @@ Update to latest rubygems (v >= 1.3.6) which adds gemcutter sources by default.
|
|
44
44
|
end
|
45
45
|
results << [tail_data, tail_errors]
|
46
46
|
|
47
|
+
# normalize names in classification collecting together synonyms, canonical names,
|
48
|
+
# vernacular names and associating paths to taxons in a classification
|
49
|
+
# distributed as DwCA file
|
50
|
+
# NOTE: this functionality requires biodiversity gem for ruby 1.8.x and
|
51
|
+
# biodiversity19 gem for ruby 1.9.x
|
52
|
+
|
53
|
+
result = dwc.normalize_classification
|
54
|
+
|
55
|
+
|
47
56
|
DarwinCore.clean_all # remove all expanded archives
|
48
57
|
|
49
58
|
== Note on Patches/Pull Requests
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.4.0
|
@@ -54,3 +54,9 @@ Feature: Reading of a Darwing Core Archive
|
|
54
54
|
When I create a new DarwinCore instance
|
55
55
|
Then I can read its core content using block
|
56
56
|
Then I can read extensions content using block
|
57
|
+
|
58
|
+
Scenario: Normalizing classification
|
59
|
+
Given path to a dwc file "data.tar.gz"
|
60
|
+
When I create a new DarwinCore instance
|
61
|
+
Then I am able to use DarwinCore#normalize_classification method
|
62
|
+
And get normalized classification in expected format
|
@@ -129,9 +129,6 @@ Then /^I can read its content into memory$/ do
|
|
129
129
|
core_data.class.should == Array
|
130
130
|
core_data.size.should == 584
|
131
131
|
core_errors.size.should == 3
|
132
|
-
core_data, core_errors = @dwc.core.read(5)
|
133
|
-
core_data.size.should == 5
|
134
|
-
core_errors.size.should == 0
|
135
132
|
end
|
136
133
|
|
137
134
|
Then /^I can read extensions content into memory$/ do
|
@@ -164,3 +161,12 @@ Then /^I can read extensions content using block$/ do
|
|
164
161
|
res.should == [[1,0]]
|
165
162
|
end
|
166
163
|
|
164
|
+
Then /^I am able to use DarwinCore\#normalize_classification method$/ do
|
165
|
+
@normalized_classification = @dwc.normalize_classification
|
166
|
+
end
|
167
|
+
|
168
|
+
Then /^get normalized classification in expected format$/ do
|
169
|
+
@normalized_classification.class.should == Hash
|
170
|
+
key = @normalized_classification.keys[0]
|
171
|
+
@normalized_classification[key].class.should == DarwinCore::TaxonNormalized
|
172
|
+
end
|
data/lib/dwc-archive.rb
CHANGED
@@ -22,6 +22,7 @@ require 'dwc-archive/metadata'
|
|
22
22
|
require 'dwc-archive/generator'
|
23
23
|
require 'dwc-archive/generator_meta_xml'
|
24
24
|
require 'dwc-archive/generator_eml_xml'
|
25
|
+
require 'dwc-archive/classification_normalizer'
|
25
26
|
|
26
27
|
class DarwinCore
|
27
28
|
attr_reader :archive, :core, :metadata, :extensions
|
@@ -29,6 +30,11 @@ class DarwinCore
|
|
29
30
|
|
30
31
|
DEFAULT_TMP_DIR = "/tmp"
|
31
32
|
|
33
|
+
def self.nil_field?(field)
|
34
|
+
return true if [nil, '', '/N'].include?(field)
|
35
|
+
false
|
36
|
+
end
|
37
|
+
|
32
38
|
def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
|
33
39
|
@archive = DarwinCore::Archive.new(dwc_path, tmp_dir)
|
34
40
|
@core = DarwinCore::Core.new(@archive)
|
@@ -36,6 +42,11 @@ class DarwinCore
|
|
36
42
|
@extensions = get_extensions
|
37
43
|
end
|
38
44
|
|
45
|
+
def normalize_classification
|
46
|
+
return nil unless core.fields.map { |f| f[:term].split('/')[-1].downcase }.include? 'highertaxonid'
|
47
|
+
DarwinCore::ClassificationNormalizer.new(self).normalize
|
48
|
+
end
|
49
|
+
|
39
50
|
def self.clean_all(tmp_dir = DEFAULT_TMP_DIR)
|
40
51
|
Dir.entries(tmp_dir).each do |entry|
|
41
52
|
path = File.join(tmp_dir, entry)
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'biodiversity'
|
3
|
+
|
4
|
+
class DarwinCore
|
5
|
+
|
6
|
+
class TaxonNormalized
|
7
|
+
attr_accessor :id, :parent_id, :classification_path, :current_name, :current_name_canonical, :synonyms, :vernacular_names, :rank, :status
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@id = @parent_id = @classification_path = @current_name = @current_name_canonical = @rank = @status = nil
|
11
|
+
@synonyms = []
|
12
|
+
@vernacular_names = []
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
class SynonymNormalized < Struct.new(:name, :canonical_name, :status);end
|
18
|
+
class VernacularNormalized < Struct.new(:name, :language);end
|
19
|
+
|
20
|
+
class ClassificationNormalizer
|
21
|
+
def initialize(dwc_instance)
|
22
|
+
@dwc = dwc_instance
|
23
|
+
@core = get_fields(@dwc.core)
|
24
|
+
@extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
|
25
|
+
@res = {}
|
26
|
+
@parser = ScientificNameParser.new
|
27
|
+
end
|
28
|
+
|
29
|
+
def normalize
|
30
|
+
injest_core
|
31
|
+
calculate_classification_path
|
32
|
+
injest_extensions
|
33
|
+
@res
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def canonical_name(a_scientific_name)
|
39
|
+
if R19
|
40
|
+
a_scientific_name.force_encoding('utf-8')
|
41
|
+
end
|
42
|
+
begin
|
43
|
+
parsed_name = @parser.parse(a_scientific_name)[:scientificName]
|
44
|
+
rescue
|
45
|
+
@parser = ScientificNameParser.new
|
46
|
+
parsed_name = @parser.parse(a_scientific_name)[:scientificName]
|
47
|
+
end
|
48
|
+
parsed_name[:parsed] ? parsed_name[:canonical] : a_scientific_name
|
49
|
+
end
|
50
|
+
|
51
|
+
def get_fields(element)
|
52
|
+
data = element.fields.inject({}) { |res, f| res[f[:term].split('/')[-1].downcase.to_sym] = f[:index].to_i; res }
|
53
|
+
data[:id] = element.id[:index]
|
54
|
+
data
|
55
|
+
end
|
56
|
+
|
57
|
+
def status_synonym?(status)
|
58
|
+
status && !!status.match(/^syn/)
|
59
|
+
end
|
60
|
+
|
61
|
+
def add_synonym_from_core(taxon_id, row)
|
62
|
+
taxon = @res[row[taxon_id]] ? @res[row[taxon_id]] : @res[row[taxon_id]] = DarwinCore::TaxonNormalized.new
|
63
|
+
taxon.synonyms << SynonymNormalized.new(
|
64
|
+
row[@core[:scientificname]],
|
65
|
+
canonical_name(row[@core[:scientificname]]),
|
66
|
+
row[@core[:taxonomicstatus]])
|
67
|
+
end
|
68
|
+
|
69
|
+
def injest_core
|
70
|
+
raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core[:id] && @core[:scientificname])
|
71
|
+
@dwc.core.read[0].each do |r|
|
72
|
+
#core has AcceptedNameUsageId
|
73
|
+
if @core[:acceptednameusageid] && r[@core[:acceptednameusageid]] && r[@core[:acceptednameusageid]] != r[@core[:id]]
|
74
|
+
add_synonym_from_core(@core[:acceptednameusageid], r)
|
75
|
+
elsif !@core[:acceptednameusageid] && status_synonym?(r[@core[:taxonomicstatus]])
|
76
|
+
add_synonym_from_core(@core[:highertaxonid], r)
|
77
|
+
else
|
78
|
+
taxon = @res[r[@core[:id]]] ? @res[r[@core[:id]]] : @res[r[@core[:id]]] = DarwinCore::TaxonNormalized.new
|
79
|
+
taxon.id = r[@core[:id]]
|
80
|
+
taxon.current_name = r[@core[:scientificname]]
|
81
|
+
taxon.current_name_canonical = canonical_name(r[@core[:scientificname]])
|
82
|
+
taxon.parent_id = r[@core[:highertaxonid]]
|
83
|
+
taxon.rank = r[@core[:taxonrank]]
|
84
|
+
taxon.status = r[@core[:taxonomicstatus]]
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def calculate_classification_path
|
90
|
+
@res.each do |taxon_id, taxon|
|
91
|
+
next if taxon.classification_path
|
92
|
+
get_classification_path(taxon)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def get_classification_path(taxon)
|
97
|
+
return if taxon.classification_path
|
98
|
+
if DarwinCore.nil_field?(taxon.parent_id)
|
99
|
+
taxon.classification_path = [taxon.current_name_canonical]
|
100
|
+
else
|
101
|
+
parent_cp = @res[taxon.parent_id].classification_path
|
102
|
+
if parent_cp
|
103
|
+
taxon.classification_path = parent_cp + [taxon.current_name_canonical]
|
104
|
+
else
|
105
|
+
get_classification_path(@res[taxon.parent_id])
|
106
|
+
taxon.classification_path = @res[taxon.parent_id].classification_path + [taxon.current_name_canonical]
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def injest_extensions
|
112
|
+
@extensions.each do |e|
|
113
|
+
ext, fields = *e
|
114
|
+
injest_synonyms(e) if fields.keys.include? :scientificname
|
115
|
+
injest_vernaculars(e) if fields.keys.include? :vernacularname
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def injest_synonyms(extension)
|
120
|
+
ext, fields = *extension
|
121
|
+
ext.read[0].each do |r|
|
122
|
+
@res[r[fields[:id]]].synonyms << SynonymNormalized.new(
|
123
|
+
r[fields[:scientificname]],
|
124
|
+
canonical_name(r[fields[:scientificname]]),
|
125
|
+
r[fields[:taxonomicstatus]])
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def injest_vernaculars(extension)
|
130
|
+
ext, fields = *extension
|
131
|
+
ext.read[0].each do |r|
|
132
|
+
@res[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
|
133
|
+
r[fields[:vernacularname]],
|
134
|
+
r[fields[:languagecode]])
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
139
|
+
end
|
data/lib/dwc-archive/ingester.rb
CHANGED
@@ -2,8 +2,7 @@ class DarwinCore
|
|
2
2
|
module Ingester
|
3
3
|
attr_reader :data, :properties, :encoding, :fields_separator
|
4
4
|
attr_reader :file_path, :fields, :line_separator, :quote_character, :ignore_headers
|
5
|
-
|
6
|
-
def read(batch_size = nil)
|
5
|
+
def read(batch_size = 10000)
|
7
6
|
res = []
|
8
7
|
errors = []
|
9
8
|
index_fix = 1
|
@@ -13,14 +12,10 @@ class DarwinCore
|
|
13
12
|
CSV.open(@file_path, args).each_with_index do |r, i|
|
14
13
|
index_fix = 0; next if @ignore_headers && i == 0
|
15
14
|
min_size > r.size ? errors << r : process_csv_row(res, errors, r)
|
16
|
-
if
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
errors = []
|
21
|
-
else
|
22
|
-
return [res, errors]
|
23
|
-
end
|
15
|
+
if block_given? && (i + index_fix) % batch_size == 0
|
16
|
+
yield [res, errors]
|
17
|
+
res = []
|
18
|
+
errors = []
|
24
19
|
end
|
25
20
|
end
|
26
21
|
[res, errors]
|
Binary file
|
Binary file
|
Binary file
|
@@ -1,10 +1,24 @@
|
|
1
|
-
require File.dirname(__FILE__) + "/../spec_helper"
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + "/../spec_helper")
|
2
2
|
|
3
3
|
describe DarwinCore do
|
4
4
|
before(:all) do
|
5
5
|
@file_dir = File.join(File.dirname(__FILE__), '..', 'files')
|
6
6
|
end
|
7
7
|
|
8
|
+
describe "::nil_field?" do
|
9
|
+
it "should return true for entries which normally mean nil" do
|
10
|
+
[nil, '/N', ''].each do |i|
|
11
|
+
DarwinCore.nil_field?(i).should be_true
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return false for fields that are not nil" do
|
16
|
+
[0, '0', '123', 123, 'dsdfs434343/N'].each do |i|
|
17
|
+
DarwinCore.nil_field?(i).should be_false
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
8
22
|
describe ".new" do
|
9
23
|
it "should create DarwinCore instance out of archive file" do
|
10
24
|
['data.zip', 'data.tar.gz', 'minimal.tar.gz', 'junk_dir_inside.zip'].each do |file|
|
@@ -35,4 +49,49 @@ describe DarwinCore do
|
|
35
49
|
dwc.archive.valid?.should be_true
|
36
50
|
end
|
37
51
|
end
|
52
|
+
|
53
|
+
describe ".normalize_classification" do
|
54
|
+
it "should return nil if file has no parent id information" do
|
55
|
+
file = File.join(@file_dir, 'flat_list.tar.gz')
|
56
|
+
dwc = DarwinCore.new(file)
|
57
|
+
dwc.normalize_classification.should be_nil
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should traverse DarwinCore files and assemble data for every node in memory" do
|
61
|
+
file = File.join(@file_dir, 'data.tar.gz')
|
62
|
+
dwc = DarwinCore.new(file)
|
63
|
+
norm = dwc.normalize_classification
|
64
|
+
norm.class.should == Hash
|
65
|
+
norm['leptogastrinae:tid:2857'].class.should == DarwinCore::TaxonNormalized
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should be able to assemble vernacular names from an extension" do
|
69
|
+
file = File.join(@file_dir, 'data.tar.gz')
|
70
|
+
dwc = DarwinCore.new(file)
|
71
|
+
norm = dwc.normalize_classification
|
72
|
+
norm.select { |k,v| !v.vernacular_names.empty? }.map { |k,v| v.vernacular_names }.size.should > 0
|
73
|
+
end
|
74
|
+
|
75
|
+
it "should be able to assemble synonyms from extension" do
|
76
|
+
file = File.join(@file_dir, 'synonyms_in_extension.tar.gz')
|
77
|
+
dwc = DarwinCore.new(file)
|
78
|
+
norm = dwc.normalize_classification
|
79
|
+
norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should be able to assemble synonyms from extension" do
|
83
|
+
file = File.join(@file_dir, 'synonyms_in_core_accepted_name_field.tar.gz')
|
84
|
+
dwc = DarwinCore.new(file)
|
85
|
+
norm = dwc.normalize_classification
|
86
|
+
norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
|
87
|
+
end
|
88
|
+
|
89
|
+
it "should be able to assemble synonyms from extension" do
|
90
|
+
file = File.join(@file_dir, 'data.tar.gz')
|
91
|
+
dwc = DarwinCore.new(file)
|
92
|
+
norm = dwc.normalize_classification
|
93
|
+
norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
38
97
|
end
|
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash: 17
|
5
4
|
prerelease: false
|
6
5
|
segments:
|
7
6
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
7
|
+
- 4
|
8
|
+
- 0
|
9
|
+
version: 0.4.0
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Dmitry Mozzherin
|
@@ -15,7 +14,7 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date: 2010-
|
17
|
+
date: 2010-09-09 00:00:00 -04:00
|
19
18
|
default_executable:
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
@@ -26,7 +25,6 @@ dependencies:
|
|
26
25
|
requirements:
|
27
26
|
- - ">="
|
28
27
|
- !ruby/object:Gem::Version
|
29
|
-
hash: 13
|
30
28
|
segments:
|
31
29
|
- 1
|
32
30
|
- 2
|
@@ -42,7 +40,6 @@ dependencies:
|
|
42
40
|
requirements:
|
43
41
|
- - ">="
|
44
42
|
- !ruby/object:Gem::Version
|
45
|
-
hash: 3
|
46
43
|
segments:
|
47
44
|
- 0
|
48
45
|
version: "0"
|
@@ -72,6 +69,7 @@ files:
|
|
72
69
|
- lib/dwc-archive.rb
|
73
70
|
- lib/dwc-archive/.expander.rb.swo
|
74
71
|
- lib/dwc-archive/archive.rb
|
72
|
+
- lib/dwc-archive/classification_normalizer.rb
|
75
73
|
- lib/dwc-archive/core.rb
|
76
74
|
- lib/dwc-archive/errors.rb
|
77
75
|
- lib/dwc-archive/expander.rb
|
@@ -88,10 +86,13 @@ files:
|
|
88
86
|
- spec/files/data.zip
|
89
87
|
- spec/files/eml.xml
|
90
88
|
- spec/files/file with characters(3).gz
|
89
|
+
- spec/files/flat_list.tar.gz
|
91
90
|
- spec/files/invalid.tar.gz
|
92
91
|
- spec/files/junk_dir_inside.zip
|
93
92
|
- spec/files/meta.xml
|
94
93
|
- spec/files/minimal.tar.gz
|
94
|
+
- spec/files/synonyms_in_core_accepted_name_field.tar.gz
|
95
|
+
- spec/files/synonyms_in_extension.tar.gz
|
95
96
|
- spec/files/uncompressed
|
96
97
|
- spec/lib/dwc-archive_spec.rb
|
97
98
|
- spec/lib/ruby_extenstions_spec.rb
|
@@ -111,7 +112,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
111
112
|
requirements:
|
112
113
|
- - ">="
|
113
114
|
- !ruby/object:Gem::Version
|
114
|
-
hash: 3
|
115
115
|
segments:
|
116
116
|
- 0
|
117
117
|
version: "0"
|
@@ -120,7 +120,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
120
120
|
requirements:
|
121
121
|
- - ">="
|
122
122
|
- !ruby/object:Gem::Version
|
123
|
-
hash: 3
|
124
123
|
segments:
|
125
124
|
- 0
|
126
125
|
version: "0"
|