dwc-archive 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +9 -0
- data/VERSION +1 -1
- data/features/dwca-reader.feature +6 -0
- data/features/step_definitions/dwc-reader_steps.rb +9 -3
- data/lib/dwc-archive.rb +11 -0
- data/lib/dwc-archive/classification_normalizer.rb +139 -0
- data/lib/dwc-archive/extension.rb +1 -0
- data/lib/dwc-archive/ingester.rb +5 -10
- data/spec/files/flat_list.tar.gz +0 -0
- data/spec/files/synonyms_in_core_accepted_name_field.tar.gz +0 -0
- data/spec/files/synonyms_in_extension.tar.gz +0 -0
- data/spec/lib/dwc-archive_spec.rb +60 -1
- data/spec/lib/ruby_extenstions_spec.rb +1 -1
- metadata +8 -9
data/README.rdoc
CHANGED
@@ -44,6 +44,15 @@ Update to latest rubygems (v >= 1.3.6) which adds gemcutter sources by default.
|
|
44
44
|
end
|
45
45
|
results << [tail_data, tail_errors]
|
46
46
|
|
47
|
+
# normalize names in classification collecting together synonyms, canonical names,
|
48
|
+
# vernacular names and associating paths to taxons in a classification
|
49
|
+
# distributed as DwCA file
|
50
|
+
# NOTE: this functionality requires biodiversity gem for ruby 1.8.x and
|
51
|
+
# biodiversity19 gem for ruby 1.9.x
|
52
|
+
|
53
|
+
result = dwc.normalize_classification
|
54
|
+
|
55
|
+
|
47
56
|
DarwinCore.clean_all # remove all expanded archives
|
48
57
|
|
49
58
|
== Note on Patches/Pull Requests
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.4.0
|
@@ -54,3 +54,9 @@ Feature: Reading of a Darwing Core Archive
|
|
54
54
|
When I create a new DarwinCore instance
|
55
55
|
Then I can read its core content using block
|
56
56
|
Then I can read extensions content using block
|
57
|
+
|
58
|
+
Scenario: Normalizing classification
|
59
|
+
Given path to a dwc file "data.tar.gz"
|
60
|
+
When I create a new DarwinCore instance
|
61
|
+
Then I am able to use DarwinCore#normalize_classification method
|
62
|
+
And get normalized classification in expected format
|
@@ -129,9 +129,6 @@ Then /^I can read its content into memory$/ do
|
|
129
129
|
core_data.class.should == Array
|
130
130
|
core_data.size.should == 584
|
131
131
|
core_errors.size.should == 3
|
132
|
-
core_data, core_errors = @dwc.core.read(5)
|
133
|
-
core_data.size.should == 5
|
134
|
-
core_errors.size.should == 0
|
135
132
|
end
|
136
133
|
|
137
134
|
Then /^I can read extensions content into memory$/ do
|
@@ -164,3 +161,12 @@ Then /^I can read extensions content using block$/ do
|
|
164
161
|
res.should == [[1,0]]
|
165
162
|
end
|
166
163
|
|
164
|
+
Then /^I am able to use DarwinCore\#normalize_classification method$/ do
|
165
|
+
@normalized_classification = @dwc.normalize_classification
|
166
|
+
end
|
167
|
+
|
168
|
+
Then /^get normalized classification in expected format$/ do
|
169
|
+
@normalized_classification.class.should == Hash
|
170
|
+
key = @normalized_classification.keys[0]
|
171
|
+
@normalized_classification[key].class.should == DarwinCore::TaxonNormalized
|
172
|
+
end
|
data/lib/dwc-archive.rb
CHANGED
@@ -22,6 +22,7 @@ require 'dwc-archive/metadata'
|
|
22
22
|
require 'dwc-archive/generator'
|
23
23
|
require 'dwc-archive/generator_meta_xml'
|
24
24
|
require 'dwc-archive/generator_eml_xml'
|
25
|
+
require 'dwc-archive/classification_normalizer'
|
25
26
|
|
26
27
|
class DarwinCore
|
27
28
|
attr_reader :archive, :core, :metadata, :extensions
|
@@ -29,6 +30,11 @@ class DarwinCore
|
|
29
30
|
|
30
31
|
DEFAULT_TMP_DIR = "/tmp"
|
31
32
|
|
33
|
+
def self.nil_field?(field)
|
34
|
+
return true if [nil, '', '/N'].include?(field)
|
35
|
+
false
|
36
|
+
end
|
37
|
+
|
32
38
|
def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
|
33
39
|
@archive = DarwinCore::Archive.new(dwc_path, tmp_dir)
|
34
40
|
@core = DarwinCore::Core.new(@archive)
|
@@ -36,6 +42,11 @@ class DarwinCore
|
|
36
42
|
@extensions = get_extensions
|
37
43
|
end
|
38
44
|
|
45
|
+
def normalize_classification
|
46
|
+
return nil unless core.fields.map { |f| f[:term].split('/')[-1].downcase }.include? 'highertaxonid'
|
47
|
+
DarwinCore::ClassificationNormalizer.new(self).normalize
|
48
|
+
end
|
49
|
+
|
39
50
|
def self.clean_all(tmp_dir = DEFAULT_TMP_DIR)
|
40
51
|
Dir.entries(tmp_dir).each do |entry|
|
41
52
|
path = File.join(tmp_dir, entry)
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'biodiversity'
|
3
|
+
|
4
|
+
class DarwinCore
|
5
|
+
|
6
|
+
class TaxonNormalized
|
7
|
+
attr_accessor :id, :parent_id, :classification_path, :current_name, :current_name_canonical, :synonyms, :vernacular_names, :rank, :status
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@id = @parent_id = @classification_path = @current_name = @current_name_canonical = @rank = @status = nil
|
11
|
+
@synonyms = []
|
12
|
+
@vernacular_names = []
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
class SynonymNormalized < Struct.new(:name, :canonical_name, :status);end
|
18
|
+
class VernacularNormalized < Struct.new(:name, :language);end
|
19
|
+
|
20
|
+
class ClassificationNormalizer
|
21
|
+
def initialize(dwc_instance)
|
22
|
+
@dwc = dwc_instance
|
23
|
+
@core = get_fields(@dwc.core)
|
24
|
+
@extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
|
25
|
+
@res = {}
|
26
|
+
@parser = ScientificNameParser.new
|
27
|
+
end
|
28
|
+
|
29
|
+
def normalize
|
30
|
+
injest_core
|
31
|
+
calculate_classification_path
|
32
|
+
injest_extensions
|
33
|
+
@res
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def canonical_name(a_scientific_name)
|
39
|
+
if R19
|
40
|
+
a_scientific_name.force_encoding('utf-8')
|
41
|
+
end
|
42
|
+
begin
|
43
|
+
parsed_name = @parser.parse(a_scientific_name)[:scientificName]
|
44
|
+
rescue
|
45
|
+
@parser = ScientificNameParser.new
|
46
|
+
parsed_name = @parser.parse(a_scientific_name)[:scientificName]
|
47
|
+
end
|
48
|
+
parsed_name[:parsed] ? parsed_name[:canonical] : a_scientific_name
|
49
|
+
end
|
50
|
+
|
51
|
+
def get_fields(element)
|
52
|
+
data = element.fields.inject({}) { |res, f| res[f[:term].split('/')[-1].downcase.to_sym] = f[:index].to_i; res }
|
53
|
+
data[:id] = element.id[:index]
|
54
|
+
data
|
55
|
+
end
|
56
|
+
|
57
|
+
def status_synonym?(status)
|
58
|
+
status && !!status.match(/^syn/)
|
59
|
+
end
|
60
|
+
|
61
|
+
def add_synonym_from_core(taxon_id, row)
|
62
|
+
taxon = @res[row[taxon_id]] ? @res[row[taxon_id]] : @res[row[taxon_id]] = DarwinCore::TaxonNormalized.new
|
63
|
+
taxon.synonyms << SynonymNormalized.new(
|
64
|
+
row[@core[:scientificname]],
|
65
|
+
canonical_name(row[@core[:scientificname]]),
|
66
|
+
row[@core[:taxonomicstatus]])
|
67
|
+
end
|
68
|
+
|
69
|
+
def injest_core
|
70
|
+
raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core[:id] && @core[:scientificname])
|
71
|
+
@dwc.core.read[0].each do |r|
|
72
|
+
#core has AcceptedNameUsageId
|
73
|
+
if @core[:acceptednameusageid] && r[@core[:acceptednameusageid]] && r[@core[:acceptednameusageid]] != r[@core[:id]]
|
74
|
+
add_synonym_from_core(@core[:acceptednameusageid], r)
|
75
|
+
elsif !@core[:acceptednameusageid] && status_synonym?(r[@core[:taxonomicstatus]])
|
76
|
+
add_synonym_from_core(@core[:highertaxonid], r)
|
77
|
+
else
|
78
|
+
taxon = @res[r[@core[:id]]] ? @res[r[@core[:id]]] : @res[r[@core[:id]]] = DarwinCore::TaxonNormalized.new
|
79
|
+
taxon.id = r[@core[:id]]
|
80
|
+
taxon.current_name = r[@core[:scientificname]]
|
81
|
+
taxon.current_name_canonical = canonical_name(r[@core[:scientificname]])
|
82
|
+
taxon.parent_id = r[@core[:highertaxonid]]
|
83
|
+
taxon.rank = r[@core[:taxonrank]]
|
84
|
+
taxon.status = r[@core[:taxonomicstatus]]
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def calculate_classification_path
|
90
|
+
@res.each do |taxon_id, taxon|
|
91
|
+
next if taxon.classification_path
|
92
|
+
get_classification_path(taxon)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def get_classification_path(taxon)
|
97
|
+
return if taxon.classification_path
|
98
|
+
if DarwinCore.nil_field?(taxon.parent_id)
|
99
|
+
taxon.classification_path = [taxon.current_name_canonical]
|
100
|
+
else
|
101
|
+
parent_cp = @res[taxon.parent_id].classification_path
|
102
|
+
if parent_cp
|
103
|
+
taxon.classification_path = parent_cp + [taxon.current_name_canonical]
|
104
|
+
else
|
105
|
+
get_classification_path(@res[taxon.parent_id])
|
106
|
+
taxon.classification_path = @res[taxon.parent_id].classification_path + [taxon.current_name_canonical]
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def injest_extensions
|
112
|
+
@extensions.each do |e|
|
113
|
+
ext, fields = *e
|
114
|
+
injest_synonyms(e) if fields.keys.include? :scientificname
|
115
|
+
injest_vernaculars(e) if fields.keys.include? :vernacularname
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def injest_synonyms(extension)
|
120
|
+
ext, fields = *extension
|
121
|
+
ext.read[0].each do |r|
|
122
|
+
@res[r[fields[:id]]].synonyms << SynonymNormalized.new(
|
123
|
+
r[fields[:scientificname]],
|
124
|
+
canonical_name(r[fields[:scientificname]]),
|
125
|
+
r[fields[:taxonomicstatus]])
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def injest_vernaculars(extension)
|
130
|
+
ext, fields = *extension
|
131
|
+
ext.read[0].each do |r|
|
132
|
+
@res[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
|
133
|
+
r[fields[:vernacularname]],
|
134
|
+
r[fields[:languagecode]])
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
139
|
+
end
|
data/lib/dwc-archive/ingester.rb
CHANGED
@@ -2,8 +2,7 @@ class DarwinCore
|
|
2
2
|
module Ingester
|
3
3
|
attr_reader :data, :properties, :encoding, :fields_separator
|
4
4
|
attr_reader :file_path, :fields, :line_separator, :quote_character, :ignore_headers
|
5
|
-
|
6
|
-
def read(batch_size = nil)
|
5
|
+
def read(batch_size = 10000)
|
7
6
|
res = []
|
8
7
|
errors = []
|
9
8
|
index_fix = 1
|
@@ -13,14 +12,10 @@ class DarwinCore
|
|
13
12
|
CSV.open(@file_path, args).each_with_index do |r, i|
|
14
13
|
index_fix = 0; next if @ignore_headers && i == 0
|
15
14
|
min_size > r.size ? errors << r : process_csv_row(res, errors, r)
|
16
|
-
if
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
errors = []
|
21
|
-
else
|
22
|
-
return [res, errors]
|
23
|
-
end
|
15
|
+
if block_given? && (i + index_fix) % batch_size == 0
|
16
|
+
yield [res, errors]
|
17
|
+
res = []
|
18
|
+
errors = []
|
24
19
|
end
|
25
20
|
end
|
26
21
|
[res, errors]
|
Binary file
|
Binary file
|
Binary file
|
@@ -1,10 +1,24 @@
|
|
1
|
-
require File.dirname(__FILE__) + "/../spec_helper"
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + "/../spec_helper")
|
2
2
|
|
3
3
|
describe DarwinCore do
|
4
4
|
before(:all) do
|
5
5
|
@file_dir = File.join(File.dirname(__FILE__), '..', 'files')
|
6
6
|
end
|
7
7
|
|
8
|
+
describe "::nil_field?" do
|
9
|
+
it "should return true for entries which normally mean nil" do
|
10
|
+
[nil, '/N', ''].each do |i|
|
11
|
+
DarwinCore.nil_field?(i).should be_true
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return false for fields that are not nil" do
|
16
|
+
[0, '0', '123', 123, 'dsdfs434343/N'].each do |i|
|
17
|
+
DarwinCore.nil_field?(i).should be_false
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
8
22
|
describe ".new" do
|
9
23
|
it "should create DarwinCore instance out of archive file" do
|
10
24
|
['data.zip', 'data.tar.gz', 'minimal.tar.gz', 'junk_dir_inside.zip'].each do |file|
|
@@ -35,4 +49,49 @@ describe DarwinCore do
|
|
35
49
|
dwc.archive.valid?.should be_true
|
36
50
|
end
|
37
51
|
end
|
52
|
+
|
53
|
+
describe ".normalize_classification" do
|
54
|
+
it "should return nil if file has no parent id information" do
|
55
|
+
file = File.join(@file_dir, 'flat_list.tar.gz')
|
56
|
+
dwc = DarwinCore.new(file)
|
57
|
+
dwc.normalize_classification.should be_nil
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should traverse DarwinCore files and assemble data for every node in memory" do
|
61
|
+
file = File.join(@file_dir, 'data.tar.gz')
|
62
|
+
dwc = DarwinCore.new(file)
|
63
|
+
norm = dwc.normalize_classification
|
64
|
+
norm.class.should == Hash
|
65
|
+
norm['leptogastrinae:tid:2857'].class.should == DarwinCore::TaxonNormalized
|
66
|
+
end
|
67
|
+
|
68
|
+
it "should be able to assemble vernacular names from an extension" do
|
69
|
+
file = File.join(@file_dir, 'data.tar.gz')
|
70
|
+
dwc = DarwinCore.new(file)
|
71
|
+
norm = dwc.normalize_classification
|
72
|
+
norm.select { |k,v| !v.vernacular_names.empty? }.map { |k,v| v.vernacular_names }.size.should > 0
|
73
|
+
end
|
74
|
+
|
75
|
+
it "should be able to assemble synonyms from extension" do
|
76
|
+
file = File.join(@file_dir, 'synonyms_in_extension.tar.gz')
|
77
|
+
dwc = DarwinCore.new(file)
|
78
|
+
norm = dwc.normalize_classification
|
79
|
+
norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should be able to assemble synonyms from extension" do
|
83
|
+
file = File.join(@file_dir, 'synonyms_in_core_accepted_name_field.tar.gz')
|
84
|
+
dwc = DarwinCore.new(file)
|
85
|
+
norm = dwc.normalize_classification
|
86
|
+
norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
|
87
|
+
end
|
88
|
+
|
89
|
+
it "should be able to assemble synonyms from extension" do
|
90
|
+
file = File.join(@file_dir, 'data.tar.gz')
|
91
|
+
dwc = DarwinCore.new(file)
|
92
|
+
norm = dwc.normalize_classification
|
93
|
+
norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
38
97
|
end
|
metadata
CHANGED
@@ -1,13 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash: 17
|
5
4
|
prerelease: false
|
6
5
|
segments:
|
7
6
|
- 0
|
8
|
-
-
|
9
|
-
-
|
10
|
-
version: 0.
|
7
|
+
- 4
|
8
|
+
- 0
|
9
|
+
version: 0.4.0
|
11
10
|
platform: ruby
|
12
11
|
authors:
|
13
12
|
- Dmitry Mozzherin
|
@@ -15,7 +14,7 @@ autorequire:
|
|
15
14
|
bindir: bin
|
16
15
|
cert_chain: []
|
17
16
|
|
18
|
-
date: 2010-
|
17
|
+
date: 2010-09-09 00:00:00 -04:00
|
19
18
|
default_executable:
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
@@ -26,7 +25,6 @@ dependencies:
|
|
26
25
|
requirements:
|
27
26
|
- - ">="
|
28
27
|
- !ruby/object:Gem::Version
|
29
|
-
hash: 13
|
30
28
|
segments:
|
31
29
|
- 1
|
32
30
|
- 2
|
@@ -42,7 +40,6 @@ dependencies:
|
|
42
40
|
requirements:
|
43
41
|
- - ">="
|
44
42
|
- !ruby/object:Gem::Version
|
45
|
-
hash: 3
|
46
43
|
segments:
|
47
44
|
- 0
|
48
45
|
version: "0"
|
@@ -72,6 +69,7 @@ files:
|
|
72
69
|
- lib/dwc-archive.rb
|
73
70
|
- lib/dwc-archive/.expander.rb.swo
|
74
71
|
- lib/dwc-archive/archive.rb
|
72
|
+
- lib/dwc-archive/classification_normalizer.rb
|
75
73
|
- lib/dwc-archive/core.rb
|
76
74
|
- lib/dwc-archive/errors.rb
|
77
75
|
- lib/dwc-archive/expander.rb
|
@@ -88,10 +86,13 @@ files:
|
|
88
86
|
- spec/files/data.zip
|
89
87
|
- spec/files/eml.xml
|
90
88
|
- spec/files/file with characters(3).gz
|
89
|
+
- spec/files/flat_list.tar.gz
|
91
90
|
- spec/files/invalid.tar.gz
|
92
91
|
- spec/files/junk_dir_inside.zip
|
93
92
|
- spec/files/meta.xml
|
94
93
|
- spec/files/minimal.tar.gz
|
94
|
+
- spec/files/synonyms_in_core_accepted_name_field.tar.gz
|
95
|
+
- spec/files/synonyms_in_extension.tar.gz
|
95
96
|
- spec/files/uncompressed
|
96
97
|
- spec/lib/dwc-archive_spec.rb
|
97
98
|
- spec/lib/ruby_extenstions_spec.rb
|
@@ -111,7 +112,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
111
112
|
requirements:
|
112
113
|
- - ">="
|
113
114
|
- !ruby/object:Gem::Version
|
114
|
-
hash: 3
|
115
115
|
segments:
|
116
116
|
- 0
|
117
117
|
version: "0"
|
@@ -120,7 +120,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
120
120
|
requirements:
|
121
121
|
- - ">="
|
122
122
|
- !ruby/object:Gem::Version
|
123
|
-
hash: 3
|
124
123
|
segments:
|
125
124
|
- 0
|
126
125
|
version: "0"
|