dwc-archive 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -44,6 +44,15 @@ Update to latest rubygems (v >= 1.3.6) which adds gemcutter sources by default.
44
44
  end
45
45
  results << [tail_data, tail_errors]
46
46
 
47
+ # normalize names in classification collecting together synonyms, canonical names,
48
+ # vernacular names and associating paths to taxons in a classification
49
+ # distributed as DwCA file
50
+ # NOTE: this functionality requires biodiversity gem for ruby 1.8.x and
51
+ # biodiversity19 gem for ruby 1.9.x
52
+
53
+ result = dwc.normalize_classification
54
+
55
+
47
56
  DarwinCore.clean_all # remove all expanded archives
48
57
 
49
58
  == Note on Patches/Pull Requests
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.1
1
+ 0.4.0
@@ -54,3 +54,9 @@ Feature: Reading of a Darwing Core Archive
54
54
  When I create a new DarwinCore instance
55
55
  Then I can read its core content using block
56
56
  Then I can read extensions content using block
57
+
58
+ Scenario: Normalizing classification
59
+ Given path to a dwc file "data.tar.gz"
60
+ When I create a new DarwinCore instance
61
+ Then I am able to use DarwinCore#normalize_classification method
62
+ And get normalized classification in expected format
@@ -129,9 +129,6 @@ Then /^I can read its content into memory$/ do
129
129
  core_data.class.should == Array
130
130
  core_data.size.should == 584
131
131
  core_errors.size.should == 3
132
- core_data, core_errors = @dwc.core.read(5)
133
- core_data.size.should == 5
134
- core_errors.size.should == 0
135
132
  end
136
133
 
137
134
  Then /^I can read extensions content into memory$/ do
@@ -164,3 +161,12 @@ Then /^I can read extensions content using block$/ do
164
161
  res.should == [[1,0]]
165
162
  end
166
163
 
164
+ Then /^I am able to use DarwinCore\#normalize_classification method$/ do
165
+ @normalized_classification = @dwc.normalize_classification
166
+ end
167
+
168
+ Then /^get normalized classification in expected format$/ do
169
+ @normalized_classification.class.should == Hash
170
+ key = @normalized_classification.keys[0]
171
+ @normalized_classification[key].class.should == DarwinCore::TaxonNormalized
172
+ end
data/lib/dwc-archive.rb CHANGED
@@ -22,6 +22,7 @@ require 'dwc-archive/metadata'
22
22
  require 'dwc-archive/generator'
23
23
  require 'dwc-archive/generator_meta_xml'
24
24
  require 'dwc-archive/generator_eml_xml'
25
+ require 'dwc-archive/classification_normalizer'
25
26
 
26
27
  class DarwinCore
27
28
  attr_reader :archive, :core, :metadata, :extensions
@@ -29,6 +30,11 @@ class DarwinCore
29
30
 
30
31
  DEFAULT_TMP_DIR = "/tmp"
31
32
 
33
+ def self.nil_field?(field)
34
+ return true if [nil, '', '/N'].include?(field)
35
+ false
36
+ end
37
+
32
38
  def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
33
39
  @archive = DarwinCore::Archive.new(dwc_path, tmp_dir)
34
40
  @core = DarwinCore::Core.new(@archive)
@@ -36,6 +42,11 @@ class DarwinCore
36
42
  @extensions = get_extensions
37
43
  end
38
44
 
45
+ def normalize_classification
46
+ return nil unless core.fields.map { |f| f[:term].split('/')[-1].downcase }.include? 'highertaxonid'
47
+ DarwinCore::ClassificationNormalizer.new(self).normalize
48
+ end
49
+
39
50
  def self.clean_all(tmp_dir = DEFAULT_TMP_DIR)
40
51
  Dir.entries(tmp_dir).each do |entry|
41
52
  path = File.join(tmp_dir, entry)
@@ -0,0 +1,139 @@
1
+ # encoding: utf-8
2
+ require 'biodiversity'
3
+
4
+ class DarwinCore
5
+
6
+ class TaxonNormalized
7
+ attr_accessor :id, :parent_id, :classification_path, :current_name, :current_name_canonical, :synonyms, :vernacular_names, :rank, :status
8
+
9
+ def initialize
10
+ @id = @parent_id = @classification_path = @current_name = @current_name_canonical = @rank = @status = nil
11
+ @synonyms = []
12
+ @vernacular_names = []
13
+ end
14
+
15
+ end
16
+
17
+ class SynonymNormalized < Struct.new(:name, :canonical_name, :status);end
18
+ class VernacularNormalized < Struct.new(:name, :language);end
19
+
20
+ class ClassificationNormalizer
21
+ def initialize(dwc_instance)
22
+ @dwc = dwc_instance
23
+ @core = get_fields(@dwc.core)
24
+ @extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
25
+ @res = {}
26
+ @parser = ScientificNameParser.new
27
+ end
28
+
29
+ def normalize
30
+ injest_core
31
+ calculate_classification_path
32
+ injest_extensions
33
+ @res
34
+ end
35
+
36
+ private
37
+
38
+ def canonical_name(a_scientific_name)
39
+ if R19
40
+ a_scientific_name.force_encoding('utf-8')
41
+ end
42
+ begin
43
+ parsed_name = @parser.parse(a_scientific_name)[:scientificName]
44
+ rescue
45
+ @parser = ScientificNameParser.new
46
+ parsed_name = @parser.parse(a_scientific_name)[:scientificName]
47
+ end
48
+ parsed_name[:parsed] ? parsed_name[:canonical] : a_scientific_name
49
+ end
50
+
51
+ def get_fields(element)
52
+ data = element.fields.inject({}) { |res, f| res[f[:term].split('/')[-1].downcase.to_sym] = f[:index].to_i; res }
53
+ data[:id] = element.id[:index]
54
+ data
55
+ end
56
+
57
+ def status_synonym?(status)
58
+ status && !!status.match(/^syn/)
59
+ end
60
+
61
+ def add_synonym_from_core(taxon_id, row)
62
+ taxon = @res[row[taxon_id]] ? @res[row[taxon_id]] : @res[row[taxon_id]] = DarwinCore::TaxonNormalized.new
63
+ taxon.synonyms << SynonymNormalized.new(
64
+ row[@core[:scientificname]],
65
+ canonical_name(row[@core[:scientificname]]),
66
+ row[@core[:taxonomicstatus]])
67
+ end
68
+
69
+ def injest_core
70
+ raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core[:id] && @core[:scientificname])
71
+ @dwc.core.read[0].each do |r|
72
+ #core has AcceptedNameUsageId
73
+ if @core[:acceptednameusageid] && r[@core[:acceptednameusageid]] && r[@core[:acceptednameusageid]] != r[@core[:id]]
74
+ add_synonym_from_core(@core[:acceptednameusageid], r)
75
+ elsif !@core[:acceptednameusageid] && status_synonym?(r[@core[:taxonomicstatus]])
76
+ add_synonym_from_core(@core[:highertaxonid], r)
77
+ else
78
+ taxon = @res[r[@core[:id]]] ? @res[r[@core[:id]]] : @res[r[@core[:id]]] = DarwinCore::TaxonNormalized.new
79
+ taxon.id = r[@core[:id]]
80
+ taxon.current_name = r[@core[:scientificname]]
81
+ taxon.current_name_canonical = canonical_name(r[@core[:scientificname]])
82
+ taxon.parent_id = r[@core[:highertaxonid]]
83
+ taxon.rank = r[@core[:taxonrank]]
84
+ taxon.status = r[@core[:taxonomicstatus]]
85
+ end
86
+ end
87
+ end
88
+
89
+ def calculate_classification_path
90
+ @res.each do |taxon_id, taxon|
91
+ next if taxon.classification_path
92
+ get_classification_path(taxon)
93
+ end
94
+ end
95
+
96
+ def get_classification_path(taxon)
97
+ return if taxon.classification_path
98
+ if DarwinCore.nil_field?(taxon.parent_id)
99
+ taxon.classification_path = [taxon.current_name_canonical]
100
+ else
101
+ parent_cp = @res[taxon.parent_id].classification_path
102
+ if parent_cp
103
+ taxon.classification_path = parent_cp + [taxon.current_name_canonical]
104
+ else
105
+ get_classification_path(@res[taxon.parent_id])
106
+ taxon.classification_path = @res[taxon.parent_id].classification_path + [taxon.current_name_canonical]
107
+ end
108
+ end
109
+ end
110
+
111
+ def injest_extensions
112
+ @extensions.each do |e|
113
+ ext, fields = *e
114
+ injest_synonyms(e) if fields.keys.include? :scientificname
115
+ injest_vernaculars(e) if fields.keys.include? :vernacularname
116
+ end
117
+ end
118
+
119
+ def injest_synonyms(extension)
120
+ ext, fields = *extension
121
+ ext.read[0].each do |r|
122
+ @res[r[fields[:id]]].synonyms << SynonymNormalized.new(
123
+ r[fields[:scientificname]],
124
+ canonical_name(r[fields[:scientificname]]),
125
+ r[fields[:taxonomicstatus]])
126
+ end
127
+ end
128
+
129
+ def injest_vernaculars(extension)
130
+ ext, fields = *extension
131
+ ext.read[0].each do |r|
132
+ @res[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
133
+ r[fields[:vernacularname]],
134
+ r[fields[:languagecode]])
135
+ end
136
+ end
137
+
138
+ end
139
+ end
@@ -2,6 +2,7 @@ class DarwinCore
2
2
  class Extension
3
3
  include DarwinCore::Ingester
4
4
  attr_reader :coreid
5
+ alias :id :coreid
5
6
 
6
7
  def initialize(archive, data)
7
8
  @archive = archive
@@ -2,8 +2,7 @@ class DarwinCore
2
2
  module Ingester
3
3
  attr_reader :data, :properties, :encoding, :fields_separator
4
4
  attr_reader :file_path, :fields, :line_separator, :quote_character, :ignore_headers
5
-
6
- def read(batch_size = nil)
5
+ def read(batch_size = 10000)
7
6
  res = []
8
7
  errors = []
9
8
  index_fix = 1
@@ -13,14 +12,10 @@ class DarwinCore
13
12
  CSV.open(@file_path, args).each_with_index do |r, i|
14
13
  index_fix = 0; next if @ignore_headers && i == 0
15
14
  min_size > r.size ? errors << r : process_csv_row(res, errors, r)
16
- if batch_size.to_i > 0 && (i + index_fix) % batch_size == 0
17
- if block_given?
18
- yield [res, errors]
19
- res = []
20
- errors = []
21
- else
22
- return [res, errors]
23
- end
15
+ if block_given? && (i + index_fix) % batch_size == 0
16
+ yield [res, errors]
17
+ res = []
18
+ errors = []
24
19
  end
25
20
  end
26
21
  [res, errors]
Binary file
@@ -1,10 +1,24 @@
1
- require File.dirname(__FILE__) + "/../spec_helper"
1
+ require File.expand_path(File.dirname(__FILE__) + "/../spec_helper")
2
2
 
3
3
  describe DarwinCore do
4
4
  before(:all) do
5
5
  @file_dir = File.join(File.dirname(__FILE__), '..', 'files')
6
6
  end
7
7
 
8
+ describe "::nil_field?" do
9
+ it "should return true for entries which normally mean nil" do
10
+ [nil, '/N', ''].each do |i|
11
+ DarwinCore.nil_field?(i).should be_true
12
+ end
13
+ end
14
+
15
+ it "should return false for fields that are not nil" do
16
+ [0, '0', '123', 123, 'dsdfs434343/N'].each do |i|
17
+ DarwinCore.nil_field?(i).should be_false
18
+ end
19
+ end
20
+ end
21
+
8
22
  describe ".new" do
9
23
  it "should create DarwinCore instance out of archive file" do
10
24
  ['data.zip', 'data.tar.gz', 'minimal.tar.gz', 'junk_dir_inside.zip'].each do |file|
@@ -35,4 +49,49 @@ describe DarwinCore do
35
49
  dwc.archive.valid?.should be_true
36
50
  end
37
51
  end
52
+
53
+ describe ".normalize_classification" do
54
+ it "should return nil if file has no parent id information" do
55
+ file = File.join(@file_dir, 'flat_list.tar.gz')
56
+ dwc = DarwinCore.new(file)
57
+ dwc.normalize_classification.should be_nil
58
+ end
59
+
60
+ it "should traverse DarwinCore files and assemble data for every node in memory" do
61
+ file = File.join(@file_dir, 'data.tar.gz')
62
+ dwc = DarwinCore.new(file)
63
+ norm = dwc.normalize_classification
64
+ norm.class.should == Hash
65
+ norm['leptogastrinae:tid:2857'].class.should == DarwinCore::TaxonNormalized
66
+ end
67
+
68
+ it "should be able to assemble vernacular names from an extension" do
69
+ file = File.join(@file_dir, 'data.tar.gz')
70
+ dwc = DarwinCore.new(file)
71
+ norm = dwc.normalize_classification
72
+ norm.select { |k,v| !v.vernacular_names.empty? }.map { |k,v| v.vernacular_names }.size.should > 0
73
+ end
74
+
75
+ it "should be able to assemble synonyms from extension" do
76
+ file = File.join(@file_dir, 'synonyms_in_extension.tar.gz')
77
+ dwc = DarwinCore.new(file)
78
+ norm = dwc.normalize_classification
79
+ norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
80
+ end
81
+
82
+ it "should be able to assemble synonyms from extension" do
83
+ file = File.join(@file_dir, 'synonyms_in_core_accepted_name_field.tar.gz')
84
+ dwc = DarwinCore.new(file)
85
+ norm = dwc.normalize_classification
86
+ norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
87
+ end
88
+
89
+ it "should be able to assemble synonyms from extension" do
90
+ file = File.join(@file_dir, 'data.tar.gz')
91
+ dwc = DarwinCore.new(file)
92
+ norm = dwc.normalize_classification
93
+ norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
94
+ end
95
+ end
96
+
38
97
  end
@@ -1,4 +1,4 @@
1
- require File.dirname(__FILE__) + "/../spec_helper"
1
+ require File.expand_path(File.dirname(__FILE__) + "/../spec_helper")
2
2
 
3
3
  describe "Hash" do
4
4
  it "should parse xml to hash" do
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc-archive
3
3
  version: !ruby/object:Gem::Version
4
- hash: 17
5
4
  prerelease: false
6
5
  segments:
7
6
  - 0
8
- - 3
9
- - 1
10
- version: 0.3.1
7
+ - 4
8
+ - 0
9
+ version: 0.4.0
11
10
  platform: ruby
12
11
  authors:
13
12
  - Dmitry Mozzherin
@@ -15,7 +14,7 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2010-08-11 00:00:00 -04:00
17
+ date: 2010-09-09 00:00:00 -04:00
19
18
  default_executable:
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
@@ -26,7 +25,6 @@ dependencies:
26
25
  requirements:
27
26
  - - ">="
28
27
  - !ruby/object:Gem::Version
29
- hash: 13
30
28
  segments:
31
29
  - 1
32
30
  - 2
@@ -42,7 +40,6 @@ dependencies:
42
40
  requirements:
43
41
  - - ">="
44
42
  - !ruby/object:Gem::Version
45
- hash: 3
46
43
  segments:
47
44
  - 0
48
45
  version: "0"
@@ -72,6 +69,7 @@ files:
72
69
  - lib/dwc-archive.rb
73
70
  - lib/dwc-archive/.expander.rb.swo
74
71
  - lib/dwc-archive/archive.rb
72
+ - lib/dwc-archive/classification_normalizer.rb
75
73
  - lib/dwc-archive/core.rb
76
74
  - lib/dwc-archive/errors.rb
77
75
  - lib/dwc-archive/expander.rb
@@ -88,10 +86,13 @@ files:
88
86
  - spec/files/data.zip
89
87
  - spec/files/eml.xml
90
88
  - spec/files/file with characters(3).gz
89
+ - spec/files/flat_list.tar.gz
91
90
  - spec/files/invalid.tar.gz
92
91
  - spec/files/junk_dir_inside.zip
93
92
  - spec/files/meta.xml
94
93
  - spec/files/minimal.tar.gz
94
+ - spec/files/synonyms_in_core_accepted_name_field.tar.gz
95
+ - spec/files/synonyms_in_extension.tar.gz
95
96
  - spec/files/uncompressed
96
97
  - spec/lib/dwc-archive_spec.rb
97
98
  - spec/lib/ruby_extenstions_spec.rb
@@ -111,7 +112,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
111
112
  requirements:
112
113
  - - ">="
113
114
  - !ruby/object:Gem::Version
114
- hash: 3
115
115
  segments:
116
116
  - 0
117
117
  version: "0"
@@ -120,7 +120,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
120
120
  requirements:
121
121
  - - ">="
122
122
  - !ruby/object:Gem::Version
123
- hash: 3
124
123
  segments:
125
124
  - 0
126
125
  version: "0"