dwc-archive 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -44,6 +44,15 @@ Update to latest rubygems (v >= 1.3.6) which adds gemcutter sources by default.
44
44
  end
45
45
  results << [tail_data, tail_errors]
46
46
 
47
+ # normalize names in classification collecting together synonyms, canonical names,
48
+ # vernacular names and associating paths to taxons in a classification
49
+ # distributed as DwCA file
50
+ # NOTE: this functionality requires biodiversity gem for ruby 1.8.x and
51
+ # biodiversity19 gem for ruby 1.9.x
52
+
53
+ result = dwc.normalize_classification
54
+
55
+
47
56
  DarwinCore.clean_all # remove all expanded archives
48
57
 
49
58
  == Note on Patches/Pull Requests
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.3.1
1
+ 0.4.0
@@ -54,3 +54,9 @@ Feature: Reading of a Darwing Core Archive
54
54
  When I create a new DarwinCore instance
55
55
  Then I can read its core content using block
56
56
  Then I can read extensions content using block
57
+
58
+ Scenario: Normalizing classification
59
+ Given path to a dwc file "data.tar.gz"
60
+ When I create a new DarwinCore instance
61
+ Then I am able to use DarwinCore#normalize_classification method
62
+ And get normalized classification in expected format
@@ -129,9 +129,6 @@ Then /^I can read its content into memory$/ do
129
129
  core_data.class.should == Array
130
130
  core_data.size.should == 584
131
131
  core_errors.size.should == 3
132
- core_data, core_errors = @dwc.core.read(5)
133
- core_data.size.should == 5
134
- core_errors.size.should == 0
135
132
  end
136
133
 
137
134
  Then /^I can read extensions content into memory$/ do
@@ -164,3 +161,12 @@ Then /^I can read extensions content using block$/ do
164
161
  res.should == [[1,0]]
165
162
  end
166
163
 
164
+ Then /^I am able to use DarwinCore\#normalize_classification method$/ do
165
+ @normalized_classification = @dwc.normalize_classification
166
+ end
167
+
168
+ Then /^get normalized classification in expected format$/ do
169
+ @normalized_classification.class.should == Hash
170
+ key = @normalized_classification.keys[0]
171
+ @normalized_classification[key].class.should == DarwinCore::TaxonNormalized
172
+ end
data/lib/dwc-archive.rb CHANGED
@@ -22,6 +22,7 @@ require 'dwc-archive/metadata'
22
22
  require 'dwc-archive/generator'
23
23
  require 'dwc-archive/generator_meta_xml'
24
24
  require 'dwc-archive/generator_eml_xml'
25
+ require 'dwc-archive/classification_normalizer'
25
26
 
26
27
  class DarwinCore
27
28
  attr_reader :archive, :core, :metadata, :extensions
@@ -29,6 +30,11 @@ class DarwinCore
29
30
 
30
31
  DEFAULT_TMP_DIR = "/tmp"
31
32
 
33
+ def self.nil_field?(field)
34
+ return true if [nil, '', '/N'].include?(field)
35
+ false
36
+ end
37
+
32
38
  def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
33
39
  @archive = DarwinCore::Archive.new(dwc_path, tmp_dir)
34
40
  @core = DarwinCore::Core.new(@archive)
@@ -36,6 +42,11 @@ class DarwinCore
36
42
  @extensions = get_extensions
37
43
  end
38
44
 
45
+ def normalize_classification
46
+ return nil unless core.fields.map { |f| f[:term].split('/')[-1].downcase }.include? 'highertaxonid'
47
+ DarwinCore::ClassificationNormalizer.new(self).normalize
48
+ end
49
+
39
50
  def self.clean_all(tmp_dir = DEFAULT_TMP_DIR)
40
51
  Dir.entries(tmp_dir).each do |entry|
41
52
  path = File.join(tmp_dir, entry)
@@ -0,0 +1,139 @@
1
+ # encoding: utf-8
2
+ require 'biodiversity'
3
+
4
+ class DarwinCore
5
+
6
+ class TaxonNormalized
7
+ attr_accessor :id, :parent_id, :classification_path, :current_name, :current_name_canonical, :synonyms, :vernacular_names, :rank, :status
8
+
9
+ def initialize
10
+ @id = @parent_id = @classification_path = @current_name = @current_name_canonical = @rank = @status = nil
11
+ @synonyms = []
12
+ @vernacular_names = []
13
+ end
14
+
15
+ end
16
+
17
+ class SynonymNormalized < Struct.new(:name, :canonical_name, :status);end
18
+ class VernacularNormalized < Struct.new(:name, :language);end
19
+
20
+ class ClassificationNormalizer
21
+ def initialize(dwc_instance)
22
+ @dwc = dwc_instance
23
+ @core = get_fields(@dwc.core)
24
+ @extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
25
+ @res = {}
26
+ @parser = ScientificNameParser.new
27
+ end
28
+
29
+ def normalize
30
+ injest_core
31
+ calculate_classification_path
32
+ injest_extensions
33
+ @res
34
+ end
35
+
36
+ private
37
+
38
+ def canonical_name(a_scientific_name)
39
+ if R19
40
+ a_scientific_name.force_encoding('utf-8')
41
+ end
42
+ begin
43
+ parsed_name = @parser.parse(a_scientific_name)[:scientificName]
44
+ rescue
45
+ @parser = ScientificNameParser.new
46
+ parsed_name = @parser.parse(a_scientific_name)[:scientificName]
47
+ end
48
+ parsed_name[:parsed] ? parsed_name[:canonical] : a_scientific_name
49
+ end
50
+
51
+ def get_fields(element)
52
+ data = element.fields.inject({}) { |res, f| res[f[:term].split('/')[-1].downcase.to_sym] = f[:index].to_i; res }
53
+ data[:id] = element.id[:index]
54
+ data
55
+ end
56
+
57
+ def status_synonym?(status)
58
+ status && !!status.match(/^syn/)
59
+ end
60
+
61
+ def add_synonym_from_core(taxon_id, row)
62
+ taxon = @res[row[taxon_id]] ? @res[row[taxon_id]] : @res[row[taxon_id]] = DarwinCore::TaxonNormalized.new
63
+ taxon.synonyms << SynonymNormalized.new(
64
+ row[@core[:scientificname]],
65
+ canonical_name(row[@core[:scientificname]]),
66
+ row[@core[:taxonomicstatus]])
67
+ end
68
+
69
+ def injest_core
70
+ raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core[:id] && @core[:scientificname])
71
+ @dwc.core.read[0].each do |r|
72
+ #core has AcceptedNameUsageId
73
+ if @core[:acceptednameusageid] && r[@core[:acceptednameusageid]] && r[@core[:acceptednameusageid]] != r[@core[:id]]
74
+ add_synonym_from_core(@core[:acceptednameusageid], r)
75
+ elsif !@core[:acceptednameusageid] && status_synonym?(r[@core[:taxonomicstatus]])
76
+ add_synonym_from_core(@core[:highertaxonid], r)
77
+ else
78
+ taxon = @res[r[@core[:id]]] ? @res[r[@core[:id]]] : @res[r[@core[:id]]] = DarwinCore::TaxonNormalized.new
79
+ taxon.id = r[@core[:id]]
80
+ taxon.current_name = r[@core[:scientificname]]
81
+ taxon.current_name_canonical = canonical_name(r[@core[:scientificname]])
82
+ taxon.parent_id = r[@core[:highertaxonid]]
83
+ taxon.rank = r[@core[:taxonrank]]
84
+ taxon.status = r[@core[:taxonomicstatus]]
85
+ end
86
+ end
87
+ end
88
+
89
+ def calculate_classification_path
90
+ @res.each do |taxon_id, taxon|
91
+ next if taxon.classification_path
92
+ get_classification_path(taxon)
93
+ end
94
+ end
95
+
96
+ def get_classification_path(taxon)
97
+ return if taxon.classification_path
98
+ if DarwinCore.nil_field?(taxon.parent_id)
99
+ taxon.classification_path = [taxon.current_name_canonical]
100
+ else
101
+ parent_cp = @res[taxon.parent_id].classification_path
102
+ if parent_cp
103
+ taxon.classification_path = parent_cp + [taxon.current_name_canonical]
104
+ else
105
+ get_classification_path(@res[taxon.parent_id])
106
+ taxon.classification_path = @res[taxon.parent_id].classification_path + [taxon.current_name_canonical]
107
+ end
108
+ end
109
+ end
110
+
111
+ def injest_extensions
112
+ @extensions.each do |e|
113
+ ext, fields = *e
114
+ injest_synonyms(e) if fields.keys.include? :scientificname
115
+ injest_vernaculars(e) if fields.keys.include? :vernacularname
116
+ end
117
+ end
118
+
119
+ def injest_synonyms(extension)
120
+ ext, fields = *extension
121
+ ext.read[0].each do |r|
122
+ @res[r[fields[:id]]].synonyms << SynonymNormalized.new(
123
+ r[fields[:scientificname]],
124
+ canonical_name(r[fields[:scientificname]]),
125
+ r[fields[:taxonomicstatus]])
126
+ end
127
+ end
128
+
129
+ def injest_vernaculars(extension)
130
+ ext, fields = *extension
131
+ ext.read[0].each do |r|
132
+ @res[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
133
+ r[fields[:vernacularname]],
134
+ r[fields[:languagecode]])
135
+ end
136
+ end
137
+
138
+ end
139
+ end
@@ -2,6 +2,7 @@ class DarwinCore
2
2
  class Extension
3
3
  include DarwinCore::Ingester
4
4
  attr_reader :coreid
5
+ alias :id :coreid
5
6
 
6
7
  def initialize(archive, data)
7
8
  @archive = archive
@@ -2,8 +2,7 @@ class DarwinCore
2
2
  module Ingester
3
3
  attr_reader :data, :properties, :encoding, :fields_separator
4
4
  attr_reader :file_path, :fields, :line_separator, :quote_character, :ignore_headers
5
-
6
- def read(batch_size = nil)
5
+ def read(batch_size = 10000)
7
6
  res = []
8
7
  errors = []
9
8
  index_fix = 1
@@ -13,14 +12,10 @@ class DarwinCore
13
12
  CSV.open(@file_path, args).each_with_index do |r, i|
14
13
  index_fix = 0; next if @ignore_headers && i == 0
15
14
  min_size > r.size ? errors << r : process_csv_row(res, errors, r)
16
- if batch_size.to_i > 0 && (i + index_fix) % batch_size == 0
17
- if block_given?
18
- yield [res, errors]
19
- res = []
20
- errors = []
21
- else
22
- return [res, errors]
23
- end
15
+ if block_given? && (i + index_fix) % batch_size == 0
16
+ yield [res, errors]
17
+ res = []
18
+ errors = []
24
19
  end
25
20
  end
26
21
  [res, errors]
Binary file
@@ -1,10 +1,24 @@
1
- require File.dirname(__FILE__) + "/../spec_helper"
1
+ require File.expand_path(File.dirname(__FILE__) + "/../spec_helper")
2
2
 
3
3
  describe DarwinCore do
4
4
  before(:all) do
5
5
  @file_dir = File.join(File.dirname(__FILE__), '..', 'files')
6
6
  end
7
7
 
8
+ describe "::nil_field?" do
9
+ it "should return true for entries which normally mean nil" do
10
+ [nil, '/N', ''].each do |i|
11
+ DarwinCore.nil_field?(i).should be_true
12
+ end
13
+ end
14
+
15
+ it "should return false for fields that are not nil" do
16
+ [0, '0', '123', 123, 'dsdfs434343/N'].each do |i|
17
+ DarwinCore.nil_field?(i).should be_false
18
+ end
19
+ end
20
+ end
21
+
8
22
  describe ".new" do
9
23
  it "should create DarwinCore instance out of archive file" do
10
24
  ['data.zip', 'data.tar.gz', 'minimal.tar.gz', 'junk_dir_inside.zip'].each do |file|
@@ -35,4 +49,49 @@ describe DarwinCore do
35
49
  dwc.archive.valid?.should be_true
36
50
  end
37
51
  end
52
+
53
+ describe ".normalize_classification" do
54
+ it "should return nil if file has no parent id information" do
55
+ file = File.join(@file_dir, 'flat_list.tar.gz')
56
+ dwc = DarwinCore.new(file)
57
+ dwc.normalize_classification.should be_nil
58
+ end
59
+
60
+ it "should traverse DarwinCore files and assemble data for every node in memory" do
61
+ file = File.join(@file_dir, 'data.tar.gz')
62
+ dwc = DarwinCore.new(file)
63
+ norm = dwc.normalize_classification
64
+ norm.class.should == Hash
65
+ norm['leptogastrinae:tid:2857'].class.should == DarwinCore::TaxonNormalized
66
+ end
67
+
68
+ it "should be able to assemble vernacular names from an extension" do
69
+ file = File.join(@file_dir, 'data.tar.gz')
70
+ dwc = DarwinCore.new(file)
71
+ norm = dwc.normalize_classification
72
+ norm.select { |k,v| !v.vernacular_names.empty? }.map { |k,v| v.vernacular_names }.size.should > 0
73
+ end
74
+
75
+ it "should be able to assemble synonyms from extension" do
76
+ file = File.join(@file_dir, 'synonyms_in_extension.tar.gz')
77
+ dwc = DarwinCore.new(file)
78
+ norm = dwc.normalize_classification
79
+ norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
80
+ end
81
+
82
+ it "should be able to assemble synonyms from extension" do
83
+ file = File.join(@file_dir, 'synonyms_in_core_accepted_name_field.tar.gz')
84
+ dwc = DarwinCore.new(file)
85
+ norm = dwc.normalize_classification
86
+ norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
87
+ end
88
+
89
+ it "should be able to assemble synonyms from extension" do
90
+ file = File.join(@file_dir, 'data.tar.gz')
91
+ dwc = DarwinCore.new(file)
92
+ norm = dwc.normalize_classification
93
+ norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
94
+ end
95
+ end
96
+
38
97
  end
@@ -1,4 +1,4 @@
1
- require File.dirname(__FILE__) + "/../spec_helper"
1
+ require File.expand_path(File.dirname(__FILE__) + "/../spec_helper")
2
2
 
3
3
  describe "Hash" do
4
4
  it "should parse xml to hash" do
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc-archive
3
3
  version: !ruby/object:Gem::Version
4
- hash: 17
5
4
  prerelease: false
6
5
  segments:
7
6
  - 0
8
- - 3
9
- - 1
10
- version: 0.3.1
7
+ - 4
8
+ - 0
9
+ version: 0.4.0
11
10
  platform: ruby
12
11
  authors:
13
12
  - Dmitry Mozzherin
@@ -15,7 +14,7 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2010-08-11 00:00:00 -04:00
17
+ date: 2010-09-09 00:00:00 -04:00
19
18
  default_executable:
20
19
  dependencies:
21
20
  - !ruby/object:Gem::Dependency
@@ -26,7 +25,6 @@ dependencies:
26
25
  requirements:
27
26
  - - ">="
28
27
  - !ruby/object:Gem::Version
29
- hash: 13
30
28
  segments:
31
29
  - 1
32
30
  - 2
@@ -42,7 +40,6 @@ dependencies:
42
40
  requirements:
43
41
  - - ">="
44
42
  - !ruby/object:Gem::Version
45
- hash: 3
46
43
  segments:
47
44
  - 0
48
45
  version: "0"
@@ -72,6 +69,7 @@ files:
72
69
  - lib/dwc-archive.rb
73
70
  - lib/dwc-archive/.expander.rb.swo
74
71
  - lib/dwc-archive/archive.rb
72
+ - lib/dwc-archive/classification_normalizer.rb
75
73
  - lib/dwc-archive/core.rb
76
74
  - lib/dwc-archive/errors.rb
77
75
  - lib/dwc-archive/expander.rb
@@ -88,10 +86,13 @@ files:
88
86
  - spec/files/data.zip
89
87
  - spec/files/eml.xml
90
88
  - spec/files/file with characters(3).gz
89
+ - spec/files/flat_list.tar.gz
91
90
  - spec/files/invalid.tar.gz
92
91
  - spec/files/junk_dir_inside.zip
93
92
  - spec/files/meta.xml
94
93
  - spec/files/minimal.tar.gz
94
+ - spec/files/synonyms_in_core_accepted_name_field.tar.gz
95
+ - spec/files/synonyms_in_extension.tar.gz
95
96
  - spec/files/uncompressed
96
97
  - spec/lib/dwc-archive_spec.rb
97
98
  - spec/lib/ruby_extenstions_spec.rb
@@ -111,7 +112,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
111
112
  requirements:
112
113
  - - ">="
113
114
  - !ruby/object:Gem::Version
114
- hash: 3
115
115
  segments:
116
116
  - 0
117
117
  version: "0"
@@ -120,7 +120,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
120
120
  requirements:
121
121
  - - ">="
122
122
  - !ruby/object:Gem::Version
123
- hash: 3
124
123
  segments:
125
124
  - 0
126
125
  version: "0"