dwc-archive 0.5.1 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.5.1
1
+ 0.5.2
data/lib/dwc-archive.rb CHANGED
@@ -52,10 +52,6 @@ class DarwinCore
52
52
  end
53
53
  end
54
54
 
55
- def self.logger
56
- @@logger ||= Logger.new(nil)
57
- end
58
-
59
55
  def self.logger=(logger)
60
56
  @@logger = logger
61
57
  end
@@ -64,17 +60,22 @@ class DarwinCore
64
60
  @@logger = Logger.new(nil)
65
61
  end
66
62
 
63
+ def self.logger_write(obj_id, message, method = :info)
64
+ @@logger.send(method, "|%s|%s|" % [obj_id, message])
65
+ end
66
+
67
67
  def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
68
+ @@logger ||= Logger.new(nil)
68
69
  @archive = DarwinCore::Archive.new(dwc_path, tmp_dir)
69
- @core = DarwinCore::Core.new(@archive)
70
+ @core = DarwinCore::Core.new(self)
70
71
  @metadata = DarwinCore::Metadata.new(@archive)
71
72
  @extensions = get_extensions
72
73
  end
73
74
 
74
75
  # generates a hash from a classification data with path to each node, list of synonyms and vernacular names.
75
- def normalize_classification(verbose = false)
76
+ def normalize_classification
76
77
  return nil unless has_parent_id?
77
- @classification_normalizer ||= DarwinCore::ClassificationNormalizer.new(self, verbose)
78
+ @classification_normalizer ||= DarwinCore::ClassificationNormalizer.new(self)
78
79
  @classification_normalizer.normalize
79
80
  end
80
81
 
@@ -89,6 +90,6 @@ class DarwinCore
89
90
  ext = @archive.meta[root_key][:extension]
90
91
  return [] unless ext
91
92
  ext = [ext] if ext.class != Array
92
- ext.map { |e| DarwinCore::Extension.new(@archive, e) }
93
+ ext.map { |e| DarwinCore::Extension.new(self, e) }
93
94
  end
94
95
  end
@@ -22,17 +22,14 @@ class DarwinCore
22
22
  class VernacularNormalized < Struct.new(:name, :language);end
23
23
 
24
24
  class ClassificationNormalizer
25
- attr_accessor :verbose
26
25
  attr_reader :error_names, :tree
27
26
 
28
- def initialize(dwc_instance, verbose = false)
27
+ def initialize(dwc_instance)
29
28
  @dwc = dwc_instance
30
29
  @core_fields = get_fields(@dwc.core)
31
30
  @extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
32
31
  @res = {}
33
32
  @parser = ParsleyStore.new(1,2)
34
- @verbose = verbose
35
- @verbose_count = 10000
36
33
  @name_strings = {}
37
34
  @error_names = []
38
35
  @tree = {}
@@ -47,9 +44,12 @@ class DarwinCore
47
44
  end
48
45
 
49
46
  def normalize
47
+ DarwinCore.logger_write(@dwc.object_id, "Started normalization of the classification")
50
48
  @res = {}
51
49
  ingest_core
50
+ DarwinCore.logger_write(@dwc.object_id, "Calculating the classification parent/child paths")
52
51
  calculate_classification_path
52
+ DarwinCore.logger_write(@dwc.object_id, "Ingesting data from extensions")
53
53
  ingest_extensions
54
54
  @res
55
55
  end
@@ -94,26 +94,23 @@ class DarwinCore
94
94
 
95
95
  def ingest_core
96
96
  raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core_fields[:id] && @core_fields[:scientificname])
97
- puts "Reading core information" if @verbose
98
- rows = @dwc.core.read[0]
99
- puts "Ingesting information from the core" if @verbose
100
- rows.each_with_index do |r, i|
101
- count = i + 1
102
- set_scientific_name(r, @core_fields)
103
- puts "Ingesting %s'th record" % count if @verbose and count % @verbose_count == 0
104
- #core has AcceptedNameUsageId
105
- if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
106
- add_synonym_from_core(@core_fields[:acceptednameusageid], r)
107
- elsif !@core_fields[:acceptednameusageid] && status_synonym?(r[@core_fields[:taxonomicstatus]])
108
- add_synonym_from_core(parent_id, r)
109
- else
110
- taxon = @res[r[@core_fields[:id]]] ? @res[r[@core_fields[:id]]] : @res[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new
111
- taxon.id = r[@core_fields[:id]]
112
- taxon.current_name = r[@core_fields[:scientificname]]
113
- taxon.current_name_canonical = r[@core_fields[:canonicalname]]
114
- taxon.parent_id = r[parent_id]
115
- taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
116
- taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
97
+ @dwc.core.read do |rows|
98
+ rows[0].each do |r|
99
+ set_scientific_name(r, @core_fields)
100
+ #core has AcceptedNameUsageId
101
+ if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
102
+ add_synonym_from_core(@core_fields[:acceptednameusageid], r)
103
+ elsif !@core_fields[:acceptednameusageid] && status_synonym?(r[@core_fields[:taxonomicstatus]])
104
+ add_synonym_from_core(parent_id, r)
105
+ else
106
+ taxon = @res[r[@core_fields[:id]]] ? @res[r[@core_fields[:id]]] : @res[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new
107
+ taxon.id = r[@core_fields[:id]]
108
+ taxon.current_name = r[@core_fields[:scientificname]]
109
+ taxon.current_name_canonical = r[@core_fields[:canonicalname]]
110
+ taxon.parent_id = r[parent_id]
111
+ taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
112
+ taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
113
+ end
117
114
  end
118
115
  end
119
116
  end
@@ -172,33 +169,31 @@ class DarwinCore
172
169
  end
173
170
 
174
171
  def ingest_synonyms(extension)
175
- puts "Ingesting synonyms extension" if @verbose
172
+ DarwinCore.logger_write(@dwc.object_id, "Ingesting synonyms extension")
176
173
  ext, fields = *extension
177
- ext.read[0].each_with_index do |r, i|
178
- count = i + 1
179
- set_scientific_name(r, fields)
180
- puts "Ingesting %s'th record" % count if @verbose && count % @verbose_count == 0
181
- @res[r[fields[:id]]].synonyms << SynonymNormalized.new(
182
- r[fields[:scientificname]],
183
- r[fields[:canonicalname]],
184
- fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil)
174
+ ext.read do |rows|
175
+ rows[0].each do |r|
176
+ set_scientific_name(r, fields)
177
+ @res[r[fields[:id]]].synonyms << SynonymNormalized.new(
178
+ r[fields[:scientificname]],
179
+ r[fields[:canonicalname]],
180
+ fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil)
181
+ end
185
182
  end
186
183
  end
187
184
 
188
185
  def ingest_vernaculars(extension)
189
- puts "Ingesting vernacular names" if @verbose
186
+ DarwinCore.logger_write(@dwc.object_id, "Ingesting vernacular names extension")
190
187
  ext, fields = *extension
191
- ext.read[0].each_with_index do |r, i|
192
- count = i + 1
193
- puts "Ingesting %s'th record" % count if @verbose && count % @verbose_count == 0
194
- @res[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
195
- r[fields[:vernacularname]],
196
- fields[:languagecode] ? r[fields[:languagecode]] : nil)
197
- add_name_string(r[fields[:vernacularname]])
188
+ ext.read do |rows|
189
+ rows[0].each do |r|
190
+ @res[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
191
+ r[fields[:vernacularname]],
192
+ fields[:languagecode] ? r[fields[:languagecode]] : nil)
193
+ add_name_string(r[fields[:vernacularname]])
194
+ end
198
195
  end
199
196
  end
200
197
 
201
198
  end
202
199
  end
203
-
204
-
@@ -2,8 +2,9 @@ class DarwinCore
2
2
  class Core
3
3
  include DarwinCore::Ingester
4
4
  attr_reader :id
5
- def initialize(archive)
6
- @archive = archive
5
+ def initialize(dwc)
6
+ @dwc = dwc
7
+ @archive = @dwc.archive
7
8
  @path = @archive.files_path
8
9
  root_key = @archive.meta.keys[0]
9
10
  @data = @archive.meta[root_key][:core]
@@ -4,8 +4,9 @@ class DarwinCore
4
4
  attr_reader :coreid
5
5
  alias :id :coreid
6
6
 
7
- def initialize(archive, data)
8
- @archive = archive
7
+ def initialize(dwc, data)
8
+ @dwc = dwc
9
+ @archive = @dwc.archive
9
10
  @path = @archive.files_path
10
11
  @data = data
11
12
  @coreid = @data[:coreid][:attributes]
@@ -3,6 +3,7 @@ class DarwinCore
3
3
  attr_reader :data, :properties, :encoding, :fields_separator
4
4
  attr_reader :file_path, :fields, :line_separator, :quote_character, :ignore_headers
5
5
  def read(batch_size = 10000)
6
+ DarwinCore.logger_write(@dwc.object_id, "Reading %s data" % name)
6
7
  res = []
7
8
  errors = []
8
9
  index_fix = 1
@@ -13,7 +14,7 @@ class DarwinCore
13
14
  index_fix = 0; next if @ignore_headers && i == 0
14
15
  min_size > r.size ? errors << r : process_csv_row(res, errors, r)
15
16
  if (i + index_fix) % batch_size == 0
16
- DarwinCore.logger.info("%s| Ingested %s records" % [self.object_id, (i + index_fix)])
17
+ DarwinCore.logger_write(@dwc.object_id, "Ingested %s records from %s" % [(i + index_fix), name])
17
18
  if block_given?
18
19
  yield [res, errors]
19
20
  res = []
@@ -26,6 +27,10 @@ class DarwinCore
26
27
  end
27
28
 
28
29
  private
30
+ def name
31
+ self.class.to_s.split('::')[-1].downcase
32
+ end
33
+
29
34
  def process_csv_row(result, errors, row)
30
35
  str = row.join('')
31
36
  if R19
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc-archive
3
3
  version: !ruby/object:Gem::Version
4
- hash: 9
4
+ hash: 15
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 5
9
- - 1
10
- version: 0.5.1
9
+ - 2
10
+ version: 0.5.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Dmitry Mozzherin
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-11-05 00:00:00 -04:00
18
+ date: 2010-11-06 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -66,9 +66,8 @@ dependencies:
66
66
  version_requirements: *id003
67
67
  description: Darwin Core Archive is the current standard exchange format for GLobal Names Architecture modules. This gem makes it easy to incorporate files in Darwin Core Archive format into a ruby project.
68
68
  email: dmozzherin at gmail dot com
69
- executables:
70
- - preparse.rb
71
- - t
69
+ executables: []
70
+
72
71
  extensions: []
73
72
 
74
73
  extra_rdoc_files:
@@ -119,8 +118,6 @@ files:
119
118
  - spec/lib/ruby_extenstions_spec.rb
120
119
  - spec/spec.opts
121
120
  - spec/spec_helper.rb
122
- - bin/preparse.rb
123
- - bin/t
124
121
  has_rdoc: true
125
122
  homepage: http://github.com/GlobalNamesArchitecture/dwc-archive
126
123
  licenses: []