dwc-archive 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.5.1
1
+ 0.5.2
data/lib/dwc-archive.rb CHANGED
@@ -52,10 +52,6 @@ class DarwinCore
52
52
  end
53
53
  end
54
54
 
55
- def self.logger
56
- @@logger ||= Logger.new(nil)
57
- end
58
-
59
55
  def self.logger=(logger)
60
56
  @@logger = logger
61
57
  end
@@ -64,17 +60,22 @@ class DarwinCore
64
60
  @@logger = Logger.new(nil)
65
61
  end
66
62
 
63
+ def self.logger_write(obj_id, message, method = :info)
64
+ @@logger.send(method, "|%s|%s|" % [obj_id, message])
65
+ end
66
+
67
67
  def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
68
+ @@logger ||= Logger.new(nil)
68
69
  @archive = DarwinCore::Archive.new(dwc_path, tmp_dir)
69
- @core = DarwinCore::Core.new(@archive)
70
+ @core = DarwinCore::Core.new(self)
70
71
  @metadata = DarwinCore::Metadata.new(@archive)
71
72
  @extensions = get_extensions
72
73
  end
73
74
 
74
75
  # generates a hash from a classification data with path to each node, list of synonyms and vernacular names.
75
- def normalize_classification(verbose = false)
76
+ def normalize_classification
76
77
  return nil unless has_parent_id?
77
- @classification_normalizer ||= DarwinCore::ClassificationNormalizer.new(self, verbose)
78
+ @classification_normalizer ||= DarwinCore::ClassificationNormalizer.new(self)
78
79
  @classification_normalizer.normalize
79
80
  end
80
81
 
@@ -89,6 +90,6 @@ class DarwinCore
89
90
  ext = @archive.meta[root_key][:extension]
90
91
  return [] unless ext
91
92
  ext = [ext] if ext.class != Array
92
- ext.map { |e| DarwinCore::Extension.new(@archive, e) }
93
+ ext.map { |e| DarwinCore::Extension.new(self, e) }
93
94
  end
94
95
  end
@@ -22,17 +22,14 @@ class DarwinCore
22
22
  class VernacularNormalized < Struct.new(:name, :language);end
23
23
 
24
24
  class ClassificationNormalizer
25
- attr_accessor :verbose
26
25
  attr_reader :error_names, :tree
27
26
 
28
- def initialize(dwc_instance, verbose = false)
27
+ def initialize(dwc_instance)
29
28
  @dwc = dwc_instance
30
29
  @core_fields = get_fields(@dwc.core)
31
30
  @extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
32
31
  @res = {}
33
32
  @parser = ParsleyStore.new(1,2)
34
- @verbose = verbose
35
- @verbose_count = 10000
36
33
  @name_strings = {}
37
34
  @error_names = []
38
35
  @tree = {}
@@ -47,9 +44,12 @@ class DarwinCore
47
44
  end
48
45
 
49
46
  def normalize
47
+ DarwinCore.logger_write(@dwc.object_id, "Started normalization of the classification")
50
48
  @res = {}
51
49
  ingest_core
50
+ DarwinCore.logger_write(@dwc.object_id, "Calculating the classification parent/child paths")
52
51
  calculate_classification_path
52
+ DarwinCore.logger_write(@dwc.object_id, "Ingesting data from extensions")
53
53
  ingest_extensions
54
54
  @res
55
55
  end
@@ -94,26 +94,23 @@ class DarwinCore
94
94
 
95
95
  def ingest_core
96
96
  raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core_fields[:id] && @core_fields[:scientificname])
97
- puts "Reading core information" if @verbose
98
- rows = @dwc.core.read[0]
99
- puts "Ingesting information from the core" if @verbose
100
- rows.each_with_index do |r, i|
101
- count = i + 1
102
- set_scientific_name(r, @core_fields)
103
- puts "Ingesting %s'th record" % count if @verbose and count % @verbose_count == 0
104
- #core has AcceptedNameUsageId
105
- if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
106
- add_synonym_from_core(@core_fields[:acceptednameusageid], r)
107
- elsif !@core_fields[:acceptednameusageid] && status_synonym?(r[@core_fields[:taxonomicstatus]])
108
- add_synonym_from_core(parent_id, r)
109
- else
110
- taxon = @res[r[@core_fields[:id]]] ? @res[r[@core_fields[:id]]] : @res[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new
111
- taxon.id = r[@core_fields[:id]]
112
- taxon.current_name = r[@core_fields[:scientificname]]
113
- taxon.current_name_canonical = r[@core_fields[:canonicalname]]
114
- taxon.parent_id = r[parent_id]
115
- taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
116
- taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
97
+ @dwc.core.read do |rows|
98
+ rows[0].each do |r|
99
+ set_scientific_name(r, @core_fields)
100
+ #core has AcceptedNameUsageId
101
+ if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
102
+ add_synonym_from_core(@core_fields[:acceptednameusageid], r)
103
+ elsif !@core_fields[:acceptednameusageid] && status_synonym?(r[@core_fields[:taxonomicstatus]])
104
+ add_synonym_from_core(parent_id, r)
105
+ else
106
+ taxon = @res[r[@core_fields[:id]]] ? @res[r[@core_fields[:id]]] : @res[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new
107
+ taxon.id = r[@core_fields[:id]]
108
+ taxon.current_name = r[@core_fields[:scientificname]]
109
+ taxon.current_name_canonical = r[@core_fields[:canonicalname]]
110
+ taxon.parent_id = r[parent_id]
111
+ taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
112
+ taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
113
+ end
117
114
  end
118
115
  end
119
116
  end
@@ -172,33 +169,31 @@ class DarwinCore
172
169
  end
173
170
 
174
171
  def ingest_synonyms(extension)
175
- puts "Ingesting synonyms extension" if @verbose
172
+ DarwinCore.logger_write(@dwc.object_id, "Ingesting synonyms extension")
176
173
  ext, fields = *extension
177
- ext.read[0].each_with_index do |r, i|
178
- count = i + 1
179
- set_scientific_name(r, fields)
180
- puts "Ingesting %s'th record" % count if @verbose && count % @verbose_count == 0
181
- @res[r[fields[:id]]].synonyms << SynonymNormalized.new(
182
- r[fields[:scientificname]],
183
- r[fields[:canonicalname]],
184
- fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil)
174
+ ext.read do |rows|
175
+ rows[0].each do |r|
176
+ set_scientific_name(r, fields)
177
+ @res[r[fields[:id]]].synonyms << SynonymNormalized.new(
178
+ r[fields[:scientificname]],
179
+ r[fields[:canonicalname]],
180
+ fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil)
181
+ end
185
182
  end
186
183
  end
187
184
 
188
185
  def ingest_vernaculars(extension)
189
- puts "Ingesting vernacular names" if @verbose
186
+ DarwinCore.logger_write(@dwc.object_id, "Ingesting vernacular names extension")
190
187
  ext, fields = *extension
191
- ext.read[0].each_with_index do |r, i|
192
- count = i + 1
193
- puts "Ingesting %s'th record" % count if @verbose && count % @verbose_count == 0
194
- @res[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
195
- r[fields[:vernacularname]],
196
- fields[:languagecode] ? r[fields[:languagecode]] : nil)
197
- add_name_string(r[fields[:vernacularname]])
188
+ ext.read do |rows|
189
+ rows[0].each do |r|
190
+ @res[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
191
+ r[fields[:vernacularname]],
192
+ fields[:languagecode] ? r[fields[:languagecode]] : nil)
193
+ add_name_string(r[fields[:vernacularname]])
194
+ end
198
195
  end
199
196
  end
200
197
 
201
198
  end
202
199
  end
203
-
204
-
@@ -2,8 +2,9 @@ class DarwinCore
2
2
  class Core
3
3
  include DarwinCore::Ingester
4
4
  attr_reader :id
5
- def initialize(archive)
6
- @archive = archive
5
+ def initialize(dwc)
6
+ @dwc = dwc
7
+ @archive = @dwc.archive
7
8
  @path = @archive.files_path
8
9
  root_key = @archive.meta.keys[0]
9
10
  @data = @archive.meta[root_key][:core]
@@ -4,8 +4,9 @@ class DarwinCore
4
4
  attr_reader :coreid
5
5
  alias :id :coreid
6
6
 
7
- def initialize(archive, data)
8
- @archive = archive
7
+ def initialize(dwc, data)
8
+ @dwc = dwc
9
+ @archive = @dwc.archive
9
10
  @path = @archive.files_path
10
11
  @data = data
11
12
  @coreid = @data[:coreid][:attributes]
@@ -3,6 +3,7 @@ class DarwinCore
3
3
  attr_reader :data, :properties, :encoding, :fields_separator
4
4
  attr_reader :file_path, :fields, :line_separator, :quote_character, :ignore_headers
5
5
  def read(batch_size = 10000)
6
+ DarwinCore.logger_write(@dwc.object_id, "Reading %s data" % name)
6
7
  res = []
7
8
  errors = []
8
9
  index_fix = 1
@@ -13,7 +14,7 @@ class DarwinCore
13
14
  index_fix = 0; next if @ignore_headers && i == 0
14
15
  min_size > r.size ? errors << r : process_csv_row(res, errors, r)
15
16
  if (i + index_fix) % batch_size == 0
16
- DarwinCore.logger.info("%s| Ingested %s records" % [self.object_id, (i + index_fix)])
17
+ DarwinCore.logger_write(@dwc.object_id, "Ingested %s records from %s" % [(i + index_fix), name])
17
18
  if block_given?
18
19
  yield [res, errors]
19
20
  res = []
@@ -26,6 +27,10 @@ class DarwinCore
26
27
  end
27
28
 
28
29
  private
30
+ def name
31
+ self.class.to_s.split('::')[-1].downcase
32
+ end
33
+
29
34
  def process_csv_row(result, errors, row)
30
35
  str = row.join('')
31
36
  if R19
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwc-archive
3
3
  version: !ruby/object:Gem::Version
4
- hash: 9
4
+ hash: 15
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 5
9
- - 1
10
- version: 0.5.1
9
+ - 2
10
+ version: 0.5.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - Dmitry Mozzherin
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-11-05 00:00:00 -04:00
18
+ date: 2010-11-06 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -66,9 +66,8 @@ dependencies:
66
66
  version_requirements: *id003
67
67
  description: Darwin Core Archive is the current standard exchange format for GLobal Names Architecture modules. This gem makes it easy to incorporate files in Darwin Core Archive format into a ruby project.
68
68
  email: dmozzherin at gmail dot com
69
- executables:
70
- - preparse.rb
71
- - t
69
+ executables: []
70
+
72
71
  extensions: []
73
72
 
74
73
  extra_rdoc_files:
@@ -119,8 +118,6 @@ files:
119
118
  - spec/lib/ruby_extenstions_spec.rb
120
119
  - spec/spec.opts
121
120
  - spec/spec_helper.rb
122
- - bin/preparse.rb
123
- - bin/t
124
121
  has_rdoc: true
125
122
  homepage: http://github.com/GlobalNamesArchitecture/dwc-archive
126
123
  licenses: []