dwc-archive 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/dwc-archive.rb +9 -8
- data/lib/dwc-archive/classification_normalizer.rb +38 -43
- data/lib/dwc-archive/core.rb +3 -2
- data/lib/dwc-archive/extension.rb +3 -2
- data/lib/dwc-archive/ingester.rb +6 -1
- metadata +6 -9
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.5.
|
1
|
+
0.5.2
|
data/lib/dwc-archive.rb
CHANGED
@@ -52,10 +52,6 @@ class DarwinCore
|
|
52
52
|
end
|
53
53
|
end
|
54
54
|
|
55
|
-
def self.logger
|
56
|
-
@@logger ||= Logger.new(nil)
|
57
|
-
end
|
58
|
-
|
59
55
|
def self.logger=(logger)
|
60
56
|
@@logger = logger
|
61
57
|
end
|
@@ -64,17 +60,22 @@ class DarwinCore
|
|
64
60
|
@@logger = Logger.new(nil)
|
65
61
|
end
|
66
62
|
|
63
|
+
def self.logger_write(obj_id, message, method = :info)
|
64
|
+
@@logger.send(method, "|%s|%s|" % [obj_id, message])
|
65
|
+
end
|
66
|
+
|
67
67
|
def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
|
68
|
+
@@logger ||= Logger.new(nil)
|
68
69
|
@archive = DarwinCore::Archive.new(dwc_path, tmp_dir)
|
69
|
-
@core = DarwinCore::Core.new(
|
70
|
+
@core = DarwinCore::Core.new(self)
|
70
71
|
@metadata = DarwinCore::Metadata.new(@archive)
|
71
72
|
@extensions = get_extensions
|
72
73
|
end
|
73
74
|
|
74
75
|
# generates a hash from a classification data with path to each node, list of synonyms and vernacular names.
|
75
|
-
def normalize_classification
|
76
|
+
def normalize_classification
|
76
77
|
return nil unless has_parent_id?
|
77
|
-
@classification_normalizer ||= DarwinCore::ClassificationNormalizer.new(self
|
78
|
+
@classification_normalizer ||= DarwinCore::ClassificationNormalizer.new(self)
|
78
79
|
@classification_normalizer.normalize
|
79
80
|
end
|
80
81
|
|
@@ -89,6 +90,6 @@ class DarwinCore
|
|
89
90
|
ext = @archive.meta[root_key][:extension]
|
90
91
|
return [] unless ext
|
91
92
|
ext = [ext] if ext.class != Array
|
92
|
-
ext.map { |e| DarwinCore::Extension.new(
|
93
|
+
ext.map { |e| DarwinCore::Extension.new(self, e) }
|
93
94
|
end
|
94
95
|
end
|
@@ -22,17 +22,14 @@ class DarwinCore
|
|
22
22
|
class VernacularNormalized < Struct.new(:name, :language);end
|
23
23
|
|
24
24
|
class ClassificationNormalizer
|
25
|
-
attr_accessor :verbose
|
26
25
|
attr_reader :error_names, :tree
|
27
26
|
|
28
|
-
def initialize(dwc_instance
|
27
|
+
def initialize(dwc_instance)
|
29
28
|
@dwc = dwc_instance
|
30
29
|
@core_fields = get_fields(@dwc.core)
|
31
30
|
@extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
|
32
31
|
@res = {}
|
33
32
|
@parser = ParsleyStore.new(1,2)
|
34
|
-
@verbose = verbose
|
35
|
-
@verbose_count = 10000
|
36
33
|
@name_strings = {}
|
37
34
|
@error_names = []
|
38
35
|
@tree = {}
|
@@ -47,9 +44,12 @@ class DarwinCore
|
|
47
44
|
end
|
48
45
|
|
49
46
|
def normalize
|
47
|
+
DarwinCore.logger_write(@dwc.object_id, "Started normalization of the classification")
|
50
48
|
@res = {}
|
51
49
|
ingest_core
|
50
|
+
DarwinCore.logger_write(@dwc.object_id, "Calculating the classification parent/child paths")
|
52
51
|
calculate_classification_path
|
52
|
+
DarwinCore.logger_write(@dwc.object_id, "Ingesting data from extensions")
|
53
53
|
ingest_extensions
|
54
54
|
@res
|
55
55
|
end
|
@@ -94,26 +94,23 @@ class DarwinCore
|
|
94
94
|
|
95
95
|
def ingest_core
|
96
96
|
raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core_fields[:id] && @core_fields[:scientificname])
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
taxon.parent_id = r[parent_id]
|
115
|
-
taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
|
116
|
-
taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
|
97
|
+
@dwc.core.read do |rows|
|
98
|
+
rows[0].each do |r|
|
99
|
+
set_scientific_name(r, @core_fields)
|
100
|
+
#core has AcceptedNameUsageId
|
101
|
+
if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
|
102
|
+
add_synonym_from_core(@core_fields[:acceptednameusageid], r)
|
103
|
+
elsif !@core_fields[:acceptednameusageid] && status_synonym?(r[@core_fields[:taxonomicstatus]])
|
104
|
+
add_synonym_from_core(parent_id, r)
|
105
|
+
else
|
106
|
+
taxon = @res[r[@core_fields[:id]]] ? @res[r[@core_fields[:id]]] : @res[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new
|
107
|
+
taxon.id = r[@core_fields[:id]]
|
108
|
+
taxon.current_name = r[@core_fields[:scientificname]]
|
109
|
+
taxon.current_name_canonical = r[@core_fields[:canonicalname]]
|
110
|
+
taxon.parent_id = r[parent_id]
|
111
|
+
taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
|
112
|
+
taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
|
113
|
+
end
|
117
114
|
end
|
118
115
|
end
|
119
116
|
end
|
@@ -172,33 +169,31 @@ class DarwinCore
|
|
172
169
|
end
|
173
170
|
|
174
171
|
def ingest_synonyms(extension)
|
175
|
-
|
172
|
+
DarwinCore.logger_write(@dwc.object_id, "Ingesting synonyms extension")
|
176
173
|
ext, fields = *extension
|
177
|
-
ext.read
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
174
|
+
ext.read do |rows|
|
175
|
+
rows[0].each do |r|
|
176
|
+
set_scientific_name(r, fields)
|
177
|
+
@res[r[fields[:id]]].synonyms << SynonymNormalized.new(
|
178
|
+
r[fields[:scientificname]],
|
179
|
+
r[fields[:canonicalname]],
|
180
|
+
fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil)
|
181
|
+
end
|
185
182
|
end
|
186
183
|
end
|
187
184
|
|
188
185
|
def ingest_vernaculars(extension)
|
189
|
-
|
186
|
+
DarwinCore.logger_write(@dwc.object_id, "Ingesting vernacular names extension")
|
190
187
|
ext, fields = *extension
|
191
|
-
ext.read
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
188
|
+
ext.read do |rows|
|
189
|
+
rows[0].each do |r|
|
190
|
+
@res[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
|
191
|
+
r[fields[:vernacularname]],
|
192
|
+
fields[:languagecode] ? r[fields[:languagecode]] : nil)
|
193
|
+
add_name_string(r[fields[:vernacularname]])
|
194
|
+
end
|
198
195
|
end
|
199
196
|
end
|
200
197
|
|
201
198
|
end
|
202
199
|
end
|
203
|
-
|
204
|
-
|
data/lib/dwc-archive/core.rb
CHANGED
@@ -2,8 +2,9 @@ class DarwinCore
|
|
2
2
|
class Core
|
3
3
|
include DarwinCore::Ingester
|
4
4
|
attr_reader :id
|
5
|
-
def initialize(
|
6
|
-
@
|
5
|
+
def initialize(dwc)
|
6
|
+
@dwc = dwc
|
7
|
+
@archive = @dwc.archive
|
7
8
|
@path = @archive.files_path
|
8
9
|
root_key = @archive.meta.keys[0]
|
9
10
|
@data = @archive.meta[root_key][:core]
|
@@ -4,8 +4,9 @@ class DarwinCore
|
|
4
4
|
attr_reader :coreid
|
5
5
|
alias :id :coreid
|
6
6
|
|
7
|
-
def initialize(
|
8
|
-
@
|
7
|
+
def initialize(dwc, data)
|
8
|
+
@dwc = dwc
|
9
|
+
@archive = @dwc.archive
|
9
10
|
@path = @archive.files_path
|
10
11
|
@data = data
|
11
12
|
@coreid = @data[:coreid][:attributes]
|
data/lib/dwc-archive/ingester.rb
CHANGED
@@ -3,6 +3,7 @@ class DarwinCore
|
|
3
3
|
attr_reader :data, :properties, :encoding, :fields_separator
|
4
4
|
attr_reader :file_path, :fields, :line_separator, :quote_character, :ignore_headers
|
5
5
|
def read(batch_size = 10000)
|
6
|
+
DarwinCore.logger_write(@dwc.object_id, "Reading %s data" % name)
|
6
7
|
res = []
|
7
8
|
errors = []
|
8
9
|
index_fix = 1
|
@@ -13,7 +14,7 @@ class DarwinCore
|
|
13
14
|
index_fix = 0; next if @ignore_headers && i == 0
|
14
15
|
min_size > r.size ? errors << r : process_csv_row(res, errors, r)
|
15
16
|
if (i + index_fix) % batch_size == 0
|
16
|
-
DarwinCore.
|
17
|
+
DarwinCore.logger_write(@dwc.object_id, "Ingested %s records from %s" % [(i + index_fix), name])
|
17
18
|
if block_given?
|
18
19
|
yield [res, errors]
|
19
20
|
res = []
|
@@ -26,6 +27,10 @@ class DarwinCore
|
|
26
27
|
end
|
27
28
|
|
28
29
|
private
|
30
|
+
def name
|
31
|
+
self.class.to_s.split('::')[-1].downcase
|
32
|
+
end
|
33
|
+
|
29
34
|
def process_csv_row(result, errors, row)
|
30
35
|
str = row.join('')
|
31
36
|
if R19
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 5
|
9
|
-
-
|
10
|
-
version: 0.5.
|
9
|
+
- 2
|
10
|
+
version: 0.5.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Dmitry Mozzherin
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-11-
|
18
|
+
date: 2010-11-06 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -66,9 +66,8 @@ dependencies:
|
|
66
66
|
version_requirements: *id003
|
67
67
|
description: Darwin Core Archive is the current standard exchange format for GLobal Names Architecture modules. This gem makes it easy to incorporate files in Darwin Core Archive format into a ruby project.
|
68
68
|
email: dmozzherin at gmail dot com
|
69
|
-
executables:
|
70
|
-
|
71
|
-
- t
|
69
|
+
executables: []
|
70
|
+
|
72
71
|
extensions: []
|
73
72
|
|
74
73
|
extra_rdoc_files:
|
@@ -119,8 +118,6 @@ files:
|
|
119
118
|
- spec/lib/ruby_extenstions_spec.rb
|
120
119
|
- spec/spec.opts
|
121
120
|
- spec/spec_helper.rb
|
122
|
-
- bin/preparse.rb
|
123
|
-
- bin/t
|
124
121
|
has_rdoc: true
|
125
122
|
homepage: http://github.com/GlobalNamesArchitecture/dwc-archive
|
126
123
|
licenses: []
|