dwc-archive 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/dwc-archive.rb +9 -8
- data/lib/dwc-archive/classification_normalizer.rb +38 -43
- data/lib/dwc-archive/core.rb +3 -2
- data/lib/dwc-archive/extension.rb +3 -2
- data/lib/dwc-archive/ingester.rb +6 -1
- metadata +6 -9
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.5.
|
1
|
+
0.5.2
|
data/lib/dwc-archive.rb
CHANGED
@@ -52,10 +52,6 @@ class DarwinCore
|
|
52
52
|
end
|
53
53
|
end
|
54
54
|
|
55
|
-
def self.logger
|
56
|
-
@@logger ||= Logger.new(nil)
|
57
|
-
end
|
58
|
-
|
59
55
|
def self.logger=(logger)
|
60
56
|
@@logger = logger
|
61
57
|
end
|
@@ -64,17 +60,22 @@ class DarwinCore
|
|
64
60
|
@@logger = Logger.new(nil)
|
65
61
|
end
|
66
62
|
|
63
|
+
def self.logger_write(obj_id, message, method = :info)
|
64
|
+
@@logger.send(method, "|%s|%s|" % [obj_id, message])
|
65
|
+
end
|
66
|
+
|
67
67
|
def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
|
68
|
+
@@logger ||= Logger.new(nil)
|
68
69
|
@archive = DarwinCore::Archive.new(dwc_path, tmp_dir)
|
69
|
-
@core = DarwinCore::Core.new(
|
70
|
+
@core = DarwinCore::Core.new(self)
|
70
71
|
@metadata = DarwinCore::Metadata.new(@archive)
|
71
72
|
@extensions = get_extensions
|
72
73
|
end
|
73
74
|
|
74
75
|
# generates a hash from a classification data with path to each node, list of synonyms and vernacular names.
|
75
|
-
def normalize_classification
|
76
|
+
def normalize_classification
|
76
77
|
return nil unless has_parent_id?
|
77
|
-
@classification_normalizer ||= DarwinCore::ClassificationNormalizer.new(self
|
78
|
+
@classification_normalizer ||= DarwinCore::ClassificationNormalizer.new(self)
|
78
79
|
@classification_normalizer.normalize
|
79
80
|
end
|
80
81
|
|
@@ -89,6 +90,6 @@ class DarwinCore
|
|
89
90
|
ext = @archive.meta[root_key][:extension]
|
90
91
|
return [] unless ext
|
91
92
|
ext = [ext] if ext.class != Array
|
92
|
-
ext.map { |e| DarwinCore::Extension.new(
|
93
|
+
ext.map { |e| DarwinCore::Extension.new(self, e) }
|
93
94
|
end
|
94
95
|
end
|
@@ -22,17 +22,14 @@ class DarwinCore
|
|
22
22
|
class VernacularNormalized < Struct.new(:name, :language);end
|
23
23
|
|
24
24
|
class ClassificationNormalizer
|
25
|
-
attr_accessor :verbose
|
26
25
|
attr_reader :error_names, :tree
|
27
26
|
|
28
|
-
def initialize(dwc_instance
|
27
|
+
def initialize(dwc_instance)
|
29
28
|
@dwc = dwc_instance
|
30
29
|
@core_fields = get_fields(@dwc.core)
|
31
30
|
@extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
|
32
31
|
@res = {}
|
33
32
|
@parser = ParsleyStore.new(1,2)
|
34
|
-
@verbose = verbose
|
35
|
-
@verbose_count = 10000
|
36
33
|
@name_strings = {}
|
37
34
|
@error_names = []
|
38
35
|
@tree = {}
|
@@ -47,9 +44,12 @@ class DarwinCore
|
|
47
44
|
end
|
48
45
|
|
49
46
|
def normalize
|
47
|
+
DarwinCore.logger_write(@dwc.object_id, "Started normalization of the classification")
|
50
48
|
@res = {}
|
51
49
|
ingest_core
|
50
|
+
DarwinCore.logger_write(@dwc.object_id, "Calculating the classification parent/child paths")
|
52
51
|
calculate_classification_path
|
52
|
+
DarwinCore.logger_write(@dwc.object_id, "Ingesting data from extensions")
|
53
53
|
ingest_extensions
|
54
54
|
@res
|
55
55
|
end
|
@@ -94,26 +94,23 @@ class DarwinCore
|
|
94
94
|
|
95
95
|
def ingest_core
|
96
96
|
raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core_fields[:id] && @core_fields[:scientificname])
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
taxon.parent_id = r[parent_id]
|
115
|
-
taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
|
116
|
-
taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
|
97
|
+
@dwc.core.read do |rows|
|
98
|
+
rows[0].each do |r|
|
99
|
+
set_scientific_name(r, @core_fields)
|
100
|
+
#core has AcceptedNameUsageId
|
101
|
+
if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
|
102
|
+
add_synonym_from_core(@core_fields[:acceptednameusageid], r)
|
103
|
+
elsif !@core_fields[:acceptednameusageid] && status_synonym?(r[@core_fields[:taxonomicstatus]])
|
104
|
+
add_synonym_from_core(parent_id, r)
|
105
|
+
else
|
106
|
+
taxon = @res[r[@core_fields[:id]]] ? @res[r[@core_fields[:id]]] : @res[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new
|
107
|
+
taxon.id = r[@core_fields[:id]]
|
108
|
+
taxon.current_name = r[@core_fields[:scientificname]]
|
109
|
+
taxon.current_name_canonical = r[@core_fields[:canonicalname]]
|
110
|
+
taxon.parent_id = r[parent_id]
|
111
|
+
taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
|
112
|
+
taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
|
113
|
+
end
|
117
114
|
end
|
118
115
|
end
|
119
116
|
end
|
@@ -172,33 +169,31 @@ class DarwinCore
|
|
172
169
|
end
|
173
170
|
|
174
171
|
def ingest_synonyms(extension)
|
175
|
-
|
172
|
+
DarwinCore.logger_write(@dwc.object_id, "Ingesting synonyms extension")
|
176
173
|
ext, fields = *extension
|
177
|
-
ext.read
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
174
|
+
ext.read do |rows|
|
175
|
+
rows[0].each do |r|
|
176
|
+
set_scientific_name(r, fields)
|
177
|
+
@res[r[fields[:id]]].synonyms << SynonymNormalized.new(
|
178
|
+
r[fields[:scientificname]],
|
179
|
+
r[fields[:canonicalname]],
|
180
|
+
fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil)
|
181
|
+
end
|
185
182
|
end
|
186
183
|
end
|
187
184
|
|
188
185
|
def ingest_vernaculars(extension)
|
189
|
-
|
186
|
+
DarwinCore.logger_write(@dwc.object_id, "Ingesting vernacular names extension")
|
190
187
|
ext, fields = *extension
|
191
|
-
ext.read
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
188
|
+
ext.read do |rows|
|
189
|
+
rows[0].each do |r|
|
190
|
+
@res[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
|
191
|
+
r[fields[:vernacularname]],
|
192
|
+
fields[:languagecode] ? r[fields[:languagecode]] : nil)
|
193
|
+
add_name_string(r[fields[:vernacularname]])
|
194
|
+
end
|
198
195
|
end
|
199
196
|
end
|
200
197
|
|
201
198
|
end
|
202
199
|
end
|
203
|
-
|
204
|
-
|
data/lib/dwc-archive/core.rb
CHANGED
@@ -2,8 +2,9 @@ class DarwinCore
|
|
2
2
|
class Core
|
3
3
|
include DarwinCore::Ingester
|
4
4
|
attr_reader :id
|
5
|
-
def initialize(
|
6
|
-
@
|
5
|
+
def initialize(dwc)
|
6
|
+
@dwc = dwc
|
7
|
+
@archive = @dwc.archive
|
7
8
|
@path = @archive.files_path
|
8
9
|
root_key = @archive.meta.keys[0]
|
9
10
|
@data = @archive.meta[root_key][:core]
|
@@ -4,8 +4,9 @@ class DarwinCore
|
|
4
4
|
attr_reader :coreid
|
5
5
|
alias :id :coreid
|
6
6
|
|
7
|
-
def initialize(
|
8
|
-
@
|
7
|
+
def initialize(dwc, data)
|
8
|
+
@dwc = dwc
|
9
|
+
@archive = @dwc.archive
|
9
10
|
@path = @archive.files_path
|
10
11
|
@data = data
|
11
12
|
@coreid = @data[:coreid][:attributes]
|
data/lib/dwc-archive/ingester.rb
CHANGED
@@ -3,6 +3,7 @@ class DarwinCore
|
|
3
3
|
attr_reader :data, :properties, :encoding, :fields_separator
|
4
4
|
attr_reader :file_path, :fields, :line_separator, :quote_character, :ignore_headers
|
5
5
|
def read(batch_size = 10000)
|
6
|
+
DarwinCore.logger_write(@dwc.object_id, "Reading %s data" % name)
|
6
7
|
res = []
|
7
8
|
errors = []
|
8
9
|
index_fix = 1
|
@@ -13,7 +14,7 @@ class DarwinCore
|
|
13
14
|
index_fix = 0; next if @ignore_headers && i == 0
|
14
15
|
min_size > r.size ? errors << r : process_csv_row(res, errors, r)
|
15
16
|
if (i + index_fix) % batch_size == 0
|
16
|
-
DarwinCore.
|
17
|
+
DarwinCore.logger_write(@dwc.object_id, "Ingested %s records from %s" % [(i + index_fix), name])
|
17
18
|
if block_given?
|
18
19
|
yield [res, errors]
|
19
20
|
res = []
|
@@ -26,6 +27,10 @@ class DarwinCore
|
|
26
27
|
end
|
27
28
|
|
28
29
|
private
|
30
|
+
def name
|
31
|
+
self.class.to_s.split('::')[-1].downcase
|
32
|
+
end
|
33
|
+
|
29
34
|
def process_csv_row(result, errors, row)
|
30
35
|
str = row.join('')
|
31
36
|
if R19
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 5
|
9
|
-
-
|
10
|
-
version: 0.5.
|
9
|
+
- 2
|
10
|
+
version: 0.5.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Dmitry Mozzherin
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-11-
|
18
|
+
date: 2010-11-06 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -66,9 +66,8 @@ dependencies:
|
|
66
66
|
version_requirements: *id003
|
67
67
|
description: Darwin Core Archive is the current standard exchange format for GLobal Names Architecture modules. This gem makes it easy to incorporate files in Darwin Core Archive format into a ruby project.
|
68
68
|
email: dmozzherin at gmail dot com
|
69
|
-
executables:
|
70
|
-
|
71
|
-
- t
|
69
|
+
executables: []
|
70
|
+
|
72
71
|
extensions: []
|
73
72
|
|
74
73
|
extra_rdoc_files:
|
@@ -119,8 +118,6 @@ files:
|
|
119
118
|
- spec/lib/ruby_extenstions_spec.rb
|
120
119
|
- spec/spec.opts
|
121
120
|
- spec/spec_helper.rb
|
122
|
-
- bin/preparse.rb
|
123
|
-
- bin/t
|
124
121
|
has_rdoc: true
|
125
122
|
homepage: http://github.com/GlobalNamesArchitecture/dwc-archive
|
126
123
|
licenses: []
|