dwc-archive 0.4.2 → 0.4.3
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/dwc-archive.rb +5 -2
- data/lib/dwc-archive/classification_normalizer.rb +25 -10
- data/spec/lib/dwc-archive_spec.rb +6 -0
- metadata +8 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.4.
|
1
|
+
0.4.3
|
data/lib/dwc-archive.rb
CHANGED
@@ -25,6 +25,9 @@ require 'dwc-archive/generator_eml_xml'
|
|
25
25
|
require 'dwc-archive/classification_normalizer'
|
26
26
|
|
27
27
|
class DarwinCore
|
28
|
+
|
29
|
+
VERSION = open(File.join(File.dirname(__FILE__), '..', 'VERSION')).readline.strip
|
30
|
+
|
28
31
|
attr_reader :archive, :core, :metadata, :extensions
|
29
32
|
alias :eml :metadata
|
30
33
|
|
@@ -52,9 +55,9 @@ class DarwinCore
|
|
52
55
|
end
|
53
56
|
|
54
57
|
# generates a hash from a classification data with path to each node, list of synonyms and vernacular names.
|
55
|
-
def normalize_classification
|
58
|
+
def normalize_classification(verbose = false)
|
56
59
|
return nil unless has_parent_id?
|
57
|
-
DarwinCore::ClassificationNormalizer.new(self).normalize
|
60
|
+
DarwinCore::ClassificationNormalizer.new(self, verbose).normalize
|
58
61
|
end
|
59
62
|
|
60
63
|
def has_parent_id?
|
@@ -1,8 +1,8 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
require '
|
2
|
+
require 'parsley-store'
|
3
3
|
|
4
4
|
class DarwinCore
|
5
|
-
|
5
|
+
|
6
6
|
class TaxonNormalized
|
7
7
|
attr_accessor :id, :parent_id, :classification_path, :current_name, :current_name_canonical, :synonyms, :vernacular_names, :rank, :status
|
8
8
|
|
@@ -18,12 +18,15 @@ class DarwinCore
|
|
18
18
|
class VernacularNormalized < Struct.new(:name, :language);end
|
19
19
|
|
20
20
|
class ClassificationNormalizer
|
21
|
-
|
21
|
+
|
22
|
+
def initialize(dwc_instance, verbose = false)
|
22
23
|
@dwc = dwc_instance
|
23
24
|
@core = get_fields(@dwc.core)
|
24
25
|
@extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
|
25
26
|
@res = {}
|
26
|
-
@parser =
|
27
|
+
@parser = ParsleyStore.new(1,2)
|
28
|
+
@verbose = verbose
|
29
|
+
@verbose_count = 1000
|
27
30
|
end
|
28
31
|
|
29
32
|
def normalize
|
@@ -42,7 +45,7 @@ class DarwinCore
|
|
42
45
|
begin
|
43
46
|
parsed_name = @parser.parse(a_scientific_name)[:scientificName]
|
44
47
|
rescue
|
45
|
-
@parser =
|
48
|
+
@parser = ParsleyStore.new(1,2)
|
46
49
|
parsed_name = @parser.parse(a_scientific_name)[:scientificName]
|
47
50
|
end
|
48
51
|
parsed_name[:parsed] ? parsed_name[:canonical] : a_scientific_name
|
@@ -63,12 +66,17 @@ class DarwinCore
|
|
63
66
|
taxon.synonyms << SynonymNormalized.new(
|
64
67
|
row[@core[:scientificname]],
|
65
68
|
canonical_name(row[@core[:scientificname]]),
|
66
|
-
row[@core[:taxonomicstatus]])
|
69
|
+
@core[:taxonomicstatus] ? row[@core[:taxonomicstatus]] : nil)
|
67
70
|
end
|
68
71
|
|
69
72
|
def injest_core
|
70
73
|
raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core[:id] && @core[:scientificname])
|
71
|
-
|
74
|
+
puts "Reading core information" if @verbose
|
75
|
+
rows = @dwc.core.read[0]
|
76
|
+
puts "Injesting information from the core" if @verbose
|
77
|
+
rows.each_with_index do |r, i|
|
78
|
+
count = i + 1
|
79
|
+
puts "Injesting %s'th record" % count if @verbose and count % @verbose_count == 0
|
72
80
|
#core has AcceptedNameUsageId
|
73
81
|
if @core[:acceptednameusageid] && r[@core[:acceptednameusageid]] && r[@core[:acceptednameusageid]] != r[@core[:id]]
|
74
82
|
add_synonym_from_core(@core[:acceptednameusageid], r)
|
@@ -102,7 +110,7 @@ class DarwinCore
|
|
102
110
|
if DarwinCore.nil_field?(taxon.parent_id)
|
103
111
|
taxon.classification_path = [taxon.current_name_canonical]
|
104
112
|
else
|
105
|
-
|
113
|
+
parent_cp = @res[taxon.parent_id].classification_path
|
106
114
|
if parent_cp
|
107
115
|
taxon.classification_path = parent_cp + [taxon.current_name_canonical]
|
108
116
|
else
|
@@ -121,8 +129,11 @@ class DarwinCore
|
|
121
129
|
end
|
122
130
|
|
123
131
|
def injest_synonyms(extension)
|
132
|
+
puts "Injesting synonyms extension" if @verbose
|
124
133
|
ext, fields = *extension
|
125
|
-
ext.read[0].
|
134
|
+
ext.read[0].each_with_index do |r, i|
|
135
|
+
count = i + 1
|
136
|
+
puts "Injesting %s'th record" % count if @verbose && count % @verbose_count == 0
|
126
137
|
@res[r[fields[:id]]].synonyms << SynonymNormalized.new(
|
127
138
|
r[fields[:scientificname]],
|
128
139
|
canonical_name(r[fields[:scientificname]]),
|
@@ -131,8 +142,11 @@ class DarwinCore
|
|
131
142
|
end
|
132
143
|
|
133
144
|
def injest_vernaculars(extension)
|
145
|
+
puts "Injesting vernacular names" if @verbose
|
134
146
|
ext, fields = *extension
|
135
|
-
ext.read[0].
|
147
|
+
ext.read[0].each_with_index do |r, i|
|
148
|
+
count = i + 1
|
149
|
+
puts "Injesting %s'th record" % count if @verbose && count % @verbose_count == 0
|
136
150
|
@res[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
|
137
151
|
r[fields[:vernacularname]],
|
138
152
|
fields[:languagecode] ? r[fields[:languagecode]] : nil)
|
@@ -141,3 +155,4 @@ class DarwinCore
|
|
141
155
|
|
142
156
|
end
|
143
157
|
end
|
158
|
+
|
@@ -5,6 +5,12 @@ describe DarwinCore do
|
|
5
5
|
@file_dir = File.join(File.dirname(__FILE__), '..', 'files')
|
6
6
|
end
|
7
7
|
|
8
|
+
describe "VERSION" do
|
9
|
+
it "should return VERSION number" do
|
10
|
+
DarwinCore::VERSION.split('.').join('').to_i.should > 41
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
8
14
|
describe "::nil_field?" do
|
9
15
|
it "should return true for entries which normally mean nil" do
|
10
16
|
[nil, '/N', ''].each do |i|
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 9
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
8
|
- 4
|
8
|
-
-
|
9
|
-
version: 0.4.
|
9
|
+
- 3
|
10
|
+
version: 0.4.3
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Dmitry Mozzherin
|
@@ -14,7 +15,7 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2010-09-
|
18
|
+
date: 2010-09-13 00:00:00 -04:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
@@ -25,6 +26,7 @@ dependencies:
|
|
25
26
|
requirements:
|
26
27
|
- - ">="
|
27
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 13
|
28
30
|
segments:
|
29
31
|
- 1
|
30
32
|
- 2
|
@@ -40,6 +42,7 @@ dependencies:
|
|
40
42
|
requirements:
|
41
43
|
- - ">="
|
42
44
|
- !ruby/object:Gem::Version
|
45
|
+
hash: 3
|
43
46
|
segments:
|
44
47
|
- 0
|
45
48
|
version: "0"
|
@@ -112,6 +115,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
112
115
|
requirements:
|
113
116
|
- - ">="
|
114
117
|
- !ruby/object:Gem::Version
|
118
|
+
hash: 3
|
115
119
|
segments:
|
116
120
|
- 0
|
117
121
|
version: "0"
|
@@ -120,6 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
120
124
|
requirements:
|
121
125
|
- - ">="
|
122
126
|
- !ruby/object:Gem::Version
|
127
|
+
hash: 3
|
123
128
|
segments:
|
124
129
|
- 0
|
125
130
|
version: "0"
|