dwc-archive 0.4.13 → 0.4.14
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/dwc-archive/classification_normalizer.rb +27 -17
- data/spec/files/sci_name_authorship.tar.gz +0 -0
- data/spec/lib/dwc-archive_spec.rb +11 -0
- metadata +10 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.4.
|
1
|
+
0.4.14
|
@@ -27,7 +27,7 @@ class DarwinCore
|
|
27
27
|
|
28
28
|
def initialize(dwc_instance, verbose = false)
|
29
29
|
@dwc = dwc_instance
|
30
|
-
@
|
30
|
+
@core_fields = get_fields(@dwc.core)
|
31
31
|
@extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
|
32
32
|
@res = {}
|
33
33
|
@parser = ParsleyStore.new(1,2)
|
@@ -56,7 +56,7 @@ class DarwinCore
|
|
56
56
|
|
57
57
|
private
|
58
58
|
|
59
|
-
def
|
59
|
+
def get_canonical_name(a_scientific_name)
|
60
60
|
if R19
|
61
61
|
a_scientific_name.force_encoding('utf-8')
|
62
62
|
end
|
@@ -84,38 +84,47 @@ class DarwinCore
|
|
84
84
|
def add_synonym_from_core(taxon_id, row)
|
85
85
|
taxon = @res[row[taxon_id]] ? @res[row[taxon_id]] : @res[row[taxon_id]] = DarwinCore::TaxonNormalized.new
|
86
86
|
taxon.synonyms << SynonymNormalized.new(
|
87
|
-
row[@
|
88
|
-
|
89
|
-
@
|
87
|
+
row[@core_fields[:scientificname]],
|
88
|
+
row[@core_fields[:canonicalname]],
|
89
|
+
@core_fields[:taxonomicstatus] ? row[@core_fields[:taxonomicstatus]] : nil)
|
90
|
+
end
|
91
|
+
|
92
|
+
def set_scientific_name(row, fields)
|
93
|
+
canonical_name = fields[:scientificnameauthorship] ? row[fields[:scientificname]] : get_canonical_name(row[fields[:scientificname]])
|
94
|
+
fields[:canonicalname] = row.size
|
95
|
+
row << canonical_name
|
96
|
+
scientific_name = (fields[:scientificnameauthorship] && row[fields[:scientificnameauthorship]].to_s.strip != '') ? row[fields[:scientificname]].strip + ' ' + row[fields[:scientificnameauthorship]].strip : row[fields[:scientificname]].strip
|
97
|
+
row[fields[:scientificname]] = scientific_name
|
90
98
|
end
|
91
99
|
|
92
100
|
def ingest_core
|
93
|
-
raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@
|
101
|
+
raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core_fields[:id] && @core_fields[:scientificname])
|
94
102
|
puts "Reading core information" if @verbose
|
95
103
|
rows = @dwc.core.read[0]
|
96
104
|
puts "Ingesting information from the core" if @verbose
|
97
105
|
rows.each_with_index do |r, i|
|
98
106
|
count = i + 1
|
107
|
+
set_scientific_name(r, @core_fields)
|
99
108
|
puts "Ingesting %s'th record" % count if @verbose and count % @verbose_count == 0
|
100
109
|
#core has AcceptedNameUsageId
|
101
|
-
if @
|
102
|
-
add_synonym_from_core(@
|
103
|
-
elsif !@
|
110
|
+
if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
|
111
|
+
add_synonym_from_core(@core_fields[:acceptednameusageid], r)
|
112
|
+
elsif !@core_fields[:acceptednameusageid] && status_synonym?(r[@core_fields[:taxonomicstatus]])
|
104
113
|
add_synonym_from_core(parent_id, r)
|
105
114
|
else
|
106
|
-
taxon = @res[r[@
|
107
|
-
taxon.id = r[@
|
108
|
-
taxon.current_name = r[@
|
109
|
-
taxon.current_name_canonical =
|
115
|
+
taxon = @res[r[@core_fields[:id]]] ? @res[r[@core_fields[:id]]] : @res[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new
|
116
|
+
taxon.id = r[@core_fields[:id]]
|
117
|
+
taxon.current_name = r[@core_fields[:scientificname]]
|
118
|
+
taxon.current_name_canonical = r[@core_fields[:canonicalname]]
|
110
119
|
taxon.parent_id = r[parent_id]
|
111
|
-
taxon.rank = r[@
|
112
|
-
taxon.status = r[@
|
120
|
+
taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
|
121
|
+
taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
|
113
122
|
end
|
114
123
|
end
|
115
124
|
end
|
116
125
|
|
117
126
|
def parent_id
|
118
|
-
parent_id_field = @
|
127
|
+
parent_id_field = @core_fields[:highertaxonid] || @core_fields[:parentnameusageid]
|
119
128
|
end
|
120
129
|
|
121
130
|
def calculate_classification_path
|
@@ -172,10 +181,11 @@ class DarwinCore
|
|
172
181
|
ext, fields = *extension
|
173
182
|
ext.read[0].each_with_index do |r, i|
|
174
183
|
count = i + 1
|
184
|
+
set_scientific_name(r, fields)
|
175
185
|
puts "Ingesting %s'th record" % count if @verbose && count % @verbose_count == 0
|
176
186
|
@res[r[fields[:id]]].synonyms << SynonymNormalized.new(
|
177
187
|
r[fields[:scientificname]],
|
178
|
-
|
188
|
+
r[fields[:canonicalname]],
|
179
189
|
fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil)
|
180
190
|
end
|
181
191
|
end
|
Binary file
|
@@ -98,6 +98,17 @@ describe DarwinCore do
|
|
98
98
|
norm = dwc.normalize_classification
|
99
99
|
norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
|
100
100
|
end
|
101
|
+
|
102
|
+
it "should be able work with files which have scientificNameAuthorship" do
|
103
|
+
file = File.join(@file_dir, 'sci_name_authorship.tar.gz')
|
104
|
+
dwc = DarwinCore.new(file)
|
105
|
+
$lala = 1
|
106
|
+
norm = dwc.normalize_classification
|
107
|
+
taxa = norm.select{|k,v| v.current_name_canonical.match " "}.select{|k,v| [v.current_name.split(" ").size > v.current_name_canonical.split(" ").size]}
|
108
|
+
taxa.size.should == 507
|
109
|
+
syn = norm.select{|k,v| v.synonyms.size > 0}.map {|k,v| v.synonyms}.flatten.select {|s| s.name.split(" ").size > s.canonical_name.split(" ").size}
|
110
|
+
syn.size.should == 50
|
111
|
+
end
|
101
112
|
end
|
102
113
|
|
103
114
|
end
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 19
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
8
|
- 4
|
8
|
-
-
|
9
|
-
version: 0.4.
|
9
|
+
- 14
|
10
|
+
version: 0.4.14
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Dmitry Mozzherin
|
@@ -14,7 +15,7 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2010-10-
|
18
|
+
date: 2010-10-08 00:00:00 -04:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
@@ -25,6 +26,7 @@ dependencies:
|
|
25
26
|
requirements:
|
26
27
|
- - ">="
|
27
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
28
30
|
segments:
|
29
31
|
- 0
|
30
32
|
version: "0"
|
@@ -38,6 +40,7 @@ dependencies:
|
|
38
40
|
requirements:
|
39
41
|
- - ">="
|
40
42
|
- !ruby/object:Gem::Version
|
43
|
+
hash: 13
|
41
44
|
segments:
|
42
45
|
- 1
|
43
46
|
- 2
|
@@ -53,6 +56,7 @@ dependencies:
|
|
53
56
|
requirements:
|
54
57
|
- - ">="
|
55
58
|
- !ruby/object:Gem::Version
|
59
|
+
hash: 3
|
56
60
|
segments:
|
57
61
|
- 0
|
58
62
|
version: "0"
|
@@ -104,6 +108,7 @@ files:
|
|
104
108
|
- spec/files/junk_dir_inside.zip
|
105
109
|
- spec/files/meta.xml
|
106
110
|
- spec/files/minimal.tar.gz
|
111
|
+
- spec/files/sci_name_authorship.tar.gz
|
107
112
|
- spec/files/synonyms_in_core_accepted_name_field.tar.gz
|
108
113
|
- spec/files/synonyms_in_extension.tar.gz
|
109
114
|
- spec/files/uncompressed
|
@@ -125,6 +130,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
125
130
|
requirements:
|
126
131
|
- - ">="
|
127
132
|
- !ruby/object:Gem::Version
|
133
|
+
hash: 3
|
128
134
|
segments:
|
129
135
|
- 0
|
130
136
|
version: "0"
|
@@ -133,6 +139,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
133
139
|
requirements:
|
134
140
|
- - ">="
|
135
141
|
- !ruby/object:Gem::Version
|
142
|
+
hash: 3
|
136
143
|
segments:
|
137
144
|
- 0
|
138
145
|
version: "0"
|