dwc-archive 0.4.13 → 0.4.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/dwc-archive/classification_normalizer.rb +27 -17
- data/spec/files/sci_name_authorship.tar.gz +0 -0
- data/spec/lib/dwc-archive_spec.rb +11 -0
- metadata +10 -3
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.4.
|
1
|
+
0.4.14
|
@@ -27,7 +27,7 @@ class DarwinCore
|
|
27
27
|
|
28
28
|
def initialize(dwc_instance, verbose = false)
|
29
29
|
@dwc = dwc_instance
|
30
|
-
@
|
30
|
+
@core_fields = get_fields(@dwc.core)
|
31
31
|
@extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
|
32
32
|
@res = {}
|
33
33
|
@parser = ParsleyStore.new(1,2)
|
@@ -56,7 +56,7 @@ class DarwinCore
|
|
56
56
|
|
57
57
|
private
|
58
58
|
|
59
|
-
def
|
59
|
+
def get_canonical_name(a_scientific_name)
|
60
60
|
if R19
|
61
61
|
a_scientific_name.force_encoding('utf-8')
|
62
62
|
end
|
@@ -84,38 +84,47 @@ class DarwinCore
|
|
84
84
|
def add_synonym_from_core(taxon_id, row)
|
85
85
|
taxon = @res[row[taxon_id]] ? @res[row[taxon_id]] : @res[row[taxon_id]] = DarwinCore::TaxonNormalized.new
|
86
86
|
taxon.synonyms << SynonymNormalized.new(
|
87
|
-
row[@
|
88
|
-
|
89
|
-
@
|
87
|
+
row[@core_fields[:scientificname]],
|
88
|
+
row[@core_fields[:canonicalname]],
|
89
|
+
@core_fields[:taxonomicstatus] ? row[@core_fields[:taxonomicstatus]] : nil)
|
90
|
+
end
|
91
|
+
|
92
|
+
def set_scientific_name(row, fields)
|
93
|
+
canonical_name = fields[:scientificnameauthorship] ? row[fields[:scientificname]] : get_canonical_name(row[fields[:scientificname]])
|
94
|
+
fields[:canonicalname] = row.size
|
95
|
+
row << canonical_name
|
96
|
+
scientific_name = (fields[:scientificnameauthorship] && row[fields[:scientificnameauthorship]].to_s.strip != '') ? row[fields[:scientificname]].strip + ' ' + row[fields[:scientificnameauthorship]].strip : row[fields[:scientificname]].strip
|
97
|
+
row[fields[:scientificname]] = scientific_name
|
90
98
|
end
|
91
99
|
|
92
100
|
def ingest_core
|
93
|
-
raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@
|
101
|
+
raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core_fields[:id] && @core_fields[:scientificname])
|
94
102
|
puts "Reading core information" if @verbose
|
95
103
|
rows = @dwc.core.read[0]
|
96
104
|
puts "Ingesting information from the core" if @verbose
|
97
105
|
rows.each_with_index do |r, i|
|
98
106
|
count = i + 1
|
107
|
+
set_scientific_name(r, @core_fields)
|
99
108
|
puts "Ingesting %s'th record" % count if @verbose and count % @verbose_count == 0
|
100
109
|
#core has AcceptedNameUsageId
|
101
|
-
if @
|
102
|
-
add_synonym_from_core(@
|
103
|
-
elsif !@
|
110
|
+
if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
|
111
|
+
add_synonym_from_core(@core_fields[:acceptednameusageid], r)
|
112
|
+
elsif !@core_fields[:acceptednameusageid] && status_synonym?(r[@core_fields[:taxonomicstatus]])
|
104
113
|
add_synonym_from_core(parent_id, r)
|
105
114
|
else
|
106
|
-
taxon = @res[r[@
|
107
|
-
taxon.id = r[@
|
108
|
-
taxon.current_name = r[@
|
109
|
-
taxon.current_name_canonical =
|
115
|
+
taxon = @res[r[@core_fields[:id]]] ? @res[r[@core_fields[:id]]] : @res[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new
|
116
|
+
taxon.id = r[@core_fields[:id]]
|
117
|
+
taxon.current_name = r[@core_fields[:scientificname]]
|
118
|
+
taxon.current_name_canonical = r[@core_fields[:canonicalname]]
|
110
119
|
taxon.parent_id = r[parent_id]
|
111
|
-
taxon.rank = r[@
|
112
|
-
taxon.status = r[@
|
120
|
+
taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
|
121
|
+
taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
|
113
122
|
end
|
114
123
|
end
|
115
124
|
end
|
116
125
|
|
117
126
|
def parent_id
|
118
|
-
parent_id_field = @
|
127
|
+
parent_id_field = @core_fields[:highertaxonid] || @core_fields[:parentnameusageid]
|
119
128
|
end
|
120
129
|
|
121
130
|
def calculate_classification_path
|
@@ -172,10 +181,11 @@ class DarwinCore
|
|
172
181
|
ext, fields = *extension
|
173
182
|
ext.read[0].each_with_index do |r, i|
|
174
183
|
count = i + 1
|
184
|
+
set_scientific_name(r, fields)
|
175
185
|
puts "Ingesting %s'th record" % count if @verbose && count % @verbose_count == 0
|
176
186
|
@res[r[fields[:id]]].synonyms << SynonymNormalized.new(
|
177
187
|
r[fields[:scientificname]],
|
178
|
-
|
188
|
+
r[fields[:canonicalname]],
|
179
189
|
fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil)
|
180
190
|
end
|
181
191
|
end
|
Binary file
|
@@ -98,6 +98,17 @@ describe DarwinCore do
|
|
98
98
|
norm = dwc.normalize_classification
|
99
99
|
norm.select { |k,v| !v.synonyms.empty? }.map { |k,v| v.synonyms }.size.should > 0
|
100
100
|
end
|
101
|
+
|
102
|
+
it "should be able work with files which have scientificNameAuthorship" do
|
103
|
+
file = File.join(@file_dir, 'sci_name_authorship.tar.gz')
|
104
|
+
dwc = DarwinCore.new(file)
|
105
|
+
$lala = 1
|
106
|
+
norm = dwc.normalize_classification
|
107
|
+
taxa = norm.select{|k,v| v.current_name_canonical.match " "}.select{|k,v| [v.current_name.split(" ").size > v.current_name_canonical.split(" ").size]}
|
108
|
+
taxa.size.should == 507
|
109
|
+
syn = norm.select{|k,v| v.synonyms.size > 0}.map {|k,v| v.synonyms}.flatten.select {|s| s.name.split(" ").size > s.canonical_name.split(" ").size}
|
110
|
+
syn.size.should == 50
|
111
|
+
end
|
101
112
|
end
|
102
113
|
|
103
114
|
end
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
+
hash: 19
|
4
5
|
prerelease: false
|
5
6
|
segments:
|
6
7
|
- 0
|
7
8
|
- 4
|
8
|
-
-
|
9
|
-
version: 0.4.
|
9
|
+
- 14
|
10
|
+
version: 0.4.14
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Dmitry Mozzherin
|
@@ -14,7 +15,7 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2010-10-
|
18
|
+
date: 2010-10-08 00:00:00 -04:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
@@ -25,6 +26,7 @@ dependencies:
|
|
25
26
|
requirements:
|
26
27
|
- - ">="
|
27
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
28
30
|
segments:
|
29
31
|
- 0
|
30
32
|
version: "0"
|
@@ -38,6 +40,7 @@ dependencies:
|
|
38
40
|
requirements:
|
39
41
|
- - ">="
|
40
42
|
- !ruby/object:Gem::Version
|
43
|
+
hash: 13
|
41
44
|
segments:
|
42
45
|
- 1
|
43
46
|
- 2
|
@@ -53,6 +56,7 @@ dependencies:
|
|
53
56
|
requirements:
|
54
57
|
- - ">="
|
55
58
|
- !ruby/object:Gem::Version
|
59
|
+
hash: 3
|
56
60
|
segments:
|
57
61
|
- 0
|
58
62
|
version: "0"
|
@@ -104,6 +108,7 @@ files:
|
|
104
108
|
- spec/files/junk_dir_inside.zip
|
105
109
|
- spec/files/meta.xml
|
106
110
|
- spec/files/minimal.tar.gz
|
111
|
+
- spec/files/sci_name_authorship.tar.gz
|
107
112
|
- spec/files/synonyms_in_core_accepted_name_field.tar.gz
|
108
113
|
- spec/files/synonyms_in_extension.tar.gz
|
109
114
|
- spec/files/uncompressed
|
@@ -125,6 +130,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
125
130
|
requirements:
|
126
131
|
- - ">="
|
127
132
|
- !ruby/object:Gem::Version
|
133
|
+
hash: 3
|
128
134
|
segments:
|
129
135
|
- 0
|
130
136
|
version: "0"
|
@@ -133,6 +139,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
133
139
|
requirements:
|
134
140
|
- - ">="
|
135
141
|
- !ruby/object:Gem::Version
|
142
|
+
hash: 3
|
136
143
|
segments:
|
137
144
|
- 0
|
138
145
|
version: "0"
|