dwca_hunter 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.byebug_history +31 -0
- data/.document +5 -0
- data/.gitignore +58 -0
- data/.rspec +3 -0
- data/.rubocop.yml +33 -0
- data/.ruby-version +1 -0
- data/CHANGELOG.md +15 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +133 -0
- data/LICENSE.txt +20 -0
- data/README.md +39 -0
- data/Rakefile +11 -0
- data/dwca_hunter.gemspec +42 -0
- data/exe/dwcahunter +77 -0
- data/files/birdlife_7.csv +11862 -0
- data/files/fishbase_taxon_cache.tsv +81000 -0
- data/files/reptile_checklist_2014_12.csv +15158 -0
- data/lib/dwca_hunter/downloader.rb +60 -0
- data/lib/dwca_hunter/encoding.rb +17 -0
- data/lib/dwca_hunter/resource.rb +101 -0
- data/lib/dwca_hunter/resources/arctos.rb +222 -0
- data/lib/dwca_hunter/resources/birdlife.rb +160 -0
- data/lib/dwca_hunter/resources/fishbase.rb +99 -0
- data/lib/dwca_hunter/resources/freebase.rb +152 -0
- data/lib/dwca_hunter/resources/gnub.rb +101 -0
- data/lib/dwca_hunter/resources/itis.rb +271 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +179 -0
- data/lib/dwca_hunter/resources/ncbi.rb +174 -0
- data/lib/dwca_hunter/resources/opentree.rb +121 -0
- data/lib/dwca_hunter/resources/reptiles_checklist.rb +139 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +350 -0
- data/lib/dwca_hunter/resources/worms.rb +176 -0
- data/lib/dwca_hunter/url.rb +33 -0
- data/lib/dwca_hunter/version.rb +7 -0
- data/lib/dwca_hunter/xml.rb +33 -0
- data/lib/dwca_hunter.rb +53 -0
- metadata +250 -0
@@ -0,0 +1,152 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceFreebase < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "freebase"
|
7
|
+
@title = 'Freebase'
|
8
|
+
@uuid = 'bacd21f0-44e0-43e2-914c-70929916f257'
|
9
|
+
@download_path = File.join(Dir.tmpdir,
|
10
|
+
'dwca_hunter',
|
11
|
+
'freebase',
|
12
|
+
'data.json')
|
13
|
+
@data = []
|
14
|
+
@all_taxa = {}
|
15
|
+
@cleaned_taxa = {}
|
16
|
+
@extensions = []
|
17
|
+
super
|
18
|
+
end
|
19
|
+
|
20
|
+
def needs_unpack?
|
21
|
+
false
|
22
|
+
end
|
23
|
+
|
24
|
+
def make_dwca
|
25
|
+
organize_data
|
26
|
+
generate_dwca
|
27
|
+
end
|
28
|
+
|
29
|
+
def download
|
30
|
+
DwcaHunter::logger_write(self.object_id,
|
31
|
+
'Querying freebase for species information...')
|
32
|
+
q = {
|
33
|
+
query: [{
|
34
|
+
type: '/biology/organism_classification',
|
35
|
+
id: nil,
|
36
|
+
guid: nil,
|
37
|
+
name: nil,
|
38
|
+
scientific_name: nil,
|
39
|
+
synonym_scientific_name: [],
|
40
|
+
higher_classification: {
|
41
|
+
id: nil,
|
42
|
+
guid: nil,
|
43
|
+
scientific_name: nil,
|
44
|
+
optional: true,
|
45
|
+
},
|
46
|
+
}],
|
47
|
+
cursor: true,
|
48
|
+
}
|
49
|
+
|
50
|
+
run_query(q)
|
51
|
+
|
52
|
+
data = JSON.pretty_generate @data
|
53
|
+
f = open(@download_path, 'w:utf-8')
|
54
|
+
f.write(data)
|
55
|
+
f.close
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def run_query(q)
|
61
|
+
count = 0
|
62
|
+
requests_num = 0
|
63
|
+
while true
|
64
|
+
freebase_url = "http://api.freebase.com/api/service/mqlread?query=%s" %
|
65
|
+
URI.encode(q.to_json)
|
66
|
+
res = JSON.load RestClient.get(freebase_url)
|
67
|
+
requests_num += 1
|
68
|
+
break if res['result'] == nil || res['result'].empty?
|
69
|
+
if requests_num % 10 == 0
|
70
|
+
DwcaHunter::logger_write(self.object_id,
|
71
|
+
"Received %s names" % count)
|
72
|
+
end
|
73
|
+
count += res['result'].size
|
74
|
+
res['result'].each { |d| @data << d }
|
75
|
+
q[:cursor] = res['cursor']
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def organize_data
|
80
|
+
@data = JSON.load(open(@download_path, 'r:utf-8').read)
|
81
|
+
@data.each do |d|
|
82
|
+
scientific_name = d['scientific_name'].to_s
|
83
|
+
id = d["id"]
|
84
|
+
parent_id = d['higher_classification'] ?
|
85
|
+
d['higher_classification']["id"] :
|
86
|
+
nil
|
87
|
+
synonyms = d['synonym_scientific_name']
|
88
|
+
@all_taxa[id] = { id: id,
|
89
|
+
parent_id: parent_id,
|
90
|
+
scientific_name: scientific_name,
|
91
|
+
synonyms: synonyms }
|
92
|
+
end
|
93
|
+
|
94
|
+
@all_taxa.each do |k, v|
|
95
|
+
next unless v[:scientific_name] && v[:scientific_name].strip != ""
|
96
|
+
parent_id = v[:parent_id]
|
97
|
+
until (@all_taxa[parent_id] &&
|
98
|
+
@all_taxa[parent_id][:scientific_name]) || parent_id.nil?
|
99
|
+
puts "did not find parent %s" % parent_id
|
100
|
+
parent_id = @all_taxa[parent_id]
|
101
|
+
end
|
102
|
+
parent_id = nil if v[:id] == parent_id
|
103
|
+
v[:parent_id] = parent_id
|
104
|
+
@cleaned_taxa[k] = v
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
def generate_dwca
|
110
|
+
DwcaHunter::logger_write(self.object_id,
|
111
|
+
'Creating DarwinCore Archive file')
|
112
|
+
@core = [['http://rs.tdwg.org/dwc/terms/taxonID',
|
113
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
114
|
+
'http://rs.tdwg.org/dwc/terms/parentNameUsageID']]
|
115
|
+
|
116
|
+
@extensions << { data: [[
|
117
|
+
'http://rs.tdwg.org/dwc/terms/TaxonID',
|
118
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
119
|
+
]], file_name: 'synonyms.txt' }
|
120
|
+
DwcaHunter::logger_write(self.object_id,
|
121
|
+
'Creating synonyms extension for DarwinCore Archive file')
|
122
|
+
count = 0
|
123
|
+
@cleaned_taxa.each do |key, taxon|
|
124
|
+
count += 1
|
125
|
+
@core << [taxon[:id], taxon[:scientific_name], taxon[:parent_id]]
|
126
|
+
if count % BATCH_SIZE == 0
|
127
|
+
DwcaHunter::logger_write(self.object_id,
|
128
|
+
"Traversing %s extension data record" % count)
|
129
|
+
end
|
130
|
+
taxon[:synonyms].each do |name|
|
131
|
+
@extensions[-1][:data] << [taxon[:id], name]
|
132
|
+
end
|
133
|
+
end
|
134
|
+
@eml = {
|
135
|
+
id: @uuid,
|
136
|
+
title: @title,
|
137
|
+
license: 'http://creativecommons.org/licenses/by-sa/3.0/',
|
138
|
+
authors: [
|
139
|
+
{ url: 'http://www.freebase.com/home' }],
|
140
|
+
abstract: 'An entity graph of people, places and things, ' +
|
141
|
+
'built by a community that loves open data.',
|
142
|
+
metadata_providers: [
|
143
|
+
{ first_name: 'Dmitry',
|
144
|
+
last_name: 'Mozzherin',
|
145
|
+
email: 'dmozzherin@mbl.edu' }],
|
146
|
+
url: 'http://www.freebase.com/home'
|
147
|
+
}
|
148
|
+
super
|
149
|
+
end
|
150
|
+
|
151
|
+
end
|
152
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module DwcaHunter
|
3
|
+
class ResourceGNUB < DwcaHunter::Resource
|
4
|
+
def initialize(opts = {})
|
5
|
+
@command = 'gnub'
|
6
|
+
@title = 'GNUB'
|
7
|
+
@url = 'http://gnub.org/datadump/gni_export.zip'
|
8
|
+
@UUID = 'd34ed224-78e7-485d-a478-adc2558a0f68'
|
9
|
+
@download_path = File.join(Dir.tmpdir,
|
10
|
+
'dwca_hunter',
|
11
|
+
'gnub',
|
12
|
+
'data.tar.gz')
|
13
|
+
@ranks = {}
|
14
|
+
@kingdoms = {}
|
15
|
+
@authors = {}
|
16
|
+
@vernaculars = {}
|
17
|
+
@synonyms = {}
|
18
|
+
@synonym_of = {}
|
19
|
+
@names = []
|
20
|
+
@extensions = []
|
21
|
+
super(opts)
|
22
|
+
@gnub_dir = File.join(@download_dir, 'gnub')
|
23
|
+
end
|
24
|
+
|
25
|
+
def unpack
|
26
|
+
unpack_zip
|
27
|
+
end
|
28
|
+
|
29
|
+
def make_dwca
|
30
|
+
DwcaHunter::logger_write(self.object_id, 'Extracting data')
|
31
|
+
get_names
|
32
|
+
generate_dwca
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def get_names
|
38
|
+
codes = get_codes
|
39
|
+
file = Dir.entries(@download_dir).grep(/txt$/).first
|
40
|
+
open(File.join(@download_dir, file)).each_with_index do |line, i|
|
41
|
+
next if i == 0 || (data = line.strip) == ''
|
42
|
+
data = data.split("\t")
|
43
|
+
protolog = data[0].downcase
|
44
|
+
protolog_path = data[1].downcase
|
45
|
+
name_string = data[2]
|
46
|
+
rank = data[3]
|
47
|
+
code = codes[data[4].to_i]
|
48
|
+
taxon_id = UUID.create_v5(name_string +
|
49
|
+
protolog_path +
|
50
|
+
rank, GNA_NAMESPACE)
|
51
|
+
@names << { taxon_id: taxon_id,
|
52
|
+
name_string: name_string,
|
53
|
+
protolog: protolog,
|
54
|
+
protolog_path: protolog_path,
|
55
|
+
code: code,
|
56
|
+
rank: rank }
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def get_codes
|
61
|
+
codes_url = 'http://resolver.globalnames.org/nomenclatural_codes.json'
|
62
|
+
codes = RestClient.get(codes_url)
|
63
|
+
codes = JSON.parse(codes, symbolize_names: true)
|
64
|
+
codes.inject({}) do |res, c|
|
65
|
+
res[c[:id]] = c[:code]
|
66
|
+
res
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def generate_dwca
|
71
|
+
DwcaHunter::logger_write(self.object_id,
|
72
|
+
'Creating DarwinCore Archive file')
|
73
|
+
@core = [['http://rs.tdwg.org/dwc/terms/taxonID',
|
74
|
+
'http://rs.tdwg.org/dwc/terms/originalNameUsageID',
|
75
|
+
'http://globalnames.org/terms/originalNameUsageIDPath',
|
76
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
77
|
+
'http://rs.tdwg.org/dwc/terms/nomenclaturalCode',
|
78
|
+
'http://rs.tdwg.org/dwc/terms/taxonRank']]
|
79
|
+
@names.each do |n|
|
80
|
+
@core << [n[:taxon_id], n[:protolog], n[:name_string],
|
81
|
+
n[:protolog_path], n[:code], n[:rank]]
|
82
|
+
end
|
83
|
+
@eml = {
|
84
|
+
id: @uuid,
|
85
|
+
title: @title,
|
86
|
+
authors: [
|
87
|
+
{email: 'deepreef@bishopmuseum.org'}
|
88
|
+
],
|
89
|
+
metadata_providers: [
|
90
|
+
{ first_name: 'Dmitry',
|
91
|
+
last_name: 'Mozzherin',
|
92
|
+
email: 'dmozzherin@gmail.com' }
|
93
|
+
],
|
94
|
+
abstract: 'Global Names Usage Bank',
|
95
|
+
url: 'http://www.zoobank.org'
|
96
|
+
}
|
97
|
+
super
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
@@ -0,0 +1,271 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module DwcaHunter
|
3
|
+
class ResourceITIS < DwcaHunter::Resource
|
4
|
+
def initialize(opts = {})
|
5
|
+
@command = 'itis'
|
6
|
+
@title = 'ITIS'
|
7
|
+
@url = 'https://www.itis.gov/downloads/itisMySQLTables.tar.gz'
|
8
|
+
@uuid = '5d066e84-e512-4a2f-875c-0a605d3d9f35'
|
9
|
+
@download_path = File.join(Dir.tmpdir,
|
10
|
+
'dwca_hunter',
|
11
|
+
'itis',
|
12
|
+
'data.tar.gz')
|
13
|
+
@ranks = {}
|
14
|
+
@kingdoms = {}
|
15
|
+
@authors = {}
|
16
|
+
@vernaculars = {}
|
17
|
+
@synonyms = {}
|
18
|
+
@synonym_of = {}
|
19
|
+
@names = {}
|
20
|
+
@extensions = []
|
21
|
+
super(opts)
|
22
|
+
@itis_dir = File.join(@download_dir, 'itis')
|
23
|
+
end
|
24
|
+
|
25
|
+
def unpack
|
26
|
+
unpack_tar
|
27
|
+
dir = Dir.entries(@download_dir).select {|e| e.match(/itisMySQL/)}[0]
|
28
|
+
FileUtils.mv(File.join(@download_dir, dir), @itis_dir)
|
29
|
+
|
30
|
+
# Create a file with the same name as the directory we extracted.
|
31
|
+
FileUtils.touch(File.join(@itis_dir, 'version_' + dir))
|
32
|
+
end
|
33
|
+
|
34
|
+
def make_dwca
|
35
|
+
DwcaHunter::logger_write(self.object_id, 'Extracting data')
|
36
|
+
get_ranks
|
37
|
+
get_kingdoms
|
38
|
+
get_authors
|
39
|
+
get_vernaculars
|
40
|
+
get_synonyms
|
41
|
+
get_names
|
42
|
+
generate_dwca
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
def get_ranks
|
47
|
+
# 0 kingdom_id integer not null
|
48
|
+
# 1 rank_id smallint not null
|
49
|
+
# 2 rank_name char(15) not null
|
50
|
+
# 3 dir_parent_rank_id smallint not null
|
51
|
+
# 4 req_parent_rank_id smallint not null
|
52
|
+
# 5 update_date date not null
|
53
|
+
rank_file = File.join(@itis_dir, 'taxon_unit_types')
|
54
|
+
f = open(rank_file, 'r:utf-8')
|
55
|
+
f.each do |l|
|
56
|
+
l.encode!('UTF-8',
|
57
|
+
'ISO-8859-1',
|
58
|
+
invalid: :replace,
|
59
|
+
replace: '?')
|
60
|
+
row = l.strip.split('|')
|
61
|
+
@ranks[row[0].strip + '/' + row[1].strip] = row[2].strip
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def get_kingdoms
|
66
|
+
# 0 kingdom_id serial not null
|
67
|
+
# 1 kingdom_name char(10) not null
|
68
|
+
# 2 update_date date not null
|
69
|
+
|
70
|
+
f = open(File.join(@itis_dir, 'kingdoms'))
|
71
|
+
f.each do |l|
|
72
|
+
data = l.strip.split('|')
|
73
|
+
@kingdoms[data[0].strip] = data[1].strip
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def get_authors
|
78
|
+
# 0 taxon_author_id serial not null
|
79
|
+
# 1 taxon_author varchar(100,30) not null
|
80
|
+
# 2 update_date date not null
|
81
|
+
# 3 kingdom_id smallint not null
|
82
|
+
|
83
|
+
f = open(File.join(@itis_dir, 'taxon_authors_lkp'))
|
84
|
+
f.each do |l|
|
85
|
+
l.encode!('UTF-8',
|
86
|
+
'ISO-8859-1',
|
87
|
+
invalid: :replace,
|
88
|
+
replace: '?')
|
89
|
+
data = l.strip.split('|')
|
90
|
+
@authors[data[0].strip] = data[1].strip
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def get_vernaculars
|
95
|
+
# 0 tsn integer not null
|
96
|
+
# 1 vernacular_name varchar(80,5) not null
|
97
|
+
# 2 language varchar(15) not null
|
98
|
+
# 3 approved_ind char(1)
|
99
|
+
# 4 update_date date not null
|
100
|
+
# 5 primary key (tsn,vernacular_name,language)
|
101
|
+
# constraint "itis".vernaculars_key
|
102
|
+
|
103
|
+
f = open(File.join(@itis_dir, 'vernaculars'))
|
104
|
+
f.each_with_index do |l, i|
|
105
|
+
if i % BATCH_SIZE == 0
|
106
|
+
DwcaHunter::logger_write(self.object_id,
|
107
|
+
"Extracted %s vernacular names" % i)
|
108
|
+
end
|
109
|
+
l.encode!('UTF-8',
|
110
|
+
'ISO-8859-1',
|
111
|
+
invalid: :replace,
|
112
|
+
replace: '?')
|
113
|
+
data = l.split('|').map { |d| d.strip }
|
114
|
+
name_tsn = data[0]
|
115
|
+
string = data[1]
|
116
|
+
language = data[2]
|
117
|
+
language = 'Common name' if language == 'unspecified'
|
118
|
+
@vernaculars[name_tsn] = { name:string, language:language }
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def get_synonyms
|
123
|
+
# 0 tsn integer not null
|
124
|
+
# 1 tsn_accepted integer not null
|
125
|
+
# 2 update_date date not null
|
126
|
+
|
127
|
+
f = open(File.join(@itis_dir, 'synonym_links'))
|
128
|
+
f.each_with_index do |l, i|
|
129
|
+
if i % BATCH_SIZE == 0
|
130
|
+
DwcaHunter::logger_write(self.object_id,
|
131
|
+
"Extracted %s synonyms" % i)
|
132
|
+
end
|
133
|
+
l.encode!('UTF-8',
|
134
|
+
'ISO-8859-1',
|
135
|
+
invalid: :replace,
|
136
|
+
replace: '?')
|
137
|
+
data = l.split('|').map { |d| d.strip }
|
138
|
+
synonym_name_tsn = data[0]
|
139
|
+
accepted_name_tsn = data[1]
|
140
|
+
@synonyms[synonym_name_tsn] = accepted_name_tsn
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def get_names
|
145
|
+
# 0 tsn serial not null
|
146
|
+
# 1 unit_ind1 char(1)
|
147
|
+
# 2 unit_name1 char(35) not null
|
148
|
+
# 3 unit_ind2 char(1)
|
149
|
+
# 4 unit_name2 varchar(35)
|
150
|
+
# 5 unit_ind3 varchar(7)
|
151
|
+
# 6 unit_name3 varchar(35)
|
152
|
+
# 7 unit_ind4 varchar(7)
|
153
|
+
# 8 unit_name4 varchar(35)
|
154
|
+
# 9 unnamed_taxon_ind char(1)
|
155
|
+
# 10 usage varchar(12,5) not null
|
156
|
+
# 11 unaccept_reason varchar(50,9)
|
157
|
+
# 12 credibility_rtng varchar(40,17) not null
|
158
|
+
# 13 completeness_rtng char(10)
|
159
|
+
# 14 currency_rating char(7)
|
160
|
+
# 15 phylo_sort_seq smallint
|
161
|
+
# 16 initial_time_stamp datetime year to second not null
|
162
|
+
# 17 parent_tsn integer
|
163
|
+
# 18 taxon_author_id integer
|
164
|
+
# 19 hybrid_author_id integer
|
165
|
+
# 20 kingdom_id smallint not null
|
166
|
+
# 21 rank_id smallint not null
|
167
|
+
# 22 update_date date not null
|
168
|
+
# 23 uncertain_prnt_ind char(3)
|
169
|
+
|
170
|
+
f = open(File.join(@itis_dir, 'taxonomic_units'))
|
171
|
+
f.each_with_index do |l, i|
|
172
|
+
if i % BATCH_SIZE == 0
|
173
|
+
DwcaHunter::logger_write(self.object_id,
|
174
|
+
"Extracted %s names" % i)
|
175
|
+
end
|
176
|
+
l.encode!('UTF-8',
|
177
|
+
'ISO-8859-1',
|
178
|
+
invalid: :replace,
|
179
|
+
replace: '?')
|
180
|
+
data = l.split("|").map { |d| d.strip }
|
181
|
+
name_tsn = data[0]
|
182
|
+
x1 = data[1]
|
183
|
+
name_part1 = data[2]
|
184
|
+
x2 = data[3]
|
185
|
+
name_part2 = data[4]
|
186
|
+
sp_marker1 = data[5]
|
187
|
+
name_part3 = data[6]
|
188
|
+
sp_marker2 = data[7]
|
189
|
+
name_part4 = data[8]
|
190
|
+
status = data[10]
|
191
|
+
parent_tsn = data[17]
|
192
|
+
author_id = data[18]
|
193
|
+
kingdom_id = data[20]
|
194
|
+
rank_id = data[21]
|
195
|
+
|
196
|
+
parent_tsn = nil if parent_tsn == ''
|
197
|
+
name = [x1, name_part1, x2, name_part2,
|
198
|
+
sp_marker1, name_part3, sp_marker2, name_part4]
|
199
|
+
canonical_name = name.clone
|
200
|
+
name << @authors[author_id] if @authors[author_id]
|
201
|
+
name = name.join(' ').strip.gsub(/\s+/, ' ')
|
202
|
+
canonical_name = canonical_name.join(' ').strip.gsub(/\s+/, ' ')
|
203
|
+
rank = @ranks[kingdom_id + '/' + rank_id] ?
|
204
|
+
@ranks[kingdom_id + '/' + rank_id] :
|
205
|
+
''
|
206
|
+
@names[name_tsn] = { name: name,
|
207
|
+
canonical_name: canonical_name,
|
208
|
+
status: status,
|
209
|
+
parent_tsn: parent_tsn,
|
210
|
+
rank: rank }
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
def generate_dwca
|
215
|
+
DwcaHunter::logger_write(self.object_id,
|
216
|
+
'Creating DarwinCore Archive file')
|
217
|
+
@core = [['http://rs.tdwg.org/dwc/terms/taxonID',
|
218
|
+
'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
|
219
|
+
'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID',
|
220
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
221
|
+
'http://rs.tdwg.org/ontology/voc/TaxonName#nameComplete',
|
222
|
+
'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
|
223
|
+
'http://rs.tdwg.org/dwc/terms/taxonRank']]
|
224
|
+
@extensions << { data: [['http://rs.tdwg.org/dwc/terms/taxonID',
|
225
|
+
'http://rs.tdwg.org/dwc/terms/vernacularName',
|
226
|
+
'http://purl.org/dc/terms/language']],
|
227
|
+
file_name: 'vernacular_names.txt',
|
228
|
+
row_type: 'http://rs.gbif.org/terms/1.0/VernacularName'
|
229
|
+
}
|
230
|
+
@names.keys.each_with_index do |k, i|
|
231
|
+
d = @names[k]
|
232
|
+
accepted_id = @synonyms[k] ? @synonyms[k] : nil
|
233
|
+
parent_id = d[:parent_tsn].to_i == 0 ? nil : d[:parent_tsn]
|
234
|
+
row = [k, parent_id, accepted_id, d[:name], d[:canonical_name], d[:status], d[:rank]]
|
235
|
+
@core << row
|
236
|
+
end
|
237
|
+
|
238
|
+
@vernaculars.keys.each_with_index do |k, i|
|
239
|
+
d = @vernaculars[k]
|
240
|
+
@extensions[0][:data] << [k, d[:name], d[:language]]
|
241
|
+
end
|
242
|
+
|
243
|
+
@eml = {
|
244
|
+
id: @uuid,
|
245
|
+
title: @title,
|
246
|
+
authors: [
|
247
|
+
{email: 'itiswebmaster@itis.gov'}
|
248
|
+
],
|
249
|
+
metadata_providers: [
|
250
|
+
{ first_name: 'Dmitry',
|
251
|
+
last_name: 'Mozzherin',
|
252
|
+
email: 'dmozzherin@gmail.com' }
|
253
|
+
],
|
254
|
+
abstract: 'The White House Subcommittee on Biodiversity and ' +
|
255
|
+
'Ecosystem Dynamics has identified systematics as a ' +
|
256
|
+
'research priority that is fundamental to ecosystem ' +
|
257
|
+
'management and biodiversity conservation. This primary ' +
|
258
|
+
'need identified by the Subcommittee requires ' +
|
259
|
+
'improvements in the organization of, and access to, ' +
|
260
|
+
'standardized nomenclature. ITIS (originally referred ' +
|
261
|
+
'to as the Interagency Taxonomic Information System) ' +
|
262
|
+
'was designed to fulfill these requirements. In the ' +
|
263
|
+
'future, the ITIS will provide taxonomic data and a ' +
|
264
|
+
'directory of taxonomic expertise that will support ' +
|
265
|
+
'the system',
|
266
|
+
url: 'http://www.itis.gov'
|
267
|
+
}
|
268
|
+
super
|
269
|
+
end
|
270
|
+
end
|
271
|
+
end
|
@@ -0,0 +1,179 @@
|
|
1
|
+
module DwcaHunter
|
2
|
+
class ResourceMammalSpecies < DwcaHunter::Resource
|
3
|
+
def initialize(opts = {})
|
4
|
+
@command = "mammal-species"
|
5
|
+
@title = "The Mammal Species of The World"
|
6
|
+
@uuid = "464dafec-1037-432d-8449-c0b309e0a030"
|
7
|
+
@data = []
|
8
|
+
@extensions = []
|
9
|
+
@count = 1
|
10
|
+
@clades = {"Mammalia" => { rank: "class", id: @count}}
|
11
|
+
@url = "http://www.departments.bucknell.edu"\
|
12
|
+
"/biology/resources/msw3/export.asp"
|
13
|
+
@download_path = File.join(Dir.tmpdir, "dwca_hunter",
|
14
|
+
"mammalsp", "msw3-all.csv")
|
15
|
+
super
|
16
|
+
end
|
17
|
+
|
18
|
+
def needs_unpack?
|
19
|
+
false
|
20
|
+
end
|
21
|
+
|
22
|
+
def make_dwca
|
23
|
+
DwcaHunter::logger_write(self.object_id, "Extracting data")
|
24
|
+
encode
|
25
|
+
collect_data
|
26
|
+
generate_dwca
|
27
|
+
end
|
28
|
+
|
29
|
+
def download
|
30
|
+
DwcaHunter::logger_write(self.object_id, "Downloading file -- "\
|
31
|
+
"it will take some time...")
|
32
|
+
dlr = DwcaHunter::Downloader.new(url, @download_path)
|
33
|
+
dlr.download
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def encode
|
39
|
+
DwcaHunter::Encoding.latin1_to_utf8(@download_path)
|
40
|
+
end
|
41
|
+
|
42
|
+
def collect_data
|
43
|
+
opts = { headers: true, header_converters: :symbol }
|
44
|
+
CSV.open(@download_path + ".utf_8", opts).each do |row|
|
45
|
+
@data << row.to_hash
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def generate_dwca
|
50
|
+
DwcaHunter::logger_write(self.object_id,
|
51
|
+
'Creating DarwinCore Archive file')
|
52
|
+
core_init
|
53
|
+
extensions_init
|
54
|
+
eml_init
|
55
|
+
@data.each do |rec|
|
56
|
+
taxon = process_hierarchy(rec)
|
57
|
+
process_vernaculars(rec, taxon)
|
58
|
+
process_synonyms(rec, taxon)
|
59
|
+
end
|
60
|
+
super
|
61
|
+
end
|
62
|
+
|
63
|
+
def process_vernaculars(rec, taxon)
|
64
|
+
return if rec[:commonname].to_s == ""
|
65
|
+
taxon_id = taxon[0]
|
66
|
+
lang = "en"
|
67
|
+
name = rec[:commonname].gsub("\u{0092}", "'")
|
68
|
+
@extensions[0][:data] << [taxon_id, name, lang]
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
def process_synonyms(rec, taxon)
|
73
|
+
accepted_id = taxon[0]
|
74
|
+
parent_id = taxon[2]
|
75
|
+
rank = taxon[-1]
|
76
|
+
return unless ['species', 'subspecies'].include? rank
|
77
|
+
synonyms = rec[:synonyms].gsub(/\.$/, "").
|
78
|
+
gsub(/<[\/ib]+>/, "").gsub(/[\s]+/, " ").split(";")
|
79
|
+
synonyms = synonyms.map(&:strip)
|
80
|
+
synonyms = synonyms.map do |s|
|
81
|
+
next if s.match(/<u>/)
|
82
|
+
if s.match(/^[a-z]/)
|
83
|
+
s = rec[:genus] + " " + s
|
84
|
+
end
|
85
|
+
@count += 1
|
86
|
+
id = @count
|
87
|
+
@core << [id, nil, parent_id, accepted_id, s, "synonym", rank]
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def process_name(rec, rank)
|
92
|
+
name =[@core.last[4], rec[:author], rec[:date]]
|
93
|
+
@core.last[4] = name.join(" ").gsub(/[\s]+/, " ").strip
|
94
|
+
@core.last[1] = rec[:id]
|
95
|
+
end
|
96
|
+
|
97
|
+
def process_hierarchy(rec)
|
98
|
+
parent_id = @clades["Mammalia"][:id]
|
99
|
+
is_row_rank = false
|
100
|
+
[:order, :suborder, :infraorder, :superfamily, :family,
|
101
|
+
:subfamily, :tribe, :genus, :subgenus,
|
102
|
+
:species, :subspecies].each do |rank|
|
103
|
+
is_row_rank = true if rank == rec[:taxonlevel].downcase.to_sym
|
104
|
+
clade = rec[rank]
|
105
|
+
clade = clade.capitalize if clade.match(/^[A-Z]+$/)
|
106
|
+
next if clade.to_s == ""
|
107
|
+
clade_id = nil
|
108
|
+
clade = adjust_clade(rec, rank, clade)
|
109
|
+
if @clades.key?(clade)
|
110
|
+
clade_id = @clades[clade][:id]
|
111
|
+
else
|
112
|
+
@count += 1
|
113
|
+
clade_id = @count
|
114
|
+
@clades[clade] = { id: clade_id, rank: rank }
|
115
|
+
@core << [clade_id, nil, parent_id, clade_id, clade, nil, rank.to_s]
|
116
|
+
if is_row_rank
|
117
|
+
process_name(rec, rank)
|
118
|
+
return @core.last
|
119
|
+
end
|
120
|
+
end
|
121
|
+
parent_id = clade_id
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
def adjust_clade(rec, rank, clade)
|
126
|
+
if [:species, :subspecies].include? rank
|
127
|
+
clade = [rec[:genus], rec[:species]]
|
128
|
+
clade << rec[:subspecies] if rank == :subspecies
|
129
|
+
clade.join(" ").gsub(/[\s]+/, " ").strip
|
130
|
+
else
|
131
|
+
clade
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def eml_init
|
136
|
+
@eml = {
|
137
|
+
id: @uuid,
|
138
|
+
title: @title,
|
139
|
+
authors: [
|
140
|
+
{ first_name: "Don",
|
141
|
+
last_name: "Wilson" },
|
142
|
+
{ first_name: "DeeAnn",
|
143
|
+
last_name: "Reader" },
|
144
|
+
],
|
145
|
+
metadata_providers: [
|
146
|
+
{ first_name: "Dmitry",
|
147
|
+
last_name: "Mozzherin",
|
148
|
+
email: "dmozzherin@gmail.com" }
|
149
|
+
],
|
150
|
+
abstract: "Mammal Species of the World, 3rd edition (MSW3) is "\
|
151
|
+
"a database of mammalian taxonomy, based upon the 2005 book "\
|
152
|
+
"Mammal Species of the World. A Taxonomic and Geographic Reference "\
|
153
|
+
"(3rd ed). Don E. Wilson & DeeAnn M. Reeder (editors).",
|
154
|
+
url: "http://www.vertebrates.si.edu/msw/mswcfapp/msw/index.cfm"
|
155
|
+
}
|
156
|
+
end
|
157
|
+
|
158
|
+
def core_init
|
159
|
+
@core = [['http://rs.tdwg.org/dwc/terms/taxonID',
|
160
|
+
'http://globalnames.org/terms/localID',
|
161
|
+
'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
|
162
|
+
'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID',
|
163
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
164
|
+
'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
|
165
|
+
'http://rs.tdwg.org/dwc/terms/taxonRank']]
|
166
|
+
m = @clades["Mammalia"]
|
167
|
+
@core << [m[:id], nil, nil, m[:id], "Mammalia", nil, "class"]
|
168
|
+
end
|
169
|
+
|
170
|
+
def extensions_init
|
171
|
+
@extensions << { data: [['http://rs.tdwg.org/dwc/terms/taxonID',
|
172
|
+
'http://rs.tdwg.org/dwc/terms/vernacularName',
|
173
|
+
'http://purl.org/dc/terms/language']],
|
174
|
+
file_name: 'vernacular_names.txt',
|
175
|
+
row_type: 'http://rs.gbif.org/terms/1.0/VernacularName'
|
176
|
+
}
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|