dwca_hunter 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.byebug_history +31 -0
- data/.document +5 -0
- data/.gitignore +58 -0
- data/.rspec +3 -0
- data/.rubocop.yml +33 -0
- data/.ruby-version +1 -0
- data/CHANGELOG.md +15 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +133 -0
- data/LICENSE.txt +20 -0
- data/README.md +39 -0
- data/Rakefile +11 -0
- data/dwca_hunter.gemspec +42 -0
- data/exe/dwcahunter +77 -0
- data/files/birdlife_7.csv +11862 -0
- data/files/fishbase_taxon_cache.tsv +81000 -0
- data/files/reptile_checklist_2014_12.csv +15158 -0
- data/lib/dwca_hunter/downloader.rb +60 -0
- data/lib/dwca_hunter/encoding.rb +17 -0
- data/lib/dwca_hunter/resource.rb +101 -0
- data/lib/dwca_hunter/resources/arctos.rb +222 -0
- data/lib/dwca_hunter/resources/birdlife.rb +160 -0
- data/lib/dwca_hunter/resources/fishbase.rb +99 -0
- data/lib/dwca_hunter/resources/freebase.rb +152 -0
- data/lib/dwca_hunter/resources/gnub.rb +101 -0
- data/lib/dwca_hunter/resources/itis.rb +271 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +179 -0
- data/lib/dwca_hunter/resources/ncbi.rb +174 -0
- data/lib/dwca_hunter/resources/opentree.rb +121 -0
- data/lib/dwca_hunter/resources/reptiles_checklist.rb +139 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +350 -0
- data/lib/dwca_hunter/resources/worms.rb +176 -0
- data/lib/dwca_hunter/url.rb +33 -0
- data/lib/dwca_hunter/version.rb +7 -0
- data/lib/dwca_hunter/xml.rb +33 -0
- data/lib/dwca_hunter.rb +53 -0
- metadata +250 -0
@@ -0,0 +1,152 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceFreebase < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "freebase"
|
7
|
+
@title = 'Freebase'
|
8
|
+
@uuid = 'bacd21f0-44e0-43e2-914c-70929916f257'
|
9
|
+
@download_path = File.join(Dir.tmpdir,
|
10
|
+
'dwca_hunter',
|
11
|
+
'freebase',
|
12
|
+
'data.json')
|
13
|
+
@data = []
|
14
|
+
@all_taxa = {}
|
15
|
+
@cleaned_taxa = {}
|
16
|
+
@extensions = []
|
17
|
+
super
|
18
|
+
end
|
19
|
+
|
20
|
+
def needs_unpack?
|
21
|
+
false
|
22
|
+
end
|
23
|
+
|
24
|
+
def make_dwca
|
25
|
+
organize_data
|
26
|
+
generate_dwca
|
27
|
+
end
|
28
|
+
|
29
|
+
def download
|
30
|
+
DwcaHunter::logger_write(self.object_id,
|
31
|
+
'Querying freebase for species information...')
|
32
|
+
q = {
|
33
|
+
query: [{
|
34
|
+
type: '/biology/organism_classification',
|
35
|
+
id: nil,
|
36
|
+
guid: nil,
|
37
|
+
name: nil,
|
38
|
+
scientific_name: nil,
|
39
|
+
synonym_scientific_name: [],
|
40
|
+
higher_classification: {
|
41
|
+
id: nil,
|
42
|
+
guid: nil,
|
43
|
+
scientific_name: nil,
|
44
|
+
optional: true,
|
45
|
+
},
|
46
|
+
}],
|
47
|
+
cursor: true,
|
48
|
+
}
|
49
|
+
|
50
|
+
run_query(q)
|
51
|
+
|
52
|
+
data = JSON.pretty_generate @data
|
53
|
+
f = open(@download_path, 'w:utf-8')
|
54
|
+
f.write(data)
|
55
|
+
f.close
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def run_query(q)
|
61
|
+
count = 0
|
62
|
+
requests_num = 0
|
63
|
+
while true
|
64
|
+
freebase_url = "http://api.freebase.com/api/service/mqlread?query=%s" %
|
65
|
+
URI.encode(q.to_json)
|
66
|
+
res = JSON.load RestClient.get(freebase_url)
|
67
|
+
requests_num += 1
|
68
|
+
break if res['result'] == nil || res['result'].empty?
|
69
|
+
if requests_num % 10 == 0
|
70
|
+
DwcaHunter::logger_write(self.object_id,
|
71
|
+
"Received %s names" % count)
|
72
|
+
end
|
73
|
+
count += res['result'].size
|
74
|
+
res['result'].each { |d| @data << d }
|
75
|
+
q[:cursor] = res['cursor']
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def organize_data
|
80
|
+
@data = JSON.load(open(@download_path, 'r:utf-8').read)
|
81
|
+
@data.each do |d|
|
82
|
+
scientific_name = d['scientific_name'].to_s
|
83
|
+
id = d["id"]
|
84
|
+
parent_id = d['higher_classification'] ?
|
85
|
+
d['higher_classification']["id"] :
|
86
|
+
nil
|
87
|
+
synonyms = d['synonym_scientific_name']
|
88
|
+
@all_taxa[id] = { id: id,
|
89
|
+
parent_id: parent_id,
|
90
|
+
scientific_name: scientific_name,
|
91
|
+
synonyms: synonyms }
|
92
|
+
end
|
93
|
+
|
94
|
+
@all_taxa.each do |k, v|
|
95
|
+
next unless v[:scientific_name] && v[:scientific_name].strip != ""
|
96
|
+
parent_id = v[:parent_id]
|
97
|
+
until (@all_taxa[parent_id] &&
|
98
|
+
@all_taxa[parent_id][:scientific_name]) || parent_id.nil?
|
99
|
+
puts "did not find parent %s" % parent_id
|
100
|
+
parent_id = @all_taxa[parent_id]
|
101
|
+
end
|
102
|
+
parent_id = nil if v[:id] == parent_id
|
103
|
+
v[:parent_id] = parent_id
|
104
|
+
@cleaned_taxa[k] = v
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
def generate_dwca
|
110
|
+
DwcaHunter::logger_write(self.object_id,
|
111
|
+
'Creating DarwinCore Archive file')
|
112
|
+
@core = [['http://rs.tdwg.org/dwc/terms/taxonID',
|
113
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
114
|
+
'http://rs.tdwg.org/dwc/terms/parentNameUsageID']]
|
115
|
+
|
116
|
+
@extensions << { data: [[
|
117
|
+
'http://rs.tdwg.org/dwc/terms/TaxonID',
|
118
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
119
|
+
]], file_name: 'synonyms.txt' }
|
120
|
+
DwcaHunter::logger_write(self.object_id,
|
121
|
+
'Creating synonyms extension for DarwinCore Archive file')
|
122
|
+
count = 0
|
123
|
+
@cleaned_taxa.each do |key, taxon|
|
124
|
+
count += 1
|
125
|
+
@core << [taxon[:id], taxon[:scientific_name], taxon[:parent_id]]
|
126
|
+
if count % BATCH_SIZE == 0
|
127
|
+
DwcaHunter::logger_write(self.object_id,
|
128
|
+
"Traversing %s extension data record" % count)
|
129
|
+
end
|
130
|
+
taxon[:synonyms].each do |name|
|
131
|
+
@extensions[-1][:data] << [taxon[:id], name]
|
132
|
+
end
|
133
|
+
end
|
134
|
+
@eml = {
|
135
|
+
id: @uuid,
|
136
|
+
title: @title,
|
137
|
+
license: 'http://creativecommons.org/licenses/by-sa/3.0/',
|
138
|
+
authors: [
|
139
|
+
{ url: 'http://www.freebase.com/home' }],
|
140
|
+
abstract: 'An entity graph of people, places and things, ' +
|
141
|
+
'built by a community that loves open data.',
|
142
|
+
metadata_providers: [
|
143
|
+
{ first_name: 'Dmitry',
|
144
|
+
last_name: 'Mozzherin',
|
145
|
+
email: 'dmozzherin@mbl.edu' }],
|
146
|
+
url: 'http://www.freebase.com/home'
|
147
|
+
}
|
148
|
+
super
|
149
|
+
end
|
150
|
+
|
151
|
+
end
|
152
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module DwcaHunter
|
3
|
+
class ResourceGNUB < DwcaHunter::Resource
|
4
|
+
def initialize(opts = {})
|
5
|
+
@command = 'gnub'
|
6
|
+
@title = 'GNUB'
|
7
|
+
@url = 'http://gnub.org/datadump/gni_export.zip'
|
8
|
+
@UUID = 'd34ed224-78e7-485d-a478-adc2558a0f68'
|
9
|
+
@download_path = File.join(Dir.tmpdir,
|
10
|
+
'dwca_hunter',
|
11
|
+
'gnub',
|
12
|
+
'data.tar.gz')
|
13
|
+
@ranks = {}
|
14
|
+
@kingdoms = {}
|
15
|
+
@authors = {}
|
16
|
+
@vernaculars = {}
|
17
|
+
@synonyms = {}
|
18
|
+
@synonym_of = {}
|
19
|
+
@names = []
|
20
|
+
@extensions = []
|
21
|
+
super(opts)
|
22
|
+
@gnub_dir = File.join(@download_dir, 'gnub')
|
23
|
+
end
|
24
|
+
|
25
|
+
def unpack
|
26
|
+
unpack_zip
|
27
|
+
end
|
28
|
+
|
29
|
+
def make_dwca
|
30
|
+
DwcaHunter::logger_write(self.object_id, 'Extracting data')
|
31
|
+
get_names
|
32
|
+
generate_dwca
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def get_names
|
38
|
+
codes = get_codes
|
39
|
+
file = Dir.entries(@download_dir).grep(/txt$/).first
|
40
|
+
open(File.join(@download_dir, file)).each_with_index do |line, i|
|
41
|
+
next if i == 0 || (data = line.strip) == ''
|
42
|
+
data = data.split("\t")
|
43
|
+
protolog = data[0].downcase
|
44
|
+
protolog_path = data[1].downcase
|
45
|
+
name_string = data[2]
|
46
|
+
rank = data[3]
|
47
|
+
code = codes[data[4].to_i]
|
48
|
+
taxon_id = UUID.create_v5(name_string +
|
49
|
+
protolog_path +
|
50
|
+
rank, GNA_NAMESPACE)
|
51
|
+
@names << { taxon_id: taxon_id,
|
52
|
+
name_string: name_string,
|
53
|
+
protolog: protolog,
|
54
|
+
protolog_path: protolog_path,
|
55
|
+
code: code,
|
56
|
+
rank: rank }
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def get_codes
|
61
|
+
codes_url = 'http://resolver.globalnames.org/nomenclatural_codes.json'
|
62
|
+
codes = RestClient.get(codes_url)
|
63
|
+
codes = JSON.parse(codes, symbolize_names: true)
|
64
|
+
codes.inject({}) do |res, c|
|
65
|
+
res[c[:id]] = c[:code]
|
66
|
+
res
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def generate_dwca
|
71
|
+
DwcaHunter::logger_write(self.object_id,
|
72
|
+
'Creating DarwinCore Archive file')
|
73
|
+
@core = [['http://rs.tdwg.org/dwc/terms/taxonID',
|
74
|
+
'http://rs.tdwg.org/dwc/terms/originalNameUsageID',
|
75
|
+
'http://globalnames.org/terms/originalNameUsageIDPath',
|
76
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
77
|
+
'http://rs.tdwg.org/dwc/terms/nomenclaturalCode',
|
78
|
+
'http://rs.tdwg.org/dwc/terms/taxonRank']]
|
79
|
+
@names.each do |n|
|
80
|
+
@core << [n[:taxon_id], n[:protolog], n[:name_string],
|
81
|
+
n[:protolog_path], n[:code], n[:rank]]
|
82
|
+
end
|
83
|
+
@eml = {
|
84
|
+
id: @uuid,
|
85
|
+
title: @title,
|
86
|
+
authors: [
|
87
|
+
{email: 'deepreef@bishopmuseum.org'}
|
88
|
+
],
|
89
|
+
metadata_providers: [
|
90
|
+
{ first_name: 'Dmitry',
|
91
|
+
last_name: 'Mozzherin',
|
92
|
+
email: 'dmozzherin@gmail.com' }
|
93
|
+
],
|
94
|
+
abstract: 'Global Names Usage Bank',
|
95
|
+
url: 'http://www.zoobank.org'
|
96
|
+
}
|
97
|
+
super
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
@@ -0,0 +1,271 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module DwcaHunter
|
3
|
+
class ResourceITIS < DwcaHunter::Resource
|
4
|
+
def initialize(opts = {})
|
5
|
+
@command = 'itis'
|
6
|
+
@title = 'ITIS'
|
7
|
+
@url = 'https://www.itis.gov/downloads/itisMySQLTables.tar.gz'
|
8
|
+
@uuid = '5d066e84-e512-4a2f-875c-0a605d3d9f35'
|
9
|
+
@download_path = File.join(Dir.tmpdir,
|
10
|
+
'dwca_hunter',
|
11
|
+
'itis',
|
12
|
+
'data.tar.gz')
|
13
|
+
@ranks = {}
|
14
|
+
@kingdoms = {}
|
15
|
+
@authors = {}
|
16
|
+
@vernaculars = {}
|
17
|
+
@synonyms = {}
|
18
|
+
@synonym_of = {}
|
19
|
+
@names = {}
|
20
|
+
@extensions = []
|
21
|
+
super(opts)
|
22
|
+
@itis_dir = File.join(@download_dir, 'itis')
|
23
|
+
end
|
24
|
+
|
25
|
+
def unpack
|
26
|
+
unpack_tar
|
27
|
+
dir = Dir.entries(@download_dir).select {|e| e.match(/itisMySQL/)}[0]
|
28
|
+
FileUtils.mv(File.join(@download_dir, dir), @itis_dir)
|
29
|
+
|
30
|
+
# Create a file with the same name as the directory we extracted.
|
31
|
+
FileUtils.touch(File.join(@itis_dir, 'version_' + dir))
|
32
|
+
end
|
33
|
+
|
34
|
+
def make_dwca
|
35
|
+
DwcaHunter::logger_write(self.object_id, 'Extracting data')
|
36
|
+
get_ranks
|
37
|
+
get_kingdoms
|
38
|
+
get_authors
|
39
|
+
get_vernaculars
|
40
|
+
get_synonyms
|
41
|
+
get_names
|
42
|
+
generate_dwca
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
def get_ranks
|
47
|
+
# 0 kingdom_id integer not null
|
48
|
+
# 1 rank_id smallint not null
|
49
|
+
# 2 rank_name char(15) not null
|
50
|
+
# 3 dir_parent_rank_id smallint not null
|
51
|
+
# 4 req_parent_rank_id smallint not null
|
52
|
+
# 5 update_date date not null
|
53
|
+
rank_file = File.join(@itis_dir, 'taxon_unit_types')
|
54
|
+
f = open(rank_file, 'r:utf-8')
|
55
|
+
f.each do |l|
|
56
|
+
l.encode!('UTF-8',
|
57
|
+
'ISO-8859-1',
|
58
|
+
invalid: :replace,
|
59
|
+
replace: '?')
|
60
|
+
row = l.strip.split('|')
|
61
|
+
@ranks[row[0].strip + '/' + row[1].strip] = row[2].strip
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def get_kingdoms
|
66
|
+
# 0 kingdom_id serial not null
|
67
|
+
# 1 kingdom_name char(10) not null
|
68
|
+
# 2 update_date date not null
|
69
|
+
|
70
|
+
f = open(File.join(@itis_dir, 'kingdoms'))
|
71
|
+
f.each do |l|
|
72
|
+
data = l.strip.split('|')
|
73
|
+
@kingdoms[data[0].strip] = data[1].strip
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def get_authors
|
78
|
+
# 0 taxon_author_id serial not null
|
79
|
+
# 1 taxon_author varchar(100,30) not null
|
80
|
+
# 2 update_date date not null
|
81
|
+
# 3 kingdom_id smallint not null
|
82
|
+
|
83
|
+
f = open(File.join(@itis_dir, 'taxon_authors_lkp'))
|
84
|
+
f.each do |l|
|
85
|
+
l.encode!('UTF-8',
|
86
|
+
'ISO-8859-1',
|
87
|
+
invalid: :replace,
|
88
|
+
replace: '?')
|
89
|
+
data = l.strip.split('|')
|
90
|
+
@authors[data[0].strip] = data[1].strip
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def get_vernaculars
|
95
|
+
# 0 tsn integer not null
|
96
|
+
# 1 vernacular_name varchar(80,5) not null
|
97
|
+
# 2 language varchar(15) not null
|
98
|
+
# 3 approved_ind char(1)
|
99
|
+
# 4 update_date date not null
|
100
|
+
# 5 primary key (tsn,vernacular_name,language)
|
101
|
+
# constraint "itis".vernaculars_key
|
102
|
+
|
103
|
+
f = open(File.join(@itis_dir, 'vernaculars'))
|
104
|
+
f.each_with_index do |l, i|
|
105
|
+
if i % BATCH_SIZE == 0
|
106
|
+
DwcaHunter::logger_write(self.object_id,
|
107
|
+
"Extracted %s vernacular names" % i)
|
108
|
+
end
|
109
|
+
l.encode!('UTF-8',
|
110
|
+
'ISO-8859-1',
|
111
|
+
invalid: :replace,
|
112
|
+
replace: '?')
|
113
|
+
data = l.split('|').map { |d| d.strip }
|
114
|
+
name_tsn = data[0]
|
115
|
+
string = data[1]
|
116
|
+
language = data[2]
|
117
|
+
language = 'Common name' if language == 'unspecified'
|
118
|
+
@vernaculars[name_tsn] = { name:string, language:language }
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def get_synonyms
|
123
|
+
# 0 tsn integer not null
|
124
|
+
# 1 tsn_accepted integer not null
|
125
|
+
# 2 update_date date not null
|
126
|
+
|
127
|
+
f = open(File.join(@itis_dir, 'synonym_links'))
|
128
|
+
f.each_with_index do |l, i|
|
129
|
+
if i % BATCH_SIZE == 0
|
130
|
+
DwcaHunter::logger_write(self.object_id,
|
131
|
+
"Extracted %s synonyms" % i)
|
132
|
+
end
|
133
|
+
l.encode!('UTF-8',
|
134
|
+
'ISO-8859-1',
|
135
|
+
invalid: :replace,
|
136
|
+
replace: '?')
|
137
|
+
data = l.split('|').map { |d| d.strip }
|
138
|
+
synonym_name_tsn = data[0]
|
139
|
+
accepted_name_tsn = data[1]
|
140
|
+
@synonyms[synonym_name_tsn] = accepted_name_tsn
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def get_names
|
145
|
+
# 0 tsn serial not null
|
146
|
+
# 1 unit_ind1 char(1)
|
147
|
+
# 2 unit_name1 char(35) not null
|
148
|
+
# 3 unit_ind2 char(1)
|
149
|
+
# 4 unit_name2 varchar(35)
|
150
|
+
# 5 unit_ind3 varchar(7)
|
151
|
+
# 6 unit_name3 varchar(35)
|
152
|
+
# 7 unit_ind4 varchar(7)
|
153
|
+
# 8 unit_name4 varchar(35)
|
154
|
+
# 9 unnamed_taxon_ind char(1)
|
155
|
+
# 10 usage varchar(12,5) not null
|
156
|
+
# 11 unaccept_reason varchar(50,9)
|
157
|
+
# 12 credibility_rtng varchar(40,17) not null
|
158
|
+
# 13 completeness_rtng char(10)
|
159
|
+
# 14 currency_rating char(7)
|
160
|
+
# 15 phylo_sort_seq smallint
|
161
|
+
# 16 initial_time_stamp datetime year to second not null
|
162
|
+
# 17 parent_tsn integer
|
163
|
+
# 18 taxon_author_id integer
|
164
|
+
# 19 hybrid_author_id integer
|
165
|
+
# 20 kingdom_id smallint not null
|
166
|
+
# 21 rank_id smallint not null
|
167
|
+
# 22 update_date date not null
|
168
|
+
# 23 uncertain_prnt_ind char(3)
|
169
|
+
|
170
|
+
f = open(File.join(@itis_dir, 'taxonomic_units'))
|
171
|
+
f.each_with_index do |l, i|
|
172
|
+
if i % BATCH_SIZE == 0
|
173
|
+
DwcaHunter::logger_write(self.object_id,
|
174
|
+
"Extracted %s names" % i)
|
175
|
+
end
|
176
|
+
l.encode!('UTF-8',
|
177
|
+
'ISO-8859-1',
|
178
|
+
invalid: :replace,
|
179
|
+
replace: '?')
|
180
|
+
data = l.split("|").map { |d| d.strip }
|
181
|
+
name_tsn = data[0]
|
182
|
+
x1 = data[1]
|
183
|
+
name_part1 = data[2]
|
184
|
+
x2 = data[3]
|
185
|
+
name_part2 = data[4]
|
186
|
+
sp_marker1 = data[5]
|
187
|
+
name_part3 = data[6]
|
188
|
+
sp_marker2 = data[7]
|
189
|
+
name_part4 = data[8]
|
190
|
+
status = data[10]
|
191
|
+
parent_tsn = data[17]
|
192
|
+
author_id = data[18]
|
193
|
+
kingdom_id = data[20]
|
194
|
+
rank_id = data[21]
|
195
|
+
|
196
|
+
parent_tsn = nil if parent_tsn == ''
|
197
|
+
name = [x1, name_part1, x2, name_part2,
|
198
|
+
sp_marker1, name_part3, sp_marker2, name_part4]
|
199
|
+
canonical_name = name.clone
|
200
|
+
name << @authors[author_id] if @authors[author_id]
|
201
|
+
name = name.join(' ').strip.gsub(/\s+/, ' ')
|
202
|
+
canonical_name = canonical_name.join(' ').strip.gsub(/\s+/, ' ')
|
203
|
+
rank = @ranks[kingdom_id + '/' + rank_id] ?
|
204
|
+
@ranks[kingdom_id + '/' + rank_id] :
|
205
|
+
''
|
206
|
+
@names[name_tsn] = { name: name,
|
207
|
+
canonical_name: canonical_name,
|
208
|
+
status: status,
|
209
|
+
parent_tsn: parent_tsn,
|
210
|
+
rank: rank }
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
def generate_dwca
|
215
|
+
DwcaHunter::logger_write(self.object_id,
|
216
|
+
'Creating DarwinCore Archive file')
|
217
|
+
@core = [['http://rs.tdwg.org/dwc/terms/taxonID',
|
218
|
+
'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
|
219
|
+
'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID',
|
220
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
221
|
+
'http://rs.tdwg.org/ontology/voc/TaxonName#nameComplete',
|
222
|
+
'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
|
223
|
+
'http://rs.tdwg.org/dwc/terms/taxonRank']]
|
224
|
+
@extensions << { data: [['http://rs.tdwg.org/dwc/terms/taxonID',
|
225
|
+
'http://rs.tdwg.org/dwc/terms/vernacularName',
|
226
|
+
'http://purl.org/dc/terms/language']],
|
227
|
+
file_name: 'vernacular_names.txt',
|
228
|
+
row_type: 'http://rs.gbif.org/terms/1.0/VernacularName'
|
229
|
+
}
|
230
|
+
@names.keys.each_with_index do |k, i|
|
231
|
+
d = @names[k]
|
232
|
+
accepted_id = @synonyms[k] ? @synonyms[k] : nil
|
233
|
+
parent_id = d[:parent_tsn].to_i == 0 ? nil : d[:parent_tsn]
|
234
|
+
row = [k, parent_id, accepted_id, d[:name], d[:canonical_name], d[:status], d[:rank]]
|
235
|
+
@core << row
|
236
|
+
end
|
237
|
+
|
238
|
+
@vernaculars.keys.each_with_index do |k, i|
|
239
|
+
d = @vernaculars[k]
|
240
|
+
@extensions[0][:data] << [k, d[:name], d[:language]]
|
241
|
+
end
|
242
|
+
|
243
|
+
@eml = {
|
244
|
+
id: @uuid,
|
245
|
+
title: @title,
|
246
|
+
authors: [
|
247
|
+
{email: 'itiswebmaster@itis.gov'}
|
248
|
+
],
|
249
|
+
metadata_providers: [
|
250
|
+
{ first_name: 'Dmitry',
|
251
|
+
last_name: 'Mozzherin',
|
252
|
+
email: 'dmozzherin@gmail.com' }
|
253
|
+
],
|
254
|
+
abstract: 'The White House Subcommittee on Biodiversity and ' +
|
255
|
+
'Ecosystem Dynamics has identified systematics as a ' +
|
256
|
+
'research priority that is fundamental to ecosystem ' +
|
257
|
+
'management and biodiversity conservation. This primary ' +
|
258
|
+
'need identified by the Subcommittee requires ' +
|
259
|
+
'improvements in the organization of, and access to, ' +
|
260
|
+
'standardized nomenclature. ITIS (originally referred ' +
|
261
|
+
'to as the Interagency Taxonomic Information System) ' +
|
262
|
+
'was designed to fulfill these requirements. In the ' +
|
263
|
+
'future, the ITIS will provide taxonomic data and a ' +
|
264
|
+
'directory of taxonomic expertise that will support ' +
|
265
|
+
'the system',
|
266
|
+
url: 'http://www.itis.gov'
|
267
|
+
}
|
268
|
+
super
|
269
|
+
end
|
270
|
+
end
|
271
|
+
end
|
@@ -0,0 +1,179 @@
|
|
1
|
+
module DwcaHunter
|
2
|
+
class ResourceMammalSpecies < DwcaHunter::Resource
|
3
|
+
def initialize(opts = {})
|
4
|
+
@command = "mammal-species"
|
5
|
+
@title = "The Mammal Species of The World"
|
6
|
+
@uuid = "464dafec-1037-432d-8449-c0b309e0a030"
|
7
|
+
@data = []
|
8
|
+
@extensions = []
|
9
|
+
@count = 1
|
10
|
+
@clades = {"Mammalia" => { rank: "class", id: @count}}
|
11
|
+
@url = "http://www.departments.bucknell.edu"\
|
12
|
+
"/biology/resources/msw3/export.asp"
|
13
|
+
@download_path = File.join(Dir.tmpdir, "dwca_hunter",
|
14
|
+
"mammalsp", "msw3-all.csv")
|
15
|
+
super
|
16
|
+
end
|
17
|
+
|
18
|
+
def needs_unpack?
|
19
|
+
false
|
20
|
+
end
|
21
|
+
|
22
|
+
def make_dwca
|
23
|
+
DwcaHunter::logger_write(self.object_id, "Extracting data")
|
24
|
+
encode
|
25
|
+
collect_data
|
26
|
+
generate_dwca
|
27
|
+
end
|
28
|
+
|
29
|
+
def download
|
30
|
+
DwcaHunter::logger_write(self.object_id, "Downloading file -- "\
|
31
|
+
"it will take some time...")
|
32
|
+
dlr = DwcaHunter::Downloader.new(url, @download_path)
|
33
|
+
dlr.download
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def encode
|
39
|
+
DwcaHunter::Encoding.latin1_to_utf8(@download_path)
|
40
|
+
end
|
41
|
+
|
42
|
+
def collect_data
|
43
|
+
opts = { headers: true, header_converters: :symbol }
|
44
|
+
CSV.open(@download_path + ".utf_8", opts).each do |row|
|
45
|
+
@data << row.to_hash
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def generate_dwca
|
50
|
+
DwcaHunter::logger_write(self.object_id,
|
51
|
+
'Creating DarwinCore Archive file')
|
52
|
+
core_init
|
53
|
+
extensions_init
|
54
|
+
eml_init
|
55
|
+
@data.each do |rec|
|
56
|
+
taxon = process_hierarchy(rec)
|
57
|
+
process_vernaculars(rec, taxon)
|
58
|
+
process_synonyms(rec, taxon)
|
59
|
+
end
|
60
|
+
super
|
61
|
+
end
|
62
|
+
|
63
|
+
def process_vernaculars(rec, taxon)
|
64
|
+
return if rec[:commonname].to_s == ""
|
65
|
+
taxon_id = taxon[0]
|
66
|
+
lang = "en"
|
67
|
+
name = rec[:commonname].gsub("\u{0092}", "'")
|
68
|
+
@extensions[0][:data] << [taxon_id, name, lang]
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
def process_synonyms(rec, taxon)
|
73
|
+
accepted_id = taxon[0]
|
74
|
+
parent_id = taxon[2]
|
75
|
+
rank = taxon[-1]
|
76
|
+
return unless ['species', 'subspecies'].include? rank
|
77
|
+
synonyms = rec[:synonyms].gsub(/\.$/, "").
|
78
|
+
gsub(/<[\/ib]+>/, "").gsub(/[\s]+/, " ").split(";")
|
79
|
+
synonyms = synonyms.map(&:strip)
|
80
|
+
synonyms = synonyms.map do |s|
|
81
|
+
next if s.match(/<u>/)
|
82
|
+
if s.match(/^[a-z]/)
|
83
|
+
s = rec[:genus] + " " + s
|
84
|
+
end
|
85
|
+
@count += 1
|
86
|
+
id = @count
|
87
|
+
@core << [id, nil, parent_id, accepted_id, s, "synonym", rank]
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def process_name(rec, rank)
|
92
|
+
name =[@core.last[4], rec[:author], rec[:date]]
|
93
|
+
@core.last[4] = name.join(" ").gsub(/[\s]+/, " ").strip
|
94
|
+
@core.last[1] = rec[:id]
|
95
|
+
end
|
96
|
+
|
97
|
+
def process_hierarchy(rec)
|
98
|
+
parent_id = @clades["Mammalia"][:id]
|
99
|
+
is_row_rank = false
|
100
|
+
[:order, :suborder, :infraorder, :superfamily, :family,
|
101
|
+
:subfamily, :tribe, :genus, :subgenus,
|
102
|
+
:species, :subspecies].each do |rank|
|
103
|
+
is_row_rank = true if rank == rec[:taxonlevel].downcase.to_sym
|
104
|
+
clade = rec[rank]
|
105
|
+
clade = clade.capitalize if clade.match(/^[A-Z]+$/)
|
106
|
+
next if clade.to_s == ""
|
107
|
+
clade_id = nil
|
108
|
+
clade = adjust_clade(rec, rank, clade)
|
109
|
+
if @clades.key?(clade)
|
110
|
+
clade_id = @clades[clade][:id]
|
111
|
+
else
|
112
|
+
@count += 1
|
113
|
+
clade_id = @count
|
114
|
+
@clades[clade] = { id: clade_id, rank: rank }
|
115
|
+
@core << [clade_id, nil, parent_id, clade_id, clade, nil, rank.to_s]
|
116
|
+
if is_row_rank
|
117
|
+
process_name(rec, rank)
|
118
|
+
return @core.last
|
119
|
+
end
|
120
|
+
end
|
121
|
+
parent_id = clade_id
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
def adjust_clade(rec, rank, clade)
|
126
|
+
if [:species, :subspecies].include? rank
|
127
|
+
clade = [rec[:genus], rec[:species]]
|
128
|
+
clade << rec[:subspecies] if rank == :subspecies
|
129
|
+
clade.join(" ").gsub(/[\s]+/, " ").strip
|
130
|
+
else
|
131
|
+
clade
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def eml_init
|
136
|
+
@eml = {
|
137
|
+
id: @uuid,
|
138
|
+
title: @title,
|
139
|
+
authors: [
|
140
|
+
{ first_name: "Don",
|
141
|
+
last_name: "Wilson" },
|
142
|
+
{ first_name: "DeeAnn",
|
143
|
+
last_name: "Reader" },
|
144
|
+
],
|
145
|
+
metadata_providers: [
|
146
|
+
{ first_name: "Dmitry",
|
147
|
+
last_name: "Mozzherin",
|
148
|
+
email: "dmozzherin@gmail.com" }
|
149
|
+
],
|
150
|
+
abstract: "Mammal Species of the World, 3rd edition (MSW3) is "\
|
151
|
+
"a database of mammalian taxonomy, based upon the 2005 book "\
|
152
|
+
"Mammal Species of the World. A Taxonomic and Geographic Reference "\
|
153
|
+
"(3rd ed). Don E. Wilson & DeeAnn M. Reeder (editors).",
|
154
|
+
url: "http://www.vertebrates.si.edu/msw/mswcfapp/msw/index.cfm"
|
155
|
+
}
|
156
|
+
end
|
157
|
+
|
158
|
+
def core_init
|
159
|
+
@core = [['http://rs.tdwg.org/dwc/terms/taxonID',
|
160
|
+
'http://globalnames.org/terms/localID',
|
161
|
+
'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
|
162
|
+
'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID',
|
163
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
164
|
+
'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
|
165
|
+
'http://rs.tdwg.org/dwc/terms/taxonRank']]
|
166
|
+
m = @clades["Mammalia"]
|
167
|
+
@core << [m[:id], nil, nil, m[:id], "Mammalia", nil, "class"]
|
168
|
+
end
|
169
|
+
|
170
|
+
def extensions_init
|
171
|
+
@extensions << { data: [['http://rs.tdwg.org/dwc/terms/taxonID',
|
172
|
+
'http://rs.tdwg.org/dwc/terms/vernacularName',
|
173
|
+
'http://purl.org/dc/terms/language']],
|
174
|
+
file_name: 'vernacular_names.txt',
|
175
|
+
row_type: 'http://rs.gbif.org/terms/1.0/VernacularName'
|
176
|
+
}
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|