dwca_hunter 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.byebug_history +31 -0
- data/.document +5 -0
- data/.gitignore +58 -0
- data/.rspec +3 -0
- data/.rubocop.yml +33 -0
- data/.ruby-version +1 -0
- data/CHANGELOG.md +15 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +133 -0
- data/LICENSE.txt +20 -0
- data/README.md +39 -0
- data/Rakefile +11 -0
- data/dwca_hunter.gemspec +42 -0
- data/exe/dwcahunter +77 -0
- data/files/birdlife_7.csv +11862 -0
- data/files/fishbase_taxon_cache.tsv +81000 -0
- data/files/reptile_checklist_2014_12.csv +15158 -0
- data/lib/dwca_hunter/downloader.rb +60 -0
- data/lib/dwca_hunter/encoding.rb +17 -0
- data/lib/dwca_hunter/resource.rb +101 -0
- data/lib/dwca_hunter/resources/arctos.rb +222 -0
- data/lib/dwca_hunter/resources/birdlife.rb +160 -0
- data/lib/dwca_hunter/resources/fishbase.rb +99 -0
- data/lib/dwca_hunter/resources/freebase.rb +152 -0
- data/lib/dwca_hunter/resources/gnub.rb +101 -0
- data/lib/dwca_hunter/resources/itis.rb +271 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +179 -0
- data/lib/dwca_hunter/resources/ncbi.rb +174 -0
- data/lib/dwca_hunter/resources/opentree.rb +121 -0
- data/lib/dwca_hunter/resources/reptiles_checklist.rb +139 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +350 -0
- data/lib/dwca_hunter/resources/worms.rb +176 -0
- data/lib/dwca_hunter/url.rb +33 -0
- data/lib/dwca_hunter/version.rb +7 -0
- data/lib/dwca_hunter/xml.rb +33 -0
- data/lib/dwca_hunter.rb +53 -0
- metadata +250 -0
@@ -0,0 +1,174 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module DwcaHunter
|
3
|
+
class ResourceNCBI < DwcaHunter::Resource
|
4
|
+
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = 'ncbi'
|
7
|
+
@title = 'NCBI'
|
8
|
+
@url = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
|
9
|
+
@uuid = '97d7633b-5f79-4307-a397-3c29402d9311'
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
'dwca_hunter',
|
12
|
+
'ncbi',
|
13
|
+
'data.tar.gz')
|
14
|
+
@names = {}
|
15
|
+
@data = []
|
16
|
+
@collected_names = ['genbank common name', 'common name', 'valid']
|
17
|
+
@core = []
|
18
|
+
@extensions = []
|
19
|
+
super
|
20
|
+
end
|
21
|
+
|
22
|
+
def unpack
|
23
|
+
unpack_tar
|
24
|
+
end
|
25
|
+
|
26
|
+
def make_dwca
|
27
|
+
set_vars
|
28
|
+
get_names
|
29
|
+
get_classification
|
30
|
+
generate_dwca
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def set_vars
|
36
|
+
@names_file = File.join(@download_dir, 'names.dmp')
|
37
|
+
@nodes_file = File.join(@download_dir, 'nodes.dmp')
|
38
|
+
end
|
39
|
+
|
40
|
+
def get_names
|
41
|
+
DwcaHunter::logger_write(object_id, 'Collecting names...')
|
42
|
+
open(@names_file).each_with_index do |line, i|
|
43
|
+
if i > 0 && i % BATCH_SIZE == 0
|
44
|
+
DwcaHunter::logger_write(object_id, 'Collected %s names...' % i)
|
45
|
+
end
|
46
|
+
line = line.split("|").map {|l| cleanup(l)}
|
47
|
+
id = line[0]
|
48
|
+
next if id == 1
|
49
|
+
name = line[1]
|
50
|
+
name_type = line[3]
|
51
|
+
name_type = 'valid' if name_type == 'scientific name'
|
52
|
+
begin
|
53
|
+
name = name.gsub(/(^|\s)('|")(.*?)\2(\s|-|$)/, '\1\3\5').
|
54
|
+
gsub(/\s+/, ' ')
|
55
|
+
rescue NoMethodError
|
56
|
+
puts "wrong name: %s" % name
|
57
|
+
next
|
58
|
+
end
|
59
|
+
@names[id] = {} unless @names[id]
|
60
|
+
@names[id][name_type] ?
|
61
|
+
(@names[id][name_type] << name) :
|
62
|
+
(@names[id][name_type] = [name])
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def get_classification
|
67
|
+
DwcaHunter.logger_write(object_id, "Building classification...")
|
68
|
+
open(@nodes_file, "r:utf-8").each_with_index do |line, i|
|
69
|
+
if i > 0 && i % BATCH_SIZE == 0
|
70
|
+
DwcaHunter.logger_write(object_id, "Collected %s nodes..." % i)
|
71
|
+
end
|
72
|
+
line = line.split('|').map {|l| cleanup(l)}
|
73
|
+
id = line[0]
|
74
|
+
next if id == 1
|
75
|
+
parent_tax_id = line[1]
|
76
|
+
rank = line[2]
|
77
|
+
hidden_flag = line[10]
|
78
|
+
comments = line[12]
|
79
|
+
|
80
|
+
rank = "" if rank == "no rank"
|
81
|
+
parent_tax_id = nil if parent_tax_id == 1
|
82
|
+
next unless @names[id] && @names[id]["valid"]
|
83
|
+
vernacular_names = []
|
84
|
+
synonyms = []
|
85
|
+
@names[id].keys.each do |k|
|
86
|
+
if @collected_names.include? k
|
87
|
+
vernacular_names += @names[id][k] if k != "valid"
|
88
|
+
else
|
89
|
+
synonyms << { scientificName: @names[id][k],
|
90
|
+
taxonomicStatus: k }
|
91
|
+
end
|
92
|
+
end
|
93
|
+
@data << {
|
94
|
+
id: id,
|
95
|
+
scientificName: @names[id]["valid"][0],
|
96
|
+
parentNameUsageId: parent_tax_id,
|
97
|
+
taxonRank: rank,
|
98
|
+
taxonomicStatus: "valid",
|
99
|
+
vernacularNames: vernacular_names,
|
100
|
+
synonyms: []
|
101
|
+
}
|
102
|
+
@names[id].keys.each do |k|
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def generate_dwca
|
108
|
+
DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
|
109
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonId",
|
110
|
+
"http://purl.org/dc/terms/scientificName",
|
111
|
+
"http://purl.org/dc/terms/parentNameUsageId",
|
112
|
+
"http://purl.org/dc/terms/taxonRank"]]
|
113
|
+
DwcaHunter.logger_write(object_id, "Assembling Core Data")
|
114
|
+
count = 0
|
115
|
+
@data.map do |d|
|
116
|
+
count += 1
|
117
|
+
if (count % BATCH_SIZE).zero?
|
118
|
+
DwcaHunter.logger_write(object_id, "Traversing #{count} core " \
|
119
|
+
"data record" % count)
|
120
|
+
end
|
121
|
+
@core << [d[:id],
|
122
|
+
d[:scientificName],
|
123
|
+
d[:parentNameUsageId],
|
124
|
+
d[:taxonRank]]
|
125
|
+
end
|
126
|
+
@extensions << {
|
127
|
+
data: [["http://rs.tdwg.org/dwc/terms/TaxonID",
|
128
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName"]],
|
129
|
+
file_name: "vernacular_names.txt"
|
130
|
+
}
|
131
|
+
@extensions << { data: [[
|
132
|
+
"http://rs.tdwg.org/dwc/terms/taxonId",
|
133
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
134
|
+
"http://rs.tdwg.org/dwc/terms/taxonomicStatus"
|
135
|
+
]],
|
136
|
+
file_name: "synonyms.txt" }
|
137
|
+
|
138
|
+
DwcaHunter.logger_write(object_id, "Creating verncaular name " \
|
139
|
+
"extension for DarwinCore Archive file")
|
140
|
+
count = 0
|
141
|
+
@data.each do |d|
|
142
|
+
count += 1
|
143
|
+
if (count % BATCH_SIZE).zero?
|
144
|
+
DwcaHunter.logger_write(object_id,
|
145
|
+
"Traversing #{count} extension data record")
|
146
|
+
end
|
147
|
+
d[:vernacularNames].each do |vn|
|
148
|
+
@extensions[0][:data] << [d[:id], vn]
|
149
|
+
end
|
150
|
+
|
151
|
+
d[:synonyms].each do |synonym|
|
152
|
+
@extensions[1][:data] << [d[:id],
|
153
|
+
synonym[:scientificName],
|
154
|
+
synonym[:taxonomicStatus]]
|
155
|
+
end
|
156
|
+
end
|
157
|
+
@eml = {
|
158
|
+
id: @uuid,
|
159
|
+
title: @title,
|
160
|
+
authors: [{ url: "http://www.ncbi.org" }],
|
161
|
+
abstract: "The National Center for Biotechnology Information " \
|
162
|
+
"advances science and health by providing access to " \
|
163
|
+
"biomedical and genomic information.",
|
164
|
+
metadata_providers: [
|
165
|
+
{ first_name: "mitry",
|
166
|
+
last_name: "Mozzherin",
|
167
|
+
email: "dmozzherin@mbl.edu" }
|
168
|
+
],
|
169
|
+
url: @url
|
170
|
+
}
|
171
|
+
super
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
# Harvesting resource for Open Tree of Life
|
5
|
+
class ResourceOpenTree < DwcaHunter::Resource
|
6
|
+
def initialize(opts = {})
|
7
|
+
@command = "open-tree"
|
8
|
+
@title = "Open Tree of Life Reference Taxonomy"
|
9
|
+
@uuid = "e10865e2-cdd9-4f97-912f-08f3d5ef49f7"
|
10
|
+
@data = []
|
11
|
+
@extensions = []
|
12
|
+
@count = 1
|
13
|
+
@clades = {}
|
14
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonId",
|
15
|
+
"http://globalnames.org/terms/localID",
|
16
|
+
"http://purl.org/dc/terms/scientificName",
|
17
|
+
"http://purl.org/dc/terms/parentNameUsageId",
|
18
|
+
"http://purl.org/dc/terms/taxonRank",
|
19
|
+
"http://globalnames.org/ottCrossMaps",
|
20
|
+
"http://globalnames.org/ottNotes"]]
|
21
|
+
@eml = {
|
22
|
+
id: @uuid,
|
23
|
+
title: @title,
|
24
|
+
authors: [{ url: "https://tree.opentreeoflife.org" }],
|
25
|
+
abstract: "Open Tree of Life aims to construct a comprehensive, " \
|
26
|
+
"dynamic and digitally-available tree of life by " \
|
27
|
+
"synthesizing published phylogenetic trees along with" \
|
28
|
+
"taxonomic data. The project is a collaborative effort" \
|
29
|
+
"between 11 PIs across 10 institutions.",
|
30
|
+
metadata_providers: [
|
31
|
+
{ first_name: "Dmitry",
|
32
|
+
last_name: "Mozzherin",
|
33
|
+
email: "dmozzherin@gmail.com" }
|
34
|
+
],
|
35
|
+
url: @url
|
36
|
+
}
|
37
|
+
@url = "http://opendata.globalnames.org/id-crossmap/ott3.0.tgz"
|
38
|
+
@download_path = File.join(Dir.tmpdir, "dwca_hunter",
|
39
|
+
"opentree", "data.tar.gz")
|
40
|
+
super
|
41
|
+
end
|
42
|
+
|
43
|
+
def unpack
|
44
|
+
unpack_tar if @needs_unpack
|
45
|
+
end
|
46
|
+
|
47
|
+
def make_dwca
|
48
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
49
|
+
collect_data
|
50
|
+
generate_dwca
|
51
|
+
end
|
52
|
+
|
53
|
+
def download
|
54
|
+
return unless @needs_download
|
55
|
+
DwcaHunter.logger_write(object_id, "Downloading file -- "\
|
56
|
+
"it will take some time...")
|
57
|
+
dlr = DwcaHunter::Downloader.new(url, @download_path)
|
58
|
+
dlr.download
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def collect_data
|
64
|
+
set_vars
|
65
|
+
classification
|
66
|
+
end
|
67
|
+
|
68
|
+
def set_vars
|
69
|
+
@taxonomy = File.join(@download_dir, "ott", "taxonomy.tsv")
|
70
|
+
@synonyms = File.join(@download_dir, "ott", "synonyms.tsv")
|
71
|
+
end
|
72
|
+
|
73
|
+
def classification
|
74
|
+
@classification = []
|
75
|
+
@names = {}
|
76
|
+
DwcaHunter.logger_write(object_id, "Building classification")
|
77
|
+
open(@taxonomy).each_with_index do |line, i|
|
78
|
+
if ((i + 1) % BATCH_SIZE).zero?
|
79
|
+
DwcaHunter.logger_write(object_id,
|
80
|
+
"Traversed #{i + 1} taxonomy lines")
|
81
|
+
end
|
82
|
+
@classification << line.split("|").map(&:strip)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def generate_dwca
|
87
|
+
DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
|
88
|
+
DwcaHunter.logger_write(object_id, "Assembling Core Data")
|
89
|
+
generate_core
|
90
|
+
generate_synonyms
|
91
|
+
super
|
92
|
+
end
|
93
|
+
|
94
|
+
def generate_core
|
95
|
+
@classification.each do |d|
|
96
|
+
if (@count % BATCH_SIZE).zero?
|
97
|
+
DwcaHunter.logger_write(object_id, "Traversing #{@count} core " \
|
98
|
+
"data record")
|
99
|
+
end
|
100
|
+
@core << [d[0], d[0], d[2], d[1], d[3], d[4], d[5]]
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def synonyms
|
105
|
+
[]
|
106
|
+
end
|
107
|
+
|
108
|
+
def generate_synonyms
|
109
|
+
@extensions <<
|
110
|
+
{ data: [["http://rs.tdwg.org/dwc/terms/taxonId",
|
111
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
112
|
+
"http://rs.tdwg.org/dwc/terms/taxonomicStatus"]],
|
113
|
+
file_name: "synonyms.txt" }
|
114
|
+
|
115
|
+
synonyms.each do |synonym|
|
116
|
+
@extensions.first[:data] << [d[:id], synonym[:scientificName],
|
117
|
+
synonym[:taxonomicStatus]]
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'biodiversity'
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
module DwcaHunter
|
6
|
+
class ResourceReptilesChecklist < DwcaHunter::Resource
|
7
|
+
def initialize(opts = {})
|
8
|
+
@command = "reptile-database"
|
9
|
+
@title = "The Reptile Database"
|
10
|
+
@uuid = "c24e0905-4980-4e1d-aff2-ee0ef54ea1f8"
|
11
|
+
@data = []
|
12
|
+
@extensions = []
|
13
|
+
@download_path = File.join(Dir.tmpdir, 'dwca_hunter',
|
14
|
+
'reptilesdb', 'fake.tar.gz')
|
15
|
+
super
|
16
|
+
end
|
17
|
+
|
18
|
+
def needs_unpack?
|
19
|
+
false
|
20
|
+
end
|
21
|
+
|
22
|
+
def download
|
23
|
+
end
|
24
|
+
|
25
|
+
def make_dwca
|
26
|
+
organize_data
|
27
|
+
generate_dwca
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
def organize_data
|
32
|
+
DwcaHunter::logger_write(self.object_id,
|
33
|
+
"Organizing data")
|
34
|
+
path = File.join(__dir__, "..",
|
35
|
+
"..", "files", "reptile_checklist_2014_12.csv")
|
36
|
+
snp = ScientificNameParser.new
|
37
|
+
@data = CSV.open(path).each_with_object([]) do |row, data|
|
38
|
+
res = {}
|
39
|
+
name = row[0..1].join(" ")
|
40
|
+
res[:species] = snp.parse(name)[:scientificName][:normalized]
|
41
|
+
res[:subspecies] = []
|
42
|
+
if row[2]
|
43
|
+
row[2].split("\n").each do |ssp|
|
44
|
+
res[:subspecies] << snp.parse(ssp)[:scientificName][:normalized]
|
45
|
+
end
|
46
|
+
end
|
47
|
+
res[:vernaculars] = []
|
48
|
+
if row[3]
|
49
|
+
row[3].split("\n").each do |v|
|
50
|
+
lang = "en"
|
51
|
+
v.gsub!(/^E: /, '')
|
52
|
+
v.gsub!(/^G: /) do |m|
|
53
|
+
lang = "de" if m
|
54
|
+
""
|
55
|
+
end
|
56
|
+
v.split(",").each do |name|
|
57
|
+
res[:vernaculars] << { name: name.strip, lang: lang }
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
if row[4]
|
62
|
+
res[:family] = row[4].match(/^[A-Za-z]+/)[0]
|
63
|
+
end
|
64
|
+
data << res
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def generate_dwca
|
69
|
+
DwcaHunter::logger_write(self.object_id,
|
70
|
+
"Creating DarwinCore Archive file")
|
71
|
+
@core = [['http://rs.tdwg.org/dwc/terms/taxonID',
|
72
|
+
'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
|
73
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
74
|
+
'http://rs.tdwg.org/dwc/terms/taxonRank']]
|
75
|
+
@extensions << { data: [['http://rs.tdwg.org/dwc/terms/taxonID',
|
76
|
+
'http://rs.tdwg.org/dwc/terms/vernacularName',
|
77
|
+
'http://purl.org/dc/terms/language']],
|
78
|
+
file_name: 'vernacular_names.txt',
|
79
|
+
row_type: 'http://rs.gbif.org/terms/1.0/VernacularName'
|
80
|
+
}
|
81
|
+
families = {}
|
82
|
+
count = 1
|
83
|
+
class_id = count
|
84
|
+
@core << [count, nil, "Reptilia", "class"]
|
85
|
+
@data.each_with_index do |record|
|
86
|
+
count += 1
|
87
|
+
family_id = families[record[:family]]
|
88
|
+
unless family_id
|
89
|
+
count += 1
|
90
|
+
family_id = count
|
91
|
+
families[record[:family]] = family_id
|
92
|
+
@core << [family_id, class_id, record[:family], "family"]
|
93
|
+
end
|
94
|
+
count += 1
|
95
|
+
species_id = count
|
96
|
+
@core << [species_id, family_id, record[:species], "species"]
|
97
|
+
record[:vernaculars].each do |v|
|
98
|
+
@extensions[0][:data] << [species_id, v[:name], v[:lang]]
|
99
|
+
end
|
100
|
+
record[:subspecies].each do |ssp|
|
101
|
+
count += 1
|
102
|
+
row = [count, species_id, ssp, "subspecies"]
|
103
|
+
@core << row
|
104
|
+
end
|
105
|
+
end
|
106
|
+
@eml = {
|
107
|
+
id: @uuid,
|
108
|
+
title: @title,
|
109
|
+
authors: [
|
110
|
+
{
|
111
|
+
first_name: "Peter",
|
112
|
+
last_name: "Uetz",
|
113
|
+
email: "info@reptile-database_org"
|
114
|
+
},
|
115
|
+
{
|
116
|
+
first_name: "Jiri",
|
117
|
+
last_name: "Hosek",
|
118
|
+
email: "jiri.hosek@reptarium.cz"
|
119
|
+
}
|
120
|
+
],
|
121
|
+
metadata_providers: [
|
122
|
+
{ first_name: 'Dmitry',
|
123
|
+
last_name: 'Mozzherin',
|
124
|
+
email: 'dmozzherin@gmail.com' }
|
125
|
+
],
|
126
|
+
abstract: "This database provides a catalogue of all living reptile "\
|
127
|
+
"species and their classification. The database covers "\
|
128
|
+
"all living snakes, lizards, turtles, amphisbaenians, "\
|
129
|
+
"tuataras, and crocodiles. Currently there are about "\
|
130
|
+
"9,500 species including another 2,800 subspecies "\
|
131
|
+
"(statistics). The database focuses on taxonomic data, "\
|
132
|
+
"i.e. names and synonyms, distribution and type data "\
|
133
|
+
"and literature references.",
|
134
|
+
url: "http://www.reptile-database.org"
|
135
|
+
}
|
136
|
+
super
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|