dwca_hunter 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.byebug_history +31 -0
- data/.document +5 -0
- data/.gitignore +58 -0
- data/.rspec +3 -0
- data/.rubocop.yml +33 -0
- data/.ruby-version +1 -0
- data/CHANGELOG.md +15 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +133 -0
- data/LICENSE.txt +20 -0
- data/README.md +39 -0
- data/Rakefile +11 -0
- data/dwca_hunter.gemspec +42 -0
- data/exe/dwcahunter +77 -0
- data/files/birdlife_7.csv +11862 -0
- data/files/fishbase_taxon_cache.tsv +81000 -0
- data/files/reptile_checklist_2014_12.csv +15158 -0
- data/lib/dwca_hunter/downloader.rb +60 -0
- data/lib/dwca_hunter/encoding.rb +17 -0
- data/lib/dwca_hunter/resource.rb +101 -0
- data/lib/dwca_hunter/resources/arctos.rb +222 -0
- data/lib/dwca_hunter/resources/birdlife.rb +160 -0
- data/lib/dwca_hunter/resources/fishbase.rb +99 -0
- data/lib/dwca_hunter/resources/freebase.rb +152 -0
- data/lib/dwca_hunter/resources/gnub.rb +101 -0
- data/lib/dwca_hunter/resources/itis.rb +271 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +179 -0
- data/lib/dwca_hunter/resources/ncbi.rb +174 -0
- data/lib/dwca_hunter/resources/opentree.rb +121 -0
- data/lib/dwca_hunter/resources/reptiles_checklist.rb +139 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +350 -0
- data/lib/dwca_hunter/resources/worms.rb +176 -0
- data/lib/dwca_hunter/url.rb +33 -0
- data/lib/dwca_hunter/version.rb +7 -0
- data/lib/dwca_hunter/xml.rb +33 -0
- data/lib/dwca_hunter.rb +53 -0
- metadata +250 -0
@@ -0,0 +1,174 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module DwcaHunter
|
3
|
+
class ResourceNCBI < DwcaHunter::Resource
|
4
|
+
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = 'ncbi'
|
7
|
+
@title = 'NCBI'
|
8
|
+
@url = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
|
9
|
+
@uuid = '97d7633b-5f79-4307-a397-3c29402d9311'
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
'dwca_hunter',
|
12
|
+
'ncbi',
|
13
|
+
'data.tar.gz')
|
14
|
+
@names = {}
|
15
|
+
@data = []
|
16
|
+
@collected_names = ['genbank common name', 'common name', 'valid']
|
17
|
+
@core = []
|
18
|
+
@extensions = []
|
19
|
+
super
|
20
|
+
end
|
21
|
+
|
22
|
+
def unpack
|
23
|
+
unpack_tar
|
24
|
+
end
|
25
|
+
|
26
|
+
def make_dwca
|
27
|
+
set_vars
|
28
|
+
get_names
|
29
|
+
get_classification
|
30
|
+
generate_dwca
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def set_vars
|
36
|
+
@names_file = File.join(@download_dir, 'names.dmp')
|
37
|
+
@nodes_file = File.join(@download_dir, 'nodes.dmp')
|
38
|
+
end
|
39
|
+
|
40
|
+
def get_names
|
41
|
+
DwcaHunter::logger_write(object_id, 'Collecting names...')
|
42
|
+
open(@names_file).each_with_index do |line, i|
|
43
|
+
if i > 0 && i % BATCH_SIZE == 0
|
44
|
+
DwcaHunter::logger_write(object_id, 'Collected %s names...' % i)
|
45
|
+
end
|
46
|
+
line = line.split("|").map {|l| cleanup(l)}
|
47
|
+
id = line[0]
|
48
|
+
next if id == 1
|
49
|
+
name = line[1]
|
50
|
+
name_type = line[3]
|
51
|
+
name_type = 'valid' if name_type == 'scientific name'
|
52
|
+
begin
|
53
|
+
name = name.gsub(/(^|\s)('|")(.*?)\2(\s|-|$)/, '\1\3\5').
|
54
|
+
gsub(/\s+/, ' ')
|
55
|
+
rescue NoMethodError
|
56
|
+
puts "wrong name: %s" % name
|
57
|
+
next
|
58
|
+
end
|
59
|
+
@names[id] = {} unless @names[id]
|
60
|
+
@names[id][name_type] ?
|
61
|
+
(@names[id][name_type] << name) :
|
62
|
+
(@names[id][name_type] = [name])
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def get_classification
|
67
|
+
DwcaHunter.logger_write(object_id, "Building classification...")
|
68
|
+
open(@nodes_file, "r:utf-8").each_with_index do |line, i|
|
69
|
+
if i > 0 && i % BATCH_SIZE == 0
|
70
|
+
DwcaHunter.logger_write(object_id, "Collected %s nodes..." % i)
|
71
|
+
end
|
72
|
+
line = line.split('|').map {|l| cleanup(l)}
|
73
|
+
id = line[0]
|
74
|
+
next if id == 1
|
75
|
+
parent_tax_id = line[1]
|
76
|
+
rank = line[2]
|
77
|
+
hidden_flag = line[10]
|
78
|
+
comments = line[12]
|
79
|
+
|
80
|
+
rank = "" if rank == "no rank"
|
81
|
+
parent_tax_id = nil if parent_tax_id == 1
|
82
|
+
next unless @names[id] && @names[id]["valid"]
|
83
|
+
vernacular_names = []
|
84
|
+
synonyms = []
|
85
|
+
@names[id].keys.each do |k|
|
86
|
+
if @collected_names.include? k
|
87
|
+
vernacular_names += @names[id][k] if k != "valid"
|
88
|
+
else
|
89
|
+
synonyms << { scientificName: @names[id][k],
|
90
|
+
taxonomicStatus: k }
|
91
|
+
end
|
92
|
+
end
|
93
|
+
@data << {
|
94
|
+
id: id,
|
95
|
+
scientificName: @names[id]["valid"][0],
|
96
|
+
parentNameUsageId: parent_tax_id,
|
97
|
+
taxonRank: rank,
|
98
|
+
taxonomicStatus: "valid",
|
99
|
+
vernacularNames: vernacular_names,
|
100
|
+
synonyms: []
|
101
|
+
}
|
102
|
+
@names[id].keys.each do |k|
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def generate_dwca
|
108
|
+
DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
|
109
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonId",
|
110
|
+
"http://purl.org/dc/terms/scientificName",
|
111
|
+
"http://purl.org/dc/terms/parentNameUsageId",
|
112
|
+
"http://purl.org/dc/terms/taxonRank"]]
|
113
|
+
DwcaHunter.logger_write(object_id, "Assembling Core Data")
|
114
|
+
count = 0
|
115
|
+
@data.map do |d|
|
116
|
+
count += 1
|
117
|
+
if (count % BATCH_SIZE).zero?
|
118
|
+
DwcaHunter.logger_write(object_id, "Traversing #{count} core " \
|
119
|
+
"data record" % count)
|
120
|
+
end
|
121
|
+
@core << [d[:id],
|
122
|
+
d[:scientificName],
|
123
|
+
d[:parentNameUsageId],
|
124
|
+
d[:taxonRank]]
|
125
|
+
end
|
126
|
+
@extensions << {
|
127
|
+
data: [["http://rs.tdwg.org/dwc/terms/TaxonID",
|
128
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName"]],
|
129
|
+
file_name: "vernacular_names.txt"
|
130
|
+
}
|
131
|
+
@extensions << { data: [[
|
132
|
+
"http://rs.tdwg.org/dwc/terms/taxonId",
|
133
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
134
|
+
"http://rs.tdwg.org/dwc/terms/taxonomicStatus"
|
135
|
+
]],
|
136
|
+
file_name: "synonyms.txt" }
|
137
|
+
|
138
|
+
DwcaHunter.logger_write(object_id, "Creating verncaular name " \
|
139
|
+
"extension for DarwinCore Archive file")
|
140
|
+
count = 0
|
141
|
+
@data.each do |d|
|
142
|
+
count += 1
|
143
|
+
if (count % BATCH_SIZE).zero?
|
144
|
+
DwcaHunter.logger_write(object_id,
|
145
|
+
"Traversing #{count} extension data record")
|
146
|
+
end
|
147
|
+
d[:vernacularNames].each do |vn|
|
148
|
+
@extensions[0][:data] << [d[:id], vn]
|
149
|
+
end
|
150
|
+
|
151
|
+
d[:synonyms].each do |synonym|
|
152
|
+
@extensions[1][:data] << [d[:id],
|
153
|
+
synonym[:scientificName],
|
154
|
+
synonym[:taxonomicStatus]]
|
155
|
+
end
|
156
|
+
end
|
157
|
+
@eml = {
|
158
|
+
id: @uuid,
|
159
|
+
title: @title,
|
160
|
+
authors: [{ url: "http://www.ncbi.org" }],
|
161
|
+
abstract: "The National Center for Biotechnology Information " \
|
162
|
+
"advances science and health by providing access to " \
|
163
|
+
"biomedical and genomic information.",
|
164
|
+
metadata_providers: [
|
165
|
+
{ first_name: "mitry",
|
166
|
+
last_name: "Mozzherin",
|
167
|
+
email: "dmozzherin@mbl.edu" }
|
168
|
+
],
|
169
|
+
url: @url
|
170
|
+
}
|
171
|
+
super
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
# Harvesting resource for Open Tree of Life
|
5
|
+
class ResourceOpenTree < DwcaHunter::Resource
|
6
|
+
def initialize(opts = {})
|
7
|
+
@command = "open-tree"
|
8
|
+
@title = "Open Tree of Life Reference Taxonomy"
|
9
|
+
@uuid = "e10865e2-cdd9-4f97-912f-08f3d5ef49f7"
|
10
|
+
@data = []
|
11
|
+
@extensions = []
|
12
|
+
@count = 1
|
13
|
+
@clades = {}
|
14
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonId",
|
15
|
+
"http://globalnames.org/terms/localID",
|
16
|
+
"http://purl.org/dc/terms/scientificName",
|
17
|
+
"http://purl.org/dc/terms/parentNameUsageId",
|
18
|
+
"http://purl.org/dc/terms/taxonRank",
|
19
|
+
"http://globalnames.org/ottCrossMaps",
|
20
|
+
"http://globalnames.org/ottNotes"]]
|
21
|
+
@eml = {
|
22
|
+
id: @uuid,
|
23
|
+
title: @title,
|
24
|
+
authors: [{ url: "https://tree.opentreeoflife.org" }],
|
25
|
+
abstract: "Open Tree of Life aims to construct a comprehensive, " \
|
26
|
+
"dynamic and digitally-available tree of life by " \
|
27
|
+
"synthesizing published phylogenetic trees along with" \
|
28
|
+
"taxonomic data. The project is a collaborative effort" \
|
29
|
+
"between 11 PIs across 10 institutions.",
|
30
|
+
metadata_providers: [
|
31
|
+
{ first_name: "Dmitry",
|
32
|
+
last_name: "Mozzherin",
|
33
|
+
email: "dmozzherin@gmail.com" }
|
34
|
+
],
|
35
|
+
url: @url
|
36
|
+
}
|
37
|
+
@url = "http://opendata.globalnames.org/id-crossmap/ott3.0.tgz"
|
38
|
+
@download_path = File.join(Dir.tmpdir, "dwca_hunter",
|
39
|
+
"opentree", "data.tar.gz")
|
40
|
+
super
|
41
|
+
end
|
42
|
+
|
43
|
+
def unpack
|
44
|
+
unpack_tar if @needs_unpack
|
45
|
+
end
|
46
|
+
|
47
|
+
def make_dwca
|
48
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
49
|
+
collect_data
|
50
|
+
generate_dwca
|
51
|
+
end
|
52
|
+
|
53
|
+
def download
|
54
|
+
return unless @needs_download
|
55
|
+
DwcaHunter.logger_write(object_id, "Downloading file -- "\
|
56
|
+
"it will take some time...")
|
57
|
+
dlr = DwcaHunter::Downloader.new(url, @download_path)
|
58
|
+
dlr.download
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def collect_data
|
64
|
+
set_vars
|
65
|
+
classification
|
66
|
+
end
|
67
|
+
|
68
|
+
def set_vars
|
69
|
+
@taxonomy = File.join(@download_dir, "ott", "taxonomy.tsv")
|
70
|
+
@synonyms = File.join(@download_dir, "ott", "synonyms.tsv")
|
71
|
+
end
|
72
|
+
|
73
|
+
def classification
|
74
|
+
@classification = []
|
75
|
+
@names = {}
|
76
|
+
DwcaHunter.logger_write(object_id, "Building classification")
|
77
|
+
open(@taxonomy).each_with_index do |line, i|
|
78
|
+
if ((i + 1) % BATCH_SIZE).zero?
|
79
|
+
DwcaHunter.logger_write(object_id,
|
80
|
+
"Traversed #{i + 1} taxonomy lines")
|
81
|
+
end
|
82
|
+
@classification << line.split("|").map(&:strip)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def generate_dwca
|
87
|
+
DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
|
88
|
+
DwcaHunter.logger_write(object_id, "Assembling Core Data")
|
89
|
+
generate_core
|
90
|
+
generate_synonyms
|
91
|
+
super
|
92
|
+
end
|
93
|
+
|
94
|
+
def generate_core
|
95
|
+
@classification.each do |d|
|
96
|
+
if (@count % BATCH_SIZE).zero?
|
97
|
+
DwcaHunter.logger_write(object_id, "Traversing #{@count} core " \
|
98
|
+
"data record")
|
99
|
+
end
|
100
|
+
@core << [d[0], d[0], d[2], d[1], d[3], d[4], d[5]]
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def synonyms
|
105
|
+
[]
|
106
|
+
end
|
107
|
+
|
108
|
+
def generate_synonyms
|
109
|
+
@extensions <<
|
110
|
+
{ data: [["http://rs.tdwg.org/dwc/terms/taxonId",
|
111
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
112
|
+
"http://rs.tdwg.org/dwc/terms/taxonomicStatus"]],
|
113
|
+
file_name: "synonyms.txt" }
|
114
|
+
|
115
|
+
synonyms.each do |synonym|
|
116
|
+
@extensions.first[:data] << [d[:id], synonym[:scientificName],
|
117
|
+
synonym[:taxonomicStatus]]
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'biodiversity'
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
module DwcaHunter
|
6
|
+
class ResourceReptilesChecklist < DwcaHunter::Resource
|
7
|
+
def initialize(opts = {})
|
8
|
+
@command = "reptile-database"
|
9
|
+
@title = "The Reptile Database"
|
10
|
+
@uuid = "c24e0905-4980-4e1d-aff2-ee0ef54ea1f8"
|
11
|
+
@data = []
|
12
|
+
@extensions = []
|
13
|
+
@download_path = File.join(Dir.tmpdir, 'dwca_hunter',
|
14
|
+
'reptilesdb', 'fake.tar.gz')
|
15
|
+
super
|
16
|
+
end
|
17
|
+
|
18
|
+
def needs_unpack?
|
19
|
+
false
|
20
|
+
end
|
21
|
+
|
22
|
+
def download
|
23
|
+
end
|
24
|
+
|
25
|
+
def make_dwca
|
26
|
+
organize_data
|
27
|
+
generate_dwca
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
def organize_data
|
32
|
+
DwcaHunter::logger_write(self.object_id,
|
33
|
+
"Organizing data")
|
34
|
+
path = File.join(__dir__, "..",
|
35
|
+
"..", "files", "reptile_checklist_2014_12.csv")
|
36
|
+
snp = ScientificNameParser.new
|
37
|
+
@data = CSV.open(path).each_with_object([]) do |row, data|
|
38
|
+
res = {}
|
39
|
+
name = row[0..1].join(" ")
|
40
|
+
res[:species] = snp.parse(name)[:scientificName][:normalized]
|
41
|
+
res[:subspecies] = []
|
42
|
+
if row[2]
|
43
|
+
row[2].split("\n").each do |ssp|
|
44
|
+
res[:subspecies] << snp.parse(ssp)[:scientificName][:normalized]
|
45
|
+
end
|
46
|
+
end
|
47
|
+
res[:vernaculars] = []
|
48
|
+
if row[3]
|
49
|
+
row[3].split("\n").each do |v|
|
50
|
+
lang = "en"
|
51
|
+
v.gsub!(/^E: /, '')
|
52
|
+
v.gsub!(/^G: /) do |m|
|
53
|
+
lang = "de" if m
|
54
|
+
""
|
55
|
+
end
|
56
|
+
v.split(",").each do |name|
|
57
|
+
res[:vernaculars] << { name: name.strip, lang: lang }
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
if row[4]
|
62
|
+
res[:family] = row[4].match(/^[A-Za-z]+/)[0]
|
63
|
+
end
|
64
|
+
data << res
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def generate_dwca
|
69
|
+
DwcaHunter::logger_write(self.object_id,
|
70
|
+
"Creating DarwinCore Archive file")
|
71
|
+
@core = [['http://rs.tdwg.org/dwc/terms/taxonID',
|
72
|
+
'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
|
73
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
74
|
+
'http://rs.tdwg.org/dwc/terms/taxonRank']]
|
75
|
+
@extensions << { data: [['http://rs.tdwg.org/dwc/terms/taxonID',
|
76
|
+
'http://rs.tdwg.org/dwc/terms/vernacularName',
|
77
|
+
'http://purl.org/dc/terms/language']],
|
78
|
+
file_name: 'vernacular_names.txt',
|
79
|
+
row_type: 'http://rs.gbif.org/terms/1.0/VernacularName'
|
80
|
+
}
|
81
|
+
families = {}
|
82
|
+
count = 1
|
83
|
+
class_id = count
|
84
|
+
@core << [count, nil, "Reptilia", "class"]
|
85
|
+
@data.each_with_index do |record|
|
86
|
+
count += 1
|
87
|
+
family_id = families[record[:family]]
|
88
|
+
unless family_id
|
89
|
+
count += 1
|
90
|
+
family_id = count
|
91
|
+
families[record[:family]] = family_id
|
92
|
+
@core << [family_id, class_id, record[:family], "family"]
|
93
|
+
end
|
94
|
+
count += 1
|
95
|
+
species_id = count
|
96
|
+
@core << [species_id, family_id, record[:species], "species"]
|
97
|
+
record[:vernaculars].each do |v|
|
98
|
+
@extensions[0][:data] << [species_id, v[:name], v[:lang]]
|
99
|
+
end
|
100
|
+
record[:subspecies].each do |ssp|
|
101
|
+
count += 1
|
102
|
+
row = [count, species_id, ssp, "subspecies"]
|
103
|
+
@core << row
|
104
|
+
end
|
105
|
+
end
|
106
|
+
@eml = {
|
107
|
+
id: @uuid,
|
108
|
+
title: @title,
|
109
|
+
authors: [
|
110
|
+
{
|
111
|
+
first_name: "Peter",
|
112
|
+
last_name: "Uetz",
|
113
|
+
email: "info@reptile-database_org"
|
114
|
+
},
|
115
|
+
{
|
116
|
+
first_name: "Jiri",
|
117
|
+
last_name: "Hosek",
|
118
|
+
email: "jiri.hosek@reptarium.cz"
|
119
|
+
}
|
120
|
+
],
|
121
|
+
metadata_providers: [
|
122
|
+
{ first_name: 'Dmitry',
|
123
|
+
last_name: 'Mozzherin',
|
124
|
+
email: 'dmozzherin@gmail.com' }
|
125
|
+
],
|
126
|
+
abstract: "This database provides a catalogue of all living reptile "\
|
127
|
+
"species and their classification. The database covers "\
|
128
|
+
"all living snakes, lizards, turtles, amphisbaenians, "\
|
129
|
+
"tuataras, and crocodiles. Currently there are about "\
|
130
|
+
"9,500 species including another 2,800 subspecies "\
|
131
|
+
"(statistics). The database focuses on taxonomic data, "\
|
132
|
+
"i.e. names and synonyms, distribution and type data "\
|
133
|
+
"and literature references.",
|
134
|
+
url: "http://www.reptile-database.org"
|
135
|
+
}
|
136
|
+
super
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|