dwca_hunter 0.5.3 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.byebug_history +37 -0
- data/.gitignore +5 -0
- data/.rubocop.yml +11 -2
- data/.ruby-version +1 -1
- data/Gemfile.lock +90 -84
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/dwca_hunter.gemspec +13 -12
- data/exe/dwcahunter +1 -5
- data/lib/dwca_hunter.rb +33 -0
- data/lib/dwca_hunter/resource.rb +8 -3
- data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
- data/lib/dwca_hunter/resources/arctos.rb +115 -149
- data/lib/dwca_hunter/resources/clements.rb +151 -0
- data/lib/dwca_hunter/resources/freebase.rb +51 -49
- data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
- data/lib/dwca_hunter/resources/index-fungorum.rb +131 -0
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
- data/lib/dwca_hunter/resources/ion.rb +98 -0
- data/lib/dwca_hunter/resources/ipni.rb +3 -2
- data/lib/dwca_hunter/resources/itis.rb +99 -99
- data/lib/dwca_hunter/resources/mammal_divdb.rb +186 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +3 -3
- data/lib/dwca_hunter/resources/mcz.rb +123 -0
- data/lib/dwca_hunter/resources/ncbi.rb +22 -23
- data/lib/dwca_hunter/resources/opentree.rb +5 -5
- data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
- data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
- data/lib/dwca_hunter/resources/sherborn.rb +91 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +166 -184
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +54 -32
- data/ipni.csv.gz +0 -0
- data/ipniWebName.csv.xz?dl=1 +0 -0
@@ -0,0 +1,131 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceAOS < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "index-fungorum"
|
7
|
+
@title = "Index Fungorum (Species Fungorum)"
|
8
|
+
@url = "https://uofi.box.com/shared/static/54l3b7h4q4pwqq4fgqvx42h3d328fl1c.csv"
|
9
|
+
@UUID = "af06816a-0b28-4a09-8219-bd1d63289858"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"index-fungorum",
|
13
|
+
"data.csv")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@extensions = []
|
17
|
+
@synonyms_hash = {}
|
18
|
+
super(opts)
|
19
|
+
end
|
20
|
+
|
21
|
+
def download
|
22
|
+
puts "Downloading csv from remote"
|
23
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
24
|
+
end
|
25
|
+
|
26
|
+
def unpack; end
|
27
|
+
|
28
|
+
def make_dwca
|
29
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
30
|
+
get_names
|
31
|
+
generate_dwca
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def get_names
|
37
|
+
Dir.chdir(@download_dir)
|
38
|
+
collect_names
|
39
|
+
end
|
40
|
+
|
41
|
+
def collect_names
|
42
|
+
@names_index = {}
|
43
|
+
file = CSV.open(File.join(@download_dir, "data.csv"),
|
44
|
+
headers: true)
|
45
|
+
file.each_with_index do |row, _i|
|
46
|
+
taxon_id = row["RECORD NUMBER"]
|
47
|
+
current_id = row["CURRENT NAME RECORD NUMBER"]
|
48
|
+
name_string = row["NAME OF FUNGUS"]
|
49
|
+
authors = row["AUTHORS"]
|
50
|
+
year = row["YEAR OF PUBLICATION"]
|
51
|
+
kingdom = row["Kingdom name"]
|
52
|
+
phylum = row["Phylum name"]
|
53
|
+
sub_phylum = row["Subphylum name"]
|
54
|
+
klass = row["Class name"]
|
55
|
+
subklass = row["Subclass name"]
|
56
|
+
order = row["Order name"]
|
57
|
+
family = row["Family name"]
|
58
|
+
code = "ICN"
|
59
|
+
|
60
|
+
@names << {
|
61
|
+
taxon_id: taxon_id,
|
62
|
+
name_string: "#{name_string} #{authors} #{year}",
|
63
|
+
current_id: current_id,
|
64
|
+
kingdom: kingdom,
|
65
|
+
phylum: phylum,
|
66
|
+
klass: klass,
|
67
|
+
order: order,
|
68
|
+
family: family,
|
69
|
+
code: code
|
70
|
+
}
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def generate_dwca
|
75
|
+
DwcaHunter.logger_write(object_id,
|
76
|
+
"Creating DarwinCore Archive file")
|
77
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
78
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
79
|
+
"http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
|
80
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
81
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
82
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
83
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
84
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
85
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
86
|
+
@names.each do |n|
|
87
|
+
@core << [n[:taxon_id], n[:name_string], n[:current_id],
|
88
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
89
|
+
n[:code]]
|
90
|
+
end
|
91
|
+
|
92
|
+
@eml = {
|
93
|
+
id: @uuid,
|
94
|
+
title: @title,
|
95
|
+
authors: [
|
96
|
+
{ first_name: "Paul",
|
97
|
+
last_name: "Kirk" }
|
98
|
+
],
|
99
|
+
metadata_providers: [
|
100
|
+
{ first_name: "Dmitry",
|
101
|
+
last_name: "Mozzherin",
|
102
|
+
email: "dmozzherin@gmail.com" }
|
103
|
+
],
|
104
|
+
abstract: "The Index Fungorum, the global fungal nomenclator " \
|
105
|
+
"coordinated and supported by the Index Fungorum Partnership, " \
|
106
|
+
"contains names of fungi (including yeasts, lichens, chromistan " \
|
107
|
+
"fungal analogues, protozoan fungal analogues and fossil forms) " \
|
108
|
+
"at all ranks.\n\n" \
|
109
|
+
"As a result of changes to the ICN (previously ICBN) relating to " \
|
110
|
+
"registration of names and following the lead taken by MycoBank, " \
|
111
|
+
"Index Fungorum now provides a mechanism to register names of " \
|
112
|
+
"new taxa, new names, new combinations and new typifications — no " \
|
113
|
+
"login is required. Names registered at Index Fungorum can be " \
|
114
|
+
"published immediately through the Index Fungorum e-Publication " \
|
115
|
+
"facility — an authorized login is required for this.\n\n" \
|
116
|
+
"Species Fungorum is currently an RBG Kew coordinated initiative " \
|
117
|
+
"to compile a global checklist of the fungi. You may search " \
|
118
|
+
"systematically defined and taxonomically complete datasets - " \
|
119
|
+
"global species databases - or the entire Species Fungorum. " \
|
120
|
+
"Species Fungorum contributes the fungal component to the Species " \
|
121
|
+
"2000 project and, in partnership with ITIS, to the Catalogue " \
|
122
|
+
"of Life (currently used in the GBIF and EoL portal); for more " \
|
123
|
+
"information regarding these global initiative visit their " \
|
124
|
+
"websites. Please contact Paul Kirk if you you would like to " \
|
125
|
+
"contribute to Species Fungorum.",
|
126
|
+
url: @url
|
127
|
+
}
|
128
|
+
super
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
@@ -0,0 +1,200 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceIOCWorldBird < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = 'ioc-world-bird'
|
7
|
+
@title = 'IOC World Bird List'
|
8
|
+
@url = 'https://uofi.box.com/shared/static/znsd734a78saq87hes979p5uspgkzy93.csv'
|
9
|
+
@UUID = '6421ffec-38e3-40fb-a6d9-af27238a47a1'
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
'dwca_hunter',
|
12
|
+
'ioc-bird',
|
13
|
+
'data.csv')
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
20
|
+
super(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts 'Downloading cached and converted to csv version.'
|
25
|
+
puts 'CHECK FOR NEW VERSION at'
|
26
|
+
puts 'https://www.worldbirdnames.org/ioc-lists/master-list-2/'
|
27
|
+
puts 'Use libreoffice to convert to csv.'
|
28
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
29
|
+
end
|
30
|
+
|
31
|
+
def unpack; end
|
32
|
+
|
33
|
+
def make_dwca
|
34
|
+
DwcaHunter.logger_write(object_id, 'Extracting data')
|
35
|
+
get_names
|
36
|
+
generate_dwca
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def get_names
|
42
|
+
Dir.chdir(@download_dir)
|
43
|
+
collect_names
|
44
|
+
end
|
45
|
+
|
46
|
+
def collect_names
|
47
|
+
@names_index = {}
|
48
|
+
file = CSV.open(File.join(@download_dir, 'data.csv'),
|
49
|
+
headers: true)
|
50
|
+
order = ''
|
51
|
+
family = ''
|
52
|
+
genus = ''
|
53
|
+
species = ''
|
54
|
+
count = 0
|
55
|
+
file.each do |row|
|
56
|
+
order1 = row['Order']
|
57
|
+
order = order1.capitalize if order1.to_s != ''
|
58
|
+
|
59
|
+
family1 = row['Family (Scientific)']
|
60
|
+
family = family1.capitalize if family1.to_s != ''
|
61
|
+
|
62
|
+
genus1 = row['Genus']
|
63
|
+
genus = genus1.capitalize if genus1.to_s != ''
|
64
|
+
|
65
|
+
species1 = row['Species (Scientific)']
|
66
|
+
species = species1 if species1.to_s != ''
|
67
|
+
|
68
|
+
subspecies = row['Subspecies']
|
69
|
+
next if species.to_s == ''
|
70
|
+
|
71
|
+
count += 1
|
72
|
+
taxon_id = "gn_#{count}"
|
73
|
+
name = {
|
74
|
+
taxon_id: taxon_id,
|
75
|
+
kingdom: 'Animalia',
|
76
|
+
phylum: 'Chordata',
|
77
|
+
klass: 'Aves',
|
78
|
+
order: order,
|
79
|
+
family: family,
|
80
|
+
genus: genus,
|
81
|
+
code: 'ICZN'
|
82
|
+
}
|
83
|
+
if subspecies.to_s == ''
|
84
|
+
auth = row['Authority'].to_s
|
85
|
+
auth = DwcaHunter.normalize_authors(auth) if auth != ''
|
86
|
+
name[:name_string] = clean(
|
87
|
+
"#{genus} #{species} #{auth}"
|
88
|
+
.strip
|
89
|
+
)
|
90
|
+
@names << name
|
91
|
+
vernacular = row['Species (English)']
|
92
|
+
if vernacular.to_s != ''
|
93
|
+
vernaclar = { taxon_id: taxon_id, vern: vernacular, lang: 'en' }
|
94
|
+
@vernaculars << vernaclar
|
95
|
+
end
|
96
|
+
species = ''
|
97
|
+
else
|
98
|
+
name[:name_string] = clean(
|
99
|
+
"#{genus} #{species} #{subspecies} #{row['Authority']}"
|
100
|
+
.strip
|
101
|
+
)
|
102
|
+
@names << name
|
103
|
+
species = ''
|
104
|
+
subspecies = ''
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def clean(n)
|
110
|
+
n = n.gsub(/†/, '')
|
111
|
+
n.gsub(/\s+/, ' ')
|
112
|
+
end
|
113
|
+
|
114
|
+
def generate_dwca
|
115
|
+
DwcaHunter.logger_write(object_id,
|
116
|
+
'Creating DarwinCore Archive file')
|
117
|
+
@core = [['http://rs.tdwg.org/dwc/terms/taxonID',
|
118
|
+
'http://rs.tdwg.org/dwc/terms/scientificName',
|
119
|
+
'http://rs.tdwg.org/dwc/terms/kingdom',
|
120
|
+
'http://rs.tdwg.org/dwc/terms/phylum',
|
121
|
+
'http://rs.tdwg.org/dwc/terms/class',
|
122
|
+
'http://rs.tdwg.org/dwc/terms/order',
|
123
|
+
'http://rs.tdwg.org/dwc/terms/family',
|
124
|
+
'http://rs.tdwg.org/dwc/terms/genus',
|
125
|
+
'http://rs.tdwg.org/dwc/terms/nomenclaturalCode']]
|
126
|
+
@names.each do |n|
|
127
|
+
@core << [n[:taxon_id], n[:name_string],
|
128
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
129
|
+
n[:genus], n[:code]]
|
130
|
+
end
|
131
|
+
@extensions << {
|
132
|
+
data: [[
|
133
|
+
'http://rs.tdwg.org/dwc/terms/taxonID',
|
134
|
+
'http://rs.tdwg.org/dwc/terms/vernacularName',
|
135
|
+
'http://purl.org/dc/terms/language'
|
136
|
+
]],
|
137
|
+
file_name: 'vernacular_names.txt',
|
138
|
+
row_type: 'http://rs.gbif.org/terms/1.0/VernacularName'
|
139
|
+
}
|
140
|
+
|
141
|
+
@vernaculars.each do |v|
|
142
|
+
@extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
|
143
|
+
end
|
144
|
+
|
145
|
+
@eml = {
|
146
|
+
id: @uuid,
|
147
|
+
title: @title,
|
148
|
+
authors: [
|
149
|
+
{ first_name: 'Per',
|
150
|
+
last_name: 'Alstrom' },
|
151
|
+
{ first_name: 'Mike',
|
152
|
+
last_name: 'Blair' },
|
153
|
+
{ first_name: 'Rauri',
|
154
|
+
last_name: 'Bowie' },
|
155
|
+
{ first_name: 'Nigel',
|
156
|
+
last_name: 'Redman' },
|
157
|
+
{ first_name: 'Jon',
|
158
|
+
last_name: 'Fjeldsa' },
|
159
|
+
{ first_name: 'Phil',
|
160
|
+
last_name: 'Gregory' },
|
161
|
+
{ first_name: 'Leo',
|
162
|
+
last_name: 'Joseph' },
|
163
|
+
{ first_name: 'Peter',
|
164
|
+
last_name: 'Kovalik' },
|
165
|
+
{ first_name: 'Adolfo',
|
166
|
+
last_name: 'Navarro-Siguenza' },
|
167
|
+
{ first_name: 'David',
|
168
|
+
last_name: 'Parkin' },
|
169
|
+
{ first_name: 'Alan',
|
170
|
+
last_name: 'Peterson' },
|
171
|
+
{ first_name: 'Douglas',
|
172
|
+
last_name: 'Pratt' },
|
173
|
+
{ first_name: 'Pam',
|
174
|
+
last_name: 'Rasmussen' },
|
175
|
+
{ first_name: 'Frank',
|
176
|
+
last_name: 'Rheindt' },
|
177
|
+
{ first_name: 'Robert',
|
178
|
+
last_name: 'Ridgely' },
|
179
|
+
{ first_name: 'Peter',
|
180
|
+
last_name: 'Ryan' },
|
181
|
+
{ first_name: 'George',
|
182
|
+
last_name: 'Sangster' },
|
183
|
+
{ first_name: 'Dick',
|
184
|
+
last_name: 'Schodde' },
|
185
|
+
{ first_name: 'Minturn',
|
186
|
+
last_name: 'Wright' }
|
187
|
+
],
|
188
|
+
metadata_providers: [
|
189
|
+
{ first_name: 'Dmitry',
|
190
|
+
last_name: 'Mozzherin',
|
191
|
+
email: 'dmozzherin@gmail.com' }
|
192
|
+
],
|
193
|
+
abstract: 'The IOC World Bird List is an open access resource of ' \
|
194
|
+
'the international community of ornithologists.',
|
195
|
+
url: 'https://www.worldbirdnames.org'
|
196
|
+
}
|
197
|
+
super
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceION < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "ion"
|
7
|
+
@title = "Index to Organism Names"
|
8
|
+
@url = "https://uofi.box.com/shared/static/tklh8i6q2kb33g6ki33k6s3is06lo9np.gz"
|
9
|
+
@UUID = "1137dfa3-5b8c-487d-b497-dc0938605864"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"ion",
|
13
|
+
"data.tar.gz")
|
14
|
+
@names = []
|
15
|
+
@extensions = []
|
16
|
+
super(opts)
|
17
|
+
end
|
18
|
+
|
19
|
+
def download
|
20
|
+
puts "Downloading cached verion of the file. Ask Rod Page to make new."
|
21
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
22
|
+
end
|
23
|
+
|
24
|
+
def unpack
|
25
|
+
unpack_tar
|
26
|
+
end
|
27
|
+
|
28
|
+
def make_dwca
|
29
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
30
|
+
get_names
|
31
|
+
generate_dwca
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def get_names
|
37
|
+
Dir.chdir(@download_dir)
|
38
|
+
collect_names
|
39
|
+
end
|
40
|
+
|
41
|
+
def collect_names
|
42
|
+
file = CSV.open(File.join(@download_dir, "ion.tsv"),
|
43
|
+
headers: true, col_sep: "\t", quote_char: "щ")
|
44
|
+
file.each_with_index do |row, i|
|
45
|
+
id = row["id"]
|
46
|
+
name_string = row["nameComplete"]
|
47
|
+
auth = row["taxonAuthor"]
|
48
|
+
|
49
|
+
@names << { taxon_id: id,
|
50
|
+
name_string: name_string,
|
51
|
+
auth: auth }
|
52
|
+
|
53
|
+
puts "Processed %s names" % i if i % 10_000 == 0
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def generate_dwca
|
58
|
+
DwcaHunter.logger_write(object_id,
|
59
|
+
"Creating DarwinCore Archive file")
|
60
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
61
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
62
|
+
"http://rs.tdwg.org/dwc/terms/scientificNameAuthorship"]]
|
63
|
+
@names.each do |n|
|
64
|
+
@core << [n[:taxon_id], n[:name_string], n[:auth]]
|
65
|
+
end
|
66
|
+
|
67
|
+
@eml = {
|
68
|
+
id: @uuid,
|
69
|
+
title: @title,
|
70
|
+
authors: [
|
71
|
+
{ first_name: "Nigel",
|
72
|
+
last_name: "Robinson",
|
73
|
+
email: "nigel.robinson@thomsonreuters.com" }
|
74
|
+
],
|
75
|
+
metadata_providers: [
|
76
|
+
{ first_name: "Dmitry",
|
77
|
+
last_name: "Mozzherin",
|
78
|
+
email: "dmozzherin@gmail.com" }
|
79
|
+
],
|
80
|
+
abstract: "ION contains millions of animal names, both fossil and " \
|
81
|
+
"recent, at all taxonomic ranks, reported from the scientific " \
|
82
|
+
"literature. (Bacteria, plant and virus names will be added soon)." \
|
83
|
+
"\n\n" \
|
84
|
+
"These names are derived from premier Clarivate databases: " \
|
85
|
+
"Zoological Record®, BIOSIS Previews®, and Biological Abstracts®. " \
|
86
|
+
"All names are tied to at least one published article. Together, " \
|
87
|
+
"these resources cover every aspect of the life sciences - " \
|
88
|
+
"providing names from over 30 million scientific records, " \
|
89
|
+
"including approximately ,000 international journals, patents, " \
|
90
|
+
"books, and conference proceedings. They provide a powerful " \
|
91
|
+
"foundation for the most complete collection of organism names " \
|
92
|
+
"available today.",
|
93
|
+
url: @url
|
94
|
+
}
|
95
|
+
super
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -8,7 +8,7 @@ module DwcaHunter
|
|
8
8
|
@command = "ipni"
|
9
9
|
@title = "The International Plant Names Index"
|
10
10
|
@abbr = "IPNI"
|
11
|
-
@url = "https://
|
11
|
+
@url = "https://uofi.box.com/shared/static/s0x4xjonxt54pi89n543gdmttrdqd6iv.xz"
|
12
12
|
@uuid = "6b3905ce-5025-49f3-9697-ddd5bdfb4ff0"
|
13
13
|
@download_path = File.join(Dir.tmpdir, "dwca_hunter", "ipni",
|
14
14
|
"ipni.csv.xz")
|
@@ -22,8 +22,9 @@ module DwcaHunter
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def download
|
25
|
-
puts "
|
25
|
+
puts "Download by hand from"
|
26
26
|
puts "https://storage.cloud.google.com/ipni-data/ipniWebName.csv.xz"
|
27
|
+
puts "and copy to given url"
|
27
28
|
`curl -s -L #{@url} -o #{@download_path}`
|
28
29
|
end
|
29
30
|
|