dwca_hunter 0.5.2 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.byebug_history +37 -0
- data/.gitignore +5 -0
- data/.rubocop.yml +3 -2
- data/.ruby-version +1 -1
- data/Gemfile.lock +59 -135
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/dwca_hunter.gemspec +7 -8
- data/exe/dwcahunter +1 -3
- data/lib/dwca_hunter.rb +39 -8
- data/lib/dwca_hunter/resource.rb +5 -0
- data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
- data/lib/dwca_hunter/resources/arctos.rb +121 -145
- data/lib/dwca_hunter/resources/clements.rb +151 -0
- data/lib/dwca_hunter/resources/eol.rb +85 -0
- data/lib/dwca_hunter/resources/freebase.rb +51 -49
- data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
- data/lib/dwca_hunter/resources/index-fungorum.rb +131 -0
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
- data/lib/dwca_hunter/resources/ion.rb +98 -0
- data/lib/dwca_hunter/resources/ipni.rb +3 -2
- data/lib/dwca_hunter/resources/itis.rb +99 -99
- data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +9 -6
- data/lib/dwca_hunter/resources/mcz.rb +123 -0
- data/lib/dwca_hunter/resources/ncbi.rb +22 -23
- data/lib/dwca_hunter/resources/opentree.rb +5 -5
- data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
- data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
- data/lib/dwca_hunter/resources/sherborn.rb +91 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +142 -129
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +31 -40
- data/files/birdlife_7.csv +0 -11862
- data/files/fishbase_taxon_cache.tsv +0 -81000
- data/files/reptile_checklist_2014_12.csv +0 -15158
- data/files/species-black.txt +0 -251
- data/ipni.csv.gz +0 -0
- data/ipniWebName.csv.xz?dl=1 +0 -0
@@ -0,0 +1,131 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceAOS < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "index-fungorum"
|
7
|
+
@title = "Index Fungorum (Species Fungorum)"
|
8
|
+
@url = "https://uofi.box.com/shared/static/54l3b7h4q4pwqq4fgqvx42h3d328fl1c.csv"
|
9
|
+
@UUID = "af06816a-0b28-4a09-8219-bd1d63289858"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"index-fungorum",
|
13
|
+
"data.csv")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@extensions = []
|
17
|
+
@synonyms_hash = {}
|
18
|
+
super(opts)
|
19
|
+
end
|
20
|
+
|
21
|
+
def download
|
22
|
+
puts "Downloading csv from remote"
|
23
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
24
|
+
end
|
25
|
+
|
26
|
+
def unpack; end
|
27
|
+
|
28
|
+
def make_dwca
|
29
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
30
|
+
get_names
|
31
|
+
generate_dwca
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def get_names
|
37
|
+
Dir.chdir(@download_dir)
|
38
|
+
collect_names
|
39
|
+
end
|
40
|
+
|
41
|
+
def collect_names
|
42
|
+
@names_index = {}
|
43
|
+
file = CSV.open(File.join(@download_dir, "data.csv"),
|
44
|
+
headers: true)
|
45
|
+
file.each_with_index do |row, _i|
|
46
|
+
taxon_id = row["RECORD NUMBER"]
|
47
|
+
current_id = row["CURRENT NAME RECORD NUMBER"]
|
48
|
+
name_string = row["NAME OF FUNGUS"]
|
49
|
+
authors = row["AUTHORS"]
|
50
|
+
year = row["YEAR OF PUBLICATION"]
|
51
|
+
kingdom = row["Kingdom name"]
|
52
|
+
phylum = row["Phylum name"]
|
53
|
+
sub_phylum = row["Subphylum name"]
|
54
|
+
klass = row["Class name"]
|
55
|
+
subklass = row["Subclass name"]
|
56
|
+
order = row["Order name"]
|
57
|
+
family = row["Family name"]
|
58
|
+
code = "ICN"
|
59
|
+
|
60
|
+
@names << {
|
61
|
+
taxon_id: taxon_id,
|
62
|
+
name_string: "#{name_string} #{authors} #{year}",
|
63
|
+
current_id: current_id,
|
64
|
+
kingdom: kingdom,
|
65
|
+
phylum: phylum,
|
66
|
+
klass: klass,
|
67
|
+
order: order,
|
68
|
+
family: family,
|
69
|
+
code: code
|
70
|
+
}
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def generate_dwca
|
75
|
+
DwcaHunter.logger_write(object_id,
|
76
|
+
"Creating DarwinCore Archive file")
|
77
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
78
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
79
|
+
"http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
|
80
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
81
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
82
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
83
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
84
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
85
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
86
|
+
@names.each do |n|
|
87
|
+
@core << [n[:taxon_id], n[:name_string], n[:current_id],
|
88
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
89
|
+
n[:code]]
|
90
|
+
end
|
91
|
+
|
92
|
+
@eml = {
|
93
|
+
id: @uuid,
|
94
|
+
title: @title,
|
95
|
+
authors: [
|
96
|
+
{ first_name: "Paul",
|
97
|
+
last_name: "Kirk" }
|
98
|
+
],
|
99
|
+
metadata_providers: [
|
100
|
+
{ first_name: "Dmitry",
|
101
|
+
last_name: "Mozzherin",
|
102
|
+
email: "dmozzherin@gmail.com" }
|
103
|
+
],
|
104
|
+
abstract: "The Index Fungorum, the global fungal nomenclator " \
|
105
|
+
"coordinated and supported by the Index Fungorum Partnership, " \
|
106
|
+
"contains names of fungi (including yeasts, lichens, chromistan " \
|
107
|
+
"fungal analogues, protozoan fungal analogues and fossil forms) " \
|
108
|
+
"at all ranks.\n\n" \
|
109
|
+
"As a result of changes to the ICN (previously ICBN) relating to " \
|
110
|
+
"registration of names and following the lead taken by MycoBank, " \
|
111
|
+
"Index Fungorum now provides a mechanism to register names of " \
|
112
|
+
"new taxa, new names, new combinations and new typifications — no " \
|
113
|
+
"login is required. Names registered at Index Fungorum can be " \
|
114
|
+
"published immediately through the Index Fungorum e-Publication " \
|
115
|
+
"facility — an authorized login is required for this.\n\n" \
|
116
|
+
"Species Fungorum is currently an RBG Kew coordinated initiative " \
|
117
|
+
"to compile a global checklist of the fungi. You may search " \
|
118
|
+
"systematically defined and taxonomically complete datasets - " \
|
119
|
+
"global species databases - or the entire Species Fungorum. " \
|
120
|
+
"Species Fungorum contributes the fungal component to the Species " \
|
121
|
+
"2000 project and, in partnership with ITIS, to the Catalogue " \
|
122
|
+
"of Life (currently used in the GBIF and EoL portal); for more " \
|
123
|
+
"information regarding these global initiative visit their " \
|
124
|
+
"websites. Please contact Paul Kirk if you you would like to " \
|
125
|
+
"contribute to Species Fungorum.",
|
126
|
+
url: @url
|
127
|
+
}
|
128
|
+
super
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
@@ -0,0 +1,200 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceIOCWorldBird < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "ioc-world-bird"
|
7
|
+
@title = "IOC World Bird List"
|
8
|
+
@url = "https://uofi.box.com/shared/static/fbpuk5ghh9083nbzjdeyqtoqsvdnro01.csv"
|
9
|
+
@UUID = "6421ffec-38e3-40fb-a6d9-af27238a47a1"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"ioc-bird",
|
13
|
+
"data.csv")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
20
|
+
super(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading cached and converted to csv version."
|
25
|
+
puts "CHECK FOR NEW VERSION at"
|
26
|
+
puts "https://www.worldbirdnames.org/ioc-lists/master-list-2/"
|
27
|
+
puts "Use libreoffice to convert to csv."
|
28
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
29
|
+
end
|
30
|
+
|
31
|
+
def unpack; end
|
32
|
+
|
33
|
+
def make_dwca
|
34
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
35
|
+
get_names
|
36
|
+
generate_dwca
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def get_names
|
42
|
+
Dir.chdir(@download_dir)
|
43
|
+
collect_names
|
44
|
+
end
|
45
|
+
|
46
|
+
def collect_names
|
47
|
+
@names_index = {}
|
48
|
+
file = CSV.open(File.join(@download_dir, "data.csv"),
|
49
|
+
headers: true)
|
50
|
+
order = ""
|
51
|
+
family = ""
|
52
|
+
genus = ""
|
53
|
+
species = ""
|
54
|
+
count = 0
|
55
|
+
file.each do |row|
|
56
|
+
order1 = row["Order"]
|
57
|
+
order = order1.capitalize if order1.to_s != ""
|
58
|
+
|
59
|
+
family1 = row["Family (Scientific)"]
|
60
|
+
family = family1.capitalize if family1.to_s != ""
|
61
|
+
|
62
|
+
genus1 = row["Genus"]
|
63
|
+
genus = genus1.capitalize if genus1.to_s != ""
|
64
|
+
|
65
|
+
species1 = row["Species (Scientific)"]
|
66
|
+
species = species1 if species1.to_s != ""
|
67
|
+
|
68
|
+
subspecies = row["Subspecies"]
|
69
|
+
next if species.to_s == ""
|
70
|
+
|
71
|
+
count += 1
|
72
|
+
taxon_id = "gn_#{count}"
|
73
|
+
name = {
|
74
|
+
taxon_id: taxon_id,
|
75
|
+
kingdom: "Animalia",
|
76
|
+
phylum: "Chordata",
|
77
|
+
klass: "Aves",
|
78
|
+
order: order,
|
79
|
+
family: family,
|
80
|
+
genus: genus,
|
81
|
+
code: "ICZN"
|
82
|
+
}
|
83
|
+
if subspecies.to_s == ""
|
84
|
+
auth = row["Authority"].to_s
|
85
|
+
auth = DwcaHunter.normalize_authors(auth) if auth != ""
|
86
|
+
name[:name_string] = clean(
|
87
|
+
"#{genus} #{species} #{auth}".
|
88
|
+
strip
|
89
|
+
)
|
90
|
+
@names << name
|
91
|
+
vernacular = row["Species (English)"]
|
92
|
+
if vernacular.to_s != ""
|
93
|
+
vernaclar = { taxon_id: taxon_id, vern: vernacular, lang: "en" }
|
94
|
+
@vernaculars << vernaclar
|
95
|
+
end
|
96
|
+
species = ""
|
97
|
+
else
|
98
|
+
name[:name_string] = clean(
|
99
|
+
"#{genus} #{species} #{subspecies} #{row['Authority']}".
|
100
|
+
strip
|
101
|
+
)
|
102
|
+
@names << name
|
103
|
+
species = ""
|
104
|
+
subspecies = ""
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def clean(n)
|
110
|
+
n = n.gsub(/†/, "")
|
111
|
+
n.gsub(/\s+/, " ")
|
112
|
+
end
|
113
|
+
|
114
|
+
def generate_dwca
|
115
|
+
DwcaHunter.logger_write(object_id,
|
116
|
+
"Creating DarwinCore Archive file")
|
117
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
118
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
119
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
120
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
121
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
122
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
123
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
124
|
+
"http://rs.tdwg.org/dwc/terms/genus",
|
125
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
126
|
+
@names.each do |n|
|
127
|
+
@core << [n[:taxon_id], n[:name_string],
|
128
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
129
|
+
n[:genus], n[:code]]
|
130
|
+
end
|
131
|
+
@extensions << {
|
132
|
+
data: [[
|
133
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
134
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
135
|
+
"http://purl.org/dc/terms/language"
|
136
|
+
]],
|
137
|
+
file_name: "vernacular_names.txt",
|
138
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
|
139
|
+
}
|
140
|
+
|
141
|
+
@vernaculars.each do |v|
|
142
|
+
@extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
|
143
|
+
end
|
144
|
+
|
145
|
+
@eml = {
|
146
|
+
id: @uuid,
|
147
|
+
title: @title,
|
148
|
+
authors: [
|
149
|
+
{ first_name: "Per",
|
150
|
+
last_name: "Alstrom" },
|
151
|
+
{ first_name: "Mike",
|
152
|
+
last_name: "Blair" },
|
153
|
+
{ first_name: "Rauri",
|
154
|
+
last_name: "Bowie" },
|
155
|
+
{ first_name: "Nigel",
|
156
|
+
last_name: "Redman" },
|
157
|
+
{ first_name: "Jon",
|
158
|
+
last_name: "Fjeldsa" },
|
159
|
+
{ first_name: "Phil",
|
160
|
+
last_name: "Gregory" },
|
161
|
+
{ first_name: "Leo",
|
162
|
+
last_name: "Joseph" },
|
163
|
+
{ first_name: "Peter",
|
164
|
+
last_name: "Kovalik" },
|
165
|
+
{ first_name: "Adolfo",
|
166
|
+
last_name: "Navarro-Siguenza" },
|
167
|
+
{ first_name: "David",
|
168
|
+
last_name: "Parkin" },
|
169
|
+
{ first_name: "Alan",
|
170
|
+
last_name: "Peterson" },
|
171
|
+
{ first_name: "Douglas",
|
172
|
+
last_name: "Pratt" },
|
173
|
+
{ first_name: "Pam",
|
174
|
+
last_name: "Rasmussen" },
|
175
|
+
{ first_name: "Frank",
|
176
|
+
last_name: "Rheindt" },
|
177
|
+
{ first_name: "Robert",
|
178
|
+
last_name: "Ridgely" },
|
179
|
+
{ first_name: "Peter",
|
180
|
+
last_name: "Ryan" },
|
181
|
+
{ first_name: "George",
|
182
|
+
last_name: "Sangster" },
|
183
|
+
{ first_name: "Dick",
|
184
|
+
last_name: "Schodde" },
|
185
|
+
{ first_name: "Minturn",
|
186
|
+
last_name: "Wright" }
|
187
|
+
],
|
188
|
+
metadata_providers: [
|
189
|
+
{ first_name: "Dmitry",
|
190
|
+
last_name: "Mozzherin",
|
191
|
+
email: "dmozzherin@gmail.com" }
|
192
|
+
],
|
193
|
+
abstract: "The IOC World Bird List is an open access resource of " \
|
194
|
+
"the international community of ornithologists.",
|
195
|
+
url: "https://www.worldbirdnames.org"
|
196
|
+
}
|
197
|
+
super
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceION < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "ion"
|
7
|
+
@title = "Index to Organism Names"
|
8
|
+
@url = "https://uofi.box.com/shared/static/tklh8i6q2kb33g6ki33k6s3is06lo9np.gz"
|
9
|
+
@UUID = "1137dfa3-5b8c-487d-b497-dc0938605864"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"ion",
|
13
|
+
"data.tar.gz")
|
14
|
+
@names = []
|
15
|
+
@extensions = []
|
16
|
+
super(opts)
|
17
|
+
end
|
18
|
+
|
19
|
+
def download
|
20
|
+
puts "Downloading cached verion of the file. Ask Rod Page to make new."
|
21
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
22
|
+
end
|
23
|
+
|
24
|
+
def unpack
|
25
|
+
unpack_tar
|
26
|
+
end
|
27
|
+
|
28
|
+
def make_dwca
|
29
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
30
|
+
get_names
|
31
|
+
generate_dwca
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def get_names
|
37
|
+
Dir.chdir(@download_dir)
|
38
|
+
collect_names
|
39
|
+
end
|
40
|
+
|
41
|
+
def collect_names
|
42
|
+
file = CSV.open(File.join(@download_dir, "ion.tsv"),
|
43
|
+
headers: true, col_sep: "\t", quote_char: "щ")
|
44
|
+
file.each_with_index do |row, i|
|
45
|
+
id = row["id"]
|
46
|
+
name_string = row["nameComplete"]
|
47
|
+
auth = row["taxonAuthor"]
|
48
|
+
|
49
|
+
@names << { taxon_id: id,
|
50
|
+
name_string: name_string,
|
51
|
+
auth: auth }
|
52
|
+
|
53
|
+
puts "Processed %s names" % i if i % 10_000 == 0
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def generate_dwca
|
58
|
+
DwcaHunter.logger_write(object_id,
|
59
|
+
"Creating DarwinCore Archive file")
|
60
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
61
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
62
|
+
"http://rs.tdwg.org/dwc/terms/scientificNameAuthorship"]]
|
63
|
+
@names.each do |n|
|
64
|
+
@core << [n[:taxon_id], n[:name_string], n[:auth]]
|
65
|
+
end
|
66
|
+
|
67
|
+
@eml = {
|
68
|
+
id: @uuid,
|
69
|
+
title: @title,
|
70
|
+
authors: [
|
71
|
+
{ first_name: "Nigel",
|
72
|
+
last_name: "Robinson",
|
73
|
+
email: "nigel.robinson@thomsonreuters.com" }
|
74
|
+
],
|
75
|
+
metadata_providers: [
|
76
|
+
{ first_name: "Dmitry",
|
77
|
+
last_name: "Mozzherin",
|
78
|
+
email: "dmozzherin@gmail.com" }
|
79
|
+
],
|
80
|
+
abstract: "ION contains millions of animal names, both fossil and " \
|
81
|
+
"recent, at all taxonomic ranks, reported from the scientific " \
|
82
|
+
"literature. (Bacteria, plant and virus names will be added soon)." \
|
83
|
+
"\n\n" \
|
84
|
+
"These names are derived from premier Clarivate databases: " \
|
85
|
+
"Zoological Record®, BIOSIS Previews®, and Biological Abstracts®. " \
|
86
|
+
"All names are tied to at least one published article. Together, " \
|
87
|
+
"these resources cover every aspect of the life sciences - " \
|
88
|
+
"providing names from over 30 million scientific records, " \
|
89
|
+
"including approximately ,000 international journals, patents, " \
|
90
|
+
"books, and conference proceedings. They provide a powerful " \
|
91
|
+
"foundation for the most complete collection of organism names " \
|
92
|
+
"available today.",
|
93
|
+
url: @url
|
94
|
+
}
|
95
|
+
super
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
@@ -8,7 +8,7 @@ module DwcaHunter
|
|
8
8
|
@command = "ipni"
|
9
9
|
@title = "The International Plant Names Index"
|
10
10
|
@abbr = "IPNI"
|
11
|
-
@url = "https://
|
11
|
+
@url = "https://uofi.box.com/shared/static/s0x4xjonxt54pi89n543gdmttrdqd6iv.xz"
|
12
12
|
@uuid = "6b3905ce-5025-49f3-9697-ddd5bdfb4ff0"
|
13
13
|
@download_path = File.join(Dir.tmpdir, "dwca_hunter", "ipni",
|
14
14
|
"ipni.csv.xz")
|
@@ -22,8 +22,9 @@ module DwcaHunter
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def download
|
25
|
-
puts "
|
25
|
+
puts "Download by hand from"
|
26
26
|
puts "https://storage.cloud.google.com/ipni-data/ipniWebName.csv.xz"
|
27
|
+
puts "and copy to given url"
|
27
28
|
`curl -s -L #{@url} -o #{@download_path}`
|
28
29
|
end
|
29
30
|
|