dwca_hunter 0.5.5 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.byebug_history +37 -0
- data/.gitignore +5 -0
- data/.rubocop.yml +3 -2
- data/.ruby-version +1 -1
- data/Gemfile.lock +50 -77
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/dwca_hunter.gemspec +7 -8
- data/exe/dwcahunter +1 -3
- data/lib/dwca_hunter.rb +31 -0
- data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
- data/lib/dwca_hunter/resources/arctos.rb +93 -91
- data/lib/dwca_hunter/resources/clements.rb +151 -0
- data/lib/dwca_hunter/resources/freebase.rb +51 -49
- data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
- data/lib/dwca_hunter/resources/ipni.rb +3 -2
- data/lib/dwca_hunter/resources/itis.rb +99 -99
- data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +3 -3
- data/lib/dwca_hunter/resources/mcz.rb +123 -0
- data/lib/dwca_hunter/resources/ncbi.rb +22 -23
- data/lib/dwca_hunter/resources/opentree.rb +5 -5
- data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
- data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
- data/lib/dwca_hunter/resources/sherborn.rb +91 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +142 -127
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +27 -34
- data/ipni.csv.gz +0 -0
- data/ipniWebName.csv.xz?dl=1 +0 -0
@@ -0,0 +1,155 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceMammalDiversityDb < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "mammal-div-db"
|
7
|
+
@title = "ASM Mammal Diversity Database"
|
8
|
+
@url = "https://mammaldiversity.org/species-account/api.php?q=*"
|
9
|
+
@UUID = "94270cdd-5424-4bb1-8324-46ccc5386dc7"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"mammal-div-db",
|
13
|
+
"data.json")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
20
|
+
super(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
DwcaHunter.logger_write(object_id, "Downloading")
|
25
|
+
`curl '#{@url}' -H 'User-Agent:' -o #{@download_path}`
|
26
|
+
end
|
27
|
+
|
28
|
+
def unpack; end
|
29
|
+
|
30
|
+
def make_dwca
|
31
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
32
|
+
get_names
|
33
|
+
generate_dwca
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def get_names
|
39
|
+
Dir.chdir(@download_dir)
|
40
|
+
collect_names
|
41
|
+
end
|
42
|
+
|
43
|
+
def collect_names
|
44
|
+
@names_index = {}
|
45
|
+
decoder = HTMLEntities.new
|
46
|
+
data = File.read(File.join(@download_dir, "data.json"))
|
47
|
+
data = JSON.parse(data, symbolize_names: true)
|
48
|
+
data[:result].each_with_index do |e, _i|
|
49
|
+
e = e[1]
|
50
|
+
order = e[:dwc][:order].capitalize
|
51
|
+
order = nil if order.match(/incertae/)
|
52
|
+
family = e[:dwc][:family].capitalize
|
53
|
+
family = nil if family.match(/incertae/)
|
54
|
+
genus = e[:dwc][:genus].capitalize
|
55
|
+
genus = nil if genus.match(/incertae/)
|
56
|
+
name = {
|
57
|
+
taxon_id: e[:internal_id],
|
58
|
+
kingdom: "Animalia",
|
59
|
+
phylum: "Chordata",
|
60
|
+
klass: "Mammalia",
|
61
|
+
order: order,
|
62
|
+
family: family,
|
63
|
+
genus: genus,
|
64
|
+
name_string: "#{e[:dwc][:scientificName]} " \
|
65
|
+
"#{e[:dwc][:scientificNameAuthorship][:species]}".strip,
|
66
|
+
rank: e[:dwc][:taxonRank],
|
67
|
+
status: e[:dwc][:taxonRank],
|
68
|
+
code: "ICZN"
|
69
|
+
}
|
70
|
+
if e[:dwc][:taxonomicStatus] == "accepted"
|
71
|
+
@names << name
|
72
|
+
else
|
73
|
+
@synonyms << name
|
74
|
+
end
|
75
|
+
vern = e[:dwc][:vernacularName]
|
76
|
+
next unless vern.to_s != ""
|
77
|
+
vern = decoder.decode(vern)
|
78
|
+
vernacular = {
|
79
|
+
taxon_id: e[:id],
|
80
|
+
vern: vern,
|
81
|
+
lang: "en"
|
82
|
+
}
|
83
|
+
@vernaculars << vernacular
|
84
|
+
end
|
85
|
+
puts data[:result].size
|
86
|
+
end
|
87
|
+
|
88
|
+
def generate_dwca
|
89
|
+
DwcaHunter.logger_write(object_id,
|
90
|
+
"Creating DarwinCore Archive file")
|
91
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
92
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
93
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
94
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
95
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
96
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
97
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
98
|
+
"http://rs.tdwg.org/dwc/terms/genus",
|
99
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
100
|
+
@names.each do |n|
|
101
|
+
@core << [n[:taxon_id], n[:name_string],
|
102
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
103
|
+
n[:genus], n[:code]]
|
104
|
+
end
|
105
|
+
@extensions << {
|
106
|
+
data: [[
|
107
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
108
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
109
|
+
"http://purl.org/dc/terms/language"
|
110
|
+
]],
|
111
|
+
file_name: "vernacular_names.txt",
|
112
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
|
113
|
+
}
|
114
|
+
|
115
|
+
@vernaculars.each do |v|
|
116
|
+
@extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
|
117
|
+
end
|
118
|
+
|
119
|
+
@extensions << {
|
120
|
+
data: [[
|
121
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
122
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
123
|
+
"http://rs.tdwg.org/dwc/terms/taxonomicStatus"
|
124
|
+
]],
|
125
|
+
file_name: "synonyms.txt"
|
126
|
+
}
|
127
|
+
@synonyms.each do |s|
|
128
|
+
@extensions[-1][:data] << [s[:taxon_id], s[:name_string], s[:status]]
|
129
|
+
end
|
130
|
+
@eml = {
|
131
|
+
id: @uuid,
|
132
|
+
title: @title,
|
133
|
+
authors: [
|
134
|
+
{ first_name: "C. J.",
|
135
|
+
last_name: "Burgin" },
|
136
|
+
{ first_name: "J. P.",
|
137
|
+
last_name: "Colella" },
|
138
|
+
{ first_name: "P. L.",
|
139
|
+
last_name: "Kahn" },
|
140
|
+
{ first_name: "N. S.",
|
141
|
+
last_name: "Upham" }
|
142
|
+
],
|
143
|
+
metadata_providers: [
|
144
|
+
{ first_name: "Dmitry",
|
145
|
+
last_name: "Mozzherin",
|
146
|
+
email: "dmozzherin@gmail.com" }
|
147
|
+
],
|
148
|
+
abstract: "Mammal Diversity Database. 2020. www.mammaldiversity.org. " \
|
149
|
+
"American Society of Mammalogists. Accessed 2020-05-24 .",
|
150
|
+
url: @url
|
151
|
+
}
|
152
|
+
super
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
@@ -5,7 +5,7 @@ module DwcaHunter
|
|
5
5
|
# to DarwinCore Archive file
|
6
6
|
class ResourceMammalSpecies < DwcaHunter::Resource
|
7
7
|
def initialize(opts = {})
|
8
|
-
@parser =
|
8
|
+
@parser = Biodiversity::Parser
|
9
9
|
@black_sp = black_species
|
10
10
|
@command = "mammal-species"
|
11
11
|
@title = "The Mammal Species of The World"
|
@@ -99,9 +99,9 @@ module DwcaHunter
|
|
99
99
|
# rubocop:enable Metrics/AbcSize
|
100
100
|
|
101
101
|
def real_name?(str)
|
102
|
-
parsed = @parser.parse(str)
|
102
|
+
parsed = @parser.parse(str)
|
103
103
|
return false unless parsed[:parsed]
|
104
|
-
epithets = parsed[:
|
104
|
+
epithets = parsed[:canonicalName][:simple].split(" ")[1..-1]
|
105
105
|
return false if epithets.nil? || epithets.empty?
|
106
106
|
epithets.each do |e|
|
107
107
|
return false if @black_sp[e]
|
@@ -0,0 +1,123 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceMCZ < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "mcz"
|
7
|
+
@title = "MCZbase"
|
8
|
+
@url = "https://uofi.box.com/shared/static/x1dp86l48hyjkwfl106ejj25ormkzwip.gz"
|
9
|
+
@UUID = "c79d055b-211b-40de-8e27-618011656265"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"mcz",
|
13
|
+
"data.tar.gz")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
20
|
+
super(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading cached verion of the file. Ask MCZ for update."
|
25
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
26
|
+
end
|
27
|
+
|
28
|
+
def unpack
|
29
|
+
unpack_tar
|
30
|
+
end
|
31
|
+
|
32
|
+
def make_dwca
|
33
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
34
|
+
get_names
|
35
|
+
generate_dwca
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def get_names
|
41
|
+
Dir.chdir(@download_dir)
|
42
|
+
collect_names
|
43
|
+
end
|
44
|
+
|
45
|
+
def collect_names
|
46
|
+
@names_index = {}
|
47
|
+
file = CSV.open(File.join(@download_dir, "taxonomy_export_2020May26.csv"),
|
48
|
+
headers: true)
|
49
|
+
file.each_with_index do |row, i|
|
50
|
+
canonical = row["SCIENTIFIC_NAME"]
|
51
|
+
authors = row["AUTHOR_TEXT"]
|
52
|
+
kingdom = row["KINGDOM"]
|
53
|
+
phylum = row["PHYLUM"]
|
54
|
+
klass = row["PHYLCLASS"]
|
55
|
+
order = row["PHYLORDER"]
|
56
|
+
family = row["FAMILY"]
|
57
|
+
genus = row["GENUS"]
|
58
|
+
code = row["NOMENCLATURAL_CODE"]
|
59
|
+
|
60
|
+
taxon_id = "gn_#{i + 1}"
|
61
|
+
name_string = "#{canonical} #{authors}".strip
|
62
|
+
@names << { taxon_id: taxon_id,
|
63
|
+
name_string: name_string,
|
64
|
+
kingdom: kingdom,
|
65
|
+
phylum: phylum,
|
66
|
+
klass: klass,
|
67
|
+
order: order,
|
68
|
+
family: family,
|
69
|
+
genus: genus,
|
70
|
+
code: code }
|
71
|
+
puts "Processed %s names" % i if i % 10_000 == 0
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def generate_dwca
|
76
|
+
DwcaHunter.logger_write(object_id,
|
77
|
+
"Creating DarwinCore Archive file")
|
78
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
79
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
80
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
81
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
82
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
83
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
84
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
85
|
+
"http://rs.tdwg.org/dwc/terms/genus",
|
86
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
87
|
+
@names.each do |n|
|
88
|
+
@core << [n[:taxon_id], n[:name_string],
|
89
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
90
|
+
n[:genus], n[:code]]
|
91
|
+
end
|
92
|
+
|
93
|
+
@eml = {
|
94
|
+
id: @uuid,
|
95
|
+
title: @title,
|
96
|
+
authors: [
|
97
|
+
{ first_name: "MCZ",
|
98
|
+
last_name: "Harvard University" }
|
99
|
+
],
|
100
|
+
metadata_providers: [
|
101
|
+
{ first_name: "Paul",
|
102
|
+
last_name: "Morris" }
|
103
|
+
],
|
104
|
+
|
105
|
+
abstract: "The Museum of Comparative Zoology was founded in 1859 on " \
|
106
|
+
"the concept that collections are an integral and fundamental " \
|
107
|
+
"component of zoological research and teaching. This more than " \
|
108
|
+
"150-year-old commitment remains a strong and proud tradition for " \
|
109
|
+
"the MCZ. The present-day MCZ contains over 21-million specimens in " \
|
110
|
+
"ten research collections which comprise one of the world's richest " \
|
111
|
+
"and most varied resources for studying the diversity of life. The " \
|
112
|
+
"museum serves as the primary repository for zoological specimens " \
|
113
|
+
"collected by past and present Harvard faculty-curators, staff and " \
|
114
|
+
"associates conducting research around the world. As a premier " \
|
115
|
+
"university museum and research institution, the specimens and " \
|
116
|
+
"their related data are available to researchers of the scientific " \
|
117
|
+
"and museum community. doi:10.5281/zenodo.891420",
|
118
|
+
url: @url
|
119
|
+
}
|
120
|
+
super
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
@@ -1,19 +1,19 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module DwcaHunter
|
3
4
|
class ResourceNCBI < DwcaHunter::Resource
|
4
|
-
|
5
5
|
def initialize(opts = {})
|
6
|
-
@command =
|
7
|
-
@title =
|
8
|
-
@url =
|
9
|
-
@uuid =
|
6
|
+
@command = "ncbi"
|
7
|
+
@title = "NCBI"
|
8
|
+
@url = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
|
9
|
+
@uuid = "97d7633b-5f79-4307-a397-3c29402d9311"
|
10
10
|
@download_path = File.join(Dir.tmpdir,
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
"dwca_hunter",
|
12
|
+
"ncbi",
|
13
|
+
"data.tar.gz")
|
14
14
|
@names = {}
|
15
15
|
@data = []
|
16
|
-
@collected_names = [
|
16
|
+
@collected_names = ["genbank common name", "common name", "valid"]
|
17
17
|
@core = []
|
18
18
|
@extensions = []
|
19
19
|
super
|
@@ -33,25 +33,24 @@ module DwcaHunter
|
|
33
33
|
private
|
34
34
|
|
35
35
|
def set_vars
|
36
|
-
@names_file = File.join(@download_dir,
|
37
|
-
@nodes_file = File.join(@download_dir,
|
36
|
+
@names_file = File.join(@download_dir, "names.dmp")
|
37
|
+
@nodes_file = File.join(@download_dir, "nodes.dmp")
|
38
38
|
end
|
39
39
|
|
40
40
|
def get_names
|
41
|
-
DwcaHunter
|
41
|
+
DwcaHunter.logger_write(object_id, "Collecting names...")
|
42
42
|
open(@names_file).each_with_index do |line, i|
|
43
|
-
if i > 0 && i % BATCH_SIZE == 0
|
44
|
-
|
45
|
-
end
|
46
|
-
line = line.split("|").map {|l| cleanup(l)}
|
43
|
+
DwcaHunter.logger_write(object_id, "Collected %s names..." % i) if i > 0 && i % BATCH_SIZE == 0
|
44
|
+
line = line.split("|").map { |l| cleanup(l) }
|
47
45
|
id = line[0]
|
48
46
|
next if id == 1
|
47
|
+
|
49
48
|
name = line[1]
|
50
49
|
name_type = line[3]
|
51
|
-
name_type =
|
50
|
+
name_type = "valid" if name_type == "scientific name"
|
52
51
|
begin
|
53
52
|
name = name.gsub(/(^|\s)('|")(.*?)\2(\s|-|$)/, '\1\3\5').
|
54
|
-
|
53
|
+
gsub(/\s+/, " ")
|
55
54
|
rescue NoMethodError
|
56
55
|
puts "wrong name: %s" % name
|
57
56
|
next
|
@@ -66,12 +65,11 @@ module DwcaHunter
|
|
66
65
|
def get_classification
|
67
66
|
DwcaHunter.logger_write(object_id, "Building classification...")
|
68
67
|
open(@nodes_file, "r:utf-8").each_with_index do |line, i|
|
69
|
-
if i > 0 && i % BATCH_SIZE == 0
|
70
|
-
|
71
|
-
end
|
72
|
-
line = line.split('|').map {|l| cleanup(l)}
|
68
|
+
DwcaHunter.logger_write(object_id, "Collected %s nodes..." % i) if i > 0 && i % BATCH_SIZE == 0
|
69
|
+
line = line.split("|").map { |l| cleanup(l) }
|
73
70
|
id = line[0]
|
74
71
|
next if id == 1
|
72
|
+
|
75
73
|
parent_tax_id = line[1]
|
76
74
|
rank = line[2]
|
77
75
|
hidden_flag = line[10]
|
@@ -80,6 +78,7 @@ module DwcaHunter
|
|
80
78
|
rank = "" if rank == "no rank"
|
81
79
|
parent_tax_id = nil if parent_tax_id == 1
|
82
80
|
next unless @names[id] && @names[id]["valid"]
|
81
|
+
|
83
82
|
vernacular_names = []
|
84
83
|
synonyms = []
|
85
84
|
@names[id].keys.each do |k|
|
@@ -34,7 +34,7 @@ module DwcaHunter
|
|
34
34
|
],
|
35
35
|
url: @url
|
36
36
|
}
|
37
|
-
@url = "http://
|
37
|
+
@url = "http://files.opentreeoflife.org/ott/ott3.2/ott3.2.tgz"
|
38
38
|
@download_path = File.join(Dir.tmpdir, "dwca_hunter",
|
39
39
|
"opentree", "data.tar.gz")
|
40
40
|
super
|
@@ -51,11 +51,11 @@ module DwcaHunter
|
|
51
51
|
end
|
52
52
|
|
53
53
|
def download
|
54
|
+
puts "Downloading cached data, update it at oot website!!"
|
54
55
|
return unless @needs_download
|
55
56
|
DwcaHunter.logger_write(object_id, "Downloading file -- "\
|
56
57
|
"it will take some time...")
|
57
|
-
|
58
|
-
dlr.download
|
58
|
+
`curl -L #{url} -o #{@download_path}`
|
59
59
|
end
|
60
60
|
|
61
61
|
private
|
@@ -66,8 +66,8 @@ module DwcaHunter
|
|
66
66
|
end
|
67
67
|
|
68
68
|
def set_vars
|
69
|
-
@taxonomy = File.join(@download_dir, "
|
70
|
-
@synonyms = File.join(@download_dir, "
|
69
|
+
@taxonomy = File.join(@download_dir, "ott3.2", "taxonomy.tsv")
|
70
|
+
@synonyms = File.join(@download_dir, "ott3.2", "synonyms.tsv")
|
71
71
|
end
|
72
72
|
|
73
73
|
def classification
|
@@ -0,0 +1,193 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourcePaleoBioDb < DwcaHunter::Resource
|
5
|
+
OCCURANCE_URL = "http://paleobiodb.org/data1.2/occs/list.txt?" \
|
6
|
+
"datainfo&rowcount&base_name=Life&taxon_reso=species&" \
|
7
|
+
"idqual=certain&show=ecospace,loc,paleoloc,acconly"
|
8
|
+
TAXA_URL = "http://paleobiodb.org/data1.2/taxa/list.txt?datainfo&" \
|
9
|
+
"rowcount&base_name=Life&variant=all&" \
|
10
|
+
"show=attr,common,app,parent,ecospace,ref,refattr,entname"
|
11
|
+
REFS_URL = "http://paleobiodb.org/data1.2/taxa/refs.txt?datainfo&" \
|
12
|
+
"rowcount&base_name=Life&select=taxonomy"
|
13
|
+
TAXA_REFS_URL = "http://paleobiodb.org/data1.2/taxa/byref.txt?datainfo&" \
|
14
|
+
"rowcount&base_name=Life&select=taxonomy"
|
15
|
+
|
16
|
+
URLS = {
|
17
|
+
occurences: OCCURANCE_URL,
|
18
|
+
taxa: TAXA_URL,
|
19
|
+
refs: REFS_URL,
|
20
|
+
taxa_refs: TAXA_REFS_URL
|
21
|
+
}.freeze
|
22
|
+
|
23
|
+
def initialize(opts = {})
|
24
|
+
# opts = {download: false}
|
25
|
+
@command = "paleodb"
|
26
|
+
@title = "The Paleobiology Database"
|
27
|
+
@UUID = "fad9970e-c358-4e1b-8cc3-f9ad2582751f"
|
28
|
+
@download_path = File.join(Dir.tmpdir,
|
29
|
+
"dwca_hunter",
|
30
|
+
"paleobiodb", "fake.csv")
|
31
|
+
@synonyms = []
|
32
|
+
@names = []
|
33
|
+
@vernaculars = []
|
34
|
+
@extensions = []
|
35
|
+
@synonyms_hash = {}
|
36
|
+
@vernaculars_hash = {}
|
37
|
+
super(opts)
|
38
|
+
end
|
39
|
+
|
40
|
+
def download
|
41
|
+
puts "Downloading from original."
|
42
|
+
URLS.each do |k, v|
|
43
|
+
file_name = k.to_s + ".txt"
|
44
|
+
f = File.open(File.join(@download_dir, file_name), "w:utf-8")
|
45
|
+
puts "Getting #{k}"
|
46
|
+
data = RestClient::Request.execute(method: :get, url: v, timeout: 600)
|
47
|
+
f.write(data)
|
48
|
+
f.close
|
49
|
+
end
|
50
|
+
remove_header_text
|
51
|
+
end
|
52
|
+
|
53
|
+
def unpack; end
|
54
|
+
|
55
|
+
def make_dwca
|
56
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
57
|
+
harvester = PaleodbHarvester.new(@download_dir)
|
58
|
+
harvester.taxa
|
59
|
+
harvester.refs
|
60
|
+
harvester.taxa_refs
|
61
|
+
harvester.occurences
|
62
|
+
@taxa_json = JSON.parse(File.read(
|
63
|
+
File.join(@download_dir, "json", "taxa.json")
|
64
|
+
), symbolize_names: true)
|
65
|
+
@name_id_json = JSON.parse(File.read(
|
66
|
+
File.join(@download_dir, "json", "name_id.json")
|
67
|
+
), symbolize_names: true)
|
68
|
+
get_names
|
69
|
+
generate_dwca
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def remove_header_text
|
75
|
+
URLS.each do |k, _v|
|
76
|
+
file_name = k.to_s + ".csv"
|
77
|
+
fout = File.open(File.join(@download_dir, file_name),
|
78
|
+
"w:utf-8")
|
79
|
+
csv_started = false
|
80
|
+
File.open(File.join(@download_dir, k.to_s + ".txt")).each do |l|
|
81
|
+
unless csv_started
|
82
|
+
csv_started = true if l =~ /"Records:"/
|
83
|
+
next
|
84
|
+
end
|
85
|
+
fout.write(l)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def get_names
|
91
|
+
sp, syn = species
|
92
|
+
sp.each_with_index do |r, i|
|
93
|
+
puts format("Processing %s species", i) if (i % 5000).zero?
|
94
|
+
append_accepted_species(r)
|
95
|
+
end
|
96
|
+
syn.each_with_index do |r, i|
|
97
|
+
puts format("Processing %s synonyms", i) if (i % 5000).zero?
|
98
|
+
append_synonyms(r)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def append_accepted_species(row)
|
103
|
+
c = classification({}, row)
|
104
|
+
name = {
|
105
|
+
id: row[:id],
|
106
|
+
acc_id: row[:id],
|
107
|
+
klass: c[:class],
|
108
|
+
order: c[:order],
|
109
|
+
family: c[:family],
|
110
|
+
genus: c[:genus],
|
111
|
+
name: row[:name],
|
112
|
+
auth: row[:auth]
|
113
|
+
}
|
114
|
+
@names << name
|
115
|
+
end
|
116
|
+
|
117
|
+
def append_synonyms(row)
|
118
|
+
id, acc_id = synonymId(row)
|
119
|
+
syn = {
|
120
|
+
id: id,
|
121
|
+
name: row[:name],
|
122
|
+
auth: row[:auth],
|
123
|
+
acc_id: acc_id
|
124
|
+
}
|
125
|
+
@names << syn
|
126
|
+
end
|
127
|
+
|
128
|
+
def synonymId(row)
|
129
|
+
acc_id = row[:acc_id]
|
130
|
+
id = row[:id]
|
131
|
+
acc_id = @name_id_json[row[:acc_name].to_sym][:id] if id == acc_id
|
132
|
+
[id, acc_id]
|
133
|
+
rescue StandardError
|
134
|
+
puts "Unable to get synonymId"
|
135
|
+
end
|
136
|
+
|
137
|
+
def classification(data, row)
|
138
|
+
data = {}
|
139
|
+
stack = [[data, row]]
|
140
|
+
until stack.empty?
|
141
|
+
data, row = stack.delete_at(0)
|
142
|
+
next unless @taxa_json[row[:parent_id].to_sym] && row[:parent_id] != row[:id]
|
143
|
+
|
144
|
+
row = @taxa_json[row[:parent_id].to_sym]
|
145
|
+
data[row[:rank].to_sym] = row[:name] unless data[row[:rank].to_sym]
|
146
|
+
stack << [data, row]
|
147
|
+
end
|
148
|
+
data
|
149
|
+
end
|
150
|
+
|
151
|
+
def species
|
152
|
+
@taxa_json.values.select { |v| (v[:rank] == "species") }.
|
153
|
+
partition do |v|
|
154
|
+
(v[:name] == v[:acc_name]) || v[:acc_id].nil?
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def generate_dwca
|
159
|
+
DwcaHunter.logger_write(object_id,
|
160
|
+
"Creating DarwinCore Archive file")
|
161
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
162
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
163
|
+
"http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
|
164
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
165
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
166
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
167
|
+
"http://rs.tdwg.org/dwc/terms/genus",
|
168
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
169
|
+
@names.each do |n|
|
170
|
+
name_string = "#{n[:name]} #{n[:auth]}".strip
|
171
|
+
@core << [n[:id], name_string, n[:acc_id],
|
172
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
173
|
+
n[:genus], n[:code]]
|
174
|
+
end
|
175
|
+
|
176
|
+
@eml = {
|
177
|
+
id: @uuid,
|
178
|
+
title: @title,
|
179
|
+
authors: [
|
180
|
+
{ email: "admin@paleobiodb.org" }
|
181
|
+
],
|
182
|
+
metadata_providers: [
|
183
|
+
{ first_name: "Dmitry",
|
184
|
+
last_name: "Mozzherin",
|
185
|
+
email: "dmozzherin@gmail.com" }
|
186
|
+
],
|
187
|
+
abstract: "The Paleobiology Database (PBDB) is a non-governmental, non-profit public resource for paleontological data. It has been organized and operated by a multi-disciplinary, multi-institutional, international group of paleobiological researchers. Its purpose is to provide global, collection-based occurrence and taxonomic data for organisms of all geological ages, as well data services to allow easy access to data for independent development of analytical tools, visualization software, and applications of all types. The Database’s broader goal is to encourage and enable data-driven collaborative efforts that address large-scale paleobiological questions.",
|
188
|
+
url: @url
|
189
|
+
}
|
190
|
+
super
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|