dwca_hunter 0.5.5 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.byebug_history +37 -0
- data/.gitignore +5 -0
- data/.rubocop.yml +3 -2
- data/.ruby-version +1 -1
- data/Gemfile.lock +50 -77
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/dwca_hunter.gemspec +7 -8
- data/exe/dwcahunter +1 -3
- data/lib/dwca_hunter.rb +31 -0
- data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
- data/lib/dwca_hunter/resources/arctos.rb +93 -91
- data/lib/dwca_hunter/resources/clements.rb +151 -0
- data/lib/dwca_hunter/resources/freebase.rb +51 -49
- data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
- data/lib/dwca_hunter/resources/ipni.rb +3 -2
- data/lib/dwca_hunter/resources/itis.rb +99 -99
- data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +3 -3
- data/lib/dwca_hunter/resources/mcz.rb +123 -0
- data/lib/dwca_hunter/resources/ncbi.rb +22 -23
- data/lib/dwca_hunter/resources/opentree.rb +5 -5
- data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
- data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
- data/lib/dwca_hunter/resources/sherborn.rb +91 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +142 -127
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +27 -34
- data/ipni.csv.gz +0 -0
- data/ipniWebName.csv.xz?dl=1 +0 -0
@@ -0,0 +1,168 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceHowardMoore < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "how-moore-birds"
|
7
|
+
@title = "Howard and Moore Complete Checklist of the Birds of the World"
|
8
|
+
@url = "https://uofi.box.com/shared/static/m71m541dr5unc41xzg4y51d92b7wiy2k.csv"
|
9
|
+
@UUID = "85023fe5-bf2a-486b-bdae-3e61cefd41fd"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"how-moore-birds",
|
13
|
+
"data.csv")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
20
|
+
super(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading cached verion of the file."
|
25
|
+
puts "Check https://www.howardandmoore.org/howard-and-moore-database/"
|
26
|
+
puts "If there is a more recent edition"
|
27
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
28
|
+
end
|
29
|
+
|
30
|
+
def unpack; end
|
31
|
+
|
32
|
+
def make_dwca
|
33
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
34
|
+
get_names
|
35
|
+
generate_dwca
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def get_names
|
41
|
+
Dir.chdir(@download_dir)
|
42
|
+
collect_names
|
43
|
+
end
|
44
|
+
|
45
|
+
def collect_names
|
46
|
+
file = CSV.open(File.join(@download_dir, "data.csv"),
|
47
|
+
headers: true)
|
48
|
+
file.each_with_index do |row, i|
|
49
|
+
kingdom = "Animalia"
|
50
|
+
phylum = "Chordata"
|
51
|
+
klass = "Aves"
|
52
|
+
family = row["FAMILY_NAME"].capitalize
|
53
|
+
genus = row["GENERA_NAME"].capitalize
|
54
|
+
species = row["SPECIES_NAME"]
|
55
|
+
species_au =
|
56
|
+
"#{row['species_author']} #{row['species_rec_year']}".strip
|
57
|
+
subspecies = row["SUB_SPECIES_NAME"]
|
58
|
+
subspecies_au =
|
59
|
+
"#{row['subspecies_author']} #{row['subspecies_rec_year']}".strip
|
60
|
+
code = "ICZN"
|
61
|
+
|
62
|
+
taxon_id = "gn_#{i + 1}"
|
63
|
+
name_string = species
|
64
|
+
name_string = if subspecies.to_s == "" ||
|
65
|
+
name_string.include?(subspecies)
|
66
|
+
"#{name_string} #{species_au}".strip
|
67
|
+
else
|
68
|
+
"#{name_string} #{subspecies} #{subspecies_au}".strip
|
69
|
+
end
|
70
|
+
|
71
|
+
@names << { taxon_id: taxon_id,
|
72
|
+
name_string: name_string,
|
73
|
+
kingdom: kingdom,
|
74
|
+
phylum: phylum,
|
75
|
+
klass: klass,
|
76
|
+
family: family,
|
77
|
+
genus: genus,
|
78
|
+
code: code }
|
79
|
+
|
80
|
+
if row["species_english_name"].to_s != ""
|
81
|
+
@vernaculars << {
|
82
|
+
taxon_id: taxon_id,
|
83
|
+
vern: row["species_english_name"],
|
84
|
+
lang: "en"
|
85
|
+
}
|
86
|
+
end
|
87
|
+
if row["species_english_name2"].to_s != ""
|
88
|
+
@vernaculars << {
|
89
|
+
taxon_id: taxon_id,
|
90
|
+
vern: row["species_english_name2"],
|
91
|
+
lang: "en"
|
92
|
+
}
|
93
|
+
end
|
94
|
+
|
95
|
+
puts "Processed %s names" % i if i % 10_000 == 0
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def update_vernacular(taxon_id, canonical)
|
100
|
+
return unless @vernaculars_hash.key?(canonical)
|
101
|
+
|
102
|
+
@vernaculars_hash[canonical].each do |vern|
|
103
|
+
@vernaculars << { taxon_id: taxon_id, vern: vern }
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def update_synonym(taxon_id, canonical)
|
108
|
+
return unless @synonyms_hash.key?(canonical)
|
109
|
+
|
110
|
+
@synonyms_hash[canonical].each do |syn|
|
111
|
+
@synonyms << { taxon_id: taxon_id, name_string: syn[:name_string],
|
112
|
+
status: syn[:status] }
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def generate_dwca
|
117
|
+
DwcaHunter.logger_write(object_id,
|
118
|
+
"Creating DarwinCore Archive file")
|
119
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
120
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
121
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
122
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
123
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
124
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
125
|
+
"http://rs.tdwg.org/dwc/terms/genus",
|
126
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
127
|
+
@names.each do |n|
|
128
|
+
@core << [n[:taxon_id], n[:name_string],
|
129
|
+
n[:kingdom], n[:phylum], n[:klass], n[:family],
|
130
|
+
n[:genus], n[:code]]
|
131
|
+
end
|
132
|
+
@extensions << {
|
133
|
+
data: [[
|
134
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
135
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
136
|
+
"http://purl.org/dc/terms/language"
|
137
|
+
]],
|
138
|
+
file_name: "vernacular_names.txt",
|
139
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
|
140
|
+
}
|
141
|
+
|
142
|
+
@vernaculars.each do |v|
|
143
|
+
@extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
|
144
|
+
end
|
145
|
+
|
146
|
+
@eml = {
|
147
|
+
id: @uuid,
|
148
|
+
title: @title,
|
149
|
+
authors: [
|
150
|
+
{
|
151
|
+
last_name: "Christidis"
|
152
|
+
}
|
153
|
+
],
|
154
|
+
metadata_providers: [
|
155
|
+
{ first_name: "Dmitry",
|
156
|
+
last_name: "Mozzherin",
|
157
|
+
email: "dmozzherin@gmail.com" }
|
158
|
+
],
|
159
|
+
abstract: "Christidis et al. 2018. The Howard and Moore Complete " \
|
160
|
+
"Checklist of the Birds of the World, version 4.1 " \
|
161
|
+
"(Downloadable checklist). " \
|
162
|
+
"Accessed from https://www.howardandmoore.org.",
|
163
|
+
url: @url
|
164
|
+
}
|
165
|
+
super
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
@@ -0,0 +1,200 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceIOCWorldBird < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "ioc-world-bird"
|
7
|
+
@title = "IOC World Bird List"
|
8
|
+
@url = "https://uofi.box.com/shared/static/fbpuk5ghh9083nbzjdeyqtoqsvdnro01.csv"
|
9
|
+
@UUID = "6421ffec-38e3-40fb-a6d9-af27238a47a1"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"ioc-bird",
|
13
|
+
"data.csv")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
20
|
+
super(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading cached and converted to csv version."
|
25
|
+
puts "CHECK FOR NEW VERSION at"
|
26
|
+
puts "https://www.worldbirdnames.org/ioc-lists/master-list-2/"
|
27
|
+
puts "Use libreoffice to convert to csv."
|
28
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
29
|
+
end
|
30
|
+
|
31
|
+
def unpack; end
|
32
|
+
|
33
|
+
def make_dwca
|
34
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
35
|
+
get_names
|
36
|
+
generate_dwca
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def get_names
|
42
|
+
Dir.chdir(@download_dir)
|
43
|
+
collect_names
|
44
|
+
end
|
45
|
+
|
46
|
+
def collect_names
|
47
|
+
@names_index = {}
|
48
|
+
file = CSV.open(File.join(@download_dir, "data.csv"),
|
49
|
+
headers: true)
|
50
|
+
order = ""
|
51
|
+
family = ""
|
52
|
+
genus = ""
|
53
|
+
species = ""
|
54
|
+
count = 0
|
55
|
+
file.each do |row|
|
56
|
+
order1 = row["Order"]
|
57
|
+
order = order1.capitalize if order1.to_s != ""
|
58
|
+
|
59
|
+
family1 = row["Family (Scientific)"]
|
60
|
+
family = family1.capitalize if family1.to_s != ""
|
61
|
+
|
62
|
+
genus1 = row["Genus"]
|
63
|
+
genus = genus1.capitalize if genus1.to_s != ""
|
64
|
+
|
65
|
+
species1 = row["Species (Scientific)"]
|
66
|
+
species = species1 if species1.to_s != ""
|
67
|
+
|
68
|
+
subspecies = row["Subspecies"]
|
69
|
+
next if species.to_s == ""
|
70
|
+
|
71
|
+
count += 1
|
72
|
+
taxon_id = "gn_#{count}"
|
73
|
+
name = {
|
74
|
+
taxon_id: taxon_id,
|
75
|
+
kingdom: "Animalia",
|
76
|
+
phylum: "Chordata",
|
77
|
+
klass: "Aves",
|
78
|
+
order: order,
|
79
|
+
family: family,
|
80
|
+
genus: genus,
|
81
|
+
code: "ICZN"
|
82
|
+
}
|
83
|
+
if subspecies.to_s == ""
|
84
|
+
auth = row["Authority"].to_s
|
85
|
+
auth = DwcaHunter.normalize_authors(auth) if auth != ""
|
86
|
+
name[:name_string] = clean(
|
87
|
+
"#{genus} #{species} #{auth}".
|
88
|
+
strip
|
89
|
+
)
|
90
|
+
@names << name
|
91
|
+
vernacular = row["Species (English)"]
|
92
|
+
if vernacular.to_s != ""
|
93
|
+
vernaclar = { taxon_id: taxon_id, vern: vernacular, lang: "en" }
|
94
|
+
@vernaculars << vernaclar
|
95
|
+
end
|
96
|
+
species = ""
|
97
|
+
else
|
98
|
+
name[:name_string] = clean(
|
99
|
+
"#{genus} #{species} #{subspecies} #{row['Authority']}".
|
100
|
+
strip
|
101
|
+
)
|
102
|
+
@names << name
|
103
|
+
species = ""
|
104
|
+
subspecies = ""
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def clean(n)
|
110
|
+
n = n.gsub(/†/, "")
|
111
|
+
n.gsub(/\s+/, " ")
|
112
|
+
end
|
113
|
+
|
114
|
+
def generate_dwca
|
115
|
+
DwcaHunter.logger_write(object_id,
|
116
|
+
"Creating DarwinCore Archive file")
|
117
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
118
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
119
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
120
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
121
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
122
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
123
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
124
|
+
"http://rs.tdwg.org/dwc/terms/genus",
|
125
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
126
|
+
@names.each do |n|
|
127
|
+
@core << [n[:taxon_id], n[:name_string],
|
128
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
129
|
+
n[:genus], n[:code]]
|
130
|
+
end
|
131
|
+
@extensions << {
|
132
|
+
data: [[
|
133
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
134
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
135
|
+
"http://purl.org/dc/terms/language"
|
136
|
+
]],
|
137
|
+
file_name: "vernacular_names.txt",
|
138
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
|
139
|
+
}
|
140
|
+
|
141
|
+
@vernaculars.each do |v|
|
142
|
+
@extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
|
143
|
+
end
|
144
|
+
|
145
|
+
@eml = {
|
146
|
+
id: @uuid,
|
147
|
+
title: @title,
|
148
|
+
authors: [
|
149
|
+
{ first_name: "Per",
|
150
|
+
last_name: "Alstrom" },
|
151
|
+
{ first_name: "Mike",
|
152
|
+
last_name: "Blair" },
|
153
|
+
{ first_name: "Rauri",
|
154
|
+
last_name: "Bowie" },
|
155
|
+
{ first_name: "Nigel",
|
156
|
+
last_name: "Redman" },
|
157
|
+
{ first_name: "Jon",
|
158
|
+
last_name: "Fjeldsa" },
|
159
|
+
{ first_name: "Phil",
|
160
|
+
last_name: "Gregory" },
|
161
|
+
{ first_name: "Leo",
|
162
|
+
last_name: "Joseph" },
|
163
|
+
{ first_name: "Peter",
|
164
|
+
last_name: "Kovalik" },
|
165
|
+
{ first_name: "Adolfo",
|
166
|
+
last_name: "Navarro-Siguenza" },
|
167
|
+
{ first_name: "David",
|
168
|
+
last_name: "Parkin" },
|
169
|
+
{ first_name: "Alan",
|
170
|
+
last_name: "Peterson" },
|
171
|
+
{ first_name: "Douglas",
|
172
|
+
last_name: "Pratt" },
|
173
|
+
{ first_name: "Pam",
|
174
|
+
last_name: "Rasmussen" },
|
175
|
+
{ first_name: "Frank",
|
176
|
+
last_name: "Rheindt" },
|
177
|
+
{ first_name: "Robert",
|
178
|
+
last_name: "Ridgely" },
|
179
|
+
{ first_name: "Peter",
|
180
|
+
last_name: "Ryan" },
|
181
|
+
{ first_name: "George",
|
182
|
+
last_name: "Sangster" },
|
183
|
+
{ first_name: "Dick",
|
184
|
+
last_name: "Schodde" },
|
185
|
+
{ first_name: "Minturn",
|
186
|
+
last_name: "Wright" }
|
187
|
+
],
|
188
|
+
metadata_providers: [
|
189
|
+
{ first_name: "Dmitry",
|
190
|
+
last_name: "Mozzherin",
|
191
|
+
email: "dmozzherin@gmail.com" }
|
192
|
+
],
|
193
|
+
abstract: "The IOC World Bird List is an open access resource of " \
|
194
|
+
"the international community of ornithologists.",
|
195
|
+
url: "https://www.worldbirdnames.org"
|
196
|
+
}
|
197
|
+
super
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
@@ -8,7 +8,7 @@ module DwcaHunter
|
|
8
8
|
@command = "ipni"
|
9
9
|
@title = "The International Plant Names Index"
|
10
10
|
@abbr = "IPNI"
|
11
|
-
@url = "https://
|
11
|
+
@url = "https://uofi.box.com/shared/static/s0x4xjonxt54pi89n543gdmttrdqd6iv.xz"
|
12
12
|
@uuid = "6b3905ce-5025-49f3-9697-ddd5bdfb4ff0"
|
13
13
|
@download_path = File.join(Dir.tmpdir, "dwca_hunter", "ipni",
|
14
14
|
"ipni.csv.xz")
|
@@ -22,8 +22,9 @@ module DwcaHunter
|
|
22
22
|
end
|
23
23
|
|
24
24
|
def download
|
25
|
-
puts "
|
25
|
+
puts "Download by hand from"
|
26
26
|
puts "https://storage.cloud.google.com/ipni-data/ipniWebName.csv.xz"
|
27
|
+
puts "and copy to given url"
|
27
28
|
`curl -s -L #{@url} -o #{@download_path}`
|
28
29
|
end
|
29
30
|
|
@@ -1,15 +1,16 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module DwcaHunter
|
3
4
|
class ResourceITIS < DwcaHunter::Resource
|
4
5
|
def initialize(opts = {})
|
5
|
-
@command =
|
6
|
-
@title =
|
7
|
-
@url =
|
8
|
-
@uuid =
|
6
|
+
@command = "itis"
|
7
|
+
@title = "ITIS"
|
8
|
+
@url = "https://www.itis.gov/downloads/itisMySQLTables.tar.gz"
|
9
|
+
@uuid = "5d066e84-e512-4a2f-875c-0a605d3d9f35"
|
9
10
|
@download_path = File.join(Dir.tmpdir,
|
10
|
-
|
11
|
-
|
12
|
-
|
11
|
+
"dwca_hunter",
|
12
|
+
"itis",
|
13
|
+
"data.tar.gz")
|
13
14
|
@ranks = {}
|
14
15
|
@kingdoms = {}
|
15
16
|
@authors = {}
|
@@ -19,20 +20,20 @@ module DwcaHunter
|
|
19
20
|
@names = {}
|
20
21
|
@extensions = []
|
21
22
|
super(opts)
|
22
|
-
@itis_dir = File.join(@download_dir,
|
23
|
+
@itis_dir = File.join(@download_dir, "itis")
|
23
24
|
end
|
24
25
|
|
25
26
|
def unpack
|
26
27
|
unpack_tar
|
27
|
-
dir = Dir.entries(@download_dir).select {|e| e.match(/itisMySQL/)}[0]
|
28
|
+
dir = Dir.entries(@download_dir).select { |e| e.match(/itisMySQL/) }[0]
|
28
29
|
FileUtils.mv(File.join(@download_dir, dir), @itis_dir)
|
29
30
|
|
30
31
|
# Create a file with the same name as the directory we extracted.
|
31
|
-
FileUtils.touch(File.join(@itis_dir,
|
32
|
+
FileUtils.touch(File.join(@itis_dir, "version_" + dir))
|
32
33
|
end
|
33
34
|
|
34
35
|
def make_dwca
|
35
|
-
DwcaHunter
|
36
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
36
37
|
get_ranks
|
37
38
|
get_kingdoms
|
38
39
|
get_authors
|
@@ -42,7 +43,8 @@ module DwcaHunter
|
|
42
43
|
generate_dwca
|
43
44
|
end
|
44
45
|
|
45
|
-
|
46
|
+
private
|
47
|
+
|
46
48
|
def get_ranks
|
47
49
|
# 0 kingdom_id integer not null
|
48
50
|
# 1 rank_id smallint not null
|
@@ -50,15 +52,15 @@ module DwcaHunter
|
|
50
52
|
# 3 dir_parent_rank_id smallint not null
|
51
53
|
# 4 req_parent_rank_id smallint not null
|
52
54
|
# 5 update_date date not null
|
53
|
-
rank_file = File.join(@itis_dir,
|
54
|
-
f = open(rank_file,
|
55
|
+
rank_file = File.join(@itis_dir, "taxon_unit_types")
|
56
|
+
f = open(rank_file, "r:utf-8")
|
55
57
|
f.each do |l|
|
56
|
-
l.encode!(
|
57
|
-
|
58
|
+
l.encode!("UTF-8",
|
59
|
+
"ISO-8859-1",
|
58
60
|
invalid: :replace,
|
59
|
-
replace:
|
60
|
-
row = l.strip.split(
|
61
|
-
@ranks[row[0].strip +
|
61
|
+
replace: "?")
|
62
|
+
row = l.strip.split("|")
|
63
|
+
@ranks[row[0].strip + "/" + row[1].strip] = row[2].strip
|
62
64
|
end
|
63
65
|
end
|
64
66
|
|
@@ -67,9 +69,9 @@ module DwcaHunter
|
|
67
69
|
# 1 kingdom_name char(10) not null
|
68
70
|
# 2 update_date date not null
|
69
71
|
|
70
|
-
f = open(File.join(@itis_dir,
|
72
|
+
f = open(File.join(@itis_dir, "kingdoms"))
|
71
73
|
f.each do |l|
|
72
|
-
data = l.strip.split(
|
74
|
+
data = l.strip.split("|")
|
73
75
|
@kingdoms[data[0].strip] = data[1].strip
|
74
76
|
end
|
75
77
|
end
|
@@ -80,13 +82,13 @@ module DwcaHunter
|
|
80
82
|
# 2 update_date date not null
|
81
83
|
# 3 kingdom_id smallint not null
|
82
84
|
|
83
|
-
f = open(File.join(@itis_dir,
|
85
|
+
f = open(File.join(@itis_dir, "taxon_authors_lkp"))
|
84
86
|
f.each do |l|
|
85
|
-
l.encode!(
|
86
|
-
|
87
|
+
l.encode!("UTF-8",
|
88
|
+
"ISO-8859-1",
|
87
89
|
invalid: :replace,
|
88
|
-
replace:
|
89
|
-
data = l.strip.split(
|
90
|
+
replace: "?")
|
91
|
+
data = l.strip.split("|")
|
90
92
|
@authors[data[0].strip] = data[1].strip
|
91
93
|
end
|
92
94
|
end
|
@@ -100,22 +102,22 @@ module DwcaHunter
|
|
100
102
|
# 5 primary key (tsn,vernacular_name,language)
|
101
103
|
# constraint "itis".vernaculars_key
|
102
104
|
|
103
|
-
f = open(File.join(@itis_dir,
|
105
|
+
f = open(File.join(@itis_dir, "vernaculars"))
|
104
106
|
f.each_with_index do |l, i|
|
105
107
|
if i % BATCH_SIZE == 0
|
106
|
-
DwcaHunter
|
107
|
-
|
108
|
+
DwcaHunter.logger_write(object_id,
|
109
|
+
"Extracted %s vernacular names" % i)
|
108
110
|
end
|
109
|
-
l.encode!(
|
110
|
-
|
111
|
+
l.encode!("UTF-8",
|
112
|
+
"ISO-8859-1",
|
111
113
|
invalid: :replace,
|
112
|
-
replace:
|
113
|
-
data = l.split(
|
114
|
+
replace: "?")
|
115
|
+
data = l.split("|").map(&:strip)
|
114
116
|
name_tsn = data[0]
|
115
117
|
string = data[1]
|
116
118
|
language = data[2]
|
117
|
-
language =
|
118
|
-
@vernaculars[name_tsn] = { name:string, language:language }
|
119
|
+
language = "Common name" if language == "unspecified"
|
120
|
+
@vernaculars[name_tsn] = { name: string, language: language }
|
119
121
|
end
|
120
122
|
end
|
121
123
|
|
@@ -124,17 +126,17 @@ module DwcaHunter
|
|
124
126
|
# 1 tsn_accepted integer not null
|
125
127
|
# 2 update_date date not null
|
126
128
|
|
127
|
-
f = open(File.join(@itis_dir,
|
129
|
+
f = open(File.join(@itis_dir, "synonym_links"))
|
128
130
|
f.each_with_index do |l, i|
|
129
131
|
if i % BATCH_SIZE == 0
|
130
|
-
DwcaHunter
|
131
|
-
|
132
|
+
DwcaHunter.logger_write(object_id,
|
133
|
+
"Extracted %s synonyms" % i)
|
132
134
|
end
|
133
|
-
l.encode!(
|
134
|
-
|
135
|
+
l.encode!("UTF-8",
|
136
|
+
"ISO-8859-1",
|
135
137
|
invalid: :replace,
|
136
|
-
replace:
|
137
|
-
data = l.split(
|
138
|
+
replace: "?")
|
139
|
+
data = l.split("|").map(&:strip)
|
138
140
|
synonym_name_tsn = data[0]
|
139
141
|
accepted_name_tsn = data[1]
|
140
142
|
@synonyms[synonym_name_tsn] = accepted_name_tsn
|
@@ -167,19 +169,19 @@ module DwcaHunter
|
|
167
169
|
# 22 update_date date not null
|
168
170
|
# 23 uncertain_prnt_ind char(3)
|
169
171
|
|
170
|
-
f = open(File.join(@itis_dir,
|
172
|
+
f = open(File.join(@itis_dir, "taxonomic_units"))
|
171
173
|
f.each_with_index do |l, i|
|
172
174
|
if i % BATCH_SIZE == 0
|
173
|
-
DwcaHunter
|
174
|
-
|
175
|
+
DwcaHunter.logger_write(object_id,
|
176
|
+
"Extracted %s names" % i)
|
175
177
|
end
|
176
|
-
l.encode!(
|
177
|
-
|
178
|
+
l.encode!("UTF-8",
|
179
|
+
"ISO-8859-1",
|
178
180
|
invalid: :replace,
|
179
|
-
replace:
|
180
|
-
data = l.split("|").map
|
181
|
-
name_tsn
|
182
|
-
x1
|
181
|
+
replace: "?")
|
182
|
+
data = l.split("|").map(&:strip)
|
183
|
+
name_tsn = data[0]
|
184
|
+
x1 = data[1]
|
183
185
|
name_part1 = data[2]
|
184
186
|
x2 = data[3]
|
185
187
|
name_part2 = data[4]
|
@@ -193,16 +195,15 @@ module DwcaHunter
|
|
193
195
|
kingdom_id = data[20]
|
194
196
|
rank_id = data[21]
|
195
197
|
|
196
|
-
parent_tsn = nil if parent_tsn ==
|
198
|
+
parent_tsn = nil if parent_tsn == ""
|
197
199
|
name = [x1, name_part1, x2, name_part2,
|
198
200
|
sp_marker1, name_part3, sp_marker2, name_part4]
|
199
201
|
canonical_name = name.clone
|
200
202
|
name << @authors[author_id] if @authors[author_id]
|
201
|
-
name = name.join(
|
202
|
-
canonical_name = canonical_name.join(
|
203
|
-
rank = @ranks[kingdom_id +
|
204
|
-
|
205
|
-
''
|
203
|
+
name = name.join(" ").strip.gsub(/\s+/, " ")
|
204
|
+
canonical_name = canonical_name.join(" ").strip.gsub(/\s+/, " ")
|
205
|
+
rank = @ranks[kingdom_id + "/" + rank_id] ||
|
206
|
+
""
|
206
207
|
@names[name_tsn] = { name: name,
|
207
208
|
canonical_name: canonical_name,
|
208
209
|
status: status,
|
@@ -212,58 +213,57 @@ module DwcaHunter
|
|
212
213
|
end
|
213
214
|
|
214
215
|
def generate_dwca
|
215
|
-
DwcaHunter
|
216
|
-
|
217
|
-
@core = [[
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
@extensions << { data: [[
|
225
|
-
|
226
|
-
|
227
|
-
file_name:
|
228
|
-
row_type:
|
229
|
-
|
230
|
-
@names.keys.each_with_index do |k, i|
|
216
|
+
DwcaHunter.logger_write(object_id,
|
217
|
+
"Creating DarwinCore Archive file")
|
218
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
219
|
+
"http://rs.tdwg.org/dwc/terms/parentNameUsageID",
|
220
|
+
"http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
|
221
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
222
|
+
"http://rs.tdwg.org/ontology/voc/TaxonName#nameComplete",
|
223
|
+
"http://rs.tdwg.org/dwc/terms/taxonomicStatus",
|
224
|
+
"http://rs.tdwg.org/dwc/terms/taxonRank"]]
|
225
|
+
@extensions << { data: [["http://rs.tdwg.org/dwc/terms/taxonID",
|
226
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
227
|
+
"http://purl.org/dc/terms/language"]],
|
228
|
+
file_name: "vernacular_names.txt",
|
229
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName" }
|
230
|
+
@names.keys.each_with_index do |k, _i|
|
231
231
|
d = @names[k]
|
232
|
-
accepted_id = @synonyms[k]
|
232
|
+
accepted_id = @synonyms[k] || nil
|
233
233
|
parent_id = d[:parent_tsn].to_i == 0 ? nil : d[:parent_tsn]
|
234
234
|
row = [k, parent_id, accepted_id, d[:name], d[:canonical_name], d[:status], d[:rank]]
|
235
235
|
@core << row
|
236
236
|
end
|
237
237
|
|
238
|
-
@vernaculars.keys.each_with_index do |k,
|
238
|
+
@vernaculars.keys.each_with_index do |k, _i|
|
239
239
|
d = @vernaculars[k]
|
240
240
|
@extensions[0][:data] << [k, d[:name], d[:language]]
|
241
241
|
end
|
242
242
|
|
243
243
|
@eml = {
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
244
|
+
id: @uuid,
|
245
|
+
title: @title,
|
246
|
+
authors: [
|
247
|
+
{ email: "itiswebmaster@itis.gov" }
|
248
|
+
],
|
249
|
+
metadata_providers: [
|
250
|
+
{ first_name: "Dmitry",
|
251
|
+
last_name: "Mozzherin",
|
252
|
+
email: "dmozzherin@gmail.com" }
|
253
|
+
],
|
254
|
+
abstract: "The White House Subcommittee on Biodiversity and " \
|
255
|
+
"Ecosystem Dynamics has identified systematics as a " \
|
256
|
+
"research priority that is fundamental to ecosystem " \
|
257
|
+
"management and biodiversity conservation. This primary " \
|
258
|
+
"need identified by the Subcommittee requires " \
|
259
|
+
"improvements in the organization of, and access to, " \
|
260
|
+
"standardized nomenclature. ITIS (originally referred " \
|
261
|
+
"to as the Interagency Taxonomic Information System) " \
|
262
|
+
"was designed to fulfill these requirements. In the " \
|
263
|
+
"future, the ITIS will provide taxonomic data and a " \
|
264
|
+
"directory of taxonomic expertise that will support " \
|
265
|
+
"the system",
|
266
|
+
url: "http://www.itis.gov"
|
267
267
|
}
|
268
268
|
super
|
269
269
|
end
|