dwca_hunter 0.5.3 → 0.7.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,15 +1,16 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module DwcaHunter
3
4
  class ResourceITIS < DwcaHunter::Resource
4
5
  def initialize(opts = {})
5
- @command = 'itis'
6
- @title = 'ITIS'
7
- @url = 'https://www.itis.gov/downloads/itisMySQLTables.tar.gz'
8
- @uuid = '5d066e84-e512-4a2f-875c-0a605d3d9f35'
6
+ @command = "itis"
7
+ @title = "Integrated Taxonomic Information SystemITIS"
8
+ @url = "https://www.itis.gov/downloads/itisMySQLTables.tar.gz"
9
+ @uuid = "5d066e84-e512-4a2f-875c-0a605d3d9f35"
9
10
  @download_path = File.join(Dir.tmpdir,
10
- 'dwca_hunter',
11
- 'itis',
12
- 'data.tar.gz')
11
+ "dwca_hunter",
12
+ "itis",
13
+ "data.tar.gz")
13
14
  @ranks = {}
14
15
  @kingdoms = {}
15
16
  @authors = {}
@@ -19,20 +20,20 @@ module DwcaHunter
19
20
  @names = {}
20
21
  @extensions = []
21
22
  super(opts)
22
- @itis_dir = File.join(@download_dir, 'itis')
23
+ @itis_dir = File.join(@download_dir, "itis")
23
24
  end
24
25
 
25
26
  def unpack
26
27
  unpack_tar
27
- dir = Dir.entries(@download_dir).select {|e| e.match(/itisMySQL/)}[0]
28
+ dir = Dir.entries(@download_dir).select { |e| e.match(/itisMySQL/) }[0]
28
29
  FileUtils.mv(File.join(@download_dir, dir), @itis_dir)
29
30
 
30
31
  # Create a file with the same name as the directory we extracted.
31
- FileUtils.touch(File.join(@itis_dir, 'version_' + dir))
32
+ FileUtils.touch(File.join(@itis_dir, "version_" + dir))
32
33
  end
33
34
 
34
35
  def make_dwca
35
- DwcaHunter::logger_write(self.object_id, 'Extracting data')
36
+ DwcaHunter.logger_write(object_id, "Extracting data")
36
37
  get_ranks
37
38
  get_kingdoms
38
39
  get_authors
@@ -42,7 +43,8 @@ module DwcaHunter
42
43
  generate_dwca
43
44
  end
44
45
 
45
- private
46
+ private
47
+
46
48
  def get_ranks
47
49
  # 0 kingdom_id integer not null
48
50
  # 1 rank_id smallint not null
@@ -50,15 +52,15 @@ module DwcaHunter
50
52
  # 3 dir_parent_rank_id smallint not null
51
53
  # 4 req_parent_rank_id smallint not null
52
54
  # 5 update_date date not null
53
- rank_file = File.join(@itis_dir, 'taxon_unit_types')
54
- f = open(rank_file, 'r:utf-8')
55
+ rank_file = File.join(@itis_dir, "taxon_unit_types")
56
+ f = open(rank_file, "r:utf-8")
55
57
  f.each do |l|
56
- l.encode!('UTF-8',
57
- 'ISO-8859-1',
58
+ l.encode!("UTF-8",
59
+ "ISO-8859-1",
58
60
  invalid: :replace,
59
- replace: '?')
60
- row = l.strip.split('|')
61
- @ranks[row[0].strip + '/' + row[1].strip] = row[2].strip
61
+ replace: "?")
62
+ row = l.strip.split("|")
63
+ @ranks[row[0].strip + "/" + row[1].strip] = row[2].strip
62
64
  end
63
65
  end
64
66
 
@@ -67,9 +69,9 @@ module DwcaHunter
67
69
  # 1 kingdom_name char(10) not null
68
70
  # 2 update_date date not null
69
71
 
70
- f = open(File.join(@itis_dir, 'kingdoms'))
72
+ f = open(File.join(@itis_dir, "kingdoms"))
71
73
  f.each do |l|
72
- data = l.strip.split('|')
74
+ data = l.strip.split("|")
73
75
  @kingdoms[data[0].strip] = data[1].strip
74
76
  end
75
77
  end
@@ -80,13 +82,13 @@ module DwcaHunter
80
82
  # 2 update_date date not null
81
83
  # 3 kingdom_id smallint not null
82
84
 
83
- f = open(File.join(@itis_dir, 'taxon_authors_lkp'))
85
+ f = open(File.join(@itis_dir, "taxon_authors_lkp"))
84
86
  f.each do |l|
85
- l.encode!('UTF-8',
86
- 'ISO-8859-1',
87
+ l.encode!("UTF-8",
88
+ "ISO-8859-1",
87
89
  invalid: :replace,
88
- replace: '?')
89
- data = l.strip.split('|')
90
+ replace: "?")
91
+ data = l.strip.split("|")
90
92
  @authors[data[0].strip] = data[1].strip
91
93
  end
92
94
  end
@@ -100,22 +102,22 @@ module DwcaHunter
100
102
  # 5 primary key (tsn,vernacular_name,language)
101
103
  # constraint "itis".vernaculars_key
102
104
 
103
- f = open(File.join(@itis_dir, 'vernaculars'))
105
+ f = open(File.join(@itis_dir, "vernaculars"))
104
106
  f.each_with_index do |l, i|
105
107
  if i % BATCH_SIZE == 0
106
- DwcaHunter::logger_write(self.object_id,
107
- "Extracted %s vernacular names" % i)
108
+ DwcaHunter.logger_write(object_id,
109
+ "Extracted %s vernacular names" % i)
108
110
  end
109
- l.encode!('UTF-8',
110
- 'ISO-8859-1',
111
+ l.encode!("UTF-8",
112
+ "ISO-8859-1",
111
113
  invalid: :replace,
112
- replace: '?')
113
- data = l.split('|').map { |d| d.strip }
114
+ replace: "?")
115
+ data = l.split("|").map(&:strip)
114
116
  name_tsn = data[0]
115
117
  string = data[1]
116
118
  language = data[2]
117
- language = 'Common name' if language == 'unspecified'
118
- @vernaculars[name_tsn] = { name:string, language:language }
119
+ language = "Common name" if language == "unspecified"
120
+ @vernaculars[name_tsn] = { name: string, language: language }
119
121
  end
120
122
  end
121
123
 
@@ -124,17 +126,17 @@ module DwcaHunter
124
126
  # 1 tsn_accepted integer not null
125
127
  # 2 update_date date not null
126
128
 
127
- f = open(File.join(@itis_dir, 'synonym_links'))
129
+ f = open(File.join(@itis_dir, "synonym_links"))
128
130
  f.each_with_index do |l, i|
129
131
  if i % BATCH_SIZE == 0
130
- DwcaHunter::logger_write(self.object_id,
131
- "Extracted %s synonyms" % i)
132
+ DwcaHunter.logger_write(object_id,
133
+ "Extracted %s synonyms" % i)
132
134
  end
133
- l.encode!('UTF-8',
134
- 'ISO-8859-1',
135
+ l.encode!("UTF-8",
136
+ "ISO-8859-1",
135
137
  invalid: :replace,
136
- replace: '?')
137
- data = l.split('|').map { |d| d.strip }
138
+ replace: "?")
139
+ data = l.split("|").map(&:strip)
138
140
  synonym_name_tsn = data[0]
139
141
  accepted_name_tsn = data[1]
140
142
  @synonyms[synonym_name_tsn] = accepted_name_tsn
@@ -167,19 +169,19 @@ module DwcaHunter
167
169
  # 22 update_date date not null
168
170
  # 23 uncertain_prnt_ind char(3)
169
171
 
170
- f = open(File.join(@itis_dir, 'taxonomic_units'))
172
+ f = open(File.join(@itis_dir, "taxonomic_units"))
171
173
  f.each_with_index do |l, i|
172
174
  if i % BATCH_SIZE == 0
173
- DwcaHunter::logger_write(self.object_id,
174
- "Extracted %s names" % i)
175
+ DwcaHunter.logger_write(object_id,
176
+ "Extracted %s names" % i)
175
177
  end
176
- l.encode!('UTF-8',
177
- 'ISO-8859-1',
178
+ l.encode!("UTF-8",
179
+ "ISO-8859-1",
178
180
  invalid: :replace,
179
- replace: '?')
180
- data = l.split("|").map { |d| d.strip }
181
- name_tsn = data[0]
182
- x1 = data[1]
181
+ replace: "?")
182
+ data = l.split("|").map(&:strip)
183
+ name_tsn = data[0]
184
+ x1 = data[1]
183
185
  name_part1 = data[2]
184
186
  x2 = data[3]
185
187
  name_part2 = data[4]
@@ -193,16 +195,15 @@ module DwcaHunter
193
195
  kingdom_id = data[20]
194
196
  rank_id = data[21]
195
197
 
196
- parent_tsn = nil if parent_tsn == ''
198
+ parent_tsn = nil if parent_tsn == ""
197
199
  name = [x1, name_part1, x2, name_part2,
198
200
  sp_marker1, name_part3, sp_marker2, name_part4]
199
201
  canonical_name = name.clone
200
202
  name << @authors[author_id] if @authors[author_id]
201
- name = name.join(' ').strip.gsub(/\s+/, ' ')
202
- canonical_name = canonical_name.join(' ').strip.gsub(/\s+/, ' ')
203
- rank = @ranks[kingdom_id + '/' + rank_id] ?
204
- @ranks[kingdom_id + '/' + rank_id] :
205
- ''
203
+ name = name.join(" ").strip.gsub(/\s+/, " ")
204
+ canonical_name = canonical_name.join(" ").strip.gsub(/\s+/, " ")
205
+ rank = @ranks[kingdom_id + "/" + rank_id] ||
206
+ ""
206
207
  @names[name_tsn] = { name: name,
207
208
  canonical_name: canonical_name,
208
209
  status: status,
@@ -212,58 +213,57 @@ module DwcaHunter
212
213
  end
213
214
 
214
215
  def generate_dwca
215
- DwcaHunter::logger_write(self.object_id,
216
- 'Creating DarwinCore Archive file')
217
- @core = [['http://rs.tdwg.org/dwc/terms/taxonID',
218
- 'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
219
- 'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID',
220
- 'http://rs.tdwg.org/dwc/terms/scientificName',
221
- 'http://rs.tdwg.org/ontology/voc/TaxonName#nameComplete',
222
- 'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
223
- 'http://rs.tdwg.org/dwc/terms/taxonRank']]
224
- @extensions << { data: [['http://rs.tdwg.org/dwc/terms/taxonID',
225
- 'http://rs.tdwg.org/dwc/terms/vernacularName',
226
- 'http://purl.org/dc/terms/language']],
227
- file_name: 'vernacular_names.txt',
228
- row_type: 'http://rs.gbif.org/terms/1.0/VernacularName'
229
- }
230
- @names.keys.each_with_index do |k, i|
216
+ DwcaHunter.logger_write(object_id,
217
+ "Creating DarwinCore Archive file")
218
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
219
+ "http://rs.tdwg.org/dwc/terms/parentNameUsageID",
220
+ "http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
221
+ "http://rs.tdwg.org/dwc/terms/scientificName",
222
+ "http://rs.tdwg.org/ontology/voc/TaxonName#nameComplete",
223
+ "http://rs.tdwg.org/dwc/terms/taxonomicStatus",
224
+ "http://rs.tdwg.org/dwc/terms/taxonRank"]]
225
+ @extensions << { data: [["http://rs.tdwg.org/dwc/terms/taxonID",
226
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
227
+ "http://purl.org/dc/terms/language"]],
228
+ file_name: "vernacular_names.txt",
229
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName" }
230
+ @names.keys.each_with_index do |k, _i|
231
231
  d = @names[k]
232
- accepted_id = @synonyms[k] ? @synonyms[k] : nil
232
+ accepted_id = @synonyms[k] || nil
233
233
  parent_id = d[:parent_tsn].to_i == 0 ? nil : d[:parent_tsn]
234
234
  row = [k, parent_id, accepted_id, d[:name], d[:canonical_name], d[:status], d[:rank]]
235
235
  @core << row
236
236
  end
237
237
 
238
- @vernaculars.keys.each_with_index do |k, i|
238
+ @vernaculars.keys.each_with_index do |k, _i|
239
239
  d = @vernaculars[k]
240
240
  @extensions[0][:data] << [k, d[:name], d[:language]]
241
241
  end
242
242
 
243
243
  @eml = {
244
- id: @uuid,
245
- title: @title,
246
- authors: [
247
- {email: 'itiswebmaster@itis.gov'}
248
- ],
249
- metadata_providers: [
250
- { first_name: 'Dmitry',
251
- last_name: 'Mozzherin',
252
- email: 'dmozzherin@gmail.com' }
253
- ],
254
- abstract: 'The White House Subcommittee on Biodiversity and ' +
255
- 'Ecosystem Dynamics has identified systematics as a ' +
256
- 'research priority that is fundamental to ecosystem ' +
257
- 'management and biodiversity conservation. This primary ' +
258
- 'need identified by the Subcommittee requires ' +
259
- 'improvements in the organization of, and access to, ' +
260
- 'standardized nomenclature. ITIS (originally referred ' +
261
- 'to as the Interagency Taxonomic Information System) ' +
262
- 'was designed to fulfill these requirements. In the ' +
263
- 'future, the ITIS will provide taxonomic data and a ' +
264
- 'directory of taxonomic expertise that will support ' +
265
- 'the system',
266
- url: 'http://www.itis.gov'
244
+ id: @uuid,
245
+ title: @title,
246
+ authors: [
247
+ { email: "itiswebmaster@itis.gov" }
248
+ ],
249
+ metadata_providers: [
250
+ { first_name: "Dmitry",
251
+ last_name: "Mozzherin",
252
+ email: "dmozzherin@gmail.com" }
253
+ ],
254
+ abstract: "The White House Subcommittee on Biodiversity and " \
255
+ "Ecosystem Dynamics has identified systematics as a " \
256
+ "research priority that is fundamental to ecosystem " \
257
+ "management and biodiversity conservation. This primary " \
258
+ "need identified by the Subcommittee requires " \
259
+ "improvements in the organization of, and access to, " \
260
+ "standardized nomenclature. ITIS (originally referred " \
261
+ "to as the Interagency Taxonomic Information System) " \
262
+ "was designed to fulfill these requirements. In the " \
263
+ "future, the ITIS will provide taxonomic data and a " \
264
+ "directory of taxonomic expertise that will support " \
265
+ "the system",
266
+ url: "http://www.itis.gov"
267
267
  }
268
268
  super
269
269
  end
@@ -0,0 +1,186 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceMammalDiversityDb < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = "mammal-div-db"
7
+ @title = "ASM Mammal Diversity Database"
8
+ @url = "https://www.mammaldiversity.org/assets/data/MDD.zip"
9
+ @UUID = "94270cdd-5424-4bb1-8324-46ccc5386dc7"
10
+ @download_path = File.join(Dir.tmpdir,
11
+ "dwca_hunter",
12
+ "mammal-div-db",
13
+ "data.zip")
14
+ @synonyms = []
15
+ @names = []
16
+ @vernaculars = []
17
+ @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
20
+ super(opts)
21
+ end
22
+
23
+ def download
24
+ DwcaHunter.logger_write(object_id, "Downloading")
25
+ `curl '#{@url}' -H 'User-Agent:' -o #{@download_path}`
26
+ end
27
+
28
+ def unpack
29
+ unpack_zip
30
+ end
31
+
32
+ def make_dwca
33
+ DwcaHunter.logger_write(object_id, "Extracting data")
34
+ get_names
35
+ generate_dwca
36
+ end
37
+
38
+ private
39
+
40
+ def get_names
41
+ Dir.chdir(@download_dir)
42
+ collect_names
43
+ end
44
+
45
+ def find_csv_file
46
+ Dir.chdir(@download_dir)
47
+ Dir.entries(".").each do |f|
48
+ return f if f[-4..-1] == ".csv"
49
+ end
50
+ end
51
+
52
+ def assemble_name(row)
53
+ name = row["sciName"].gsub("_", " ")
54
+ auth = "#{row['authoritySpeciesAuthor']} #{row['aurhoritySpeciesYear']}".
55
+ strip
56
+ auth = "(#{auth})" if row["authorityParentheses"] == 1
57
+ rank = "species"
58
+ rank = "subspecies" if (name.split(" ").size > 2)
59
+ name = "#{name} #{auth}".strip
60
+ [rank, name]
61
+ end
62
+
63
+ def assemble_synonym(row)
64
+ name = row["originalNameCombination"].gsub("_", " ")
65
+ auth = "#{row['authoritySpeciesAuthor']} #{row['aurhoritySpeciesYear']}".
66
+ strip
67
+ name = "#{name} #{auth}".strip
68
+ { taxon_id: row["id"], name_string: name, status: "synonym" }
69
+ end
70
+
71
+ def vernaculars(row)
72
+ id = row["id"]
73
+ res = []
74
+ vern = row["mainCommonName"].to_s
75
+ res << vern if vern != ""
76
+ verns = row["otherCommonNames"].to_s
77
+ if verns != ""
78
+ verns = verns.split("|")
79
+ res += verns
80
+ end
81
+ res.map do |v|
82
+ { taxon_id: id, vern: v, lang: "en" }
83
+ end
84
+ end
85
+
86
+ def collect_names
87
+ @names_index = {}
88
+ file = CSV.open(File.join(@download_dir, find_csv_file),
89
+ headers: true)
90
+ file.each do |row|
91
+ order = row["order"].to_s.capitalize
92
+ order = nil if order.match(/incertae/) || order.empty?
93
+ family = row["family"].to_s.capitalize
94
+ family = nil if family.match(/incertae/) || family.empty?
95
+ genus = row["genus"].to_s.capitalize
96
+ genus = nil if genus.match(/incertae/) || genus.empty?
97
+ rank, name_string = assemble_name(row)
98
+ @names << {
99
+ taxon_id: row["id"],
100
+ kingdom: "Animalia",
101
+ phylum: "Chordata",
102
+ klass: "Mammalia",
103
+ order: order,
104
+ family: family,
105
+ genus: genus,
106
+ name_string: name_string,
107
+ rank: rank,
108
+ code: "ICZN"
109
+ }
110
+ if row["originalNameCombination"].to_s != ""
111
+ @synonyms << assemble_synonym(row)
112
+ end
113
+ vernaculars(row).each do |vern|
114
+ @vernaculars << vern
115
+ end
116
+ end
117
+ end
118
+
119
+ def generate_dwca
120
+ DwcaHunter.logger_write(object_id,
121
+ "Creating DarwinCore Archive file")
122
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
123
+ "http://rs.tdwg.org/dwc/terms/scientificName",
124
+ "http://rs.tdwg.org/dwc/terms/kingdom",
125
+ "http://rs.tdwg.org/dwc/terms/phylum",
126
+ "http://rs.tdwg.org/dwc/terms/class",
127
+ "http://rs.tdwg.org/dwc/terms/order",
128
+ "http://rs.tdwg.org/dwc/terms/family",
129
+ "http://rs.tdwg.org/dwc/terms/genus",
130
+ "http://rs.tdwg.org/dwc/terms/taxonRank",
131
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
132
+ @names.each do |n|
133
+ @core << [n[:taxon_id], n[:name_string],
134
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
135
+ n[:genus], n[:rank], n[:code]]
136
+ end
137
+ @extensions << {
138
+ data: [[
139
+ "http://rs.tdwg.org/dwc/terms/taxonID",
140
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
141
+ "http://purl.org/dc/terms/language"
142
+ ]],
143
+ file_name: "vernacular_names.txt",
144
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
145
+ }
146
+
147
+ @vernaculars.each do |v|
148
+ @extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
149
+ end
150
+
151
+ @extensions << {
152
+ data: [[
153
+ "http://rs.tdwg.org/dwc/terms/taxonID",
154
+ "http://rs.tdwg.org/dwc/terms/scientificName",
155
+ "http://rs.tdwg.org/dwc/terms/taxonomicStatus"
156
+ ]],
157
+ file_name: "synonyms.txt"
158
+ }
159
+ @synonyms.each do |s|
160
+ @extensions[-1][:data] << [s[:taxon_id], s[:name_string], s[:status]]
161
+ end
162
+ @eml = {
163
+ id: @uuid,
164
+ title: @title,
165
+ authors: [
166
+ { first_name: "C. J.",
167
+ last_name: "Burgin" },
168
+ { first_name: "J. P.",
169
+ last_name: "Colella" },
170
+ { first_name: "P. L.",
171
+ last_name: "Kahn" },
172
+ { first_name: "N. S.",
173
+ last_name: "Upham" }
174
+ ],
175
+ metadata_providers: [
176
+ { first_name: "Dmitry",
177
+ last_name: "Mozzherin",
178
+ email: "dmozzherin@gmail.com" }
179
+ ],
180
+ abstract: "Mammal Diversity Database. 2021. www.mammaldiversity.org. " \
181
+ "American Society of Mammalogists. Accessed 2021-01-28.", url: @url
182
+ }
183
+ super
184
+ end
185
+ end
186
+ end