dwca_hunter 0.5.3 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,12 @@ module DwcaHunter
4
4
 
5
5
  def self.unzip(file, dir = nil)
6
6
  Dir.chdir(dir) if dir
7
- `unzip -qq -u #{file} > /dev/null 2>&1`
7
+ Zip::File.open(file) do |zip_file|
8
+ zip_file.each do |entry|
9
+ puts "Extracting #{entry.name}"
10
+ entry.extract
11
+ end
12
+ end
8
13
  end
9
14
 
10
15
  def self.gunzip(file, dir = nil)
@@ -13,8 +18,8 @@ module DwcaHunter
13
18
  end
14
19
 
15
20
  def initialize(opts)
16
- @needs_download = !(opts[:download] == false)
17
- @needs_unpack = !(opts[:unpack] == false)
21
+ @needs_download = (opts[:download] != false)
22
+ @needs_unpack = (opts[:unpack] != false)
18
23
  @download_dir, @download_file = File.split(@download_path)
19
24
  prepare_path if needs_download?
20
25
  end
@@ -0,0 +1,143 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceAOS < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = "aos-birds"
7
+ @title = "American Ornithological Society"
8
+ @url = "http://checklist.americanornithology.org/taxa.csv"
9
+ @UUID = "91d38806-8435-479f-a18d-705e5cb0767c"
10
+ @download_path = File.join(Dir.tmpdir,
11
+ "dwca_hunter",
12
+ "aos",
13
+ "data.csv")
14
+ @synonyms = []
15
+ @names = []
16
+ @vernaculars = []
17
+ @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
20
+ super(opts)
21
+ end
22
+
23
+ def download
24
+ puts "Downloading csv from remote"
25
+ `curl -s -L #{@url} -o #{@download_path}`
26
+ end
27
+
28
+ def unpack; end
29
+
30
+ def make_dwca
31
+ DwcaHunter.logger_write(object_id, "Extracting data")
32
+ get_names
33
+ generate_dwca
34
+ end
35
+
36
+ private
37
+
38
+ def get_names
39
+ Dir.chdir(@download_dir)
40
+ collect_names
41
+ end
42
+
43
+ def collect_names
44
+ @names_index = {}
45
+ file = CSV.open(File.join(@download_dir, "data.csv"),
46
+ headers: true)
47
+ file.each_with_index do |row, _i|
48
+ taxon_id = row["id"]
49
+ name_string = row["species"]
50
+ kingdom = "Animalia"
51
+ phylum = "Chordata"
52
+ klass = "Aves"
53
+ order = row["order"]
54
+ family = row["family"]
55
+ genus = row["genus"]
56
+ code = "ICZN"
57
+
58
+ @names << {
59
+ taxon_id: taxon_id,
60
+ name_string: name_string,
61
+ kingdom: kingdom,
62
+ phylum: phylum,
63
+ klass: klass,
64
+ order: order,
65
+ family: family,
66
+ genus: genus,
67
+ code: code
68
+ }
69
+ if row["common_name"].to_s != ""
70
+ @vernaculars << {
71
+ taxon_id: taxon_id,
72
+ vern: row["common_name"],
73
+ lang: "en"
74
+ }
75
+ end
76
+ next unless row["french_name"].to_s != ""
77
+
78
+ @vernaculars << {
79
+ taxon_id: taxon_id,
80
+ vern: row["french_name"],
81
+ lang: "fr"
82
+ }
83
+ end
84
+ end
85
+
86
+ def generate_dwca
87
+ DwcaHunter.logger_write(object_id,
88
+ "Creating DarwinCore Archive file")
89
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
90
+ "http://rs.tdwg.org/dwc/terms/scientificName",
91
+ "http://rs.tdwg.org/dwc/terms/kingdom",
92
+ "http://rs.tdwg.org/dwc/terms/phylum",
93
+ "http://rs.tdwg.org/dwc/terms/class",
94
+ "http://rs.tdwg.org/dwc/terms/order",
95
+ "http://rs.tdwg.org/dwc/terms/family",
96
+ "http://rs.tdwg.org/dwc/terms/genus",
97
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
98
+ @names.each do |n|
99
+ @core << [n[:taxon_id], n[:name_string],
100
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
101
+ n[:genus], n[:code]]
102
+ end
103
+ @extensions << {
104
+ data: [[
105
+ "http://rs.tdwg.org/dwc/terms/taxonID",
106
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
107
+ "http://purl.org/dc/terms/language"
108
+ ]],
109
+ file_name: "vernacular_names.txt",
110
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
111
+ }
112
+
113
+ @vernaculars.each do |v|
114
+ @extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
115
+ end
116
+ @eml = {
117
+ id: @uuid,
118
+ title: @title,
119
+ authors: [
120
+ { first_name: "R. T.",
121
+ last_name: "Chesser" }
122
+ ],
123
+ metadata_providers: [
124
+ { first_name: "Dmitry",
125
+ last_name: "Mozzherin",
126
+ email: "dmozzherin@gmail.com" }
127
+ ],
128
+ abstract: "The American Ornithological Society's (AOS) Checklist is " \
129
+ "the official source on the taxonomy of birds found in North and " \
130
+ "Middle America, including adjacent islands. This list is produced " \
131
+ "by the North American Classification and Nomenclature Committee " \
132
+ "(NACC) of the AOS.\n\n" \
133
+ "Recommended citation: Chesser, R. T., K. J. Burns, C. Cicero, " \
134
+ "J. L. Dunn, A. W. Kratter, I. J. Lovette, P. C. Rasmussen, " \
135
+ "J. V. Remsen, Jr., D. F. Stotz, and K. Winker. 2019. Check-list " \
136
+ "of North American Birds (online). American Ornithological Society. " \
137
+ "http://checklist.aou.org/taxa",
138
+ url: @url
139
+ }
140
+ super
141
+ end
142
+ end
143
+ end
@@ -1,34 +1,36 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module DwcaHunter
3
4
  class ResourceArctos < DwcaHunter::Resource
4
-
5
5
  def initialize(opts = {})
6
- @command = 'arctos'
7
- @title = 'Arctos'
8
- @url = 'https://www.dropbox.com/s/jo44d1vd9bkdwm8/arctos.zip?dl=1'
9
- @UUID = 'eea8315d-a244-4625-859a-226675622312'
6
+ @command = "arctos"
7
+ @title = "Arctos"
8
+ @url = "http://arctos.database.museum/cache/gn_merge.tgz"
9
+ @UUID = "eea8315d-a244-4625-859a-226675622312"
10
10
  @download_path = File.join(Dir.tmpdir,
11
- 'dwca_hunter',
12
- 'arctos',
13
- 'data.zip')
11
+ "dwca_hunter",
12
+ "arctos",
13
+ "data.tar.gz")
14
14
  @synonyms = []
15
15
  @names = []
16
16
  @vernaculars = []
17
17
  @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
18
20
  super(opts)
19
21
  end
20
22
 
21
23
  def download
22
- puts "Downloading cached verion of the file. Ask Arctos to generate new."
23
- `curl -s -L #{@url} -o #{@download_path}`
24
+ puts "Downloading Arctos file."
25
+ `curl -s #{@url} -o #{@download_path}`
24
26
  end
25
27
 
26
28
  def unpack
27
- unpack_zip
29
+ unpack_tar
28
30
  end
29
31
 
30
32
  def make_dwca
31
- DwcaHunter::logger_write(self.object_id, 'Extracting data')
33
+ DwcaHunter.logger_write(object_id, "Extracting data")
32
34
  get_names
33
35
  generate_dwca
34
36
  end
@@ -37,190 +39,154 @@ module DwcaHunter
37
39
 
38
40
  def get_names
39
41
  Dir.chdir(@download_dir)
40
- Dir.entries(@download_dir).grep(/zip$/).each do |file|
41
- self.class.unzip(file) unless File.exists?(file.gsub(/zip$/,'csv'))
42
- end
43
- collect_names
44
42
  collect_synonyms
45
43
  collect_vernaculars
44
+ collect_names
46
45
  end
47
46
 
48
47
  def collect_vernaculars
49
- file = open(File.join(@download_dir, 'flat_common_name.csv'))
50
- fields = {}
48
+ file = CSV.open(File.join(@download_dir, "globalnames_commonname.csv"),
49
+ headers: true)
51
50
  file.each_with_index do |row, i|
51
+ canonical = row["scientific_name"]
52
+ vernacular_name_string = row["common_name"]
52
53
 
53
- if i == 0
54
- fields = get_fields(row)
55
- next
54
+ if @vernaculars_hash.key?(canonical)
55
+ @vernaculars_hash[canonical] << vernacular_name_string
56
+ else
57
+ @vernaculars_hash[canonical] = [vernacular_name_string]
56
58
  end
57
59
 
58
- row = split_row(row)
59
-
60
- taxon_id = row[fields[:taxon_name_id]]
61
- vernacular_name_string = row[fields[:common_name]]
62
-
63
- @vernaculars << {
64
- taxon_id: taxon_id,
65
- vernacular_name_string: vernacular_name_string
66
- }
67
-
68
- puts "Processed %s vernaculars" % i if i % 10000 == 0
60
+ puts "Processed #{i} vernaculars"if (i % 100_000).zero?
69
61
  end
70
62
  end
71
63
 
72
64
  def collect_synonyms
73
- file = open(File.join(@download_dir, 'flat_relationships.csv'))
74
- fields = {}
65
+ file = CSV.open(File.join(@download_dir, "globalnames_relationships.csv"),
66
+ headers: true)
75
67
  file.each_with_index do |row, i|
76
- if i == 0
77
- fields = get_fields(row)
78
- next
68
+ canonical = row["scientific_name"]
69
+ if @synonyms_hash.key?(canonical)
70
+ @synonyms_hash[canonical] <<
71
+ { name_string: row["related_name"], status: row["taxon_relationship"] }
72
+ else
73
+ @synonyms_hash[canonical] = [
74
+ { name_string: row["related_name"], status: row["taxon_relationship"] }
75
+ ]
79
76
  end
80
-
81
- row = split_row(row)
82
- taxon_id = row[fields[:taxon_name_id]]
83
- @synonyms << {
84
- taxon_id: row[fields[:related_taxon_name_id]],
85
- local_id: taxon_id,
86
- name_string: @names_index[taxon_id],
87
- #synonym_authority: row[fields[:relation_authority]],
88
- taxonomic_status: row[fields[:taxon_relationship]],
89
- }
90
- puts "Processed %s synonyms" % i if i % 10000 == 0
77
+ puts "Processed #{i} synonyms" if (i % 100_000).zero?
91
78
  end
92
79
  end
93
80
 
94
81
  def collect_names
95
82
  @names_index = {}
96
- file = open(File.join(@download_dir, 'flat_classification.csv'))
97
- fields = {}
83
+ file = CSV.open(File.join(@download_dir, "globalnames_classification.csv"),
84
+ headers: true)
85
+
86
+ names = {}
98
87
  file.each_with_index do |row, i|
99
- if i == 0
100
- fields = get_fields(row)
101
- next
88
+ next if row["term_type"].nil?
89
+ name = row["scientific_name"]
90
+ if names.key?(name)
91
+ names[name] = names[name].
92
+ merge({row["term_type"].to_sym => row["term"]})
93
+ else
94
+ names[name] = {row["term_type"].to_sym => row["term"]}
102
95
  end
103
-
104
- next unless row[fields[:display_name]]
105
- row = split_row(row)
106
- taxon_id = row[fields[:taxon_name_id]]
107
- name_string = row[fields[:display_name]].gsub(/<\/?i>/,'')
108
- kingdom = row[fields[:kingdom]]
109
- phylum = row[fields[:phylum]]
110
- klass = row[fields[:phylclass]]
111
- subclass = row[fields[:subclass]]
112
- order = row[fields[:phylorder]]
113
- suborder = row[fields[:suborder]]
114
- superfamily = row[fields[:superfamily]]
115
- family = row[fields[:family]]
116
- subfamily = row[fields[:subfamily]]
117
- tribe = row[fields[:tribe]]
118
- genus = row[fields[:genus]]
119
- subgenus = row[fields[:subgenus]]
120
- species = row[fields[:species]]
121
- subspecies = row[fields[:subspecies]]
122
- code = row[fields[:nomenclatural_code]]
123
-
124
- @names << { taxon_id: taxon_id,
125
- local_id: taxon_id,
126
- name_string: name_string,
127
- kingdom: kingdom,
128
- phylum: phylum,
129
- klass: klass,
130
- order: order,
131
- family: family,
132
- genus: genus,
133
- code: code,
134
- }
135
-
136
- @names_index[taxon_id] = name_string
137
- puts "Processed %s names" % i if i % 10000 == 0
96
+ puts "Preprocessed #{i} rows" if (i % 100_000).zero?
97
+ end
98
+ names.each_with_index do |m, i|
99
+ canonical = m[0]
100
+ v = m[1]
101
+ taxon_id = "gn_#{i + 1}"
102
+ res ={ taxon_id: taxon_id,
103
+ name_string: canonical,
104
+ kingdom: v[:kingdom],
105
+ phylum: v[:phylum],
106
+ klass: v[:class],
107
+ order: v[:order],
108
+ family: v[:family],
109
+ genus: v[:genus],
110
+ species: v[:species],
111
+ authors: v[:author_text],
112
+ code: v[:nomenclatural_code] }
113
+ @names << res
114
+ update_vernacular(taxon_id, canonical)
115
+ update_synonym(taxon_id, canonical)
116
+ puts "Processed #{i} names" if (i % 100_000).zero?
138
117
  end
139
118
  end
140
119
 
141
- def split_row(row)
142
- row = row.strip.gsub(/^"/, '').gsub(/"$/, '')
143
- row.split('","')
144
- end
120
+ def update_vernacular(taxon_id, canonical)
121
+ return unless @vernaculars_hash.key?(canonical)
145
122
 
146
- def get_fields(row)
147
- row = row.split(",")
148
- encoding_options = {
149
- :invalid => :replace,
150
- :undef => :replace,
151
- :replace => '',
152
- :universal_newline => true
153
- }
154
- num_ary = (0...row.size).to_a
155
- row = row.map do |f|
156
- f = f.strip.downcase
157
- f = f.encode ::Encoding.find('ASCII'), encoding_options
158
- f.to_sym
123
+ @vernaculars_hash[canonical].each do |vern|
124
+ @vernaculars << { taxon_id: taxon_id, vern: vern }
159
125
  end
160
- res = Hash[row.zip(num_ary)]
161
- require 'byebug'; byebug
162
- puts ''
163
- res
164
126
  end
165
127
 
128
+ def update_synonym(taxon_id, canonical)
129
+ return unless @synonyms_hash.key?(canonical)
130
+
131
+ @synonyms_hash[canonical].each do |syn|
132
+ @synonyms << { taxon_id: taxon_id, name_string: syn[:name_string],
133
+ status: syn[:status] }
134
+ end
135
+ end
166
136
 
167
137
  def generate_dwca
168
- DwcaHunter::logger_write(self.object_id,
169
- 'Creating DarwinCore Archive file')
170
- @core = [['http://rs.tdwg.org/dwc/terms/taxonID',
171
- 'http://globalnames.org/terms/localID',
172
- 'http://rs.tdwg.org/dwc/terms/scientificName',
173
- 'http://rs.tdwg.org/dwc/terms/kingdom',
174
- 'http://rs.tdwg.org/dwc/terms/phylum',
175
- 'http://rs.tdwg.org/dwc/terms/class',
176
- 'http://rs.tdwg.org/dwc/terms/order',
177
- 'http://rs.tdwg.org/dwc/terms/family',
178
- 'http://rs.tdwg.org/dwc/terms/genus',
179
- 'http://rs.tdwg.org/dwc/terms/nomenclaturalCode',
180
- ]]
138
+ DwcaHunter.logger_write(object_id,
139
+ "Creating DarwinCore Archive file")
140
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
141
+ "http://rs.tdwg.org/dwc/terms/scientificName",
142
+ "http://rs.tdwg.org/dwc/terms/kingdom",
143
+ "http://rs.tdwg.org/dwc/terms/phylum",
144
+ "http://rs.tdwg.org/dwc/terms/class",
145
+ "http://rs.tdwg.org/dwc/terms/order",
146
+ "http://rs.tdwg.org/dwc/terms/family",
147
+ "http://rs.tdwg.org/dwc/terms/genus",
148
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
181
149
  @names.each do |n|
182
- @core << [n[:taxon_id], n[:taxon_id], n[:name_string],
183
- n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
184
- n[:genus], n[:code]]
150
+ @core << [n[:taxon_id], n[:name_string],
151
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
152
+ n[:genus], n[:code]]
185
153
  end
186
154
  @extensions << {
187
155
  data: [[
188
- 'http://rs.tdwg.org/dwc/terms/taxonID',
189
- 'http://rs.tdwg.org/dwc/terms/vernacularName']],
190
- file_name: 'vernacular_names.txt',
191
- row_type: 'http://rs.gbif.org/terms/1.0/VernacularName' }
156
+ "http://rs.tdwg.org/dwc/terms/taxonID",
157
+ "http://rs.tdwg.org/dwc/terms/vernacularName"
158
+ ]],
159
+ file_name: "vernacular_names.txt",
160
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
161
+ }
192
162
 
193
163
  @vernaculars.each do |v|
194
- @extensions[-1][:data] << [v[:taxon_id], v[:vernacular_name_string]]
164
+ @extensions[-1][:data] << [v[:taxon_id], v[:vern]]
195
165
  end
196
166
 
197
167
  @extensions << {
198
168
  data: [[
199
- 'http://rs.tdwg.org/dwc/terms/taxonID',
200
- 'http://globalnames.org/terms/localID',
201
- 'http://rs.tdwg.org/dwc/terms/scientificName',
202
- 'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
203
- ]],
204
- file_name: 'synonyms.txt',
205
- }
206
-
169
+ "http://rs.tdwg.org/dwc/terms/taxonID",
170
+ "http://rs.tdwg.org/dwc/terms/scientificName",
171
+ "http://rs.tdwg.org/dwc/terms/taxonomicStatus"
172
+ ]],
173
+ file_name: "synonyms.txt"
174
+ }
207
175
  @synonyms.each do |s|
208
- @extensions[-1][:data] << [
209
- s[:taxon_id], s[:local_id],
210
- s[:name_string], s[:taxonomic_status]]
176
+ @extensions[-1][:data] << [s[:taxon_id], s[:name_string], s[:status]]
211
177
  end
212
178
  @eml = {
213
179
  id: @uuid,
214
180
  title: @title,
215
181
  authors: [
216
- {email: 'dustymc at gmail dot com'}
217
- ],
182
+ { email: "dustymc at gmail dot com" }
183
+ ],
218
184
  metadata_providers: [
219
- { first_name: 'Dmitry',
220
- last_name: 'Mozzherin',
221
- email: 'dmozzherin@gmail.com' }
222
- ],
223
- abstract: 'Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.',
185
+ { first_name: "Dmitry",
186
+ last_name: "Mozzherin",
187
+ email: "dmozzherin@gmail.com" }
188
+ ],
189
+ abstract: "Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.",
224
190
  url: @url
225
191
  }
226
192
  super