dwca_hunter 0.5.2 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/.byebug_history +37 -0
  3. data/.gitignore +5 -0
  4. data/.rubocop.yml +3 -2
  5. data/.ruby-version +1 -1
  6. data/Gemfile.lock +59 -135
  7. data/LICENSE.txt +1 -1
  8. data/README.md +1 -1
  9. data/dwca_hunter.gemspec +7 -8
  10. data/exe/dwcahunter +1 -3
  11. data/lib/dwca_hunter.rb +39 -8
  12. data/lib/dwca_hunter/resource.rb +5 -0
  13. data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
  14. data/lib/dwca_hunter/resources/arctos.rb +121 -145
  15. data/lib/dwca_hunter/resources/clements.rb +151 -0
  16. data/lib/dwca_hunter/resources/eol.rb +85 -0
  17. data/lib/dwca_hunter/resources/freebase.rb +51 -49
  18. data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
  19. data/lib/dwca_hunter/resources/index-fungorum.rb +131 -0
  20. data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
  21. data/lib/dwca_hunter/resources/ion.rb +98 -0
  22. data/lib/dwca_hunter/resources/ipni.rb +3 -2
  23. data/lib/dwca_hunter/resources/itis.rb +99 -99
  24. data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
  25. data/lib/dwca_hunter/resources/mammal_species.rb +9 -6
  26. data/lib/dwca_hunter/resources/mcz.rb +123 -0
  27. data/lib/dwca_hunter/resources/ncbi.rb +22 -23
  28. data/lib/dwca_hunter/resources/opentree.rb +5 -5
  29. data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
  30. data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
  31. data/lib/dwca_hunter/resources/sherborn.rb +91 -0
  32. data/lib/dwca_hunter/resources/wikispecies.rb +142 -129
  33. data/lib/dwca_hunter/version.rb +1 -1
  34. metadata +31 -40
  35. data/files/birdlife_7.csv +0 -11862
  36. data/files/fishbase_taxon_cache.tsv +0 -81000
  37. data/files/reptile_checklist_2014_12.csv +0 -15158
  38. data/files/species-black.txt +0 -251
  39. data/ipni.csv.gz +0 -0
  40. data/ipniWebName.csv.xz?dl=1 +0 -0
@@ -7,6 +7,11 @@ module DwcaHunter
7
7
  `unzip -qq -u #{file} > /dev/null 2>&1`
8
8
  end
9
9
 
10
+ def self.gunzip(file, dir = nil)
11
+ Dir.chdir(dir) if dir
12
+ `gunzip #{file}`
13
+ end
14
+
10
15
  def initialize(opts)
11
16
  @needs_download = !(opts[:download] == false)
12
17
  @needs_unpack = !(opts[:unpack] == false)
@@ -0,0 +1,143 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceAOS < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = "aos-birds"
7
+ @title = "American Ornithological Society"
8
+ @url = "http://checklist.americanornithology.org/taxa.csv"
9
+ @UUID = "91d38806-8435-479f-a18d-705e5cb0767c"
10
+ @download_path = File.join(Dir.tmpdir,
11
+ "dwca_hunter",
12
+ "aos",
13
+ "data.csv")
14
+ @synonyms = []
15
+ @names = []
16
+ @vernaculars = []
17
+ @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
20
+ super(opts)
21
+ end
22
+
23
+ def download
24
+ puts "Downloading csv from remote"
25
+ `curl -s -L #{@url} -o #{@download_path}`
26
+ end
27
+
28
+ def unpack; end
29
+
30
+ def make_dwca
31
+ DwcaHunter.logger_write(object_id, "Extracting data")
32
+ get_names
33
+ generate_dwca
34
+ end
35
+
36
+ private
37
+
38
+ def get_names
39
+ Dir.chdir(@download_dir)
40
+ collect_names
41
+ end
42
+
43
+ def collect_names
44
+ @names_index = {}
45
+ file = CSV.open(File.join(@download_dir, "data.csv"),
46
+ headers: true)
47
+ file.each_with_index do |row, _i|
48
+ taxon_id = row["id"]
49
+ name_string = row["species"]
50
+ kingdom = "Animalia"
51
+ phylum = "Chordata"
52
+ klass = "Aves"
53
+ order = row["order"]
54
+ family = row["family"]
55
+ genus = row["genus"]
56
+ code = "ICZN"
57
+
58
+ @names << {
59
+ taxon_id: taxon_id,
60
+ name_string: name_string,
61
+ kingdom: kingdom,
62
+ phylum: phylum,
63
+ klass: klass,
64
+ order: order,
65
+ family: family,
66
+ genus: genus,
67
+ code: code
68
+ }
69
+ if row["common_name"].to_s != ""
70
+ @vernaculars << {
71
+ taxon_id: taxon_id,
72
+ vern: row["common_name"],
73
+ lang: "en"
74
+ }
75
+ end
76
+ next unless row["french_name"].to_s != ""
77
+
78
+ @vernaculars << {
79
+ taxon_id: taxon_id,
80
+ vern: row["french_name"],
81
+ lang: "fr"
82
+ }
83
+ end
84
+ end
85
+
86
+ def generate_dwca
87
+ DwcaHunter.logger_write(object_id,
88
+ "Creating DarwinCore Archive file")
89
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
90
+ "http://rs.tdwg.org/dwc/terms/scientificName",
91
+ "http://rs.tdwg.org/dwc/terms/kingdom",
92
+ "http://rs.tdwg.org/dwc/terms/phylum",
93
+ "http://rs.tdwg.org/dwc/terms/class",
94
+ "http://rs.tdwg.org/dwc/terms/order",
95
+ "http://rs.tdwg.org/dwc/terms/family",
96
+ "http://rs.tdwg.org/dwc/terms/genus",
97
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
98
+ @names.each do |n|
99
+ @core << [n[:taxon_id], n[:name_string],
100
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
101
+ n[:genus], n[:code]]
102
+ end
103
+ @extensions << {
104
+ data: [[
105
+ "http://rs.tdwg.org/dwc/terms/taxonID",
106
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
107
+ "http://purl.org/dc/terms/language"
108
+ ]],
109
+ file_name: "vernacular_names.txt",
110
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
111
+ }
112
+
113
+ @vernaculars.each do |v|
114
+ @extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
115
+ end
116
+ @eml = {
117
+ id: @uuid,
118
+ title: @title,
119
+ authors: [
120
+ { first_name: "R. T.",
121
+ last_name: "Chesser" }
122
+ ],
123
+ metadata_providers: [
124
+ { first_name: "Dmitry",
125
+ last_name: "Mozzherin",
126
+ email: "dmozzherin@gmail.com" }
127
+ ],
128
+ abstract: "The American Ornithological Society's (AOS) Checklist is " \
129
+ "the official source on the taxonomy of birds found in North and " \
130
+ "Middle America, including adjacent islands. This list is produced " \
131
+ "by the North American Classification and Nomenclature Committee " \
132
+ "(NACC) of the AOS.\n\n" \
133
+ "Recommended citation: Chesser, R. T., K. J. Burns, C. Cicero, " \
134
+ "J. L. Dunn, A. W. Kratter, I. J. Lovette, P. C. Rasmussen, " \
135
+ "J. V. Remsen, Jr., D. F. Stotz, and K. Winker. 2019. Check-list " \
136
+ "of North American Birds (online). American Ornithological Society. " \
137
+ "http://checklist.aou.org/taxa",
138
+ url: @url
139
+ }
140
+ super
141
+ end
142
+ end
143
+ end
@@ -1,30 +1,36 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module DwcaHunter
3
4
  class ResourceArctos < DwcaHunter::Resource
4
-
5
5
  def initialize(opts = {})
6
- @command = 'arctos'
7
- @title = 'Arctos'
8
- @url = 'http://arctos.database.museum/download/gncombined.zip'
9
- @UUID = 'eea8315d-a244-4625-859a-226675622312'
6
+ @command = "arctos"
7
+ @title = "Arctos"
8
+ @url = "https://www.dropbox.com/s/3rmny5d8cfm9mmp/arctos.tar.gz?dl=1"
9
+ @UUID = "eea8315d-a244-4625-859a-226675622312"
10
10
  @download_path = File.join(Dir.tmpdir,
11
- 'dwca_hunter',
12
- 'arctos',
13
- 'data.tar.gz')
11
+ "dwca_hunter",
12
+ "arctos",
13
+ "data.zip")
14
14
  @synonyms = []
15
15
  @names = []
16
16
  @vernaculars = []
17
17
  @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
18
20
  super(opts)
19
- @gnub_dir = File.join(@download_dir, 'gnub')
21
+ end
22
+
23
+ def download
24
+ puts "Downloading cached verion of the file. Ask Arctos to generate new."
25
+ `curl -s -L #{@url} -o #{@download_path}`
20
26
  end
21
27
 
22
28
  def unpack
23
- unpack_zip
29
+ unpack_tar
24
30
  end
25
31
 
26
32
  def make_dwca
27
- DwcaHunter::logger_write(self.object_id, 'Extracting data')
33
+ DwcaHunter.logger_write(object_id, "Extracting data")
28
34
  get_names
29
35
  generate_dwca
30
36
  end
@@ -33,190 +39,160 @@ module DwcaHunter
33
39
 
34
40
  def get_names
35
41
  Dir.chdir(@download_dir)
36
- Dir.entries(@download_dir).grep(/zip$/).each do |file|
37
- self.class.unzip(file) unless File.exists?(file.gsub(/zip$/,'csv'))
38
- end
39
- collect_names
40
42
  collect_synonyms
41
43
  collect_vernaculars
44
+ collect_names
42
45
  end
43
46
 
44
47
  def collect_vernaculars
45
- file = open(File.join(@download_dir, 'common_name.csv'))
46
- fields = {}
48
+ file = CSV.open(File.join(@download_dir, "common_name.csv"),
49
+ headers: true)
47
50
  file.each_with_index do |row, i|
51
+ canonical = row["SCIENTIFIC_NAME"]
52
+ vernacular_name_string = row["COMMON_NAME"]
48
53
 
49
- if i == 0
50
- fields = get_fields(row)
51
- next
54
+ if @vernaculars_hash.key?(canonical)
55
+ @vernaculars_hash[canonical] << vernacular_name_string
56
+ else
57
+ @vernaculars_hash[canonical] = [vernacular_name_string]
52
58
  end
53
59
 
54
- row = split_row(row)
55
-
56
- taxon_id = row[fields[:taxon_name_id]]
57
- vernacular_name_string = row[fields[:common_name]]
58
-
59
- @vernaculars << {
60
- taxon_id: taxon_id,
61
- vernacular_name_string: vernacular_name_string
62
- }
63
-
64
- puts "Processed %s vernaculars" % i if i % 10000 == 0
60
+ puts "Processed %s vernaculars" % i if i % 10_000 == 0
65
61
  end
66
62
  end
67
63
 
68
64
  def collect_synonyms
69
- file = open(File.join(@download_dir, 'taxon_relations.csv'))
70
- fields = {}
65
+ file = CSV.open(File.join(@download_dir, "relationships.csv"),
66
+ headers: true)
71
67
  file.each_with_index do |row, i|
72
- if i == 0
73
- fields = get_fields(row)
74
- next
68
+ canonical = row["scientific_name"]
69
+ if @synonyms_hash.key?(canonical)
70
+ @synonyms_hash[canonical] <<
71
+ { name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
72
+ else
73
+ @synonyms_hash[canonical] = [
74
+ { name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
75
+ ]
75
76
  end
76
-
77
- row = split_row(row)
78
- taxon_id = row[fields[:taxon_name_id]]
79
- @synonyms << {
80
- taxon_id: row[fields[:related_taxon_name_id]],
81
- local_id: taxon_id,
82
- name_string: @names_index[taxon_id],
83
- #synonym_authority: row[fields[:relation_authority]],
84
- taxonomic_status: row[fields[:taxon_relationship]],
85
- }
86
- puts "Processed %s synonyms" % i if i % 10000 == 0
77
+ puts "Processed %s synonyms" % i if i % 10_000 == 0
87
78
  end
88
79
  end
89
80
 
90
81
  def collect_names
91
82
  @names_index = {}
92
- file = open(File.join(@download_dir, 'taxonomy.csv'))
93
- fields = {}
83
+ file = CSV.open(File.join(@download_dir, "classification.csv"),
84
+ headers: true)
94
85
  file.each_with_index do |row, i|
95
- if i == 0
96
- fields = get_fields(row)
97
- next
98
- end
99
- next unless row[fields[:display_name]]
100
- row = split_row(row)
101
- taxon_id = row[fields[:taxon_name_id]]
102
- name_string = row[fields[:display_name]].gsub(/<\/?i>/,'')
103
- kingdom = row[fields[:kingdom]]
104
- phylum = row[fields[:phylum]]
105
- klass = row[fields[:phylclass]]
106
- subclass = row[fields[:subclass]]
107
- order = row[fields[:phylorder]]
108
- suborder = row[fields[:suborder]]
109
- superfamily = row[fields[:superfamily]]
110
- family = row[fields[:family]]
111
- subfamily = row[fields[:subfamily]]
112
- tribe = row[fields[:tribe]]
113
- genus = row[fields[:genus]]
114
- subgenus = row[fields[:subgenus]]
115
- species = row[fields[:species]]
116
- subspecies = row[fields[:subspecies]]
117
- code = row[fields[:nomenclatural_code]]
118
-
86
+ next unless row["display_name"]
87
+
88
+ name_string = row["display_name"].gsub(%r{</?i>}, "")
89
+ canonical = row["scientific_name"]
90
+ kingdom = row["kingdom"]
91
+ phylum = row["phylum"]
92
+ klass = row["phylclass"]
93
+ subclass = row["subclass"]
94
+ order = row["phylorder"]
95
+ suborder = row["suborder"]
96
+ superfamily = row["superfamily"]
97
+ family = row["family"]
98
+ subfamily = row["subfamily"]
99
+ tribe = row["tribe"]
100
+ genus = row["genus"]
101
+ subgenus = row["subgenus"]
102
+ species = row["species"]
103
+ subspecies = row["subspecies"]
104
+ code = row["nomenclatural_code"]
105
+
106
+ taxon_id = "ARCT_#{i + 1}"
119
107
  @names << { taxon_id: taxon_id,
120
- local_id: taxon_id,
121
- name_string: name_string,
122
- kingdom: kingdom,
123
- phylum: phylum,
124
- klass: klass,
125
- order: order,
126
- family: family,
127
- genus: genus,
128
- code: code,
129
- }
130
-
131
- @names_index[taxon_id] = name_string
132
- puts "Processed %s names" % i if i % 10000 == 0
108
+ name_string: name_string,
109
+ kingdom: kingdom,
110
+ phylum: phylum,
111
+ klass: klass,
112
+ order: order,
113
+ family: family,
114
+ genus: genus,
115
+ code: code }
116
+
117
+ update_vernacular(taxon_id, canonical)
118
+ update_synonym(taxon_id, canonical)
119
+ puts "Processed %s names" % i if i % 10_000 == 0
133
120
  end
134
121
  end
135
122
 
136
- def split_row(row)
137
- row = row.strip.gsub(/^"/, '').gsub(/"$/, '')
138
- row.split('","')
139
- end
123
+ def update_vernacular(taxon_id, canonical)
124
+ return unless @vernaculars_hash.key?(canonical)
140
125
 
141
- def get_fields(row)
142
- row = row.split(",")
143
- encoding_options = {
144
- :invalid => :replace,
145
- :undef => :replace,
146
- :replace => '',
147
- :universal_newline => true
148
- }
149
- num_ary = (0...row.size).to_a
150
- row = row.map do |f|
151
- f = f.strip.downcase
152
- f = f.encode ::Encoding.find('ASCII'), encoding_options
153
- f.to_sym
126
+ @vernaculars_hash[canonical].each do |vern|
127
+ @vernaculars << { taxon_id: taxon_id, vern: vern }
154
128
  end
155
- Hash[row.zip(num_ary)]
156
129
  end
157
130
 
131
+ def update_synonym(taxon_id, canonical)
132
+ return unless @synonyms_hash.key?(canonical)
133
+
134
+ @synonyms_hash[canonical].each do |syn|
135
+ @synonyms << { taxon_id: taxon_id, name_string: syn[:name_string],
136
+ status: syn[:status] }
137
+ end
138
+ end
158
139
 
159
140
  def generate_dwca
160
- DwcaHunter::logger_write(self.object_id,
161
- 'Creating DarwinCore Archive file')
162
- @core = [['http://rs.tdwg.org/dwc/terms/taxonID',
163
- 'http://globalnames.org/terms/localID',
164
- 'http://rs.tdwg.org/dwc/terms/scientificName',
165
- 'http://rs.tdwg.org/dwc/terms/kingdom',
166
- 'http://rs.tdwg.org/dwc/terms/phylum',
167
- 'http://rs.tdwg.org/dwc/terms/class',
168
- 'http://rs.tdwg.org/dwc/terms/order',
169
- 'http://rs.tdwg.org/dwc/terms/family',
170
- 'http://rs.tdwg.org/dwc/terms/genus',
171
- 'http://rs.tdwg.org/dwc/terms/nomenclaturalCode',
172
- ]]
141
+ DwcaHunter.logger_write(object_id,
142
+ "Creating DarwinCore Archive file")
143
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
144
+ "http://rs.tdwg.org/dwc/terms/scientificName",
145
+ "http://rs.tdwg.org/dwc/terms/kingdom",
146
+ "http://rs.tdwg.org/dwc/terms/phylum",
147
+ "http://rs.tdwg.org/dwc/terms/class",
148
+ "http://rs.tdwg.org/dwc/terms/order",
149
+ "http://rs.tdwg.org/dwc/terms/family",
150
+ "http://rs.tdwg.org/dwc/terms/genus",
151
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
173
152
  @names.each do |n|
174
- @core << [n[:taxon_id], n[:taxon_id], n[:name_string],
175
- n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
176
- n[:genus], n[:code]]
153
+ @core << [n[:taxon_id], n[:name_string],
154
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
155
+ n[:genus], n[:code]]
177
156
  end
178
157
  @extensions << {
179
158
  data: [[
180
- 'http://rs.tdwg.org/dwc/terms/taxonID',
181
- 'http://rs.tdwg.org/dwc/terms/vernacularName']],
182
- file_name: 'vernacular_names.txt',
183
- row_type: 'http://rs.gbif.org/terms/1.0/VernacularName' }
159
+ "http://rs.tdwg.org/dwc/terms/taxonID",
160
+ "http://rs.tdwg.org/dwc/terms/vernacularName"
161
+ ]],
162
+ file_name: "vernacular_names.txt",
163
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
164
+ }
184
165
 
185
166
  @vernaculars.each do |v|
186
- @extensions[-1][:data] << [v[:taxon_id], v[:vernacular_name_string]]
167
+ @extensions[-1][:data] << [v[:taxon_id], v[:vern]]
187
168
  end
188
169
 
189
170
  @extensions << {
190
171
  data: [[
191
- 'http://rs.tdwg.org/dwc/terms/taxonID',
192
- 'http://globalnames.org/terms/localID',
193
- 'http://rs.tdwg.org/dwc/terms/scientificName',
194
- 'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
195
- ]],
196
- file_name: 'synonyms.txt',
197
- }
198
-
172
+ "http://rs.tdwg.org/dwc/terms/taxonID",
173
+ "http://rs.tdwg.org/dwc/terms/scientificName",
174
+ "http://rs.tdwg.org/dwc/terms/taxonomicStatus"
175
+ ]],
176
+ file_name: "synonyms.txt"
177
+ }
199
178
  @synonyms.each do |s|
200
- @extensions[-1][:data] << [
201
- s[:taxon_id], s[:local_id],
202
- s[:name_string], s[:taxonomic_status]]
179
+ @extensions[-1][:data] << [s[:taxon_id], s[:name_string], s[:status]]
203
180
  end
204
181
  @eml = {
205
182
  id: @uuid,
206
183
  title: @title,
207
184
  authors: [
208
- {email: 'dustymc at gmail dot com'}
209
- ],
185
+ { email: "dustymc at gmail dot com" }
186
+ ],
210
187
  metadata_providers: [
211
- { first_name: 'Dmitry',
212
- last_name: 'Mozzherin',
213
- email: 'dmozzherin@gmail.com' }
214
- ],
215
- abstract: 'Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.',
188
+ { first_name: "Dmitry",
189
+ last_name: "Mozzherin",
190
+ email: "dmozzherin@gmail.com" }
191
+ ],
192
+ abstract: "Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.",
216
193
  url: @url
217
194
  }
218
195
  super
219
196
  end
220
197
  end
221
198
  end
222
-