dwca_hunter 0.5.2 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/.byebug_history +37 -0
  3. data/.gitignore +5 -0
  4. data/.rubocop.yml +3 -2
  5. data/.ruby-version +1 -1
  6. data/Gemfile.lock +59 -135
  7. data/LICENSE.txt +1 -1
  8. data/README.md +1 -1
  9. data/dwca_hunter.gemspec +7 -8
  10. data/exe/dwcahunter +1 -3
  11. data/lib/dwca_hunter.rb +39 -8
  12. data/lib/dwca_hunter/resource.rb +5 -0
  13. data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
  14. data/lib/dwca_hunter/resources/arctos.rb +121 -145
  15. data/lib/dwca_hunter/resources/clements.rb +151 -0
  16. data/lib/dwca_hunter/resources/eol.rb +85 -0
  17. data/lib/dwca_hunter/resources/freebase.rb +51 -49
  18. data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
  19. data/lib/dwca_hunter/resources/index-fungorum.rb +131 -0
  20. data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
  21. data/lib/dwca_hunter/resources/ion.rb +98 -0
  22. data/lib/dwca_hunter/resources/ipni.rb +3 -2
  23. data/lib/dwca_hunter/resources/itis.rb +99 -99
  24. data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
  25. data/lib/dwca_hunter/resources/mammal_species.rb +9 -6
  26. data/lib/dwca_hunter/resources/mcz.rb +123 -0
  27. data/lib/dwca_hunter/resources/ncbi.rb +22 -23
  28. data/lib/dwca_hunter/resources/opentree.rb +5 -5
  29. data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
  30. data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
  31. data/lib/dwca_hunter/resources/sherborn.rb +91 -0
  32. data/lib/dwca_hunter/resources/wikispecies.rb +142 -129
  33. data/lib/dwca_hunter/version.rb +1 -1
  34. metadata +31 -40
  35. data/files/birdlife_7.csv +0 -11862
  36. data/files/fishbase_taxon_cache.tsv +0 -81000
  37. data/files/reptile_checklist_2014_12.csv +0 -15158
  38. data/files/species-black.txt +0 -251
  39. data/ipni.csv.gz +0 -0
  40. data/ipniWebName.csv.xz?dl=1 +0 -0
@@ -7,6 +7,11 @@ module DwcaHunter
7
7
  `unzip -qq -u #{file} > /dev/null 2>&1`
8
8
  end
9
9
 
10
+ def self.gunzip(file, dir = nil)
11
+ Dir.chdir(dir) if dir
12
+ `gunzip #{file}`
13
+ end
14
+
10
15
  def initialize(opts)
11
16
  @needs_download = !(opts[:download] == false)
12
17
  @needs_unpack = !(opts[:unpack] == false)
@@ -0,0 +1,143 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceAOS < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = "aos-birds"
7
+ @title = "American Ornithological Society"
8
+ @url = "http://checklist.americanornithology.org/taxa.csv"
9
+ @UUID = "91d38806-8435-479f-a18d-705e5cb0767c"
10
+ @download_path = File.join(Dir.tmpdir,
11
+ "dwca_hunter",
12
+ "aos",
13
+ "data.csv")
14
+ @synonyms = []
15
+ @names = []
16
+ @vernaculars = []
17
+ @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
20
+ super(opts)
21
+ end
22
+
23
+ def download
24
+ puts "Downloading csv from remote"
25
+ `curl -s -L #{@url} -o #{@download_path}`
26
+ end
27
+
28
+ def unpack; end
29
+
30
+ def make_dwca
31
+ DwcaHunter.logger_write(object_id, "Extracting data")
32
+ get_names
33
+ generate_dwca
34
+ end
35
+
36
+ private
37
+
38
+ def get_names
39
+ Dir.chdir(@download_dir)
40
+ collect_names
41
+ end
42
+
43
+ def collect_names
44
+ @names_index = {}
45
+ file = CSV.open(File.join(@download_dir, "data.csv"),
46
+ headers: true)
47
+ file.each_with_index do |row, _i|
48
+ taxon_id = row["id"]
49
+ name_string = row["species"]
50
+ kingdom = "Animalia"
51
+ phylum = "Chordata"
52
+ klass = "Aves"
53
+ order = row["order"]
54
+ family = row["family"]
55
+ genus = row["genus"]
56
+ code = "ICZN"
57
+
58
+ @names << {
59
+ taxon_id: taxon_id,
60
+ name_string: name_string,
61
+ kingdom: kingdom,
62
+ phylum: phylum,
63
+ klass: klass,
64
+ order: order,
65
+ family: family,
66
+ genus: genus,
67
+ code: code
68
+ }
69
+ if row["common_name"].to_s != ""
70
+ @vernaculars << {
71
+ taxon_id: taxon_id,
72
+ vern: row["common_name"],
73
+ lang: "en"
74
+ }
75
+ end
76
+ next unless row["french_name"].to_s != ""
77
+
78
+ @vernaculars << {
79
+ taxon_id: taxon_id,
80
+ vern: row["french_name"],
81
+ lang: "fr"
82
+ }
83
+ end
84
+ end
85
+
86
+ def generate_dwca
87
+ DwcaHunter.logger_write(object_id,
88
+ "Creating DarwinCore Archive file")
89
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
90
+ "http://rs.tdwg.org/dwc/terms/scientificName",
91
+ "http://rs.tdwg.org/dwc/terms/kingdom",
92
+ "http://rs.tdwg.org/dwc/terms/phylum",
93
+ "http://rs.tdwg.org/dwc/terms/class",
94
+ "http://rs.tdwg.org/dwc/terms/order",
95
+ "http://rs.tdwg.org/dwc/terms/family",
96
+ "http://rs.tdwg.org/dwc/terms/genus",
97
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
98
+ @names.each do |n|
99
+ @core << [n[:taxon_id], n[:name_string],
100
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
101
+ n[:genus], n[:code]]
102
+ end
103
+ @extensions << {
104
+ data: [[
105
+ "http://rs.tdwg.org/dwc/terms/taxonID",
106
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
107
+ "http://purl.org/dc/terms/language"
108
+ ]],
109
+ file_name: "vernacular_names.txt",
110
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
111
+ }
112
+
113
+ @vernaculars.each do |v|
114
+ @extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
115
+ end
116
+ @eml = {
117
+ id: @uuid,
118
+ title: @title,
119
+ authors: [
120
+ { first_name: "R. T.",
121
+ last_name: "Chesser" }
122
+ ],
123
+ metadata_providers: [
124
+ { first_name: "Dmitry",
125
+ last_name: "Mozzherin",
126
+ email: "dmozzherin@gmail.com" }
127
+ ],
128
+ abstract: "The American Ornithological Society's (AOS) Checklist is " \
129
+ "the official source on the taxonomy of birds found in North and " \
130
+ "Middle America, including adjacent islands. This list is produced " \
131
+ "by the North American Classification and Nomenclature Committee " \
132
+ "(NACC) of the AOS.\n\n" \
133
+ "Recommended citation: Chesser, R. T., K. J. Burns, C. Cicero, " \
134
+ "J. L. Dunn, A. W. Kratter, I. J. Lovette, P. C. Rasmussen, " \
135
+ "J. V. Remsen, Jr., D. F. Stotz, and K. Winker. 2019. Check-list " \
136
+ "of North American Birds (online). American Ornithological Society. " \
137
+ "http://checklist.aou.org/taxa",
138
+ url: @url
139
+ }
140
+ super
141
+ end
142
+ end
143
+ end
@@ -1,30 +1,36 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module DwcaHunter
3
4
  class ResourceArctos < DwcaHunter::Resource
4
-
5
5
  def initialize(opts = {})
6
- @command = 'arctos'
7
- @title = 'Arctos'
8
- @url = 'http://arctos.database.museum/download/gncombined.zip'
9
- @UUID = 'eea8315d-a244-4625-859a-226675622312'
6
+ @command = "arctos"
7
+ @title = "Arctos"
8
+ @url = "https://www.dropbox.com/s/3rmny5d8cfm9mmp/arctos.tar.gz?dl=1"
9
+ @UUID = "eea8315d-a244-4625-859a-226675622312"
10
10
  @download_path = File.join(Dir.tmpdir,
11
- 'dwca_hunter',
12
- 'arctos',
13
- 'data.tar.gz')
11
+ "dwca_hunter",
12
+ "arctos",
13
+ "data.zip")
14
14
  @synonyms = []
15
15
  @names = []
16
16
  @vernaculars = []
17
17
  @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
18
20
  super(opts)
19
- @gnub_dir = File.join(@download_dir, 'gnub')
21
+ end
22
+
23
+ def download
24
+ puts "Downloading cached verion of the file. Ask Arctos to generate new."
25
+ `curl -s -L #{@url} -o #{@download_path}`
20
26
  end
21
27
 
22
28
  def unpack
23
- unpack_zip
29
+ unpack_tar
24
30
  end
25
31
 
26
32
  def make_dwca
27
- DwcaHunter::logger_write(self.object_id, 'Extracting data')
33
+ DwcaHunter.logger_write(object_id, "Extracting data")
28
34
  get_names
29
35
  generate_dwca
30
36
  end
@@ -33,190 +39,160 @@ module DwcaHunter
33
39
 
34
40
  def get_names
35
41
  Dir.chdir(@download_dir)
36
- Dir.entries(@download_dir).grep(/zip$/).each do |file|
37
- self.class.unzip(file) unless File.exists?(file.gsub(/zip$/,'csv'))
38
- end
39
- collect_names
40
42
  collect_synonyms
41
43
  collect_vernaculars
44
+ collect_names
42
45
  end
43
46
 
44
47
  def collect_vernaculars
45
- file = open(File.join(@download_dir, 'common_name.csv'))
46
- fields = {}
48
+ file = CSV.open(File.join(@download_dir, "common_name.csv"),
49
+ headers: true)
47
50
  file.each_with_index do |row, i|
51
+ canonical = row["SCIENTIFIC_NAME"]
52
+ vernacular_name_string = row["COMMON_NAME"]
48
53
 
49
- if i == 0
50
- fields = get_fields(row)
51
- next
54
+ if @vernaculars_hash.key?(canonical)
55
+ @vernaculars_hash[canonical] << vernacular_name_string
56
+ else
57
+ @vernaculars_hash[canonical] = [vernacular_name_string]
52
58
  end
53
59
 
54
- row = split_row(row)
55
-
56
- taxon_id = row[fields[:taxon_name_id]]
57
- vernacular_name_string = row[fields[:common_name]]
58
-
59
- @vernaculars << {
60
- taxon_id: taxon_id,
61
- vernacular_name_string: vernacular_name_string
62
- }
63
-
64
- puts "Processed %s vernaculars" % i if i % 10000 == 0
60
+ puts "Processed %s vernaculars" % i if i % 10_000 == 0
65
61
  end
66
62
  end
67
63
 
68
64
  def collect_synonyms
69
- file = open(File.join(@download_dir, 'taxon_relations.csv'))
70
- fields = {}
65
+ file = CSV.open(File.join(@download_dir, "relationships.csv"),
66
+ headers: true)
71
67
  file.each_with_index do |row, i|
72
- if i == 0
73
- fields = get_fields(row)
74
- next
68
+ canonical = row["scientific_name"]
69
+ if @synonyms_hash.key?(canonical)
70
+ @synonyms_hash[canonical] <<
71
+ { name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
72
+ else
73
+ @synonyms_hash[canonical] = [
74
+ { name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
75
+ ]
75
76
  end
76
-
77
- row = split_row(row)
78
- taxon_id = row[fields[:taxon_name_id]]
79
- @synonyms << {
80
- taxon_id: row[fields[:related_taxon_name_id]],
81
- local_id: taxon_id,
82
- name_string: @names_index[taxon_id],
83
- #synonym_authority: row[fields[:relation_authority]],
84
- taxonomic_status: row[fields[:taxon_relationship]],
85
- }
86
- puts "Processed %s synonyms" % i if i % 10000 == 0
77
+ puts "Processed %s synonyms" % i if i % 10_000 == 0
87
78
  end
88
79
  end
89
80
 
90
81
  def collect_names
91
82
  @names_index = {}
92
- file = open(File.join(@download_dir, 'taxonomy.csv'))
93
- fields = {}
83
+ file = CSV.open(File.join(@download_dir, "classification.csv"),
84
+ headers: true)
94
85
  file.each_with_index do |row, i|
95
- if i == 0
96
- fields = get_fields(row)
97
- next
98
- end
99
- next unless row[fields[:display_name]]
100
- row = split_row(row)
101
- taxon_id = row[fields[:taxon_name_id]]
102
- name_string = row[fields[:display_name]].gsub(/<\/?i>/,'')
103
- kingdom = row[fields[:kingdom]]
104
- phylum = row[fields[:phylum]]
105
- klass = row[fields[:phylclass]]
106
- subclass = row[fields[:subclass]]
107
- order = row[fields[:phylorder]]
108
- suborder = row[fields[:suborder]]
109
- superfamily = row[fields[:superfamily]]
110
- family = row[fields[:family]]
111
- subfamily = row[fields[:subfamily]]
112
- tribe = row[fields[:tribe]]
113
- genus = row[fields[:genus]]
114
- subgenus = row[fields[:subgenus]]
115
- species = row[fields[:species]]
116
- subspecies = row[fields[:subspecies]]
117
- code = row[fields[:nomenclatural_code]]
118
-
86
+ next unless row["display_name"]
87
+
88
+ name_string = row["display_name"].gsub(%r{</?i>}, "")
89
+ canonical = row["scientific_name"]
90
+ kingdom = row["kingdom"]
91
+ phylum = row["phylum"]
92
+ klass = row["phylclass"]
93
+ subclass = row["subclass"]
94
+ order = row["phylorder"]
95
+ suborder = row["suborder"]
96
+ superfamily = row["superfamily"]
97
+ family = row["family"]
98
+ subfamily = row["subfamily"]
99
+ tribe = row["tribe"]
100
+ genus = row["genus"]
101
+ subgenus = row["subgenus"]
102
+ species = row["species"]
103
+ subspecies = row["subspecies"]
104
+ code = row["nomenclatural_code"]
105
+
106
+ taxon_id = "ARCT_#{i + 1}"
119
107
  @names << { taxon_id: taxon_id,
120
- local_id: taxon_id,
121
- name_string: name_string,
122
- kingdom: kingdom,
123
- phylum: phylum,
124
- klass: klass,
125
- order: order,
126
- family: family,
127
- genus: genus,
128
- code: code,
129
- }
130
-
131
- @names_index[taxon_id] = name_string
132
- puts "Processed %s names" % i if i % 10000 == 0
108
+ name_string: name_string,
109
+ kingdom: kingdom,
110
+ phylum: phylum,
111
+ klass: klass,
112
+ order: order,
113
+ family: family,
114
+ genus: genus,
115
+ code: code }
116
+
117
+ update_vernacular(taxon_id, canonical)
118
+ update_synonym(taxon_id, canonical)
119
+ puts "Processed %s names" % i if i % 10_000 == 0
133
120
  end
134
121
  end
135
122
 
136
- def split_row(row)
137
- row = row.strip.gsub(/^"/, '').gsub(/"$/, '')
138
- row.split('","')
139
- end
123
+ def update_vernacular(taxon_id, canonical)
124
+ return unless @vernaculars_hash.key?(canonical)
140
125
 
141
- def get_fields(row)
142
- row = row.split(",")
143
- encoding_options = {
144
- :invalid => :replace,
145
- :undef => :replace,
146
- :replace => '',
147
- :universal_newline => true
148
- }
149
- num_ary = (0...row.size).to_a
150
- row = row.map do |f|
151
- f = f.strip.downcase
152
- f = f.encode ::Encoding.find('ASCII'), encoding_options
153
- f.to_sym
126
+ @vernaculars_hash[canonical].each do |vern|
127
+ @vernaculars << { taxon_id: taxon_id, vern: vern }
154
128
  end
155
- Hash[row.zip(num_ary)]
156
129
  end
157
130
 
131
+ def update_synonym(taxon_id, canonical)
132
+ return unless @synonyms_hash.key?(canonical)
133
+
134
+ @synonyms_hash[canonical].each do |syn|
135
+ @synonyms << { taxon_id: taxon_id, name_string: syn[:name_string],
136
+ status: syn[:status] }
137
+ end
138
+ end
158
139
 
159
140
  def generate_dwca
160
- DwcaHunter::logger_write(self.object_id,
161
- 'Creating DarwinCore Archive file')
162
- @core = [['http://rs.tdwg.org/dwc/terms/taxonID',
163
- 'http://globalnames.org/terms/localID',
164
- 'http://rs.tdwg.org/dwc/terms/scientificName',
165
- 'http://rs.tdwg.org/dwc/terms/kingdom',
166
- 'http://rs.tdwg.org/dwc/terms/phylum',
167
- 'http://rs.tdwg.org/dwc/terms/class',
168
- 'http://rs.tdwg.org/dwc/terms/order',
169
- 'http://rs.tdwg.org/dwc/terms/family',
170
- 'http://rs.tdwg.org/dwc/terms/genus',
171
- 'http://rs.tdwg.org/dwc/terms/nomenclaturalCode',
172
- ]]
141
+ DwcaHunter.logger_write(object_id,
142
+ "Creating DarwinCore Archive file")
143
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
144
+ "http://rs.tdwg.org/dwc/terms/scientificName",
145
+ "http://rs.tdwg.org/dwc/terms/kingdom",
146
+ "http://rs.tdwg.org/dwc/terms/phylum",
147
+ "http://rs.tdwg.org/dwc/terms/class",
148
+ "http://rs.tdwg.org/dwc/terms/order",
149
+ "http://rs.tdwg.org/dwc/terms/family",
150
+ "http://rs.tdwg.org/dwc/terms/genus",
151
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
173
152
  @names.each do |n|
174
- @core << [n[:taxon_id], n[:taxon_id], n[:name_string],
175
- n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
176
- n[:genus], n[:code]]
153
+ @core << [n[:taxon_id], n[:name_string],
154
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
155
+ n[:genus], n[:code]]
177
156
  end
178
157
  @extensions << {
179
158
  data: [[
180
- 'http://rs.tdwg.org/dwc/terms/taxonID',
181
- 'http://rs.tdwg.org/dwc/terms/vernacularName']],
182
- file_name: 'vernacular_names.txt',
183
- row_type: 'http://rs.gbif.org/terms/1.0/VernacularName' }
159
+ "http://rs.tdwg.org/dwc/terms/taxonID",
160
+ "http://rs.tdwg.org/dwc/terms/vernacularName"
161
+ ]],
162
+ file_name: "vernacular_names.txt",
163
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
164
+ }
184
165
 
185
166
  @vernaculars.each do |v|
186
- @extensions[-1][:data] << [v[:taxon_id], v[:vernacular_name_string]]
167
+ @extensions[-1][:data] << [v[:taxon_id], v[:vern]]
187
168
  end
188
169
 
189
170
  @extensions << {
190
171
  data: [[
191
- 'http://rs.tdwg.org/dwc/terms/taxonID',
192
- 'http://globalnames.org/terms/localID',
193
- 'http://rs.tdwg.org/dwc/terms/scientificName',
194
- 'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
195
- ]],
196
- file_name: 'synonyms.txt',
197
- }
198
-
172
+ "http://rs.tdwg.org/dwc/terms/taxonID",
173
+ "http://rs.tdwg.org/dwc/terms/scientificName",
174
+ "http://rs.tdwg.org/dwc/terms/taxonomicStatus"
175
+ ]],
176
+ file_name: "synonyms.txt"
177
+ }
199
178
  @synonyms.each do |s|
200
- @extensions[-1][:data] << [
201
- s[:taxon_id], s[:local_id],
202
- s[:name_string], s[:taxonomic_status]]
179
+ @extensions[-1][:data] << [s[:taxon_id], s[:name_string], s[:status]]
203
180
  end
204
181
  @eml = {
205
182
  id: @uuid,
206
183
  title: @title,
207
184
  authors: [
208
- {email: 'dustymc at gmail dot com'}
209
- ],
185
+ { email: "dustymc at gmail dot com" }
186
+ ],
210
187
  metadata_providers: [
211
- { first_name: 'Dmitry',
212
- last_name: 'Mozzherin',
213
- email: 'dmozzherin@gmail.com' }
214
- ],
215
- abstract: 'Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.',
188
+ { first_name: "Dmitry",
189
+ last_name: "Mozzherin",
190
+ email: "dmozzherin@gmail.com" }
191
+ ],
192
+ abstract: "Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.",
216
193
  url: @url
217
194
  }
218
195
  super
219
196
  end
220
197
  end
221
198
  end
222
-