dwca_hunter 0.5.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/.byebug_history +45 -0
  3. data/.gitignore +5 -0
  4. data/.rubocop.yml +3 -2
  5. data/.ruby-version +1 -1
  6. data/Gemfile.lock +61 -83
  7. data/LICENSE.txt +1 -1
  8. data/README.md +1 -1
  9. data/dwca_hunter.gemspec +9 -9
  10. data/exe/dwcahunter +1 -3
  11. data/lib/dwca_hunter.rb +39 -8
  12. data/lib/dwca_hunter/resource.rb +5 -0
  13. data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
  14. data/lib/dwca_hunter/resources/arctos.rb +121 -145
  15. data/lib/dwca_hunter/resources/clements.rb +151 -0
  16. data/lib/dwca_hunter/resources/eol.rb +85 -0
  17. data/lib/dwca_hunter/resources/freebase.rb +51 -49
  18. data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
  19. data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
  20. data/lib/dwca_hunter/resources/ipni.rb +111 -0
  21. data/lib/dwca_hunter/resources/itis.rb +99 -99
  22. data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
  23. data/lib/dwca_hunter/resources/mammal_species.rb +9 -6
  24. data/lib/dwca_hunter/resources/mcz.rb +123 -0
  25. data/lib/dwca_hunter/resources/ncbi.rb +22 -23
  26. data/lib/dwca_hunter/resources/opentree.rb +5 -5
  27. data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
  28. data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
  29. data/lib/dwca_hunter/resources/sherborn.rb +91 -0
  30. data/lib/dwca_hunter/resources/wikispecies.rb +142 -129
  31. data/lib/dwca_hunter/version.rb +1 -1
  32. metadata +46 -40
  33. data/files/birdlife_7.csv +0 -11862
  34. data/files/fishbase_taxon_cache.tsv +0 -81000
  35. data/files/reptile_checklist_2014_12.csv +0 -15158
  36. data/files/species-black.txt +0 -251
@@ -1,30 +1,36 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module DwcaHunter
3
4
  class ResourceArctos < DwcaHunter::Resource
4
-
5
5
  def initialize(opts = {})
6
- @command = 'arctos'
7
- @title = 'Arctos'
8
- @url = 'http://arctos.database.museum/download/gncombined.zip'
9
- @UUID = 'eea8315d-a244-4625-859a-226675622312'
6
+ @command = "arctos"
7
+ @title = "Arctos"
8
+ @url = "https://www.dropbox.com/s/3rmny5d8cfm9mmp/arctos.tar.gz?dl=1"
9
+ @UUID = "eea8315d-a244-4625-859a-226675622312"
10
10
  @download_path = File.join(Dir.tmpdir,
11
- 'dwca_hunter',
12
- 'arctos',
13
- 'data.tar.gz')
11
+ "dwca_hunter",
12
+ "arctos",
13
+ "data.zip")
14
14
  @synonyms = []
15
15
  @names = []
16
16
  @vernaculars = []
17
17
  @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
18
20
  super(opts)
19
- @gnub_dir = File.join(@download_dir, 'gnub')
21
+ end
22
+
23
+ def download
24
+ puts "Downloading cached verion of the file. Ask Arctos to generate new."
25
+ `curl -s -L #{@url} -o #{@download_path}`
20
26
  end
21
27
 
22
28
  def unpack
23
- unpack_zip
29
+ unpack_tar
24
30
  end
25
31
 
26
32
  def make_dwca
27
- DwcaHunter::logger_write(self.object_id, 'Extracting data')
33
+ DwcaHunter.logger_write(object_id, "Extracting data")
28
34
  get_names
29
35
  generate_dwca
30
36
  end
@@ -33,190 +39,160 @@ module DwcaHunter
33
39
 
34
40
  def get_names
35
41
  Dir.chdir(@download_dir)
36
- Dir.entries(@download_dir).grep(/zip$/).each do |file|
37
- self.class.unzip(file) unless File.exists?(file.gsub(/zip$/,'csv'))
38
- end
39
- collect_names
40
42
  collect_synonyms
41
43
  collect_vernaculars
44
+ collect_names
42
45
  end
43
46
 
44
47
  def collect_vernaculars
45
- file = open(File.join(@download_dir, 'common_name.csv'))
46
- fields = {}
48
+ file = CSV.open(File.join(@download_dir, "common_name.csv"),
49
+ headers: true)
47
50
  file.each_with_index do |row, i|
51
+ canonical = row["SCIENTIFIC_NAME"]
52
+ vernacular_name_string = row["COMMON_NAME"]
48
53
 
49
- if i == 0
50
- fields = get_fields(row)
51
- next
54
+ if @vernaculars_hash.key?(canonical)
55
+ @vernaculars_hash[canonical] << vernacular_name_string
56
+ else
57
+ @vernaculars_hash[canonical] = [vernacular_name_string]
52
58
  end
53
59
 
54
- row = split_row(row)
55
-
56
- taxon_id = row[fields[:taxon_name_id]]
57
- vernacular_name_string = row[fields[:common_name]]
58
-
59
- @vernaculars << {
60
- taxon_id: taxon_id,
61
- vernacular_name_string: vernacular_name_string
62
- }
63
-
64
- puts "Processed %s vernaculars" % i if i % 10000 == 0
60
+ puts "Processed %s vernaculars" % i if i % 10_000 == 0
65
61
  end
66
62
  end
67
63
 
68
64
  def collect_synonyms
69
- file = open(File.join(@download_dir, 'taxon_relations.csv'))
70
- fields = {}
65
+ file = CSV.open(File.join(@download_dir, "relationships.csv"),
66
+ headers: true)
71
67
  file.each_with_index do |row, i|
72
- if i == 0
73
- fields = get_fields(row)
74
- next
68
+ canonical = row["scientific_name"]
69
+ if @synonyms_hash.key?(canonical)
70
+ @synonyms_hash[canonical] <<
71
+ { name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
72
+ else
73
+ @synonyms_hash[canonical] = [
74
+ { name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
75
+ ]
75
76
  end
76
-
77
- row = split_row(row)
78
- taxon_id = row[fields[:taxon_name_id]]
79
- @synonyms << {
80
- taxon_id: row[fields[:related_taxon_name_id]],
81
- local_id: taxon_id,
82
- name_string: @names_index[taxon_id],
83
- #synonym_authority: row[fields[:relation_authority]],
84
- taxonomic_status: row[fields[:taxon_relationship]],
85
- }
86
- puts "Processed %s synonyms" % i if i % 10000 == 0
77
+ puts "Processed %s synonyms" % i if i % 10_000 == 0
87
78
  end
88
79
  end
89
80
 
90
81
  def collect_names
91
82
  @names_index = {}
92
- file = open(File.join(@download_dir, 'taxonomy.csv'))
93
- fields = {}
83
+ file = CSV.open(File.join(@download_dir, "classification.csv"),
84
+ headers: true)
94
85
  file.each_with_index do |row, i|
95
- if i == 0
96
- fields = get_fields(row)
97
- next
98
- end
99
- next unless row[fields[:display_name]]
100
- row = split_row(row)
101
- taxon_id = row[fields[:taxon_name_id]]
102
- name_string = row[fields[:display_name]].gsub(/<\/?i>/,'')
103
- kingdom = row[fields[:kingdom]]
104
- phylum = row[fields[:phylum]]
105
- klass = row[fields[:phylclass]]
106
- subclass = row[fields[:subclass]]
107
- order = row[fields[:phylorder]]
108
- suborder = row[fields[:suborder]]
109
- superfamily = row[fields[:superfamily]]
110
- family = row[fields[:family]]
111
- subfamily = row[fields[:subfamily]]
112
- tribe = row[fields[:tribe]]
113
- genus = row[fields[:genus]]
114
- subgenus = row[fields[:subgenus]]
115
- species = row[fields[:species]]
116
- subspecies = row[fields[:subspecies]]
117
- code = row[fields[:nomenclatural_code]]
118
-
86
+ next unless row["display_name"]
87
+
88
+ name_string = row["display_name"].gsub(%r{</?i>}, "")
89
+ canonical = row["scientific_name"]
90
+ kingdom = row["kingdom"]
91
+ phylum = row["phylum"]
92
+ klass = row["phylclass"]
93
+ subclass = row["subclass"]
94
+ order = row["phylorder"]
95
+ suborder = row["suborder"]
96
+ superfamily = row["superfamily"]
97
+ family = row["family"]
98
+ subfamily = row["subfamily"]
99
+ tribe = row["tribe"]
100
+ genus = row["genus"]
101
+ subgenus = row["subgenus"]
102
+ species = row["species"]
103
+ subspecies = row["subspecies"]
104
+ code = row["nomenclatural_code"]
105
+
106
+ taxon_id = "ARCT_#{i + 1}"
119
107
  @names << { taxon_id: taxon_id,
120
- local_id: taxon_id,
121
- name_string: name_string,
122
- kingdom: kingdom,
123
- phylum: phylum,
124
- klass: klass,
125
- order: order,
126
- family: family,
127
- genus: genus,
128
- code: code,
129
- }
130
-
131
- @names_index[taxon_id] = name_string
132
- puts "Processed %s names" % i if i % 10000 == 0
108
+ name_string: name_string,
109
+ kingdom: kingdom,
110
+ phylum: phylum,
111
+ klass: klass,
112
+ order: order,
113
+ family: family,
114
+ genus: genus,
115
+ code: code }
116
+
117
+ update_vernacular(taxon_id, canonical)
118
+ update_synonym(taxon_id, canonical)
119
+ puts "Processed %s names" % i if i % 10_000 == 0
133
120
  end
134
121
  end
135
122
 
136
- def split_row(row)
137
- row = row.strip.gsub(/^"/, '').gsub(/"$/, '')
138
- row.split('","')
139
- end
123
+ def update_vernacular(taxon_id, canonical)
124
+ return unless @vernaculars_hash.key?(canonical)
140
125
 
141
- def get_fields(row)
142
- row = row.split(",")
143
- encoding_options = {
144
- :invalid => :replace,
145
- :undef => :replace,
146
- :replace => '',
147
- :universal_newline => true
148
- }
149
- num_ary = (0...row.size).to_a
150
- row = row.map do |f|
151
- f = f.strip.downcase
152
- f = f.encode ::Encoding.find('ASCII'), encoding_options
153
- f.to_sym
126
+ @vernaculars_hash[canonical].each do |vern|
127
+ @vernaculars << { taxon_id: taxon_id, vern: vern }
154
128
  end
155
- Hash[row.zip(num_ary)]
156
129
  end
157
130
 
131
+ def update_synonym(taxon_id, canonical)
132
+ return unless @synonyms_hash.key?(canonical)
133
+
134
+ @synonyms_hash[canonical].each do |syn|
135
+ @synonyms << { taxon_id: taxon_id, name_string: syn[:name_string],
136
+ status: syn[:status] }
137
+ end
138
+ end
158
139
 
159
140
  def generate_dwca
160
- DwcaHunter::logger_write(self.object_id,
161
- 'Creating DarwinCore Archive file')
162
- @core = [['http://rs.tdwg.org/dwc/terms/taxonID',
163
- 'http://globalnames.org/terms/localID',
164
- 'http://rs.tdwg.org/dwc/terms/scientificName',
165
- 'http://rs.tdwg.org/dwc/terms/kingdom',
166
- 'http://rs.tdwg.org/dwc/terms/phylum',
167
- 'http://rs.tdwg.org/dwc/terms/class',
168
- 'http://rs.tdwg.org/dwc/terms/order',
169
- 'http://rs.tdwg.org/dwc/terms/family',
170
- 'http://rs.tdwg.org/dwc/terms/genus',
171
- 'http://rs.tdwg.org/dwc/terms/nomenclaturalCode',
172
- ]]
141
+ DwcaHunter.logger_write(object_id,
142
+ "Creating DarwinCore Archive file")
143
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
144
+ "http://rs.tdwg.org/dwc/terms/scientificName",
145
+ "http://rs.tdwg.org/dwc/terms/kingdom",
146
+ "http://rs.tdwg.org/dwc/terms/phylum",
147
+ "http://rs.tdwg.org/dwc/terms/class",
148
+ "http://rs.tdwg.org/dwc/terms/order",
149
+ "http://rs.tdwg.org/dwc/terms/family",
150
+ "http://rs.tdwg.org/dwc/terms/genus",
151
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
173
152
  @names.each do |n|
174
- @core << [n[:taxon_id], n[:taxon_id], n[:name_string],
175
- n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
176
- n[:genus], n[:code]]
153
+ @core << [n[:taxon_id], n[:name_string],
154
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
155
+ n[:genus], n[:code]]
177
156
  end
178
157
  @extensions << {
179
158
  data: [[
180
- 'http://rs.tdwg.org/dwc/terms/taxonID',
181
- 'http://rs.tdwg.org/dwc/terms/vernacularName']],
182
- file_name: 'vernacular_names.txt',
183
- row_type: 'http://rs.gbif.org/terms/1.0/VernacularName' }
159
+ "http://rs.tdwg.org/dwc/terms/taxonID",
160
+ "http://rs.tdwg.org/dwc/terms/vernacularName"
161
+ ]],
162
+ file_name: "vernacular_names.txt",
163
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
164
+ }
184
165
 
185
166
  @vernaculars.each do |v|
186
- @extensions[-1][:data] << [v[:taxon_id], v[:vernacular_name_string]]
167
+ @extensions[-1][:data] << [v[:taxon_id], v[:vern]]
187
168
  end
188
169
 
189
170
  @extensions << {
190
171
  data: [[
191
- 'http://rs.tdwg.org/dwc/terms/taxonID',
192
- 'http://globalnames.org/terms/localID',
193
- 'http://rs.tdwg.org/dwc/terms/scientificName',
194
- 'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
195
- ]],
196
- file_name: 'synonyms.txt',
197
- }
198
-
172
+ "http://rs.tdwg.org/dwc/terms/taxonID",
173
+ "http://rs.tdwg.org/dwc/terms/scientificName",
174
+ "http://rs.tdwg.org/dwc/terms/taxonomicStatus"
175
+ ]],
176
+ file_name: "synonyms.txt"
177
+ }
199
178
  @synonyms.each do |s|
200
- @extensions[-1][:data] << [
201
- s[:taxon_id], s[:local_id],
202
- s[:name_string], s[:taxonomic_status]]
179
+ @extensions[-1][:data] << [s[:taxon_id], s[:name_string], s[:status]]
203
180
  end
204
181
  @eml = {
205
182
  id: @uuid,
206
183
  title: @title,
207
184
  authors: [
208
- {email: 'dustymc at gmail dot com'}
209
- ],
185
+ { email: "dustymc at gmail dot com" }
186
+ ],
210
187
  metadata_providers: [
211
- { first_name: 'Dmitry',
212
- last_name: 'Mozzherin',
213
- email: 'dmozzherin@gmail.com' }
214
- ],
215
- abstract: 'Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.',
188
+ { first_name: "Dmitry",
189
+ last_name: "Mozzherin",
190
+ email: "dmozzherin@gmail.com" }
191
+ ],
192
+ abstract: "Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.",
216
193
  url: @url
217
194
  }
218
195
  super
219
196
  end
220
197
  end
221
198
  end
222
-
@@ -0,0 +1,151 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceClements < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = "clements-ebird"
7
+ @title = "The eBird/Clements Checklist of Birds of the World"
8
+ @url = "https://uofi.box.com/shared/static/b4n8zqa99hq9rdga27skkh3870yhujgo.csv"
9
+ @UUID = "577c0b56-4a3c-4314-8724-14b304f601de"
10
+ @download_path = File.join(Dir.tmpdir,
11
+ "dwca_hunter",
12
+ "clements",
13
+ "data.csv")
14
+ @synonyms = []
15
+ @names = []
16
+ @vernaculars = []
17
+ @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
20
+ super(opts)
21
+ end
22
+
23
+ def download
24
+ puts "Downloading cached and modified version of the file."
25
+ puts "Go to https://www.birds.cornell.edu/clementschecklist/download/ " \
26
+ "for updates."
27
+ `curl -s -L #{@url} -o #{@download_path}`
28
+ end
29
+
30
+ def unpack
31
+ end
32
+
33
+ def make_dwca
34
+ DwcaHunter.logger_write(object_id, "Extracting data")
35
+ get_names
36
+ generate_dwca
37
+ end
38
+
39
+ private
40
+
41
+ def get_names
42
+ Dir.chdir(@download_dir)
43
+ collect_names
44
+ end
45
+
46
+ def collect_names
47
+ @names_index = {}
48
+ file = CSV.open(File.join(@download_dir, "data.csv"),
49
+ headers: true)
50
+ file.each_with_index do |row, i|
51
+ name_string = row["scientific name"]
52
+ canonical = name_string
53
+ kingdom = "Animalia"
54
+ phylum = "Chordata"
55
+ klass = "Aves"
56
+ order = row["order"]
57
+ family = row["family"]
58
+ code = "ICZN"
59
+
60
+ taxon_id = "gn_#{i + 1}"
61
+ @names << { taxon_id: taxon_id,
62
+ name_string: name_string,
63
+ kingdom: kingdom,
64
+ phylum: phylum,
65
+ klass: klass,
66
+ order: order,
67
+ family: family,
68
+ code: code }
69
+
70
+ if row["English name"].to_s != ""
71
+ @vernaculars << {
72
+ taxon_id: taxon_id,
73
+ vern: row["English name"],
74
+ lang: "end"
75
+ }
76
+ end
77
+
78
+ puts "Processed %s names" % i if i % 10_000 == 0
79
+ end
80
+ end
81
+
82
+ def generate_dwca
83
+ DwcaHunter.logger_write(object_id,
84
+ "Creating DarwinCore Archive file")
85
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
86
+ "http://rs.tdwg.org/dwc/terms/scientificName",
87
+ "http://rs.tdwg.org/dwc/terms/kingdom",
88
+ "http://rs.tdwg.org/dwc/terms/phylum",
89
+ "http://rs.tdwg.org/dwc/terms/class",
90
+ "http://rs.tdwg.org/dwc/terms/order",
91
+ "http://rs.tdwg.org/dwc/terms/family",
92
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
93
+ @names.each do |n|
94
+ @core << [n[:taxon_id], n[:name_string],
95
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
96
+ n[:code]]
97
+ end
98
+ @extensions << {
99
+ data: [[
100
+ "http://rs.tdwg.org/dwc/terms/taxonID",
101
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
102
+ "http://purl.org/dc/terms/language"
103
+ ]],
104
+ file_name: "vernacular_names.txt",
105
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
106
+ }
107
+
108
+ @vernaculars.each do |v|
109
+ @extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
110
+ end
111
+
112
+ @eml = {
113
+ id: @uuid,
114
+ title: @title,
115
+ authors: [
116
+ { first_name: "G. F.",
117
+ last_name: "Clements"
118
+ },
119
+ { first_name: "T. S.",
120
+ last_name: "Schulenberg"
121
+ },
122
+ { first_name: "M. J.",
123
+ last_name: "Iliff"
124
+ },
125
+ { first_name: "S. M.",
126
+ last_name: "Billerman"
127
+ },
128
+ { first_name: "T. A.",
129
+ last_name: "Fredericks"
130
+ },
131
+ { first_name: "B. L.",
132
+ last_name: "Sullivan"
133
+ },
134
+ { first_name: "C. L.",
135
+ last_name: "Wood"
136
+ },
137
+ ],
138
+ metadata_providers: [
139
+ { first_name: "Dmitry",
140
+ last_name: "Mozzherin",
141
+ email: "dmozzherin@gmail.com" }
142
+ ],
143
+ abstract: "The eBird/Clements Checklist of Birds of the World" \
144
+ ": v2019. Downloaded from " \
145
+ "https://www.birds.cornell.edu/clementschecklist/download/",
146
+ url: @url
147
+ }
148
+ super
149
+ end
150
+ end
151
+ end