dwca_hunter 0.5.1 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/.byebug_history +45 -0
  3. data/.gitignore +5 -0
  4. data/.rubocop.yml +3 -2
  5. data/.ruby-version +1 -1
  6. data/Gemfile.lock +61 -83
  7. data/LICENSE.txt +1 -1
  8. data/README.md +1 -1
  9. data/dwca_hunter.gemspec +9 -9
  10. data/exe/dwcahunter +1 -3
  11. data/lib/dwca_hunter.rb +39 -8
  12. data/lib/dwca_hunter/resource.rb +5 -0
  13. data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
  14. data/lib/dwca_hunter/resources/arctos.rb +121 -145
  15. data/lib/dwca_hunter/resources/clements.rb +151 -0
  16. data/lib/dwca_hunter/resources/eol.rb +85 -0
  17. data/lib/dwca_hunter/resources/freebase.rb +51 -49
  18. data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
  19. data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
  20. data/lib/dwca_hunter/resources/ipni.rb +111 -0
  21. data/lib/dwca_hunter/resources/itis.rb +99 -99
  22. data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
  23. data/lib/dwca_hunter/resources/mammal_species.rb +9 -6
  24. data/lib/dwca_hunter/resources/mcz.rb +123 -0
  25. data/lib/dwca_hunter/resources/ncbi.rb +22 -23
  26. data/lib/dwca_hunter/resources/opentree.rb +5 -5
  27. data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
  28. data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
  29. data/lib/dwca_hunter/resources/sherborn.rb +91 -0
  30. data/lib/dwca_hunter/resources/wikispecies.rb +142 -129
  31. data/lib/dwca_hunter/version.rb +1 -1
  32. metadata +46 -40
  33. data/files/birdlife_7.csv +0 -11862
  34. data/files/fishbase_taxon_cache.tsv +0 -81000
  35. data/files/reptile_checklist_2014_12.csv +0 -15158
  36. data/files/species-black.txt +0 -251
@@ -1,30 +1,36 @@
1
- # encoding: utf-8
1
+ # frozen_string_literal: true
2
+
2
3
  module DwcaHunter
3
4
  class ResourceArctos < DwcaHunter::Resource
4
-
5
5
  def initialize(opts = {})
6
- @command = 'arctos'
7
- @title = 'Arctos'
8
- @url = 'http://arctos.database.museum/download/gncombined.zip'
9
- @UUID = 'eea8315d-a244-4625-859a-226675622312'
6
+ @command = "arctos"
7
+ @title = "Arctos"
8
+ @url = "https://www.dropbox.com/s/3rmny5d8cfm9mmp/arctos.tar.gz?dl=1"
9
+ @UUID = "eea8315d-a244-4625-859a-226675622312"
10
10
  @download_path = File.join(Dir.tmpdir,
11
- 'dwca_hunter',
12
- 'arctos',
13
- 'data.tar.gz')
11
+ "dwca_hunter",
12
+ "arctos",
13
+ "data.zip")
14
14
  @synonyms = []
15
15
  @names = []
16
16
  @vernaculars = []
17
17
  @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
18
20
  super(opts)
19
- @gnub_dir = File.join(@download_dir, 'gnub')
21
+ end
22
+
23
+ def download
24
+ puts "Downloading cached verion of the file. Ask Arctos to generate new."
25
+ `curl -s -L #{@url} -o #{@download_path}`
20
26
  end
21
27
 
22
28
  def unpack
23
- unpack_zip
29
+ unpack_tar
24
30
  end
25
31
 
26
32
  def make_dwca
27
- DwcaHunter::logger_write(self.object_id, 'Extracting data')
33
+ DwcaHunter.logger_write(object_id, "Extracting data")
28
34
  get_names
29
35
  generate_dwca
30
36
  end
@@ -33,190 +39,160 @@ module DwcaHunter
33
39
 
34
40
  def get_names
35
41
  Dir.chdir(@download_dir)
36
- Dir.entries(@download_dir).grep(/zip$/).each do |file|
37
- self.class.unzip(file) unless File.exists?(file.gsub(/zip$/,'csv'))
38
- end
39
- collect_names
40
42
  collect_synonyms
41
43
  collect_vernaculars
44
+ collect_names
42
45
  end
43
46
 
44
47
  def collect_vernaculars
45
- file = open(File.join(@download_dir, 'common_name.csv'))
46
- fields = {}
48
+ file = CSV.open(File.join(@download_dir, "common_name.csv"),
49
+ headers: true)
47
50
  file.each_with_index do |row, i|
51
+ canonical = row["SCIENTIFIC_NAME"]
52
+ vernacular_name_string = row["COMMON_NAME"]
48
53
 
49
- if i == 0
50
- fields = get_fields(row)
51
- next
54
+ if @vernaculars_hash.key?(canonical)
55
+ @vernaculars_hash[canonical] << vernacular_name_string
56
+ else
57
+ @vernaculars_hash[canonical] = [vernacular_name_string]
52
58
  end
53
59
 
54
- row = split_row(row)
55
-
56
- taxon_id = row[fields[:taxon_name_id]]
57
- vernacular_name_string = row[fields[:common_name]]
58
-
59
- @vernaculars << {
60
- taxon_id: taxon_id,
61
- vernacular_name_string: vernacular_name_string
62
- }
63
-
64
- puts "Processed %s vernaculars" % i if i % 10000 == 0
60
+ puts "Processed %s vernaculars" % i if i % 10_000 == 0
65
61
  end
66
62
  end
67
63
 
68
64
  def collect_synonyms
69
- file = open(File.join(@download_dir, 'taxon_relations.csv'))
70
- fields = {}
65
+ file = CSV.open(File.join(@download_dir, "relationships.csv"),
66
+ headers: true)
71
67
  file.each_with_index do |row, i|
72
- if i == 0
73
- fields = get_fields(row)
74
- next
68
+ canonical = row["scientific_name"]
69
+ if @synonyms_hash.key?(canonical)
70
+ @synonyms_hash[canonical] <<
71
+ { name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
72
+ else
73
+ @synonyms_hash[canonical] = [
74
+ { name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
75
+ ]
75
76
  end
76
-
77
- row = split_row(row)
78
- taxon_id = row[fields[:taxon_name_id]]
79
- @synonyms << {
80
- taxon_id: row[fields[:related_taxon_name_id]],
81
- local_id: taxon_id,
82
- name_string: @names_index[taxon_id],
83
- #synonym_authority: row[fields[:relation_authority]],
84
- taxonomic_status: row[fields[:taxon_relationship]],
85
- }
86
- puts "Processed %s synonyms" % i if i % 10000 == 0
77
+ puts "Processed %s synonyms" % i if i % 10_000 == 0
87
78
  end
88
79
  end
89
80
 
90
81
  def collect_names
91
82
  @names_index = {}
92
- file = open(File.join(@download_dir, 'taxonomy.csv'))
93
- fields = {}
83
+ file = CSV.open(File.join(@download_dir, "classification.csv"),
84
+ headers: true)
94
85
  file.each_with_index do |row, i|
95
- if i == 0
96
- fields = get_fields(row)
97
- next
98
- end
99
- next unless row[fields[:display_name]]
100
- row = split_row(row)
101
- taxon_id = row[fields[:taxon_name_id]]
102
- name_string = row[fields[:display_name]].gsub(/<\/?i>/,'')
103
- kingdom = row[fields[:kingdom]]
104
- phylum = row[fields[:phylum]]
105
- klass = row[fields[:phylclass]]
106
- subclass = row[fields[:subclass]]
107
- order = row[fields[:phylorder]]
108
- suborder = row[fields[:suborder]]
109
- superfamily = row[fields[:superfamily]]
110
- family = row[fields[:family]]
111
- subfamily = row[fields[:subfamily]]
112
- tribe = row[fields[:tribe]]
113
- genus = row[fields[:genus]]
114
- subgenus = row[fields[:subgenus]]
115
- species = row[fields[:species]]
116
- subspecies = row[fields[:subspecies]]
117
- code = row[fields[:nomenclatural_code]]
118
-
86
+ next unless row["display_name"]
87
+
88
+ name_string = row["display_name"].gsub(%r{</?i>}, "")
89
+ canonical = row["scientific_name"]
90
+ kingdom = row["kingdom"]
91
+ phylum = row["phylum"]
92
+ klass = row["phylclass"]
93
+ subclass = row["subclass"]
94
+ order = row["phylorder"]
95
+ suborder = row["suborder"]
96
+ superfamily = row["superfamily"]
97
+ family = row["family"]
98
+ subfamily = row["subfamily"]
99
+ tribe = row["tribe"]
100
+ genus = row["genus"]
101
+ subgenus = row["subgenus"]
102
+ species = row["species"]
103
+ subspecies = row["subspecies"]
104
+ code = row["nomenclatural_code"]
105
+
106
+ taxon_id = "ARCT_#{i + 1}"
119
107
  @names << { taxon_id: taxon_id,
120
- local_id: taxon_id,
121
- name_string: name_string,
122
- kingdom: kingdom,
123
- phylum: phylum,
124
- klass: klass,
125
- order: order,
126
- family: family,
127
- genus: genus,
128
- code: code,
129
- }
130
-
131
- @names_index[taxon_id] = name_string
132
- puts "Processed %s names" % i if i % 10000 == 0
108
+ name_string: name_string,
109
+ kingdom: kingdom,
110
+ phylum: phylum,
111
+ klass: klass,
112
+ order: order,
113
+ family: family,
114
+ genus: genus,
115
+ code: code }
116
+
117
+ update_vernacular(taxon_id, canonical)
118
+ update_synonym(taxon_id, canonical)
119
+ puts "Processed %s names" % i if i % 10_000 == 0
133
120
  end
134
121
  end
135
122
 
136
- def split_row(row)
137
- row = row.strip.gsub(/^"/, '').gsub(/"$/, '')
138
- row.split('","')
139
- end
123
+ def update_vernacular(taxon_id, canonical)
124
+ return unless @vernaculars_hash.key?(canonical)
140
125
 
141
- def get_fields(row)
142
- row = row.split(",")
143
- encoding_options = {
144
- :invalid => :replace,
145
- :undef => :replace,
146
- :replace => '',
147
- :universal_newline => true
148
- }
149
- num_ary = (0...row.size).to_a
150
- row = row.map do |f|
151
- f = f.strip.downcase
152
- f = f.encode ::Encoding.find('ASCII'), encoding_options
153
- f.to_sym
126
+ @vernaculars_hash[canonical].each do |vern|
127
+ @vernaculars << { taxon_id: taxon_id, vern: vern }
154
128
  end
155
- Hash[row.zip(num_ary)]
156
129
  end
157
130
 
131
+ def update_synonym(taxon_id, canonical)
132
+ return unless @synonyms_hash.key?(canonical)
133
+
134
+ @synonyms_hash[canonical].each do |syn|
135
+ @synonyms << { taxon_id: taxon_id, name_string: syn[:name_string],
136
+ status: syn[:status] }
137
+ end
138
+ end
158
139
 
159
140
  def generate_dwca
160
- DwcaHunter::logger_write(self.object_id,
161
- 'Creating DarwinCore Archive file')
162
- @core = [['http://rs.tdwg.org/dwc/terms/taxonID',
163
- 'http://globalnames.org/terms/localID',
164
- 'http://rs.tdwg.org/dwc/terms/scientificName',
165
- 'http://rs.tdwg.org/dwc/terms/kingdom',
166
- 'http://rs.tdwg.org/dwc/terms/phylum',
167
- 'http://rs.tdwg.org/dwc/terms/class',
168
- 'http://rs.tdwg.org/dwc/terms/order',
169
- 'http://rs.tdwg.org/dwc/terms/family',
170
- 'http://rs.tdwg.org/dwc/terms/genus',
171
- 'http://rs.tdwg.org/dwc/terms/nomenclaturalCode',
172
- ]]
141
+ DwcaHunter.logger_write(object_id,
142
+ "Creating DarwinCore Archive file")
143
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
144
+ "http://rs.tdwg.org/dwc/terms/scientificName",
145
+ "http://rs.tdwg.org/dwc/terms/kingdom",
146
+ "http://rs.tdwg.org/dwc/terms/phylum",
147
+ "http://rs.tdwg.org/dwc/terms/class",
148
+ "http://rs.tdwg.org/dwc/terms/order",
149
+ "http://rs.tdwg.org/dwc/terms/family",
150
+ "http://rs.tdwg.org/dwc/terms/genus",
151
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
173
152
  @names.each do |n|
174
- @core << [n[:taxon_id], n[:taxon_id], n[:name_string],
175
- n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
176
- n[:genus], n[:code]]
153
+ @core << [n[:taxon_id], n[:name_string],
154
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
155
+ n[:genus], n[:code]]
177
156
  end
178
157
  @extensions << {
179
158
  data: [[
180
- 'http://rs.tdwg.org/dwc/terms/taxonID',
181
- 'http://rs.tdwg.org/dwc/terms/vernacularName']],
182
- file_name: 'vernacular_names.txt',
183
- row_type: 'http://rs.gbif.org/terms/1.0/VernacularName' }
159
+ "http://rs.tdwg.org/dwc/terms/taxonID",
160
+ "http://rs.tdwg.org/dwc/terms/vernacularName"
161
+ ]],
162
+ file_name: "vernacular_names.txt",
163
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
164
+ }
184
165
 
185
166
  @vernaculars.each do |v|
186
- @extensions[-1][:data] << [v[:taxon_id], v[:vernacular_name_string]]
167
+ @extensions[-1][:data] << [v[:taxon_id], v[:vern]]
187
168
  end
188
169
 
189
170
  @extensions << {
190
171
  data: [[
191
- 'http://rs.tdwg.org/dwc/terms/taxonID',
192
- 'http://globalnames.org/terms/localID',
193
- 'http://rs.tdwg.org/dwc/terms/scientificName',
194
- 'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
195
- ]],
196
- file_name: 'synonyms.txt',
197
- }
198
-
172
+ "http://rs.tdwg.org/dwc/terms/taxonID",
173
+ "http://rs.tdwg.org/dwc/terms/scientificName",
174
+ "http://rs.tdwg.org/dwc/terms/taxonomicStatus"
175
+ ]],
176
+ file_name: "synonyms.txt"
177
+ }
199
178
  @synonyms.each do |s|
200
- @extensions[-1][:data] << [
201
- s[:taxon_id], s[:local_id],
202
- s[:name_string], s[:taxonomic_status]]
179
+ @extensions[-1][:data] << [s[:taxon_id], s[:name_string], s[:status]]
203
180
  end
204
181
  @eml = {
205
182
  id: @uuid,
206
183
  title: @title,
207
184
  authors: [
208
- {email: 'dustymc at gmail dot com'}
209
- ],
185
+ { email: "dustymc at gmail dot com" }
186
+ ],
210
187
  metadata_providers: [
211
- { first_name: 'Dmitry',
212
- last_name: 'Mozzherin',
213
- email: 'dmozzherin@gmail.com' }
214
- ],
215
- abstract: 'Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.',
188
+ { first_name: "Dmitry",
189
+ last_name: "Mozzherin",
190
+ email: "dmozzherin@gmail.com" }
191
+ ],
192
+ abstract: "Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.",
216
193
  url: @url
217
194
  }
218
195
  super
219
196
  end
220
197
  end
221
198
  end
222
-
@@ -0,0 +1,151 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceClements < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = "clements-ebird"
7
+ @title = "The eBird/Clements Checklist of Birds of the World"
8
+ @url = "https://uofi.box.com/shared/static/b4n8zqa99hq9rdga27skkh3870yhujgo.csv"
9
+ @UUID = "577c0b56-4a3c-4314-8724-14b304f601de"
10
+ @download_path = File.join(Dir.tmpdir,
11
+ "dwca_hunter",
12
+ "clements",
13
+ "data.csv")
14
+ @synonyms = []
15
+ @names = []
16
+ @vernaculars = []
17
+ @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
20
+ super(opts)
21
+ end
22
+
23
+ def download
24
+ puts "Downloading cached and modified version of the file."
25
+ puts "Go to https://www.birds.cornell.edu/clementschecklist/download/ " \
26
+ "for updates."
27
+ `curl -s -L #{@url} -o #{@download_path}`
28
+ end
29
+
30
+ def unpack
31
+ end
32
+
33
+ def make_dwca
34
+ DwcaHunter.logger_write(object_id, "Extracting data")
35
+ get_names
36
+ generate_dwca
37
+ end
38
+
39
+ private
40
+
41
+ def get_names
42
+ Dir.chdir(@download_dir)
43
+ collect_names
44
+ end
45
+
46
+ def collect_names
47
+ @names_index = {}
48
+ file = CSV.open(File.join(@download_dir, "data.csv"),
49
+ headers: true)
50
+ file.each_with_index do |row, i|
51
+ name_string = row["scientific name"]
52
+ canonical = name_string
53
+ kingdom = "Animalia"
54
+ phylum = "Chordata"
55
+ klass = "Aves"
56
+ order = row["order"]
57
+ family = row["family"]
58
+ code = "ICZN"
59
+
60
+ taxon_id = "gn_#{i + 1}"
61
+ @names << { taxon_id: taxon_id,
62
+ name_string: name_string,
63
+ kingdom: kingdom,
64
+ phylum: phylum,
65
+ klass: klass,
66
+ order: order,
67
+ family: family,
68
+ code: code }
69
+
70
+ if row["English name"].to_s != ""
71
+ @vernaculars << {
72
+ taxon_id: taxon_id,
73
+ vern: row["English name"],
74
+ lang: "end"
75
+ }
76
+ end
77
+
78
+ puts "Processed %s names" % i if i % 10_000 == 0
79
+ end
80
+ end
81
+
82
+ def generate_dwca
83
+ DwcaHunter.logger_write(object_id,
84
+ "Creating DarwinCore Archive file")
85
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
86
+ "http://rs.tdwg.org/dwc/terms/scientificName",
87
+ "http://rs.tdwg.org/dwc/terms/kingdom",
88
+ "http://rs.tdwg.org/dwc/terms/phylum",
89
+ "http://rs.tdwg.org/dwc/terms/class",
90
+ "http://rs.tdwg.org/dwc/terms/order",
91
+ "http://rs.tdwg.org/dwc/terms/family",
92
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
93
+ @names.each do |n|
94
+ @core << [n[:taxon_id], n[:name_string],
95
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
96
+ n[:code]]
97
+ end
98
+ @extensions << {
99
+ data: [[
100
+ "http://rs.tdwg.org/dwc/terms/taxonID",
101
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
102
+ "http://purl.org/dc/terms/language"
103
+ ]],
104
+ file_name: "vernacular_names.txt",
105
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
106
+ }
107
+
108
+ @vernaculars.each do |v|
109
+ @extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
110
+ end
111
+
112
+ @eml = {
113
+ id: @uuid,
114
+ title: @title,
115
+ authors: [
116
+ { first_name: "G. F.",
117
+ last_name: "Clements"
118
+ },
119
+ { first_name: "T. S.",
120
+ last_name: "Schulenberg"
121
+ },
122
+ { first_name: "M. J.",
123
+ last_name: "Iliff"
124
+ },
125
+ { first_name: "S. M.",
126
+ last_name: "Billerman"
127
+ },
128
+ { first_name: "T. A.",
129
+ last_name: "Fredericks"
130
+ },
131
+ { first_name: "B. L.",
132
+ last_name: "Sullivan"
133
+ },
134
+ { first_name: "C. L.",
135
+ last_name: "Wood"
136
+ },
137
+ ],
138
+ metadata_providers: [
139
+ { first_name: "Dmitry",
140
+ last_name: "Mozzherin",
141
+ email: "dmozzherin@gmail.com" }
142
+ ],
143
+ abstract: "The eBird/Clements Checklist of Birds of the World" \
144
+ ": v2019. Downloaded from " \
145
+ "https://www.birds.cornell.edu/clementschecklist/download/",
146
+ url: @url
147
+ }
148
+ super
149
+ end
150
+ end
151
+ end