dwca_hunter 0.5.2 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/.byebug_history +37 -0
  3. data/.gitignore +5 -0
  4. data/.rubocop.yml +3 -2
  5. data/.ruby-version +1 -1
  6. data/Gemfile.lock +59 -135
  7. data/LICENSE.txt +1 -1
  8. data/README.md +1 -1
  9. data/dwca_hunter.gemspec +7 -8
  10. data/exe/dwcahunter +1 -3
  11. data/lib/dwca_hunter.rb +39 -8
  12. data/lib/dwca_hunter/resource.rb +5 -0
  13. data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
  14. data/lib/dwca_hunter/resources/arctos.rb +121 -145
  15. data/lib/dwca_hunter/resources/clements.rb +151 -0
  16. data/lib/dwca_hunter/resources/eol.rb +85 -0
  17. data/lib/dwca_hunter/resources/freebase.rb +51 -49
  18. data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
  19. data/lib/dwca_hunter/resources/index-fungorum.rb +131 -0
  20. data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
  21. data/lib/dwca_hunter/resources/ion.rb +98 -0
  22. data/lib/dwca_hunter/resources/ipni.rb +3 -2
  23. data/lib/dwca_hunter/resources/itis.rb +99 -99
  24. data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
  25. data/lib/dwca_hunter/resources/mammal_species.rb +9 -6
  26. data/lib/dwca_hunter/resources/mcz.rb +123 -0
  27. data/lib/dwca_hunter/resources/ncbi.rb +22 -23
  28. data/lib/dwca_hunter/resources/opentree.rb +5 -5
  29. data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
  30. data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
  31. data/lib/dwca_hunter/resources/sherborn.rb +91 -0
  32. data/lib/dwca_hunter/resources/wikispecies.rb +142 -129
  33. data/lib/dwca_hunter/version.rb +1 -1
  34. metadata +31 -40
  35. data/files/birdlife_7.csv +0 -11862
  36. data/files/fishbase_taxon_cache.tsv +0 -81000
  37. data/files/reptile_checklist_2014_12.csv +0 -15158
  38. data/files/species-black.txt +0 -251
  39. data/ipni.csv.gz +0 -0
  40. data/ipniWebName.csv.xz?dl=1 +0 -0
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceAOS < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = "index-fungorum"
7
+ @title = "Index Fungorum (Species Fungorum)"
8
+ @url = "https://uofi.box.com/shared/static/54l3b7h4q4pwqq4fgqvx42h3d328fl1c.csv"
9
+ @UUID = "af06816a-0b28-4a09-8219-bd1d63289858"
10
+ @download_path = File.join(Dir.tmpdir,
11
+ "dwca_hunter",
12
+ "index-fungorum",
13
+ "data.csv")
14
+ @synonyms = []
15
+ @names = []
16
+ @extensions = []
17
+ @synonyms_hash = {}
18
+ super(opts)
19
+ end
20
+
21
+ def download
22
+ puts "Downloading csv from remote"
23
+ `curl -s -L #{@url} -o #{@download_path}`
24
+ end
25
+
26
+ def unpack; end
27
+
28
+ def make_dwca
29
+ DwcaHunter.logger_write(object_id, "Extracting data")
30
+ get_names
31
+ generate_dwca
32
+ end
33
+
34
+ private
35
+
36
+ def get_names
37
+ Dir.chdir(@download_dir)
38
+ collect_names
39
+ end
40
+
41
+ def collect_names
42
+ @names_index = {}
43
+ file = CSV.open(File.join(@download_dir, "data.csv"),
44
+ headers: true)
45
+ file.each_with_index do |row, _i|
46
+ taxon_id = row["RECORD NUMBER"]
47
+ current_id = row["CURRENT NAME RECORD NUMBER"]
48
+ name_string = row["NAME OF FUNGUS"]
49
+ authors = row["AUTHORS"]
50
+ year = row["YEAR OF PUBLICATION"]
51
+ kingdom = row["Kingdom name"]
52
+ phylum = row["Phylum name"]
53
+ sub_phylum = row["Subphylum name"]
54
+ klass = row["Class name"]
55
+ subklass = row["Subclass name"]
56
+ order = row["Order name"]
57
+ family = row["Family name"]
58
+ code = "ICN"
59
+
60
+ @names << {
61
+ taxon_id: taxon_id,
62
+ name_string: "#{name_string} #{authors} #{year}",
63
+ current_id: current_id,
64
+ kingdom: kingdom,
65
+ phylum: phylum,
66
+ klass: klass,
67
+ order: order,
68
+ family: family,
69
+ code: code
70
+ }
71
+ end
72
+ end
73
+
74
+ def generate_dwca
75
+ DwcaHunter.logger_write(object_id,
76
+ "Creating DarwinCore Archive file")
77
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
78
+ "http://rs.tdwg.org/dwc/terms/scientificName",
79
+ "http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
80
+ "http://rs.tdwg.org/dwc/terms/kingdom",
81
+ "http://rs.tdwg.org/dwc/terms/phylum",
82
+ "http://rs.tdwg.org/dwc/terms/class",
83
+ "http://rs.tdwg.org/dwc/terms/order",
84
+ "http://rs.tdwg.org/dwc/terms/family",
85
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
86
+ @names.each do |n|
87
+ @core << [n[:taxon_id], n[:name_string], n[:current_id],
88
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
89
+ n[:code]]
90
+ end
91
+
92
+ @eml = {
93
+ id: @uuid,
94
+ title: @title,
95
+ authors: [
96
+ { first_name: "Paul",
97
+ last_name: "Kirk" }
98
+ ],
99
+ metadata_providers: [
100
+ { first_name: "Dmitry",
101
+ last_name: "Mozzherin",
102
+ email: "dmozzherin@gmail.com" }
103
+ ],
104
+ abstract: "The Index Fungorum, the global fungal nomenclator " \
105
+ "coordinated and supported by the Index Fungorum Partnership, " \
106
+ "contains names of fungi (including yeasts, lichens, chromistan " \
107
+ "fungal analogues, protozoan fungal analogues and fossil forms) " \
108
+ "at all ranks.\n\n" \
109
+ "As a result of changes to the ICN (previously ICBN) relating to " \
110
+ "registration of names and following the lead taken by MycoBank, " \
111
+ "Index Fungorum now provides a mechanism to register names of " \
112
+ "new taxa, new names, new combinations and new typifications — no " \
113
+ "login is required. Names registered at Index Fungorum can be " \
114
+ "published immediately through the Index Fungorum e-Publication " \
115
+ "facility — an authorized login is required for this.\n\n" \
116
+ "Species Fungorum is currently an RBG Kew coordinated initiative " \
117
+ "to compile a global checklist of the fungi. You may search " \
118
+ "systematically defined and taxonomically complete datasets - " \
119
+ "global species databases - or the entire Species Fungorum. " \
120
+ "Species Fungorum contributes the fungal component to the Species " \
121
+ "2000 project and, in partnership with ITIS, to the Catalogue " \
122
+ "of Life (currently used in the GBIF and EoL portal); for more " \
123
+ "information regarding these global initiative visit their " \
124
+ "websites. Please contact Paul Kirk if you you would like to " \
125
+ "contribute to Species Fungorum.",
126
+ url: @url
127
+ }
128
+ super
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,200 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceIOCWorldBird < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = "ioc-world-bird"
7
+ @title = "IOC World Bird List"
8
+ @url = "https://uofi.box.com/shared/static/fbpuk5ghh9083nbzjdeyqtoqsvdnro01.csv"
9
+ @UUID = "6421ffec-38e3-40fb-a6d9-af27238a47a1"
10
+ @download_path = File.join(Dir.tmpdir,
11
+ "dwca_hunter",
12
+ "ioc-bird",
13
+ "data.csv")
14
+ @synonyms = []
15
+ @names = []
16
+ @vernaculars = []
17
+ @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
20
+ super(opts)
21
+ end
22
+
23
+ def download
24
+ puts "Downloading cached and converted to csv version."
25
+ puts "CHECK FOR NEW VERSION at"
26
+ puts "https://www.worldbirdnames.org/ioc-lists/master-list-2/"
27
+ puts "Use libreoffice to convert to csv."
28
+ `curl -s -L #{@url} -o #{@download_path}`
29
+ end
30
+
31
+ def unpack; end
32
+
33
+ def make_dwca
34
+ DwcaHunter.logger_write(object_id, "Extracting data")
35
+ get_names
36
+ generate_dwca
37
+ end
38
+
39
+ private
40
+
41
+ def get_names
42
+ Dir.chdir(@download_dir)
43
+ collect_names
44
+ end
45
+
46
+ def collect_names
47
+ @names_index = {}
48
+ file = CSV.open(File.join(@download_dir, "data.csv"),
49
+ headers: true)
50
+ order = ""
51
+ family = ""
52
+ genus = ""
53
+ species = ""
54
+ count = 0
55
+ file.each do |row|
56
+ order1 = row["Order"]
57
+ order = order1.capitalize if order1.to_s != ""
58
+
59
+ family1 = row["Family (Scientific)"]
60
+ family = family1.capitalize if family1.to_s != ""
61
+
62
+ genus1 = row["Genus"]
63
+ genus = genus1.capitalize if genus1.to_s != ""
64
+
65
+ species1 = row["Species (Scientific)"]
66
+ species = species1 if species1.to_s != ""
67
+
68
+ subspecies = row["Subspecies"]
69
+ next if species.to_s == ""
70
+
71
+ count += 1
72
+ taxon_id = "gn_#{count}"
73
+ name = {
74
+ taxon_id: taxon_id,
75
+ kingdom: "Animalia",
76
+ phylum: "Chordata",
77
+ klass: "Aves",
78
+ order: order,
79
+ family: family,
80
+ genus: genus,
81
+ code: "ICZN"
82
+ }
83
+ if subspecies.to_s == ""
84
+ auth = row["Authority"].to_s
85
+ auth = DwcaHunter.normalize_authors(auth) if auth != ""
86
+ name[:name_string] = clean(
87
+ "#{genus} #{species} #{auth}".
88
+ strip
89
+ )
90
+ @names << name
91
+ vernacular = row["Species (English)"]
92
+ if vernacular.to_s != ""
93
+ vernaclar = { taxon_id: taxon_id, vern: vernacular, lang: "en" }
94
+ @vernaculars << vernaclar
95
+ end
96
+ species = ""
97
+ else
98
+ name[:name_string] = clean(
99
+ "#{genus} #{species} #{subspecies} #{row['Authority']}".
100
+ strip
101
+ )
102
+ @names << name
103
+ species = ""
104
+ subspecies = ""
105
+ end
106
+ end
107
+ end
108
+
109
+ def clean(n)
110
+ n = n.gsub(/†/, "")
111
+ n.gsub(/\s+/, " ")
112
+ end
113
+
114
+ def generate_dwca
115
+ DwcaHunter.logger_write(object_id,
116
+ "Creating DarwinCore Archive file")
117
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
118
+ "http://rs.tdwg.org/dwc/terms/scientificName",
119
+ "http://rs.tdwg.org/dwc/terms/kingdom",
120
+ "http://rs.tdwg.org/dwc/terms/phylum",
121
+ "http://rs.tdwg.org/dwc/terms/class",
122
+ "http://rs.tdwg.org/dwc/terms/order",
123
+ "http://rs.tdwg.org/dwc/terms/family",
124
+ "http://rs.tdwg.org/dwc/terms/genus",
125
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
126
+ @names.each do |n|
127
+ @core << [n[:taxon_id], n[:name_string],
128
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
129
+ n[:genus], n[:code]]
130
+ end
131
+ @extensions << {
132
+ data: [[
133
+ "http://rs.tdwg.org/dwc/terms/taxonID",
134
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
135
+ "http://purl.org/dc/terms/language"
136
+ ]],
137
+ file_name: "vernacular_names.txt",
138
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
139
+ }
140
+
141
+ @vernaculars.each do |v|
142
+ @extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
143
+ end
144
+
145
+ @eml = {
146
+ id: @uuid,
147
+ title: @title,
148
+ authors: [
149
+ { first_name: "Per",
150
+ last_name: "Alstrom" },
151
+ { first_name: "Mike",
152
+ last_name: "Blair" },
153
+ { first_name: "Rauri",
154
+ last_name: "Bowie" },
155
+ { first_name: "Nigel",
156
+ last_name: "Redman" },
157
+ { first_name: "Jon",
158
+ last_name: "Fjeldsa" },
159
+ { first_name: "Phil",
160
+ last_name: "Gregory" },
161
+ { first_name: "Leo",
162
+ last_name: "Joseph" },
163
+ { first_name: "Peter",
164
+ last_name: "Kovalik" },
165
+ { first_name: "Adolfo",
166
+ last_name: "Navarro-Siguenza" },
167
+ { first_name: "David",
168
+ last_name: "Parkin" },
169
+ { first_name: "Alan",
170
+ last_name: "Peterson" },
171
+ { first_name: "Douglas",
172
+ last_name: "Pratt" },
173
+ { first_name: "Pam",
174
+ last_name: "Rasmussen" },
175
+ { first_name: "Frank",
176
+ last_name: "Rheindt" },
177
+ { first_name: "Robert",
178
+ last_name: "Ridgely" },
179
+ { first_name: "Peter",
180
+ last_name: "Ryan" },
181
+ { first_name: "George",
182
+ last_name: "Sangster" },
183
+ { first_name: "Dick",
184
+ last_name: "Schodde" },
185
+ { first_name: "Minturn",
186
+ last_name: "Wright" }
187
+ ],
188
+ metadata_providers: [
189
+ { first_name: "Dmitry",
190
+ last_name: "Mozzherin",
191
+ email: "dmozzherin@gmail.com" }
192
+ ],
193
+ abstract: "The IOC World Bird List is an open access resource of " \
194
+ "the international community of ornithologists.",
195
+ url: "https://www.worldbirdnames.org"
196
+ }
197
+ super
198
+ end
199
+ end
200
+ end
@@ -0,0 +1,98 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceION < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = "ion"
7
+ @title = "Index to Organism Names"
8
+ @url = "https://uofi.box.com/shared/static/tklh8i6q2kb33g6ki33k6s3is06lo9np.gz"
9
+ @UUID = "1137dfa3-5b8c-487d-b497-dc0938605864"
10
+ @download_path = File.join(Dir.tmpdir,
11
+ "dwca_hunter",
12
+ "ion",
13
+ "data.tar.gz")
14
+ @names = []
15
+ @extensions = []
16
+ super(opts)
17
+ end
18
+
19
+ def download
20
+ puts "Downloading cached verion of the file. Ask Rod Page to make new."
21
+ `curl -s -L #{@url} -o #{@download_path}`
22
+ end
23
+
24
+ def unpack
25
+ unpack_tar
26
+ end
27
+
28
+ def make_dwca
29
+ DwcaHunter.logger_write(object_id, "Extracting data")
30
+ get_names
31
+ generate_dwca
32
+ end
33
+
34
+ private
35
+
36
+ def get_names
37
+ Dir.chdir(@download_dir)
38
+ collect_names
39
+ end
40
+
41
+ def collect_names
42
+ file = CSV.open(File.join(@download_dir, "ion.tsv"),
43
+ headers: true, col_sep: "\t", quote_char: "щ")
44
+ file.each_with_index do |row, i|
45
+ id = row["id"]
46
+ name_string = row["nameComplete"]
47
+ auth = row["taxonAuthor"]
48
+
49
+ @names << { taxon_id: id,
50
+ name_string: name_string,
51
+ auth: auth }
52
+
53
+ puts "Processed %s names" % i if i % 10_000 == 0
54
+ end
55
+ end
56
+
57
+ def generate_dwca
58
+ DwcaHunter.logger_write(object_id,
59
+ "Creating DarwinCore Archive file")
60
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
61
+ "http://rs.tdwg.org/dwc/terms/scientificName",
62
+ "http://rs.tdwg.org/dwc/terms/scientificNameAuthorship"]]
63
+ @names.each do |n|
64
+ @core << [n[:taxon_id], n[:name_string], n[:auth]]
65
+ end
66
+
67
+ @eml = {
68
+ id: @uuid,
69
+ title: @title,
70
+ authors: [
71
+ { first_name: "Nigel",
72
+ last_name: "Robinson",
73
+ email: "nigel.robinson@thomsonreuters.com" }
74
+ ],
75
+ metadata_providers: [
76
+ { first_name: "Dmitry",
77
+ last_name: "Mozzherin",
78
+ email: "dmozzherin@gmail.com" }
79
+ ],
80
+ abstract: "ION contains millions of animal names, both fossil and " \
81
+ "recent, at all taxonomic ranks, reported from the scientific " \
82
+ "literature. (Bacteria, plant and virus names will be added soon)." \
83
+ "\n\n" \
84
+ "These names are derived from premier Clarivate databases: " \
85
+ "Zoological Record®, BIOSIS Previews®, and Biological Abstracts®. " \
86
+ "All names are tied to at least one published article. Together, " \
87
+ "these resources cover every aspect of the life sciences - " \
88
+ "providing names from over 30 million scientific records, " \
89
+ "including approximately ,000 international journals, patents, " \
90
+ "books, and conference proceedings. They provide a powerful " \
91
+ "foundation for the most complete collection of organism names " \
92
+ "available today.",
93
+ url: @url
94
+ }
95
+ super
96
+ end
97
+ end
98
+ end
@@ -8,7 +8,7 @@ module DwcaHunter
8
8
  @command = "ipni"
9
9
  @title = "The International Plant Names Index"
10
10
  @abbr = "IPNI"
11
- @url = "https://www.dropbox.com/s/1n0sn80vkdir5nu/ipniWebName.csv.xz"
11
+ @url = "https://uofi.box.com/shared/static/s0x4xjonxt54pi89n543gdmttrdqd6iv.xz"
12
12
  @uuid = "6b3905ce-5025-49f3-9697-ddd5bdfb4ff0"
13
13
  @download_path = File.join(Dir.tmpdir, "dwca_hunter", "ipni",
14
14
  "ipni.csv.xz")
@@ -22,8 +22,9 @@ module DwcaHunter
22
22
  end
23
23
 
24
24
  def download
25
- puts "Downloading cached verion of the file. Get daily updated one from"
25
+ puts "Download by hand from"
26
26
  puts "https://storage.cloud.google.com/ipni-data/ipniWebName.csv.xz"
27
+ puts "and copy to given url"
27
28
  `curl -s -L #{@url} -o #{@download_path}`
28
29
  end
29
30