dwca_hunter 0.5.3 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceAOS < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = "index-fungorum"
7
+ @title = "Index Fungorum (Species Fungorum)"
8
+ @url = "https://uofi.box.com/shared/static/54l3b7h4q4pwqq4fgqvx42h3d328fl1c.csv"
9
+ @UUID = "af06816a-0b28-4a09-8219-bd1d63289858"
10
+ @download_path = File.join(Dir.tmpdir,
11
+ "dwca_hunter",
12
+ "index-fungorum",
13
+ "data.csv")
14
+ @synonyms = []
15
+ @names = []
16
+ @extensions = []
17
+ @synonyms_hash = {}
18
+ super(opts)
19
+ end
20
+
21
+ def download
22
+ puts "Downloading csv from remote"
23
+ `curl -s -L #{@url} -o #{@download_path}`
24
+ end
25
+
26
+ def unpack; end
27
+
28
+ def make_dwca
29
+ DwcaHunter.logger_write(object_id, "Extracting data")
30
+ get_names
31
+ generate_dwca
32
+ end
33
+
34
+ private
35
+
36
+ def get_names
37
+ Dir.chdir(@download_dir)
38
+ collect_names
39
+ end
40
+
41
+ def collect_names
42
+ @names_index = {}
43
+ file = CSV.open(File.join(@download_dir, "data.csv"),
44
+ headers: true)
45
+ file.each_with_index do |row, _i|
46
+ taxon_id = row["RECORD NUMBER"]
47
+ current_id = row["CURRENT NAME RECORD NUMBER"]
48
+ name_string = row["NAME OF FUNGUS"]
49
+ authors = row["AUTHORS"]
50
+ year = row["YEAR OF PUBLICATION"]
51
+ kingdom = row["Kingdom name"]
52
+ phylum = row["Phylum name"]
53
+ sub_phylum = row["Subphylum name"]
54
+ klass = row["Class name"]
55
+ subklass = row["Subclass name"]
56
+ order = row["Order name"]
57
+ family = row["Family name"]
58
+ code = "ICN"
59
+
60
+ @names << {
61
+ taxon_id: taxon_id,
62
+ name_string: "#{name_string} #{authors} #{year}",
63
+ current_id: current_id,
64
+ kingdom: kingdom,
65
+ phylum: phylum,
66
+ klass: klass,
67
+ order: order,
68
+ family: family,
69
+ code: code
70
+ }
71
+ end
72
+ end
73
+
74
+ def generate_dwca
75
+ DwcaHunter.logger_write(object_id,
76
+ "Creating DarwinCore Archive file")
77
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
78
+ "http://rs.tdwg.org/dwc/terms/scientificName",
79
+ "http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
80
+ "http://rs.tdwg.org/dwc/terms/kingdom",
81
+ "http://rs.tdwg.org/dwc/terms/phylum",
82
+ "http://rs.tdwg.org/dwc/terms/class",
83
+ "http://rs.tdwg.org/dwc/terms/order",
84
+ "http://rs.tdwg.org/dwc/terms/family",
85
+ "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
86
+ @names.each do |n|
87
+ @core << [n[:taxon_id], n[:name_string], n[:current_id],
88
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
89
+ n[:code]]
90
+ end
91
+
92
+ @eml = {
93
+ id: @uuid,
94
+ title: @title,
95
+ authors: [
96
+ { first_name: "Paul",
97
+ last_name: "Kirk" }
98
+ ],
99
+ metadata_providers: [
100
+ { first_name: "Dmitry",
101
+ last_name: "Mozzherin",
102
+ email: "dmozzherin@gmail.com" }
103
+ ],
104
+ abstract: "The Index Fungorum, the global fungal nomenclator " \
105
+ "coordinated and supported by the Index Fungorum Partnership, " \
106
+ "contains names of fungi (including yeasts, lichens, chromistan " \
107
+ "fungal analogues, protozoan fungal analogues and fossil forms) " \
108
+ "at all ranks.\n\n" \
109
+ "As a result of changes to the ICN (previously ICBN) relating to " \
110
+ "registration of names and following the lead taken by MycoBank, " \
111
+ "Index Fungorum now provides a mechanism to register names of " \
112
+ "new taxa, new names, new combinations and new typifications — no " \
113
+ "login is required. Names registered at Index Fungorum can be " \
114
+ "published immediately through the Index Fungorum e-Publication " \
115
+ "facility — an authorized login is required for this.\n\n" \
116
+ "Species Fungorum is currently an RBG Kew coordinated initiative " \
117
+ "to compile a global checklist of the fungi. You may search " \
118
+ "systematically defined and taxonomically complete datasets - " \
119
+ "global species databases - or the entire Species Fungorum. " \
120
+ "Species Fungorum contributes the fungal component to the Species " \
121
+ "2000 project and, in partnership with ITIS, to the Catalogue " \
122
+ "of Life (currently used in the GBIF and EoL portal); for more " \
123
+ "information regarding these global initiative visit their " \
124
+ "websites. Please contact Paul Kirk if you you would like to " \
125
+ "contribute to Species Fungorum.",
126
+ url: @url
127
+ }
128
+ super
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,200 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceIOCWorldBird < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = 'ioc-world-bird'
7
+ @title = 'IOC World Bird List'
8
+ @url = 'https://uofi.box.com/shared/static/znsd734a78saq87hes979p5uspgkzy93.csv'
9
+ @UUID = '6421ffec-38e3-40fb-a6d9-af27238a47a1'
10
+ @download_path = File.join(Dir.tmpdir,
11
+ 'dwca_hunter',
12
+ 'ioc-bird',
13
+ 'data.csv')
14
+ @synonyms = []
15
+ @names = []
16
+ @vernaculars = []
17
+ @extensions = []
18
+ @synonyms_hash = {}
19
+ @vernaculars_hash = {}
20
+ super(opts)
21
+ end
22
+
23
+ def download
24
+ puts 'Downloading cached and converted to csv version.'
25
+ puts 'CHECK FOR NEW VERSION at'
26
+ puts 'https://www.worldbirdnames.org/ioc-lists/master-list-2/'
27
+ puts 'Use libreoffice to convert to csv.'
28
+ `curl -s -L #{@url} -o #{@download_path}`
29
+ end
30
+
31
+ def unpack; end
32
+
33
+ def make_dwca
34
+ DwcaHunter.logger_write(object_id, 'Extracting data')
35
+ get_names
36
+ generate_dwca
37
+ end
38
+
39
+ private
40
+
41
+ def get_names
42
+ Dir.chdir(@download_dir)
43
+ collect_names
44
+ end
45
+
46
+ def collect_names
47
+ @names_index = {}
48
+ file = CSV.open(File.join(@download_dir, 'data.csv'),
49
+ headers: true)
50
+ order = ''
51
+ family = ''
52
+ genus = ''
53
+ species = ''
54
+ count = 0
55
+ file.each do |row|
56
+ order1 = row['Order']
57
+ order = order1.capitalize if order1.to_s != ''
58
+
59
+ family1 = row['Family (Scientific)']
60
+ family = family1.capitalize if family1.to_s != ''
61
+
62
+ genus1 = row['Genus']
63
+ genus = genus1.capitalize if genus1.to_s != ''
64
+
65
+ species1 = row['Species (Scientific)']
66
+ species = species1 if species1.to_s != ''
67
+
68
+ subspecies = row['Subspecies']
69
+ next if species.to_s == ''
70
+
71
+ count += 1
72
+ taxon_id = "gn_#{count}"
73
+ name = {
74
+ taxon_id: taxon_id,
75
+ kingdom: 'Animalia',
76
+ phylum: 'Chordata',
77
+ klass: 'Aves',
78
+ order: order,
79
+ family: family,
80
+ genus: genus,
81
+ code: 'ICZN'
82
+ }
83
+ if subspecies.to_s == ''
84
+ auth = row['Authority'].to_s
85
+ auth = DwcaHunter.normalize_authors(auth) if auth != ''
86
+ name[:name_string] = clean(
87
+ "#{genus} #{species} #{auth}"
88
+ .strip
89
+ )
90
+ @names << name
91
+ vernacular = row['Species (English)']
92
+ if vernacular.to_s != ''
93
+ vernaclar = { taxon_id: taxon_id, vern: vernacular, lang: 'en' }
94
+ @vernaculars << vernaclar
95
+ end
96
+ species = ''
97
+ else
98
+ name[:name_string] = clean(
99
+ "#{genus} #{species} #{subspecies} #{row['Authority']}"
100
+ .strip
101
+ )
102
+ @names << name
103
+ species = ''
104
+ subspecies = ''
105
+ end
106
+ end
107
+ end
108
+
109
+ def clean(n)
110
+ n = n.gsub(/†/, '')
111
+ n.gsub(/\s+/, ' ')
112
+ end
113
+
114
+ def generate_dwca
115
+ DwcaHunter.logger_write(object_id,
116
+ 'Creating DarwinCore Archive file')
117
+ @core = [['http://rs.tdwg.org/dwc/terms/taxonID',
118
+ 'http://rs.tdwg.org/dwc/terms/scientificName',
119
+ 'http://rs.tdwg.org/dwc/terms/kingdom',
120
+ 'http://rs.tdwg.org/dwc/terms/phylum',
121
+ 'http://rs.tdwg.org/dwc/terms/class',
122
+ 'http://rs.tdwg.org/dwc/terms/order',
123
+ 'http://rs.tdwg.org/dwc/terms/family',
124
+ 'http://rs.tdwg.org/dwc/terms/genus',
125
+ 'http://rs.tdwg.org/dwc/terms/nomenclaturalCode']]
126
+ @names.each do |n|
127
+ @core << [n[:taxon_id], n[:name_string],
128
+ n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
129
+ n[:genus], n[:code]]
130
+ end
131
+ @extensions << {
132
+ data: [[
133
+ 'http://rs.tdwg.org/dwc/terms/taxonID',
134
+ 'http://rs.tdwg.org/dwc/terms/vernacularName',
135
+ 'http://purl.org/dc/terms/language'
136
+ ]],
137
+ file_name: 'vernacular_names.txt',
138
+ row_type: 'http://rs.gbif.org/terms/1.0/VernacularName'
139
+ }
140
+
141
+ @vernaculars.each do |v|
142
+ @extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
143
+ end
144
+
145
+ @eml = {
146
+ id: @uuid,
147
+ title: @title,
148
+ authors: [
149
+ { first_name: 'Per',
150
+ last_name: 'Alstrom' },
151
+ { first_name: 'Mike',
152
+ last_name: 'Blair' },
153
+ { first_name: 'Rauri',
154
+ last_name: 'Bowie' },
155
+ { first_name: 'Nigel',
156
+ last_name: 'Redman' },
157
+ { first_name: 'Jon',
158
+ last_name: 'Fjeldsa' },
159
+ { first_name: 'Phil',
160
+ last_name: 'Gregory' },
161
+ { first_name: 'Leo',
162
+ last_name: 'Joseph' },
163
+ { first_name: 'Peter',
164
+ last_name: 'Kovalik' },
165
+ { first_name: 'Adolfo',
166
+ last_name: 'Navarro-Siguenza' },
167
+ { first_name: 'David',
168
+ last_name: 'Parkin' },
169
+ { first_name: 'Alan',
170
+ last_name: 'Peterson' },
171
+ { first_name: 'Douglas',
172
+ last_name: 'Pratt' },
173
+ { first_name: 'Pam',
174
+ last_name: 'Rasmussen' },
175
+ { first_name: 'Frank',
176
+ last_name: 'Rheindt' },
177
+ { first_name: 'Robert',
178
+ last_name: 'Ridgely' },
179
+ { first_name: 'Peter',
180
+ last_name: 'Ryan' },
181
+ { first_name: 'George',
182
+ last_name: 'Sangster' },
183
+ { first_name: 'Dick',
184
+ last_name: 'Schodde' },
185
+ { first_name: 'Minturn',
186
+ last_name: 'Wright' }
187
+ ],
188
+ metadata_providers: [
189
+ { first_name: 'Dmitry',
190
+ last_name: 'Mozzherin',
191
+ email: 'dmozzherin@gmail.com' }
192
+ ],
193
+ abstract: 'The IOC World Bird List is an open access resource of ' \
194
+ 'the international community of ornithologists.',
195
+ url: 'https://www.worldbirdnames.org'
196
+ }
197
+ super
198
+ end
199
+ end
200
+ end
@@ -0,0 +1,98 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ class ResourceION < DwcaHunter::Resource
5
+ def initialize(opts = {})
6
+ @command = "ion"
7
+ @title = "Index to Organism Names"
8
+ @url = "https://uofi.box.com/shared/static/tklh8i6q2kb33g6ki33k6s3is06lo9np.gz"
9
+ @UUID = "1137dfa3-5b8c-487d-b497-dc0938605864"
10
+ @download_path = File.join(Dir.tmpdir,
11
+ "dwca_hunter",
12
+ "ion",
13
+ "data.tar.gz")
14
+ @names = []
15
+ @extensions = []
16
+ super(opts)
17
+ end
18
+
19
+ def download
20
+ puts "Downloading cached verion of the file. Ask Rod Page to make new."
21
+ `curl -s -L #{@url} -o #{@download_path}`
22
+ end
23
+
24
+ def unpack
25
+ unpack_tar
26
+ end
27
+
28
+ def make_dwca
29
+ DwcaHunter.logger_write(object_id, "Extracting data")
30
+ get_names
31
+ generate_dwca
32
+ end
33
+
34
+ private
35
+
36
+ def get_names
37
+ Dir.chdir(@download_dir)
38
+ collect_names
39
+ end
40
+
41
+ def collect_names
42
+ file = CSV.open(File.join(@download_dir, "ion.tsv"),
43
+ headers: true, col_sep: "\t", quote_char: "щ")
44
+ file.each_with_index do |row, i|
45
+ id = row["id"]
46
+ name_string = row["nameComplete"]
47
+ auth = row["taxonAuthor"]
48
+
49
+ @names << { taxon_id: id,
50
+ name_string: name_string,
51
+ auth: auth }
52
+
53
+ puts "Processed %s names" % i if i % 10_000 == 0
54
+ end
55
+ end
56
+
57
+ def generate_dwca
58
+ DwcaHunter.logger_write(object_id,
59
+ "Creating DarwinCore Archive file")
60
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
61
+ "http://rs.tdwg.org/dwc/terms/scientificName",
62
+ "http://rs.tdwg.org/dwc/terms/scientificNameAuthorship"]]
63
+ @names.each do |n|
64
+ @core << [n[:taxon_id], n[:name_string], n[:auth]]
65
+ end
66
+
67
+ @eml = {
68
+ id: @uuid,
69
+ title: @title,
70
+ authors: [
71
+ { first_name: "Nigel",
72
+ last_name: "Robinson",
73
+ email: "nigel.robinson@thomsonreuters.com" }
74
+ ],
75
+ metadata_providers: [
76
+ { first_name: "Dmitry",
77
+ last_name: "Mozzherin",
78
+ email: "dmozzherin@gmail.com" }
79
+ ],
80
+ abstract: "ION contains millions of animal names, both fossil and " \
81
+ "recent, at all taxonomic ranks, reported from the scientific " \
82
+ "literature. (Bacteria, plant and virus names will be added soon)." \
83
+ "\n\n" \
84
+ "These names are derived from premier Clarivate databases: " \
85
+ "Zoological Record®, BIOSIS Previews®, and Biological Abstracts®. " \
86
+ "All names are tied to at least one published article. Together, " \
87
+ "these resources cover every aspect of the life sciences - " \
88
+ "providing names from over 30 million scientific records, " \
89
+ "including approximately ,000 international journals, patents, " \
90
+ "books, and conference proceedings. They provide a powerful " \
91
+ "foundation for the most complete collection of organism names " \
92
+ "available today.",
93
+ url: @url
94
+ }
95
+ super
96
+ end
97
+ end
98
+ end
@@ -8,7 +8,7 @@ module DwcaHunter
8
8
  @command = "ipni"
9
9
  @title = "The International Plant Names Index"
10
10
  @abbr = "IPNI"
11
- @url = "https://www.dropbox.com/s/1n0sn80vkdir5nu/ipniWebName.csv.xz"
11
+ @url = "https://uofi.box.com/shared/static/s0x4xjonxt54pi89n543gdmttrdqd6iv.xz"
12
12
  @uuid = "6b3905ce-5025-49f3-9697-ddd5bdfb4ff0"
13
13
  @download_path = File.join(Dir.tmpdir, "dwca_hunter", "ipni",
14
14
  "ipni.csv.xz")
@@ -22,8 +22,9 @@ module DwcaHunter
22
22
  end
23
23
 
24
24
  def download
25
- puts "Downloading cached verion of the file. Get daily updated one from"
25
+ puts "Download by hand from"
26
26
  puts "https://storage.cloud.google.com/ipni-data/ipniWebName.csv.xz"
27
+ puts "and copy to given url"
27
28
  `curl -s -L #{@url} -o #{@download_path}`
28
29
  end
29
30