dwca_hunter 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,174 @@
1
+ # encoding: utf-8
2
+ module DwcaHunter
3
+ class ResourceNCBI < DwcaHunter::Resource
4
+
5
+ def initialize(opts = {})
6
+ @command = 'ncbi'
7
+ @title = 'NCBI'
8
+ @url = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
9
+ @uuid = '97d7633b-5f79-4307-a397-3c29402d9311'
10
+ @download_path = File.join(Dir.tmpdir,
11
+ 'dwca_hunter',
12
+ 'ncbi',
13
+ 'data.tar.gz')
14
+ @names = {}
15
+ @data = []
16
+ @collected_names = ['genbank common name', 'common name', 'valid']
17
+ @core = []
18
+ @extensions = []
19
+ super
20
+ end
21
+
22
+ def unpack
23
+ unpack_tar
24
+ end
25
+
26
+ def make_dwca
27
+ set_vars
28
+ get_names
29
+ get_classification
30
+ generate_dwca
31
+ end
32
+
33
+ private
34
+
35
+ def set_vars
36
+ @names_file = File.join(@download_dir, 'names.dmp')
37
+ @nodes_file = File.join(@download_dir, 'nodes.dmp')
38
+ end
39
+
40
+ def get_names
41
+ DwcaHunter::logger_write(object_id, 'Collecting names...')
42
+ open(@names_file).each_with_index do |line, i|
43
+ if i > 0 && i % BATCH_SIZE == 0
44
+ DwcaHunter::logger_write(object_id, 'Collected %s names...' % i)
45
+ end
46
+ line = line.split("|").map {|l| cleanup(l)}
47
+ id = line[0]
48
+ next if id == 1
49
+ name = line[1]
50
+ name_type = line[3]
51
+ name_type = 'valid' if name_type == 'scientific name'
52
+ begin
53
+ name = name.gsub(/(^|\s)('|")(.*?)\2(\s|-|$)/, '\1\3\5').
54
+ gsub(/\s+/, ' ')
55
+ rescue NoMethodError
56
+ puts "wrong name: %s" % name
57
+ next
58
+ end
59
+ @names[id] = {} unless @names[id]
60
+ @names[id][name_type] ?
61
+ (@names[id][name_type] << name) :
62
+ (@names[id][name_type] = [name])
63
+ end
64
+ end
65
+
66
+ def get_classification
67
+ DwcaHunter.logger_write(object_id, "Building classification...")
68
+ open(@nodes_file, "r:utf-8").each_with_index do |line, i|
69
+ if i > 0 && i % BATCH_SIZE == 0
70
+ DwcaHunter.logger_write(object_id, "Collected %s nodes..." % i)
71
+ end
72
+ line = line.split('|').map {|l| cleanup(l)}
73
+ id = line[0]
74
+ next if id == 1
75
+ parent_tax_id = line[1]
76
+ rank = line[2]
77
+ hidden_flag = line[10]
78
+ comments = line[12]
79
+
80
+ rank = "" if rank == "no rank"
81
+ parent_tax_id = nil if parent_tax_id == 1
82
+ next unless @names[id] && @names[id]["valid"]
83
+ vernacular_names = []
84
+ synonyms = []
85
+ @names[id].keys.each do |k|
86
+ if @collected_names.include? k
87
+ vernacular_names += @names[id][k] if k != "valid"
88
+ else
89
+ synonyms << { scientificName: @names[id][k],
90
+ taxonomicStatus: k }
91
+ end
92
+ end
93
+ @data << {
94
+ id: id,
95
+ scientificName: @names[id]["valid"][0],
96
+ parentNameUsageId: parent_tax_id,
97
+ taxonRank: rank,
98
+ taxonomicStatus: "valid",
99
+ vernacularNames: vernacular_names,
100
+ synonyms: []
101
+ }
102
+ @names[id].keys.each do |k|
103
+ end
104
+ end
105
+ end
106
+
107
+ def generate_dwca
108
+ DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
109
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonId",
110
+ "http://purl.org/dc/terms/scientificName",
111
+ "http://purl.org/dc/terms/parentNameUsageId",
112
+ "http://purl.org/dc/terms/taxonRank"]]
113
+ DwcaHunter.logger_write(object_id, "Assembling Core Data")
114
+ count = 0
115
+ @data.map do |d|
116
+ count += 1
117
+ if (count % BATCH_SIZE).zero?
118
+ DwcaHunter.logger_write(object_id, "Traversing #{count} core " \
119
+ "data record" % count)
120
+ end
121
+ @core << [d[:id],
122
+ d[:scientificName],
123
+ d[:parentNameUsageId],
124
+ d[:taxonRank]]
125
+ end
126
+ @extensions << {
127
+ data: [["http://rs.tdwg.org/dwc/terms/TaxonID",
128
+ "http://rs.tdwg.org/dwc/terms/vernacularName"]],
129
+ file_name: "vernacular_names.txt"
130
+ }
131
+ @extensions << { data: [[
132
+ "http://rs.tdwg.org/dwc/terms/taxonId",
133
+ "http://rs.tdwg.org/dwc/terms/scientificName",
134
+ "http://rs.tdwg.org/dwc/terms/taxonomicStatus"
135
+ ]],
136
+ file_name: "synonyms.txt" }
137
+
138
+ DwcaHunter.logger_write(object_id, "Creating verncaular name " \
139
+ "extension for DarwinCore Archive file")
140
+ count = 0
141
+ @data.each do |d|
142
+ count += 1
143
+ if (count % BATCH_SIZE).zero?
144
+ DwcaHunter.logger_write(object_id,
145
+ "Traversing #{count} extension data record")
146
+ end
147
+ d[:vernacularNames].each do |vn|
148
+ @extensions[0][:data] << [d[:id], vn]
149
+ end
150
+
151
+ d[:synonyms].each do |synonym|
152
+ @extensions[1][:data] << [d[:id],
153
+ synonym[:scientificName],
154
+ synonym[:taxonomicStatus]]
155
+ end
156
+ end
157
+ @eml = {
158
+ id: @uuid,
159
+ title: @title,
160
+ authors: [{ url: "http://www.ncbi.org" }],
161
+ abstract: "The National Center for Biotechnology Information " \
162
+ "advances science and health by providing access to " \
163
+ "biomedical and genomic information.",
164
+ metadata_providers: [
165
+ { first_name: "mitry",
166
+ last_name: "Mozzherin",
167
+ email: "dmozzherin@mbl.edu" }
168
+ ],
169
+ url: @url
170
+ }
171
+ super
172
+ end
173
+ end
174
+ end
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ # Harvesting resource for Open Tree of Life
5
+ class ResourceOpenTree < DwcaHunter::Resource
6
+ def initialize(opts = {})
7
+ @command = "open-tree"
8
+ @title = "Open Tree of Life Reference Taxonomy"
9
+ @uuid = "e10865e2-cdd9-4f97-912f-08f3d5ef49f7"
10
+ @data = []
11
+ @extensions = []
12
+ @count = 1
13
+ @clades = {}
14
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonId",
15
+ "http://globalnames.org/terms/localID",
16
+ "http://purl.org/dc/terms/scientificName",
17
+ "http://purl.org/dc/terms/parentNameUsageId",
18
+ "http://purl.org/dc/terms/taxonRank",
19
+ "http://globalnames.org/ottCrossMaps",
20
+ "http://globalnames.org/ottNotes"]]
21
+ @eml = {
22
+ id: @uuid,
23
+ title: @title,
24
+ authors: [{ url: "https://tree.opentreeoflife.org" }],
25
+ abstract: "Open Tree of Life aims to construct a comprehensive, " \
26
+ "dynamic and digitally-available tree of life by " \
27
+ "synthesizing published phylogenetic trees along with" \
28
+ "taxonomic data. The project is a collaborative effort" \
29
+ "between 11 PIs across 10 institutions.",
30
+ metadata_providers: [
31
+ { first_name: "Dmitry",
32
+ last_name: "Mozzherin",
33
+ email: "dmozzherin@gmail.com" }
34
+ ],
35
+ url: @url
36
+ }
37
+ @url = "http://opendata.globalnames.org/id-crossmap/ott3.0.tgz"
38
+ @download_path = File.join(Dir.tmpdir, "dwca_hunter",
39
+ "opentree", "data.tar.gz")
40
+ super
41
+ end
42
+
43
+ def unpack
44
+ unpack_tar if @needs_unpack
45
+ end
46
+
47
+ def make_dwca
48
+ DwcaHunter.logger_write(object_id, "Extracting data")
49
+ collect_data
50
+ generate_dwca
51
+ end
52
+
53
+ def download
54
+ return unless @needs_download
55
+ DwcaHunter.logger_write(object_id, "Downloading file -- "\
56
+ "it will take some time...")
57
+ dlr = DwcaHunter::Downloader.new(url, @download_path)
58
+ dlr.download
59
+ end
60
+
61
+ private
62
+
63
+ def collect_data
64
+ set_vars
65
+ classification
66
+ end
67
+
68
+ def set_vars
69
+ @taxonomy = File.join(@download_dir, "ott", "taxonomy.tsv")
70
+ @synonyms = File.join(@download_dir, "ott", "synonyms.tsv")
71
+ end
72
+
73
+ def classification
74
+ @classification = []
75
+ @names = {}
76
+ DwcaHunter.logger_write(object_id, "Building classification")
77
+ open(@taxonomy).each_with_index do |line, i|
78
+ if ((i + 1) % BATCH_SIZE).zero?
79
+ DwcaHunter.logger_write(object_id,
80
+ "Traversed #{i + 1} taxonomy lines")
81
+ end
82
+ @classification << line.split("|").map(&:strip)
83
+ end
84
+ end
85
+
86
+ def generate_dwca
87
+ DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
88
+ DwcaHunter.logger_write(object_id, "Assembling Core Data")
89
+ generate_core
90
+ generate_synonyms
91
+ super
92
+ end
93
+
94
+ def generate_core
95
+ @classification.each do |d|
96
+ if (@count % BATCH_SIZE).zero?
97
+ DwcaHunter.logger_write(object_id, "Traversing #{@count} core " \
98
+ "data record")
99
+ end
100
+ @core << [d[0], d[0], d[2], d[1], d[3], d[4], d[5]]
101
+ end
102
+ end
103
+
104
+ def synonyms
105
+ []
106
+ end
107
+
108
+ def generate_synonyms
109
+ @extensions <<
110
+ { data: [["http://rs.tdwg.org/dwc/terms/taxonId",
111
+ "http://rs.tdwg.org/dwc/terms/scientificName",
112
+ "http://rs.tdwg.org/dwc/terms/taxonomicStatus"]],
113
+ file_name: "synonyms.txt" }
114
+
115
+ synonyms.each do |synonym|
116
+ @extensions.first[:data] << [d[:id], synonym[:scientificName],
117
+ synonym[:taxonomicStatus]]
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,139 @@
1
+ # encoding: utf-8
2
+ require 'biodiversity'
3
+ require 'csv'
4
+
5
+ module DwcaHunter
6
+ class ResourceReptilesChecklist < DwcaHunter::Resource
7
+ def initialize(opts = {})
8
+ @command = "reptile-database"
9
+ @title = "The Reptile Database"
10
+ @uuid = "c24e0905-4980-4e1d-aff2-ee0ef54ea1f8"
11
+ @data = []
12
+ @extensions = []
13
+ @download_path = File.join(Dir.tmpdir, 'dwca_hunter',
14
+ 'reptilesdb', 'fake.tar.gz')
15
+ super
16
+ end
17
+
18
+ def needs_unpack?
19
+ false
20
+ end
21
+
22
+ def download
23
+ end
24
+
25
+ def make_dwca
26
+ organize_data
27
+ generate_dwca
28
+ end
29
+
30
+ private
31
+ def organize_data
32
+ DwcaHunter::logger_write(self.object_id,
33
+ "Organizing data")
34
+ path = File.join(__dir__, "..",
35
+ "..", "files", "reptile_checklist_2014_12.csv")
36
+ snp = ScientificNameParser.new
37
+ @data = CSV.open(path).each_with_object([]) do |row, data|
38
+ res = {}
39
+ name = row[0..1].join(" ")
40
+ res[:species] = snp.parse(name)[:scientificName][:normalized]
41
+ res[:subspecies] = []
42
+ if row[2]
43
+ row[2].split("\n").each do |ssp|
44
+ res[:subspecies] << snp.parse(ssp)[:scientificName][:normalized]
45
+ end
46
+ end
47
+ res[:vernaculars] = []
48
+ if row[3]
49
+ row[3].split("\n").each do |v|
50
+ lang = "en"
51
+ v.gsub!(/^E: /, '')
52
+ v.gsub!(/^G: /) do |m|
53
+ lang = "de" if m
54
+ ""
55
+ end
56
+ v.split(",").each do |name|
57
+ res[:vernaculars] << { name: name.strip, lang: lang }
58
+ end
59
+ end
60
+ end
61
+ if row[4]
62
+ res[:family] = row[4].match(/^[A-Za-z]+/)[0]
63
+ end
64
+ data << res
65
+ end
66
+ end
67
+
68
+ def generate_dwca
69
+ DwcaHunter::logger_write(self.object_id,
70
+ "Creating DarwinCore Archive file")
71
+ @core = [['http://rs.tdwg.org/dwc/terms/taxonID',
72
+ 'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
73
+ 'http://rs.tdwg.org/dwc/terms/scientificName',
74
+ 'http://rs.tdwg.org/dwc/terms/taxonRank']]
75
+ @extensions << { data: [['http://rs.tdwg.org/dwc/terms/taxonID',
76
+ 'http://rs.tdwg.org/dwc/terms/vernacularName',
77
+ 'http://purl.org/dc/terms/language']],
78
+ file_name: 'vernacular_names.txt',
79
+ row_type: 'http://rs.gbif.org/terms/1.0/VernacularName'
80
+ }
81
+ families = {}
82
+ count = 1
83
+ class_id = count
84
+ @core << [count, nil, "Reptilia", "class"]
85
+ @data.each_with_index do |record|
86
+ count += 1
87
+ family_id = families[record[:family]]
88
+ unless family_id
89
+ count += 1
90
+ family_id = count
91
+ families[record[:family]] = family_id
92
+ @core << [family_id, class_id, record[:family], "family"]
93
+ end
94
+ count += 1
95
+ species_id = count
96
+ @core << [species_id, family_id, record[:species], "species"]
97
+ record[:vernaculars].each do |v|
98
+ @extensions[0][:data] << [species_id, v[:name], v[:lang]]
99
+ end
100
+ record[:subspecies].each do |ssp|
101
+ count += 1
102
+ row = [count, species_id, ssp, "subspecies"]
103
+ @core << row
104
+ end
105
+ end
106
+ @eml = {
107
+ id: @uuid,
108
+ title: @title,
109
+ authors: [
110
+ {
111
+ first_name: "Peter",
112
+ last_name: "Uetz",
113
+ email: "info@reptile-database_org"
114
+ },
115
+ {
116
+ first_name: "Jiri",
117
+ last_name: "Hosek",
118
+ email: "jiri.hosek@reptarium.cz"
119
+ }
120
+ ],
121
+ metadata_providers: [
122
+ { first_name: 'Dmitry',
123
+ last_name: 'Mozzherin',
124
+ email: 'dmozzherin@gmail.com' }
125
+ ],
126
+ abstract: "This database provides a catalogue of all living reptile "\
127
+ "species and their classification. The database covers "\
128
+ "all living snakes, lizards, turtles, amphisbaenians, "\
129
+ "tuataras, and crocodiles. Currently there are about "\
130
+ "9,500 species including another 2,800 subspecies "\
131
+ "(statistics). The database focuses on taxonomic data, "\
132
+ "i.e. names and synonyms, distribution and type data "\
133
+ "and literature references.",
134
+ url: "http://www.reptile-database.org"
135
+ }
136
+ super
137
+ end
138
+ end
139
+ end