dwca_hunter 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,174 @@
1
+ # encoding: utf-8
2
+ module DwcaHunter
3
+ class ResourceNCBI < DwcaHunter::Resource
4
+
5
+ def initialize(opts = {})
6
+ @command = 'ncbi'
7
+ @title = 'NCBI'
8
+ @url = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
9
+ @uuid = '97d7633b-5f79-4307-a397-3c29402d9311'
10
+ @download_path = File.join(Dir.tmpdir,
11
+ 'dwca_hunter',
12
+ 'ncbi',
13
+ 'data.tar.gz')
14
+ @names = {}
15
+ @data = []
16
+ @collected_names = ['genbank common name', 'common name', 'valid']
17
+ @core = []
18
+ @extensions = []
19
+ super
20
+ end
21
+
22
+ def unpack
23
+ unpack_tar
24
+ end
25
+
26
+ def make_dwca
27
+ set_vars
28
+ get_names
29
+ get_classification
30
+ generate_dwca
31
+ end
32
+
33
+ private
34
+
35
+ def set_vars
36
+ @names_file = File.join(@download_dir, 'names.dmp')
37
+ @nodes_file = File.join(@download_dir, 'nodes.dmp')
38
+ end
39
+
40
+ def get_names
41
+ DwcaHunter::logger_write(object_id, 'Collecting names...')
42
+ open(@names_file).each_with_index do |line, i|
43
+ if i > 0 && i % BATCH_SIZE == 0
44
+ DwcaHunter::logger_write(object_id, 'Collected %s names...' % i)
45
+ end
46
+ line = line.split("|").map {|l| cleanup(l)}
47
+ id = line[0]
48
+ next if id == 1
49
+ name = line[1]
50
+ name_type = line[3]
51
+ name_type = 'valid' if name_type == 'scientific name'
52
+ begin
53
+ name = name.gsub(/(^|\s)('|")(.*?)\2(\s|-|$)/, '\1\3\5').
54
+ gsub(/\s+/, ' ')
55
+ rescue NoMethodError
56
+ puts "wrong name: %s" % name
57
+ next
58
+ end
59
+ @names[id] = {} unless @names[id]
60
+ @names[id][name_type] ?
61
+ (@names[id][name_type] << name) :
62
+ (@names[id][name_type] = [name])
63
+ end
64
+ end
65
+
66
+ def get_classification
67
+ DwcaHunter.logger_write(object_id, "Building classification...")
68
+ open(@nodes_file, "r:utf-8").each_with_index do |line, i|
69
+ if i > 0 && i % BATCH_SIZE == 0
70
+ DwcaHunter.logger_write(object_id, "Collected %s nodes..." % i)
71
+ end
72
+ line = line.split('|').map {|l| cleanup(l)}
73
+ id = line[0]
74
+ next if id == 1
75
+ parent_tax_id = line[1]
76
+ rank = line[2]
77
+ hidden_flag = line[10]
78
+ comments = line[12]
79
+
80
+ rank = "" if rank == "no rank"
81
+ parent_tax_id = nil if parent_tax_id == 1
82
+ next unless @names[id] && @names[id]["valid"]
83
+ vernacular_names = []
84
+ synonyms = []
85
+ @names[id].keys.each do |k|
86
+ if @collected_names.include? k
87
+ vernacular_names += @names[id][k] if k != "valid"
88
+ else
89
+ synonyms << { scientificName: @names[id][k],
90
+ taxonomicStatus: k }
91
+ end
92
+ end
93
+ @data << {
94
+ id: id,
95
+ scientificName: @names[id]["valid"][0],
96
+ parentNameUsageId: parent_tax_id,
97
+ taxonRank: rank,
98
+ taxonomicStatus: "valid",
99
+ vernacularNames: vernacular_names,
100
+ synonyms: []
101
+ }
102
+ @names[id].keys.each do |k|
103
+ end
104
+ end
105
+ end
106
+
107
+ def generate_dwca
108
+ DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
109
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonId",
110
+ "http://purl.org/dc/terms/scientificName",
111
+ "http://purl.org/dc/terms/parentNameUsageId",
112
+ "http://purl.org/dc/terms/taxonRank"]]
113
+ DwcaHunter.logger_write(object_id, "Assembling Core Data")
114
+ count = 0
115
+ @data.map do |d|
116
+ count += 1
117
+ if (count % BATCH_SIZE).zero?
118
+ DwcaHunter.logger_write(object_id, "Traversing #{count} core " \
119
+ "data record" % count)
120
+ end
121
+ @core << [d[:id],
122
+ d[:scientificName],
123
+ d[:parentNameUsageId],
124
+ d[:taxonRank]]
125
+ end
126
+ @extensions << {
127
+ data: [["http://rs.tdwg.org/dwc/terms/TaxonID",
128
+ "http://rs.tdwg.org/dwc/terms/vernacularName"]],
129
+ file_name: "vernacular_names.txt"
130
+ }
131
+ @extensions << { data: [[
132
+ "http://rs.tdwg.org/dwc/terms/taxonId",
133
+ "http://rs.tdwg.org/dwc/terms/scientificName",
134
+ "http://rs.tdwg.org/dwc/terms/taxonomicStatus"
135
+ ]],
136
+ file_name: "synonyms.txt" }
137
+
138
+ DwcaHunter.logger_write(object_id, "Creating verncaular name " \
139
+ "extension for DarwinCore Archive file")
140
+ count = 0
141
+ @data.each do |d|
142
+ count += 1
143
+ if (count % BATCH_SIZE).zero?
144
+ DwcaHunter.logger_write(object_id,
145
+ "Traversing #{count} extension data record")
146
+ end
147
+ d[:vernacularNames].each do |vn|
148
+ @extensions[0][:data] << [d[:id], vn]
149
+ end
150
+
151
+ d[:synonyms].each do |synonym|
152
+ @extensions[1][:data] << [d[:id],
153
+ synonym[:scientificName],
154
+ synonym[:taxonomicStatus]]
155
+ end
156
+ end
157
+ @eml = {
158
+ id: @uuid,
159
+ title: @title,
160
+ authors: [{ url: "http://www.ncbi.org" }],
161
+ abstract: "The National Center for Biotechnology Information " \
162
+ "advances science and health by providing access to " \
163
+ "biomedical and genomic information.",
164
+ metadata_providers: [
165
+ { first_name: "mitry",
166
+ last_name: "Mozzherin",
167
+ email: "dmozzherin@mbl.edu" }
168
+ ],
169
+ url: @url
170
+ }
171
+ super
172
+ end
173
+ end
174
+ end
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DwcaHunter
4
+ # Harvesting resource for Open Tree of Life
5
+ class ResourceOpenTree < DwcaHunter::Resource
6
+ def initialize(opts = {})
7
+ @command = "open-tree"
8
+ @title = "Open Tree of Life Reference Taxonomy"
9
+ @uuid = "e10865e2-cdd9-4f97-912f-08f3d5ef49f7"
10
+ @data = []
11
+ @extensions = []
12
+ @count = 1
13
+ @clades = {}
14
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonId",
15
+ "http://globalnames.org/terms/localID",
16
+ "http://purl.org/dc/terms/scientificName",
17
+ "http://purl.org/dc/terms/parentNameUsageId",
18
+ "http://purl.org/dc/terms/taxonRank",
19
+ "http://globalnames.org/ottCrossMaps",
20
+ "http://globalnames.org/ottNotes"]]
21
+ @eml = {
22
+ id: @uuid,
23
+ title: @title,
24
+ authors: [{ url: "https://tree.opentreeoflife.org" }],
25
+ abstract: "Open Tree of Life aims to construct a comprehensive, " \
26
+ "dynamic and digitally-available tree of life by " \
27
+ "synthesizing published phylogenetic trees along with" \
28
+ "taxonomic data. The project is a collaborative effort" \
29
+ "between 11 PIs across 10 institutions.",
30
+ metadata_providers: [
31
+ { first_name: "Dmitry",
32
+ last_name: "Mozzherin",
33
+ email: "dmozzherin@gmail.com" }
34
+ ],
35
+ url: @url
36
+ }
37
+ @url = "http://opendata.globalnames.org/id-crossmap/ott3.0.tgz"
38
+ @download_path = File.join(Dir.tmpdir, "dwca_hunter",
39
+ "opentree", "data.tar.gz")
40
+ super
41
+ end
42
+
43
+ def unpack
44
+ unpack_tar if @needs_unpack
45
+ end
46
+
47
+ def make_dwca
48
+ DwcaHunter.logger_write(object_id, "Extracting data")
49
+ collect_data
50
+ generate_dwca
51
+ end
52
+
53
+ def download
54
+ return unless @needs_download
55
+ DwcaHunter.logger_write(object_id, "Downloading file -- "\
56
+ "it will take some time...")
57
+ dlr = DwcaHunter::Downloader.new(url, @download_path)
58
+ dlr.download
59
+ end
60
+
61
+ private
62
+
63
+ def collect_data
64
+ set_vars
65
+ classification
66
+ end
67
+
68
+ def set_vars
69
+ @taxonomy = File.join(@download_dir, "ott", "taxonomy.tsv")
70
+ @synonyms = File.join(@download_dir, "ott", "synonyms.tsv")
71
+ end
72
+
73
+ def classification
74
+ @classification = []
75
+ @names = {}
76
+ DwcaHunter.logger_write(object_id, "Building classification")
77
+ open(@taxonomy).each_with_index do |line, i|
78
+ if ((i + 1) % BATCH_SIZE).zero?
79
+ DwcaHunter.logger_write(object_id,
80
+ "Traversed #{i + 1} taxonomy lines")
81
+ end
82
+ @classification << line.split("|").map(&:strip)
83
+ end
84
+ end
85
+
86
+ def generate_dwca
87
+ DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
88
+ DwcaHunter.logger_write(object_id, "Assembling Core Data")
89
+ generate_core
90
+ generate_synonyms
91
+ super
92
+ end
93
+
94
+ def generate_core
95
+ @classification.each do |d|
96
+ if (@count % BATCH_SIZE).zero?
97
+ DwcaHunter.logger_write(object_id, "Traversing #{@count} core " \
98
+ "data record")
99
+ end
100
+ @core << [d[0], d[0], d[2], d[1], d[3], d[4], d[5]]
101
+ end
102
+ end
103
+
104
+ def synonyms
105
+ []
106
+ end
107
+
108
+ def generate_synonyms
109
+ @extensions <<
110
+ { data: [["http://rs.tdwg.org/dwc/terms/taxonId",
111
+ "http://rs.tdwg.org/dwc/terms/scientificName",
112
+ "http://rs.tdwg.org/dwc/terms/taxonomicStatus"]],
113
+ file_name: "synonyms.txt" }
114
+
115
+ synonyms.each do |synonym|
116
+ @extensions.first[:data] << [d[:id], synonym[:scientificName],
117
+ synonym[:taxonomicStatus]]
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,139 @@
1
+ # encoding: utf-8
2
+ require 'biodiversity'
3
+ require 'csv'
4
+
5
+ module DwcaHunter
6
+ class ResourceReptilesChecklist < DwcaHunter::Resource
7
+ def initialize(opts = {})
8
+ @command = "reptile-database"
9
+ @title = "The Reptile Database"
10
+ @uuid = "c24e0905-4980-4e1d-aff2-ee0ef54ea1f8"
11
+ @data = []
12
+ @extensions = []
13
+ @download_path = File.join(Dir.tmpdir, 'dwca_hunter',
14
+ 'reptilesdb', 'fake.tar.gz')
15
+ super
16
+ end
17
+
18
+ def needs_unpack?
19
+ false
20
+ end
21
+
22
+ def download
23
+ end
24
+
25
+ def make_dwca
26
+ organize_data
27
+ generate_dwca
28
+ end
29
+
30
+ private
31
+ def organize_data
32
+ DwcaHunter::logger_write(self.object_id,
33
+ "Organizing data")
34
+ path = File.join(__dir__, "..",
35
+ "..", "files", "reptile_checklist_2014_12.csv")
36
+ snp = ScientificNameParser.new
37
+ @data = CSV.open(path).each_with_object([]) do |row, data|
38
+ res = {}
39
+ name = row[0..1].join(" ")
40
+ res[:species] = snp.parse(name)[:scientificName][:normalized]
41
+ res[:subspecies] = []
42
+ if row[2]
43
+ row[2].split("\n").each do |ssp|
44
+ res[:subspecies] << snp.parse(ssp)[:scientificName][:normalized]
45
+ end
46
+ end
47
+ res[:vernaculars] = []
48
+ if row[3]
49
+ row[3].split("\n").each do |v|
50
+ lang = "en"
51
+ v.gsub!(/^E: /, '')
52
+ v.gsub!(/^G: /) do |m|
53
+ lang = "de" if m
54
+ ""
55
+ end
56
+ v.split(",").each do |name|
57
+ res[:vernaculars] << { name: name.strip, lang: lang }
58
+ end
59
+ end
60
+ end
61
+ if row[4]
62
+ res[:family] = row[4].match(/^[A-Za-z]+/)[0]
63
+ end
64
+ data << res
65
+ end
66
+ end
67
+
68
+ def generate_dwca
69
+ DwcaHunter::logger_write(self.object_id,
70
+ "Creating DarwinCore Archive file")
71
+ @core = [['http://rs.tdwg.org/dwc/terms/taxonID',
72
+ 'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
73
+ 'http://rs.tdwg.org/dwc/terms/scientificName',
74
+ 'http://rs.tdwg.org/dwc/terms/taxonRank']]
75
+ @extensions << { data: [['http://rs.tdwg.org/dwc/terms/taxonID',
76
+ 'http://rs.tdwg.org/dwc/terms/vernacularName',
77
+ 'http://purl.org/dc/terms/language']],
78
+ file_name: 'vernacular_names.txt',
79
+ row_type: 'http://rs.gbif.org/terms/1.0/VernacularName'
80
+ }
81
+ families = {}
82
+ count = 1
83
+ class_id = count
84
+ @core << [count, nil, "Reptilia", "class"]
85
+ @data.each_with_index do |record|
86
+ count += 1
87
+ family_id = families[record[:family]]
88
+ unless family_id
89
+ count += 1
90
+ family_id = count
91
+ families[record[:family]] = family_id
92
+ @core << [family_id, class_id, record[:family], "family"]
93
+ end
94
+ count += 1
95
+ species_id = count
96
+ @core << [species_id, family_id, record[:species], "species"]
97
+ record[:vernaculars].each do |v|
98
+ @extensions[0][:data] << [species_id, v[:name], v[:lang]]
99
+ end
100
+ record[:subspecies].each do |ssp|
101
+ count += 1
102
+ row = [count, species_id, ssp, "subspecies"]
103
+ @core << row
104
+ end
105
+ end
106
+ @eml = {
107
+ id: @uuid,
108
+ title: @title,
109
+ authors: [
110
+ {
111
+ first_name: "Peter",
112
+ last_name: "Uetz",
113
+ email: "info@reptile-database_org"
114
+ },
115
+ {
116
+ first_name: "Jiri",
117
+ last_name: "Hosek",
118
+ email: "jiri.hosek@reptarium.cz"
119
+ }
120
+ ],
121
+ metadata_providers: [
122
+ { first_name: 'Dmitry',
123
+ last_name: 'Mozzherin',
124
+ email: 'dmozzherin@gmail.com' }
125
+ ],
126
+ abstract: "This database provides a catalogue of all living reptile "\
127
+ "species and their classification. The database covers "\
128
+ "all living snakes, lizards, turtles, amphisbaenians, "\
129
+ "tuataras, and crocodiles. Currently there are about "\
130
+ "9,500 species including another 2,800 subspecies "\
131
+ "(statistics). The database focuses on taxonomic data, "\
132
+ "i.e. names and synonyms, distribution and type data "\
133
+ "and literature references.",
134
+ url: "http://www.reptile-database.org"
135
+ }
136
+ super
137
+ end
138
+ end
139
+ end