dwca_hunter 0.5.5 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.byebug_history +37 -0
- data/.gitignore +5 -0
- data/.rubocop.yml +3 -2
- data/.ruby-version +1 -1
- data/Gemfile.lock +50 -77
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/dwca_hunter.gemspec +7 -8
- data/exe/dwcahunter +1 -3
- data/lib/dwca_hunter.rb +31 -0
- data/lib/dwca_hunter/resources/aos-birds.rb +143 -0
- data/lib/dwca_hunter/resources/arctos.rb +93 -91
- data/lib/dwca_hunter/resources/clements.rb +151 -0
- data/lib/dwca_hunter/resources/freebase.rb +51 -49
- data/lib/dwca_hunter/resources/how-moore-birds.rb +168 -0
- data/lib/dwca_hunter/resources/ioc_word_bird.rb +200 -0
- data/lib/dwca_hunter/resources/ipni.rb +3 -2
- data/lib/dwca_hunter/resources/itis.rb +99 -99
- data/lib/dwca_hunter/resources/mammal_divdb.rb +155 -0
- data/lib/dwca_hunter/resources/mammal_species.rb +3 -3
- data/lib/dwca_hunter/resources/mcz.rb +123 -0
- data/lib/dwca_hunter/resources/ncbi.rb +22 -23
- data/lib/dwca_hunter/resources/opentree.rb +5 -5
- data/lib/dwca_hunter/resources/paleobiodb.rb +193 -0
- data/lib/dwca_hunter/resources/paleodb_harvester.rb +140 -0
- data/lib/dwca_hunter/resources/sherborn.rb +91 -0
- data/lib/dwca_hunter/resources/wikispecies.rb +142 -127
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +27 -34
- data/ipni.csv.gz +0 -0
- data/ipniWebName.csv.xz?dl=1 +0 -0
@@ -1,16 +1,16 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
module DwcaHunter
|
3
4
|
class ResourceArctos < DwcaHunter::Resource
|
4
|
-
|
5
5
|
def initialize(opts = {})
|
6
|
-
@command =
|
7
|
-
@title =
|
8
|
-
@url =
|
9
|
-
@UUID =
|
6
|
+
@command = "arctos"
|
7
|
+
@title = "Arctos"
|
8
|
+
@url = "https://www.dropbox.com/s/3rmny5d8cfm9mmp/arctos.tar.gz?dl=1"
|
9
|
+
@UUID = "eea8315d-a244-4625-859a-226675622312"
|
10
10
|
@download_path = File.join(Dir.tmpdir,
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
"dwca_hunter",
|
12
|
+
"arctos",
|
13
|
+
"data.zip")
|
14
14
|
@synonyms = []
|
15
15
|
@names = []
|
16
16
|
@vernaculars = []
|
@@ -22,7 +22,7 @@ module DwcaHunter
|
|
22
22
|
|
23
23
|
def download
|
24
24
|
puts "Downloading cached verion of the file. Ask Arctos to generate new."
|
25
|
-
|
25
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
26
26
|
end
|
27
27
|
|
28
28
|
def unpack
|
@@ -30,7 +30,7 @@ module DwcaHunter
|
|
30
30
|
end
|
31
31
|
|
32
32
|
def make_dwca
|
33
|
-
DwcaHunter
|
33
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
34
34
|
get_names
|
35
35
|
generate_dwca
|
36
36
|
end
|
@@ -45,121 +45,123 @@ module DwcaHunter
|
|
45
45
|
end
|
46
46
|
|
47
47
|
def collect_vernaculars
|
48
|
-
file = CSV.open(File.join(@download_dir,
|
49
|
-
|
48
|
+
file = CSV.open(File.join(@download_dir, "common_name.csv"),
|
49
|
+
headers: true)
|
50
50
|
file.each_with_index do |row, i|
|
51
|
+
canonical = row["SCIENTIFIC_NAME"]
|
52
|
+
vernacular_name_string = row["COMMON_NAME"]
|
51
53
|
|
52
|
-
canonical
|
53
|
-
vernacular_name_string = row['COMMON_NAME']
|
54
|
-
|
55
|
-
if @vernaculars_hash.has_key?(canonical)
|
54
|
+
if @vernaculars_hash.key?(canonical)
|
56
55
|
@vernaculars_hash[canonical] << vernacular_name_string
|
57
56
|
else
|
58
57
|
@vernaculars_hash[canonical] = [vernacular_name_string]
|
59
58
|
end
|
60
59
|
|
61
|
-
puts "Processed %s vernaculars" % i if i %
|
60
|
+
puts "Processed %s vernaculars" % i if i % 10_000 == 0
|
62
61
|
end
|
63
62
|
end
|
64
63
|
|
65
64
|
def collect_synonyms
|
66
|
-
file = CSV.open(File.join(@download_dir,
|
67
|
-
|
65
|
+
file = CSV.open(File.join(@download_dir, "relationships.csv"),
|
66
|
+
headers: true)
|
68
67
|
file.each_with_index do |row, i|
|
69
|
-
canonical = row[
|
70
|
-
if @synonyms_hash.
|
68
|
+
canonical = row["scientific_name"]
|
69
|
+
if @synonyms_hash.key?(canonical)
|
71
70
|
@synonyms_hash[canonical] <<
|
72
|
-
|
71
|
+
{ name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
|
73
72
|
else
|
74
73
|
@synonyms_hash[canonical] = [
|
75
|
-
|
74
|
+
{ name_string: row["related_name"], status: row["TAXON_RELATIONSHIP"] }
|
76
75
|
]
|
77
76
|
end
|
78
|
-
puts "Processed %s synonyms" % i if i %
|
77
|
+
puts "Processed %s synonyms" % i if i % 10_000 == 0
|
79
78
|
end
|
80
79
|
end
|
81
80
|
|
82
81
|
def collect_names
|
83
82
|
@names_index = {}
|
84
|
-
file = CSV.open(File.join(@download_dir,
|
85
|
-
|
83
|
+
file = CSV.open(File.join(@download_dir, "classification.csv"),
|
84
|
+
headers: true)
|
86
85
|
file.each_with_index do |row, i|
|
87
|
-
next unless
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
86
|
+
next unless row["display_name"]
|
87
|
+
|
88
|
+
name_string = row["display_name"].gsub(%r{</?i>}, "")
|
89
|
+
canonical = row["scientific_name"]
|
90
|
+
kingdom = row["kingdom"]
|
91
|
+
phylum = row["phylum"]
|
92
|
+
klass = row["phylclass"]
|
93
|
+
subclass = row["subclass"]
|
94
|
+
order = row["phylorder"]
|
95
|
+
suborder = row["suborder"]
|
96
|
+
superfamily = row["superfamily"]
|
97
|
+
family = row["family"]
|
98
|
+
subfamily = row["subfamily"]
|
99
|
+
tribe = row["tribe"]
|
100
|
+
genus = row["genus"]
|
101
|
+
subgenus = row["subgenus"]
|
102
|
+
species = row["species"]
|
103
|
+
subspecies = row["subspecies"]
|
104
|
+
code = row["nomenclatural_code"]
|
105
|
+
|
106
|
+
taxon_id = "ARCT_#{i + 1}"
|
107
107
|
@names << { taxon_id: taxon_id,
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
}
|
108
|
+
name_string: name_string,
|
109
|
+
kingdom: kingdom,
|
110
|
+
phylum: phylum,
|
111
|
+
klass: klass,
|
112
|
+
order: order,
|
113
|
+
family: family,
|
114
|
+
genus: genus,
|
115
|
+
code: code }
|
117
116
|
|
118
117
|
update_vernacular(taxon_id, canonical)
|
119
118
|
update_synonym(taxon_id, canonical)
|
120
|
-
puts "Processed %s names" % i if i %
|
119
|
+
puts "Processed %s names" % i if i % 10_000 == 0
|
121
120
|
end
|
122
121
|
end
|
123
122
|
|
124
123
|
def update_vernacular(taxon_id, canonical)
|
125
|
-
return unless @vernaculars_hash.
|
124
|
+
return unless @vernaculars_hash.key?(canonical)
|
125
|
+
|
126
126
|
@vernaculars_hash[canonical].each do |vern|
|
127
127
|
@vernaculars << { taxon_id: taxon_id, vern: vern }
|
128
128
|
end
|
129
129
|
end
|
130
130
|
|
131
131
|
def update_synonym(taxon_id, canonical)
|
132
|
-
return unless @synonyms_hash.
|
132
|
+
return unless @synonyms_hash.key?(canonical)
|
133
|
+
|
133
134
|
@synonyms_hash[canonical].each do |syn|
|
134
135
|
@synonyms << { taxon_id: taxon_id, name_string: syn[:name_string],
|
135
|
-
|
136
|
+
status: syn[:status] }
|
136
137
|
end
|
137
138
|
end
|
138
139
|
|
139
140
|
def generate_dwca
|
140
|
-
DwcaHunter
|
141
|
-
|
142
|
-
@core = [[
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
]]
|
141
|
+
DwcaHunter.logger_write(object_id,
|
142
|
+
"Creating DarwinCore Archive file")
|
143
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
144
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
145
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
146
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
147
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
148
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
149
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
150
|
+
"http://rs.tdwg.org/dwc/terms/genus",
|
151
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
152
152
|
@names.each do |n|
|
153
153
|
@core << [n[:taxon_id], n[:name_string],
|
154
|
-
|
155
|
-
|
154
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
155
|
+
n[:genus], n[:code]]
|
156
156
|
end
|
157
157
|
@extensions << {
|
158
158
|
data: [[
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
159
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
160
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName"
|
161
|
+
]],
|
162
|
+
file_name: "vernacular_names.txt",
|
163
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
|
164
|
+
}
|
163
165
|
|
164
166
|
@vernaculars.each do |v|
|
165
167
|
@extensions[-1][:data] << [v[:taxon_id], v[:vern]]
|
@@ -167,12 +169,12 @@ module DwcaHunter
|
|
167
169
|
|
168
170
|
@extensions << {
|
169
171
|
data: [[
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
file_name:
|
175
|
-
|
172
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
173
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
174
|
+
"http://rs.tdwg.org/dwc/terms/taxonomicStatus"
|
175
|
+
]],
|
176
|
+
file_name: "synonyms.txt"
|
177
|
+
}
|
176
178
|
@synonyms.each do |s|
|
177
179
|
@extensions[-1][:data] << [s[:taxon_id], s[:name_string], s[:status]]
|
178
180
|
end
|
@@ -180,14 +182,14 @@ module DwcaHunter
|
|
180
182
|
id: @uuid,
|
181
183
|
title: @title,
|
182
184
|
authors: [
|
183
|
-
{email:
|
184
|
-
|
185
|
+
{ email: "dustymc at gmail dot com" }
|
186
|
+
],
|
185
187
|
metadata_providers: [
|
186
|
-
{ first_name:
|
187
|
-
last_name:
|
188
|
-
email:
|
189
|
-
|
190
|
-
abstract:
|
188
|
+
{ first_name: "Dmitry",
|
189
|
+
last_name: "Mozzherin",
|
190
|
+
email: "dmozzherin@gmail.com" }
|
191
|
+
],
|
192
|
+
abstract: "Arctos is an ongoing effort to integrate access to specimen data, collection-management tools, and external resources on the internet.",
|
191
193
|
url: @url
|
192
194
|
}
|
193
195
|
super
|
@@ -0,0 +1,151 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DwcaHunter
|
4
|
+
class ResourceClements < DwcaHunter::Resource
|
5
|
+
def initialize(opts = {})
|
6
|
+
@command = "clements-ebird"
|
7
|
+
@title = "The eBird/Clements Checklist of Birds of the World"
|
8
|
+
@url = "https://uofi.box.com/shared/static/b4n8zqa99hq9rdga27skkh3870yhujgo.csv"
|
9
|
+
@UUID = "577c0b56-4a3c-4314-8724-14b304f601de"
|
10
|
+
@download_path = File.join(Dir.tmpdir,
|
11
|
+
"dwca_hunter",
|
12
|
+
"clements",
|
13
|
+
"data.csv")
|
14
|
+
@synonyms = []
|
15
|
+
@names = []
|
16
|
+
@vernaculars = []
|
17
|
+
@extensions = []
|
18
|
+
@synonyms_hash = {}
|
19
|
+
@vernaculars_hash = {}
|
20
|
+
super(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def download
|
24
|
+
puts "Downloading cached and modified version of the file."
|
25
|
+
puts "Go to https://www.birds.cornell.edu/clementschecklist/download/ " \
|
26
|
+
"for updates."
|
27
|
+
`curl -s -L #{@url} -o #{@download_path}`
|
28
|
+
end
|
29
|
+
|
30
|
+
def unpack
|
31
|
+
end
|
32
|
+
|
33
|
+
def make_dwca
|
34
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
35
|
+
get_names
|
36
|
+
generate_dwca
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def get_names
|
42
|
+
Dir.chdir(@download_dir)
|
43
|
+
collect_names
|
44
|
+
end
|
45
|
+
|
46
|
+
def collect_names
|
47
|
+
@names_index = {}
|
48
|
+
file = CSV.open(File.join(@download_dir, "data.csv"),
|
49
|
+
headers: true)
|
50
|
+
file.each_with_index do |row, i|
|
51
|
+
name_string = row["scientific name"]
|
52
|
+
canonical = name_string
|
53
|
+
kingdom = "Animalia"
|
54
|
+
phylum = "Chordata"
|
55
|
+
klass = "Aves"
|
56
|
+
order = row["order"]
|
57
|
+
family = row["family"]
|
58
|
+
code = "ICZN"
|
59
|
+
|
60
|
+
taxon_id = "gn_#{i + 1}"
|
61
|
+
@names << { taxon_id: taxon_id,
|
62
|
+
name_string: name_string,
|
63
|
+
kingdom: kingdom,
|
64
|
+
phylum: phylum,
|
65
|
+
klass: klass,
|
66
|
+
order: order,
|
67
|
+
family: family,
|
68
|
+
code: code }
|
69
|
+
|
70
|
+
if row["English name"].to_s != ""
|
71
|
+
@vernaculars << {
|
72
|
+
taxon_id: taxon_id,
|
73
|
+
vern: row["English name"],
|
74
|
+
lang: "end"
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
puts "Processed %s names" % i if i % 10_000 == 0
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def generate_dwca
|
83
|
+
DwcaHunter.logger_write(object_id,
|
84
|
+
"Creating DarwinCore Archive file")
|
85
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
86
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
87
|
+
"http://rs.tdwg.org/dwc/terms/kingdom",
|
88
|
+
"http://rs.tdwg.org/dwc/terms/phylum",
|
89
|
+
"http://rs.tdwg.org/dwc/terms/class",
|
90
|
+
"http://rs.tdwg.org/dwc/terms/order",
|
91
|
+
"http://rs.tdwg.org/dwc/terms/family",
|
92
|
+
"http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]]
|
93
|
+
@names.each do |n|
|
94
|
+
@core << [n[:taxon_id], n[:name_string],
|
95
|
+
n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family],
|
96
|
+
n[:code]]
|
97
|
+
end
|
98
|
+
@extensions << {
|
99
|
+
data: [[
|
100
|
+
"http://rs.tdwg.org/dwc/terms/taxonID",
|
101
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
102
|
+
"http://purl.org/dc/terms/language"
|
103
|
+
]],
|
104
|
+
file_name: "vernacular_names.txt",
|
105
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName"
|
106
|
+
}
|
107
|
+
|
108
|
+
@vernaculars.each do |v|
|
109
|
+
@extensions[-1][:data] << [v[:taxon_id], v[:vern], v[:lang]]
|
110
|
+
end
|
111
|
+
|
112
|
+
@eml = {
|
113
|
+
id: @uuid,
|
114
|
+
title: @title,
|
115
|
+
authors: [
|
116
|
+
{ first_name: "G. F.",
|
117
|
+
last_name: "Clements"
|
118
|
+
},
|
119
|
+
{ first_name: "T. S.",
|
120
|
+
last_name: "Schulenberg"
|
121
|
+
},
|
122
|
+
{ first_name: "M. J.",
|
123
|
+
last_name: "Iliff"
|
124
|
+
},
|
125
|
+
{ first_name: "S. M.",
|
126
|
+
last_name: "Billerman"
|
127
|
+
},
|
128
|
+
{ first_name: "T. A.",
|
129
|
+
last_name: "Fredericks"
|
130
|
+
},
|
131
|
+
{ first_name: "B. L.",
|
132
|
+
last_name: "Sullivan"
|
133
|
+
},
|
134
|
+
{ first_name: "C. L.",
|
135
|
+
last_name: "Wood"
|
136
|
+
},
|
137
|
+
],
|
138
|
+
metadata_providers: [
|
139
|
+
{ first_name: "Dmitry",
|
140
|
+
last_name: "Mozzherin",
|
141
|
+
email: "dmozzherin@gmail.com" }
|
142
|
+
],
|
143
|
+
abstract: "The eBird/Clements Checklist of Birds of the World" \
|
144
|
+
": v2019. Downloaded from " \
|
145
|
+
"https://www.birds.cornell.edu/clementschecklist/download/",
|
146
|
+
url: @url
|
147
|
+
}
|
148
|
+
super
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
@@ -1,15 +1,15 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module DwcaHunter
|
4
4
|
class ResourceFreebase < DwcaHunter::Resource
|
5
5
|
def initialize(opts = {})
|
6
6
|
@command = "freebase"
|
7
|
-
@title =
|
8
|
-
@uuid =
|
7
|
+
@title = "Freebase"
|
8
|
+
@uuid = "bacd21f0-44e0-43e2-914c-70929916f257"
|
9
9
|
@download_path = File.join(Dir.tmpdir,
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
"dwca_hunter",
|
11
|
+
"freebase",
|
12
|
+
"data.json")
|
13
13
|
@data = []
|
14
14
|
@all_taxa = {}
|
15
15
|
@cleaned_taxa = {}
|
@@ -27,11 +27,11 @@ module DwcaHunter
|
|
27
27
|
end
|
28
28
|
|
29
29
|
def download
|
30
|
-
DwcaHunter
|
31
|
-
|
30
|
+
DwcaHunter.logger_write(object_id,
|
31
|
+
"Querying freebase for species information...")
|
32
32
|
q = {
|
33
33
|
query: [{
|
34
|
-
type:
|
34
|
+
type: "/biology/organism_classification",
|
35
35
|
id: nil,
|
36
36
|
guid: nil,
|
37
37
|
name: nil,
|
@@ -41,16 +41,16 @@ module DwcaHunter
|
|
41
41
|
id: nil,
|
42
42
|
guid: nil,
|
43
43
|
scientific_name: nil,
|
44
|
-
optional: true
|
45
|
-
}
|
44
|
+
optional: true
|
45
|
+
}
|
46
46
|
}],
|
47
|
-
cursor: true
|
47
|
+
cursor: true
|
48
48
|
}
|
49
49
|
|
50
50
|
run_query(q)
|
51
51
|
|
52
52
|
data = JSON.pretty_generate @data
|
53
|
-
f = open(@download_path,
|
53
|
+
f = open(@download_path, "w:utf-8")
|
54
54
|
f.write(data)
|
55
55
|
f.close
|
56
56
|
end
|
@@ -60,31 +60,32 @@ module DwcaHunter
|
|
60
60
|
def run_query(q)
|
61
61
|
count = 0
|
62
62
|
requests_num = 0
|
63
|
-
|
63
|
+
loop do
|
64
64
|
freebase_url = "http://api.freebase.com/api/service/mqlread?query=%s" %
|
65
|
-
|
65
|
+
URI.encode(q.to_json)
|
66
66
|
res = JSON.load RestClient.get(freebase_url)
|
67
67
|
requests_num += 1
|
68
|
-
break if res[
|
68
|
+
break if res["result"].nil? || res["result"].empty?
|
69
|
+
|
69
70
|
if requests_num % 10 == 0
|
70
|
-
DwcaHunter
|
71
|
-
|
71
|
+
DwcaHunter.logger_write(object_id,
|
72
|
+
"Received %s names" % count)
|
72
73
|
end
|
73
|
-
count += res[
|
74
|
-
res[
|
75
|
-
q[:cursor] = res[
|
74
|
+
count += res["result"].size
|
75
|
+
res["result"].each { |d| @data << d }
|
76
|
+
q[:cursor] = res["cursor"]
|
76
77
|
end
|
77
78
|
end
|
78
79
|
|
79
80
|
def organize_data
|
80
|
-
@data = JSON.load(open(@download_path,
|
81
|
+
@data = JSON.load(open(@download_path, "r:utf-8").read)
|
81
82
|
@data.each do |d|
|
82
|
-
scientific_name = d[
|
83
|
+
scientific_name = d["scientific_name"].to_s
|
83
84
|
id = d["id"]
|
84
|
-
parent_id = d[
|
85
|
-
d[
|
85
|
+
parent_id = d["higher_classification"] ?
|
86
|
+
d["higher_classification"]["id"] :
|
86
87
|
nil
|
87
|
-
synonyms = d[
|
88
|
+
synonyms = d["synonym_scientific_name"]
|
88
89
|
@all_taxa[id] = { id: id,
|
89
90
|
parent_id: parent_id,
|
90
91
|
scientific_name: scientific_name,
|
@@ -93,6 +94,7 @@ module DwcaHunter
|
|
93
94
|
|
94
95
|
@all_taxa.each do |k, v|
|
95
96
|
next unless v[:scientific_name] && v[:scientific_name].strip != ""
|
97
|
+
|
96
98
|
parent_id = v[:parent_id]
|
97
99
|
until (@all_taxa[parent_id] &&
|
98
100
|
@all_taxa[parent_id][:scientific_name]) || parent_id.nil?
|
@@ -103,29 +105,28 @@ module DwcaHunter
|
|
103
105
|
v[:parent_id] = parent_id
|
104
106
|
@cleaned_taxa[k] = v
|
105
107
|
end
|
106
|
-
|
107
108
|
end
|
108
109
|
|
109
110
|
def generate_dwca
|
110
|
-
DwcaHunter
|
111
|
-
|
112
|
-
@core = [[
|
113
|
-
|
114
|
-
|
111
|
+
DwcaHunter.logger_write(object_id,
|
112
|
+
"Creating DarwinCore Archive file")
|
113
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
114
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
115
|
+
"http://rs.tdwg.org/dwc/terms/parentNameUsageID"]]
|
115
116
|
|
116
117
|
@extensions << { data: [[
|
117
|
-
|
118
|
-
|
119
|
-
]], file_name:
|
120
|
-
DwcaHunter
|
121
|
-
|
118
|
+
"http://rs.tdwg.org/dwc/terms/TaxonID",
|
119
|
+
"http://rs.tdwg.org/dwc/terms/scientificName"
|
120
|
+
]], file_name: "synonyms.txt" }
|
121
|
+
DwcaHunter.logger_write(object_id,
|
122
|
+
"Creating synonyms extension for DarwinCore Archive file")
|
122
123
|
count = 0
|
123
|
-
@cleaned_taxa.each do |
|
124
|
+
@cleaned_taxa.each do |_key, taxon|
|
124
125
|
count += 1
|
125
126
|
@core << [taxon[:id], taxon[:scientific_name], taxon[:parent_id]]
|
126
127
|
if count % BATCH_SIZE == 0
|
127
|
-
DwcaHunter
|
128
|
-
|
128
|
+
DwcaHunter.logger_write(object_id,
|
129
|
+
"Traversing %s extension data record" % count)
|
129
130
|
end
|
130
131
|
taxon[:synonyms].each do |name|
|
131
132
|
@extensions[-1][:data] << [taxon[:id], name]
|
@@ -134,19 +135,20 @@ module DwcaHunter
|
|
134
135
|
@eml = {
|
135
136
|
id: @uuid,
|
136
137
|
title: @title,
|
137
|
-
license:
|
138
|
+
license: "http://creativecommons.org/licenses/by-sa/3.0/",
|
138
139
|
authors: [
|
139
|
-
{ url:
|
140
|
-
|
141
|
-
|
140
|
+
{ url: "http://www.freebase.com/home" }
|
141
|
+
],
|
142
|
+
abstract: "An entity graph of people, places and things, " \
|
143
|
+
"built by a community that loves open data.",
|
142
144
|
metadata_providers: [
|
143
|
-
{ first_name:
|
144
|
-
last_name:
|
145
|
-
email:
|
146
|
-
|
145
|
+
{ first_name: "Dmitry",
|
146
|
+
last_name: "Mozzherin",
|
147
|
+
email: "dmozzherin@mbl.edu" }
|
148
|
+
],
|
149
|
+
url: "http://www.freebase.com/home"
|
147
150
|
}
|
148
151
|
super
|
149
152
|
end
|
150
|
-
|
151
153
|
end
|
152
154
|
end
|