dwca_hunter 0.5.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/Gemfile.lock +3 -1
- data/dwca_hunter.gemspec +2 -0
- data/exe/dwcahunter +2 -0
- data/files/species-black.txt +251 -0
- data/lib/dwca_hunter.rb +2 -0
- data/lib/dwca_hunter/encoding.rb +11 -5
- data/lib/dwca_hunter/resources/gnub.rb +1 -2
- data/lib/dwca_hunter/resources/mammal_species.rb +73 -41
- data/lib/dwca_hunter/resources/wikispecies.rb +1 -2
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +31 -3
- data/lib/dwca_hunter/resources/worms.rb +0 -176
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e30e9b34ea1c46b021bd3c2ec66ccad4996d4a921c7ce78791b84940bd239f05
|
4
|
+
data.tar.gz: 1be0e7119fd38094f94a53d71460a8f97a59f2e46a5e9740b814f5dcc97b42cd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: df1d9bebe191ebf8ae72d601f05374edeaeffbb627d08d7981da582559439dd9ad173656d78a7c88b3a9765562efa8a7095c4eaa5ed0f1f3ef856be94f990b63
|
7
|
+
data.tar.gz: 703bbf2d197a55a8d4e8510e940f77562540983bb5eddc7a7496ed77af5e610ce0ce5d95a1faa25878a8f8b12a17613808898453a0bd6b7aa19a87dc7c5f000e
|
data/CHANGELOG.md
CHANGED
@@ -2,14 +2,23 @@
|
|
2
2
|
|
3
3
|
## Unreleased (placeholder for the next version)
|
4
4
|
|
5
|
+
## [v0.5.1]
|
6
|
+
|
7
|
+
- Add [#11] clean up data for Mammal Species of the World
|
8
|
+
- Add [#10] show logs during processing
|
9
|
+
|
5
10
|
## [v0.5.0]
|
11
|
+
|
6
12
|
- Add [#8] convert project to a `Ruby gem`
|
7
13
|
|
8
14
|
## Footnotes
|
9
15
|
|
10
16
|
This document follows [changelog guidelines]
|
11
17
|
|
18
|
+
[#11]: https://github.com/GlobalNamesArchitecture/dwca_hunter/issues/11
|
19
|
+
[#10]: https://github.com/GlobalNamesArchitecture/dwca_hunter/issues/10
|
12
20
|
[#8]: https://github.com/GlobalNamesArchitecture/dwca_hunter/issues/8
|
21
|
+
[v0.5.1]: https://github.com/gnames/dwca_hunter/compare/v0.5.0...v0.5.1
|
13
22
|
[v0.5.0]: https://github.com/gnames/dwca_hunter/compare/v0.4.0...v0.5.0
|
14
23
|
|
15
24
|
[changelog guidelines]: https://github.com/olivierlacan/keep-a-changelog
|
data/Gemfile.lock
CHANGED
data/dwca_hunter.gemspec
CHANGED
@@ -26,8 +26,10 @@ Gem::Specification.new do |gem|
|
|
26
26
|
gem.executables = gem.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
27
27
|
gem.require_paths = ["lib"]
|
28
28
|
|
29
|
+
gem.add_dependency "biodiversity", "~> 3.5"
|
29
30
|
gem.add_dependency "dwc-archive", "~> 1.0"
|
30
31
|
gem.add_dependency "gn_uuid", "~> 0.5"
|
32
|
+
gem.add_dependency "htmlentities", "~> 4.3"
|
31
33
|
gem.add_dependency "nokogiri", "~> 1.8"
|
32
34
|
gem.add_dependency "rest-client", "~> 2.0"
|
33
35
|
gem.add_dependency "thor", "~> 0.19"
|
data/exe/dwcahunter
CHANGED
@@ -4,6 +4,8 @@
|
|
4
4
|
require "thor"
|
5
5
|
require "dwca_hunter"
|
6
6
|
|
7
|
+
DwcaHunter.logger = Logger.new($stdout)
|
8
|
+
|
7
9
|
# DwcaHunterCLI determines command line interface to the gem functionality
|
8
10
|
class DwcaHunterCLI < Thor
|
9
11
|
desc "list [SEARCH]", "lists available sources to convert filtered by search"
|
@@ -0,0 +1,251 @@
|
|
1
|
+
ab
|
2
|
+
above
|
3
|
+
account
|
4
|
+
ad
|
5
|
+
ae
|
6
|
+
al
|
7
|
+
all
|
8
|
+
allocation
|
9
|
+
also
|
10
|
+
anamorph
|
11
|
+
and
|
12
|
+
ap
|
13
|
+
are
|
14
|
+
areas
|
15
|
+
as
|
16
|
+
associated
|
17
|
+
at
|
18
|
+
available
|
19
|
+
awaiting
|
20
|
+
backbone
|
21
|
+
bacterium
|
22
|
+
basin
|
23
|
+
bird
|
24
|
+
bis
|
25
|
+
but
|
26
|
+
bv
|
27
|
+
by
|
28
|
+
ca
|
29
|
+
calls
|
30
|
+
cf
|
31
|
+
circular
|
32
|
+
cited
|
33
|
+
clams
|
34
|
+
clonal
|
35
|
+
clone
|
36
|
+
comes
|
37
|
+
comment
|
38
|
+
comments
|
39
|
+
construct
|
40
|
+
contrasts
|
41
|
+
coralline
|
42
|
+
coronaviridae
|
43
|
+
culture
|
44
|
+
cv
|
45
|
+
da
|
46
|
+
dc
|
47
|
+
de
|
48
|
+
de-
|
49
|
+
def
|
50
|
+
degrading
|
51
|
+
del
|
52
|
+
depletion
|
53
|
+
der
|
54
|
+
des
|
55
|
+
described
|
56
|
+
diatom
|
57
|
+
differ
|
58
|
+
display
|
59
|
+
do
|
60
|
+
du
|
61
|
+
dummy
|
62
|
+
ecological
|
63
|
+
ectosymbiont
|
64
|
+
ed
|
65
|
+
either
|
66
|
+
em
|
67
|
+
en
|
68
|
+
endosymbiont
|
69
|
+
enrichment
|
70
|
+
environmental
|
71
|
+
et
|
72
|
+
ex
|
73
|
+
examples
|
74
|
+
excluding
|
75
|
+
expression
|
76
|
+
extend
|
77
|
+
eyes
|
78
|
+
faeces
|
79
|
+
figures
|
80
|
+
fl
|
81
|
+
flying
|
82
|
+
fn
|
83
|
+
fo
|
84
|
+
follows
|
85
|
+
foot
|
86
|
+
for
|
87
|
+
form
|
88
|
+
forma
|
89
|
+
formerly
|
90
|
+
fosmid
|
91
|
+
fossils
|
92
|
+
fox
|
93
|
+
fr
|
94
|
+
fragments
|
95
|
+
from
|
96
|
+
fungal
|
97
|
+
ge
|
98
|
+
gen
|
99
|
+
genera
|
100
|
+
generic
|
101
|
+
genus
|
102
|
+
goes
|
103
|
+
group
|
104
|
+
he
|
105
|
+
host
|
106
|
+
hu
|
107
|
+
hybrid
|
108
|
+
id
|
109
|
+
im
|
110
|
+
in
|
111
|
+
incertae
|
112
|
+
ing
|
113
|
+
is
|
114
|
+
it
|
115
|
+
jejuni-like
|
116
|
+
jr
|
117
|
+
kg
|
118
|
+
la
|
119
|
+
large
|
120
|
+
largest
|
121
|
+
leech
|
122
|
+
leukemia-related
|
123
|
+
libraries
|
124
|
+
longer
|
125
|
+
luciferase
|
126
|
+
magnified
|
127
|
+
marker
|
128
|
+
mc
|
129
|
+
morphology
|
130
|
+
ms
|
131
|
+
my
|
132
|
+
name
|
133
|
+
names
|
134
|
+
new
|
135
|
+
nf
|
136
|
+
ng
|
137
|
+
nm
|
138
|
+
non
|
139
|
+
not
|
140
|
+
notes
|
141
|
+
nov
|
142
|
+
nr
|
143
|
+
ns
|
144
|
+
occurs
|
145
|
+
of
|
146
|
+
on
|
147
|
+
one
|
148
|
+
op
|
149
|
+
or
|
150
|
+
oral
|
151
|
+
other
|
152
|
+
ox
|
153
|
+
parasite
|
154
|
+
parasites
|
155
|
+
part
|
156
|
+
phylotype
|
157
|
+
pipefish
|
158
|
+
pl
|
159
|
+
plasmid
|
160
|
+
pomfret
|
161
|
+
population
|
162
|
+
populations
|
163
|
+
pr
|
164
|
+
probes
|
165
|
+
prophage
|
166
|
+
pt
|
167
|
+
queens
|
168
|
+
red
|
169
|
+
reference
|
170
|
+
references
|
171
|
+
regarded
|
172
|
+
region
|
173
|
+
regions
|
174
|
+
related
|
175
|
+
reported
|
176
|
+
resistance
|
177
|
+
retroviruses
|
178
|
+
revised
|
179
|
+
rhodolith
|
180
|
+
same
|
181
|
+
sample
|
182
|
+
samples
|
183
|
+
scales
|
184
|
+
se
|
185
|
+
sea
|
186
|
+
seahorse
|
187
|
+
sec
|
188
|
+
secondary
|
189
|
+
sect
|
190
|
+
section
|
191
|
+
see
|
192
|
+
seed
|
193
|
+
seems
|
194
|
+
segment
|
195
|
+
series
|
196
|
+
show
|
197
|
+
shuttle
|
198
|
+
sinus
|
199
|
+
so
|
200
|
+
soil
|
201
|
+
south
|
202
|
+
species
|
203
|
+
specimens
|
204
|
+
ss
|
205
|
+
st
|
206
|
+
strains
|
207
|
+
subsp
|
208
|
+
subspec
|
209
|
+
subsp-nov
|
210
|
+
summarized
|
211
|
+
symbiont
|
212
|
+
symbionts
|
213
|
+
synthase
|
214
|
+
taken
|
215
|
+
taxon
|
216
|
+
that
|
217
|
+
the
|
218
|
+
these
|
219
|
+
thin
|
220
|
+
this
|
221
|
+
three
|
222
|
+
to
|
223
|
+
towards
|
224
|
+
trapping
|
225
|
+
two
|
226
|
+
ty
|
227
|
+
type
|
228
|
+
und
|
229
|
+
under
|
230
|
+
unpublished
|
231
|
+
up
|
232
|
+
us
|
233
|
+
usually
|
234
|
+
van
|
235
|
+
var
|
236
|
+
variable
|
237
|
+
variant
|
238
|
+
variety
|
239
|
+
vector
|
240
|
+
vi
|
241
|
+
virus
|
242
|
+
von
|
243
|
+
voyager
|
244
|
+
was
|
245
|
+
waters
|
246
|
+
we
|
247
|
+
were
|
248
|
+
with
|
249
|
+
wrote
|
250
|
+
xx
|
251
|
+
zur
|
data/lib/dwca_hunter.rb
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require "biodiversity"
|
3
4
|
require "logger"
|
4
5
|
require "fileutils"
|
5
6
|
require "uri"
|
6
7
|
require "tmpdir"
|
7
8
|
require "net/http"
|
8
9
|
require "json"
|
10
|
+
require "htmlentities"
|
9
11
|
require "dwc_archive"
|
10
12
|
require "dwca_hunter/resource"
|
11
13
|
require "rest_client"
|
data/lib/dwca_hunter/encoding.rb
CHANGED
@@ -1,12 +1,18 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module DwcaHunter
|
4
|
+
# Encoding module fixes encoding issues with data
|
2
5
|
module Encoding
|
3
6
|
def self.latin1_to_utf8(file_path)
|
4
|
-
new_file = file_path +
|
5
|
-
puts "Creating
|
6
|
-
r = open(file_path)
|
7
|
-
w = open(new_file,
|
7
|
+
new_file = file_path + ".utf_8"
|
8
|
+
puts "Creating #{new_file}"
|
9
|
+
r = File.open(file_path)
|
10
|
+
w = File.open(new_file, "w:utf-8")
|
11
|
+
he = HTMLEntities.new
|
8
12
|
r.each do |l|
|
9
|
-
l
|
13
|
+
l = l
|
14
|
+
l = l.encode("UTF-8", "ISO-8859-1", invalid: :replace, replace: "?")
|
15
|
+
l = he.decode(l)
|
10
16
|
w.write l
|
11
17
|
end
|
12
18
|
r.close
|
@@ -1,13 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module DwcaHunter
|
4
|
+
# ResourceMammalSpecies converts "Mammal Species of the World" data
|
5
|
+
# to DarwinCore Archive file
|
2
6
|
class ResourceMammalSpecies < DwcaHunter::Resource
|
3
7
|
def initialize(opts = {})
|
8
|
+
@parser = ScientificNameParser.new
|
9
|
+
@black_sp = black_species
|
4
10
|
@command = "mammal-species"
|
5
11
|
@title = "The Mammal Species of The World"
|
6
12
|
@uuid = "464dafec-1037-432d-8449-c0b309e0a030"
|
7
13
|
@data = []
|
8
14
|
@extensions = []
|
9
15
|
@count = 1
|
10
|
-
@clades = {"Mammalia" => { rank: "class", id: @count}}
|
16
|
+
@clades = { "Mammalia" => { rank: "class", id: @count } }
|
11
17
|
@url = "http://www.departments.bucknell.edu"\
|
12
18
|
"/biology/resources/msw3/export.asp"
|
13
19
|
@download_path = File.join(Dir.tmpdir, "dwca_hunter",
|
@@ -20,14 +26,14 @@ module DwcaHunter
|
|
20
26
|
end
|
21
27
|
|
22
28
|
def make_dwca
|
23
|
-
DwcaHunter
|
29
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
24
30
|
encode
|
25
31
|
collect_data
|
26
32
|
generate_dwca
|
27
33
|
end
|
28
34
|
|
29
35
|
def download
|
30
|
-
DwcaHunter
|
36
|
+
DwcaHunter.logger_write(object_id, "Downloading file -- "\
|
31
37
|
"it will take some time...")
|
32
38
|
dlr = DwcaHunter::Downloader.new(url, @download_path)
|
33
39
|
dlr.download
|
@@ -47,8 +53,7 @@ module DwcaHunter
|
|
47
53
|
end
|
48
54
|
|
49
55
|
def generate_dwca
|
50
|
-
DwcaHunter
|
51
|
-
'Creating DarwinCore Archive file')
|
56
|
+
DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
|
52
57
|
core_init
|
53
58
|
extensions_init
|
54
59
|
eml_init
|
@@ -64,45 +69,63 @@ module DwcaHunter
|
|
64
69
|
return if rec[:commonname].to_s == ""
|
65
70
|
taxon_id = taxon[0]
|
66
71
|
lang = "en"
|
67
|
-
name = rec[:commonname].
|
72
|
+
name = rec[:commonname].tr("\u{0092}", "'")
|
68
73
|
@extensions[0][:data] << [taxon_id, name, lang]
|
69
|
-
|
70
74
|
end
|
71
75
|
|
76
|
+
# rubocop:disable Metrics/AbcSize
|
77
|
+
|
72
78
|
def process_synonyms(rec, taxon)
|
73
79
|
accepted_id = taxon[0]
|
74
80
|
parent_id = taxon[2]
|
75
81
|
rank = taxon[-1]
|
76
|
-
return unless [
|
82
|
+
return unless %w[species subspecies].include? rank
|
77
83
|
synonyms = rec[:synonyms].gsub(/\.$/, "").
|
78
|
-
|
84
|
+
gsub(%r{<[/ibsup]+>}, "").gsub(/[\s]+/, " ").split(";")
|
79
85
|
synonyms = synonyms.map(&:strip)
|
80
|
-
synonyms
|
81
|
-
next if s
|
82
|
-
if s
|
83
|
-
s = rec[:genus] + " " + s
|
84
|
-
end
|
86
|
+
synonyms.map do |s|
|
87
|
+
next if s =~ /<u>/
|
88
|
+
s = rec[:genus] + " " + s if s =~ /^[a-z]/
|
85
89
|
@count += 1
|
86
90
|
id = @count
|
87
|
-
|
91
|
+
if real_name?(s)
|
92
|
+
@core << [id, nil, parent_id, accepted_id, s, "synonym", rank]
|
93
|
+
else
|
94
|
+
puts "Rejected: #{s}"
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# rubocop:enable Metrics/AbcSize
|
100
|
+
|
101
|
+
def real_name?(str)
|
102
|
+
parsed = @parser.parse(str)[:scientificName]
|
103
|
+
return false unless parsed[:parsed]
|
104
|
+
epithets = parsed[:canonical].split(" ")[1..-1]
|
105
|
+
return false if epithets.nil? || epithets.empty?
|
106
|
+
epithets.each do |e|
|
107
|
+
return false if @black_sp[e]
|
88
108
|
end
|
109
|
+
true
|
89
110
|
end
|
90
111
|
|
91
|
-
def process_name(rec
|
92
|
-
name =[@core.last[4], rec[:author], rec[:date]]
|
93
|
-
@core.last[4] = name.join(" ").gsub(/
|
112
|
+
def process_name(rec)
|
113
|
+
name = [@core.last[4], rec[:author], rec[:date]]
|
114
|
+
@core.last[4] = name.join(" ").gsub(%r{<[/ibsup]+>}, "").
|
115
|
+
gsub(/[\s]+/, " ").strip
|
94
116
|
@core.last[1] = rec[:id]
|
95
117
|
end
|
96
118
|
|
119
|
+
# rubocop:disable Metrics/AbcSize
|
120
|
+
|
97
121
|
def process_hierarchy(rec)
|
98
122
|
parent_id = @clades["Mammalia"][:id]
|
99
123
|
is_row_rank = false
|
100
|
-
[
|
101
|
-
|
102
|
-
|
103
|
-
is_row_rank = true if rank == rec[:taxonlevel].downcase.to_sym
|
124
|
+
%i[order suborder infraorder superfamily family
|
125
|
+
subfamily tribe genus subgenus species subspecies].each do |rank|
|
126
|
+
is_row_rank = true if rank == rec[:taxonlevel].downcase.to_sym
|
104
127
|
clade = rec[rank]
|
105
|
-
clade = clade.capitalize if clade
|
128
|
+
clade = clade.capitalize if clade =~ /^[A-Z]+$/
|
106
129
|
next if clade.to_s == ""
|
107
130
|
clade_id = nil
|
108
131
|
clade = adjust_clade(rec, rank, clade)
|
@@ -114,16 +137,17 @@ module DwcaHunter
|
|
114
137
|
@clades[clade] = { id: clade_id, rank: rank }
|
115
138
|
@core << [clade_id, nil, parent_id, clade_id, clade, nil, rank.to_s]
|
116
139
|
if is_row_rank
|
117
|
-
process_name(rec
|
140
|
+
process_name(rec)
|
118
141
|
return @core.last
|
119
142
|
end
|
120
143
|
end
|
121
144
|
parent_id = clade_id
|
122
145
|
end
|
123
146
|
end
|
147
|
+
# rubocop:enable Metrics/AbcSize
|
124
148
|
|
125
149
|
def adjust_clade(rec, rank, clade)
|
126
|
-
if [
|
150
|
+
if %i[species subspecies].include? rank
|
127
151
|
clade = [rec[:genus], rec[:species]]
|
128
152
|
clade << rec[:subspecies] if rank == :subspecies
|
129
153
|
clade.join(" ").gsub(/[\s]+/, " ").strip
|
@@ -140,13 +164,13 @@ module DwcaHunter
|
|
140
164
|
{ first_name: "Don",
|
141
165
|
last_name: "Wilson" },
|
142
166
|
{ first_name: "DeeAnn",
|
143
|
-
last_name: "Reader" }
|
144
|
-
|
167
|
+
last_name: "Reader" }
|
168
|
+
],
|
145
169
|
metadata_providers: [
|
146
170
|
{ first_name: "Dmitry",
|
147
171
|
last_name: "Mozzherin",
|
148
172
|
email: "dmozzherin@gmail.com" }
|
149
|
-
|
173
|
+
],
|
150
174
|
abstract: "Mammal Species of the World, 3rd edition (MSW3) is "\
|
151
175
|
"a database of mammalian taxonomy, based upon the 2005 book "\
|
152
176
|
"Mammal Species of the World. A Taxonomic and Geographic Reference "\
|
@@ -156,24 +180,32 @@ module DwcaHunter
|
|
156
180
|
end
|
157
181
|
|
158
182
|
def core_init
|
159
|
-
@core = [[
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
183
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
184
|
+
"http://globalnames.org/terms/localID",
|
185
|
+
"http://rs.tdwg.org/dwc/terms/parentNameUsageID",
|
186
|
+
"http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
|
187
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
188
|
+
"http://rs.tdwg.org/dwc/terms/taxonomicStatus",
|
189
|
+
"http://rs.tdwg.org/dwc/terms/taxonRank"]]
|
166
190
|
m = @clades["Mammalia"]
|
167
191
|
@core << [m[:id], nil, nil, m[:id], "Mammalia", nil, "class"]
|
168
192
|
end
|
169
193
|
|
194
|
+
def black_species
|
195
|
+
res = {}
|
196
|
+
path = File.join(__dir__, "..", "..", "..", "files", "species-black.txt")
|
197
|
+
File.open(path).each do |l|
|
198
|
+
res[l.strip] = 1
|
199
|
+
end
|
200
|
+
res
|
201
|
+
end
|
202
|
+
|
170
203
|
def extensions_init
|
171
|
-
@extensions << { data: [[
|
172
|
-
|
173
|
-
|
174
|
-
file_name:
|
175
|
-
row_type:
|
176
|
-
}
|
204
|
+
@extensions << { data: [["http://rs.tdwg.org/dwc/terms/taxonID",
|
205
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
206
|
+
"http://purl.org/dc/terms/language"]],
|
207
|
+
file_name: "vernacular_names.txt",
|
208
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName" }
|
177
209
|
end
|
178
210
|
end
|
179
211
|
end
|
@@ -5,7 +5,7 @@ module DwcaHunter
|
|
5
5
|
@problems_file = open('problems.txt', 'w:utf-8')
|
6
6
|
@command = "wikispecies"
|
7
7
|
@title = 'Wikispecies'
|
8
|
-
@url = 'http://dumps.wikimedia.org/specieswiki/latest/'
|
8
|
+
@url = 'http://dumps.wikimedia.org/specieswiki/latest/' \
|
9
9
|
'specieswiki-latest-pages-articles.xml.bz2'
|
10
10
|
@url = opts[:url] if opts[:url]
|
11
11
|
@uuid = '68923690-0727-473c-b7c5-2ae9e601e3fd'
|
@@ -347,4 +347,3 @@ module DwcaHunter
|
|
347
347
|
|
348
348
|
end
|
349
349
|
end
|
350
|
-
|
data/lib/dwca_hunter/version.rb
CHANGED
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwca_hunter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-08-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: biodiversity
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '3.5'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '3.5'
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: dwc-archive
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -38,6 +52,20 @@ dependencies:
|
|
38
52
|
- - "~>"
|
39
53
|
- !ruby/object:Gem::Version
|
40
54
|
version: '0.5'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: htmlentities
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '4.3'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '4.3'
|
41
69
|
- !ruby/object:Gem::Dependency
|
42
70
|
name: nokogiri
|
43
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -204,6 +232,7 @@ files:
|
|
204
232
|
- files/birdlife_7.csv
|
205
233
|
- files/fishbase_taxon_cache.tsv
|
206
234
|
- files/reptile_checklist_2014_12.csv
|
235
|
+
- files/species-black.txt
|
207
236
|
- lib/dwca_hunter.rb
|
208
237
|
- lib/dwca_hunter/downloader.rb
|
209
238
|
- lib/dwca_hunter/encoding.rb
|
@@ -219,7 +248,6 @@ files:
|
|
219
248
|
- lib/dwca_hunter/resources/opentree.rb
|
220
249
|
- lib/dwca_hunter/resources/reptiles_checklist.rb
|
221
250
|
- lib/dwca_hunter/resources/wikispecies.rb
|
222
|
-
- lib/dwca_hunter/resources/worms.rb
|
223
251
|
- lib/dwca_hunter/url.rb
|
224
252
|
- lib/dwca_hunter/version.rb
|
225
253
|
- lib/dwca_hunter/xml.rb
|
@@ -1,176 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
module DwcaHunter
|
3
|
-
class ResourceWoRMS < DwcaHunter::Resource
|
4
|
-
def initialize(opts = {})
|
5
|
-
@command = 'worms'
|
6
|
-
@title = 'WoRMS'
|
7
|
-
@url = 'http://content60.eol.org/resources/26.tar.gz'
|
8
|
-
@uuid = '9d27a7ad-2e6a-4597-a79b-23fb3b2f8284'
|
9
|
-
@download_path = File.join(Dir.tmpdir,
|
10
|
-
'dwca_hunter',
|
11
|
-
'worms',
|
12
|
-
'data.tar.gz')
|
13
|
-
@fields = ['dc:identifier',
|
14
|
-
'dc:source',
|
15
|
-
'dwc:Kingdom',
|
16
|
-
'dwc:Phylum',
|
17
|
-
'dwc:Class',
|
18
|
-
'dwc:Order',
|
19
|
-
'dwc:Family',
|
20
|
-
'dwc:Genus',
|
21
|
-
'dwc:ScientificName']
|
22
|
-
@rank = { 1 => 'kingdom',
|
23
|
-
2 => 'phylum',
|
24
|
-
3 => 'class',
|
25
|
-
4 => 'order',
|
26
|
-
5 => 'family',
|
27
|
-
6 => 'genus',
|
28
|
-
7 => 'species' }
|
29
|
-
@known_paths = {}
|
30
|
-
@data = []
|
31
|
-
@extensions = []
|
32
|
-
@extensions << { data: [[
|
33
|
-
'http://rs.tdwg.org/dwc/terms/taxonId',
|
34
|
-
'http://rs.tdwg.org/dwc/terms/scientificName']],
|
35
|
-
file_name: 'synonyms.txt' }
|
36
|
-
@re = {
|
37
|
-
cdata: %r#\<\!\[CDATA\[(.*)\]\]\>#
|
38
|
-
}
|
39
|
-
@core = [[
|
40
|
-
'http://rs.tdwg.org/dwc/terms/taxonID',
|
41
|
-
'http://purl.org/dc/terms/parentNameUsageID',
|
42
|
-
'http://purl.org/dc/terms/source',
|
43
|
-
'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID',
|
44
|
-
'http://purl.org/dc/terms/scientificName',
|
45
|
-
'http://purl.org/dc/terms/taxonRank']]
|
46
|
-
super
|
47
|
-
end
|
48
|
-
|
49
|
-
def unpack
|
50
|
-
unpack_tar
|
51
|
-
end
|
52
|
-
|
53
|
-
def make_dwca
|
54
|
-
collect_data
|
55
|
-
make_core_data
|
56
|
-
generate_dwca
|
57
|
-
end
|
58
|
-
|
59
|
-
private
|
60
|
-
|
61
|
-
def collect_data
|
62
|
-
DwcaHunter::logger_write(self.object_id, 'Traversing xml file...')
|
63
|
-
xml_file = File.join(@download_dir, '26.xml')
|
64
|
-
f = open(xml_file, 'r:utf-8')
|
65
|
-
in_taxon = false
|
66
|
-
taxon = nil
|
67
|
-
count = 0
|
68
|
-
Nokogiri::XML::Reader(f).each do |node|
|
69
|
-
if !in_taxon && node.name == 'taxon'
|
70
|
-
in_taxon = true
|
71
|
-
taxon = {}
|
72
|
-
@fields.each { |field| taxon[field.to_sym] = nil }
|
73
|
-
taxon[:synonyms] = []
|
74
|
-
elsif in_taxon && node.name == 'taxon'
|
75
|
-
in_taxon = false
|
76
|
-
@data << taxon
|
77
|
-
taxon = nil
|
78
|
-
count += 1
|
79
|
-
if count % BATCH_SIZE == 0
|
80
|
-
DwcaHunter::logger_write(self.object_id,
|
81
|
-
"Extracted %s taxons" % count)
|
82
|
-
end
|
83
|
-
elsif in_taxon
|
84
|
-
item = node.name.to_sym
|
85
|
-
if taxon.has_key?(item) && !taxon[item]
|
86
|
-
text = node.inner_xml
|
87
|
-
if cdata = text.match(@re[:cdata])
|
88
|
-
text = cdata[1]
|
89
|
-
else
|
90
|
-
text = DwcaHunter::XML.unescape(text)
|
91
|
-
end
|
92
|
-
taxon[item] = text
|
93
|
-
elsif node.name == 'synonym' &&
|
94
|
-
(cdata = node.inner_xml.match(@re[:cdata]))
|
95
|
-
taxon[:synonyms] << cdata[1]
|
96
|
-
end
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
def get_gn_id(path_string)
|
102
|
-
gn_uuid = UUID.create_v5(path_string, GNA_NAMESPACE)
|
103
|
-
id = Base64.urlsafe_encode64(gn_uuid.raw_bytes)[0..-3]
|
104
|
-
"gn:" + id
|
105
|
-
end
|
106
|
-
|
107
|
-
def make_core_data
|
108
|
-
DwcaHunter::logger_write(self.object_id, 'Creating core data')
|
109
|
-
@data.each_with_index do |taxa, i|
|
110
|
-
if i % BATCH_SIZE == 0
|
111
|
-
DwcaHunter::logger_write(self.object_id,
|
112
|
-
'Traversing %s species for core' % i)
|
113
|
-
end
|
114
|
-
path = get_path(taxa)
|
115
|
-
parent_id = get_gn_id(path.join('|'))
|
116
|
-
@core << [taxa[:'dc:identifier'],
|
117
|
-
parent_id, taxa[:'dc:source'],
|
118
|
-
nil,
|
119
|
-
taxa[:'dwc:ScientificName'],
|
120
|
-
'species']
|
121
|
-
|
122
|
-
taxa[:synonyms].each do |synonym|
|
123
|
-
@extensions[0][:data] << [taxa[:'dc:identifier'], synonym]
|
124
|
-
end
|
125
|
-
|
126
|
-
until path.empty?
|
127
|
-
path_string = path.join("|")
|
128
|
-
unless @known_paths[path_string]
|
129
|
-
@known_paths[path_string] = 1
|
130
|
-
parent_id = (path.size == 1) ?
|
131
|
-
nil :
|
132
|
-
get_gn_id([path[0..-2]].join('|'))
|
133
|
-
id = get_gn_id(path_string)
|
134
|
-
@core << [id, parent_id, nil, nil, path[-1], @rank[path.size]]
|
135
|
-
end
|
136
|
-
path.pop
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
def get_path(taxa)
|
142
|
-
path = []
|
143
|
-
@fields[2..-2].each do |field|
|
144
|
-
path << taxa[field.to_sym]
|
145
|
-
end
|
146
|
-
path
|
147
|
-
end
|
148
|
-
|
149
|
-
def generate_dwca
|
150
|
-
DwcaHunter::logger_write(self.object_id,
|
151
|
-
'Creating DarwinCore Archive file')
|
152
|
-
@eml = {
|
153
|
-
id: @uuid,
|
154
|
-
title: @title,
|
155
|
-
authors: [
|
156
|
-
{ email: 'info@marinespecies.org',
|
157
|
-
url: 'http://www.marinespecies.org' }
|
158
|
-
],
|
159
|
-
metadata_providers: [
|
160
|
-
{ first_name: 'Dmitry',
|
161
|
-
last_name: 'Mozzherin',
|
162
|
-
email: 'dmozzherin@gmail.com' }
|
163
|
-
],
|
164
|
-
abstract: 'The aim of a World Register of Marine Species (WoRMS) ' +
|
165
|
-
'is to provide an authoritative and comprehensive list ' +
|
166
|
-
'of names of marine organisms, including information ' +
|
167
|
-
'on synonymy. While highest priority goes to valid ' +
|
168
|
-
'names, other names in use are included so that this ' +
|
169
|
-
'register can serve as a guide to interpret taxonomic ' +
|
170
|
-
'literature.',
|
171
|
-
}
|
172
|
-
super
|
173
|
-
end
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|