dwca_hunter 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/Gemfile.lock +3 -1
- data/dwca_hunter.gemspec +2 -0
- data/exe/dwcahunter +2 -0
- data/files/species-black.txt +251 -0
- data/lib/dwca_hunter.rb +2 -0
- data/lib/dwca_hunter/encoding.rb +11 -5
- data/lib/dwca_hunter/resources/gnub.rb +1 -2
- data/lib/dwca_hunter/resources/mammal_species.rb +73 -41
- data/lib/dwca_hunter/resources/wikispecies.rb +1 -2
- data/lib/dwca_hunter/version.rb +1 -1
- metadata +31 -3
- data/lib/dwca_hunter/resources/worms.rb +0 -176
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e30e9b34ea1c46b021bd3c2ec66ccad4996d4a921c7ce78791b84940bd239f05
|
|
4
|
+
data.tar.gz: 1be0e7119fd38094f94a53d71460a8f97a59f2e46a5e9740b814f5dcc97b42cd
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: df1d9bebe191ebf8ae72d601f05374edeaeffbb627d08d7981da582559439dd9ad173656d78a7c88b3a9765562efa8a7095c4eaa5ed0f1f3ef856be94f990b63
|
|
7
|
+
data.tar.gz: 703bbf2d197a55a8d4e8510e940f77562540983bb5eddc7a7496ed77af5e610ce0ce5d95a1faa25878a8f8b12a17613808898453a0bd6b7aa19a87dc7c5f000e
|
data/CHANGELOG.md
CHANGED
|
@@ -2,14 +2,23 @@
|
|
|
2
2
|
|
|
3
3
|
## Unreleased (placeholder for the next version)
|
|
4
4
|
|
|
5
|
+
## [v0.5.1]
|
|
6
|
+
|
|
7
|
+
- Add [#11] clean up data for Mammal Species of the World
|
|
8
|
+
- Add [#10] show logs during processing
|
|
9
|
+
|
|
5
10
|
## [v0.5.0]
|
|
11
|
+
|
|
6
12
|
- Add [#8] convert project to a `Ruby gem`
|
|
7
13
|
|
|
8
14
|
## Footnotes
|
|
9
15
|
|
|
10
16
|
This document follows [changelog guidelines]
|
|
11
17
|
|
|
18
|
+
[#11]: https://github.com/GlobalNamesArchitecture/dwca_hunter/issues/11
|
|
19
|
+
[#10]: https://github.com/GlobalNamesArchitecture/dwca_hunter/issues/10
|
|
12
20
|
[#8]: https://github.com/GlobalNamesArchitecture/dwca_hunter/issues/8
|
|
21
|
+
[v0.5.1]: https://github.com/gnames/dwca_hunter/compare/v0.5.0...v0.5.1
|
|
13
22
|
[v0.5.0]: https://github.com/gnames/dwca_hunter/compare/v0.4.0...v0.5.0
|
|
14
23
|
|
|
15
24
|
[changelog guidelines]: https://github.com/olivierlacan/keep-a-changelog
|
data/Gemfile.lock
CHANGED
data/dwca_hunter.gemspec
CHANGED
|
@@ -26,8 +26,10 @@ Gem::Specification.new do |gem|
|
|
|
26
26
|
gem.executables = gem.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
27
27
|
gem.require_paths = ["lib"]
|
|
28
28
|
|
|
29
|
+
gem.add_dependency "biodiversity", "~> 3.5"
|
|
29
30
|
gem.add_dependency "dwc-archive", "~> 1.0"
|
|
30
31
|
gem.add_dependency "gn_uuid", "~> 0.5"
|
|
32
|
+
gem.add_dependency "htmlentities", "~> 4.3"
|
|
31
33
|
gem.add_dependency "nokogiri", "~> 1.8"
|
|
32
34
|
gem.add_dependency "rest-client", "~> 2.0"
|
|
33
35
|
gem.add_dependency "thor", "~> 0.19"
|
data/exe/dwcahunter
CHANGED
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
require "thor"
|
|
5
5
|
require "dwca_hunter"
|
|
6
6
|
|
|
7
|
+
DwcaHunter.logger = Logger.new($stdout)
|
|
8
|
+
|
|
7
9
|
# DwcaHunterCLI determines command line interface to the gem functionality
|
|
8
10
|
class DwcaHunterCLI < Thor
|
|
9
11
|
desc "list [SEARCH]", "lists available sources to convert filtered by search"
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
ab
|
|
2
|
+
above
|
|
3
|
+
account
|
|
4
|
+
ad
|
|
5
|
+
ae
|
|
6
|
+
al
|
|
7
|
+
all
|
|
8
|
+
allocation
|
|
9
|
+
also
|
|
10
|
+
anamorph
|
|
11
|
+
and
|
|
12
|
+
ap
|
|
13
|
+
are
|
|
14
|
+
areas
|
|
15
|
+
as
|
|
16
|
+
associated
|
|
17
|
+
at
|
|
18
|
+
available
|
|
19
|
+
awaiting
|
|
20
|
+
backbone
|
|
21
|
+
bacterium
|
|
22
|
+
basin
|
|
23
|
+
bird
|
|
24
|
+
bis
|
|
25
|
+
but
|
|
26
|
+
bv
|
|
27
|
+
by
|
|
28
|
+
ca
|
|
29
|
+
calls
|
|
30
|
+
cf
|
|
31
|
+
circular
|
|
32
|
+
cited
|
|
33
|
+
clams
|
|
34
|
+
clonal
|
|
35
|
+
clone
|
|
36
|
+
comes
|
|
37
|
+
comment
|
|
38
|
+
comments
|
|
39
|
+
construct
|
|
40
|
+
contrasts
|
|
41
|
+
coralline
|
|
42
|
+
coronaviridae
|
|
43
|
+
culture
|
|
44
|
+
cv
|
|
45
|
+
da
|
|
46
|
+
dc
|
|
47
|
+
de
|
|
48
|
+
de-
|
|
49
|
+
def
|
|
50
|
+
degrading
|
|
51
|
+
del
|
|
52
|
+
depletion
|
|
53
|
+
der
|
|
54
|
+
des
|
|
55
|
+
described
|
|
56
|
+
diatom
|
|
57
|
+
differ
|
|
58
|
+
display
|
|
59
|
+
do
|
|
60
|
+
du
|
|
61
|
+
dummy
|
|
62
|
+
ecological
|
|
63
|
+
ectosymbiont
|
|
64
|
+
ed
|
|
65
|
+
either
|
|
66
|
+
em
|
|
67
|
+
en
|
|
68
|
+
endosymbiont
|
|
69
|
+
enrichment
|
|
70
|
+
environmental
|
|
71
|
+
et
|
|
72
|
+
ex
|
|
73
|
+
examples
|
|
74
|
+
excluding
|
|
75
|
+
expression
|
|
76
|
+
extend
|
|
77
|
+
eyes
|
|
78
|
+
faeces
|
|
79
|
+
figures
|
|
80
|
+
fl
|
|
81
|
+
flying
|
|
82
|
+
fn
|
|
83
|
+
fo
|
|
84
|
+
follows
|
|
85
|
+
foot
|
|
86
|
+
for
|
|
87
|
+
form
|
|
88
|
+
forma
|
|
89
|
+
formerly
|
|
90
|
+
fosmid
|
|
91
|
+
fossils
|
|
92
|
+
fox
|
|
93
|
+
fr
|
|
94
|
+
fragments
|
|
95
|
+
from
|
|
96
|
+
fungal
|
|
97
|
+
ge
|
|
98
|
+
gen
|
|
99
|
+
genera
|
|
100
|
+
generic
|
|
101
|
+
genus
|
|
102
|
+
goes
|
|
103
|
+
group
|
|
104
|
+
he
|
|
105
|
+
host
|
|
106
|
+
hu
|
|
107
|
+
hybrid
|
|
108
|
+
id
|
|
109
|
+
im
|
|
110
|
+
in
|
|
111
|
+
incertae
|
|
112
|
+
ing
|
|
113
|
+
is
|
|
114
|
+
it
|
|
115
|
+
jejuni-like
|
|
116
|
+
jr
|
|
117
|
+
kg
|
|
118
|
+
la
|
|
119
|
+
large
|
|
120
|
+
largest
|
|
121
|
+
leech
|
|
122
|
+
leukemia-related
|
|
123
|
+
libraries
|
|
124
|
+
longer
|
|
125
|
+
luciferase
|
|
126
|
+
magnified
|
|
127
|
+
marker
|
|
128
|
+
mc
|
|
129
|
+
morphology
|
|
130
|
+
ms
|
|
131
|
+
my
|
|
132
|
+
name
|
|
133
|
+
names
|
|
134
|
+
new
|
|
135
|
+
nf
|
|
136
|
+
ng
|
|
137
|
+
nm
|
|
138
|
+
non
|
|
139
|
+
not
|
|
140
|
+
notes
|
|
141
|
+
nov
|
|
142
|
+
nr
|
|
143
|
+
ns
|
|
144
|
+
occurs
|
|
145
|
+
of
|
|
146
|
+
on
|
|
147
|
+
one
|
|
148
|
+
op
|
|
149
|
+
or
|
|
150
|
+
oral
|
|
151
|
+
other
|
|
152
|
+
ox
|
|
153
|
+
parasite
|
|
154
|
+
parasites
|
|
155
|
+
part
|
|
156
|
+
phylotype
|
|
157
|
+
pipefish
|
|
158
|
+
pl
|
|
159
|
+
plasmid
|
|
160
|
+
pomfret
|
|
161
|
+
population
|
|
162
|
+
populations
|
|
163
|
+
pr
|
|
164
|
+
probes
|
|
165
|
+
prophage
|
|
166
|
+
pt
|
|
167
|
+
queens
|
|
168
|
+
red
|
|
169
|
+
reference
|
|
170
|
+
references
|
|
171
|
+
regarded
|
|
172
|
+
region
|
|
173
|
+
regions
|
|
174
|
+
related
|
|
175
|
+
reported
|
|
176
|
+
resistance
|
|
177
|
+
retroviruses
|
|
178
|
+
revised
|
|
179
|
+
rhodolith
|
|
180
|
+
same
|
|
181
|
+
sample
|
|
182
|
+
samples
|
|
183
|
+
scales
|
|
184
|
+
se
|
|
185
|
+
sea
|
|
186
|
+
seahorse
|
|
187
|
+
sec
|
|
188
|
+
secondary
|
|
189
|
+
sect
|
|
190
|
+
section
|
|
191
|
+
see
|
|
192
|
+
seed
|
|
193
|
+
seems
|
|
194
|
+
segment
|
|
195
|
+
series
|
|
196
|
+
show
|
|
197
|
+
shuttle
|
|
198
|
+
sinus
|
|
199
|
+
so
|
|
200
|
+
soil
|
|
201
|
+
south
|
|
202
|
+
species
|
|
203
|
+
specimens
|
|
204
|
+
ss
|
|
205
|
+
st
|
|
206
|
+
strains
|
|
207
|
+
subsp
|
|
208
|
+
subspec
|
|
209
|
+
subsp-nov
|
|
210
|
+
summarized
|
|
211
|
+
symbiont
|
|
212
|
+
symbionts
|
|
213
|
+
synthase
|
|
214
|
+
taken
|
|
215
|
+
taxon
|
|
216
|
+
that
|
|
217
|
+
the
|
|
218
|
+
these
|
|
219
|
+
thin
|
|
220
|
+
this
|
|
221
|
+
three
|
|
222
|
+
to
|
|
223
|
+
towards
|
|
224
|
+
trapping
|
|
225
|
+
two
|
|
226
|
+
ty
|
|
227
|
+
type
|
|
228
|
+
und
|
|
229
|
+
under
|
|
230
|
+
unpublished
|
|
231
|
+
up
|
|
232
|
+
us
|
|
233
|
+
usually
|
|
234
|
+
van
|
|
235
|
+
var
|
|
236
|
+
variable
|
|
237
|
+
variant
|
|
238
|
+
variety
|
|
239
|
+
vector
|
|
240
|
+
vi
|
|
241
|
+
virus
|
|
242
|
+
von
|
|
243
|
+
voyager
|
|
244
|
+
was
|
|
245
|
+
waters
|
|
246
|
+
we
|
|
247
|
+
were
|
|
248
|
+
with
|
|
249
|
+
wrote
|
|
250
|
+
xx
|
|
251
|
+
zur
|
data/lib/dwca_hunter.rb
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "biodiversity"
|
|
3
4
|
require "logger"
|
|
4
5
|
require "fileutils"
|
|
5
6
|
require "uri"
|
|
6
7
|
require "tmpdir"
|
|
7
8
|
require "net/http"
|
|
8
9
|
require "json"
|
|
10
|
+
require "htmlentities"
|
|
9
11
|
require "dwc_archive"
|
|
10
12
|
require "dwca_hunter/resource"
|
|
11
13
|
require "rest_client"
|
data/lib/dwca_hunter/encoding.rb
CHANGED
|
@@ -1,12 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module DwcaHunter
|
|
4
|
+
# Encoding module fixes encoding issues with data
|
|
2
5
|
module Encoding
|
|
3
6
|
def self.latin1_to_utf8(file_path)
|
|
4
|
-
new_file = file_path +
|
|
5
|
-
puts "Creating
|
|
6
|
-
r = open(file_path)
|
|
7
|
-
w = open(new_file,
|
|
7
|
+
new_file = file_path + ".utf_8"
|
|
8
|
+
puts "Creating #{new_file}"
|
|
9
|
+
r = File.open(file_path)
|
|
10
|
+
w = File.open(new_file, "w:utf-8")
|
|
11
|
+
he = HTMLEntities.new
|
|
8
12
|
r.each do |l|
|
|
9
|
-
l
|
|
13
|
+
l = l
|
|
14
|
+
l = l.encode("UTF-8", "ISO-8859-1", invalid: :replace, replace: "?")
|
|
15
|
+
l = he.decode(l)
|
|
10
16
|
w.write l
|
|
11
17
|
end
|
|
12
18
|
r.close
|
|
@@ -1,13 +1,19 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module DwcaHunter
|
|
4
|
+
# ResourceMammalSpecies converts "Mammal Species of the World" data
|
|
5
|
+
# to DarwinCore Archive file
|
|
2
6
|
class ResourceMammalSpecies < DwcaHunter::Resource
|
|
3
7
|
def initialize(opts = {})
|
|
8
|
+
@parser = ScientificNameParser.new
|
|
9
|
+
@black_sp = black_species
|
|
4
10
|
@command = "mammal-species"
|
|
5
11
|
@title = "The Mammal Species of The World"
|
|
6
12
|
@uuid = "464dafec-1037-432d-8449-c0b309e0a030"
|
|
7
13
|
@data = []
|
|
8
14
|
@extensions = []
|
|
9
15
|
@count = 1
|
|
10
|
-
@clades = {"Mammalia" => { rank: "class", id: @count}}
|
|
16
|
+
@clades = { "Mammalia" => { rank: "class", id: @count } }
|
|
11
17
|
@url = "http://www.departments.bucknell.edu"\
|
|
12
18
|
"/biology/resources/msw3/export.asp"
|
|
13
19
|
@download_path = File.join(Dir.tmpdir, "dwca_hunter",
|
|
@@ -20,14 +26,14 @@ module DwcaHunter
|
|
|
20
26
|
end
|
|
21
27
|
|
|
22
28
|
def make_dwca
|
|
23
|
-
DwcaHunter
|
|
29
|
+
DwcaHunter.logger_write(object_id, "Extracting data")
|
|
24
30
|
encode
|
|
25
31
|
collect_data
|
|
26
32
|
generate_dwca
|
|
27
33
|
end
|
|
28
34
|
|
|
29
35
|
def download
|
|
30
|
-
DwcaHunter
|
|
36
|
+
DwcaHunter.logger_write(object_id, "Downloading file -- "\
|
|
31
37
|
"it will take some time...")
|
|
32
38
|
dlr = DwcaHunter::Downloader.new(url, @download_path)
|
|
33
39
|
dlr.download
|
|
@@ -47,8 +53,7 @@ module DwcaHunter
|
|
|
47
53
|
end
|
|
48
54
|
|
|
49
55
|
def generate_dwca
|
|
50
|
-
DwcaHunter
|
|
51
|
-
'Creating DarwinCore Archive file')
|
|
56
|
+
DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
|
|
52
57
|
core_init
|
|
53
58
|
extensions_init
|
|
54
59
|
eml_init
|
|
@@ -64,45 +69,63 @@ module DwcaHunter
|
|
|
64
69
|
return if rec[:commonname].to_s == ""
|
|
65
70
|
taxon_id = taxon[0]
|
|
66
71
|
lang = "en"
|
|
67
|
-
name = rec[:commonname].
|
|
72
|
+
name = rec[:commonname].tr("\u{0092}", "'")
|
|
68
73
|
@extensions[0][:data] << [taxon_id, name, lang]
|
|
69
|
-
|
|
70
74
|
end
|
|
71
75
|
|
|
76
|
+
# rubocop:disable Metrics/AbcSize
|
|
77
|
+
|
|
72
78
|
def process_synonyms(rec, taxon)
|
|
73
79
|
accepted_id = taxon[0]
|
|
74
80
|
parent_id = taxon[2]
|
|
75
81
|
rank = taxon[-1]
|
|
76
|
-
return unless [
|
|
82
|
+
return unless %w[species subspecies].include? rank
|
|
77
83
|
synonyms = rec[:synonyms].gsub(/\.$/, "").
|
|
78
|
-
|
|
84
|
+
gsub(%r{<[/ibsup]+>}, "").gsub(/[\s]+/, " ").split(";")
|
|
79
85
|
synonyms = synonyms.map(&:strip)
|
|
80
|
-
synonyms
|
|
81
|
-
next if s
|
|
82
|
-
if s
|
|
83
|
-
s = rec[:genus] + " " + s
|
|
84
|
-
end
|
|
86
|
+
synonyms.map do |s|
|
|
87
|
+
next if s =~ /<u>/
|
|
88
|
+
s = rec[:genus] + " " + s if s =~ /^[a-z]/
|
|
85
89
|
@count += 1
|
|
86
90
|
id = @count
|
|
87
|
-
|
|
91
|
+
if real_name?(s)
|
|
92
|
+
@core << [id, nil, parent_id, accepted_id, s, "synonym", rank]
|
|
93
|
+
else
|
|
94
|
+
puts "Rejected: #{s}"
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# rubocop:enable Metrics/AbcSize
|
|
100
|
+
|
|
101
|
+
def real_name?(str)
|
|
102
|
+
parsed = @parser.parse(str)[:scientificName]
|
|
103
|
+
return false unless parsed[:parsed]
|
|
104
|
+
epithets = parsed[:canonical].split(" ")[1..-1]
|
|
105
|
+
return false if epithets.nil? || epithets.empty?
|
|
106
|
+
epithets.each do |e|
|
|
107
|
+
return false if @black_sp[e]
|
|
88
108
|
end
|
|
109
|
+
true
|
|
89
110
|
end
|
|
90
111
|
|
|
91
|
-
def process_name(rec
|
|
92
|
-
name =[@core.last[4], rec[:author], rec[:date]]
|
|
93
|
-
@core.last[4] = name.join(" ").gsub(/
|
|
112
|
+
def process_name(rec)
|
|
113
|
+
name = [@core.last[4], rec[:author], rec[:date]]
|
|
114
|
+
@core.last[4] = name.join(" ").gsub(%r{<[/ibsup]+>}, "").
|
|
115
|
+
gsub(/[\s]+/, " ").strip
|
|
94
116
|
@core.last[1] = rec[:id]
|
|
95
117
|
end
|
|
96
118
|
|
|
119
|
+
# rubocop:disable Metrics/AbcSize
|
|
120
|
+
|
|
97
121
|
def process_hierarchy(rec)
|
|
98
122
|
parent_id = @clades["Mammalia"][:id]
|
|
99
123
|
is_row_rank = false
|
|
100
|
-
[
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
is_row_rank = true if rank == rec[:taxonlevel].downcase.to_sym
|
|
124
|
+
%i[order suborder infraorder superfamily family
|
|
125
|
+
subfamily tribe genus subgenus species subspecies].each do |rank|
|
|
126
|
+
is_row_rank = true if rank == rec[:taxonlevel].downcase.to_sym
|
|
104
127
|
clade = rec[rank]
|
|
105
|
-
clade = clade.capitalize if clade
|
|
128
|
+
clade = clade.capitalize if clade =~ /^[A-Z]+$/
|
|
106
129
|
next if clade.to_s == ""
|
|
107
130
|
clade_id = nil
|
|
108
131
|
clade = adjust_clade(rec, rank, clade)
|
|
@@ -114,16 +137,17 @@ module DwcaHunter
|
|
|
114
137
|
@clades[clade] = { id: clade_id, rank: rank }
|
|
115
138
|
@core << [clade_id, nil, parent_id, clade_id, clade, nil, rank.to_s]
|
|
116
139
|
if is_row_rank
|
|
117
|
-
process_name(rec
|
|
140
|
+
process_name(rec)
|
|
118
141
|
return @core.last
|
|
119
142
|
end
|
|
120
143
|
end
|
|
121
144
|
parent_id = clade_id
|
|
122
145
|
end
|
|
123
146
|
end
|
|
147
|
+
# rubocop:enable Metrics/AbcSize
|
|
124
148
|
|
|
125
149
|
def adjust_clade(rec, rank, clade)
|
|
126
|
-
if [
|
|
150
|
+
if %i[species subspecies].include? rank
|
|
127
151
|
clade = [rec[:genus], rec[:species]]
|
|
128
152
|
clade << rec[:subspecies] if rank == :subspecies
|
|
129
153
|
clade.join(" ").gsub(/[\s]+/, " ").strip
|
|
@@ -140,13 +164,13 @@ module DwcaHunter
|
|
|
140
164
|
{ first_name: "Don",
|
|
141
165
|
last_name: "Wilson" },
|
|
142
166
|
{ first_name: "DeeAnn",
|
|
143
|
-
last_name: "Reader" }
|
|
144
|
-
|
|
167
|
+
last_name: "Reader" }
|
|
168
|
+
],
|
|
145
169
|
metadata_providers: [
|
|
146
170
|
{ first_name: "Dmitry",
|
|
147
171
|
last_name: "Mozzherin",
|
|
148
172
|
email: "dmozzherin@gmail.com" }
|
|
149
|
-
|
|
173
|
+
],
|
|
150
174
|
abstract: "Mammal Species of the World, 3rd edition (MSW3) is "\
|
|
151
175
|
"a database of mammalian taxonomy, based upon the 2005 book "\
|
|
152
176
|
"Mammal Species of the World. A Taxonomic and Geographic Reference "\
|
|
@@ -156,24 +180,32 @@ module DwcaHunter
|
|
|
156
180
|
end
|
|
157
181
|
|
|
158
182
|
def core_init
|
|
159
|
-
@core = [[
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
183
|
+
@core = [["http://rs.tdwg.org/dwc/terms/taxonID",
|
|
184
|
+
"http://globalnames.org/terms/localID",
|
|
185
|
+
"http://rs.tdwg.org/dwc/terms/parentNameUsageID",
|
|
186
|
+
"http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
|
|
187
|
+
"http://rs.tdwg.org/dwc/terms/scientificName",
|
|
188
|
+
"http://rs.tdwg.org/dwc/terms/taxonomicStatus",
|
|
189
|
+
"http://rs.tdwg.org/dwc/terms/taxonRank"]]
|
|
166
190
|
m = @clades["Mammalia"]
|
|
167
191
|
@core << [m[:id], nil, nil, m[:id], "Mammalia", nil, "class"]
|
|
168
192
|
end
|
|
169
193
|
|
|
194
|
+
def black_species
|
|
195
|
+
res = {}
|
|
196
|
+
path = File.join(__dir__, "..", "..", "..", "files", "species-black.txt")
|
|
197
|
+
File.open(path).each do |l|
|
|
198
|
+
res[l.strip] = 1
|
|
199
|
+
end
|
|
200
|
+
res
|
|
201
|
+
end
|
|
202
|
+
|
|
170
203
|
def extensions_init
|
|
171
|
-
@extensions << { data: [[
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
file_name:
|
|
175
|
-
row_type:
|
|
176
|
-
}
|
|
204
|
+
@extensions << { data: [["http://rs.tdwg.org/dwc/terms/taxonID",
|
|
205
|
+
"http://rs.tdwg.org/dwc/terms/vernacularName",
|
|
206
|
+
"http://purl.org/dc/terms/language"]],
|
|
207
|
+
file_name: "vernacular_names.txt",
|
|
208
|
+
row_type: "http://rs.gbif.org/terms/1.0/VernacularName" }
|
|
177
209
|
end
|
|
178
210
|
end
|
|
179
211
|
end
|
|
@@ -5,7 +5,7 @@ module DwcaHunter
|
|
|
5
5
|
@problems_file = open('problems.txt', 'w:utf-8')
|
|
6
6
|
@command = "wikispecies"
|
|
7
7
|
@title = 'Wikispecies'
|
|
8
|
-
@url = 'http://dumps.wikimedia.org/specieswiki/latest/'
|
|
8
|
+
@url = 'http://dumps.wikimedia.org/specieswiki/latest/' \
|
|
9
9
|
'specieswiki-latest-pages-articles.xml.bz2'
|
|
10
10
|
@url = opts[:url] if opts[:url]
|
|
11
11
|
@uuid = '68923690-0727-473c-b7c5-2ae9e601e3fd'
|
|
@@ -347,4 +347,3 @@ module DwcaHunter
|
|
|
347
347
|
|
|
348
348
|
end
|
|
349
349
|
end
|
|
350
|
-
|
data/lib/dwca_hunter/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,15 +1,29 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: dwca_hunter
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.5.
|
|
4
|
+
version: 0.5.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Dmitry Mozzherin
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2018-
|
|
11
|
+
date: 2018-08-04 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: biodiversity
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '3.5'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '3.5'
|
|
13
27
|
- !ruby/object:Gem::Dependency
|
|
14
28
|
name: dwc-archive
|
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -38,6 +52,20 @@ dependencies:
|
|
|
38
52
|
- - "~>"
|
|
39
53
|
- !ruby/object:Gem::Version
|
|
40
54
|
version: '0.5'
|
|
55
|
+
- !ruby/object:Gem::Dependency
|
|
56
|
+
name: htmlentities
|
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
|
58
|
+
requirements:
|
|
59
|
+
- - "~>"
|
|
60
|
+
- !ruby/object:Gem::Version
|
|
61
|
+
version: '4.3'
|
|
62
|
+
type: :runtime
|
|
63
|
+
prerelease: false
|
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
+
requirements:
|
|
66
|
+
- - "~>"
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: '4.3'
|
|
41
69
|
- !ruby/object:Gem::Dependency
|
|
42
70
|
name: nokogiri
|
|
43
71
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -204,6 +232,7 @@ files:
|
|
|
204
232
|
- files/birdlife_7.csv
|
|
205
233
|
- files/fishbase_taxon_cache.tsv
|
|
206
234
|
- files/reptile_checklist_2014_12.csv
|
|
235
|
+
- files/species-black.txt
|
|
207
236
|
- lib/dwca_hunter.rb
|
|
208
237
|
- lib/dwca_hunter/downloader.rb
|
|
209
238
|
- lib/dwca_hunter/encoding.rb
|
|
@@ -219,7 +248,6 @@ files:
|
|
|
219
248
|
- lib/dwca_hunter/resources/opentree.rb
|
|
220
249
|
- lib/dwca_hunter/resources/reptiles_checklist.rb
|
|
221
250
|
- lib/dwca_hunter/resources/wikispecies.rb
|
|
222
|
-
- lib/dwca_hunter/resources/worms.rb
|
|
223
251
|
- lib/dwca_hunter/url.rb
|
|
224
252
|
- lib/dwca_hunter/version.rb
|
|
225
253
|
- lib/dwca_hunter/xml.rb
|
|
@@ -1,176 +0,0 @@
|
|
|
1
|
-
# encoding: utf-8
|
|
2
|
-
module DwcaHunter
|
|
3
|
-
class ResourceWoRMS < DwcaHunter::Resource
|
|
4
|
-
def initialize(opts = {})
|
|
5
|
-
@command = 'worms'
|
|
6
|
-
@title = 'WoRMS'
|
|
7
|
-
@url = 'http://content60.eol.org/resources/26.tar.gz'
|
|
8
|
-
@uuid = '9d27a7ad-2e6a-4597-a79b-23fb3b2f8284'
|
|
9
|
-
@download_path = File.join(Dir.tmpdir,
|
|
10
|
-
'dwca_hunter',
|
|
11
|
-
'worms',
|
|
12
|
-
'data.tar.gz')
|
|
13
|
-
@fields = ['dc:identifier',
|
|
14
|
-
'dc:source',
|
|
15
|
-
'dwc:Kingdom',
|
|
16
|
-
'dwc:Phylum',
|
|
17
|
-
'dwc:Class',
|
|
18
|
-
'dwc:Order',
|
|
19
|
-
'dwc:Family',
|
|
20
|
-
'dwc:Genus',
|
|
21
|
-
'dwc:ScientificName']
|
|
22
|
-
@rank = { 1 => 'kingdom',
|
|
23
|
-
2 => 'phylum',
|
|
24
|
-
3 => 'class',
|
|
25
|
-
4 => 'order',
|
|
26
|
-
5 => 'family',
|
|
27
|
-
6 => 'genus',
|
|
28
|
-
7 => 'species' }
|
|
29
|
-
@known_paths = {}
|
|
30
|
-
@data = []
|
|
31
|
-
@extensions = []
|
|
32
|
-
@extensions << { data: [[
|
|
33
|
-
'http://rs.tdwg.org/dwc/terms/taxonId',
|
|
34
|
-
'http://rs.tdwg.org/dwc/terms/scientificName']],
|
|
35
|
-
file_name: 'synonyms.txt' }
|
|
36
|
-
@re = {
|
|
37
|
-
cdata: %r#\<\!\[CDATA\[(.*)\]\]\>#
|
|
38
|
-
}
|
|
39
|
-
@core = [[
|
|
40
|
-
'http://rs.tdwg.org/dwc/terms/taxonID',
|
|
41
|
-
'http://purl.org/dc/terms/parentNameUsageID',
|
|
42
|
-
'http://purl.org/dc/terms/source',
|
|
43
|
-
'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID',
|
|
44
|
-
'http://purl.org/dc/terms/scientificName',
|
|
45
|
-
'http://purl.org/dc/terms/taxonRank']]
|
|
46
|
-
super
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
def unpack
|
|
50
|
-
unpack_tar
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
def make_dwca
|
|
54
|
-
collect_data
|
|
55
|
-
make_core_data
|
|
56
|
-
generate_dwca
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
private
|
|
60
|
-
|
|
61
|
-
def collect_data
|
|
62
|
-
DwcaHunter::logger_write(self.object_id, 'Traversing xml file...')
|
|
63
|
-
xml_file = File.join(@download_dir, '26.xml')
|
|
64
|
-
f = open(xml_file, 'r:utf-8')
|
|
65
|
-
in_taxon = false
|
|
66
|
-
taxon = nil
|
|
67
|
-
count = 0
|
|
68
|
-
Nokogiri::XML::Reader(f).each do |node|
|
|
69
|
-
if !in_taxon && node.name == 'taxon'
|
|
70
|
-
in_taxon = true
|
|
71
|
-
taxon = {}
|
|
72
|
-
@fields.each { |field| taxon[field.to_sym] = nil }
|
|
73
|
-
taxon[:synonyms] = []
|
|
74
|
-
elsif in_taxon && node.name == 'taxon'
|
|
75
|
-
in_taxon = false
|
|
76
|
-
@data << taxon
|
|
77
|
-
taxon = nil
|
|
78
|
-
count += 1
|
|
79
|
-
if count % BATCH_SIZE == 0
|
|
80
|
-
DwcaHunter::logger_write(self.object_id,
|
|
81
|
-
"Extracted %s taxons" % count)
|
|
82
|
-
end
|
|
83
|
-
elsif in_taxon
|
|
84
|
-
item = node.name.to_sym
|
|
85
|
-
if taxon.has_key?(item) && !taxon[item]
|
|
86
|
-
text = node.inner_xml
|
|
87
|
-
if cdata = text.match(@re[:cdata])
|
|
88
|
-
text = cdata[1]
|
|
89
|
-
else
|
|
90
|
-
text = DwcaHunter::XML.unescape(text)
|
|
91
|
-
end
|
|
92
|
-
taxon[item] = text
|
|
93
|
-
elsif node.name == 'synonym' &&
|
|
94
|
-
(cdata = node.inner_xml.match(@re[:cdata]))
|
|
95
|
-
taxon[:synonyms] << cdata[1]
|
|
96
|
-
end
|
|
97
|
-
end
|
|
98
|
-
end
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
def get_gn_id(path_string)
|
|
102
|
-
gn_uuid = UUID.create_v5(path_string, GNA_NAMESPACE)
|
|
103
|
-
id = Base64.urlsafe_encode64(gn_uuid.raw_bytes)[0..-3]
|
|
104
|
-
"gn:" + id
|
|
105
|
-
end
|
|
106
|
-
|
|
107
|
-
def make_core_data
|
|
108
|
-
DwcaHunter::logger_write(self.object_id, 'Creating core data')
|
|
109
|
-
@data.each_with_index do |taxa, i|
|
|
110
|
-
if i % BATCH_SIZE == 0
|
|
111
|
-
DwcaHunter::logger_write(self.object_id,
|
|
112
|
-
'Traversing %s species for core' % i)
|
|
113
|
-
end
|
|
114
|
-
path = get_path(taxa)
|
|
115
|
-
parent_id = get_gn_id(path.join('|'))
|
|
116
|
-
@core << [taxa[:'dc:identifier'],
|
|
117
|
-
parent_id, taxa[:'dc:source'],
|
|
118
|
-
nil,
|
|
119
|
-
taxa[:'dwc:ScientificName'],
|
|
120
|
-
'species']
|
|
121
|
-
|
|
122
|
-
taxa[:synonyms].each do |synonym|
|
|
123
|
-
@extensions[0][:data] << [taxa[:'dc:identifier'], synonym]
|
|
124
|
-
end
|
|
125
|
-
|
|
126
|
-
until path.empty?
|
|
127
|
-
path_string = path.join("|")
|
|
128
|
-
unless @known_paths[path_string]
|
|
129
|
-
@known_paths[path_string] = 1
|
|
130
|
-
parent_id = (path.size == 1) ?
|
|
131
|
-
nil :
|
|
132
|
-
get_gn_id([path[0..-2]].join('|'))
|
|
133
|
-
id = get_gn_id(path_string)
|
|
134
|
-
@core << [id, parent_id, nil, nil, path[-1], @rank[path.size]]
|
|
135
|
-
end
|
|
136
|
-
path.pop
|
|
137
|
-
end
|
|
138
|
-
end
|
|
139
|
-
end
|
|
140
|
-
|
|
141
|
-
def get_path(taxa)
|
|
142
|
-
path = []
|
|
143
|
-
@fields[2..-2].each do |field|
|
|
144
|
-
path << taxa[field.to_sym]
|
|
145
|
-
end
|
|
146
|
-
path
|
|
147
|
-
end
|
|
148
|
-
|
|
149
|
-
def generate_dwca
|
|
150
|
-
DwcaHunter::logger_write(self.object_id,
|
|
151
|
-
'Creating DarwinCore Archive file')
|
|
152
|
-
@eml = {
|
|
153
|
-
id: @uuid,
|
|
154
|
-
title: @title,
|
|
155
|
-
authors: [
|
|
156
|
-
{ email: 'info@marinespecies.org',
|
|
157
|
-
url: 'http://www.marinespecies.org' }
|
|
158
|
-
],
|
|
159
|
-
metadata_providers: [
|
|
160
|
-
{ first_name: 'Dmitry',
|
|
161
|
-
last_name: 'Mozzherin',
|
|
162
|
-
email: 'dmozzherin@gmail.com' }
|
|
163
|
-
],
|
|
164
|
-
abstract: 'The aim of a World Register of Marine Species (WoRMS) ' +
|
|
165
|
-
'is to provide an authoritative and comprehensive list ' +
|
|
166
|
-
'of names of marine organisms, including information ' +
|
|
167
|
-
'on synonymy. While highest priority goes to valid ' +
|
|
168
|
-
'names, other names in use are included so that this ' +
|
|
169
|
-
'register can serve as a guide to interpret taxonomic ' +
|
|
170
|
-
'literature.',
|
|
171
|
-
}
|
|
172
|
-
super
|
|
173
|
-
end
|
|
174
|
-
end
|
|
175
|
-
end
|
|
176
|
-
|