dwca_hunter 0.5.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2a7db63e8ed5cb5d28892c4f43adaa015e52ae012a0363b6fe50aaa230038f04
4
- data.tar.gz: 3b8bde91aeec03e927ec7811c0ccfba4c411e5f1a3c7521415bc359add68b8e1
3
+ metadata.gz: e30e9b34ea1c46b021bd3c2ec66ccad4996d4a921c7ce78791b84940bd239f05
4
+ data.tar.gz: 1be0e7119fd38094f94a53d71460a8f97a59f2e46a5e9740b814f5dcc97b42cd
5
5
  SHA512:
6
- metadata.gz: 2992334297d7add2ad3875180b080c7278b7ab934440a5376a8fffb6943bcef9c585187ec67e1bf8737405e7153222bd65df494e4805ace843cfbfd07adf73c1
7
- data.tar.gz: 91218db042979509afd377008df68c82d04a08fe39b653119cdbe3b187a27202e5da11f173310617535c2acabc1aef65aec142d4e6226d0abb250dcec2d38eab
6
+ metadata.gz: df1d9bebe191ebf8ae72d601f05374edeaeffbb627d08d7981da582559439dd9ad173656d78a7c88b3a9765562efa8a7095c4eaa5ed0f1f3ef856be94f990b63
7
+ data.tar.gz: 703bbf2d197a55a8d4e8510e940f77562540983bb5eddc7a7496ed77af5e610ce0ce5d95a1faa25878a8f8b12a17613808898453a0bd6b7aa19a87dc7c5f000e
data/CHANGELOG.md CHANGED
@@ -2,14 +2,23 @@
2
2
 
3
3
  ## Unreleased (placeholder for the next version)
4
4
 
5
+ ## [v0.5.1]
6
+
7
+ - Add [#11] clean up data for Mammal Species of the World
8
+ - Add [#10] show logs during processing
9
+
5
10
  ## [v0.5.0]
11
+
6
12
  - Add [#8] convert project to a `Ruby gem`
7
13
 
8
14
  ## Footnotes
9
15
 
10
16
  This document follows [changelog guidelines]
11
17
 
18
+ [#11]: https://github.com/GlobalNamesArchitecture/dwca_hunter/issues/11
19
+ [#10]: https://github.com/GlobalNamesArchitecture/dwca_hunter/issues/10
12
20
  [#8]: https://github.com/GlobalNamesArchitecture/dwca_hunter/issues/8
21
+ [v0.5.1]: https://github.com/gnames/dwca_hunter/compare/v0.5.0...v0.5.1
13
22
  [v0.5.0]: https://github.com/gnames/dwca_hunter/compare/v0.4.0...v0.5.0
14
23
 
15
24
  [changelog guidelines]: https://github.com/olivierlacan/keep-a-changelog
data/Gemfile.lock CHANGED
@@ -1,9 +1,11 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- dwca_hunter (0.5.0)
4
+ dwca_hunter (0.5.1)
5
+ biodiversity (~> 3.5)
5
6
  dwc-archive (~> 1.0)
6
7
  gn_uuid (~> 0.5)
8
+ htmlentities (~> 4.3)
7
9
  nokogiri (~> 1.8)
8
10
  rest-client (~> 2.0)
9
11
  thor (~> 0.19)
data/dwca_hunter.gemspec CHANGED
@@ -26,8 +26,10 @@ Gem::Specification.new do |gem|
26
26
  gem.executables = gem.files.grep(%r{^exe/}) { |f| File.basename(f) }
27
27
  gem.require_paths = ["lib"]
28
28
 
29
+ gem.add_dependency "biodiversity", "~> 3.5"
29
30
  gem.add_dependency "dwc-archive", "~> 1.0"
30
31
  gem.add_dependency "gn_uuid", "~> 0.5"
32
+ gem.add_dependency "htmlentities", "~> 4.3"
31
33
  gem.add_dependency "nokogiri", "~> 1.8"
32
34
  gem.add_dependency "rest-client", "~> 2.0"
33
35
  gem.add_dependency "thor", "~> 0.19"
data/exe/dwcahunter CHANGED
@@ -4,6 +4,8 @@
4
4
  require "thor"
5
5
  require "dwca_hunter"
6
6
 
7
+ DwcaHunter.logger = Logger.new($stdout)
8
+
7
9
  # DwcaHunterCLI determines command line interface to the gem functionality
8
10
  class DwcaHunterCLI < Thor
9
11
  desc "list [SEARCH]", "lists available sources to convert filtered by search"
@@ -0,0 +1,251 @@
1
+ ab
2
+ above
3
+ account
4
+ ad
5
+ ae
6
+ al
7
+ all
8
+ allocation
9
+ also
10
+ anamorph
11
+ and
12
+ ap
13
+ are
14
+ areas
15
+ as
16
+ associated
17
+ at
18
+ available
19
+ awaiting
20
+ backbone
21
+ bacterium
22
+ basin
23
+ bird
24
+ bis
25
+ but
26
+ bv
27
+ by
28
+ ca
29
+ calls
30
+ cf
31
+ circular
32
+ cited
33
+ clams
34
+ clonal
35
+ clone
36
+ comes
37
+ comment
38
+ comments
39
+ construct
40
+ contrasts
41
+ coralline
42
+ coronaviridae
43
+ culture
44
+ cv
45
+ da
46
+ dc
47
+ de
48
+ de-
49
+ def
50
+ degrading
51
+ del
52
+ depletion
53
+ der
54
+ des
55
+ described
56
+ diatom
57
+ differ
58
+ display
59
+ do
60
+ du
61
+ dummy
62
+ ecological
63
+ ectosymbiont
64
+ ed
65
+ either
66
+ em
67
+ en
68
+ endosymbiont
69
+ enrichment
70
+ environmental
71
+ et
72
+ ex
73
+ examples
74
+ excluding
75
+ expression
76
+ extend
77
+ eyes
78
+ faeces
79
+ figures
80
+ fl
81
+ flying
82
+ fn
83
+ fo
84
+ follows
85
+ foot
86
+ for
87
+ form
88
+ forma
89
+ formerly
90
+ fosmid
91
+ fossils
92
+ fox
93
+ fr
94
+ fragments
95
+ from
96
+ fungal
97
+ ge
98
+ gen
99
+ genera
100
+ generic
101
+ genus
102
+ goes
103
+ group
104
+ he
105
+ host
106
+ hu
107
+ hybrid
108
+ id
109
+ im
110
+ in
111
+ incertae
112
+ ing
113
+ is
114
+ it
115
+ jejuni-like
116
+ jr
117
+ kg
118
+ la
119
+ large
120
+ largest
121
+ leech
122
+ leukemia-related
123
+ libraries
124
+ longer
125
+ luciferase
126
+ magnified
127
+ marker
128
+ mc
129
+ morphology
130
+ ms
131
+ my
132
+ name
133
+ names
134
+ new
135
+ nf
136
+ ng
137
+ nm
138
+ non
139
+ not
140
+ notes
141
+ nov
142
+ nr
143
+ ns
144
+ occurs
145
+ of
146
+ on
147
+ one
148
+ op
149
+ or
150
+ oral
151
+ other
152
+ ox
153
+ parasite
154
+ parasites
155
+ part
156
+ phylotype
157
+ pipefish
158
+ pl
159
+ plasmid
160
+ pomfret
161
+ population
162
+ populations
163
+ pr
164
+ probes
165
+ prophage
166
+ pt
167
+ queens
168
+ red
169
+ reference
170
+ references
171
+ regarded
172
+ region
173
+ regions
174
+ related
175
+ reported
176
+ resistance
177
+ retroviruses
178
+ revised
179
+ rhodolith
180
+ same
181
+ sample
182
+ samples
183
+ scales
184
+ se
185
+ sea
186
+ seahorse
187
+ sec
188
+ secondary
189
+ sect
190
+ section
191
+ see
192
+ seed
193
+ seems
194
+ segment
195
+ series
196
+ show
197
+ shuttle
198
+ sinus
199
+ so
200
+ soil
201
+ south
202
+ species
203
+ specimens
204
+ ss
205
+ st
206
+ strains
207
+ subsp
208
+ subspec
209
+ subsp-nov
210
+ summarized
211
+ symbiont
212
+ symbionts
213
+ synthase
214
+ taken
215
+ taxon
216
+ that
217
+ the
218
+ these
219
+ thin
220
+ this
221
+ three
222
+ to
223
+ towards
224
+ trapping
225
+ two
226
+ ty
227
+ type
228
+ und
229
+ under
230
+ unpublished
231
+ up
232
+ us
233
+ usually
234
+ van
235
+ var
236
+ variable
237
+ variant
238
+ variety
239
+ vector
240
+ vi
241
+ virus
242
+ von
243
+ voyager
244
+ was
245
+ waters
246
+ we
247
+ were
248
+ with
249
+ wrote
250
+ xx
251
+ zur
data/lib/dwca_hunter.rb CHANGED
@@ -1,11 +1,13 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "biodiversity"
3
4
  require "logger"
4
5
  require "fileutils"
5
6
  require "uri"
6
7
  require "tmpdir"
7
8
  require "net/http"
8
9
  require "json"
10
+ require "htmlentities"
9
11
  require "dwc_archive"
10
12
  require "dwca_hunter/resource"
11
13
  require "rest_client"
@@ -1,12 +1,18 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module DwcaHunter
4
+ # Encoding module fixes encoding issues with data
2
5
  module Encoding
3
6
  def self.latin1_to_utf8(file_path)
4
- new_file = file_path + '.utf_8'
5
- puts "Creating %s" % new_file
6
- r = open(file_path)
7
- w = open(new_file, 'w:utf-8')
7
+ new_file = file_path + ".utf_8"
8
+ puts "Creating #{new_file}"
9
+ r = File.open(file_path)
10
+ w = File.open(new_file, "w:utf-8")
11
+ he = HTMLEntities.new
8
12
  r.each do |l|
9
- l.encode!('UTF-8', 'ISO-8859-1', invalid: :replace, replace: '?')
13
+ l = l
14
+ l = l.encode("UTF-8", "ISO-8859-1", invalid: :replace, replace: "?")
15
+ l = he.decode(l)
10
16
  w.write l
11
17
  end
12
18
  r.close
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ __END__
2
2
  module DwcaHunter
3
3
  class ResourceGNUB < DwcaHunter::Resource
4
4
  def initialize(opts = {})
@@ -98,4 +98,3 @@ module DwcaHunter
98
98
  end
99
99
  end
100
100
  end
101
-
@@ -1,13 +1,19 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module DwcaHunter
4
+ # ResourceMammalSpecies converts "Mammal Species of the World" data
5
+ # to DarwinCore Archive file
2
6
  class ResourceMammalSpecies < DwcaHunter::Resource
3
7
  def initialize(opts = {})
8
+ @parser = ScientificNameParser.new
9
+ @black_sp = black_species
4
10
  @command = "mammal-species"
5
11
  @title = "The Mammal Species of The World"
6
12
  @uuid = "464dafec-1037-432d-8449-c0b309e0a030"
7
13
  @data = []
8
14
  @extensions = []
9
15
  @count = 1
10
- @clades = {"Mammalia" => { rank: "class", id: @count}}
16
+ @clades = { "Mammalia" => { rank: "class", id: @count } }
11
17
  @url = "http://www.departments.bucknell.edu"\
12
18
  "/biology/resources/msw3/export.asp"
13
19
  @download_path = File.join(Dir.tmpdir, "dwca_hunter",
@@ -20,14 +26,14 @@ module DwcaHunter
20
26
  end
21
27
 
22
28
  def make_dwca
23
- DwcaHunter::logger_write(self.object_id, "Extracting data")
29
+ DwcaHunter.logger_write(object_id, "Extracting data")
24
30
  encode
25
31
  collect_data
26
32
  generate_dwca
27
33
  end
28
34
 
29
35
  def download
30
- DwcaHunter::logger_write(self.object_id, "Downloading file -- "\
36
+ DwcaHunter.logger_write(object_id, "Downloading file -- "\
31
37
  "it will take some time...")
32
38
  dlr = DwcaHunter::Downloader.new(url, @download_path)
33
39
  dlr.download
@@ -47,8 +53,7 @@ module DwcaHunter
47
53
  end
48
54
 
49
55
  def generate_dwca
50
- DwcaHunter::logger_write(self.object_id,
51
- 'Creating DarwinCore Archive file')
56
+ DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
52
57
  core_init
53
58
  extensions_init
54
59
  eml_init
@@ -64,45 +69,63 @@ module DwcaHunter
64
69
  return if rec[:commonname].to_s == ""
65
70
  taxon_id = taxon[0]
66
71
  lang = "en"
67
- name = rec[:commonname].gsub("\u{0092}", "'")
72
+ name = rec[:commonname].tr("\u{0092}", "'")
68
73
  @extensions[0][:data] << [taxon_id, name, lang]
69
-
70
74
  end
71
75
 
76
+ # rubocop:disable Metrics/AbcSize
77
+
72
78
  def process_synonyms(rec, taxon)
73
79
  accepted_id = taxon[0]
74
80
  parent_id = taxon[2]
75
81
  rank = taxon[-1]
76
- return unless ['species', 'subspecies'].include? rank
82
+ return unless %w[species subspecies].include? rank
77
83
  synonyms = rec[:synonyms].gsub(/\.$/, "").
78
- gsub(/<[\/ib]+>/, "").gsub(/[\s]+/, " ").split(";")
84
+ gsub(%r{<[/ibsup]+>}, "").gsub(/[\s]+/, " ").split(";")
79
85
  synonyms = synonyms.map(&:strip)
80
- synonyms = synonyms.map do |s|
81
- next if s.match(/<u>/)
82
- if s.match(/^[a-z]/)
83
- s = rec[:genus] + " " + s
84
- end
86
+ synonyms.map do |s|
87
+ next if s =~ /<u>/
88
+ s = rec[:genus] + " " + s if s =~ /^[a-z]/
85
89
  @count += 1
86
90
  id = @count
87
- @core << [id, nil, parent_id, accepted_id, s, "synonym", rank]
91
+ if real_name?(s)
92
+ @core << [id, nil, parent_id, accepted_id, s, "synonym", rank]
93
+ else
94
+ puts "Rejected: #{s}"
95
+ end
96
+ end
97
+ end
98
+
99
+ # rubocop:enable Metrics/AbcSize
100
+
101
+ def real_name?(str)
102
+ parsed = @parser.parse(str)[:scientificName]
103
+ return false unless parsed[:parsed]
104
+ epithets = parsed[:canonical].split(" ")[1..-1]
105
+ return false if epithets.nil? || epithets.empty?
106
+ epithets.each do |e|
107
+ return false if @black_sp[e]
88
108
  end
109
+ true
89
110
  end
90
111
 
91
- def process_name(rec, rank)
92
- name =[@core.last[4], rec[:author], rec[:date]]
93
- @core.last[4] = name.join(" ").gsub(/[\s]+/, " ").strip
112
+ def process_name(rec)
113
+ name = [@core.last[4], rec[:author], rec[:date]]
114
+ @core.last[4] = name.join(" ").gsub(%r{<[/ibsup]+>}, "").
115
+ gsub(/[\s]+/, " ").strip
94
116
  @core.last[1] = rec[:id]
95
117
  end
96
118
 
119
+ # rubocop:disable Metrics/AbcSize
120
+
97
121
  def process_hierarchy(rec)
98
122
  parent_id = @clades["Mammalia"][:id]
99
123
  is_row_rank = false
100
- [:order, :suborder, :infraorder, :superfamily, :family,
101
- :subfamily, :tribe, :genus, :subgenus,
102
- :species, :subspecies].each do |rank|
103
- is_row_rank = true if rank == rec[:taxonlevel].downcase.to_sym
124
+ %i[order suborder infraorder superfamily family
125
+ subfamily tribe genus subgenus species subspecies].each do |rank|
126
+ is_row_rank = true if rank == rec[:taxonlevel].downcase.to_sym
104
127
  clade = rec[rank]
105
- clade = clade.capitalize if clade.match(/^[A-Z]+$/)
128
+ clade = clade.capitalize if clade =~ /^[A-Z]+$/
106
129
  next if clade.to_s == ""
107
130
  clade_id = nil
108
131
  clade = adjust_clade(rec, rank, clade)
@@ -114,16 +137,17 @@ module DwcaHunter
114
137
  @clades[clade] = { id: clade_id, rank: rank }
115
138
  @core << [clade_id, nil, parent_id, clade_id, clade, nil, rank.to_s]
116
139
  if is_row_rank
117
- process_name(rec, rank)
140
+ process_name(rec)
118
141
  return @core.last
119
142
  end
120
143
  end
121
144
  parent_id = clade_id
122
145
  end
123
146
  end
147
+ # rubocop:enable Metrics/AbcSize
124
148
 
125
149
  def adjust_clade(rec, rank, clade)
126
- if [:species, :subspecies].include? rank
150
+ if %i[species subspecies].include? rank
127
151
  clade = [rec[:genus], rec[:species]]
128
152
  clade << rec[:subspecies] if rank == :subspecies
129
153
  clade.join(" ").gsub(/[\s]+/, " ").strip
@@ -140,13 +164,13 @@ module DwcaHunter
140
164
  { first_name: "Don",
141
165
  last_name: "Wilson" },
142
166
  { first_name: "DeeAnn",
143
- last_name: "Reader" },
144
- ],
167
+ last_name: "Reader" }
168
+ ],
145
169
  metadata_providers: [
146
170
  { first_name: "Dmitry",
147
171
  last_name: "Mozzherin",
148
172
  email: "dmozzherin@gmail.com" }
149
- ],
173
+ ],
150
174
  abstract: "Mammal Species of the World, 3rd edition (MSW3) is "\
151
175
  "a database of mammalian taxonomy, based upon the 2005 book "\
152
176
  "Mammal Species of the World. A Taxonomic and Geographic Reference "\
@@ -156,24 +180,32 @@ module DwcaHunter
156
180
  end
157
181
 
158
182
  def core_init
159
- @core = [['http://rs.tdwg.org/dwc/terms/taxonID',
160
- 'http://globalnames.org/terms/localID',
161
- 'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
162
- 'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID',
163
- 'http://rs.tdwg.org/dwc/terms/scientificName',
164
- 'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
165
- 'http://rs.tdwg.org/dwc/terms/taxonRank']]
183
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
184
+ "http://globalnames.org/terms/localID",
185
+ "http://rs.tdwg.org/dwc/terms/parentNameUsageID",
186
+ "http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
187
+ "http://rs.tdwg.org/dwc/terms/scientificName",
188
+ "http://rs.tdwg.org/dwc/terms/taxonomicStatus",
189
+ "http://rs.tdwg.org/dwc/terms/taxonRank"]]
166
190
  m = @clades["Mammalia"]
167
191
  @core << [m[:id], nil, nil, m[:id], "Mammalia", nil, "class"]
168
192
  end
169
193
 
194
+ def black_species
195
+ res = {}
196
+ path = File.join(__dir__, "..", "..", "..", "files", "species-black.txt")
197
+ File.open(path).each do |l|
198
+ res[l.strip] = 1
199
+ end
200
+ res
201
+ end
202
+
170
203
  def extensions_init
171
- @extensions << { data: [['http://rs.tdwg.org/dwc/terms/taxonID',
172
- 'http://rs.tdwg.org/dwc/terms/vernacularName',
173
- 'http://purl.org/dc/terms/language']],
174
- file_name: 'vernacular_names.txt',
175
- row_type: 'http://rs.gbif.org/terms/1.0/VernacularName'
176
- }
204
+ @extensions << { data: [["http://rs.tdwg.org/dwc/terms/taxonID",
205
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
206
+ "http://purl.org/dc/terms/language"]],
207
+ file_name: "vernacular_names.txt",
208
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName" }
177
209
  end
178
210
  end
179
211
  end
@@ -5,7 +5,7 @@ module DwcaHunter
5
5
  @problems_file = open('problems.txt', 'w:utf-8')
6
6
  @command = "wikispecies"
7
7
  @title = 'Wikispecies'
8
- @url = 'http://dumps.wikimedia.org/specieswiki/latest/' +
8
+ @url = 'http://dumps.wikimedia.org/specieswiki/latest/' \
9
9
  'specieswiki-latest-pages-articles.xml.bz2'
10
10
  @url = opts[:url] if opts[:url]
11
11
  @uuid = '68923690-0727-473c-b7c5-2ae9e601e3fd'
@@ -347,4 +347,3 @@ module DwcaHunter
347
347
 
348
348
  end
349
349
  end
350
-
@@ -1,5 +1,5 @@
1
1
  module DwcaHunter
2
- VERSION = "0.5.0"
2
+ VERSION = "0.5.1"
3
3
 
4
4
  def self.version
5
5
  VERSION
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwca_hunter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-07-20 00:00:00.000000000 Z
11
+ date: 2018-08-04 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: biodiversity
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '3.5'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '3.5'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: dwc-archive
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -38,6 +52,20 @@ dependencies:
38
52
  - - "~>"
39
53
  - !ruby/object:Gem::Version
40
54
  version: '0.5'
55
+ - !ruby/object:Gem::Dependency
56
+ name: htmlentities
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '4.3'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '4.3'
41
69
  - !ruby/object:Gem::Dependency
42
70
  name: nokogiri
43
71
  requirement: !ruby/object:Gem::Requirement
@@ -204,6 +232,7 @@ files:
204
232
  - files/birdlife_7.csv
205
233
  - files/fishbase_taxon_cache.tsv
206
234
  - files/reptile_checklist_2014_12.csv
235
+ - files/species-black.txt
207
236
  - lib/dwca_hunter.rb
208
237
  - lib/dwca_hunter/downloader.rb
209
238
  - lib/dwca_hunter/encoding.rb
@@ -219,7 +248,6 @@ files:
219
248
  - lib/dwca_hunter/resources/opentree.rb
220
249
  - lib/dwca_hunter/resources/reptiles_checklist.rb
221
250
  - lib/dwca_hunter/resources/wikispecies.rb
222
- - lib/dwca_hunter/resources/worms.rb
223
251
  - lib/dwca_hunter/url.rb
224
252
  - lib/dwca_hunter/version.rb
225
253
  - lib/dwca_hunter/xml.rb
@@ -1,176 +0,0 @@
1
- # encoding: utf-8
2
- module DwcaHunter
3
- class ResourceWoRMS < DwcaHunter::Resource
4
- def initialize(opts = {})
5
- @command = 'worms'
6
- @title = 'WoRMS'
7
- @url = 'http://content60.eol.org/resources/26.tar.gz'
8
- @uuid = '9d27a7ad-2e6a-4597-a79b-23fb3b2f8284'
9
- @download_path = File.join(Dir.tmpdir,
10
- 'dwca_hunter',
11
- 'worms',
12
- 'data.tar.gz')
13
- @fields = ['dc:identifier',
14
- 'dc:source',
15
- 'dwc:Kingdom',
16
- 'dwc:Phylum',
17
- 'dwc:Class',
18
- 'dwc:Order',
19
- 'dwc:Family',
20
- 'dwc:Genus',
21
- 'dwc:ScientificName']
22
- @rank = { 1 => 'kingdom',
23
- 2 => 'phylum',
24
- 3 => 'class',
25
- 4 => 'order',
26
- 5 => 'family',
27
- 6 => 'genus',
28
- 7 => 'species' }
29
- @known_paths = {}
30
- @data = []
31
- @extensions = []
32
- @extensions << { data: [[
33
- 'http://rs.tdwg.org/dwc/terms/taxonId',
34
- 'http://rs.tdwg.org/dwc/terms/scientificName']],
35
- file_name: 'synonyms.txt' }
36
- @re = {
37
- cdata: %r#\<\!\[CDATA\[(.*)\]\]\>#
38
- }
39
- @core = [[
40
- 'http://rs.tdwg.org/dwc/terms/taxonID',
41
- 'http://purl.org/dc/terms/parentNameUsageID',
42
- 'http://purl.org/dc/terms/source',
43
- 'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID',
44
- 'http://purl.org/dc/terms/scientificName',
45
- 'http://purl.org/dc/terms/taxonRank']]
46
- super
47
- end
48
-
49
- def unpack
50
- unpack_tar
51
- end
52
-
53
- def make_dwca
54
- collect_data
55
- make_core_data
56
- generate_dwca
57
- end
58
-
59
- private
60
-
61
- def collect_data
62
- DwcaHunter::logger_write(self.object_id, 'Traversing xml file...')
63
- xml_file = File.join(@download_dir, '26.xml')
64
- f = open(xml_file, 'r:utf-8')
65
- in_taxon = false
66
- taxon = nil
67
- count = 0
68
- Nokogiri::XML::Reader(f).each do |node|
69
- if !in_taxon && node.name == 'taxon'
70
- in_taxon = true
71
- taxon = {}
72
- @fields.each { |field| taxon[field.to_sym] = nil }
73
- taxon[:synonyms] = []
74
- elsif in_taxon && node.name == 'taxon'
75
- in_taxon = false
76
- @data << taxon
77
- taxon = nil
78
- count += 1
79
- if count % BATCH_SIZE == 0
80
- DwcaHunter::logger_write(self.object_id,
81
- "Extracted %s taxons" % count)
82
- end
83
- elsif in_taxon
84
- item = node.name.to_sym
85
- if taxon.has_key?(item) && !taxon[item]
86
- text = node.inner_xml
87
- if cdata = text.match(@re[:cdata])
88
- text = cdata[1]
89
- else
90
- text = DwcaHunter::XML.unescape(text)
91
- end
92
- taxon[item] = text
93
- elsif node.name == 'synonym' &&
94
- (cdata = node.inner_xml.match(@re[:cdata]))
95
- taxon[:synonyms] << cdata[1]
96
- end
97
- end
98
- end
99
- end
100
-
101
- def get_gn_id(path_string)
102
- gn_uuid = UUID.create_v5(path_string, GNA_NAMESPACE)
103
- id = Base64.urlsafe_encode64(gn_uuid.raw_bytes)[0..-3]
104
- "gn:" + id
105
- end
106
-
107
- def make_core_data
108
- DwcaHunter::logger_write(self.object_id, 'Creating core data')
109
- @data.each_with_index do |taxa, i|
110
- if i % BATCH_SIZE == 0
111
- DwcaHunter::logger_write(self.object_id,
112
- 'Traversing %s species for core' % i)
113
- end
114
- path = get_path(taxa)
115
- parent_id = get_gn_id(path.join('|'))
116
- @core << [taxa[:'dc:identifier'],
117
- parent_id, taxa[:'dc:source'],
118
- nil,
119
- taxa[:'dwc:ScientificName'],
120
- 'species']
121
-
122
- taxa[:synonyms].each do |synonym|
123
- @extensions[0][:data] << [taxa[:'dc:identifier'], synonym]
124
- end
125
-
126
- until path.empty?
127
- path_string = path.join("|")
128
- unless @known_paths[path_string]
129
- @known_paths[path_string] = 1
130
- parent_id = (path.size == 1) ?
131
- nil :
132
- get_gn_id([path[0..-2]].join('|'))
133
- id = get_gn_id(path_string)
134
- @core << [id, parent_id, nil, nil, path[-1], @rank[path.size]]
135
- end
136
- path.pop
137
- end
138
- end
139
- end
140
-
141
- def get_path(taxa)
142
- path = []
143
- @fields[2..-2].each do |field|
144
- path << taxa[field.to_sym]
145
- end
146
- path
147
- end
148
-
149
- def generate_dwca
150
- DwcaHunter::logger_write(self.object_id,
151
- 'Creating DarwinCore Archive file')
152
- @eml = {
153
- id: @uuid,
154
- title: @title,
155
- authors: [
156
- { email: 'info@marinespecies.org',
157
- url: 'http://www.marinespecies.org' }
158
- ],
159
- metadata_providers: [
160
- { first_name: 'Dmitry',
161
- last_name: 'Mozzherin',
162
- email: 'dmozzherin@gmail.com' }
163
- ],
164
- abstract: 'The aim of a World Register of Marine Species (WoRMS) ' +
165
- 'is to provide an authoritative and comprehensive list ' +
166
- 'of names of marine organisms, including information ' +
167
- 'on synonymy. While highest priority goes to valid ' +
168
- 'names, other names in use are included so that this ' +
169
- 'register can serve as a guide to interpret taxonomic ' +
170
- 'literature.',
171
- }
172
- super
173
- end
174
- end
175
- end
176
-