dwca_hunter 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2a7db63e8ed5cb5d28892c4f43adaa015e52ae012a0363b6fe50aaa230038f04
4
- data.tar.gz: 3b8bde91aeec03e927ec7811c0ccfba4c411e5f1a3c7521415bc359add68b8e1
3
+ metadata.gz: e30e9b34ea1c46b021bd3c2ec66ccad4996d4a921c7ce78791b84940bd239f05
4
+ data.tar.gz: 1be0e7119fd38094f94a53d71460a8f97a59f2e46a5e9740b814f5dcc97b42cd
5
5
  SHA512:
6
- metadata.gz: 2992334297d7add2ad3875180b080c7278b7ab934440a5376a8fffb6943bcef9c585187ec67e1bf8737405e7153222bd65df494e4805ace843cfbfd07adf73c1
7
- data.tar.gz: 91218db042979509afd377008df68c82d04a08fe39b653119cdbe3b187a27202e5da11f173310617535c2acabc1aef65aec142d4e6226d0abb250dcec2d38eab
6
+ metadata.gz: df1d9bebe191ebf8ae72d601f05374edeaeffbb627d08d7981da582559439dd9ad173656d78a7c88b3a9765562efa8a7095c4eaa5ed0f1f3ef856be94f990b63
7
+ data.tar.gz: 703bbf2d197a55a8d4e8510e940f77562540983bb5eddc7a7496ed77af5e610ce0ce5d95a1faa25878a8f8b12a17613808898453a0bd6b7aa19a87dc7c5f000e
data/CHANGELOG.md CHANGED
@@ -2,14 +2,23 @@
2
2
 
3
3
  ## Unreleased (placeholder for the next version)
4
4
 
5
+ ## [v0.5.1]
6
+
7
+ - Add [#11] clean up data for Mammal Species of the World
8
+ - Add [#10] show logs during processing
9
+
5
10
  ## [v0.5.0]
11
+
6
12
  - Add [#8] convert project to a `Ruby gem`
7
13
 
8
14
  ## Footnotes
9
15
 
10
16
  This document follows [changelog guidelines]
11
17
 
18
+ [#11]: https://github.com/GlobalNamesArchitecture/dwca_hunter/issues/11
19
+ [#10]: https://github.com/GlobalNamesArchitecture/dwca_hunter/issues/10
12
20
  [#8]: https://github.com/GlobalNamesArchitecture/dwca_hunter/issues/8
21
+ [v0.5.1]: https://github.com/gnames/dwca_hunter/compare/v0.5.0...v0.5.1
13
22
  [v0.5.0]: https://github.com/gnames/dwca_hunter/compare/v0.4.0...v0.5.0
14
23
 
15
24
  [changelog guidelines]: https://github.com/olivierlacan/keep-a-changelog
data/Gemfile.lock CHANGED
@@ -1,9 +1,11 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- dwca_hunter (0.5.0)
4
+ dwca_hunter (0.5.1)
5
+ biodiversity (~> 3.5)
5
6
  dwc-archive (~> 1.0)
6
7
  gn_uuid (~> 0.5)
8
+ htmlentities (~> 4.3)
7
9
  nokogiri (~> 1.8)
8
10
  rest-client (~> 2.0)
9
11
  thor (~> 0.19)
data/dwca_hunter.gemspec CHANGED
@@ -26,8 +26,10 @@ Gem::Specification.new do |gem|
26
26
  gem.executables = gem.files.grep(%r{^exe/}) { |f| File.basename(f) }
27
27
  gem.require_paths = ["lib"]
28
28
 
29
+ gem.add_dependency "biodiversity", "~> 3.5"
29
30
  gem.add_dependency "dwc-archive", "~> 1.0"
30
31
  gem.add_dependency "gn_uuid", "~> 0.5"
32
+ gem.add_dependency "htmlentities", "~> 4.3"
31
33
  gem.add_dependency "nokogiri", "~> 1.8"
32
34
  gem.add_dependency "rest-client", "~> 2.0"
33
35
  gem.add_dependency "thor", "~> 0.19"
data/exe/dwcahunter CHANGED
@@ -4,6 +4,8 @@
4
4
  require "thor"
5
5
  require "dwca_hunter"
6
6
 
7
+ DwcaHunter.logger = Logger.new($stdout)
8
+
7
9
  # DwcaHunterCLI determines command line interface to the gem functionality
8
10
  class DwcaHunterCLI < Thor
9
11
  desc "list [SEARCH]", "lists available sources to convert filtered by search"
@@ -0,0 +1,251 @@
1
+ ab
2
+ above
3
+ account
4
+ ad
5
+ ae
6
+ al
7
+ all
8
+ allocation
9
+ also
10
+ anamorph
11
+ and
12
+ ap
13
+ are
14
+ areas
15
+ as
16
+ associated
17
+ at
18
+ available
19
+ awaiting
20
+ backbone
21
+ bacterium
22
+ basin
23
+ bird
24
+ bis
25
+ but
26
+ bv
27
+ by
28
+ ca
29
+ calls
30
+ cf
31
+ circular
32
+ cited
33
+ clams
34
+ clonal
35
+ clone
36
+ comes
37
+ comment
38
+ comments
39
+ construct
40
+ contrasts
41
+ coralline
42
+ coronaviridae
43
+ culture
44
+ cv
45
+ da
46
+ dc
47
+ de
48
+ de-
49
+ def
50
+ degrading
51
+ del
52
+ depletion
53
+ der
54
+ des
55
+ described
56
+ diatom
57
+ differ
58
+ display
59
+ do
60
+ du
61
+ dummy
62
+ ecological
63
+ ectosymbiont
64
+ ed
65
+ either
66
+ em
67
+ en
68
+ endosymbiont
69
+ enrichment
70
+ environmental
71
+ et
72
+ ex
73
+ examples
74
+ excluding
75
+ expression
76
+ extend
77
+ eyes
78
+ faeces
79
+ figures
80
+ fl
81
+ flying
82
+ fn
83
+ fo
84
+ follows
85
+ foot
86
+ for
87
+ form
88
+ forma
89
+ formerly
90
+ fosmid
91
+ fossils
92
+ fox
93
+ fr
94
+ fragments
95
+ from
96
+ fungal
97
+ ge
98
+ gen
99
+ genera
100
+ generic
101
+ genus
102
+ goes
103
+ group
104
+ he
105
+ host
106
+ hu
107
+ hybrid
108
+ id
109
+ im
110
+ in
111
+ incertae
112
+ ing
113
+ is
114
+ it
115
+ jejuni-like
116
+ jr
117
+ kg
118
+ la
119
+ large
120
+ largest
121
+ leech
122
+ leukemia-related
123
+ libraries
124
+ longer
125
+ luciferase
126
+ magnified
127
+ marker
128
+ mc
129
+ morphology
130
+ ms
131
+ my
132
+ name
133
+ names
134
+ new
135
+ nf
136
+ ng
137
+ nm
138
+ non
139
+ not
140
+ notes
141
+ nov
142
+ nr
143
+ ns
144
+ occurs
145
+ of
146
+ on
147
+ one
148
+ op
149
+ or
150
+ oral
151
+ other
152
+ ox
153
+ parasite
154
+ parasites
155
+ part
156
+ phylotype
157
+ pipefish
158
+ pl
159
+ plasmid
160
+ pomfret
161
+ population
162
+ populations
163
+ pr
164
+ probes
165
+ prophage
166
+ pt
167
+ queens
168
+ red
169
+ reference
170
+ references
171
+ regarded
172
+ region
173
+ regions
174
+ related
175
+ reported
176
+ resistance
177
+ retroviruses
178
+ revised
179
+ rhodolith
180
+ same
181
+ sample
182
+ samples
183
+ scales
184
+ se
185
+ sea
186
+ seahorse
187
+ sec
188
+ secondary
189
+ sect
190
+ section
191
+ see
192
+ seed
193
+ seems
194
+ segment
195
+ series
196
+ show
197
+ shuttle
198
+ sinus
199
+ so
200
+ soil
201
+ south
202
+ species
203
+ specimens
204
+ ss
205
+ st
206
+ strains
207
+ subsp
208
+ subspec
209
+ subsp-nov
210
+ summarized
211
+ symbiont
212
+ symbionts
213
+ synthase
214
+ taken
215
+ taxon
216
+ that
217
+ the
218
+ these
219
+ thin
220
+ this
221
+ three
222
+ to
223
+ towards
224
+ trapping
225
+ two
226
+ ty
227
+ type
228
+ und
229
+ under
230
+ unpublished
231
+ up
232
+ us
233
+ usually
234
+ van
235
+ var
236
+ variable
237
+ variant
238
+ variety
239
+ vector
240
+ vi
241
+ virus
242
+ von
243
+ voyager
244
+ was
245
+ waters
246
+ we
247
+ were
248
+ with
249
+ wrote
250
+ xx
251
+ zur
data/lib/dwca_hunter.rb CHANGED
@@ -1,11 +1,13 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "biodiversity"
3
4
  require "logger"
4
5
  require "fileutils"
5
6
  require "uri"
6
7
  require "tmpdir"
7
8
  require "net/http"
8
9
  require "json"
10
+ require "htmlentities"
9
11
  require "dwc_archive"
10
12
  require "dwca_hunter/resource"
11
13
  require "rest_client"
@@ -1,12 +1,18 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module DwcaHunter
4
+ # Encoding module fixes encoding issues with data
2
5
  module Encoding
3
6
  def self.latin1_to_utf8(file_path)
4
- new_file = file_path + '.utf_8'
5
- puts "Creating %s" % new_file
6
- r = open(file_path)
7
- w = open(new_file, 'w:utf-8')
7
+ new_file = file_path + ".utf_8"
8
+ puts "Creating #{new_file}"
9
+ r = File.open(file_path)
10
+ w = File.open(new_file, "w:utf-8")
11
+ he = HTMLEntities.new
8
12
  r.each do |l|
9
- l.encode!('UTF-8', 'ISO-8859-1', invalid: :replace, replace: '?')
13
+ l = l
14
+ l = l.encode("UTF-8", "ISO-8859-1", invalid: :replace, replace: "?")
15
+ l = he.decode(l)
10
16
  w.write l
11
17
  end
12
18
  r.close
@@ -1,4 +1,4 @@
1
- # encoding: utf-8
1
+ __END__
2
2
  module DwcaHunter
3
3
  class ResourceGNUB < DwcaHunter::Resource
4
4
  def initialize(opts = {})
@@ -98,4 +98,3 @@ module DwcaHunter
98
98
  end
99
99
  end
100
100
  end
101
-
@@ -1,13 +1,19 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module DwcaHunter
4
+ # ResourceMammalSpecies converts "Mammal Species of the World" data
5
+ # to DarwinCore Archive file
2
6
  class ResourceMammalSpecies < DwcaHunter::Resource
3
7
  def initialize(opts = {})
8
+ @parser = ScientificNameParser.new
9
+ @black_sp = black_species
4
10
  @command = "mammal-species"
5
11
  @title = "The Mammal Species of The World"
6
12
  @uuid = "464dafec-1037-432d-8449-c0b309e0a030"
7
13
  @data = []
8
14
  @extensions = []
9
15
  @count = 1
10
- @clades = {"Mammalia" => { rank: "class", id: @count}}
16
+ @clades = { "Mammalia" => { rank: "class", id: @count } }
11
17
  @url = "http://www.departments.bucknell.edu"\
12
18
  "/biology/resources/msw3/export.asp"
13
19
  @download_path = File.join(Dir.tmpdir, "dwca_hunter",
@@ -20,14 +26,14 @@ module DwcaHunter
20
26
  end
21
27
 
22
28
  def make_dwca
23
- DwcaHunter::logger_write(self.object_id, "Extracting data")
29
+ DwcaHunter.logger_write(object_id, "Extracting data")
24
30
  encode
25
31
  collect_data
26
32
  generate_dwca
27
33
  end
28
34
 
29
35
  def download
30
- DwcaHunter::logger_write(self.object_id, "Downloading file -- "\
36
+ DwcaHunter.logger_write(object_id, "Downloading file -- "\
31
37
  "it will take some time...")
32
38
  dlr = DwcaHunter::Downloader.new(url, @download_path)
33
39
  dlr.download
@@ -47,8 +53,7 @@ module DwcaHunter
47
53
  end
48
54
 
49
55
  def generate_dwca
50
- DwcaHunter::logger_write(self.object_id,
51
- 'Creating DarwinCore Archive file')
56
+ DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
52
57
  core_init
53
58
  extensions_init
54
59
  eml_init
@@ -64,45 +69,63 @@ module DwcaHunter
64
69
  return if rec[:commonname].to_s == ""
65
70
  taxon_id = taxon[0]
66
71
  lang = "en"
67
- name = rec[:commonname].gsub("\u{0092}", "'")
72
+ name = rec[:commonname].tr("\u{0092}", "'")
68
73
  @extensions[0][:data] << [taxon_id, name, lang]
69
-
70
74
  end
71
75
 
76
+ # rubocop:disable Metrics/AbcSize
77
+
72
78
  def process_synonyms(rec, taxon)
73
79
  accepted_id = taxon[0]
74
80
  parent_id = taxon[2]
75
81
  rank = taxon[-1]
76
- return unless ['species', 'subspecies'].include? rank
82
+ return unless %w[species subspecies].include? rank
77
83
  synonyms = rec[:synonyms].gsub(/\.$/, "").
78
- gsub(/<[\/ib]+>/, "").gsub(/[\s]+/, " ").split(";")
84
+ gsub(%r{<[/ibsup]+>}, "").gsub(/[\s]+/, " ").split(";")
79
85
  synonyms = synonyms.map(&:strip)
80
- synonyms = synonyms.map do |s|
81
- next if s.match(/<u>/)
82
- if s.match(/^[a-z]/)
83
- s = rec[:genus] + " " + s
84
- end
86
+ synonyms.map do |s|
87
+ next if s =~ /<u>/
88
+ s = rec[:genus] + " " + s if s =~ /^[a-z]/
85
89
  @count += 1
86
90
  id = @count
87
- @core << [id, nil, parent_id, accepted_id, s, "synonym", rank]
91
+ if real_name?(s)
92
+ @core << [id, nil, parent_id, accepted_id, s, "synonym", rank]
93
+ else
94
+ puts "Rejected: #{s}"
95
+ end
96
+ end
97
+ end
98
+
99
+ # rubocop:enable Metrics/AbcSize
100
+
101
+ def real_name?(str)
102
+ parsed = @parser.parse(str)[:scientificName]
103
+ return false unless parsed[:parsed]
104
+ epithets = parsed[:canonical].split(" ")[1..-1]
105
+ return false if epithets.nil? || epithets.empty?
106
+ epithets.each do |e|
107
+ return false if @black_sp[e]
88
108
  end
109
+ true
89
110
  end
90
111
 
91
- def process_name(rec, rank)
92
- name =[@core.last[4], rec[:author], rec[:date]]
93
- @core.last[4] = name.join(" ").gsub(/[\s]+/, " ").strip
112
+ def process_name(rec)
113
+ name = [@core.last[4], rec[:author], rec[:date]]
114
+ @core.last[4] = name.join(" ").gsub(%r{<[/ibsup]+>}, "").
115
+ gsub(/[\s]+/, " ").strip
94
116
  @core.last[1] = rec[:id]
95
117
  end
96
118
 
119
+ # rubocop:disable Metrics/AbcSize
120
+
97
121
  def process_hierarchy(rec)
98
122
  parent_id = @clades["Mammalia"][:id]
99
123
  is_row_rank = false
100
- [:order, :suborder, :infraorder, :superfamily, :family,
101
- :subfamily, :tribe, :genus, :subgenus,
102
- :species, :subspecies].each do |rank|
103
- is_row_rank = true if rank == rec[:taxonlevel].downcase.to_sym
124
+ %i[order suborder infraorder superfamily family
125
+ subfamily tribe genus subgenus species subspecies].each do |rank|
126
+ is_row_rank = true if rank == rec[:taxonlevel].downcase.to_sym
104
127
  clade = rec[rank]
105
- clade = clade.capitalize if clade.match(/^[A-Z]+$/)
128
+ clade = clade.capitalize if clade =~ /^[A-Z]+$/
106
129
  next if clade.to_s == ""
107
130
  clade_id = nil
108
131
  clade = adjust_clade(rec, rank, clade)
@@ -114,16 +137,17 @@ module DwcaHunter
114
137
  @clades[clade] = { id: clade_id, rank: rank }
115
138
  @core << [clade_id, nil, parent_id, clade_id, clade, nil, rank.to_s]
116
139
  if is_row_rank
117
- process_name(rec, rank)
140
+ process_name(rec)
118
141
  return @core.last
119
142
  end
120
143
  end
121
144
  parent_id = clade_id
122
145
  end
123
146
  end
147
+ # rubocop:enable Metrics/AbcSize
124
148
 
125
149
  def adjust_clade(rec, rank, clade)
126
- if [:species, :subspecies].include? rank
150
+ if %i[species subspecies].include? rank
127
151
  clade = [rec[:genus], rec[:species]]
128
152
  clade << rec[:subspecies] if rank == :subspecies
129
153
  clade.join(" ").gsub(/[\s]+/, " ").strip
@@ -140,13 +164,13 @@ module DwcaHunter
140
164
  { first_name: "Don",
141
165
  last_name: "Wilson" },
142
166
  { first_name: "DeeAnn",
143
- last_name: "Reader" },
144
- ],
167
+ last_name: "Reader" }
168
+ ],
145
169
  metadata_providers: [
146
170
  { first_name: "Dmitry",
147
171
  last_name: "Mozzherin",
148
172
  email: "dmozzherin@gmail.com" }
149
- ],
173
+ ],
150
174
  abstract: "Mammal Species of the World, 3rd edition (MSW3) is "\
151
175
  "a database of mammalian taxonomy, based upon the 2005 book "\
152
176
  "Mammal Species of the World. A Taxonomic and Geographic Reference "\
@@ -156,24 +180,32 @@ module DwcaHunter
156
180
  end
157
181
 
158
182
  def core_init
159
- @core = [['http://rs.tdwg.org/dwc/terms/taxonID',
160
- 'http://globalnames.org/terms/localID',
161
- 'http://rs.tdwg.org/dwc/terms/parentNameUsageID',
162
- 'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID',
163
- 'http://rs.tdwg.org/dwc/terms/scientificName',
164
- 'http://rs.tdwg.org/dwc/terms/taxonomicStatus',
165
- 'http://rs.tdwg.org/dwc/terms/taxonRank']]
183
+ @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
184
+ "http://globalnames.org/terms/localID",
185
+ "http://rs.tdwg.org/dwc/terms/parentNameUsageID",
186
+ "http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
187
+ "http://rs.tdwg.org/dwc/terms/scientificName",
188
+ "http://rs.tdwg.org/dwc/terms/taxonomicStatus",
189
+ "http://rs.tdwg.org/dwc/terms/taxonRank"]]
166
190
  m = @clades["Mammalia"]
167
191
  @core << [m[:id], nil, nil, m[:id], "Mammalia", nil, "class"]
168
192
  end
169
193
 
194
+ def black_species
195
+ res = {}
196
+ path = File.join(__dir__, "..", "..", "..", "files", "species-black.txt")
197
+ File.open(path).each do |l|
198
+ res[l.strip] = 1
199
+ end
200
+ res
201
+ end
202
+
170
203
  def extensions_init
171
- @extensions << { data: [['http://rs.tdwg.org/dwc/terms/taxonID',
172
- 'http://rs.tdwg.org/dwc/terms/vernacularName',
173
- 'http://purl.org/dc/terms/language']],
174
- file_name: 'vernacular_names.txt',
175
- row_type: 'http://rs.gbif.org/terms/1.0/VernacularName'
176
- }
204
+ @extensions << { data: [["http://rs.tdwg.org/dwc/terms/taxonID",
205
+ "http://rs.tdwg.org/dwc/terms/vernacularName",
206
+ "http://purl.org/dc/terms/language"]],
207
+ file_name: "vernacular_names.txt",
208
+ row_type: "http://rs.gbif.org/terms/1.0/VernacularName" }
177
209
  end
178
210
  end
179
211
  end
@@ -5,7 +5,7 @@ module DwcaHunter
5
5
  @problems_file = open('problems.txt', 'w:utf-8')
6
6
  @command = "wikispecies"
7
7
  @title = 'Wikispecies'
8
- @url = 'http://dumps.wikimedia.org/specieswiki/latest/' +
8
+ @url = 'http://dumps.wikimedia.org/specieswiki/latest/' \
9
9
  'specieswiki-latest-pages-articles.xml.bz2'
10
10
  @url = opts[:url] if opts[:url]
11
11
  @uuid = '68923690-0727-473c-b7c5-2ae9e601e3fd'
@@ -347,4 +347,3 @@ module DwcaHunter
347
347
 
348
348
  end
349
349
  end
350
-
@@ -1,5 +1,5 @@
1
1
  module DwcaHunter
2
- VERSION = "0.5.0"
2
+ VERSION = "0.5.1"
3
3
 
4
4
  def self.version
5
5
  VERSION
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dwca_hunter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-07-20 00:00:00.000000000 Z
11
+ date: 2018-08-04 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: biodiversity
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '3.5'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '3.5'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: dwc-archive
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -38,6 +52,20 @@ dependencies:
38
52
  - - "~>"
39
53
  - !ruby/object:Gem::Version
40
54
  version: '0.5'
55
+ - !ruby/object:Gem::Dependency
56
+ name: htmlentities
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '4.3'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '4.3'
41
69
  - !ruby/object:Gem::Dependency
42
70
  name: nokogiri
43
71
  requirement: !ruby/object:Gem::Requirement
@@ -204,6 +232,7 @@ files:
204
232
  - files/birdlife_7.csv
205
233
  - files/fishbase_taxon_cache.tsv
206
234
  - files/reptile_checklist_2014_12.csv
235
+ - files/species-black.txt
207
236
  - lib/dwca_hunter.rb
208
237
  - lib/dwca_hunter/downloader.rb
209
238
  - lib/dwca_hunter/encoding.rb
@@ -219,7 +248,6 @@ files:
219
248
  - lib/dwca_hunter/resources/opentree.rb
220
249
  - lib/dwca_hunter/resources/reptiles_checklist.rb
221
250
  - lib/dwca_hunter/resources/wikispecies.rb
222
- - lib/dwca_hunter/resources/worms.rb
223
251
  - lib/dwca_hunter/url.rb
224
252
  - lib/dwca_hunter/version.rb
225
253
  - lib/dwca_hunter/xml.rb
@@ -1,176 +0,0 @@
1
- # encoding: utf-8
2
- module DwcaHunter
3
- class ResourceWoRMS < DwcaHunter::Resource
4
- def initialize(opts = {})
5
- @command = 'worms'
6
- @title = 'WoRMS'
7
- @url = 'http://content60.eol.org/resources/26.tar.gz'
8
- @uuid = '9d27a7ad-2e6a-4597-a79b-23fb3b2f8284'
9
- @download_path = File.join(Dir.tmpdir,
10
- 'dwca_hunter',
11
- 'worms',
12
- 'data.tar.gz')
13
- @fields = ['dc:identifier',
14
- 'dc:source',
15
- 'dwc:Kingdom',
16
- 'dwc:Phylum',
17
- 'dwc:Class',
18
- 'dwc:Order',
19
- 'dwc:Family',
20
- 'dwc:Genus',
21
- 'dwc:ScientificName']
22
- @rank = { 1 => 'kingdom',
23
- 2 => 'phylum',
24
- 3 => 'class',
25
- 4 => 'order',
26
- 5 => 'family',
27
- 6 => 'genus',
28
- 7 => 'species' }
29
- @known_paths = {}
30
- @data = []
31
- @extensions = []
32
- @extensions << { data: [[
33
- 'http://rs.tdwg.org/dwc/terms/taxonId',
34
- 'http://rs.tdwg.org/dwc/terms/scientificName']],
35
- file_name: 'synonyms.txt' }
36
- @re = {
37
- cdata: %r#\<\!\[CDATA\[(.*)\]\]\>#
38
- }
39
- @core = [[
40
- 'http://rs.tdwg.org/dwc/terms/taxonID',
41
- 'http://purl.org/dc/terms/parentNameUsageID',
42
- 'http://purl.org/dc/terms/source',
43
- 'http://rs.tdwg.org/dwc/terms/acceptedNameUsageID',
44
- 'http://purl.org/dc/terms/scientificName',
45
- 'http://purl.org/dc/terms/taxonRank']]
46
- super
47
- end
48
-
49
- def unpack
50
- unpack_tar
51
- end
52
-
53
- def make_dwca
54
- collect_data
55
- make_core_data
56
- generate_dwca
57
- end
58
-
59
- private
60
-
61
- def collect_data
62
- DwcaHunter::logger_write(self.object_id, 'Traversing xml file...')
63
- xml_file = File.join(@download_dir, '26.xml')
64
- f = open(xml_file, 'r:utf-8')
65
- in_taxon = false
66
- taxon = nil
67
- count = 0
68
- Nokogiri::XML::Reader(f).each do |node|
69
- if !in_taxon && node.name == 'taxon'
70
- in_taxon = true
71
- taxon = {}
72
- @fields.each { |field| taxon[field.to_sym] = nil }
73
- taxon[:synonyms] = []
74
- elsif in_taxon && node.name == 'taxon'
75
- in_taxon = false
76
- @data << taxon
77
- taxon = nil
78
- count += 1
79
- if count % BATCH_SIZE == 0
80
- DwcaHunter::logger_write(self.object_id,
81
- "Extracted %s taxons" % count)
82
- end
83
- elsif in_taxon
84
- item = node.name.to_sym
85
- if taxon.has_key?(item) && !taxon[item]
86
- text = node.inner_xml
87
- if cdata = text.match(@re[:cdata])
88
- text = cdata[1]
89
- else
90
- text = DwcaHunter::XML.unescape(text)
91
- end
92
- taxon[item] = text
93
- elsif node.name == 'synonym' &&
94
- (cdata = node.inner_xml.match(@re[:cdata]))
95
- taxon[:synonyms] << cdata[1]
96
- end
97
- end
98
- end
99
- end
100
-
101
- def get_gn_id(path_string)
102
- gn_uuid = UUID.create_v5(path_string, GNA_NAMESPACE)
103
- id = Base64.urlsafe_encode64(gn_uuid.raw_bytes)[0..-3]
104
- "gn:" + id
105
- end
106
-
107
- def make_core_data
108
- DwcaHunter::logger_write(self.object_id, 'Creating core data')
109
- @data.each_with_index do |taxa, i|
110
- if i % BATCH_SIZE == 0
111
- DwcaHunter::logger_write(self.object_id,
112
- 'Traversing %s species for core' % i)
113
- end
114
- path = get_path(taxa)
115
- parent_id = get_gn_id(path.join('|'))
116
- @core << [taxa[:'dc:identifier'],
117
- parent_id, taxa[:'dc:source'],
118
- nil,
119
- taxa[:'dwc:ScientificName'],
120
- 'species']
121
-
122
- taxa[:synonyms].each do |synonym|
123
- @extensions[0][:data] << [taxa[:'dc:identifier'], synonym]
124
- end
125
-
126
- until path.empty?
127
- path_string = path.join("|")
128
- unless @known_paths[path_string]
129
- @known_paths[path_string] = 1
130
- parent_id = (path.size == 1) ?
131
- nil :
132
- get_gn_id([path[0..-2]].join('|'))
133
- id = get_gn_id(path_string)
134
- @core << [id, parent_id, nil, nil, path[-1], @rank[path.size]]
135
- end
136
- path.pop
137
- end
138
- end
139
- end
140
-
141
- def get_path(taxa)
142
- path = []
143
- @fields[2..-2].each do |field|
144
- path << taxa[field.to_sym]
145
- end
146
- path
147
- end
148
-
149
- def generate_dwca
150
- DwcaHunter::logger_write(self.object_id,
151
- 'Creating DarwinCore Archive file')
152
- @eml = {
153
- id: @uuid,
154
- title: @title,
155
- authors: [
156
- { email: 'info@marinespecies.org',
157
- url: 'http://www.marinespecies.org' }
158
- ],
159
- metadata_providers: [
160
- { first_name: 'Dmitry',
161
- last_name: 'Mozzherin',
162
- email: 'dmozzherin@gmail.com' }
163
- ],
164
- abstract: 'The aim of a World Register of Marine Species (WoRMS) ' +
165
- 'is to provide an authoritative and comprehensive list ' +
166
- 'of names of marine organisms, including information ' +
167
- 'on synonymy. While highest priority goes to valid ' +
168
- 'names, other names in use are included so that this ' +
169
- 'register can serve as a guide to interpret taxonomic ' +
170
- 'literature.',
171
- }
172
- super
173
- end
174
- end
175
- end
176
-