dwc-archive 0.9.6 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +31 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +23 -0
  5. data/.ruby-version +1 -0
  6. data/.travis.yml +4 -5
  7. data/CHANGELOG +15 -7
  8. data/Gemfile +3 -15
  9. data/LICENSE +1 -1
  10. data/README.md +135 -111
  11. data/Rakefile +13 -54
  12. data/dwc-archive.gemspec +37 -0
  13. data/features/step_definitions/dwc-creator_steps.rb +5 -5
  14. data/features/step_definitions/dwc-reader_steps.rb +47 -28
  15. data/features/support/env.rb +1 -1
  16. data/lib/dwc_archive.rb +121 -0
  17. data/lib/dwc_archive/archive.rb +59 -0
  18. data/lib/dwc_archive/classification_normalizer.rb +382 -0
  19. data/lib/dwc_archive/core.rb +25 -0
  20. data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
  21. data/lib/dwc_archive/expander.rb +85 -0
  22. data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
  23. data/lib/dwc_archive/generator.rb +90 -0
  24. data/lib/dwc_archive/generator_eml_xml.rb +116 -0
  25. data/lib/dwc_archive/generator_meta_xml.rb +72 -0
  26. data/lib/dwc_archive/gnub_taxon.rb +14 -0
  27. data/lib/dwc_archive/ingester.rb +106 -0
  28. data/lib/dwc_archive/metadata.rb +56 -0
  29. data/lib/dwc_archive/taxon_normalized.rb +23 -0
  30. data/lib/dwc_archive/version.rb +6 -0
  31. data/lib/dwc_archive/xml_reader.rb +89 -0
  32. data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
  33. data/spec/files/generator_eml.xml +47 -0
  34. data/spec/files/generator_meta.xml +19 -0
  35. data/spec/lib/classification_normalizer_spec.rb +214 -0
  36. data/spec/lib/core_spec.rb +100 -0
  37. data/spec/lib/darwin_core_spec.rb +249 -0
  38. data/spec/lib/generator_eml_xml_spec.rb +22 -0
  39. data/spec/lib/generator_meta_xml_spec.rb +22 -0
  40. data/spec/lib/generator_spec.rb +124 -0
  41. data/spec/lib/gnub_taxon_spec.rb +32 -0
  42. data/spec/lib/metadata_spec.rb +89 -0
  43. data/spec/lib/taxon_normalized_spec.rb +142 -0
  44. data/spec/lib/xml_reader_spec.rb +11 -11
  45. data/spec/spec_helper.rb +78 -6
  46. metadata +180 -92
  47. data/.rvmrc +0 -1
  48. data/Gemfile.lock +0 -155
  49. data/VERSION +0 -1
  50. data/lib/dwc-archive.rb +0 -95
  51. data/lib/dwc-archive/.expander.rb.swo +0 -0
  52. data/lib/dwc-archive/archive.rb +0 -37
  53. data/lib/dwc-archive/classification_normalizer.rb +0 -424
  54. data/lib/dwc-archive/core.rb +0 -17
  55. data/lib/dwc-archive/expander.rb +0 -80
  56. data/lib/dwc-archive/generator.rb +0 -75
  57. data/lib/dwc-archive/generator_eml_xml.rb +0 -84
  58. data/lib/dwc-archive/generator_meta_xml.rb +0 -50
  59. data/lib/dwc-archive/ingester.rb +0 -101
  60. data/lib/dwc-archive/metadata.rb +0 -42
  61. data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
  62. data/lib/dwc-archive/xml_reader.rb +0 -64
  63. data/spec/lib/dwc-archive_spec.rb +0 -250
  64. data/spec/spec.opts +0 -1
data/.rvmrc DELETED
@@ -1 +0,0 @@
1
- rvm use ruby-1.9.3-p392@dwc-archive --create
@@ -1,155 +0,0 @@
1
- GEM
2
- remote: https://rubygems.org/
3
- specs:
4
- abstract (1.0.0)
5
- actionpack (3.0.8)
6
- activemodel (= 3.0.8)
7
- activesupport (= 3.0.8)
8
- builder (~> 2.1.2)
9
- erubis (~> 2.6.6)
10
- i18n (~> 0.5.0)
11
- rack (~> 1.2.1)
12
- rack-mount (~> 0.6.14)
13
- rack-test (~> 0.5.7)
14
- tzinfo (~> 0.3.23)
15
- activemodel (3.0.8)
16
- activesupport (= 3.0.8)
17
- builder (~> 2.1.2)
18
- i18n (~> 0.5.0)
19
- activesupport (3.0.8)
20
- archive-tar-minitar (0.5.2)
21
- awesome_print (1.1.0)
22
- binding_of_caller (0.7.1)
23
- debug_inspector (>= 0.0.1)
24
- biodiversity (3.1.0)
25
- parallel
26
- parallel (~> 0.6)
27
- rake (~> 10.0)
28
- treetop
29
- treetop (~> 1.4)
30
- unicode_utils (~> 1.4)
31
- builder (2.1.2)
32
- coderay (1.0.9)
33
- columnize (0.3.6)
34
- coolline (0.4.2)
35
- cucumber (1.3.1)
36
- builder (>= 2.1.2)
37
- diff-lcs (>= 1.1.3)
38
- gherkin (~> 2.12.0)
39
- multi_json (~> 1.3)
40
- debug_inspector (0.0.2)
41
- debugger (1.5.0)
42
- columnize (>= 0.3.1)
43
- debugger-linecache (~> 1.2.0)
44
- debugger-ruby_core_source (~> 1.2.0)
45
- debugger-linecache (1.2.0)
46
- debugger-ruby_core_source (1.2.0)
47
- diff-lcs (1.2.4)
48
- diffy (2.1.4)
49
- erubis (2.6.6)
50
- abstract (>= 1.0.0)
51
- gherkin (2.12.0)
52
- multi_json (~> 1.3)
53
- git (1.2.5)
54
- grit (2.5.0)
55
- diff-lcs (~> 1.1)
56
- mime-types (~> 1.15)
57
- posix-spawn (~> 0.3.6)
58
- hirb (0.7.1)
59
- i18n (0.5.0)
60
- jazz_hands (0.5.0)
61
- awesome_print (~> 1.1.0)
62
- coderay (~> 1.0.9)
63
- coolline (>= 0.4.0)
64
- hirb (~> 0.7.1)
65
- pry (~> 0.9.12)
66
- pry-debugger (~> 0.2.2)
67
- pry-doc (~> 0.4.4)
68
- pry-git (~> 0.2.3)
69
- pry-rails (~> 0.2.2)
70
- pry-remote (>= 0.1.7)
71
- pry-stack_explorer (~> 0.4.9)
72
- railties (>= 3.0, < 5.0)
73
- jeweler (1.8.4)
74
- bundler (~> 1.0)
75
- git (>= 1.2.5)
76
- rake
77
- rdoc
78
- json (1.7.7)
79
- method_source (0.8.1)
80
- mime-types (1.23)
81
- multi_json (1.7.3)
82
- nokogiri (1.5.9)
83
- parallel (0.7.0)
84
- parsley-store (0.3.2)
85
- biodiversity (~> 3.1.0)
86
- jeweler (~> 1.8)
87
- redis (~> 3.0)
88
- polyglot (0.3.3)
89
- posix-spawn (0.3.6)
90
- pry (0.9.12.1)
91
- coderay (~> 1.0.5)
92
- method_source (~> 0.8)
93
- slop (~> 3.4)
94
- pry-debugger (0.2.2)
95
- debugger (~> 1.3)
96
- pry (~> 0.9.10)
97
- pry-doc (0.4.5)
98
- pry (>= 0.9)
99
- yard (>= 0.8)
100
- pry-git (0.2.3)
101
- diffy
102
- grit
103
- pry (>= 0.9.8)
104
- pry-rails (0.2.2)
105
- pry (>= 0.9.10)
106
- pry-remote (0.1.7)
107
- pry (~> 0.9)
108
- slop (~> 3.0)
109
- pry-stack_explorer (0.4.9)
110
- binding_of_caller (>= 0.7)
111
- pry (~> 0.9.11)
112
- rack (1.2.8)
113
- rack-mount (0.6.14)
114
- rack (>= 1.0.0)
115
- rack-test (0.5.7)
116
- rack (>= 1.0)
117
- railties (3.0.8)
118
- actionpack (= 3.0.8)
119
- activesupport (= 3.0.8)
120
- rake (>= 0.8.7)
121
- thor (~> 0.14.4)
122
- rake (10.0.4)
123
- rdoc (4.0.1)
124
- json (~> 1.4)
125
- redis (3.0.4)
126
- rspec (2.13.0)
127
- rspec-core (~> 2.13.0)
128
- rspec-expectations (~> 2.13.0)
129
- rspec-mocks (~> 2.13.0)
130
- rspec-core (2.13.1)
131
- rspec-expectations (2.13.0)
132
- diff-lcs (>= 1.1.3, < 2.0)
133
- rspec-mocks (2.13.1)
134
- slop (3.4.4)
135
- thor (0.14.6)
136
- treetop (1.4.14)
137
- polyglot
138
- polyglot (>= 0.3.1)
139
- tzinfo (0.3.37)
140
- unicode_utils (1.4.0)
141
- yard (0.8.6.1)
142
-
143
- PLATFORMS
144
- ruby
145
-
146
- DEPENDENCIES
147
- archive-tar-minitar (~> 0.5)
148
- bundler (~> 1.3)
149
- cucumber (~> 1.3)
150
- debugger (~> 1.3)
151
- jazz_hands (~> 0.5)
152
- jeweler (~> 1.8)
153
- nokogiri (~> 1.5)
154
- parsley-store (~> 0.3.2)
155
- rspec (~> 2.13)
data/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.9.6
@@ -1,95 +0,0 @@
1
- # encoding: UTF-8
2
- $:.unshift(File.dirname(__FILE__)) unless
3
- $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
4
- R19 = RUBY_VERSION.split('.')[0..1].join('').to_i > 18
5
- raise "IMPORTANT: dwc-archive gem requires ruby >= 1.9.1" unless R19
6
- require 'fileutils'
7
- require 'ostruct'
8
- require 'digest'
9
- require 'csv'
10
- require 'logger'
11
- require 'dwc-archive/xml_reader'
12
- require 'dwc-archive/ingester'
13
- require 'dwc-archive/errors'
14
- require 'dwc-archive/expander'
15
- require 'dwc-archive/archive'
16
- require 'dwc-archive/core'
17
- require 'dwc-archive/extension'
18
- require 'dwc-archive/metadata'
19
- require 'dwc-archive/generator'
20
- require 'dwc-archive/generator_meta_xml'
21
- require 'dwc-archive/generator_eml_xml'
22
- require 'dwc-archive/classification_normalizer'
23
-
24
- class DarwinCore
25
-
26
- VERSION = open(File.join(File.dirname(__FILE__), '..', 'VERSION')).readline.strip
27
-
28
- attr_reader :archive, :core, :metadata, :extensions, :classification_normalizer
29
- alias :eml :metadata
30
-
31
- DEFAULT_TMP_DIR = "/tmp"
32
-
33
- def self.nil_field?(field)
34
- return true if [nil, '', '/N'].include?(field)
35
- false
36
- end
37
-
38
- def self.clean_all(tmp_dir = DEFAULT_TMP_DIR)
39
- Dir.entries(tmp_dir).each do |entry|
40
- path = File.join(tmp_dir, entry)
41
- if FileTest.directory?(path) && entry.match(/^dwc_[\d]+$/)
42
- FileUtils.rm_rf(path)
43
- end
44
- end
45
- end
46
-
47
- def self.logger
48
- @@logger ||= Logger.new(nil)
49
- end
50
-
51
- def self.logger=(logger)
52
- @@logger = logger
53
- end
54
-
55
- def self.logger_reset
56
- self.logger = Logger.new(nil)
57
- end
58
-
59
- def self.logger_write(obj_id, message, method = :info)
60
- self.logger.send(method, "|%s|%s|" % [obj_id, message])
61
- end
62
-
63
- def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
64
- @dwc_path = dwc_path
65
- @archive = DarwinCore::Archive.new(@dwc_path, tmp_dir)
66
- @core = DarwinCore::Core.new(self)
67
- @metadata = DarwinCore::Metadata.new(@archive)
68
- @extensions = get_extensions
69
- end
70
-
71
- # generates a hash from a classification data with path to each node, list of synonyms and vernacular names.
72
- def normalize_classification
73
- return nil unless has_parent_id?
74
- @classification_normalizer ||= DarwinCore::ClassificationNormalizer.new(self)
75
- @classification_normalizer.normalize
76
- end
77
-
78
- def has_parent_id?
79
- !!@core.fields.join('|').downcase.match(/highertaxonid|parentnameusageid/)
80
- end
81
-
82
- def checksum
83
- Digest::SHA1.hexdigest(open(@dwc_path).read)
84
- end
85
-
86
- private
87
- def get_extensions
88
- res = []
89
- root_key = @archive.meta.keys[0]
90
- ext = @archive.meta[root_key][:extension]
91
- return [] unless ext
92
- ext = [ext] if ext.class != Array
93
- ext.map { |e| DarwinCore::Extension.new(self, e) }
94
- end
95
- end
@@ -1,37 +0,0 @@
1
- require 'nokogiri'
2
- class DarwinCore
3
- class Archive
4
- attr_reader :meta, :eml
5
- def initialize(archive_path, tmp_dir)
6
- @archive_path = archive_path
7
- @tmp_dir = tmp_dir
8
- @expander = DarwinCore::Expander.new(@archive_path, @tmp_dir)
9
- @expander.unpack
10
- if valid?
11
- @meta = DarwinCore::XmlReader.from_xml(open(File.join(@expander.path, 'meta.xml')))
12
- @eml = files.include?("eml.xml") ? DarwinCore::XmlReader.from_xml(open(File.join(@expander.path, 'eml.xml'))) : nil
13
- else
14
- clean
15
- raise InvalidArchiveError
16
- end
17
- end
18
-
19
- def valid?
20
- valid = true
21
- valid = valid && @expander.path && FileTest.exists?(@expander.path)
22
- valid = valid && files && files.include?('meta.xml')
23
- end
24
-
25
- def files
26
- @expander.files
27
- end
28
-
29
- def files_path
30
- @expander.path
31
- end
32
-
33
- def clean
34
- @expander.clean
35
- end
36
- end
37
- end
@@ -1,424 +0,0 @@
1
- # encoding: utf-8
2
- require 'parsley-store'
3
-
4
- class DarwinCore
5
-
6
- class TaxonNormalized
7
- attr_accessor :id, :local_id, :global_id, :source, :parent_id,
8
- :classification_path_id, :classification_path,
9
- :linnean_classification_path, :current_name, :current_name_canonical,
10
- :synonyms, :vernacular_names, :rank, :status
11
-
12
- def initialize
13
- @id = @parent_id = @rank = @status = nil
14
- @current_name = ''
15
- @current_name_canonical = ''
16
- @source = ''
17
- @local_id = ''
18
- @global_id = ''
19
- @classification_path = []
20
- @classification_path_id = []
21
- @synonyms = []
22
- @vernacular_names = []
23
- @linnean_classification_path = []
24
- end
25
-
26
- end
27
-
28
- class GnubTaxon < TaxonNormalized
29
- attr_accessor :uuid, :uuid_path
30
-
31
- def initialize
32
- super
33
- @uuid = nil
34
- @uuid_path = []
35
- end
36
- end
37
-
38
- class SynonymNormalized < Struct.new(:id, :name, :canonical_name,
39
- :status, :source, :local_id,
40
- :global_id);end
41
- class VernacularNormalized < Struct.new(:name, :language, :locality,
42
- :country_code);end
43
-
44
- class ClassificationNormalizer
45
- attr_reader :error_names, :tree, :normalized_data
46
-
47
- def initialize(dwc_instance)
48
- @dwc = dwc_instance
49
- @core_fields = get_fields(@dwc.core)
50
- @extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
51
- @normalized_data = {}
52
- @synonyms = {}
53
- @parser = ParsleyStore.new(1,2)
54
- @name_strings = {}
55
- @vernacular_name_strings = {}
56
- @error_names = []
57
- @tree = {}
58
- end
59
-
60
- def add_name_string(name_string)
61
- @name_strings[name_string] = 1 unless @name_strings[name_string]
62
- end
63
-
64
- def add_vernacular_name_string(name_string)
65
- unless @vernacular_name_strings[name_string]
66
- @vernacular_name_strings[name_string] = 1
67
- end
68
- end
69
-
70
- def name_strings(opts = {})
71
- opts = { with_hash: false }.merge(opts)
72
- if !!opts[:with_hash]
73
- @name_strings
74
- else
75
- @name_strings.keys
76
- end
77
- end
78
-
79
- def vernacular_name_strings(opts = {})
80
- opts = { with_hash: false }.merge(opts)
81
- if !!opts[:with_hash]
82
- @vernacular_name_strings
83
- else
84
- @vernacular_name_strings.keys
85
- end
86
- end
87
-
88
- def normalize(opts = {})
89
- opts = { :with_canonical_names => true,
90
- :with_extensions => true }.merge(opts)
91
- @with_canonical_names = !!opts[:with_canonical_names]
92
- DarwinCore.logger_write(@dwc.object_id,
93
- 'Started normalization of the classification')
94
- ingest_core
95
- DarwinCore.logger_write(@dwc.object_id,
96
- 'Calculating the classification parent/child paths')
97
- has_parent_id? ?
98
- calculate_classification_path :
99
- @normalized_data.keys.each { |id| @tree[id] = {} }
100
- DarwinCore.logger_write(@dwc.object_id, 'Ingesting data from extensions')
101
- if !!opts[:with_extensions]
102
- ingest_extensions
103
- end
104
- @normalized_data
105
- end
106
-
107
- private
108
-
109
- def get_canonical_name(a_scientific_name)
110
- if @with_canonical_names
111
- canonical_name = @parser.parse(a_scientific_name,
112
- :canonical_only => true)
113
- canonical_name.to_s.empty? ? a_scientific_name : canonical_name
114
- else
115
- nil
116
- end
117
- end
118
-
119
- def get_fields(element)
120
- data = element.fields.inject({}) do |res, f|
121
- field = f[:term].split('/')[-1]
122
- field = field ? field.downcase.to_sym : ''
123
- res[field] = f[:index].to_i
124
- res
125
- end
126
- data[:id] = element.id[:index] if element.id
127
- data
128
- end
129
-
130
- def status_synonym?(status)
131
- status && !!status.match(/^syn/)
132
- end
133
-
134
- def add_synonym_from_core(taxon_id, row)
135
- @synonyms[row[@core_fields[:id]]] = taxon_id
136
- taxon = @normalized_data[row[taxon_id]] ?
137
- @normalized_data[row[taxon_id]] :
138
- @normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new
139
- synonym = SynonymNormalized.new(
140
- row[@core_fields[:id]],
141
- row[@core_fields[:scientificname]],
142
- row[@core_fields[:canonicalname]],
143
- @core_fields[:taxonomicstatus] ?
144
- row[@core_fields[:taxonomicstatus]] :
145
- nil,
146
- @core_fields[:source] ? row[@core_fields[:source]] : nil,
147
- @core_fields[:localid] ? row[@core_fields[:localid]] : nil,
148
- @core_fields[:globalid] ? row[@core_fields[:globalid]] : nil,
149
- )
150
- taxon.synonyms << synonym
151
- add_name_string(synonym.name)
152
- add_name_string(synonym.canonical_name)
153
- end
154
-
155
- def set_scientific_name(row, fields)
156
- row[fields[:scientificname]] = 'N/A' unless row[fields[:scientificname]]
157
- canonical_name = nil
158
- scientific_name = row[fields[:scientificname]].strip
159
- if separate_canonical_and_authorship?(row, fields)
160
- if @with_canonical_names
161
- canonical_name = row[fields[:scientificname]].strip
162
- end
163
- scientific_name += " #{row[fields[:scientificnameauthorship]].strip}"
164
- else
165
- if @with_canonical_names
166
- canonical_name = get_canonical_name(row[fields[:scientificname]])
167
- end
168
- end
169
- fields[:canonicalname] = row.size
170
- row << canonical_name
171
- row[fields[:scientificname]] = scientific_name
172
- end
173
-
174
- def separate_canonical_and_authorship?(row, fields)
175
- authorship = ''
176
- if fields[:scientificnameauthorship]
177
- authorship = row[fields[:scientificnameauthorship]].to_s.strip
178
- end
179
- !(authorship.empty? || row[fields[:scientificname]].index(authorship))
180
- end
181
-
182
- def ingest_core
183
- @normalized_data = {}
184
- has_name_and_id = @core_fields[:id] && @core_fields[:scientificname]
185
- raise DarwinCore::CoreFileError.new('Darwin Core core fields must ' +
186
- 'contain taxon id and scientific name') unless has_name_and_id
187
- @dwc.core.read do |rows|
188
- rows[1].each do |error|
189
- @error_names << { :data => error,
190
- :error => :reading_or_encoding_error }
191
- end
192
- rows[0].each do |r|
193
- set_scientific_name(r, @core_fields)
194
- #core has AcceptedNameUsageId
195
- if @core_fields[:acceptednameusageid] &&
196
- r[@core_fields[:acceptednameusageid]] &&
197
- r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
198
- add_synonym_from_core(@core_fields[:acceptednameusageid], r)
199
- elsif !@core_fields[:acceptednameusageid] &&
200
- @core_fields[:taxonomicstatus] &&
201
- status_synonym?(r[@core_fields[:taxonomicstatus]])
202
- add_synonym_from_core(parent_id, r) if has_parent_id?
203
- else
204
- unless @normalized_data[r[@core_fields[:id]]]
205
- if gnub_archive?
206
- new_taxon = DarwinCore::GnubTaxon.new
207
- else
208
- new_taxon = DarwinCore::TaxonNormalized.new
209
- end
210
- @normalized_data[r[@core_fields[:id]]] = new_taxon
211
- end
212
- taxon = @normalized_data[r[@core_fields[:id]]]
213
- if gnub_archive?
214
- taxon.uuid = r[@core_fields[:originalnameusageid]]
215
- taxon.uuid_path = r[@core_fields[:originalnameusageidpath]].
216
- split('|')
217
- end
218
- taxon.id = r[@core_fields[:id]]
219
- taxon.current_name = r[@core_fields[:scientificname]]
220
- taxon.current_name_canonical = r[@core_fields[:canonicalname]]
221
- taxon.parent_id = has_parent_id? ? r[parent_id] : nil
222
- taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
223
- if @core_fields[:taxonomicstatus]
224
- taxon.status = r[@core_fields[:taxonomicstatus]]
225
- end
226
- taxon.source = r[@core_fields[:source]] if @core_fields[:source]
227
- taxon.local_id = r[@core_fields[:localid]] if @core_fields[:localid]
228
- if @core_fields[:globalid]
229
- taxon.global_id = r[@core_fields[:globalid]]
230
- end
231
- taxon.linnean_classification_path =
232
- get_linnean_classification_path(r, taxon)
233
- add_name_string(taxon.current_name)
234
- has_canonical = taxon.current_name_canonical &&
235
- !taxon.current_name_canonical.empty?
236
- add_name_string(taxon.current_name_canonical) if has_canonical
237
- end
238
- end
239
- end
240
- end
241
-
242
- def has_parent_id?
243
- @has_parent_id ||= @core_fields.has_key?(:highertaxonid) ||
244
- @core_fields.has_key?(:parentnameusageid)
245
- end
246
-
247
- def parent_id
248
- parent_id_field = @core_fields[:highertaxonid] ||
249
- @core_fields[:parentnameusageid]
250
- end
251
-
252
- def calculate_classification_path
253
- @paths_num = 0
254
- @normalized_data.each do |taxon_id, taxon|
255
- next if !taxon.classification_path_id.empty?
256
- res = get_classification_path(taxon)
257
- next if res == 'error'
258
- end
259
- end
260
-
261
- def get_classification_path(taxon)
262
- return if !taxon.classification_path_id.empty?
263
- @paths_num += 1
264
- if @paths_num % 10000 == 0
265
- DarwinCore.logger_write(@dwc.object_id,
266
- "Calculated %s paths" % @paths_num)
267
- end
268
- current_node = {taxon.id => {}}
269
- if DarwinCore.nil_field?(taxon.parent_id)
270
- if @with_canonical_names
271
- taxon.classification_path << taxon.current_name_canonical
272
- end
273
- taxon.classification_path_id << taxon.id
274
- @tree.merge!(current_node)
275
- else
276
- parent_cp = parent_cpid = nil
277
- if @normalized_data[taxon.parent_id]
278
- if @with_canonical_names
279
- parent_cp = @normalized_data[taxon.parent_id].classification_path
280
- end
281
- parent_cpid = @normalized_data[taxon.parent_id].
282
- classification_path_id
283
- else
284
- current_parent = @normalized_data[@synonyms[taxon.parent_id]]
285
- if current_parent
286
- error = 'WARNING: The parent of the taxon ' +
287
- "\'#{taxon.current_name}\' is deprecated"
288
- @error_names << {:data => taxon,
289
- :error => :deprecated_parent,
290
- :current_parent => current_parent }
291
-
292
- if @with_canonical_names
293
- parent_cp = current_parent.classification_path
294
- end
295
- parent_cpid = current_parent.classification_path_id
296
- else
297
- error = 'WARNING: The parent of the taxon ' +
298
- "\'#{taxon.current_name}\' not found"
299
- @error_names << {:data => taxon,
300
- :error => :deprecated_parent, :current_parent => nil}
301
- end
302
- end
303
- return 'error' unless parent_cpid
304
- if parent_cpid.empty?
305
- res = 'error'
306
- begin
307
- res = get_classification_path(@normalized_data[taxon.parent_id])
308
- rescue SystemStackError
309
- @error_names << {:data => taxon,
310
- :error => :too_deep_hierarchy, :current_parent => nil}
311
- end
312
- return res if res == 'error'
313
- if @with_canonical_names
314
- taxon.classification_path += @normalized_data[taxon.parent_id].
315
- classification_path + [taxon.current_name_canonical]
316
- end
317
- taxon.classification_path_id += @normalized_data[taxon.parent_id].
318
- classification_path_id + [taxon.id]
319
- parent_node = @normalized_data[taxon.parent_id].
320
- classification_path_id.inject(@tree) {|node, id| node[id]}
321
- parent_node.merge!(current_node)
322
- else
323
- taxon.classification_path += parent_cp +
324
- [taxon.current_name_canonical] if @with_canonical_names
325
- taxon.classification_path_id += parent_cpid + [taxon.id]
326
- parent_node = @normalized_data[taxon.parent_id].
327
- classification_path_id.inject(@tree) {|node, id| node[id]}
328
- begin
329
- parent_node.merge!(current_node)
330
- rescue NoMethodError => e
331
- DarwinCore.logger_write(@dwc.object_id,
332
- "Error '%s' taxon %s" % [e.message, taxon.id])
333
- return 'error'
334
- end
335
- end
336
- end
337
- end
338
-
339
- def ingest_extensions
340
- @extensions.each do |e|
341
- ext, fields = *e
342
- ingest_synonyms(e) if (File.split(e[0].file_path).
343
- last.match(/synonym/i) &&
344
- fields.keys.include?(:scientificname))
345
- ingest_vernaculars(e) if fields.keys.include? :vernacularname
346
- end
347
- end
348
-
349
- def ingest_synonyms(extension)
350
- DarwinCore.logger_write(@dwc.object_id, 'Ingesting synonyms extension')
351
- ext, fields = *extension
352
- ext.read do |rows|
353
- rows[0].each do |r|
354
- set_scientific_name(r, fields)
355
- synonym = SynonymNormalized.new(
356
- nil,
357
- r[fields[:scientificname]],
358
- r[fields[:canonicalname]],
359
- fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil,
360
- fields[:source] ? r[fields[:source]] : nil,
361
- fields[:localid] ? r[fields[:localid]] : nil,
362
- fields[:globalid] ? r[fields[:globalid]] : nil,
363
- )
364
- if @normalized_data[r[fields[:id]]]
365
- @normalized_data[r[fields[:id]]].synonyms << synonym
366
- add_name_string(synonym.name)
367
- add_name_string(synonym.canonical_name)
368
- else
369
- @error_names << { :taxon => synonym,
370
- :error => :synonym_of_unknown_taxa }
371
- end
372
- end
373
- end
374
- end
375
-
376
- def ingest_vernaculars(extension)
377
- DarwinCore.logger_write(@dwc.object_id,
378
- 'Ingesting vernacular names extension')
379
- ext, fields = *extension
380
- ext.read do |rows|
381
- rows[0].each do |r|
382
-
383
- language = nil
384
- if fields[:language]
385
- language = r[fields[:language]]
386
- elsif fields[:languagecode]
387
- language = r[fields[:languagecode]]
388
- end
389
-
390
- locality = fields[:locality] ? r[fields[:locality]] : nil
391
-
392
- country_code = fields[:countrycode] ? r[fields[:countrycode]] : nil
393
-
394
- vernacular = VernacularNormalized.new(
395
- r[fields[:vernacularname]],
396
- language,
397
- locality,
398
- country_code)
399
- if @normalized_data[r[fields[:id]]]
400
- @normalized_data[r[fields[:id]]].vernacular_names << vernacular
401
- add_vernacular_name_string(vernacular.name)
402
- else
403
- @error_names << { :vernacular_name => vernacular,
404
- :error => :vernacular_of_unknown_taxa }
405
- end
406
- end
407
- end
408
- end
409
-
410
- #Collect linnean classification path only on species level
411
- def get_linnean_classification_path(row, taxon)
412
- res = []
413
- [:kingdom, :phylum, :class,
414
- :order, :family, :genus, :subgenus].each do |clade|
415
- res << [row[@core_fields[clade]], clade] if @core_fields[clade]
416
- end
417
- res
418
- end
419
-
420
- def gnub_archive?
421
- @core_fields[:originalnameusageidpath]
422
- end
423
- end
424
- end