dwc-archive 0.9.6 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +31 -0
- data/.rspec +3 -0
- data/.rubocop.yml +23 -0
- data/.ruby-version +1 -0
- data/.travis.yml +4 -5
- data/CHANGELOG +15 -7
- data/Gemfile +3 -15
- data/LICENSE +1 -1
- data/README.md +135 -111
- data/Rakefile +13 -54
- data/dwc-archive.gemspec +37 -0
- data/features/step_definitions/dwc-creator_steps.rb +5 -5
- data/features/step_definitions/dwc-reader_steps.rb +47 -28
- data/features/support/env.rb +1 -1
- data/lib/dwc_archive.rb +121 -0
- data/lib/dwc_archive/archive.rb +59 -0
- data/lib/dwc_archive/classification_normalizer.rb +382 -0
- data/lib/dwc_archive/core.rb +25 -0
- data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
- data/lib/dwc_archive/expander.rb +85 -0
- data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
- data/lib/dwc_archive/generator.rb +90 -0
- data/lib/dwc_archive/generator_eml_xml.rb +116 -0
- data/lib/dwc_archive/generator_meta_xml.rb +72 -0
- data/lib/dwc_archive/gnub_taxon.rb +14 -0
- data/lib/dwc_archive/ingester.rb +106 -0
- data/lib/dwc_archive/metadata.rb +56 -0
- data/lib/dwc_archive/taxon_normalized.rb +23 -0
- data/lib/dwc_archive/version.rb +6 -0
- data/lib/dwc_archive/xml_reader.rb +89 -0
- data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
- data/spec/files/generator_eml.xml +47 -0
- data/spec/files/generator_meta.xml +19 -0
- data/spec/lib/classification_normalizer_spec.rb +214 -0
- data/spec/lib/core_spec.rb +100 -0
- data/spec/lib/darwin_core_spec.rb +249 -0
- data/spec/lib/generator_eml_xml_spec.rb +22 -0
- data/spec/lib/generator_meta_xml_spec.rb +22 -0
- data/spec/lib/generator_spec.rb +124 -0
- data/spec/lib/gnub_taxon_spec.rb +32 -0
- data/spec/lib/metadata_spec.rb +89 -0
- data/spec/lib/taxon_normalized_spec.rb +142 -0
- data/spec/lib/xml_reader_spec.rb +11 -11
- data/spec/spec_helper.rb +78 -6
- metadata +180 -92
- data/.rvmrc +0 -1
- data/Gemfile.lock +0 -155
- data/VERSION +0 -1
- data/lib/dwc-archive.rb +0 -95
- data/lib/dwc-archive/.expander.rb.swo +0 -0
- data/lib/dwc-archive/archive.rb +0 -37
- data/lib/dwc-archive/classification_normalizer.rb +0 -424
- data/lib/dwc-archive/core.rb +0 -17
- data/lib/dwc-archive/expander.rb +0 -80
- data/lib/dwc-archive/generator.rb +0 -75
- data/lib/dwc-archive/generator_eml_xml.rb +0 -84
- data/lib/dwc-archive/generator_meta_xml.rb +0 -50
- data/lib/dwc-archive/ingester.rb +0 -101
- data/lib/dwc-archive/metadata.rb +0 -42
- data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
- data/lib/dwc-archive/xml_reader.rb +0 -64
- data/spec/lib/dwc-archive_spec.rb +0 -250
- data/spec/spec.opts +0 -1
data/.rvmrc
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
rvm use ruby-1.9.3-p392@dwc-archive --create
|
data/Gemfile.lock
DELETED
@@ -1,155 +0,0 @@
|
|
1
|
-
GEM
|
2
|
-
remote: https://rubygems.org/
|
3
|
-
specs:
|
4
|
-
abstract (1.0.0)
|
5
|
-
actionpack (3.0.8)
|
6
|
-
activemodel (= 3.0.8)
|
7
|
-
activesupport (= 3.0.8)
|
8
|
-
builder (~> 2.1.2)
|
9
|
-
erubis (~> 2.6.6)
|
10
|
-
i18n (~> 0.5.0)
|
11
|
-
rack (~> 1.2.1)
|
12
|
-
rack-mount (~> 0.6.14)
|
13
|
-
rack-test (~> 0.5.7)
|
14
|
-
tzinfo (~> 0.3.23)
|
15
|
-
activemodel (3.0.8)
|
16
|
-
activesupport (= 3.0.8)
|
17
|
-
builder (~> 2.1.2)
|
18
|
-
i18n (~> 0.5.0)
|
19
|
-
activesupport (3.0.8)
|
20
|
-
archive-tar-minitar (0.5.2)
|
21
|
-
awesome_print (1.1.0)
|
22
|
-
binding_of_caller (0.7.1)
|
23
|
-
debug_inspector (>= 0.0.1)
|
24
|
-
biodiversity (3.1.0)
|
25
|
-
parallel
|
26
|
-
parallel (~> 0.6)
|
27
|
-
rake (~> 10.0)
|
28
|
-
treetop
|
29
|
-
treetop (~> 1.4)
|
30
|
-
unicode_utils (~> 1.4)
|
31
|
-
builder (2.1.2)
|
32
|
-
coderay (1.0.9)
|
33
|
-
columnize (0.3.6)
|
34
|
-
coolline (0.4.2)
|
35
|
-
cucumber (1.3.1)
|
36
|
-
builder (>= 2.1.2)
|
37
|
-
diff-lcs (>= 1.1.3)
|
38
|
-
gherkin (~> 2.12.0)
|
39
|
-
multi_json (~> 1.3)
|
40
|
-
debug_inspector (0.0.2)
|
41
|
-
debugger (1.5.0)
|
42
|
-
columnize (>= 0.3.1)
|
43
|
-
debugger-linecache (~> 1.2.0)
|
44
|
-
debugger-ruby_core_source (~> 1.2.0)
|
45
|
-
debugger-linecache (1.2.0)
|
46
|
-
debugger-ruby_core_source (1.2.0)
|
47
|
-
diff-lcs (1.2.4)
|
48
|
-
diffy (2.1.4)
|
49
|
-
erubis (2.6.6)
|
50
|
-
abstract (>= 1.0.0)
|
51
|
-
gherkin (2.12.0)
|
52
|
-
multi_json (~> 1.3)
|
53
|
-
git (1.2.5)
|
54
|
-
grit (2.5.0)
|
55
|
-
diff-lcs (~> 1.1)
|
56
|
-
mime-types (~> 1.15)
|
57
|
-
posix-spawn (~> 0.3.6)
|
58
|
-
hirb (0.7.1)
|
59
|
-
i18n (0.5.0)
|
60
|
-
jazz_hands (0.5.0)
|
61
|
-
awesome_print (~> 1.1.0)
|
62
|
-
coderay (~> 1.0.9)
|
63
|
-
coolline (>= 0.4.0)
|
64
|
-
hirb (~> 0.7.1)
|
65
|
-
pry (~> 0.9.12)
|
66
|
-
pry-debugger (~> 0.2.2)
|
67
|
-
pry-doc (~> 0.4.4)
|
68
|
-
pry-git (~> 0.2.3)
|
69
|
-
pry-rails (~> 0.2.2)
|
70
|
-
pry-remote (>= 0.1.7)
|
71
|
-
pry-stack_explorer (~> 0.4.9)
|
72
|
-
railties (>= 3.0, < 5.0)
|
73
|
-
jeweler (1.8.4)
|
74
|
-
bundler (~> 1.0)
|
75
|
-
git (>= 1.2.5)
|
76
|
-
rake
|
77
|
-
rdoc
|
78
|
-
json (1.7.7)
|
79
|
-
method_source (0.8.1)
|
80
|
-
mime-types (1.23)
|
81
|
-
multi_json (1.7.3)
|
82
|
-
nokogiri (1.5.9)
|
83
|
-
parallel (0.7.0)
|
84
|
-
parsley-store (0.3.2)
|
85
|
-
biodiversity (~> 3.1.0)
|
86
|
-
jeweler (~> 1.8)
|
87
|
-
redis (~> 3.0)
|
88
|
-
polyglot (0.3.3)
|
89
|
-
posix-spawn (0.3.6)
|
90
|
-
pry (0.9.12.1)
|
91
|
-
coderay (~> 1.0.5)
|
92
|
-
method_source (~> 0.8)
|
93
|
-
slop (~> 3.4)
|
94
|
-
pry-debugger (0.2.2)
|
95
|
-
debugger (~> 1.3)
|
96
|
-
pry (~> 0.9.10)
|
97
|
-
pry-doc (0.4.5)
|
98
|
-
pry (>= 0.9)
|
99
|
-
yard (>= 0.8)
|
100
|
-
pry-git (0.2.3)
|
101
|
-
diffy
|
102
|
-
grit
|
103
|
-
pry (>= 0.9.8)
|
104
|
-
pry-rails (0.2.2)
|
105
|
-
pry (>= 0.9.10)
|
106
|
-
pry-remote (0.1.7)
|
107
|
-
pry (~> 0.9)
|
108
|
-
slop (~> 3.0)
|
109
|
-
pry-stack_explorer (0.4.9)
|
110
|
-
binding_of_caller (>= 0.7)
|
111
|
-
pry (~> 0.9.11)
|
112
|
-
rack (1.2.8)
|
113
|
-
rack-mount (0.6.14)
|
114
|
-
rack (>= 1.0.0)
|
115
|
-
rack-test (0.5.7)
|
116
|
-
rack (>= 1.0)
|
117
|
-
railties (3.0.8)
|
118
|
-
actionpack (= 3.0.8)
|
119
|
-
activesupport (= 3.0.8)
|
120
|
-
rake (>= 0.8.7)
|
121
|
-
thor (~> 0.14.4)
|
122
|
-
rake (10.0.4)
|
123
|
-
rdoc (4.0.1)
|
124
|
-
json (~> 1.4)
|
125
|
-
redis (3.0.4)
|
126
|
-
rspec (2.13.0)
|
127
|
-
rspec-core (~> 2.13.0)
|
128
|
-
rspec-expectations (~> 2.13.0)
|
129
|
-
rspec-mocks (~> 2.13.0)
|
130
|
-
rspec-core (2.13.1)
|
131
|
-
rspec-expectations (2.13.0)
|
132
|
-
diff-lcs (>= 1.1.3, < 2.0)
|
133
|
-
rspec-mocks (2.13.1)
|
134
|
-
slop (3.4.4)
|
135
|
-
thor (0.14.6)
|
136
|
-
treetop (1.4.14)
|
137
|
-
polyglot
|
138
|
-
polyglot (>= 0.3.1)
|
139
|
-
tzinfo (0.3.37)
|
140
|
-
unicode_utils (1.4.0)
|
141
|
-
yard (0.8.6.1)
|
142
|
-
|
143
|
-
PLATFORMS
|
144
|
-
ruby
|
145
|
-
|
146
|
-
DEPENDENCIES
|
147
|
-
archive-tar-minitar (~> 0.5)
|
148
|
-
bundler (~> 1.3)
|
149
|
-
cucumber (~> 1.3)
|
150
|
-
debugger (~> 1.3)
|
151
|
-
jazz_hands (~> 0.5)
|
152
|
-
jeweler (~> 1.8)
|
153
|
-
nokogiri (~> 1.5)
|
154
|
-
parsley-store (~> 0.3.2)
|
155
|
-
rspec (~> 2.13)
|
data/VERSION
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.9.6
|
data/lib/dwc-archive.rb
DELETED
@@ -1,95 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
$:.unshift(File.dirname(__FILE__)) unless
|
3
|
-
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
4
|
-
R19 = RUBY_VERSION.split('.')[0..1].join('').to_i > 18
|
5
|
-
raise "IMPORTANT: dwc-archive gem requires ruby >= 1.9.1" unless R19
|
6
|
-
require 'fileutils'
|
7
|
-
require 'ostruct'
|
8
|
-
require 'digest'
|
9
|
-
require 'csv'
|
10
|
-
require 'logger'
|
11
|
-
require 'dwc-archive/xml_reader'
|
12
|
-
require 'dwc-archive/ingester'
|
13
|
-
require 'dwc-archive/errors'
|
14
|
-
require 'dwc-archive/expander'
|
15
|
-
require 'dwc-archive/archive'
|
16
|
-
require 'dwc-archive/core'
|
17
|
-
require 'dwc-archive/extension'
|
18
|
-
require 'dwc-archive/metadata'
|
19
|
-
require 'dwc-archive/generator'
|
20
|
-
require 'dwc-archive/generator_meta_xml'
|
21
|
-
require 'dwc-archive/generator_eml_xml'
|
22
|
-
require 'dwc-archive/classification_normalizer'
|
23
|
-
|
24
|
-
class DarwinCore
|
25
|
-
|
26
|
-
VERSION = open(File.join(File.dirname(__FILE__), '..', 'VERSION')).readline.strip
|
27
|
-
|
28
|
-
attr_reader :archive, :core, :metadata, :extensions, :classification_normalizer
|
29
|
-
alias :eml :metadata
|
30
|
-
|
31
|
-
DEFAULT_TMP_DIR = "/tmp"
|
32
|
-
|
33
|
-
def self.nil_field?(field)
|
34
|
-
return true if [nil, '', '/N'].include?(field)
|
35
|
-
false
|
36
|
-
end
|
37
|
-
|
38
|
-
def self.clean_all(tmp_dir = DEFAULT_TMP_DIR)
|
39
|
-
Dir.entries(tmp_dir).each do |entry|
|
40
|
-
path = File.join(tmp_dir, entry)
|
41
|
-
if FileTest.directory?(path) && entry.match(/^dwc_[\d]+$/)
|
42
|
-
FileUtils.rm_rf(path)
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def self.logger
|
48
|
-
@@logger ||= Logger.new(nil)
|
49
|
-
end
|
50
|
-
|
51
|
-
def self.logger=(logger)
|
52
|
-
@@logger = logger
|
53
|
-
end
|
54
|
-
|
55
|
-
def self.logger_reset
|
56
|
-
self.logger = Logger.new(nil)
|
57
|
-
end
|
58
|
-
|
59
|
-
def self.logger_write(obj_id, message, method = :info)
|
60
|
-
self.logger.send(method, "|%s|%s|" % [obj_id, message])
|
61
|
-
end
|
62
|
-
|
63
|
-
def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
|
64
|
-
@dwc_path = dwc_path
|
65
|
-
@archive = DarwinCore::Archive.new(@dwc_path, tmp_dir)
|
66
|
-
@core = DarwinCore::Core.new(self)
|
67
|
-
@metadata = DarwinCore::Metadata.new(@archive)
|
68
|
-
@extensions = get_extensions
|
69
|
-
end
|
70
|
-
|
71
|
-
# generates a hash from a classification data with path to each node, list of synonyms and vernacular names.
|
72
|
-
def normalize_classification
|
73
|
-
return nil unless has_parent_id?
|
74
|
-
@classification_normalizer ||= DarwinCore::ClassificationNormalizer.new(self)
|
75
|
-
@classification_normalizer.normalize
|
76
|
-
end
|
77
|
-
|
78
|
-
def has_parent_id?
|
79
|
-
!!@core.fields.join('|').downcase.match(/highertaxonid|parentnameusageid/)
|
80
|
-
end
|
81
|
-
|
82
|
-
def checksum
|
83
|
-
Digest::SHA1.hexdigest(open(@dwc_path).read)
|
84
|
-
end
|
85
|
-
|
86
|
-
private
|
87
|
-
def get_extensions
|
88
|
-
res = []
|
89
|
-
root_key = @archive.meta.keys[0]
|
90
|
-
ext = @archive.meta[root_key][:extension]
|
91
|
-
return [] unless ext
|
92
|
-
ext = [ext] if ext.class != Array
|
93
|
-
ext.map { |e| DarwinCore::Extension.new(self, e) }
|
94
|
-
end
|
95
|
-
end
|
Binary file
|
data/lib/dwc-archive/archive.rb
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
class DarwinCore
|
3
|
-
class Archive
|
4
|
-
attr_reader :meta, :eml
|
5
|
-
def initialize(archive_path, tmp_dir)
|
6
|
-
@archive_path = archive_path
|
7
|
-
@tmp_dir = tmp_dir
|
8
|
-
@expander = DarwinCore::Expander.new(@archive_path, @tmp_dir)
|
9
|
-
@expander.unpack
|
10
|
-
if valid?
|
11
|
-
@meta = DarwinCore::XmlReader.from_xml(open(File.join(@expander.path, 'meta.xml')))
|
12
|
-
@eml = files.include?("eml.xml") ? DarwinCore::XmlReader.from_xml(open(File.join(@expander.path, 'eml.xml'))) : nil
|
13
|
-
else
|
14
|
-
clean
|
15
|
-
raise InvalidArchiveError
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
def valid?
|
20
|
-
valid = true
|
21
|
-
valid = valid && @expander.path && FileTest.exists?(@expander.path)
|
22
|
-
valid = valid && files && files.include?('meta.xml')
|
23
|
-
end
|
24
|
-
|
25
|
-
def files
|
26
|
-
@expander.files
|
27
|
-
end
|
28
|
-
|
29
|
-
def files_path
|
30
|
-
@expander.path
|
31
|
-
end
|
32
|
-
|
33
|
-
def clean
|
34
|
-
@expander.clean
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
@@ -1,424 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
require 'parsley-store'
|
3
|
-
|
4
|
-
class DarwinCore
|
5
|
-
|
6
|
-
class TaxonNormalized
|
7
|
-
attr_accessor :id, :local_id, :global_id, :source, :parent_id,
|
8
|
-
:classification_path_id, :classification_path,
|
9
|
-
:linnean_classification_path, :current_name, :current_name_canonical,
|
10
|
-
:synonyms, :vernacular_names, :rank, :status
|
11
|
-
|
12
|
-
def initialize
|
13
|
-
@id = @parent_id = @rank = @status = nil
|
14
|
-
@current_name = ''
|
15
|
-
@current_name_canonical = ''
|
16
|
-
@source = ''
|
17
|
-
@local_id = ''
|
18
|
-
@global_id = ''
|
19
|
-
@classification_path = []
|
20
|
-
@classification_path_id = []
|
21
|
-
@synonyms = []
|
22
|
-
@vernacular_names = []
|
23
|
-
@linnean_classification_path = []
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|
27
|
-
|
28
|
-
class GnubTaxon < TaxonNormalized
|
29
|
-
attr_accessor :uuid, :uuid_path
|
30
|
-
|
31
|
-
def initialize
|
32
|
-
super
|
33
|
-
@uuid = nil
|
34
|
-
@uuid_path = []
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
class SynonymNormalized < Struct.new(:id, :name, :canonical_name,
|
39
|
-
:status, :source, :local_id,
|
40
|
-
:global_id);end
|
41
|
-
class VernacularNormalized < Struct.new(:name, :language, :locality,
|
42
|
-
:country_code);end
|
43
|
-
|
44
|
-
class ClassificationNormalizer
|
45
|
-
attr_reader :error_names, :tree, :normalized_data
|
46
|
-
|
47
|
-
def initialize(dwc_instance)
|
48
|
-
@dwc = dwc_instance
|
49
|
-
@core_fields = get_fields(@dwc.core)
|
50
|
-
@extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
|
51
|
-
@normalized_data = {}
|
52
|
-
@synonyms = {}
|
53
|
-
@parser = ParsleyStore.new(1,2)
|
54
|
-
@name_strings = {}
|
55
|
-
@vernacular_name_strings = {}
|
56
|
-
@error_names = []
|
57
|
-
@tree = {}
|
58
|
-
end
|
59
|
-
|
60
|
-
def add_name_string(name_string)
|
61
|
-
@name_strings[name_string] = 1 unless @name_strings[name_string]
|
62
|
-
end
|
63
|
-
|
64
|
-
def add_vernacular_name_string(name_string)
|
65
|
-
unless @vernacular_name_strings[name_string]
|
66
|
-
@vernacular_name_strings[name_string] = 1
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
def name_strings(opts = {})
|
71
|
-
opts = { with_hash: false }.merge(opts)
|
72
|
-
if !!opts[:with_hash]
|
73
|
-
@name_strings
|
74
|
-
else
|
75
|
-
@name_strings.keys
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
def vernacular_name_strings(opts = {})
|
80
|
-
opts = { with_hash: false }.merge(opts)
|
81
|
-
if !!opts[:with_hash]
|
82
|
-
@vernacular_name_strings
|
83
|
-
else
|
84
|
-
@vernacular_name_strings.keys
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
def normalize(opts = {})
|
89
|
-
opts = { :with_canonical_names => true,
|
90
|
-
:with_extensions => true }.merge(opts)
|
91
|
-
@with_canonical_names = !!opts[:with_canonical_names]
|
92
|
-
DarwinCore.logger_write(@dwc.object_id,
|
93
|
-
'Started normalization of the classification')
|
94
|
-
ingest_core
|
95
|
-
DarwinCore.logger_write(@dwc.object_id,
|
96
|
-
'Calculating the classification parent/child paths')
|
97
|
-
has_parent_id? ?
|
98
|
-
calculate_classification_path :
|
99
|
-
@normalized_data.keys.each { |id| @tree[id] = {} }
|
100
|
-
DarwinCore.logger_write(@dwc.object_id, 'Ingesting data from extensions')
|
101
|
-
if !!opts[:with_extensions]
|
102
|
-
ingest_extensions
|
103
|
-
end
|
104
|
-
@normalized_data
|
105
|
-
end
|
106
|
-
|
107
|
-
private
|
108
|
-
|
109
|
-
def get_canonical_name(a_scientific_name)
|
110
|
-
if @with_canonical_names
|
111
|
-
canonical_name = @parser.parse(a_scientific_name,
|
112
|
-
:canonical_only => true)
|
113
|
-
canonical_name.to_s.empty? ? a_scientific_name : canonical_name
|
114
|
-
else
|
115
|
-
nil
|
116
|
-
end
|
117
|
-
end
|
118
|
-
|
119
|
-
def get_fields(element)
|
120
|
-
data = element.fields.inject({}) do |res, f|
|
121
|
-
field = f[:term].split('/')[-1]
|
122
|
-
field = field ? field.downcase.to_sym : ''
|
123
|
-
res[field] = f[:index].to_i
|
124
|
-
res
|
125
|
-
end
|
126
|
-
data[:id] = element.id[:index] if element.id
|
127
|
-
data
|
128
|
-
end
|
129
|
-
|
130
|
-
def status_synonym?(status)
|
131
|
-
status && !!status.match(/^syn/)
|
132
|
-
end
|
133
|
-
|
134
|
-
def add_synonym_from_core(taxon_id, row)
|
135
|
-
@synonyms[row[@core_fields[:id]]] = taxon_id
|
136
|
-
taxon = @normalized_data[row[taxon_id]] ?
|
137
|
-
@normalized_data[row[taxon_id]] :
|
138
|
-
@normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new
|
139
|
-
synonym = SynonymNormalized.new(
|
140
|
-
row[@core_fields[:id]],
|
141
|
-
row[@core_fields[:scientificname]],
|
142
|
-
row[@core_fields[:canonicalname]],
|
143
|
-
@core_fields[:taxonomicstatus] ?
|
144
|
-
row[@core_fields[:taxonomicstatus]] :
|
145
|
-
nil,
|
146
|
-
@core_fields[:source] ? row[@core_fields[:source]] : nil,
|
147
|
-
@core_fields[:localid] ? row[@core_fields[:localid]] : nil,
|
148
|
-
@core_fields[:globalid] ? row[@core_fields[:globalid]] : nil,
|
149
|
-
)
|
150
|
-
taxon.synonyms << synonym
|
151
|
-
add_name_string(synonym.name)
|
152
|
-
add_name_string(synonym.canonical_name)
|
153
|
-
end
|
154
|
-
|
155
|
-
def set_scientific_name(row, fields)
|
156
|
-
row[fields[:scientificname]] = 'N/A' unless row[fields[:scientificname]]
|
157
|
-
canonical_name = nil
|
158
|
-
scientific_name = row[fields[:scientificname]].strip
|
159
|
-
if separate_canonical_and_authorship?(row, fields)
|
160
|
-
if @with_canonical_names
|
161
|
-
canonical_name = row[fields[:scientificname]].strip
|
162
|
-
end
|
163
|
-
scientific_name += " #{row[fields[:scientificnameauthorship]].strip}"
|
164
|
-
else
|
165
|
-
if @with_canonical_names
|
166
|
-
canonical_name = get_canonical_name(row[fields[:scientificname]])
|
167
|
-
end
|
168
|
-
end
|
169
|
-
fields[:canonicalname] = row.size
|
170
|
-
row << canonical_name
|
171
|
-
row[fields[:scientificname]] = scientific_name
|
172
|
-
end
|
173
|
-
|
174
|
-
def separate_canonical_and_authorship?(row, fields)
|
175
|
-
authorship = ''
|
176
|
-
if fields[:scientificnameauthorship]
|
177
|
-
authorship = row[fields[:scientificnameauthorship]].to_s.strip
|
178
|
-
end
|
179
|
-
!(authorship.empty? || row[fields[:scientificname]].index(authorship))
|
180
|
-
end
|
181
|
-
|
182
|
-
def ingest_core
|
183
|
-
@normalized_data = {}
|
184
|
-
has_name_and_id = @core_fields[:id] && @core_fields[:scientificname]
|
185
|
-
raise DarwinCore::CoreFileError.new('Darwin Core core fields must ' +
|
186
|
-
'contain taxon id and scientific name') unless has_name_and_id
|
187
|
-
@dwc.core.read do |rows|
|
188
|
-
rows[1].each do |error|
|
189
|
-
@error_names << { :data => error,
|
190
|
-
:error => :reading_or_encoding_error }
|
191
|
-
end
|
192
|
-
rows[0].each do |r|
|
193
|
-
set_scientific_name(r, @core_fields)
|
194
|
-
#core has AcceptedNameUsageId
|
195
|
-
if @core_fields[:acceptednameusageid] &&
|
196
|
-
r[@core_fields[:acceptednameusageid]] &&
|
197
|
-
r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
|
198
|
-
add_synonym_from_core(@core_fields[:acceptednameusageid], r)
|
199
|
-
elsif !@core_fields[:acceptednameusageid] &&
|
200
|
-
@core_fields[:taxonomicstatus] &&
|
201
|
-
status_synonym?(r[@core_fields[:taxonomicstatus]])
|
202
|
-
add_synonym_from_core(parent_id, r) if has_parent_id?
|
203
|
-
else
|
204
|
-
unless @normalized_data[r[@core_fields[:id]]]
|
205
|
-
if gnub_archive?
|
206
|
-
new_taxon = DarwinCore::GnubTaxon.new
|
207
|
-
else
|
208
|
-
new_taxon = DarwinCore::TaxonNormalized.new
|
209
|
-
end
|
210
|
-
@normalized_data[r[@core_fields[:id]]] = new_taxon
|
211
|
-
end
|
212
|
-
taxon = @normalized_data[r[@core_fields[:id]]]
|
213
|
-
if gnub_archive?
|
214
|
-
taxon.uuid = r[@core_fields[:originalnameusageid]]
|
215
|
-
taxon.uuid_path = r[@core_fields[:originalnameusageidpath]].
|
216
|
-
split('|')
|
217
|
-
end
|
218
|
-
taxon.id = r[@core_fields[:id]]
|
219
|
-
taxon.current_name = r[@core_fields[:scientificname]]
|
220
|
-
taxon.current_name_canonical = r[@core_fields[:canonicalname]]
|
221
|
-
taxon.parent_id = has_parent_id? ? r[parent_id] : nil
|
222
|
-
taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
|
223
|
-
if @core_fields[:taxonomicstatus]
|
224
|
-
taxon.status = r[@core_fields[:taxonomicstatus]]
|
225
|
-
end
|
226
|
-
taxon.source = r[@core_fields[:source]] if @core_fields[:source]
|
227
|
-
taxon.local_id = r[@core_fields[:localid]] if @core_fields[:localid]
|
228
|
-
if @core_fields[:globalid]
|
229
|
-
taxon.global_id = r[@core_fields[:globalid]]
|
230
|
-
end
|
231
|
-
taxon.linnean_classification_path =
|
232
|
-
get_linnean_classification_path(r, taxon)
|
233
|
-
add_name_string(taxon.current_name)
|
234
|
-
has_canonical = taxon.current_name_canonical &&
|
235
|
-
!taxon.current_name_canonical.empty?
|
236
|
-
add_name_string(taxon.current_name_canonical) if has_canonical
|
237
|
-
end
|
238
|
-
end
|
239
|
-
end
|
240
|
-
end
|
241
|
-
|
242
|
-
def has_parent_id?
|
243
|
-
@has_parent_id ||= @core_fields.has_key?(:highertaxonid) ||
|
244
|
-
@core_fields.has_key?(:parentnameusageid)
|
245
|
-
end
|
246
|
-
|
247
|
-
def parent_id
|
248
|
-
parent_id_field = @core_fields[:highertaxonid] ||
|
249
|
-
@core_fields[:parentnameusageid]
|
250
|
-
end
|
251
|
-
|
252
|
-
def calculate_classification_path
|
253
|
-
@paths_num = 0
|
254
|
-
@normalized_data.each do |taxon_id, taxon|
|
255
|
-
next if !taxon.classification_path_id.empty?
|
256
|
-
res = get_classification_path(taxon)
|
257
|
-
next if res == 'error'
|
258
|
-
end
|
259
|
-
end
|
260
|
-
|
261
|
-
def get_classification_path(taxon)
|
262
|
-
return if !taxon.classification_path_id.empty?
|
263
|
-
@paths_num += 1
|
264
|
-
if @paths_num % 10000 == 0
|
265
|
-
DarwinCore.logger_write(@dwc.object_id,
|
266
|
-
"Calculated %s paths" % @paths_num)
|
267
|
-
end
|
268
|
-
current_node = {taxon.id => {}}
|
269
|
-
if DarwinCore.nil_field?(taxon.parent_id)
|
270
|
-
if @with_canonical_names
|
271
|
-
taxon.classification_path << taxon.current_name_canonical
|
272
|
-
end
|
273
|
-
taxon.classification_path_id << taxon.id
|
274
|
-
@tree.merge!(current_node)
|
275
|
-
else
|
276
|
-
parent_cp = parent_cpid = nil
|
277
|
-
if @normalized_data[taxon.parent_id]
|
278
|
-
if @with_canonical_names
|
279
|
-
parent_cp = @normalized_data[taxon.parent_id].classification_path
|
280
|
-
end
|
281
|
-
parent_cpid = @normalized_data[taxon.parent_id].
|
282
|
-
classification_path_id
|
283
|
-
else
|
284
|
-
current_parent = @normalized_data[@synonyms[taxon.parent_id]]
|
285
|
-
if current_parent
|
286
|
-
error = 'WARNING: The parent of the taxon ' +
|
287
|
-
"\'#{taxon.current_name}\' is deprecated"
|
288
|
-
@error_names << {:data => taxon,
|
289
|
-
:error => :deprecated_parent,
|
290
|
-
:current_parent => current_parent }
|
291
|
-
|
292
|
-
if @with_canonical_names
|
293
|
-
parent_cp = current_parent.classification_path
|
294
|
-
end
|
295
|
-
parent_cpid = current_parent.classification_path_id
|
296
|
-
else
|
297
|
-
error = 'WARNING: The parent of the taxon ' +
|
298
|
-
"\'#{taxon.current_name}\' not found"
|
299
|
-
@error_names << {:data => taxon,
|
300
|
-
:error => :deprecated_parent, :current_parent => nil}
|
301
|
-
end
|
302
|
-
end
|
303
|
-
return 'error' unless parent_cpid
|
304
|
-
if parent_cpid.empty?
|
305
|
-
res = 'error'
|
306
|
-
begin
|
307
|
-
res = get_classification_path(@normalized_data[taxon.parent_id])
|
308
|
-
rescue SystemStackError
|
309
|
-
@error_names << {:data => taxon,
|
310
|
-
:error => :too_deep_hierarchy, :current_parent => nil}
|
311
|
-
end
|
312
|
-
return res if res == 'error'
|
313
|
-
if @with_canonical_names
|
314
|
-
taxon.classification_path += @normalized_data[taxon.parent_id].
|
315
|
-
classification_path + [taxon.current_name_canonical]
|
316
|
-
end
|
317
|
-
taxon.classification_path_id += @normalized_data[taxon.parent_id].
|
318
|
-
classification_path_id + [taxon.id]
|
319
|
-
parent_node = @normalized_data[taxon.parent_id].
|
320
|
-
classification_path_id.inject(@tree) {|node, id| node[id]}
|
321
|
-
parent_node.merge!(current_node)
|
322
|
-
else
|
323
|
-
taxon.classification_path += parent_cp +
|
324
|
-
[taxon.current_name_canonical] if @with_canonical_names
|
325
|
-
taxon.classification_path_id += parent_cpid + [taxon.id]
|
326
|
-
parent_node = @normalized_data[taxon.parent_id].
|
327
|
-
classification_path_id.inject(@tree) {|node, id| node[id]}
|
328
|
-
begin
|
329
|
-
parent_node.merge!(current_node)
|
330
|
-
rescue NoMethodError => e
|
331
|
-
DarwinCore.logger_write(@dwc.object_id,
|
332
|
-
"Error '%s' taxon %s" % [e.message, taxon.id])
|
333
|
-
return 'error'
|
334
|
-
end
|
335
|
-
end
|
336
|
-
end
|
337
|
-
end
|
338
|
-
|
339
|
-
def ingest_extensions
|
340
|
-
@extensions.each do |e|
|
341
|
-
ext, fields = *e
|
342
|
-
ingest_synonyms(e) if (File.split(e[0].file_path).
|
343
|
-
last.match(/synonym/i) &&
|
344
|
-
fields.keys.include?(:scientificname))
|
345
|
-
ingest_vernaculars(e) if fields.keys.include? :vernacularname
|
346
|
-
end
|
347
|
-
end
|
348
|
-
|
349
|
-
def ingest_synonyms(extension)
|
350
|
-
DarwinCore.logger_write(@dwc.object_id, 'Ingesting synonyms extension')
|
351
|
-
ext, fields = *extension
|
352
|
-
ext.read do |rows|
|
353
|
-
rows[0].each do |r|
|
354
|
-
set_scientific_name(r, fields)
|
355
|
-
synonym = SynonymNormalized.new(
|
356
|
-
nil,
|
357
|
-
r[fields[:scientificname]],
|
358
|
-
r[fields[:canonicalname]],
|
359
|
-
fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil,
|
360
|
-
fields[:source] ? r[fields[:source]] : nil,
|
361
|
-
fields[:localid] ? r[fields[:localid]] : nil,
|
362
|
-
fields[:globalid] ? r[fields[:globalid]] : nil,
|
363
|
-
)
|
364
|
-
if @normalized_data[r[fields[:id]]]
|
365
|
-
@normalized_data[r[fields[:id]]].synonyms << synonym
|
366
|
-
add_name_string(synonym.name)
|
367
|
-
add_name_string(synonym.canonical_name)
|
368
|
-
else
|
369
|
-
@error_names << { :taxon => synonym,
|
370
|
-
:error => :synonym_of_unknown_taxa }
|
371
|
-
end
|
372
|
-
end
|
373
|
-
end
|
374
|
-
end
|
375
|
-
|
376
|
-
def ingest_vernaculars(extension)
|
377
|
-
DarwinCore.logger_write(@dwc.object_id,
|
378
|
-
'Ingesting vernacular names extension')
|
379
|
-
ext, fields = *extension
|
380
|
-
ext.read do |rows|
|
381
|
-
rows[0].each do |r|
|
382
|
-
|
383
|
-
language = nil
|
384
|
-
if fields[:language]
|
385
|
-
language = r[fields[:language]]
|
386
|
-
elsif fields[:languagecode]
|
387
|
-
language = r[fields[:languagecode]]
|
388
|
-
end
|
389
|
-
|
390
|
-
locality = fields[:locality] ? r[fields[:locality]] : nil
|
391
|
-
|
392
|
-
country_code = fields[:countrycode] ? r[fields[:countrycode]] : nil
|
393
|
-
|
394
|
-
vernacular = VernacularNormalized.new(
|
395
|
-
r[fields[:vernacularname]],
|
396
|
-
language,
|
397
|
-
locality,
|
398
|
-
country_code)
|
399
|
-
if @normalized_data[r[fields[:id]]]
|
400
|
-
@normalized_data[r[fields[:id]]].vernacular_names << vernacular
|
401
|
-
add_vernacular_name_string(vernacular.name)
|
402
|
-
else
|
403
|
-
@error_names << { :vernacular_name => vernacular,
|
404
|
-
:error => :vernacular_of_unknown_taxa }
|
405
|
-
end
|
406
|
-
end
|
407
|
-
end
|
408
|
-
end
|
409
|
-
|
410
|
-
#Collect linnean classification path only on species level
|
411
|
-
def get_linnean_classification_path(row, taxon)
|
412
|
-
res = []
|
413
|
-
[:kingdom, :phylum, :class,
|
414
|
-
:order, :family, :genus, :subgenus].each do |clade|
|
415
|
-
res << [row[@core_fields[clade]], clade] if @core_fields[clade]
|
416
|
-
end
|
417
|
-
res
|
418
|
-
end
|
419
|
-
|
420
|
-
def gnub_archive?
|
421
|
-
@core_fields[:originalnameusageidpath]
|
422
|
-
end
|
423
|
-
end
|
424
|
-
end
|