dwc-archive 0.9.6 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +31 -0
- data/.rspec +3 -0
- data/.rubocop.yml +23 -0
- data/.ruby-version +1 -0
- data/.travis.yml +4 -5
- data/CHANGELOG +15 -7
- data/Gemfile +3 -15
- data/LICENSE +1 -1
- data/README.md +135 -111
- data/Rakefile +13 -54
- data/dwc-archive.gemspec +37 -0
- data/features/step_definitions/dwc-creator_steps.rb +5 -5
- data/features/step_definitions/dwc-reader_steps.rb +47 -28
- data/features/support/env.rb +1 -1
- data/lib/dwc_archive.rb +121 -0
- data/lib/dwc_archive/archive.rb +59 -0
- data/lib/dwc_archive/classification_normalizer.rb +382 -0
- data/lib/dwc_archive/core.rb +25 -0
- data/lib/{dwc-archive → dwc_archive}/errors.rb +2 -0
- data/lib/dwc_archive/expander.rb +85 -0
- data/lib/{dwc-archive → dwc_archive}/extension.rb +5 -3
- data/lib/dwc_archive/generator.rb +90 -0
- data/lib/dwc_archive/generator_eml_xml.rb +116 -0
- data/lib/dwc_archive/generator_meta_xml.rb +72 -0
- data/lib/dwc_archive/gnub_taxon.rb +14 -0
- data/lib/dwc_archive/ingester.rb +106 -0
- data/lib/dwc_archive/metadata.rb +56 -0
- data/lib/dwc_archive/taxon_normalized.rb +23 -0
- data/lib/dwc_archive/version.rb +6 -0
- data/lib/dwc_archive/xml_reader.rb +89 -0
- data/spec/files/file with characters(3).gz b/data/spec/files/file with → characters(3).tar.gz +0 -0
- data/spec/files/generator_eml.xml +47 -0
- data/spec/files/generator_meta.xml +19 -0
- data/spec/lib/classification_normalizer_spec.rb +214 -0
- data/spec/lib/core_spec.rb +100 -0
- data/spec/lib/darwin_core_spec.rb +249 -0
- data/spec/lib/generator_eml_xml_spec.rb +22 -0
- data/spec/lib/generator_meta_xml_spec.rb +22 -0
- data/spec/lib/generator_spec.rb +124 -0
- data/spec/lib/gnub_taxon_spec.rb +32 -0
- data/spec/lib/metadata_spec.rb +89 -0
- data/spec/lib/taxon_normalized_spec.rb +142 -0
- data/spec/lib/xml_reader_spec.rb +11 -11
- data/spec/spec_helper.rb +78 -6
- metadata +180 -92
- data/.rvmrc +0 -1
- data/Gemfile.lock +0 -155
- data/VERSION +0 -1
- data/lib/dwc-archive.rb +0 -95
- data/lib/dwc-archive/.expander.rb.swo +0 -0
- data/lib/dwc-archive/archive.rb +0 -37
- data/lib/dwc-archive/classification_normalizer.rb +0 -424
- data/lib/dwc-archive/core.rb +0 -17
- data/lib/dwc-archive/expander.rb +0 -80
- data/lib/dwc-archive/generator.rb +0 -75
- data/lib/dwc-archive/generator_eml_xml.rb +0 -84
- data/lib/dwc-archive/generator_meta_xml.rb +0 -50
- data/lib/dwc-archive/ingester.rb +0 -101
- data/lib/dwc-archive/metadata.rb +0 -42
- data/lib/dwc-archive/utf_regex_ruby18.rb +0 -10
- data/lib/dwc-archive/xml_reader.rb +0 -64
- data/spec/lib/dwc-archive_spec.rb +0 -250
- data/spec/spec.opts +0 -1
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Represents core of the DarwinCore Archive
|
5
|
+
class Core
|
6
|
+
include DarwinCore::Ingester
|
7
|
+
attr_reader :id
|
8
|
+
|
9
|
+
# rubocop:disable Metrics/MethodLength
|
10
|
+
def initialize(dwc)
|
11
|
+
@dwc = dwc
|
12
|
+
@archive = @dwc.archive
|
13
|
+
@path = @archive.files_path
|
14
|
+
root_key = @archive.meta.keys[0]
|
15
|
+
@data = @archive.meta[root_key][:core]
|
16
|
+
unless @data
|
17
|
+
raise DarwinCore::CoreFileError,
|
18
|
+
"Cannot find core in meta.xml, is meta.xml valid?"
|
19
|
+
end
|
20
|
+
@id = @data[:id][:attributes]
|
21
|
+
init_attributes
|
22
|
+
end
|
23
|
+
end
|
24
|
+
# rubocop:enable Metrics/MethodLength
|
25
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Unpacks compressed archives into a temp directory
|
5
|
+
class Expander
|
6
|
+
def initialize(archive_path, tmp_dir)
|
7
|
+
@archive_path = archive_path
|
8
|
+
@tmp_dir = tmp_dir
|
9
|
+
@dir_path = DarwinCore.random_path(tmp_dir)
|
10
|
+
@unpacker = init_unpacker
|
11
|
+
end
|
12
|
+
|
13
|
+
def unpack
|
14
|
+
clean
|
15
|
+
raise DarwinCore::FileNotFoundError unless File.exist?(@archive_path)
|
16
|
+
success = @unpacker.call(@dir_path, @archive_path) if @unpacker
|
17
|
+
if @unpacker && success && $CHILD_STATUS.exitstatus.zero?
|
18
|
+
success
|
19
|
+
else
|
20
|
+
clean
|
21
|
+
raise DarwinCore::UnpackingError
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def path
|
26
|
+
@path ||= files_path
|
27
|
+
end
|
28
|
+
|
29
|
+
def clean
|
30
|
+
DarwinCore.clean(@dir_path)
|
31
|
+
end
|
32
|
+
|
33
|
+
def files
|
34
|
+
DarwinCore.files(path)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def init_unpacker
|
40
|
+
return tar_unpacker if @archive_path =~ /tar.gz$/i
|
41
|
+
return zip_unpacker if @archive_path =~ /zip$/i
|
42
|
+
nil
|
43
|
+
end
|
44
|
+
|
45
|
+
def tar_unpacker
|
46
|
+
proc do |tmp_path, archive_path|
|
47
|
+
FileUtils.mkdir tmp_path
|
48
|
+
path = esc(archive_path)
|
49
|
+
system("tar -zxf #{path} -C #{tmp_path} > /dev/null 2>&1")
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def zip_unpacker
|
54
|
+
proc do |tmp_path, archive_path|
|
55
|
+
path = esc(archive_path)
|
56
|
+
system("unzip -qq -d #{tmp_path} #{path} > /dev/null 2>&1")
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def esc(a_str)
|
61
|
+
"'" + a_str.gsub(92.chr, '\\\\\\').gsub("'", "\\\\'") + "'"
|
62
|
+
end
|
63
|
+
|
64
|
+
def path_entries(dir)
|
65
|
+
Dir.entries(dir).reject { |e| e.match(/[\.]{1,2}$/) }.sort
|
66
|
+
end
|
67
|
+
|
68
|
+
def files_path
|
69
|
+
entries = path_entries(@dir_path)
|
70
|
+
entries.include?("meta.xml") ? @dir_path : search_for_file_path(entries)
|
71
|
+
end
|
72
|
+
|
73
|
+
def search_for_file_path(entries)
|
74
|
+
res = nil
|
75
|
+
entries.each do |e|
|
76
|
+
check_path = File.join(@dir_path, e)
|
77
|
+
next unless FileTest.directory?(check_path) &&
|
78
|
+
path_entries(check_path).include?("meta.xml")
|
79
|
+
res = check_path
|
80
|
+
break
|
81
|
+
end
|
82
|
+
res
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -1,8 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
class DarwinCore
|
4
|
+
# Represents extensions of DarwinCore Archive
|
2
5
|
class Extension
|
3
6
|
include DarwinCore::Ingester
|
4
7
|
attr_reader :coreid
|
5
|
-
alias
|
8
|
+
alias id coreid
|
6
9
|
|
7
10
|
def initialize(dwc, data)
|
8
11
|
@dwc = dwc
|
@@ -10,8 +13,7 @@ class DarwinCore
|
|
10
13
|
@path = @archive.files_path
|
11
14
|
@data = data
|
12
15
|
@coreid = @data[:coreid][:attributes]
|
13
|
-
|
16
|
+
init_attributes
|
14
17
|
end
|
15
|
-
|
16
18
|
end
|
17
19
|
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Creates csv files for core and extensions
|
5
|
+
class Generator
|
6
|
+
attr_reader :eml_xml_data, :path
|
7
|
+
|
8
|
+
def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
|
9
|
+
@dwc_path = dwc_path
|
10
|
+
@path = DarwinCore.random_path(tmp_dir)
|
11
|
+
FileUtils.mkdir(@path)
|
12
|
+
@meta_xml_data = { extensions: [] }
|
13
|
+
@eml_xml_data = { id: nil, title: nil, authors: [], abstrac: nil,
|
14
|
+
citation: nil, url: nil }
|
15
|
+
@write = "w:utf-8"
|
16
|
+
end
|
17
|
+
|
18
|
+
def clean
|
19
|
+
DarwinCore.clean(@path)
|
20
|
+
end
|
21
|
+
|
22
|
+
def add_core(data, file_name, keep_headers = true)
|
23
|
+
opts = { type: "core", data: data, file_name: file_name,
|
24
|
+
keep_headers: keep_headers }
|
25
|
+
prepare_csv_file(opts)
|
26
|
+
end
|
27
|
+
|
28
|
+
def add_extension(data, file_name, keep_headers = true,
|
29
|
+
row_type = "http://rs.tdwg.org/dwc/terms/Taxon")
|
30
|
+
opts = { type: "extension", data: data, file_name: file_name,
|
31
|
+
keep_headers: keep_headers, row_type: row_type }
|
32
|
+
prepare_csv_file(opts)
|
33
|
+
end
|
34
|
+
|
35
|
+
def add_meta_xml
|
36
|
+
meta = DarwinCore::Generator::MetaXml.new(@meta_xml_data, @path)
|
37
|
+
meta.create
|
38
|
+
end
|
39
|
+
|
40
|
+
def add_eml_xml(data)
|
41
|
+
@eml_xml_data = data
|
42
|
+
eml = DarwinCore::Generator::EmlXml.new(@eml_xml_data, @path)
|
43
|
+
eml.create
|
44
|
+
end
|
45
|
+
|
46
|
+
def files
|
47
|
+
DarwinCore.files(@path)
|
48
|
+
end
|
49
|
+
|
50
|
+
def pack
|
51
|
+
a = "cd #{@path}; tar -zcf #{@dwc_path} *"
|
52
|
+
system(a)
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def prepare_csv_file(opts)
|
58
|
+
c = CSV.open(File.join(@path, opts[:file_name]), @write)
|
59
|
+
attributes = prepare_attributes(opts)
|
60
|
+
if opts[:type] == "core"
|
61
|
+
@meta_xml_data[:core] = attributes
|
62
|
+
else
|
63
|
+
@meta_xml_data[:extensions] << attributes
|
64
|
+
end
|
65
|
+
opts[:data].each { |d| c << d }
|
66
|
+
c.close
|
67
|
+
end
|
68
|
+
|
69
|
+
def prepare_attributes(opts)
|
70
|
+
header = opts[:data].shift
|
71
|
+
fields = init_fields(header, opts[:type])
|
72
|
+
opts[:data].unshift(fields) if opts[:keep_headers]
|
73
|
+
ignore_header_lines = opts[:keep_headers] ? 1 : 0
|
74
|
+
|
75
|
+
res = { fields: header, ignoreHeaderLines: ignore_header_lines,
|
76
|
+
location: opts[:file_name] }
|
77
|
+
res[:rowType] = opts[:row_type] if opts[:row_type]
|
78
|
+
res
|
79
|
+
end
|
80
|
+
|
81
|
+
def init_fields(header, file_type)
|
82
|
+
header.map do |f|
|
83
|
+
f = f.strip
|
84
|
+
err = "No header in #{file_type} data, or header fields are not urls"
|
85
|
+
raise DarwinCore::GeneratorError, err unless f =~ %r{^http://}
|
86
|
+
f.split("/")[-1]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
class DarwinCore
|
2
|
+
class Generator
|
3
|
+
# Creates EML file with meta information about archive
|
4
|
+
class EmlXml
|
5
|
+
SCHEMA_DATA = {
|
6
|
+
:"xml:lang" => "en",
|
7
|
+
:"xmlns:eml" => "eml://ecoinformatics.org/eml-2.1.1",
|
8
|
+
:"xmlns:md" => "eml://ecoinformatics.org/methods-2.1.1",
|
9
|
+
:"xmlns:proj" => "eml://ecoinformatics.org/project-2.1.1",
|
10
|
+
:"xmlns:d" => "eml://ecoinformatics.org/dataset-2.1.1",
|
11
|
+
:"xmlns:res" => "eml://ecoinformatics.org/resource-2.1.1",
|
12
|
+
:"xmlns:dc" => "http://purl.org/dc/terms/",
|
13
|
+
:"xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
|
14
|
+
:"xsi:schemaLocation" => "eml://ecoinformatics.org/eml-2.1.1 "\
|
15
|
+
"http://rs.gbif.org/schema/eml-gbif-profile/1.0.1/eml.xsd"
|
16
|
+
}
|
17
|
+
|
18
|
+
def initialize(data, path)
|
19
|
+
@data = data
|
20
|
+
@path = path
|
21
|
+
@write = "w:utf-8"
|
22
|
+
end
|
23
|
+
|
24
|
+
def create
|
25
|
+
schema_data = {
|
26
|
+
packageId: "#{@data[:id]}/#{timestamp}",
|
27
|
+
system: @data[:system] || "http://globalnames.org"
|
28
|
+
}.merge(SCHEMA_DATA)
|
29
|
+
builder = Nokogiri::XML::Builder.new do |xml|
|
30
|
+
xml.eml(schema_data) do
|
31
|
+
build_body(xml)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
save_eml(builder)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def build_body(xml)
|
40
|
+
build_dataset(xml)
|
41
|
+
build_additional_metadata(xml)
|
42
|
+
xml.parent.namespace = xml.parent.namespace_definitions.first
|
43
|
+
end
|
44
|
+
|
45
|
+
def save_eml(builder)
|
46
|
+
data = builder.to_xml
|
47
|
+
f = open(File.join(@path, "eml.xml"), @write)
|
48
|
+
f.write(data)
|
49
|
+
f.close
|
50
|
+
end
|
51
|
+
|
52
|
+
def build_dataset(xml)
|
53
|
+
xml.dataset(id: @data[:id]) do
|
54
|
+
xml.title(@data[:title])
|
55
|
+
xml.license(@data[:license])
|
56
|
+
contacts = []
|
57
|
+
build_authors(xml, contacts)
|
58
|
+
build_metadata_providers(xml)
|
59
|
+
xml.pubDate(Time.now.to_s)
|
60
|
+
build_abstract(xml)
|
61
|
+
build_contacts(xml, contacts)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def build_abstract(xml)
|
66
|
+
xml.abstract { xml.para(@data[:abstract]) }
|
67
|
+
end
|
68
|
+
|
69
|
+
def build_contacts(xml, contacts)
|
70
|
+
contacts.each { |contact| xml.contact { xml.references(contact) } }
|
71
|
+
end
|
72
|
+
|
73
|
+
def build_metadata_providers(xml)
|
74
|
+
@data[:metadata_providers].each do |a|
|
75
|
+
xml.metadataProvider { build_person(xml, a) }
|
76
|
+
end if @data[:metadata_providers]
|
77
|
+
end
|
78
|
+
|
79
|
+
def build_authors(xml, contacts)
|
80
|
+
@data[:authors].each_with_index do |a, i|
|
81
|
+
creator_id = i + 1
|
82
|
+
contacts << creator_id
|
83
|
+
xml.creator(id: creator_id, scope: "document") do
|
84
|
+
build_person(xml, a)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def build_additional_metadata(xml)
|
90
|
+
xml.additionalMetadata do
|
91
|
+
xml.metadata do
|
92
|
+
xml.citation(@data[:citation])
|
93
|
+
xml.resourceLogoUrl(@data[:logo_url]) if @data[:logo_url]
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def build_person(xml, data)
|
99
|
+
a = data
|
100
|
+
xml.individualName do
|
101
|
+
xml.givenName(a[:first_name])
|
102
|
+
xml.surName(a[:last_name])
|
103
|
+
end
|
104
|
+
xml.organizationName(a[:organization]) if a[:organization]
|
105
|
+
xml.positionName(a[:position]) if a[:position]
|
106
|
+
xml.onlineUrl(a[:url]) if a[:url]
|
107
|
+
xml.electronicMailAddress(a[:email])
|
108
|
+
end
|
109
|
+
|
110
|
+
def timestamp
|
111
|
+
t = Time.now.getutc.to_a[0..5].reverse
|
112
|
+
t[0..2] * ("-") + "::" + t[-3..-1] * (":")
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
class DarwinCore
|
2
|
+
class Generator
|
3
|
+
# Creates DarwinCore meta file
|
4
|
+
class MetaXml
|
5
|
+
def initialize(data, path)
|
6
|
+
@data = data
|
7
|
+
@path = path
|
8
|
+
@write = "w:utf-8"
|
9
|
+
end
|
10
|
+
|
11
|
+
def create
|
12
|
+
schema_uri = "http://rs.tdwg.org/dwc/terms/xsd/archive/ "\
|
13
|
+
"http://darwincore.googlecode.com/svn/trunk/text/tdwg_dwc_text.xsd"
|
14
|
+
builder = Nokogiri::XML::Builder.new do |xml|
|
15
|
+
opts = { encoding: "UTF-8", fieldsTerminatedBy: ",",
|
16
|
+
fieldsEnclosedBy: '"', linesTerminatedBy: "\n",
|
17
|
+
rowType: "http://rs.tdwg.org/dwc/terms/Taxon" }
|
18
|
+
build_archive(xml, opts, schema_uri)
|
19
|
+
end
|
20
|
+
save_meta(builder)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def save_meta(builder)
|
26
|
+
meta_xml_data = builder.to_xml
|
27
|
+
meta_file = open(File.join(@path, "meta.xml"), @write)
|
28
|
+
meta_file.write(meta_xml_data)
|
29
|
+
meta_file.close
|
30
|
+
end
|
31
|
+
|
32
|
+
def build_archive(xml, opts, schema_uri)
|
33
|
+
xml.archive(xmlns: "http://rs.tdwg.org/dwc/text/",
|
34
|
+
:"xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance",
|
35
|
+
:"xsi:schemaLocation" => schema_uri) do
|
36
|
+
build_core(xml, opts)
|
37
|
+
build_extensions(xml, opts)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def build_core(xml, opts)
|
42
|
+
xml.core(opts.merge(ignoreHeaderLines:
|
43
|
+
@data[:core][:ignoreHeaderLines])) do
|
44
|
+
xml.files { xml.location(@data[:core][:location]) }
|
45
|
+
taxon_id, fields = find_taxon_id(@data[:core][:fields])
|
46
|
+
xml.id_(index: taxon_id[1])
|
47
|
+
fields.each { |f| xml.field(term: f[0], index: f[1]) }
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def build_extensions(xml, opts)
|
52
|
+
@data[:extensions].each do |e|
|
53
|
+
xml.extension(opts.merge(ignoreHeaderLines: e[:ignoreHeaderLines],
|
54
|
+
rowType: e[:rowType])) do
|
55
|
+
xml.files { xml.location(e[:location]) }
|
56
|
+
taxon_id, fields = find_taxon_id(e[:fields])
|
57
|
+
xml.coreid(index: taxon_id[1])
|
58
|
+
fields.each { |f| xml.field(term: f[0], index: f[1]) }
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def find_taxon_id(data)
|
64
|
+
fields = []
|
65
|
+
data.each_with_index { |f, i| fields << [f.strip, i] }
|
66
|
+
taxon_id, fields = fields.partition { |f| f[0].match(/\/taxonid$/i) }
|
67
|
+
fail DarwinCore::GeneratorError if taxon_id.size != 1
|
68
|
+
[taxon_id[0], fields]
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DarwinCore
|
4
|
+
# Covers special case of Global Names Usage Bank data
|
5
|
+
class GnubTaxon < TaxonNormalized
|
6
|
+
attr_accessor :uuid, :uuid_path
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
super
|
10
|
+
@uuid = nil
|
11
|
+
@uuid_path = []
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
class DarwinCore
|
3
|
+
# This module abstracts information for reading csv file to be used
|
4
|
+
# in several classes which need such functionality
|
5
|
+
module Ingester
|
6
|
+
attr_reader :data, :properties, :encoding, :fields_separator, :size
|
7
|
+
attr_reader :file_path, :fields, :line_separator, :quote_character,
|
8
|
+
:ignore_headers
|
9
|
+
|
10
|
+
def size
|
11
|
+
@size ||= init_size
|
12
|
+
end
|
13
|
+
|
14
|
+
def read(batch_size = 10_000)
|
15
|
+
DarwinCore.logger_write(@dwc.object_id, "Reading #{name} data")
|
16
|
+
res = []
|
17
|
+
errors = []
|
18
|
+
args = define_csv_args
|
19
|
+
min_size = @fields.map { |f| f[:index].to_i || 0 }.sort[-1] + 1
|
20
|
+
csv = CSV.new(open(@file_path), args)
|
21
|
+
csv.each_with_index do |r, i|
|
22
|
+
next if @ignore_headers && i == 0
|
23
|
+
min_size > r.size ? errors << r : process_csv_row(res, errors, r)
|
24
|
+
next if i == 0 || i % batch_size != 0
|
25
|
+
DarwinCore.logger_write(@dwc.object_id,
|
26
|
+
format("Ingested %s records from %s",
|
27
|
+
i, name))
|
28
|
+
next unless block_given?
|
29
|
+
yield [res, errors]
|
30
|
+
res = []
|
31
|
+
errors = []
|
32
|
+
end
|
33
|
+
yield [res, errors] if block_given?
|
34
|
+
[res, errors]
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def define_csv_args
|
40
|
+
args = { col_sep: @field_separator }
|
41
|
+
@quote_character = "\b" if @quote_character.empty?
|
42
|
+
args.merge(quote_char: @quote_character)
|
43
|
+
end
|
44
|
+
|
45
|
+
def name
|
46
|
+
self.class.to_s.split("::")[-1].downcase
|
47
|
+
end
|
48
|
+
|
49
|
+
def process_csv_row(result, errors, row)
|
50
|
+
str = row.join("")
|
51
|
+
str = str.force_encoding("utf-8")
|
52
|
+
if str.encoding.name == "UTF-8" && str.valid_encoding?
|
53
|
+
result << row.map { |f| f.nil? ? nil : f.force_encoding("utf-8") }
|
54
|
+
else
|
55
|
+
errors << row
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def init_attributes
|
60
|
+
@properties = @data[:attributes]
|
61
|
+
init_encoding
|
62
|
+
@field_separator = init_field_separator
|
63
|
+
@quote_character = @properties[:fieldsEnclosedBy] || ""
|
64
|
+
@line_separator = @properties[:linesTerminatedBy] || "\n"
|
65
|
+
@ignore_headers = @properties[:ignoreHeaderLines] &&
|
66
|
+
[1, true].include?(@properties[:ignoreHeaderLines])
|
67
|
+
init_file_path
|
68
|
+
init_fields
|
69
|
+
end
|
70
|
+
|
71
|
+
def init_encoding
|
72
|
+
@encoding = @properties[:encoding] || "UTF-8"
|
73
|
+
accepted_encoding = ["utf-8", "utf8", "utf-16", "utf16"].
|
74
|
+
include?(@encoding.downcase)
|
75
|
+
fail(
|
76
|
+
DarwinCore::EncodingError,
|
77
|
+
"No support for encodings other than utf-8 or utf-16 at the moment"
|
78
|
+
) unless accepted_encoding
|
79
|
+
end
|
80
|
+
|
81
|
+
def init_file_path
|
82
|
+
file = @data[:location] ||
|
83
|
+
@data[:attributes][:location] ||
|
84
|
+
@data[:files][:location]
|
85
|
+
@file_path = File.join(@path, file)
|
86
|
+
fail DarwinCore::FileNotFoundError, "No file data" unless @file_path
|
87
|
+
end
|
88
|
+
|
89
|
+
def init_fields
|
90
|
+
@data[:field] = [data[:field]] if data[:field].class != Array
|
91
|
+
@fields = @data[:field].map { |f| f[:attributes] }
|
92
|
+
fail DarwinCore::InvalidArchiveError,
|
93
|
+
"No data fields are found" if @fields.empty?
|
94
|
+
end
|
95
|
+
|
96
|
+
def init_field_separator
|
97
|
+
res = @properties[:fieldsTerminatedBy] || ","
|
98
|
+
res = "\t" if res == "\\t"
|
99
|
+
res
|
100
|
+
end
|
101
|
+
|
102
|
+
def init_size
|
103
|
+
`wc -l #{@file_path}`.match(/^\s*([\d]+)\s/)[1].to_i
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|