dwc-archive 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -0
- data/VERSION +1 -1
- data/features/step_definitions/dwc-archive_steps.rb +2 -2
- data/lib/dwc-archive/core.rb +2 -9
- data/lib/dwc-archive/errors.rb +1 -0
- data/lib/dwc-archive/extension.rb +3 -9
- data/lib/dwc-archive/ingester.rb +27 -9
- data/spec/files/data.tar.gz +0 -0
- metadata +22 -8
data/Rakefile
CHANGED
@@ -10,6 +10,7 @@ begin
|
|
10
10
|
gem.email = "dmozzherin at gmail dot com"
|
11
11
|
gem.homepage = "http://github.com/dimus/dwc-archive"
|
12
12
|
gem.authors = ["Dmitry Mozzherin"]
|
13
|
+
gem.add_dependency "fastercsv" if RUBY_VERSION.match /^1.8/
|
13
14
|
gem.add_development_dependency "rspec", ">= 1.2.9"
|
14
15
|
gem.add_development_dependency "cucumber", ">= 0"
|
15
16
|
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.2
|
@@ -128,7 +128,7 @@ Then /^I can read its content into memory$/ do
|
|
128
128
|
core_data, core_errors = @dwc.core.read
|
129
129
|
core_data.class.should == Array
|
130
130
|
core_data.size.should == 584
|
131
|
-
core_errors.size.should ==
|
131
|
+
core_errors.size.should == 3
|
132
132
|
end
|
133
133
|
|
134
134
|
Then /^I can read extensions content into memory$/ do
|
@@ -147,7 +147,7 @@ Then /^I can read its core content using block$/ do
|
|
147
147
|
res << [r.size, err.size]
|
148
148
|
end
|
149
149
|
res << [tail_data.size, tail_errors.size]
|
150
|
-
res.should == [[
|
150
|
+
res.should == [[198,2],[200,0],[186,1]]
|
151
151
|
end
|
152
152
|
|
153
153
|
Then /^I can read extensions content using block$/ do
|
data/lib/dwc-archive/core.rb
CHANGED
@@ -8,16 +8,9 @@ class DarwinCore
|
|
8
8
|
root_key = @archive.meta.keys[0]
|
9
9
|
@data = @archive.meta[root_key][:core]
|
10
10
|
raise CoreFileError("Cannot found core in meta.xml, is meta.xml valid?") unless @data
|
11
|
-
@properties = @data[:attributes]
|
12
|
-
@encoding = @properties[:encoding] || 'UTF-8'
|
13
|
-
raise CoreFileError("No support for encodings other than utf-8 or utf-16 at the moment") unless ["utf-8", "utf8", "utf-16", "utf16"].include? @encoding.downcase
|
14
11
|
@id = @data[:id][:attributes]
|
15
|
-
|
16
|
-
|
17
|
-
@line_separator = @properties[:linesTerminatedBy] || "\n"
|
18
|
-
@ignore_headers = @properties[:ignoreHeaderLines] ? [1, true].include?(@properties[:ignoreHeaderLines]) : false
|
19
|
-
@file_path = get_file_path
|
20
|
-
@fields = get_fields
|
12
|
+
raise CoreFileError("Cannot find core identifier") unless @id
|
13
|
+
get_attributes(CoreFileError)
|
21
14
|
end
|
22
15
|
end
|
23
16
|
end
|
data/lib/dwc-archive/errors.rb
CHANGED
@@ -7,15 +7,9 @@ class DarwinCore
|
|
7
7
|
@archive = archive
|
8
8
|
@path = @archive.files_path
|
9
9
|
@data = data
|
10
|
-
@
|
11
|
-
|
12
|
-
|
13
|
-
@quote_character = @properties[:fieldsEnclosedBy] || ""
|
14
|
-
@line_separator = @properties[:linesTerminatedBy] || "\n"
|
15
|
-
@ignore_headers = @properties[:ignoreHeaderLines] ? [1, true].include?(@properties[:ignoreHeaderLines]) : false
|
16
|
-
@field_separator = get_field_separator
|
17
|
-
@file_path = get_file_path
|
18
|
-
@fields = get_fields
|
10
|
+
@coreid = @data[:coreid][:attributes]
|
11
|
+
raise ExtensionFileError("Extension has no coreid information") unless @coreid
|
12
|
+
get_attributes(ExtensionFileError)
|
19
13
|
end
|
20
14
|
|
21
15
|
end
|
data/lib/dwc-archive/ingester.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
class DarwinCore
|
1
|
+
class DarwinCore
|
2
2
|
module Ingester
|
3
3
|
attr_reader :data, :properties, :encoding, :fields_separator
|
4
4
|
attr_reader :file_path, :fields, :line_separator, :quote_character, :ignore_headers
|
@@ -10,14 +10,7 @@ class DarwinCore
|
|
10
10
|
args.merge!({:quote_char => @quote_character}) if @quote_character != ''
|
11
11
|
CSV.open(@file_path, args).each_with_index do |r, i|
|
12
12
|
index_fix = 0; next if @ignore_headers && i == 0
|
13
|
-
|
14
|
-
if defined? FasterCSV
|
15
|
-
require File.join(File.dirname(__FILE__), 'utf_regex_ruby18')
|
16
|
-
UTF8RGX === str ? res << r : errors << r
|
17
|
-
else
|
18
|
-
str = str.force_encoding('utf-8')
|
19
|
-
str.encoding.name == "UTF-8" && str.valid_encoding? ? res << r : errors << r
|
20
|
-
end
|
13
|
+
@fields.size > (r.size - 1) ? errors << r : process_csv_row(res, errors, r)
|
21
14
|
if block_given? && (i + index_fix) % batch_size == 0
|
22
15
|
yield [res, errors]
|
23
16
|
res = []
|
@@ -28,6 +21,31 @@ class DarwinCore
|
|
28
21
|
end
|
29
22
|
|
30
23
|
private
|
24
|
+
def process_csv_row(result, errors, row)
|
25
|
+
str = row.join('')
|
26
|
+
if defined? FasterCSV
|
27
|
+
require File.join(File.dirname(__FILE__), 'utf_regex_ruby18')
|
28
|
+
UTF8RGX === str ? result << row : errors << row
|
29
|
+
else
|
30
|
+
str = str.force_encoding('utf-8')
|
31
|
+
str.encoding.name == "UTF-8" && str.valid_encoding? ? result << row : errors << row
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_attributes(exception)
|
36
|
+
@properties = @data[:attributes]
|
37
|
+
@encoding = @properties[:encoding] || 'UTF-8'
|
38
|
+
raise exception("No support for encodings other than utf-8 or utf-16 at the moment") unless ["utf-8", "utf8", "utf-16", "utf16"].include? @encoding.downcase
|
39
|
+
@field_separator = get_field_separator
|
40
|
+
@quote_character = @properties[:fieldsEnclosedBy] || ""
|
41
|
+
@line_separator = @properties[:linesTerminatedBy] || "\n"
|
42
|
+
@ignore_headers = @properties[:ignoreHeaderLines] ? [1, true].include?(@properties[:ignoreHeaderLines]) : false
|
43
|
+
@file_path = get_file_path
|
44
|
+
raise exception("No file data") unless @file_path
|
45
|
+
@fields = get_fields
|
46
|
+
raise exception("No data fields are found") if @fields.empty?
|
47
|
+
end
|
48
|
+
|
31
49
|
def get_file_path
|
32
50
|
file = @data[:location] || @data[:attributes][:location] || @data[:files][:location]
|
33
51
|
File.join(@path, file)
|
data/spec/files/data.tar.gz
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dwc-archive
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 2
|
10
|
+
version: 0.2.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Dmitry Mozzherin
|
@@ -15,13 +15,27 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-05-
|
18
|
+
date: 2010-05-21 00:00:00 -04:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
|
-
name:
|
22
|
+
name: fastercsv
|
23
23
|
prerelease: false
|
24
24
|
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: rspec
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
25
39
|
none: false
|
26
40
|
requirements:
|
27
41
|
- - ">="
|
@@ -33,11 +47,11 @@ dependencies:
|
|
33
47
|
- 9
|
34
48
|
version: 1.2.9
|
35
49
|
type: :development
|
36
|
-
version_requirements: *
|
50
|
+
version_requirements: *id002
|
37
51
|
- !ruby/object:Gem::Dependency
|
38
52
|
name: cucumber
|
39
53
|
prerelease: false
|
40
|
-
requirement: &
|
54
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
41
55
|
none: false
|
42
56
|
requirements:
|
43
57
|
- - ">="
|
@@ -47,7 +61,7 @@ dependencies:
|
|
47
61
|
- 0
|
48
62
|
version: "0"
|
49
63
|
type: :development
|
50
|
-
version_requirements: *
|
64
|
+
version_requirements: *id003
|
51
65
|
description: Darwin Core Archive is the current standard exchange format for GLobal Names Architecture modules. This gem makes it easy to incorporate files in Darwin Core Archive format into a ruby project.
|
52
66
|
email: dmozzherin at gmail dot com
|
53
67
|
executables: []
|