dwc-archive 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +22 -0
- data/VERSION +1 -1
- data/features/dwc-archive.feature +12 -0
- data/features/step_definitions/dwc-archive_steps.rb +37 -0
- data/lib/dwc-archive.rb +19 -1
- data/lib/dwc-archive/core.rb +14 -25
- data/lib/dwc-archive/errors.rb +1 -0
- data/lib/dwc-archive/extension.rb +14 -23
- data/lib/dwc-archive/ingester.rb +46 -0
- data/lib/ruby_extensions.rb +4 -1
- data/spec/files/data.tar.gz +0 -0
- data/spec/files/junk_dir_inside.zip +0 -0
- data/spec/lib/dwc-archive_spec.rb +1 -3
- metadata +6 -4
data/README.rdoc
CHANGED
@@ -22,6 +22,28 @@ Update to latest rubygems (v >= 1.3.6) which adds gemcutter sources by default.
|
|
22
22
|
dwc.extensions # array of DarwinCore Star extensions
|
23
23
|
dwc.extensions[0].data # summary for an extension
|
24
24
|
|
25
|
+
# read content of the core data file into memory or used with a block
|
26
|
+
# it returns array of arrays of data
|
27
|
+
# rows that had a wrong encoding will be collected into errors array
|
28
|
+
data, errors = dwc.core.read
|
29
|
+
|
30
|
+
# read content using a block with getting back results in sets 100 rows each
|
31
|
+
results = []
|
32
|
+
tail_data, tail_errors = dwc.core.read(100) do |data, errors|
|
33
|
+
results << [data, errors]
|
34
|
+
end
|
35
|
+
results << [tail_data, tail_errors]
|
36
|
+
|
37
|
+
# read content of an extension data file into memory
|
38
|
+
data, errors = dwc.core.extensions[0].read
|
39
|
+
|
40
|
+
# read content of an extension data using block
|
41
|
+
results = []
|
42
|
+
tail_data, tail_errors = dwc.core.extensions[0](100) do |data, errors|
|
43
|
+
results << [data, errors]
|
44
|
+
end
|
45
|
+
results << [tail_data, tail_errors]
|
46
|
+
|
25
47
|
DarwinCore.clean_all # remove all expanded archives
|
26
48
|
|
27
49
|
== Note on Patches/Pull Requests
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
@@ -42,3 +42,15 @@ Feature: Creation of a Darwing Core Archive
|
|
42
42
|
Given acces to DarwinCore gem
|
43
43
|
When I use DarwinCore.clean_all method
|
44
44
|
Then all temporary directories created by DarwinCore are deleted
|
45
|
+
|
46
|
+
Scenario: Importing data into memory
|
47
|
+
Given path to a dwc file "data.tar.gz"
|
48
|
+
When I create a new DarwinCore instance
|
49
|
+
Then I can read its content into memory
|
50
|
+
Then I can read extensions content into memory
|
51
|
+
|
52
|
+
Scenario: Importing data using block
|
53
|
+
Given path to a dwc file "data.tar.gz"
|
54
|
+
When I create a new DarwinCore instance
|
55
|
+
Then I can read its core content using block
|
56
|
+
Then I can read extensions content using block
|
@@ -124,3 +124,40 @@ Then /^"([^\"]*)" should send instance of "([^\"]*)" back$/ do |arg1, arg2|
|
|
124
124
|
res.class.to_s.should == arg2
|
125
125
|
end
|
126
126
|
|
127
|
+
Then /^I can read its content into memory$/ do
|
128
|
+
core_data, core_errors = @dwc.core.read
|
129
|
+
core_data.class.should == Array
|
130
|
+
core_data.size.should == 584
|
131
|
+
core_errors.size.should == 1
|
132
|
+
end
|
133
|
+
|
134
|
+
Then /^I can read extensions content into memory$/ do
|
135
|
+
ext = @dwc.extensions
|
136
|
+
ext.class.should == Array
|
137
|
+
ext_data, ext_errors = ext[0].read
|
138
|
+
ext_data.class.should == Array
|
139
|
+
ext_data.size.should == 1
|
140
|
+
ext_errors.size.should == 0
|
141
|
+
end
|
142
|
+
|
143
|
+
Then /^I can read its core content using block$/ do
|
144
|
+
res = []
|
145
|
+
@dwc.core.ignore_headers.should be_true
|
146
|
+
tail_data, tail_errors = @dwc.core.read(200) do |r, err|
|
147
|
+
res << [r.size, err.size]
|
148
|
+
end
|
149
|
+
res << [tail_data.size, tail_errors.size]
|
150
|
+
res.should == [[200,0],[200,0],[184,1]]
|
151
|
+
end
|
152
|
+
|
153
|
+
Then /^I can read extensions content using block$/ do
|
154
|
+
res = []
|
155
|
+
ext = @dwc.extensions[0]
|
156
|
+
ext.ignore_headers.should be_true
|
157
|
+
tail_data, tail_errors = ext.read(200) do |r, err|
|
158
|
+
res << [r.size, err.size]
|
159
|
+
end
|
160
|
+
res << [tail_data.size, tail_errors.size]
|
161
|
+
res.should == [[1,0]]
|
162
|
+
end
|
163
|
+
|
data/lib/dwc-archive.rb
CHANGED
@@ -3,6 +3,13 @@ $:.unshift(File.dirname(__FILE__)) unless
|
|
3
3
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
4
4
|
require 'ruby_extensions'
|
5
5
|
require 'fileutils'
|
6
|
+
begin
|
7
|
+
require 'fastercsv'
|
8
|
+
CSV = FasterCSV
|
9
|
+
rescue LoadError
|
10
|
+
require 'csv'
|
11
|
+
end
|
12
|
+
require 'dwc-archive/ingester'
|
6
13
|
require 'dwc-archive/errors'
|
7
14
|
require 'dwc-archive/expander'
|
8
15
|
require 'dwc-archive/archive'
|
@@ -15,6 +22,17 @@ class DarwinCore
|
|
15
22
|
alias :eml :metadata
|
16
23
|
|
17
24
|
DEFAULT_TMP_DIR = "/tmp"
|
25
|
+
UTF8RGX = /\A(
|
26
|
+
[\x09\x0A\x0D\x20-\x7E] # ASCII
|
27
|
+
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
|
28
|
+
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
|
29
|
+
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
|
30
|
+
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
|
31
|
+
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
|
32
|
+
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
|
33
|
+
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
|
34
|
+
)*\z/x unless defined? UTF8RGX
|
35
|
+
|
18
36
|
|
19
37
|
def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
|
20
38
|
@archive = DarwinCore::Archive.new(dwc_path, tmp_dir)
|
@@ -38,7 +56,7 @@ class DarwinCore
|
|
38
56
|
root_key = @archive.meta.keys[0]
|
39
57
|
ext = @archive.meta[root_key][:extension]
|
40
58
|
return [] unless ext
|
41
|
-
ext = [ext]
|
59
|
+
ext = [ext] if ext.class != Array
|
42
60
|
ext.map { |e| DarwinCore::Extension.new(@archive, e) }
|
43
61
|
end
|
44
62
|
end
|
data/lib/dwc-archive/core.rb
CHANGED
@@ -1,34 +1,23 @@
|
|
1
1
|
class DarwinCore
|
2
2
|
class Core
|
3
|
+
include DarwinCore::Ingester
|
4
|
+
attr_reader :id
|
3
5
|
def initialize(archive)
|
4
6
|
@archive = archive
|
5
7
|
@path = @archive.files_path
|
6
8
|
root_key = @archive.meta.keys[0]
|
7
|
-
@
|
8
|
-
raise "Cannot found core in meta.xml, is meta.xml valid?" unless @
|
9
|
+
@data = @archive.meta[root_key][:core]
|
10
|
+
raise CoreFileError("Cannot found core in meta.xml, is meta.xml valid?") unless @data
|
11
|
+
@properties = @data[:attributes]
|
12
|
+
@encoding = @properties[:encoding] || 'UTF-8'
|
13
|
+
raise CoreFileError("No support for encodings other than utf-8 or utf-16 at the moment") unless ["utf-8", "utf8", "utf-16", "utf16"].include? @encoding.downcase
|
14
|
+
@id = @data[:id][:attributes]
|
15
|
+
@field_separator = get_field_separator
|
16
|
+
@quote_character = @properties[:fieldsEnclosedBy] || ""
|
17
|
+
@line_separator = @properties[:linesTerminatedBy] || "\n"
|
18
|
+
@ignore_headers = @properties[:ignoreHeaderLines] ? [1, true].include?(@properties[:ignoreHeaderLines]) : false
|
19
|
+
@file_path = get_file_path
|
20
|
+
@fields = get_fields
|
9
21
|
end
|
10
|
-
|
11
|
-
def data
|
12
|
-
@core
|
13
|
-
end
|
14
|
-
|
15
|
-
def properties
|
16
|
-
@core[:attributes]
|
17
|
-
end
|
18
|
-
|
19
|
-
def file_path
|
20
|
-
file = @core[:files][:location] || @core[:location]
|
21
|
-
File.join(@path, file)
|
22
|
-
end
|
23
|
-
|
24
|
-
def id
|
25
|
-
@core[:id][:attributes]
|
26
|
-
end
|
27
|
-
|
28
|
-
def fields
|
29
|
-
@core[:field] = [@core[:field]] unless @core[:field].class == Array
|
30
|
-
@core[:field].map {|f| f[:attributes]}
|
31
|
-
end
|
32
|
-
|
33
22
|
end
|
34
23
|
end
|
data/lib/dwc-archive/errors.rb
CHANGED
@@ -1,31 +1,22 @@
|
|
1
1
|
class DarwinCore
|
2
2
|
class Extension
|
3
|
-
|
3
|
+
include DarwinCore::Ingester
|
4
|
+
attr_reader :coreid
|
5
|
+
|
6
|
+
def initialize(archive, data)
|
4
7
|
@archive = archive
|
5
8
|
@path = @archive.files_path
|
6
|
-
@
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
@
|
9
|
+
@data = data
|
10
|
+
@properties = @data[:attributes]
|
11
|
+
@coreid = @data[:coreid][:attributes]
|
12
|
+
@encoding = @properties[:encoding] || 'UTF-8'
|
13
|
+
@quote_character = @properties[:fieldsEnclosedBy] || ""
|
14
|
+
@line_separator = @properties[:linesTerminatedBy] || "\n"
|
15
|
+
@ignore_headers = @properties[:ignoreHeaderLines] ? [1, true].include?(@properties[:ignoreHeaderLines]) : false
|
16
|
+
@field_separator = get_field_separator
|
17
|
+
@file_path = get_file_path
|
18
|
+
@fields = get_fields
|
11
19
|
end
|
12
20
|
|
13
|
-
def properties
|
14
|
-
@extension[:attributes]
|
15
|
-
end
|
16
|
-
|
17
|
-
def file_path
|
18
|
-
file = @extension[:files][:location]
|
19
|
-
File.join(@path, file)
|
20
|
-
end
|
21
|
-
|
22
|
-
def coreid
|
23
|
-
@extension[:coreid][:attributes]
|
24
|
-
end
|
25
|
-
|
26
|
-
def fields
|
27
|
-
@extension[:field] = [@extension[:field]] unless @extension[:field].class == Array
|
28
|
-
@extension[:field].map {|f| f[:attributes]}
|
29
|
-
end
|
30
21
|
end
|
31
22
|
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
class DarwinCore
|
2
|
+
module Ingester
|
3
|
+
attr_reader :data, :properties, :encoding, :fields_separator
|
4
|
+
attr_reader :file_path, :fields, :line_separator, :quote_character, :ignore_headers
|
5
|
+
def read(batch_size = 10000)
|
6
|
+
res = []
|
7
|
+
errors = []
|
8
|
+
index_fix = 1
|
9
|
+
args = {:col_sep => @field_separator}
|
10
|
+
args.merge!({:quote_char => @quote_character}) if @quote_character != ''
|
11
|
+
CSV.open(@file_path, args).each_with_index do |r, i|
|
12
|
+
index_fix = 0; next if @ignore_headers && i == 0
|
13
|
+
str = r.join('')
|
14
|
+
if defined? FasterCSV
|
15
|
+
UTF8RGX === str ? res << r : errors << r
|
16
|
+
else
|
17
|
+
str = str.force_encoding('utf-8')
|
18
|
+
str.encoding.name == "UTF-8" && str.valid_encoding? ? res << r : errors << r
|
19
|
+
end
|
20
|
+
if block_given? && (i + index_fix) % batch_size == 0
|
21
|
+
yield [res, errors]
|
22
|
+
res = []
|
23
|
+
errors = []
|
24
|
+
end
|
25
|
+
end
|
26
|
+
[res, errors]
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
def get_file_path
|
31
|
+
file = @data[:location] || @data[:attributes][:location] || @data[:files][:location]
|
32
|
+
File.join(@path, file)
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_fields
|
36
|
+
@data[:field] = [data[:field]] if data[:field].class != Array
|
37
|
+
@data[:field].map {|f| f[:attributes]}
|
38
|
+
end
|
39
|
+
|
40
|
+
def get_field_separator
|
41
|
+
res = @properties[:fieldsTerminatedBy] || ','
|
42
|
+
res = "\t" if res == "\\t"
|
43
|
+
res
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/ruby_extensions.rb
CHANGED
@@ -52,7 +52,10 @@ class Hash
|
|
52
52
|
end
|
53
53
|
|
54
54
|
def prepare(data)
|
55
|
-
|
55
|
+
return data if data.class != String
|
56
|
+
data = true if data.strip == "true"
|
57
|
+
data = false if data.strip == "false"
|
58
|
+
data.to_i.to_s == data ? data.to_i : data
|
56
59
|
end
|
57
60
|
end
|
58
61
|
|
data/spec/files/data.tar.gz
CHANGED
Binary file
|
Binary file
|
@@ -6,9 +6,8 @@ describe DarwinCore do
|
|
6
6
|
end
|
7
7
|
|
8
8
|
describe ".new" do
|
9
|
-
|
10
9
|
it "should create DarwinCore instance out of archive file" do
|
11
|
-
['data.zip', 'data.tar.gz', 'minimal.tar.gz'].each do |file|
|
10
|
+
['data.zip', 'data.tar.gz', 'minimal.tar.gz', 'junk_dir_inside.zip'].each do |file|
|
12
11
|
file = File.join(@file_dir, file)
|
13
12
|
dwc = DarwinCore.new(file)
|
14
13
|
dwc.archive.valid?.should be_true
|
@@ -35,6 +34,5 @@ describe DarwinCore do
|
|
35
34
|
dwc = DarwinCore.new(file)
|
36
35
|
dwc.archive.valid?.should be_true
|
37
36
|
end
|
38
|
-
|
39
37
|
end
|
40
38
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 2
|
8
|
+
- 0
|
9
|
+
version: 0.2.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Dmitry Mozzherin
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-05-
|
17
|
+
date: 2010-05-19 00:00:00 -04:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -69,6 +69,7 @@ files:
|
|
69
69
|
- lib/dwc-archive/errors.rb
|
70
70
|
- lib/dwc-archive/expander.rb
|
71
71
|
- lib/dwc-archive/extension.rb
|
72
|
+
- lib/dwc-archive/ingester.rb
|
72
73
|
- lib/dwc-archive/metadata.rb
|
73
74
|
- lib/ruby_extensions.rb
|
74
75
|
- spec/files/broken.tar.gz
|
@@ -77,6 +78,7 @@ files:
|
|
77
78
|
- spec/files/eml.xml
|
78
79
|
- spec/files/file with characters(3).gz
|
79
80
|
- spec/files/invalid.tar.gz
|
81
|
+
- spec/files/junk_dir_inside.zip
|
80
82
|
- spec/files/meta.xml
|
81
83
|
- spec/files/minimal.tar.gz
|
82
84
|
- spec/files/uncompressed
|