dwc-archive 0.1.6 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -22,6 +22,28 @@ Update to latest rubygems (v >= 1.3.6) which adds gemcutter sources by default.
22
22
  dwc.extensions # array of DarwinCore Star extensions
23
23
  dwc.extensions[0].data # summary for an extension
24
24
 
25
+ # read content of the core data file into memory or used with a block
26
+ # it returns array of arrays of data
27
+ # rows that had a wrong encoding will be collected into errors array
28
+ data, errors = dwc.core.read
29
+
30
+ # read content using a block with getting back results in sets 100 rows each
31
+ results = []
32
+ tail_data, tail_errors = dwc.core.read(100) do |data, errors|
33
+ results << [data, errors]
34
+ end
35
+ results << [tail_data, tail_errors]
36
+
37
+ # read content of an extension data file into memory
38
+ data, errors = dwc.core.extensions[0].read
39
+
40
+ # read content of an extension data using block
41
+ results = []
42
+ tail_data, tail_errors = dwc.core.extensions[0](100) do |data, errors|
43
+ results << [data, errors]
44
+ end
45
+ results << [tail_data, tail_errors]
46
+
25
47
  DarwinCore.clean_all # remove all expanded archives
26
48
 
27
49
  == Note on Patches/Pull Requests
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.6
1
+ 0.2.0
@@ -42,3 +42,15 @@ Feature: Creation of a Darwing Core Archive
42
42
  Given acces to DarwinCore gem
43
43
  When I use DarwinCore.clean_all method
44
44
  Then all temporary directories created by DarwinCore are deleted
45
+
46
+ Scenario: Importing data into memory
47
+ Given path to a dwc file "data.tar.gz"
48
+ When I create a new DarwinCore instance
49
+ Then I can read its content into memory
50
+ Then I can read extensions content into memory
51
+
52
+ Scenario: Importing data using block
53
+ Given path to a dwc file "data.tar.gz"
54
+ When I create a new DarwinCore instance
55
+ Then I can read its core content using block
56
+ Then I can read extensions content using block
@@ -124,3 +124,40 @@ Then /^"([^\"]*)" should send instance of "([^\"]*)" back$/ do |arg1, arg2|
124
124
  res.class.to_s.should == arg2
125
125
  end
126
126
 
127
+ Then /^I can read its content into memory$/ do
128
+ core_data, core_errors = @dwc.core.read
129
+ core_data.class.should == Array
130
+ core_data.size.should == 584
131
+ core_errors.size.should == 1
132
+ end
133
+
134
+ Then /^I can read extensions content into memory$/ do
135
+ ext = @dwc.extensions
136
+ ext.class.should == Array
137
+ ext_data, ext_errors = ext[0].read
138
+ ext_data.class.should == Array
139
+ ext_data.size.should == 1
140
+ ext_errors.size.should == 0
141
+ end
142
+
143
+ Then /^I can read its core content using block$/ do
144
+ res = []
145
+ @dwc.core.ignore_headers.should be_true
146
+ tail_data, tail_errors = @dwc.core.read(200) do |r, err|
147
+ res << [r.size, err.size]
148
+ end
149
+ res << [tail_data.size, tail_errors.size]
150
+ res.should == [[200,0],[200,0],[184,1]]
151
+ end
152
+
153
+ Then /^I can read extensions content using block$/ do
154
+ res = []
155
+ ext = @dwc.extensions[0]
156
+ ext.ignore_headers.should be_true
157
+ tail_data, tail_errors = ext.read(200) do |r, err|
158
+ res << [r.size, err.size]
159
+ end
160
+ res << [tail_data.size, tail_errors.size]
161
+ res.should == [[1,0]]
162
+ end
163
+
data/lib/dwc-archive.rb CHANGED
@@ -3,6 +3,13 @@ $:.unshift(File.dirname(__FILE__)) unless
3
3
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
4
4
  require 'ruby_extensions'
5
5
  require 'fileutils'
6
+ begin
7
+ require 'fastercsv'
8
+ CSV = FasterCSV
9
+ rescue LoadError
10
+ require 'csv'
11
+ end
12
+ require 'dwc-archive/ingester'
6
13
  require 'dwc-archive/errors'
7
14
  require 'dwc-archive/expander'
8
15
  require 'dwc-archive/archive'
@@ -15,6 +22,17 @@ class DarwinCore
15
22
  alias :eml :metadata
16
23
 
17
24
  DEFAULT_TMP_DIR = "/tmp"
25
+ UTF8RGX = /\A(
26
+ [\x09\x0A\x0D\x20-\x7E] # ASCII
27
+ | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
28
+ | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
29
+ | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
30
+ | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
31
+ | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
32
+ | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
33
+ | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
34
+ )*\z/x unless defined? UTF8RGX
35
+
18
36
 
19
37
  def initialize(dwc_path, tmp_dir = DEFAULT_TMP_DIR)
20
38
  @archive = DarwinCore::Archive.new(dwc_path, tmp_dir)
@@ -38,7 +56,7 @@ class DarwinCore
38
56
  root_key = @archive.meta.keys[0]
39
57
  ext = @archive.meta[root_key][:extension]
40
58
  return [] unless ext
41
- ext = [ext] unless ext.class == Array
59
+ ext = [ext] if ext.class != Array
42
60
  ext.map { |e| DarwinCore::Extension.new(@archive, e) }
43
61
  end
44
62
  end
@@ -1,34 +1,23 @@
1
1
  class DarwinCore
2
2
  class Core
3
+ include DarwinCore::Ingester
4
+ attr_reader :id
3
5
  def initialize(archive)
4
6
  @archive = archive
5
7
  @path = @archive.files_path
6
8
  root_key = @archive.meta.keys[0]
7
- @core = @archive.meta[root_key][:core]
8
- raise "Cannot found core in meta.xml, is meta.xml valid?" unless @core
9
+ @data = @archive.meta[root_key][:core]
10
+ raise CoreFileError("Cannot found core in meta.xml, is meta.xml valid?") unless @data
11
+ @properties = @data[:attributes]
12
+ @encoding = @properties[:encoding] || 'UTF-8'
13
+ raise CoreFileError("No support for encodings other than utf-8 or utf-16 at the moment") unless ["utf-8", "utf8", "utf-16", "utf16"].include? @encoding.downcase
14
+ @id = @data[:id][:attributes]
15
+ @field_separator = get_field_separator
16
+ @quote_character = @properties[:fieldsEnclosedBy] || ""
17
+ @line_separator = @properties[:linesTerminatedBy] || "\n"
18
+ @ignore_headers = @properties[:ignoreHeaderLines] ? [1, true].include?(@properties[:ignoreHeaderLines]) : false
19
+ @file_path = get_file_path
20
+ @fields = get_fields
9
21
  end
10
-
11
- def data
12
- @core
13
- end
14
-
15
- def properties
16
- @core[:attributes]
17
- end
18
-
19
- def file_path
20
- file = @core[:files][:location] || @core[:location]
21
- File.join(@path, file)
22
- end
23
-
24
- def id
25
- @core[:id][:attributes]
26
- end
27
-
28
- def fields
29
- @core[:field] = [@core[:field]] unless @core[:field].class == Array
30
- @core[:field].map {|f| f[:attributes]}
31
- end
32
-
33
22
  end
34
23
  end
@@ -3,4 +3,5 @@ class DarwinCore
3
3
  class FileNotFoundError < Error; end
4
4
  class UnpackingError < Error; end
5
5
  class InvalidArchiveError < Error; end
6
+ class CoreFileError < Error; end
6
7
  end
@@ -1,31 +1,22 @@
1
1
  class DarwinCore
2
2
  class Extension
3
- def initialize(archive, extension)
3
+ include DarwinCore::Ingester
4
+ attr_reader :coreid
5
+
6
+ def initialize(archive, data)
4
7
  @archive = archive
5
8
  @path = @archive.files_path
6
- @extension = extension
7
- end
8
-
9
- def data
10
- @extension
9
+ @data = data
10
+ @properties = @data[:attributes]
11
+ @coreid = @data[:coreid][:attributes]
12
+ @encoding = @properties[:encoding] || 'UTF-8'
13
+ @quote_character = @properties[:fieldsEnclosedBy] || ""
14
+ @line_separator = @properties[:linesTerminatedBy] || "\n"
15
+ @ignore_headers = @properties[:ignoreHeaderLines] ? [1, true].include?(@properties[:ignoreHeaderLines]) : false
16
+ @field_separator = get_field_separator
17
+ @file_path = get_file_path
18
+ @fields = get_fields
11
19
  end
12
20
 
13
- def properties
14
- @extension[:attributes]
15
- end
16
-
17
- def file_path
18
- file = @extension[:files][:location]
19
- File.join(@path, file)
20
- end
21
-
22
- def coreid
23
- @extension[:coreid][:attributes]
24
- end
25
-
26
- def fields
27
- @extension[:field] = [@extension[:field]] unless @extension[:field].class == Array
28
- @extension[:field].map {|f| f[:attributes]}
29
- end
30
21
  end
31
22
  end
@@ -0,0 +1,46 @@
1
+ class DarwinCore
2
+ module Ingester
3
+ attr_reader :data, :properties, :encoding, :fields_separator
4
+ attr_reader :file_path, :fields, :line_separator, :quote_character, :ignore_headers
5
+ def read(batch_size = 10000)
6
+ res = []
7
+ errors = []
8
+ index_fix = 1
9
+ args = {:col_sep => @field_separator}
10
+ args.merge!({:quote_char => @quote_character}) if @quote_character != ''
11
+ CSV.open(@file_path, args).each_with_index do |r, i|
12
+ index_fix = 0; next if @ignore_headers && i == 0
13
+ str = r.join('')
14
+ if defined? FasterCSV
15
+ UTF8RGX === str ? res << r : errors << r
16
+ else
17
+ str = str.force_encoding('utf-8')
18
+ str.encoding.name == "UTF-8" && str.valid_encoding? ? res << r : errors << r
19
+ end
20
+ if block_given? && (i + index_fix) % batch_size == 0
21
+ yield [res, errors]
22
+ res = []
23
+ errors = []
24
+ end
25
+ end
26
+ [res, errors]
27
+ end
28
+
29
+ private
30
+ def get_file_path
31
+ file = @data[:location] || @data[:attributes][:location] || @data[:files][:location]
32
+ File.join(@path, file)
33
+ end
34
+
35
+ def get_fields
36
+ @data[:field] = [data[:field]] if data[:field].class != Array
37
+ @data[:field].map {|f| f[:attributes]}
38
+ end
39
+
40
+ def get_field_separator
41
+ res = @properties[:fieldsTerminatedBy] || ','
42
+ res = "\t" if res == "\\t"
43
+ res
44
+ end
45
+ end
46
+ end
@@ -52,7 +52,10 @@ class Hash
52
52
  end
53
53
 
54
54
  def prepare(data)
55
- (data.class == String && data.to_i.to_s == data) ? data.to_i : data
55
+ return data if data.class != String
56
+ data = true if data.strip == "true"
57
+ data = false if data.strip == "false"
58
+ data.to_i.to_s == data ? data.to_i : data
56
59
  end
57
60
  end
58
61
 
Binary file
Binary file
@@ -6,9 +6,8 @@ describe DarwinCore do
6
6
  end
7
7
 
8
8
  describe ".new" do
9
-
10
9
  it "should create DarwinCore instance out of archive file" do
11
- ['data.zip', 'data.tar.gz', 'minimal.tar.gz'].each do |file|
10
+ ['data.zip', 'data.tar.gz', 'minimal.tar.gz', 'junk_dir_inside.zip'].each do |file|
12
11
  file = File.join(@file_dir, file)
13
12
  dwc = DarwinCore.new(file)
14
13
  dwc.archive.valid?.should be_true
@@ -35,6 +34,5 @@ describe DarwinCore do
35
34
  dwc = DarwinCore.new(file)
36
35
  dwc.archive.valid?.should be_true
37
36
  end
38
-
39
37
  end
40
38
  end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 1
8
- - 6
9
- version: 0.1.6
7
+ - 2
8
+ - 0
9
+ version: 0.2.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Dmitry Mozzherin
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-05-06 00:00:00 -04:00
17
+ date: 2010-05-19 00:00:00 -04:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -69,6 +69,7 @@ files:
69
69
  - lib/dwc-archive/errors.rb
70
70
  - lib/dwc-archive/expander.rb
71
71
  - lib/dwc-archive/extension.rb
72
+ - lib/dwc-archive/ingester.rb
72
73
  - lib/dwc-archive/metadata.rb
73
74
  - lib/ruby_extensions.rb
74
75
  - spec/files/broken.tar.gz
@@ -77,6 +78,7 @@ files:
77
78
  - spec/files/eml.xml
78
79
  - spec/files/file with characters(3).gz
79
80
  - spec/files/invalid.tar.gz
81
+ - spec/files/junk_dir_inside.zip
80
82
  - spec/files/meta.xml
81
83
  - spec/files/minimal.tar.gz
82
84
  - spec/files/uncompressed