imw 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +34 -14
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/imw.rb +9 -6
- data/lib/imw/{resources/archive.rb → archives.rb} +20 -10
- data/lib/imw/archives/rar.rb +19 -0
- data/lib/imw/archives/tar.rb +19 -0
- data/lib/imw/archives/tarbz2.rb +73 -0
- data/lib/imw/archives/targz.rb +73 -0
- data/lib/imw/archives/zip.rb +51 -0
- data/lib/imw/{resources/compressed_file.rb → compressed_files.rb} +16 -11
- data/lib/imw/compressed_files/bz2.rb +16 -0
- data/lib/imw/{resources → compressed_files}/compressible.rb +2 -4
- data/lib/imw/compressed_files/gz.rb +16 -0
- data/lib/imw/formats.rb +31 -0
- data/lib/imw/formats/delimited.rb +90 -0
- data/lib/imw/formats/excel.rb +125 -0
- data/lib/imw/formats/json.rb +51 -0
- data/lib/imw/formats/sgml.rb +69 -0
- data/lib/imw/formats/yaml.rb +51 -0
- data/lib/imw/resource.rb +108 -10
- data/lib/imw/schemes.rb +21 -0
- data/lib/imw/schemes/hdfs.rb +240 -0
- data/lib/imw/schemes/http.rb +166 -0
- data/lib/imw/schemes/local.rb +219 -0
- data/lib/imw/schemes/remote.rb +114 -0
- data/lib/imw/schemes/s3.rb +135 -0
- data/lib/imw/tools.rb +8 -0
- data/lib/imw/{transforms → tools}/archiver.rb +1 -1
- data/lib/imw/{transforms → tools}/transferer.rb +10 -10
- data/spec/imw/{resources/archive_spec.rb → archive_spec.rb} +3 -3
- data/spec/imw/{resources/archives_and_compressed → archives}/rar_spec.rb +2 -2
- data/spec/imw/{resources/archives_and_compressed → archives}/tar_spec.rb +2 -2
- data/spec/imw/{resources/archives_and_compressed → archives}/tarbz2_spec.rb +4 -4
- data/spec/imw/{resources/archives_and_compressed → archives}/targz_spec.rb +4 -4
- data/spec/imw/{resources/archives_and_compressed → archives}/zip_spec.rb +2 -2
- data/spec/imw/compressed_files/bz2_spec.rb +15 -0
- data/spec/imw/{resources → compressed_files}/compressible_spec.rb +1 -1
- data/spec/imw/compressed_files/gz_spec.rb +15 -0
- data/spec/imw/{resources/compressed_file_spec.rb → compressed_files_spec.rb} +3 -3
- data/spec/imw/{resources/formats → formats}/delimited_spec.rb +2 -2
- data/spec/imw/{resources/formats → formats}/json_spec.rb +2 -2
- data/spec/imw/{resources/formats → formats}/sgml_spec.rb +2 -2
- data/spec/imw/{resources/formats → formats}/yaml_spec.rb +2 -2
- data/spec/imw/resource_spec.rb +4 -4
- data/spec/imw/{resources/schemes → schemes}/hdfs_spec.rb +7 -7
- data/spec/imw/{resources/schemes → schemes}/http_spec.rb +2 -2
- data/spec/imw/{resources → schemes}/local_spec.rb +5 -5
- data/spec/imw/{resources → schemes}/remote_spec.rb +7 -3
- data/spec/imw/{resources/schemes → schemes}/s3_spec.rb +2 -2
- data/spec/imw/{transforms → tools}/archiver_spec.rb +2 -2
- data/spec/imw/tools/transferer_spec.rb +113 -0
- metadata +69 -71
- data/lib/imw/resources.rb +0 -118
- data/lib/imw/resources/archives_and_compressed.rb +0 -32
- data/lib/imw/resources/archives_and_compressed/bz2.rb +0 -18
- data/lib/imw/resources/archives_and_compressed/gz.rb +0 -18
- data/lib/imw/resources/archives_and_compressed/rar.rb +0 -23
- data/lib/imw/resources/archives_and_compressed/tar.rb +0 -23
- data/lib/imw/resources/archives_and_compressed/tarbz2.rb +0 -78
- data/lib/imw/resources/archives_and_compressed/targz.rb +0 -78
- data/lib/imw/resources/archives_and_compressed/zip.rb +0 -57
- data/lib/imw/resources/formats.rb +0 -32
- data/lib/imw/resources/formats/delimited.rb +0 -92
- data/lib/imw/resources/formats/excel.rb +0 -125
- data/lib/imw/resources/formats/json.rb +0 -53
- data/lib/imw/resources/formats/sgml.rb +0 -72
- data/lib/imw/resources/formats/yaml.rb +0 -53
- data/lib/imw/resources/local.rb +0 -198
- data/lib/imw/resources/remote.rb +0 -110
- data/lib/imw/resources/schemes.rb +0 -19
- data/lib/imw/resources/schemes/hdfs.rb +0 -242
- data/lib/imw/resources/schemes/http.rb +0 -161
- data/lib/imw/resources/schemes/s3.rb +0 -137
- data/lib/imw/transforms.rb +0 -8
- data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +0 -15
- data/spec/imw/resources/archives_and_compressed/gz_spec.rb +0 -15
- data/spec/imw/transforms/transferer_spec.rb +0 -113
@@ -10,9 +10,9 @@ module IMW
|
|
10
10
|
:compress => '',
|
11
11
|
:extension => 'bz2'
|
12
12
|
} unless defined?(COMPRESSION_SETTINGS)
|
13
|
-
|
14
|
-
module Resources
|
15
13
|
|
14
|
+
module CompressedFiles
|
15
|
+
|
16
16
|
# Defines methods for compressing a file. The default compression
|
17
17
|
# program is defined in IMW::COMPRESSION_SETTINGS though a
|
18
18
|
# particular resource can change the values in its
|
@@ -70,8 +70,6 @@ module IMW
|
|
70
70
|
copy.mv(path) if copy.exist?
|
71
71
|
end
|
72
72
|
end
|
73
|
-
|
74
73
|
end
|
75
74
|
end
|
76
75
|
end
|
77
|
-
|
data/lib/imw/formats.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
module IMW
|
2
|
+
module Formats
|
3
|
+
autoload :Csv, 'imw/formats/delimited'
|
4
|
+
autoload :Tsv, 'imw/formats/delimited'
|
5
|
+
autoload :Excel, 'imw/formats/excel'
|
6
|
+
autoload :Json, 'imw/formats/json'
|
7
|
+
autoload :Xml, 'imw/formats/sgml'
|
8
|
+
autoload :Xsl, 'imw/formats/sgml'
|
9
|
+
autoload :Html, 'imw/formats/sgml'
|
10
|
+
autoload :Xhtml, 'imw/formats/sgml'
|
11
|
+
autoload :Rdf, 'imw/formats/sgml'
|
12
|
+
autoload :Yaml, 'imw/formats/yaml'
|
13
|
+
|
14
|
+
# Handlers which augment a resource with data format specific
|
15
|
+
# methods.
|
16
|
+
HANDLERS = [
|
17
|
+
[ "Formats::Csv", /\.csv$/ ],
|
18
|
+
[ "Formats::Tsv", /\.tsv$/ ],
|
19
|
+
[ "Formats::Excel", /\.xslx?$/ ],
|
20
|
+
[ "Formats::Json", /\.json$/ ],
|
21
|
+
[ "Formats::Xml", /\.xml$/ ],
|
22
|
+
[ "Formats::Xsl", /\.xsl$/ ],
|
23
|
+
[ "Formats::Html", /\.html?$/ ],
|
24
|
+
[ "Formats::Xhtml", /\.xhtml?$/ ],
|
25
|
+
[ "Formats::Rdf", /\.rdf?$/ ],
|
26
|
+
[ "Formats::Yaml", /\.ya?ml$/ ]
|
27
|
+
]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
|
@@ -0,0 +1,90 @@
|
|
1
|
+
module IMW
|
2
|
+
module Formats
|
3
|
+
|
4
|
+
# Defines methods used for parsing and writing delimited data
|
5
|
+
# formats (CSV, TSV, &c.) with the FasterCSV library. This
|
6
|
+
# module is not used to directly extend a resource. Instead,
|
7
|
+
# more specific modules (e.g. - IMW::Resources::Formats::Csv)
|
8
|
+
# include this one and also define +delimited_options+ which is
|
9
|
+
# actually what's passed to FasterCSV.
|
10
|
+
#
|
11
|
+
# @abstract
|
12
|
+
module Delimited
|
13
|
+
|
14
|
+
attr_accessor :delimited_settings
|
15
|
+
|
16
|
+
# Return the data in this delimited resource as an array of
|
17
|
+
# arrays.
|
18
|
+
#
|
19
|
+
# Yield each outer array (row) if passed a block.
|
20
|
+
#
|
21
|
+
# @return [Array] the full data matrix
|
22
|
+
# @yield [Array] each row of the data
|
23
|
+
def load &block
|
24
|
+
require 'fastercsv'
|
25
|
+
FasterCSV.parse(read, delimited_options, &block)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Map each row in this delimited resource.
|
29
|
+
#
|
30
|
+
# @yield [Array] each row of the data
|
31
|
+
def map &block
|
32
|
+
load.map(&block)
|
33
|
+
end
|
34
|
+
|
35
|
+
# Dump an array of arrays into this resource.
|
36
|
+
#
|
37
|
+
# @param [Array] data array of arrays to dump
|
38
|
+
# @param [Hash] options
|
39
|
+
# @option options [true, false] :persist Keep this resource's IO object open after dumping
|
40
|
+
def dump data, options={}
|
41
|
+
require 'fastercsv'
|
42
|
+
data.each do |row|
|
43
|
+
write(FasterCSV.generate_line(row, delimited_options))
|
44
|
+
end
|
45
|
+
io.close unless options[:persist]
|
46
|
+
self
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
module Csv
|
51
|
+
include Delimited
|
52
|
+
|
53
|
+
# Default options to be passed to
|
54
|
+
# FasterCSV[http://fastercsv.rubyforge.org/]; see its
|
55
|
+
# documentation for more information.
|
56
|
+
#
|
57
|
+
# @return [Hash]
|
58
|
+
def delimited_options
|
59
|
+
@delimited_options ||= {
|
60
|
+
:col_sep => ',',
|
61
|
+
:headers => false,
|
62
|
+
:return_headers => false,
|
63
|
+
:write_headers => true,
|
64
|
+
:skip_blanks => false,
|
65
|
+
:force_quotes => false
|
66
|
+
}
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
module Tsv
|
71
|
+
include Delimited
|
72
|
+
|
73
|
+
# Default options to be passed to
|
74
|
+
# FasterCSV[http://fastercsv.rubyforge.org/]; see its
|
75
|
+
# documentation for more information.
|
76
|
+
#
|
77
|
+
# @return [Hash]
|
78
|
+
def delimited_options
|
79
|
+
@delimited_options ||= {
|
80
|
+
:col_sep => "\t",
|
81
|
+
:headers => false,
|
82
|
+
:return_headers => false,
|
83
|
+
:write_headers => true,
|
84
|
+
:skip_blanks => false,
|
85
|
+
:force_quotes => false
|
86
|
+
}
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
module IMW
|
2
|
+
module Formats
|
3
|
+
|
4
|
+
# Defines methods for reading and writing Microsoft Excel data.
|
5
|
+
module Excel
|
6
|
+
|
7
|
+
attr_accessor :book, :sheet
|
8
|
+
|
9
|
+
def self.extended obj
|
10
|
+
if obj.exist?
|
11
|
+
@book = Spreadsheet.open path
|
12
|
+
@sheet = book.worksheet(0)
|
13
|
+
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
def book
|
19
|
+
return @book if @book
|
20
|
+
if exists?
|
21
|
+
@book = Spreadsheet.open(path)
|
22
|
+
else
|
23
|
+
@book = Spreadsheet::Workbook.new
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def sheet
|
28
|
+
@sheet = @book.create_worksheet
|
29
|
+
@sheet
|
30
|
+
end
|
31
|
+
|
32
|
+
#If an Excel file exists at the location specified by uri then
|
33
|
+
#it is opened and can be read out with a subsequent call to
|
34
|
+
#load(). Otherwise, a new workbook is created and can be written
|
35
|
+
#to with the dump() method.
|
36
|
+
def initialize uri, mode='r', options={}
|
37
|
+
self.uri = uri
|
38
|
+
@max_lines = options[:max_lines] || 65000
|
39
|
+
@idx = 0
|
40
|
+
@book_idx = 0
|
41
|
+
@sht_idx = 0
|
42
|
+
unless self.exist?
|
43
|
+
make_new_book
|
44
|
+
make_new_sheet
|
45
|
+
else
|
46
|
+
get_existing_book
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
#Returns the data in an existing workbook as an
|
51
|
+
#array of arrays. Only capable of reading a single sheet.
|
52
|
+
def load
|
53
|
+
@sheet.map{|row| row.to_a}
|
54
|
+
end
|
55
|
+
|
56
|
+
#Dumps data, which is assumed to be an array of arrays, to a
|
57
|
+
#newly created Excel workbook. Attempting to dump to a book
|
58
|
+
#that already exists will typically result in file corruption.
|
59
|
+
#Raises a 'too many lines' error if the number of lines
|
60
|
+
#of data exceeds max_lines.
|
61
|
+
def dump data
|
62
|
+
data.each do |line|
|
63
|
+
raise "too many lines" if too_many?
|
64
|
+
self << line
|
65
|
+
end
|
66
|
+
save unless no_data?
|
67
|
+
end
|
68
|
+
|
69
|
+
#Processes a single line of data and updates internal variables.
|
70
|
+
#You shouldn't need to call this directly.
|
71
|
+
def << line
|
72
|
+
@sheet.row(@sht_row).concat( line )
|
73
|
+
@sht_row += 1
|
74
|
+
@idx += 1
|
75
|
+
end
|
76
|
+
|
77
|
+
#Instantiates a new Excel workbook in memory. You shouldn't
|
78
|
+
#need to call this directly.
|
79
|
+
def make_new_book
|
80
|
+
@book = Spreadsheet::Workbook.new
|
81
|
+
@book_idx += 1
|
82
|
+
end
|
83
|
+
|
84
|
+
#Makes a new worksheet for a pre-existing Excel workbook.
|
85
|
+
#This should be called after recovering from the
|
86
|
+
#'too many lines' error.
|
87
|
+
def make_new_sheet
|
88
|
+
@sheet = @book.create_worksheet
|
89
|
+
@sht_idx += 1
|
90
|
+
@sht_row = 0 #always start at row 0 in a new sheet
|
91
|
+
end
|
92
|
+
|
93
|
+
#Opens an existing Excel workbook. You shoudn't need to
|
94
|
+
#call this directly.
|
95
|
+
def get_existing_book
|
96
|
+
@sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
|
97
|
+
@sht_idx += 1
|
98
|
+
end
|
99
|
+
|
100
|
+
#Increments the current sheet to the next one in
|
101
|
+
#an open book. Not necessary at the moment.
|
102
|
+
def incr_sheet
|
103
|
+
@sheet = book.worksheet @sht_idx
|
104
|
+
end
|
105
|
+
|
106
|
+
#There are too many lines if the number of rows attempting
|
107
|
+
#to be written exceeds max_lines.
|
108
|
+
def too_many?
|
109
|
+
@sht_row >= @max_lines
|
110
|
+
end
|
111
|
+
|
112
|
+
#There is no data if the number of rows attempting to be written
|
113
|
+
#is zero.
|
114
|
+
def no_data?
|
115
|
+
@sht_row == 0
|
116
|
+
end
|
117
|
+
|
118
|
+
#Saves the workbook.
|
119
|
+
def save
|
120
|
+
@book.write path
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module IMW
|
2
|
+
module Formats
|
3
|
+
|
4
|
+
# Defines methods for reading and writing JSON data.
|
5
|
+
module Json
|
6
|
+
|
7
|
+
# Return the content of this resource.
|
8
|
+
#
|
9
|
+
# Will try to be smart about iterating over the data when
|
10
|
+
# passed a block.
|
11
|
+
#
|
12
|
+
# - if the outermost JSON data structure is an array, then
|
13
|
+
# yield each element
|
14
|
+
#
|
15
|
+
# - if the outermost JSON data structure is a mapping, then
|
16
|
+
# yield each key, value pair
|
17
|
+
#
|
18
|
+
# - otherwise just yield the structure
|
19
|
+
#
|
20
|
+
# @return [Hash, Array, String, Fixnum] whatever the JSON contained
|
21
|
+
def load &block
|
22
|
+
require 'json'
|
23
|
+
json = JSON.parse(read)
|
24
|
+
if block_given?
|
25
|
+
case json
|
26
|
+
when Array
|
27
|
+
json.each { |obj| yield obj }
|
28
|
+
when Hash
|
29
|
+
json.each_pair { |key, value| yield key, value }
|
30
|
+
else
|
31
|
+
yield json
|
32
|
+
end
|
33
|
+
else
|
34
|
+
json
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Dump the +data+ into this resource. It must be opened for
|
39
|
+
# writing.
|
40
|
+
#
|
41
|
+
# @param [Hash, String, Array, Fixnum] data the Ruby object to dump
|
42
|
+
# @option options [true, false] :persist (false) Don't close the IO object after writing
|
43
|
+
def dump data, options={}
|
44
|
+
require 'json'
|
45
|
+
write(data.to_json)
|
46
|
+
io.close unless options[:persist]
|
47
|
+
self
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module IMW
|
2
|
+
module Formats
|
3
|
+
|
4
|
+
# Defines methods to parse SGML-derived data formats (XML, HTML,
|
5
|
+
# &c.). This module isn't directly used to extend resources.
|
6
|
+
# Instead, more specific modules (e.g. -
|
7
|
+
# IMW::Resources::Formats::Xml) are used.
|
8
|
+
module Sgml
|
9
|
+
|
10
|
+
# Parse this resource using Hpricot and return (or yield if
|
11
|
+
# given a block) the resulting Hpricot::Doc.
|
12
|
+
#
|
13
|
+
# @return [Hpricot::Doc]
|
14
|
+
# @yield [Hpricot::Doc]
|
15
|
+
def load &block
|
16
|
+
require 'hpricot'
|
17
|
+
sgml = Hpricot(io)
|
18
|
+
if block_given?
|
19
|
+
yield sgml
|
20
|
+
else
|
21
|
+
sgml
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Parse the Hpricot::Doc of this resource with the given
|
26
|
+
# +parser+.
|
27
|
+
#
|
28
|
+
# The parser can either be an IMW::Parsers::HtmlParser or a
|
29
|
+
# hash which will be used to build such a parser. See the
|
30
|
+
# documentation for IMW::Parsers::HtmlParser for more
|
31
|
+
# information.
|
32
|
+
#
|
33
|
+
# @param [Hash, IMW::Parsers::HtmlParser] parser
|
34
|
+
# @return [Hash] the parser's output
|
35
|
+
def parse parser
|
36
|
+
if parser.is_a?(IMW::Parsers::HtmlParser)
|
37
|
+
parser.parse(load)
|
38
|
+
else
|
39
|
+
IMW::Parsers::HtmlParser.new(parser).parse(load)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Defines methods for XML data.
|
45
|
+
module Xml
|
46
|
+
include Sgml
|
47
|
+
end
|
48
|
+
|
49
|
+
# Defines methods for XSL data.
|
50
|
+
module Xsl
|
51
|
+
include Sgml
|
52
|
+
end
|
53
|
+
|
54
|
+
# Defines methods for XHTML data.
|
55
|
+
module Xhtml
|
56
|
+
include Sgml
|
57
|
+
end
|
58
|
+
|
59
|
+
# Defines methods for HTML data.
|
60
|
+
module Html
|
61
|
+
include Sgml
|
62
|
+
end
|
63
|
+
|
64
|
+
# Defines methods for RDF data.
|
65
|
+
module Rdf
|
66
|
+
include Sgml
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module IMW
|
2
|
+
module Formats
|
3
|
+
|
4
|
+
# Provides methods for reading and writing YAML data.
|
5
|
+
module Yaml
|
6
|
+
|
7
|
+
# Return the content of this resource.
|
8
|
+
#
|
9
|
+
# Will try to be smart about iterating over the data when
|
10
|
+
# passed a block.
|
11
|
+
#
|
12
|
+
# - if the outermost YAML data structure is an array, then
|
13
|
+
# yield each element
|
14
|
+
#
|
15
|
+
# - if the outermost YAML data structure is a mapping, then
|
16
|
+
# yield each key, value pair
|
17
|
+
#
|
18
|
+
# - otherwise just yield the structure
|
19
|
+
#
|
20
|
+
# @return [Hash, Array, String, Fixnum] whatever the YAML contained
|
21
|
+
def load &block
|
22
|
+
require 'yaml'
|
23
|
+
yaml = YAML.load(read)
|
24
|
+
if block_given?
|
25
|
+
case yaml
|
26
|
+
when Array
|
27
|
+
yaml.each { |obj| yield obj }
|
28
|
+
when Hash
|
29
|
+
yaml.each_pair { |key, value| yield key, value }
|
30
|
+
else
|
31
|
+
yield yaml
|
32
|
+
end
|
33
|
+
else
|
34
|
+
yaml
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Dump the +data+ into this resource. It must be opened for
|
39
|
+
# writing.
|
40
|
+
#
|
41
|
+
# @param [Hash, String, Array, Fixnum] data the Ruby object to dump
|
42
|
+
# @option options [true, false] :persist (false) Don't close the IO object after writing
|
43
|
+
def dump data, options={}
|
44
|
+
require 'yaml'
|
45
|
+
write(data.to_yaml)
|
46
|
+
io.close unless options[:persist]
|
47
|
+
self
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|