imw 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +34 -14
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/imw.rb +9 -6
- data/lib/imw/{resources/archive.rb → archives.rb} +20 -10
- data/lib/imw/archives/rar.rb +19 -0
- data/lib/imw/archives/tar.rb +19 -0
- data/lib/imw/archives/tarbz2.rb +73 -0
- data/lib/imw/archives/targz.rb +73 -0
- data/lib/imw/archives/zip.rb +51 -0
- data/lib/imw/{resources/compressed_file.rb → compressed_files.rb} +16 -11
- data/lib/imw/compressed_files/bz2.rb +16 -0
- data/lib/imw/{resources → compressed_files}/compressible.rb +2 -4
- data/lib/imw/compressed_files/gz.rb +16 -0
- data/lib/imw/formats.rb +31 -0
- data/lib/imw/formats/delimited.rb +90 -0
- data/lib/imw/formats/excel.rb +125 -0
- data/lib/imw/formats/json.rb +51 -0
- data/lib/imw/formats/sgml.rb +69 -0
- data/lib/imw/formats/yaml.rb +51 -0
- data/lib/imw/resource.rb +108 -10
- data/lib/imw/schemes.rb +21 -0
- data/lib/imw/schemes/hdfs.rb +240 -0
- data/lib/imw/schemes/http.rb +166 -0
- data/lib/imw/schemes/local.rb +219 -0
- data/lib/imw/schemes/remote.rb +114 -0
- data/lib/imw/schemes/s3.rb +135 -0
- data/lib/imw/tools.rb +8 -0
- data/lib/imw/{transforms → tools}/archiver.rb +1 -1
- data/lib/imw/{transforms → tools}/transferer.rb +10 -10
- data/spec/imw/{resources/archive_spec.rb → archive_spec.rb} +3 -3
- data/spec/imw/{resources/archives_and_compressed → archives}/rar_spec.rb +2 -2
- data/spec/imw/{resources/archives_and_compressed → archives}/tar_spec.rb +2 -2
- data/spec/imw/{resources/archives_and_compressed → archives}/tarbz2_spec.rb +4 -4
- data/spec/imw/{resources/archives_and_compressed → archives}/targz_spec.rb +4 -4
- data/spec/imw/{resources/archives_and_compressed → archives}/zip_spec.rb +2 -2
- data/spec/imw/compressed_files/bz2_spec.rb +15 -0
- data/spec/imw/{resources → compressed_files}/compressible_spec.rb +1 -1
- data/spec/imw/compressed_files/gz_spec.rb +15 -0
- data/spec/imw/{resources/compressed_file_spec.rb → compressed_files_spec.rb} +3 -3
- data/spec/imw/{resources/formats → formats}/delimited_spec.rb +2 -2
- data/spec/imw/{resources/formats → formats}/json_spec.rb +2 -2
- data/spec/imw/{resources/formats → formats}/sgml_spec.rb +2 -2
- data/spec/imw/{resources/formats → formats}/yaml_spec.rb +2 -2
- data/spec/imw/resource_spec.rb +4 -4
- data/spec/imw/{resources/schemes → schemes}/hdfs_spec.rb +7 -7
- data/spec/imw/{resources/schemes → schemes}/http_spec.rb +2 -2
- data/spec/imw/{resources → schemes}/local_spec.rb +5 -5
- data/spec/imw/{resources → schemes}/remote_spec.rb +7 -3
- data/spec/imw/{resources/schemes → schemes}/s3_spec.rb +2 -2
- data/spec/imw/{transforms → tools}/archiver_spec.rb +2 -2
- data/spec/imw/tools/transferer_spec.rb +113 -0
- metadata +69 -71
- data/lib/imw/resources.rb +0 -118
- data/lib/imw/resources/archives_and_compressed.rb +0 -32
- data/lib/imw/resources/archives_and_compressed/bz2.rb +0 -18
- data/lib/imw/resources/archives_and_compressed/gz.rb +0 -18
- data/lib/imw/resources/archives_and_compressed/rar.rb +0 -23
- data/lib/imw/resources/archives_and_compressed/tar.rb +0 -23
- data/lib/imw/resources/archives_and_compressed/tarbz2.rb +0 -78
- data/lib/imw/resources/archives_and_compressed/targz.rb +0 -78
- data/lib/imw/resources/archives_and_compressed/zip.rb +0 -57
- data/lib/imw/resources/formats.rb +0 -32
- data/lib/imw/resources/formats/delimited.rb +0 -92
- data/lib/imw/resources/formats/excel.rb +0 -125
- data/lib/imw/resources/formats/json.rb +0 -53
- data/lib/imw/resources/formats/sgml.rb +0 -72
- data/lib/imw/resources/formats/yaml.rb +0 -53
- data/lib/imw/resources/local.rb +0 -198
- data/lib/imw/resources/remote.rb +0 -110
- data/lib/imw/resources/schemes.rb +0 -19
- data/lib/imw/resources/schemes/hdfs.rb +0 -242
- data/lib/imw/resources/schemes/http.rb +0 -161
- data/lib/imw/resources/schemes/s3.rb +0 -137
- data/lib/imw/transforms.rb +0 -8
- data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +0 -15
- data/spec/imw/resources/archives_and_compressed/gz_spec.rb +0 -15
- data/spec/imw/transforms/transferer_spec.rb +0 -113
@@ -10,9 +10,9 @@ module IMW
|
|
10
10
|
:compress => '',
|
11
11
|
:extension => 'bz2'
|
12
12
|
} unless defined?(COMPRESSION_SETTINGS)
|
13
|
-
|
14
|
-
module Resources
|
15
13
|
|
14
|
+
module CompressedFiles
|
15
|
+
|
16
16
|
# Defines methods for compressing a file. The default compression
|
17
17
|
# program is defined in IMW::COMPRESSION_SETTINGS though a
|
18
18
|
# particular resource can change the values in its
|
@@ -70,8 +70,6 @@ module IMW
|
|
70
70
|
copy.mv(path) if copy.exist?
|
71
71
|
end
|
72
72
|
end
|
73
|
-
|
74
73
|
end
|
75
74
|
end
|
76
75
|
end
|
77
|
-
|
data/lib/imw/formats.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
module IMW
|
2
|
+
module Formats
|
3
|
+
autoload :Csv, 'imw/formats/delimited'
|
4
|
+
autoload :Tsv, 'imw/formats/delimited'
|
5
|
+
autoload :Excel, 'imw/formats/excel'
|
6
|
+
autoload :Json, 'imw/formats/json'
|
7
|
+
autoload :Xml, 'imw/formats/sgml'
|
8
|
+
autoload :Xsl, 'imw/formats/sgml'
|
9
|
+
autoload :Html, 'imw/formats/sgml'
|
10
|
+
autoload :Xhtml, 'imw/formats/sgml'
|
11
|
+
autoload :Rdf, 'imw/formats/sgml'
|
12
|
+
autoload :Yaml, 'imw/formats/yaml'
|
13
|
+
|
14
|
+
# Handlers which augment a resource with data format specific
|
15
|
+
# methods.
|
16
|
+
HANDLERS = [
|
17
|
+
[ "Formats::Csv", /\.csv$/ ],
|
18
|
+
[ "Formats::Tsv", /\.tsv$/ ],
|
19
|
+
[ "Formats::Excel", /\.xslx?$/ ],
|
20
|
+
[ "Formats::Json", /\.json$/ ],
|
21
|
+
[ "Formats::Xml", /\.xml$/ ],
|
22
|
+
[ "Formats::Xsl", /\.xsl$/ ],
|
23
|
+
[ "Formats::Html", /\.html?$/ ],
|
24
|
+
[ "Formats::Xhtml", /\.xhtml?$/ ],
|
25
|
+
[ "Formats::Rdf", /\.rdf?$/ ],
|
26
|
+
[ "Formats::Yaml", /\.ya?ml$/ ]
|
27
|
+
]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
|
@@ -0,0 +1,90 @@
|
|
1
|
+
module IMW
|
2
|
+
module Formats
|
3
|
+
|
4
|
+
# Defines methods used for parsing and writing delimited data
|
5
|
+
# formats (CSV, TSV, &c.) with the FasterCSV library. This
|
6
|
+
# module is not used to directly extend a resource. Instead,
|
7
|
+
# more specific modules (e.g. - IMW::Resources::Formats::Csv)
|
8
|
+
# include this one and also define +delimited_options+ which is
|
9
|
+
# actually what's passed to FasterCSV.
|
10
|
+
#
|
11
|
+
# @abstract
|
12
|
+
module Delimited
|
13
|
+
|
14
|
+
attr_accessor :delimited_settings
|
15
|
+
|
16
|
+
# Return the data in this delimited resource as an array of
|
17
|
+
# arrays.
|
18
|
+
#
|
19
|
+
# Yield each outer array (row) if passed a block.
|
20
|
+
#
|
21
|
+
# @return [Array] the full data matrix
|
22
|
+
# @yield [Array] each row of the data
|
23
|
+
def load &block
|
24
|
+
require 'fastercsv'
|
25
|
+
FasterCSV.parse(read, delimited_options, &block)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Map each row in this delimited resource.
|
29
|
+
#
|
30
|
+
# @yield [Array] each row of the data
|
31
|
+
def map &block
|
32
|
+
load.map(&block)
|
33
|
+
end
|
34
|
+
|
35
|
+
# Dump an array of arrays into this resource.
|
36
|
+
#
|
37
|
+
# @param [Array] data array of arrays to dump
|
38
|
+
# @param [Hash] options
|
39
|
+
# @option options [true, false] :persist Keep this resource's IO object open after dumping
|
40
|
+
def dump data, options={}
|
41
|
+
require 'fastercsv'
|
42
|
+
data.each do |row|
|
43
|
+
write(FasterCSV.generate_line(row, delimited_options))
|
44
|
+
end
|
45
|
+
io.close unless options[:persist]
|
46
|
+
self
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
module Csv
|
51
|
+
include Delimited
|
52
|
+
|
53
|
+
# Default options to be passed to
|
54
|
+
# FasterCSV[http://fastercsv.rubyforge.org/]; see its
|
55
|
+
# documentation for more information.
|
56
|
+
#
|
57
|
+
# @return [Hash]
|
58
|
+
def delimited_options
|
59
|
+
@delimited_options ||= {
|
60
|
+
:col_sep => ',',
|
61
|
+
:headers => false,
|
62
|
+
:return_headers => false,
|
63
|
+
:write_headers => true,
|
64
|
+
:skip_blanks => false,
|
65
|
+
:force_quotes => false
|
66
|
+
}
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
module Tsv
|
71
|
+
include Delimited
|
72
|
+
|
73
|
+
# Default options to be passed to
|
74
|
+
# FasterCSV[http://fastercsv.rubyforge.org/]; see its
|
75
|
+
# documentation for more information.
|
76
|
+
#
|
77
|
+
# @return [Hash]
|
78
|
+
def delimited_options
|
79
|
+
@delimited_options ||= {
|
80
|
+
:col_sep => "\t",
|
81
|
+
:headers => false,
|
82
|
+
:return_headers => false,
|
83
|
+
:write_headers => true,
|
84
|
+
:skip_blanks => false,
|
85
|
+
:force_quotes => false
|
86
|
+
}
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
module IMW
|
2
|
+
module Formats
|
3
|
+
|
4
|
+
# Defines methods for reading and writing Microsoft Excel data.
|
5
|
+
module Excel
|
6
|
+
|
7
|
+
attr_accessor :book, :sheet
|
8
|
+
|
9
|
+
def self.extended obj
|
10
|
+
if obj.exist?
|
11
|
+
@book = Spreadsheet.open path
|
12
|
+
@sheet = book.worksheet(0)
|
13
|
+
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
def book
|
19
|
+
return @book if @book
|
20
|
+
if exists?
|
21
|
+
@book = Spreadsheet.open(path)
|
22
|
+
else
|
23
|
+
@book = Spreadsheet::Workbook.new
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def sheet
|
28
|
+
@sheet = @book.create_worksheet
|
29
|
+
@sheet
|
30
|
+
end
|
31
|
+
|
32
|
+
#If an Excel file exists at the location specified by uri then
|
33
|
+
#it is opened and can be read out with a subsequent call to
|
34
|
+
#load(). Otherwise, a new workbook is created and can be written
|
35
|
+
#to with the dump() method.
|
36
|
+
def initialize uri, mode='r', options={}
|
37
|
+
self.uri = uri
|
38
|
+
@max_lines = options[:max_lines] || 65000
|
39
|
+
@idx = 0
|
40
|
+
@book_idx = 0
|
41
|
+
@sht_idx = 0
|
42
|
+
unless self.exist?
|
43
|
+
make_new_book
|
44
|
+
make_new_sheet
|
45
|
+
else
|
46
|
+
get_existing_book
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
#Returns the data in an existing workbook as an
|
51
|
+
#array of arrays. Only capable of reading a single sheet.
|
52
|
+
def load
|
53
|
+
@sheet.map{|row| row.to_a}
|
54
|
+
end
|
55
|
+
|
56
|
+
#Dumps data, which is assumed to be an array of arrays, to a
|
57
|
+
#newly created Excel workbook. Attempting to dump to a book
|
58
|
+
#that already exists will typically result in file corruption.
|
59
|
+
#Raises a 'too many lines' error if the number of lines
|
60
|
+
#of data exceeds max_lines.
|
61
|
+
def dump data
|
62
|
+
data.each do |line|
|
63
|
+
raise "too many lines" if too_many?
|
64
|
+
self << line
|
65
|
+
end
|
66
|
+
save unless no_data?
|
67
|
+
end
|
68
|
+
|
69
|
+
#Processes a single line of data and updates internal variables.
|
70
|
+
#You shouldn't need to call this directly.
|
71
|
+
def << line
|
72
|
+
@sheet.row(@sht_row).concat( line )
|
73
|
+
@sht_row += 1
|
74
|
+
@idx += 1
|
75
|
+
end
|
76
|
+
|
77
|
+
#Instantiates a new Excel workbook in memory. You shouldn't
|
78
|
+
#need to call this directly.
|
79
|
+
def make_new_book
|
80
|
+
@book = Spreadsheet::Workbook.new
|
81
|
+
@book_idx += 1
|
82
|
+
end
|
83
|
+
|
84
|
+
#Makes a new worksheet for a pre-existing Excel workbook.
|
85
|
+
#This should be called after recovering from the
|
86
|
+
#'too many lines' error.
|
87
|
+
def make_new_sheet
|
88
|
+
@sheet = @book.create_worksheet
|
89
|
+
@sht_idx += 1
|
90
|
+
@sht_row = 0 #always start at row 0 in a new sheet
|
91
|
+
end
|
92
|
+
|
93
|
+
#Opens an existing Excel workbook. You shoudn't need to
|
94
|
+
#call this directly.
|
95
|
+
def get_existing_book
|
96
|
+
@sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
|
97
|
+
@sht_idx += 1
|
98
|
+
end
|
99
|
+
|
100
|
+
#Increments the current sheet to the next one in
|
101
|
+
#an open book. Not necessary at the moment.
|
102
|
+
def incr_sheet
|
103
|
+
@sheet = book.worksheet @sht_idx
|
104
|
+
end
|
105
|
+
|
106
|
+
#There are too many lines if the number of rows attempting
|
107
|
+
#to be written exceeds max_lines.
|
108
|
+
def too_many?
|
109
|
+
@sht_row >= @max_lines
|
110
|
+
end
|
111
|
+
|
112
|
+
#There is no data if the number of rows attempting to be written
|
113
|
+
#is zero.
|
114
|
+
def no_data?
|
115
|
+
@sht_row == 0
|
116
|
+
end
|
117
|
+
|
118
|
+
#Saves the workbook.
|
119
|
+
def save
|
120
|
+
@book.write path
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module IMW
|
2
|
+
module Formats
|
3
|
+
|
4
|
+
# Defines methods for reading and writing JSON data.
|
5
|
+
module Json
|
6
|
+
|
7
|
+
# Return the content of this resource.
|
8
|
+
#
|
9
|
+
# Will try to be smart about iterating over the data when
|
10
|
+
# passed a block.
|
11
|
+
#
|
12
|
+
# - if the outermost JSON data structure is an array, then
|
13
|
+
# yield each element
|
14
|
+
#
|
15
|
+
# - if the outermost JSON data structure is a mapping, then
|
16
|
+
# yield each key, value pair
|
17
|
+
#
|
18
|
+
# - otherwise just yield the structure
|
19
|
+
#
|
20
|
+
# @return [Hash, Array, String, Fixnum] whatever the JSON contained
|
21
|
+
def load &block
|
22
|
+
require 'json'
|
23
|
+
json = JSON.parse(read)
|
24
|
+
if block_given?
|
25
|
+
case json
|
26
|
+
when Array
|
27
|
+
json.each { |obj| yield obj }
|
28
|
+
when Hash
|
29
|
+
json.each_pair { |key, value| yield key, value }
|
30
|
+
else
|
31
|
+
yield json
|
32
|
+
end
|
33
|
+
else
|
34
|
+
json
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Dump the +data+ into this resource. It must be opened for
|
39
|
+
# writing.
|
40
|
+
#
|
41
|
+
# @param [Hash, String, Array, Fixnum] data the Ruby object to dump
|
42
|
+
# @option options [true, false] :persist (false) Don't close the IO object after writing
|
43
|
+
def dump data, options={}
|
44
|
+
require 'json'
|
45
|
+
write(data.to_json)
|
46
|
+
io.close unless options[:persist]
|
47
|
+
self
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module IMW
|
2
|
+
module Formats
|
3
|
+
|
4
|
+
# Defines methods to parse SGML-derived data formats (XML, HTML,
|
5
|
+
# &c.). This module isn't directly used to extend resources.
|
6
|
+
# Instead, more specific modules (e.g. -
|
7
|
+
# IMW::Resources::Formats::Xml) are used.
|
8
|
+
module Sgml
|
9
|
+
|
10
|
+
# Parse this resource using Hpricot and return (or yield if
|
11
|
+
# given a block) the resulting Hpricot::Doc.
|
12
|
+
#
|
13
|
+
# @return [Hpricot::Doc]
|
14
|
+
# @yield [Hpricot::Doc]
|
15
|
+
def load &block
|
16
|
+
require 'hpricot'
|
17
|
+
sgml = Hpricot(io)
|
18
|
+
if block_given?
|
19
|
+
yield sgml
|
20
|
+
else
|
21
|
+
sgml
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Parse the Hpricot::Doc of this resource with the given
|
26
|
+
# +parser+.
|
27
|
+
#
|
28
|
+
# The parser can either be an IMW::Parsers::HtmlParser or a
|
29
|
+
# hash which will be used to build such a parser. See the
|
30
|
+
# documentation for IMW::Parsers::HtmlParser for more
|
31
|
+
# information.
|
32
|
+
#
|
33
|
+
# @param [Hash, IMW::Parsers::HtmlParser] parser
|
34
|
+
# @return [Hash] the parser's output
|
35
|
+
def parse parser
|
36
|
+
if parser.is_a?(IMW::Parsers::HtmlParser)
|
37
|
+
parser.parse(load)
|
38
|
+
else
|
39
|
+
IMW::Parsers::HtmlParser.new(parser).parse(load)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Defines methods for XML data.
|
45
|
+
module Xml
|
46
|
+
include Sgml
|
47
|
+
end
|
48
|
+
|
49
|
+
# Defines methods for XSL data.
|
50
|
+
module Xsl
|
51
|
+
include Sgml
|
52
|
+
end
|
53
|
+
|
54
|
+
# Defines methods for XHTML data.
|
55
|
+
module Xhtml
|
56
|
+
include Sgml
|
57
|
+
end
|
58
|
+
|
59
|
+
# Defines methods for HTML data.
|
60
|
+
module Html
|
61
|
+
include Sgml
|
62
|
+
end
|
63
|
+
|
64
|
+
# Defines methods for RDF data.
|
65
|
+
module Rdf
|
66
|
+
include Sgml
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module IMW
|
2
|
+
module Formats
|
3
|
+
|
4
|
+
# Provides methods for reading and writing YAML data.
|
5
|
+
module Yaml
|
6
|
+
|
7
|
+
# Return the content of this resource.
|
8
|
+
#
|
9
|
+
# Will try to be smart about iterating over the data when
|
10
|
+
# passed a block.
|
11
|
+
#
|
12
|
+
# - if the outermost YAML data structure is an array, then
|
13
|
+
# yield each element
|
14
|
+
#
|
15
|
+
# - if the outermost YAML data structure is a mapping, then
|
16
|
+
# yield each key, value pair
|
17
|
+
#
|
18
|
+
# - otherwise just yield the structure
|
19
|
+
#
|
20
|
+
# @return [Hash, Array, String, Fixnum] whatever the YAML contained
|
21
|
+
def load &block
|
22
|
+
require 'yaml'
|
23
|
+
yaml = YAML.load(read)
|
24
|
+
if block_given?
|
25
|
+
case yaml
|
26
|
+
when Array
|
27
|
+
yaml.each { |obj| yield obj }
|
28
|
+
when Hash
|
29
|
+
yaml.each_pair { |key, value| yield key, value }
|
30
|
+
else
|
31
|
+
yield yaml
|
32
|
+
end
|
33
|
+
else
|
34
|
+
yaml
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Dump the +data+ into this resource. It must be opened for
|
39
|
+
# writing.
|
40
|
+
#
|
41
|
+
# @param [Hash, String, Array, Fixnum] data the Ruby object to dump
|
42
|
+
# @option options [true, false] :persist (false) Don't close the IO object after writing
|
43
|
+
def dump data, options={}
|
44
|
+
require 'yaml'
|
45
|
+
write(data.to_yaml)
|
46
|
+
io.close unless options[:persist]
|
47
|
+
self
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|