imw 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. data/README.rdoc +34 -14
  2. data/Rakefile +1 -1
  3. data/VERSION +1 -1
  4. data/lib/imw.rb +9 -6
  5. data/lib/imw/{resources/archive.rb → archives.rb} +20 -10
  6. data/lib/imw/archives/rar.rb +19 -0
  7. data/lib/imw/archives/tar.rb +19 -0
  8. data/lib/imw/archives/tarbz2.rb +73 -0
  9. data/lib/imw/archives/targz.rb +73 -0
  10. data/lib/imw/archives/zip.rb +51 -0
  11. data/lib/imw/{resources/compressed_file.rb → compressed_files.rb} +16 -11
  12. data/lib/imw/compressed_files/bz2.rb +16 -0
  13. data/lib/imw/{resources → compressed_files}/compressible.rb +2 -4
  14. data/lib/imw/compressed_files/gz.rb +16 -0
  15. data/lib/imw/formats.rb +31 -0
  16. data/lib/imw/formats/delimited.rb +90 -0
  17. data/lib/imw/formats/excel.rb +125 -0
  18. data/lib/imw/formats/json.rb +51 -0
  19. data/lib/imw/formats/sgml.rb +69 -0
  20. data/lib/imw/formats/yaml.rb +51 -0
  21. data/lib/imw/resource.rb +108 -10
  22. data/lib/imw/schemes.rb +21 -0
  23. data/lib/imw/schemes/hdfs.rb +240 -0
  24. data/lib/imw/schemes/http.rb +166 -0
  25. data/lib/imw/schemes/local.rb +219 -0
  26. data/lib/imw/schemes/remote.rb +114 -0
  27. data/lib/imw/schemes/s3.rb +135 -0
  28. data/lib/imw/tools.rb +8 -0
  29. data/lib/imw/{transforms → tools}/archiver.rb +1 -1
  30. data/lib/imw/{transforms → tools}/transferer.rb +10 -10
  31. data/spec/imw/{resources/archive_spec.rb → archive_spec.rb} +3 -3
  32. data/spec/imw/{resources/archives_and_compressed → archives}/rar_spec.rb +2 -2
  33. data/spec/imw/{resources/archives_and_compressed → archives}/tar_spec.rb +2 -2
  34. data/spec/imw/{resources/archives_and_compressed → archives}/tarbz2_spec.rb +4 -4
  35. data/spec/imw/{resources/archives_and_compressed → archives}/targz_spec.rb +4 -4
  36. data/spec/imw/{resources/archives_and_compressed → archives}/zip_spec.rb +2 -2
  37. data/spec/imw/compressed_files/bz2_spec.rb +15 -0
  38. data/spec/imw/{resources → compressed_files}/compressible_spec.rb +1 -1
  39. data/spec/imw/compressed_files/gz_spec.rb +15 -0
  40. data/spec/imw/{resources/compressed_file_spec.rb → compressed_files_spec.rb} +3 -3
  41. data/spec/imw/{resources/formats → formats}/delimited_spec.rb +2 -2
  42. data/spec/imw/{resources/formats → formats}/json_spec.rb +2 -2
  43. data/spec/imw/{resources/formats → formats}/sgml_spec.rb +2 -2
  44. data/spec/imw/{resources/formats → formats}/yaml_spec.rb +2 -2
  45. data/spec/imw/resource_spec.rb +4 -4
  46. data/spec/imw/{resources/schemes → schemes}/hdfs_spec.rb +7 -7
  47. data/spec/imw/{resources/schemes → schemes}/http_spec.rb +2 -2
  48. data/spec/imw/{resources → schemes}/local_spec.rb +5 -5
  49. data/spec/imw/{resources → schemes}/remote_spec.rb +7 -3
  50. data/spec/imw/{resources/schemes → schemes}/s3_spec.rb +2 -2
  51. data/spec/imw/{transforms → tools}/archiver_spec.rb +2 -2
  52. data/spec/imw/tools/transferer_spec.rb +113 -0
  53. metadata +69 -71
  54. data/lib/imw/resources.rb +0 -118
  55. data/lib/imw/resources/archives_and_compressed.rb +0 -32
  56. data/lib/imw/resources/archives_and_compressed/bz2.rb +0 -18
  57. data/lib/imw/resources/archives_and_compressed/gz.rb +0 -18
  58. data/lib/imw/resources/archives_and_compressed/rar.rb +0 -23
  59. data/lib/imw/resources/archives_and_compressed/tar.rb +0 -23
  60. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +0 -78
  61. data/lib/imw/resources/archives_and_compressed/targz.rb +0 -78
  62. data/lib/imw/resources/archives_and_compressed/zip.rb +0 -57
  63. data/lib/imw/resources/formats.rb +0 -32
  64. data/lib/imw/resources/formats/delimited.rb +0 -92
  65. data/lib/imw/resources/formats/excel.rb +0 -125
  66. data/lib/imw/resources/formats/json.rb +0 -53
  67. data/lib/imw/resources/formats/sgml.rb +0 -72
  68. data/lib/imw/resources/formats/yaml.rb +0 -53
  69. data/lib/imw/resources/local.rb +0 -198
  70. data/lib/imw/resources/remote.rb +0 -110
  71. data/lib/imw/resources/schemes.rb +0 -19
  72. data/lib/imw/resources/schemes/hdfs.rb +0 -242
  73. data/lib/imw/resources/schemes/http.rb +0 -161
  74. data/lib/imw/resources/schemes/s3.rb +0 -137
  75. data/lib/imw/transforms.rb +0 -8
  76. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +0 -15
  77. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +0 -15
  78. data/spec/imw/transforms/transferer_spec.rb +0 -113
@@ -10,9 +10,9 @@ module IMW
10
10
  :compress => '',
11
11
  :extension => 'bz2'
12
12
  } unless defined?(COMPRESSION_SETTINGS)
13
-
14
- module Resources
15
13
 
14
+ module CompressedFiles
15
+
16
16
  # Defines methods for compressing a file. The default compression
17
17
  # program is defined in IMW::COMPRESSION_SETTINGS though a
18
18
  # particular resource can change the values in its
@@ -70,8 +70,6 @@ module IMW
70
70
  copy.mv(path) if copy.exist?
71
71
  end
72
72
  end
73
-
74
73
  end
75
74
  end
76
75
  end
77
-
@@ -0,0 +1,16 @@
1
+ module IMW
2
+ module CompressedFiles
3
+ module Gz
4
+
5
+ include IMW::CompressedFiles::Base
6
+
7
+ def compression_settings
8
+ @compression_settings ||= {
9
+ :decompression_program => :gunzip,
10
+ :decompress => '-fd'
11
+ }
12
+ end
13
+
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,31 @@
1
+ module IMW
2
+ module Formats
3
+ autoload :Csv, 'imw/formats/delimited'
4
+ autoload :Tsv, 'imw/formats/delimited'
5
+ autoload :Excel, 'imw/formats/excel'
6
+ autoload :Json, 'imw/formats/json'
7
+ autoload :Xml, 'imw/formats/sgml'
8
+ autoload :Xsl, 'imw/formats/sgml'
9
+ autoload :Html, 'imw/formats/sgml'
10
+ autoload :Xhtml, 'imw/formats/sgml'
11
+ autoload :Rdf, 'imw/formats/sgml'
12
+ autoload :Yaml, 'imw/formats/yaml'
13
+
14
+ # Handlers which augment a resource with data format specific
15
+ # methods.
16
+ HANDLERS = [
17
+ [ "Formats::Csv", /\.csv$/ ],
18
+ [ "Formats::Tsv", /\.tsv$/ ],
19
+ [ "Formats::Excel", /\.xslx?$/ ],
20
+ [ "Formats::Json", /\.json$/ ],
21
+ [ "Formats::Xml", /\.xml$/ ],
22
+ [ "Formats::Xsl", /\.xsl$/ ],
23
+ [ "Formats::Html", /\.html?$/ ],
24
+ [ "Formats::Xhtml", /\.xhtml?$/ ],
25
+ [ "Formats::Rdf", /\.rdf?$/ ],
26
+ [ "Formats::Yaml", /\.ya?ml$/ ]
27
+ ]
28
+ end
29
+ end
30
+
31
+
@@ -0,0 +1,90 @@
1
+ module IMW
2
+ module Formats
3
+
4
+ # Defines methods used for parsing and writing delimited data
5
+ # formats (CSV, TSV, &c.) with the FasterCSV library. This
6
+ # module is not used to directly extend a resource. Instead,
7
+ # more specific modules (e.g. - IMW::Resources::Formats::Csv)
8
+ # include this one and also define +delimited_options+ which is
9
+ # actually what's passed to FasterCSV.
10
+ #
11
+ # @abstract
12
+ module Delimited
13
+
14
+ attr_accessor :delimited_settings
15
+
16
+ # Return the data in this delimited resource as an array of
17
+ # arrays.
18
+ #
19
+ # Yield each outer array (row) if passed a block.
20
+ #
21
+ # @return [Array] the full data matrix
22
+ # @yield [Array] each row of the data
23
+ def load &block
24
+ require 'fastercsv'
25
+ FasterCSV.parse(read, delimited_options, &block)
26
+ end
27
+
28
+ # Map each row in this delimited resource.
29
+ #
30
+ # @yield [Array] each row of the data
31
+ def map &block
32
+ load.map(&block)
33
+ end
34
+
35
+ # Dump an array of arrays into this resource.
36
+ #
37
+ # @param [Array] data array of arrays to dump
38
+ # @param [Hash] options
39
+ # @option options [true, false] :persist Keep this resource's IO object open after dumping
40
+ def dump data, options={}
41
+ require 'fastercsv'
42
+ data.each do |row|
43
+ write(FasterCSV.generate_line(row, delimited_options))
44
+ end
45
+ io.close unless options[:persist]
46
+ self
47
+ end
48
+ end
49
+
50
+ module Csv
51
+ include Delimited
52
+
53
+ # Default options to be passed to
54
+ # FasterCSV[http://fastercsv.rubyforge.org/]; see its
55
+ # documentation for more information.
56
+ #
57
+ # @return [Hash]
58
+ def delimited_options
59
+ @delimited_options ||= {
60
+ :col_sep => ',',
61
+ :headers => false,
62
+ :return_headers => false,
63
+ :write_headers => true,
64
+ :skip_blanks => false,
65
+ :force_quotes => false
66
+ }
67
+ end
68
+ end
69
+
70
+ module Tsv
71
+ include Delimited
72
+
73
+ # Default options to be passed to
74
+ # FasterCSV[http://fastercsv.rubyforge.org/]; see its
75
+ # documentation for more information.
76
+ #
77
+ # @return [Hash]
78
+ def delimited_options
79
+ @delimited_options ||= {
80
+ :col_sep => "\t",
81
+ :headers => false,
82
+ :return_headers => false,
83
+ :write_headers => true,
84
+ :skip_blanks => false,
85
+ :force_quotes => false
86
+ }
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,125 @@
1
+ module IMW
2
+ module Formats
3
+
4
+ # Defines methods for reading and writing Microsoft Excel data.
5
+ module Excel
6
+
7
+ attr_accessor :book, :sheet
8
+
9
+ def self.extended obj
10
+ if obj.exist?
11
+ @book = Spreadsheet.open path
12
+ @sheet = book.worksheet(0)
13
+
14
+ end
15
+ end
16
+
17
+
18
+ def book
19
+ return @book if @book
20
+ if exists?
21
+ @book = Spreadsheet.open(path)
22
+ else
23
+ @book = Spreadsheet::Workbook.new
24
+ end
25
+ end
26
+
27
+ def sheet
28
+ @sheet = @book.create_worksheet
29
+ @sheet
30
+ end
31
+
32
+ #If an Excel file exists at the location specified by uri then
33
+ #it is opened and can be read out with a subsequent call to
34
+ #load(). Otherwise, a new workbook is created and can be written
35
+ #to with the dump() method.
36
+ def initialize uri, mode='r', options={}
37
+ self.uri = uri
38
+ @max_lines = options[:max_lines] || 65000
39
+ @idx = 0
40
+ @book_idx = 0
41
+ @sht_idx = 0
42
+ unless self.exist?
43
+ make_new_book
44
+ make_new_sheet
45
+ else
46
+ get_existing_book
47
+ end
48
+ end
49
+
50
+ #Returns the data in an existing workbook as an
51
+ #array of arrays. Only capable of reading a single sheet.
52
+ def load
53
+ @sheet.map{|row| row.to_a}
54
+ end
55
+
56
+ #Dumps data, which is assumed to be an array of arrays, to a
57
+ #newly created Excel workbook. Attempting to dump to a book
58
+ #that already exists will typically result in file corruption.
59
+ #Raises a 'too many lines' error if the number of lines
60
+ #of data exceeds max_lines.
61
+ def dump data
62
+ data.each do |line|
63
+ raise "too many lines" if too_many?
64
+ self << line
65
+ end
66
+ save unless no_data?
67
+ end
68
+
69
+ #Processes a single line of data and updates internal variables.
70
+ #You shouldn't need to call this directly.
71
+ def << line
72
+ @sheet.row(@sht_row).concat( line )
73
+ @sht_row += 1
74
+ @idx += 1
75
+ end
76
+
77
+ #Instantiates a new Excel workbook in memory. You shouldn't
78
+ #need to call this directly.
79
+ def make_new_book
80
+ @book = Spreadsheet::Workbook.new
81
+ @book_idx += 1
82
+ end
83
+
84
+ #Makes a new worksheet for a pre-existing Excel workbook.
85
+ #This should be called after recovering from the
86
+ #'too many lines' error.
87
+ def make_new_sheet
88
+ @sheet = @book.create_worksheet
89
+ @sht_idx += 1
90
+ @sht_row = 0 #always start at row 0 in a new sheet
91
+ end
92
+
93
+ #Opens an existing Excel workbook. You shoudn't need to
94
+ #call this directly.
95
+ def get_existing_book
96
+ @sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
97
+ @sht_idx += 1
98
+ end
99
+
100
+ #Increments the current sheet to the next one in
101
+ #an open book. Not necessary at the moment.
102
+ def incr_sheet
103
+ @sheet = book.worksheet @sht_idx
104
+ end
105
+
106
+ #There are too many lines if the number of rows attempting
107
+ #to be written exceeds max_lines.
108
+ def too_many?
109
+ @sht_row >= @max_lines
110
+ end
111
+
112
+ #There is no data if the number of rows attempting to be written
113
+ #is zero.
114
+ def no_data?
115
+ @sht_row == 0
116
+ end
117
+
118
+ #Saves the workbook.
119
+ def save
120
+ @book.write path
121
+ end
122
+ end
123
+ end
124
+ end
125
+
@@ -0,0 +1,51 @@
1
+ module IMW
2
+ module Formats
3
+
4
+ # Defines methods for reading and writing JSON data.
5
+ module Json
6
+
7
+ # Return the content of this resource.
8
+ #
9
+ # Will try to be smart about iterating over the data when
10
+ # passed a block.
11
+ #
12
+ # - if the outermost JSON data structure is an array, then
13
+ # yield each element
14
+ #
15
+ # - if the outermost JSON data structure is a mapping, then
16
+ # yield each key, value pair
17
+ #
18
+ # - otherwise just yield the structure
19
+ #
20
+ # @return [Hash, Array, String, Fixnum] whatever the JSON contained
21
+ def load &block
22
+ require 'json'
23
+ json = JSON.parse(read)
24
+ if block_given?
25
+ case json
26
+ when Array
27
+ json.each { |obj| yield obj }
28
+ when Hash
29
+ json.each_pair { |key, value| yield key, value }
30
+ else
31
+ yield json
32
+ end
33
+ else
34
+ json
35
+ end
36
+ end
37
+
38
+ # Dump the +data+ into this resource. It must be opened for
39
+ # writing.
40
+ #
41
+ # @param [Hash, String, Array, Fixnum] data the Ruby object to dump
42
+ # @option options [true, false] :persist (false) Don't close the IO object after writing
43
+ def dump data, options={}
44
+ require 'json'
45
+ write(data.to_json)
46
+ io.close unless options[:persist]
47
+ self
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,69 @@
1
+ module IMW
2
+ module Formats
3
+
4
+ # Defines methods to parse SGML-derived data formats (XML, HTML,
5
+ # &c.). This module isn't directly used to extend resources.
6
+ # Instead, more specific modules (e.g. -
7
+ # IMW::Resources::Formats::Xml) are used.
8
+ module Sgml
9
+
10
+ # Parse this resource using Hpricot and return (or yield if
11
+ # given a block) the resulting Hpricot::Doc.
12
+ #
13
+ # @return [Hpricot::Doc]
14
+ # @yield [Hpricot::Doc]
15
+ def load &block
16
+ require 'hpricot'
17
+ sgml = Hpricot(io)
18
+ if block_given?
19
+ yield sgml
20
+ else
21
+ sgml
22
+ end
23
+ end
24
+
25
+ # Parse the Hpricot::Doc of this resource with the given
26
+ # +parser+.
27
+ #
28
+ # The parser can either be an IMW::Parsers::HtmlParser or a
29
+ # hash which will be used to build such a parser. See the
30
+ # documentation for IMW::Parsers::HtmlParser for more
31
+ # information.
32
+ #
33
+ # @param [Hash, IMW::Parsers::HtmlParser] parser
34
+ # @return [Hash] the parser's output
35
+ def parse parser
36
+ if parser.is_a?(IMW::Parsers::HtmlParser)
37
+ parser.parse(load)
38
+ else
39
+ IMW::Parsers::HtmlParser.new(parser).parse(load)
40
+ end
41
+ end
42
+ end
43
+
44
+ # Defines methods for XML data.
45
+ module Xml
46
+ include Sgml
47
+ end
48
+
49
+ # Defines methods for XSL data.
50
+ module Xsl
51
+ include Sgml
52
+ end
53
+
54
+ # Defines methods for XHTML data.
55
+ module Xhtml
56
+ include Sgml
57
+ end
58
+
59
+ # Defines methods for HTML data.
60
+ module Html
61
+ include Sgml
62
+ end
63
+
64
+ # Defines methods for RDF data.
65
+ module Rdf
66
+ include Sgml
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,51 @@
1
+ module IMW
2
+ module Formats
3
+
4
+ # Provides methods for reading and writing YAML data.
5
+ module Yaml
6
+
7
+ # Return the content of this resource.
8
+ #
9
+ # Will try to be smart about iterating over the data when
10
+ # passed a block.
11
+ #
12
+ # - if the outermost YAML data structure is an array, then
13
+ # yield each element
14
+ #
15
+ # - if the outermost YAML data structure is a mapping, then
16
+ # yield each key, value pair
17
+ #
18
+ # - otherwise just yield the structure
19
+ #
20
+ # @return [Hash, Array, String, Fixnum] whatever the YAML contained
21
+ def load &block
22
+ require 'yaml'
23
+ yaml = YAML.load(read)
24
+ if block_given?
25
+ case yaml
26
+ when Array
27
+ yaml.each { |obj| yield obj }
28
+ when Hash
29
+ yaml.each_pair { |key, value| yield key, value }
30
+ else
31
+ yield yaml
32
+ end
33
+ else
34
+ yaml
35
+ end
36
+ end
37
+
38
+ # Dump the +data+ into this resource. It must be opened for
39
+ # writing.
40
+ #
41
+ # @param [Hash, String, Array, Fixnum] data the Ruby object to dump
42
+ # @option options [true, false] :persist (false) Don't close the IO object after writing
43
+ def dump data, options={}
44
+ require 'yaml'
45
+ write(data.to_yaml)
46
+ io.close unless options[:persist]
47
+ self
48
+ end
49
+ end
50
+ end
51
+ end