imw 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. data/README.rdoc +34 -14
  2. data/Rakefile +1 -1
  3. data/VERSION +1 -1
  4. data/lib/imw.rb +9 -6
  5. data/lib/imw/{resources/archive.rb → archives.rb} +20 -10
  6. data/lib/imw/archives/rar.rb +19 -0
  7. data/lib/imw/archives/tar.rb +19 -0
  8. data/lib/imw/archives/tarbz2.rb +73 -0
  9. data/lib/imw/archives/targz.rb +73 -0
  10. data/lib/imw/archives/zip.rb +51 -0
  11. data/lib/imw/{resources/compressed_file.rb → compressed_files.rb} +16 -11
  12. data/lib/imw/compressed_files/bz2.rb +16 -0
  13. data/lib/imw/{resources → compressed_files}/compressible.rb +2 -4
  14. data/lib/imw/compressed_files/gz.rb +16 -0
  15. data/lib/imw/formats.rb +31 -0
  16. data/lib/imw/formats/delimited.rb +90 -0
  17. data/lib/imw/formats/excel.rb +125 -0
  18. data/lib/imw/formats/json.rb +51 -0
  19. data/lib/imw/formats/sgml.rb +69 -0
  20. data/lib/imw/formats/yaml.rb +51 -0
  21. data/lib/imw/resource.rb +108 -10
  22. data/lib/imw/schemes.rb +21 -0
  23. data/lib/imw/schemes/hdfs.rb +240 -0
  24. data/lib/imw/schemes/http.rb +166 -0
  25. data/lib/imw/schemes/local.rb +219 -0
  26. data/lib/imw/schemes/remote.rb +114 -0
  27. data/lib/imw/schemes/s3.rb +135 -0
  28. data/lib/imw/tools.rb +8 -0
  29. data/lib/imw/{transforms → tools}/archiver.rb +1 -1
  30. data/lib/imw/{transforms → tools}/transferer.rb +10 -10
  31. data/spec/imw/{resources/archive_spec.rb → archive_spec.rb} +3 -3
  32. data/spec/imw/{resources/archives_and_compressed → archives}/rar_spec.rb +2 -2
  33. data/spec/imw/{resources/archives_and_compressed → archives}/tar_spec.rb +2 -2
  34. data/spec/imw/{resources/archives_and_compressed → archives}/tarbz2_spec.rb +4 -4
  35. data/spec/imw/{resources/archives_and_compressed → archives}/targz_spec.rb +4 -4
  36. data/spec/imw/{resources/archives_and_compressed → archives}/zip_spec.rb +2 -2
  37. data/spec/imw/compressed_files/bz2_spec.rb +15 -0
  38. data/spec/imw/{resources → compressed_files}/compressible_spec.rb +1 -1
  39. data/spec/imw/compressed_files/gz_spec.rb +15 -0
  40. data/spec/imw/{resources/compressed_file_spec.rb → compressed_files_spec.rb} +3 -3
  41. data/spec/imw/{resources/formats → formats}/delimited_spec.rb +2 -2
  42. data/spec/imw/{resources/formats → formats}/json_spec.rb +2 -2
  43. data/spec/imw/{resources/formats → formats}/sgml_spec.rb +2 -2
  44. data/spec/imw/{resources/formats → formats}/yaml_spec.rb +2 -2
  45. data/spec/imw/resource_spec.rb +4 -4
  46. data/spec/imw/{resources/schemes → schemes}/hdfs_spec.rb +7 -7
  47. data/spec/imw/{resources/schemes → schemes}/http_spec.rb +2 -2
  48. data/spec/imw/{resources → schemes}/local_spec.rb +5 -5
  49. data/spec/imw/{resources → schemes}/remote_spec.rb +7 -3
  50. data/spec/imw/{resources/schemes → schemes}/s3_spec.rb +2 -2
  51. data/spec/imw/{transforms → tools}/archiver_spec.rb +2 -2
  52. data/spec/imw/tools/transferer_spec.rb +113 -0
  53. metadata +69 -71
  54. data/lib/imw/resources.rb +0 -118
  55. data/lib/imw/resources/archives_and_compressed.rb +0 -32
  56. data/lib/imw/resources/archives_and_compressed/bz2.rb +0 -18
  57. data/lib/imw/resources/archives_and_compressed/gz.rb +0 -18
  58. data/lib/imw/resources/archives_and_compressed/rar.rb +0 -23
  59. data/lib/imw/resources/archives_and_compressed/tar.rb +0 -23
  60. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +0 -78
  61. data/lib/imw/resources/archives_and_compressed/targz.rb +0 -78
  62. data/lib/imw/resources/archives_and_compressed/zip.rb +0 -57
  63. data/lib/imw/resources/formats.rb +0 -32
  64. data/lib/imw/resources/formats/delimited.rb +0 -92
  65. data/lib/imw/resources/formats/excel.rb +0 -125
  66. data/lib/imw/resources/formats/json.rb +0 -53
  67. data/lib/imw/resources/formats/sgml.rb +0 -72
  68. data/lib/imw/resources/formats/yaml.rb +0 -53
  69. data/lib/imw/resources/local.rb +0 -198
  70. data/lib/imw/resources/remote.rb +0 -110
  71. data/lib/imw/resources/schemes.rb +0 -19
  72. data/lib/imw/resources/schemes/hdfs.rb +0 -242
  73. data/lib/imw/resources/schemes/http.rb +0 -161
  74. data/lib/imw/resources/schemes/s3.rb +0 -137
  75. data/lib/imw/transforms.rb +0 -8
  76. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +0 -15
  77. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +0 -15
  78. data/spec/imw/transforms/transferer_spec.rb +0 -113
@@ -10,9 +10,9 @@ module IMW
10
10
  :compress => '',
11
11
  :extension => 'bz2'
12
12
  } unless defined?(COMPRESSION_SETTINGS)
13
-
14
- module Resources
15
13
 
14
+ module CompressedFiles
15
+
16
16
  # Defines methods for compressing a file. The default compression
17
17
  # program is defined in IMW::COMPRESSION_SETTINGS though a
18
18
  # particular resource can change the values in its
@@ -70,8 +70,6 @@ module IMW
70
70
  copy.mv(path) if copy.exist?
71
71
  end
72
72
  end
73
-
74
73
  end
75
74
  end
76
75
  end
77
-
@@ -0,0 +1,16 @@
1
+ module IMW
2
+ module CompressedFiles
3
+ module Gz
4
+
5
+ include IMW::CompressedFiles::Base
6
+
7
+ def compression_settings
8
+ @compression_settings ||= {
9
+ :decompression_program => :gunzip,
10
+ :decompress => '-fd'
11
+ }
12
+ end
13
+
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,31 @@
1
+ module IMW
2
+ module Formats
3
+ autoload :Csv, 'imw/formats/delimited'
4
+ autoload :Tsv, 'imw/formats/delimited'
5
+ autoload :Excel, 'imw/formats/excel'
6
+ autoload :Json, 'imw/formats/json'
7
+ autoload :Xml, 'imw/formats/sgml'
8
+ autoload :Xsl, 'imw/formats/sgml'
9
+ autoload :Html, 'imw/formats/sgml'
10
+ autoload :Xhtml, 'imw/formats/sgml'
11
+ autoload :Rdf, 'imw/formats/sgml'
12
+ autoload :Yaml, 'imw/formats/yaml'
13
+
14
+ # Handlers which augment a resource with data format specific
15
+ # methods.
16
+ HANDLERS = [
17
+ [ "Formats::Csv", /\.csv$/ ],
18
+ [ "Formats::Tsv", /\.tsv$/ ],
19
+ [ "Formats::Excel", /\.xslx?$/ ],
20
+ [ "Formats::Json", /\.json$/ ],
21
+ [ "Formats::Xml", /\.xml$/ ],
22
+ [ "Formats::Xsl", /\.xsl$/ ],
23
+ [ "Formats::Html", /\.html?$/ ],
24
+ [ "Formats::Xhtml", /\.xhtml?$/ ],
25
+ [ "Formats::Rdf", /\.rdf?$/ ],
26
+ [ "Formats::Yaml", /\.ya?ml$/ ]
27
+ ]
28
+ end
29
+ end
30
+
31
+
@@ -0,0 +1,90 @@
1
+ module IMW
2
+ module Formats
3
+
4
+ # Defines methods used for parsing and writing delimited data
5
+ # formats (CSV, TSV, &c.) with the FasterCSV library. This
6
+ # module is not used to directly extend a resource. Instead,
7
+ # more specific modules (e.g. - IMW::Resources::Formats::Csv)
8
+ # include this one and also define +delimited_options+ which is
9
+ # actually what's passed to FasterCSV.
10
+ #
11
+ # @abstract
12
+ module Delimited
13
+
14
+ attr_accessor :delimited_settings
15
+
16
+ # Return the data in this delimited resource as an array of
17
+ # arrays.
18
+ #
19
+ # Yield each outer array (row) if passed a block.
20
+ #
21
+ # @return [Array] the full data matrix
22
+ # @yield [Array] each row of the data
23
+ def load &block
24
+ require 'fastercsv'
25
+ FasterCSV.parse(read, delimited_options, &block)
26
+ end
27
+
28
+ # Map each row in this delimited resource.
29
+ #
30
+ # @yield [Array] each row of the data
31
+ def map &block
32
+ load.map(&block)
33
+ end
34
+
35
+ # Dump an array of arrays into this resource.
36
+ #
37
+ # @param [Array] data array of arrays to dump
38
+ # @param [Hash] options
39
+ # @option options [true, false] :persist Keep this resource's IO object open after dumping
40
+ def dump data, options={}
41
+ require 'fastercsv'
42
+ data.each do |row|
43
+ write(FasterCSV.generate_line(row, delimited_options))
44
+ end
45
+ io.close unless options[:persist]
46
+ self
47
+ end
48
+ end
49
+
50
+ module Csv
51
+ include Delimited
52
+
53
+ # Default options to be passed to
54
+ # FasterCSV[http://fastercsv.rubyforge.org/]; see its
55
+ # documentation for more information.
56
+ #
57
+ # @return [Hash]
58
+ def delimited_options
59
+ @delimited_options ||= {
60
+ :col_sep => ',',
61
+ :headers => false,
62
+ :return_headers => false,
63
+ :write_headers => true,
64
+ :skip_blanks => false,
65
+ :force_quotes => false
66
+ }
67
+ end
68
+ end
69
+
70
+ module Tsv
71
+ include Delimited
72
+
73
+ # Default options to be passed to
74
+ # FasterCSV[http://fastercsv.rubyforge.org/]; see its
75
+ # documentation for more information.
76
+ #
77
+ # @return [Hash]
78
+ def delimited_options
79
+ @delimited_options ||= {
80
+ :col_sep => "\t",
81
+ :headers => false,
82
+ :return_headers => false,
83
+ :write_headers => true,
84
+ :skip_blanks => false,
85
+ :force_quotes => false
86
+ }
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,125 @@
1
+ module IMW
2
+ module Formats
3
+
4
+ # Defines methods for reading and writing Microsoft Excel data.
5
+ module Excel
6
+
7
+ attr_accessor :book, :sheet
8
+
9
+ def self.extended obj
10
+ if obj.exist?
11
+ @book = Spreadsheet.open path
12
+ @sheet = book.worksheet(0)
13
+
14
+ end
15
+ end
16
+
17
+
18
+ def book
19
+ return @book if @book
20
+ if exists?
21
+ @book = Spreadsheet.open(path)
22
+ else
23
+ @book = Spreadsheet::Workbook.new
24
+ end
25
+ end
26
+
27
+ def sheet
28
+ @sheet = @book.create_worksheet
29
+ @sheet
30
+ end
31
+
32
+ #If an Excel file exists at the location specified by uri then
33
+ #it is opened and can be read out with a subsequent call to
34
+ #load(). Otherwise, a new workbook is created and can be written
35
+ #to with the dump() method.
36
+ def initialize uri, mode='r', options={}
37
+ self.uri = uri
38
+ @max_lines = options[:max_lines] || 65000
39
+ @idx = 0
40
+ @book_idx = 0
41
+ @sht_idx = 0
42
+ unless self.exist?
43
+ make_new_book
44
+ make_new_sheet
45
+ else
46
+ get_existing_book
47
+ end
48
+ end
49
+
50
+ #Returns the data in an existing workbook as an
51
+ #array of arrays. Only capable of reading a single sheet.
52
+ def load
53
+ @sheet.map{|row| row.to_a}
54
+ end
55
+
56
+ #Dumps data, which is assumed to be an array of arrays, to a
57
+ #newly created Excel workbook. Attempting to dump to a book
58
+ #that already exists will typically result in file corruption.
59
+ #Raises a 'too many lines' error if the number of lines
60
+ #of data exceeds max_lines.
61
+ def dump data
62
+ data.each do |line|
63
+ raise "too many lines" if too_many?
64
+ self << line
65
+ end
66
+ save unless no_data?
67
+ end
68
+
69
+ #Processes a single line of data and updates internal variables.
70
+ #You shouldn't need to call this directly.
71
+ def << line
72
+ @sheet.row(@sht_row).concat( line )
73
+ @sht_row += 1
74
+ @idx += 1
75
+ end
76
+
77
+ #Instantiates a new Excel workbook in memory. You shouldn't
78
+ #need to call this directly.
79
+ def make_new_book
80
+ @book = Spreadsheet::Workbook.new
81
+ @book_idx += 1
82
+ end
83
+
84
+ #Makes a new worksheet for a pre-existing Excel workbook.
85
+ #This should be called after recovering from the
86
+ #'too many lines' error.
87
+ def make_new_sheet
88
+ @sheet = @book.create_worksheet
89
+ @sht_idx += 1
90
+ @sht_row = 0 #always start at row 0 in a new sheet
91
+ end
92
+
93
+ #Opens an existing Excel workbook. You shoudn't need to
94
+ #call this directly.
95
+ def get_existing_book
96
+ @sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
97
+ @sht_idx += 1
98
+ end
99
+
100
+ #Increments the current sheet to the next one in
101
+ #an open book. Not necessary at the moment.
102
+ def incr_sheet
103
+ @sheet = book.worksheet @sht_idx
104
+ end
105
+
106
+ #There are too many lines if the number of rows attempting
107
+ #to be written exceeds max_lines.
108
+ def too_many?
109
+ @sht_row >= @max_lines
110
+ end
111
+
112
+ #There is no data if the number of rows attempting to be written
113
+ #is zero.
114
+ def no_data?
115
+ @sht_row == 0
116
+ end
117
+
118
+ #Saves the workbook.
119
+ def save
120
+ @book.write path
121
+ end
122
+ end
123
+ end
124
+ end
125
+
@@ -0,0 +1,51 @@
1
+ module IMW
2
+ module Formats
3
+
4
+ # Defines methods for reading and writing JSON data.
5
+ module Json
6
+
7
+ # Return the content of this resource.
8
+ #
9
+ # Will try to be smart about iterating over the data when
10
+ # passed a block.
11
+ #
12
+ # - if the outermost JSON data structure is an array, then
13
+ # yield each element
14
+ #
15
+ # - if the outermost JSON data structure is a mapping, then
16
+ # yield each key, value pair
17
+ #
18
+ # - otherwise just yield the structure
19
+ #
20
+ # @return [Hash, Array, String, Fixnum] whatever the JSON contained
21
+ def load &block
22
+ require 'json'
23
+ json = JSON.parse(read)
24
+ if block_given?
25
+ case json
26
+ when Array
27
+ json.each { |obj| yield obj }
28
+ when Hash
29
+ json.each_pair { |key, value| yield key, value }
30
+ else
31
+ yield json
32
+ end
33
+ else
34
+ json
35
+ end
36
+ end
37
+
38
+ # Dump the +data+ into this resource. It must be opened for
39
+ # writing.
40
+ #
41
+ # @param [Hash, String, Array, Fixnum] data the Ruby object to dump
42
+ # @option options [true, false] :persist (false) Don't close the IO object after writing
43
+ def dump data, options={}
44
+ require 'json'
45
+ write(data.to_json)
46
+ io.close unless options[:persist]
47
+ self
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,69 @@
1
+ module IMW
2
+ module Formats
3
+
4
+ # Defines methods to parse SGML-derived data formats (XML, HTML,
5
+ # &c.). This module isn't directly used to extend resources.
6
+ # Instead, more specific modules (e.g. -
7
+ # IMW::Resources::Formats::Xml) are used.
8
+ module Sgml
9
+
10
+ # Parse this resource using Hpricot and return (or yield if
11
+ # given a block) the resulting Hpricot::Doc.
12
+ #
13
+ # @return [Hpricot::Doc]
14
+ # @yield [Hpricot::Doc]
15
+ def load &block
16
+ require 'hpricot'
17
+ sgml = Hpricot(io)
18
+ if block_given?
19
+ yield sgml
20
+ else
21
+ sgml
22
+ end
23
+ end
24
+
25
+ # Parse the Hpricot::Doc of this resource with the given
26
+ # +parser+.
27
+ #
28
+ # The parser can either be an IMW::Parsers::HtmlParser or a
29
+ # hash which will be used to build such a parser. See the
30
+ # documentation for IMW::Parsers::HtmlParser for more
31
+ # information.
32
+ #
33
+ # @param [Hash, IMW::Parsers::HtmlParser] parser
34
+ # @return [Hash] the parser's output
35
+ def parse parser
36
+ if parser.is_a?(IMW::Parsers::HtmlParser)
37
+ parser.parse(load)
38
+ else
39
+ IMW::Parsers::HtmlParser.new(parser).parse(load)
40
+ end
41
+ end
42
+ end
43
+
44
+ # Defines methods for XML data.
45
+ module Xml
46
+ include Sgml
47
+ end
48
+
49
+ # Defines methods for XSL data.
50
+ module Xsl
51
+ include Sgml
52
+ end
53
+
54
+ # Defines methods for XHTML data.
55
+ module Xhtml
56
+ include Sgml
57
+ end
58
+
59
+ # Defines methods for HTML data.
60
+ module Html
61
+ include Sgml
62
+ end
63
+
64
+ # Defines methods for RDF data.
65
+ module Rdf
66
+ include Sgml
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,51 @@
1
+ module IMW
2
+ module Formats
3
+
4
+ # Provides methods for reading and writing YAML data.
5
+ module Yaml
6
+
7
+ # Return the content of this resource.
8
+ #
9
+ # Will try to be smart about iterating over the data when
10
+ # passed a block.
11
+ #
12
+ # - if the outermost YAML data structure is an array, then
13
+ # yield each element
14
+ #
15
+ # - if the outermost YAML data structure is a mapping, then
16
+ # yield each key, value pair
17
+ #
18
+ # - otherwise just yield the structure
19
+ #
20
+ # @return [Hash, Array, String, Fixnum] whatever the YAML contained
21
+ def load &block
22
+ require 'yaml'
23
+ yaml = YAML.load(read)
24
+ if block_given?
25
+ case yaml
26
+ when Array
27
+ yaml.each { |obj| yield obj }
28
+ when Hash
29
+ yaml.each_pair { |key, value| yield key, value }
30
+ else
31
+ yield yaml
32
+ end
33
+ else
34
+ yaml
35
+ end
36
+ end
37
+
38
+ # Dump the +data+ into this resource. It must be opened for
39
+ # writing.
40
+ #
41
+ # @param [Hash, String, Array, Fixnum] data the Ruby object to dump
42
+ # @option options [true, false] :persist (false) Don't close the IO object after writing
43
+ def dump data, options={}
44
+ require 'yaml'
45
+ write(data.to_yaml)
46
+ io.close unless options[:persist]
47
+ self
48
+ end
49
+ end
50
+ end
51
+ end