imw 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. data/README.rdoc +34 -14
  2. data/Rakefile +1 -1
  3. data/VERSION +1 -1
  4. data/lib/imw.rb +9 -6
  5. data/lib/imw/{resources/archive.rb → archives.rb} +20 -10
  6. data/lib/imw/archives/rar.rb +19 -0
  7. data/lib/imw/archives/tar.rb +19 -0
  8. data/lib/imw/archives/tarbz2.rb +73 -0
  9. data/lib/imw/archives/targz.rb +73 -0
  10. data/lib/imw/archives/zip.rb +51 -0
  11. data/lib/imw/{resources/compressed_file.rb → compressed_files.rb} +16 -11
  12. data/lib/imw/compressed_files/bz2.rb +16 -0
  13. data/lib/imw/{resources → compressed_files}/compressible.rb +2 -4
  14. data/lib/imw/compressed_files/gz.rb +16 -0
  15. data/lib/imw/formats.rb +31 -0
  16. data/lib/imw/formats/delimited.rb +90 -0
  17. data/lib/imw/formats/excel.rb +125 -0
  18. data/lib/imw/formats/json.rb +51 -0
  19. data/lib/imw/formats/sgml.rb +69 -0
  20. data/lib/imw/formats/yaml.rb +51 -0
  21. data/lib/imw/resource.rb +108 -10
  22. data/lib/imw/schemes.rb +21 -0
  23. data/lib/imw/schemes/hdfs.rb +240 -0
  24. data/lib/imw/schemes/http.rb +166 -0
  25. data/lib/imw/schemes/local.rb +219 -0
  26. data/lib/imw/schemes/remote.rb +114 -0
  27. data/lib/imw/schemes/s3.rb +135 -0
  28. data/lib/imw/tools.rb +8 -0
  29. data/lib/imw/{transforms → tools}/archiver.rb +1 -1
  30. data/lib/imw/{transforms → tools}/transferer.rb +10 -10
  31. data/spec/imw/{resources/archive_spec.rb → archive_spec.rb} +3 -3
  32. data/spec/imw/{resources/archives_and_compressed → archives}/rar_spec.rb +2 -2
  33. data/spec/imw/{resources/archives_and_compressed → archives}/tar_spec.rb +2 -2
  34. data/spec/imw/{resources/archives_and_compressed → archives}/tarbz2_spec.rb +4 -4
  35. data/spec/imw/{resources/archives_and_compressed → archives}/targz_spec.rb +4 -4
  36. data/spec/imw/{resources/archives_and_compressed → archives}/zip_spec.rb +2 -2
  37. data/spec/imw/compressed_files/bz2_spec.rb +15 -0
  38. data/spec/imw/{resources → compressed_files}/compressible_spec.rb +1 -1
  39. data/spec/imw/compressed_files/gz_spec.rb +15 -0
  40. data/spec/imw/{resources/compressed_file_spec.rb → compressed_files_spec.rb} +3 -3
  41. data/spec/imw/{resources/formats → formats}/delimited_spec.rb +2 -2
  42. data/spec/imw/{resources/formats → formats}/json_spec.rb +2 -2
  43. data/spec/imw/{resources/formats → formats}/sgml_spec.rb +2 -2
  44. data/spec/imw/{resources/formats → formats}/yaml_spec.rb +2 -2
  45. data/spec/imw/resource_spec.rb +4 -4
  46. data/spec/imw/{resources/schemes → schemes}/hdfs_spec.rb +7 -7
  47. data/spec/imw/{resources/schemes → schemes}/http_spec.rb +2 -2
  48. data/spec/imw/{resources → schemes}/local_spec.rb +5 -5
  49. data/spec/imw/{resources → schemes}/remote_spec.rb +7 -3
  50. data/spec/imw/{resources/schemes → schemes}/s3_spec.rb +2 -2
  51. data/spec/imw/{transforms → tools}/archiver_spec.rb +2 -2
  52. data/spec/imw/tools/transferer_spec.rb +113 -0
  53. metadata +69 -71
  54. data/lib/imw/resources.rb +0 -118
  55. data/lib/imw/resources/archives_and_compressed.rb +0 -32
  56. data/lib/imw/resources/archives_and_compressed/bz2.rb +0 -18
  57. data/lib/imw/resources/archives_and_compressed/gz.rb +0 -18
  58. data/lib/imw/resources/archives_and_compressed/rar.rb +0 -23
  59. data/lib/imw/resources/archives_and_compressed/tar.rb +0 -23
  60. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +0 -78
  61. data/lib/imw/resources/archives_and_compressed/targz.rb +0 -78
  62. data/lib/imw/resources/archives_and_compressed/zip.rb +0 -57
  63. data/lib/imw/resources/formats.rb +0 -32
  64. data/lib/imw/resources/formats/delimited.rb +0 -92
  65. data/lib/imw/resources/formats/excel.rb +0 -125
  66. data/lib/imw/resources/formats/json.rb +0 -53
  67. data/lib/imw/resources/formats/sgml.rb +0 -72
  68. data/lib/imw/resources/formats/yaml.rb +0 -53
  69. data/lib/imw/resources/local.rb +0 -198
  70. data/lib/imw/resources/remote.rb +0 -110
  71. data/lib/imw/resources/schemes.rb +0 -19
  72. data/lib/imw/resources/schemes/hdfs.rb +0 -242
  73. data/lib/imw/resources/schemes/http.rb +0 -161
  74. data/lib/imw/resources/schemes/s3.rb +0 -137
  75. data/lib/imw/transforms.rb +0 -8
  76. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +0 -15
  77. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +0 -15
  78. data/spec/imw/transforms/transferer_spec.rb +0 -113
@@ -1,18 +0,0 @@
1
- module IMW
2
- module Resources
3
- module CompressedFiles
4
- module Bz2
5
-
6
- include IMW::Resources::CompressedFile
7
-
8
- def compression_settings
9
- @compression_settings ||= {
10
- :decompression_program => :bzip2,
11
- :decompress => '-fd'
12
- }
13
- end
14
-
15
- end
16
- end
17
- end
18
- end
@@ -1,18 +0,0 @@
1
- module IMW
2
- module Resources
3
- module CompressedFiles
4
- module Gz
5
-
6
- include IMW::Resources::CompressedFile
7
-
8
- def compression_settings
9
- @compression_settings ||= {
10
- :decompression_program => :gunzip,
11
- :decompress => '-fd'
12
- }
13
- end
14
-
15
- end
16
- end
17
- end
18
- end
@@ -1,23 +0,0 @@
1
- require 'imw/resources/archive'
2
-
3
- module IMW
4
- module Resources
5
- module Archives
6
- module Rar
7
-
8
- include IMW::Resources::Archive
9
-
10
- def archive_settings
11
- @archive_settings ||= {
12
- :program => :rar,
13
- :create => ['a', '-o+', '-inul'],
14
- :append => ['a', '-o+', '-inul'],
15
- :list => "vb",
16
- :extract => ['x', '-o+', '-inul']
17
- }
18
- end
19
- end
20
- end
21
- end
22
- end
23
-
@@ -1,23 +0,0 @@
1
- require 'imw/resources/archive'
2
-
3
- module IMW
4
- module Resources
5
- module Archives
6
- module Tar
7
-
8
- include IMW::Resources::Archive
9
-
10
- def archive_settings
11
- @archive_settings ||= {
12
- :create => "-cf",
13
- :append => "-rf",
14
- :list => "-tf",
15
- :extract => "-xf",
16
- :program => :tar
17
- }
18
- end
19
- end
20
- end
21
- end
22
- end
23
-
@@ -1,78 +0,0 @@
1
- require 'imw/resources/archive'
2
- require 'imw/resources/compressed_file'
3
-
4
- module IMW
5
- module Resources
6
- module Archives
7
- module Tarbz2
8
-
9
- #
10
- # It's a compressed file
11
- #
12
-
13
- include IMW::Resources::CompressedFile
14
-
15
- def compression_settings
16
- @compression_settings ||= {
17
- :program => :bzip2,
18
- :decompression_program => :bunzip2,
19
- :decompress => '',
20
- :extension => 'bz2'
21
- }
22
- end
23
-
24
- #
25
- # But it's also an archive
26
- #
27
-
28
- include IMW::Resources::Archive
29
-
30
- def archive_settings
31
- @archive_settings ||= {
32
- :program => :tar,
33
- :create => '-cf',
34
- :list => "-tjf",
35
- :extract => "-xjf"
36
- }
37
- end
38
-
39
- # Overrides default behvaior of IMW::Files::Archive#create to
40
- # compress files after creating them.
41
- def create *input_paths
42
- IMW.system(archive_settings[:program], archive_settings[:create], path_between_archive_and_compression, *input_paths.flatten)
43
- IMW.open(path_between_archive_and_compression).compress!
44
- end
45
-
46
- def decompressed_basename
47
- case extname
48
- when '.tar.bz2' then basename[0..-5] # .tar.bz2 => .tar
49
- when '.tbz2' then basename.gsub(/tbz2$/, 'tar') # .tbz2 => .tar
50
- else basename[0..-(extname.size + 1)]
51
- end
52
- end
53
-
54
-
55
- protected
56
- def path_between_archive_and_compression
57
- File.join(dirname,name + '.tar')
58
- end
59
-
60
- public
61
-
62
- #
63
- # It's a compressed file AND an archive!
64
- #
65
-
66
- def extname
67
- case path
68
- when /\.tar\.bz2$/ then '.tar.bz2'
69
- when /\.tbz2$/ then '.tbz2'
70
- else File.extname(path)
71
- end
72
- end
73
-
74
- end
75
- end
76
- end
77
- end
78
-
@@ -1,78 +0,0 @@
1
- require 'imw/resources/archive'
2
- require 'imw/resources/compressed_file'
3
-
4
- module IMW
5
- module Resources
6
- module Archives
7
- module Targz
8
-
9
- #
10
- # It's a compressed file
11
- #
12
-
13
- include IMW::Resources::CompressedFile
14
-
15
- def compression_settings
16
- @compression_settings ||= {
17
- :program => :gzip,
18
- :decompression_program => :gunzip,
19
- :decompress => '',
20
- :extension => 'gz'
21
- }
22
- end
23
-
24
- #
25
- # But it's also an archive
26
- #
27
-
28
- include IMW::Resources::Archive
29
-
30
- def archive_settings
31
- @archive_settings ||= {
32
- :program => :tar,
33
- :list => "-tzf",
34
- :create => '-cf',
35
- :extract => "-xzf"
36
- }
37
- end
38
-
39
- # Overrides default behvaior of IMW::Files::Archive#create to
40
- # compress files after creating them.
41
- def create *input_paths
42
- IMW.system(archive_settings[:program], archive_settings[:create].split, path_between_archive_and_compression, *input_paths.flatten)
43
- tar = IMW.open(path_between_archive_and_compression)
44
- tar.compression_settings = compression_settings
45
- tar.compress!
46
- end
47
-
48
- def decompressed_basename
49
- case extname
50
- when '.tar.gz' then basename[0..-4] # .tar.gz => .tar
51
- when '.tgz' then basename.gsub(/tgz$/, 'tar') # .tgz => .tar
52
- else basename[0..-(extname.size + 1)]
53
- end
54
- end
55
-
56
- protected
57
- def path_between_archive_and_compression
58
- File.join(dirname,name + '.tar')
59
- end
60
- public
61
-
62
- #
63
- # It's both an archive and a compressed file!
64
- #
65
-
66
- def extname
67
- case path
68
- when /\.tar\.gz$/ then '.tar.gz'
69
- when /\.tgz$/ then '.tgz'
70
- else File.extname(path)
71
- end
72
- end
73
-
74
- end
75
- end
76
- end
77
- end
78
-
@@ -1,57 +0,0 @@
1
- require 'imw/resources/archive'
2
-
3
- module IMW
4
- module Resources
5
- module Archives
6
- module Zip
7
-
8
- include IMW::Resources::Archive
9
-
10
- def archive_settings
11
- @archive_settings ||= {
12
- :program => :zip,
13
- :create => "-qqr",
14
- :append => "-qqg",
15
- :list => "-l",
16
- :extract => "-qqo",
17
- :unarchiving_program => :unzip
18
- }
19
- end
20
-
21
- protected
22
-
23
- # The `unzip' program outputs data in a very annoying format:
24
- #
25
- # Archive: data.zip
26
- # Length Date Time Name
27
- # -------- ---- ---- ----
28
- # 18510 07-28-08 15:58 data/4d7Qrgz7.csv
29
- # 3418 07-28-08 15:41 data/7S.csv
30
- # 23353 07-28-08 15:41 data/g.csv
31
- # 711 07-28-08 15:58 data/g.xml
32
- # 1095 07-28-08 15:41 data/L.xml
33
- # 2399 07-28-08 15:58 data/mTAu9H3.xml
34
- # 152 07-28-08 15:58 data/vaHBS2t5R.dat
35
- # -------- -------
36
- # 49638 7 files
37
- #
38
- # which is parsed by this method.
39
- def archive_contents_string_to_array string
40
- rows = string.split("\n")
41
- # ignore the first 3 lines of the output and also discared the
42
- # last 2 (5 = 2 + 3)
43
- file_rows = rows[3,(rows.length - 5)]
44
- file_rows.map do |row|
45
- if row
46
- columns = row.lstrip.rstrip.split(' ')
47
- # grab the filename in the fourth column
48
- columns[3..-1].join(' ')
49
- end
50
- end.compact
51
- end
52
- end
53
- end
54
- end
55
- end
56
-
57
-
@@ -1,32 +0,0 @@
1
- module IMW
2
- module Resources
3
- module Formats
4
- autoload :Csv, 'imw/resources/formats/delimited'
5
- autoload :Tsv, 'imw/resources/formats/delimited'
6
- autoload :Excel, 'imw/resources/formats/excel'
7
- autoload :Json, 'imw/resources/formats/json'
8
- autoload :Xml, 'imw/resources/formats/sgml'
9
- autoload :Xsl, 'imw/resources/formats/sgml'
10
- autoload :Html, 'imw/resources/formats/sgml'
11
- autoload :Xhtml, 'imw/resources/formats/sgml'
12
- autoload :Rdf, 'imw/resources/formats/sgml'
13
- autoload :Yaml, 'imw/resources/formats/yaml'
14
-
15
- # Handlers which augment a resource with data format specific
16
- # methods.
17
- FORMAT_HANDLERS = [
18
- [ "Formats::Csv", /\.csv$/ ],
19
- [ "Formats::Tsv", /\.tsv$/ ],
20
- [ "Formats::Excel", /\.xslx?$/ ],
21
- [ "Formats::Json", /\.json$/ ],
22
- [ "Formats::Xml", /\.xml$/ ],
23
- [ "Formats::Xsl", /\.xsl$/ ],
24
- [ "Formats::Html", /\.html?$/ ],
25
- [ "Formats::Xhtml", /\.xhtml?$/ ],
26
- [ "Formats::Rdf", /\.rdf?$/ ],
27
- [ "Formats::Yaml", /\.ya?ml$/ ]
28
- ]
29
- end
30
- end
31
- end
32
-
@@ -1,92 +0,0 @@
1
- module IMW
2
- module Resources
3
- module Formats
4
-
5
- # Defines methods used for parsing and writing delimited data
6
- # formats (CSV, TSV, &c.) with the FasterCSV library. This
7
- # module is not used to directly extend a resource. Instead,
8
- # more specific modules (e.g. - IMW::Resources::Formats::Csv)
9
- # include this one and also define +delimited_options+ which is
10
- # actually what's passed to FasterCSV.
11
- #
12
- # @abstract
13
- module Delimited
14
-
15
- attr_accessor :delimited_settings
16
-
17
- # Return the data in this delimited resource as an array of
18
- # arrays.
19
- #
20
- # Yield each outer array (row) if passed a block.
21
- #
22
- # @return [Array] the full data matrix
23
- # @yield [Array] each row of the data
24
- def load &block
25
- require 'fastercsv'
26
- FasterCSV.parse(read, delimited_options, &block)
27
- end
28
-
29
- # Map each row in this delimited resource.
30
- #
31
- # @yield [Array] each row of the data
32
- def map &block
33
- load.map(&block)
34
- end
35
-
36
- # Dump an array of arrays into this resource.
37
- #
38
- # @param [Array] data array of arrays to dump
39
- # @param [Hash] options
40
- # @option options [true, false] :persist Keep this resource's IO object open after dumping
41
- def dump data, options={}
42
- require 'fastercsv'
43
- data.each do |row|
44
- write(FasterCSV.generate_line(row, delimited_options))
45
- end
46
- io.close unless options[:persist]
47
- self
48
- end
49
- end
50
-
51
- module Csv
52
- include Delimited
53
-
54
- # Default options to be passed to
55
- # FasterCSV[http://fastercsv.rubyforge.org/]; see its
56
- # documentation for more information.
57
- #
58
- # @return [Hash]
59
- def delimited_options
60
- @delimited_options ||= {
61
- :col_sep => ',',
62
- :headers => false,
63
- :return_headers => false,
64
- :write_headers => true,
65
- :skip_blanks => false,
66
- :force_quotes => false
67
- }
68
- end
69
- end
70
-
71
- module Tsv
72
- include Delimited
73
-
74
- # Default options to be passed to
75
- # FasterCSV[http://fastercsv.rubyforge.org/]; see its
76
- # documentation for more information.
77
- #
78
- # @return [Hash]
79
- def delimited_options
80
- @delimited_options ||= {
81
- :col_sep => "\t",
82
- :headers => false,
83
- :return_headers => false,
84
- :write_headers => true,
85
- :skip_blanks => false,
86
- :force_quotes => false
87
- }
88
- end
89
- end
90
- end
91
- end
92
- end
@@ -1,125 +0,0 @@
1
- module IMW
2
- module Resources
3
- module Formats
4
-
5
- # Defines methods for reading and writing Microsoft Excel data.
6
- module Excel
7
-
8
- attr_accessor :book, :sheet
9
-
10
- def self.extended obj
11
- if obj.exist?
12
- @book = Spreadsheet.open path
13
- @sheet = book.worksheet(0)
14
-
15
- end
16
- end
17
-
18
-
19
- def book
20
- return @book if @book
21
- if exists?
22
- @book = Spreadsheet.open(path)
23
- else
24
- @book = Spreadsheet::Workbook.new
25
- end
26
- end
27
-
28
- def sheet
29
- @sheet = @book.create_worksheet
30
- @sheet
31
- end
32
-
33
- #If an Excel file exists at the location specified by uri then
34
- #it is opened and can be read out with a subsequent call to
35
- #load(). Otherwise, a new workbook is created and can be written
36
- #to with the dump() method.
37
- def initialize uri, mode='r', options={}
38
- self.uri = uri
39
- @max_lines = options[:max_lines] || 65000
40
- @idx = 0
41
- @book_idx = 0
42
- @sht_idx = 0
43
- unless self.exist?
44
- make_new_book
45
- make_new_sheet
46
- else
47
- get_existing_book
48
- end
49
- end
50
-
51
- #Returns the data in an existing workbook as an
52
- #array of arrays. Only capable of reading a single sheet.
53
- def load
54
- @sheet.map{|row| row.to_a}
55
- end
56
-
57
- #Dumps data, which is assumed to be an array of arrays, to a
58
- #newly created Excel workbook. Attempting to dump to a book
59
- #that already exists will typically result in file corruption.
60
- #Raises a 'too many lines' error if the number of lines
61
- #of data exceeds max_lines.
62
- def dump data
63
- data.each do |line|
64
- raise "too many lines" if too_many?
65
- self << line
66
- end
67
- save unless no_data?
68
- end
69
-
70
- #Processes a single line of data and updates internal variables.
71
- #You shouldn't need to call this directly.
72
- def << line
73
- @sheet.row(@sht_row).concat( line )
74
- @sht_row += 1
75
- @idx += 1
76
- end
77
-
78
- #Instantiates a new Excel workbook in memory. You shouldn't
79
- #need to call this directly.
80
- def make_new_book
81
- @book = Spreadsheet::Workbook.new
82
- @book_idx += 1
83
- end
84
-
85
- #Makes a new worksheet for a pre-existing Excel workbook.
86
- #This should be called after recovering from the
87
- #'too many lines' error.
88
- def make_new_sheet
89
- @sheet = @book.create_worksheet
90
- @sht_idx += 1
91
- @sht_row = 0 #always start at row 0 in a new sheet
92
- end
93
-
94
- #Opens an existing Excel workbook. You shoudn't need to
95
- #call this directly.
96
- def get_existing_book
97
- @sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
98
- @sht_idx += 1
99
- end
100
-
101
- #Increments the current sheet to the next one in
102
- #an open book. Not necessary at the moment.
103
- def incr_sheet
104
- @sheet = book.worksheet @sht_idx
105
- end
106
-
107
- #There are too many lines if the number of rows attempting
108
- #to be written exceeds max_lines.
109
- def too_many?
110
- @sht_row >= @max_lines
111
- end
112
-
113
- #There is no data if the number of rows attempting to be written
114
- #is zero.
115
- def no_data?
116
- @sht_row == 0
117
- end
118
-
119
- #Saves the workbook.
120
- def save
121
- @book.write path
122
- end
123
- end
124
- end
125
- end