imw 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. data/README.rdoc +34 -14
  2. data/Rakefile +1 -1
  3. data/VERSION +1 -1
  4. data/lib/imw.rb +9 -6
  5. data/lib/imw/{resources/archive.rb → archives.rb} +20 -10
  6. data/lib/imw/archives/rar.rb +19 -0
  7. data/lib/imw/archives/tar.rb +19 -0
  8. data/lib/imw/archives/tarbz2.rb +73 -0
  9. data/lib/imw/archives/targz.rb +73 -0
  10. data/lib/imw/archives/zip.rb +51 -0
  11. data/lib/imw/{resources/compressed_file.rb → compressed_files.rb} +16 -11
  12. data/lib/imw/compressed_files/bz2.rb +16 -0
  13. data/lib/imw/{resources → compressed_files}/compressible.rb +2 -4
  14. data/lib/imw/compressed_files/gz.rb +16 -0
  15. data/lib/imw/formats.rb +31 -0
  16. data/lib/imw/formats/delimited.rb +90 -0
  17. data/lib/imw/formats/excel.rb +125 -0
  18. data/lib/imw/formats/json.rb +51 -0
  19. data/lib/imw/formats/sgml.rb +69 -0
  20. data/lib/imw/formats/yaml.rb +51 -0
  21. data/lib/imw/resource.rb +108 -10
  22. data/lib/imw/schemes.rb +21 -0
  23. data/lib/imw/schemes/hdfs.rb +240 -0
  24. data/lib/imw/schemes/http.rb +166 -0
  25. data/lib/imw/schemes/local.rb +219 -0
  26. data/lib/imw/schemes/remote.rb +114 -0
  27. data/lib/imw/schemes/s3.rb +135 -0
  28. data/lib/imw/tools.rb +8 -0
  29. data/lib/imw/{transforms → tools}/archiver.rb +1 -1
  30. data/lib/imw/{transforms → tools}/transferer.rb +10 -10
  31. data/spec/imw/{resources/archive_spec.rb → archive_spec.rb} +3 -3
  32. data/spec/imw/{resources/archives_and_compressed → archives}/rar_spec.rb +2 -2
  33. data/spec/imw/{resources/archives_and_compressed → archives}/tar_spec.rb +2 -2
  34. data/spec/imw/{resources/archives_and_compressed → archives}/tarbz2_spec.rb +4 -4
  35. data/spec/imw/{resources/archives_and_compressed → archives}/targz_spec.rb +4 -4
  36. data/spec/imw/{resources/archives_and_compressed → archives}/zip_spec.rb +2 -2
  37. data/spec/imw/compressed_files/bz2_spec.rb +15 -0
  38. data/spec/imw/{resources → compressed_files}/compressible_spec.rb +1 -1
  39. data/spec/imw/compressed_files/gz_spec.rb +15 -0
  40. data/spec/imw/{resources/compressed_file_spec.rb → compressed_files_spec.rb} +3 -3
  41. data/spec/imw/{resources/formats → formats}/delimited_spec.rb +2 -2
  42. data/spec/imw/{resources/formats → formats}/json_spec.rb +2 -2
  43. data/spec/imw/{resources/formats → formats}/sgml_spec.rb +2 -2
  44. data/spec/imw/{resources/formats → formats}/yaml_spec.rb +2 -2
  45. data/spec/imw/resource_spec.rb +4 -4
  46. data/spec/imw/{resources/schemes → schemes}/hdfs_spec.rb +7 -7
  47. data/spec/imw/{resources/schemes → schemes}/http_spec.rb +2 -2
  48. data/spec/imw/{resources → schemes}/local_spec.rb +5 -5
  49. data/spec/imw/{resources → schemes}/remote_spec.rb +7 -3
  50. data/spec/imw/{resources/schemes → schemes}/s3_spec.rb +2 -2
  51. data/spec/imw/{transforms → tools}/archiver_spec.rb +2 -2
  52. data/spec/imw/tools/transferer_spec.rb +113 -0
  53. metadata +69 -71
  54. data/lib/imw/resources.rb +0 -118
  55. data/lib/imw/resources/archives_and_compressed.rb +0 -32
  56. data/lib/imw/resources/archives_and_compressed/bz2.rb +0 -18
  57. data/lib/imw/resources/archives_and_compressed/gz.rb +0 -18
  58. data/lib/imw/resources/archives_and_compressed/rar.rb +0 -23
  59. data/lib/imw/resources/archives_and_compressed/tar.rb +0 -23
  60. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +0 -78
  61. data/lib/imw/resources/archives_and_compressed/targz.rb +0 -78
  62. data/lib/imw/resources/archives_and_compressed/zip.rb +0 -57
  63. data/lib/imw/resources/formats.rb +0 -32
  64. data/lib/imw/resources/formats/delimited.rb +0 -92
  65. data/lib/imw/resources/formats/excel.rb +0 -125
  66. data/lib/imw/resources/formats/json.rb +0 -53
  67. data/lib/imw/resources/formats/sgml.rb +0 -72
  68. data/lib/imw/resources/formats/yaml.rb +0 -53
  69. data/lib/imw/resources/local.rb +0 -198
  70. data/lib/imw/resources/remote.rb +0 -110
  71. data/lib/imw/resources/schemes.rb +0 -19
  72. data/lib/imw/resources/schemes/hdfs.rb +0 -242
  73. data/lib/imw/resources/schemes/http.rb +0 -161
  74. data/lib/imw/resources/schemes/s3.rb +0 -137
  75. data/lib/imw/transforms.rb +0 -8
  76. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +0 -15
  77. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +0 -15
  78. data/spec/imw/transforms/transferer_spec.rb +0 -113
@@ -1,18 +0,0 @@
1
- module IMW
2
- module Resources
3
- module CompressedFiles
4
- module Bz2
5
-
6
- include IMW::Resources::CompressedFile
7
-
8
- def compression_settings
9
- @compression_settings ||= {
10
- :decompression_program => :bzip2,
11
- :decompress => '-fd'
12
- }
13
- end
14
-
15
- end
16
- end
17
- end
18
- end
@@ -1,18 +0,0 @@
1
- module IMW
2
- module Resources
3
- module CompressedFiles
4
- module Gz
5
-
6
- include IMW::Resources::CompressedFile
7
-
8
- def compression_settings
9
- @compression_settings ||= {
10
- :decompression_program => :gunzip,
11
- :decompress => '-fd'
12
- }
13
- end
14
-
15
- end
16
- end
17
- end
18
- end
@@ -1,23 +0,0 @@
1
- require 'imw/resources/archive'
2
-
3
- module IMW
4
- module Resources
5
- module Archives
6
- module Rar
7
-
8
- include IMW::Resources::Archive
9
-
10
- def archive_settings
11
- @archive_settings ||= {
12
- :program => :rar,
13
- :create => ['a', '-o+', '-inul'],
14
- :append => ['a', '-o+', '-inul'],
15
- :list => "vb",
16
- :extract => ['x', '-o+', '-inul']
17
- }
18
- end
19
- end
20
- end
21
- end
22
- end
23
-
@@ -1,23 +0,0 @@
1
- require 'imw/resources/archive'
2
-
3
- module IMW
4
- module Resources
5
- module Archives
6
- module Tar
7
-
8
- include IMW::Resources::Archive
9
-
10
- def archive_settings
11
- @archive_settings ||= {
12
- :create => "-cf",
13
- :append => "-rf",
14
- :list => "-tf",
15
- :extract => "-xf",
16
- :program => :tar
17
- }
18
- end
19
- end
20
- end
21
- end
22
- end
23
-
@@ -1,78 +0,0 @@
1
- require 'imw/resources/archive'
2
- require 'imw/resources/compressed_file'
3
-
4
- module IMW
5
- module Resources
6
- module Archives
7
- module Tarbz2
8
-
9
- #
10
- # It's a compressed file
11
- #
12
-
13
- include IMW::Resources::CompressedFile
14
-
15
- def compression_settings
16
- @compression_settings ||= {
17
- :program => :bzip2,
18
- :decompression_program => :bunzip2,
19
- :decompress => '',
20
- :extension => 'bz2'
21
- }
22
- end
23
-
24
- #
25
- # But it's also an archive
26
- #
27
-
28
- include IMW::Resources::Archive
29
-
30
- def archive_settings
31
- @archive_settings ||= {
32
- :program => :tar,
33
- :create => '-cf',
34
- :list => "-tjf",
35
- :extract => "-xjf"
36
- }
37
- end
38
-
39
- # Overrides default behvaior of IMW::Files::Archive#create to
40
- # compress files after creating them.
41
- def create *input_paths
42
- IMW.system(archive_settings[:program], archive_settings[:create], path_between_archive_and_compression, *input_paths.flatten)
43
- IMW.open(path_between_archive_and_compression).compress!
44
- end
45
-
46
- def decompressed_basename
47
- case extname
48
- when '.tar.bz2' then basename[0..-5] # .tar.bz2 => .tar
49
- when '.tbz2' then basename.gsub(/tbz2$/, 'tar') # .tbz2 => .tar
50
- else basename[0..-(extname.size + 1)]
51
- end
52
- end
53
-
54
-
55
- protected
56
- def path_between_archive_and_compression
57
- File.join(dirname,name + '.tar')
58
- end
59
-
60
- public
61
-
62
- #
63
- # It's a compressed file AND an archive!
64
- #
65
-
66
- def extname
67
- case path
68
- when /\.tar\.bz2$/ then '.tar.bz2'
69
- when /\.tbz2$/ then '.tbz2'
70
- else File.extname(path)
71
- end
72
- end
73
-
74
- end
75
- end
76
- end
77
- end
78
-
@@ -1,78 +0,0 @@
1
- require 'imw/resources/archive'
2
- require 'imw/resources/compressed_file'
3
-
4
- module IMW
5
- module Resources
6
- module Archives
7
- module Targz
8
-
9
- #
10
- # It's a compressed file
11
- #
12
-
13
- include IMW::Resources::CompressedFile
14
-
15
- def compression_settings
16
- @compression_settings ||= {
17
- :program => :gzip,
18
- :decompression_program => :gunzip,
19
- :decompress => '',
20
- :extension => 'gz'
21
- }
22
- end
23
-
24
- #
25
- # But it's also an archive
26
- #
27
-
28
- include IMW::Resources::Archive
29
-
30
- def archive_settings
31
- @archive_settings ||= {
32
- :program => :tar,
33
- :list => "-tzf",
34
- :create => '-cf',
35
- :extract => "-xzf"
36
- }
37
- end
38
-
39
- # Overrides default behvaior of IMW::Files::Archive#create to
40
- # compress files after creating them.
41
- def create *input_paths
42
- IMW.system(archive_settings[:program], archive_settings[:create].split, path_between_archive_and_compression, *input_paths.flatten)
43
- tar = IMW.open(path_between_archive_and_compression)
44
- tar.compression_settings = compression_settings
45
- tar.compress!
46
- end
47
-
48
- def decompressed_basename
49
- case extname
50
- when '.tar.gz' then basename[0..-4] # .tar.gz => .tar
51
- when '.tgz' then basename.gsub(/tgz$/, 'tar') # .tgz => .tar
52
- else basename[0..-(extname.size + 1)]
53
- end
54
- end
55
-
56
- protected
57
- def path_between_archive_and_compression
58
- File.join(dirname,name + '.tar')
59
- end
60
- public
61
-
62
- #
63
- # It's both an archive and a compressed file!
64
- #
65
-
66
- def extname
67
- case path
68
- when /\.tar\.gz$/ then '.tar.gz'
69
- when /\.tgz$/ then '.tgz'
70
- else File.extname(path)
71
- end
72
- end
73
-
74
- end
75
- end
76
- end
77
- end
78
-
@@ -1,57 +0,0 @@
1
- require 'imw/resources/archive'
2
-
3
- module IMW
4
- module Resources
5
- module Archives
6
- module Zip
7
-
8
- include IMW::Resources::Archive
9
-
10
- def archive_settings
11
- @archive_settings ||= {
12
- :program => :zip,
13
- :create => "-qqr",
14
- :append => "-qqg",
15
- :list => "-l",
16
- :extract => "-qqo",
17
- :unarchiving_program => :unzip
18
- }
19
- end
20
-
21
- protected
22
-
23
- # The `unzip' program outputs data in a very annoying format:
24
- #
25
- # Archive: data.zip
26
- # Length Date Time Name
27
- # -------- ---- ---- ----
28
- # 18510 07-28-08 15:58 data/4d7Qrgz7.csv
29
- # 3418 07-28-08 15:41 data/7S.csv
30
- # 23353 07-28-08 15:41 data/g.csv
31
- # 711 07-28-08 15:58 data/g.xml
32
- # 1095 07-28-08 15:41 data/L.xml
33
- # 2399 07-28-08 15:58 data/mTAu9H3.xml
34
- # 152 07-28-08 15:58 data/vaHBS2t5R.dat
35
- # -------- -------
36
- # 49638 7 files
37
- #
38
- # which is parsed by this method.
39
- def archive_contents_string_to_array string
40
- rows = string.split("\n")
41
- # ignore the first 3 lines of the output and also discared the
42
- # last 2 (5 = 2 + 3)
43
- file_rows = rows[3,(rows.length - 5)]
44
- file_rows.map do |row|
45
- if row
46
- columns = row.lstrip.rstrip.split(' ')
47
- # grab the filename in the fourth column
48
- columns[3..-1].join(' ')
49
- end
50
- end.compact
51
- end
52
- end
53
- end
54
- end
55
- end
56
-
57
-
@@ -1,32 +0,0 @@
1
- module IMW
2
- module Resources
3
- module Formats
4
- autoload :Csv, 'imw/resources/formats/delimited'
5
- autoload :Tsv, 'imw/resources/formats/delimited'
6
- autoload :Excel, 'imw/resources/formats/excel'
7
- autoload :Json, 'imw/resources/formats/json'
8
- autoload :Xml, 'imw/resources/formats/sgml'
9
- autoload :Xsl, 'imw/resources/formats/sgml'
10
- autoload :Html, 'imw/resources/formats/sgml'
11
- autoload :Xhtml, 'imw/resources/formats/sgml'
12
- autoload :Rdf, 'imw/resources/formats/sgml'
13
- autoload :Yaml, 'imw/resources/formats/yaml'
14
-
15
- # Handlers which augment a resource with data format specific
16
- # methods.
17
- FORMAT_HANDLERS = [
18
- [ "Formats::Csv", /\.csv$/ ],
19
- [ "Formats::Tsv", /\.tsv$/ ],
20
- [ "Formats::Excel", /\.xslx?$/ ],
21
- [ "Formats::Json", /\.json$/ ],
22
- [ "Formats::Xml", /\.xml$/ ],
23
- [ "Formats::Xsl", /\.xsl$/ ],
24
- [ "Formats::Html", /\.html?$/ ],
25
- [ "Formats::Xhtml", /\.xhtml?$/ ],
26
- [ "Formats::Rdf", /\.rdf?$/ ],
27
- [ "Formats::Yaml", /\.ya?ml$/ ]
28
- ]
29
- end
30
- end
31
- end
32
-
@@ -1,92 +0,0 @@
1
- module IMW
2
- module Resources
3
- module Formats
4
-
5
- # Defines methods used for parsing and writing delimited data
6
- # formats (CSV, TSV, &c.) with the FasterCSV library. This
7
- # module is not used to directly extend a resource. Instead,
8
- # more specific modules (e.g. - IMW::Resources::Formats::Csv)
9
- # include this one and also define +delimited_options+ which is
10
- # actually what's passed to FasterCSV.
11
- #
12
- # @abstract
13
- module Delimited
14
-
15
- attr_accessor :delimited_settings
16
-
17
- # Return the data in this delimited resource as an array of
18
- # arrays.
19
- #
20
- # Yield each outer array (row) if passed a block.
21
- #
22
- # @return [Array] the full data matrix
23
- # @yield [Array] each row of the data
24
- def load &block
25
- require 'fastercsv'
26
- FasterCSV.parse(read, delimited_options, &block)
27
- end
28
-
29
- # Map each row in this delimited resource.
30
- #
31
- # @yield [Array] each row of the data
32
- def map &block
33
- load.map(&block)
34
- end
35
-
36
- # Dump an array of arrays into this resource.
37
- #
38
- # @param [Array] data array of arrays to dump
39
- # @param [Hash] options
40
- # @option options [true, false] :persist Keep this resource's IO object open after dumping
41
- def dump data, options={}
42
- require 'fastercsv'
43
- data.each do |row|
44
- write(FasterCSV.generate_line(row, delimited_options))
45
- end
46
- io.close unless options[:persist]
47
- self
48
- end
49
- end
50
-
51
- module Csv
52
- include Delimited
53
-
54
- # Default options to be passed to
55
- # FasterCSV[http://fastercsv.rubyforge.org/]; see its
56
- # documentation for more information.
57
- #
58
- # @return [Hash]
59
- def delimited_options
60
- @delimited_options ||= {
61
- :col_sep => ',',
62
- :headers => false,
63
- :return_headers => false,
64
- :write_headers => true,
65
- :skip_blanks => false,
66
- :force_quotes => false
67
- }
68
- end
69
- end
70
-
71
- module Tsv
72
- include Delimited
73
-
74
- # Default options to be passed to
75
- # FasterCSV[http://fastercsv.rubyforge.org/]; see its
76
- # documentation for more information.
77
- #
78
- # @return [Hash]
79
- def delimited_options
80
- @delimited_options ||= {
81
- :col_sep => "\t",
82
- :headers => false,
83
- :return_headers => false,
84
- :write_headers => true,
85
- :skip_blanks => false,
86
- :force_quotes => false
87
- }
88
- end
89
- end
90
- end
91
- end
92
- end
@@ -1,125 +0,0 @@
1
- module IMW
2
- module Resources
3
- module Formats
4
-
5
- # Defines methods for reading and writing Microsoft Excel data.
6
- module Excel
7
-
8
- attr_accessor :book, :sheet
9
-
10
- def self.extended obj
11
- if obj.exist?
12
- @book = Spreadsheet.open path
13
- @sheet = book.worksheet(0)
14
-
15
- end
16
- end
17
-
18
-
19
- def book
20
- return @book if @book
21
- if exists?
22
- @book = Spreadsheet.open(path)
23
- else
24
- @book = Spreadsheet::Workbook.new
25
- end
26
- end
27
-
28
- def sheet
29
- @sheet = @book.create_worksheet
30
- @sheet
31
- end
32
-
33
- #If an Excel file exists at the location specified by uri then
34
- #it is opened and can be read out with a subsequent call to
35
- #load(). Otherwise, a new workbook is created and can be written
36
- #to with the dump() method.
37
- def initialize uri, mode='r', options={}
38
- self.uri = uri
39
- @max_lines = options[:max_lines] || 65000
40
- @idx = 0
41
- @book_idx = 0
42
- @sht_idx = 0
43
- unless self.exist?
44
- make_new_book
45
- make_new_sheet
46
- else
47
- get_existing_book
48
- end
49
- end
50
-
51
- #Returns the data in an existing workbook as an
52
- #array of arrays. Only capable of reading a single sheet.
53
- def load
54
- @sheet.map{|row| row.to_a}
55
- end
56
-
57
- #Dumps data, which is assumed to be an array of arrays, to a
58
- #newly created Excel workbook. Attempting to dump to a book
59
- #that already exists will typically result in file corruption.
60
- #Raises a 'too many lines' error if the number of lines
61
- #of data exceeds max_lines.
62
- def dump data
63
- data.each do |line|
64
- raise "too many lines" if too_many?
65
- self << line
66
- end
67
- save unless no_data?
68
- end
69
-
70
- #Processes a single line of data and updates internal variables.
71
- #You shouldn't need to call this directly.
72
- def << line
73
- @sheet.row(@sht_row).concat( line )
74
- @sht_row += 1
75
- @idx += 1
76
- end
77
-
78
- #Instantiates a new Excel workbook in memory. You shouldn't
79
- #need to call this directly.
80
- def make_new_book
81
- @book = Spreadsheet::Workbook.new
82
- @book_idx += 1
83
- end
84
-
85
- #Makes a new worksheet for a pre-existing Excel workbook.
86
- #This should be called after recovering from the
87
- #'too many lines' error.
88
- def make_new_sheet
89
- @sheet = @book.create_worksheet
90
- @sht_idx += 1
91
- @sht_row = 0 #always start at row 0 in a new sheet
92
- end
93
-
94
- #Opens an existing Excel workbook. You shoudn't need to
95
- #call this directly.
96
- def get_existing_book
97
- @sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
98
- @sht_idx += 1
99
- end
100
-
101
- #Increments the current sheet to the next one in
102
- #an open book. Not necessary at the moment.
103
- def incr_sheet
104
- @sheet = book.worksheet @sht_idx
105
- end
106
-
107
- #There are too many lines if the number of rows attempting
108
- #to be written exceeds max_lines.
109
- def too_many?
110
- @sht_row >= @max_lines
111
- end
112
-
113
- #There is no data if the number of rows attempting to be written
114
- #is zero.
115
- def no_data?
116
- @sht_row == 0
117
- end
118
-
119
- #Saves the workbook.
120
- def save
121
- @book.write path
122
- end
123
- end
124
- end
125
- end