imw 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. data/README.rdoc +34 -14
  2. data/Rakefile +1 -1
  3. data/VERSION +1 -1
  4. data/lib/imw.rb +9 -6
  5. data/lib/imw/{resources/archive.rb → archives.rb} +20 -10
  6. data/lib/imw/archives/rar.rb +19 -0
  7. data/lib/imw/archives/tar.rb +19 -0
  8. data/lib/imw/archives/tarbz2.rb +73 -0
  9. data/lib/imw/archives/targz.rb +73 -0
  10. data/lib/imw/archives/zip.rb +51 -0
  11. data/lib/imw/{resources/compressed_file.rb → compressed_files.rb} +16 -11
  12. data/lib/imw/compressed_files/bz2.rb +16 -0
  13. data/lib/imw/{resources → compressed_files}/compressible.rb +2 -4
  14. data/lib/imw/compressed_files/gz.rb +16 -0
  15. data/lib/imw/formats.rb +31 -0
  16. data/lib/imw/formats/delimited.rb +90 -0
  17. data/lib/imw/formats/excel.rb +125 -0
  18. data/lib/imw/formats/json.rb +51 -0
  19. data/lib/imw/formats/sgml.rb +69 -0
  20. data/lib/imw/formats/yaml.rb +51 -0
  21. data/lib/imw/resource.rb +108 -10
  22. data/lib/imw/schemes.rb +21 -0
  23. data/lib/imw/schemes/hdfs.rb +240 -0
  24. data/lib/imw/schemes/http.rb +166 -0
  25. data/lib/imw/schemes/local.rb +219 -0
  26. data/lib/imw/schemes/remote.rb +114 -0
  27. data/lib/imw/schemes/s3.rb +135 -0
  28. data/lib/imw/tools.rb +8 -0
  29. data/lib/imw/{transforms → tools}/archiver.rb +1 -1
  30. data/lib/imw/{transforms → tools}/transferer.rb +10 -10
  31. data/spec/imw/{resources/archive_spec.rb → archive_spec.rb} +3 -3
  32. data/spec/imw/{resources/archives_and_compressed → archives}/rar_spec.rb +2 -2
  33. data/spec/imw/{resources/archives_and_compressed → archives}/tar_spec.rb +2 -2
  34. data/spec/imw/{resources/archives_and_compressed → archives}/tarbz2_spec.rb +4 -4
  35. data/spec/imw/{resources/archives_and_compressed → archives}/targz_spec.rb +4 -4
  36. data/spec/imw/{resources/archives_and_compressed → archives}/zip_spec.rb +2 -2
  37. data/spec/imw/compressed_files/bz2_spec.rb +15 -0
  38. data/spec/imw/{resources → compressed_files}/compressible_spec.rb +1 -1
  39. data/spec/imw/compressed_files/gz_spec.rb +15 -0
  40. data/spec/imw/{resources/compressed_file_spec.rb → compressed_files_spec.rb} +3 -3
  41. data/spec/imw/{resources/formats → formats}/delimited_spec.rb +2 -2
  42. data/spec/imw/{resources/formats → formats}/json_spec.rb +2 -2
  43. data/spec/imw/{resources/formats → formats}/sgml_spec.rb +2 -2
  44. data/spec/imw/{resources/formats → formats}/yaml_spec.rb +2 -2
  45. data/spec/imw/resource_spec.rb +4 -4
  46. data/spec/imw/{resources/schemes → schemes}/hdfs_spec.rb +7 -7
  47. data/spec/imw/{resources/schemes → schemes}/http_spec.rb +2 -2
  48. data/spec/imw/{resources → schemes}/local_spec.rb +5 -5
  49. data/spec/imw/{resources → schemes}/remote_spec.rb +7 -3
  50. data/spec/imw/{resources/schemes → schemes}/s3_spec.rb +2 -2
  51. data/spec/imw/{transforms → tools}/archiver_spec.rb +2 -2
  52. data/spec/imw/tools/transferer_spec.rb +113 -0
  53. metadata +69 -71
  54. data/lib/imw/resources.rb +0 -118
  55. data/lib/imw/resources/archives_and_compressed.rb +0 -32
  56. data/lib/imw/resources/archives_and_compressed/bz2.rb +0 -18
  57. data/lib/imw/resources/archives_and_compressed/gz.rb +0 -18
  58. data/lib/imw/resources/archives_and_compressed/rar.rb +0 -23
  59. data/lib/imw/resources/archives_and_compressed/tar.rb +0 -23
  60. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +0 -78
  61. data/lib/imw/resources/archives_and_compressed/targz.rb +0 -78
  62. data/lib/imw/resources/archives_and_compressed/zip.rb +0 -57
  63. data/lib/imw/resources/formats.rb +0 -32
  64. data/lib/imw/resources/formats/delimited.rb +0 -92
  65. data/lib/imw/resources/formats/excel.rb +0 -125
  66. data/lib/imw/resources/formats/json.rb +0 -53
  67. data/lib/imw/resources/formats/sgml.rb +0 -72
  68. data/lib/imw/resources/formats/yaml.rb +0 -53
  69. data/lib/imw/resources/local.rb +0 -198
  70. data/lib/imw/resources/remote.rb +0 -110
  71. data/lib/imw/resources/schemes.rb +0 -19
  72. data/lib/imw/resources/schemes/hdfs.rb +0 -242
  73. data/lib/imw/resources/schemes/http.rb +0 -161
  74. data/lib/imw/resources/schemes/s3.rb +0 -137
  75. data/lib/imw/transforms.rb +0 -8
  76. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +0 -15
  77. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +0 -15
  78. data/spec/imw/transforms/transferer_spec.rb +0 -113
data/README.rdoc CHANGED
@@ -25,14 +25,13 @@ data. It has the following goals:
25
25
  The Infinite Monkeywrench is a powerful tool but it is not always the
26
26
  right one to use. IMW is **not** designed for
27
27
 
28
- * Scraping vast amounts of data (use Wuclan[http://github.com/infochimps/wuclan], Monkeyshines[http://github.com/infochimps/monkeyshines], and Edamame[http://github.com/infochimps/edamame].)
28
+ * Scraping vast amounts of data (use Wuclan[http://github.com/infochimps/wuclan] and Monkeyshines[http://github.com/infochimps/monkeyshines])
29
29
 
30
30
  * Really, really big datasets (use Wukong[http://github.com/infochimps/wukong] and Hadoop[http://hadoop.apache.org])
31
31
 
32
- * Data mining
33
-
34
- * Data visualization
32
+ * Data mining or statistical analysis
35
33
 
34
+ * Visualization
36
35
 
37
36
  = Setup
38
37
 
@@ -47,21 +46,42 @@ and then install IMW
47
46
 
48
47
  $ sudo gem install imw
49
48
 
50
- = IMW Basics
49
+ In all the examples that follow it is assumed that you've installed
50
+ IMW and required it in a script via
51
51
 
52
- The central goal of IMW is to make workflow involved in processing a
53
- dataset from a raw source to a finished product as simple as possible.
52
+ require 'rubygems'
53
+ require 'imw'
54
54
 
55
- To help achieve this goal, IMW creates lots of convenient structures
56
- and methods. The following sections provide a tour of these.
55
+ = Resources
57
56
 
58
- It is assumed that you've installed IMW and required it in a script
59
- via
57
+ IMW is centered around processing resources. A resource can be
58
+ _anything_ with a URI and you create one using IMW.open.
60
59
 
61
- require 'rubygems'
62
- require 'imw'
60
+ csv = IMW.open('/path/to/my_data.csv')
61
+ html = IMW.open('http://www.infochimps.com')
62
+ tar_bz2 = IMW.open(
63
+
64
+ IMW dynamically extends a resource with modules appropriate to it when
65
+ you open it. In the above case, +csv+ would be automatically extended
66
+ by the IMW::Resources::Formats::Csv module, among others:
67
+
68
+ csv.resource_modules
69
+ => [IMW::Resources::LocalObj, IMW::Resources::LocalFile, IMW::Resources::Compressible, IMW::Resources::Formats::Csv]
70
+
71
+ while +html+ will use a different set
72
+
73
+ html.resource_modules
74
+ => [IMW::Resources::LocalObj, IMW::Resources::LocalFile, IMW::Resources::Compressible, IMW::Resources::Formats::Csv]
75
+
76
+
77
+ Consult the documentation for the modules a resource uses to learn
78
+ what it can do.
79
+
80
+ Since resources are built around the idea of URIs, you can explicitly i
81
+
82
+ == Manipulating Paths
63
83
 
64
- == Paths
84
+ You can p
65
85
 
66
86
  IMW holds a registry of paths that you can define on the fly or store
67
87
  in a configuration file.
data/Rakefile CHANGED
@@ -21,7 +21,7 @@ end
21
21
 
22
22
  desc "Build tags"
23
23
  task :tags do
24
- system "etags -R bin etc examples lib spec"
24
+ system "etags -R README.rdoc bin etc examples lib spec"
25
25
  end
26
26
 
27
27
  desc "Build docs"
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.2.1
data/lib/imw.rb CHANGED
@@ -26,12 +26,15 @@ require 'imw/utils'
26
26
  # Repositories are collections of datasets and it is on these
27
27
  # collections that the +imw+ command line tool operates.
28
28
  module IMW
29
- autoload :Resource, 'imw/resource'
30
- autoload :Resources, 'imw/resources'
31
- autoload :Repository, 'imw/repository'
32
- autoload :Dataset, 'imw/dataset'
33
- autoload :Transforms, 'imw/transforms'
34
- autoload :Parsers, 'imw/parsers'
29
+ autoload :Resource, 'imw/resource'
30
+ autoload :Schemes, 'imw/schemes'
31
+ autoload :Archives, 'imw/archives'
32
+ autoload :CompressedFiles, 'imw/compressed_files'
33
+ autoload :Formats, 'imw/formats'
34
+ autoload :Tools, 'imw/tools'
35
+ autoload :Parsers, 'imw/parsers'
36
+ autoload :Dataset, 'imw/dataset'
37
+ autoload :Repository, 'imw/repository'
35
38
 
36
39
  # Open a resource at the given +uri+. The resource will
37
40
  # automatically be extended by modules which make sense given the
@@ -1,13 +1,24 @@
1
1
  module IMW
2
- module Resources
3
2
 
4
- module Archives
5
- autoload :Rar, 'imw/resources/archives_and_compressed/rar'
6
- autoload :Tar, 'imw/resources/archives_and_compressed/tar'
7
- autoload :Tarbz2, 'imw/resources/archives_and_compressed/tarbz2'
8
- autoload :Targz, 'imw/resources/archives_and_compressed/targz'
9
- autoload :Zip, 'imw/resources/archives_and_compressed/zip'
10
- end
3
+ # Contains modules which define the behavior of archive files.
4
+ module Archives
5
+
6
+ # Handlers for archives.
7
+ HANDLERS = [
8
+ ["Archives::Tarbz2", Proc.new { |r| r.is_local? && r.path =~ /\.tar\.bz2$/ } ],
9
+ ["Archives::Tarbz2", Proc.new { |r| r.is_local? && r.path =~ /\.tbz2$/ } ],
10
+ ["Archives::Targz", Proc.new { |r| r.is_local? && r.path =~ /\.tar\.gz$/ } ],
11
+ ["Archives::Targz", Proc.new { |r| r.is_local? && r.path =~ /\.tgz$/ } ],
12
+ ["Archives::Tar", Proc.new { |r| r.is_local? && r.path =~ /\.tar$/ } ],
13
+ ["Archives::Rar", Proc.new { |r| r.is_local? && r.path =~ /\.rar$/ } ],
14
+ ["Archives::Zip", Proc.new { |r| r.is_local? && r.path =~ /\.zip$/ } ]
15
+ ]
16
+
17
+ autoload :Rar, 'imw/archives/rar'
18
+ autoload :Tar, 'imw/archives/tar'
19
+ autoload :Tarbz2, 'imw/archives/tarbz2'
20
+ autoload :Targz, 'imw/archives/targz'
21
+ autoload :Zip, 'imw/archives/zip'
11
22
 
12
23
  # Defines methods for creating, appending to, extracting, and
13
24
  # listing an archive file. This module isn't used to directly
@@ -15,7 +26,7 @@ module IMW
15
26
  # (e.g. - IMW::Resources::Archives::Tarbz2) include this module
16
27
  # and define the specific settings (command-line flags, &c.)
17
28
  # required to make things work.
18
- module Archive
29
+ module Base
19
30
 
20
31
  attr_accessor :archive_settings
21
32
 
@@ -94,4 +105,3 @@ module IMW
94
105
  end
95
106
  end
96
107
 
97
-
@@ -0,0 +1,19 @@
1
+ module IMW
2
+ module Archives
3
+ module Rar
4
+
5
+ include IMW::Archives::Base
6
+
7
+ def archive_settings
8
+ @archive_settings ||= {
9
+ :program => :rar,
10
+ :create => ['a', '-o+', '-inul'],
11
+ :append => ['a', '-o+', '-inul'],
12
+ :list => "vb",
13
+ :extract => ['x', '-o+', '-inul']
14
+ }
15
+ end
16
+ end
17
+ end
18
+ end
19
+
@@ -0,0 +1,19 @@
1
+ module IMW
2
+ module Archives
3
+ module Tar
4
+
5
+ include IMW::Archives::Base
6
+
7
+ def archive_settings
8
+ @archive_settings ||= {
9
+ :create => "-cf",
10
+ :append => "-rf",
11
+ :list => "-tf",
12
+ :extract => "-xf",
13
+ :program => :tar
14
+ }
15
+ end
16
+ end
17
+ end
18
+ end
19
+
@@ -0,0 +1,73 @@
1
+ module IMW
2
+ module Archives
3
+ module Tarbz2
4
+
5
+ #
6
+ # It's a compressed file
7
+ #
8
+
9
+ include IMW::CompressedFiles::Base
10
+
11
+ def compression_settings
12
+ @compression_settings ||= {
13
+ :program => :bzip2,
14
+ :decompression_program => :bunzip2,
15
+ :decompress => '',
16
+ :extension => 'bz2'
17
+ }
18
+ end
19
+
20
+ #
21
+ # But it's also an archive
22
+ #
23
+
24
+ include IMW::Archives::Base
25
+
26
+ def archive_settings
27
+ @archive_settings ||= {
28
+ :program => :tar,
29
+ :create => '-cf',
30
+ :list => "-tjf",
31
+ :extract => "-xjf"
32
+ }
33
+ end
34
+
35
+ # Overrides default behvaior of IMW::Files::Archive#create to
36
+ # compress files after creating them.
37
+ def create *input_paths
38
+ IMW.system(archive_settings[:program], archive_settings[:create], path_between_archive_and_compression, *input_paths.flatten)
39
+ IMW.open(path_between_archive_and_compression).compress!
40
+ end
41
+
42
+ def decompressed_basename
43
+ case extname
44
+ when '.tar.bz2' then basename[0..-5] # .tar.bz2 => .tar
45
+ when '.tbz2' then basename.gsub(/tbz2$/, 'tar') # .tbz2 => .tar
46
+ else basename[0..-(extname.size + 1)]
47
+ end
48
+ end
49
+
50
+
51
+ protected
52
+ def path_between_archive_and_compression
53
+ File.join(dirname,name + '.tar')
54
+ end
55
+
56
+ public
57
+
58
+ #
59
+ # It's a compressed file AND an archive!
60
+ #
61
+
62
+ def extname
63
+ case path
64
+ when /\.tar\.bz2$/ then '.tar.bz2'
65
+ when /\.tbz2$/ then '.tbz2'
66
+ else File.extname(path)
67
+ end
68
+ end
69
+
70
+ end
71
+ end
72
+ end
73
+
@@ -0,0 +1,73 @@
1
+ module IMW
2
+ module Archives
3
+ module Targz
4
+
5
+ #
6
+ # It's a compressed file
7
+ #
8
+
9
+ include IMW::CompressedFiles::Base
10
+
11
+ def compression_settings
12
+ @compression_settings ||= {
13
+ :program => :gzip,
14
+ :decompression_program => :gunzip,
15
+ :decompress => '',
16
+ :extension => 'gz'
17
+ }
18
+ end
19
+
20
+ #
21
+ # But it's also an archive
22
+ #
23
+
24
+ include IMW::Archives::Base
25
+
26
+ def archive_settings
27
+ @archive_settings ||= {
28
+ :program => :tar,
29
+ :list => "-tzf",
30
+ :create => '-cf',
31
+ :extract => "-xzf"
32
+ }
33
+ end
34
+
35
+ # Overrides default behvaior of IMW::Files::Archive#create to
36
+ # compress files after creating them.
37
+ def create *input_paths
38
+ IMW.system(archive_settings[:program], archive_settings[:create].split, path_between_archive_and_compression, *input_paths.flatten)
39
+ tar = IMW.open(path_between_archive_and_compression)
40
+ tar.compression_settings = compression_settings
41
+ tar.compress!
42
+ end
43
+
44
+ def decompressed_basename
45
+ case extname
46
+ when '.tar.gz' then basename[0..-4] # .tar.gz => .tar
47
+ when '.tgz' then basename.gsub(/tgz$/, 'tar') # .tgz => .tar
48
+ else basename[0..-(extname.size + 1)]
49
+ end
50
+ end
51
+
52
+ protected
53
+ def path_between_archive_and_compression
54
+ File.join(dirname,name + '.tar')
55
+ end
56
+ public
57
+
58
+ #
59
+ # It's both an archive and a compressed file!
60
+ #
61
+
62
+ def extname
63
+ case path
64
+ when /\.tar\.gz$/ then '.tar.gz'
65
+ when /\.tgz$/ then '.tgz'
66
+ else File.extname(path)
67
+ end
68
+ end
69
+
70
+ end
71
+ end
72
+ end
73
+
@@ -0,0 +1,51 @@
1
+ module IMW
2
+ module Archives
3
+ module Zip
4
+
5
+ include IMW::Archives::Base
6
+
7
+ def archive_settings
8
+ @archive_settings ||= {
9
+ :program => :zip,
10
+ :create => "-qqr",
11
+ :append => "-qqg",
12
+ :list => "-l",
13
+ :extract => "-qqo",
14
+ :unarchiving_program => :unzip
15
+ }
16
+ end
17
+
18
+ protected
19
+
20
+ # The `unzip' program outputs data in a very annoying format:
21
+ #
22
+ # Archive: data.zip
23
+ # Length Date Time Name
24
+ # -------- ---- ---- ----
25
+ # 18510 07-28-08 15:58 data/4d7Qrgz7.csv
26
+ # 3418 07-28-08 15:41 data/7S.csv
27
+ # 23353 07-28-08 15:41 data/g.csv
28
+ # 711 07-28-08 15:58 data/g.xml
29
+ # 1095 07-28-08 15:41 data/L.xml
30
+ # 2399 07-28-08 15:58 data/mTAu9H3.xml
31
+ # 152 07-28-08 15:58 data/vaHBS2t5R.dat
32
+ # -------- -------
33
+ # 49638 7 files
34
+ #
35
+ # which is parsed by this method.
36
+ def archive_contents_string_to_array string
37
+ rows = string.split("\n")
38
+ # ignore the first 3 lines of the output and also discared the
39
+ # last 2 (5 = 2 + 3)
40
+ file_rows = rows[3,(rows.length - 5)]
41
+ file_rows.map do |row|
42
+ if row
43
+ columns = row.lstrip.rstrip.split(' ')
44
+ # grab the filename in the fourth column
45
+ columns[3..-1].join(' ')
46
+ end
47
+ end.compact
48
+ end
49
+ end
50
+ end
51
+ end
@@ -1,10 +1,19 @@
1
1
  module IMW
2
- module Resources
3
-
4
- module CompressedFiles
5
- autoload :Bz2, 'imw/resources/archives_and_compressed/bz2'
6
- autoload :Gz, 'imw/resources/archives_and_compressed/gz'
7
- end
2
+
3
+ # Contains modules which define the behavior of compressed files.
4
+ module CompressedFiles
5
+ autoload :Bz2, 'imw/compressed_files/bz2'
6
+ autoload :Gz, 'imw/compressed_files/gz'
7
+ autoload :Compressible, 'imw/compressed_files/compressible'
8
+
9
+ # Handlers which include modules for compressed file formats as
10
+ # well as the IMW::CompressedFiles::Compressible module for
11
+ # compressing regular files.
12
+ HANDLERS = [
13
+ ["CompressedFiles::Compressible", Proc.new { |r| r.is_local? && r.is_file? && r.path != /\.(bz2|gz|tgz|tbz2)$/ } ],
14
+ ["CompressedFiles::Gz", Proc.new { |r| r.is_local? && r.path =~ /\.gz$/ && r.path !~ /\.tar\.gz$/ && r.path !~ /\.tgz$/ } ],
15
+ ["CompressedFiles::Bz2", Proc.new { |r| r.is_local? && r.path =~ /\.bz2$/ && r.path !~ /\.tar\.bz2$/ && r.path !~ /\.tbz2$/ } ]
16
+ ]
8
17
 
9
18
  # Defines methods for decompressing a compressed file. This
10
19
  # module isn't used to directly extend an IMW::Resource --
@@ -12,7 +21,7 @@ module IMW
12
21
  # IMW::Resources::CompressedFiles::Bz2) include this module and
13
22
  # further define the command-line flags &c. needed to make
14
23
  # everything work.
15
- module CompressedFile
24
+ module Base
16
25
 
17
26
  attr_accessor :compression_settings
18
27
 
@@ -80,10 +89,6 @@ module IMW
80
89
  copy.mv(path) if copy && copy.exist?
81
90
  end
82
91
  end
83
-
84
92
  end
85
93
  end
86
94
  end
87
-
88
-
89
-
@@ -0,0 +1,16 @@
1
+ module IMW
2
+ module CompressedFiles
3
+ module Bz2
4
+
5
+ include IMW::CompressedFiles::Base
6
+
7
+ def compression_settings
8
+ @compression_settings ||= {
9
+ :decompression_program => :bzip2,
10
+ :decompress => '-fd'
11
+ }
12
+ end
13
+
14
+ end
15
+ end
16
+ end