imw 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. data/README.rdoc +34 -14
  2. data/Rakefile +1 -1
  3. data/VERSION +1 -1
  4. data/lib/imw.rb +9 -6
  5. data/lib/imw/{resources/archive.rb → archives.rb} +20 -10
  6. data/lib/imw/archives/rar.rb +19 -0
  7. data/lib/imw/archives/tar.rb +19 -0
  8. data/lib/imw/archives/tarbz2.rb +73 -0
  9. data/lib/imw/archives/targz.rb +73 -0
  10. data/lib/imw/archives/zip.rb +51 -0
  11. data/lib/imw/{resources/compressed_file.rb → compressed_files.rb} +16 -11
  12. data/lib/imw/compressed_files/bz2.rb +16 -0
  13. data/lib/imw/{resources → compressed_files}/compressible.rb +2 -4
  14. data/lib/imw/compressed_files/gz.rb +16 -0
  15. data/lib/imw/formats.rb +31 -0
  16. data/lib/imw/formats/delimited.rb +90 -0
  17. data/lib/imw/formats/excel.rb +125 -0
  18. data/lib/imw/formats/json.rb +51 -0
  19. data/lib/imw/formats/sgml.rb +69 -0
  20. data/lib/imw/formats/yaml.rb +51 -0
  21. data/lib/imw/resource.rb +108 -10
  22. data/lib/imw/schemes.rb +21 -0
  23. data/lib/imw/schemes/hdfs.rb +240 -0
  24. data/lib/imw/schemes/http.rb +166 -0
  25. data/lib/imw/schemes/local.rb +219 -0
  26. data/lib/imw/schemes/remote.rb +114 -0
  27. data/lib/imw/schemes/s3.rb +135 -0
  28. data/lib/imw/tools.rb +8 -0
  29. data/lib/imw/{transforms → tools}/archiver.rb +1 -1
  30. data/lib/imw/{transforms → tools}/transferer.rb +10 -10
  31. data/spec/imw/{resources/archive_spec.rb → archive_spec.rb} +3 -3
  32. data/spec/imw/{resources/archives_and_compressed → archives}/rar_spec.rb +2 -2
  33. data/spec/imw/{resources/archives_and_compressed → archives}/tar_spec.rb +2 -2
  34. data/spec/imw/{resources/archives_and_compressed → archives}/tarbz2_spec.rb +4 -4
  35. data/spec/imw/{resources/archives_and_compressed → archives}/targz_spec.rb +4 -4
  36. data/spec/imw/{resources/archives_and_compressed → archives}/zip_spec.rb +2 -2
  37. data/spec/imw/compressed_files/bz2_spec.rb +15 -0
  38. data/spec/imw/{resources → compressed_files}/compressible_spec.rb +1 -1
  39. data/spec/imw/compressed_files/gz_spec.rb +15 -0
  40. data/spec/imw/{resources/compressed_file_spec.rb → compressed_files_spec.rb} +3 -3
  41. data/spec/imw/{resources/formats → formats}/delimited_spec.rb +2 -2
  42. data/spec/imw/{resources/formats → formats}/json_spec.rb +2 -2
  43. data/spec/imw/{resources/formats → formats}/sgml_spec.rb +2 -2
  44. data/spec/imw/{resources/formats → formats}/yaml_spec.rb +2 -2
  45. data/spec/imw/resource_spec.rb +4 -4
  46. data/spec/imw/{resources/schemes → schemes}/hdfs_spec.rb +7 -7
  47. data/spec/imw/{resources/schemes → schemes}/http_spec.rb +2 -2
  48. data/spec/imw/{resources → schemes}/local_spec.rb +5 -5
  49. data/spec/imw/{resources → schemes}/remote_spec.rb +7 -3
  50. data/spec/imw/{resources/schemes → schemes}/s3_spec.rb +2 -2
  51. data/spec/imw/{transforms → tools}/archiver_spec.rb +2 -2
  52. data/spec/imw/tools/transferer_spec.rb +113 -0
  53. metadata +69 -71
  54. data/lib/imw/resources.rb +0 -118
  55. data/lib/imw/resources/archives_and_compressed.rb +0 -32
  56. data/lib/imw/resources/archives_and_compressed/bz2.rb +0 -18
  57. data/lib/imw/resources/archives_and_compressed/gz.rb +0 -18
  58. data/lib/imw/resources/archives_and_compressed/rar.rb +0 -23
  59. data/lib/imw/resources/archives_and_compressed/tar.rb +0 -23
  60. data/lib/imw/resources/archives_and_compressed/tarbz2.rb +0 -78
  61. data/lib/imw/resources/archives_and_compressed/targz.rb +0 -78
  62. data/lib/imw/resources/archives_and_compressed/zip.rb +0 -57
  63. data/lib/imw/resources/formats.rb +0 -32
  64. data/lib/imw/resources/formats/delimited.rb +0 -92
  65. data/lib/imw/resources/formats/excel.rb +0 -125
  66. data/lib/imw/resources/formats/json.rb +0 -53
  67. data/lib/imw/resources/formats/sgml.rb +0 -72
  68. data/lib/imw/resources/formats/yaml.rb +0 -53
  69. data/lib/imw/resources/local.rb +0 -198
  70. data/lib/imw/resources/remote.rb +0 -110
  71. data/lib/imw/resources/schemes.rb +0 -19
  72. data/lib/imw/resources/schemes/hdfs.rb +0 -242
  73. data/lib/imw/resources/schemes/http.rb +0 -161
  74. data/lib/imw/resources/schemes/s3.rb +0 -137
  75. data/lib/imw/transforms.rb +0 -8
  76. data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +0 -15
  77. data/spec/imw/resources/archives_and_compressed/gz_spec.rb +0 -15
  78. data/spec/imw/transforms/transferer_spec.rb +0 -113
data/README.rdoc CHANGED
@@ -25,14 +25,13 @@ data. It has the following goals:
25
25
  The Infinite Monkeywrench is a powerful tool but it is not always the
26
26
  right one to use. IMW is **not** designed for
27
27
 
28
- * Scraping vast amounts of data (use Wuclan[http://github.com/infochimps/wuclan], Monkeyshines[http://github.com/infochimps/monkeyshines], and Edamame[http://github.com/infochimps/edamame].)
28
+ * Scraping vast amounts of data (use Wuclan[http://github.com/infochimps/wuclan] and Monkeyshines[http://github.com/infochimps/monkeyshines])
29
29
 
30
30
  * Really, really big datasets (use Wukong[http://github.com/infochimps/wukong] and Hadoop[http://hadoop.apache.org])
31
31
 
32
- * Data mining
33
-
34
- * Data visualization
32
+ * Data mining or statistical analysis
35
33
 
34
+ * Visualization
36
35
 
37
36
  = Setup
38
37
 
@@ -47,21 +46,42 @@ and then install IMW
47
46
 
48
47
  $ sudo gem install imw
49
48
 
50
- = IMW Basics
49
+ In all the examples that follow it is assumed that you've installed
50
+ IMW and required it in a script via
51
51
 
52
- The central goal of IMW is to make workflow involved in processing a
53
- dataset from a raw source to a finished product as simple as possible.
52
+ require 'rubygems'
53
+ require 'imw'
54
54
 
55
- To help achieve this goal, IMW creates lots of convenient structures
56
- and methods. The following sections provide a tour of these.
55
+ = Resources
57
56
 
58
- It is assumed that you've installed IMW and required it in a script
59
- via
57
+ IMW is centered around processing resources. A resource can be
58
+ _anything_ with a URI and you create one using IMW.open.
60
59
 
61
- require 'rubygems'
62
- require 'imw'
60
+ csv = IMW.open('/path/to/my_data.csv')
61
+ html = IMW.open('http://www.infochimps.com')
62
+ tar_bz2 = IMW.open(
63
+
64
+ IMW dynamically extends a resource with modules appropriate to it when
65
+ you open it. In the above case, +csv+ would be automatically extended
66
+ by the IMW::Resources::Formats::Csv module, among others:
67
+
68
+ csv.resource_modules
69
+ => [IMW::Resources::LocalObj, IMW::Resources::LocalFile, IMW::Resources::Compressible, IMW::Resources::Formats::Csv]
70
+
71
+ while +html+ will use a different set
72
+
73
+ html.resource_modules
74
+ => [IMW::Resources::LocalObj, IMW::Resources::LocalFile, IMW::Resources::Compressible, IMW::Resources::Formats::Csv]
75
+
76
+
77
+ Consult the documentation for the modules a resource uses to learn
78
+ what it can do.
79
+
80
+ Since resources are built around the idea of URIs, you can explicitly i
81
+
82
+ == Manipulating Paths
63
83
 
64
- == Paths
84
+ You can p
65
85
 
66
86
  IMW holds a registry of paths that you can define on the fly or store
67
87
  in a configuration file.
data/Rakefile CHANGED
@@ -21,7 +21,7 @@ end
21
21
 
22
22
  desc "Build tags"
23
23
  task :tags do
24
- system "etags -R bin etc examples lib spec"
24
+ system "etags -R README.rdoc bin etc examples lib spec"
25
25
  end
26
26
 
27
27
  desc "Build docs"
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.0
1
+ 0.2.1
data/lib/imw.rb CHANGED
@@ -26,12 +26,15 @@ require 'imw/utils'
26
26
  # Repositories are collections of datasets and it is on these
27
27
  # collections that the +imw+ command line tool operates.
28
28
  module IMW
29
- autoload :Resource, 'imw/resource'
30
- autoload :Resources, 'imw/resources'
31
- autoload :Repository, 'imw/repository'
32
- autoload :Dataset, 'imw/dataset'
33
- autoload :Transforms, 'imw/transforms'
34
- autoload :Parsers, 'imw/parsers'
29
+ autoload :Resource, 'imw/resource'
30
+ autoload :Schemes, 'imw/schemes'
31
+ autoload :Archives, 'imw/archives'
32
+ autoload :CompressedFiles, 'imw/compressed_files'
33
+ autoload :Formats, 'imw/formats'
34
+ autoload :Tools, 'imw/tools'
35
+ autoload :Parsers, 'imw/parsers'
36
+ autoload :Dataset, 'imw/dataset'
37
+ autoload :Repository, 'imw/repository'
35
38
 
36
39
  # Open a resource at the given +uri+. The resource will
37
40
  # automatically be extended by modules which make sense given the
@@ -1,13 +1,24 @@
1
1
  module IMW
2
- module Resources
3
2
 
4
- module Archives
5
- autoload :Rar, 'imw/resources/archives_and_compressed/rar'
6
- autoload :Tar, 'imw/resources/archives_and_compressed/tar'
7
- autoload :Tarbz2, 'imw/resources/archives_and_compressed/tarbz2'
8
- autoload :Targz, 'imw/resources/archives_and_compressed/targz'
9
- autoload :Zip, 'imw/resources/archives_and_compressed/zip'
10
- end
3
+ # Contains modules which define the behavior of archive files.
4
+ module Archives
5
+
6
+ # Handlers for archives.
7
+ HANDLERS = [
8
+ ["Archives::Tarbz2", Proc.new { |r| r.is_local? && r.path =~ /\.tar\.bz2$/ } ],
9
+ ["Archives::Tarbz2", Proc.new { |r| r.is_local? && r.path =~ /\.tbz2$/ } ],
10
+ ["Archives::Targz", Proc.new { |r| r.is_local? && r.path =~ /\.tar\.gz$/ } ],
11
+ ["Archives::Targz", Proc.new { |r| r.is_local? && r.path =~ /\.tgz$/ } ],
12
+ ["Archives::Tar", Proc.new { |r| r.is_local? && r.path =~ /\.tar$/ } ],
13
+ ["Archives::Rar", Proc.new { |r| r.is_local? && r.path =~ /\.rar$/ } ],
14
+ ["Archives::Zip", Proc.new { |r| r.is_local? && r.path =~ /\.zip$/ } ]
15
+ ]
16
+
17
+ autoload :Rar, 'imw/archives/rar'
18
+ autoload :Tar, 'imw/archives/tar'
19
+ autoload :Tarbz2, 'imw/archives/tarbz2'
20
+ autoload :Targz, 'imw/archives/targz'
21
+ autoload :Zip, 'imw/archives/zip'
11
22
 
12
23
  # Defines methods for creating, appending to, extracting, and
13
24
  # listing an archive file. This module isn't used to directly
@@ -15,7 +26,7 @@ module IMW
15
26
  # (e.g. - IMW::Resources::Archives::Tarbz2) include this module
16
27
  # and define the specific settings (command-line flags, &c.)
17
28
  # required to make things work.
18
- module Archive
29
+ module Base
19
30
 
20
31
  attr_accessor :archive_settings
21
32
 
@@ -94,4 +105,3 @@ module IMW
94
105
  end
95
106
  end
96
107
 
97
-
@@ -0,0 +1,19 @@
1
+ module IMW
2
+ module Archives
3
+ module Rar
4
+
5
+ include IMW::Archives::Base
6
+
7
+ def archive_settings
8
+ @archive_settings ||= {
9
+ :program => :rar,
10
+ :create => ['a', '-o+', '-inul'],
11
+ :append => ['a', '-o+', '-inul'],
12
+ :list => "vb",
13
+ :extract => ['x', '-o+', '-inul']
14
+ }
15
+ end
16
+ end
17
+ end
18
+ end
19
+
@@ -0,0 +1,19 @@
1
+ module IMW
2
+ module Archives
3
+ module Tar
4
+
5
+ include IMW::Archives::Base
6
+
7
+ def archive_settings
8
+ @archive_settings ||= {
9
+ :create => "-cf",
10
+ :append => "-rf",
11
+ :list => "-tf",
12
+ :extract => "-xf",
13
+ :program => :tar
14
+ }
15
+ end
16
+ end
17
+ end
18
+ end
19
+
@@ -0,0 +1,73 @@
1
+ module IMW
2
+ module Archives
3
+ module Tarbz2
4
+
5
+ #
6
+ # It's a compressed file
7
+ #
8
+
9
+ include IMW::CompressedFiles::Base
10
+
11
+ def compression_settings
12
+ @compression_settings ||= {
13
+ :program => :bzip2,
14
+ :decompression_program => :bunzip2,
15
+ :decompress => '',
16
+ :extension => 'bz2'
17
+ }
18
+ end
19
+
20
+ #
21
+ # But it's also an archive
22
+ #
23
+
24
+ include IMW::Archives::Base
25
+
26
+ def archive_settings
27
+ @archive_settings ||= {
28
+ :program => :tar,
29
+ :create => '-cf',
30
+ :list => "-tjf",
31
+ :extract => "-xjf"
32
+ }
33
+ end
34
+
35
+ # Overrides default behvaior of IMW::Files::Archive#create to
36
+ # compress files after creating them.
37
+ def create *input_paths
38
+ IMW.system(archive_settings[:program], archive_settings[:create], path_between_archive_and_compression, *input_paths.flatten)
39
+ IMW.open(path_between_archive_and_compression).compress!
40
+ end
41
+
42
+ def decompressed_basename
43
+ case extname
44
+ when '.tar.bz2' then basename[0..-5] # .tar.bz2 => .tar
45
+ when '.tbz2' then basename.gsub(/tbz2$/, 'tar') # .tbz2 => .tar
46
+ else basename[0..-(extname.size + 1)]
47
+ end
48
+ end
49
+
50
+
51
+ protected
52
+ def path_between_archive_and_compression
53
+ File.join(dirname,name + '.tar')
54
+ end
55
+
56
+ public
57
+
58
+ #
59
+ # It's a compressed file AND an archive!
60
+ #
61
+
62
+ def extname
63
+ case path
64
+ when /\.tar\.bz2$/ then '.tar.bz2'
65
+ when /\.tbz2$/ then '.tbz2'
66
+ else File.extname(path)
67
+ end
68
+ end
69
+
70
+ end
71
+ end
72
+ end
73
+
@@ -0,0 +1,73 @@
1
+ module IMW
2
+ module Archives
3
+ module Targz
4
+
5
+ #
6
+ # It's a compressed file
7
+ #
8
+
9
+ include IMW::CompressedFiles::Base
10
+
11
+ def compression_settings
12
+ @compression_settings ||= {
13
+ :program => :gzip,
14
+ :decompression_program => :gunzip,
15
+ :decompress => '',
16
+ :extension => 'gz'
17
+ }
18
+ end
19
+
20
+ #
21
+ # But it's also an archive
22
+ #
23
+
24
+ include IMW::Archives::Base
25
+
26
+ def archive_settings
27
+ @archive_settings ||= {
28
+ :program => :tar,
29
+ :list => "-tzf",
30
+ :create => '-cf',
31
+ :extract => "-xzf"
32
+ }
33
+ end
34
+
35
+ # Overrides default behvaior of IMW::Files::Archive#create to
36
+ # compress files after creating them.
37
+ def create *input_paths
38
+ IMW.system(archive_settings[:program], archive_settings[:create].split, path_between_archive_and_compression, *input_paths.flatten)
39
+ tar = IMW.open(path_between_archive_and_compression)
40
+ tar.compression_settings = compression_settings
41
+ tar.compress!
42
+ end
43
+
44
+ def decompressed_basename
45
+ case extname
46
+ when '.tar.gz' then basename[0..-4] # .tar.gz => .tar
47
+ when '.tgz' then basename.gsub(/tgz$/, 'tar') # .tgz => .tar
48
+ else basename[0..-(extname.size + 1)]
49
+ end
50
+ end
51
+
52
+ protected
53
+ def path_between_archive_and_compression
54
+ File.join(dirname,name + '.tar')
55
+ end
56
+ public
57
+
58
+ #
59
+ # It's both an archive and a compressed file!
60
+ #
61
+
62
+ def extname
63
+ case path
64
+ when /\.tar\.gz$/ then '.tar.gz'
65
+ when /\.tgz$/ then '.tgz'
66
+ else File.extname(path)
67
+ end
68
+ end
69
+
70
+ end
71
+ end
72
+ end
73
+
@@ -0,0 +1,51 @@
1
+ module IMW
2
+ module Archives
3
+ module Zip
4
+
5
+ include IMW::Archives::Base
6
+
7
+ def archive_settings
8
+ @archive_settings ||= {
9
+ :program => :zip,
10
+ :create => "-qqr",
11
+ :append => "-qqg",
12
+ :list => "-l",
13
+ :extract => "-qqo",
14
+ :unarchiving_program => :unzip
15
+ }
16
+ end
17
+
18
+ protected
19
+
20
+ # The `unzip' program outputs data in a very annoying format:
21
+ #
22
+ # Archive: data.zip
23
+ # Length Date Time Name
24
+ # -------- ---- ---- ----
25
+ # 18510 07-28-08 15:58 data/4d7Qrgz7.csv
26
+ # 3418 07-28-08 15:41 data/7S.csv
27
+ # 23353 07-28-08 15:41 data/g.csv
28
+ # 711 07-28-08 15:58 data/g.xml
29
+ # 1095 07-28-08 15:41 data/L.xml
30
+ # 2399 07-28-08 15:58 data/mTAu9H3.xml
31
+ # 152 07-28-08 15:58 data/vaHBS2t5R.dat
32
+ # -------- -------
33
+ # 49638 7 files
34
+ #
35
+ # which is parsed by this method.
36
+ def archive_contents_string_to_array string
37
+ rows = string.split("\n")
38
+ # ignore the first 3 lines of the output and also discared the
39
+ # last 2 (5 = 2 + 3)
40
+ file_rows = rows[3,(rows.length - 5)]
41
+ file_rows.map do |row|
42
+ if row
43
+ columns = row.lstrip.rstrip.split(' ')
44
+ # grab the filename in the fourth column
45
+ columns[3..-1].join(' ')
46
+ end
47
+ end.compact
48
+ end
49
+ end
50
+ end
51
+ end
@@ -1,10 +1,19 @@
1
1
  module IMW
2
- module Resources
3
-
4
- module CompressedFiles
5
- autoload :Bz2, 'imw/resources/archives_and_compressed/bz2'
6
- autoload :Gz, 'imw/resources/archives_and_compressed/gz'
7
- end
2
+
3
+ # Contains modules which define the behavior of compressed files.
4
+ module CompressedFiles
5
+ autoload :Bz2, 'imw/compressed_files/bz2'
6
+ autoload :Gz, 'imw/compressed_files/gz'
7
+ autoload :Compressible, 'imw/compressed_files/compressible'
8
+
9
+ # Handlers which include modules for compressed file formats as
10
+ # well as the IMW::CompressedFiles::Compressible module for
11
+ # compressing regular files.
12
+ HANDLERS = [
13
+ ["CompressedFiles::Compressible", Proc.new { |r| r.is_local? && r.is_file? && r.path != /\.(bz2|gz|tgz|tbz2)$/ } ],
14
+ ["CompressedFiles::Gz", Proc.new { |r| r.is_local? && r.path =~ /\.gz$/ && r.path !~ /\.tar\.gz$/ && r.path !~ /\.tgz$/ } ],
15
+ ["CompressedFiles::Bz2", Proc.new { |r| r.is_local? && r.path =~ /\.bz2$/ && r.path !~ /\.tar\.bz2$/ && r.path !~ /\.tbz2$/ } ]
16
+ ]
8
17
 
9
18
  # Defines methods for decompressing a compressed file. This
10
19
  # module isn't used to directly extend an IMW::Resource --
@@ -12,7 +21,7 @@ module IMW
12
21
  # IMW::Resources::CompressedFiles::Bz2) include this module and
13
22
  # further define the command-line flags &c. needed to make
14
23
  # everything work.
15
- module CompressedFile
24
+ module Base
16
25
 
17
26
  attr_accessor :compression_settings
18
27
 
@@ -80,10 +89,6 @@ module IMW
80
89
  copy.mv(path) if copy && copy.exist?
81
90
  end
82
91
  end
83
-
84
92
  end
85
93
  end
86
94
  end
87
-
88
-
89
-
@@ -0,0 +1,16 @@
1
+ module IMW
2
+ module CompressedFiles
3
+ module Bz2
4
+
5
+ include IMW::CompressedFiles::Base
6
+
7
+ def compression_settings
8
+ @compression_settings ||= {
9
+ :decompression_program => :bzip2,
10
+ :decompress => '-fd'
11
+ }
12
+ end
13
+
14
+ end
15
+ end
16
+ end