imw 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +34 -14
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/imw.rb +9 -6
- data/lib/imw/{resources/archive.rb → archives.rb} +20 -10
- data/lib/imw/archives/rar.rb +19 -0
- data/lib/imw/archives/tar.rb +19 -0
- data/lib/imw/archives/tarbz2.rb +73 -0
- data/lib/imw/archives/targz.rb +73 -0
- data/lib/imw/archives/zip.rb +51 -0
- data/lib/imw/{resources/compressed_file.rb → compressed_files.rb} +16 -11
- data/lib/imw/compressed_files/bz2.rb +16 -0
- data/lib/imw/{resources → compressed_files}/compressible.rb +2 -4
- data/lib/imw/compressed_files/gz.rb +16 -0
- data/lib/imw/formats.rb +31 -0
- data/lib/imw/formats/delimited.rb +90 -0
- data/lib/imw/formats/excel.rb +125 -0
- data/lib/imw/formats/json.rb +51 -0
- data/lib/imw/formats/sgml.rb +69 -0
- data/lib/imw/formats/yaml.rb +51 -0
- data/lib/imw/resource.rb +108 -10
- data/lib/imw/schemes.rb +21 -0
- data/lib/imw/schemes/hdfs.rb +240 -0
- data/lib/imw/schemes/http.rb +166 -0
- data/lib/imw/schemes/local.rb +219 -0
- data/lib/imw/schemes/remote.rb +114 -0
- data/lib/imw/schemes/s3.rb +135 -0
- data/lib/imw/tools.rb +8 -0
- data/lib/imw/{transforms → tools}/archiver.rb +1 -1
- data/lib/imw/{transforms → tools}/transferer.rb +10 -10
- data/spec/imw/{resources/archive_spec.rb → archive_spec.rb} +3 -3
- data/spec/imw/{resources/archives_and_compressed → archives}/rar_spec.rb +2 -2
- data/spec/imw/{resources/archives_and_compressed → archives}/tar_spec.rb +2 -2
- data/spec/imw/{resources/archives_and_compressed → archives}/tarbz2_spec.rb +4 -4
- data/spec/imw/{resources/archives_and_compressed → archives}/targz_spec.rb +4 -4
- data/spec/imw/{resources/archives_and_compressed → archives}/zip_spec.rb +2 -2
- data/spec/imw/compressed_files/bz2_spec.rb +15 -0
- data/spec/imw/{resources → compressed_files}/compressible_spec.rb +1 -1
- data/spec/imw/compressed_files/gz_spec.rb +15 -0
- data/spec/imw/{resources/compressed_file_spec.rb → compressed_files_spec.rb} +3 -3
- data/spec/imw/{resources/formats → formats}/delimited_spec.rb +2 -2
- data/spec/imw/{resources/formats → formats}/json_spec.rb +2 -2
- data/spec/imw/{resources/formats → formats}/sgml_spec.rb +2 -2
- data/spec/imw/{resources/formats → formats}/yaml_spec.rb +2 -2
- data/spec/imw/resource_spec.rb +4 -4
- data/spec/imw/{resources/schemes → schemes}/hdfs_spec.rb +7 -7
- data/spec/imw/{resources/schemes → schemes}/http_spec.rb +2 -2
- data/spec/imw/{resources → schemes}/local_spec.rb +5 -5
- data/spec/imw/{resources → schemes}/remote_spec.rb +7 -3
- data/spec/imw/{resources/schemes → schemes}/s3_spec.rb +2 -2
- data/spec/imw/{transforms → tools}/archiver_spec.rb +2 -2
- data/spec/imw/tools/transferer_spec.rb +113 -0
- metadata +69 -71
- data/lib/imw/resources.rb +0 -118
- data/lib/imw/resources/archives_and_compressed.rb +0 -32
- data/lib/imw/resources/archives_and_compressed/bz2.rb +0 -18
- data/lib/imw/resources/archives_and_compressed/gz.rb +0 -18
- data/lib/imw/resources/archives_and_compressed/rar.rb +0 -23
- data/lib/imw/resources/archives_and_compressed/tar.rb +0 -23
- data/lib/imw/resources/archives_and_compressed/tarbz2.rb +0 -78
- data/lib/imw/resources/archives_and_compressed/targz.rb +0 -78
- data/lib/imw/resources/archives_and_compressed/zip.rb +0 -57
- data/lib/imw/resources/formats.rb +0 -32
- data/lib/imw/resources/formats/delimited.rb +0 -92
- data/lib/imw/resources/formats/excel.rb +0 -125
- data/lib/imw/resources/formats/json.rb +0 -53
- data/lib/imw/resources/formats/sgml.rb +0 -72
- data/lib/imw/resources/formats/yaml.rb +0 -53
- data/lib/imw/resources/local.rb +0 -198
- data/lib/imw/resources/remote.rb +0 -110
- data/lib/imw/resources/schemes.rb +0 -19
- data/lib/imw/resources/schemes/hdfs.rb +0 -242
- data/lib/imw/resources/schemes/http.rb +0 -161
- data/lib/imw/resources/schemes/s3.rb +0 -137
- data/lib/imw/transforms.rb +0 -8
- data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +0 -15
- data/spec/imw/resources/archives_and_compressed/gz_spec.rb +0 -15
- data/spec/imw/transforms/transferer_spec.rb +0 -113
data/README.rdoc
CHANGED
@@ -25,14 +25,13 @@ data. It has the following goals:
|
|
25
25
|
The Infinite Monkeywrench is a powerful tool but it is not always the
|
26
26
|
right one to use. IMW is **not** designed for
|
27
27
|
|
28
|
-
* Scraping vast amounts of data (use Wuclan[http://github.com/infochimps/wuclan]
|
28
|
+
* Scraping vast amounts of data (use Wuclan[http://github.com/infochimps/wuclan] and Monkeyshines[http://github.com/infochimps/monkeyshines])
|
29
29
|
|
30
30
|
* Really, really big datasets (use Wukong[http://github.com/infochimps/wukong] and Hadoop[http://hadoop.apache.org])
|
31
31
|
|
32
|
-
* Data mining
|
33
|
-
|
34
|
-
* Data visualization
|
32
|
+
* Data mining or statistical analysis
|
35
33
|
|
34
|
+
* Visualization
|
36
35
|
|
37
36
|
= Setup
|
38
37
|
|
@@ -47,21 +46,42 @@ and then install IMW
|
|
47
46
|
|
48
47
|
$ sudo gem install imw
|
49
48
|
|
50
|
-
|
49
|
+
In all the examples that follow it is assumed that you've installed
|
50
|
+
IMW and required it in a script via
|
51
51
|
|
52
|
-
|
53
|
-
|
52
|
+
require 'rubygems'
|
53
|
+
require 'imw'
|
54
54
|
|
55
|
-
|
56
|
-
and methods. The following sections provide a tour of these.
|
55
|
+
= Resources
|
57
56
|
|
58
|
-
|
59
|
-
|
57
|
+
IMW is centered around processing resources. A resource can be
|
58
|
+
_anything_ with a URI and you create one using IMW.open.
|
60
59
|
|
61
|
-
|
62
|
-
|
60
|
+
csv = IMW.open('/path/to/my_data.csv')
|
61
|
+
html = IMW.open('http://www.infochimps.com')
|
62
|
+
tar_bz2 = IMW.open(
|
63
|
+
|
64
|
+
IMW dynamically extends a resource with modules appropriate to it when
|
65
|
+
you open it. In the above case, +csv+ would be automatically extended
|
66
|
+
by the IMW::Resources::Formats::Csv module, among others:
|
67
|
+
|
68
|
+
csv.resource_modules
|
69
|
+
=> [IMW::Resources::LocalObj, IMW::Resources::LocalFile, IMW::Resources::Compressible, IMW::Resources::Formats::Csv]
|
70
|
+
|
71
|
+
while +html+ will use a different set
|
72
|
+
|
73
|
+
html.resource_modules
|
74
|
+
=> [IMW::Resources::LocalObj, IMW::Resources::LocalFile, IMW::Resources::Compressible, IMW::Resources::Formats::Csv]
|
75
|
+
|
76
|
+
|
77
|
+
Consult the documentation for the modules a resource uses to learn
|
78
|
+
what it can do.
|
79
|
+
|
80
|
+
Since resources are built around the idea of URIs, you can explicitly i
|
81
|
+
|
82
|
+
== Manipulating Paths
|
63
83
|
|
64
|
-
|
84
|
+
You can p
|
65
85
|
|
66
86
|
IMW holds a registry of paths that you can define on the fly or store
|
67
87
|
in a configuration file.
|
data/Rakefile
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.1
|
data/lib/imw.rb
CHANGED
@@ -26,12 +26,15 @@ require 'imw/utils'
|
|
26
26
|
# Repositories are collections of datasets and it is on these
|
27
27
|
# collections that the +imw+ command line tool operates.
|
28
28
|
module IMW
|
29
|
-
autoload :Resource,
|
30
|
-
autoload :
|
31
|
-
autoload :
|
32
|
-
autoload :
|
33
|
-
autoload :
|
34
|
-
autoload :
|
29
|
+
autoload :Resource, 'imw/resource'
|
30
|
+
autoload :Schemes, 'imw/schemes'
|
31
|
+
autoload :Archives, 'imw/archives'
|
32
|
+
autoload :CompressedFiles, 'imw/compressed_files'
|
33
|
+
autoload :Formats, 'imw/formats'
|
34
|
+
autoload :Tools, 'imw/tools'
|
35
|
+
autoload :Parsers, 'imw/parsers'
|
36
|
+
autoload :Dataset, 'imw/dataset'
|
37
|
+
autoload :Repository, 'imw/repository'
|
35
38
|
|
36
39
|
# Open a resource at the given +uri+. The resource will
|
37
40
|
# automatically be extended by modules which make sense given the
|
@@ -1,13 +1,24 @@
|
|
1
1
|
module IMW
|
2
|
-
module Resources
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
3
|
+
# Contains modules which define the behavior of archive files.
|
4
|
+
module Archives
|
5
|
+
|
6
|
+
# Handlers for archives.
|
7
|
+
HANDLERS = [
|
8
|
+
["Archives::Tarbz2", Proc.new { |r| r.is_local? && r.path =~ /\.tar\.bz2$/ } ],
|
9
|
+
["Archives::Tarbz2", Proc.new { |r| r.is_local? && r.path =~ /\.tbz2$/ } ],
|
10
|
+
["Archives::Targz", Proc.new { |r| r.is_local? && r.path =~ /\.tar\.gz$/ } ],
|
11
|
+
["Archives::Targz", Proc.new { |r| r.is_local? && r.path =~ /\.tgz$/ } ],
|
12
|
+
["Archives::Tar", Proc.new { |r| r.is_local? && r.path =~ /\.tar$/ } ],
|
13
|
+
["Archives::Rar", Proc.new { |r| r.is_local? && r.path =~ /\.rar$/ } ],
|
14
|
+
["Archives::Zip", Proc.new { |r| r.is_local? && r.path =~ /\.zip$/ } ]
|
15
|
+
]
|
16
|
+
|
17
|
+
autoload :Rar, 'imw/archives/rar'
|
18
|
+
autoload :Tar, 'imw/archives/tar'
|
19
|
+
autoload :Tarbz2, 'imw/archives/tarbz2'
|
20
|
+
autoload :Targz, 'imw/archives/targz'
|
21
|
+
autoload :Zip, 'imw/archives/zip'
|
11
22
|
|
12
23
|
# Defines methods for creating, appending to, extracting, and
|
13
24
|
# listing an archive file. This module isn't used to directly
|
@@ -15,7 +26,7 @@ module IMW
|
|
15
26
|
# (e.g. - IMW::Resources::Archives::Tarbz2) include this module
|
16
27
|
# and define the specific settings (command-line flags, &c.)
|
17
28
|
# required to make things work.
|
18
|
-
module
|
29
|
+
module Base
|
19
30
|
|
20
31
|
attr_accessor :archive_settings
|
21
32
|
|
@@ -94,4 +105,3 @@ module IMW
|
|
94
105
|
end
|
95
106
|
end
|
96
107
|
|
97
|
-
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module IMW
|
2
|
+
module Archives
|
3
|
+
module Rar
|
4
|
+
|
5
|
+
include IMW::Archives::Base
|
6
|
+
|
7
|
+
def archive_settings
|
8
|
+
@archive_settings ||= {
|
9
|
+
:program => :rar,
|
10
|
+
:create => ['a', '-o+', '-inul'],
|
11
|
+
:append => ['a', '-o+', '-inul'],
|
12
|
+
:list => "vb",
|
13
|
+
:extract => ['x', '-o+', '-inul']
|
14
|
+
}
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module IMW
|
2
|
+
module Archives
|
3
|
+
module Tar
|
4
|
+
|
5
|
+
include IMW::Archives::Base
|
6
|
+
|
7
|
+
def archive_settings
|
8
|
+
@archive_settings ||= {
|
9
|
+
:create => "-cf",
|
10
|
+
:append => "-rf",
|
11
|
+
:list => "-tf",
|
12
|
+
:extract => "-xf",
|
13
|
+
:program => :tar
|
14
|
+
}
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module IMW
|
2
|
+
module Archives
|
3
|
+
module Tarbz2
|
4
|
+
|
5
|
+
#
|
6
|
+
# It's a compressed file
|
7
|
+
#
|
8
|
+
|
9
|
+
include IMW::CompressedFiles::Base
|
10
|
+
|
11
|
+
def compression_settings
|
12
|
+
@compression_settings ||= {
|
13
|
+
:program => :bzip2,
|
14
|
+
:decompression_program => :bunzip2,
|
15
|
+
:decompress => '',
|
16
|
+
:extension => 'bz2'
|
17
|
+
}
|
18
|
+
end
|
19
|
+
|
20
|
+
#
|
21
|
+
# But it's also an archive
|
22
|
+
#
|
23
|
+
|
24
|
+
include IMW::Archives::Base
|
25
|
+
|
26
|
+
def archive_settings
|
27
|
+
@archive_settings ||= {
|
28
|
+
:program => :tar,
|
29
|
+
:create => '-cf',
|
30
|
+
:list => "-tjf",
|
31
|
+
:extract => "-xjf"
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
# Overrides default behvaior of IMW::Files::Archive#create to
|
36
|
+
# compress files after creating them.
|
37
|
+
def create *input_paths
|
38
|
+
IMW.system(archive_settings[:program], archive_settings[:create], path_between_archive_and_compression, *input_paths.flatten)
|
39
|
+
IMW.open(path_between_archive_and_compression).compress!
|
40
|
+
end
|
41
|
+
|
42
|
+
def decompressed_basename
|
43
|
+
case extname
|
44
|
+
when '.tar.bz2' then basename[0..-5] # .tar.bz2 => .tar
|
45
|
+
when '.tbz2' then basename.gsub(/tbz2$/, 'tar') # .tbz2 => .tar
|
46
|
+
else basename[0..-(extname.size + 1)]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
protected
|
52
|
+
def path_between_archive_and_compression
|
53
|
+
File.join(dirname,name + '.tar')
|
54
|
+
end
|
55
|
+
|
56
|
+
public
|
57
|
+
|
58
|
+
#
|
59
|
+
# It's a compressed file AND an archive!
|
60
|
+
#
|
61
|
+
|
62
|
+
def extname
|
63
|
+
case path
|
64
|
+
when /\.tar\.bz2$/ then '.tar.bz2'
|
65
|
+
when /\.tbz2$/ then '.tbz2'
|
66
|
+
else File.extname(path)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module IMW
|
2
|
+
module Archives
|
3
|
+
module Targz
|
4
|
+
|
5
|
+
#
|
6
|
+
# It's a compressed file
|
7
|
+
#
|
8
|
+
|
9
|
+
include IMW::CompressedFiles::Base
|
10
|
+
|
11
|
+
def compression_settings
|
12
|
+
@compression_settings ||= {
|
13
|
+
:program => :gzip,
|
14
|
+
:decompression_program => :gunzip,
|
15
|
+
:decompress => '',
|
16
|
+
:extension => 'gz'
|
17
|
+
}
|
18
|
+
end
|
19
|
+
|
20
|
+
#
|
21
|
+
# But it's also an archive
|
22
|
+
#
|
23
|
+
|
24
|
+
include IMW::Archives::Base
|
25
|
+
|
26
|
+
def archive_settings
|
27
|
+
@archive_settings ||= {
|
28
|
+
:program => :tar,
|
29
|
+
:list => "-tzf",
|
30
|
+
:create => '-cf',
|
31
|
+
:extract => "-xzf"
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
# Overrides default behvaior of IMW::Files::Archive#create to
|
36
|
+
# compress files after creating them.
|
37
|
+
def create *input_paths
|
38
|
+
IMW.system(archive_settings[:program], archive_settings[:create].split, path_between_archive_and_compression, *input_paths.flatten)
|
39
|
+
tar = IMW.open(path_between_archive_and_compression)
|
40
|
+
tar.compression_settings = compression_settings
|
41
|
+
tar.compress!
|
42
|
+
end
|
43
|
+
|
44
|
+
def decompressed_basename
|
45
|
+
case extname
|
46
|
+
when '.tar.gz' then basename[0..-4] # .tar.gz => .tar
|
47
|
+
when '.tgz' then basename.gsub(/tgz$/, 'tar') # .tgz => .tar
|
48
|
+
else basename[0..-(extname.size + 1)]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
protected
|
53
|
+
def path_between_archive_and_compression
|
54
|
+
File.join(dirname,name + '.tar')
|
55
|
+
end
|
56
|
+
public
|
57
|
+
|
58
|
+
#
|
59
|
+
# It's both an archive and a compressed file!
|
60
|
+
#
|
61
|
+
|
62
|
+
def extname
|
63
|
+
case path
|
64
|
+
when /\.tar\.gz$/ then '.tar.gz'
|
65
|
+
when /\.tgz$/ then '.tgz'
|
66
|
+
else File.extname(path)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module IMW
|
2
|
+
module Archives
|
3
|
+
module Zip
|
4
|
+
|
5
|
+
include IMW::Archives::Base
|
6
|
+
|
7
|
+
def archive_settings
|
8
|
+
@archive_settings ||= {
|
9
|
+
:program => :zip,
|
10
|
+
:create => "-qqr",
|
11
|
+
:append => "-qqg",
|
12
|
+
:list => "-l",
|
13
|
+
:extract => "-qqo",
|
14
|
+
:unarchiving_program => :unzip
|
15
|
+
}
|
16
|
+
end
|
17
|
+
|
18
|
+
protected
|
19
|
+
|
20
|
+
# The `unzip' program outputs data in a very annoying format:
|
21
|
+
#
|
22
|
+
# Archive: data.zip
|
23
|
+
# Length Date Time Name
|
24
|
+
# -------- ---- ---- ----
|
25
|
+
# 18510 07-28-08 15:58 data/4d7Qrgz7.csv
|
26
|
+
# 3418 07-28-08 15:41 data/7S.csv
|
27
|
+
# 23353 07-28-08 15:41 data/g.csv
|
28
|
+
# 711 07-28-08 15:58 data/g.xml
|
29
|
+
# 1095 07-28-08 15:41 data/L.xml
|
30
|
+
# 2399 07-28-08 15:58 data/mTAu9H3.xml
|
31
|
+
# 152 07-28-08 15:58 data/vaHBS2t5R.dat
|
32
|
+
# -------- -------
|
33
|
+
# 49638 7 files
|
34
|
+
#
|
35
|
+
# which is parsed by this method.
|
36
|
+
def archive_contents_string_to_array string
|
37
|
+
rows = string.split("\n")
|
38
|
+
# ignore the first 3 lines of the output and also discared the
|
39
|
+
# last 2 (5 = 2 + 3)
|
40
|
+
file_rows = rows[3,(rows.length - 5)]
|
41
|
+
file_rows.map do |row|
|
42
|
+
if row
|
43
|
+
columns = row.lstrip.rstrip.split(' ')
|
44
|
+
# grab the filename in the fourth column
|
45
|
+
columns[3..-1].join(' ')
|
46
|
+
end
|
47
|
+
end.compact
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -1,10 +1,19 @@
|
|
1
1
|
module IMW
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
2
|
+
|
3
|
+
# Contains modules which define the behavior of compressed files.
|
4
|
+
module CompressedFiles
|
5
|
+
autoload :Bz2, 'imw/compressed_files/bz2'
|
6
|
+
autoload :Gz, 'imw/compressed_files/gz'
|
7
|
+
autoload :Compressible, 'imw/compressed_files/compressible'
|
8
|
+
|
9
|
+
# Handlers which include modules for compressed file formats as
|
10
|
+
# well as the IMW::CompressedFiles::Compressible module for
|
11
|
+
# compressing regular files.
|
12
|
+
HANDLERS = [
|
13
|
+
["CompressedFiles::Compressible", Proc.new { |r| r.is_local? && r.is_file? && r.path != /\.(bz2|gz|tgz|tbz2)$/ } ],
|
14
|
+
["CompressedFiles::Gz", Proc.new { |r| r.is_local? && r.path =~ /\.gz$/ && r.path !~ /\.tar\.gz$/ && r.path !~ /\.tgz$/ } ],
|
15
|
+
["CompressedFiles::Bz2", Proc.new { |r| r.is_local? && r.path =~ /\.bz2$/ && r.path !~ /\.tar\.bz2$/ && r.path !~ /\.tbz2$/ } ]
|
16
|
+
]
|
8
17
|
|
9
18
|
# Defines methods for decompressing a compressed file. This
|
10
19
|
# module isn't used to directly extend an IMW::Resource --
|
@@ -12,7 +21,7 @@ module IMW
|
|
12
21
|
# IMW::Resources::CompressedFiles::Bz2) include this module and
|
13
22
|
# further define the command-line flags &c. needed to make
|
14
23
|
# everything work.
|
15
|
-
module
|
24
|
+
module Base
|
16
25
|
|
17
26
|
attr_accessor :compression_settings
|
18
27
|
|
@@ -80,10 +89,6 @@ module IMW
|
|
80
89
|
copy.mv(path) if copy && copy.exist?
|
81
90
|
end
|
82
91
|
end
|
83
|
-
|
84
92
|
end
|
85
93
|
end
|
86
94
|
end
|
87
|
-
|
88
|
-
|
89
|
-
|