imw 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +34 -14
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/imw.rb +9 -6
- data/lib/imw/{resources/archive.rb → archives.rb} +20 -10
- data/lib/imw/archives/rar.rb +19 -0
- data/lib/imw/archives/tar.rb +19 -0
- data/lib/imw/archives/tarbz2.rb +73 -0
- data/lib/imw/archives/targz.rb +73 -0
- data/lib/imw/archives/zip.rb +51 -0
- data/lib/imw/{resources/compressed_file.rb → compressed_files.rb} +16 -11
- data/lib/imw/compressed_files/bz2.rb +16 -0
- data/lib/imw/{resources → compressed_files}/compressible.rb +2 -4
- data/lib/imw/compressed_files/gz.rb +16 -0
- data/lib/imw/formats.rb +31 -0
- data/lib/imw/formats/delimited.rb +90 -0
- data/lib/imw/formats/excel.rb +125 -0
- data/lib/imw/formats/json.rb +51 -0
- data/lib/imw/formats/sgml.rb +69 -0
- data/lib/imw/formats/yaml.rb +51 -0
- data/lib/imw/resource.rb +108 -10
- data/lib/imw/schemes.rb +21 -0
- data/lib/imw/schemes/hdfs.rb +240 -0
- data/lib/imw/schemes/http.rb +166 -0
- data/lib/imw/schemes/local.rb +219 -0
- data/lib/imw/schemes/remote.rb +114 -0
- data/lib/imw/schemes/s3.rb +135 -0
- data/lib/imw/tools.rb +8 -0
- data/lib/imw/{transforms → tools}/archiver.rb +1 -1
- data/lib/imw/{transforms → tools}/transferer.rb +10 -10
- data/spec/imw/{resources/archive_spec.rb → archive_spec.rb} +3 -3
- data/spec/imw/{resources/archives_and_compressed → archives}/rar_spec.rb +2 -2
- data/spec/imw/{resources/archives_and_compressed → archives}/tar_spec.rb +2 -2
- data/spec/imw/{resources/archives_and_compressed → archives}/tarbz2_spec.rb +4 -4
- data/spec/imw/{resources/archives_and_compressed → archives}/targz_spec.rb +4 -4
- data/spec/imw/{resources/archives_and_compressed → archives}/zip_spec.rb +2 -2
- data/spec/imw/compressed_files/bz2_spec.rb +15 -0
- data/spec/imw/{resources → compressed_files}/compressible_spec.rb +1 -1
- data/spec/imw/compressed_files/gz_spec.rb +15 -0
- data/spec/imw/{resources/compressed_file_spec.rb → compressed_files_spec.rb} +3 -3
- data/spec/imw/{resources/formats → formats}/delimited_spec.rb +2 -2
- data/spec/imw/{resources/formats → formats}/json_spec.rb +2 -2
- data/spec/imw/{resources/formats → formats}/sgml_spec.rb +2 -2
- data/spec/imw/{resources/formats → formats}/yaml_spec.rb +2 -2
- data/spec/imw/resource_spec.rb +4 -4
- data/spec/imw/{resources/schemes → schemes}/hdfs_spec.rb +7 -7
- data/spec/imw/{resources/schemes → schemes}/http_spec.rb +2 -2
- data/spec/imw/{resources → schemes}/local_spec.rb +5 -5
- data/spec/imw/{resources → schemes}/remote_spec.rb +7 -3
- data/spec/imw/{resources/schemes → schemes}/s3_spec.rb +2 -2
- data/spec/imw/{transforms → tools}/archiver_spec.rb +2 -2
- data/spec/imw/tools/transferer_spec.rb +113 -0
- metadata +69 -71
- data/lib/imw/resources.rb +0 -118
- data/lib/imw/resources/archives_and_compressed.rb +0 -32
- data/lib/imw/resources/archives_and_compressed/bz2.rb +0 -18
- data/lib/imw/resources/archives_and_compressed/gz.rb +0 -18
- data/lib/imw/resources/archives_and_compressed/rar.rb +0 -23
- data/lib/imw/resources/archives_and_compressed/tar.rb +0 -23
- data/lib/imw/resources/archives_and_compressed/tarbz2.rb +0 -78
- data/lib/imw/resources/archives_and_compressed/targz.rb +0 -78
- data/lib/imw/resources/archives_and_compressed/zip.rb +0 -57
- data/lib/imw/resources/formats.rb +0 -32
- data/lib/imw/resources/formats/delimited.rb +0 -92
- data/lib/imw/resources/formats/excel.rb +0 -125
- data/lib/imw/resources/formats/json.rb +0 -53
- data/lib/imw/resources/formats/sgml.rb +0 -72
- data/lib/imw/resources/formats/yaml.rb +0 -53
- data/lib/imw/resources/local.rb +0 -198
- data/lib/imw/resources/remote.rb +0 -110
- data/lib/imw/resources/schemes.rb +0 -19
- data/lib/imw/resources/schemes/hdfs.rb +0 -242
- data/lib/imw/resources/schemes/http.rb +0 -161
- data/lib/imw/resources/schemes/s3.rb +0 -137
- data/lib/imw/transforms.rb +0 -8
- data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +0 -15
- data/spec/imw/resources/archives_and_compressed/gz_spec.rb +0 -15
- data/spec/imw/transforms/transferer_spec.rb +0 -113
data/README.rdoc
CHANGED
@@ -25,14 +25,13 @@ data. It has the following goals:
|
|
25
25
|
The Infinite Monkeywrench is a powerful tool but it is not always the
|
26
26
|
right one to use. IMW is **not** designed for
|
27
27
|
|
28
|
-
* Scraping vast amounts of data (use Wuclan[http://github.com/infochimps/wuclan]
|
28
|
+
* Scraping vast amounts of data (use Wuclan[http://github.com/infochimps/wuclan] and Monkeyshines[http://github.com/infochimps/monkeyshines])
|
29
29
|
|
30
30
|
* Really, really big datasets (use Wukong[http://github.com/infochimps/wukong] and Hadoop[http://hadoop.apache.org])
|
31
31
|
|
32
|
-
* Data mining
|
33
|
-
|
34
|
-
* Data visualization
|
32
|
+
* Data mining or statistical analysis
|
35
33
|
|
34
|
+
* Visualization
|
36
35
|
|
37
36
|
= Setup
|
38
37
|
|
@@ -47,21 +46,42 @@ and then install IMW
|
|
47
46
|
|
48
47
|
$ sudo gem install imw
|
49
48
|
|
50
|
-
|
49
|
+
In all the examples that follow it is assumed that you've installed
|
50
|
+
IMW and required it in a script via
|
51
51
|
|
52
|
-
|
53
|
-
|
52
|
+
require 'rubygems'
|
53
|
+
require 'imw'
|
54
54
|
|
55
|
-
|
56
|
-
and methods. The following sections provide a tour of these.
|
55
|
+
= Resources
|
57
56
|
|
58
|
-
|
59
|
-
|
57
|
+
IMW is centered around processing resources. A resource can be
|
58
|
+
_anything_ with a URI and you create one using IMW.open.
|
60
59
|
|
61
|
-
|
62
|
-
|
60
|
+
csv = IMW.open('/path/to/my_data.csv')
|
61
|
+
html = IMW.open('http://www.infochimps.com')
|
62
|
+
tar_bz2 = IMW.open(
|
63
|
+
|
64
|
+
IMW dynamically extends a resource with modules appropriate to it when
|
65
|
+
you open it. In the above case, +csv+ would be automatically extended
|
66
|
+
by the IMW::Resources::Formats::Csv module, among others:
|
67
|
+
|
68
|
+
csv.resource_modules
|
69
|
+
=> [IMW::Resources::LocalObj, IMW::Resources::LocalFile, IMW::Resources::Compressible, IMW::Resources::Formats::Csv]
|
70
|
+
|
71
|
+
while +html+ will use a different set
|
72
|
+
|
73
|
+
html.resource_modules
|
74
|
+
=> [IMW::Resources::LocalObj, IMW::Resources::LocalFile, IMW::Resources::Compressible, IMW::Resources::Formats::Csv]
|
75
|
+
|
76
|
+
|
77
|
+
Consult the documentation for the modules a resource uses to learn
|
78
|
+
what it can do.
|
79
|
+
|
80
|
+
Since resources are built around the idea of URIs, you can explicitly i
|
81
|
+
|
82
|
+
== Manipulating Paths
|
63
83
|
|
64
|
-
|
84
|
+
You can p
|
65
85
|
|
66
86
|
IMW holds a registry of paths that you can define on the fly or store
|
67
87
|
in a configuration file.
|
data/Rakefile
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.1
|
data/lib/imw.rb
CHANGED
@@ -26,12 +26,15 @@ require 'imw/utils'
|
|
26
26
|
# Repositories are collections of datasets and it is on these
|
27
27
|
# collections that the +imw+ command line tool operates.
|
28
28
|
module IMW
|
29
|
-
autoload :Resource,
|
30
|
-
autoload :
|
31
|
-
autoload :
|
32
|
-
autoload :
|
33
|
-
autoload :
|
34
|
-
autoload :
|
29
|
+
autoload :Resource, 'imw/resource'
|
30
|
+
autoload :Schemes, 'imw/schemes'
|
31
|
+
autoload :Archives, 'imw/archives'
|
32
|
+
autoload :CompressedFiles, 'imw/compressed_files'
|
33
|
+
autoload :Formats, 'imw/formats'
|
34
|
+
autoload :Tools, 'imw/tools'
|
35
|
+
autoload :Parsers, 'imw/parsers'
|
36
|
+
autoload :Dataset, 'imw/dataset'
|
37
|
+
autoload :Repository, 'imw/repository'
|
35
38
|
|
36
39
|
# Open a resource at the given +uri+. The resource will
|
37
40
|
# automatically be extended by modules which make sense given the
|
@@ -1,13 +1,24 @@
|
|
1
1
|
module IMW
|
2
|
-
module Resources
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
3
|
+
# Contains modules which define the behavior of archive files.
|
4
|
+
module Archives
|
5
|
+
|
6
|
+
# Handlers for archives.
|
7
|
+
HANDLERS = [
|
8
|
+
["Archives::Tarbz2", Proc.new { |r| r.is_local? && r.path =~ /\.tar\.bz2$/ } ],
|
9
|
+
["Archives::Tarbz2", Proc.new { |r| r.is_local? && r.path =~ /\.tbz2$/ } ],
|
10
|
+
["Archives::Targz", Proc.new { |r| r.is_local? && r.path =~ /\.tar\.gz$/ } ],
|
11
|
+
["Archives::Targz", Proc.new { |r| r.is_local? && r.path =~ /\.tgz$/ } ],
|
12
|
+
["Archives::Tar", Proc.new { |r| r.is_local? && r.path =~ /\.tar$/ } ],
|
13
|
+
["Archives::Rar", Proc.new { |r| r.is_local? && r.path =~ /\.rar$/ } ],
|
14
|
+
["Archives::Zip", Proc.new { |r| r.is_local? && r.path =~ /\.zip$/ } ]
|
15
|
+
]
|
16
|
+
|
17
|
+
autoload :Rar, 'imw/archives/rar'
|
18
|
+
autoload :Tar, 'imw/archives/tar'
|
19
|
+
autoload :Tarbz2, 'imw/archives/tarbz2'
|
20
|
+
autoload :Targz, 'imw/archives/targz'
|
21
|
+
autoload :Zip, 'imw/archives/zip'
|
11
22
|
|
12
23
|
# Defines methods for creating, appending to, extracting, and
|
13
24
|
# listing an archive file. This module isn't used to directly
|
@@ -15,7 +26,7 @@ module IMW
|
|
15
26
|
# (e.g. - IMW::Resources::Archives::Tarbz2) include this module
|
16
27
|
# and define the specific settings (command-line flags, &c.)
|
17
28
|
# required to make things work.
|
18
|
-
module
|
29
|
+
module Base
|
19
30
|
|
20
31
|
attr_accessor :archive_settings
|
21
32
|
|
@@ -94,4 +105,3 @@ module IMW
|
|
94
105
|
end
|
95
106
|
end
|
96
107
|
|
97
|
-
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module IMW
|
2
|
+
module Archives
|
3
|
+
module Rar
|
4
|
+
|
5
|
+
include IMW::Archives::Base
|
6
|
+
|
7
|
+
def archive_settings
|
8
|
+
@archive_settings ||= {
|
9
|
+
:program => :rar,
|
10
|
+
:create => ['a', '-o+', '-inul'],
|
11
|
+
:append => ['a', '-o+', '-inul'],
|
12
|
+
:list => "vb",
|
13
|
+
:extract => ['x', '-o+', '-inul']
|
14
|
+
}
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module IMW
|
2
|
+
module Archives
|
3
|
+
module Tar
|
4
|
+
|
5
|
+
include IMW::Archives::Base
|
6
|
+
|
7
|
+
def archive_settings
|
8
|
+
@archive_settings ||= {
|
9
|
+
:create => "-cf",
|
10
|
+
:append => "-rf",
|
11
|
+
:list => "-tf",
|
12
|
+
:extract => "-xf",
|
13
|
+
:program => :tar
|
14
|
+
}
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module IMW
|
2
|
+
module Archives
|
3
|
+
module Tarbz2
|
4
|
+
|
5
|
+
#
|
6
|
+
# It's a compressed file
|
7
|
+
#
|
8
|
+
|
9
|
+
include IMW::CompressedFiles::Base
|
10
|
+
|
11
|
+
def compression_settings
|
12
|
+
@compression_settings ||= {
|
13
|
+
:program => :bzip2,
|
14
|
+
:decompression_program => :bunzip2,
|
15
|
+
:decompress => '',
|
16
|
+
:extension => 'bz2'
|
17
|
+
}
|
18
|
+
end
|
19
|
+
|
20
|
+
#
|
21
|
+
# But it's also an archive
|
22
|
+
#
|
23
|
+
|
24
|
+
include IMW::Archives::Base
|
25
|
+
|
26
|
+
def archive_settings
|
27
|
+
@archive_settings ||= {
|
28
|
+
:program => :tar,
|
29
|
+
:create => '-cf',
|
30
|
+
:list => "-tjf",
|
31
|
+
:extract => "-xjf"
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
# Overrides default behvaior of IMW::Files::Archive#create to
|
36
|
+
# compress files after creating them.
|
37
|
+
def create *input_paths
|
38
|
+
IMW.system(archive_settings[:program], archive_settings[:create], path_between_archive_and_compression, *input_paths.flatten)
|
39
|
+
IMW.open(path_between_archive_and_compression).compress!
|
40
|
+
end
|
41
|
+
|
42
|
+
def decompressed_basename
|
43
|
+
case extname
|
44
|
+
when '.tar.bz2' then basename[0..-5] # .tar.bz2 => .tar
|
45
|
+
when '.tbz2' then basename.gsub(/tbz2$/, 'tar') # .tbz2 => .tar
|
46
|
+
else basename[0..-(extname.size + 1)]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
protected
|
52
|
+
def path_between_archive_and_compression
|
53
|
+
File.join(dirname,name + '.tar')
|
54
|
+
end
|
55
|
+
|
56
|
+
public
|
57
|
+
|
58
|
+
#
|
59
|
+
# It's a compressed file AND an archive!
|
60
|
+
#
|
61
|
+
|
62
|
+
def extname
|
63
|
+
case path
|
64
|
+
when /\.tar\.bz2$/ then '.tar.bz2'
|
65
|
+
when /\.tbz2$/ then '.tbz2'
|
66
|
+
else File.extname(path)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module IMW
|
2
|
+
module Archives
|
3
|
+
module Targz
|
4
|
+
|
5
|
+
#
|
6
|
+
# It's a compressed file
|
7
|
+
#
|
8
|
+
|
9
|
+
include IMW::CompressedFiles::Base
|
10
|
+
|
11
|
+
def compression_settings
|
12
|
+
@compression_settings ||= {
|
13
|
+
:program => :gzip,
|
14
|
+
:decompression_program => :gunzip,
|
15
|
+
:decompress => '',
|
16
|
+
:extension => 'gz'
|
17
|
+
}
|
18
|
+
end
|
19
|
+
|
20
|
+
#
|
21
|
+
# But it's also an archive
|
22
|
+
#
|
23
|
+
|
24
|
+
include IMW::Archives::Base
|
25
|
+
|
26
|
+
def archive_settings
|
27
|
+
@archive_settings ||= {
|
28
|
+
:program => :tar,
|
29
|
+
:list => "-tzf",
|
30
|
+
:create => '-cf',
|
31
|
+
:extract => "-xzf"
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
# Overrides default behvaior of IMW::Files::Archive#create to
|
36
|
+
# compress files after creating them.
|
37
|
+
def create *input_paths
|
38
|
+
IMW.system(archive_settings[:program], archive_settings[:create].split, path_between_archive_and_compression, *input_paths.flatten)
|
39
|
+
tar = IMW.open(path_between_archive_and_compression)
|
40
|
+
tar.compression_settings = compression_settings
|
41
|
+
tar.compress!
|
42
|
+
end
|
43
|
+
|
44
|
+
def decompressed_basename
|
45
|
+
case extname
|
46
|
+
when '.tar.gz' then basename[0..-4] # .tar.gz => .tar
|
47
|
+
when '.tgz' then basename.gsub(/tgz$/, 'tar') # .tgz => .tar
|
48
|
+
else basename[0..-(extname.size + 1)]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
protected
|
53
|
+
def path_between_archive_and_compression
|
54
|
+
File.join(dirname,name + '.tar')
|
55
|
+
end
|
56
|
+
public
|
57
|
+
|
58
|
+
#
|
59
|
+
# It's both an archive and a compressed file!
|
60
|
+
#
|
61
|
+
|
62
|
+
def extname
|
63
|
+
case path
|
64
|
+
when /\.tar\.gz$/ then '.tar.gz'
|
65
|
+
when /\.tgz$/ then '.tgz'
|
66
|
+
else File.extname(path)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module IMW
|
2
|
+
module Archives
|
3
|
+
module Zip
|
4
|
+
|
5
|
+
include IMW::Archives::Base
|
6
|
+
|
7
|
+
def archive_settings
|
8
|
+
@archive_settings ||= {
|
9
|
+
:program => :zip,
|
10
|
+
:create => "-qqr",
|
11
|
+
:append => "-qqg",
|
12
|
+
:list => "-l",
|
13
|
+
:extract => "-qqo",
|
14
|
+
:unarchiving_program => :unzip
|
15
|
+
}
|
16
|
+
end
|
17
|
+
|
18
|
+
protected
|
19
|
+
|
20
|
+
# The `unzip' program outputs data in a very annoying format:
|
21
|
+
#
|
22
|
+
# Archive: data.zip
|
23
|
+
# Length Date Time Name
|
24
|
+
# -------- ---- ---- ----
|
25
|
+
# 18510 07-28-08 15:58 data/4d7Qrgz7.csv
|
26
|
+
# 3418 07-28-08 15:41 data/7S.csv
|
27
|
+
# 23353 07-28-08 15:41 data/g.csv
|
28
|
+
# 711 07-28-08 15:58 data/g.xml
|
29
|
+
# 1095 07-28-08 15:41 data/L.xml
|
30
|
+
# 2399 07-28-08 15:58 data/mTAu9H3.xml
|
31
|
+
# 152 07-28-08 15:58 data/vaHBS2t5R.dat
|
32
|
+
# -------- -------
|
33
|
+
# 49638 7 files
|
34
|
+
#
|
35
|
+
# which is parsed by this method.
|
36
|
+
def archive_contents_string_to_array string
|
37
|
+
rows = string.split("\n")
|
38
|
+
# ignore the first 3 lines of the output and also discared the
|
39
|
+
# last 2 (5 = 2 + 3)
|
40
|
+
file_rows = rows[3,(rows.length - 5)]
|
41
|
+
file_rows.map do |row|
|
42
|
+
if row
|
43
|
+
columns = row.lstrip.rstrip.split(' ')
|
44
|
+
# grab the filename in the fourth column
|
45
|
+
columns[3..-1].join(' ')
|
46
|
+
end
|
47
|
+
end.compact
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -1,10 +1,19 @@
|
|
1
1
|
module IMW
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
2
|
+
|
3
|
+
# Contains modules which define the behavior of compressed files.
|
4
|
+
module CompressedFiles
|
5
|
+
autoload :Bz2, 'imw/compressed_files/bz2'
|
6
|
+
autoload :Gz, 'imw/compressed_files/gz'
|
7
|
+
autoload :Compressible, 'imw/compressed_files/compressible'
|
8
|
+
|
9
|
+
# Handlers which include modules for compressed file formats as
|
10
|
+
# well as the IMW::CompressedFiles::Compressible module for
|
11
|
+
# compressing regular files.
|
12
|
+
HANDLERS = [
|
13
|
+
["CompressedFiles::Compressible", Proc.new { |r| r.is_local? && r.is_file? && r.path != /\.(bz2|gz|tgz|tbz2)$/ } ],
|
14
|
+
["CompressedFiles::Gz", Proc.new { |r| r.is_local? && r.path =~ /\.gz$/ && r.path !~ /\.tar\.gz$/ && r.path !~ /\.tgz$/ } ],
|
15
|
+
["CompressedFiles::Bz2", Proc.new { |r| r.is_local? && r.path =~ /\.bz2$/ && r.path !~ /\.tar\.bz2$/ && r.path !~ /\.tbz2$/ } ]
|
16
|
+
]
|
8
17
|
|
9
18
|
# Defines methods for decompressing a compressed file. This
|
10
19
|
# module isn't used to directly extend an IMW::Resource --
|
@@ -12,7 +21,7 @@ module IMW
|
|
12
21
|
# IMW::Resources::CompressedFiles::Bz2) include this module and
|
13
22
|
# further define the command-line flags &c. needed to make
|
14
23
|
# everything work.
|
15
|
-
module
|
24
|
+
module Base
|
16
25
|
|
17
26
|
attr_accessor :compression_settings
|
18
27
|
|
@@ -80,10 +89,6 @@ module IMW
|
|
80
89
|
copy.mv(path) if copy && copy.exist?
|
81
90
|
end
|
82
91
|
end
|
83
|
-
|
84
92
|
end
|
85
93
|
end
|
86
94
|
end
|
87
|
-
|
88
|
-
|
89
|
-
|