imw 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +15 -0
- data/CHANGELOG +0 -0
- data/LICENSE +674 -0
- data/README.rdoc +101 -0
- data/Rakefile +20 -0
- data/VERSION +1 -0
- data/etc/imwrc.rb +76 -0
- data/lib/imw.rb +42 -0
- data/lib/imw/boot.rb +58 -0
- data/lib/imw/dataset.rb +233 -0
- data/lib/imw/dataset/datamapper.rb +66 -0
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
- data/lib/imw/dataset/loaddump.rb +50 -0
- data/lib/imw/dataset/old/file_collection.rb +88 -0
- data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
- data/lib/imw/dataset/scaffold.rb +132 -0
- data/lib/imw/dataset/scraped_uri.rb +305 -0
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
- data/lib/imw/dataset/scrub/scrub.rb +147 -0
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
- data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
- data/lib/imw/dataset/scrub/slug.rb +101 -0
- data/lib/imw/dataset/stats.rb +73 -0
- data/lib/imw/dataset/stats/counter.rb +23 -0
- data/lib/imw/dataset/task.rb +38 -0
- data/lib/imw/dataset/workflow.rb +81 -0
- data/lib/imw/files.rb +110 -0
- data/lib/imw/files/archive.rb +113 -0
- data/lib/imw/files/basicfile.rb +122 -0
- data/lib/imw/files/binary.rb +28 -0
- data/lib/imw/files/compressed_file.rb +93 -0
- data/lib/imw/files/compressed_files_and_archives.rb +348 -0
- data/lib/imw/files/compressible.rb +103 -0
- data/lib/imw/files/csv.rb +112 -0
- data/lib/imw/files/json.rb +41 -0
- data/lib/imw/files/sgml.rb +65 -0
- data/lib/imw/files/text.rb +68 -0
- data/lib/imw/files/yaml.rb +46 -0
- data/lib/imw/packagers.rb +8 -0
- data/lib/imw/packagers/archiver.rb +108 -0
- data/lib/imw/packagers/s3_mover.rb +28 -0
- data/lib/imw/parsers.rb +7 -0
- data/lib/imw/parsers/html_parser.rb +382 -0
- data/lib/imw/parsers/html_parser/matchers.rb +306 -0
- data/lib/imw/parsers/line_parser.rb +87 -0
- data/lib/imw/parsers/regexp_parser.rb +72 -0
- data/lib/imw/utils.rb +24 -0
- data/lib/imw/utils/components.rb +61 -0
- data/lib/imw/utils/config.rb +46 -0
- data/lib/imw/utils/error.rb +54 -0
- data/lib/imw/utils/extensions/array.rb +125 -0
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
- data/lib/imw/utils/extensions/core.rb +43 -0
- data/lib/imw/utils/extensions/dir.rb +24 -0
- data/lib/imw/utils/extensions/file_core.rb +64 -0
- data/lib/imw/utils/extensions/hash.rb +218 -0
- data/lib/imw/utils/extensions/hpricot.rb +48 -0
- data/lib/imw/utils/extensions/string.rb +49 -0
- data/lib/imw/utils/extensions/struct.rb +42 -0
- data/lib/imw/utils/extensions/symbol.rb +28 -0
- data/lib/imw/utils/extensions/typed_struct.rb +22 -0
- data/lib/imw/utils/extensions/uri.rb +59 -0
- data/lib/imw/utils/log.rb +67 -0
- data/lib/imw/utils/misc.rb +63 -0
- data/lib/imw/utils/paths.rb +115 -0
- data/lib/imw/utils/uri.rb +59 -0
- data/lib/imw/utils/uuid.rb +33 -0
- data/lib/imw/utils/validate.rb +38 -0
- data/lib/imw/utils/version.rb +12 -0
- data/lib/imw/utils/view.rb +113 -0
- data/lib/imw/utils/view/dump_csv.rb +112 -0
- data/lib/imw/utils/view/dump_csv_older.rb +117 -0
- data/spec/data/sample.csv +131 -0
- data/spec/data/sample.tsv +131 -0
- data/spec/data/sample.txt +131 -0
- data/spec/data/sample.xml +653 -0
- data/spec/data/sample.yaml +652 -0
- data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
- data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
- data/spec/imw/files/archive_spec.rb +118 -0
- data/spec/imw/files/basicfile_spec.rb +121 -0
- data/spec/imw/files/bz2_spec.rb +32 -0
- data/spec/imw/files/compressed_file_spec.rb +96 -0
- data/spec/imw/files/compressible_spec.rb +100 -0
- data/spec/imw/files/file_spec.rb +144 -0
- data/spec/imw/files/gz_spec.rb +32 -0
- data/spec/imw/files/rar_spec.rb +33 -0
- data/spec/imw/files/tar_spec.rb +31 -0
- data/spec/imw/files/text_spec.rb +23 -0
- data/spec/imw/files/zip_spec.rb +31 -0
- data/spec/imw/files_spec.rb +38 -0
- data/spec/imw/packagers/archiver_spec.rb +125 -0
- data/spec/imw/packagers/s3_mover_spec.rb +7 -0
- data/spec/imw/parsers/line_parser_spec.rb +96 -0
- data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
- data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
- data/spec/imw/utils/extensions/find_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +38 -0
- data/spec/imw/workflow/rip/local_spec.rb +89 -0
- data/spec/imw/workflow/rip_spec.rb +27 -0
- data/spec/rcov.opts +1 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +32 -0
- data/spec/support/archive_contents_matcher.rb +94 -0
- data/spec/support/custom_matchers.rb +21 -0
- data/spec/support/directory_contents_matcher.rb +61 -0
- data/spec/support/extensions.rb +18 -0
- data/spec/support/file_contents_matcher.rb +50 -0
- data/spec/support/random.rb +210 -0
- data/spec/support/without_regard_to_order_matcher.rb +58 -0
- metadata +196 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
#
|
|
2
|
+
# h2. lib/imw/files/archive.rb -- describes archives of files
|
|
3
|
+
#
|
|
4
|
+
# == About
|
|
5
|
+
#
|
|
6
|
+
# Module for describing known archive types. An including archive
|
|
7
|
+
# type's class must define an instance variable +archive+ which is a
|
|
8
|
+
# hash with the following required keys:
|
|
9
|
+
#
|
|
10
|
+
# <tt>:program</tt>:: a symbol naming the program to be used. It
|
|
11
|
+
# should match one of the symbols in <tt>IMW::EXTERNAL_PROGRAMS</tt>
|
|
12
|
+
#
|
|
13
|
+
# <tt>:create_flags</tt>:: a string of flags to pass to the archiving
|
|
14
|
+
# program when creating the archive
|
|
15
|
+
#
|
|
16
|
+
# <tt>:append_flags</tt>:: a string of flags to pass to the archiving
|
|
17
|
+
# program when appending files to the archive
|
|
18
|
+
#
|
|
19
|
+
# <tt>:extract_flags</tt>:: a string of flags to pass to the archiving
|
|
20
|
+
# program when extracting the archive
|
|
21
|
+
#
|
|
22
|
+
# <tt>:list_flags</tt>:: a string of flags to pass to the archiving
|
|
23
|
+
# program when listing the archive's contents
|
|
24
|
+
#
|
|
25
|
+
# THe +archive+ hash may also contain the entry:
|
|
26
|
+
#
|
|
27
|
+
# <tt>:unarchiving_program</tt>:: a symbol naming the program to be
|
|
28
|
+
# used to list/extract the archive. Useful only if this program
|
|
29
|
+
# differs from the program used to create the archive in the first
|
|
30
|
+
# place (i.e. - zip & unzip).
|
|
31
|
+
#
|
|
32
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
33
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
34
|
+
# License:: GPL 3.0
|
|
35
|
+
# Website:: http://infinitemonkeywrench.org/
|
|
36
|
+
#
|
|
37
|
+
# puts "#{File.basename(__FILE__)}: Put it all in one place so that when something goes wrong you'll know it immediately. You'll regret it, but at least you'll know." # at bottom
|
|
38
|
+
module IMW
|
|
39
|
+
module Files
|
|
40
|
+
|
|
41
|
+
module BasicFile
|
|
42
|
+
|
|
43
|
+
# Is this file an archive?
|
|
44
|
+
def archive?
|
|
45
|
+
false
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
module Archive
|
|
50
|
+
|
|
51
|
+
attr_reader :archive
|
|
52
|
+
|
|
53
|
+
# Is this file an archive?
|
|
54
|
+
def archive?
|
|
55
|
+
true
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
public
|
|
59
|
+
# Create this archive containing the given +paths+, which can be
|
|
60
|
+
# either a string or list of strings to be interpreted as paths
|
|
61
|
+
# to files/directories by the shell.
|
|
62
|
+
#
|
|
63
|
+
# Options:
|
|
64
|
+
# <tt>:force</tt> (false):: overwrite any existing archive at this path.
|
|
65
|
+
def create paths, opts = {}
|
|
66
|
+
opts = opts.reverse_merge({:force => false})
|
|
67
|
+
raise IMW::Error.new("An archive already exists at #{@path}.") if exist? and not opts[:force]
|
|
68
|
+
raise IMW::Error.new("Cannot create an archive of type #{@extname}") unless @archive[:create_flags]
|
|
69
|
+
paths = [paths] if paths.class == String
|
|
70
|
+
IMW.system IMW::EXTERNAL_PROGRAMS[@archive[:program]], @archive[:create_flags], @path, *paths
|
|
71
|
+
self
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Append to this archive the given +paths+, which can be
|
|
75
|
+
# either a string or list of strings to be interpreted as paths
|
|
76
|
+
# to files/directories by the shell.
|
|
77
|
+
def append paths
|
|
78
|
+
raise IMW::Error.new("Cannot append to an archive of type #{@archive[:program]}.") unless @archive[:append_flags]
|
|
79
|
+
paths = [paths] if paths.class == String
|
|
80
|
+
IMW.system IMW::EXTERNAL_PROGRAMS[@archive[:program]], @archive[:append_flags], @path, *paths
|
|
81
|
+
self
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Extract the files from this archive to the current directory.
|
|
85
|
+
def extract
|
|
86
|
+
raise IMW::Error.new("Cannot extract, #{@path} does not exist.") unless exist?
|
|
87
|
+
program = (@archive[:unarchiving_program] or @archive[:program])
|
|
88
|
+
IMW.system IMW::EXTERNAL_PROGRAMS[program], @archive[:extract_flags], @path
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Return a (sorted) list of contents in this archive.
|
|
92
|
+
def contents
|
|
93
|
+
raise IMW::Error.new("Cannot list contents, #{@path} does not exist.") unless exist?
|
|
94
|
+
program = (@archive[:unarchiving_program] or @archive[:program])
|
|
95
|
+
output = ''
|
|
96
|
+
command = [IMW::EXTERNAL_PROGRAMS[program], @archive[:list_flags], @path].join ' '
|
|
97
|
+
output += `#{command}`
|
|
98
|
+
archive_contents_string_to_array(output)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Parse and format the output from the archive program's "list"
|
|
102
|
+
# command into an array of filenames.
|
|
103
|
+
#
|
|
104
|
+
# An including class can customize this method to match the
|
|
105
|
+
# output from the archiving program of that class.
|
|
106
|
+
def archive_contents_string_to_array string
|
|
107
|
+
string.split("\n")
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
#
|
|
2
|
+
# h2. lib/imw/files/file.rb -- base class for files
|
|
3
|
+
#
|
|
4
|
+
# == About
|
|
5
|
+
#
|
|
6
|
+
# Defines a base class for classes for specific filetypes to subclass.
|
|
7
|
+
#
|
|
8
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
9
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
10
|
+
# License:: GPL 3.0
|
|
11
|
+
# Website:: http://infinitemonkeywrench.org/
|
|
12
|
+
#
|
|
13
|
+
# puts "#{File.basename(__FILE__)}: At the very bottom of the office building, wedged between a small boulder and a rotting log you see a weathered manilla file folder. The writing on the tab is too faded to make out." # at bottom
|
|
14
|
+
module IMW
|
|
15
|
+
module Files
|
|
16
|
+
module BasicFile
|
|
17
|
+
|
|
18
|
+
attr_reader :uri, :host, :path, :dirname, :basename, :extname, :name
|
|
19
|
+
|
|
20
|
+
protected
|
|
21
|
+
|
|
22
|
+
def uri= uri
|
|
23
|
+
@uri = URI.parse(uri) if uri.is_a?(String)
|
|
24
|
+
@host = self.uri.host
|
|
25
|
+
@path = local? ? ::File.expand_path(self.uri.path) : self.uri.path
|
|
26
|
+
@dirname = ::File.dirname path
|
|
27
|
+
@basename = ::File.basename path
|
|
28
|
+
@extname = find_extname
|
|
29
|
+
@name = @basename[0,@basename.length - @extname.length]
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Some files (like <tt>.tar.gz</tt>) have an "extra" extension.
|
|
33
|
+
# Classes in the <tt>IMW::Files</tt> module should define a
|
|
34
|
+
# class method <tt>extname</tt> which returns the their full
|
|
35
|
+
# extension.
|
|
36
|
+
def find_extname
|
|
37
|
+
self.class.respond_to?(:extname) ? self.class.extname(path) : ::File.extname(path)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
public
|
|
41
|
+
|
|
42
|
+
# Is this file on the local machine (the scheme of the file's URI is nil or
|
|
43
|
+
def local?
|
|
44
|
+
host == 'file' || host.nil?
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Is this file on a remote machine?
|
|
48
|
+
def remote?
|
|
49
|
+
(! local?)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Steal a bunch of class methods from File which only take a
|
|
53
|
+
# path as a first argument.
|
|
54
|
+
[:executable?, :executable_real?, :file?, :directory?, :ftype, :owned?, :pipe?, :readable?, :readable_real?, :setgid?, :setuid?, :size, :size?, :socket?, :split, :stat, :sticky?, :writable?, :writable_real?, :zero?].each do |class_method|
|
|
55
|
+
define_method class_method do
|
|
56
|
+
File.send(class_method, path) if local?
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Is there a real file at the path of this File? Will attempt
|
|
61
|
+
# to open files online too to check.
|
|
62
|
+
def exist?
|
|
63
|
+
if local?
|
|
64
|
+
::File.exist?(path) ? true : false
|
|
65
|
+
else
|
|
66
|
+
begin
|
|
67
|
+
true if open(uri)
|
|
68
|
+
rescue SocketError
|
|
69
|
+
false
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
alias_method :exists?, :exist?
|
|
74
|
+
|
|
75
|
+
# Delete this file.
|
|
76
|
+
def rm
|
|
77
|
+
raise IMW::PathError.new("cannot delete remote file #{uri}") unless local?
|
|
78
|
+
raise IMW::PathError.new("cannot delete #{uri}, doesn't exist!") unless exist?
|
|
79
|
+
FileUtils.rm path
|
|
80
|
+
end
|
|
81
|
+
alias_method :rm!, :rm
|
|
82
|
+
|
|
83
|
+
# Copy this file to +new_path+.
|
|
84
|
+
def cp new_path
|
|
85
|
+
raise IMW::PathError.new("cannot copy from #{path}, doesn't exist!") unless exist?
|
|
86
|
+
if local?
|
|
87
|
+
FileUtils.cp path, new_path
|
|
88
|
+
else
|
|
89
|
+
# FIXME better way to do this?
|
|
90
|
+
File.open(new_path,'w') { |f| f.write(open(uri).read) }
|
|
91
|
+
end
|
|
92
|
+
self.class.new(new_path)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Copy this file to +dir+.
|
|
96
|
+
def cp_to_dir dir
|
|
97
|
+
cp File.join(File.expand_path(dir),basename)
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
# Move this file to +new_path+.
|
|
101
|
+
def mv new_path
|
|
102
|
+
raise IMW::PathError.new("cannot move from #{path}, doesn't exist!") unless exist?
|
|
103
|
+
if local?
|
|
104
|
+
FileUtils.mv path, new_path
|
|
105
|
+
else
|
|
106
|
+
# FIXME better way to do this?
|
|
107
|
+
File.open(new_path,'w') { |f| f.write(open(uri).read) }
|
|
108
|
+
end
|
|
109
|
+
self.class.new(new_path)
|
|
110
|
+
end
|
|
111
|
+
alias_method :mv!, :mv
|
|
112
|
+
|
|
113
|
+
# Move this file to +dir+.
|
|
114
|
+
def mv_to_dir dir
|
|
115
|
+
mv File.join(File.expand_path(dir),basename)
|
|
116
|
+
end
|
|
117
|
+
alias_method :mv_to_dir!, :mv_to_dir
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
#
|
|
2
|
+
# h2. lib/imw/files/binary.rb -- binary files
|
|
3
|
+
#
|
|
4
|
+
# == About
|
|
5
|
+
#
|
|
6
|
+
# Class for handling binary data.
|
|
7
|
+
#
|
|
8
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
9
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
10
|
+
# License:: GPL 3.0
|
|
11
|
+
# Website:: http://infinitemonkeywrench.org/
|
|
12
|
+
#
|
|
13
|
+
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
|
14
|
+
module IMW
|
|
15
|
+
module Files
|
|
16
|
+
|
|
17
|
+
class Binary
|
|
18
|
+
|
|
19
|
+
include IMW::Files::BasicFile
|
|
20
|
+
include IMW::Files::Compressible
|
|
21
|
+
|
|
22
|
+
def initialize uri
|
|
23
|
+
self.uri= uri
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
#
|
|
2
|
+
# h2. lib/imw/files/compressed_file.rb -- class describing compressed files
|
|
3
|
+
#
|
|
4
|
+
# == About
|
|
5
|
+
#
|
|
6
|
+
# Compression of files is handled via the
|
|
7
|
+
# <tt>IMW::Files::Compressible</tt> module which can be included by
|
|
8
|
+
# any object that has a <tt>@path</tt> attribute. The methods defined
|
|
9
|
+
# there compress files and return this
|
|
10
|
+
# <tt>IMW::Files::CompressedFile</tt> object which has methods for
|
|
11
|
+
# decompression.
|
|
12
|
+
#
|
|
13
|
+
# A subclass of this class must define a +compression+ instance
|
|
14
|
+
# variable which is a hash with the following keys:
|
|
15
|
+
#
|
|
16
|
+
# <tt>:program</tt>:: a symbol naming the program used for
|
|
17
|
+
# compression/decompression which must be one of the symbols in
|
|
18
|
+
# <tt>IMW::EXTERNAL_PROGRAMS</tt>
|
|
19
|
+
#
|
|
20
|
+
# <tt>:decompression_flags</tt>:: a string of flags to pass to the
|
|
21
|
+
# compression program when decompressing the file.
|
|
22
|
+
#
|
|
23
|
+
# A subclass must also define the method +decompressed_path+ which
|
|
24
|
+
# returns the path of the file post-decompression.
|
|
25
|
+
#
|
|
26
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
27
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
28
|
+
# License:: GPL 3.0
|
|
29
|
+
# Website:: http://infinitemonkeywrench.org/
|
|
30
|
+
#
|
|
31
|
+
# puts "#{File.basename(__FILE__)}: Have you ever folded up the wrapper of a soda straw into a little accordian shape and let a drop of water soak into it?" # at bottom
|
|
32
|
+
module IMW
|
|
33
|
+
module Files
|
|
34
|
+
|
|
35
|
+
module BasicFile
|
|
36
|
+
def compressed?
|
|
37
|
+
false
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# A module which provides methods for decompressing a compressed
|
|
42
|
+
# file. An including should define an instance variable
|
|
43
|
+
# <tt>@compression</tt> with two keys:
|
|
44
|
+
#
|
|
45
|
+
# <tt>:program</tt>:: a symbol from <tt>IMW::EXTERNAL_PROGRAMS</tt>
|
|
46
|
+
# <tt>:decompression_flags</tt>:: a string specifying flags to pass to the decompression program
|
|
47
|
+
module CompressedFile
|
|
48
|
+
|
|
49
|
+
attr_reader :compression
|
|
50
|
+
|
|
51
|
+
# Is this file compressed?
|
|
52
|
+
def compressed?
|
|
53
|
+
true
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Construct the command passed to the shell to decompress this
|
|
57
|
+
# file.
|
|
58
|
+
def decompression_command
|
|
59
|
+
[IMW::EXTERNAL_PROGRAMS[@compression[:program]],@compression[:decompression_flags],@path].join ' '
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
public
|
|
63
|
+
# Decompress this file in its present directory overwriting any
|
|
64
|
+
# existing files and without saving the original compressed
|
|
65
|
+
# file.
|
|
66
|
+
def decompress!
|
|
67
|
+
raise IMW::PathError.new("cannot decompress #{@path}, doesn't exist!") unless exist?
|
|
68
|
+
FileUtils.cd(@dirname) { IMW.system decompression_command }
|
|
69
|
+
IMW.open(decompressed_path)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Decompress this file in its present directory, overwriting any
|
|
73
|
+
# existing files while keeping the original compressed file.
|
|
74
|
+
#
|
|
75
|
+
# The implementation is a little stupid, as the file is
|
|
76
|
+
# needlessly copied.
|
|
77
|
+
def decompress
|
|
78
|
+
raise IMW::PathError.new("cannot decompress #{@path}, doesn't exist!") unless exist?
|
|
79
|
+
begin
|
|
80
|
+
FileUtils.cp(@path,@path + 'copy')
|
|
81
|
+
decompress!
|
|
82
|
+
ensure
|
|
83
|
+
FileUtils.mv(@path + 'copy',@path)
|
|
84
|
+
end
|
|
85
|
+
IMW.open(decompressed_path)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
|
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
#
|
|
2
|
+
# h2. lib/imw/files/compressed_files_and_archives.rb -- require farm
|
|
3
|
+
#
|
|
4
|
+
# == About
|
|
5
|
+
#
|
|
6
|
+
# Just required all the archive and compressed formats (+tar+, +bz2+,
|
|
7
|
+
# &c.)
|
|
8
|
+
#
|
|
9
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
10
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
11
|
+
# License:: GPL 3.0
|
|
12
|
+
# Website:: http://infinitemonkeywrench.org/
|
|
13
|
+
#
|
|
14
|
+
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
|
15
|
+
module IMW
|
|
16
|
+
module Files
|
|
17
|
+
|
|
18
|
+
# A class to wrap a +tar+ archive.
|
|
19
|
+
#
|
|
20
|
+
# Creation, appending, listing, and extraction flags are stored in
|
|
21
|
+
# <tt>IMW::Files::Tar::DEFAULT_FLAGS</tt> and all are passed to
|
|
22
|
+
# the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
|
23
|
+
class Tar
|
|
24
|
+
|
|
25
|
+
include IMW::Files::BasicFile
|
|
26
|
+
include IMW::Files::Archive
|
|
27
|
+
include IMW::Files::Compressible
|
|
28
|
+
|
|
29
|
+
# The default flags used creating, appending to, listing, and
|
|
30
|
+
# extracting a tar archive.
|
|
31
|
+
DEFAULT_FLAGS = {
|
|
32
|
+
:create => "-cf",
|
|
33
|
+
:append => "-rf",
|
|
34
|
+
:list => "-tf",
|
|
35
|
+
:extract => "-xf",
|
|
36
|
+
:program => :tar
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
def initialize uri, *args
|
|
40
|
+
self.uri= uri
|
|
41
|
+
@archive = {
|
|
42
|
+
:program => DEFAULT_FLAGS[:program],
|
|
43
|
+
:create_flags => DEFAULT_FLAGS[:create],
|
|
44
|
+
:append_flags => DEFAULT_FLAGS[:append],
|
|
45
|
+
:list_flags => DEFAULT_FLAGS[:list],
|
|
46
|
+
:extract_flags => DEFAULT_FLAGS[:extract]
|
|
47
|
+
}
|
|
48
|
+
end
|
|
49
|
+
end # Tar
|
|
50
|
+
|
|
51
|
+
# A class to wrap a <tt>tar.gz</tt> archive.
|
|
52
|
+
#
|
|
53
|
+
# Creation, appending, listing, and extraction flags are stored in
|
|
54
|
+
# <tt>IMW::Files::TarGz::DEFAULT_FLAGS</tt> and all are passed to
|
|
55
|
+
# the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
|
56
|
+
class TarGz
|
|
57
|
+
|
|
58
|
+
include IMW::Files::BasicFile
|
|
59
|
+
include IMW::Files::Archive
|
|
60
|
+
include IMW::Files::CompressedFile
|
|
61
|
+
|
|
62
|
+
# The default flags used creating, appending to, listing, and
|
|
63
|
+
# extracting a <tt>tar.gz</tt> archive.
|
|
64
|
+
DEFAULT_FLAGS = {
|
|
65
|
+
:decompression_program => :gzip,
|
|
66
|
+
:decompression_flags => '-fd',
|
|
67
|
+
:archive_program => :tar,
|
|
68
|
+
:archive_list_flags => "-tf",
|
|
69
|
+
:archive_extract_flags => "-xzf"
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
def initialize uri, *args
|
|
73
|
+
self.uri= uri
|
|
74
|
+
@compression = {
|
|
75
|
+
:program => DEFAULT_FLAGS[:decompression_program],
|
|
76
|
+
:decompression_flags => DEFAULT_FLAGS[:decompression_flags]
|
|
77
|
+
}
|
|
78
|
+
@archive = {
|
|
79
|
+
:program => DEFAULT_FLAGS[:archive_program],
|
|
80
|
+
:list_flags => DEFAULT_FLAGS[:archive_list_flags],
|
|
81
|
+
:extract_flags => DEFAULT_FLAGS[:archive_extract_flags]
|
|
82
|
+
}
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Returns the path of the file after decompression.
|
|
86
|
+
def decompressed_path
|
|
87
|
+
if /\.tar\.gz$/.match @path then
|
|
88
|
+
@path.gsub /\.tar\.gz$/, ".tar"
|
|
89
|
+
elsif /\.tgz$/.match @path then
|
|
90
|
+
@path.gsub /\.tgz$/, ".tar"
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def self.extname path
|
|
95
|
+
if /\.tar\.gz$/.match path then
|
|
96
|
+
".tar.gz"
|
|
97
|
+
elsif /\.tgz$/.match path then
|
|
98
|
+
".tgz"
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
end # TarGz
|
|
103
|
+
|
|
104
|
+
# A class to wrap a <tt>tar.bz2</tt> archive.
|
|
105
|
+
#
|
|
106
|
+
# Creation, appending, listing, and extraction flags are stored in
|
|
107
|
+
# <tt>IMW::Files::TarBz2::DEFAULT_FLAGS</tt> and all are passed to
|
|
108
|
+
# the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
|
109
|
+
class TarBz2
|
|
110
|
+
|
|
111
|
+
include IMW::Files::BasicFile
|
|
112
|
+
include IMW::Files::Archive
|
|
113
|
+
include IMW::Files::CompressedFile
|
|
114
|
+
|
|
115
|
+
# The default flags used creating, appending to, listing, and
|
|
116
|
+
# extracting a <tt>tar.bz2</tt> archive.
|
|
117
|
+
DEFAULT_FLAGS = {
|
|
118
|
+
:decompression_program => :bzip2,
|
|
119
|
+
:decompression_flags => '-fd',
|
|
120
|
+
:archive_program => :tar,
|
|
121
|
+
:archive_create_flags => '-cf',
|
|
122
|
+
:archive_list_flags => "-tf",
|
|
123
|
+
:archive_extract_flags => "-xjf"
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
def self.extname path
|
|
127
|
+
if /\.tar\.bz2$/.match path then
|
|
128
|
+
".tar.bz2"
|
|
129
|
+
elsif /\.tbz2$/.match path then
|
|
130
|
+
".tbz2"
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def initialize uri, *args
|
|
135
|
+
self.uri= uri
|
|
136
|
+
@compression = {
|
|
137
|
+
:program => DEFAULT_FLAGS[:decompression_program],
|
|
138
|
+
:decompression_flags => DEFAULT_FLAGS[:decompression]
|
|
139
|
+
}
|
|
140
|
+
@archive = {
|
|
141
|
+
:program => DEFAULT_FLAGS[:archive_program],
|
|
142
|
+
:list_flags => DEFAULT_FLAGS[:archive_list_flags],
|
|
143
|
+
:extract_flags => DEFAULT_FLAGS[:archive_extract_flags],
|
|
144
|
+
:create_flags => DEFAULT_FLAGS[:archive_create_flags]
|
|
145
|
+
}
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Returns the path of the file after decompression.
|
|
149
|
+
def decompressed_path
|
|
150
|
+
if /\.tar\.bz2$/.match @path then
|
|
151
|
+
@path.gsub /\.tar\.bz2$/, ".tar"
|
|
152
|
+
elsif /\.tbz2$/.match @path then
|
|
153
|
+
@path.gsub /\.tbz2$/, ".tar"
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Overrides default behvaior of IMW::Files::Archive#create to
|
|
158
|
+
# compress files after creating them.
|
|
159
|
+
def create paths, opts={}
|
|
160
|
+
opts = opts.reverse_merge({:force => false})
|
|
161
|
+
raise IMW::Error.new("An archive already exists at #{@path}.") if exist? and not opts[:force]
|
|
162
|
+
paths = [paths] if paths.class == String
|
|
163
|
+
IMW.system IMW::EXTERNAL_PROGRAMS[@archive[:program]], @archive[:create_flags], path_between_archive_and_compression, *paths
|
|
164
|
+
IMW.open(path_between_archive_and_compression).compress!(:bzip2)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
protected
|
|
168
|
+
def path_between_archive_and_compression
|
|
169
|
+
File.join(dirname,name + '.tar')
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
end # TarBz2
|
|
173
|
+
|
|
174
|
+
# A class to wrap a +rar+ archive.
|
|
175
|
+
#
|
|
176
|
+
# Creation, appending, listing, and extraction flags are stored in
|
|
177
|
+
# <tt>IMW::Files::Rar::DEFAULT_FLAGS</tt> and all are passed to
|
|
178
|
+
# the <tt>:rar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
|
179
|
+
class Rar
|
|
180
|
+
|
|
181
|
+
include IMW::Files::BasicFile
|
|
182
|
+
include IMW::Files::Archive
|
|
183
|
+
|
|
184
|
+
# The default flags used creating, appending to, listing, and
|
|
185
|
+
# extracting a rar archive.
|
|
186
|
+
DEFAULT_FLAGS = {
|
|
187
|
+
:create => "a -r -o+ -inul",
|
|
188
|
+
:append => "a -r -o+ -inul",
|
|
189
|
+
:list => "vb",
|
|
190
|
+
:extract => "x -o+ -inul"
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
def initialize uri, *args
|
|
194
|
+
self.uri= uri
|
|
195
|
+
@archive = {
|
|
196
|
+
:program => :rar,
|
|
197
|
+
:create_flags => DEFAULT_FLAGS[:create],
|
|
198
|
+
:append_flags => DEFAULT_FLAGS[:append],
|
|
199
|
+
:list_flags => DEFAULT_FLAGS[:list],
|
|
200
|
+
:extract_flags => DEFAULT_FLAGS[:extract]
|
|
201
|
+
}
|
|
202
|
+
end
|
|
203
|
+
end # Rar
|
|
204
|
+
|
|
205
|
+
# A class to wrap a +zip+ archive.
|
|
206
|
+
#
|
|
207
|
+
# Creation, appending, listing, and extraction flags are stored in
|
|
208
|
+
# <tt>IMW::Files::Zip::DEFAULT_FLAGS</tt> and all are passed to
|
|
209
|
+
# the <tt>:zip</tt> and <tt>:unzip</tt> programs in
|
|
210
|
+
# <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
|
211
|
+
class Zip
|
|
212
|
+
|
|
213
|
+
include IMW::Files::BasicFile
|
|
214
|
+
include IMW::Files::Archive
|
|
215
|
+
|
|
216
|
+
# The default flags used creating, appending to, listing, and
|
|
217
|
+
# extracting a zip archive.
|
|
218
|
+
DEFAULT_FLAGS = {
|
|
219
|
+
:create => "-q -r",
|
|
220
|
+
:append => "-q -g",
|
|
221
|
+
:list => "-l",
|
|
222
|
+
:extract => "-q -o",
|
|
223
|
+
:unarchiving_program => :unzip
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
def initialize uri, *args
|
|
227
|
+
self.uri= uri
|
|
228
|
+
@archive = {
|
|
229
|
+
:program => :zip,
|
|
230
|
+
:create_flags => DEFAULT_FLAGS[:create],
|
|
231
|
+
:append_flags => DEFAULT_FLAGS[:append],
|
|
232
|
+
:list_flags => DEFAULT_FLAGS[:list],
|
|
233
|
+
:extract_flags => DEFAULT_FLAGS[:extract],
|
|
234
|
+
:unarchiving_program => DEFAULT_FLAGS[:unarchiving_program]
|
|
235
|
+
}
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# The `unzip' program outputs data in a very annoying format:
|
|
239
|
+
#
|
|
240
|
+
# Archive: data.zip
|
|
241
|
+
# Length Date Time Name
|
|
242
|
+
# -------- ---- ---- ----
|
|
243
|
+
# 18510 07-28-08 15:58 data/4d7Qrgz7.csv
|
|
244
|
+
# 3418 07-28-08 15:41 data/7S.csv
|
|
245
|
+
# 23353 07-28-08 15:41 data/g.csv
|
|
246
|
+
# 711 07-28-08 15:58 data/g.xml
|
|
247
|
+
# 1095 07-28-08 15:41 data/L.xml
|
|
248
|
+
# 2399 07-28-08 15:58 data/mTAu9H3.xml
|
|
249
|
+
# 152 07-28-08 15:58 data/vaHBS2t5R.dat
|
|
250
|
+
# -------- -------
|
|
251
|
+
# 49638 7 files
|
|
252
|
+
#
|
|
253
|
+
# which is parsed by this method.
|
|
254
|
+
def archive_contents_string_to_array string
|
|
255
|
+
rows = string.split("\n")
|
|
256
|
+
# ignore the first 3 lines of the output and also discared the
|
|
257
|
+
# last 2 (5 = 2 + 3)
|
|
258
|
+
file_rows = rows[3,(rows.length - 5)]
|
|
259
|
+
file_rows.map! do |row|
|
|
260
|
+
# discard extra whitespace before after main text
|
|
261
|
+
row.lstrip!.rstrip!
|
|
262
|
+
# split the remaining text at spaces...columns beyond the
|
|
263
|
+
# third are part of the filename and should be joined with a
|
|
264
|
+
# space again in case of a filename with a space
|
|
265
|
+
row.split(' ')[3,row.size].join(' ')
|
|
266
|
+
end
|
|
267
|
+
file_rows
|
|
268
|
+
end
|
|
269
|
+
end # Zip
|
|
270
|
+
|
|
271
|
+
# A class to wrap a <tt>gz</tt> compressed file.
|
|
272
|
+
#
|
|
273
|
+
# The decompressing flags are stored in
|
|
274
|
+
# <tt>IMW::Files::Gz::DEFAULT_FLAGS</tt> and all are passed to the
|
|
275
|
+
# <tt>:gzip</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
|
276
|
+
class Gz
|
|
277
|
+
|
|
278
|
+
include IMW::Files::BasicFile
|
|
279
|
+
include IMW::Files::CompressedFile
|
|
280
|
+
|
|
281
|
+
# The default flags used in extracting a <tt>gz</tt> file.
|
|
282
|
+
DEFAULT_FLAGS = {
|
|
283
|
+
:program => :gzip,
|
|
284
|
+
:decompression => '-fd'
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
def initialize uri, *args
|
|
288
|
+
self.uri= uri
|
|
289
|
+
@compression = {
|
|
290
|
+
:program => DEFAULT_FLAGS[:program],
|
|
291
|
+
:decompression_flags => DEFAULT_FLAGS[:decompression]
|
|
292
|
+
}
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
def decompressed_path
|
|
296
|
+
@path.gsub /\.gz$/, ""
|
|
297
|
+
end
|
|
298
|
+
end # Gz
|
|
299
|
+
|
|
300
|
+
# A class to wrap a <tt>bz2</tt> compressed file.
|
|
301
|
+
#
|
|
302
|
+
# The decompressing flags are stored in
|
|
303
|
+
# <tt>IMW::Files::Bz2::DEFAULT_FLAGS</tt> and all are passed to
|
|
304
|
+
# the <tt>:bzip2</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
|
|
305
|
+
class Bz2
|
|
306
|
+
|
|
307
|
+
include IMW::Files::BasicFile
|
|
308
|
+
include IMW::Files::CompressedFile
|
|
309
|
+
|
|
310
|
+
# The default flags used in extracting a <tt>bz2</tt> file.
|
|
311
|
+
DEFAULT_FLAGS = {
|
|
312
|
+
:program => :bzip2,
|
|
313
|
+
:decompression => '-fd'
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
def initialize uri, *args
|
|
317
|
+
self.uri= uri
|
|
318
|
+
raise IMW::Error.new("#{@extname} is not a valid extension for a bzip2 compressed file.") unless @extname == '.bz2'
|
|
319
|
+
@compression = {
|
|
320
|
+
:program => DEFAULT_FLAGS[:program],
|
|
321
|
+
:decompression_flags => DEFAULT_FLAGS[:decompression]
|
|
322
|
+
}
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
# Returns the path of the file after decompression.
|
|
326
|
+
def decompressed_path
|
|
327
|
+
@path.gsub /\.bz2$/, ""
|
|
328
|
+
end
|
|
329
|
+
end # Bz2
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
# make sure that tar.bz2 precedes bz2 and so on...
|
|
333
|
+
FILE_REGEXPS << [/\.tar\.bz2$/, IMW::Files::TarBz2]
|
|
334
|
+
FILE_REGEXPS << [/\.tbz2$/, IMW::Files::TarBz2]
|
|
335
|
+
|
|
336
|
+
FILE_REGEXPS << [/\.tar\.gz$/, IMW::Files::TarGz]
|
|
337
|
+
FILE_REGEXPS << [/\.tgz$/, IMW::Files::TarGz]
|
|
338
|
+
|
|
339
|
+
FILE_REGEXPS << [/\.tar$/, IMW::Files::Tar]
|
|
340
|
+
FILE_REGEXPS << [/\.bz2$/, IMW::Files::Bz2]
|
|
341
|
+
FILE_REGEXPS << [/\.gz$/, IMW::Files::Gz]
|
|
342
|
+
FILE_REGEXPS << [/\.rar$/, IMW::Files::Rar]
|
|
343
|
+
FILE_REGEXPS << [/\.zip$/, IMW::Files::Zip]
|
|
344
|
+
|
|
345
|
+
end # Files
|
|
346
|
+
end # IMW
|
|
347
|
+
|
|
348
|
+
|