imw 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -1
- data/Rakefile +10 -0
- data/TODO +18 -0
- data/VERSION +1 -1
- data/bin/imw +1 -1
- data/etc/imwrc.rb +0 -50
- data/examples/dataset.rb +12 -0
- data/lib/imw/boot.rb +55 -9
- data/lib/imw/dataset/paths.rb +15 -24
- data/lib/imw/dataset/workflow.rb +131 -72
- data/lib/imw/dataset.rb +94 -186
- data/lib/imw/parsers/html_parser.rb +1 -1
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +3 -27
- data/lib/imw/resource.rb +190 -0
- data/lib/imw/resources/archive.rb +97 -0
- data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
- data/lib/imw/resources/archives_and_compressed.rb +32 -0
- data/lib/imw/resources/compressed_file.rb +89 -0
- data/lib/imw/resources/compressible.rb +77 -0
- data/lib/imw/resources/formats/delimited.rb +92 -0
- data/lib/imw/resources/formats/excel.rb +125 -0
- data/lib/imw/resources/formats/json.rb +53 -0
- data/lib/imw/resources/formats/sgml.rb +72 -0
- data/lib/imw/resources/formats/yaml.rb +53 -0
- data/lib/imw/resources/formats.rb +32 -0
- data/lib/imw/resources/local.rb +198 -0
- data/lib/imw/resources/remote.rb +110 -0
- data/lib/imw/resources/schemes/hdfs.rb +242 -0
- data/lib/imw/resources/schemes/http.rb +161 -0
- data/lib/imw/resources/schemes/s3.rb +137 -0
- data/lib/imw/resources/schemes.rb +19 -0
- data/lib/imw/resources.rb +118 -0
- data/lib/imw/runner.rb +5 -4
- data/lib/imw/transforms/archiver.rb +215 -0
- data/lib/imw/transforms/transferer.rb +103 -0
- data/lib/imw/transforms.rb +8 -0
- data/lib/imw/utils/error.rb +26 -30
- data/lib/imw/utils/extensions/array.rb +5 -15
- data/lib/imw/utils/extensions/hash.rb +6 -16
- data/lib/imw/utils/extensions/hpricot.rb +0 -14
- data/lib/imw/utils/extensions/string.rb +5 -15
- data/lib/imw/utils/extensions/symbol.rb +0 -13
- data/lib/imw/utils/extensions.rb +65 -0
- data/lib/imw/utils/log.rb +14 -13
- data/lib/imw/utils/misc.rb +0 -6
- data/lib/imw/utils/paths.rb +101 -42
- data/lib/imw/utils/version.rb +8 -9
- data/lib/imw/utils.rb +2 -18
- data/lib/imw.rb +92 -17
- data/spec/data/sample.csv +1 -1
- data/spec/data/sample.json +1 -0
- data/spec/data/sample.tsv +1 -1
- data/spec/data/sample.txt +1 -1
- data/spec/data/sample.xml +1 -1
- data/spec/data/sample.yaml +1 -1
- data/spec/imw/dataset/paths_spec.rb +32 -0
- data/spec/imw/dataset/workflow_spec.rb +41 -0
- data/spec/imw/resource_spec.rb +79 -0
- data/spec/imw/resources/archive_spec.rb +69 -0
- data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
- data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
- data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
- data/spec/imw/resources/compressed_file_spec.rb +48 -0
- data/spec/imw/resources/compressible_spec.rb +36 -0
- data/spec/imw/resources/formats/delimited_spec.rb +33 -0
- data/spec/imw/resources/formats/json_spec.rb +32 -0
- data/spec/imw/resources/formats/sgml_spec.rb +24 -0
- data/spec/imw/resources/formats/yaml_spec.rb +41 -0
- data/spec/imw/resources/local_spec.rb +98 -0
- data/spec/imw/resources/remote_spec.rb +35 -0
- data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
- data/spec/imw/resources/schemes/http_spec.rb +19 -0
- data/spec/imw/resources/schemes/s3_spec.rb +19 -0
- data/spec/imw/transforms/archiver_spec.rb +120 -0
- data/spec/imw/transforms/transferer_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +5 -33
- data/spec/imw/utils/shared_paths_spec.rb +29 -0
- data/spec/spec_helper.rb +5 -5
- data/spec/support/paths_matcher.rb +67 -0
- data/spec/support/random.rb +39 -36
- metadata +88 -75
- data/lib/imw/dataset/task.rb +0 -41
- data/lib/imw/files/archive.rb +0 -113
- data/lib/imw/files/basicfile.rb +0 -122
- data/lib/imw/files/binary.rb +0 -28
- data/lib/imw/files/compressed_file.rb +0 -93
- data/lib/imw/files/compressed_files_and_archives.rb +0 -334
- data/lib/imw/files/compressible.rb +0 -103
- data/lib/imw/files/csv.rb +0 -113
- data/lib/imw/files/directory.rb +0 -62
- data/lib/imw/files/excel.rb +0 -84
- data/lib/imw/files/json.rb +0 -41
- data/lib/imw/files/sgml.rb +0 -46
- data/lib/imw/files/text.rb +0 -68
- data/lib/imw/files/yaml.rb +0 -46
- data/lib/imw/files.rb +0 -125
- data/lib/imw/packagers/archiver.rb +0 -126
- data/lib/imw/packagers/s3_mover.rb +0 -36
- data/lib/imw/packagers.rb +0 -8
- data/lib/imw/utils/components.rb +0 -61
- data/lib/imw/utils/config.rb +0 -46
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
- data/lib/imw/utils/extensions/core.rb +0 -27
- data/lib/imw/utils/extensions/dir.rb +0 -24
- data/lib/imw/utils/extensions/file_core.rb +0 -64
- data/lib/imw/utils/extensions/typed_struct.rb +0 -22
- data/lib/imw/utils/extensions/uri.rb +0 -59
- data/lib/imw/utils/view/dump_csv.rb +0 -112
- data/lib/imw/utils/view/dump_csv_older.rb +0 -117
- data/lib/imw/utils/view.rb +0 -113
- data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
- data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
- data/spec/imw/files/archive_spec.rb +0 -118
- data/spec/imw/files/basicfile_spec.rb +0 -121
- data/spec/imw/files/bz2_spec.rb +0 -32
- data/spec/imw/files/compressed_file_spec.rb +0 -96
- data/spec/imw/files/compressible_spec.rb +0 -100
- data/spec/imw/files/file_spec.rb +0 -144
- data/spec/imw/files/gz_spec.rb +0 -32
- data/spec/imw/files/rar_spec.rb +0 -33
- data/spec/imw/files/tar_spec.rb +0 -31
- data/spec/imw/files/text_spec.rb +0 -23
- data/spec/imw/files/zip_spec.rb +0 -31
- data/spec/imw/files_spec.rb +0 -38
- data/spec/imw/packagers/archiver_spec.rb +0 -125
- data/spec/imw/packagers/s3_mover_spec.rb +0 -7
- data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
- data/spec/imw/utils/extensions/find_spec.rb +0 -113
- data/spec/imw/workflow/rip/local_spec.rb +0 -89
- data/spec/imw/workflow/rip_spec.rb +0 -27
- data/spec/support/archive_contents_matcher.rb +0 -94
- data/spec/support/directory_contents_matcher.rb +0 -61
data/lib/imw/files/json.rb
DELETED
@@ -1,41 +0,0 @@
|
|
1
|
-
|
2
|
-
# h2. lib/imw/files/json.rb -- describes json files
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# A class for working with JSON files.
|
7
|
-
#
|
8
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
9
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
10
|
-
# License:: GPL 3.0
|
11
|
-
# Website:: http://infinitemonkeywrench.org/
|
12
|
-
#
|
13
|
-
# puts "#{File.basename(__FILE__)}: Yet another clever comment." # at bottobm
|
14
|
-
|
15
|
-
require 'json'
|
16
|
-
require 'imw/files/text'
|
17
|
-
|
18
|
-
module IMW
|
19
|
-
module Files
|
20
|
-
|
21
|
-
class Json < IMW::Files::Text
|
22
|
-
|
23
|
-
def initialize uri, mode='r', options = {}
|
24
|
-
super uri, mode
|
25
|
-
end
|
26
|
-
|
27
|
-
# Return the contents of this JSON file.
|
28
|
-
#
|
29
|
-
# FIXME what to do if a block is passed in?
|
30
|
-
def load &block
|
31
|
-
JSON.parse File.new(@path).read
|
32
|
-
end
|
33
|
-
|
34
|
-
# Dump +data+ to this file as JSON.
|
35
|
-
def dump data
|
36
|
-
super data.to_json
|
37
|
-
end
|
38
|
-
end
|
39
|
-
FILE_REGEXPS << [/\.json$/, IMW::Files::Json]
|
40
|
-
end
|
41
|
-
end
|
data/lib/imw/files/sgml.rb
DELETED
@@ -1,46 +0,0 @@
|
|
1
|
-
require 'hpricot'
|
2
|
-
require 'imw/files/text'
|
3
|
-
require 'imw/parsers/html_parser'
|
4
|
-
|
5
|
-
module IMW
|
6
|
-
module Files
|
7
|
-
|
8
|
-
module Sgml
|
9
|
-
|
10
|
-
attr_accessor :doc
|
11
|
-
|
12
|
-
# Delegate to Hpricot
|
13
|
-
def method_missing method, *args, &block
|
14
|
-
@doc.send method, *args, &block
|
15
|
-
end
|
16
|
-
|
17
|
-
# Parse this file using the IMW::Parsers::HtmlParser. The
|
18
|
-
# parser can either be passed in directly or constructed from a
|
19
|
-
# passed hash of specs and/or matchers.
|
20
|
-
def parse *args
|
21
|
-
parser = args.first.is_a?(IMW::Parsers::HtmlParser) ? args.first : IMW::Parsers::HtmlParser.new(*args)
|
22
|
-
parser.parse(self)
|
23
|
-
end
|
24
|
-
|
25
|
-
end
|
26
|
-
|
27
|
-
class Xml < IMW::Files::Text
|
28
|
-
include Sgml
|
29
|
-
def initialize uri, mode='r', options={}
|
30
|
-
super uri, mode, options
|
31
|
-
@doc = Hpricot.XML(open(uri))
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
class Html < IMW::Files::Text
|
36
|
-
include Sgml
|
37
|
-
def initialize uri, mode='r', options={}
|
38
|
-
super uri, mode, options
|
39
|
-
@doc = Hpricot(open(uri))
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
|
46
|
-
|
data/lib/imw/files/text.rb
DELETED
@@ -1,68 +0,0 @@
|
|
1
|
-
module IMW
|
2
|
-
module Files
|
3
|
-
|
4
|
-
# Used to process text files when no more specialized class is suitable.
|
5
|
-
#
|
6
|
-
# f = IMW::Files::Text.new '/path/to/my_file.dat'
|
7
|
-
# f.load do |line|
|
8
|
-
# # ...
|
9
|
-
# end
|
10
|
-
#
|
11
|
-
# Missing methods will be passed to the associated file handle
|
12
|
-
# (either IO or StringIO depending on whether the URI passed in
|
13
|
-
# was local or remote) so the usual stuff like read or each_line
|
14
|
-
# still works.
|
15
|
-
class Text
|
16
|
-
|
17
|
-
include IMW::Files::BasicFile
|
18
|
-
include IMW::Files::Compressible
|
19
|
-
|
20
|
-
attr_reader :file, :parser
|
21
|
-
|
22
|
-
def initialize uri, mode='r', options = {}
|
23
|
-
self.uri= uri
|
24
|
-
raise IMW::PathError.new("Cannot write to remote file #{uri}") if mode == 'w' && remote?
|
25
|
-
@file = open(uri, mode)
|
26
|
-
end
|
27
|
-
|
28
|
-
# Return the contents of this text file as a string.
|
29
|
-
def load
|
30
|
-
file.read
|
31
|
-
end
|
32
|
-
|
33
|
-
# Return an array with each line of this file. If given a
|
34
|
-
# block, pass each line to the block.
|
35
|
-
def entries &block
|
36
|
-
if block_given?
|
37
|
-
file.each do |line|
|
38
|
-
yield line.chomp
|
39
|
-
end
|
40
|
-
else
|
41
|
-
file.map do |line|
|
42
|
-
line.chomp
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
# Dump +data+ to this file as a string. Close the file handle
|
48
|
-
# if passed in :close.
|
49
|
-
def dump data, options={}
|
50
|
-
file.write(data.inspect)
|
51
|
-
file.close if options[:close]
|
52
|
-
end
|
53
|
-
|
54
|
-
def method_missing method, *args
|
55
|
-
file.send method, *args
|
56
|
-
end
|
57
|
-
|
58
|
-
def parse parser_spec, &block
|
59
|
-
lines = parser_spec.delete(:lines)
|
60
|
-
@parser = IMW::Parsers::RegexpParser.new(parser_spec)
|
61
|
-
parser.parse!(file, {:lines => lines}, &block)
|
62
|
-
end
|
63
|
-
|
64
|
-
end
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
# puts "#{File.basename(__FILE__)}: Don't forget to put a nametag on your Monkeywrench or one of the other chimps might steal it!" # at bottom
|
data/lib/imw/files/yaml.rb
DELETED
@@ -1,46 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/files/yaml.rb -- describes yaml files
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# A class for working with YAML files.
|
7
|
-
#
|
8
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
9
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
10
|
-
# License:: GPL 3.0
|
11
|
-
# Website:: http://infinitemonkeywrench.org/
|
12
|
-
#
|
13
|
-
|
14
|
-
require 'yaml'
|
15
|
-
require 'imw/files/text'
|
16
|
-
|
17
|
-
module IMW
|
18
|
-
module Files
|
19
|
-
|
20
|
-
class Yaml < IMW::Files::Text
|
21
|
-
|
22
|
-
def initialize uri, mode='r', options = {}
|
23
|
-
super uri, mode
|
24
|
-
end
|
25
|
-
|
26
|
-
# Return the contents of this YAML file.
|
27
|
-
#
|
28
|
-
# FIXME what to do if a block is passed in?
|
29
|
-
def load &block
|
30
|
-
YAML.load_file @path
|
31
|
-
end
|
32
|
-
|
33
|
-
# Dump +data+ to this file as YAML.
|
34
|
-
def dump data
|
35
|
-
super data.to_yaml
|
36
|
-
end
|
37
|
-
|
38
|
-
end
|
39
|
-
|
40
|
-
FILE_REGEXPS << [/\.yaml$/, IMW::Files::Yaml]
|
41
|
-
FILE_REGEXPS << [/\.yml$/, IMW::Files::Yaml]
|
42
|
-
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
# puts "#{File.basename(__FILE__)}: Yet another clever comment." # at bottobm
|
data/lib/imw/files.rb
DELETED
@@ -1,125 +0,0 @@
|
|
1
|
-
require 'uri'
|
2
|
-
require 'open-uri'
|
3
|
-
require 'imw/utils'
|
4
|
-
require 'imw/files/basicfile'
|
5
|
-
require 'imw/files/directory'
|
6
|
-
require 'imw/files/archive'
|
7
|
-
require 'imw/files/compressible'
|
8
|
-
require 'imw/files/compressed_file'
|
9
|
-
|
10
|
-
module IMW
|
11
|
-
|
12
|
-
# Parse +path+ and return an appropriate handler. Pass in <tt>:write
|
13
|
-
# => true</tt> to open for writing.
|
14
|
-
#
|
15
|
-
# IMW.open("/tmp/test.csv") # => IMW::Files::Csv("/tmp/test.csv')
|
16
|
-
#
|
17
|
-
#
|
18
|
-
def self.open path, options = {}, &block
|
19
|
-
if File.directory?(File.expand_path(path))
|
20
|
-
dir = Files::Directory.new(path)
|
21
|
-
yield dir if block_given?
|
22
|
-
dir
|
23
|
-
else
|
24
|
-
mode = options[:write] ? 'w' : 'r'
|
25
|
-
file = Files.file_class_for(path, options).new(path, mode, options)
|
26
|
-
yield file if block_given?
|
27
|
-
file
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
def self.open! path, options = {}, &block
|
32
|
-
self.open path, options.reverse_merge(:write => true)
|
33
|
-
end
|
34
|
-
|
35
|
-
module Files
|
36
|
-
|
37
|
-
|
38
|
-
# There is certainly a cleaner way to do this.
|
39
|
-
autoload :Text, 'imw/files/text'
|
40
|
-
autoload :Binary, 'imw/files/binary'
|
41
|
-
autoload :Yaml, 'imw/files/yaml'
|
42
|
-
autoload :Csv, 'imw/files/csv'
|
43
|
-
autoload :Json, 'imw/files/json'
|
44
|
-
autoload :Bz2, 'imw/files/compressed_files_and_archives'
|
45
|
-
autoload :Gz, 'imw/files/compressed_files_and_archives'
|
46
|
-
autoload :Tar, 'imw/files/compressed_files_and_archives'
|
47
|
-
autoload :Tarbz2, 'imw/files/compressed_files_and_archives'
|
48
|
-
autoload :Targz, 'imw/files/compressed_files_and_archives'
|
49
|
-
autoload :Rar, 'imw/files/compressed_files_and_archives'
|
50
|
-
autoload :Zip, 'imw/files/compressed_files_and_archives'
|
51
|
-
autoload :Xml, 'imw/files/sgml'
|
52
|
-
autoload :Html, 'imw/files/sgml'
|
53
|
-
autoload :Excel, 'imw/files/excel'
|
54
|
-
|
55
|
-
|
56
|
-
# An array used to match files to classes to handle them. The
|
57
|
-
# first element of each array is the regexp and the second names
|
58
|
-
# the class to handle the file.
|
59
|
-
#
|
60
|
-
# IMW::Files::EXTENSION_HANDLERS << [ /\.csv$/, :csv ] #=> IMW::Files::Csv
|
61
|
-
# IMW::Files::EXTENSION_HANDLERS << [ /\.txt$/, "Text" ] #=> IMW::Files::Text
|
62
|
-
# IMW::Files::EXTENSION_HANDLERS << [ /\.myclass%/, MyClass ] #=> MyClass
|
63
|
-
#
|
64
|
-
# Elements at the end of the array have greater precedence which
|
65
|
-
# allows, say, <tt>.tar.gz</tt> to be handled differently from
|
66
|
-
# <tt>.gz</tt>.
|
67
|
-
EXTENSION_HANDLERS = [
|
68
|
-
[/\.txt$/, :text],
|
69
|
-
[/\.txt$/, :text],
|
70
|
-
[/\.dat$/, :text],
|
71
|
-
[/\.ascii$/, :text],
|
72
|
-
[/\.yaml$/, :yaml],
|
73
|
-
[/\.yml$/, :yaml],
|
74
|
-
[/\.csv$/, :csv],
|
75
|
-
[/\.tsv$/, :tsv],
|
76
|
-
[/\.json$/, :json],
|
77
|
-
[/\.bz2$/, :bz2],
|
78
|
-
[/\.gz$/, :gz],
|
79
|
-
[/\.tar\.bz2$/, :tarbz2],
|
80
|
-
[/\.tbz2$/, :tarbz2],
|
81
|
-
[/\.tar\.gz$/, :targz],
|
82
|
-
[/\.tgz$/, :targz],
|
83
|
-
[/\.tar$/, :tar],
|
84
|
-
[/\.rar$/, :rar],
|
85
|
-
[/\.zip$/, :zip],
|
86
|
-
[/\.xml$/, :xml],
|
87
|
-
[/\.html$/, :html],
|
88
|
-
[/\.htm$/, :html],
|
89
|
-
[/\.xlsx?$/, :excel]
|
90
|
-
]
|
91
|
-
|
92
|
-
SCHEME_HANDLERS = [
|
93
|
-
[/http/, :html]
|
94
|
-
]
|
95
|
-
|
96
|
-
protected
|
97
|
-
def self.file_class_for path, options = {}
|
98
|
-
klass = options.delete(:as)
|
99
|
-
|
100
|
-
# try to choose klass from path extension if not already set
|
101
|
-
unless klass
|
102
|
-
EXTENSION_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
|
103
|
-
next unless regexp =~ path
|
104
|
-
klass = thing
|
105
|
-
break
|
106
|
-
end
|
107
|
-
end
|
108
|
-
|
109
|
-
# try to choose klass from uri scheme if not already set
|
110
|
-
unless klass
|
111
|
-
scheme = URI.parse(path).scheme
|
112
|
-
SCHEME_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
|
113
|
-
next unless regexp =~ scheme
|
114
|
-
klass = thing
|
115
|
-
break
|
116
|
-
end
|
117
|
-
end
|
118
|
-
|
119
|
-
# just stick with text if still not set
|
120
|
-
klass = :text unless klass
|
121
|
-
|
122
|
-
klass.is_a?(Class) ? klass : class_eval(klass.to_s.downcase.capitalize)
|
123
|
-
end
|
124
|
-
end
|
125
|
-
end
|
@@ -1,126 +0,0 @@
|
|
1
|
-
module IMW
|
2
|
-
module Packagers
|
3
|
-
|
4
|
-
# Packages an Array of input files into a single output archive.
|
5
|
-
# When the archive is extracted, all the input files given will be
|
6
|
-
# in a single directory with a chosen name. The path to the output
|
7
|
-
# archive determines both the name of the archive and its type (tar,
|
8
|
-
# tar.bz2, zip, &c.).
|
9
|
-
#
|
10
|
-
# If any of the input files are themselves archives, they will first
|
11
|
-
# be extracted, with only their contents winding up in the final
|
12
|
-
# directory (the file hierarchy of the archive will be preserved).
|
13
|
-
# If any of the input files are compressed, they will first be
|
14
|
-
# uncompressed before being added to the directory.
|
15
|
-
#
|
16
|
-
# Input files can be renamed by passing in a Hash instead of an
|
17
|
-
# Array. Each key in this hash is the path to an input file and its
|
18
|
-
# value is the new basename to give it. If the basename is +nil+
|
19
|
-
# then the original path's basename will be used.
|
20
|
-
class Archiver
|
21
|
-
|
22
|
-
attr_accessor :name, :inputs
|
23
|
-
|
24
|
-
def initialize name, inputs
|
25
|
-
@name = name
|
26
|
-
add_inputs inputs
|
27
|
-
end
|
28
|
-
|
29
|
-
# FIXME Instead of requiring +new_inputs+ to be either an Array
|
30
|
-
# or Hash just iterate through whatever it is using +each+ and
|
31
|
-
# see if the iterate can be interpreted as a mapping between
|
32
|
-
# strings.
|
33
|
-
def add_inputs new_inputs
|
34
|
-
@inputs ||= {}
|
35
|
-
if new_inputs.is_a?(Array)
|
36
|
-
new_inputs.each do |input|
|
37
|
-
@inputs[File.expand_path(input)] = File.basename(input)
|
38
|
-
end
|
39
|
-
else
|
40
|
-
new_inputs.each_pair do |input, basename|
|
41
|
-
@inputs[File.expand_path(input)] = (basename || File.basename(input))
|
42
|
-
end
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
def errors
|
47
|
-
@errors ||= []
|
48
|
-
end
|
49
|
-
|
50
|
-
def add_processing_error error
|
51
|
-
IMW.logger.warn error
|
52
|
-
errors << error
|
53
|
-
end
|
54
|
-
|
55
|
-
def success?
|
56
|
-
errors.empty?
|
57
|
-
end
|
58
|
-
|
59
|
-
# A temporary directory to work in. Its contents will
|
60
|
-
# ultimately consist of a directory named for the package
|
61
|
-
# containing all the input files.
|
62
|
-
def tmp_dir
|
63
|
-
@tmp_dir ||= File.join(IMW.path_to(:tmp_root, 'packager'), (Time.now.to_i.to_s + "-" + $$.to_s)) # guaranteed unique on a node
|
64
|
-
end
|
65
|
-
|
66
|
-
def clean!
|
67
|
-
FileUtils.rm_rf(tmp_dir)
|
68
|
-
end
|
69
|
-
|
70
|
-
# A directory which will contain all the content being packaged,
|
71
|
-
# including the contents of any archives that were included in
|
72
|
-
# the list of files to process.
|
73
|
-
def dir
|
74
|
-
@dir ||= File.join(tmp_dir, name.to_s)
|
75
|
-
end
|
76
|
-
|
77
|
-
# FIXME This needs to be made idempotent -- calling prepare
|
78
|
-
# twice should not do any work the second time (unless the user
|
79
|
-
# is insistent and passes a :force option -- or maybe use bang
|
80
|
-
# and not-bang versions of the method for this distinction).
|
81
|
-
def prepare!
|
82
|
-
FileUtils.mkdir_p dir unless File.exist?(dir)
|
83
|
-
inputs.each_pair do |path, basename|
|
84
|
-
new_path = File.join(dir, basename)
|
85
|
-
file = IMW.open(path, :as => IMW::Files.file_class_for(basename)) # file's original path is meaningless: RackMultipart20091203-958-1nkgc61-0
|
86
|
-
case
|
87
|
-
when file.archive?
|
88
|
-
FileUtils.cd(dir) do
|
89
|
-
file.extract
|
90
|
-
end
|
91
|
-
when file.compressed?
|
92
|
-
file.cp(new_path).decompress!
|
93
|
-
else
|
94
|
-
file.cp(new_path)
|
95
|
-
end
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
# Package the contents of the temporary directory to an archive
|
100
|
-
# at +output+ but return exceptions instead of raising them.
|
101
|
-
def package output, options={}
|
102
|
-
begin
|
103
|
-
package! output, options={}
|
104
|
-
rescue RuntimeError => e
|
105
|
-
return e
|
106
|
-
end
|
107
|
-
end
|
108
|
-
|
109
|
-
# Package the contents of the temporary directory to an archive
|
110
|
-
# at +output+.
|
111
|
-
def package! output, options={}
|
112
|
-
output = IMW.open(output) if output.is_a?(String)
|
113
|
-
FileUtils.mkdir_p(output.dirname) unless File.exist?(output.dirname)
|
114
|
-
output.rm! if output.exist?
|
115
|
-
FileUtils.cd(tmp_dir) do
|
116
|
-
temp_output = IMW.open(output.basename)
|
117
|
-
packaged_output = temp_output.create(name.to_s + '/*').mv(output.path)
|
118
|
-
temp_output.rm if temp_output.exist?
|
119
|
-
add_processing_error "Archiver: couldn't create archive #{output.path}" unless output.exists?
|
120
|
-
end
|
121
|
-
output
|
122
|
-
end
|
123
|
-
end
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
@@ -1,36 +0,0 @@
|
|
1
|
-
require 'aws/s3'
|
2
|
-
module IMW
|
3
|
-
module Packagers
|
4
|
-
class S3Mover
|
5
|
-
|
6
|
-
attr_reader :last_response
|
7
|
-
attr_accessor :bucket_name
|
8
|
-
|
9
|
-
def initialize options={}
|
10
|
-
@bucket_name = options.delete(:bucket_name)
|
11
|
-
AWS::S3::Base.establish_connection!(options)
|
12
|
-
end
|
13
|
-
|
14
|
-
def success?
|
15
|
-
errors.empty?
|
16
|
-
end
|
17
|
-
|
18
|
-
def success?
|
19
|
-
last_response && last_response.response.class == Net::HTTPOK
|
20
|
-
end
|
21
|
-
|
22
|
-
def upload local_path, remote_path
|
23
|
-
begin
|
24
|
-
upload! local_path, remote_path
|
25
|
-
rescue RuntimeError => e
|
26
|
-
return e
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
def upload! local_path, remote_path
|
31
|
-
@last_response = AWS::S3::S3Object.store(remote_path, open(local_path), bucket_name)
|
32
|
-
end
|
33
|
-
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
data/lib/imw/packagers.rb
DELETED
data/lib/imw/utils/components.rb
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/utils/components.rb -- define separate components of IMW
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# Defines a hash <tt>IMW::COMPONENTS</tt> which keys component names
|
7
|
-
# to the files to be required to implement each component and defines
|
8
|
-
# methods to load these files.
|
9
|
-
#
|
10
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
11
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
12
|
-
# License:: GPL 3.0
|
13
|
-
# Website:: http://infinitemonkeywrench.org/
|
14
|
-
#
|
15
|
-
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
16
|
-
|
17
|
-
require 'imw/utils/error'
|
18
|
-
|
19
|
-
module IMW
|
20
|
-
|
21
|
-
# Defines IMW components and the files required by each. Components
|
22
|
-
# can be accessed using <tt>IMW.load_components</tt> or
|
23
|
-
# <tt>IMW#imw_components</tt>.
|
24
|
-
COMPONENTS = {
|
25
|
-
:datamapper => ["imw/dataset/datamapper","imw/dataset/datamapper/time_and_user_stamps"],
|
26
|
-
:data_mapper => :datamapper,
|
27
|
-
:html_parser => "imw/parsers/html_parser",
|
28
|
-
:flat_file_parser => "imw/parsers/flat_file_parser",
|
29
|
-
:line_parser => "imw/parsers/line_parser",
|
30
|
-
:infochimps => ["imw/infochimps/infochimps_resource","imw/infochimps/icss"]
|
31
|
-
}
|
32
|
-
|
33
|
-
# Load components of IMW as needed,
|
34
|
-
#
|
35
|
-
# IMW.load_components :datamapper, :flat_file_parser
|
36
|
-
def self.load_components *args
|
37
|
-
args.each do |component_name|
|
38
|
-
begin
|
39
|
-
require component_name.to_s
|
40
|
-
rescue LoadError
|
41
|
-
component = IMW::COMPONENTS[component_name]
|
42
|
-
raise IMW::Error.new("#{component_name} is an invalid IMW component. See IMW::COMPONENTS.") unless component
|
43
|
-
if component.is_a? Array then
|
44
|
-
IMW.load_components *component
|
45
|
-
else
|
46
|
-
IMW.load_components component
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
# Load components of IMW as needed,
|
53
|
-
#
|
54
|
-
# include IMW
|
55
|
-
# imw_components :datamapper, :flat_file_parser
|
56
|
-
def imw_components *args
|
57
|
-
IMW.load_components *args
|
58
|
-
end
|
59
|
-
|
60
|
-
end
|
61
|
-
|
data/lib/imw/utils/config.rb
DELETED
@@ -1,46 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/utils/config.rb -- configuration parsing
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# IMW looks for configuration settings in the following places, in
|
7
|
-
# order of increasing precedence:
|
8
|
-
#
|
9
|
-
# 1. Settings defined directly in this file.
|
10
|
-
#
|
11
|
-
# 2. From the <tt>etc/imwrc</tt> file in the IMW root directory.
|
12
|
-
#
|
13
|
-
# 3. From the <tt>.imwrc</tt> file in the user's home directory (the
|
14
|
-
# filename can be changed; see
|
15
|
-
# <tt>IMW::Config::USER_CONFIG_FILE_BASENAME</tt>).
|
16
|
-
#
|
17
|
-
# 4. From the file defined by the environment variable +IMWRC+ (the
|
18
|
-
# value can be changed; see
|
19
|
-
# <tt>IMW::Config::USER_CONFIG_FILE_ENV_VARIABLE</tt>
|
20
|
-
#
|
21
|
-
# Settings not found in one configuration location will be searched
|
22
|
-
# for in locations of lesser precedence.
|
23
|
-
#
|
24
|
-
# *Note:* configuration files are plain Ruby code that will be directly
|
25
|
-
# evaluated.
|
26
|
-
#
|
27
|
-
# Relevant settings include
|
28
|
-
#
|
29
|
-
# * interfaces with external programs (+tar+, +wget+, &c.)
|
30
|
-
# * paths to directories where IMW reads/writes files
|
31
|
-
# * correspondences between file extensions and IMW file classes
|
32
|
-
#
|
33
|
-
# For more detailed information, see the default configuration file,
|
34
|
-
# <tt>etc/imwrc</tt>.
|
35
|
-
#
|
36
|
-
#
|
37
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
38
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
39
|
-
# License:: GPL 3.0
|
40
|
-
# Website:: http://infinitemonkeywrench.org/
|
41
|
-
#
|
42
|
-
|
43
|
-
module IMW
|
44
|
-
end
|
45
|
-
|
46
|
-
# puts "#{File.basename(__FILE__)}: You carefully adjust the settings on your Monkeywrench. Glob-monsters: beware!!" # at bottom
|
@@ -1,8 +0,0 @@
|
|
1
|
-
# :nodoc:
|
2
|
-
# for when cattr_accessor is all you need
|
3
|
-
#
|
4
|
-
require 'active_support/core_ext/array/extract_options'
|
5
|
-
class Array #:nodoc:
|
6
|
-
include ActiveSupport::CoreExtensions::Array::ExtractOptions
|
7
|
-
end
|
8
|
-
require 'active_support/core_ext/class/attribute_accessors'
|
@@ -1,27 +0,0 @@
|
|
1
|
-
require 'imw/utils/extensions/string'
|
2
|
-
require 'imw/utils/extensions/array'
|
3
|
-
require 'imw/utils/extensions/hash'
|
4
|
-
require 'imw/utils/extensions/dir'
|
5
|
-
require 'imw/utils/extensions/struct'
|
6
|
-
require 'imw/utils/extensions/symbol'
|
7
|
-
require 'imw/utils/extensions/file_core'
|
8
|
-
require 'active_support/core_ext/module/aliasing'
|
9
|
-
require 'active_support/core_ext/object/blank'
|
10
|
-
require 'active_support/core_ext/object/misc'
|
11
|
-
#require 'active_support/core_ext/blank.rb'
|
12
|
-
require 'imw/utils/extensions/class/attribute_accessors'
|
13
|
-
# require 'ostruct'
|
14
|
-
require 'set'
|
15
|
-
|
16
|
-
module IMW
|
17
|
-
# A replacement for the standard system call which raises an
|
18
|
-
# IMW::SystemCallError if the command fails as well as printing the
|
19
|
-
# command appended to the end of <tt>error_message</tt>.
|
20
|
-
def self.system *commands
|
21
|
-
command = commands.flatten.join ' '
|
22
|
-
Kernel.system(command)
|
23
|
-
raise IMW::SystemCallError.new(command) unless $?.success?
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
|
@@ -1,24 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/utils/extensions/dir.rb -- directory extensions
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# The Ruby +Dir+ module is rubbish. Time to clean it up a bit!
|
7
|
-
#
|
8
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
9
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
10
|
-
# License:: GPL 3.0
|
11
|
-
# Website:: http://infinitemonkeywrench.org/
|
12
|
-
#
|
13
|
-
|
14
|
-
|
15
|
-
class Dir
|
16
|
-
|
17
|
-
# Return the absolute paths of files and directories in the
|
18
|
-
# directory, leaving out `.' and `..' entries.
|
19
|
-
def abs_contents
|
20
|
-
self.entries.map {|entry| File.join(self.path,entry) unless entry == '.' || entry == '..'}.compact
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
# puts "#{File.basename(__FILE__)}: You open the folder and see along list of names. Some have been crossed out -- ominously..." # at bottom
|