imw 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -1
- data/Rakefile +10 -0
- data/TODO +18 -0
- data/VERSION +1 -1
- data/bin/imw +1 -1
- data/etc/imwrc.rb +0 -50
- data/examples/dataset.rb +12 -0
- data/lib/imw/boot.rb +55 -9
- data/lib/imw/dataset/paths.rb +15 -24
- data/lib/imw/dataset/workflow.rb +131 -72
- data/lib/imw/dataset.rb +94 -186
- data/lib/imw/parsers/html_parser.rb +1 -1
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +3 -27
- data/lib/imw/resource.rb +190 -0
- data/lib/imw/resources/archive.rb +97 -0
- data/lib/imw/resources/archives_and_compressed/bz2.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/gz.rb +18 -0
- data/lib/imw/resources/archives_and_compressed/rar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tar.rb +23 -0
- data/lib/imw/resources/archives_and_compressed/tarbz2.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/targz.rb +78 -0
- data/lib/imw/resources/archives_and_compressed/zip.rb +57 -0
- data/lib/imw/resources/archives_and_compressed.rb +32 -0
- data/lib/imw/resources/compressed_file.rb +89 -0
- data/lib/imw/resources/compressible.rb +77 -0
- data/lib/imw/resources/formats/delimited.rb +92 -0
- data/lib/imw/resources/formats/excel.rb +125 -0
- data/lib/imw/resources/formats/json.rb +53 -0
- data/lib/imw/resources/formats/sgml.rb +72 -0
- data/lib/imw/resources/formats/yaml.rb +53 -0
- data/lib/imw/resources/formats.rb +32 -0
- data/lib/imw/resources/local.rb +198 -0
- data/lib/imw/resources/remote.rb +110 -0
- data/lib/imw/resources/schemes/hdfs.rb +242 -0
- data/lib/imw/resources/schemes/http.rb +161 -0
- data/lib/imw/resources/schemes/s3.rb +137 -0
- data/lib/imw/resources/schemes.rb +19 -0
- data/lib/imw/resources.rb +118 -0
- data/lib/imw/runner.rb +5 -4
- data/lib/imw/transforms/archiver.rb +215 -0
- data/lib/imw/transforms/transferer.rb +103 -0
- data/lib/imw/transforms.rb +8 -0
- data/lib/imw/utils/error.rb +26 -30
- data/lib/imw/utils/extensions/array.rb +5 -15
- data/lib/imw/utils/extensions/hash.rb +6 -16
- data/lib/imw/utils/extensions/hpricot.rb +0 -14
- data/lib/imw/utils/extensions/string.rb +5 -15
- data/lib/imw/utils/extensions/symbol.rb +0 -13
- data/lib/imw/utils/extensions.rb +65 -0
- data/lib/imw/utils/log.rb +14 -13
- data/lib/imw/utils/misc.rb +0 -6
- data/lib/imw/utils/paths.rb +101 -42
- data/lib/imw/utils/version.rb +8 -9
- data/lib/imw/utils.rb +2 -18
- data/lib/imw.rb +92 -17
- data/spec/data/sample.csv +1 -1
- data/spec/data/sample.json +1 -0
- data/spec/data/sample.tsv +1 -1
- data/spec/data/sample.txt +1 -1
- data/spec/data/sample.xml +1 -1
- data/spec/data/sample.yaml +1 -1
- data/spec/imw/dataset/paths_spec.rb +32 -0
- data/spec/imw/dataset/workflow_spec.rb +41 -0
- data/spec/imw/resource_spec.rb +79 -0
- data/spec/imw/resources/archive_spec.rb +69 -0
- data/spec/imw/resources/archives_and_compressed/bz2_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/gz_spec.rb +15 -0
- data/spec/imw/resources/archives_and_compressed/rar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tar_spec.rb +16 -0
- data/spec/imw/resources/archives_and_compressed/tarbz2_spec.rb +24 -0
- data/spec/imw/resources/archives_and_compressed/targz_spec.rb +21 -0
- data/spec/imw/resources/archives_and_compressed/zip_spec.rb +16 -0
- data/spec/imw/resources/compressed_file_spec.rb +48 -0
- data/spec/imw/resources/compressible_spec.rb +36 -0
- data/spec/imw/resources/formats/delimited_spec.rb +33 -0
- data/spec/imw/resources/formats/json_spec.rb +32 -0
- data/spec/imw/resources/formats/sgml_spec.rb +24 -0
- data/spec/imw/resources/formats/yaml_spec.rb +41 -0
- data/spec/imw/resources/local_spec.rb +98 -0
- data/spec/imw/resources/remote_spec.rb +35 -0
- data/spec/imw/resources/schemes/hdfs_spec.rb +61 -0
- data/spec/imw/resources/schemes/http_spec.rb +19 -0
- data/spec/imw/resources/schemes/s3_spec.rb +19 -0
- data/spec/imw/transforms/archiver_spec.rb +120 -0
- data/spec/imw/transforms/transferer_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +5 -33
- data/spec/imw/utils/shared_paths_spec.rb +29 -0
- data/spec/spec_helper.rb +5 -5
- data/spec/support/paths_matcher.rb +67 -0
- data/spec/support/random.rb +39 -36
- metadata +88 -75
- data/lib/imw/dataset/task.rb +0 -41
- data/lib/imw/files/archive.rb +0 -113
- data/lib/imw/files/basicfile.rb +0 -122
- data/lib/imw/files/binary.rb +0 -28
- data/lib/imw/files/compressed_file.rb +0 -93
- data/lib/imw/files/compressed_files_and_archives.rb +0 -334
- data/lib/imw/files/compressible.rb +0 -103
- data/lib/imw/files/csv.rb +0 -113
- data/lib/imw/files/directory.rb +0 -62
- data/lib/imw/files/excel.rb +0 -84
- data/lib/imw/files/json.rb +0 -41
- data/lib/imw/files/sgml.rb +0 -46
- data/lib/imw/files/text.rb +0 -68
- data/lib/imw/files/yaml.rb +0 -46
- data/lib/imw/files.rb +0 -125
- data/lib/imw/packagers/archiver.rb +0 -126
- data/lib/imw/packagers/s3_mover.rb +0 -36
- data/lib/imw/packagers.rb +0 -8
- data/lib/imw/utils/components.rb +0 -61
- data/lib/imw/utils/config.rb +0 -46
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +0 -8
- data/lib/imw/utils/extensions/core.rb +0 -27
- data/lib/imw/utils/extensions/dir.rb +0 -24
- data/lib/imw/utils/extensions/file_core.rb +0 -64
- data/lib/imw/utils/extensions/typed_struct.rb +0 -22
- data/lib/imw/utils/extensions/uri.rb +0 -59
- data/lib/imw/utils/view/dump_csv.rb +0 -112
- data/lib/imw/utils/view/dump_csv_older.rb +0 -117
- data/lib/imw/utils/view.rb +0 -113
- data/spec/imw/dataset/datamapper/uri_spec.rb +0 -43
- data/spec/imw/dataset/datamapper_spec_helper.rb +0 -11
- data/spec/imw/files/archive_spec.rb +0 -118
- data/spec/imw/files/basicfile_spec.rb +0 -121
- data/spec/imw/files/bz2_spec.rb +0 -32
- data/spec/imw/files/compressed_file_spec.rb +0 -96
- data/spec/imw/files/compressible_spec.rb +0 -100
- data/spec/imw/files/file_spec.rb +0 -144
- data/spec/imw/files/gz_spec.rb +0 -32
- data/spec/imw/files/rar_spec.rb +0 -33
- data/spec/imw/files/tar_spec.rb +0 -31
- data/spec/imw/files/text_spec.rb +0 -23
- data/spec/imw/files/zip_spec.rb +0 -31
- data/spec/imw/files_spec.rb +0 -38
- data/spec/imw/packagers/archiver_spec.rb +0 -125
- data/spec/imw/packagers/s3_mover_spec.rb +0 -7
- data/spec/imw/utils/extensions/file_core_spec.rb +0 -72
- data/spec/imw/utils/extensions/find_spec.rb +0 -113
- data/spec/imw/workflow/rip/local_spec.rb +0 -89
- data/spec/imw/workflow/rip_spec.rb +0 -27
- data/spec/support/archive_contents_matcher.rb +0 -94
- data/spec/support/directory_contents_matcher.rb +0 -61
data/lib/imw/files/json.rb
DELETED
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
# h2. lib/imw/files/json.rb -- describes json files
|
|
3
|
-
#
|
|
4
|
-
# == About
|
|
5
|
-
#
|
|
6
|
-
# A class for working with JSON files.
|
|
7
|
-
#
|
|
8
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
9
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
10
|
-
# License:: GPL 3.0
|
|
11
|
-
# Website:: http://infinitemonkeywrench.org/
|
|
12
|
-
#
|
|
13
|
-
# puts "#{File.basename(__FILE__)}: Yet another clever comment." # at bottobm
|
|
14
|
-
|
|
15
|
-
require 'json'
|
|
16
|
-
require 'imw/files/text'
|
|
17
|
-
|
|
18
|
-
module IMW
|
|
19
|
-
module Files
|
|
20
|
-
|
|
21
|
-
class Json < IMW::Files::Text
|
|
22
|
-
|
|
23
|
-
def initialize uri, mode='r', options = {}
|
|
24
|
-
super uri, mode
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
# Return the contents of this JSON file.
|
|
28
|
-
#
|
|
29
|
-
# FIXME what to do if a block is passed in?
|
|
30
|
-
def load &block
|
|
31
|
-
JSON.parse File.new(@path).read
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
# Dump +data+ to this file as JSON.
|
|
35
|
-
def dump data
|
|
36
|
-
super data.to_json
|
|
37
|
-
end
|
|
38
|
-
end
|
|
39
|
-
FILE_REGEXPS << [/\.json$/, IMW::Files::Json]
|
|
40
|
-
end
|
|
41
|
-
end
|
data/lib/imw/files/sgml.rb
DELETED
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
require 'hpricot'
|
|
2
|
-
require 'imw/files/text'
|
|
3
|
-
require 'imw/parsers/html_parser'
|
|
4
|
-
|
|
5
|
-
module IMW
|
|
6
|
-
module Files
|
|
7
|
-
|
|
8
|
-
module Sgml
|
|
9
|
-
|
|
10
|
-
attr_accessor :doc
|
|
11
|
-
|
|
12
|
-
# Delegate to Hpricot
|
|
13
|
-
def method_missing method, *args, &block
|
|
14
|
-
@doc.send method, *args, &block
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
# Parse this file using the IMW::Parsers::HtmlParser. The
|
|
18
|
-
# parser can either be passed in directly or constructed from a
|
|
19
|
-
# passed hash of specs and/or matchers.
|
|
20
|
-
def parse *args
|
|
21
|
-
parser = args.first.is_a?(IMW::Parsers::HtmlParser) ? args.first : IMW::Parsers::HtmlParser.new(*args)
|
|
22
|
-
parser.parse(self)
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
class Xml < IMW::Files::Text
|
|
28
|
-
include Sgml
|
|
29
|
-
def initialize uri, mode='r', options={}
|
|
30
|
-
super uri, mode, options
|
|
31
|
-
@doc = Hpricot.XML(open(uri))
|
|
32
|
-
end
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
class Html < IMW::Files::Text
|
|
36
|
-
include Sgml
|
|
37
|
-
def initialize uri, mode='r', options={}
|
|
38
|
-
super uri, mode, options
|
|
39
|
-
@doc = Hpricot(open(uri))
|
|
40
|
-
end
|
|
41
|
-
end
|
|
42
|
-
end
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
data/lib/imw/files/text.rb
DELETED
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Files
|
|
3
|
-
|
|
4
|
-
# Used to process text files when no more specialized class is suitable.
|
|
5
|
-
#
|
|
6
|
-
# f = IMW::Files::Text.new '/path/to/my_file.dat'
|
|
7
|
-
# f.load do |line|
|
|
8
|
-
# # ...
|
|
9
|
-
# end
|
|
10
|
-
#
|
|
11
|
-
# Missing methods will be passed to the associated file handle
|
|
12
|
-
# (either IO or StringIO depending on whether the URI passed in
|
|
13
|
-
# was local or remote) so the usual stuff like read or each_line
|
|
14
|
-
# still works.
|
|
15
|
-
class Text
|
|
16
|
-
|
|
17
|
-
include IMW::Files::BasicFile
|
|
18
|
-
include IMW::Files::Compressible
|
|
19
|
-
|
|
20
|
-
attr_reader :file, :parser
|
|
21
|
-
|
|
22
|
-
def initialize uri, mode='r', options = {}
|
|
23
|
-
self.uri= uri
|
|
24
|
-
raise IMW::PathError.new("Cannot write to remote file #{uri}") if mode == 'w' && remote?
|
|
25
|
-
@file = open(uri, mode)
|
|
26
|
-
end
|
|
27
|
-
|
|
28
|
-
# Return the contents of this text file as a string.
|
|
29
|
-
def load
|
|
30
|
-
file.read
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
# Return an array with each line of this file. If given a
|
|
34
|
-
# block, pass each line to the block.
|
|
35
|
-
def entries &block
|
|
36
|
-
if block_given?
|
|
37
|
-
file.each do |line|
|
|
38
|
-
yield line.chomp
|
|
39
|
-
end
|
|
40
|
-
else
|
|
41
|
-
file.map do |line|
|
|
42
|
-
line.chomp
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
# Dump +data+ to this file as a string. Close the file handle
|
|
48
|
-
# if passed in :close.
|
|
49
|
-
def dump data, options={}
|
|
50
|
-
file.write(data.inspect)
|
|
51
|
-
file.close if options[:close]
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
def method_missing method, *args
|
|
55
|
-
file.send method, *args
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
def parse parser_spec, &block
|
|
59
|
-
lines = parser_spec.delete(:lines)
|
|
60
|
-
@parser = IMW::Parsers::RegexpParser.new(parser_spec)
|
|
61
|
-
parser.parse!(file, {:lines => lines}, &block)
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
end
|
|
65
|
-
end
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
# puts "#{File.basename(__FILE__)}: Don't forget to put a nametag on your Monkeywrench or one of the other chimps might steal it!" # at bottom
|
data/lib/imw/files/yaml.rb
DELETED
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# h2. lib/imw/files/yaml.rb -- describes yaml files
|
|
3
|
-
#
|
|
4
|
-
# == About
|
|
5
|
-
#
|
|
6
|
-
# A class for working with YAML files.
|
|
7
|
-
#
|
|
8
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
9
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
10
|
-
# License:: GPL 3.0
|
|
11
|
-
# Website:: http://infinitemonkeywrench.org/
|
|
12
|
-
#
|
|
13
|
-
|
|
14
|
-
require 'yaml'
|
|
15
|
-
require 'imw/files/text'
|
|
16
|
-
|
|
17
|
-
module IMW
|
|
18
|
-
module Files
|
|
19
|
-
|
|
20
|
-
class Yaml < IMW::Files::Text
|
|
21
|
-
|
|
22
|
-
def initialize uri, mode='r', options = {}
|
|
23
|
-
super uri, mode
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
# Return the contents of this YAML file.
|
|
27
|
-
#
|
|
28
|
-
# FIXME what to do if a block is passed in?
|
|
29
|
-
def load &block
|
|
30
|
-
YAML.load_file @path
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
# Dump +data+ to this file as YAML.
|
|
34
|
-
def dump data
|
|
35
|
-
super data.to_yaml
|
|
36
|
-
end
|
|
37
|
-
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
FILE_REGEXPS << [/\.yaml$/, IMW::Files::Yaml]
|
|
41
|
-
FILE_REGEXPS << [/\.yml$/, IMW::Files::Yaml]
|
|
42
|
-
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
# puts "#{File.basename(__FILE__)}: Yet another clever comment." # at bottobm
|
data/lib/imw/files.rb
DELETED
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
require 'uri'
|
|
2
|
-
require 'open-uri'
|
|
3
|
-
require 'imw/utils'
|
|
4
|
-
require 'imw/files/basicfile'
|
|
5
|
-
require 'imw/files/directory'
|
|
6
|
-
require 'imw/files/archive'
|
|
7
|
-
require 'imw/files/compressible'
|
|
8
|
-
require 'imw/files/compressed_file'
|
|
9
|
-
|
|
10
|
-
module IMW
|
|
11
|
-
|
|
12
|
-
# Parse +path+ and return an appropriate handler. Pass in <tt>:write
|
|
13
|
-
# => true</tt> to open for writing.
|
|
14
|
-
#
|
|
15
|
-
# IMW.open("/tmp/test.csv") # => IMW::Files::Csv("/tmp/test.csv')
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
def self.open path, options = {}, &block
|
|
19
|
-
if File.directory?(File.expand_path(path))
|
|
20
|
-
dir = Files::Directory.new(path)
|
|
21
|
-
yield dir if block_given?
|
|
22
|
-
dir
|
|
23
|
-
else
|
|
24
|
-
mode = options[:write] ? 'w' : 'r'
|
|
25
|
-
file = Files.file_class_for(path, options).new(path, mode, options)
|
|
26
|
-
yield file if block_given?
|
|
27
|
-
file
|
|
28
|
-
end
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
def self.open! path, options = {}, &block
|
|
32
|
-
self.open path, options.reverse_merge(:write => true)
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
module Files
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
# There is certainly a cleaner way to do this.
|
|
39
|
-
autoload :Text, 'imw/files/text'
|
|
40
|
-
autoload :Binary, 'imw/files/binary'
|
|
41
|
-
autoload :Yaml, 'imw/files/yaml'
|
|
42
|
-
autoload :Csv, 'imw/files/csv'
|
|
43
|
-
autoload :Json, 'imw/files/json'
|
|
44
|
-
autoload :Bz2, 'imw/files/compressed_files_and_archives'
|
|
45
|
-
autoload :Gz, 'imw/files/compressed_files_and_archives'
|
|
46
|
-
autoload :Tar, 'imw/files/compressed_files_and_archives'
|
|
47
|
-
autoload :Tarbz2, 'imw/files/compressed_files_and_archives'
|
|
48
|
-
autoload :Targz, 'imw/files/compressed_files_and_archives'
|
|
49
|
-
autoload :Rar, 'imw/files/compressed_files_and_archives'
|
|
50
|
-
autoload :Zip, 'imw/files/compressed_files_and_archives'
|
|
51
|
-
autoload :Xml, 'imw/files/sgml'
|
|
52
|
-
autoload :Html, 'imw/files/sgml'
|
|
53
|
-
autoload :Excel, 'imw/files/excel'
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
# An array used to match files to classes to handle them. The
|
|
57
|
-
# first element of each array is the regexp and the second names
|
|
58
|
-
# the class to handle the file.
|
|
59
|
-
#
|
|
60
|
-
# IMW::Files::EXTENSION_HANDLERS << [ /\.csv$/, :csv ] #=> IMW::Files::Csv
|
|
61
|
-
# IMW::Files::EXTENSION_HANDLERS << [ /\.txt$/, "Text" ] #=> IMW::Files::Text
|
|
62
|
-
# IMW::Files::EXTENSION_HANDLERS << [ /\.myclass%/, MyClass ] #=> MyClass
|
|
63
|
-
#
|
|
64
|
-
# Elements at the end of the array have greater precedence which
|
|
65
|
-
# allows, say, <tt>.tar.gz</tt> to be handled differently from
|
|
66
|
-
# <tt>.gz</tt>.
|
|
67
|
-
EXTENSION_HANDLERS = [
|
|
68
|
-
[/\.txt$/, :text],
|
|
69
|
-
[/\.txt$/, :text],
|
|
70
|
-
[/\.dat$/, :text],
|
|
71
|
-
[/\.ascii$/, :text],
|
|
72
|
-
[/\.yaml$/, :yaml],
|
|
73
|
-
[/\.yml$/, :yaml],
|
|
74
|
-
[/\.csv$/, :csv],
|
|
75
|
-
[/\.tsv$/, :tsv],
|
|
76
|
-
[/\.json$/, :json],
|
|
77
|
-
[/\.bz2$/, :bz2],
|
|
78
|
-
[/\.gz$/, :gz],
|
|
79
|
-
[/\.tar\.bz2$/, :tarbz2],
|
|
80
|
-
[/\.tbz2$/, :tarbz2],
|
|
81
|
-
[/\.tar\.gz$/, :targz],
|
|
82
|
-
[/\.tgz$/, :targz],
|
|
83
|
-
[/\.tar$/, :tar],
|
|
84
|
-
[/\.rar$/, :rar],
|
|
85
|
-
[/\.zip$/, :zip],
|
|
86
|
-
[/\.xml$/, :xml],
|
|
87
|
-
[/\.html$/, :html],
|
|
88
|
-
[/\.htm$/, :html],
|
|
89
|
-
[/\.xlsx?$/, :excel]
|
|
90
|
-
]
|
|
91
|
-
|
|
92
|
-
SCHEME_HANDLERS = [
|
|
93
|
-
[/http/, :html]
|
|
94
|
-
]
|
|
95
|
-
|
|
96
|
-
protected
|
|
97
|
-
def self.file_class_for path, options = {}
|
|
98
|
-
klass = options.delete(:as)
|
|
99
|
-
|
|
100
|
-
# try to choose klass from path extension if not already set
|
|
101
|
-
unless klass
|
|
102
|
-
EXTENSION_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
|
|
103
|
-
next unless regexp =~ path
|
|
104
|
-
klass = thing
|
|
105
|
-
break
|
|
106
|
-
end
|
|
107
|
-
end
|
|
108
|
-
|
|
109
|
-
# try to choose klass from uri scheme if not already set
|
|
110
|
-
unless klass
|
|
111
|
-
scheme = URI.parse(path).scheme
|
|
112
|
-
SCHEME_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
|
|
113
|
-
next unless regexp =~ scheme
|
|
114
|
-
klass = thing
|
|
115
|
-
break
|
|
116
|
-
end
|
|
117
|
-
end
|
|
118
|
-
|
|
119
|
-
# just stick with text if still not set
|
|
120
|
-
klass = :text unless klass
|
|
121
|
-
|
|
122
|
-
klass.is_a?(Class) ? klass : class_eval(klass.to_s.downcase.capitalize)
|
|
123
|
-
end
|
|
124
|
-
end
|
|
125
|
-
end
|
|
@@ -1,126 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Packagers
|
|
3
|
-
|
|
4
|
-
# Packages an Array of input files into a single output archive.
|
|
5
|
-
# When the archive is extracted, all the input files given will be
|
|
6
|
-
# in a single directory with a chosen name. The path to the output
|
|
7
|
-
# archive determines both the name of the archive and its type (tar,
|
|
8
|
-
# tar.bz2, zip, &c.).
|
|
9
|
-
#
|
|
10
|
-
# If any of the input files are themselves archives, they will first
|
|
11
|
-
# be extracted, with only their contents winding up in the final
|
|
12
|
-
# directory (the file hierarchy of the archive will be preserved).
|
|
13
|
-
# If any of the input files are compressed, they will first be
|
|
14
|
-
# uncompressed before being added to the directory.
|
|
15
|
-
#
|
|
16
|
-
# Input files can be renamed by passing in a Hash instead of an
|
|
17
|
-
# Array. Each key in this hash is the path to an input file and its
|
|
18
|
-
# value is the new basename to give it. If the basename is +nil+
|
|
19
|
-
# then the original path's basename will be used.
|
|
20
|
-
class Archiver
|
|
21
|
-
|
|
22
|
-
attr_accessor :name, :inputs
|
|
23
|
-
|
|
24
|
-
def initialize name, inputs
|
|
25
|
-
@name = name
|
|
26
|
-
add_inputs inputs
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
# FIXME Instead of requiring +new_inputs+ to be either an Array
|
|
30
|
-
# or Hash just iterate through whatever it is using +each+ and
|
|
31
|
-
# see if the iterate can be interpreted as a mapping between
|
|
32
|
-
# strings.
|
|
33
|
-
def add_inputs new_inputs
|
|
34
|
-
@inputs ||= {}
|
|
35
|
-
if new_inputs.is_a?(Array)
|
|
36
|
-
new_inputs.each do |input|
|
|
37
|
-
@inputs[File.expand_path(input)] = File.basename(input)
|
|
38
|
-
end
|
|
39
|
-
else
|
|
40
|
-
new_inputs.each_pair do |input, basename|
|
|
41
|
-
@inputs[File.expand_path(input)] = (basename || File.basename(input))
|
|
42
|
-
end
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
def errors
|
|
47
|
-
@errors ||= []
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
def add_processing_error error
|
|
51
|
-
IMW.logger.warn error
|
|
52
|
-
errors << error
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
def success?
|
|
56
|
-
errors.empty?
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
# A temporary directory to work in. Its contents will
|
|
60
|
-
# ultimately consist of a directory named for the package
|
|
61
|
-
# containing all the input files.
|
|
62
|
-
def tmp_dir
|
|
63
|
-
@tmp_dir ||= File.join(IMW.path_to(:tmp_root, 'packager'), (Time.now.to_i.to_s + "-" + $$.to_s)) # guaranteed unique on a node
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
def clean!
|
|
67
|
-
FileUtils.rm_rf(tmp_dir)
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
# A directory which will contain all the content being packaged,
|
|
71
|
-
# including the contents of any archives that were included in
|
|
72
|
-
# the list of files to process.
|
|
73
|
-
def dir
|
|
74
|
-
@dir ||= File.join(tmp_dir, name.to_s)
|
|
75
|
-
end
|
|
76
|
-
|
|
77
|
-
# FIXME This needs to be made idempotent -- calling prepare
|
|
78
|
-
# twice should not do any work the second time (unless the user
|
|
79
|
-
# is insistent and passes a :force option -- or maybe use bang
|
|
80
|
-
# and not-bang versions of the method for this distinction).
|
|
81
|
-
def prepare!
|
|
82
|
-
FileUtils.mkdir_p dir unless File.exist?(dir)
|
|
83
|
-
inputs.each_pair do |path, basename|
|
|
84
|
-
new_path = File.join(dir, basename)
|
|
85
|
-
file = IMW.open(path, :as => IMW::Files.file_class_for(basename)) # file's original path is meaningless: RackMultipart20091203-958-1nkgc61-0
|
|
86
|
-
case
|
|
87
|
-
when file.archive?
|
|
88
|
-
FileUtils.cd(dir) do
|
|
89
|
-
file.extract
|
|
90
|
-
end
|
|
91
|
-
when file.compressed?
|
|
92
|
-
file.cp(new_path).decompress!
|
|
93
|
-
else
|
|
94
|
-
file.cp(new_path)
|
|
95
|
-
end
|
|
96
|
-
end
|
|
97
|
-
end
|
|
98
|
-
|
|
99
|
-
# Package the contents of the temporary directory to an archive
|
|
100
|
-
# at +output+ but return exceptions instead of raising them.
|
|
101
|
-
def package output, options={}
|
|
102
|
-
begin
|
|
103
|
-
package! output, options={}
|
|
104
|
-
rescue RuntimeError => e
|
|
105
|
-
return e
|
|
106
|
-
end
|
|
107
|
-
end
|
|
108
|
-
|
|
109
|
-
# Package the contents of the temporary directory to an archive
|
|
110
|
-
# at +output+.
|
|
111
|
-
def package! output, options={}
|
|
112
|
-
output = IMW.open(output) if output.is_a?(String)
|
|
113
|
-
FileUtils.mkdir_p(output.dirname) unless File.exist?(output.dirname)
|
|
114
|
-
output.rm! if output.exist?
|
|
115
|
-
FileUtils.cd(tmp_dir) do
|
|
116
|
-
temp_output = IMW.open(output.basename)
|
|
117
|
-
packaged_output = temp_output.create(name.to_s + '/*').mv(output.path)
|
|
118
|
-
temp_output.rm if temp_output.exist?
|
|
119
|
-
add_processing_error "Archiver: couldn't create archive #{output.path}" unless output.exists?
|
|
120
|
-
end
|
|
121
|
-
output
|
|
122
|
-
end
|
|
123
|
-
end
|
|
124
|
-
end
|
|
125
|
-
end
|
|
126
|
-
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
require 'aws/s3'
|
|
2
|
-
module IMW
|
|
3
|
-
module Packagers
|
|
4
|
-
class S3Mover
|
|
5
|
-
|
|
6
|
-
attr_reader :last_response
|
|
7
|
-
attr_accessor :bucket_name
|
|
8
|
-
|
|
9
|
-
def initialize options={}
|
|
10
|
-
@bucket_name = options.delete(:bucket_name)
|
|
11
|
-
AWS::S3::Base.establish_connection!(options)
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
def success?
|
|
15
|
-
errors.empty?
|
|
16
|
-
end
|
|
17
|
-
|
|
18
|
-
def success?
|
|
19
|
-
last_response && last_response.response.class == Net::HTTPOK
|
|
20
|
-
end
|
|
21
|
-
|
|
22
|
-
def upload local_path, remote_path
|
|
23
|
-
begin
|
|
24
|
-
upload! local_path, remote_path
|
|
25
|
-
rescue RuntimeError => e
|
|
26
|
-
return e
|
|
27
|
-
end
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
def upload! local_path, remote_path
|
|
31
|
-
@last_response = AWS::S3::S3Object.store(remote_path, open(local_path), bucket_name)
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
end
|
|
35
|
-
end
|
|
36
|
-
end
|
data/lib/imw/packagers.rb
DELETED
data/lib/imw/utils/components.rb
DELETED
|
@@ -1,61 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# h2. lib/imw/utils/components.rb -- define separate components of IMW
|
|
3
|
-
#
|
|
4
|
-
# == About
|
|
5
|
-
#
|
|
6
|
-
# Defines a hash <tt>IMW::COMPONENTS</tt> which keys component names
|
|
7
|
-
# to the files to be required to implement each component and defines
|
|
8
|
-
# methods to load these files.
|
|
9
|
-
#
|
|
10
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
11
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
12
|
-
# License:: GPL 3.0
|
|
13
|
-
# Website:: http://infinitemonkeywrench.org/
|
|
14
|
-
#
|
|
15
|
-
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
|
16
|
-
|
|
17
|
-
require 'imw/utils/error'
|
|
18
|
-
|
|
19
|
-
module IMW
|
|
20
|
-
|
|
21
|
-
# Defines IMW components and the files required by each. Components
|
|
22
|
-
# can be accessed using <tt>IMW.load_components</tt> or
|
|
23
|
-
# <tt>IMW#imw_components</tt>.
|
|
24
|
-
COMPONENTS = {
|
|
25
|
-
:datamapper => ["imw/dataset/datamapper","imw/dataset/datamapper/time_and_user_stamps"],
|
|
26
|
-
:data_mapper => :datamapper,
|
|
27
|
-
:html_parser => "imw/parsers/html_parser",
|
|
28
|
-
:flat_file_parser => "imw/parsers/flat_file_parser",
|
|
29
|
-
:line_parser => "imw/parsers/line_parser",
|
|
30
|
-
:infochimps => ["imw/infochimps/infochimps_resource","imw/infochimps/icss"]
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
# Load components of IMW as needed,
|
|
34
|
-
#
|
|
35
|
-
# IMW.load_components :datamapper, :flat_file_parser
|
|
36
|
-
def self.load_components *args
|
|
37
|
-
args.each do |component_name|
|
|
38
|
-
begin
|
|
39
|
-
require component_name.to_s
|
|
40
|
-
rescue LoadError
|
|
41
|
-
component = IMW::COMPONENTS[component_name]
|
|
42
|
-
raise IMW::Error.new("#{component_name} is an invalid IMW component. See IMW::COMPONENTS.") unless component
|
|
43
|
-
if component.is_a? Array then
|
|
44
|
-
IMW.load_components *component
|
|
45
|
-
else
|
|
46
|
-
IMW.load_components component
|
|
47
|
-
end
|
|
48
|
-
end
|
|
49
|
-
end
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
# Load components of IMW as needed,
|
|
53
|
-
#
|
|
54
|
-
# include IMW
|
|
55
|
-
# imw_components :datamapper, :flat_file_parser
|
|
56
|
-
def imw_components *args
|
|
57
|
-
IMW.load_components *args
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
end
|
|
61
|
-
|
data/lib/imw/utils/config.rb
DELETED
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# h2. lib/imw/utils/config.rb -- configuration parsing
|
|
3
|
-
#
|
|
4
|
-
# == About
|
|
5
|
-
#
|
|
6
|
-
# IMW looks for configuration settings in the following places, in
|
|
7
|
-
# order of increasing precedence:
|
|
8
|
-
#
|
|
9
|
-
# 1. Settings defined directly in this file.
|
|
10
|
-
#
|
|
11
|
-
# 2. From the <tt>etc/imwrc</tt> file in the IMW root directory.
|
|
12
|
-
#
|
|
13
|
-
# 3. From the <tt>.imwrc</tt> file in the user's home directory (the
|
|
14
|
-
# filename can be changed; see
|
|
15
|
-
# <tt>IMW::Config::USER_CONFIG_FILE_BASENAME</tt>).
|
|
16
|
-
#
|
|
17
|
-
# 4. From the file defined by the environment variable +IMWRC+ (the
|
|
18
|
-
# value can be changed; see
|
|
19
|
-
# <tt>IMW::Config::USER_CONFIG_FILE_ENV_VARIABLE</tt>
|
|
20
|
-
#
|
|
21
|
-
# Settings not found in one configuration location will be searched
|
|
22
|
-
# for in locations of lesser precedence.
|
|
23
|
-
#
|
|
24
|
-
# *Note:* configuration files are plain Ruby code that will be directly
|
|
25
|
-
# evaluated.
|
|
26
|
-
#
|
|
27
|
-
# Relevant settings include
|
|
28
|
-
#
|
|
29
|
-
# * interfaces with external programs (+tar+, +wget+, &c.)
|
|
30
|
-
# * paths to directories where IMW reads/writes files
|
|
31
|
-
# * correspondences between file extensions and IMW file classes
|
|
32
|
-
#
|
|
33
|
-
# For more detailed information, see the default configuration file,
|
|
34
|
-
# <tt>etc/imwrc</tt>.
|
|
35
|
-
#
|
|
36
|
-
#
|
|
37
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
38
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
39
|
-
# License:: GPL 3.0
|
|
40
|
-
# Website:: http://infinitemonkeywrench.org/
|
|
41
|
-
#
|
|
42
|
-
|
|
43
|
-
module IMW
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
# puts "#{File.basename(__FILE__)}: You carefully adjust the settings on your Monkeywrench. Glob-monsters: beware!!" # at bottom
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
# :nodoc:
|
|
2
|
-
# for when cattr_accessor is all you need
|
|
3
|
-
#
|
|
4
|
-
require 'active_support/core_ext/array/extract_options'
|
|
5
|
-
class Array #:nodoc:
|
|
6
|
-
include ActiveSupport::CoreExtensions::Array::ExtractOptions
|
|
7
|
-
end
|
|
8
|
-
require 'active_support/core_ext/class/attribute_accessors'
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
require 'imw/utils/extensions/string'
|
|
2
|
-
require 'imw/utils/extensions/array'
|
|
3
|
-
require 'imw/utils/extensions/hash'
|
|
4
|
-
require 'imw/utils/extensions/dir'
|
|
5
|
-
require 'imw/utils/extensions/struct'
|
|
6
|
-
require 'imw/utils/extensions/symbol'
|
|
7
|
-
require 'imw/utils/extensions/file_core'
|
|
8
|
-
require 'active_support/core_ext/module/aliasing'
|
|
9
|
-
require 'active_support/core_ext/object/blank'
|
|
10
|
-
require 'active_support/core_ext/object/misc'
|
|
11
|
-
#require 'active_support/core_ext/blank.rb'
|
|
12
|
-
require 'imw/utils/extensions/class/attribute_accessors'
|
|
13
|
-
# require 'ostruct'
|
|
14
|
-
require 'set'
|
|
15
|
-
|
|
16
|
-
module IMW
|
|
17
|
-
# A replacement for the standard system call which raises an
|
|
18
|
-
# IMW::SystemCallError if the command fails as well as printing the
|
|
19
|
-
# command appended to the end of <tt>error_message</tt>.
|
|
20
|
-
def self.system *commands
|
|
21
|
-
command = commands.flatten.join ' '
|
|
22
|
-
Kernel.system(command)
|
|
23
|
-
raise IMW::SystemCallError.new(command) unless $?.success?
|
|
24
|
-
end
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# h2. lib/imw/utils/extensions/dir.rb -- directory extensions
|
|
3
|
-
#
|
|
4
|
-
# == About
|
|
5
|
-
#
|
|
6
|
-
# The Ruby +Dir+ module is rubbish. Time to clean it up a bit!
|
|
7
|
-
#
|
|
8
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
9
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
10
|
-
# License:: GPL 3.0
|
|
11
|
-
# Website:: http://infinitemonkeywrench.org/
|
|
12
|
-
#
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class Dir
|
|
16
|
-
|
|
17
|
-
# Return the absolute paths of files and directories in the
|
|
18
|
-
# directory, leaving out `.' and `..' entries.
|
|
19
|
-
def abs_contents
|
|
20
|
-
self.entries.map {|entry| File.join(self.path,entry) unless entry == '.' || entry == '..'}.compact
|
|
21
|
-
end
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
# puts "#{File.basename(__FILE__)}: You open the folder and see along list of names. Some have been crossed out -- ominously..." # at bottom
|