imw 0.2.18 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +7 -26
- data/Gemfile.lock +13 -38
- data/{LICENSE → LICENSE.txt} +1 -1
- data/README.textile +35 -0
- data/Rakefile +45 -22
- data/VERSION +1 -1
- data/examples/foo.rb +19 -0
- data/examples/html_selector.rb +22 -0
- data/examples/nes_game_list.csv +625 -0
- data/examples/nes_gamespot.csv +1371 -0
- data/examples/nes_nintendo.csv +624 -0
- data/examples/nes_unlicensed.csv +89 -0
- data/examples/nes_wikipedia.csv +710 -0
- data/examples/nibbler_test.rb +24 -0
- data/examples/script.rb +19 -0
- data/lib/imw.rb +28 -140
- data/lib/imw/error.rb +9 -0
- data/lib/imw/recordizer.rb +8 -0
- data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
- data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
- data/lib/imw/resource.rb +3 -119
- data/lib/imw/serializer.rb +7 -0
- data/lib/imw/serializer/json_serializer.rb +17 -0
- data/lib/imw/uri.rb +41 -0
- data/spec/resource_spec.rb +78 -0
- data/spec/uri_spec.rb +55 -0
- metadata +81 -232
- data/README.rdoc +0 -371
- data/bin/imw +0 -5
- data/bin/tsv_to_json.rb +0 -29
- data/etc/imwrc.rb +0 -26
- data/examples/dataset.rb +0 -12
- data/examples/metadata.yml +0 -10
- data/lib/imw/archives.rb +0 -120
- data/lib/imw/archives/rar.rb +0 -19
- data/lib/imw/archives/tar.rb +0 -19
- data/lib/imw/archives/tarbz2.rb +0 -73
- data/lib/imw/archives/targz.rb +0 -73
- data/lib/imw/archives/zip.rb +0 -51
- data/lib/imw/boot.rb +0 -87
- data/lib/imw/compressed_files.rb +0 -94
- data/lib/imw/compressed_files/bz2.rb +0 -16
- data/lib/imw/compressed_files/compressible.rb +0 -75
- data/lib/imw/compressed_files/gz.rb +0 -16
- data/lib/imw/dataset.rb +0 -125
- data/lib/imw/dataset/paths.rb +0 -29
- data/lib/imw/dataset/workflow.rb +0 -195
- data/lib/imw/formats.rb +0 -33
- data/lib/imw/formats/delimited.rb +0 -170
- data/lib/imw/formats/excel.rb +0 -100
- data/lib/imw/formats/json.rb +0 -41
- data/lib/imw/formats/pdf.rb +0 -71
- data/lib/imw/formats/sgml.rb +0 -69
- data/lib/imw/formats/yaml.rb +0 -41
- data/lib/imw/metadata.rb +0 -83
- data/lib/imw/metadata/contains_metadata.rb +0 -54
- data/lib/imw/metadata/dsl.rb +0 -111
- data/lib/imw/metadata/field.rb +0 -37
- data/lib/imw/metadata/has_metadata.rb +0 -98
- data/lib/imw/metadata/has_summary.rb +0 -57
- data/lib/imw/metadata/schema.rb +0 -17
- data/lib/imw/parsers.rb +0 -8
- data/lib/imw/parsers/flat.rb +0 -44
- data/lib/imw/parsers/html_parser.rb +0 -387
- data/lib/imw/parsers/html_parser/matchers.rb +0 -289
- data/lib/imw/parsers/line_parser.rb +0 -87
- data/lib/imw/parsers/regexp_parser.rb +0 -72
- data/lib/imw/repository.rb +0 -12
- data/lib/imw/runner.rb +0 -118
- data/lib/imw/schemes.rb +0 -23
- data/lib/imw/schemes/ftp.rb +0 -142
- data/lib/imw/schemes/hdfs.rb +0 -251
- data/lib/imw/schemes/http.rb +0 -165
- data/lib/imw/schemes/local.rb +0 -409
- data/lib/imw/schemes/remote.rb +0 -119
- data/lib/imw/schemes/s3.rb +0 -143
- data/lib/imw/schemes/sql.rb +0 -129
- data/lib/imw/tools.rb +0 -12
- data/lib/imw/tools/aggregator.rb +0 -148
- data/lib/imw/tools/archiver.rb +0 -220
- data/lib/imw/tools/downloader.rb +0 -63
- data/lib/imw/tools/extension_analyzer.rb +0 -114
- data/lib/imw/tools/summarizer.rb +0 -83
- data/lib/imw/tools/transferer.rb +0 -167
- data/lib/imw/utils.rb +0 -74
- data/lib/imw/utils/dynamically_extendable.rb +0 -137
- data/lib/imw/utils/error.rb +0 -59
- data/lib/imw/utils/extensions/hpricot.rb +0 -34
- data/lib/imw/utils/has_uri.rb +0 -131
- data/lib/imw/utils/log.rb +0 -92
- data/lib/imw/utils/misc.rb +0 -57
- data/lib/imw/utils/paths.rb +0 -146
- data/lib/imw/utils/uri.rb +0 -59
- data/lib/imw/utils/uuid.rb +0 -33
- data/lib/imw/utils/validate.rb +0 -38
- data/lib/imw/utils/version.rb +0 -11
- data/spec/data/formats/delimited/sample.csv +0 -131
- data/spec/data/formats/delimited/sample.tsv +0 -131
- data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
- data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
- data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
- data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
- data/spec/data/formats/excel/sample.xls +0 -0
- data/spec/data/formats/json/sample.json +0 -1
- data/spec/data/formats/none/sample +0 -650
- data/spec/data/formats/sgml/sample.xml +0 -617
- data/spec/data/formats/text/sample.txt +0 -650
- data/spec/data/formats/yaml/sample.yaml +0 -410
- data/spec/data/schema-tabular.yaml +0 -11
- data/spec/imw/archives/rar_spec.rb +0 -16
- data/spec/imw/archives/tar_spec.rb +0 -16
- data/spec/imw/archives/tarbz2_spec.rb +0 -24
- data/spec/imw/archives/targz_spec.rb +0 -21
- data/spec/imw/archives/zip_spec.rb +0 -16
- data/spec/imw/archives_spec.rb +0 -77
- data/spec/imw/compressed_files/bz2_spec.rb +0 -15
- data/spec/imw/compressed_files/compressible_spec.rb +0 -36
- data/spec/imw/compressed_files/gz_spec.rb +0 -15
- data/spec/imw/compressed_files_spec.rb +0 -47
- data/spec/imw/dataset/paths_spec.rb +0 -32
- data/spec/imw/dataset/workflow_spec.rb +0 -41
- data/spec/imw/formats/delimited_spec.rb +0 -44
- data/spec/imw/formats/excel_spec.rb +0 -55
- data/spec/imw/formats/json_spec.rb +0 -18
- data/spec/imw/formats/sgml_spec.rb +0 -24
- data/spec/imw/formats/yaml_spec.rb +0 -19
- data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
- data/spec/imw/metadata/field_spec.rb +0 -25
- data/spec/imw/metadata/has_metadata_spec.rb +0 -58
- data/spec/imw/metadata/has_summary_spec.rb +0 -32
- data/spec/imw/metadata/schema_spec.rb +0 -24
- data/spec/imw/metadata_spec.rb +0 -86
- data/spec/imw/parsers/line_parser_spec.rb +0 -96
- data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
- data/spec/imw/resource_spec.rb +0 -32
- data/spec/imw/schemes/hdfs_spec.rb +0 -67
- data/spec/imw/schemes/http_spec.rb +0 -19
- data/spec/imw/schemes/local_spec.rb +0 -165
- data/spec/imw/schemes/remote_spec.rb +0 -38
- data/spec/imw/schemes/s3_spec.rb +0 -31
- data/spec/imw/schemes/sql_spec.rb +0 -3
- data/spec/imw/tools/aggregator_spec.rb +0 -71
- data/spec/imw/tools/archiver_spec.rb +0 -120
- data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
- data/spec/imw/tools/summarizer_spec.rb +0 -8
- data/spec/imw/tools/transferer_spec.rb +0 -195
- data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
- data/spec/imw/utils/has_uri_spec.rb +0 -61
- data/spec/imw/utils/paths_spec.rb +0 -10
- data/spec/imw/utils/shared_paths_spec.rb +0 -29
- data/spec/imw_spec.rb +0 -14
- data/spec/rcov.opts +0 -1
- data/spec/spec_helper.rb +0 -31
- data/spec/support/custom_matchers.rb +0 -28
- data/spec/support/file_contents_matcher.rb +0 -30
- data/spec/support/paths_matcher.rb +0 -66
- data/spec/support/random.rb +0 -213
- data/spec/support/without_regard_to_order_matcher.rb +0 -41
data/lib/imw/runner.rb
DELETED
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
require 'imw'
|
|
2
|
-
require 'optparse'
|
|
3
|
-
|
|
4
|
-
module IMW
|
|
5
|
-
|
|
6
|
-
RunnerError = Class.new(IMW::Error)
|
|
7
|
-
|
|
8
|
-
class Runner
|
|
9
|
-
|
|
10
|
-
DEFAULT_OPTIONS = {
|
|
11
|
-
:requires => [],
|
|
12
|
-
:selectors => [],
|
|
13
|
-
:dry_run => false
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
attr_reader :args, :options
|
|
17
|
-
|
|
18
|
-
def initialize *args
|
|
19
|
-
@args = args
|
|
20
|
-
@options = DEFAULT_OPTIONS.dup
|
|
21
|
-
parser.parse!(args)
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
def parser
|
|
25
|
-
OptionParser.new do |opts|
|
|
26
|
-
opts.banner = "usage: imw [OPTIONS] TASK"
|
|
27
|
-
opts.separator <<EOF
|
|
28
|
-
|
|
29
|
-
Run TASK for all datasets in the repository. IMW will read any
|
|
30
|
-
*.imw files in the current directory by default.
|
|
31
|
-
|
|
32
|
-
Options include
|
|
33
|
-
|
|
34
|
-
EOF
|
|
35
|
-
|
|
36
|
-
opts.on('-v', '--verbose', "Print verbose output") do
|
|
37
|
-
IMW.verbose = true # class level, see IMW::Runner.verbose?
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
opts.on('-d', '--skip-dependencies', "Execute given tasks without invoking dependencies first") do
|
|
41
|
-
options[:execute] = true
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
opts.on('-l', '--list', "List datasets in repository") do
|
|
45
|
-
options[:list] = true
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
opts.on('-s', '--selector SELECTOR', "Filter datasets by regexp SELECTOR. Can be given more than once.") do |selector|
|
|
49
|
-
options[:selectors] << selector
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
opts.on('-r', '--require PATH', "Require PATH. Can be given more than once.") do |path|
|
|
53
|
-
options[:requires] << path
|
|
54
|
-
end
|
|
55
|
-
|
|
56
|
-
end
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
def require_files
|
|
60
|
-
Dir['*.imw'].each { |path| load File.expand_path(path) }
|
|
61
|
-
Dir['*.rb'].each { |path| require path.gsub(/\.rb$/,'') }
|
|
62
|
-
options[:requires].each do |path|
|
|
63
|
-
IMW.open(path) do |requireable|
|
|
64
|
-
if requireable.directory?
|
|
65
|
-
requireable["**/*.rb"].each { |file| require file }
|
|
66
|
-
requireable["**/*.imw"].each { |file| load file }
|
|
67
|
-
else
|
|
68
|
-
require requireable.path
|
|
69
|
-
end
|
|
70
|
-
end
|
|
71
|
-
end
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
def task
|
|
75
|
-
args.first
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
def handles
|
|
79
|
-
if options[:selectors].blank?
|
|
80
|
-
IMW.repository.keys.sort
|
|
81
|
-
else
|
|
82
|
-
IMW.repository.handles.map do |handle|
|
|
83
|
-
handle if options[:selectors].all? { |selector| handle.to_s =~ Regexp.new(selector) }
|
|
84
|
-
end.compact.sort
|
|
85
|
-
end
|
|
86
|
-
end
|
|
87
|
-
|
|
88
|
-
def datasets
|
|
89
|
-
handles.map { |handle| IMW.repository[handle] }
|
|
90
|
-
end
|
|
91
|
-
|
|
92
|
-
def list!
|
|
93
|
-
puts handles
|
|
94
|
-
exit
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
def run_task!
|
|
98
|
-
datasets.each do |dataset|
|
|
99
|
-
dataset[task].send(options[:execute] ? :execute : :invoke)
|
|
100
|
-
end
|
|
101
|
-
exit
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
def run!
|
|
105
|
-
require_files
|
|
106
|
-
case
|
|
107
|
-
when options[:list]
|
|
108
|
-
list!
|
|
109
|
-
when task.blank?
|
|
110
|
-
puts parser
|
|
111
|
-
exit 1
|
|
112
|
-
else
|
|
113
|
-
run_task!
|
|
114
|
-
end
|
|
115
|
-
end
|
|
116
|
-
end
|
|
117
|
-
end
|
|
118
|
-
|
data/lib/imw/schemes.rb
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Schemes
|
|
3
|
-
autoload :Local, 'imw/schemes/local'
|
|
4
|
-
autoload :Remote, 'imw/schemes/remote'
|
|
5
|
-
autoload :S3, 'imw/schemes/s3'
|
|
6
|
-
autoload :HTTP, 'imw/schemes/http'
|
|
7
|
-
autoload :HTTPS, 'imw/schemes/http'
|
|
8
|
-
autoload :HDFS, 'imw/schemes/hdfs'
|
|
9
|
-
autoload :SQL, 'imw/schemes/sql'
|
|
10
|
-
|
|
11
|
-
HANDLERS = [
|
|
12
|
-
["Schemes::Local::Base", Proc.new { |resource| resource.scheme == 'file' || resource.scheme.blank? } ],
|
|
13
|
-
["Schemes::Remote::Base", Proc.new { |resource| resource.scheme != 'file' && resource.scheme.present? } ],
|
|
14
|
-
["Schemes::S3", %r{^s3://}i ],
|
|
15
|
-
["Schemes::HTTP", %r{^http://}i ],
|
|
16
|
-
["Schemes::HTTPS", %r{^https://}i ],
|
|
17
|
-
["Schemes::HDFS", %r{^hdfs://}i ],
|
|
18
|
-
["Schemes::SQL::Base", %r{^\w+sql://}i ]
|
|
19
|
-
]
|
|
20
|
-
end
|
|
21
|
-
end
|
|
22
|
-
|
|
23
|
-
|
data/lib/imw/schemes/ftp.rb
DELETED
|
@@ -1,142 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Schemes
|
|
3
|
-
|
|
4
|
-
# Defines methods for reading and writing data from an FTP server.
|
|
5
|
-
#
|
|
6
|
-
# IMW.open('ftp://user:pass@my_bucket/path/to/some/file.csv')
|
|
7
|
-
#
|
|
8
|
-
# Learn more about {Amazon Web Services}[http://aws.amazon.com].
|
|
9
|
-
module FTP
|
|
10
|
-
|
|
11
|
-
module Base
|
|
12
|
-
|
|
13
|
-
# Is this resource an FTP resource?
|
|
14
|
-
#
|
|
15
|
-
# @return [true, false]
|
|
16
|
-
def on_ftp?
|
|
17
|
-
true
|
|
18
|
-
end
|
|
19
|
-
alias_method :is_ftp?, :on_ftp?
|
|
20
|
-
|
|
21
|
-
# Copy this resource to the +new_uri+.
|
|
22
|
-
#
|
|
23
|
-
# @param [String, IMW::Resource] new_uri
|
|
24
|
-
# @return [IMW::Resource] the new resource
|
|
25
|
-
def cp new_uri
|
|
26
|
-
local_obj = IMW.open(new_uri)
|
|
27
|
-
raise IMW::PathError.new("FTP resources (#{self}) can only be downloaded to a local path") unless local_obj.is_local?
|
|
28
|
-
local_obj.dir.should_exist!
|
|
29
|
-
FTP.open(host, user, password) do |ftp|
|
|
30
|
-
ftp.get(path, local_obj.path)
|
|
31
|
-
end
|
|
32
|
-
local_obj
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
# Does this resource exist on S3?
|
|
36
|
-
#
|
|
37
|
-
# @return [true, false]
|
|
38
|
-
def exist?
|
|
39
|
-
s3_object.exists?
|
|
40
|
-
end
|
|
41
|
-
alias_method :exists?, :exist?
|
|
42
|
-
|
|
43
|
-
# Remove this resource from S3.
|
|
44
|
-
#
|
|
45
|
-
# @return [IMW::Resource] the deleted object
|
|
46
|
-
def rm
|
|
47
|
-
s3_object.delete
|
|
48
|
-
end
|
|
49
|
-
alias_method :rm!, :rm
|
|
50
|
-
|
|
51
|
-
# Return the S3N URL for this S3 object
|
|
52
|
-
#
|
|
53
|
-
# resource = IMW.open('s3://my_bucket/path/to/some/obj')
|
|
54
|
-
# resource.s3n_url
|
|
55
|
-
# => 's3n://my_bucket/path/to/some/obj'
|
|
56
|
-
#
|
|
57
|
-
# @return [String]
|
|
58
|
-
def s3n_url
|
|
59
|
-
uri.to_s.gsub(/^s3:/, 's3n:')
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
# Return the contents of this S3 object.
|
|
63
|
-
#
|
|
64
|
-
# @return [String]
|
|
65
|
-
def read
|
|
66
|
-
s3_object.value
|
|
67
|
-
end
|
|
68
|
-
|
|
69
|
-
# Store +source+ into +destination+.
|
|
70
|
-
#
|
|
71
|
-
# @param [String, IMW::Resource, #io] source
|
|
72
|
-
# @param [String, IMW::Resource, #path, #bucket] destination
|
|
73
|
-
# @return [IMW::Resource] the new S3 object
|
|
74
|
-
def self.put source, destination
|
|
75
|
-
source = IMW.open(source)
|
|
76
|
-
destintation = IMW.open(destination)
|
|
77
|
-
raise IMW::ArgumentError.new("destination must be on S3 -- #{destination} given") unless destination.on_s3?
|
|
78
|
-
make_connection!
|
|
79
|
-
AWS::S3::S3Object.store(destination.path, source.io, destination.bucket)
|
|
80
|
-
destination
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
# Download +source+ from S3 into +destination+.
|
|
84
|
-
#
|
|
85
|
-
# @param [String, IMW::Resource, #path, #bucket] source
|
|
86
|
-
# @param [String, IMW::Resource, #write] destination
|
|
87
|
-
# @return [IMW::Resource] the new resource
|
|
88
|
-
def self.get source, destination
|
|
89
|
-
source = IMW.open(source)
|
|
90
|
-
destination = IMW.open!(destination)
|
|
91
|
-
raise IMW::ArgumentError.new("source must be on S3 -- #{source} given") unless source.on_s3?
|
|
92
|
-
make_connection!
|
|
93
|
-
AWS::S3::S3Object.stream(source.path, source.bucket) do |chunk|
|
|
94
|
-
destination.write(chunk)
|
|
95
|
-
end
|
|
96
|
-
destination.close
|
|
97
|
-
destination.reopen
|
|
98
|
-
end
|
|
99
|
-
|
|
100
|
-
# Copy S3 resource +source+ to +destination+.
|
|
101
|
-
#
|
|
102
|
-
# @param [String, IMW::Resource, #path, #bucket] source
|
|
103
|
-
# @param [String, IMW::Resource, #path, #bucket] destination
|
|
104
|
-
# @return [IMW::Resource] the new resource
|
|
105
|
-
def self.copy source, destination
|
|
106
|
-
source = IMW.open(source)
|
|
107
|
-
destination = IMW.open(destination)
|
|
108
|
-
raise IMW::PathError.new("Bucket names must be non-blank and match to 'copy'") unless source.bucket.present? && destination.bucket.present? && source.bucket == destination.bucket
|
|
109
|
-
make_connection!
|
|
110
|
-
AWS::S3::Object.copy(source.path, destination.path, destination.bucket)
|
|
111
|
-
destination
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
# Return the resource at the base path of this resource joined
|
|
115
|
-
# to +path+.
|
|
116
|
-
#
|
|
117
|
-
# IMW.open('s3:://bucket/path/to/dir').join('subdir')
|
|
118
|
-
# #=> IMW::Resource at 's3://bucket/path/to/dir/subdir'
|
|
119
|
-
#
|
|
120
|
-
# @param [Array<String>] paths
|
|
121
|
-
# @return [IMW::Resource]
|
|
122
|
-
def join *paths
|
|
123
|
-
IMW.open(File.join(stripped_uri.to_s, *paths))
|
|
124
|
-
end
|
|
125
|
-
|
|
126
|
-
protected
|
|
127
|
-
# Make an S3 connection.
|
|
128
|
-
#
|
|
129
|
-
# Uses settings defined in IMW::AWS_CREDENTIALS.
|
|
130
|
-
#
|
|
131
|
-
# @return [AWS
|
|
132
|
-
def self.make_connection!
|
|
133
|
-
return @connection if @connection
|
|
134
|
-
raise IMW::Error.new("Must define a constant IMW::AWS_CREDENTIALS with an :access_key_id and a :secret_access_key before using S3 resources") unless defined?(IMW::AWS_CREDENTIALS)
|
|
135
|
-
require 'aws/s3'
|
|
136
|
-
@connection = AWS::S3::Base.establish_connection!(IMW::AWS_CREDENTIALS)
|
|
137
|
-
end
|
|
138
|
-
|
|
139
|
-
end
|
|
140
|
-
end
|
|
141
|
-
end
|
|
142
|
-
|
data/lib/imw/schemes/hdfs.rb
DELETED
|
@@ -1,251 +0,0 @@
|
|
|
1
|
-
module IMW
|
|
2
|
-
module Schemes
|
|
3
|
-
|
|
4
|
-
# Defines methods for reading and writing data to/from an
|
|
5
|
-
# HDFS[http://hadoop.apache.org/common/docs/current/hdfs_design.html]]
|
|
6
|
-
#
|
|
7
|
-
# Learn more about Hadoop[http://hadoop.apache.org] and the
|
|
8
|
-
# {Hadoop Distributed
|
|
9
|
-
# Filesystem}[http://hadoop.apache.org/common/docs/current/hdfs_design.html].
|
|
10
|
-
module HDFS
|
|
11
|
-
|
|
12
|
-
# Checks to see if this is a file or directory
|
|
13
|
-
def self.extended obj
|
|
14
|
-
obj.extend(obj.is_directory? ? HDFSDirectory : HDFSFile)
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
# Is this resource an HDFS resource?
|
|
18
|
-
#
|
|
19
|
-
# @return [true, false]
|
|
20
|
-
def on_hdfs?
|
|
21
|
-
true
|
|
22
|
-
end
|
|
23
|
-
alias_method :is_hdfs?, :on_hdfs?
|
|
24
|
-
|
|
25
|
-
# Copy this resource to the +new_uri+.
|
|
26
|
-
#
|
|
27
|
-
# @param [String, IMW::Resource] new_uri
|
|
28
|
-
# @return [IMW::Resource] the new resource
|
|
29
|
-
def cp new_uri
|
|
30
|
-
IMW::Tools::Transferer.new(:cp, self, new_uri).transfer!
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
# Move this resource to the +new_uri+.
|
|
34
|
-
#
|
|
35
|
-
# @param [String, IMW::Resource] new_uri
|
|
36
|
-
# @return [IMW::Resource] the new resource
|
|
37
|
-
def mv new_uri
|
|
38
|
-
IMW::Tools::Transferer.new(:mv, self, new_uri).transfer!
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
# Delete this resource from the HDFS.
|
|
42
|
-
#
|
|
43
|
-
# @option options [true,false] :skip_trash
|
|
44
|
-
def rm options={}
|
|
45
|
-
should_exist!("Cannot delete.")
|
|
46
|
-
args = [:rm]
|
|
47
|
-
args << '-skipTrash' if options[:skip] || options[:skip_trash] || options[:skipTrash]
|
|
48
|
-
args << path
|
|
49
|
-
HDFS.fs(*args)
|
|
50
|
-
self
|
|
51
|
-
end
|
|
52
|
-
alias_method :rm!, :rm
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
# Does this path exist on the HDFS?
|
|
56
|
-
#
|
|
57
|
-
# @return [true, false]
|
|
58
|
-
def exist?
|
|
59
|
-
return @exist unless @exist.nil?
|
|
60
|
-
refresh!
|
|
61
|
-
@exist
|
|
62
|
-
end
|
|
63
|
-
alias_method :exists?, :exist?
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
# Return the size (in bytes) of this resource on the HDFS.
|
|
67
|
-
#
|
|
68
|
-
# This value is cached. Call +refresh+ to refresh the cache
|
|
69
|
-
# manually.
|
|
70
|
-
#
|
|
71
|
-
# @return [Fixnum]
|
|
72
|
-
def size
|
|
73
|
-
return @size unless @size.nil?
|
|
74
|
-
refresh!
|
|
75
|
-
should_exist!("Cannot report size")
|
|
76
|
-
@size
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
# Return the number of directories contained at or below this
|
|
80
|
-
# path on the HDFS.
|
|
81
|
-
#
|
|
82
|
-
# This value is cached. Call +refresh+ to refresh the cache
|
|
83
|
-
# manually.
|
|
84
|
-
#
|
|
85
|
-
# @return [Fixnum]
|
|
86
|
-
def num_dirs
|
|
87
|
-
return @num_dirs unless @num_dirs.nil?
|
|
88
|
-
refresh!
|
|
89
|
-
should_exist!("Cannot report number of directories.")
|
|
90
|
-
@num_dirs
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
# Return the number of files contained at or below this path
|
|
94
|
-
# on the HDFS.
|
|
95
|
-
#
|
|
96
|
-
# This value is cached. Call +refresh+ to refresh the cache
|
|
97
|
-
# manually.
|
|
98
|
-
#
|
|
99
|
-
# @return [Fixnum]
|
|
100
|
-
def num_files
|
|
101
|
-
return @num_files unless @num_files.nil?
|
|
102
|
-
refresh!
|
|
103
|
-
should_exist!("Cannot report number of files.")
|
|
104
|
-
@num_files
|
|
105
|
-
end
|
|
106
|
-
|
|
107
|
-
# Is this resource an HDFS directory?
|
|
108
|
-
#
|
|
109
|
-
# @return [true, false]
|
|
110
|
-
def is_directory?
|
|
111
|
-
exist? && num_dirs > 0
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
# Refresh the cached file properties.
|
|
115
|
-
#
|
|
116
|
-
# @return [IMW::Resource] this resource
|
|
117
|
-
def refresh!
|
|
118
|
-
response = HDFS.fs(:count, path)
|
|
119
|
-
if response.blank? || response =~ /^Can not find listing for/
|
|
120
|
-
@exist = false
|
|
121
|
-
@num_dirs, @num_files, @size, @hdfs_path = false, false, false, false
|
|
122
|
-
else
|
|
123
|
-
@exist = true
|
|
124
|
-
parts = response.split
|
|
125
|
-
@num_dirs, @num_files, @size = parts[0..2].map(&:to_i)
|
|
126
|
-
@hdfs_path = parts.last
|
|
127
|
-
end
|
|
128
|
-
self
|
|
129
|
-
end
|
|
130
|
-
|
|
131
|
-
# Execute +command+ with +args+ on the Hadoop Distributed
|
|
132
|
-
# Filesystem (HDFS).
|
|
133
|
-
#
|
|
134
|
-
# If passed a block, yield each line of the output from the
|
|
135
|
-
# command, else just return the output.
|
|
136
|
-
#
|
|
137
|
-
# Try running `hadoop fs -help' for more information.
|
|
138
|
-
#
|
|
139
|
-
# @param [String, Symbol] command the command to run.
|
|
140
|
-
# @param [String, Symbol] args the arguments to pass the command
|
|
141
|
-
# @yield [String] each line of the command's output
|
|
142
|
-
# @return [String] the command's output
|
|
143
|
-
def self.fs command, *args
|
|
144
|
-
command_string = "#{executable} fs -#{command} #{args.compact.map(&:to_str).join(' ')}"
|
|
145
|
-
command_string += " 2>&1" if command == :count # FIXME or else it just spams the screen when we do HDFS#refresh!
|
|
146
|
-
output = `#{command_string}`.chomp
|
|
147
|
-
if block_given?
|
|
148
|
-
output.split("\n").each do |line|
|
|
149
|
-
yield line
|
|
150
|
-
end
|
|
151
|
-
else
|
|
152
|
-
output
|
|
153
|
-
end
|
|
154
|
-
end
|
|
155
|
-
|
|
156
|
-
protected
|
|
157
|
-
# Returns the path to the Hadoop executable.
|
|
158
|
-
#
|
|
159
|
-
# @return [String]
|
|
160
|
-
def self.executable
|
|
161
|
-
@executable ||= begin
|
|
162
|
-
string = `which hadoop`.chomp
|
|
163
|
-
raise IMW::Error.new("Could not find hadoop command. Is Hadoop installed?") if string.blank?
|
|
164
|
-
string
|
|
165
|
-
end
|
|
166
|
-
end
|
|
167
|
-
end
|
|
168
|
-
|
|
169
|
-
# Defines methods for reading data from HDFS files.
|
|
170
|
-
module HDFSFile
|
|
171
|
-
|
|
172
|
-
# Return the contents of this HDFS file as a string.
|
|
173
|
-
#
|
|
174
|
-
# Be VERY careful how you use this!
|
|
175
|
-
#
|
|
176
|
-
# @return [String]
|
|
177
|
-
def read
|
|
178
|
-
HDFS.fs(:cat, path)
|
|
179
|
-
end
|
|
180
|
-
|
|
181
|
-
# Iterate through each line of this HDFS resource.
|
|
182
|
-
#
|
|
183
|
-
# @yield [String] each line of the file
|
|
184
|
-
def each &block
|
|
185
|
-
HDFS.fs(:cat, path, &block)
|
|
186
|
-
end
|
|
187
|
-
|
|
188
|
-
# Return a handle on a StringIO object representing the
|
|
189
|
-
# content in this HDFS file.
|
|
190
|
-
#
|
|
191
|
-
# Be VERY careful how you use this! It is a StringIO object
|
|
192
|
-
# so the whole HDFS file is read into a string before
|
|
193
|
-
# returning the handle.
|
|
194
|
-
#
|
|
195
|
-
# @return [StringIO]
|
|
196
|
-
def io
|
|
197
|
-
@io ||= StringIO.new(read)
|
|
198
|
-
end
|
|
199
|
-
|
|
200
|
-
# Map over the lines of this HDFS resource.
|
|
201
|
-
#
|
|
202
|
-
# @yield [String] each line of the file
|
|
203
|
-
# @return [Array] the result of the block on each line
|
|
204
|
-
def map &block
|
|
205
|
-
[].tap do |output|
|
|
206
|
-
HDFS.fs(:cat, path) do |line|
|
|
207
|
-
output << block.call(line)
|
|
208
|
-
end
|
|
209
|
-
end
|
|
210
|
-
end
|
|
211
|
-
|
|
212
|
-
end
|
|
213
|
-
|
|
214
|
-
# Defines methods for listing contents of HDFS directories.
|
|
215
|
-
module HDFSDirectory
|
|
216
|
-
|
|
217
|
-
# Return the paths of all files and directories directly below
|
|
218
|
-
# this directory on the HDFS.
|
|
219
|
-
#
|
|
220
|
-
# @return [Array<String>]
|
|
221
|
-
def contents
|
|
222
|
-
[].tap do |paths|
|
|
223
|
-
HDFS.fs(:ls, path) do |line|
|
|
224
|
-
next if line =~ /^Found.*items$/
|
|
225
|
-
paths << line.split.last
|
|
226
|
-
end
|
|
227
|
-
end
|
|
228
|
-
end
|
|
229
|
-
|
|
230
|
-
# Return the resources directly below this directory on the
|
|
231
|
-
# HDFS.
|
|
232
|
-
#
|
|
233
|
-
# @return [Array<IMW::Resource>]
|
|
234
|
-
def resources
|
|
235
|
-
contents.map { |path| IMW.open(path) }
|
|
236
|
-
end
|
|
237
|
-
|
|
238
|
-
# Return the resource at the base path of this resource joined
|
|
239
|
-
# to +path+.
|
|
240
|
-
#
|
|
241
|
-
# IMW.open('hdfs:///path/to/dir').join('subdir')
|
|
242
|
-
# #=> IMW::Resource at 'hdfs:///path/to/dir/subdir'
|
|
243
|
-
#
|
|
244
|
-
# @param [Array<String>] paths
|
|
245
|
-
# @return [IMW::Resource]
|
|
246
|
-
def join *paths
|
|
247
|
-
IMW.open(File.join(stripped_uri.to_s, *paths))
|
|
248
|
-
end
|
|
249
|
-
end
|
|
250
|
-
end
|
|
251
|
-
end
|