masamune 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +54 -0
- data/Rakefile +15 -0
- data/bin/masamune-elastic-mapreduce +4 -0
- data/bin/masamune-hive +4 -0
- data/bin/masamune-psql +4 -0
- data/bin/masamune-shell +4 -0
- data/lib/masamune.rb +56 -0
- data/lib/masamune/accumulate.rb +60 -0
- data/lib/masamune/actions.rb +38 -0
- data/lib/masamune/actions/data_flow.rb +131 -0
- data/lib/masamune/actions/date_parse.rb +75 -0
- data/lib/masamune/actions/elastic_mapreduce.rb +68 -0
- data/lib/masamune/actions/execute.rb +52 -0
- data/lib/masamune/actions/filesystem.rb +37 -0
- data/lib/masamune/actions/hadoop_filesystem.rb +40 -0
- data/lib/masamune/actions/hadoop_streaming.rb +41 -0
- data/lib/masamune/actions/hive.rb +74 -0
- data/lib/masamune/actions/postgres.rb +76 -0
- data/lib/masamune/actions/postgres_admin.rb +34 -0
- data/lib/masamune/actions/s3cmd.rb +44 -0
- data/lib/masamune/actions/transform.rb +89 -0
- data/lib/masamune/after_initialize_callbacks.rb +55 -0
- data/lib/masamune/cached_filesystem.rb +110 -0
- data/lib/masamune/commands.rb +37 -0
- data/lib/masamune/commands/elastic_mapreduce.rb +119 -0
- data/lib/masamune/commands/hadoop_filesystem.rb +57 -0
- data/lib/masamune/commands/hadoop_streaming.rb +116 -0
- data/lib/masamune/commands/hive.rb +178 -0
- data/lib/masamune/commands/interactive.rb +37 -0
- data/lib/masamune/commands/postgres.rb +128 -0
- data/lib/masamune/commands/postgres_admin.rb +72 -0
- data/lib/masamune/commands/postgres_common.rb +33 -0
- data/lib/masamune/commands/retry_with_backoff.rb +60 -0
- data/lib/masamune/commands/s3cmd.rb +70 -0
- data/lib/masamune/commands/shell.rb +202 -0
- data/lib/masamune/configuration.rb +195 -0
- data/lib/masamune/data_plan.rb +31 -0
- data/lib/masamune/data_plan/builder.rb +66 -0
- data/lib/masamune/data_plan/elem.rb +190 -0
- data/lib/masamune/data_plan/engine.rb +162 -0
- data/lib/masamune/data_plan/rule.rb +292 -0
- data/lib/masamune/data_plan/set.rb +176 -0
- data/lib/masamune/environment.rb +164 -0
- data/lib/masamune/filesystem.rb +567 -0
- data/lib/masamune/has_environment.rb +40 -0
- data/lib/masamune/helpers.rb +27 -0
- data/lib/masamune/helpers/postgres.rb +84 -0
- data/lib/masamune/io.rb +33 -0
- data/lib/masamune/last_element.rb +53 -0
- data/lib/masamune/method_logger.rb +41 -0
- data/lib/masamune/multi_io.rb +39 -0
- data/lib/masamune/schema.rb +36 -0
- data/lib/masamune/schema/catalog.rb +233 -0
- data/lib/masamune/schema/column.rb +527 -0
- data/lib/masamune/schema/dimension.rb +133 -0
- data/lib/masamune/schema/event.rb +121 -0
- data/lib/masamune/schema/fact.rb +133 -0
- data/lib/masamune/schema/map.rb +265 -0
- data/lib/masamune/schema/row.rb +133 -0
- data/lib/masamune/schema/store.rb +115 -0
- data/lib/masamune/schema/table.rb +308 -0
- data/lib/masamune/schema/table_reference.rb +76 -0
- data/lib/masamune/spec_helper.rb +23 -0
- data/lib/masamune/string_format.rb +34 -0
- data/lib/masamune/tasks/elastic_mapreduce_thor.rb +60 -0
- data/lib/masamune/tasks/hive_thor.rb +55 -0
- data/lib/masamune/tasks/postgres_thor.rb +47 -0
- data/lib/masamune/tasks/shell_thor.rb +63 -0
- data/lib/masamune/template.rb +77 -0
- data/lib/masamune/thor.rb +186 -0
- data/lib/masamune/thor_loader.rb +38 -0
- data/lib/masamune/topological_hash.rb +34 -0
- data/lib/masamune/transform.rb +47 -0
- data/lib/masamune/transform/bulk_upsert.psql.erb +64 -0
- data/lib/masamune/transform/bulk_upsert.rb +52 -0
- data/lib/masamune/transform/consolidate_dimension.rb +54 -0
- data/lib/masamune/transform/deduplicate_dimension.psql.erb +52 -0
- data/lib/masamune/transform/deduplicate_dimension.rb +53 -0
- data/lib/masamune/transform/define_event_view.hql.erb +51 -0
- data/lib/masamune/transform/define_event_view.rb +60 -0
- data/lib/masamune/transform/define_index.psql.erb +34 -0
- data/lib/masamune/transform/define_schema.hql.erb +23 -0
- data/lib/masamune/transform/define_schema.psql.erb +79 -0
- data/lib/masamune/transform/define_schema.rb +56 -0
- data/lib/masamune/transform/define_table.hql.erb +34 -0
- data/lib/masamune/transform/define_table.psql.erb +95 -0
- data/lib/masamune/transform/define_table.rb +40 -0
- data/lib/masamune/transform/define_unique.psql.erb +30 -0
- data/lib/masamune/transform/insert_reference_values.psql.erb +43 -0
- data/lib/masamune/transform/insert_reference_values.rb +64 -0
- data/lib/masamune/transform/load_dimension.rb +47 -0
- data/lib/masamune/transform/load_fact.rb +45 -0
- data/lib/masamune/transform/operator.rb +96 -0
- data/lib/masamune/transform/relabel_dimension.psql.erb +76 -0
- data/lib/masamune/transform/relabel_dimension.rb +39 -0
- data/lib/masamune/transform/rollup_fact.psql.erb +79 -0
- data/lib/masamune/transform/rollup_fact.rb +149 -0
- data/lib/masamune/transform/snapshot_dimension.psql.erb +75 -0
- data/lib/masamune/transform/snapshot_dimension.rb +74 -0
- data/lib/masamune/transform/stage_dimension.psql.erb +39 -0
- data/lib/masamune/transform/stage_dimension.rb +83 -0
- data/lib/masamune/transform/stage_fact.psql.erb +80 -0
- data/lib/masamune/transform/stage_fact.rb +111 -0
- data/lib/masamune/version.rb +25 -0
- data/spec/fixtures/aggregate.sql.erb +25 -0
- data/spec/fixtures/comment.sql.erb +27 -0
- data/spec/fixtures/invalid.sql.erb +23 -0
- data/spec/fixtures/relative.sql.erb +23 -0
- data/spec/fixtures/simple.sql.erb +28 -0
- data/spec/fixtures/whitespace.sql.erb +30 -0
- data/spec/masamune/actions/elastic_mapreduce_spec.rb +108 -0
- data/spec/masamune/actions/execute_spec.rb +50 -0
- data/spec/masamune/actions/hadoop_filesystem_spec.rb +44 -0
- data/spec/masamune/actions/hadoop_streaming_spec.rb +74 -0
- data/spec/masamune/actions/hive_spec.rb +117 -0
- data/spec/masamune/actions/postgres_admin_spec.rb +58 -0
- data/spec/masamune/actions/postgres_spec.rb +134 -0
- data/spec/masamune/actions/s3cmd_spec.rb +44 -0
- data/spec/masamune/actions/transform_spec.rb +144 -0
- data/spec/masamune/after_initialization_callbacks_spec.rb +61 -0
- data/spec/masamune/cached_filesystem_spec.rb +167 -0
- data/spec/masamune/commands/hadoop_filesystem_spec.rb +50 -0
- data/spec/masamune/commands/hadoop_streaming_spec.rb +106 -0
- data/spec/masamune/commands/hive_spec.rb +117 -0
- data/spec/masamune/commands/postgres_admin_spec.rb +69 -0
- data/spec/masamune/commands/postgres_spec.rb +100 -0
- data/spec/masamune/commands/retry_with_backoff_spec.rb +116 -0
- data/spec/masamune/commands/s3cmd_spec.rb +50 -0
- data/spec/masamune/commands/shell_spec.rb +101 -0
- data/spec/masamune/configuration_spec.rb +102 -0
- data/spec/masamune/data_plan/builder_spec.rb +91 -0
- data/spec/masamune/data_plan/elem_spec.rb +102 -0
- data/spec/masamune/data_plan/engine_spec.rb +356 -0
- data/spec/masamune/data_plan/rule_spec.rb +407 -0
- data/spec/masamune/data_plan/set_spec.rb +517 -0
- data/spec/masamune/environment_spec.rb +65 -0
- data/spec/masamune/filesystem_spec.rb +1421 -0
- data/spec/masamune/helpers/postgres_spec.rb +95 -0
- data/spec/masamune/schema/catalog_spec.rb +613 -0
- data/spec/masamune/schema/column_spec.rb +696 -0
- data/spec/masamune/schema/dimension_spec.rb +137 -0
- data/spec/masamune/schema/event_spec.rb +75 -0
- data/spec/masamune/schema/fact_spec.rb +117 -0
- data/spec/masamune/schema/map_spec.rb +593 -0
- data/spec/masamune/schema/row_spec.rb +28 -0
- data/spec/masamune/schema/store_spec.rb +49 -0
- data/spec/masamune/schema/table_spec.rb +395 -0
- data/spec/masamune/string_format_spec.rb +60 -0
- data/spec/masamune/tasks/elastic_mapreduce_thor_spec.rb +57 -0
- data/spec/masamune/tasks/hive_thor_spec.rb +75 -0
- data/spec/masamune/tasks/postgres_thor_spec.rb +42 -0
- data/spec/masamune/tasks/shell_thor_spec.rb +51 -0
- data/spec/masamune/template_spec.rb +77 -0
- data/spec/masamune/thor_spec.rb +238 -0
- data/spec/masamune/transform/bulk_upsert.dimension_spec.rb +200 -0
- data/spec/masamune/transform/consolidate_dimension_spec.rb +62 -0
- data/spec/masamune/transform/deduplicate_dimension_spec.rb +84 -0
- data/spec/masamune/transform/define_event_view_spec.rb +84 -0
- data/spec/masamune/transform/define_schema_spec.rb +83 -0
- data/spec/masamune/transform/define_table.dimension_spec.rb +306 -0
- data/spec/masamune/transform/define_table.fact_spec.rb +291 -0
- data/spec/masamune/transform/define_table.table_spec.rb +525 -0
- data/spec/masamune/transform/insert_reference_values.dimension_spec.rb +111 -0
- data/spec/masamune/transform/insert_reference_values.fact_spec.rb +149 -0
- data/spec/masamune/transform/load_dimension_spec.rb +76 -0
- data/spec/masamune/transform/load_fact_spec.rb +89 -0
- data/spec/masamune/transform/relabel_dimension_spec.rb +102 -0
- data/spec/masamune/transform/rollup_fact_spec.rb +333 -0
- data/spec/masamune/transform/snapshot_dimension_spec.rb +103 -0
- data/spec/masamune/transform/stage_dimension_spec.rb +115 -0
- data/spec/masamune/transform/stage_fact_spec.rb +204 -0
- data/spec/masamune_spec.rb +32 -0
- data/spec/spec_helper.rb +41 -0
- data/spec/support/masamune/example_group.rb +36 -0
- data/spec/support/masamune/mock_command.rb +99 -0
- data/spec/support/masamune/mock_delegate.rb +51 -0
- data/spec/support/masamune/mock_filesystem.rb +96 -0
- data/spec/support/masamune/thor_mute.rb +35 -0
- data/spec/support/rspec/example/action_example_group.rb +34 -0
- data/spec/support/rspec/example/task_example_group.rb +80 -0
- data/spec/support/rspec/example/transform_example_group.rb +36 -0
- data/spec/support/shared_examples/postgres_common_examples.rb +53 -0
- metadata +462 -0
@@ -0,0 +1,164 @@
|
|
1
|
+
# The MIT License (MIT)
|
2
|
+
#
|
3
|
+
# Copyright (c) 2014-2015, VMware, Inc. All Rights Reserved.
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
# of this software and associated documentation files (the "Software"), to deal
|
7
|
+
# in the Software without restriction, including without limitation the rights
|
8
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
# copies of the Software, and to permit persons to whom the Software is
|
10
|
+
# furnished to do so, subject to the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be included in
|
13
|
+
# all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
# THE SOFTWARE.
|
22
|
+
|
23
|
+
require 'thread'
|
24
|
+
require 'tmpdir'
|
25
|
+
require 'logger'
|
26
|
+
|
27
|
+
require 'masamune/version'
|
28
|
+
require 'masamune/multi_io'
|
29
|
+
|
30
|
+
module Masamune
|
31
|
+
class Environment
|
32
|
+
attr_accessor :parent
|
33
|
+
attr_accessor :filesystem
|
34
|
+
attr_accessor :catalog
|
35
|
+
|
36
|
+
def initialize(parent = nil)
|
37
|
+
self.parent = parent
|
38
|
+
end
|
39
|
+
|
40
|
+
def version
|
41
|
+
"masamune #{Masamune::VERSION}"
|
42
|
+
end
|
43
|
+
|
44
|
+
def configure
|
45
|
+
yield configuration
|
46
|
+
end
|
47
|
+
|
48
|
+
def configuration
|
49
|
+
@configuration ||= Masamune::Configuration.new(self)
|
50
|
+
end
|
51
|
+
|
52
|
+
def mutex
|
53
|
+
@mutex ||= Mutex.new
|
54
|
+
end
|
55
|
+
|
56
|
+
def with_exclusive_lock(name, &block)
|
57
|
+
raise 'filesystem path :run_dir not defined' unless filesystem.has_path?(:run_dir)
|
58
|
+
lock_name = [name, configuration.lock].compact.join(':')
|
59
|
+
logger.debug("acquiring lock '#{lock_name}'")
|
60
|
+
lock_file = lock_file(lock_name)
|
61
|
+
lock_status = lock_file.flock(File::LOCK_EX | File::LOCK_NB)
|
62
|
+
if lock_status == 0
|
63
|
+
yield if block_given?
|
64
|
+
else
|
65
|
+
logger.error "acquire lock attempt failed for '#{lock_name}'"
|
66
|
+
end
|
67
|
+
ensure
|
68
|
+
if lock_file
|
69
|
+
logger.debug("releasing lock '#{lock_name}'")
|
70
|
+
lock_file.flock(File::LOCK_UN)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def log_file_template
|
75
|
+
@log_file_template || "#{Time.now.to_i}-#{$$}.log"
|
76
|
+
end
|
77
|
+
|
78
|
+
def log_file_template=(log_file_template)
|
79
|
+
@log_file_template = log_file_template
|
80
|
+
reload_logger!
|
81
|
+
end
|
82
|
+
|
83
|
+
def reload_logger!
|
84
|
+
@logger = @log_file_name = nil
|
85
|
+
end
|
86
|
+
|
87
|
+
def log_file_name
|
88
|
+
@log_file_name ||= filesystem.get_path(:log_dir, log_file_template)
|
89
|
+
end
|
90
|
+
|
91
|
+
def logger
|
92
|
+
@logger ||= Logger.new(log_file_io).tap do
|
93
|
+
symlink_latest_log
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def console(*a)
|
98
|
+
line = a.join(' ').chomp
|
99
|
+
mutex.synchronize do
|
100
|
+
logger.info(line)
|
101
|
+
$stdout.puts line unless configuration.quiet || configuration.debug
|
102
|
+
$stdout.flush
|
103
|
+
$stderr.flush
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def trace(*a)
|
108
|
+
line = a.join(' ').chomp
|
109
|
+
mutex.synchronize do
|
110
|
+
logger.info(line)
|
111
|
+
$stdout.puts line if configuration.verbose && !configuration.debug
|
112
|
+
$stdout.flush
|
113
|
+
$stderr.flush
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def filesystem
|
118
|
+
@filesystem ||= begin
|
119
|
+
filesystem = Masamune::Filesystem.new
|
120
|
+
filesystem.add_path :root_dir, File.expand_path('../../../', __FILE__)
|
121
|
+
filesystem = Masamune::MethodLogger.new(filesystem, :copy_file_to_file, :copy_file_to_dir, :remove_dir, :move_file_to_file, :move_file_to_dir, :move_dir)
|
122
|
+
filesystem
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def catalog
|
127
|
+
@catalog ||= Masamune::Schema::Catalog.new(self)
|
128
|
+
end
|
129
|
+
|
130
|
+
def hive_helper
|
131
|
+
@hive_helper ||= Masamune::Helpers::Hive.new(self)
|
132
|
+
end
|
133
|
+
|
134
|
+
def postgres_helper
|
135
|
+
@postgres_helper ||= Masamune::Helpers::Postgres.new(self)
|
136
|
+
end
|
137
|
+
|
138
|
+
private
|
139
|
+
|
140
|
+
def lock_file(name)
|
141
|
+
path = filesystem.get_path(:run_dir, "#{name}.lock")
|
142
|
+
File.open(path, File::CREAT, 0644)
|
143
|
+
end
|
144
|
+
|
145
|
+
def log_file_io
|
146
|
+
if filesystem.has_path?(:log_dir)
|
147
|
+
log_file = File.open(log_file_name, 'a')
|
148
|
+
log_file.sync = true
|
149
|
+
configuration.debug ? Masamune::MultiIO.new($stderr, log_file) : log_file
|
150
|
+
else
|
151
|
+
configuration.debug ? $stderr : nil
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def symlink_latest_log
|
156
|
+
return unless filesystem.has_path?(:log_dir)
|
157
|
+
latest = filesystem.path(:log_dir, 'latest')
|
158
|
+
FileUtils.rm(latest) if File.exists?(latest)
|
159
|
+
FileUtils.ln_s(log_file_name, latest)
|
160
|
+
rescue => e
|
161
|
+
logger.error(e)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
@@ -0,0 +1,567 @@
|
|
1
|
+
# The MIT License (MIT)
|
2
|
+
#
|
3
|
+
# Copyright (c) 2014-2015, VMware, Inc. All Rights Reserved.
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
# of this software and associated documentation files (the "Software"), to deal
|
7
|
+
# in the Software without restriction, including without limitation the rights
|
8
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
# copies of the Software, and to permit persons to whom the Software is
|
10
|
+
# furnished to do so, subject to the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be included in
|
13
|
+
# all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
# THE SOFTWARE.
|
22
|
+
|
23
|
+
require 'masamune/has_environment'
|
24
|
+
|
25
|
+
module Masamune
|
26
|
+
class Filesystem
|
27
|
+
include Masamune::HasEnvironment
|
28
|
+
include Masamune::Accumulate
|
29
|
+
include Masamune::Actions::S3Cmd
|
30
|
+
include Masamune::Actions::HadoopFilesystem
|
31
|
+
|
32
|
+
FILE_MODE = 0777 - File.umask
|
33
|
+
|
34
|
+
def initialize
|
35
|
+
@paths = {}
|
36
|
+
@immutable_paths = {}
|
37
|
+
end
|
38
|
+
|
39
|
+
def clear!
|
40
|
+
# Intentionally unimplemented
|
41
|
+
end
|
42
|
+
|
43
|
+
def add_path(symbol, path, options = {})
|
44
|
+
options ||= {}
|
45
|
+
options.symbolize_keys!
|
46
|
+
eager_path = eager_load_path path
|
47
|
+
@paths[symbol.to_sym] = [eager_path, options]
|
48
|
+
mkdir!(eager_path) if options[:mkdir]
|
49
|
+
add_immutable_path(eager_path) if options[:immutable]
|
50
|
+
self
|
51
|
+
end
|
52
|
+
|
53
|
+
def get_path(symbol, *extra)
|
54
|
+
lazy_path = lambda do |fs|
|
55
|
+
fs.has_path?(symbol) or raise "Path :#{symbol} not defined"
|
56
|
+
path, options = fs.paths[symbol]
|
57
|
+
|
58
|
+
mkdir!(path) if options[:mkdir]
|
59
|
+
expand_params(fs, extra.any? ? File.join(path, extra) : path)
|
60
|
+
end
|
61
|
+
|
62
|
+
if eager_load_paths?
|
63
|
+
eager_load_path lazy_path.call(self)
|
64
|
+
else
|
65
|
+
lazy_path
|
66
|
+
end
|
67
|
+
end
|
68
|
+
alias :path :get_path
|
69
|
+
|
70
|
+
def has_path?(symbol)
|
71
|
+
@paths.has_key?(symbol)
|
72
|
+
end
|
73
|
+
|
74
|
+
def paths
|
75
|
+
@paths
|
76
|
+
end
|
77
|
+
|
78
|
+
def expand_params(fs, path)
|
79
|
+
new_path = path.dup
|
80
|
+
fs.environment.configuration.params.each do |key, value|
|
81
|
+
new_path.gsub!("%#{key.to_s}", value.to_s)
|
82
|
+
end
|
83
|
+
new_path
|
84
|
+
end
|
85
|
+
|
86
|
+
def relative_path?(path)
|
87
|
+
return false if remote_prefix(path)
|
88
|
+
path[0] != '/'
|
89
|
+
end
|
90
|
+
|
91
|
+
def parent_paths(path, &block)
|
92
|
+
if prefix = remote_prefix(path)
|
93
|
+
node = path.split(prefix).last
|
94
|
+
else
|
95
|
+
prefix = ''
|
96
|
+
node = path
|
97
|
+
end
|
98
|
+
|
99
|
+
return if prefix.blank? && node.blank?
|
100
|
+
parent_paths = node ? File.expand_path(node, '/').split('/') : []
|
101
|
+
parent_paths.reject! { |x| x.blank? }
|
102
|
+
parent_paths.prepend('/') if node =~ %r{\A/}
|
103
|
+
tmp = []
|
104
|
+
parent_paths.each do |part|
|
105
|
+
tmp << part
|
106
|
+
current_path = prefix + File.join(tmp)
|
107
|
+
break if current_path == path
|
108
|
+
yield current_path
|
109
|
+
end
|
110
|
+
end
|
111
|
+
method_accumulate :parent_paths
|
112
|
+
|
113
|
+
def root_path?(path)
|
114
|
+
raise ArgumentError, 'path cannot be nil' if path.nil?
|
115
|
+
raise ArgumentError, 'path cannot be blank' if path.blank?
|
116
|
+
raise ArgumentError, 'path cannot be relative' if relative_path?(path)
|
117
|
+
parent_paths(path).length < 1
|
118
|
+
end
|
119
|
+
|
120
|
+
def resolve_file(paths = [])
|
121
|
+
Array.wrap(paths).select { |path| File.exists?(path) && File.file?(path) }.first
|
122
|
+
end
|
123
|
+
|
124
|
+
def dirname(path)
|
125
|
+
parent_paths(path).last || path
|
126
|
+
end
|
127
|
+
|
128
|
+
def basename(path)
|
129
|
+
return unless path
|
130
|
+
node = remote_prefix(path) ? path.split(remote_prefix(path)).last : path
|
131
|
+
return if node.nil? || node.blank?
|
132
|
+
node.split('/').last
|
133
|
+
end
|
134
|
+
|
135
|
+
def touch!(*files)
|
136
|
+
files.uniq!
|
137
|
+
files.group_by { |path| type(path) }.each do |type, file_set|
|
138
|
+
mkdir!(*file_set.map { |file| File.dirname(file) }) unless type == :s3
|
139
|
+
case type
|
140
|
+
when :hdfs
|
141
|
+
hadoop_fs('-touchz', *file_set)
|
142
|
+
when :s3
|
143
|
+
empty = Tempfile.new('masamune')
|
144
|
+
file_set.each do |file|
|
145
|
+
s3cmd('put', empty.path, s3b(file, dir: false))
|
146
|
+
end
|
147
|
+
when :local
|
148
|
+
FileUtils.touch(file_set, file_util_args)
|
149
|
+
FileUtils.chmod(FILE_MODE, file_set, file_util_args)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def exists?(file)
|
155
|
+
case type(file)
|
156
|
+
when :hdfs
|
157
|
+
hadoop_fs('-test', '-e', file, safe: true).success?
|
158
|
+
when :s3
|
159
|
+
result = Set.new
|
160
|
+
s3cmd('ls', s3b(file), safe: true) do |line|
|
161
|
+
date, time, size, name = line.split(/\s+/)
|
162
|
+
result << (name == file)
|
163
|
+
end
|
164
|
+
result.any?
|
165
|
+
when :local
|
166
|
+
File.exists?(file)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
def glob_stat(pattern, &block)
|
171
|
+
case type(pattern)
|
172
|
+
when :hdfs
|
173
|
+
hadoop_fs('-ls', '-R', pattern, safe: true) do |line|
|
174
|
+
next if line =~ /\AFound \d+ items/
|
175
|
+
size, date, time, name = line.split(/\s+/).last(4)
|
176
|
+
next unless size && date && time && name
|
177
|
+
prefixed_name = remote_prefix(pattern) + name
|
178
|
+
yield OpenStruct.new(name: prefixed_name, mtime: Time.parse("#{date} #{time} +0000").at_beginning_of_minute.utc, size: size.to_i)
|
179
|
+
end
|
180
|
+
when :s3
|
181
|
+
file_glob, file_regexp = glob_split(pattern, recursive: true)
|
182
|
+
s3cmd('ls', '--recursive', s3b(file_glob), safe: true) do |line|
|
183
|
+
next if line =~ /\$folder$/
|
184
|
+
date, time, size, name = line.split(/\s+/)
|
185
|
+
next unless size && date && time && name
|
186
|
+
next unless name =~ file_regexp
|
187
|
+
yield OpenStruct.new(name: name, mtime: Time.parse("#{date} #{time} +0000").at_beginning_of_minute.utc, size: size.to_i)
|
188
|
+
end
|
189
|
+
when :local
|
190
|
+
Dir.glob(pattern.gsub(%r{/\*\Z}, '/**/*')) do |file|
|
191
|
+
stat = File.stat(file)
|
192
|
+
yield OpenStruct.new(name: file, mtime: stat.mtime.at_beginning_of_minute.utc, size: stat.size.to_i)
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
method_accumulate :glob_stat
|
197
|
+
|
198
|
+
def stat(file_or_dir)
|
199
|
+
raise ArgumentError, 'cannot contain wildcard' if file_or_dir.include?('*')
|
200
|
+
result = glob_stat(file_or_dir)
|
201
|
+
return unless result.any?
|
202
|
+
return result.first if result.size == 1
|
203
|
+
max_time = result.map { |stat| stat.try(:mtime) }.compact.max
|
204
|
+
sum_size = result.map { |stat| stat.try(:size) }.compact.reduce(:+)
|
205
|
+
OpenStruct.new(name: file_or_dir, mtime: max_time, size: sum_size)
|
206
|
+
end
|
207
|
+
|
208
|
+
def mkdir!(*dirs)
|
209
|
+
dirs.uniq!
|
210
|
+
dirs.group_by { |path| type(path) }.each do |type, dir_set|
|
211
|
+
case type
|
212
|
+
when :hdfs
|
213
|
+
hadoop_fs('-mkdir', '-p', *dir_set)
|
214
|
+
when :s3
|
215
|
+
touch! *dir_set.map { |dir| File.join(dir, '.not_empty') }
|
216
|
+
when :local
|
217
|
+
FileUtils.mkdir_p(dir_set, file_util_args)
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
def glob(pattern, &block)
|
223
|
+
case type(pattern)
|
224
|
+
when :hdfs
|
225
|
+
file_glob, file_regexp = glob_split(pattern)
|
226
|
+
hadoop_fs('-ls', pattern, safe: true) do |line|
|
227
|
+
next if line =~ /\AFound \d+ items/
|
228
|
+
name = line.split(/\s+/).last
|
229
|
+
next unless name
|
230
|
+
prefixed_name = remote_prefix(pattern) + name
|
231
|
+
next unless prefixed_name && prefixed_name =~ file_regexp
|
232
|
+
yield q(pattern, prefixed_name)
|
233
|
+
end
|
234
|
+
when :s3
|
235
|
+
file_glob, file_regexp = glob_split(pattern)
|
236
|
+
s3cmd('ls', '--recursive', s3b(file_glob), safe: true) do |line|
|
237
|
+
next if line =~ /\$folder$/
|
238
|
+
name = line.split(/\s+/).last
|
239
|
+
next unless name && name =~ file_regexp
|
240
|
+
yield q(pattern, name)
|
241
|
+
end
|
242
|
+
when :local
|
243
|
+
Dir.glob(pattern.gsub(%r{/\*\Z}, '/**/*')) do |file|
|
244
|
+
yield file
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
248
|
+
method_accumulate :glob
|
249
|
+
|
250
|
+
def glob_sort(pattern, options = {})
|
251
|
+
result = glob(pattern)
|
252
|
+
case options[:order]
|
253
|
+
when :basename
|
254
|
+
result.sort { |x,y| File.basename(x) <=> File.basename(y) }
|
255
|
+
else
|
256
|
+
result
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
def copy_file_to_file(src, dst)
|
261
|
+
check_immutable_path!(dst)
|
262
|
+
mkdir!(dirname(dst)) unless type(dst) == :s3
|
263
|
+
copy_file_helper(src, dst, false)
|
264
|
+
end
|
265
|
+
|
266
|
+
def copy_file_to_dir(src, dst)
|
267
|
+
check_immutable_path!(dst)
|
268
|
+
mkdir!(dst) unless type(dst) == :s3
|
269
|
+
return false if dirname(src) == dst
|
270
|
+
copy_file_helper(src, dst, true)
|
271
|
+
end
|
272
|
+
|
273
|
+
def copy_dir(src, dst)
|
274
|
+
check_immutable_path!(dst)
|
275
|
+
case [type(src), type(dst)]
|
276
|
+
when [:hdfs, :hdfs]
|
277
|
+
copy_file_to_dir(src, dst)
|
278
|
+
when [:hdfs, :local]
|
279
|
+
copy_file_to_dir(src, dst)
|
280
|
+
when [:hdfs, :s3]
|
281
|
+
copy_file_to_dir(src, dst)
|
282
|
+
when [:s3, :s3]
|
283
|
+
s3cmd('cp', '--recursive', s3b(src, dir: true), s3b(dst, dir: true))
|
284
|
+
when [:s3, :local]
|
285
|
+
fixed_dst = File.join(dst, src.split('/')[-1])
|
286
|
+
FileUtils.mkdir_p(fixed_dst, file_util_args)
|
287
|
+
s3cmd('get', '--recursive', '--skip-existing', s3b(src, dir: true), fixed_dst)
|
288
|
+
when [:s3, :hdfs]
|
289
|
+
copy_file_to_dir(src, dst)
|
290
|
+
when [:local, :local]
|
291
|
+
FileUtils.mkdir_p(dst, file_util_args)
|
292
|
+
FileUtils.cp_r(src, dst, file_util_args)
|
293
|
+
when [:local, :hdfs]
|
294
|
+
copy_file_to_dir(src, dst)
|
295
|
+
when [:local, :s3]
|
296
|
+
s3cmd('put', '--recursive', src, s3b(dst, dir: true))
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
def remove_file(file)
|
301
|
+
check_immutable_path!(file)
|
302
|
+
case type(file)
|
303
|
+
when :hdfs
|
304
|
+
hadoop_fs('-rm', file)
|
305
|
+
when :s3
|
306
|
+
s3cmd('del', s3b(file, dir: false))
|
307
|
+
when :local
|
308
|
+
FileUtils.rm(file, file_util_args)
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
def remove_dir(dir)
|
313
|
+
raise "#{dir} is root path, cannot remove" if root_path?(dir)
|
314
|
+
check_immutable_path!(dir)
|
315
|
+
case type(dir)
|
316
|
+
when :hdfs
|
317
|
+
hadoop_fs('-rmr', dir)
|
318
|
+
when :s3
|
319
|
+
s3cmd('del', '--recursive', s3b(dir, dir: true))
|
320
|
+
s3cmd('del', '--recursive', s3b("#{dir}_$folder$"))
|
321
|
+
when :local
|
322
|
+
FileUtils.rmtree(dir, file_util_args)
|
323
|
+
end
|
324
|
+
end
|
325
|
+
|
326
|
+
def move_file_to_file(src, dst)
|
327
|
+
check_immutable_path!(src)
|
328
|
+
mkdir!(dirname(dst)) unless type(dst) == :s3
|
329
|
+
move_file_helper(src, dst, false)
|
330
|
+
end
|
331
|
+
|
332
|
+
def move_file_to_dir(src, dst)
|
333
|
+
check_immutable_path!(src)
|
334
|
+
mkdir!(dst) unless type(dst) == :s3
|
335
|
+
move_file_helper(src, dst, true)
|
336
|
+
end
|
337
|
+
|
338
|
+
def move_dir(src, dst)
|
339
|
+
check_immutable_path!(src)
|
340
|
+
case [type(src), type(dst)]
|
341
|
+
when [:hdfs, :hdfs]
|
342
|
+
move_file_to_file(src, dst)
|
343
|
+
when [:hdfs, :local]
|
344
|
+
copy_file_to_dir(src, dst)
|
345
|
+
remove_dir(src)
|
346
|
+
when [:s3, :s3]
|
347
|
+
s3cmd('mv', '--recursive', d(src), f(dst))
|
348
|
+
when [:s3, :local]
|
349
|
+
s3cmd('get', '--recursive', d(src), f(dst))
|
350
|
+
remove_dir(src)
|
351
|
+
when [:s3, :hdfs]
|
352
|
+
copy_file_to_dir(src, dst)
|
353
|
+
remove_dir(src)
|
354
|
+
when [:hdfs, :s3]
|
355
|
+
copy_file_to_dir(src, d(dst))
|
356
|
+
remove_dir(src)
|
357
|
+
when [:local, :local]
|
358
|
+
move_file_to_file(src, dst)
|
359
|
+
when [:local, :hdfs]
|
360
|
+
move_file_to_file(src, dst)
|
361
|
+
when [:local, :s3]
|
362
|
+
s3cmd('put', '--recursive', d(src), d(dst))
|
363
|
+
remove_dir(src)
|
364
|
+
end
|
365
|
+
end
|
366
|
+
|
367
|
+
def cat(*files)
|
368
|
+
StringIO.new.tap do |buf|
|
369
|
+
files.group_by { |path| type(path) }.each do |type, file_set|
|
370
|
+
case type
|
371
|
+
when :local
|
372
|
+
file_set.map do |file|
|
373
|
+
next unless File.exists?(file)
|
374
|
+
next if File.directory?(file)
|
375
|
+
buf << File.read(file)
|
376
|
+
end
|
377
|
+
end
|
378
|
+
end
|
379
|
+
buf.rewind
|
380
|
+
end
|
381
|
+
end
|
382
|
+
|
383
|
+
def write(buf, dst)
|
384
|
+
case type(dst)
|
385
|
+
when :local
|
386
|
+
mkdir!(File.dirname(dst))
|
387
|
+
File.open(dst, 'w') do |file|
|
388
|
+
file.write buf
|
389
|
+
end
|
390
|
+
end
|
391
|
+
end
|
392
|
+
|
393
|
+
def chown!(*files)
|
394
|
+
opts = files.last.is_a?(Hash) ? files.pop : {}
|
395
|
+
user, group = opts.fetch(:user, current_user), opts.fetch(:group, current_group)
|
396
|
+
|
397
|
+
files.group_by { |path| type(path) }.each do |type, file_set|
|
398
|
+
case type
|
399
|
+
when :hdfs
|
400
|
+
hadoop_fs('-chown', '-R', [user, group].compact.join(':'), *file_set)
|
401
|
+
when :s3
|
402
|
+
# NOTE intentionally skip
|
403
|
+
when :local
|
404
|
+
FileUtils.chown_R(user, group, file_set, file_util_args)
|
405
|
+
end
|
406
|
+
end
|
407
|
+
end
|
408
|
+
|
409
|
+
def mktemp!(path)
|
410
|
+
get_path(path, SecureRandom.base64).tap do |file|
|
411
|
+
touch!(file)
|
412
|
+
end
|
413
|
+
end
|
414
|
+
|
415
|
+
def mktempdir!(path)
|
416
|
+
get_path(path, SecureRandom.base64).tap do |dir|
|
417
|
+
mkdir!(dir)
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
def glob_split(input, options = {})
|
422
|
+
[ input.include?('*') ? input.split('*').first + '*' : input, glob_to_regexp(input, options) ]
|
423
|
+
end
|
424
|
+
|
425
|
+
def glob_to_regexp(input, options = {})
|
426
|
+
if input.include?('*') || options.fetch(:recursive, false)
|
427
|
+
%r|\A#{Regexp.escape(input).gsub('\\*', '.*?').gsub(%r{\/\.\*\?\z}, '/?.*?')}|
|
428
|
+
else
|
429
|
+
/\A#{Regexp.escape(input)}\z/
|
430
|
+
end
|
431
|
+
end
|
432
|
+
|
433
|
+
private
|
434
|
+
|
435
|
+
def eager_load_path(path)
|
436
|
+
case path
|
437
|
+
when String
|
438
|
+
path
|
439
|
+
when Proc
|
440
|
+
path.call(self)
|
441
|
+
else
|
442
|
+
raise "Unknown path #{path.inspect}"
|
443
|
+
end
|
444
|
+
end
|
445
|
+
|
446
|
+
def remote_prefix(dir)
|
447
|
+
dir[%r{\As3n?://.*?(?=/)}] ||
|
448
|
+
dir[%r{\As3n?://.*?\Z}] ||
|
449
|
+
dir[%r{\Afile://}] ||
|
450
|
+
dir[%r{\Ahdfs://}]
|
451
|
+
end
|
452
|
+
|
453
|
+
def local_prefix(file)
|
454
|
+
return file if remote_prefix(file)
|
455
|
+
"file://#{file}"
|
456
|
+
end
|
457
|
+
|
458
|
+
def eager_load_paths?
|
459
|
+
@paths.reject { |key,_| key == :root_dir }.any?
|
460
|
+
end
|
461
|
+
|
462
|
+
def type(path)
|
463
|
+
case path
|
464
|
+
when %r{\Afile://}, %r{\Ahdfs://}
|
465
|
+
:hdfs
|
466
|
+
when %r{\As3n?://}
|
467
|
+
:s3
|
468
|
+
else
|
469
|
+
:local
|
470
|
+
end
|
471
|
+
end
|
472
|
+
|
473
|
+
def file_util_args
|
474
|
+
{noop: configuration.no_op, verbose: configuration.verbose}
|
475
|
+
end
|
476
|
+
|
477
|
+
def qualify_file(dir, file)
|
478
|
+
if prefix = remote_prefix(dir) and file !~ /\A#{Regexp.escape(prefix)}/
|
479
|
+
"#{prefix}/#{file.sub(%r{\A/+}, '')}"
|
480
|
+
else
|
481
|
+
file
|
482
|
+
end
|
483
|
+
end
|
484
|
+
alias :q :qualify_file
|
485
|
+
|
486
|
+
def ensure_dir(dir)
|
487
|
+
File.join(dir, '/')
|
488
|
+
end
|
489
|
+
alias :d :ensure_dir
|
490
|
+
|
491
|
+
def ensure_file(file)
|
492
|
+
file.chomp('/')
|
493
|
+
end
|
494
|
+
alias :f :ensure_file
|
495
|
+
|
496
|
+
def add_immutable_path(path)
|
497
|
+
@immutable_paths[path] = /\A#{Regexp.escape(path)}/
|
498
|
+
end
|
499
|
+
|
500
|
+
def check_immutable_path!(file)
|
501
|
+
@immutable_paths.each do |path, regex|
|
502
|
+
raise "#{path} is marked as immutable, cannot modify #{file}" if file[regex].present?
|
503
|
+
end
|
504
|
+
end
|
505
|
+
|
506
|
+
def current_user
|
507
|
+
Etc.getlogin
|
508
|
+
end
|
509
|
+
|
510
|
+
def current_group
|
511
|
+
Etc.getgrgid(Etc.getpwnam(current_user).gid).name
|
512
|
+
rescue
|
513
|
+
end
|
514
|
+
|
515
|
+
def copy_file_helper(src, dst, dir)
|
516
|
+
case [type(src), type(dst)]
|
517
|
+
when [:hdfs, :hdfs]
|
518
|
+
hadoop_fs('-cp', src, dst)
|
519
|
+
when [:hdfs, :local]
|
520
|
+
hadoop_fs('-copyToLocal', src, local_prefix(dst))
|
521
|
+
when [:hdfs, :s3]
|
522
|
+
hadoop_fs('-cp', src, s3n(dst))
|
523
|
+
when [:s3, :s3]
|
524
|
+
s3cmd('cp', src, s3b(dst, dir: dir))
|
525
|
+
when [:s3, :local]
|
526
|
+
s3cmd('get', src, dst)
|
527
|
+
when [:s3, :hdfs]
|
528
|
+
hadoop_fs('-cp', s3n(src), dst)
|
529
|
+
when [:local, :local]
|
530
|
+
FileUtils.cp(src, dst, file_util_args)
|
531
|
+
when [:local, :hdfs]
|
532
|
+
hadoop_fs('-copyFromLocal', local_prefix(src), dst)
|
533
|
+
when [:local, :s3]
|
534
|
+
s3cmd('put', src, s3b(dst, dir: dir))
|
535
|
+
end
|
536
|
+
end
|
537
|
+
|
538
|
+
def move_file_helper(src, dst, dir)
|
539
|
+
case [type(src), type(dst)]
|
540
|
+
when [:hdfs, :hdfs]
|
541
|
+
hadoop_fs('-mv', src, dst)
|
542
|
+
when [:hdfs, :local]
|
543
|
+
# NOTE: moveToLocal: Option '-moveToLocal' is not implemented yet
|
544
|
+
hadoop_fs('-copyToLocal', src, local_prefix(dst))
|
545
|
+
hadoop_fs('-rm', src)
|
546
|
+
when [:hdfs, :s3]
|
547
|
+
copy_file_to_file(src, s3n(dst, dir: dir))
|
548
|
+
hadoop_fs('-rm', src)
|
549
|
+
when [:s3, :s3]
|
550
|
+
s3cmd('mv', src, s3b(dst, dir: dir))
|
551
|
+
when [:s3, :local]
|
552
|
+
s3cmd('get', src, dst)
|
553
|
+
s3cmd('del', src)
|
554
|
+
when [:s3, :hdfs]
|
555
|
+
hadoop_fs('-mv', s3n(src), dst)
|
556
|
+
when [:local, :local]
|
557
|
+
FileUtils.mv(src, dst, file_util_args)
|
558
|
+
FileUtils.chmod(FILE_MODE, dst, file_util_args)
|
559
|
+
when [:local, :hdfs]
|
560
|
+
hadoop_fs('-moveFromLocal', local_prefix(src), dst)
|
561
|
+
when [:local, :s3]
|
562
|
+
s3cmd('put', src, s3b(dst, dir: dir))
|
563
|
+
FileUtils.rm(src, file_util_args)
|
564
|
+
end
|
565
|
+
end
|
566
|
+
end
|
567
|
+
end
|