masamune 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +54 -0
- data/Rakefile +15 -0
- data/bin/masamune-elastic-mapreduce +4 -0
- data/bin/masamune-hive +4 -0
- data/bin/masamune-psql +4 -0
- data/bin/masamune-shell +4 -0
- data/lib/masamune.rb +56 -0
- data/lib/masamune/accumulate.rb +60 -0
- data/lib/masamune/actions.rb +38 -0
- data/lib/masamune/actions/data_flow.rb +131 -0
- data/lib/masamune/actions/date_parse.rb +75 -0
- data/lib/masamune/actions/elastic_mapreduce.rb +68 -0
- data/lib/masamune/actions/execute.rb +52 -0
- data/lib/masamune/actions/filesystem.rb +37 -0
- data/lib/masamune/actions/hadoop_filesystem.rb +40 -0
- data/lib/masamune/actions/hadoop_streaming.rb +41 -0
- data/lib/masamune/actions/hive.rb +74 -0
- data/lib/masamune/actions/postgres.rb +76 -0
- data/lib/masamune/actions/postgres_admin.rb +34 -0
- data/lib/masamune/actions/s3cmd.rb +44 -0
- data/lib/masamune/actions/transform.rb +89 -0
- data/lib/masamune/after_initialize_callbacks.rb +55 -0
- data/lib/masamune/cached_filesystem.rb +110 -0
- data/lib/masamune/commands.rb +37 -0
- data/lib/masamune/commands/elastic_mapreduce.rb +119 -0
- data/lib/masamune/commands/hadoop_filesystem.rb +57 -0
- data/lib/masamune/commands/hadoop_streaming.rb +116 -0
- data/lib/masamune/commands/hive.rb +178 -0
- data/lib/masamune/commands/interactive.rb +37 -0
- data/lib/masamune/commands/postgres.rb +128 -0
- data/lib/masamune/commands/postgres_admin.rb +72 -0
- data/lib/masamune/commands/postgres_common.rb +33 -0
- data/lib/masamune/commands/retry_with_backoff.rb +60 -0
- data/lib/masamune/commands/s3cmd.rb +70 -0
- data/lib/masamune/commands/shell.rb +202 -0
- data/lib/masamune/configuration.rb +195 -0
- data/lib/masamune/data_plan.rb +31 -0
- data/lib/masamune/data_plan/builder.rb +66 -0
- data/lib/masamune/data_plan/elem.rb +190 -0
- data/lib/masamune/data_plan/engine.rb +162 -0
- data/lib/masamune/data_plan/rule.rb +292 -0
- data/lib/masamune/data_plan/set.rb +176 -0
- data/lib/masamune/environment.rb +164 -0
- data/lib/masamune/filesystem.rb +567 -0
- data/lib/masamune/has_environment.rb +40 -0
- data/lib/masamune/helpers.rb +27 -0
- data/lib/masamune/helpers/postgres.rb +84 -0
- data/lib/masamune/io.rb +33 -0
- data/lib/masamune/last_element.rb +53 -0
- data/lib/masamune/method_logger.rb +41 -0
- data/lib/masamune/multi_io.rb +39 -0
- data/lib/masamune/schema.rb +36 -0
- data/lib/masamune/schema/catalog.rb +233 -0
- data/lib/masamune/schema/column.rb +527 -0
- data/lib/masamune/schema/dimension.rb +133 -0
- data/lib/masamune/schema/event.rb +121 -0
- data/lib/masamune/schema/fact.rb +133 -0
- data/lib/masamune/schema/map.rb +265 -0
- data/lib/masamune/schema/row.rb +133 -0
- data/lib/masamune/schema/store.rb +115 -0
- data/lib/masamune/schema/table.rb +308 -0
- data/lib/masamune/schema/table_reference.rb +76 -0
- data/lib/masamune/spec_helper.rb +23 -0
- data/lib/masamune/string_format.rb +34 -0
- data/lib/masamune/tasks/elastic_mapreduce_thor.rb +60 -0
- data/lib/masamune/tasks/hive_thor.rb +55 -0
- data/lib/masamune/tasks/postgres_thor.rb +47 -0
- data/lib/masamune/tasks/shell_thor.rb +63 -0
- data/lib/masamune/template.rb +77 -0
- data/lib/masamune/thor.rb +186 -0
- data/lib/masamune/thor_loader.rb +38 -0
- data/lib/masamune/topological_hash.rb +34 -0
- data/lib/masamune/transform.rb +47 -0
- data/lib/masamune/transform/bulk_upsert.psql.erb +64 -0
- data/lib/masamune/transform/bulk_upsert.rb +52 -0
- data/lib/masamune/transform/consolidate_dimension.rb +54 -0
- data/lib/masamune/transform/deduplicate_dimension.psql.erb +52 -0
- data/lib/masamune/transform/deduplicate_dimension.rb +53 -0
- data/lib/masamune/transform/define_event_view.hql.erb +51 -0
- data/lib/masamune/transform/define_event_view.rb +60 -0
- data/lib/masamune/transform/define_index.psql.erb +34 -0
- data/lib/masamune/transform/define_schema.hql.erb +23 -0
- data/lib/masamune/transform/define_schema.psql.erb +79 -0
- data/lib/masamune/transform/define_schema.rb +56 -0
- data/lib/masamune/transform/define_table.hql.erb +34 -0
- data/lib/masamune/transform/define_table.psql.erb +95 -0
- data/lib/masamune/transform/define_table.rb +40 -0
- data/lib/masamune/transform/define_unique.psql.erb +30 -0
- data/lib/masamune/transform/insert_reference_values.psql.erb +43 -0
- data/lib/masamune/transform/insert_reference_values.rb +64 -0
- data/lib/masamune/transform/load_dimension.rb +47 -0
- data/lib/masamune/transform/load_fact.rb +45 -0
- data/lib/masamune/transform/operator.rb +96 -0
- data/lib/masamune/transform/relabel_dimension.psql.erb +76 -0
- data/lib/masamune/transform/relabel_dimension.rb +39 -0
- data/lib/masamune/transform/rollup_fact.psql.erb +79 -0
- data/lib/masamune/transform/rollup_fact.rb +149 -0
- data/lib/masamune/transform/snapshot_dimension.psql.erb +75 -0
- data/lib/masamune/transform/snapshot_dimension.rb +74 -0
- data/lib/masamune/transform/stage_dimension.psql.erb +39 -0
- data/lib/masamune/transform/stage_dimension.rb +83 -0
- data/lib/masamune/transform/stage_fact.psql.erb +80 -0
- data/lib/masamune/transform/stage_fact.rb +111 -0
- data/lib/masamune/version.rb +25 -0
- data/spec/fixtures/aggregate.sql.erb +25 -0
- data/spec/fixtures/comment.sql.erb +27 -0
- data/spec/fixtures/invalid.sql.erb +23 -0
- data/spec/fixtures/relative.sql.erb +23 -0
- data/spec/fixtures/simple.sql.erb +28 -0
- data/spec/fixtures/whitespace.sql.erb +30 -0
- data/spec/masamune/actions/elastic_mapreduce_spec.rb +108 -0
- data/spec/masamune/actions/execute_spec.rb +50 -0
- data/spec/masamune/actions/hadoop_filesystem_spec.rb +44 -0
- data/spec/masamune/actions/hadoop_streaming_spec.rb +74 -0
- data/spec/masamune/actions/hive_spec.rb +117 -0
- data/spec/masamune/actions/postgres_admin_spec.rb +58 -0
- data/spec/masamune/actions/postgres_spec.rb +134 -0
- data/spec/masamune/actions/s3cmd_spec.rb +44 -0
- data/spec/masamune/actions/transform_spec.rb +144 -0
- data/spec/masamune/after_initialization_callbacks_spec.rb +61 -0
- data/spec/masamune/cached_filesystem_spec.rb +167 -0
- data/spec/masamune/commands/hadoop_filesystem_spec.rb +50 -0
- data/spec/masamune/commands/hadoop_streaming_spec.rb +106 -0
- data/spec/masamune/commands/hive_spec.rb +117 -0
- data/spec/masamune/commands/postgres_admin_spec.rb +69 -0
- data/spec/masamune/commands/postgres_spec.rb +100 -0
- data/spec/masamune/commands/retry_with_backoff_spec.rb +116 -0
- data/spec/masamune/commands/s3cmd_spec.rb +50 -0
- data/spec/masamune/commands/shell_spec.rb +101 -0
- data/spec/masamune/configuration_spec.rb +102 -0
- data/spec/masamune/data_plan/builder_spec.rb +91 -0
- data/spec/masamune/data_plan/elem_spec.rb +102 -0
- data/spec/masamune/data_plan/engine_spec.rb +356 -0
- data/spec/masamune/data_plan/rule_spec.rb +407 -0
- data/spec/masamune/data_plan/set_spec.rb +517 -0
- data/spec/masamune/environment_spec.rb +65 -0
- data/spec/masamune/filesystem_spec.rb +1421 -0
- data/spec/masamune/helpers/postgres_spec.rb +95 -0
- data/spec/masamune/schema/catalog_spec.rb +613 -0
- data/spec/masamune/schema/column_spec.rb +696 -0
- data/spec/masamune/schema/dimension_spec.rb +137 -0
- data/spec/masamune/schema/event_spec.rb +75 -0
- data/spec/masamune/schema/fact_spec.rb +117 -0
- data/spec/masamune/schema/map_spec.rb +593 -0
- data/spec/masamune/schema/row_spec.rb +28 -0
- data/spec/masamune/schema/store_spec.rb +49 -0
- data/spec/masamune/schema/table_spec.rb +395 -0
- data/spec/masamune/string_format_spec.rb +60 -0
- data/spec/masamune/tasks/elastic_mapreduce_thor_spec.rb +57 -0
- data/spec/masamune/tasks/hive_thor_spec.rb +75 -0
- data/spec/masamune/tasks/postgres_thor_spec.rb +42 -0
- data/spec/masamune/tasks/shell_thor_spec.rb +51 -0
- data/spec/masamune/template_spec.rb +77 -0
- data/spec/masamune/thor_spec.rb +238 -0
- data/spec/masamune/transform/bulk_upsert.dimension_spec.rb +200 -0
- data/spec/masamune/transform/consolidate_dimension_spec.rb +62 -0
- data/spec/masamune/transform/deduplicate_dimension_spec.rb +84 -0
- data/spec/masamune/transform/define_event_view_spec.rb +84 -0
- data/spec/masamune/transform/define_schema_spec.rb +83 -0
- data/spec/masamune/transform/define_table.dimension_spec.rb +306 -0
- data/spec/masamune/transform/define_table.fact_spec.rb +291 -0
- data/spec/masamune/transform/define_table.table_spec.rb +525 -0
- data/spec/masamune/transform/insert_reference_values.dimension_spec.rb +111 -0
- data/spec/masamune/transform/insert_reference_values.fact_spec.rb +149 -0
- data/spec/masamune/transform/load_dimension_spec.rb +76 -0
- data/spec/masamune/transform/load_fact_spec.rb +89 -0
- data/spec/masamune/transform/relabel_dimension_spec.rb +102 -0
- data/spec/masamune/transform/rollup_fact_spec.rb +333 -0
- data/spec/masamune/transform/snapshot_dimension_spec.rb +103 -0
- data/spec/masamune/transform/stage_dimension_spec.rb +115 -0
- data/spec/masamune/transform/stage_fact_spec.rb +204 -0
- data/spec/masamune_spec.rb +32 -0
- data/spec/spec_helper.rb +41 -0
- data/spec/support/masamune/example_group.rb +36 -0
- data/spec/support/masamune/mock_command.rb +99 -0
- data/spec/support/masamune/mock_delegate.rb +51 -0
- data/spec/support/masamune/mock_filesystem.rb +96 -0
- data/spec/support/masamune/thor_mute.rb +35 -0
- data/spec/support/rspec/example/action_example_group.rb +34 -0
- data/spec/support/rspec/example/task_example_group.rb +80 -0
- data/spec/support/rspec/example/transform_example_group.rb +36 -0
- data/spec/support/shared_examples/postgres_common_examples.rb +53 -0
- metadata +462 -0
@@ -0,0 +1,164 @@
|
|
1
|
+
# The MIT License (MIT)
|
2
|
+
#
|
3
|
+
# Copyright (c) 2014-2015, VMware, Inc. All Rights Reserved.
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
# of this software and associated documentation files (the "Software"), to deal
|
7
|
+
# in the Software without restriction, including without limitation the rights
|
8
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
# copies of the Software, and to permit persons to whom the Software is
|
10
|
+
# furnished to do so, subject to the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be included in
|
13
|
+
# all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
# THE SOFTWARE.
|
22
|
+
|
23
|
+
require 'thread'
|
24
|
+
require 'tmpdir'
|
25
|
+
require 'logger'
|
26
|
+
|
27
|
+
require 'masamune/version'
|
28
|
+
require 'masamune/multi_io'
|
29
|
+
|
30
|
+
module Masamune
|
31
|
+
class Environment
|
32
|
+
attr_accessor :parent
|
33
|
+
attr_accessor :filesystem
|
34
|
+
attr_accessor :catalog
|
35
|
+
|
36
|
+
def initialize(parent = nil)
|
37
|
+
self.parent = parent
|
38
|
+
end
|
39
|
+
|
40
|
+
def version
|
41
|
+
"masamune #{Masamune::VERSION}"
|
42
|
+
end
|
43
|
+
|
44
|
+
def configure
|
45
|
+
yield configuration
|
46
|
+
end
|
47
|
+
|
48
|
+
def configuration
|
49
|
+
@configuration ||= Masamune::Configuration.new(self)
|
50
|
+
end
|
51
|
+
|
52
|
+
def mutex
|
53
|
+
@mutex ||= Mutex.new
|
54
|
+
end
|
55
|
+
|
56
|
+
def with_exclusive_lock(name, &block)
|
57
|
+
raise 'filesystem path :run_dir not defined' unless filesystem.has_path?(:run_dir)
|
58
|
+
lock_name = [name, configuration.lock].compact.join(':')
|
59
|
+
logger.debug("acquiring lock '#{lock_name}'")
|
60
|
+
lock_file = lock_file(lock_name)
|
61
|
+
lock_status = lock_file.flock(File::LOCK_EX | File::LOCK_NB)
|
62
|
+
if lock_status == 0
|
63
|
+
yield if block_given?
|
64
|
+
else
|
65
|
+
logger.error "acquire lock attempt failed for '#{lock_name}'"
|
66
|
+
end
|
67
|
+
ensure
|
68
|
+
if lock_file
|
69
|
+
logger.debug("releasing lock '#{lock_name}'")
|
70
|
+
lock_file.flock(File::LOCK_UN)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def log_file_template
|
75
|
+
@log_file_template || "#{Time.now.to_i}-#{$$}.log"
|
76
|
+
end
|
77
|
+
|
78
|
+
def log_file_template=(log_file_template)
|
79
|
+
@log_file_template = log_file_template
|
80
|
+
reload_logger!
|
81
|
+
end
|
82
|
+
|
83
|
+
def reload_logger!
|
84
|
+
@logger = @log_file_name = nil
|
85
|
+
end
|
86
|
+
|
87
|
+
def log_file_name
|
88
|
+
@log_file_name ||= filesystem.get_path(:log_dir, log_file_template)
|
89
|
+
end
|
90
|
+
|
91
|
+
def logger
|
92
|
+
@logger ||= Logger.new(log_file_io).tap do
|
93
|
+
symlink_latest_log
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def console(*a)
|
98
|
+
line = a.join(' ').chomp
|
99
|
+
mutex.synchronize do
|
100
|
+
logger.info(line)
|
101
|
+
$stdout.puts line unless configuration.quiet || configuration.debug
|
102
|
+
$stdout.flush
|
103
|
+
$stderr.flush
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def trace(*a)
|
108
|
+
line = a.join(' ').chomp
|
109
|
+
mutex.synchronize do
|
110
|
+
logger.info(line)
|
111
|
+
$stdout.puts line if configuration.verbose && !configuration.debug
|
112
|
+
$stdout.flush
|
113
|
+
$stderr.flush
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def filesystem
|
118
|
+
@filesystem ||= begin
|
119
|
+
filesystem = Masamune::Filesystem.new
|
120
|
+
filesystem.add_path :root_dir, File.expand_path('../../../', __FILE__)
|
121
|
+
filesystem = Masamune::MethodLogger.new(filesystem, :copy_file_to_file, :copy_file_to_dir, :remove_dir, :move_file_to_file, :move_file_to_dir, :move_dir)
|
122
|
+
filesystem
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def catalog
|
127
|
+
@catalog ||= Masamune::Schema::Catalog.new(self)
|
128
|
+
end
|
129
|
+
|
130
|
+
def hive_helper
|
131
|
+
@hive_helper ||= Masamune::Helpers::Hive.new(self)
|
132
|
+
end
|
133
|
+
|
134
|
+
def postgres_helper
|
135
|
+
@postgres_helper ||= Masamune::Helpers::Postgres.new(self)
|
136
|
+
end
|
137
|
+
|
138
|
+
private
|
139
|
+
|
140
|
+
def lock_file(name)
|
141
|
+
path = filesystem.get_path(:run_dir, "#{name}.lock")
|
142
|
+
File.open(path, File::CREAT, 0644)
|
143
|
+
end
|
144
|
+
|
145
|
+
def log_file_io
|
146
|
+
if filesystem.has_path?(:log_dir)
|
147
|
+
log_file = File.open(log_file_name, 'a')
|
148
|
+
log_file.sync = true
|
149
|
+
configuration.debug ? Masamune::MultiIO.new($stderr, log_file) : log_file
|
150
|
+
else
|
151
|
+
configuration.debug ? $stderr : nil
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def symlink_latest_log
|
156
|
+
return unless filesystem.has_path?(:log_dir)
|
157
|
+
latest = filesystem.path(:log_dir, 'latest')
|
158
|
+
FileUtils.rm(latest) if File.exists?(latest)
|
159
|
+
FileUtils.ln_s(log_file_name, latest)
|
160
|
+
rescue => e
|
161
|
+
logger.error(e)
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
@@ -0,0 +1,567 @@
|
|
1
|
+
# The MIT License (MIT)
|
2
|
+
#
|
3
|
+
# Copyright (c) 2014-2015, VMware, Inc. All Rights Reserved.
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
# of this software and associated documentation files (the "Software"), to deal
|
7
|
+
# in the Software without restriction, including without limitation the rights
|
8
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
# copies of the Software, and to permit persons to whom the Software is
|
10
|
+
# furnished to do so, subject to the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be included in
|
13
|
+
# all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
# THE SOFTWARE.
|
22
|
+
|
23
|
+
require 'masamune/has_environment'
|
24
|
+
|
25
|
+
module Masamune
|
26
|
+
class Filesystem
|
27
|
+
include Masamune::HasEnvironment
|
28
|
+
include Masamune::Accumulate
|
29
|
+
include Masamune::Actions::S3Cmd
|
30
|
+
include Masamune::Actions::HadoopFilesystem
|
31
|
+
|
32
|
+
FILE_MODE = 0777 - File.umask
|
33
|
+
|
34
|
+
def initialize
|
35
|
+
@paths = {}
|
36
|
+
@immutable_paths = {}
|
37
|
+
end
|
38
|
+
|
39
|
+
def clear!
|
40
|
+
# Intentionally unimplemented
|
41
|
+
end
|
42
|
+
|
43
|
+
def add_path(symbol, path, options = {})
|
44
|
+
options ||= {}
|
45
|
+
options.symbolize_keys!
|
46
|
+
eager_path = eager_load_path path
|
47
|
+
@paths[symbol.to_sym] = [eager_path, options]
|
48
|
+
mkdir!(eager_path) if options[:mkdir]
|
49
|
+
add_immutable_path(eager_path) if options[:immutable]
|
50
|
+
self
|
51
|
+
end
|
52
|
+
|
53
|
+
def get_path(symbol, *extra)
|
54
|
+
lazy_path = lambda do |fs|
|
55
|
+
fs.has_path?(symbol) or raise "Path :#{symbol} not defined"
|
56
|
+
path, options = fs.paths[symbol]
|
57
|
+
|
58
|
+
mkdir!(path) if options[:mkdir]
|
59
|
+
expand_params(fs, extra.any? ? File.join(path, extra) : path)
|
60
|
+
end
|
61
|
+
|
62
|
+
if eager_load_paths?
|
63
|
+
eager_load_path lazy_path.call(self)
|
64
|
+
else
|
65
|
+
lazy_path
|
66
|
+
end
|
67
|
+
end
|
68
|
+
alias :path :get_path
|
69
|
+
|
70
|
+
def has_path?(symbol)
|
71
|
+
@paths.has_key?(symbol)
|
72
|
+
end
|
73
|
+
|
74
|
+
def paths
|
75
|
+
@paths
|
76
|
+
end
|
77
|
+
|
78
|
+
def expand_params(fs, path)
|
79
|
+
new_path = path.dup
|
80
|
+
fs.environment.configuration.params.each do |key, value|
|
81
|
+
new_path.gsub!("%#{key.to_s}", value.to_s)
|
82
|
+
end
|
83
|
+
new_path
|
84
|
+
end
|
85
|
+
|
86
|
+
def relative_path?(path)
|
87
|
+
return false if remote_prefix(path)
|
88
|
+
path[0] != '/'
|
89
|
+
end
|
90
|
+
|
91
|
+
def parent_paths(path, &block)
|
92
|
+
if prefix = remote_prefix(path)
|
93
|
+
node = path.split(prefix).last
|
94
|
+
else
|
95
|
+
prefix = ''
|
96
|
+
node = path
|
97
|
+
end
|
98
|
+
|
99
|
+
return if prefix.blank? && node.blank?
|
100
|
+
parent_paths = node ? File.expand_path(node, '/').split('/') : []
|
101
|
+
parent_paths.reject! { |x| x.blank? }
|
102
|
+
parent_paths.prepend('/') if node =~ %r{\A/}
|
103
|
+
tmp = []
|
104
|
+
parent_paths.each do |part|
|
105
|
+
tmp << part
|
106
|
+
current_path = prefix + File.join(tmp)
|
107
|
+
break if current_path == path
|
108
|
+
yield current_path
|
109
|
+
end
|
110
|
+
end
|
111
|
+
method_accumulate :parent_paths
|
112
|
+
|
113
|
+
def root_path?(path)
|
114
|
+
raise ArgumentError, 'path cannot be nil' if path.nil?
|
115
|
+
raise ArgumentError, 'path cannot be blank' if path.blank?
|
116
|
+
raise ArgumentError, 'path cannot be relative' if relative_path?(path)
|
117
|
+
parent_paths(path).length < 1
|
118
|
+
end
|
119
|
+
|
120
|
+
def resolve_file(paths = [])
|
121
|
+
Array.wrap(paths).select { |path| File.exists?(path) && File.file?(path) }.first
|
122
|
+
end
|
123
|
+
|
124
|
+
def dirname(path)
|
125
|
+
parent_paths(path).last || path
|
126
|
+
end
|
127
|
+
|
128
|
+
def basename(path)
|
129
|
+
return unless path
|
130
|
+
node = remote_prefix(path) ? path.split(remote_prefix(path)).last : path
|
131
|
+
return if node.nil? || node.blank?
|
132
|
+
node.split('/').last
|
133
|
+
end
|
134
|
+
|
135
|
+
def touch!(*files)
|
136
|
+
files.uniq!
|
137
|
+
files.group_by { |path| type(path) }.each do |type, file_set|
|
138
|
+
mkdir!(*file_set.map { |file| File.dirname(file) }) unless type == :s3
|
139
|
+
case type
|
140
|
+
when :hdfs
|
141
|
+
hadoop_fs('-touchz', *file_set)
|
142
|
+
when :s3
|
143
|
+
empty = Tempfile.new('masamune')
|
144
|
+
file_set.each do |file|
|
145
|
+
s3cmd('put', empty.path, s3b(file, dir: false))
|
146
|
+
end
|
147
|
+
when :local
|
148
|
+
FileUtils.touch(file_set, file_util_args)
|
149
|
+
FileUtils.chmod(FILE_MODE, file_set, file_util_args)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def exists?(file)
|
155
|
+
case type(file)
|
156
|
+
when :hdfs
|
157
|
+
hadoop_fs('-test', '-e', file, safe: true).success?
|
158
|
+
when :s3
|
159
|
+
result = Set.new
|
160
|
+
s3cmd('ls', s3b(file), safe: true) do |line|
|
161
|
+
date, time, size, name = line.split(/\s+/)
|
162
|
+
result << (name == file)
|
163
|
+
end
|
164
|
+
result.any?
|
165
|
+
when :local
|
166
|
+
File.exists?(file)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
def glob_stat(pattern, &block)
|
171
|
+
case type(pattern)
|
172
|
+
when :hdfs
|
173
|
+
hadoop_fs('-ls', '-R', pattern, safe: true) do |line|
|
174
|
+
next if line =~ /\AFound \d+ items/
|
175
|
+
size, date, time, name = line.split(/\s+/).last(4)
|
176
|
+
next unless size && date && time && name
|
177
|
+
prefixed_name = remote_prefix(pattern) + name
|
178
|
+
yield OpenStruct.new(name: prefixed_name, mtime: Time.parse("#{date} #{time} +0000").at_beginning_of_minute.utc, size: size.to_i)
|
179
|
+
end
|
180
|
+
when :s3
|
181
|
+
file_glob, file_regexp = glob_split(pattern, recursive: true)
|
182
|
+
s3cmd('ls', '--recursive', s3b(file_glob), safe: true) do |line|
|
183
|
+
next if line =~ /\$folder$/
|
184
|
+
date, time, size, name = line.split(/\s+/)
|
185
|
+
next unless size && date && time && name
|
186
|
+
next unless name =~ file_regexp
|
187
|
+
yield OpenStruct.new(name: name, mtime: Time.parse("#{date} #{time} +0000").at_beginning_of_minute.utc, size: size.to_i)
|
188
|
+
end
|
189
|
+
when :local
|
190
|
+
Dir.glob(pattern.gsub(%r{/\*\Z}, '/**/*')) do |file|
|
191
|
+
stat = File.stat(file)
|
192
|
+
yield OpenStruct.new(name: file, mtime: stat.mtime.at_beginning_of_minute.utc, size: stat.size.to_i)
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
method_accumulate :glob_stat
|
197
|
+
|
198
|
+
def stat(file_or_dir)
|
199
|
+
raise ArgumentError, 'cannot contain wildcard' if file_or_dir.include?('*')
|
200
|
+
result = glob_stat(file_or_dir)
|
201
|
+
return unless result.any?
|
202
|
+
return result.first if result.size == 1
|
203
|
+
max_time = result.map { |stat| stat.try(:mtime) }.compact.max
|
204
|
+
sum_size = result.map { |stat| stat.try(:size) }.compact.reduce(:+)
|
205
|
+
OpenStruct.new(name: file_or_dir, mtime: max_time, size: sum_size)
|
206
|
+
end
|
207
|
+
|
208
|
+
def mkdir!(*dirs)
|
209
|
+
dirs.uniq!
|
210
|
+
dirs.group_by { |path| type(path) }.each do |type, dir_set|
|
211
|
+
case type
|
212
|
+
when :hdfs
|
213
|
+
hadoop_fs('-mkdir', '-p', *dir_set)
|
214
|
+
when :s3
|
215
|
+
touch! *dir_set.map { |dir| File.join(dir, '.not_empty') }
|
216
|
+
when :local
|
217
|
+
FileUtils.mkdir_p(dir_set, file_util_args)
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
def glob(pattern, &block)
|
223
|
+
case type(pattern)
|
224
|
+
when :hdfs
|
225
|
+
file_glob, file_regexp = glob_split(pattern)
|
226
|
+
hadoop_fs('-ls', pattern, safe: true) do |line|
|
227
|
+
next if line =~ /\AFound \d+ items/
|
228
|
+
name = line.split(/\s+/).last
|
229
|
+
next unless name
|
230
|
+
prefixed_name = remote_prefix(pattern) + name
|
231
|
+
next unless prefixed_name && prefixed_name =~ file_regexp
|
232
|
+
yield q(pattern, prefixed_name)
|
233
|
+
end
|
234
|
+
when :s3
|
235
|
+
file_glob, file_regexp = glob_split(pattern)
|
236
|
+
s3cmd('ls', '--recursive', s3b(file_glob), safe: true) do |line|
|
237
|
+
next if line =~ /\$folder$/
|
238
|
+
name = line.split(/\s+/).last
|
239
|
+
next unless name && name =~ file_regexp
|
240
|
+
yield q(pattern, name)
|
241
|
+
end
|
242
|
+
when :local
|
243
|
+
Dir.glob(pattern.gsub(%r{/\*\Z}, '/**/*')) do |file|
|
244
|
+
yield file
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
248
|
+
method_accumulate :glob
|
249
|
+
|
250
|
+
def glob_sort(pattern, options = {})
|
251
|
+
result = glob(pattern)
|
252
|
+
case options[:order]
|
253
|
+
when :basename
|
254
|
+
result.sort { |x,y| File.basename(x) <=> File.basename(y) }
|
255
|
+
else
|
256
|
+
result
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
def copy_file_to_file(src, dst)
|
261
|
+
check_immutable_path!(dst)
|
262
|
+
mkdir!(dirname(dst)) unless type(dst) == :s3
|
263
|
+
copy_file_helper(src, dst, false)
|
264
|
+
end
|
265
|
+
|
266
|
+
def copy_file_to_dir(src, dst)
|
267
|
+
check_immutable_path!(dst)
|
268
|
+
mkdir!(dst) unless type(dst) == :s3
|
269
|
+
return false if dirname(src) == dst
|
270
|
+
copy_file_helper(src, dst, true)
|
271
|
+
end
|
272
|
+
|
273
|
+
def copy_dir(src, dst)
|
274
|
+
check_immutable_path!(dst)
|
275
|
+
case [type(src), type(dst)]
|
276
|
+
when [:hdfs, :hdfs]
|
277
|
+
copy_file_to_dir(src, dst)
|
278
|
+
when [:hdfs, :local]
|
279
|
+
copy_file_to_dir(src, dst)
|
280
|
+
when [:hdfs, :s3]
|
281
|
+
copy_file_to_dir(src, dst)
|
282
|
+
when [:s3, :s3]
|
283
|
+
s3cmd('cp', '--recursive', s3b(src, dir: true), s3b(dst, dir: true))
|
284
|
+
when [:s3, :local]
|
285
|
+
fixed_dst = File.join(dst, src.split('/')[-1])
|
286
|
+
FileUtils.mkdir_p(fixed_dst, file_util_args)
|
287
|
+
s3cmd('get', '--recursive', '--skip-existing', s3b(src, dir: true), fixed_dst)
|
288
|
+
when [:s3, :hdfs]
|
289
|
+
copy_file_to_dir(src, dst)
|
290
|
+
when [:local, :local]
|
291
|
+
FileUtils.mkdir_p(dst, file_util_args)
|
292
|
+
FileUtils.cp_r(src, dst, file_util_args)
|
293
|
+
when [:local, :hdfs]
|
294
|
+
copy_file_to_dir(src, dst)
|
295
|
+
when [:local, :s3]
|
296
|
+
s3cmd('put', '--recursive', src, s3b(dst, dir: true))
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
def remove_file(file)
|
301
|
+
check_immutable_path!(file)
|
302
|
+
case type(file)
|
303
|
+
when :hdfs
|
304
|
+
hadoop_fs('-rm', file)
|
305
|
+
when :s3
|
306
|
+
s3cmd('del', s3b(file, dir: false))
|
307
|
+
when :local
|
308
|
+
FileUtils.rm(file, file_util_args)
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
def remove_dir(dir)
|
313
|
+
raise "#{dir} is root path, cannot remove" if root_path?(dir)
|
314
|
+
check_immutable_path!(dir)
|
315
|
+
case type(dir)
|
316
|
+
when :hdfs
|
317
|
+
hadoop_fs('-rmr', dir)
|
318
|
+
when :s3
|
319
|
+
s3cmd('del', '--recursive', s3b(dir, dir: true))
|
320
|
+
s3cmd('del', '--recursive', s3b("#{dir}_$folder$"))
|
321
|
+
when :local
|
322
|
+
FileUtils.rmtree(dir, file_util_args)
|
323
|
+
end
|
324
|
+
end
|
325
|
+
|
326
|
+
def move_file_to_file(src, dst)
|
327
|
+
check_immutable_path!(src)
|
328
|
+
mkdir!(dirname(dst)) unless type(dst) == :s3
|
329
|
+
move_file_helper(src, dst, false)
|
330
|
+
end
|
331
|
+
|
332
|
+
def move_file_to_dir(src, dst)
|
333
|
+
check_immutable_path!(src)
|
334
|
+
mkdir!(dst) unless type(dst) == :s3
|
335
|
+
move_file_helper(src, dst, true)
|
336
|
+
end
|
337
|
+
|
338
|
+
def move_dir(src, dst)
|
339
|
+
check_immutable_path!(src)
|
340
|
+
case [type(src), type(dst)]
|
341
|
+
when [:hdfs, :hdfs]
|
342
|
+
move_file_to_file(src, dst)
|
343
|
+
when [:hdfs, :local]
|
344
|
+
copy_file_to_dir(src, dst)
|
345
|
+
remove_dir(src)
|
346
|
+
when [:s3, :s3]
|
347
|
+
s3cmd('mv', '--recursive', d(src), f(dst))
|
348
|
+
when [:s3, :local]
|
349
|
+
s3cmd('get', '--recursive', d(src), f(dst))
|
350
|
+
remove_dir(src)
|
351
|
+
when [:s3, :hdfs]
|
352
|
+
copy_file_to_dir(src, dst)
|
353
|
+
remove_dir(src)
|
354
|
+
when [:hdfs, :s3]
|
355
|
+
copy_file_to_dir(src, d(dst))
|
356
|
+
remove_dir(src)
|
357
|
+
when [:local, :local]
|
358
|
+
move_file_to_file(src, dst)
|
359
|
+
when [:local, :hdfs]
|
360
|
+
move_file_to_file(src, dst)
|
361
|
+
when [:local, :s3]
|
362
|
+
s3cmd('put', '--recursive', d(src), d(dst))
|
363
|
+
remove_dir(src)
|
364
|
+
end
|
365
|
+
end
|
366
|
+
|
367
|
+
def cat(*files)
|
368
|
+
StringIO.new.tap do |buf|
|
369
|
+
files.group_by { |path| type(path) }.each do |type, file_set|
|
370
|
+
case type
|
371
|
+
when :local
|
372
|
+
file_set.map do |file|
|
373
|
+
next unless File.exists?(file)
|
374
|
+
next if File.directory?(file)
|
375
|
+
buf << File.read(file)
|
376
|
+
end
|
377
|
+
end
|
378
|
+
end
|
379
|
+
buf.rewind
|
380
|
+
end
|
381
|
+
end
|
382
|
+
|
383
|
+
def write(buf, dst)
|
384
|
+
case type(dst)
|
385
|
+
when :local
|
386
|
+
mkdir!(File.dirname(dst))
|
387
|
+
File.open(dst, 'w') do |file|
|
388
|
+
file.write buf
|
389
|
+
end
|
390
|
+
end
|
391
|
+
end
|
392
|
+
|
393
|
+
def chown!(*files)
|
394
|
+
opts = files.last.is_a?(Hash) ? files.pop : {}
|
395
|
+
user, group = opts.fetch(:user, current_user), opts.fetch(:group, current_group)
|
396
|
+
|
397
|
+
files.group_by { |path| type(path) }.each do |type, file_set|
|
398
|
+
case type
|
399
|
+
when :hdfs
|
400
|
+
hadoop_fs('-chown', '-R', [user, group].compact.join(':'), *file_set)
|
401
|
+
when :s3
|
402
|
+
# NOTE intentionally skip
|
403
|
+
when :local
|
404
|
+
FileUtils.chown_R(user, group, file_set, file_util_args)
|
405
|
+
end
|
406
|
+
end
|
407
|
+
end
|
408
|
+
|
409
|
+
def mktemp!(path)
|
410
|
+
get_path(path, SecureRandom.base64).tap do |file|
|
411
|
+
touch!(file)
|
412
|
+
end
|
413
|
+
end
|
414
|
+
|
415
|
+
def mktempdir!(path)
|
416
|
+
get_path(path, SecureRandom.base64).tap do |dir|
|
417
|
+
mkdir!(dir)
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
def glob_split(input, options = {})
|
422
|
+
[ input.include?('*') ? input.split('*').first + '*' : input, glob_to_regexp(input, options) ]
|
423
|
+
end
|
424
|
+
|
425
|
+
def glob_to_regexp(input, options = {})
|
426
|
+
if input.include?('*') || options.fetch(:recursive, false)
|
427
|
+
%r|\A#{Regexp.escape(input).gsub('\\*', '.*?').gsub(%r{\/\.\*\?\z}, '/?.*?')}|
|
428
|
+
else
|
429
|
+
/\A#{Regexp.escape(input)}\z/
|
430
|
+
end
|
431
|
+
end
|
432
|
+
|
433
|
+
private
|
434
|
+
|
435
|
+
def eager_load_path(path)
|
436
|
+
case path
|
437
|
+
when String
|
438
|
+
path
|
439
|
+
when Proc
|
440
|
+
path.call(self)
|
441
|
+
else
|
442
|
+
raise "Unknown path #{path.inspect}"
|
443
|
+
end
|
444
|
+
end
|
445
|
+
|
446
|
+
def remote_prefix(dir)
|
447
|
+
dir[%r{\As3n?://.*?(?=/)}] ||
|
448
|
+
dir[%r{\As3n?://.*?\Z}] ||
|
449
|
+
dir[%r{\Afile://}] ||
|
450
|
+
dir[%r{\Ahdfs://}]
|
451
|
+
end
|
452
|
+
|
453
|
+
def local_prefix(file)
|
454
|
+
return file if remote_prefix(file)
|
455
|
+
"file://#{file}"
|
456
|
+
end
|
457
|
+
|
458
|
+
def eager_load_paths?
|
459
|
+
@paths.reject { |key,_| key == :root_dir }.any?
|
460
|
+
end
|
461
|
+
|
462
|
+
def type(path)
|
463
|
+
case path
|
464
|
+
when %r{\Afile://}, %r{\Ahdfs://}
|
465
|
+
:hdfs
|
466
|
+
when %r{\As3n?://}
|
467
|
+
:s3
|
468
|
+
else
|
469
|
+
:local
|
470
|
+
end
|
471
|
+
end
|
472
|
+
|
473
|
+
def file_util_args
|
474
|
+
{noop: configuration.no_op, verbose: configuration.verbose}
|
475
|
+
end
|
476
|
+
|
477
|
+
def qualify_file(dir, file)
|
478
|
+
if prefix = remote_prefix(dir) and file !~ /\A#{Regexp.escape(prefix)}/
|
479
|
+
"#{prefix}/#{file.sub(%r{\A/+}, '')}"
|
480
|
+
else
|
481
|
+
file
|
482
|
+
end
|
483
|
+
end
|
484
|
+
alias :q :qualify_file
|
485
|
+
|
486
|
+
def ensure_dir(dir)
|
487
|
+
File.join(dir, '/')
|
488
|
+
end
|
489
|
+
alias :d :ensure_dir
|
490
|
+
|
491
|
+
def ensure_file(file)
|
492
|
+
file.chomp('/')
|
493
|
+
end
|
494
|
+
alias :f :ensure_file
|
495
|
+
|
496
|
+
def add_immutable_path(path)
|
497
|
+
@immutable_paths[path] = /\A#{Regexp.escape(path)}/
|
498
|
+
end
|
499
|
+
|
500
|
+
def check_immutable_path!(file)
|
501
|
+
@immutable_paths.each do |path, regex|
|
502
|
+
raise "#{path} is marked as immutable, cannot modify #{file}" if file[regex].present?
|
503
|
+
end
|
504
|
+
end
|
505
|
+
|
506
|
+
def current_user
|
507
|
+
Etc.getlogin
|
508
|
+
end
|
509
|
+
|
510
|
+
def current_group
|
511
|
+
Etc.getgrgid(Etc.getpwnam(current_user).gid).name
|
512
|
+
rescue
|
513
|
+
end
|
514
|
+
|
515
|
+
def copy_file_helper(src, dst, dir)
|
516
|
+
case [type(src), type(dst)]
|
517
|
+
when [:hdfs, :hdfs]
|
518
|
+
hadoop_fs('-cp', src, dst)
|
519
|
+
when [:hdfs, :local]
|
520
|
+
hadoop_fs('-copyToLocal', src, local_prefix(dst))
|
521
|
+
when [:hdfs, :s3]
|
522
|
+
hadoop_fs('-cp', src, s3n(dst))
|
523
|
+
when [:s3, :s3]
|
524
|
+
s3cmd('cp', src, s3b(dst, dir: dir))
|
525
|
+
when [:s3, :local]
|
526
|
+
s3cmd('get', src, dst)
|
527
|
+
when [:s3, :hdfs]
|
528
|
+
hadoop_fs('-cp', s3n(src), dst)
|
529
|
+
when [:local, :local]
|
530
|
+
FileUtils.cp(src, dst, file_util_args)
|
531
|
+
when [:local, :hdfs]
|
532
|
+
hadoop_fs('-copyFromLocal', local_prefix(src), dst)
|
533
|
+
when [:local, :s3]
|
534
|
+
s3cmd('put', src, s3b(dst, dir: dir))
|
535
|
+
end
|
536
|
+
end
|
537
|
+
|
538
|
+
def move_file_helper(src, dst, dir)
|
539
|
+
case [type(src), type(dst)]
|
540
|
+
when [:hdfs, :hdfs]
|
541
|
+
hadoop_fs('-mv', src, dst)
|
542
|
+
when [:hdfs, :local]
|
543
|
+
# NOTE: moveToLocal: Option '-moveToLocal' is not implemented yet
|
544
|
+
hadoop_fs('-copyToLocal', src, local_prefix(dst))
|
545
|
+
hadoop_fs('-rm', src)
|
546
|
+
when [:hdfs, :s3]
|
547
|
+
copy_file_to_file(src, s3n(dst, dir: dir))
|
548
|
+
hadoop_fs('-rm', src)
|
549
|
+
when [:s3, :s3]
|
550
|
+
s3cmd('mv', src, s3b(dst, dir: dir))
|
551
|
+
when [:s3, :local]
|
552
|
+
s3cmd('get', src, dst)
|
553
|
+
s3cmd('del', src)
|
554
|
+
when [:s3, :hdfs]
|
555
|
+
hadoop_fs('-mv', s3n(src), dst)
|
556
|
+
when [:local, :local]
|
557
|
+
FileUtils.mv(src, dst, file_util_args)
|
558
|
+
FileUtils.chmod(FILE_MODE, dst, file_util_args)
|
559
|
+
when [:local, :hdfs]
|
560
|
+
hadoop_fs('-moveFromLocal', local_prefix(src), dst)
|
561
|
+
when [:local, :s3]
|
562
|
+
s3cmd('put', src, s3b(dst, dir: dir))
|
563
|
+
FileUtils.rm(src, file_util_args)
|
564
|
+
end
|
565
|
+
end
|
566
|
+
end
|
567
|
+
end
|