masamune 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (185) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +54 -0
  4. data/Rakefile +15 -0
  5. data/bin/masamune-elastic-mapreduce +4 -0
  6. data/bin/masamune-hive +4 -0
  7. data/bin/masamune-psql +4 -0
  8. data/bin/masamune-shell +4 -0
  9. data/lib/masamune.rb +56 -0
  10. data/lib/masamune/accumulate.rb +60 -0
  11. data/lib/masamune/actions.rb +38 -0
  12. data/lib/masamune/actions/data_flow.rb +131 -0
  13. data/lib/masamune/actions/date_parse.rb +75 -0
  14. data/lib/masamune/actions/elastic_mapreduce.rb +68 -0
  15. data/lib/masamune/actions/execute.rb +52 -0
  16. data/lib/masamune/actions/filesystem.rb +37 -0
  17. data/lib/masamune/actions/hadoop_filesystem.rb +40 -0
  18. data/lib/masamune/actions/hadoop_streaming.rb +41 -0
  19. data/lib/masamune/actions/hive.rb +74 -0
  20. data/lib/masamune/actions/postgres.rb +76 -0
  21. data/lib/masamune/actions/postgres_admin.rb +34 -0
  22. data/lib/masamune/actions/s3cmd.rb +44 -0
  23. data/lib/masamune/actions/transform.rb +89 -0
  24. data/lib/masamune/after_initialize_callbacks.rb +55 -0
  25. data/lib/masamune/cached_filesystem.rb +110 -0
  26. data/lib/masamune/commands.rb +37 -0
  27. data/lib/masamune/commands/elastic_mapreduce.rb +119 -0
  28. data/lib/masamune/commands/hadoop_filesystem.rb +57 -0
  29. data/lib/masamune/commands/hadoop_streaming.rb +116 -0
  30. data/lib/masamune/commands/hive.rb +178 -0
  31. data/lib/masamune/commands/interactive.rb +37 -0
  32. data/lib/masamune/commands/postgres.rb +128 -0
  33. data/lib/masamune/commands/postgres_admin.rb +72 -0
  34. data/lib/masamune/commands/postgres_common.rb +33 -0
  35. data/lib/masamune/commands/retry_with_backoff.rb +60 -0
  36. data/lib/masamune/commands/s3cmd.rb +70 -0
  37. data/lib/masamune/commands/shell.rb +202 -0
  38. data/lib/masamune/configuration.rb +195 -0
  39. data/lib/masamune/data_plan.rb +31 -0
  40. data/lib/masamune/data_plan/builder.rb +66 -0
  41. data/lib/masamune/data_plan/elem.rb +190 -0
  42. data/lib/masamune/data_plan/engine.rb +162 -0
  43. data/lib/masamune/data_plan/rule.rb +292 -0
  44. data/lib/masamune/data_plan/set.rb +176 -0
  45. data/lib/masamune/environment.rb +164 -0
  46. data/lib/masamune/filesystem.rb +567 -0
  47. data/lib/masamune/has_environment.rb +40 -0
  48. data/lib/masamune/helpers.rb +27 -0
  49. data/lib/masamune/helpers/postgres.rb +84 -0
  50. data/lib/masamune/io.rb +33 -0
  51. data/lib/masamune/last_element.rb +53 -0
  52. data/lib/masamune/method_logger.rb +41 -0
  53. data/lib/masamune/multi_io.rb +39 -0
  54. data/lib/masamune/schema.rb +36 -0
  55. data/lib/masamune/schema/catalog.rb +233 -0
  56. data/lib/masamune/schema/column.rb +527 -0
  57. data/lib/masamune/schema/dimension.rb +133 -0
  58. data/lib/masamune/schema/event.rb +121 -0
  59. data/lib/masamune/schema/fact.rb +133 -0
  60. data/lib/masamune/schema/map.rb +265 -0
  61. data/lib/masamune/schema/row.rb +133 -0
  62. data/lib/masamune/schema/store.rb +115 -0
  63. data/lib/masamune/schema/table.rb +308 -0
  64. data/lib/masamune/schema/table_reference.rb +76 -0
  65. data/lib/masamune/spec_helper.rb +23 -0
  66. data/lib/masamune/string_format.rb +34 -0
  67. data/lib/masamune/tasks/elastic_mapreduce_thor.rb +60 -0
  68. data/lib/masamune/tasks/hive_thor.rb +55 -0
  69. data/lib/masamune/tasks/postgres_thor.rb +47 -0
  70. data/lib/masamune/tasks/shell_thor.rb +63 -0
  71. data/lib/masamune/template.rb +77 -0
  72. data/lib/masamune/thor.rb +186 -0
  73. data/lib/masamune/thor_loader.rb +38 -0
  74. data/lib/masamune/topological_hash.rb +34 -0
  75. data/lib/masamune/transform.rb +47 -0
  76. data/lib/masamune/transform/bulk_upsert.psql.erb +64 -0
  77. data/lib/masamune/transform/bulk_upsert.rb +52 -0
  78. data/lib/masamune/transform/consolidate_dimension.rb +54 -0
  79. data/lib/masamune/transform/deduplicate_dimension.psql.erb +52 -0
  80. data/lib/masamune/transform/deduplicate_dimension.rb +53 -0
  81. data/lib/masamune/transform/define_event_view.hql.erb +51 -0
  82. data/lib/masamune/transform/define_event_view.rb +60 -0
  83. data/lib/masamune/transform/define_index.psql.erb +34 -0
  84. data/lib/masamune/transform/define_schema.hql.erb +23 -0
  85. data/lib/masamune/transform/define_schema.psql.erb +79 -0
  86. data/lib/masamune/transform/define_schema.rb +56 -0
  87. data/lib/masamune/transform/define_table.hql.erb +34 -0
  88. data/lib/masamune/transform/define_table.psql.erb +95 -0
  89. data/lib/masamune/transform/define_table.rb +40 -0
  90. data/lib/masamune/transform/define_unique.psql.erb +30 -0
  91. data/lib/masamune/transform/insert_reference_values.psql.erb +43 -0
  92. data/lib/masamune/transform/insert_reference_values.rb +64 -0
  93. data/lib/masamune/transform/load_dimension.rb +47 -0
  94. data/lib/masamune/transform/load_fact.rb +45 -0
  95. data/lib/masamune/transform/operator.rb +96 -0
  96. data/lib/masamune/transform/relabel_dimension.psql.erb +76 -0
  97. data/lib/masamune/transform/relabel_dimension.rb +39 -0
  98. data/lib/masamune/transform/rollup_fact.psql.erb +79 -0
  99. data/lib/masamune/transform/rollup_fact.rb +149 -0
  100. data/lib/masamune/transform/snapshot_dimension.psql.erb +75 -0
  101. data/lib/masamune/transform/snapshot_dimension.rb +74 -0
  102. data/lib/masamune/transform/stage_dimension.psql.erb +39 -0
  103. data/lib/masamune/transform/stage_dimension.rb +83 -0
  104. data/lib/masamune/transform/stage_fact.psql.erb +80 -0
  105. data/lib/masamune/transform/stage_fact.rb +111 -0
  106. data/lib/masamune/version.rb +25 -0
  107. data/spec/fixtures/aggregate.sql.erb +25 -0
  108. data/spec/fixtures/comment.sql.erb +27 -0
  109. data/spec/fixtures/invalid.sql.erb +23 -0
  110. data/spec/fixtures/relative.sql.erb +23 -0
  111. data/spec/fixtures/simple.sql.erb +28 -0
  112. data/spec/fixtures/whitespace.sql.erb +30 -0
  113. data/spec/masamune/actions/elastic_mapreduce_spec.rb +108 -0
  114. data/spec/masamune/actions/execute_spec.rb +50 -0
  115. data/spec/masamune/actions/hadoop_filesystem_spec.rb +44 -0
  116. data/spec/masamune/actions/hadoop_streaming_spec.rb +74 -0
  117. data/spec/masamune/actions/hive_spec.rb +117 -0
  118. data/spec/masamune/actions/postgres_admin_spec.rb +58 -0
  119. data/spec/masamune/actions/postgres_spec.rb +134 -0
  120. data/spec/masamune/actions/s3cmd_spec.rb +44 -0
  121. data/spec/masamune/actions/transform_spec.rb +144 -0
  122. data/spec/masamune/after_initialization_callbacks_spec.rb +61 -0
  123. data/spec/masamune/cached_filesystem_spec.rb +167 -0
  124. data/spec/masamune/commands/hadoop_filesystem_spec.rb +50 -0
  125. data/spec/masamune/commands/hadoop_streaming_spec.rb +106 -0
  126. data/spec/masamune/commands/hive_spec.rb +117 -0
  127. data/spec/masamune/commands/postgres_admin_spec.rb +69 -0
  128. data/spec/masamune/commands/postgres_spec.rb +100 -0
  129. data/spec/masamune/commands/retry_with_backoff_spec.rb +116 -0
  130. data/spec/masamune/commands/s3cmd_spec.rb +50 -0
  131. data/spec/masamune/commands/shell_spec.rb +101 -0
  132. data/spec/masamune/configuration_spec.rb +102 -0
  133. data/spec/masamune/data_plan/builder_spec.rb +91 -0
  134. data/spec/masamune/data_plan/elem_spec.rb +102 -0
  135. data/spec/masamune/data_plan/engine_spec.rb +356 -0
  136. data/spec/masamune/data_plan/rule_spec.rb +407 -0
  137. data/spec/masamune/data_plan/set_spec.rb +517 -0
  138. data/spec/masamune/environment_spec.rb +65 -0
  139. data/spec/masamune/filesystem_spec.rb +1421 -0
  140. data/spec/masamune/helpers/postgres_spec.rb +95 -0
  141. data/spec/masamune/schema/catalog_spec.rb +613 -0
  142. data/spec/masamune/schema/column_spec.rb +696 -0
  143. data/spec/masamune/schema/dimension_spec.rb +137 -0
  144. data/spec/masamune/schema/event_spec.rb +75 -0
  145. data/spec/masamune/schema/fact_spec.rb +117 -0
  146. data/spec/masamune/schema/map_spec.rb +593 -0
  147. data/spec/masamune/schema/row_spec.rb +28 -0
  148. data/spec/masamune/schema/store_spec.rb +49 -0
  149. data/spec/masamune/schema/table_spec.rb +395 -0
  150. data/spec/masamune/string_format_spec.rb +60 -0
  151. data/spec/masamune/tasks/elastic_mapreduce_thor_spec.rb +57 -0
  152. data/spec/masamune/tasks/hive_thor_spec.rb +75 -0
  153. data/spec/masamune/tasks/postgres_thor_spec.rb +42 -0
  154. data/spec/masamune/tasks/shell_thor_spec.rb +51 -0
  155. data/spec/masamune/template_spec.rb +77 -0
  156. data/spec/masamune/thor_spec.rb +238 -0
  157. data/spec/masamune/transform/bulk_upsert.dimension_spec.rb +200 -0
  158. data/spec/masamune/transform/consolidate_dimension_spec.rb +62 -0
  159. data/spec/masamune/transform/deduplicate_dimension_spec.rb +84 -0
  160. data/spec/masamune/transform/define_event_view_spec.rb +84 -0
  161. data/spec/masamune/transform/define_schema_spec.rb +83 -0
  162. data/spec/masamune/transform/define_table.dimension_spec.rb +306 -0
  163. data/spec/masamune/transform/define_table.fact_spec.rb +291 -0
  164. data/spec/masamune/transform/define_table.table_spec.rb +525 -0
  165. data/spec/masamune/transform/insert_reference_values.dimension_spec.rb +111 -0
  166. data/spec/masamune/transform/insert_reference_values.fact_spec.rb +149 -0
  167. data/spec/masamune/transform/load_dimension_spec.rb +76 -0
  168. data/spec/masamune/transform/load_fact_spec.rb +89 -0
  169. data/spec/masamune/transform/relabel_dimension_spec.rb +102 -0
  170. data/spec/masamune/transform/rollup_fact_spec.rb +333 -0
  171. data/spec/masamune/transform/snapshot_dimension_spec.rb +103 -0
  172. data/spec/masamune/transform/stage_dimension_spec.rb +115 -0
  173. data/spec/masamune/transform/stage_fact_spec.rb +204 -0
  174. data/spec/masamune_spec.rb +32 -0
  175. data/spec/spec_helper.rb +41 -0
  176. data/spec/support/masamune/example_group.rb +36 -0
  177. data/spec/support/masamune/mock_command.rb +99 -0
  178. data/spec/support/masamune/mock_delegate.rb +51 -0
  179. data/spec/support/masamune/mock_filesystem.rb +96 -0
  180. data/spec/support/masamune/thor_mute.rb +35 -0
  181. data/spec/support/rspec/example/action_example_group.rb +34 -0
  182. data/spec/support/rspec/example/task_example_group.rb +80 -0
  183. data/spec/support/rspec/example/transform_example_group.rb +36 -0
  184. data/spec/support/shared_examples/postgres_common_examples.rb +53 -0
  185. metadata +462 -0
@@ -0,0 +1,164 @@
1
+ # The MIT License (MIT)
2
+ #
3
+ # Copyright (c) 2014-2015, VMware, Inc. All Rights Reserved.
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ # THE SOFTWARE.
22
+
23
+ require 'thread'
24
+ require 'tmpdir'
25
+ require 'logger'
26
+
27
+ require 'masamune/version'
28
+ require 'masamune/multi_io'
29
+
30
+ module Masamune
31
+ class Environment
32
+ attr_accessor :parent
33
+ attr_accessor :filesystem
34
+ attr_accessor :catalog
35
+
36
+ def initialize(parent = nil)
37
+ self.parent = parent
38
+ end
39
+
40
+ def version
41
+ "masamune #{Masamune::VERSION}"
42
+ end
43
+
44
+ def configure
45
+ yield configuration
46
+ end
47
+
48
+ def configuration
49
+ @configuration ||= Masamune::Configuration.new(self)
50
+ end
51
+
52
+ def mutex
53
+ @mutex ||= Mutex.new
54
+ end
55
+
56
+ def with_exclusive_lock(name, &block)
57
+ raise 'filesystem path :run_dir not defined' unless filesystem.has_path?(:run_dir)
58
+ lock_name = [name, configuration.lock].compact.join(':')
59
+ logger.debug("acquiring lock '#{lock_name}'")
60
+ lock_file = lock_file(lock_name)
61
+ lock_status = lock_file.flock(File::LOCK_EX | File::LOCK_NB)
62
+ if lock_status == 0
63
+ yield if block_given?
64
+ else
65
+ logger.error "acquire lock attempt failed for '#{lock_name}'"
66
+ end
67
+ ensure
68
+ if lock_file
69
+ logger.debug("releasing lock '#{lock_name}'")
70
+ lock_file.flock(File::LOCK_UN)
71
+ end
72
+ end
73
+
74
+ def log_file_template
75
+ @log_file_template || "#{Time.now.to_i}-#{$$}.log"
76
+ end
77
+
78
+ def log_file_template=(log_file_template)
79
+ @log_file_template = log_file_template
80
+ reload_logger!
81
+ end
82
+
83
+ def reload_logger!
84
+ @logger = @log_file_name = nil
85
+ end
86
+
87
+ def log_file_name
88
+ @log_file_name ||= filesystem.get_path(:log_dir, log_file_template)
89
+ end
90
+
91
+ def logger
92
+ @logger ||= Logger.new(log_file_io).tap do
93
+ symlink_latest_log
94
+ end
95
+ end
96
+
97
+ def console(*a)
98
+ line = a.join(' ').chomp
99
+ mutex.synchronize do
100
+ logger.info(line)
101
+ $stdout.puts line unless configuration.quiet || configuration.debug
102
+ $stdout.flush
103
+ $stderr.flush
104
+ end
105
+ end
106
+
107
+ def trace(*a)
108
+ line = a.join(' ').chomp
109
+ mutex.synchronize do
110
+ logger.info(line)
111
+ $stdout.puts line if configuration.verbose && !configuration.debug
112
+ $stdout.flush
113
+ $stderr.flush
114
+ end
115
+ end
116
+
117
+ def filesystem
118
+ @filesystem ||= begin
119
+ filesystem = Masamune::Filesystem.new
120
+ filesystem.add_path :root_dir, File.expand_path('../../../', __FILE__)
121
+ filesystem = Masamune::MethodLogger.new(filesystem, :copy_file_to_file, :copy_file_to_dir, :remove_dir, :move_file_to_file, :move_file_to_dir, :move_dir)
122
+ filesystem
123
+ end
124
+ end
125
+
126
+ def catalog
127
+ @catalog ||= Masamune::Schema::Catalog.new(self)
128
+ end
129
+
130
+ def hive_helper
131
+ @hive_helper ||= Masamune::Helpers::Hive.new(self)
132
+ end
133
+
134
+ def postgres_helper
135
+ @postgres_helper ||= Masamune::Helpers::Postgres.new(self)
136
+ end
137
+
138
+ private
139
+
140
+ def lock_file(name)
141
+ path = filesystem.get_path(:run_dir, "#{name}.lock")
142
+ File.open(path, File::CREAT, 0644)
143
+ end
144
+
145
+ def log_file_io
146
+ if filesystem.has_path?(:log_dir)
147
+ log_file = File.open(log_file_name, 'a')
148
+ log_file.sync = true
149
+ configuration.debug ? Masamune::MultiIO.new($stderr, log_file) : log_file
150
+ else
151
+ configuration.debug ? $stderr : nil
152
+ end
153
+ end
154
+
155
+ def symlink_latest_log
156
+ return unless filesystem.has_path?(:log_dir)
157
+ latest = filesystem.path(:log_dir, 'latest')
158
+ FileUtils.rm(latest) if File.exists?(latest)
159
+ FileUtils.ln_s(log_file_name, latest)
160
+ rescue => e
161
+ logger.error(e)
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,567 @@
1
+ # The MIT License (MIT)
2
+ #
3
+ # Copyright (c) 2014-2015, VMware, Inc. All Rights Reserved.
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ # THE SOFTWARE.
22
+
23
+ require 'masamune/has_environment'
24
+
25
+ module Masamune
26
+ class Filesystem
27
+ include Masamune::HasEnvironment
28
+ include Masamune::Accumulate
29
+ include Masamune::Actions::S3Cmd
30
+ include Masamune::Actions::HadoopFilesystem
31
+
32
+ FILE_MODE = 0777 - File.umask
33
+
34
+ def initialize
35
+ @paths = {}
36
+ @immutable_paths = {}
37
+ end
38
+
39
+ def clear!
40
+ # Intentionally unimplemented
41
+ end
42
+
43
+ def add_path(symbol, path, options = {})
44
+ options ||= {}
45
+ options.symbolize_keys!
46
+ eager_path = eager_load_path path
47
+ @paths[symbol.to_sym] = [eager_path, options]
48
+ mkdir!(eager_path) if options[:mkdir]
49
+ add_immutable_path(eager_path) if options[:immutable]
50
+ self
51
+ end
52
+
53
+ def get_path(symbol, *extra)
54
+ lazy_path = lambda do |fs|
55
+ fs.has_path?(symbol) or raise "Path :#{symbol} not defined"
56
+ path, options = fs.paths[symbol]
57
+
58
+ mkdir!(path) if options[:mkdir]
59
+ expand_params(fs, extra.any? ? File.join(path, extra) : path)
60
+ end
61
+
62
+ if eager_load_paths?
63
+ eager_load_path lazy_path.call(self)
64
+ else
65
+ lazy_path
66
+ end
67
+ end
68
+ alias :path :get_path
69
+
70
+ def has_path?(symbol)
71
+ @paths.has_key?(symbol)
72
+ end
73
+
74
+ def paths
75
+ @paths
76
+ end
77
+
78
+ def expand_params(fs, path)
79
+ new_path = path.dup
80
+ fs.environment.configuration.params.each do |key, value|
81
+ new_path.gsub!("%#{key.to_s}", value.to_s)
82
+ end
83
+ new_path
84
+ end
85
+
86
+ def relative_path?(path)
87
+ return false if remote_prefix(path)
88
+ path[0] != '/'
89
+ end
90
+
91
+ def parent_paths(path, &block)
92
+ if prefix = remote_prefix(path)
93
+ node = path.split(prefix).last
94
+ else
95
+ prefix = ''
96
+ node = path
97
+ end
98
+
99
+ return if prefix.blank? && node.blank?
100
+ parent_paths = node ? File.expand_path(node, '/').split('/') : []
101
+ parent_paths.reject! { |x| x.blank? }
102
+ parent_paths.prepend('/') if node =~ %r{\A/}
103
+ tmp = []
104
+ parent_paths.each do |part|
105
+ tmp << part
106
+ current_path = prefix + File.join(tmp)
107
+ break if current_path == path
108
+ yield current_path
109
+ end
110
+ end
111
+ method_accumulate :parent_paths
112
+
113
+ def root_path?(path)
114
+ raise ArgumentError, 'path cannot be nil' if path.nil?
115
+ raise ArgumentError, 'path cannot be blank' if path.blank?
116
+ raise ArgumentError, 'path cannot be relative' if relative_path?(path)
117
+ parent_paths(path).length < 1
118
+ end
119
+
120
+ def resolve_file(paths = [])
121
+ Array.wrap(paths).select { |path| File.exists?(path) && File.file?(path) }.first
122
+ end
123
+
124
+ def dirname(path)
125
+ parent_paths(path).last || path
126
+ end
127
+
128
+ def basename(path)
129
+ return unless path
130
+ node = remote_prefix(path) ? path.split(remote_prefix(path)).last : path
131
+ return if node.nil? || node.blank?
132
+ node.split('/').last
133
+ end
134
+
135
+ def touch!(*files)
136
+ files.uniq!
137
+ files.group_by { |path| type(path) }.each do |type, file_set|
138
+ mkdir!(*file_set.map { |file| File.dirname(file) }) unless type == :s3
139
+ case type
140
+ when :hdfs
141
+ hadoop_fs('-touchz', *file_set)
142
+ when :s3
143
+ empty = Tempfile.new('masamune')
144
+ file_set.each do |file|
145
+ s3cmd('put', empty.path, s3b(file, dir: false))
146
+ end
147
+ when :local
148
+ FileUtils.touch(file_set, file_util_args)
149
+ FileUtils.chmod(FILE_MODE, file_set, file_util_args)
150
+ end
151
+ end
152
+ end
153
+
154
+ def exists?(file)
155
+ case type(file)
156
+ when :hdfs
157
+ hadoop_fs('-test', '-e', file, safe: true).success?
158
+ when :s3
159
+ result = Set.new
160
+ s3cmd('ls', s3b(file), safe: true) do |line|
161
+ date, time, size, name = line.split(/\s+/)
162
+ result << (name == file)
163
+ end
164
+ result.any?
165
+ when :local
166
+ File.exists?(file)
167
+ end
168
+ end
169
+
170
+ def glob_stat(pattern, &block)
171
+ case type(pattern)
172
+ when :hdfs
173
+ hadoop_fs('-ls', '-R', pattern, safe: true) do |line|
174
+ next if line =~ /\AFound \d+ items/
175
+ size, date, time, name = line.split(/\s+/).last(4)
176
+ next unless size && date && time && name
177
+ prefixed_name = remote_prefix(pattern) + name
178
+ yield OpenStruct.new(name: prefixed_name, mtime: Time.parse("#{date} #{time} +0000").at_beginning_of_minute.utc, size: size.to_i)
179
+ end
180
+ when :s3
181
+ file_glob, file_regexp = glob_split(pattern, recursive: true)
182
+ s3cmd('ls', '--recursive', s3b(file_glob), safe: true) do |line|
183
+ next if line =~ /\$folder$/
184
+ date, time, size, name = line.split(/\s+/)
185
+ next unless size && date && time && name
186
+ next unless name =~ file_regexp
187
+ yield OpenStruct.new(name: name, mtime: Time.parse("#{date} #{time} +0000").at_beginning_of_minute.utc, size: size.to_i)
188
+ end
189
+ when :local
190
+ Dir.glob(pattern.gsub(%r{/\*\Z}, '/**/*')) do |file|
191
+ stat = File.stat(file)
192
+ yield OpenStruct.new(name: file, mtime: stat.mtime.at_beginning_of_minute.utc, size: stat.size.to_i)
193
+ end
194
+ end
195
+ end
196
+ method_accumulate :glob_stat
197
+
198
+ def stat(file_or_dir)
199
+ raise ArgumentError, 'cannot contain wildcard' if file_or_dir.include?('*')
200
+ result = glob_stat(file_or_dir)
201
+ return unless result.any?
202
+ return result.first if result.size == 1
203
+ max_time = result.map { |stat| stat.try(:mtime) }.compact.max
204
+ sum_size = result.map { |stat| stat.try(:size) }.compact.reduce(:+)
205
+ OpenStruct.new(name: file_or_dir, mtime: max_time, size: sum_size)
206
+ end
207
+
208
+ def mkdir!(*dirs)
209
+ dirs.uniq!
210
+ dirs.group_by { |path| type(path) }.each do |type, dir_set|
211
+ case type
212
+ when :hdfs
213
+ hadoop_fs('-mkdir', '-p', *dir_set)
214
+ when :s3
215
+ touch! *dir_set.map { |dir| File.join(dir, '.not_empty') }
216
+ when :local
217
+ FileUtils.mkdir_p(dir_set, file_util_args)
218
+ end
219
+ end
220
+ end
221
+
222
+ def glob(pattern, &block)
223
+ case type(pattern)
224
+ when :hdfs
225
+ file_glob, file_regexp = glob_split(pattern)
226
+ hadoop_fs('-ls', pattern, safe: true) do |line|
227
+ next if line =~ /\AFound \d+ items/
228
+ name = line.split(/\s+/).last
229
+ next unless name
230
+ prefixed_name = remote_prefix(pattern) + name
231
+ next unless prefixed_name && prefixed_name =~ file_regexp
232
+ yield q(pattern, prefixed_name)
233
+ end
234
+ when :s3
235
+ file_glob, file_regexp = glob_split(pattern)
236
+ s3cmd('ls', '--recursive', s3b(file_glob), safe: true) do |line|
237
+ next if line =~ /\$folder$/
238
+ name = line.split(/\s+/).last
239
+ next unless name && name =~ file_regexp
240
+ yield q(pattern, name)
241
+ end
242
+ when :local
243
+ Dir.glob(pattern.gsub(%r{/\*\Z}, '/**/*')) do |file|
244
+ yield file
245
+ end
246
+ end
247
+ end
248
+ method_accumulate :glob
249
+
250
+ def glob_sort(pattern, options = {})
251
+ result = glob(pattern)
252
+ case options[:order]
253
+ when :basename
254
+ result.sort { |x,y| File.basename(x) <=> File.basename(y) }
255
+ else
256
+ result
257
+ end
258
+ end
259
+
260
+ def copy_file_to_file(src, dst)
261
+ check_immutable_path!(dst)
262
+ mkdir!(dirname(dst)) unless type(dst) == :s3
263
+ copy_file_helper(src, dst, false)
264
+ end
265
+
266
+ def copy_file_to_dir(src, dst)
267
+ check_immutable_path!(dst)
268
+ mkdir!(dst) unless type(dst) == :s3
269
+ return false if dirname(src) == dst
270
+ copy_file_helper(src, dst, true)
271
+ end
272
+
273
+ def copy_dir(src, dst)
274
+ check_immutable_path!(dst)
275
+ case [type(src), type(dst)]
276
+ when [:hdfs, :hdfs]
277
+ copy_file_to_dir(src, dst)
278
+ when [:hdfs, :local]
279
+ copy_file_to_dir(src, dst)
280
+ when [:hdfs, :s3]
281
+ copy_file_to_dir(src, dst)
282
+ when [:s3, :s3]
283
+ s3cmd('cp', '--recursive', s3b(src, dir: true), s3b(dst, dir: true))
284
+ when [:s3, :local]
285
+ fixed_dst = File.join(dst, src.split('/')[-1])
286
+ FileUtils.mkdir_p(fixed_dst, file_util_args)
287
+ s3cmd('get', '--recursive', '--skip-existing', s3b(src, dir: true), fixed_dst)
288
+ when [:s3, :hdfs]
289
+ copy_file_to_dir(src, dst)
290
+ when [:local, :local]
291
+ FileUtils.mkdir_p(dst, file_util_args)
292
+ FileUtils.cp_r(src, dst, file_util_args)
293
+ when [:local, :hdfs]
294
+ copy_file_to_dir(src, dst)
295
+ when [:local, :s3]
296
+ s3cmd('put', '--recursive', src, s3b(dst, dir: true))
297
+ end
298
+ end
299
+
300
+ def remove_file(file)
301
+ check_immutable_path!(file)
302
+ case type(file)
303
+ when :hdfs
304
+ hadoop_fs('-rm', file)
305
+ when :s3
306
+ s3cmd('del', s3b(file, dir: false))
307
+ when :local
308
+ FileUtils.rm(file, file_util_args)
309
+ end
310
+ end
311
+
312
+ def remove_dir(dir)
313
+ raise "#{dir} is root path, cannot remove" if root_path?(dir)
314
+ check_immutable_path!(dir)
315
+ case type(dir)
316
+ when :hdfs
317
+ hadoop_fs('-rmr', dir)
318
+ when :s3
319
+ s3cmd('del', '--recursive', s3b(dir, dir: true))
320
+ s3cmd('del', '--recursive', s3b("#{dir}_$folder$"))
321
+ when :local
322
+ FileUtils.rmtree(dir, file_util_args)
323
+ end
324
+ end
325
+
326
+ def move_file_to_file(src, dst)
327
+ check_immutable_path!(src)
328
+ mkdir!(dirname(dst)) unless type(dst) == :s3
329
+ move_file_helper(src, dst, false)
330
+ end
331
+
332
+ def move_file_to_dir(src, dst)
333
+ check_immutable_path!(src)
334
+ mkdir!(dst) unless type(dst) == :s3
335
+ move_file_helper(src, dst, true)
336
+ end
337
+
338
+ def move_dir(src, dst)
339
+ check_immutable_path!(src)
340
+ case [type(src), type(dst)]
341
+ when [:hdfs, :hdfs]
342
+ move_file_to_file(src, dst)
343
+ when [:hdfs, :local]
344
+ copy_file_to_dir(src, dst)
345
+ remove_dir(src)
346
+ when [:s3, :s3]
347
+ s3cmd('mv', '--recursive', d(src), f(dst))
348
+ when [:s3, :local]
349
+ s3cmd('get', '--recursive', d(src), f(dst))
350
+ remove_dir(src)
351
+ when [:s3, :hdfs]
352
+ copy_file_to_dir(src, dst)
353
+ remove_dir(src)
354
+ when [:hdfs, :s3]
355
+ copy_file_to_dir(src, d(dst))
356
+ remove_dir(src)
357
+ when [:local, :local]
358
+ move_file_to_file(src, dst)
359
+ when [:local, :hdfs]
360
+ move_file_to_file(src, dst)
361
+ when [:local, :s3]
362
+ s3cmd('put', '--recursive', d(src), d(dst))
363
+ remove_dir(src)
364
+ end
365
+ end
366
+
367
+ def cat(*files)
368
+ StringIO.new.tap do |buf|
369
+ files.group_by { |path| type(path) }.each do |type, file_set|
370
+ case type
371
+ when :local
372
+ file_set.map do |file|
373
+ next unless File.exists?(file)
374
+ next if File.directory?(file)
375
+ buf << File.read(file)
376
+ end
377
+ end
378
+ end
379
+ buf.rewind
380
+ end
381
+ end
382
+
383
+ def write(buf, dst)
384
+ case type(dst)
385
+ when :local
386
+ mkdir!(File.dirname(dst))
387
+ File.open(dst, 'w') do |file|
388
+ file.write buf
389
+ end
390
+ end
391
+ end
392
+
393
+ def chown!(*files)
394
+ opts = files.last.is_a?(Hash) ? files.pop : {}
395
+ user, group = opts.fetch(:user, current_user), opts.fetch(:group, current_group)
396
+
397
+ files.group_by { |path| type(path) }.each do |type, file_set|
398
+ case type
399
+ when :hdfs
400
+ hadoop_fs('-chown', '-R', [user, group].compact.join(':'), *file_set)
401
+ when :s3
402
+ # NOTE intentionally skip
403
+ when :local
404
+ FileUtils.chown_R(user, group, file_set, file_util_args)
405
+ end
406
+ end
407
+ end
408
+
409
+ def mktemp!(path)
410
+ get_path(path, SecureRandom.base64).tap do |file|
411
+ touch!(file)
412
+ end
413
+ end
414
+
415
+ def mktempdir!(path)
416
+ get_path(path, SecureRandom.base64).tap do |dir|
417
+ mkdir!(dir)
418
+ end
419
+ end
420
+
421
+ def glob_split(input, options = {})
422
+ [ input.include?('*') ? input.split('*').first + '*' : input, glob_to_regexp(input, options) ]
423
+ end
424
+
425
+ def glob_to_regexp(input, options = {})
426
+ if input.include?('*') || options.fetch(:recursive, false)
427
+ %r|\A#{Regexp.escape(input).gsub('\\*', '.*?').gsub(%r{\/\.\*\?\z}, '/?.*?')}|
428
+ else
429
+ /\A#{Regexp.escape(input)}\z/
430
+ end
431
+ end
432
+
433
+ private
434
+
435
+ def eager_load_path(path)
436
+ case path
437
+ when String
438
+ path
439
+ when Proc
440
+ path.call(self)
441
+ else
442
+ raise "Unknown path #{path.inspect}"
443
+ end
444
+ end
445
+
446
+ def remote_prefix(dir)
447
+ dir[%r{\As3n?://.*?(?=/)}] ||
448
+ dir[%r{\As3n?://.*?\Z}] ||
449
+ dir[%r{\Afile://}] ||
450
+ dir[%r{\Ahdfs://}]
451
+ end
452
+
453
+ def local_prefix(file)
454
+ return file if remote_prefix(file)
455
+ "file://#{file}"
456
+ end
457
+
458
+ def eager_load_paths?
459
+ @paths.reject { |key,_| key == :root_dir }.any?
460
+ end
461
+
462
+ def type(path)
463
+ case path
464
+ when %r{\Afile://}, %r{\Ahdfs://}
465
+ :hdfs
466
+ when %r{\As3n?://}
467
+ :s3
468
+ else
469
+ :local
470
+ end
471
+ end
472
+
473
+ def file_util_args
474
+ {noop: configuration.no_op, verbose: configuration.verbose}
475
+ end
476
+
477
+ def qualify_file(dir, file)
478
+ if prefix = remote_prefix(dir) and file !~ /\A#{Regexp.escape(prefix)}/
479
+ "#{prefix}/#{file.sub(%r{\A/+}, '')}"
480
+ else
481
+ file
482
+ end
483
+ end
484
+ alias :q :qualify_file
485
+
486
+ def ensure_dir(dir)
487
+ File.join(dir, '/')
488
+ end
489
+ alias :d :ensure_dir
490
+
491
+ def ensure_file(file)
492
+ file.chomp('/')
493
+ end
494
+ alias :f :ensure_file
495
+
496
+ def add_immutable_path(path)
497
+ @immutable_paths[path] = /\A#{Regexp.escape(path)}/
498
+ end
499
+
500
+ def check_immutable_path!(file)
501
+ @immutable_paths.each do |path, regex|
502
+ raise "#{path} is marked as immutable, cannot modify #{file}" if file[regex].present?
503
+ end
504
+ end
505
+
506
+ def current_user
507
+ Etc.getlogin
508
+ end
509
+
510
+ def current_group
511
+ Etc.getgrgid(Etc.getpwnam(current_user).gid).name
512
+ rescue
513
+ end
514
+
515
+ def copy_file_helper(src, dst, dir)
516
+ case [type(src), type(dst)]
517
+ when [:hdfs, :hdfs]
518
+ hadoop_fs('-cp', src, dst)
519
+ when [:hdfs, :local]
520
+ hadoop_fs('-copyToLocal', src, local_prefix(dst))
521
+ when [:hdfs, :s3]
522
+ hadoop_fs('-cp', src, s3n(dst))
523
+ when [:s3, :s3]
524
+ s3cmd('cp', src, s3b(dst, dir: dir))
525
+ when [:s3, :local]
526
+ s3cmd('get', src, dst)
527
+ when [:s3, :hdfs]
528
+ hadoop_fs('-cp', s3n(src), dst)
529
+ when [:local, :local]
530
+ FileUtils.cp(src, dst, file_util_args)
531
+ when [:local, :hdfs]
532
+ hadoop_fs('-copyFromLocal', local_prefix(src), dst)
533
+ when [:local, :s3]
534
+ s3cmd('put', src, s3b(dst, dir: dir))
535
+ end
536
+ end
537
+
538
+ def move_file_helper(src, dst, dir)
539
+ case [type(src), type(dst)]
540
+ when [:hdfs, :hdfs]
541
+ hadoop_fs('-mv', src, dst)
542
+ when [:hdfs, :local]
543
+ # NOTE: moveToLocal: Option '-moveToLocal' is not implemented yet
544
+ hadoop_fs('-copyToLocal', src, local_prefix(dst))
545
+ hadoop_fs('-rm', src)
546
+ when [:hdfs, :s3]
547
+ copy_file_to_file(src, s3n(dst, dir: dir))
548
+ hadoop_fs('-rm', src)
549
+ when [:s3, :s3]
550
+ s3cmd('mv', src, s3b(dst, dir: dir))
551
+ when [:s3, :local]
552
+ s3cmd('get', src, dst)
553
+ s3cmd('del', src)
554
+ when [:s3, :hdfs]
555
+ hadoop_fs('-mv', s3n(src), dst)
556
+ when [:local, :local]
557
+ FileUtils.mv(src, dst, file_util_args)
558
+ FileUtils.chmod(FILE_MODE, dst, file_util_args)
559
+ when [:local, :hdfs]
560
+ hadoop_fs('-moveFromLocal', local_prefix(src), dst)
561
+ when [:local, :s3]
562
+ s3cmd('put', src, s3b(dst, dir: dir))
563
+ FileUtils.rm(src, file_util_args)
564
+ end
565
+ end
566
+ end
567
+ end