masamune 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +54 -0
  4. data/Rakefile +15 -0
  5. data/bin/masamune-elastic-mapreduce +4 -0
  6. data/bin/masamune-hive +4 -0
  7. data/bin/masamune-psql +4 -0
  8. data/bin/masamune-shell +4 -0
  9. data/lib/masamune.rb +56 -0
  10. data/lib/masamune/accumulate.rb +60 -0
  11. data/lib/masamune/actions.rb +38 -0
  12. data/lib/masamune/actions/data_flow.rb +131 -0
  13. data/lib/masamune/actions/date_parse.rb +75 -0
  14. data/lib/masamune/actions/elastic_mapreduce.rb +68 -0
  15. data/lib/masamune/actions/execute.rb +52 -0
  16. data/lib/masamune/actions/filesystem.rb +37 -0
  17. data/lib/masamune/actions/hadoop_filesystem.rb +40 -0
  18. data/lib/masamune/actions/hadoop_streaming.rb +41 -0
  19. data/lib/masamune/actions/hive.rb +74 -0
  20. data/lib/masamune/actions/postgres.rb +76 -0
  21. data/lib/masamune/actions/postgres_admin.rb +34 -0
  22. data/lib/masamune/actions/s3cmd.rb +44 -0
  23. data/lib/masamune/actions/transform.rb +89 -0
  24. data/lib/masamune/after_initialize_callbacks.rb +55 -0
  25. data/lib/masamune/cached_filesystem.rb +110 -0
  26. data/lib/masamune/commands.rb +37 -0
  27. data/lib/masamune/commands/elastic_mapreduce.rb +119 -0
  28. data/lib/masamune/commands/hadoop_filesystem.rb +57 -0
  29. data/lib/masamune/commands/hadoop_streaming.rb +116 -0
  30. data/lib/masamune/commands/hive.rb +178 -0
  31. data/lib/masamune/commands/interactive.rb +37 -0
  32. data/lib/masamune/commands/postgres.rb +128 -0
  33. data/lib/masamune/commands/postgres_admin.rb +72 -0
  34. data/lib/masamune/commands/postgres_common.rb +33 -0
  35. data/lib/masamune/commands/retry_with_backoff.rb +60 -0
  36. data/lib/masamune/commands/s3cmd.rb +70 -0
  37. data/lib/masamune/commands/shell.rb +202 -0
  38. data/lib/masamune/configuration.rb +195 -0
  39. data/lib/masamune/data_plan.rb +31 -0
  40. data/lib/masamune/data_plan/builder.rb +66 -0
  41. data/lib/masamune/data_plan/elem.rb +190 -0
  42. data/lib/masamune/data_plan/engine.rb +162 -0
  43. data/lib/masamune/data_plan/rule.rb +292 -0
  44. data/lib/masamune/data_plan/set.rb +176 -0
  45. data/lib/masamune/environment.rb +164 -0
  46. data/lib/masamune/filesystem.rb +567 -0
  47. data/lib/masamune/has_environment.rb +40 -0
  48. data/lib/masamune/helpers.rb +27 -0
  49. data/lib/masamune/helpers/postgres.rb +84 -0
  50. data/lib/masamune/io.rb +33 -0
  51. data/lib/masamune/last_element.rb +53 -0
  52. data/lib/masamune/method_logger.rb +41 -0
  53. data/lib/masamune/multi_io.rb +39 -0
  54. data/lib/masamune/schema.rb +36 -0
  55. data/lib/masamune/schema/catalog.rb +233 -0
  56. data/lib/masamune/schema/column.rb +527 -0
  57. data/lib/masamune/schema/dimension.rb +133 -0
  58. data/lib/masamune/schema/event.rb +121 -0
  59. data/lib/masamune/schema/fact.rb +133 -0
  60. data/lib/masamune/schema/map.rb +265 -0
  61. data/lib/masamune/schema/row.rb +133 -0
  62. data/lib/masamune/schema/store.rb +115 -0
  63. data/lib/masamune/schema/table.rb +308 -0
  64. data/lib/masamune/schema/table_reference.rb +76 -0
  65. data/lib/masamune/spec_helper.rb +23 -0
  66. data/lib/masamune/string_format.rb +34 -0
  67. data/lib/masamune/tasks/elastic_mapreduce_thor.rb +60 -0
  68. data/lib/masamune/tasks/hive_thor.rb +55 -0
  69. data/lib/masamune/tasks/postgres_thor.rb +47 -0
  70. data/lib/masamune/tasks/shell_thor.rb +63 -0
  71. data/lib/masamune/template.rb +77 -0
  72. data/lib/masamune/thor.rb +186 -0
  73. data/lib/masamune/thor_loader.rb +38 -0
  74. data/lib/masamune/topological_hash.rb +34 -0
  75. data/lib/masamune/transform.rb +47 -0
  76. data/lib/masamune/transform/bulk_upsert.psql.erb +64 -0
  77. data/lib/masamune/transform/bulk_upsert.rb +52 -0
  78. data/lib/masamune/transform/consolidate_dimension.rb +54 -0
  79. data/lib/masamune/transform/deduplicate_dimension.psql.erb +52 -0
  80. data/lib/masamune/transform/deduplicate_dimension.rb +53 -0
  81. data/lib/masamune/transform/define_event_view.hql.erb +51 -0
  82. data/lib/masamune/transform/define_event_view.rb +60 -0
  83. data/lib/masamune/transform/define_index.psql.erb +34 -0
  84. data/lib/masamune/transform/define_schema.hql.erb +23 -0
  85. data/lib/masamune/transform/define_schema.psql.erb +79 -0
  86. data/lib/masamune/transform/define_schema.rb +56 -0
  87. data/lib/masamune/transform/define_table.hql.erb +34 -0
  88. data/lib/masamune/transform/define_table.psql.erb +95 -0
  89. data/lib/masamune/transform/define_table.rb +40 -0
  90. data/lib/masamune/transform/define_unique.psql.erb +30 -0
  91. data/lib/masamune/transform/insert_reference_values.psql.erb +43 -0
  92. data/lib/masamune/transform/insert_reference_values.rb +64 -0
  93. data/lib/masamune/transform/load_dimension.rb +47 -0
  94. data/lib/masamune/transform/load_fact.rb +45 -0
  95. data/lib/masamune/transform/operator.rb +96 -0
  96. data/lib/masamune/transform/relabel_dimension.psql.erb +76 -0
  97. data/lib/masamune/transform/relabel_dimension.rb +39 -0
  98. data/lib/masamune/transform/rollup_fact.psql.erb +79 -0
  99. data/lib/masamune/transform/rollup_fact.rb +149 -0
  100. data/lib/masamune/transform/snapshot_dimension.psql.erb +75 -0
  101. data/lib/masamune/transform/snapshot_dimension.rb +74 -0
  102. data/lib/masamune/transform/stage_dimension.psql.erb +39 -0
  103. data/lib/masamune/transform/stage_dimension.rb +83 -0
  104. data/lib/masamune/transform/stage_fact.psql.erb +80 -0
  105. data/lib/masamune/transform/stage_fact.rb +111 -0
  106. data/lib/masamune/version.rb +25 -0
  107. data/spec/fixtures/aggregate.sql.erb +25 -0
  108. data/spec/fixtures/comment.sql.erb +27 -0
  109. data/spec/fixtures/invalid.sql.erb +23 -0
  110. data/spec/fixtures/relative.sql.erb +23 -0
  111. data/spec/fixtures/simple.sql.erb +28 -0
  112. data/spec/fixtures/whitespace.sql.erb +30 -0
  113. data/spec/masamune/actions/elastic_mapreduce_spec.rb +108 -0
  114. data/spec/masamune/actions/execute_spec.rb +50 -0
  115. data/spec/masamune/actions/hadoop_filesystem_spec.rb +44 -0
  116. data/spec/masamune/actions/hadoop_streaming_spec.rb +74 -0
  117. data/spec/masamune/actions/hive_spec.rb +117 -0
  118. data/spec/masamune/actions/postgres_admin_spec.rb +58 -0
  119. data/spec/masamune/actions/postgres_spec.rb +134 -0
  120. data/spec/masamune/actions/s3cmd_spec.rb +44 -0
  121. data/spec/masamune/actions/transform_spec.rb +144 -0
  122. data/spec/masamune/after_initialization_callbacks_spec.rb +61 -0
  123. data/spec/masamune/cached_filesystem_spec.rb +167 -0
  124. data/spec/masamune/commands/hadoop_filesystem_spec.rb +50 -0
  125. data/spec/masamune/commands/hadoop_streaming_spec.rb +106 -0
  126. data/spec/masamune/commands/hive_spec.rb +117 -0
  127. data/spec/masamune/commands/postgres_admin_spec.rb +69 -0
  128. data/spec/masamune/commands/postgres_spec.rb +100 -0
  129. data/spec/masamune/commands/retry_with_backoff_spec.rb +116 -0
  130. data/spec/masamune/commands/s3cmd_spec.rb +50 -0
  131. data/spec/masamune/commands/shell_spec.rb +101 -0
  132. data/spec/masamune/configuration_spec.rb +102 -0
  133. data/spec/masamune/data_plan/builder_spec.rb +91 -0
  134. data/spec/masamune/data_plan/elem_spec.rb +102 -0
  135. data/spec/masamune/data_plan/engine_spec.rb +356 -0
  136. data/spec/masamune/data_plan/rule_spec.rb +407 -0
  137. data/spec/masamune/data_plan/set_spec.rb +517 -0
  138. data/spec/masamune/environment_spec.rb +65 -0
  139. data/spec/masamune/filesystem_spec.rb +1421 -0
  140. data/spec/masamune/helpers/postgres_spec.rb +95 -0
  141. data/spec/masamune/schema/catalog_spec.rb +613 -0
  142. data/spec/masamune/schema/column_spec.rb +696 -0
  143. data/spec/masamune/schema/dimension_spec.rb +137 -0
  144. data/spec/masamune/schema/event_spec.rb +75 -0
  145. data/spec/masamune/schema/fact_spec.rb +117 -0
  146. data/spec/masamune/schema/map_spec.rb +593 -0
  147. data/spec/masamune/schema/row_spec.rb +28 -0
  148. data/spec/masamune/schema/store_spec.rb +49 -0
  149. data/spec/masamune/schema/table_spec.rb +395 -0
  150. data/spec/masamune/string_format_spec.rb +60 -0
  151. data/spec/masamune/tasks/elastic_mapreduce_thor_spec.rb +57 -0
  152. data/spec/masamune/tasks/hive_thor_spec.rb +75 -0
  153. data/spec/masamune/tasks/postgres_thor_spec.rb +42 -0
  154. data/spec/masamune/tasks/shell_thor_spec.rb +51 -0
  155. data/spec/masamune/template_spec.rb +77 -0
  156. data/spec/masamune/thor_spec.rb +238 -0
  157. data/spec/masamune/transform/bulk_upsert.dimension_spec.rb +200 -0
  158. data/spec/masamune/transform/consolidate_dimension_spec.rb +62 -0
  159. data/spec/masamune/transform/deduplicate_dimension_spec.rb +84 -0
  160. data/spec/masamune/transform/define_event_view_spec.rb +84 -0
  161. data/spec/masamune/transform/define_schema_spec.rb +83 -0
  162. data/spec/masamune/transform/define_table.dimension_spec.rb +306 -0
  163. data/spec/masamune/transform/define_table.fact_spec.rb +291 -0
  164. data/spec/masamune/transform/define_table.table_spec.rb +525 -0
  165. data/spec/masamune/transform/insert_reference_values.dimension_spec.rb +111 -0
  166. data/spec/masamune/transform/insert_reference_values.fact_spec.rb +149 -0
  167. data/spec/masamune/transform/load_dimension_spec.rb +76 -0
  168. data/spec/masamune/transform/load_fact_spec.rb +89 -0
  169. data/spec/masamune/transform/relabel_dimension_spec.rb +102 -0
  170. data/spec/masamune/transform/rollup_fact_spec.rb +333 -0
  171. data/spec/masamune/transform/snapshot_dimension_spec.rb +103 -0
  172. data/spec/masamune/transform/stage_dimension_spec.rb +115 -0
  173. data/spec/masamune/transform/stage_fact_spec.rb +204 -0
  174. data/spec/masamune_spec.rb +32 -0
  175. data/spec/spec_helper.rb +41 -0
  176. data/spec/support/masamune/example_group.rb +36 -0
  177. data/spec/support/masamune/mock_command.rb +99 -0
  178. data/spec/support/masamune/mock_delegate.rb +51 -0
  179. data/spec/support/masamune/mock_filesystem.rb +96 -0
  180. data/spec/support/masamune/thor_mute.rb +35 -0
  181. data/spec/support/rspec/example/action_example_group.rb +34 -0
  182. data/spec/support/rspec/example/task_example_group.rb +80 -0
  183. data/spec/support/rspec/example/transform_example_group.rb +36 -0
  184. data/spec/support/shared_examples/postgres_common_examples.rb +53 -0
  185. metadata +462 -0
@@ -0,0 +1,164 @@
1
+ # The MIT License (MIT)
2
+ #
3
+ # Copyright (c) 2014-2015, VMware, Inc. All Rights Reserved.
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ # THE SOFTWARE.
22
+
23
+ require 'thread'
24
+ require 'tmpdir'
25
+ require 'logger'
26
+
27
+ require 'masamune/version'
28
+ require 'masamune/multi_io'
29
+
30
+ module Masamune
31
+ class Environment
32
+ attr_accessor :parent
33
+ attr_accessor :filesystem
34
+ attr_accessor :catalog
35
+
36
+ def initialize(parent = nil)
37
+ self.parent = parent
38
+ end
39
+
40
+ def version
41
+ "masamune #{Masamune::VERSION}"
42
+ end
43
+
44
+ def configure
45
+ yield configuration
46
+ end
47
+
48
+ def configuration
49
+ @configuration ||= Masamune::Configuration.new(self)
50
+ end
51
+
52
+ def mutex
53
+ @mutex ||= Mutex.new
54
+ end
55
+
56
+ def with_exclusive_lock(name, &block)
57
+ raise 'filesystem path :run_dir not defined' unless filesystem.has_path?(:run_dir)
58
+ lock_name = [name, configuration.lock].compact.join(':')
59
+ logger.debug("acquiring lock '#{lock_name}'")
60
+ lock_file = lock_file(lock_name)
61
+ lock_status = lock_file.flock(File::LOCK_EX | File::LOCK_NB)
62
+ if lock_status == 0
63
+ yield if block_given?
64
+ else
65
+ logger.error "acquire lock attempt failed for '#{lock_name}'"
66
+ end
67
+ ensure
68
+ if lock_file
69
+ logger.debug("releasing lock '#{lock_name}'")
70
+ lock_file.flock(File::LOCK_UN)
71
+ end
72
+ end
73
+
74
+ def log_file_template
75
+ @log_file_template || "#{Time.now.to_i}-#{$$}.log"
76
+ end
77
+
78
+ def log_file_template=(log_file_template)
79
+ @log_file_template = log_file_template
80
+ reload_logger!
81
+ end
82
+
83
+ def reload_logger!
84
+ @logger = @log_file_name = nil
85
+ end
86
+
87
+ def log_file_name
88
+ @log_file_name ||= filesystem.get_path(:log_dir, log_file_template)
89
+ end
90
+
91
+ def logger
92
+ @logger ||= Logger.new(log_file_io).tap do
93
+ symlink_latest_log
94
+ end
95
+ end
96
+
97
+ def console(*a)
98
+ line = a.join(' ').chomp
99
+ mutex.synchronize do
100
+ logger.info(line)
101
+ $stdout.puts line unless configuration.quiet || configuration.debug
102
+ $stdout.flush
103
+ $stderr.flush
104
+ end
105
+ end
106
+
107
+ def trace(*a)
108
+ line = a.join(' ').chomp
109
+ mutex.synchronize do
110
+ logger.info(line)
111
+ $stdout.puts line if configuration.verbose && !configuration.debug
112
+ $stdout.flush
113
+ $stderr.flush
114
+ end
115
+ end
116
+
117
+ def filesystem
118
+ @filesystem ||= begin
119
+ filesystem = Masamune::Filesystem.new
120
+ filesystem.add_path :root_dir, File.expand_path('../../../', __FILE__)
121
+ filesystem = Masamune::MethodLogger.new(filesystem, :copy_file_to_file, :copy_file_to_dir, :remove_dir, :move_file_to_file, :move_file_to_dir, :move_dir)
122
+ filesystem
123
+ end
124
+ end
125
+
126
+ def catalog
127
+ @catalog ||= Masamune::Schema::Catalog.new(self)
128
+ end
129
+
130
+ def hive_helper
131
+ @hive_helper ||= Masamune::Helpers::Hive.new(self)
132
+ end
133
+
134
+ def postgres_helper
135
+ @postgres_helper ||= Masamune::Helpers::Postgres.new(self)
136
+ end
137
+
138
+ private
139
+
140
+ def lock_file(name)
141
+ path = filesystem.get_path(:run_dir, "#{name}.lock")
142
+ File.open(path, File::CREAT, 0644)
143
+ end
144
+
145
+ def log_file_io
146
+ if filesystem.has_path?(:log_dir)
147
+ log_file = File.open(log_file_name, 'a')
148
+ log_file.sync = true
149
+ configuration.debug ? Masamune::MultiIO.new($stderr, log_file) : log_file
150
+ else
151
+ configuration.debug ? $stderr : nil
152
+ end
153
+ end
154
+
155
+ def symlink_latest_log
156
+ return unless filesystem.has_path?(:log_dir)
157
+ latest = filesystem.path(:log_dir, 'latest')
158
+ FileUtils.rm(latest) if File.exists?(latest)
159
+ FileUtils.ln_s(log_file_name, latest)
160
+ rescue => e
161
+ logger.error(e)
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,567 @@
1
+ # The MIT License (MIT)
2
+ #
3
+ # Copyright (c) 2014-2015, VMware, Inc. All Rights Reserved.
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ # THE SOFTWARE.
22
+
23
+ require 'masamune/has_environment'
24
+
25
+ module Masamune
26
+ class Filesystem
27
+ include Masamune::HasEnvironment
28
+ include Masamune::Accumulate
29
+ include Masamune::Actions::S3Cmd
30
+ include Masamune::Actions::HadoopFilesystem
31
+
32
+ FILE_MODE = 0777 - File.umask
33
+
34
+ def initialize
35
+ @paths = {}
36
+ @immutable_paths = {}
37
+ end
38
+
39
+ def clear!
40
+ # Intentionally unimplemented
41
+ end
42
+
43
+ def add_path(symbol, path, options = {})
44
+ options ||= {}
45
+ options.symbolize_keys!
46
+ eager_path = eager_load_path path
47
+ @paths[symbol.to_sym] = [eager_path, options]
48
+ mkdir!(eager_path) if options[:mkdir]
49
+ add_immutable_path(eager_path) if options[:immutable]
50
+ self
51
+ end
52
+
53
+ def get_path(symbol, *extra)
54
+ lazy_path = lambda do |fs|
55
+ fs.has_path?(symbol) or raise "Path :#{symbol} not defined"
56
+ path, options = fs.paths[symbol]
57
+
58
+ mkdir!(path) if options[:mkdir]
59
+ expand_params(fs, extra.any? ? File.join(path, extra) : path)
60
+ end
61
+
62
+ if eager_load_paths?
63
+ eager_load_path lazy_path.call(self)
64
+ else
65
+ lazy_path
66
+ end
67
+ end
68
+ alias :path :get_path
69
+
70
+ def has_path?(symbol)
71
+ @paths.has_key?(symbol)
72
+ end
73
+
74
+ def paths
75
+ @paths
76
+ end
77
+
78
+ def expand_params(fs, path)
79
+ new_path = path.dup
80
+ fs.environment.configuration.params.each do |key, value|
81
+ new_path.gsub!("%#{key.to_s}", value.to_s)
82
+ end
83
+ new_path
84
+ end
85
+
86
+ def relative_path?(path)
87
+ return false if remote_prefix(path)
88
+ path[0] != '/'
89
+ end
90
+
91
+ def parent_paths(path, &block)
92
+ if prefix = remote_prefix(path)
93
+ node = path.split(prefix).last
94
+ else
95
+ prefix = ''
96
+ node = path
97
+ end
98
+
99
+ return if prefix.blank? && node.blank?
100
+ parent_paths = node ? File.expand_path(node, '/').split('/') : []
101
+ parent_paths.reject! { |x| x.blank? }
102
+ parent_paths.prepend('/') if node =~ %r{\A/}
103
+ tmp = []
104
+ parent_paths.each do |part|
105
+ tmp << part
106
+ current_path = prefix + File.join(tmp)
107
+ break if current_path == path
108
+ yield current_path
109
+ end
110
+ end
111
+ method_accumulate :parent_paths
112
+
113
+ def root_path?(path)
114
+ raise ArgumentError, 'path cannot be nil' if path.nil?
115
+ raise ArgumentError, 'path cannot be blank' if path.blank?
116
+ raise ArgumentError, 'path cannot be relative' if relative_path?(path)
117
+ parent_paths(path).length < 1
118
+ end
119
+
120
+ def resolve_file(paths = [])
121
+ Array.wrap(paths).select { |path| File.exists?(path) && File.file?(path) }.first
122
+ end
123
+
124
+ def dirname(path)
125
+ parent_paths(path).last || path
126
+ end
127
+
128
+ def basename(path)
129
+ return unless path
130
+ node = remote_prefix(path) ? path.split(remote_prefix(path)).last : path
131
+ return if node.nil? || node.blank?
132
+ node.split('/').last
133
+ end
134
+
135
+ def touch!(*files)
136
+ files.uniq!
137
+ files.group_by { |path| type(path) }.each do |type, file_set|
138
+ mkdir!(*file_set.map { |file| File.dirname(file) }) unless type == :s3
139
+ case type
140
+ when :hdfs
141
+ hadoop_fs('-touchz', *file_set)
142
+ when :s3
143
+ empty = Tempfile.new('masamune')
144
+ file_set.each do |file|
145
+ s3cmd('put', empty.path, s3b(file, dir: false))
146
+ end
147
+ when :local
148
+ FileUtils.touch(file_set, file_util_args)
149
+ FileUtils.chmod(FILE_MODE, file_set, file_util_args)
150
+ end
151
+ end
152
+ end
153
+
154
+ def exists?(file)
155
+ case type(file)
156
+ when :hdfs
157
+ hadoop_fs('-test', '-e', file, safe: true).success?
158
+ when :s3
159
+ result = Set.new
160
+ s3cmd('ls', s3b(file), safe: true) do |line|
161
+ date, time, size, name = line.split(/\s+/)
162
+ result << (name == file)
163
+ end
164
+ result.any?
165
+ when :local
166
+ File.exists?(file)
167
+ end
168
+ end
169
+
170
+ def glob_stat(pattern, &block)
171
+ case type(pattern)
172
+ when :hdfs
173
+ hadoop_fs('-ls', '-R', pattern, safe: true) do |line|
174
+ next if line =~ /\AFound \d+ items/
175
+ size, date, time, name = line.split(/\s+/).last(4)
176
+ next unless size && date && time && name
177
+ prefixed_name = remote_prefix(pattern) + name
178
+ yield OpenStruct.new(name: prefixed_name, mtime: Time.parse("#{date} #{time} +0000").at_beginning_of_minute.utc, size: size.to_i)
179
+ end
180
+ when :s3
181
+ file_glob, file_regexp = glob_split(pattern, recursive: true)
182
+ s3cmd('ls', '--recursive', s3b(file_glob), safe: true) do |line|
183
+ next if line =~ /\$folder$/
184
+ date, time, size, name = line.split(/\s+/)
185
+ next unless size && date && time && name
186
+ next unless name =~ file_regexp
187
+ yield OpenStruct.new(name: name, mtime: Time.parse("#{date} #{time} +0000").at_beginning_of_minute.utc, size: size.to_i)
188
+ end
189
+ when :local
190
+ Dir.glob(pattern.gsub(%r{/\*\Z}, '/**/*')) do |file|
191
+ stat = File.stat(file)
192
+ yield OpenStruct.new(name: file, mtime: stat.mtime.at_beginning_of_minute.utc, size: stat.size.to_i)
193
+ end
194
+ end
195
+ end
196
+ method_accumulate :glob_stat
197
+
198
+ def stat(file_or_dir)
199
+ raise ArgumentError, 'cannot contain wildcard' if file_or_dir.include?('*')
200
+ result = glob_stat(file_or_dir)
201
+ return unless result.any?
202
+ return result.first if result.size == 1
203
+ max_time = result.map { |stat| stat.try(:mtime) }.compact.max
204
+ sum_size = result.map { |stat| stat.try(:size) }.compact.reduce(:+)
205
+ OpenStruct.new(name: file_or_dir, mtime: max_time, size: sum_size)
206
+ end
207
+
208
+ def mkdir!(*dirs)
209
+ dirs.uniq!
210
+ dirs.group_by { |path| type(path) }.each do |type, dir_set|
211
+ case type
212
+ when :hdfs
213
+ hadoop_fs('-mkdir', '-p', *dir_set)
214
+ when :s3
215
+ touch! *dir_set.map { |dir| File.join(dir, '.not_empty') }
216
+ when :local
217
+ FileUtils.mkdir_p(dir_set, file_util_args)
218
+ end
219
+ end
220
+ end
221
+
222
+ def glob(pattern, &block)
223
+ case type(pattern)
224
+ when :hdfs
225
+ file_glob, file_regexp = glob_split(pattern)
226
+ hadoop_fs('-ls', pattern, safe: true) do |line|
227
+ next if line =~ /\AFound \d+ items/
228
+ name = line.split(/\s+/).last
229
+ next unless name
230
+ prefixed_name = remote_prefix(pattern) + name
231
+ next unless prefixed_name && prefixed_name =~ file_regexp
232
+ yield q(pattern, prefixed_name)
233
+ end
234
+ when :s3
235
+ file_glob, file_regexp = glob_split(pattern)
236
+ s3cmd('ls', '--recursive', s3b(file_glob), safe: true) do |line|
237
+ next if line =~ /\$folder$/
238
+ name = line.split(/\s+/).last
239
+ next unless name && name =~ file_regexp
240
+ yield q(pattern, name)
241
+ end
242
+ when :local
243
+ Dir.glob(pattern.gsub(%r{/\*\Z}, '/**/*')) do |file|
244
+ yield file
245
+ end
246
+ end
247
+ end
248
+ method_accumulate :glob
249
+
250
+ def glob_sort(pattern, options = {})
251
+ result = glob(pattern)
252
+ case options[:order]
253
+ when :basename
254
+ result.sort { |x,y| File.basename(x) <=> File.basename(y) }
255
+ else
256
+ result
257
+ end
258
+ end
259
+
260
+ def copy_file_to_file(src, dst)
261
+ check_immutable_path!(dst)
262
+ mkdir!(dirname(dst)) unless type(dst) == :s3
263
+ copy_file_helper(src, dst, false)
264
+ end
265
+
266
+ def copy_file_to_dir(src, dst)
267
+ check_immutable_path!(dst)
268
+ mkdir!(dst) unless type(dst) == :s3
269
+ return false if dirname(src) == dst
270
+ copy_file_helper(src, dst, true)
271
+ end
272
+
273
+ def copy_dir(src, dst)
274
+ check_immutable_path!(dst)
275
+ case [type(src), type(dst)]
276
+ when [:hdfs, :hdfs]
277
+ copy_file_to_dir(src, dst)
278
+ when [:hdfs, :local]
279
+ copy_file_to_dir(src, dst)
280
+ when [:hdfs, :s3]
281
+ copy_file_to_dir(src, dst)
282
+ when [:s3, :s3]
283
+ s3cmd('cp', '--recursive', s3b(src, dir: true), s3b(dst, dir: true))
284
+ when [:s3, :local]
285
+ fixed_dst = File.join(dst, src.split('/')[-1])
286
+ FileUtils.mkdir_p(fixed_dst, file_util_args)
287
+ s3cmd('get', '--recursive', '--skip-existing', s3b(src, dir: true), fixed_dst)
288
+ when [:s3, :hdfs]
289
+ copy_file_to_dir(src, dst)
290
+ when [:local, :local]
291
+ FileUtils.mkdir_p(dst, file_util_args)
292
+ FileUtils.cp_r(src, dst, file_util_args)
293
+ when [:local, :hdfs]
294
+ copy_file_to_dir(src, dst)
295
+ when [:local, :s3]
296
+ s3cmd('put', '--recursive', src, s3b(dst, dir: true))
297
+ end
298
+ end
299
+
300
+ def remove_file(file)
301
+ check_immutable_path!(file)
302
+ case type(file)
303
+ when :hdfs
304
+ hadoop_fs('-rm', file)
305
+ when :s3
306
+ s3cmd('del', s3b(file, dir: false))
307
+ when :local
308
+ FileUtils.rm(file, file_util_args)
309
+ end
310
+ end
311
+
312
+ def remove_dir(dir)
313
+ raise "#{dir} is root path, cannot remove" if root_path?(dir)
314
+ check_immutable_path!(dir)
315
+ case type(dir)
316
+ when :hdfs
317
+ hadoop_fs('-rmr', dir)
318
+ when :s3
319
+ s3cmd('del', '--recursive', s3b(dir, dir: true))
320
+ s3cmd('del', '--recursive', s3b("#{dir}_$folder$"))
321
+ when :local
322
+ FileUtils.rmtree(dir, file_util_args)
323
+ end
324
+ end
325
+
326
+ def move_file_to_file(src, dst)
327
+ check_immutable_path!(src)
328
+ mkdir!(dirname(dst)) unless type(dst) == :s3
329
+ move_file_helper(src, dst, false)
330
+ end
331
+
332
+ def move_file_to_dir(src, dst)
333
+ check_immutable_path!(src)
334
+ mkdir!(dst) unless type(dst) == :s3
335
+ move_file_helper(src, dst, true)
336
+ end
337
+
338
+ def move_dir(src, dst)
339
+ check_immutable_path!(src)
340
+ case [type(src), type(dst)]
341
+ when [:hdfs, :hdfs]
342
+ move_file_to_file(src, dst)
343
+ when [:hdfs, :local]
344
+ copy_file_to_dir(src, dst)
345
+ remove_dir(src)
346
+ when [:s3, :s3]
347
+ s3cmd('mv', '--recursive', d(src), f(dst))
348
+ when [:s3, :local]
349
+ s3cmd('get', '--recursive', d(src), f(dst))
350
+ remove_dir(src)
351
+ when [:s3, :hdfs]
352
+ copy_file_to_dir(src, dst)
353
+ remove_dir(src)
354
+ when [:hdfs, :s3]
355
+ copy_file_to_dir(src, d(dst))
356
+ remove_dir(src)
357
+ when [:local, :local]
358
+ move_file_to_file(src, dst)
359
+ when [:local, :hdfs]
360
+ move_file_to_file(src, dst)
361
+ when [:local, :s3]
362
+ s3cmd('put', '--recursive', d(src), d(dst))
363
+ remove_dir(src)
364
+ end
365
+ end
366
+
367
+ def cat(*files)
368
+ StringIO.new.tap do |buf|
369
+ files.group_by { |path| type(path) }.each do |type, file_set|
370
+ case type
371
+ when :local
372
+ file_set.map do |file|
373
+ next unless File.exists?(file)
374
+ next if File.directory?(file)
375
+ buf << File.read(file)
376
+ end
377
+ end
378
+ end
379
+ buf.rewind
380
+ end
381
+ end
382
+
383
+ def write(buf, dst)
384
+ case type(dst)
385
+ when :local
386
+ mkdir!(File.dirname(dst))
387
+ File.open(dst, 'w') do |file|
388
+ file.write buf
389
+ end
390
+ end
391
+ end
392
+
393
+ def chown!(*files)
394
+ opts = files.last.is_a?(Hash) ? files.pop : {}
395
+ user, group = opts.fetch(:user, current_user), opts.fetch(:group, current_group)
396
+
397
+ files.group_by { |path| type(path) }.each do |type, file_set|
398
+ case type
399
+ when :hdfs
400
+ hadoop_fs('-chown', '-R', [user, group].compact.join(':'), *file_set)
401
+ when :s3
402
+ # NOTE intentionally skip
403
+ when :local
404
+ FileUtils.chown_R(user, group, file_set, file_util_args)
405
+ end
406
+ end
407
+ end
408
+
409
+ def mktemp!(path)
410
+ get_path(path, SecureRandom.base64).tap do |file|
411
+ touch!(file)
412
+ end
413
+ end
414
+
415
+ def mktempdir!(path)
416
+ get_path(path, SecureRandom.base64).tap do |dir|
417
+ mkdir!(dir)
418
+ end
419
+ end
420
+
421
+ def glob_split(input, options = {})
422
+ [ input.include?('*') ? input.split('*').first + '*' : input, glob_to_regexp(input, options) ]
423
+ end
424
+
425
+ def glob_to_regexp(input, options = {})
426
+ if input.include?('*') || options.fetch(:recursive, false)
427
+ %r|\A#{Regexp.escape(input).gsub('\\*', '.*?').gsub(%r{\/\.\*\?\z}, '/?.*?')}|
428
+ else
429
+ /\A#{Regexp.escape(input)}\z/
430
+ end
431
+ end
432
+
433
+ private
434
+
435
+ def eager_load_path(path)
436
+ case path
437
+ when String
438
+ path
439
+ when Proc
440
+ path.call(self)
441
+ else
442
+ raise "Unknown path #{path.inspect}"
443
+ end
444
+ end
445
+
446
+ def remote_prefix(dir)
447
+ dir[%r{\As3n?://.*?(?=/)}] ||
448
+ dir[%r{\As3n?://.*?\Z}] ||
449
+ dir[%r{\Afile://}] ||
450
+ dir[%r{\Ahdfs://}]
451
+ end
452
+
453
+ def local_prefix(file)
454
+ return file if remote_prefix(file)
455
+ "file://#{file}"
456
+ end
457
+
458
+ def eager_load_paths?
459
+ @paths.reject { |key,_| key == :root_dir }.any?
460
+ end
461
+
462
+ def type(path)
463
+ case path
464
+ when %r{\Afile://}, %r{\Ahdfs://}
465
+ :hdfs
466
+ when %r{\As3n?://}
467
+ :s3
468
+ else
469
+ :local
470
+ end
471
+ end
472
+
473
+ def file_util_args
474
+ {noop: configuration.no_op, verbose: configuration.verbose}
475
+ end
476
+
477
+ def qualify_file(dir, file)
478
+ if prefix = remote_prefix(dir) and file !~ /\A#{Regexp.escape(prefix)}/
479
+ "#{prefix}/#{file.sub(%r{\A/+}, '')}"
480
+ else
481
+ file
482
+ end
483
+ end
484
+ alias :q :qualify_file
485
+
486
+ def ensure_dir(dir)
487
+ File.join(dir, '/')
488
+ end
489
+ alias :d :ensure_dir
490
+
491
+ def ensure_file(file)
492
+ file.chomp('/')
493
+ end
494
+ alias :f :ensure_file
495
+
496
+ def add_immutable_path(path)
497
+ @immutable_paths[path] = /\A#{Regexp.escape(path)}/
498
+ end
499
+
500
+ def check_immutable_path!(file)
501
+ @immutable_paths.each do |path, regex|
502
+ raise "#{path} is marked as immutable, cannot modify #{file}" if file[regex].present?
503
+ end
504
+ end
505
+
506
+ def current_user
507
+ Etc.getlogin
508
+ end
509
+
510
+ def current_group
511
+ Etc.getgrgid(Etc.getpwnam(current_user).gid).name
512
+ rescue
513
+ end
514
+
515
+ def copy_file_helper(src, dst, dir)
516
+ case [type(src), type(dst)]
517
+ when [:hdfs, :hdfs]
518
+ hadoop_fs('-cp', src, dst)
519
+ when [:hdfs, :local]
520
+ hadoop_fs('-copyToLocal', src, local_prefix(dst))
521
+ when [:hdfs, :s3]
522
+ hadoop_fs('-cp', src, s3n(dst))
523
+ when [:s3, :s3]
524
+ s3cmd('cp', src, s3b(dst, dir: dir))
525
+ when [:s3, :local]
526
+ s3cmd('get', src, dst)
527
+ when [:s3, :hdfs]
528
+ hadoop_fs('-cp', s3n(src), dst)
529
+ when [:local, :local]
530
+ FileUtils.cp(src, dst, file_util_args)
531
+ when [:local, :hdfs]
532
+ hadoop_fs('-copyFromLocal', local_prefix(src), dst)
533
+ when [:local, :s3]
534
+ s3cmd('put', src, s3b(dst, dir: dir))
535
+ end
536
+ end
537
+
538
+ def move_file_helper(src, dst, dir)
539
+ case [type(src), type(dst)]
540
+ when [:hdfs, :hdfs]
541
+ hadoop_fs('-mv', src, dst)
542
+ when [:hdfs, :local]
543
+ # NOTE: moveToLocal: Option '-moveToLocal' is not implemented yet
544
+ hadoop_fs('-copyToLocal', src, local_prefix(dst))
545
+ hadoop_fs('-rm', src)
546
+ when [:hdfs, :s3]
547
+ copy_file_to_file(src, s3n(dst, dir: dir))
548
+ hadoop_fs('-rm', src)
549
+ when [:s3, :s3]
550
+ s3cmd('mv', src, s3b(dst, dir: dir))
551
+ when [:s3, :local]
552
+ s3cmd('get', src, dst)
553
+ s3cmd('del', src)
554
+ when [:s3, :hdfs]
555
+ hadoop_fs('-mv', s3n(src), dst)
556
+ when [:local, :local]
557
+ FileUtils.mv(src, dst, file_util_args)
558
+ FileUtils.chmod(FILE_MODE, dst, file_util_args)
559
+ when [:local, :hdfs]
560
+ hadoop_fs('-moveFromLocal', local_prefix(src), dst)
561
+ when [:local, :s3]
562
+ s3cmd('put', src, s3b(dst, dir: dir))
563
+ FileUtils.rm(src, file_util_args)
564
+ end
565
+ end
566
+ end
567
+ end