masamune 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (185) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +54 -0
  4. data/Rakefile +15 -0
  5. data/bin/masamune-elastic-mapreduce +4 -0
  6. data/bin/masamune-hive +4 -0
  7. data/bin/masamune-psql +4 -0
  8. data/bin/masamune-shell +4 -0
  9. data/lib/masamune.rb +56 -0
  10. data/lib/masamune/accumulate.rb +60 -0
  11. data/lib/masamune/actions.rb +38 -0
  12. data/lib/masamune/actions/data_flow.rb +131 -0
  13. data/lib/masamune/actions/date_parse.rb +75 -0
  14. data/lib/masamune/actions/elastic_mapreduce.rb +68 -0
  15. data/lib/masamune/actions/execute.rb +52 -0
  16. data/lib/masamune/actions/filesystem.rb +37 -0
  17. data/lib/masamune/actions/hadoop_filesystem.rb +40 -0
  18. data/lib/masamune/actions/hadoop_streaming.rb +41 -0
  19. data/lib/masamune/actions/hive.rb +74 -0
  20. data/lib/masamune/actions/postgres.rb +76 -0
  21. data/lib/masamune/actions/postgres_admin.rb +34 -0
  22. data/lib/masamune/actions/s3cmd.rb +44 -0
  23. data/lib/masamune/actions/transform.rb +89 -0
  24. data/lib/masamune/after_initialize_callbacks.rb +55 -0
  25. data/lib/masamune/cached_filesystem.rb +110 -0
  26. data/lib/masamune/commands.rb +37 -0
  27. data/lib/masamune/commands/elastic_mapreduce.rb +119 -0
  28. data/lib/masamune/commands/hadoop_filesystem.rb +57 -0
  29. data/lib/masamune/commands/hadoop_streaming.rb +116 -0
  30. data/lib/masamune/commands/hive.rb +178 -0
  31. data/lib/masamune/commands/interactive.rb +37 -0
  32. data/lib/masamune/commands/postgres.rb +128 -0
  33. data/lib/masamune/commands/postgres_admin.rb +72 -0
  34. data/lib/masamune/commands/postgres_common.rb +33 -0
  35. data/lib/masamune/commands/retry_with_backoff.rb +60 -0
  36. data/lib/masamune/commands/s3cmd.rb +70 -0
  37. data/lib/masamune/commands/shell.rb +202 -0
  38. data/lib/masamune/configuration.rb +195 -0
  39. data/lib/masamune/data_plan.rb +31 -0
  40. data/lib/masamune/data_plan/builder.rb +66 -0
  41. data/lib/masamune/data_plan/elem.rb +190 -0
  42. data/lib/masamune/data_plan/engine.rb +162 -0
  43. data/lib/masamune/data_plan/rule.rb +292 -0
  44. data/lib/masamune/data_plan/set.rb +176 -0
  45. data/lib/masamune/environment.rb +164 -0
  46. data/lib/masamune/filesystem.rb +567 -0
  47. data/lib/masamune/has_environment.rb +40 -0
  48. data/lib/masamune/helpers.rb +27 -0
  49. data/lib/masamune/helpers/postgres.rb +84 -0
  50. data/lib/masamune/io.rb +33 -0
  51. data/lib/masamune/last_element.rb +53 -0
  52. data/lib/masamune/method_logger.rb +41 -0
  53. data/lib/masamune/multi_io.rb +39 -0
  54. data/lib/masamune/schema.rb +36 -0
  55. data/lib/masamune/schema/catalog.rb +233 -0
  56. data/lib/masamune/schema/column.rb +527 -0
  57. data/lib/masamune/schema/dimension.rb +133 -0
  58. data/lib/masamune/schema/event.rb +121 -0
  59. data/lib/masamune/schema/fact.rb +133 -0
  60. data/lib/masamune/schema/map.rb +265 -0
  61. data/lib/masamune/schema/row.rb +133 -0
  62. data/lib/masamune/schema/store.rb +115 -0
  63. data/lib/masamune/schema/table.rb +308 -0
  64. data/lib/masamune/schema/table_reference.rb +76 -0
  65. data/lib/masamune/spec_helper.rb +23 -0
  66. data/lib/masamune/string_format.rb +34 -0
  67. data/lib/masamune/tasks/elastic_mapreduce_thor.rb +60 -0
  68. data/lib/masamune/tasks/hive_thor.rb +55 -0
  69. data/lib/masamune/tasks/postgres_thor.rb +47 -0
  70. data/lib/masamune/tasks/shell_thor.rb +63 -0
  71. data/lib/masamune/template.rb +77 -0
  72. data/lib/masamune/thor.rb +186 -0
  73. data/lib/masamune/thor_loader.rb +38 -0
  74. data/lib/masamune/topological_hash.rb +34 -0
  75. data/lib/masamune/transform.rb +47 -0
  76. data/lib/masamune/transform/bulk_upsert.psql.erb +64 -0
  77. data/lib/masamune/transform/bulk_upsert.rb +52 -0
  78. data/lib/masamune/transform/consolidate_dimension.rb +54 -0
  79. data/lib/masamune/transform/deduplicate_dimension.psql.erb +52 -0
  80. data/lib/masamune/transform/deduplicate_dimension.rb +53 -0
  81. data/lib/masamune/transform/define_event_view.hql.erb +51 -0
  82. data/lib/masamune/transform/define_event_view.rb +60 -0
  83. data/lib/masamune/transform/define_index.psql.erb +34 -0
  84. data/lib/masamune/transform/define_schema.hql.erb +23 -0
  85. data/lib/masamune/transform/define_schema.psql.erb +79 -0
  86. data/lib/masamune/transform/define_schema.rb +56 -0
  87. data/lib/masamune/transform/define_table.hql.erb +34 -0
  88. data/lib/masamune/transform/define_table.psql.erb +95 -0
  89. data/lib/masamune/transform/define_table.rb +40 -0
  90. data/lib/masamune/transform/define_unique.psql.erb +30 -0
  91. data/lib/masamune/transform/insert_reference_values.psql.erb +43 -0
  92. data/lib/masamune/transform/insert_reference_values.rb +64 -0
  93. data/lib/masamune/transform/load_dimension.rb +47 -0
  94. data/lib/masamune/transform/load_fact.rb +45 -0
  95. data/lib/masamune/transform/operator.rb +96 -0
  96. data/lib/masamune/transform/relabel_dimension.psql.erb +76 -0
  97. data/lib/masamune/transform/relabel_dimension.rb +39 -0
  98. data/lib/masamune/transform/rollup_fact.psql.erb +79 -0
  99. data/lib/masamune/transform/rollup_fact.rb +149 -0
  100. data/lib/masamune/transform/snapshot_dimension.psql.erb +75 -0
  101. data/lib/masamune/transform/snapshot_dimension.rb +74 -0
  102. data/lib/masamune/transform/stage_dimension.psql.erb +39 -0
  103. data/lib/masamune/transform/stage_dimension.rb +83 -0
  104. data/lib/masamune/transform/stage_fact.psql.erb +80 -0
  105. data/lib/masamune/transform/stage_fact.rb +111 -0
  106. data/lib/masamune/version.rb +25 -0
  107. data/spec/fixtures/aggregate.sql.erb +25 -0
  108. data/spec/fixtures/comment.sql.erb +27 -0
  109. data/spec/fixtures/invalid.sql.erb +23 -0
  110. data/spec/fixtures/relative.sql.erb +23 -0
  111. data/spec/fixtures/simple.sql.erb +28 -0
  112. data/spec/fixtures/whitespace.sql.erb +30 -0
  113. data/spec/masamune/actions/elastic_mapreduce_spec.rb +108 -0
  114. data/spec/masamune/actions/execute_spec.rb +50 -0
  115. data/spec/masamune/actions/hadoop_filesystem_spec.rb +44 -0
  116. data/spec/masamune/actions/hadoop_streaming_spec.rb +74 -0
  117. data/spec/masamune/actions/hive_spec.rb +117 -0
  118. data/spec/masamune/actions/postgres_admin_spec.rb +58 -0
  119. data/spec/masamune/actions/postgres_spec.rb +134 -0
  120. data/spec/masamune/actions/s3cmd_spec.rb +44 -0
  121. data/spec/masamune/actions/transform_spec.rb +144 -0
  122. data/spec/masamune/after_initialization_callbacks_spec.rb +61 -0
  123. data/spec/masamune/cached_filesystem_spec.rb +167 -0
  124. data/spec/masamune/commands/hadoop_filesystem_spec.rb +50 -0
  125. data/spec/masamune/commands/hadoop_streaming_spec.rb +106 -0
  126. data/spec/masamune/commands/hive_spec.rb +117 -0
  127. data/spec/masamune/commands/postgres_admin_spec.rb +69 -0
  128. data/spec/masamune/commands/postgres_spec.rb +100 -0
  129. data/spec/masamune/commands/retry_with_backoff_spec.rb +116 -0
  130. data/spec/masamune/commands/s3cmd_spec.rb +50 -0
  131. data/spec/masamune/commands/shell_spec.rb +101 -0
  132. data/spec/masamune/configuration_spec.rb +102 -0
  133. data/spec/masamune/data_plan/builder_spec.rb +91 -0
  134. data/spec/masamune/data_plan/elem_spec.rb +102 -0
  135. data/spec/masamune/data_plan/engine_spec.rb +356 -0
  136. data/spec/masamune/data_plan/rule_spec.rb +407 -0
  137. data/spec/masamune/data_plan/set_spec.rb +517 -0
  138. data/spec/masamune/environment_spec.rb +65 -0
  139. data/spec/masamune/filesystem_spec.rb +1421 -0
  140. data/spec/masamune/helpers/postgres_spec.rb +95 -0
  141. data/spec/masamune/schema/catalog_spec.rb +613 -0
  142. data/spec/masamune/schema/column_spec.rb +696 -0
  143. data/spec/masamune/schema/dimension_spec.rb +137 -0
  144. data/spec/masamune/schema/event_spec.rb +75 -0
  145. data/spec/masamune/schema/fact_spec.rb +117 -0
  146. data/spec/masamune/schema/map_spec.rb +593 -0
  147. data/spec/masamune/schema/row_spec.rb +28 -0
  148. data/spec/masamune/schema/store_spec.rb +49 -0
  149. data/spec/masamune/schema/table_spec.rb +395 -0
  150. data/spec/masamune/string_format_spec.rb +60 -0
  151. data/spec/masamune/tasks/elastic_mapreduce_thor_spec.rb +57 -0
  152. data/spec/masamune/tasks/hive_thor_spec.rb +75 -0
  153. data/spec/masamune/tasks/postgres_thor_spec.rb +42 -0
  154. data/spec/masamune/tasks/shell_thor_spec.rb +51 -0
  155. data/spec/masamune/template_spec.rb +77 -0
  156. data/spec/masamune/thor_spec.rb +238 -0
  157. data/spec/masamune/transform/bulk_upsert.dimension_spec.rb +200 -0
  158. data/spec/masamune/transform/consolidate_dimension_spec.rb +62 -0
  159. data/spec/masamune/transform/deduplicate_dimension_spec.rb +84 -0
  160. data/spec/masamune/transform/define_event_view_spec.rb +84 -0
  161. data/spec/masamune/transform/define_schema_spec.rb +83 -0
  162. data/spec/masamune/transform/define_table.dimension_spec.rb +306 -0
  163. data/spec/masamune/transform/define_table.fact_spec.rb +291 -0
  164. data/spec/masamune/transform/define_table.table_spec.rb +525 -0
  165. data/spec/masamune/transform/insert_reference_values.dimension_spec.rb +111 -0
  166. data/spec/masamune/transform/insert_reference_values.fact_spec.rb +149 -0
  167. data/spec/masamune/transform/load_dimension_spec.rb +76 -0
  168. data/spec/masamune/transform/load_fact_spec.rb +89 -0
  169. data/spec/masamune/transform/relabel_dimension_spec.rb +102 -0
  170. data/spec/masamune/transform/rollup_fact_spec.rb +333 -0
  171. data/spec/masamune/transform/snapshot_dimension_spec.rb +103 -0
  172. data/spec/masamune/transform/stage_dimension_spec.rb +115 -0
  173. data/spec/masamune/transform/stage_fact_spec.rb +204 -0
  174. data/spec/masamune_spec.rb +32 -0
  175. data/spec/spec_helper.rb +41 -0
  176. data/spec/support/masamune/example_group.rb +36 -0
  177. data/spec/support/masamune/mock_command.rb +99 -0
  178. data/spec/support/masamune/mock_delegate.rb +51 -0
  179. data/spec/support/masamune/mock_filesystem.rb +96 -0
  180. data/spec/support/masamune/thor_mute.rb +35 -0
  181. data/spec/support/rspec/example/action_example_group.rb +34 -0
  182. data/spec/support/rspec/example/task_example_group.rb +80 -0
  183. data/spec/support/rspec/example/transform_example_group.rb +36 -0
  184. data/spec/support/shared_examples/postgres_common_examples.rb +53 -0
  185. metadata +462 -0
@@ -0,0 +1,133 @@
1
+ # The MIT License (MIT)
2
+ #
3
+ # Copyright (c) 2014-2015, VMware, Inc. All Rights Reserved.
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ # THE SOFTWARE.
22
+
23
+ module Masamune::Schema
24
+ class Dimension < Table
25
+ def initialize(opts = {})
26
+ super
27
+ initialize_dimension_columns!
28
+ end
29
+
30
+ def suffix
31
+ suffix = case type
32
+ when :mini
33
+ 'type'
34
+ when :one, :two, :four, :date
35
+ 'dimension'
36
+ else
37
+ type.to_s
38
+ end
39
+ parent ? [parent.suffix, suffix].compact.join('_') : suffix
40
+ end
41
+
42
+ def start_key
43
+ columns.values.detect { |column| column.id == :start_at }
44
+ end
45
+
46
+ def end_key
47
+ columns.values.detect { |column| column.id == :end_at }
48
+ end
49
+
50
+ def version_key
51
+ columns.values.detect { |column| column.id == :version }
52
+ end
53
+
54
+ def ledger_table
55
+ @ledger_table ||= self.class.new(id: id, type: :ledger, store: store, columns: ledger_table_columns, references: references.values, parent: self)
56
+ end
57
+
58
+ def reserved_column_ids
59
+ case type
60
+ when :one, :date
61
+ [:last_modified_at]
62
+ when :two
63
+ [:start_at, :end_at, :version, :last_modified_at]
64
+ when :four
65
+ [:parent_id, :record_id, :start_at, :end_at, :version, :last_modified_at]
66
+ when :ledger
67
+ [:source_kind, :source_uuid, :start_at, :last_modified_at, :delta]
68
+ else
69
+ super
70
+ end
71
+ end
72
+
73
+ private
74
+
75
+ def ledger_table_columns
76
+ columns.values.map do |column|
77
+ next if column.surrogate_key
78
+ next if reserved_column_ids.include?(column.id)
79
+
80
+ if column.type == :key_value
81
+ column_now, column_was = column.dup, column.dup
82
+ column_now.id, column_was.id = "#{column.id}_now", "#{column.id}_was"
83
+ column_now.strict, column_was.strict = false, false
84
+ [column_now, column_was]
85
+ else
86
+ column.dup.tap do |column_copy|
87
+ column_copy.strict = false unless column.surrogate_key || column.natural_key || (column.reference && column.reference.surrogate_key.auto)
88
+ end
89
+ end
90
+ end.flatten
91
+ end
92
+
93
+ def initialize_surrogate_key_column!
94
+ case type
95
+ when :mini, :one, :two, :four, :ledger, :date
96
+ initialize_column! id: 'id', type: :integer, surrogate_key: true
97
+ end
98
+ end
99
+
100
+ def initialize_dimension_columns!
101
+ case type
102
+ when :one, :date
103
+ initialize_column! id: 'last_modified_at', type: :timestamp, default: 'NOW()'
104
+ when :two
105
+ initialize_column! id: 'start_at', type: :timestamp, default: 'TO_TIMESTAMP(0)', index: true, unique: 'natural'
106
+ initialize_column! id: 'end_at', type: :timestamp, null: true, index: true
107
+ initialize_column! id: 'version', type: :integer, default: 1, null: true, index: true
108
+ initialize_column! id: 'last_modified_at', type: :timestamp, default: 'NOW()'
109
+ when :four
110
+ children << ledger_table
111
+ # FIXME derive type from from parent
112
+ initialize_column! id: 'parent_id', type: :integer, null: true, reference: ledger_table
113
+ initialize_column! id: 'record_id', type: :integer, null: true, reference: ledger_table
114
+ initialize_column! id: 'start_at', type: :timestamp, default: 'TO_TIMESTAMP(0)', index: true, unique: 'natural'
115
+ initialize_column! id: 'end_at', type: :timestamp, null: true, index: true
116
+ initialize_column! id: 'version', type: :integer, default: 1, null: true, index: true
117
+ initialize_column! id: 'last_modified_at', type: :timestamp, default: 'NOW()'
118
+ when :ledger
119
+ initialize_column! id: 'source_kind', type: :string, unique: 'natural'
120
+ initialize_column! id: 'source_uuid', type: :string, unique: 'natural'
121
+ initialize_column! id: 'start_at', type: :timestamp, index: true, unique: 'natural'
122
+ initialize_column! id: 'last_modified_at', type: :timestamp, default: 'NOW()'
123
+ initialize_column! id: 'delta', type: :integer
124
+ when :stage
125
+ if inherit
126
+ parent.reserved_columns.each do |_, column|
127
+ initialize_column! column.as_hash
128
+ end
129
+ end
130
+ end
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,121 @@
1
+ # The MIT License (MIT)
2
+ #
3
+ # Copyright (c) 2014-2015, VMware, Inc. All Rights Reserved.
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ # THE SOFTWARE.
22
+
23
+ module Masamune::Schema
24
+ class Event
25
+ class Attribute
26
+ attr_accessor :id
27
+ attr_accessor :type
28
+ attr_accessor :array
29
+ attr_accessor :immutable
30
+
31
+ def initialize(opts = {})
32
+ opts.symbolize_keys!
33
+ raise ArgumentError, 'required parameter id: missing' unless opts.key?(:id)
34
+ self.id = opts[:id].to_sym
35
+ self.type = opts.fetch(:type, :integer).to_sym
36
+ self.array = opts.fetch(:array, false)
37
+ self.immutable = opts.fetch(:immutable, false)
38
+ end
39
+
40
+ def as_columns(event, &block)
41
+ column_ids = immutable ? [id] : [:"#{id}_now", :"#{id}_was"]
42
+ column_ids.each do |id|
43
+ yield [id, Column.new(id: id, type: type, array: array, parent: event)]
44
+ end
45
+ end
46
+ end
47
+
48
+ DEFAULT_ATTRIBUTES =
49
+ {
50
+ id: nil,
51
+ store: nil,
52
+ attributes: [],
53
+ debug: false
54
+ }
55
+
56
+ DEFAULT_ATTRIBUTES.keys.each do |attr|
57
+ attr_accessor attr
58
+ end
59
+
60
+ def initialize(opts = {})
61
+ opts.symbolize_keys!
62
+ raise ArgumentError, 'required parameter id: missing' unless opts.key?(:id)
63
+ DEFAULT_ATTRIBUTES.merge(opts).each do |name, value|
64
+ public_send("#{name}=", value)
65
+ end
66
+ end
67
+
68
+ def name
69
+ @name || [id, suffix].compact.join('_')
70
+ end
71
+
72
+ def suffix
73
+ 'event'
74
+ end
75
+
76
+ def attributes=(attributes)
77
+ @attributes = {}
78
+ attributes.each do |attribute|
79
+ @attributes[attribute.id] = attribute
80
+ end
81
+ end
82
+
83
+ def columns
84
+ @columns ||= {}.tap do |columns|
85
+ columns[:uuid] = Column.new id: :uuid, type: :uuid, parent: self
86
+ columns[:type] = Column.new id: :type, type: :string, parent: self
87
+ attributes.map do |_, attribute|
88
+ attribute.as_columns(self) do |id, column|
89
+ columns[id] = column
90
+ end
91
+ end
92
+ columns[:delta] = Column.new id: :delta, type: :integer, parent: self
93
+ columns[:created_at] = Column.new id: :created_at, type: :timestamp, parent: self
94
+ end
95
+ end
96
+
97
+ def reserved_column_ids
98
+ @reserved_column_ids ||= [:uuid, :type, :delta, :created_at]
99
+ end
100
+
101
+ def unreserved_columns
102
+ columns.reject { |_, column| reserved_column_ids.include?(column.id) }
103
+ end
104
+
105
+ def create_type
106
+ @create_type ||= "#{id}_create"
107
+ end
108
+
109
+ def update_type
110
+ @update_type ||= "#{id}_update"
111
+ end
112
+
113
+ def delete_type
114
+ @delete_type ||= "#{id}_delete"
115
+ end
116
+
117
+ def dereference_column_name(name)
118
+ columns[name.to_sym]
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,133 @@
1
+ # The MIT License (MIT)
2
+ #
3
+ # Copyright (c) 2014-2015, VMware, Inc. All Rights Reserved.
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ # THE SOFTWARE.
22
+
23
+ module Masamune::Schema
24
+ class Fact < Table
25
+ SUPPORTED_GRAINS = [:transaction, :hourly, :daily, :monthly]
26
+
27
+ attr_accessor :grain
28
+ attr_accessor :partition
29
+ attr_accessor :range
30
+
31
+ def initialize(opts = {})
32
+ opts.symbolize_keys!
33
+ self.grain = opts.delete(:grain)
34
+ @partition = opts.delete(:partition)
35
+ super opts.reverse_merge(type: :fact)
36
+ initialize_fact_columns!
37
+ foreign_key_columns.each do |column|
38
+ column.index << column.name
39
+ end
40
+ time_key.index << time_key.name
41
+ end
42
+
43
+ def id
44
+ [@id, grain].compact.join('_').to_sym
45
+ end
46
+
47
+ def grain=(grain = nil)
48
+ return unless grain
49
+ raise ArgumentError, "unknown grain '#{grain}'" unless SUPPORTED_GRAINS.include?(grain.to_sym)
50
+ @grain = grain.to_sym
51
+ end
52
+
53
+ def suffix
54
+ inherited = super
55
+ [*inherited.split('_'), range.try(:suffix)].compact.uniq.join('_')
56
+ end
57
+
58
+ def date_column
59
+ columns.select { |_, column| column && column.reference && column.reference.type == :date }.values.first
60
+ end
61
+
62
+ def time_key
63
+ columns.values.detect { |column| column.id == :time_key }
64
+ end
65
+
66
+ def stage_table(*a)
67
+ super.tap do |stage|
68
+ stage.id = @id
69
+ stage.store = store
70
+ stage.range = range
71
+ stage.grain = grain
72
+ stage.columns.each do |_, column|
73
+ column.unique = false
74
+ end
75
+ end
76
+ end
77
+
78
+ def partition_table(date)
79
+ partition_range = partition_rule.bind_date(date)
80
+ @partition_tables ||= {}
81
+ @partition_tables[partition_range] ||= self.class.new(id: @id, store: store, columns: partition_table_columns, parent: self, range: partition_range, grain: grain, inherit: true)
82
+ end
83
+
84
+ def partitions
85
+ columns.select { |_, column| column.partition }
86
+ end
87
+
88
+ def measures
89
+ columns.select { |_, column| column.measure }
90
+ end
91
+
92
+ def constraints
93
+ return unless range
94
+ "CHECK (time_key >= #{range.start_time.to_i} AND time_key < #{range.stop_time.to_i})"
95
+ end
96
+
97
+ def reserved_column_ids
98
+ case type
99
+ when :fact
100
+ [:time_key, :last_modified_at]
101
+ else
102
+ super
103
+ end
104
+ end
105
+
106
+ private
107
+
108
+ def initialize_surrogate_key_column!
109
+ end
110
+
111
+ def initialize_fact_columns!
112
+ case type
113
+ when :fact
114
+ initialize_column! id: 'time_key', type: :integer, index: true
115
+ initialize_column! id: 'last_modified_at', type: :timestamp, default: 'NOW()' unless store.type == :hive
116
+ when :stage
117
+ if inherit
118
+ parent.reserved_columns.each do |_, column|
119
+ initialize_column! column.as_hash
120
+ end
121
+ end
122
+ end
123
+ end
124
+
125
+ def partition_rule
126
+ @partition_rule ||= Masamune::DataPlan::Rule.new(nil, :tmp, :target, table: name, partition: @partition)
127
+ end
128
+
129
+ def partition_table_columns
130
+ unreserved_columns.map { |_, column| column.dup }
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,265 @@
1
+ # The MIT License (MIT)
2
+ #
3
+ # Copyright (c) 2014-2015, VMware, Inc. All Rights Reserved.
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ # THE SOFTWARE.
22
+
23
+ require 'csv'
24
+
25
+ module Masamune::Schema
26
+ class Map
27
+ class JSONEncoder < SimpleDelegator
28
+ def initialize(io, store)
29
+ super io
30
+ @store = store
31
+ end
32
+
33
+ def gets(*a)
34
+ line = __getobj__.gets(*a)
35
+ return unless line
36
+ return line if skip?
37
+ encode(line, separator).join(separator)
38
+ end
39
+
40
+ private
41
+
42
+ def skip?
43
+ @store.json_encoding == :quoted
44
+ end
45
+
46
+ def encode(line, separator)
47
+ fields = []
48
+ buffer = ''
49
+ nested = false
50
+ line.strip.each_char do |char|
51
+ case char
52
+ when '{'
53
+ buffer << char
54
+ nested = true
55
+ when '}'
56
+ buffer << char
57
+ nested = false
58
+ when separator
59
+ if nested
60
+ buffer << char
61
+ else
62
+ fields << quote(buffer)
63
+ buffer = ''
64
+ end
65
+ else
66
+ buffer << char
67
+ end
68
+ end
69
+ fields << quote(buffer)
70
+ fields.compact
71
+ end
72
+
73
+ def quote(buffer)
74
+ return buffer if buffer =~ /\A".*"\z/
75
+ %Q{"#{buffer.gsub('"', '""')}"}
76
+ end
77
+
78
+ def separator
79
+ @separator ||= (@store.format == :tsv ? "\t" : ',')
80
+ end
81
+ end
82
+
83
+ class Buffer
84
+ extend Forwardable
85
+
86
+ def_delegators :@io, :flush, :path
87
+
88
+ def initialize(table, options = {})
89
+ @table = table
90
+ @store = table.store
91
+ @lines = 0
92
+ @options = options
93
+ end
94
+
95
+ def bind(io)
96
+ @io = io.set_encoding('binary', 'UTF-8', undef: :replace)
97
+ @csv = nil
98
+ end
99
+
100
+ def each(&block)
101
+ raise 'must call Buffer#bind first' unless @io
102
+ CSV.parse(JSONEncoder.new(@io, @store), options.merge(headers: @store.headers || @table.columns.keys)) do |data|
103
+ next if data.to_s =~ /\A#/
104
+ yield safe_row(data)
105
+ end
106
+ end
107
+
108
+ def append(data)
109
+ raise 'must call Buffer#bind first' unless @io
110
+ row = Masamune::Schema::Row.new(parent: @table, values: data.to_hash)
111
+ write_headers = @store.headers && @lines < 1
112
+ @csv ||= CSV.new(@io, options.merge(headers: row.headers, write_headers: write_headers))
113
+ if row.missing_required_columns.any?
114
+ missing_required_column_names = row.missing_required_columns.map(&:name)
115
+ @store.logger.warn("row '#{row.to_hash}' is missing required columns '#{missing_required_column_names.join(',')}', skipping")
116
+ else
117
+ @csv << row.serialize if append?(row.serialize)
118
+ end
119
+ @lines += 1
120
+ end
121
+
122
+ private
123
+
124
+ def options
125
+ {skip_blanks: true}.tap do | opts|
126
+ opts[:col_sep] = "\t" if @store.format == :tsv
127
+ end
128
+ end
129
+
130
+ def safe_row(data)
131
+ row = Masamune::Schema::Row.new(parent: @table, values: data.to_hash, strict: false)
132
+ row.to_hash
133
+ rescue
134
+ @store.logger.warn("failed to parse '#{data.to_hash}' for #{@table.name}, skipping")
135
+ end
136
+
137
+ def append?(elem)
138
+ return true unless @options[:distinct]
139
+ @seen ||= Set.new
140
+ @seen.add?(elem)
141
+ end
142
+ end
143
+
144
+ DEFAULT_ATTRIBUTES =
145
+ {
146
+ source: nil,
147
+ target: nil,
148
+ columns: nil,
149
+ store: nil,
150
+ function: ->(row) { row },
151
+ distinct: false,
152
+ debug: false
153
+ }
154
+
155
+ DEFAULT_ATTRIBUTES.keys.each do |attr|
156
+ attr_accessor attr
157
+ end
158
+
159
+ def initialize(opts = {})
160
+ opts.symbolize_keys!
161
+ raise ArgumentError, 'required parameter source: missing' unless opts.key?(:source)
162
+ raise ArgumentError, 'required parameter target: missing' unless opts.key?(:target)
163
+ DEFAULT_ATTRIBUTES.merge(opts).each do |name, value|
164
+ public_send("#{name}=", value)
165
+ end
166
+ end
167
+
168
+ def source=(source)
169
+ @source = source
170
+ end
171
+
172
+ # FIXME: avoid implict conversions
173
+ def target=(target)
174
+ @target = target.type == :four ? target.ledger_table : target
175
+ end
176
+
177
+ def intermediate_columns
178
+ output = function.call(default_row(source.columns))
179
+ example = Array.wrap(output).first
180
+ raise ArgumentError, "function for map between '#{source.name}' and '#{target.name}' does not return output for default input" unless example
181
+ example.keys
182
+ end
183
+
184
+ def intermediate
185
+ target.stage_table(columns: columns || intermediate_columns, inherit: false)
186
+ end
187
+
188
+ def apply(input_files, output_file)
189
+ input_buffer = Buffer.new(source)
190
+ output_buffer = Buffer.new(intermediate, distinct: distinct)
191
+ self.class.convert_files(input_files).each do |input_file|
192
+ open_stream(input_file, 'r') do |input_stream|
193
+ input_buffer.bind(input_stream)
194
+ open_stream(output_file, 'a+') do |output_stream|
195
+ output_buffer.bind(output_stream)
196
+ apply_buffer(input_buffer, output_buffer)
197
+ end
198
+ end
199
+ end
200
+ intermediate
201
+ end
202
+
203
+ def open_stream(file, mode, &block)
204
+ case file
205
+ when IO, StringIO
206
+ file.flush
207
+ yield file
208
+ when String, Tempfile
209
+ File.open(file, mode) do |io|
210
+ yield io
211
+ end
212
+ end
213
+ end
214
+
215
+ class << self
216
+ def convert_file(file)
217
+ if file.respond_to?(:path)
218
+ file.flush if file.respond_to?(:flush) && file.respond_to?(:open?) && file.open?
219
+ file.path
220
+ else
221
+ file
222
+ end
223
+ end
224
+
225
+ def convert_files(files)
226
+ case files
227
+ when Set
228
+ files.map { |file| convert_file(file) }.to_a
229
+ when Array
230
+ files.map { |file| convert_file(file) }.to_a
231
+ else
232
+ [convert_file(files)]
233
+ end
234
+ end
235
+ end
236
+
237
+ private
238
+
239
+ def default_row(columns)
240
+ {}.tap do |row|
241
+ columns.each do |_, column|
242
+ row[column.name] = column.default_ruby_value
243
+ end
244
+ end
245
+ end
246
+
247
+ def apply_buffer(input_buffer, output_buffer)
248
+ input_buffer.each do |input|
249
+ safe_apply_function(input) do |output|
250
+ output_buffer.append output
251
+ end
252
+ end
253
+ output_buffer.flush
254
+ end
255
+
256
+ def safe_apply_function(input, &block)
257
+ return unless input
258
+ Array.wrap(function.call(input)).each do |output|
259
+ yield output
260
+ end
261
+ rescue
262
+ @store.logger.warn("failed to process '#{input}' for #{target.name}, skipping")
263
+ end
264
+ end
265
+ end