bricolage-streamingload 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,62 @@
1
+ module Bricolage
2
+
3
+ module StreamingLoad
4
+
5
+ class ManifestFile
6
+
7
+ def ManifestFile.create(ds, job_id:, object_urls:, logger:, noop: false, &block)
8
+ manifest = new(ds, job_id, object_urls, logger: logger, noop: noop)
9
+ manifest.create_temporary(&block)
10
+ end
11
+
12
+ def initialize(ds, job_id, object_urls, logger:, noop: false)
13
+ @ds = ds
14
+ @job_id = job_id
15
+ @object_urls = object_urls
16
+ @logger = logger
17
+ @noop = noop
18
+ end
19
+
20
+ def credential_string
21
+ @ds.credential_string
22
+ end
23
+
24
+ def name
25
+ @name ||= "manifest-#{@job_id}.json"
26
+ end
27
+
28
+ def url
29
+ @url ||= @ds.url(name)
30
+ end
31
+
32
+ def content
33
+ @content ||= begin
34
+ ents = @object_urls.map {|url|
35
+ { "url" => url, "mandatory" => true }
36
+ }
37
+ obj = { "entries" => ents }
38
+ JSON.pretty_generate(obj)
39
+ end
40
+ end
41
+
42
+ def put
43
+ @logger.info "s3: put: #{url}"
44
+ @ds.object(name).put(body: content) unless @noop
45
+ end
46
+
47
+ def delete
48
+ @logger.info "s3: delete: #{url}"
49
+ @ds.object(name).delete unless @noop
50
+ end
51
+
52
+ def create_temporary
53
+ put
54
+ yield self
55
+ delete
56
+ end
57
+
58
+ end
59
+
60
+ end
61
+
62
+ end
@@ -0,0 +1,211 @@
1
+ require 'bricolage/streamingload/task'
2
+ require 'bricolage/streamingload/loaderparams'
3
+ require 'bricolage/sqlutils'
4
+ require 'json'
5
+ require 'securerandom'
6
+ require 'forwardable'
7
+
8
+ module Bricolage
9
+
10
+ module StreamingLoad
11
+
12
+ class LoadableObject
13
+
14
+ extend Forwardable
15
+
16
+ def initialize(event, components)
17
+ @event = event
18
+ @components = components
19
+ end
20
+
21
+ attr_reader :event
22
+
23
+ def_delegator '@event', :url
24
+ def_delegator '@event', :size
25
+ def_delegator '@event', :message_id
26
+ def_delegator '@event', :receipt_handle
27
+ def_delegator '@components', :schema_name
28
+ def_delegator '@components', :table_name
29
+
30
+ def data_source_id
31
+ "#{schema_name}.#{table_name}"
32
+ end
33
+
34
+ alias qualified_name data_source_id
35
+
36
+ def event_time
37
+ @event.time
38
+ end
39
+
40
+ end
41
+
42
+ class ObjectBuffer
43
+
44
+ include SQLUtils
45
+
46
+ def initialize(control_data_source:, logger:)
47
+ @ctl_ds = control_data_source
48
+ @logger = logger
49
+ end
50
+
51
+ def put(obj)
52
+ @ctl_ds.open {|conn|
53
+ insert_object(conn, obj)
54
+ }
55
+ end
56
+
57
+ def flush_tasks
58
+ task_ids = []
59
+ @ctl_ds.open {|conn|
60
+ conn.transaction {|txn|
61
+ task_ids = insert_tasks(conn)
62
+ insert_task_object_mappings(conn) unless task_ids.empty?
63
+ }
64
+ }
65
+ return task_ids.map {|id| LoadTask.create(task_id: id) }
66
+ end
67
+
68
+ private
69
+
70
+ def insert_object(conn, obj)
71
+ #HACK - suppress log per object
72
+ log_level = @logger.level
73
+ @logger.level = Logger::ERROR
74
+ conn.update(<<-EndSQL)
75
+ insert into strload_objects
76
+ (object_url
77
+ , object_size
78
+ , data_source_id
79
+ , message_id
80
+ , event_time
81
+ , submit_time
82
+ )
83
+ select
84
+ #{s obj.url}
85
+ , #{obj.size}
86
+ , #{s obj.data_source_id}
87
+ , #{s obj.message_id}
88
+ , '#{obj.event_time}' AT TIME ZONE 'JST'
89
+ , current_timestamp
90
+ from
91
+ strload_tables
92
+ where
93
+ data_source_id = #{s obj.data_source_id}
94
+ ;
95
+ EndSQL
96
+ @logger.level = log_level
97
+ end
98
+
99
+ def insert_tasks(conn)
100
+ vals = conn.query_values(<<-EndSQL)
101
+ insert into
102
+ strload_tasks (task_class, schema_name, table_name, submit_time)
103
+ select
104
+ 'streaming_load_v3'
105
+ , tbl.schema_name
106
+ , tbl.table_name
107
+ , current_timestamp
108
+ from
109
+ strload_tables tbl
110
+ inner join (
111
+ select
112
+ data_source_id
113
+ , count(*) as object_count
114
+ from (
115
+ select
116
+ min(object_id) as object_id
117
+ , object_url
118
+ from
119
+ strload_objects
120
+ group by
121
+ object_url
122
+ ) uniq_objects
123
+ inner join strload_objects
124
+ using(object_id)
125
+ left outer join strload_task_objects
126
+ using(object_id)
127
+ where
128
+ task_id is null -- not assigned to a task
129
+ group by
130
+ data_source_id
131
+ ) obj -- number of objects not assigned to a task per schema_name.table_name (won't return zero)
132
+ using (data_source_id)
133
+ left outer join (
134
+ select
135
+ schema_name
136
+ , table_name
137
+ , max(submit_time) as latest_submit_time
138
+ from
139
+ strload_tasks
140
+ group by
141
+ schema_name, table_name
142
+ ) task -- preceeding task's submit time
143
+ using(schema_name, table_name)
144
+ where
145
+ not tbl.disabled -- not disabled
146
+ and (
147
+ obj.object_count > tbl.load_batch_size -- batch_size exceeded?
148
+ or extract(epoch from current_timestamp - latest_submit_time) > load_interval -- load_interval exceeded?
149
+ or latest_submit_time is null -- no last task
150
+ )
151
+ returning task_id
152
+ ;
153
+ EndSQL
154
+ @logger.info "Number of task created: #{vals.size}"
155
+ vals
156
+ end
157
+
158
+ def insert_task_object_mappings(conn)
159
+ conn.update(<<-EndSQL)
160
+ insert into
161
+ strload_task_objects
162
+ select
163
+ task_id
164
+ , object_id
165
+ from (
166
+ select
167
+ row_number() over(partition by task.task_id order by obj.object_id) as object_count
168
+ , task.task_id
169
+ , obj.object_id
170
+ , load_batch_size
171
+ from (
172
+ select
173
+ min(object_id) as object_id
174
+ , object_url
175
+ , data_source_id
176
+ from
177
+ strload_objects
178
+ group by
179
+ 2, 3
180
+ ) obj
181
+ inner join (
182
+ select
183
+ min(task_id) as task_id -- oldest task
184
+ , tbl.data_source_id
185
+ , max(load_batch_size) as load_batch_size
186
+ from
187
+ strload_tasks
188
+ inner join strload_tables tbl
189
+ using(schema_name, table_name)
190
+ where
191
+ task_id not in (select distinct task_id from strload_task_objects) -- no assigned objects
192
+ group by
193
+ 2
194
+ ) task -- tasks without objects
195
+ using(data_source_id)
196
+ left outer join strload_task_objects task_obj
197
+ using(object_id)
198
+ where
199
+ task_obj.object_id is null -- not assigned to a task
200
+ ) as t
201
+ where
202
+ object_count <= load_batch_size -- limit number of objects assigned to single task
203
+ ;
204
+ EndSQL
205
+ end
206
+
207
+ end
208
+
209
+ end
210
+
211
+ end
@@ -0,0 +1,124 @@
1
+ require 'bricolage/sqsdatasource'
2
+ require 'json'
3
+
4
+ module Bricolage
5
+
6
+ module StreamingLoad
7
+
8
+ class Task < SQSMessage
9
+
10
+ def Task.get_concrete_class(msg, rec)
11
+ case
12
+ when rec['eventName'] == 'streaming_load_v3' then LoadTask
13
+ else
14
+ raise "[FATAL] unknown SQS message record: eventSource=#{rec['eventSource']} event=#{rec['eventName']} message_id=#{msg.message_id}"
15
+ end
16
+ end
17
+
18
+ def message_type
19
+ raise "#{self.class}\#message_type must be implemented"
20
+ end
21
+
22
+ def data?
23
+ false
24
+ end
25
+
26
+ end
27
+
28
+
29
+ class LoadTask < Task
30
+
31
+ def LoadTask.create(task_id:, force: false)
32
+ super name: 'streaming_load_v3', task_id: task_id, force: force
33
+ end
34
+
35
+ def LoadTask.parse_sqs_record(msg, rec)
36
+ {
37
+ task_id: rec['taskId'],
38
+ force: rec['force'],
39
+ }
40
+ end
41
+
42
+ def LoadTask.load(conn, task_id, force: false)
43
+ rec = conn.query_row(<<-EndSQL)
44
+ select
45
+ task_class
46
+ , tbl.schema_name
47
+ , tbl.table_name
48
+ , disabled
49
+ from
50
+ strload_tasks tsk
51
+ inner join strload_tables tbl
52
+ using(schema_name, table_name)
53
+ where
54
+ task_id = #{task_id}
55
+ ;
56
+ EndSQL
57
+ object_urls = conn.query_values(<<-EndSQL)
58
+ select
59
+ object_url
60
+ from
61
+ strload_task_objects
62
+ inner join strload_objects
63
+ using (object_id)
64
+ inner join strload_tasks
65
+ using (task_id)
66
+ where
67
+ task_id = #{task_id}
68
+ ;
69
+ EndSQL
70
+ return nil unless rec
71
+ new(
72
+ name: rec['task_class'],
73
+ time: nil,
74
+ source: nil,
75
+ task_id: task_id,
76
+ schema: rec['schema_name'],
77
+ table: rec['table_name'],
78
+ object_urls: object_urls,
79
+ disabled: rec['disabled'] == 'f' ? false : true,
80
+ force: force
81
+ )
82
+ end
83
+
84
+ alias message_type name
85
+
86
+ def init_message(task_id:, schema: nil, table: nil, object_urls: nil, disabled: false, force: false)
87
+ @id = task_id
88
+ @force = force
89
+
90
+ # Effective only for queue reader process
91
+ @schema = schema
92
+ @table = table
93
+ @object_urls = object_urls
94
+ @disabled = disabled
95
+ end
96
+
97
+ attr_reader :id, :force
98
+
99
+ #
100
+ # For writer only
101
+ #
102
+
103
+ attr_reader :schema, :table, :object_urls, :disabled
104
+
105
+ def qualified_name
106
+ "#{@schema}.#{@table}"
107
+ end
108
+
109
+ def body
110
+ obj = super
111
+ obj['taskId'] = @id
112
+ obj['schemaName'] = @schema
113
+ obj['tableName'] = @table
114
+ obj['objectUrls'] = @object_urls
115
+ obj['disabled'] = @disabled
116
+ obj['force'] = @force
117
+ obj
118
+ end
119
+
120
+ end
121
+
122
+ end # module StreamingLoad
123
+
124
+ end # module Bricolage
@@ -0,0 +1,59 @@
1
+ module Bricolage
2
+
3
+ module StreamingLoad
4
+
5
+ class URLPatternNotMatched < StandardError; end
6
+
7
+
8
+ class URLPatterns
9
+
10
+ def URLPatterns.for_config(configs)
11
+ new(configs.map {|c|
12
+ Pattern.new(url: c.fetch('url'), schema: c.fetch('schema'), table: c.fetch('table'))
13
+ })
14
+ end
15
+
16
+ def initialize(patterns)
17
+ @patterns = patterns
18
+ end
19
+
20
+ def match(url)
21
+ @patterns.each do |pat|
22
+ components = pat.match(url)
23
+ return components if components
24
+ end
25
+ raise URLPatternNotMatched, "no URL pattern matches the object url: #{url.inspect}"
26
+ end
27
+
28
+ class Pattern
29
+ def initialize(url:, schema:, table:)
30
+ @url_pattern = /\A#{url}\z/
31
+ @schema = schema
32
+ @table = table
33
+ end
34
+
35
+ attr_reader :url_pattern
36
+ attr_reader :schema
37
+ attr_reader :table
38
+
39
+ def match(url)
40
+ m = @url_pattern.match(url) or return nil
41
+ Components.new(get_component(m, @schema), get_component(m, @table))
42
+ end
43
+
44
+ def get_component(m, label)
45
+ if /\A%/ =~ label
46
+ m[label[1..-1]]
47
+ else
48
+ label
49
+ end
50
+ end
51
+ end
52
+
53
+ Components = Struct.new(:schema_name, :table_name)
54
+
55
+ end
56
+
57
+ end # module StreamingLoad
58
+
59
+ end # module Bricolage