bricolage-streamingload 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,62 @@
1
+ module Bricolage
2
+
3
+ module StreamingLoad
4
+
5
+ class ManifestFile
6
+
7
+ def ManifestFile.create(ds, job_id:, object_urls:, logger:, noop: false, &block)
8
+ manifest = new(ds, job_id, object_urls, logger: logger, noop: noop)
9
+ manifest.create_temporary(&block)
10
+ end
11
+
12
+ def initialize(ds, job_id, object_urls, logger:, noop: false)
13
+ @ds = ds
14
+ @job_id = job_id
15
+ @object_urls = object_urls
16
+ @logger = logger
17
+ @noop = noop
18
+ end
19
+
20
+ def credential_string
21
+ @ds.credential_string
22
+ end
23
+
24
+ def name
25
+ @name ||= "manifest-#{@job_id}.json"
26
+ end
27
+
28
+ def url
29
+ @url ||= @ds.url(name)
30
+ end
31
+
32
+ def content
33
+ @content ||= begin
34
+ ents = @object_urls.map {|url|
35
+ { "url" => url, "mandatory" => true }
36
+ }
37
+ obj = { "entries" => ents }
38
+ JSON.pretty_generate(obj)
39
+ end
40
+ end
41
+
42
+ def put
43
+ @logger.info "s3: put: #{url}"
44
+ @ds.object(name).put(body: content) unless @noop
45
+ end
46
+
47
+ def delete
48
+ @logger.info "s3: delete: #{url}"
49
+ @ds.object(name).delete unless @noop
50
+ end
51
+
52
+ def create_temporary
53
+ put
54
+ yield self
55
+ delete
56
+ end
57
+
58
+ end
59
+
60
+ end
61
+
62
+ end
@@ -0,0 +1,211 @@
1
+ require 'bricolage/streamingload/task'
2
+ require 'bricolage/streamingload/loaderparams'
3
+ require 'bricolage/sqlutils'
4
+ require 'json'
5
+ require 'securerandom'
6
+ require 'forwardable'
7
+
8
+ module Bricolage
9
+
10
+ module StreamingLoad
11
+
12
+ class LoadableObject
13
+
14
+ extend Forwardable
15
+
16
+ def initialize(event, components)
17
+ @event = event
18
+ @components = components
19
+ end
20
+
21
+ attr_reader :event
22
+
23
+ def_delegator '@event', :url
24
+ def_delegator '@event', :size
25
+ def_delegator '@event', :message_id
26
+ def_delegator '@event', :receipt_handle
27
+ def_delegator '@components', :schema_name
28
+ def_delegator '@components', :table_name
29
+
30
+ def data_source_id
31
+ "#{schema_name}.#{table_name}"
32
+ end
33
+
34
+ alias qualified_name data_source_id
35
+
36
+ def event_time
37
+ @event.time
38
+ end
39
+
40
+ end
41
+
42
+ class ObjectBuffer
43
+
44
+ include SQLUtils
45
+
46
+ def initialize(control_data_source:, logger:)
47
+ @ctl_ds = control_data_source
48
+ @logger = logger
49
+ end
50
+
51
+ def put(obj)
52
+ @ctl_ds.open {|conn|
53
+ insert_object(conn, obj)
54
+ }
55
+ end
56
+
57
+ def flush_tasks
58
+ task_ids = []
59
+ @ctl_ds.open {|conn|
60
+ conn.transaction {|txn|
61
+ task_ids = insert_tasks(conn)
62
+ insert_task_object_mappings(conn) unless task_ids.empty?
63
+ }
64
+ }
65
+ return task_ids.map {|id| LoadTask.create(task_id: id) }
66
+ end
67
+
68
+ private
69
+
70
+ def insert_object(conn, obj)
71
+ #HACK - suppress log per object
72
+ log_level = @logger.level
73
+ @logger.level = Logger::ERROR
74
+ conn.update(<<-EndSQL)
75
+ insert into strload_objects
76
+ (object_url
77
+ , object_size
78
+ , data_source_id
79
+ , message_id
80
+ , event_time
81
+ , submit_time
82
+ )
83
+ select
84
+ #{s obj.url}
85
+ , #{obj.size}
86
+ , #{s obj.data_source_id}
87
+ , #{s obj.message_id}
88
+ , '#{obj.event_time}' AT TIME ZONE 'JST'
89
+ , current_timestamp
90
+ from
91
+ strload_tables
92
+ where
93
+ data_source_id = #{s obj.data_source_id}
94
+ ;
95
+ EndSQL
96
+ @logger.level = log_level
97
+ end
98
+
99
+ def insert_tasks(conn)
100
+ vals = conn.query_values(<<-EndSQL)
101
+ insert into
102
+ strload_tasks (task_class, schema_name, table_name, submit_time)
103
+ select
104
+ 'streaming_load_v3'
105
+ , tbl.schema_name
106
+ , tbl.table_name
107
+ , current_timestamp
108
+ from
109
+ strload_tables tbl
110
+ inner join (
111
+ select
112
+ data_source_id
113
+ , count(*) as object_count
114
+ from (
115
+ select
116
+ min(object_id) as object_id
117
+ , object_url
118
+ from
119
+ strload_objects
120
+ group by
121
+ object_url
122
+ ) uniq_objects
123
+ inner join strload_objects
124
+ using(object_id)
125
+ left outer join strload_task_objects
126
+ using(object_id)
127
+ where
128
+ task_id is null -- not assigned to a task
129
+ group by
130
+ data_source_id
131
+ ) obj -- number of objects not assigned to a task per schema_name.table_name (won't return zero)
132
+ using (data_source_id)
133
+ left outer join (
134
+ select
135
+ schema_name
136
+ , table_name
137
+ , max(submit_time) as latest_submit_time
138
+ from
139
+ strload_tasks
140
+ group by
141
+ schema_name, table_name
142
+ ) task -- preceeding task's submit time
143
+ using(schema_name, table_name)
144
+ where
145
+ not tbl.disabled -- not disabled
146
+ and (
147
+ obj.object_count > tbl.load_batch_size -- batch_size exceeded?
148
+ or extract(epoch from current_timestamp - latest_submit_time) > load_interval -- load_interval exceeded?
149
+ or latest_submit_time is null -- no last task
150
+ )
151
+ returning task_id
152
+ ;
153
+ EndSQL
154
+ @logger.info "Number of task created: #{vals.size}"
155
+ vals
156
+ end
157
+
158
+ def insert_task_object_mappings(conn)
159
+ conn.update(<<-EndSQL)
160
+ insert into
161
+ strload_task_objects
162
+ select
163
+ task_id
164
+ , object_id
165
+ from (
166
+ select
167
+ row_number() over(partition by task.task_id order by obj.object_id) as object_count
168
+ , task.task_id
169
+ , obj.object_id
170
+ , load_batch_size
171
+ from (
172
+ select
173
+ min(object_id) as object_id
174
+ , object_url
175
+ , data_source_id
176
+ from
177
+ strload_objects
178
+ group by
179
+ 2, 3
180
+ ) obj
181
+ inner join (
182
+ select
183
+ min(task_id) as task_id -- oldest task
184
+ , tbl.data_source_id
185
+ , max(load_batch_size) as load_batch_size
186
+ from
187
+ strload_tasks
188
+ inner join strload_tables tbl
189
+ using(schema_name, table_name)
190
+ where
191
+ task_id not in (select distinct task_id from strload_task_objects) -- no assigned objects
192
+ group by
193
+ 2
194
+ ) task -- tasks without objects
195
+ using(data_source_id)
196
+ left outer join strload_task_objects task_obj
197
+ using(object_id)
198
+ where
199
+ task_obj.object_id is null -- not assigned to a task
200
+ ) as t
201
+ where
202
+ object_count <= load_batch_size -- limit number of objects assigned to single task
203
+ ;
204
+ EndSQL
205
+ end
206
+
207
+ end
208
+
209
+ end
210
+
211
+ end
@@ -0,0 +1,124 @@
1
+ require 'bricolage/sqsdatasource'
2
+ require 'json'
3
+
4
+ module Bricolage
5
+
6
+ module StreamingLoad
7
+
8
+ class Task < SQSMessage
9
+
10
+ def Task.get_concrete_class(msg, rec)
11
+ case
12
+ when rec['eventName'] == 'streaming_load_v3' then LoadTask
13
+ else
14
+ raise "[FATAL] unknown SQS message record: eventSource=#{rec['eventSource']} event=#{rec['eventName']} message_id=#{msg.message_id}"
15
+ end
16
+ end
17
+
18
+ def message_type
19
+ raise "#{self.class}\#message_type must be implemented"
20
+ end
21
+
22
+ def data?
23
+ false
24
+ end
25
+
26
+ end
27
+
28
+
29
+ class LoadTask < Task
30
+
31
+ def LoadTask.create(task_id:, force: false)
32
+ super name: 'streaming_load_v3', task_id: task_id, force: force
33
+ end
34
+
35
+ def LoadTask.parse_sqs_record(msg, rec)
36
+ {
37
+ task_id: rec['taskId'],
38
+ force: rec['force'],
39
+ }
40
+ end
41
+
42
+ def LoadTask.load(conn, task_id, force: false)
43
+ rec = conn.query_row(<<-EndSQL)
44
+ select
45
+ task_class
46
+ , tbl.schema_name
47
+ , tbl.table_name
48
+ , disabled
49
+ from
50
+ strload_tasks tsk
51
+ inner join strload_tables tbl
52
+ using(schema_name, table_name)
53
+ where
54
+ task_id = #{task_id}
55
+ ;
56
+ EndSQL
57
+ object_urls = conn.query_values(<<-EndSQL)
58
+ select
59
+ object_url
60
+ from
61
+ strload_task_objects
62
+ inner join strload_objects
63
+ using (object_id)
64
+ inner join strload_tasks
65
+ using (task_id)
66
+ where
67
+ task_id = #{task_id}
68
+ ;
69
+ EndSQL
70
+ return nil unless rec
71
+ new(
72
+ name: rec['task_class'],
73
+ time: nil,
74
+ source: nil,
75
+ task_id: task_id,
76
+ schema: rec['schema_name'],
77
+ table: rec['table_name'],
78
+ object_urls: object_urls,
79
+ disabled: rec['disabled'] == 'f' ? false : true,
80
+ force: force
81
+ )
82
+ end
83
+
84
+ alias message_type name
85
+
86
+ def init_message(task_id:, schema: nil, table: nil, object_urls: nil, disabled: false, force: false)
87
+ @id = task_id
88
+ @force = force
89
+
90
+ # Effective only for queue reader process
91
+ @schema = schema
92
+ @table = table
93
+ @object_urls = object_urls
94
+ @disabled = disabled
95
+ end
96
+
97
+ attr_reader :id, :force
98
+
99
+ #
100
+ # For writer only
101
+ #
102
+
103
+ attr_reader :schema, :table, :object_urls, :disabled
104
+
105
+ def qualified_name
106
+ "#{@schema}.#{@table}"
107
+ end
108
+
109
+ def body
110
+ obj = super
111
+ obj['taskId'] = @id
112
+ obj['schemaName'] = @schema
113
+ obj['tableName'] = @table
114
+ obj['objectUrls'] = @object_urls
115
+ obj['disabled'] = @disabled
116
+ obj['force'] = @force
117
+ obj
118
+ end
119
+
120
+ end
121
+
122
+ end # module StreamingLoad
123
+
124
+ end # module Bricolage
@@ -0,0 +1,59 @@
1
+ module Bricolage
2
+
3
+ module StreamingLoad
4
+
5
+ class URLPatternNotMatched < StandardError; end
6
+
7
+
8
+ class URLPatterns
9
+
10
+ def URLPatterns.for_config(configs)
11
+ new(configs.map {|c|
12
+ Pattern.new(url: c.fetch('url'), schema: c.fetch('schema'), table: c.fetch('table'))
13
+ })
14
+ end
15
+
16
+ def initialize(patterns)
17
+ @patterns = patterns
18
+ end
19
+
20
+ def match(url)
21
+ @patterns.each do |pat|
22
+ components = pat.match(url)
23
+ return components if components
24
+ end
25
+ raise URLPatternNotMatched, "no URL pattern matches the object url: #{url.inspect}"
26
+ end
27
+
28
+ class Pattern
29
+ def initialize(url:, schema:, table:)
30
+ @url_pattern = /\A#{url}\z/
31
+ @schema = schema
32
+ @table = table
33
+ end
34
+
35
+ attr_reader :url_pattern
36
+ attr_reader :schema
37
+ attr_reader :table
38
+
39
+ def match(url)
40
+ m = @url_pattern.match(url) or return nil
41
+ Components.new(get_component(m, @schema), get_component(m, @table))
42
+ end
43
+
44
+ def get_component(m, label)
45
+ if /\A%/ =~ label
46
+ m[label[1..-1]]
47
+ else
48
+ label
49
+ end
50
+ end
51
+ end
52
+
53
+ Components = Struct.new(:schema_name, :table_name)
54
+
55
+ end
56
+
57
+ end # module StreamingLoad
58
+
59
+ end # module Bricolage