bricolage-streamingload 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +19 -0
- data/bin/bricolage-streaming-dispatcher +6 -0
- data/bin/bricolage-streaming-loader +6 -0
- data/lib/bricolage/sqsdatasource.rb +299 -0
- data/lib/bricolage/sqswrapper.rb +77 -0
- data/lib/bricolage/streamingload/dispatcher.rb +181 -0
- data/lib/bricolage/streamingload/event.rb +139 -0
- data/lib/bricolage/streamingload/loader.rb +144 -0
- data/lib/bricolage/streamingload/loaderparams.rb +153 -0
- data/lib/bricolage/streamingload/loaderservice.rb +163 -0
- data/lib/bricolage/streamingload/manifest.rb +62 -0
- data/lib/bricolage/streamingload/objectbuffer.rb +211 -0
- data/lib/bricolage/streamingload/task.rb +124 -0
- data/lib/bricolage/streamingload/urlpatterns.rb +59 -0
- data/lib/bricolage/streamingload/version.rb +5 -0
- data/test/all.rb +3 -0
- data/test/streamingload/test_event.rb +30 -0
- metadata +148 -0
@@ -0,0 +1,62 @@
|
|
1
|
+
module Bricolage
|
2
|
+
|
3
|
+
module StreamingLoad
|
4
|
+
|
5
|
+
class ManifestFile
|
6
|
+
|
7
|
+
def ManifestFile.create(ds, job_id:, object_urls:, logger:, noop: false, &block)
|
8
|
+
manifest = new(ds, job_id, object_urls, logger: logger, noop: noop)
|
9
|
+
manifest.create_temporary(&block)
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(ds, job_id, object_urls, logger:, noop: false)
|
13
|
+
@ds = ds
|
14
|
+
@job_id = job_id
|
15
|
+
@object_urls = object_urls
|
16
|
+
@logger = logger
|
17
|
+
@noop = noop
|
18
|
+
end
|
19
|
+
|
20
|
+
def credential_string
|
21
|
+
@ds.credential_string
|
22
|
+
end
|
23
|
+
|
24
|
+
def name
|
25
|
+
@name ||= "manifest-#{@job_id}.json"
|
26
|
+
end
|
27
|
+
|
28
|
+
def url
|
29
|
+
@url ||= @ds.url(name)
|
30
|
+
end
|
31
|
+
|
32
|
+
def content
|
33
|
+
@content ||= begin
|
34
|
+
ents = @object_urls.map {|url|
|
35
|
+
{ "url" => url, "mandatory" => true }
|
36
|
+
}
|
37
|
+
obj = { "entries" => ents }
|
38
|
+
JSON.pretty_generate(obj)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def put
|
43
|
+
@logger.info "s3: put: #{url}"
|
44
|
+
@ds.object(name).put(body: content) unless @noop
|
45
|
+
end
|
46
|
+
|
47
|
+
def delete
|
48
|
+
@logger.info "s3: delete: #{url}"
|
49
|
+
@ds.object(name).delete unless @noop
|
50
|
+
end
|
51
|
+
|
52
|
+
def create_temporary
|
53
|
+
put
|
54
|
+
yield self
|
55
|
+
delete
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
@@ -0,0 +1,211 @@
|
|
1
|
+
require 'bricolage/streamingload/task'
|
2
|
+
require 'bricolage/streamingload/loaderparams'
|
3
|
+
require 'bricolage/sqlutils'
|
4
|
+
require 'json'
|
5
|
+
require 'securerandom'
|
6
|
+
require 'forwardable'
|
7
|
+
|
8
|
+
module Bricolage
|
9
|
+
|
10
|
+
module StreamingLoad
|
11
|
+
|
12
|
+
class LoadableObject
|
13
|
+
|
14
|
+
extend Forwardable
|
15
|
+
|
16
|
+
def initialize(event, components)
|
17
|
+
@event = event
|
18
|
+
@components = components
|
19
|
+
end
|
20
|
+
|
21
|
+
attr_reader :event
|
22
|
+
|
23
|
+
def_delegator '@event', :url
|
24
|
+
def_delegator '@event', :size
|
25
|
+
def_delegator '@event', :message_id
|
26
|
+
def_delegator '@event', :receipt_handle
|
27
|
+
def_delegator '@components', :schema_name
|
28
|
+
def_delegator '@components', :table_name
|
29
|
+
|
30
|
+
def data_source_id
|
31
|
+
"#{schema_name}.#{table_name}"
|
32
|
+
end
|
33
|
+
|
34
|
+
alias qualified_name data_source_id
|
35
|
+
|
36
|
+
def event_time
|
37
|
+
@event.time
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
class ObjectBuffer
|
43
|
+
|
44
|
+
include SQLUtils
|
45
|
+
|
46
|
+
def initialize(control_data_source:, logger:)
|
47
|
+
@ctl_ds = control_data_source
|
48
|
+
@logger = logger
|
49
|
+
end
|
50
|
+
|
51
|
+
def put(obj)
|
52
|
+
@ctl_ds.open {|conn|
|
53
|
+
insert_object(conn, obj)
|
54
|
+
}
|
55
|
+
end
|
56
|
+
|
57
|
+
def flush_tasks
|
58
|
+
task_ids = []
|
59
|
+
@ctl_ds.open {|conn|
|
60
|
+
conn.transaction {|txn|
|
61
|
+
task_ids = insert_tasks(conn)
|
62
|
+
insert_task_object_mappings(conn) unless task_ids.empty?
|
63
|
+
}
|
64
|
+
}
|
65
|
+
return task_ids.map {|id| LoadTask.create(task_id: id) }
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def insert_object(conn, obj)
|
71
|
+
#HACK - suppress log per object
|
72
|
+
log_level = @logger.level
|
73
|
+
@logger.level = Logger::ERROR
|
74
|
+
conn.update(<<-EndSQL)
|
75
|
+
insert into strload_objects
|
76
|
+
(object_url
|
77
|
+
, object_size
|
78
|
+
, data_source_id
|
79
|
+
, message_id
|
80
|
+
, event_time
|
81
|
+
, submit_time
|
82
|
+
)
|
83
|
+
select
|
84
|
+
#{s obj.url}
|
85
|
+
, #{obj.size}
|
86
|
+
, #{s obj.data_source_id}
|
87
|
+
, #{s obj.message_id}
|
88
|
+
, '#{obj.event_time}' AT TIME ZONE 'JST'
|
89
|
+
, current_timestamp
|
90
|
+
from
|
91
|
+
strload_tables
|
92
|
+
where
|
93
|
+
data_source_id = #{s obj.data_source_id}
|
94
|
+
;
|
95
|
+
EndSQL
|
96
|
+
@logger.level = log_level
|
97
|
+
end
|
98
|
+
|
99
|
+
def insert_tasks(conn)
|
100
|
+
vals = conn.query_values(<<-EndSQL)
|
101
|
+
insert into
|
102
|
+
strload_tasks (task_class, schema_name, table_name, submit_time)
|
103
|
+
select
|
104
|
+
'streaming_load_v3'
|
105
|
+
, tbl.schema_name
|
106
|
+
, tbl.table_name
|
107
|
+
, current_timestamp
|
108
|
+
from
|
109
|
+
strload_tables tbl
|
110
|
+
inner join (
|
111
|
+
select
|
112
|
+
data_source_id
|
113
|
+
, count(*) as object_count
|
114
|
+
from (
|
115
|
+
select
|
116
|
+
min(object_id) as object_id
|
117
|
+
, object_url
|
118
|
+
from
|
119
|
+
strload_objects
|
120
|
+
group by
|
121
|
+
object_url
|
122
|
+
) uniq_objects
|
123
|
+
inner join strload_objects
|
124
|
+
using(object_id)
|
125
|
+
left outer join strload_task_objects
|
126
|
+
using(object_id)
|
127
|
+
where
|
128
|
+
task_id is null -- not assigned to a task
|
129
|
+
group by
|
130
|
+
data_source_id
|
131
|
+
) obj -- number of objects not assigned to a task per schema_name.table_name (won't return zero)
|
132
|
+
using (data_source_id)
|
133
|
+
left outer join (
|
134
|
+
select
|
135
|
+
schema_name
|
136
|
+
, table_name
|
137
|
+
, max(submit_time) as latest_submit_time
|
138
|
+
from
|
139
|
+
strload_tasks
|
140
|
+
group by
|
141
|
+
schema_name, table_name
|
142
|
+
) task -- preceeding task's submit time
|
143
|
+
using(schema_name, table_name)
|
144
|
+
where
|
145
|
+
not tbl.disabled -- not disabled
|
146
|
+
and (
|
147
|
+
obj.object_count > tbl.load_batch_size -- batch_size exceeded?
|
148
|
+
or extract(epoch from current_timestamp - latest_submit_time) > load_interval -- load_interval exceeded?
|
149
|
+
or latest_submit_time is null -- no last task
|
150
|
+
)
|
151
|
+
returning task_id
|
152
|
+
;
|
153
|
+
EndSQL
|
154
|
+
@logger.info "Number of task created: #{vals.size}"
|
155
|
+
vals
|
156
|
+
end
|
157
|
+
|
158
|
+
def insert_task_object_mappings(conn)
|
159
|
+
conn.update(<<-EndSQL)
|
160
|
+
insert into
|
161
|
+
strload_task_objects
|
162
|
+
select
|
163
|
+
task_id
|
164
|
+
, object_id
|
165
|
+
from (
|
166
|
+
select
|
167
|
+
row_number() over(partition by task.task_id order by obj.object_id) as object_count
|
168
|
+
, task.task_id
|
169
|
+
, obj.object_id
|
170
|
+
, load_batch_size
|
171
|
+
from (
|
172
|
+
select
|
173
|
+
min(object_id) as object_id
|
174
|
+
, object_url
|
175
|
+
, data_source_id
|
176
|
+
from
|
177
|
+
strload_objects
|
178
|
+
group by
|
179
|
+
2, 3
|
180
|
+
) obj
|
181
|
+
inner join (
|
182
|
+
select
|
183
|
+
min(task_id) as task_id -- oldest task
|
184
|
+
, tbl.data_source_id
|
185
|
+
, max(load_batch_size) as load_batch_size
|
186
|
+
from
|
187
|
+
strload_tasks
|
188
|
+
inner join strload_tables tbl
|
189
|
+
using(schema_name, table_name)
|
190
|
+
where
|
191
|
+
task_id not in (select distinct task_id from strload_task_objects) -- no assigned objects
|
192
|
+
group by
|
193
|
+
2
|
194
|
+
) task -- tasks without objects
|
195
|
+
using(data_source_id)
|
196
|
+
left outer join strload_task_objects task_obj
|
197
|
+
using(object_id)
|
198
|
+
where
|
199
|
+
task_obj.object_id is null -- not assigned to a task
|
200
|
+
) as t
|
201
|
+
where
|
202
|
+
object_count <= load_batch_size -- limit number of objects assigned to single task
|
203
|
+
;
|
204
|
+
EndSQL
|
205
|
+
end
|
206
|
+
|
207
|
+
end
|
208
|
+
|
209
|
+
end
|
210
|
+
|
211
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
require 'bricolage/sqsdatasource'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
module Bricolage
|
5
|
+
|
6
|
+
module StreamingLoad
|
7
|
+
|
8
|
+
class Task < SQSMessage
|
9
|
+
|
10
|
+
def Task.get_concrete_class(msg, rec)
|
11
|
+
case
|
12
|
+
when rec['eventName'] == 'streaming_load_v3' then LoadTask
|
13
|
+
else
|
14
|
+
raise "[FATAL] unknown SQS message record: eventSource=#{rec['eventSource']} event=#{rec['eventName']} message_id=#{msg.message_id}"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def message_type
|
19
|
+
raise "#{self.class}\#message_type must be implemented"
|
20
|
+
end
|
21
|
+
|
22
|
+
def data?
|
23
|
+
false
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
class LoadTask < Task
|
30
|
+
|
31
|
+
def LoadTask.create(task_id:, force: false)
|
32
|
+
super name: 'streaming_load_v3', task_id: task_id, force: force
|
33
|
+
end
|
34
|
+
|
35
|
+
def LoadTask.parse_sqs_record(msg, rec)
|
36
|
+
{
|
37
|
+
task_id: rec['taskId'],
|
38
|
+
force: rec['force'],
|
39
|
+
}
|
40
|
+
end
|
41
|
+
|
42
|
+
def LoadTask.load(conn, task_id, force: false)
|
43
|
+
rec = conn.query_row(<<-EndSQL)
|
44
|
+
select
|
45
|
+
task_class
|
46
|
+
, tbl.schema_name
|
47
|
+
, tbl.table_name
|
48
|
+
, disabled
|
49
|
+
from
|
50
|
+
strload_tasks tsk
|
51
|
+
inner join strload_tables tbl
|
52
|
+
using(schema_name, table_name)
|
53
|
+
where
|
54
|
+
task_id = #{task_id}
|
55
|
+
;
|
56
|
+
EndSQL
|
57
|
+
object_urls = conn.query_values(<<-EndSQL)
|
58
|
+
select
|
59
|
+
object_url
|
60
|
+
from
|
61
|
+
strload_task_objects
|
62
|
+
inner join strload_objects
|
63
|
+
using (object_id)
|
64
|
+
inner join strload_tasks
|
65
|
+
using (task_id)
|
66
|
+
where
|
67
|
+
task_id = #{task_id}
|
68
|
+
;
|
69
|
+
EndSQL
|
70
|
+
return nil unless rec
|
71
|
+
new(
|
72
|
+
name: rec['task_class'],
|
73
|
+
time: nil,
|
74
|
+
source: nil,
|
75
|
+
task_id: task_id,
|
76
|
+
schema: rec['schema_name'],
|
77
|
+
table: rec['table_name'],
|
78
|
+
object_urls: object_urls,
|
79
|
+
disabled: rec['disabled'] == 'f' ? false : true,
|
80
|
+
force: force
|
81
|
+
)
|
82
|
+
end
|
83
|
+
|
84
|
+
alias message_type name
|
85
|
+
|
86
|
+
def init_message(task_id:, schema: nil, table: nil, object_urls: nil, disabled: false, force: false)
|
87
|
+
@id = task_id
|
88
|
+
@force = force
|
89
|
+
|
90
|
+
# Effective only for queue reader process
|
91
|
+
@schema = schema
|
92
|
+
@table = table
|
93
|
+
@object_urls = object_urls
|
94
|
+
@disabled = disabled
|
95
|
+
end
|
96
|
+
|
97
|
+
attr_reader :id, :force
|
98
|
+
|
99
|
+
#
|
100
|
+
# For writer only
|
101
|
+
#
|
102
|
+
|
103
|
+
attr_reader :schema, :table, :object_urls, :disabled
|
104
|
+
|
105
|
+
def qualified_name
|
106
|
+
"#{@schema}.#{@table}"
|
107
|
+
end
|
108
|
+
|
109
|
+
def body
|
110
|
+
obj = super
|
111
|
+
obj['taskId'] = @id
|
112
|
+
obj['schemaName'] = @schema
|
113
|
+
obj['tableName'] = @table
|
114
|
+
obj['objectUrls'] = @object_urls
|
115
|
+
obj['disabled'] = @disabled
|
116
|
+
obj['force'] = @force
|
117
|
+
obj
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
121
|
+
|
122
|
+
end # module StreamingLoad
|
123
|
+
|
124
|
+
end # module Bricolage
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Bricolage
|
2
|
+
|
3
|
+
module StreamingLoad
|
4
|
+
|
5
|
+
class URLPatternNotMatched < StandardError; end
|
6
|
+
|
7
|
+
|
8
|
+
class URLPatterns
|
9
|
+
|
10
|
+
def URLPatterns.for_config(configs)
|
11
|
+
new(configs.map {|c|
|
12
|
+
Pattern.new(url: c.fetch('url'), schema: c.fetch('schema'), table: c.fetch('table'))
|
13
|
+
})
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize(patterns)
|
17
|
+
@patterns = patterns
|
18
|
+
end
|
19
|
+
|
20
|
+
def match(url)
|
21
|
+
@patterns.each do |pat|
|
22
|
+
components = pat.match(url)
|
23
|
+
return components if components
|
24
|
+
end
|
25
|
+
raise URLPatternNotMatched, "no URL pattern matches the object url: #{url.inspect}"
|
26
|
+
end
|
27
|
+
|
28
|
+
class Pattern
|
29
|
+
def initialize(url:, schema:, table:)
|
30
|
+
@url_pattern = /\A#{url}\z/
|
31
|
+
@schema = schema
|
32
|
+
@table = table
|
33
|
+
end
|
34
|
+
|
35
|
+
attr_reader :url_pattern
|
36
|
+
attr_reader :schema
|
37
|
+
attr_reader :table
|
38
|
+
|
39
|
+
def match(url)
|
40
|
+
m = @url_pattern.match(url) or return nil
|
41
|
+
Components.new(get_component(m, @schema), get_component(m, @table))
|
42
|
+
end
|
43
|
+
|
44
|
+
def get_component(m, label)
|
45
|
+
if /\A%/ =~ label
|
46
|
+
m[label[1..-1]]
|
47
|
+
else
|
48
|
+
label
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
Components = Struct.new(:schema_name, :table_name)
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
end # module StreamingLoad
|
58
|
+
|
59
|
+
end # module Bricolage
|