bricolage-streamingload 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +19 -0
- data/bin/bricolage-streaming-dispatcher +6 -0
- data/bin/bricolage-streaming-loader +6 -0
- data/lib/bricolage/sqsdatasource.rb +299 -0
- data/lib/bricolage/sqswrapper.rb +77 -0
- data/lib/bricolage/streamingload/dispatcher.rb +181 -0
- data/lib/bricolage/streamingload/event.rb +139 -0
- data/lib/bricolage/streamingload/loader.rb +144 -0
- data/lib/bricolage/streamingload/loaderparams.rb +153 -0
- data/lib/bricolage/streamingload/loaderservice.rb +163 -0
- data/lib/bricolage/streamingload/manifest.rb +62 -0
- data/lib/bricolage/streamingload/objectbuffer.rb +211 -0
- data/lib/bricolage/streamingload/task.rb +124 -0
- data/lib/bricolage/streamingload/urlpatterns.rb +59 -0
- data/lib/bricolage/streamingload/version.rb +5 -0
- data/test/all.rb +3 -0
- data/test/streamingload/test_event.rb +30 -0
- metadata +148 -0
@@ -0,0 +1,62 @@
|
|
1
|
+
module Bricolage
|
2
|
+
|
3
|
+
module StreamingLoad
|
4
|
+
|
5
|
+
class ManifestFile
|
6
|
+
|
7
|
+
def ManifestFile.create(ds, job_id:, object_urls:, logger:, noop: false, &block)
|
8
|
+
manifest = new(ds, job_id, object_urls, logger: logger, noop: noop)
|
9
|
+
manifest.create_temporary(&block)
|
10
|
+
end
|
11
|
+
|
12
|
+
def initialize(ds, job_id, object_urls, logger:, noop: false)
|
13
|
+
@ds = ds
|
14
|
+
@job_id = job_id
|
15
|
+
@object_urls = object_urls
|
16
|
+
@logger = logger
|
17
|
+
@noop = noop
|
18
|
+
end
|
19
|
+
|
20
|
+
def credential_string
|
21
|
+
@ds.credential_string
|
22
|
+
end
|
23
|
+
|
24
|
+
def name
|
25
|
+
@name ||= "manifest-#{@job_id}.json"
|
26
|
+
end
|
27
|
+
|
28
|
+
def url
|
29
|
+
@url ||= @ds.url(name)
|
30
|
+
end
|
31
|
+
|
32
|
+
def content
|
33
|
+
@content ||= begin
|
34
|
+
ents = @object_urls.map {|url|
|
35
|
+
{ "url" => url, "mandatory" => true }
|
36
|
+
}
|
37
|
+
obj = { "entries" => ents }
|
38
|
+
JSON.pretty_generate(obj)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def put
|
43
|
+
@logger.info "s3: put: #{url}"
|
44
|
+
@ds.object(name).put(body: content) unless @noop
|
45
|
+
end
|
46
|
+
|
47
|
+
def delete
|
48
|
+
@logger.info "s3: delete: #{url}"
|
49
|
+
@ds.object(name).delete unless @noop
|
50
|
+
end
|
51
|
+
|
52
|
+
def create_temporary
|
53
|
+
put
|
54
|
+
yield self
|
55
|
+
delete
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
@@ -0,0 +1,211 @@
|
|
1
|
+
require 'bricolage/streamingload/task'
|
2
|
+
require 'bricolage/streamingload/loaderparams'
|
3
|
+
require 'bricolage/sqlutils'
|
4
|
+
require 'json'
|
5
|
+
require 'securerandom'
|
6
|
+
require 'forwardable'
|
7
|
+
|
8
|
+
module Bricolage
|
9
|
+
|
10
|
+
module StreamingLoad
|
11
|
+
|
12
|
+
class LoadableObject
|
13
|
+
|
14
|
+
extend Forwardable
|
15
|
+
|
16
|
+
def initialize(event, components)
|
17
|
+
@event = event
|
18
|
+
@components = components
|
19
|
+
end
|
20
|
+
|
21
|
+
attr_reader :event
|
22
|
+
|
23
|
+
def_delegator '@event', :url
|
24
|
+
def_delegator '@event', :size
|
25
|
+
def_delegator '@event', :message_id
|
26
|
+
def_delegator '@event', :receipt_handle
|
27
|
+
def_delegator '@components', :schema_name
|
28
|
+
def_delegator '@components', :table_name
|
29
|
+
|
30
|
+
def data_source_id
|
31
|
+
"#{schema_name}.#{table_name}"
|
32
|
+
end
|
33
|
+
|
34
|
+
alias qualified_name data_source_id
|
35
|
+
|
36
|
+
def event_time
|
37
|
+
@event.time
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
class ObjectBuffer
|
43
|
+
|
44
|
+
include SQLUtils
|
45
|
+
|
46
|
+
def initialize(control_data_source:, logger:)
|
47
|
+
@ctl_ds = control_data_source
|
48
|
+
@logger = logger
|
49
|
+
end
|
50
|
+
|
51
|
+
def put(obj)
|
52
|
+
@ctl_ds.open {|conn|
|
53
|
+
insert_object(conn, obj)
|
54
|
+
}
|
55
|
+
end
|
56
|
+
|
57
|
+
def flush_tasks
|
58
|
+
task_ids = []
|
59
|
+
@ctl_ds.open {|conn|
|
60
|
+
conn.transaction {|txn|
|
61
|
+
task_ids = insert_tasks(conn)
|
62
|
+
insert_task_object_mappings(conn) unless task_ids.empty?
|
63
|
+
}
|
64
|
+
}
|
65
|
+
return task_ids.map {|id| LoadTask.create(task_id: id) }
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def insert_object(conn, obj)
|
71
|
+
#HACK - suppress log per object
|
72
|
+
log_level = @logger.level
|
73
|
+
@logger.level = Logger::ERROR
|
74
|
+
conn.update(<<-EndSQL)
|
75
|
+
insert into strload_objects
|
76
|
+
(object_url
|
77
|
+
, object_size
|
78
|
+
, data_source_id
|
79
|
+
, message_id
|
80
|
+
, event_time
|
81
|
+
, submit_time
|
82
|
+
)
|
83
|
+
select
|
84
|
+
#{s obj.url}
|
85
|
+
, #{obj.size}
|
86
|
+
, #{s obj.data_source_id}
|
87
|
+
, #{s obj.message_id}
|
88
|
+
, '#{obj.event_time}' AT TIME ZONE 'JST'
|
89
|
+
, current_timestamp
|
90
|
+
from
|
91
|
+
strload_tables
|
92
|
+
where
|
93
|
+
data_source_id = #{s obj.data_source_id}
|
94
|
+
;
|
95
|
+
EndSQL
|
96
|
+
@logger.level = log_level
|
97
|
+
end
|
98
|
+
|
99
|
+
def insert_tasks(conn)
|
100
|
+
vals = conn.query_values(<<-EndSQL)
|
101
|
+
insert into
|
102
|
+
strload_tasks (task_class, schema_name, table_name, submit_time)
|
103
|
+
select
|
104
|
+
'streaming_load_v3'
|
105
|
+
, tbl.schema_name
|
106
|
+
, tbl.table_name
|
107
|
+
, current_timestamp
|
108
|
+
from
|
109
|
+
strload_tables tbl
|
110
|
+
inner join (
|
111
|
+
select
|
112
|
+
data_source_id
|
113
|
+
, count(*) as object_count
|
114
|
+
from (
|
115
|
+
select
|
116
|
+
min(object_id) as object_id
|
117
|
+
, object_url
|
118
|
+
from
|
119
|
+
strload_objects
|
120
|
+
group by
|
121
|
+
object_url
|
122
|
+
) uniq_objects
|
123
|
+
inner join strload_objects
|
124
|
+
using(object_id)
|
125
|
+
left outer join strload_task_objects
|
126
|
+
using(object_id)
|
127
|
+
where
|
128
|
+
task_id is null -- not assigned to a task
|
129
|
+
group by
|
130
|
+
data_source_id
|
131
|
+
) obj -- number of objects not assigned to a task per schema_name.table_name (won't return zero)
|
132
|
+
using (data_source_id)
|
133
|
+
left outer join (
|
134
|
+
select
|
135
|
+
schema_name
|
136
|
+
, table_name
|
137
|
+
, max(submit_time) as latest_submit_time
|
138
|
+
from
|
139
|
+
strload_tasks
|
140
|
+
group by
|
141
|
+
schema_name, table_name
|
142
|
+
) task -- preceeding task's submit time
|
143
|
+
using(schema_name, table_name)
|
144
|
+
where
|
145
|
+
not tbl.disabled -- not disabled
|
146
|
+
and (
|
147
|
+
obj.object_count > tbl.load_batch_size -- batch_size exceeded?
|
148
|
+
or extract(epoch from current_timestamp - latest_submit_time) > load_interval -- load_interval exceeded?
|
149
|
+
or latest_submit_time is null -- no last task
|
150
|
+
)
|
151
|
+
returning task_id
|
152
|
+
;
|
153
|
+
EndSQL
|
154
|
+
@logger.info "Number of task created: #{vals.size}"
|
155
|
+
vals
|
156
|
+
end
|
157
|
+
|
158
|
+
def insert_task_object_mappings(conn)
|
159
|
+
conn.update(<<-EndSQL)
|
160
|
+
insert into
|
161
|
+
strload_task_objects
|
162
|
+
select
|
163
|
+
task_id
|
164
|
+
, object_id
|
165
|
+
from (
|
166
|
+
select
|
167
|
+
row_number() over(partition by task.task_id order by obj.object_id) as object_count
|
168
|
+
, task.task_id
|
169
|
+
, obj.object_id
|
170
|
+
, load_batch_size
|
171
|
+
from (
|
172
|
+
select
|
173
|
+
min(object_id) as object_id
|
174
|
+
, object_url
|
175
|
+
, data_source_id
|
176
|
+
from
|
177
|
+
strload_objects
|
178
|
+
group by
|
179
|
+
2, 3
|
180
|
+
) obj
|
181
|
+
inner join (
|
182
|
+
select
|
183
|
+
min(task_id) as task_id -- oldest task
|
184
|
+
, tbl.data_source_id
|
185
|
+
, max(load_batch_size) as load_batch_size
|
186
|
+
from
|
187
|
+
strload_tasks
|
188
|
+
inner join strload_tables tbl
|
189
|
+
using(schema_name, table_name)
|
190
|
+
where
|
191
|
+
task_id not in (select distinct task_id from strload_task_objects) -- no assigned objects
|
192
|
+
group by
|
193
|
+
2
|
194
|
+
) task -- tasks without objects
|
195
|
+
using(data_source_id)
|
196
|
+
left outer join strload_task_objects task_obj
|
197
|
+
using(object_id)
|
198
|
+
where
|
199
|
+
task_obj.object_id is null -- not assigned to a task
|
200
|
+
) as t
|
201
|
+
where
|
202
|
+
object_count <= load_batch_size -- limit number of objects assigned to single task
|
203
|
+
;
|
204
|
+
EndSQL
|
205
|
+
end
|
206
|
+
|
207
|
+
end
|
208
|
+
|
209
|
+
end
|
210
|
+
|
211
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
require 'bricolage/sqsdatasource'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
module Bricolage
|
5
|
+
|
6
|
+
module StreamingLoad
|
7
|
+
|
8
|
+
class Task < SQSMessage
|
9
|
+
|
10
|
+
def Task.get_concrete_class(msg, rec)
|
11
|
+
case
|
12
|
+
when rec['eventName'] == 'streaming_load_v3' then LoadTask
|
13
|
+
else
|
14
|
+
raise "[FATAL] unknown SQS message record: eventSource=#{rec['eventSource']} event=#{rec['eventName']} message_id=#{msg.message_id}"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def message_type
|
19
|
+
raise "#{self.class}\#message_type must be implemented"
|
20
|
+
end
|
21
|
+
|
22
|
+
def data?
|
23
|
+
false
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
class LoadTask < Task
|
30
|
+
|
31
|
+
def LoadTask.create(task_id:, force: false)
|
32
|
+
super name: 'streaming_load_v3', task_id: task_id, force: force
|
33
|
+
end
|
34
|
+
|
35
|
+
def LoadTask.parse_sqs_record(msg, rec)
|
36
|
+
{
|
37
|
+
task_id: rec['taskId'],
|
38
|
+
force: rec['force'],
|
39
|
+
}
|
40
|
+
end
|
41
|
+
|
42
|
+
def LoadTask.load(conn, task_id, force: false)
|
43
|
+
rec = conn.query_row(<<-EndSQL)
|
44
|
+
select
|
45
|
+
task_class
|
46
|
+
, tbl.schema_name
|
47
|
+
, tbl.table_name
|
48
|
+
, disabled
|
49
|
+
from
|
50
|
+
strload_tasks tsk
|
51
|
+
inner join strload_tables tbl
|
52
|
+
using(schema_name, table_name)
|
53
|
+
where
|
54
|
+
task_id = #{task_id}
|
55
|
+
;
|
56
|
+
EndSQL
|
57
|
+
object_urls = conn.query_values(<<-EndSQL)
|
58
|
+
select
|
59
|
+
object_url
|
60
|
+
from
|
61
|
+
strload_task_objects
|
62
|
+
inner join strload_objects
|
63
|
+
using (object_id)
|
64
|
+
inner join strload_tasks
|
65
|
+
using (task_id)
|
66
|
+
where
|
67
|
+
task_id = #{task_id}
|
68
|
+
;
|
69
|
+
EndSQL
|
70
|
+
return nil unless rec
|
71
|
+
new(
|
72
|
+
name: rec['task_class'],
|
73
|
+
time: nil,
|
74
|
+
source: nil,
|
75
|
+
task_id: task_id,
|
76
|
+
schema: rec['schema_name'],
|
77
|
+
table: rec['table_name'],
|
78
|
+
object_urls: object_urls,
|
79
|
+
disabled: rec['disabled'] == 'f' ? false : true,
|
80
|
+
force: force
|
81
|
+
)
|
82
|
+
end
|
83
|
+
|
84
|
+
alias message_type name
|
85
|
+
|
86
|
+
def init_message(task_id:, schema: nil, table: nil, object_urls: nil, disabled: false, force: false)
|
87
|
+
@id = task_id
|
88
|
+
@force = force
|
89
|
+
|
90
|
+
# Effective only for queue reader process
|
91
|
+
@schema = schema
|
92
|
+
@table = table
|
93
|
+
@object_urls = object_urls
|
94
|
+
@disabled = disabled
|
95
|
+
end
|
96
|
+
|
97
|
+
attr_reader :id, :force
|
98
|
+
|
99
|
+
#
|
100
|
+
# For writer only
|
101
|
+
#
|
102
|
+
|
103
|
+
attr_reader :schema, :table, :object_urls, :disabled
|
104
|
+
|
105
|
+
def qualified_name
|
106
|
+
"#{@schema}.#{@table}"
|
107
|
+
end
|
108
|
+
|
109
|
+
def body
|
110
|
+
obj = super
|
111
|
+
obj['taskId'] = @id
|
112
|
+
obj['schemaName'] = @schema
|
113
|
+
obj['tableName'] = @table
|
114
|
+
obj['objectUrls'] = @object_urls
|
115
|
+
obj['disabled'] = @disabled
|
116
|
+
obj['force'] = @force
|
117
|
+
obj
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
121
|
+
|
122
|
+
end # module StreamingLoad
|
123
|
+
|
124
|
+
end # module Bricolage
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Bricolage
|
2
|
+
|
3
|
+
module StreamingLoad
|
4
|
+
|
5
|
+
class URLPatternNotMatched < StandardError; end
|
6
|
+
|
7
|
+
|
8
|
+
class URLPatterns
|
9
|
+
|
10
|
+
def URLPatterns.for_config(configs)
|
11
|
+
new(configs.map {|c|
|
12
|
+
Pattern.new(url: c.fetch('url'), schema: c.fetch('schema'), table: c.fetch('table'))
|
13
|
+
})
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize(patterns)
|
17
|
+
@patterns = patterns
|
18
|
+
end
|
19
|
+
|
20
|
+
def match(url)
|
21
|
+
@patterns.each do |pat|
|
22
|
+
components = pat.match(url)
|
23
|
+
return components if components
|
24
|
+
end
|
25
|
+
raise URLPatternNotMatched, "no URL pattern matches the object url: #{url.inspect}"
|
26
|
+
end
|
27
|
+
|
28
|
+
class Pattern
|
29
|
+
def initialize(url:, schema:, table:)
|
30
|
+
@url_pattern = /\A#{url}\z/
|
31
|
+
@schema = schema
|
32
|
+
@table = table
|
33
|
+
end
|
34
|
+
|
35
|
+
attr_reader :url_pattern
|
36
|
+
attr_reader :schema
|
37
|
+
attr_reader :table
|
38
|
+
|
39
|
+
def match(url)
|
40
|
+
m = @url_pattern.match(url) or return nil
|
41
|
+
Components.new(get_component(m, @schema), get_component(m, @table))
|
42
|
+
end
|
43
|
+
|
44
|
+
def get_component(m, label)
|
45
|
+
if /\A%/ =~ label
|
46
|
+
m[label[1..-1]]
|
47
|
+
else
|
48
|
+
label
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
Components = Struct.new(:schema_name, :table_name)
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
end # module StreamingLoad
|
58
|
+
|
59
|
+
end # module Bricolage
|