apisonator 2.100.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +317 -0
- data/Gemfile +11 -0
- data/Gemfile.base +65 -0
- data/Gemfile.lock +319 -0
- data/Gemfile.on_prem +1 -0
- data/Gemfile.on_prem.lock +297 -0
- data/LICENSE +202 -0
- data/NOTICE +15 -0
- data/README.md +230 -0
- data/Rakefile +287 -0
- data/apisonator.gemspec +47 -0
- data/app/api/api.rb +13 -0
- data/app/api/internal/alert_limits.rb +32 -0
- data/app/api/internal/application_keys.rb +49 -0
- data/app/api/internal/application_referrer_filters.rb +43 -0
- data/app/api/internal/applications.rb +77 -0
- data/app/api/internal/errors.rb +54 -0
- data/app/api/internal/events.rb +42 -0
- data/app/api/internal/internal.rb +104 -0
- data/app/api/internal/metrics.rb +40 -0
- data/app/api/internal/service_tokens.rb +46 -0
- data/app/api/internal/services.rb +58 -0
- data/app/api/internal/stats.rb +42 -0
- data/app/api/internal/usagelimits.rb +62 -0
- data/app/api/internal/utilization.rb +23 -0
- data/bin/3scale_backend +223 -0
- data/bin/3scale_backend_worker +26 -0
- data/config.ru +4 -0
- data/config/puma.rb +192 -0
- data/config/schedule.rb +9 -0
- data/ext/mkrf_conf.rb +64 -0
- data/lib/3scale/backend.rb +67 -0
- data/lib/3scale/backend/alert_limit.rb +56 -0
- data/lib/3scale/backend/alerts.rb +137 -0
- data/lib/3scale/backend/analytics/kinesis.rb +3 -0
- data/lib/3scale/backend/analytics/kinesis/adapter.rb +180 -0
- data/lib/3scale/backend/analytics/kinesis/exporter.rb +86 -0
- data/lib/3scale/backend/analytics/kinesis/job.rb +135 -0
- data/lib/3scale/backend/analytics/redshift.rb +3 -0
- data/lib/3scale/backend/analytics/redshift/adapter.rb +367 -0
- data/lib/3scale/backend/analytics/redshift/importer.rb +83 -0
- data/lib/3scale/backend/analytics/redshift/job.rb +33 -0
- data/lib/3scale/backend/application.rb +330 -0
- data/lib/3scale/backend/application_events.rb +76 -0
- data/lib/3scale/backend/background_job.rb +65 -0
- data/lib/3scale/backend/configurable.rb +20 -0
- data/lib/3scale/backend/configuration.rb +151 -0
- data/lib/3scale/backend/configuration/loader.rb +42 -0
- data/lib/3scale/backend/constants.rb +19 -0
- data/lib/3scale/backend/cors.rb +84 -0
- data/lib/3scale/backend/distributed_lock.rb +67 -0
- data/lib/3scale/backend/environment.rb +21 -0
- data/lib/3scale/backend/error_storage.rb +52 -0
- data/lib/3scale/backend/errors.rb +343 -0
- data/lib/3scale/backend/event_storage.rb +120 -0
- data/lib/3scale/backend/experiment.rb +84 -0
- data/lib/3scale/backend/extensions.rb +5 -0
- data/lib/3scale/backend/extensions/array.rb +19 -0
- data/lib/3scale/backend/extensions/hash.rb +26 -0
- data/lib/3scale/backend/extensions/nil_class.rb +13 -0
- data/lib/3scale/backend/extensions/redis.rb +44 -0
- data/lib/3scale/backend/extensions/string.rb +13 -0
- data/lib/3scale/backend/extensions/time.rb +110 -0
- data/lib/3scale/backend/failed_jobs_scheduler.rb +141 -0
- data/lib/3scale/backend/job_fetcher.rb +122 -0
- data/lib/3scale/backend/listener.rb +728 -0
- data/lib/3scale/backend/listener_metrics.rb +99 -0
- data/lib/3scale/backend/logging.rb +48 -0
- data/lib/3scale/backend/logging/external.rb +44 -0
- data/lib/3scale/backend/logging/external/impl.rb +93 -0
- data/lib/3scale/backend/logging/external/impl/airbrake.rb +66 -0
- data/lib/3scale/backend/logging/external/impl/bugsnag.rb +69 -0
- data/lib/3scale/backend/logging/external/impl/default.rb +18 -0
- data/lib/3scale/backend/logging/external/resque.rb +57 -0
- data/lib/3scale/backend/logging/logger.rb +18 -0
- data/lib/3scale/backend/logging/middleware.rb +62 -0
- data/lib/3scale/backend/logging/middleware/json_writer.rb +21 -0
- data/lib/3scale/backend/logging/middleware/text_writer.rb +60 -0
- data/lib/3scale/backend/logging/middleware/writer.rb +143 -0
- data/lib/3scale/backend/logging/worker.rb +107 -0
- data/lib/3scale/backend/manifest.rb +80 -0
- data/lib/3scale/backend/memoizer.rb +277 -0
- data/lib/3scale/backend/metric.rb +275 -0
- data/lib/3scale/backend/metric/collection.rb +91 -0
- data/lib/3scale/backend/oauth.rb +4 -0
- data/lib/3scale/backend/oauth/token.rb +26 -0
- data/lib/3scale/backend/oauth/token_key.rb +30 -0
- data/lib/3scale/backend/oauth/token_storage.rb +313 -0
- data/lib/3scale/backend/oauth/token_value.rb +25 -0
- data/lib/3scale/backend/period.rb +3 -0
- data/lib/3scale/backend/period/boundary.rb +107 -0
- data/lib/3scale/backend/period/cache.rb +28 -0
- data/lib/3scale/backend/period/period.rb +402 -0
- data/lib/3scale/backend/queue_storage.rb +16 -0
- data/lib/3scale/backend/rack.rb +49 -0
- data/lib/3scale/backend/rack/exception_catcher.rb +136 -0
- data/lib/3scale/backend/rack/internal_error_catcher.rb +23 -0
- data/lib/3scale/backend/rack/prometheus.rb +19 -0
- data/lib/3scale/backend/saas.rb +6 -0
- data/lib/3scale/backend/saas_analytics.rb +4 -0
- data/lib/3scale/backend/server.rb +30 -0
- data/lib/3scale/backend/server/falcon.rb +52 -0
- data/lib/3scale/backend/server/puma.rb +71 -0
- data/lib/3scale/backend/service.rb +317 -0
- data/lib/3scale/backend/service_token.rb +97 -0
- data/lib/3scale/backend/stats.rb +8 -0
- data/lib/3scale/backend/stats/aggregator.rb +170 -0
- data/lib/3scale/backend/stats/aggregators/base.rb +72 -0
- data/lib/3scale/backend/stats/aggregators/response_code.rb +58 -0
- data/lib/3scale/backend/stats/aggregators/usage.rb +34 -0
- data/lib/3scale/backend/stats/bucket_reader.rb +135 -0
- data/lib/3scale/backend/stats/bucket_storage.rb +108 -0
- data/lib/3scale/backend/stats/cleaner.rb +195 -0
- data/lib/3scale/backend/stats/codes_commons.rb +14 -0
- data/lib/3scale/backend/stats/delete_job_def.rb +60 -0
- data/lib/3scale/backend/stats/key_generator.rb +73 -0
- data/lib/3scale/backend/stats/keys.rb +104 -0
- data/lib/3scale/backend/stats/partition_eraser_job.rb +58 -0
- data/lib/3scale/backend/stats/partition_generator_job.rb +46 -0
- data/lib/3scale/backend/stats/period_commons.rb +34 -0
- data/lib/3scale/backend/stats/stats_parser.rb +141 -0
- data/lib/3scale/backend/stats/storage.rb +113 -0
- data/lib/3scale/backend/statsd.rb +14 -0
- data/lib/3scale/backend/storable.rb +35 -0
- data/lib/3scale/backend/storage.rb +40 -0
- data/lib/3scale/backend/storage_async.rb +4 -0
- data/lib/3scale/backend/storage_async/async_redis.rb +21 -0
- data/lib/3scale/backend/storage_async/client.rb +205 -0
- data/lib/3scale/backend/storage_async/pipeline.rb +79 -0
- data/lib/3scale/backend/storage_async/resque_extensions.rb +30 -0
- data/lib/3scale/backend/storage_helpers.rb +278 -0
- data/lib/3scale/backend/storage_key_helpers.rb +9 -0
- data/lib/3scale/backend/storage_sync.rb +43 -0
- data/lib/3scale/backend/transaction.rb +62 -0
- data/lib/3scale/backend/transactor.rb +177 -0
- data/lib/3scale/backend/transactor/limit_headers.rb +54 -0
- data/lib/3scale/backend/transactor/notify_batcher.rb +139 -0
- data/lib/3scale/backend/transactor/notify_job.rb +47 -0
- data/lib/3scale/backend/transactor/process_job.rb +33 -0
- data/lib/3scale/backend/transactor/report_job.rb +84 -0
- data/lib/3scale/backend/transactor/status.rb +236 -0
- data/lib/3scale/backend/transactor/usage_report.rb +182 -0
- data/lib/3scale/backend/usage.rb +63 -0
- data/lib/3scale/backend/usage_limit.rb +115 -0
- data/lib/3scale/backend/use_cases/provider_key_change_use_case.rb +60 -0
- data/lib/3scale/backend/util.rb +17 -0
- data/lib/3scale/backend/validators.rb +26 -0
- data/lib/3scale/backend/validators/base.rb +36 -0
- data/lib/3scale/backend/validators/key.rb +17 -0
- data/lib/3scale/backend/validators/limits.rb +57 -0
- data/lib/3scale/backend/validators/oauth_key.rb +15 -0
- data/lib/3scale/backend/validators/oauth_setting.rb +15 -0
- data/lib/3scale/backend/validators/redirect_uri.rb +33 -0
- data/lib/3scale/backend/validators/referrer.rb +60 -0
- data/lib/3scale/backend/validators/service_state.rb +15 -0
- data/lib/3scale/backend/validators/state.rb +15 -0
- data/lib/3scale/backend/version.rb +5 -0
- data/lib/3scale/backend/views/oauth_access_tokens.builder +14 -0
- data/lib/3scale/backend/views/oauth_app_id_by_token.builder +4 -0
- data/lib/3scale/backend/worker.rb +87 -0
- data/lib/3scale/backend/worker_async.rb +88 -0
- data/lib/3scale/backend/worker_metrics.rb +44 -0
- data/lib/3scale/backend/worker_sync.rb +32 -0
- data/lib/3scale/bundler_shim.rb +17 -0
- data/lib/3scale/prometheus_server.rb +10 -0
- data/lib/3scale/tasks/connectivity.rake +41 -0
- data/lib/3scale/tasks/helpers.rb +3 -0
- data/lib/3scale/tasks/helpers/environment.rb +23 -0
- data/lib/3scale/tasks/stats.rake +131 -0
- data/lib/3scale/tasks/swagger.rake +46 -0
- data/licenses.xml +1215 -0
- metadata +227 -0
@@ -0,0 +1,180 @@
|
|
1
|
+
require '3scale/backend/logging'
|
2
|
+
|
3
|
+
module ThreeScale
|
4
|
+
module Backend
|
5
|
+
module Analytics
|
6
|
+
module Kinesis
|
7
|
+
class Adapter
|
8
|
+
# Each Kinesis record is rounded to the nearest 5KB to calculate the
|
9
|
+
# cost. Each of our events is a hash with a few keys: service,
|
10
|
+
# metric, period, time, value, etc. This means that the size of one
|
11
|
+
# of our events is nowhere near 5KB. For that reason, we need to make
|
12
|
+
# sure that we send many events in each record.
|
13
|
+
# The max size for each record is 1000KB. In each record batch, Kinesis
|
14
|
+
# accepts a maximum of 4MB.
|
15
|
+
#
|
16
|
+
# We will try to optimize the batching process later. For now, I will
|
17
|
+
# just put 1000 events in each record. And batches of 5 records max.
|
18
|
+
#
|
19
|
+
# When we receive a number of events not big enough to fill a record,
|
20
|
+
# those events are marked as pending events.
|
21
|
+
# Kinesis can return errors, when that happens, the events of the
|
22
|
+
# records that failed are re-enqueued as pending events.
|
23
|
+
# The list of pending events is stored in Redis, so we do not fail to
|
24
|
+
# process any events in case of downtime or errors.
|
25
|
+
|
26
|
+
include Logging
|
27
|
+
|
28
|
+
EVENTS_PER_RECORD = 1000
|
29
|
+
private_constant :EVENTS_PER_RECORD
|
30
|
+
|
31
|
+
MAX_RECORDS_PER_BATCH = 5
|
32
|
+
private_constant :MAX_RECORDS_PER_BATCH
|
33
|
+
|
34
|
+
EVENTS_PER_BATCH = EVENTS_PER_RECORD*MAX_RECORDS_PER_BATCH
|
35
|
+
private_constant :EVENTS_PER_BATCH
|
36
|
+
|
37
|
+
KINESIS_PENDING_EVENTS_KEY = 'send_to_kinesis:pending_events'
|
38
|
+
private_constant :KINESIS_PENDING_EVENTS_KEY
|
39
|
+
|
40
|
+
# We need to limit the number of pending events stored in Redis.
|
41
|
+
# The Redis database can grow very quickly if a few consecutive jobs
|
42
|
+
# fail. I am going to limit the number of pending events to 600k
|
43
|
+
# (10 jobs approx.). If that limit is reached, we will disable the
|
44
|
+
# creation of buckets in the system, but we will continue trying to
|
45
|
+
# send the failed events. We will lose data, but that is better than
|
46
|
+
# collapsing the whole Redis.
|
47
|
+
# We will try to find a better alternative once we cannot afford to
|
48
|
+
# miss events. Right now, we are just deleting the stats keys with
|
49
|
+
# period = minute, so we can restore everything else.
|
50
|
+
MAX_PENDING_EVENTS = 600_000
|
51
|
+
private_constant :MAX_PENDING_EVENTS
|
52
|
+
|
53
|
+
MAX_PENDING_EVENTS_REACHED_MSG =
|
54
|
+
'Bucket creation has been disabled. Max pending events reached'.freeze
|
55
|
+
private_constant :MAX_PENDING_EVENTS_REACHED_MSG
|
56
|
+
|
57
|
+
def initialize(stream_name, kinesis_client, storage)
|
58
|
+
@stream_name = stream_name
|
59
|
+
@kinesis_client = kinesis_client
|
60
|
+
@storage = storage
|
61
|
+
end
|
62
|
+
|
63
|
+
def send_events(events)
|
64
|
+
pending_events = stored_pending_events + events
|
65
|
+
|
66
|
+
# Only disable indicating emergency if bucket storage is enabled.
|
67
|
+
# We do not want to indicate emergency if it was disabled manually.
|
68
|
+
if limit_pending_events_reached?(pending_events.size) && Stats::Storage.enabled?
|
69
|
+
Stats::Storage.disable!(true)
|
70
|
+
log_bucket_creation_disabled
|
71
|
+
end
|
72
|
+
|
73
|
+
# Batch events until we can fill at least one record
|
74
|
+
if pending_events.size >= EVENTS_PER_RECORD
|
75
|
+
failed_events = send_events_in_batches(pending_events)
|
76
|
+
store_pending_events(failed_events)
|
77
|
+
else
|
78
|
+
store_pending_events(pending_events)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# Sends the pending events to Kinesis, even if there are not enough of
|
83
|
+
# them to fill 1 record.
|
84
|
+
# Returns the number of events correctly sent to Kinesis
|
85
|
+
def flush(limit = nil)
|
86
|
+
pending_events = stored_pending_events
|
87
|
+
events_to_flush = limit ? pending_events.take(limit) : pending_events
|
88
|
+
failed_events = send_events_in_batches(events_to_flush)
|
89
|
+
store_pending_events(pending_events - events_to_flush + failed_events)
|
90
|
+
events_to_flush.size - failed_events.size
|
91
|
+
end
|
92
|
+
|
93
|
+
def num_pending_events
|
94
|
+
storage.scard(KINESIS_PENDING_EVENTS_KEY)
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
|
99
|
+
attr_reader :stream_name, :kinesis_client, :storage
|
100
|
+
|
101
|
+
def stored_pending_events
|
102
|
+
storage.smembers(KINESIS_PENDING_EVENTS_KEY).map do |pending_event|
|
103
|
+
JSON.parse(pending_event, symbolize_names: true)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def limit_pending_events_reached?(count)
|
108
|
+
count > MAX_PENDING_EVENTS
|
109
|
+
end
|
110
|
+
|
111
|
+
def log_bucket_creation_disabled
|
112
|
+
logger.info(MAX_PENDING_EVENTS_REACHED_MSG)
|
113
|
+
end
|
114
|
+
|
115
|
+
# Returns the failed events
|
116
|
+
def send_events_in_batches(events)
|
117
|
+
failed_events = []
|
118
|
+
|
119
|
+
events.each_slice(EVENTS_PER_BATCH) do |events_slice|
|
120
|
+
begin
|
121
|
+
kinesis_resp = kinesis_client.put_record_batch(
|
122
|
+
{ delivery_stream_name: stream_name,
|
123
|
+
records: events_to_kinesis_records(events_slice) })
|
124
|
+
failed_events << failed_events_kinesis_resp(
|
125
|
+
kinesis_resp[:request_responses], events_slice)
|
126
|
+
rescue Aws::Firehose::Errors::ServiceError
|
127
|
+
failed_events << events_slice
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
failed_events.flatten
|
132
|
+
end
|
133
|
+
|
134
|
+
def events_to_kinesis_records(events)
|
135
|
+
# Record format expected by Kinesis:
|
136
|
+
# [{ data: "data_event_group_1" }, { data: "data_event_group_2" }]
|
137
|
+
events.each_slice(EVENTS_PER_RECORD).map do |events_slice|
|
138
|
+
{ data: events_to_pseudo_json(events_slice) }
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
# We want to send to Kinesis events that can be read by Redshift.
|
143
|
+
# Redshift expects events in JSON format without the '[]' and
|
144
|
+
# without separating them with commas.
|
145
|
+
# We put each event in a separated line, that will make their parsing
|
146
|
+
# easier, but it is not needed by Redshift.
|
147
|
+
def events_to_pseudo_json(events)
|
148
|
+
events.map { |event| event.to_json }.join("\n") + "\n"
|
149
|
+
end
|
150
|
+
|
151
|
+
def failed_events_kinesis_resp(request_responses, events)
|
152
|
+
failed_records_indexes = failed_records_indexes(request_responses)
|
153
|
+
failed_records_indexes.flat_map do |failed_record_index|
|
154
|
+
events_index_start = failed_record_index*EVENTS_PER_RECORD
|
155
|
+
events_index_end = events_index_start + EVENTS_PER_RECORD - 1
|
156
|
+
events[events_index_start..events_index_end]
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
def failed_records_indexes(request_responses)
|
161
|
+
result = []
|
162
|
+
request_responses.each_with_index do |response, index|
|
163
|
+
result << index unless response[:error_code].nil?
|
164
|
+
end
|
165
|
+
result
|
166
|
+
end
|
167
|
+
|
168
|
+
def store_pending_events(events)
|
169
|
+
storage.pipelined do
|
170
|
+
storage.del(KINESIS_PENDING_EVENTS_KEY)
|
171
|
+
events.each do |event|
|
172
|
+
storage.sadd(KINESIS_PENDING_EVENTS_KEY, event.to_json)
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module ThreeScale
|
2
|
+
module Backend
|
3
|
+
module Analytics
|
4
|
+
module Kinesis
|
5
|
+
|
6
|
+
# The main responsibility of this class is to schedule Kinesis jobs.
|
7
|
+
# We know that the distributed locking algorithm that we are using
|
8
|
+
# guarantees that two jobs will not be running at the same time except
|
9
|
+
# in some corner cases, like in the case of a failure of one of the Redis
|
10
|
+
# masters. However, this is not a problem in our case. If two Kinesis
|
11
|
+
# jobs run at the same time, they will probably export the same events to
|
12
|
+
# Kinesis. However, they will not be imported twice into Redshift because
|
13
|
+
# the import method that we use detects that two events are the same and
|
14
|
+
# only imports one. This detection is done using the 'time_gen' field
|
15
|
+
# that we attach to each event before they are send to Kinesis.
|
16
|
+
class Exporter
|
17
|
+
SEND_TO_KINESIS_ENABLED_KEY = 'send_to_kinesis:enabled'.freeze
|
18
|
+
private_constant :SEND_TO_KINESIS_ENABLED_KEY
|
19
|
+
|
20
|
+
TTL_JOB_RUNNING_KEY_SEC = 360
|
21
|
+
private_constant :TTL_JOB_RUNNING_KEY_SEC
|
22
|
+
|
23
|
+
class << self
|
24
|
+
def enable
|
25
|
+
storage.set(SEND_TO_KINESIS_ENABLED_KEY, '1')
|
26
|
+
end
|
27
|
+
|
28
|
+
def disable
|
29
|
+
storage.del(SEND_TO_KINESIS_ENABLED_KEY)
|
30
|
+
end
|
31
|
+
|
32
|
+
def enabled?
|
33
|
+
storage.get(SEND_TO_KINESIS_ENABLED_KEY).to_i == 1
|
34
|
+
end
|
35
|
+
|
36
|
+
def schedule_job
|
37
|
+
if enabled?
|
38
|
+
lock_key = dist_lock.lock
|
39
|
+
if lock_key
|
40
|
+
Resque.enqueue(Job, Time.now.utc, lock_key, Time.now.utc.to_f)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def flush_pending_events(limit = nil)
|
46
|
+
flushed_events = 0
|
47
|
+
if enabled?
|
48
|
+
lock_key = dist_lock.lock
|
49
|
+
if lock_key
|
50
|
+
flushed_events = kinesis_adapter.flush(limit)
|
51
|
+
job_finished(lock_key) # flush is not asynchronous
|
52
|
+
end
|
53
|
+
end
|
54
|
+
flushed_events
|
55
|
+
end
|
56
|
+
|
57
|
+
def num_pending_events
|
58
|
+
kinesis_adapter.num_pending_events
|
59
|
+
end
|
60
|
+
|
61
|
+
# To be called by a kinesis job once it exits so other jobs can run
|
62
|
+
def job_finished(lock_key)
|
63
|
+
dist_lock.unlock if lock_key == dist_lock.current_lock_key
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
|
68
|
+
def storage
|
69
|
+
Backend::Storage.instance
|
70
|
+
end
|
71
|
+
|
72
|
+
def kinesis_adapter
|
73
|
+
Stats::Storage.kinesis_adapter
|
74
|
+
end
|
75
|
+
|
76
|
+
def dist_lock
|
77
|
+
@dist_lock ||= DistributedLock.new(self.name,
|
78
|
+
TTL_JOB_RUNNING_KEY_SEC,
|
79
|
+
storage)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,135 @@
|
|
1
|
+
require '3scale/backend/logging'
|
2
|
+
|
3
|
+
module ThreeScale
|
4
|
+
module Backend
|
5
|
+
module Analytics
|
6
|
+
module Kinesis
|
7
|
+
# This job works as follows:
|
8
|
+
# 1) Reads the pending events from the buckets that have not been read.
|
9
|
+
# 2) Parses and filters those events.
|
10
|
+
# 3) Sends the events to the Kinesis adapter.
|
11
|
+
# 4) Updates the latest bucket read, to avoid processing buckets more
|
12
|
+
# than once.
|
13
|
+
# The events are sent in batches to Kinesis, but the component that does
|
14
|
+
# that batching is the Kinesis adapter.
|
15
|
+
#
|
16
|
+
# Before sending the events to Kinesis, we attach a 'time_gen' attribute
|
17
|
+
# to each of them. This is a timestamp that indicates approximately when
|
18
|
+
# the event was generated based on the bucket where it was stored.
|
19
|
+
# We need this attribute because we will have repeated event keys in
|
20
|
+
# Redis and we will need to know which one contains the most updated
|
21
|
+
# value.
|
22
|
+
# Notice that we do not send all the events that are in the buckets to
|
23
|
+
# Kinesis. This job reads several buckets each time it runs. Some events
|
24
|
+
# can be repeated across those buckets. However, the job will only send
|
25
|
+
# to Kinesis the latest value (the one in the most recent bucket). This
|
26
|
+
# reduces the information that we need to parse, filter, and send.
|
27
|
+
# We need the extra field 'time_gen', because we cannot safely assume any
|
28
|
+
# order in S3 when sending events to Kinesis.
|
29
|
+
class Job < BackgroundJob
|
30
|
+
@queue = :stats
|
31
|
+
|
32
|
+
FILTERED_EVENT_PERIODS = %w(week eternity)
|
33
|
+
private_constant :FILTERED_EVENT_PERIODS
|
34
|
+
|
35
|
+
# We need to limit the amount of buckets that a job can process.
|
36
|
+
# Otherwise, there is the possibility that the job would not finish
|
37
|
+
# before its expiration time, and the next one would start processing
|
38
|
+
# the same buckets.
|
39
|
+
MAX_BUCKETS = 60
|
40
|
+
private_constant :MAX_BUCKETS
|
41
|
+
|
42
|
+
FILTERED_EVENT_PERIODS_STR = FILTERED_EVENT_PERIODS.map do |period|
|
43
|
+
"/#{period}".freeze
|
44
|
+
end.freeze
|
45
|
+
private_constant :FILTERED_EVENT_PERIODS_STR
|
46
|
+
|
47
|
+
class << self
|
48
|
+
include Logging
|
49
|
+
|
50
|
+
def perform_logged(end_time_utc, lock_key, _enqueue_time)
|
51
|
+
# end_time_utc will be a string when the worker processes this job.
|
52
|
+
# The parameter is passed through Redis as a string. We need to
|
53
|
+
# convert it back.
|
54
|
+
events_sent = 0
|
55
|
+
|
56
|
+
end_time = DateTime.parse(end_time_utc).to_time.utc
|
57
|
+
pending_events = bucket_reader.pending_events_in_buckets(
|
58
|
+
end_time_utc: end_time, max_buckets: MAX_BUCKETS)
|
59
|
+
|
60
|
+
unless pending_events[:events].empty?
|
61
|
+
events = prepare_events(pending_events[:latest_bucket],
|
62
|
+
pending_events[:events])
|
63
|
+
kinesis_adapter.send_events(events)
|
64
|
+
bucket_reader.latest_bucket_read = pending_events[:latest_bucket]
|
65
|
+
events_sent = events.size
|
66
|
+
|
67
|
+
# We might use a different strategy to delete buckets in the
|
68
|
+
# future, but for now, we are going to delete the buckets as they
|
69
|
+
# are read
|
70
|
+
bucket_storage.delete_range(pending_events[:latest_bucket])
|
71
|
+
end
|
72
|
+
|
73
|
+
Exporter.job_finished(lock_key)
|
74
|
+
[true, msg_events_sent(events_sent)]
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def prepare_events(bucket, events)
|
80
|
+
filter_events(events)
|
81
|
+
parsed_events = parse_events(events.lazy)
|
82
|
+
add_time_gen_to_events(parsed_events, bucket_to_timestamp(bucket)).force
|
83
|
+
end
|
84
|
+
|
85
|
+
# Parses the events and discards the invalid ones
|
86
|
+
def parse_events(events)
|
87
|
+
events.map do |k, v|
|
88
|
+
begin
|
89
|
+
Stats::StatsParser.parse(k, v)
|
90
|
+
rescue Stats::StatsParser::StatsKeyValueInvalid
|
91
|
+
logger.notify("Invalid stats key-value. k: #{k}. v: #{v}")
|
92
|
+
nil
|
93
|
+
end
|
94
|
+
end.reject(&:nil?)
|
95
|
+
end
|
96
|
+
|
97
|
+
# We do not want to send all the events to Kinesis.
|
98
|
+
# This method filters them.
|
99
|
+
def filter_events(events)
|
100
|
+
events.reject! do |event|
|
101
|
+
FILTERED_EVENT_PERIODS_STR.any? do |filtered_period|
|
102
|
+
event.include?(filtered_period)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def add_time_gen_to_events(events, time_gen)
|
108
|
+
events.map { |event| event[:time_gen] = time_gen; event }
|
109
|
+
end
|
110
|
+
|
111
|
+
def bucket_to_timestamp(bucket)
|
112
|
+
DateTime.parse(bucket).to_time.utc.strftime('%Y%m%d %H:%M:%S')
|
113
|
+
end
|
114
|
+
|
115
|
+
def msg_events_sent(n_events)
|
116
|
+
"#{n_events} events have been sent to the Kinesis adapter"
|
117
|
+
end
|
118
|
+
|
119
|
+
def bucket_storage
|
120
|
+
Stats::Storage.bucket_storage
|
121
|
+
end
|
122
|
+
|
123
|
+
def bucket_reader
|
124
|
+
Stats::Storage.bucket_reader
|
125
|
+
end
|
126
|
+
|
127
|
+
def kinesis_adapter
|
128
|
+
Stats::Storage.kinesis_adapter
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,367 @@
|
|
1
|
+
require 'pg'
|
2
|
+
|
3
|
+
module ThreeScale
|
4
|
+
module Backend
|
5
|
+
module Analytics
|
6
|
+
module Redshift
|
7
|
+
# This class imports the events stored by Kinesis in S3 into Redshift.
|
8
|
+
# It keeps track of the events that have been imported so it does not
|
9
|
+
# read twice the same S3 path.
|
10
|
+
#
|
11
|
+
# We store 'repeated' events in S3. This means that we can find several
|
12
|
+
# times the same {service, instance, uinstance, metric, period, timestamp}
|
13
|
+
# combination.
|
14
|
+
#
|
15
|
+
# In order to avoid storing repeated information in Redshift we need to
|
16
|
+
# perform UPSERTs. The algorithm followed is the one explained in the
|
17
|
+
# official Redshift documentation:
|
18
|
+
# http://docs.aws.amazon.com/redshift/latest/dg/t_updating-inserting-using-staging-tables-.html
|
19
|
+
# The process is as follows:
|
20
|
+
# 1) Create a temporary table with the data imported from S3, including
|
21
|
+
# duplicates.
|
22
|
+
# Two attributes can have nulls: cinstance and uinstance. We replace
|
23
|
+
# those nulls with ''. I have observed substantial performance gains
|
24
|
+
# because of this.
|
25
|
+
# 2) Perform the necessary operations in the temp table to remove
|
26
|
+
# duplicates. (In our case this basically consists of an inner-join).
|
27
|
+
# 3) Inside a transaction, delete all the events that are in the temp
|
28
|
+
# table from the final table. Next, insert the ones in the temp
|
29
|
+
# table into the final table. Finally, remove the temp table.
|
30
|
+
# 4) Last, we perform a vacuum, because Redshift does not automatically
|
31
|
+
# reclaim and reuse space that has been freed after deletes or
|
32
|
+
# updates. The vacuum operation also leaves the table sorted.
|
33
|
+
# More info:
|
34
|
+
# http://docs.aws.amazon.com/redshift/latest/dg/t_Reclaiming_storage_space202.html
|
35
|
+
# Right now, we are going to vacuum every time we insert new data,
|
36
|
+
# we will see if for performance reasons we need to do it less often.
|
37
|
+
class Adapter
|
38
|
+
|
39
|
+
module SQL
|
40
|
+
SCHEMA = 'backend'.freeze
|
41
|
+
|
42
|
+
# This importer relies on some tables or views that are created in
|
43
|
+
# Redshift to function correctly.
|
44
|
+
TABLES = { events: "#{SCHEMA}.events".freeze,
|
45
|
+
latest_s3_path_read: "#{SCHEMA}.latest_s3_path_read".freeze,
|
46
|
+
temp: "#{SCHEMA}.temp_events".freeze,
|
47
|
+
unique_imported_events: "#{SCHEMA}.unique_imported_events".freeze }.freeze
|
48
|
+
|
49
|
+
EVENT_ATTRS = %w(service cinstance uinstance metric period timestamp time_gen).freeze
|
50
|
+
JOIN_EVENT_ATTRS = (EVENT_ATTRS - ['time_gen']).freeze
|
51
|
+
|
52
|
+
EXISTING_TABLES =
|
53
|
+
'SELECT table_name '\
|
54
|
+
'FROM information_schema.tables '\
|
55
|
+
"WHERE table_schema = '#{SCHEMA}';".freeze
|
56
|
+
|
57
|
+
CREATE_TEMP_TABLES =
|
58
|
+
"DROP TABLE IF EXISTS #{TABLES[:temp]} CASCADE; "\
|
59
|
+
"CREATE TABLE #{TABLES[:temp]} (LIKE #{TABLES[:events]}); "\
|
60
|
+
"DROP TABLE IF EXISTS #{TABLES[:unique_imported_events]} CASCADE; "\
|
61
|
+
"CREATE TABLE #{TABLES[:unique_imported_events]} (LIKE #{TABLES[:events]}); "\
|
62
|
+
'COMMIT;'.freeze
|
63
|
+
|
64
|
+
CLEAN_TEMP_TABLES =
|
65
|
+
"DROP TABLE #{TABLES[:unique_imported_events]}; "\
|
66
|
+
"DROP TABLE #{TABLES[:temp]};".freeze
|
67
|
+
|
68
|
+
LATEST_TIMESTAMP_READ = "SELECT s3_path FROM #{TABLES[:latest_s3_path_read]}".freeze
|
69
|
+
|
70
|
+
VACUUM = "VACUUM FULL #{TABLES[:events]}".freeze
|
71
|
+
|
72
|
+
class << self
|
73
|
+
|
74
|
+
def insert_imported_events
|
75
|
+
'BEGIN TRANSACTION; '\
|
76
|
+
"DELETE FROM #{TABLES[:events]} "\
|
77
|
+
"USING #{TABLES[:unique_imported_events]} u "\
|
78
|
+
"WHERE #{TABLES[:events]}.timestamp >= "\
|
79
|
+
"(SELECT MIN(timestamp) FROM #{TABLES[:unique_imported_events]}) "\
|
80
|
+
"AND #{join_comparisons(TABLES[:events], 'u', JOIN_EVENT_ATTRS)} "\
|
81
|
+
"AND (#{TABLES[:events]}.time_gen < u.time_gen); "\
|
82
|
+
"INSERT INTO #{TABLES[:events]} "\
|
83
|
+
"SELECT * FROM #{TABLES[:unique_imported_events]};" \
|
84
|
+
'END TRANSACTION;'.freeze
|
85
|
+
end
|
86
|
+
|
87
|
+
# In order to get unique events, I use an inner-join with the same
|
88
|
+
# table. There might be several rows with the same {service, instance,
|
89
|
+
# uinstance, metric, period, timestamp} and different time_gen and
|
90
|
+
# value. From those rows, we want to get just the one with the highest
|
91
|
+
# time_gen. We cannot get the one with the highest value because we
|
92
|
+
# support SET operations. That means that a value of '0' can be more
|
93
|
+
# recent than '50'.
|
94
|
+
#
|
95
|
+
# The way to solve this is as follows: find out the max time_gen
|
96
|
+
# grouping the 'repeated' events, and then perform an inner-join to
|
97
|
+
# select the row with the most recent data.
|
98
|
+
#
|
99
|
+
# Note that we are only getting events with period != 'minute' and
|
100
|
+
# service = master. This is what is required for the dashboard project.
|
101
|
+
# We will need to change this when we start importing data to a
|
102
|
+
# Redshift cluster used as a source for the stats API.
|
103
|
+
def fill_table_unique_imported
|
104
|
+
"INSERT INTO #{TABLES[:unique_imported_events]} "\
|
105
|
+
'SELECT e.service, e.cinstance, e.uinstance, e.metric, e.period, '\
|
106
|
+
'e.timestamp, e.time_gen, e.value '\
|
107
|
+
'FROM '\
|
108
|
+
'(SELECT service, cinstance, uinstance, metric, period, '\
|
109
|
+
'MAX(time_gen) AS max_time_gen, timestamp '\
|
110
|
+
"FROM #{TABLES[:temp]} "\
|
111
|
+
"WHERE period != 'minute' AND service = '#{master_service}' "\
|
112
|
+
'GROUP BY service, cinstance, uinstance, metric, period, timestamp) AS e1 '\
|
113
|
+
"INNER JOIN #{TABLES[:temp]} e "\
|
114
|
+
"ON #{join_comparisons('e', 'e1', JOIN_EVENT_ATTRS)} "\
|
115
|
+
'AND e.time_gen = e1.max_time_gen ' \
|
116
|
+
'GROUP BY e.service, e.cinstance, e.uinstance, e.metric, e.period, '\
|
117
|
+
'e.timestamp, e.time_gen, e.value'.freeze
|
118
|
+
end
|
119
|
+
|
120
|
+
# Once we have imported some events and have made sure that we have
|
121
|
+
# selected only the ones that are more recent, we need to delete the
|
122
|
+
# ones that do not need to be imported. Those are the ones that have
|
123
|
+
# a time_gen older than that of the same event in the events table.
|
124
|
+
def delete_outdated_from_unique_imported
|
125
|
+
"DELETE FROM #{TABLES[:unique_imported_events]} "\
|
126
|
+
'USING (SELECT * '\
|
127
|
+
"FROM #{TABLES[:events]} e "\
|
128
|
+
'WHERE e.time_gen >= (SELECT MIN(time_gen) '\
|
129
|
+
"FROM #{TABLES[:unique_imported_events]})) AS e "\
|
130
|
+
"WHERE #{join_comparisons(
|
131
|
+
TABLES[:unique_imported_events], 'e', JOIN_EVENT_ATTRS)} "\
|
132
|
+
"AND (#{TABLES[:unique_imported_events]}.time_gen <= e.time_gen);".freeze
|
133
|
+
end
|
134
|
+
|
135
|
+
def import_s3_path(path, access_key_id, secret_access_key)
|
136
|
+
"COPY #{TABLES[:temp]} "\
|
137
|
+
"FROM '#{path}' "\
|
138
|
+
"CREDENTIALS '#{amazon_credentials(access_key_id,
|
139
|
+
secret_access_key)}' "\
|
140
|
+
"FORMAT AS JSON 'auto' "\
|
141
|
+
"TIMEFORMAT 'auto';"
|
142
|
+
end
|
143
|
+
|
144
|
+
def delete_nulls_from_imported
|
145
|
+
attrs_with_nulls = %w(cinstance uinstance)
|
146
|
+
attrs_with_nulls.map do |attr|
|
147
|
+
replace_nulls(TABLES[:temp], attr, '')
|
148
|
+
end.join(' ')
|
149
|
+
end
|
150
|
+
|
151
|
+
def store_timestamp_read(timestamp)
|
152
|
+
"DELETE FROM #{TABLES[:latest_s3_path_read]}; "\
|
153
|
+
"INSERT INTO #{TABLES[:latest_s3_path_read]} VALUES ('#{timestamp}');"
|
154
|
+
end
|
155
|
+
|
156
|
+
def duplicated_events
|
157
|
+
'SELECT COUNT(*) '\
|
158
|
+
'FROM (SELECT COUNT(*) AS count '\
|
159
|
+
"FROM #{TABLES[:events]} "\
|
160
|
+
"GROUP BY #{JOIN_EVENT_ATTRS.join(',')}) AS group_counts "\
|
161
|
+
'WHERE group_counts.count > 1;'
|
162
|
+
end
|
163
|
+
|
164
|
+
private
|
165
|
+
|
166
|
+
def amazon_credentials(access_key_id, secret_access_key)
|
167
|
+
"aws_access_key_id=#{access_key_id};"\
|
168
|
+
"aws_secret_access_key=#{secret_access_key}"
|
169
|
+
end
|
170
|
+
|
171
|
+
def replace_nulls(table, attr, value)
|
172
|
+
"UPDATE #{table} "\
|
173
|
+
"SET #{attr} = '#{value}' "\
|
174
|
+
"WHERE #{attr} IS NULL;"
|
175
|
+
end
|
176
|
+
|
177
|
+
# Given 2 tables and an array of attributes, generates a string
|
178
|
+
# like this:
|
179
|
+
# table1.attr1 = table2.attr1 AND table1.attr2 = table2.attr2 AND ...
|
180
|
+
# This is helpful to build the WHERE clauses of certain JOINs.
|
181
|
+
def join_comparisons(table1, table2, attrs)
|
182
|
+
attrs.map do |attr|
|
183
|
+
"#{table1}.#{attr} = #{table2}.#{attr}"
|
184
|
+
end.join(' AND ') + ' '
|
185
|
+
end
|
186
|
+
|
187
|
+
def master_service
|
188
|
+
Backend.configuration.master_service_id
|
189
|
+
end
|
190
|
+
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
# This private class is the responsible for calculating the S3 paths
|
195
|
+
# that we have not imported to Redshift yet.
|
196
|
+
class S3EventPaths
|
197
|
+
|
198
|
+
# The events in our S3 bucket are classified in paths.
|
199
|
+
# Paths are created every hour.
|
200
|
+
DIR_CREATION_INTERVAL = 60*60
|
201
|
+
private_constant :DIR_CREATION_INTERVAL
|
202
|
+
|
203
|
+
# When we read a path we want to be sure that no more events will be stored
|
204
|
+
# For that reason, we will wait a few minutes after the hour ends just to
|
205
|
+
# be safe. For example, we will not read the path '2016/02/25/00' until
|
206
|
+
# 2016-02-25 01:00 + DIR_BACKUP_TIME_S
|
207
|
+
DIR_BACKUP_TIME_S = 60*10
|
208
|
+
private_constant :DIR_BACKUP_TIME_S
|
209
|
+
|
210
|
+
class << self
|
211
|
+
|
212
|
+
def pending_paths(latest_read)
|
213
|
+
time_now = Time.now.utc
|
214
|
+
start_time = DateTime.parse(latest_read).to_time.utc + DIR_CREATION_INTERVAL
|
215
|
+
|
216
|
+
(start_time.to_i..time_now.to_i).step(DIR_CREATION_INTERVAL).inject([]) do |res, time|
|
217
|
+
t = Time.at(time)
|
218
|
+
break res unless can_get_events?(time_now, t)
|
219
|
+
res << t.utc
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
private
|
224
|
+
|
225
|
+
def can_get_events?(now, time)
|
226
|
+
now - time > DIR_CREATION_INTERVAL + DIR_BACKUP_TIME_S
|
227
|
+
end
|
228
|
+
|
229
|
+
end
|
230
|
+
|
231
|
+
end
|
232
|
+
private_constant :S3EventPaths
|
233
|
+
|
234
|
+
S3_BUCKET = 'backend-events'.freeze
|
235
|
+
private_constant :S3_BUCKET
|
236
|
+
|
237
|
+
S3_EVENTS_BASE_PATH = "s3://#{S3_BUCKET}/".freeze
|
238
|
+
private_constant :S3_EVENTS_BASE_PATH
|
239
|
+
|
240
|
+
REQUIRED_TABLES = [SQL::TABLES[:events],
|
241
|
+
SQL::TABLES[:latest_s3_path_read]].freeze
|
242
|
+
private_constant :REQUIRED_TABLES
|
243
|
+
|
244
|
+
MissingRequiredTables = Class.new(ThreeScale::Backend::Error)
|
245
|
+
MissingLatestS3PathRead = Class.new(ThreeScale::Backend::Error)
|
246
|
+
|
247
|
+
class << self
|
248
|
+
|
249
|
+
def insert_pending_events(silent = false)
|
250
|
+
check_redshift_tables
|
251
|
+
|
252
|
+
pending_times_utc = S3EventPaths.pending_paths(latest_timestamp_read)
|
253
|
+
pending_times_utc.each do |pending_time_utc|
|
254
|
+
puts "Loading events generated in hour: #{pending_time_utc}" unless silent
|
255
|
+
save_in_redshift(s3_path(pending_time_utc))
|
256
|
+
save_latest_read(pending_time_utc)
|
257
|
+
end
|
258
|
+
pending_times_utc.last
|
259
|
+
end
|
260
|
+
|
261
|
+
# This method import a specific S3 path into Redshift.
|
262
|
+
# Right now, its main use case consists of uploading past events to
|
263
|
+
# a path and importing only that path.
|
264
|
+
def insert_path(path)
|
265
|
+
# Need to check that the 'events' table exists. Do not care about
|
266
|
+
# 'latest_s3_path_read' in this case.
|
267
|
+
unless existing_tables_with_schema.include?(SQL::TABLES[:events])
|
268
|
+
raise MissingRequiredTables, 'Events table is missing'
|
269
|
+
end
|
270
|
+
|
271
|
+
save_in_redshift("#{S3_EVENTS_BASE_PATH}#{path}")
|
272
|
+
end
|
273
|
+
|
274
|
+
# Returns a timestamp with format 'YYYYMMDDHH' or nil if the latest
|
275
|
+
# timestamp read does not exist in the DB.
|
276
|
+
def latest_timestamp_read
|
277
|
+
query_result = execute_command(SQL::LATEST_TIMESTAMP_READ)
|
278
|
+
return nil if query_result.ntuples == 0
|
279
|
+
query_result.first['s3_path']
|
280
|
+
end
|
281
|
+
|
282
|
+
# Returns whether the data in the DB is consistent. Right now, this
|
283
|
+
# method only checks if there are duplicated events, but it could be
|
284
|
+
# extended in the future.
|
285
|
+
def consistent_data?
|
286
|
+
execute_command(SQL::duplicated_events).first['count'].to_i.zero?
|
287
|
+
end
|
288
|
+
|
289
|
+
private
|
290
|
+
|
291
|
+
def config
|
292
|
+
Backend.configuration
|
293
|
+
end
|
294
|
+
|
295
|
+
def redshift_config
|
296
|
+
config.redshift.to_h
|
297
|
+
end
|
298
|
+
|
299
|
+
def redshift_connection
|
300
|
+
@connection ||= PGconn.new(redshift_config)
|
301
|
+
end
|
302
|
+
|
303
|
+
def execute_command(command)
|
304
|
+
redshift_connection.exec(command)
|
305
|
+
end
|
306
|
+
|
307
|
+
def check_redshift_tables
|
308
|
+
unless required_tables_exist?
|
309
|
+
raise MissingRequiredTables, 'Some of the required tables are not in Redshift.'
|
310
|
+
end
|
311
|
+
|
312
|
+
unless latest_timestamp_read_exists?
|
313
|
+
raise MissingLatestS3PathRead,
|
314
|
+
"The 'latest read' table does not contain any values"
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
def existing_tables
|
319
|
+
execute_command(SQL::EXISTING_TABLES).map { |row| row['table_name'] }
|
320
|
+
end
|
321
|
+
|
322
|
+
def existing_tables_with_schema
|
323
|
+
existing_tables.map { |table| "#{SQL::SCHEMA}.#{table}" }
|
324
|
+
end
|
325
|
+
|
326
|
+
def required_tables_exist?
|
327
|
+
db_tables_with_schema = existing_tables_with_schema
|
328
|
+
REQUIRED_TABLES.all? do |required_table|
|
329
|
+
db_tables_with_schema.include?(required_table)
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
def save_in_redshift(path)
|
334
|
+
import_s3_path(path)
|
335
|
+
[SQL.delete_nulls_from_imported,
|
336
|
+
SQL.fill_table_unique_imported,
|
337
|
+
SQL.delete_outdated_from_unique_imported,
|
338
|
+
SQL.insert_imported_events,
|
339
|
+
SQL::CLEAN_TEMP_TABLES,
|
340
|
+
SQL::VACUUM].each { |command| execute_command(command) }
|
341
|
+
end
|
342
|
+
|
343
|
+
def save_latest_read(time_utc)
|
344
|
+
execute_command(SQL.store_timestamp_read(time_utc.strftime('%Y%m%d%H')))
|
345
|
+
end
|
346
|
+
|
347
|
+
def import_s3_path(path)
|
348
|
+
execute_command(SQL::CREATE_TEMP_TABLES)
|
349
|
+
execute_command(SQL.import_s3_path(
|
350
|
+
path, config.aws_access_key_id, config.aws_secret_access_key))
|
351
|
+
end
|
352
|
+
|
353
|
+
def latest_timestamp_read_exists?
|
354
|
+
execute_command(SQL::LATEST_TIMESTAMP_READ).ntuples > 0
|
355
|
+
end
|
356
|
+
|
357
|
+
def s3_path(time_utc)
|
358
|
+
"#{S3_EVENTS_BASE_PATH}#{time_utc.strftime('%Y/%m/%d/%H')}"
|
359
|
+
end
|
360
|
+
|
361
|
+
end
|
362
|
+
|
363
|
+
end
|
364
|
+
end
|
365
|
+
end
|
366
|
+
end
|
367
|
+
end
|