apisonator 2.100.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +317 -0
- data/Gemfile +11 -0
- data/Gemfile.base +65 -0
- data/Gemfile.lock +319 -0
- data/Gemfile.on_prem +1 -0
- data/Gemfile.on_prem.lock +297 -0
- data/LICENSE +202 -0
- data/NOTICE +15 -0
- data/README.md +230 -0
- data/Rakefile +287 -0
- data/apisonator.gemspec +47 -0
- data/app/api/api.rb +13 -0
- data/app/api/internal/alert_limits.rb +32 -0
- data/app/api/internal/application_keys.rb +49 -0
- data/app/api/internal/application_referrer_filters.rb +43 -0
- data/app/api/internal/applications.rb +77 -0
- data/app/api/internal/errors.rb +54 -0
- data/app/api/internal/events.rb +42 -0
- data/app/api/internal/internal.rb +104 -0
- data/app/api/internal/metrics.rb +40 -0
- data/app/api/internal/service_tokens.rb +46 -0
- data/app/api/internal/services.rb +58 -0
- data/app/api/internal/stats.rb +42 -0
- data/app/api/internal/usagelimits.rb +62 -0
- data/app/api/internal/utilization.rb +23 -0
- data/bin/3scale_backend +223 -0
- data/bin/3scale_backend_worker +26 -0
- data/config.ru +4 -0
- data/config/puma.rb +192 -0
- data/config/schedule.rb +9 -0
- data/ext/mkrf_conf.rb +64 -0
- data/lib/3scale/backend.rb +67 -0
- data/lib/3scale/backend/alert_limit.rb +56 -0
- data/lib/3scale/backend/alerts.rb +137 -0
- data/lib/3scale/backend/analytics/kinesis.rb +3 -0
- data/lib/3scale/backend/analytics/kinesis/adapter.rb +180 -0
- data/lib/3scale/backend/analytics/kinesis/exporter.rb +86 -0
- data/lib/3scale/backend/analytics/kinesis/job.rb +135 -0
- data/lib/3scale/backend/analytics/redshift.rb +3 -0
- data/lib/3scale/backend/analytics/redshift/adapter.rb +367 -0
- data/lib/3scale/backend/analytics/redshift/importer.rb +83 -0
- data/lib/3scale/backend/analytics/redshift/job.rb +33 -0
- data/lib/3scale/backend/application.rb +330 -0
- data/lib/3scale/backend/application_events.rb +76 -0
- data/lib/3scale/backend/background_job.rb +65 -0
- data/lib/3scale/backend/configurable.rb +20 -0
- data/lib/3scale/backend/configuration.rb +151 -0
- data/lib/3scale/backend/configuration/loader.rb +42 -0
- data/lib/3scale/backend/constants.rb +19 -0
- data/lib/3scale/backend/cors.rb +84 -0
- data/lib/3scale/backend/distributed_lock.rb +67 -0
- data/lib/3scale/backend/environment.rb +21 -0
- data/lib/3scale/backend/error_storage.rb +52 -0
- data/lib/3scale/backend/errors.rb +343 -0
- data/lib/3scale/backend/event_storage.rb +120 -0
- data/lib/3scale/backend/experiment.rb +84 -0
- data/lib/3scale/backend/extensions.rb +5 -0
- data/lib/3scale/backend/extensions/array.rb +19 -0
- data/lib/3scale/backend/extensions/hash.rb +26 -0
- data/lib/3scale/backend/extensions/nil_class.rb +13 -0
- data/lib/3scale/backend/extensions/redis.rb +44 -0
- data/lib/3scale/backend/extensions/string.rb +13 -0
- data/lib/3scale/backend/extensions/time.rb +110 -0
- data/lib/3scale/backend/failed_jobs_scheduler.rb +141 -0
- data/lib/3scale/backend/job_fetcher.rb +122 -0
- data/lib/3scale/backend/listener.rb +728 -0
- data/lib/3scale/backend/listener_metrics.rb +99 -0
- data/lib/3scale/backend/logging.rb +48 -0
- data/lib/3scale/backend/logging/external.rb +44 -0
- data/lib/3scale/backend/logging/external/impl.rb +93 -0
- data/lib/3scale/backend/logging/external/impl/airbrake.rb +66 -0
- data/lib/3scale/backend/logging/external/impl/bugsnag.rb +69 -0
- data/lib/3scale/backend/logging/external/impl/default.rb +18 -0
- data/lib/3scale/backend/logging/external/resque.rb +57 -0
- data/lib/3scale/backend/logging/logger.rb +18 -0
- data/lib/3scale/backend/logging/middleware.rb +62 -0
- data/lib/3scale/backend/logging/middleware/json_writer.rb +21 -0
- data/lib/3scale/backend/logging/middleware/text_writer.rb +60 -0
- data/lib/3scale/backend/logging/middleware/writer.rb +143 -0
- data/lib/3scale/backend/logging/worker.rb +107 -0
- data/lib/3scale/backend/manifest.rb +80 -0
- data/lib/3scale/backend/memoizer.rb +277 -0
- data/lib/3scale/backend/metric.rb +275 -0
- data/lib/3scale/backend/metric/collection.rb +91 -0
- data/lib/3scale/backend/oauth.rb +4 -0
- data/lib/3scale/backend/oauth/token.rb +26 -0
- data/lib/3scale/backend/oauth/token_key.rb +30 -0
- data/lib/3scale/backend/oauth/token_storage.rb +313 -0
- data/lib/3scale/backend/oauth/token_value.rb +25 -0
- data/lib/3scale/backend/period.rb +3 -0
- data/lib/3scale/backend/period/boundary.rb +107 -0
- data/lib/3scale/backend/period/cache.rb +28 -0
- data/lib/3scale/backend/period/period.rb +402 -0
- data/lib/3scale/backend/queue_storage.rb +16 -0
- data/lib/3scale/backend/rack.rb +49 -0
- data/lib/3scale/backend/rack/exception_catcher.rb +136 -0
- data/lib/3scale/backend/rack/internal_error_catcher.rb +23 -0
- data/lib/3scale/backend/rack/prometheus.rb +19 -0
- data/lib/3scale/backend/saas.rb +6 -0
- data/lib/3scale/backend/saas_analytics.rb +4 -0
- data/lib/3scale/backend/server.rb +30 -0
- data/lib/3scale/backend/server/falcon.rb +52 -0
- data/lib/3scale/backend/server/puma.rb +71 -0
- data/lib/3scale/backend/service.rb +317 -0
- data/lib/3scale/backend/service_token.rb +97 -0
- data/lib/3scale/backend/stats.rb +8 -0
- data/lib/3scale/backend/stats/aggregator.rb +170 -0
- data/lib/3scale/backend/stats/aggregators/base.rb +72 -0
- data/lib/3scale/backend/stats/aggregators/response_code.rb +58 -0
- data/lib/3scale/backend/stats/aggregators/usage.rb +34 -0
- data/lib/3scale/backend/stats/bucket_reader.rb +135 -0
- data/lib/3scale/backend/stats/bucket_storage.rb +108 -0
- data/lib/3scale/backend/stats/cleaner.rb +195 -0
- data/lib/3scale/backend/stats/codes_commons.rb +14 -0
- data/lib/3scale/backend/stats/delete_job_def.rb +60 -0
- data/lib/3scale/backend/stats/key_generator.rb +73 -0
- data/lib/3scale/backend/stats/keys.rb +104 -0
- data/lib/3scale/backend/stats/partition_eraser_job.rb +58 -0
- data/lib/3scale/backend/stats/partition_generator_job.rb +46 -0
- data/lib/3scale/backend/stats/period_commons.rb +34 -0
- data/lib/3scale/backend/stats/stats_parser.rb +141 -0
- data/lib/3scale/backend/stats/storage.rb +113 -0
- data/lib/3scale/backend/statsd.rb +14 -0
- data/lib/3scale/backend/storable.rb +35 -0
- data/lib/3scale/backend/storage.rb +40 -0
- data/lib/3scale/backend/storage_async.rb +4 -0
- data/lib/3scale/backend/storage_async/async_redis.rb +21 -0
- data/lib/3scale/backend/storage_async/client.rb +205 -0
- data/lib/3scale/backend/storage_async/pipeline.rb +79 -0
- data/lib/3scale/backend/storage_async/resque_extensions.rb +30 -0
- data/lib/3scale/backend/storage_helpers.rb +278 -0
- data/lib/3scale/backend/storage_key_helpers.rb +9 -0
- data/lib/3scale/backend/storage_sync.rb +43 -0
- data/lib/3scale/backend/transaction.rb +62 -0
- data/lib/3scale/backend/transactor.rb +177 -0
- data/lib/3scale/backend/transactor/limit_headers.rb +54 -0
- data/lib/3scale/backend/transactor/notify_batcher.rb +139 -0
- data/lib/3scale/backend/transactor/notify_job.rb +47 -0
- data/lib/3scale/backend/transactor/process_job.rb +33 -0
- data/lib/3scale/backend/transactor/report_job.rb +84 -0
- data/lib/3scale/backend/transactor/status.rb +236 -0
- data/lib/3scale/backend/transactor/usage_report.rb +182 -0
- data/lib/3scale/backend/usage.rb +63 -0
- data/lib/3scale/backend/usage_limit.rb +115 -0
- data/lib/3scale/backend/use_cases/provider_key_change_use_case.rb +60 -0
- data/lib/3scale/backend/util.rb +17 -0
- data/lib/3scale/backend/validators.rb +26 -0
- data/lib/3scale/backend/validators/base.rb +36 -0
- data/lib/3scale/backend/validators/key.rb +17 -0
- data/lib/3scale/backend/validators/limits.rb +57 -0
- data/lib/3scale/backend/validators/oauth_key.rb +15 -0
- data/lib/3scale/backend/validators/oauth_setting.rb +15 -0
- data/lib/3scale/backend/validators/redirect_uri.rb +33 -0
- data/lib/3scale/backend/validators/referrer.rb +60 -0
- data/lib/3scale/backend/validators/service_state.rb +15 -0
- data/lib/3scale/backend/validators/state.rb +15 -0
- data/lib/3scale/backend/version.rb +5 -0
- data/lib/3scale/backend/views/oauth_access_tokens.builder +14 -0
- data/lib/3scale/backend/views/oauth_app_id_by_token.builder +4 -0
- data/lib/3scale/backend/worker.rb +87 -0
- data/lib/3scale/backend/worker_async.rb +88 -0
- data/lib/3scale/backend/worker_metrics.rb +44 -0
- data/lib/3scale/backend/worker_sync.rb +32 -0
- data/lib/3scale/bundler_shim.rb +17 -0
- data/lib/3scale/prometheus_server.rb +10 -0
- data/lib/3scale/tasks/connectivity.rake +41 -0
- data/lib/3scale/tasks/helpers.rb +3 -0
- data/lib/3scale/tasks/helpers/environment.rb +23 -0
- data/lib/3scale/tasks/stats.rake +131 -0
- data/lib/3scale/tasks/swagger.rake +46 -0
- data/licenses.xml +1215 -0
- metadata +227 -0
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
require '3scale/backend/logging'
|
|
2
|
+
|
|
3
|
+
module ThreeScale
|
|
4
|
+
module Backend
|
|
5
|
+
module Analytics
|
|
6
|
+
module Kinesis
|
|
7
|
+
class Adapter
|
|
8
|
+
# Each Kinesis record is rounded to the nearest 5KB to calculate the
|
|
9
|
+
# cost. Each of our events is a hash with a few keys: service,
|
|
10
|
+
# metric, period, time, value, etc. This means that the size of one
|
|
11
|
+
# of our events is nowhere near 5KB. For that reason, we need to make
|
|
12
|
+
# sure that we send many events in each record.
|
|
13
|
+
# The max size for each record is 1000KB. In each record batch, Kinesis
|
|
14
|
+
# accepts a maximum of 4MB.
|
|
15
|
+
#
|
|
16
|
+
# We will try to optimize the batching process later. For now, I will
|
|
17
|
+
# just put 1000 events in each record. And batches of 5 records max.
|
|
18
|
+
#
|
|
19
|
+
# When we receive a number of events not big enough to fill a record,
|
|
20
|
+
# those events are marked as pending events.
|
|
21
|
+
# Kinesis can return errors, when that happens, the events of the
|
|
22
|
+
# records that failed are re-enqueued as pending events.
|
|
23
|
+
# The list of pending events is stored in Redis, so we do not fail to
|
|
24
|
+
# process any events in case of downtime or errors.
|
|
25
|
+
|
|
26
|
+
include Logging
|
|
27
|
+
|
|
28
|
+
EVENTS_PER_RECORD = 1000
|
|
29
|
+
private_constant :EVENTS_PER_RECORD
|
|
30
|
+
|
|
31
|
+
MAX_RECORDS_PER_BATCH = 5
|
|
32
|
+
private_constant :MAX_RECORDS_PER_BATCH
|
|
33
|
+
|
|
34
|
+
EVENTS_PER_BATCH = EVENTS_PER_RECORD*MAX_RECORDS_PER_BATCH
|
|
35
|
+
private_constant :EVENTS_PER_BATCH
|
|
36
|
+
|
|
37
|
+
KINESIS_PENDING_EVENTS_KEY = 'send_to_kinesis:pending_events'
|
|
38
|
+
private_constant :KINESIS_PENDING_EVENTS_KEY
|
|
39
|
+
|
|
40
|
+
# We need to limit the number of pending events stored in Redis.
|
|
41
|
+
# The Redis database can grow very quickly if a few consecutive jobs
|
|
42
|
+
# fail. I am going to limit the number of pending events to 600k
|
|
43
|
+
# (10 jobs approx.). If that limit is reached, we will disable the
|
|
44
|
+
# creation of buckets in the system, but we will continue trying to
|
|
45
|
+
# send the failed events. We will lose data, but that is better than
|
|
46
|
+
# collapsing the whole Redis.
|
|
47
|
+
# We will try to find a better alternative once we cannot afford to
|
|
48
|
+
# miss events. Right now, we are just deleting the stats keys with
|
|
49
|
+
# period = minute, so we can restore everything else.
|
|
50
|
+
MAX_PENDING_EVENTS = 600_000
|
|
51
|
+
private_constant :MAX_PENDING_EVENTS
|
|
52
|
+
|
|
53
|
+
MAX_PENDING_EVENTS_REACHED_MSG =
|
|
54
|
+
'Bucket creation has been disabled. Max pending events reached'.freeze
|
|
55
|
+
private_constant :MAX_PENDING_EVENTS_REACHED_MSG
|
|
56
|
+
|
|
57
|
+
def initialize(stream_name, kinesis_client, storage)
|
|
58
|
+
@stream_name = stream_name
|
|
59
|
+
@kinesis_client = kinesis_client
|
|
60
|
+
@storage = storage
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def send_events(events)
|
|
64
|
+
pending_events = stored_pending_events + events
|
|
65
|
+
|
|
66
|
+
# Only disable indicating emergency if bucket storage is enabled.
|
|
67
|
+
# We do not want to indicate emergency if it was disabled manually.
|
|
68
|
+
if limit_pending_events_reached?(pending_events.size) && Stats::Storage.enabled?
|
|
69
|
+
Stats::Storage.disable!(true)
|
|
70
|
+
log_bucket_creation_disabled
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Batch events until we can fill at least one record
|
|
74
|
+
if pending_events.size >= EVENTS_PER_RECORD
|
|
75
|
+
failed_events = send_events_in_batches(pending_events)
|
|
76
|
+
store_pending_events(failed_events)
|
|
77
|
+
else
|
|
78
|
+
store_pending_events(pending_events)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Sends the pending events to Kinesis, even if there are not enough of
|
|
83
|
+
# them to fill 1 record.
|
|
84
|
+
# Returns the number of events correctly sent to Kinesis
|
|
85
|
+
def flush(limit = nil)
|
|
86
|
+
pending_events = stored_pending_events
|
|
87
|
+
events_to_flush = limit ? pending_events.take(limit) : pending_events
|
|
88
|
+
failed_events = send_events_in_batches(events_to_flush)
|
|
89
|
+
store_pending_events(pending_events - events_to_flush + failed_events)
|
|
90
|
+
events_to_flush.size - failed_events.size
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def num_pending_events
|
|
94
|
+
storage.scard(KINESIS_PENDING_EVENTS_KEY)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
private
|
|
98
|
+
|
|
99
|
+
attr_reader :stream_name, :kinesis_client, :storage
|
|
100
|
+
|
|
101
|
+
def stored_pending_events
|
|
102
|
+
storage.smembers(KINESIS_PENDING_EVENTS_KEY).map do |pending_event|
|
|
103
|
+
JSON.parse(pending_event, symbolize_names: true)
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def limit_pending_events_reached?(count)
|
|
108
|
+
count > MAX_PENDING_EVENTS
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def log_bucket_creation_disabled
|
|
112
|
+
logger.info(MAX_PENDING_EVENTS_REACHED_MSG)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Returns the failed events
|
|
116
|
+
def send_events_in_batches(events)
|
|
117
|
+
failed_events = []
|
|
118
|
+
|
|
119
|
+
events.each_slice(EVENTS_PER_BATCH) do |events_slice|
|
|
120
|
+
begin
|
|
121
|
+
kinesis_resp = kinesis_client.put_record_batch(
|
|
122
|
+
{ delivery_stream_name: stream_name,
|
|
123
|
+
records: events_to_kinesis_records(events_slice) })
|
|
124
|
+
failed_events << failed_events_kinesis_resp(
|
|
125
|
+
kinesis_resp[:request_responses], events_slice)
|
|
126
|
+
rescue Aws::Firehose::Errors::ServiceError
|
|
127
|
+
failed_events << events_slice
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
failed_events.flatten
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def events_to_kinesis_records(events)
|
|
135
|
+
# Record format expected by Kinesis:
|
|
136
|
+
# [{ data: "data_event_group_1" }, { data: "data_event_group_2" }]
|
|
137
|
+
events.each_slice(EVENTS_PER_RECORD).map do |events_slice|
|
|
138
|
+
{ data: events_to_pseudo_json(events_slice) }
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# We want to send to Kinesis events that can be read by Redshift.
|
|
143
|
+
# Redshift expects events in JSON format without the '[]' and
|
|
144
|
+
# without separating them with commas.
|
|
145
|
+
# We put each event in a separated line, that will make their parsing
|
|
146
|
+
# easier, but it is not needed by Redshift.
|
|
147
|
+
def events_to_pseudo_json(events)
|
|
148
|
+
events.map { |event| event.to_json }.join("\n") + "\n"
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def failed_events_kinesis_resp(request_responses, events)
|
|
152
|
+
failed_records_indexes = failed_records_indexes(request_responses)
|
|
153
|
+
failed_records_indexes.flat_map do |failed_record_index|
|
|
154
|
+
events_index_start = failed_record_index*EVENTS_PER_RECORD
|
|
155
|
+
events_index_end = events_index_start + EVENTS_PER_RECORD - 1
|
|
156
|
+
events[events_index_start..events_index_end]
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def failed_records_indexes(request_responses)
|
|
161
|
+
result = []
|
|
162
|
+
request_responses.each_with_index do |response, index|
|
|
163
|
+
result << index unless response[:error_code].nil?
|
|
164
|
+
end
|
|
165
|
+
result
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def store_pending_events(events)
|
|
169
|
+
storage.pipelined do
|
|
170
|
+
storage.del(KINESIS_PENDING_EVENTS_KEY)
|
|
171
|
+
events.each do |event|
|
|
172
|
+
storage.sadd(KINESIS_PENDING_EVENTS_KEY, event.to_json)
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
end
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
module ThreeScale
|
|
2
|
+
module Backend
|
|
3
|
+
module Analytics
|
|
4
|
+
module Kinesis
|
|
5
|
+
|
|
6
|
+
# The main responsibility of this class is to schedule Kinesis jobs.
|
|
7
|
+
# We know that the distributed locking algorithm that we are using
|
|
8
|
+
# guarantees that two jobs will not be running at the same time except
|
|
9
|
+
# in some corner cases, like in the case of a failure of one of the Redis
|
|
10
|
+
# masters. However, this is not a problem in our case. If two Kinesis
|
|
11
|
+
# jobs run at the same time, they will probably export the same events to
|
|
12
|
+
# Kinesis. However, they will not be imported twice into Redshift because
|
|
13
|
+
# the import method that we use detects that two events are the same and
|
|
14
|
+
# only imports one. This detection is done using the 'time_gen' field
|
|
15
|
+
# that we attach to each event before they are send to Kinesis.
|
|
16
|
+
class Exporter
|
|
17
|
+
SEND_TO_KINESIS_ENABLED_KEY = 'send_to_kinesis:enabled'.freeze
|
|
18
|
+
private_constant :SEND_TO_KINESIS_ENABLED_KEY
|
|
19
|
+
|
|
20
|
+
TTL_JOB_RUNNING_KEY_SEC = 360
|
|
21
|
+
private_constant :TTL_JOB_RUNNING_KEY_SEC
|
|
22
|
+
|
|
23
|
+
class << self
|
|
24
|
+
def enable
|
|
25
|
+
storage.set(SEND_TO_KINESIS_ENABLED_KEY, '1')
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def disable
|
|
29
|
+
storage.del(SEND_TO_KINESIS_ENABLED_KEY)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def enabled?
|
|
33
|
+
storage.get(SEND_TO_KINESIS_ENABLED_KEY).to_i == 1
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def schedule_job
|
|
37
|
+
if enabled?
|
|
38
|
+
lock_key = dist_lock.lock
|
|
39
|
+
if lock_key
|
|
40
|
+
Resque.enqueue(Job, Time.now.utc, lock_key, Time.now.utc.to_f)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def flush_pending_events(limit = nil)
|
|
46
|
+
flushed_events = 0
|
|
47
|
+
if enabled?
|
|
48
|
+
lock_key = dist_lock.lock
|
|
49
|
+
if lock_key
|
|
50
|
+
flushed_events = kinesis_adapter.flush(limit)
|
|
51
|
+
job_finished(lock_key) # flush is not asynchronous
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
flushed_events
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def num_pending_events
|
|
58
|
+
kinesis_adapter.num_pending_events
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# To be called by a kinesis job once it exits so other jobs can run
|
|
62
|
+
def job_finished(lock_key)
|
|
63
|
+
dist_lock.unlock if lock_key == dist_lock.current_lock_key
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
def storage
|
|
69
|
+
Backend::Storage.instance
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def kinesis_adapter
|
|
73
|
+
Stats::Storage.kinesis_adapter
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def dist_lock
|
|
77
|
+
@dist_lock ||= DistributedLock.new(self.name,
|
|
78
|
+
TTL_JOB_RUNNING_KEY_SEC,
|
|
79
|
+
storage)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
require '3scale/backend/logging'
|
|
2
|
+
|
|
3
|
+
module ThreeScale
|
|
4
|
+
module Backend
|
|
5
|
+
module Analytics
|
|
6
|
+
module Kinesis
|
|
7
|
+
# This job works as follows:
|
|
8
|
+
# 1) Reads the pending events from the buckets that have not been read.
|
|
9
|
+
# 2) Parses and filters those events.
|
|
10
|
+
# 3) Sends the events to the Kinesis adapter.
|
|
11
|
+
# 4) Updates the latest bucket read, to avoid processing buckets more
|
|
12
|
+
# than once.
|
|
13
|
+
# The events are sent in batches to Kinesis, but the component that does
|
|
14
|
+
# that batching is the Kinesis adapter.
|
|
15
|
+
#
|
|
16
|
+
# Before sending the events to Kinesis, we attach a 'time_gen' attribute
|
|
17
|
+
# to each of them. This is a timestamp that indicates approximately when
|
|
18
|
+
# the event was generated based on the bucket where it was stored.
|
|
19
|
+
# We need this attribute because we will have repeated event keys in
|
|
20
|
+
# Redis and we will need to know which one contains the most updated
|
|
21
|
+
# value.
|
|
22
|
+
# Notice that we do not send all the events that are in the buckets to
|
|
23
|
+
# Kinesis. This job reads several buckets each time it runs. Some events
|
|
24
|
+
# can be repeated across those buckets. However, the job will only send
|
|
25
|
+
# to Kinesis the latest value (the one in the most recent bucket). This
|
|
26
|
+
# reduces the information that we need to parse, filter, and send.
|
|
27
|
+
# We need the extra field 'time_gen', because we cannot safely assume any
|
|
28
|
+
# order in S3 when sending events to Kinesis.
|
|
29
|
+
class Job < BackgroundJob
|
|
30
|
+
@queue = :stats
|
|
31
|
+
|
|
32
|
+
FILTERED_EVENT_PERIODS = %w(week eternity)
|
|
33
|
+
private_constant :FILTERED_EVENT_PERIODS
|
|
34
|
+
|
|
35
|
+
# We need to limit the amount of buckets that a job can process.
|
|
36
|
+
# Otherwise, there is the possibility that the job would not finish
|
|
37
|
+
# before its expiration time, and the next one would start processing
|
|
38
|
+
# the same buckets.
|
|
39
|
+
MAX_BUCKETS = 60
|
|
40
|
+
private_constant :MAX_BUCKETS
|
|
41
|
+
|
|
42
|
+
FILTERED_EVENT_PERIODS_STR = FILTERED_EVENT_PERIODS.map do |period|
|
|
43
|
+
"/#{period}".freeze
|
|
44
|
+
end.freeze
|
|
45
|
+
private_constant :FILTERED_EVENT_PERIODS_STR
|
|
46
|
+
|
|
47
|
+
class << self
|
|
48
|
+
include Logging
|
|
49
|
+
|
|
50
|
+
def perform_logged(end_time_utc, lock_key, _enqueue_time)
|
|
51
|
+
# end_time_utc will be a string when the worker processes this job.
|
|
52
|
+
# The parameter is passed through Redis as a string. We need to
|
|
53
|
+
# convert it back.
|
|
54
|
+
events_sent = 0
|
|
55
|
+
|
|
56
|
+
end_time = DateTime.parse(end_time_utc).to_time.utc
|
|
57
|
+
pending_events = bucket_reader.pending_events_in_buckets(
|
|
58
|
+
end_time_utc: end_time, max_buckets: MAX_BUCKETS)
|
|
59
|
+
|
|
60
|
+
unless pending_events[:events].empty?
|
|
61
|
+
events = prepare_events(pending_events[:latest_bucket],
|
|
62
|
+
pending_events[:events])
|
|
63
|
+
kinesis_adapter.send_events(events)
|
|
64
|
+
bucket_reader.latest_bucket_read = pending_events[:latest_bucket]
|
|
65
|
+
events_sent = events.size
|
|
66
|
+
|
|
67
|
+
# We might use a different strategy to delete buckets in the
|
|
68
|
+
# future, but for now, we are going to delete the buckets as they
|
|
69
|
+
# are read
|
|
70
|
+
bucket_storage.delete_range(pending_events[:latest_bucket])
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
Exporter.job_finished(lock_key)
|
|
74
|
+
[true, msg_events_sent(events_sent)]
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
private
|
|
78
|
+
|
|
79
|
+
def prepare_events(bucket, events)
|
|
80
|
+
filter_events(events)
|
|
81
|
+
parsed_events = parse_events(events.lazy)
|
|
82
|
+
add_time_gen_to_events(parsed_events, bucket_to_timestamp(bucket)).force
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Parses the events and discards the invalid ones
|
|
86
|
+
def parse_events(events)
|
|
87
|
+
events.map do |k, v|
|
|
88
|
+
begin
|
|
89
|
+
Stats::StatsParser.parse(k, v)
|
|
90
|
+
rescue Stats::StatsParser::StatsKeyValueInvalid
|
|
91
|
+
logger.notify("Invalid stats key-value. k: #{k}. v: #{v}")
|
|
92
|
+
nil
|
|
93
|
+
end
|
|
94
|
+
end.reject(&:nil?)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# We do not want to send all the events to Kinesis.
|
|
98
|
+
# This method filters them.
|
|
99
|
+
def filter_events(events)
|
|
100
|
+
events.reject! do |event|
|
|
101
|
+
FILTERED_EVENT_PERIODS_STR.any? do |filtered_period|
|
|
102
|
+
event.include?(filtered_period)
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def add_time_gen_to_events(events, time_gen)
|
|
108
|
+
events.map { |event| event[:time_gen] = time_gen; event }
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def bucket_to_timestamp(bucket)
|
|
112
|
+
DateTime.parse(bucket).to_time.utc.strftime('%Y%m%d %H:%M:%S')
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def msg_events_sent(n_events)
|
|
116
|
+
"#{n_events} events have been sent to the Kinesis adapter"
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def bucket_storage
|
|
120
|
+
Stats::Storage.bucket_storage
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def bucket_reader
|
|
124
|
+
Stats::Storage.bucket_reader
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def kinesis_adapter
|
|
128
|
+
Stats::Storage.kinesis_adapter
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
require 'pg'
|
|
2
|
+
|
|
3
|
+
module ThreeScale
|
|
4
|
+
module Backend
|
|
5
|
+
module Analytics
|
|
6
|
+
module Redshift
|
|
7
|
+
# This class imports the events stored by Kinesis in S3 into Redshift.
|
|
8
|
+
# It keeps track of the events that have been imported so it does not
|
|
9
|
+
# read twice the same S3 path.
|
|
10
|
+
#
|
|
11
|
+
# We store 'repeated' events in S3. This means that we can find several
|
|
12
|
+
# times the same {service, instance, uinstance, metric, period, timestamp}
|
|
13
|
+
# combination.
|
|
14
|
+
#
|
|
15
|
+
# In order to avoid storing repeated information in Redshift we need to
|
|
16
|
+
# perform UPSERTs. The algorithm followed is the one explained in the
|
|
17
|
+
# official Redshift documentation:
|
|
18
|
+
# http://docs.aws.amazon.com/redshift/latest/dg/t_updating-inserting-using-staging-tables-.html
|
|
19
|
+
# The process is as follows:
|
|
20
|
+
# 1) Create a temporary table with the data imported from S3, including
|
|
21
|
+
# duplicates.
|
|
22
|
+
# Two attributes can have nulls: cinstance and uinstance. We replace
|
|
23
|
+
# those nulls with ''. I have observed substantial performance gains
|
|
24
|
+
# because of this.
|
|
25
|
+
# 2) Perform the necessary operations in the temp table to remove
|
|
26
|
+
# duplicates. (In our case this basically consists of an inner-join).
|
|
27
|
+
# 3) Inside a transaction, delete all the events that are in the temp
|
|
28
|
+
# table from the final table. Next, insert the ones in the temp
|
|
29
|
+
# table into the final table. Finally, remove the temp table.
|
|
30
|
+
# 4) Last, we perform a vacuum, because Redshift does not automatically
|
|
31
|
+
# reclaim and reuse space that has been freed after deletes or
|
|
32
|
+
# updates. The vacuum operation also leaves the table sorted.
|
|
33
|
+
# More info:
|
|
34
|
+
# http://docs.aws.amazon.com/redshift/latest/dg/t_Reclaiming_storage_space202.html
|
|
35
|
+
# Right now, we are going to vacuum every time we insert new data,
|
|
36
|
+
# we will see if for performance reasons we need to do it less often.
|
|
37
|
+
class Adapter
|
|
38
|
+
|
|
39
|
+
module SQL
|
|
40
|
+
SCHEMA = 'backend'.freeze
|
|
41
|
+
|
|
42
|
+
# This importer relies on some tables or views that are created in
|
|
43
|
+
# Redshift to function correctly.
|
|
44
|
+
TABLES = { events: "#{SCHEMA}.events".freeze,
|
|
45
|
+
latest_s3_path_read: "#{SCHEMA}.latest_s3_path_read".freeze,
|
|
46
|
+
temp: "#{SCHEMA}.temp_events".freeze,
|
|
47
|
+
unique_imported_events: "#{SCHEMA}.unique_imported_events".freeze }.freeze
|
|
48
|
+
|
|
49
|
+
EVENT_ATTRS = %w(service cinstance uinstance metric period timestamp time_gen).freeze
|
|
50
|
+
JOIN_EVENT_ATTRS = (EVENT_ATTRS - ['time_gen']).freeze
|
|
51
|
+
|
|
52
|
+
EXISTING_TABLES =
|
|
53
|
+
'SELECT table_name '\
|
|
54
|
+
'FROM information_schema.tables '\
|
|
55
|
+
"WHERE table_schema = '#{SCHEMA}';".freeze
|
|
56
|
+
|
|
57
|
+
CREATE_TEMP_TABLES =
|
|
58
|
+
"DROP TABLE IF EXISTS #{TABLES[:temp]} CASCADE; "\
|
|
59
|
+
"CREATE TABLE #{TABLES[:temp]} (LIKE #{TABLES[:events]}); "\
|
|
60
|
+
"DROP TABLE IF EXISTS #{TABLES[:unique_imported_events]} CASCADE; "\
|
|
61
|
+
"CREATE TABLE #{TABLES[:unique_imported_events]} (LIKE #{TABLES[:events]}); "\
|
|
62
|
+
'COMMIT;'.freeze
|
|
63
|
+
|
|
64
|
+
CLEAN_TEMP_TABLES =
|
|
65
|
+
"DROP TABLE #{TABLES[:unique_imported_events]}; "\
|
|
66
|
+
"DROP TABLE #{TABLES[:temp]};".freeze
|
|
67
|
+
|
|
68
|
+
LATEST_TIMESTAMP_READ = "SELECT s3_path FROM #{TABLES[:latest_s3_path_read]}".freeze
|
|
69
|
+
|
|
70
|
+
VACUUM = "VACUUM FULL #{TABLES[:events]}".freeze
|
|
71
|
+
|
|
72
|
+
class << self
|
|
73
|
+
|
|
74
|
+
def insert_imported_events
|
|
75
|
+
'BEGIN TRANSACTION; '\
|
|
76
|
+
"DELETE FROM #{TABLES[:events]} "\
|
|
77
|
+
"USING #{TABLES[:unique_imported_events]} u "\
|
|
78
|
+
"WHERE #{TABLES[:events]}.timestamp >= "\
|
|
79
|
+
"(SELECT MIN(timestamp) FROM #{TABLES[:unique_imported_events]}) "\
|
|
80
|
+
"AND #{join_comparisons(TABLES[:events], 'u', JOIN_EVENT_ATTRS)} "\
|
|
81
|
+
"AND (#{TABLES[:events]}.time_gen < u.time_gen); "\
|
|
82
|
+
"INSERT INTO #{TABLES[:events]} "\
|
|
83
|
+
"SELECT * FROM #{TABLES[:unique_imported_events]};" \
|
|
84
|
+
'END TRANSACTION;'.freeze
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# In order to get unique events, I use an inner-join with the same
|
|
88
|
+
# table. There might be several rows with the same {service, instance,
|
|
89
|
+
# uinstance, metric, period, timestamp} and different time_gen and
|
|
90
|
+
# value. From those rows, we want to get just the one with the highest
|
|
91
|
+
# time_gen. We cannot get the one with the highest value because we
|
|
92
|
+
# support SET operations. That means that a value of '0' can be more
|
|
93
|
+
# recent than '50'.
|
|
94
|
+
#
|
|
95
|
+
# The way to solve this is as follows: find out the max time_gen
|
|
96
|
+
# grouping the 'repeated' events, and then perform an inner-join to
|
|
97
|
+
# select the row with the most recent data.
|
|
98
|
+
#
|
|
99
|
+
# Note that we are only getting events with period != 'minute' and
|
|
100
|
+
# service = master. This is what is required for the dashboard project.
|
|
101
|
+
# We will need to change this when we start importing data to a
|
|
102
|
+
# Redshift cluster used as a source for the stats API.
|
|
103
|
+
def fill_table_unique_imported
|
|
104
|
+
"INSERT INTO #{TABLES[:unique_imported_events]} "\
|
|
105
|
+
'SELECT e.service, e.cinstance, e.uinstance, e.metric, e.period, '\
|
|
106
|
+
'e.timestamp, e.time_gen, e.value '\
|
|
107
|
+
'FROM '\
|
|
108
|
+
'(SELECT service, cinstance, uinstance, metric, period, '\
|
|
109
|
+
'MAX(time_gen) AS max_time_gen, timestamp '\
|
|
110
|
+
"FROM #{TABLES[:temp]} "\
|
|
111
|
+
"WHERE period != 'minute' AND service = '#{master_service}' "\
|
|
112
|
+
'GROUP BY service, cinstance, uinstance, metric, period, timestamp) AS e1 '\
|
|
113
|
+
"INNER JOIN #{TABLES[:temp]} e "\
|
|
114
|
+
"ON #{join_comparisons('e', 'e1', JOIN_EVENT_ATTRS)} "\
|
|
115
|
+
'AND e.time_gen = e1.max_time_gen ' \
|
|
116
|
+
'GROUP BY e.service, e.cinstance, e.uinstance, e.metric, e.period, '\
|
|
117
|
+
'e.timestamp, e.time_gen, e.value'.freeze
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Once we have imported some events and have made sure that we have
|
|
121
|
+
# selected only the ones that are more recent, we need to delete the
|
|
122
|
+
# ones that do not need to be imported. Those are the ones that have
|
|
123
|
+
# a time_gen older than that of the same event in the events table.
|
|
124
|
+
def delete_outdated_from_unique_imported
|
|
125
|
+
"DELETE FROM #{TABLES[:unique_imported_events]} "\
|
|
126
|
+
'USING (SELECT * '\
|
|
127
|
+
"FROM #{TABLES[:events]} e "\
|
|
128
|
+
'WHERE e.time_gen >= (SELECT MIN(time_gen) '\
|
|
129
|
+
"FROM #{TABLES[:unique_imported_events]})) AS e "\
|
|
130
|
+
"WHERE #{join_comparisons(
|
|
131
|
+
TABLES[:unique_imported_events], 'e', JOIN_EVENT_ATTRS)} "\
|
|
132
|
+
"AND (#{TABLES[:unique_imported_events]}.time_gen <= e.time_gen);".freeze
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def import_s3_path(path, access_key_id, secret_access_key)
|
|
136
|
+
"COPY #{TABLES[:temp]} "\
|
|
137
|
+
"FROM '#{path}' "\
|
|
138
|
+
"CREDENTIALS '#{amazon_credentials(access_key_id,
|
|
139
|
+
secret_access_key)}' "\
|
|
140
|
+
"FORMAT AS JSON 'auto' "\
|
|
141
|
+
"TIMEFORMAT 'auto';"
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def delete_nulls_from_imported
|
|
145
|
+
attrs_with_nulls = %w(cinstance uinstance)
|
|
146
|
+
attrs_with_nulls.map do |attr|
|
|
147
|
+
replace_nulls(TABLES[:temp], attr, '')
|
|
148
|
+
end.join(' ')
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def store_timestamp_read(timestamp)
|
|
152
|
+
"DELETE FROM #{TABLES[:latest_s3_path_read]}; "\
|
|
153
|
+
"INSERT INTO #{TABLES[:latest_s3_path_read]} VALUES ('#{timestamp}');"
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def duplicated_events
|
|
157
|
+
'SELECT COUNT(*) '\
|
|
158
|
+
'FROM (SELECT COUNT(*) AS count '\
|
|
159
|
+
"FROM #{TABLES[:events]} "\
|
|
160
|
+
"GROUP BY #{JOIN_EVENT_ATTRS.join(',')}) AS group_counts "\
|
|
161
|
+
'WHERE group_counts.count > 1;'
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
private
|
|
165
|
+
|
|
166
|
+
def amazon_credentials(access_key_id, secret_access_key)
|
|
167
|
+
"aws_access_key_id=#{access_key_id};"\
|
|
168
|
+
"aws_secret_access_key=#{secret_access_key}"
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def replace_nulls(table, attr, value)
|
|
172
|
+
"UPDATE #{table} "\
|
|
173
|
+
"SET #{attr} = '#{value}' "\
|
|
174
|
+
"WHERE #{attr} IS NULL;"
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# Given 2 tables and an array of attributes, generates a string
|
|
178
|
+
# like this:
|
|
179
|
+
# table1.attr1 = table2.attr1 AND table1.attr2 = table2.attr2 AND ...
|
|
180
|
+
# This is helpful to build the WHERE clauses of certain JOINs.
|
|
181
|
+
def join_comparisons(table1, table2, attrs)
|
|
182
|
+
attrs.map do |attr|
|
|
183
|
+
"#{table1}.#{attr} = #{table2}.#{attr}"
|
|
184
|
+
end.join(' AND ') + ' '
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def master_service
|
|
188
|
+
Backend.configuration.master_service_id
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# This private class is the responsible for calculating the S3 paths
|
|
195
|
+
# that we have not imported to Redshift yet.
|
|
196
|
+
class S3EventPaths
|
|
197
|
+
|
|
198
|
+
# The events in our S3 bucket are classified in paths.
|
|
199
|
+
# Paths are created every hour.
|
|
200
|
+
DIR_CREATION_INTERVAL = 60*60
|
|
201
|
+
private_constant :DIR_CREATION_INTERVAL
|
|
202
|
+
|
|
203
|
+
# When we read a path we want to be sure that no more events will be stored
|
|
204
|
+
# For that reason, we will wait a few minutes after the hour ends just to
|
|
205
|
+
# be safe. For example, we will not read the path '2016/02/25/00' until
|
|
206
|
+
# 2016-02-25 01:00 + DIR_BACKUP_TIME_S
|
|
207
|
+
DIR_BACKUP_TIME_S = 60*10
|
|
208
|
+
private_constant :DIR_BACKUP_TIME_S
|
|
209
|
+
|
|
210
|
+
class << self
|
|
211
|
+
|
|
212
|
+
def pending_paths(latest_read)
|
|
213
|
+
time_now = Time.now.utc
|
|
214
|
+
start_time = DateTime.parse(latest_read).to_time.utc + DIR_CREATION_INTERVAL
|
|
215
|
+
|
|
216
|
+
(start_time.to_i..time_now.to_i).step(DIR_CREATION_INTERVAL).inject([]) do |res, time|
|
|
217
|
+
t = Time.at(time)
|
|
218
|
+
break res unless can_get_events?(time_now, t)
|
|
219
|
+
res << t.utc
|
|
220
|
+
end
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
private
|
|
224
|
+
|
|
225
|
+
def can_get_events?(now, time)
|
|
226
|
+
now - time > DIR_CREATION_INTERVAL + DIR_BACKUP_TIME_S
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
end
|
|
232
|
+
private_constant :S3EventPaths
|
|
233
|
+
|
|
234
|
+
S3_BUCKET = 'backend-events'.freeze
|
|
235
|
+
private_constant :S3_BUCKET
|
|
236
|
+
|
|
237
|
+
S3_EVENTS_BASE_PATH = "s3://#{S3_BUCKET}/".freeze
|
|
238
|
+
private_constant :S3_EVENTS_BASE_PATH
|
|
239
|
+
|
|
240
|
+
REQUIRED_TABLES = [SQL::TABLES[:events],
|
|
241
|
+
SQL::TABLES[:latest_s3_path_read]].freeze
|
|
242
|
+
private_constant :REQUIRED_TABLES
|
|
243
|
+
|
|
244
|
+
MissingRequiredTables = Class.new(ThreeScale::Backend::Error)
|
|
245
|
+
MissingLatestS3PathRead = Class.new(ThreeScale::Backend::Error)
|
|
246
|
+
|
|
247
|
+
class << self
|
|
248
|
+
|
|
249
|
+
def insert_pending_events(silent = false)
|
|
250
|
+
check_redshift_tables
|
|
251
|
+
|
|
252
|
+
pending_times_utc = S3EventPaths.pending_paths(latest_timestamp_read)
|
|
253
|
+
pending_times_utc.each do |pending_time_utc|
|
|
254
|
+
puts "Loading events generated in hour: #{pending_time_utc}" unless silent
|
|
255
|
+
save_in_redshift(s3_path(pending_time_utc))
|
|
256
|
+
save_latest_read(pending_time_utc)
|
|
257
|
+
end
|
|
258
|
+
pending_times_utc.last
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# This method import a specific S3 path into Redshift.
|
|
262
|
+
# Right now, its main use case consists of uploading past events to
|
|
263
|
+
# a path and importing only that path.
|
|
264
|
+
def insert_path(path)
|
|
265
|
+
# Need to check that the 'events' table exists. Do not care about
|
|
266
|
+
# 'latest_s3_path_read' in this case.
|
|
267
|
+
unless existing_tables_with_schema.include?(SQL::TABLES[:events])
|
|
268
|
+
raise MissingRequiredTables, 'Events table is missing'
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
save_in_redshift("#{S3_EVENTS_BASE_PATH}#{path}")
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
# Returns a timestamp with format 'YYYYMMDDHH' or nil if the latest
|
|
275
|
+
# timestamp read does not exist in the DB.
|
|
276
|
+
def latest_timestamp_read
|
|
277
|
+
query_result = execute_command(SQL::LATEST_TIMESTAMP_READ)
|
|
278
|
+
return nil if query_result.ntuples == 0
|
|
279
|
+
query_result.first['s3_path']
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
# Returns whether the data in the DB is consistent. Right now, this
|
|
283
|
+
# method only checks if there are duplicated events, but it could be
|
|
284
|
+
# extended in the future.
|
|
285
|
+
def consistent_data?
|
|
286
|
+
execute_command(SQL::duplicated_events).first['count'].to_i.zero?
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
private
|
|
290
|
+
|
|
291
|
+
def config
|
|
292
|
+
Backend.configuration
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
def redshift_config
|
|
296
|
+
config.redshift.to_h
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
def redshift_connection
|
|
300
|
+
@connection ||= PGconn.new(redshift_config)
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
def execute_command(command)
|
|
304
|
+
redshift_connection.exec(command)
|
|
305
|
+
end
|
|
306
|
+
|
|
307
|
+
def check_redshift_tables
|
|
308
|
+
unless required_tables_exist?
|
|
309
|
+
raise MissingRequiredTables, 'Some of the required tables are not in Redshift.'
|
|
310
|
+
end
|
|
311
|
+
|
|
312
|
+
unless latest_timestamp_read_exists?
|
|
313
|
+
raise MissingLatestS3PathRead,
|
|
314
|
+
"The 'latest read' table does not contain any values"
|
|
315
|
+
end
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
def existing_tables
|
|
319
|
+
execute_command(SQL::EXISTING_TABLES).map { |row| row['table_name'] }
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
def existing_tables_with_schema
|
|
323
|
+
existing_tables.map { |table| "#{SQL::SCHEMA}.#{table}" }
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
def required_tables_exist?
|
|
327
|
+
db_tables_with_schema = existing_tables_with_schema
|
|
328
|
+
REQUIRED_TABLES.all? do |required_table|
|
|
329
|
+
db_tables_with_schema.include?(required_table)
|
|
330
|
+
end
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
def save_in_redshift(path)
|
|
334
|
+
import_s3_path(path)
|
|
335
|
+
[SQL.delete_nulls_from_imported,
|
|
336
|
+
SQL.fill_table_unique_imported,
|
|
337
|
+
SQL.delete_outdated_from_unique_imported,
|
|
338
|
+
SQL.insert_imported_events,
|
|
339
|
+
SQL::CLEAN_TEMP_TABLES,
|
|
340
|
+
SQL::VACUUM].each { |command| execute_command(command) }
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
def save_latest_read(time_utc)
|
|
344
|
+
execute_command(SQL.store_timestamp_read(time_utc.strftime('%Y%m%d%H')))
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
def import_s3_path(path)
|
|
348
|
+
execute_command(SQL::CREATE_TEMP_TABLES)
|
|
349
|
+
execute_command(SQL.import_s3_path(
|
|
350
|
+
path, config.aws_access_key_id, config.aws_secret_access_key))
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
def latest_timestamp_read_exists?
|
|
354
|
+
execute_command(SQL::LATEST_TIMESTAMP_READ).ntuples > 0
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
def s3_path(time_utc)
|
|
358
|
+
"#{S3_EVENTS_BASE_PATH}#{time_utc.strftime('%Y/%m/%d/%H')}"
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
end
|
|
364
|
+
end
|
|
365
|
+
end
|
|
366
|
+
end
|
|
367
|
+
end
|