xcflushd 1.0.0.rc2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,83 @@
1
+ require 'uri'
2
+ require 'xcflushd/runner'
3
+
4
+ module Xcflushd
5
+ module GLIHelpers
6
+ POSITIVE_N_RE = /\A[1-9]\d*\z/.freeze
7
+
8
+ class PositiveMinMaxInt
9
+ # this allows 0 or more as MIN, 1 or more as MAX
10
+ POSITIVE_MIN_MAX_RE = /\A(?<min>\d+):(?<max>[1-9]\d*)\z/.freeze
11
+ private_constant :POSITIVE_MIN_MAX_RE
12
+
13
+ def self.match(str)
14
+ md = POSITIVE_MIN_MAX_RE.match str
15
+ return false if md.nil?
16
+ min, max = [md[:min].to_i, md[:max].to_i]
17
+ return false if max < min
18
+ new min, max
19
+ end
20
+
21
+ attr_reader :min, :max
22
+
23
+ def initialize(min, max)
24
+ @min, @max = min, max
25
+ end
26
+
27
+ def to_a
28
+ [self]
29
+ end
30
+ end
31
+
32
+ # URI parsing for GLI
33
+ class GenericURI
34
+ # https://tools.ietf.org/html/rfc3986#appendix-A
35
+ SCHEME_RE = /[[:alpha:]][[[:alpha:]][[:digit:]]\+-\.]*:\/\//
36
+ private_constant :SCHEME_RE
37
+
38
+ def self.new(s, default_port = nil)
39
+ # URI.parse won't correctly parse a URI without a scheme
40
+ unless SCHEME_RE.match s
41
+ s = "generic://#{s}"
42
+ end
43
+ uri = URI.parse(s)
44
+ # exit with an error if no host parsed
45
+ return false unless uri.host
46
+ if !uri.port && default_port
47
+ uri.port = default_port
48
+ end
49
+ uri.define_singleton_method :to_a do
50
+ [self]
51
+ end
52
+ uri
53
+ end
54
+ end
55
+
56
+ class RedisURI
57
+ DEFAULT_PORT = 6379
58
+ private_constant :DEFAULT_PORT
59
+
60
+ def self.match(s)
61
+ GenericURI.new(s, DEFAULT_PORT)
62
+ end
63
+ end
64
+
65
+ class BackendURI
66
+ def self.match(s)
67
+ GenericURI.new(s)
68
+ end
69
+ end
70
+
71
+ def start_xcflusher(options)
72
+ Xcflushd::Runner.run(Hash[options.map { |k, v| [k.to_s.tr('-', '_').to_sym, v] }])
73
+ end
74
+
75
+ def set_title(title)
76
+ if Process.respond_to? :setproctitle
77
+ Process.setproctitle title
78
+ else
79
+ $0 = title
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,9 @@
1
+ require 'logger'
2
+
3
+ module Xcflushd
4
+ class Logger
5
+ def self.new(*args)
6
+ ::Logger.new(*args)
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,253 @@
1
+ require 'xcflushd/threading'
2
+
3
+ module Xcflushd
4
+ # Apart from flushing all the cached reports and renewing the authorizations
5
+ # periodically, we need to provide a mechanism to renew a specific auth at
6
+ # any time. The information needed is the combination of service, application
7
+ # credentials and metric.
8
+ #
9
+ # When the client looks for the auth of a combination in the cache, it might
10
+ # not be there. It could be an authorization that has never been cached or one
11
+ # that has expired. In that case, we need to provide a way to check a
12
+ # specific authorization without waiting for the next flush cycle.
13
+ #
14
+ # We use Redis publish/subscribe to solve this problem. We use 2 different
15
+ # type of channels:
16
+ # 1) Auth requests channel. It's the channel where the client specifies the
17
+ # combinations that need to be checked. xcflushd is subscribed to the
18
+ # channel. There is only one channel of this type.
19
+ # 2) Responses channel. Every time there's a request for a specific
20
+ # combination, a channel of this type is created. The client is
21
+ # subscribed to this channel, and xcflushd will publish the authorization
22
+ # status once it gets it from 3scale.
23
+ class PriorityAuthRenewer
24
+
25
+ # Number of times that a response is published
26
+ TIMES_TO_PUBLISH = 5
27
+ private_constant :TIMES_TO_PUBLISH
28
+
29
+ # We need two separate Redis clients: one for subscribing to a channel and
30
+ # the other one to publish to different channels. It is specified in the
31
+ # Redis website: http://redis.io/topics/pubsub
32
+ def initialize(authorizer, storage, redis_pub, redis_sub,
33
+ auth_ttl, logger, threads)
34
+ @authorizer = authorizer
35
+ @storage = storage
36
+ @redis_pub = redis_pub
37
+ @redis_sub = redis_sub
38
+ @auth_ttl = auth_ttl
39
+ @logger = logger
40
+
41
+ # We can receive several requests to renew the authorization of a
42
+ # combination while we are already renewing it. We want to avoid
43
+ # performing several calls to 3scale asking for the same thing. For that
44
+ # reason, we use a map to keep track of the combinations that we are
45
+ # renewing.
46
+ # This map is updated from different threads. We use Concurrent::Map to
47
+ # ensure thread-safety.
48
+ @current_auths = Concurrent::Map.new
49
+
50
+ min_threads, max_threads = if threads
51
+ [threads.min, threads.max]
52
+ else
53
+ Threading.default_threads_value
54
+ end
55
+
56
+ @thread_pool = Concurrent::ThreadPoolExecutor.new(
57
+ min_threads: min_threads,
58
+ max_threads: max_threads)
59
+ end
60
+
61
+ def shutdown
62
+ @thread_pool.shutdown
63
+ end
64
+
65
+ def wait_for_termination(secs = nil)
66
+ @thread_pool.wait_for_termination(secs)
67
+ end
68
+
69
+ def terminate
70
+ @thread_pool.kill
71
+ end
72
+
73
+ def start
74
+ begin
75
+ subscribe_to_requests_channel
76
+ rescue StandardError => e
77
+ logger.error("PriorityAuthRenewer can't subscribe to the requests "\
78
+ "channel - #{e.class} #{e.message} #{e.cause}")
79
+ raise e
80
+ end
81
+ end
82
+
83
+ private
84
+
85
+ attr_reader :authorizer, :storage, :redis_pub, :redis_sub, :auth_ttl,
86
+ :logger, :current_auths, :thread_pool
87
+
88
+ def subscribe_to_requests_channel
89
+ redis_sub.subscribe(StorageKeys::AUTH_REQUESTS_CHANNEL) do |on|
90
+ on.subscribe do |channel, _subscriptions|
91
+ logger.info("PriorityAuthRenewer correctly subscribed to #{channel}")
92
+ end
93
+
94
+ on.message do |_channel, msg|
95
+ begin
96
+ # The renew and publish operations need to be done asynchronously.
97
+ # Renewing the authorizations involves getting them from 3scale,
98
+ # making networks requests, and also updating Redis. We cannot block
99
+ # until we get all that done. That is why we need to treat the
100
+ # messages received in the channel concurrently.
101
+ unless currently_authorizing?(msg)
102
+ async_renew_and_publish_task(msg).execute
103
+ end
104
+ rescue Concurrent::RejectedExecutionError => e
105
+ # This error is raised when we try to submit a task to the thread
106
+ # pool and it is rejected.
107
+ # After we call shutdown() on the thread pool, this error will be
108
+ # raised. We do not want to log errors in this case.
109
+ unless thread_pool.shuttingdown?
110
+ logger.error('Error while treating a message received in the '\
111
+ "requests channel: #{e.message}")
112
+ end
113
+ rescue StandardError => e
114
+ # If we do not rescue from an exception raised while treating a
115
+ # message, the redis client instance used stops receiving messages.
116
+ # We need to make sure that we'll rescue in all cases.
117
+ # Keep in mind that this will not rescue from exceptions raised in
118
+ # async tasks because they are executed in different threads.
119
+ logger.error('Error while treating a message received in the '\
120
+ "requests channel: #{e.message}")
121
+ end
122
+ end
123
+ end
124
+ end
125
+
126
+ # Apart from renewing the auth of the combination received, we also renew
127
+ # all the metrics of the associated application. The reason is that to renew
128
+ # a single metric we need to perform one call to 3scale, and to renew all
129
+ # the limited metrics of an application we also need one. If the metric
130
+ # received does not have limits defined, we need to perform two calls, but
131
+ # still it is worth to renew all of them for that price.
132
+ #
133
+ # Note: Some exceptions can be raised inside the futures that are executed
134
+ # by the thread pool. For example, when 3scale is not accessible, when
135
+ # renewing the cached authorizations fails, or when publishing to the
136
+ # response channels fails. Trying to recover from all those cases does not
137
+ # seem to be worth it. The request that published the message will wait for
138
+ # a response that will not arrive and eventually, it will timeout. However,
139
+ # if the request retries, it is likely to succeed, as the kind of errors
140
+ # listed above are (hopefully) temporary.
141
+ def async_renew_and_publish_task(channel_msg)
142
+ Concurrent::Future.new(executor: thread_pool) do
143
+ success = true
144
+ begin
145
+ combination = auth_channel_msg_2_combination(channel_msg)
146
+ app_auths = app_authorizations(combination)
147
+ renew(combination[:service_id], combination[:credentials], app_auths)
148
+ metric_auth = app_auths[combination[:metric]]
149
+ rescue StandardError
150
+ # If we do not do rescue, we would not be able to process the same
151
+ # message again.
152
+ success = false
153
+ ensure
154
+ mark_auth_task_as_finished(channel_msg)
155
+ end
156
+
157
+ # We only publish a message when there aren't any errors. When
158
+ # success is false, we could have renewed some auths, so this could
159
+ # be more fine grained and ping the subscribers that are not interested
160
+ # in the auths that failed. Also, as we do not publish anything when
161
+ # there is an error, the subscriber waits until it timeouts.
162
+ # This is good enough for now, but there is room for improvement.
163
+ publish_auth_repeatedly(combination, metric_auth) if success
164
+ end
165
+ end
166
+
167
+ def auth_channel_msg_2_combination(msg)
168
+ StorageKeys.pubsub_auth_msg_2_auth_info(msg)
169
+ end
170
+
171
+ def app_authorizations(combination)
172
+ authorizer.authorizations(combination[:service_id],
173
+ combination[:credentials],
174
+ [combination[:metric]])
175
+ end
176
+
177
+ def renew(service_id, credentials, auths)
178
+ storage.renew_auths(service_id, credentials, auths, auth_ttl)
179
+ end
180
+
181
+ def channel_for_combination(combination)
182
+ StorageKeys.pubsub_auths_resp_channel(combination[:service_id],
183
+ combination[:credentials],
184
+ combination[:metric])
185
+ end
186
+
187
+ def publish_auth_repeatedly(combination, authorization)
188
+ # There is a race condition here. A renew and publish task is only run
189
+ # when there is not another one renewing the same combination. When there
190
+ # is another, the incoming request does not trigger a new task, but waits
191
+ # for the publish below. The request could miss the published message
192
+ # if events happened in this order:
193
+ # 1) The request publishes the combination it needs in the requests
194
+ # channel.
195
+ # 2) A new task is not executed, because there is another renewing
196
+ # the same combination.
197
+ # 3) That task publishes the result.
198
+ # 4) The request subscribes to receive the result, but now it is
199
+ # too late.
200
+ # I cannot think of an easy way to solve this. There is some time
201
+ # between the moment the requests performs the publish and the
202
+ # subscribe actions. To mitigate the problem we can publish several
203
+ # times during some ms. We will see if this is good enough.
204
+ # Trade-off: publishing too much increases the Redis load. Waiting too
205
+ # much makes the incoming request slow.
206
+ publish_failures = 0
207
+ TIMES_TO_PUBLISH.times do |t|
208
+ begin
209
+ publish_auth(combination, authorization)
210
+ rescue
211
+ publish_failures += 1
212
+ end
213
+ sleep((1.0/50)*((t+1)**2))
214
+ end
215
+
216
+ if publish_failures > 0
217
+ logger.warn('There was an error while publishing a response in the '\
218
+ "priority channel. Combination: #{combination}".freeze)
219
+ end
220
+ end
221
+
222
+ def publish_auth(combination, authorization)
223
+ msg = if authorization.authorized?
224
+ '1'.freeze
225
+ else
226
+ authorization.reason ? "0:#{authorization.reason}" : '0'.freeze
227
+ end
228
+
229
+ redis_pub.publish(channel_for_combination(combination), msg)
230
+ end
231
+
232
+ def currently_authorizing?(channel_msg)
233
+ # A simple solution would be something like:
234
+ # if !current_auths[channel_msg]
235
+ # current_auths[channel_msg] = true;
236
+ # perform_work
237
+ # current_auths.delete(channel_msg)
238
+ # end
239
+ # The problem is that the read/write is not atomic. Therefore, several
240
+ # threads could enter the if at the same time repeating work. That is
241
+ # why we use concurrent-ruby's Map#put_if_absent, which is atomic.
242
+
243
+ # The value we set in the map is not relevant. #put_if_absent returns
244
+ # nil when the key is not in the map, which means that we are not
245
+ # currently authorizing it. That is all we care about.
246
+ current_auths.put_if_absent(channel_msg, true) != nil
247
+ end
248
+
249
+ def mark_auth_task_as_finished(channel_msg)
250
+ current_auths.delete(channel_msg)
251
+ end
252
+ end
253
+ end
@@ -0,0 +1,70 @@
1
+ require '3scale_client'
2
+
3
+ module Xcflushd
4
+ class Reporter
5
+
6
+ class ReporterError < Flusher::XcflushdError
7
+ def initialize(service_id, transaction, specific_msg)
8
+ super("Error reporting this transaction: #{transaction} "\
9
+ "for service with id #{service_id}. "\
10
+ "#{specific_msg}")
11
+ end
12
+ end
13
+
14
+ # Exception raised when the 3scale client is not called with the right
15
+ # params. This happens when there are programming errors.
16
+ class ThreeScaleBadParams < ReporterError
17
+ def initialize(service_id, transaction)
18
+ super(service_id, transaction,
19
+ 'There might be a bug in the program.'.freeze)
20
+ end
21
+ end
22
+
23
+ # Exception raised when the 3scale client is called with the right params
24
+ # but it returns a ServerError. Most of the time this means that 3scale is
25
+ # unreachable, although it could also be caused by a bug in the 3scale
26
+ # service management API.
27
+ class ThreeScaleInternalError < ReporterError
28
+ def initialize(service_id, transaction)
29
+ super(service_id, transaction, '3scale seems to be unreachable.'.freeze)
30
+ end
31
+ end
32
+
33
+ # Exception raised when the 3scale client made the call, but did not
34
+ # succeed. This happens when the credentials are invalid. For example, when
35
+ # an invalid provider key is used.
36
+ class ThreeScaleAuthError < ReporterError
37
+ def initialize(service_id, transaction)
38
+ super(service_id, transaction,
39
+ 'Invalid credentials. Check the provider key'.freeze)
40
+ end
41
+ end
42
+
43
+ def initialize(threescale_client)
44
+ @threescale_client = threescale_client
45
+ end
46
+
47
+ def report(service_id, credentials, usage)
48
+ transaction = credentials.creds.merge(usage: usage)
49
+
50
+ begin
51
+ resp = threescale_client.report(transactions: [transaction],
52
+ service_id: service_id)
53
+ # TODO: get rid of the coupling with ThreeScale::ServerError
54
+ rescue ThreeScale::ServerError, SocketError
55
+ # We'll get a SocketError if there's a timeout when contacting 3scale.
56
+ raise ThreeScaleInternalError.new(service_id, transaction)
57
+ rescue ArgumentError
58
+ raise ThreeScaleBadParams.new(service_id, transaction)
59
+ end
60
+
61
+ raise ThreeScaleAuthError.new(service_id, transaction) unless resp.success?
62
+ true
63
+ end
64
+
65
+ private
66
+
67
+ attr_reader :threescale_client
68
+
69
+ end
70
+ end
@@ -0,0 +1,165 @@
1
+ require 'xcflushd'
2
+ require 'redis'
3
+ require '3scale_client'
4
+ require 'xcflushd/3scale_client_ext'
5
+
6
+ module Xcflushd
7
+ class Runner
8
+ class << self
9
+ # Amount of time to wait before retrying the subscription to the
10
+ # priority auth renewal pubsub channel.
11
+ PRIORITY_SUBSCRIPTION_RETRY_WAIT = 5
12
+ private_constant :PRIORITY_SUBSCRIPTION_RETRY_WAIT
13
+ # Maximum time to wait for a graceful shutdown before becoming more
14
+ # aggressive at killing thread pools.
15
+ DEFAULT_MAX_TERM_WAIT = 30
16
+ private_constant :DEFAULT_MAX_TERM_WAIT
17
+ # because Ruby is not providing us wakeup from sleep itself, we
18
+ # sleep in small intervals and check if we have been signalled
19
+ MAX_IDLING_SIGNAL_LATENCY = 5
20
+ private_constant :MAX_IDLING_SIGNAL_LATENCY
21
+
22
+ def run(opts = {})
23
+ setup_sighandlers
24
+
25
+ @max_term_wait = opts[:max_term_wait] || DEFAULT_MAX_TERM_WAIT
26
+ @logger = Logger.new(STDOUT)
27
+
28
+ redis_host = opts[:redis].host
29
+ redis_port = opts[:redis].port
30
+ redis = Redis.new(host: redis_host, port: redis_port, driver: :hiredis)
31
+ storage = Storage.new(redis, @logger, StorageKeys)
32
+
33
+ threescale = ThreeScale::Client.new(provider_key: opts[:provider_key],
34
+ host: opts[:backend].host,
35
+ port: opts[:backend].port ||
36
+ (opts[:secure] ? 443 : 80),
37
+ secure: opts[:secure],
38
+ persistent: true)
39
+ reporter = Reporter.new(threescale)
40
+ authorizer = Authorizer.new(threescale)
41
+
42
+ redis_pub = Redis.new(host: redis_host, port: redis_port, driver: :hiredis)
43
+ redis_sub = Redis.new(host: redis_host, port: redis_port, driver: :hiredis)
44
+
45
+ auth_ttl = opts[:auth_ttl]
46
+
47
+ error_handler = FlusherErrorHandler.new(@logger, storage)
48
+ @flusher = Flusher.new(reporter, authorizer, storage,
49
+ auth_ttl, error_handler, opts[:threads])
50
+
51
+ @prio_auth_renewer = PriorityAuthRenewer.new(authorizer, storage,
52
+ redis_pub, redis_sub,
53
+ auth_ttl, @logger,
54
+ opts[:prio_threads])
55
+
56
+ @prio_auth_renewer_thread = start_priority_auth_renewer
57
+
58
+ flush_periodically(opts[:frequency])
59
+ end
60
+
61
+ private
62
+
63
+ def start_priority_auth_renewer
64
+ Thread.new do
65
+ loop do
66
+ break if @exit
67
+ begin
68
+ @prio_auth_renewer.start
69
+ rescue StandardError
70
+ sleep PRIORITY_SUBSCRIPTION_RETRY_WAIT
71
+ end
72
+ end
73
+ end
74
+ end
75
+
76
+ def flush_periodically(flush_freq)
77
+ loop do
78
+ break if @exit
79
+ begin
80
+ @logger.info('Flushing...')
81
+ flusher_start = Time.now
82
+ next_flush = flusher_start + flush_freq
83
+ @flusher.flush
84
+ flusher_runtime = Time.now - flusher_start
85
+ @logger.info("Flush completed in #{flusher_runtime} seconds")
86
+ rescue StandardError => e
87
+ # Let's make sure that we treat all the standard errors to ensure that
88
+ # the flusher keeps running.
89
+ @logger.error(e)
90
+ end
91
+ loop do
92
+ # sleep in small intervals to check if signalled
93
+ break if @exit
94
+ time_remaining = next_flush - Time.now
95
+ break if time_remaining <= 0
96
+ sleep([MAX_IDLING_SIGNAL_LATENCY, time_remaining].min)
97
+ end
98
+ end
99
+ @logger.info('Exiting')
100
+ rescue Exception => e
101
+ @logger.fatal("Unhandled exception #{e.class}, shutting down: #{e.cause} - #{e}")
102
+ ensure
103
+ shutdown
104
+ end
105
+
106
+ # Shutting down xcflushd
107
+ #
108
+ # We issue shutdown commands to the thread pools in the auth renewer and
109
+ # the flusher, wait a bit for a graceful termination and then proceed with
110
+ # more drastic ways.
111
+ #
112
+ # Note that there is no @prio_auth_renewer_thread.join(timeout).
113
+ #
114
+ # This is because that thread is blocked in the Redis pubsub mechanism.
115
+ # Since that is handled by the Redis gem and there is no way to exit it
116
+ # unless an unhandled exception is raised or an explicit unsubscribe
117
+ # command is issued from within one of the pubsub message handlers, we
118
+ # can't do much to issue an unsubscribe command (it would be issued from
119
+ # an external place and would block on the Redis gem's internal
120
+ # synchronization primitives).
121
+ #
122
+ # Therefore if we did the join we would be wasting that time once the
123
+ # thread pool is terminated, so we just go ahead and kill the thread right
124
+ # away (in terminate).
125
+ #
126
+ def shutdown
127
+ shutdown_deadline = Time.now + @max_term_wait
128
+ tasks = [@prio_auth_renewer, @flusher]
129
+ tasks.each do |task|
130
+ with_logged_shutdown { task.shutdown }
131
+ end
132
+ tasks.each do |task|
133
+ with_logged_shutdown do
134
+ task.wait_for_termination(shutdown_deadline - Time.now)
135
+ end
136
+ end
137
+ ensure
138
+ terminate
139
+ end
140
+
141
+ def terminate
142
+ [@prio_auth_renewer, @flusher, @prio_auth_renewer_thread].each do |task|
143
+ with_logged_shutdown { task.terminate }
144
+ end
145
+ end
146
+
147
+ def with_logged_shutdown
148
+ yield
149
+ rescue Exception => e
150
+ begin
151
+ @logger.error("while shutting down: #{e.class}, cause #{e.cause} - #{e}")
152
+ rescue Exception
153
+ # we want to avoid barfing if logger also breaks so that further
154
+ # processing can continue.
155
+ end
156
+ end
157
+
158
+ def setup_sighandlers
159
+ @exit = false
160
+ Signal.trap('EXIT') { @exit = true }
161
+ Signal.trap('INT') { @exit = true }
162
+ end
163
+ end
164
+ end
165
+ end