buildkite-test_collector 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,331 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "socket_connection"
4
+
5
+ module Buildkite::TestCollector
6
+ class Session
7
+ # Picked 75 as the magic timeout number as it's longer than the TCP timeout of 60s 🤷‍♀️
8
+ CONFIRMATION_TIMEOUT = ENV.fetch("BUILDKITE_ANALYTICS_CONFIRMATION_TIMEOUT") { 75 }.to_i
9
+ MAX_RECONNECTION_ATTEMPTS = ENV.fetch("BUILDKITE_ANALYTICS_RECONNECTION_ATTEMPTS") { 3 }.to_i
10
+ WAIT_BETWEEN_RECONNECTIONS = ENV.fetch("BUILDKITE_ANALYTICS_RECONNECTION_WAIT") { 5 }.to_i
11
+
12
+ class RejectedSubscription < StandardError; end
13
+ class InitialConnectionFailure < StandardError; end
14
+
15
+ DISCONNECTED_EXCEPTIONS = [
16
+ SocketConnection::HandshakeError,
17
+ RejectedSubscription,
18
+ TimeoutError,
19
+ InitialConnectionFailure,
20
+ SocketConnection::SocketError
21
+ ]
22
+
23
+ def initialize(url, authorization_header, channel)
24
+ @establish_subscription_queue = Queue.new
25
+ @channel = channel
26
+
27
+ @unconfirmed_idents = {}
28
+ @idents_mutex = Mutex.new
29
+ @send_queue = Queue.new
30
+ @empty = ConditionVariable.new
31
+ @closing = false
32
+ @eot_queued = false
33
+ @eot_queued_mutex = Mutex.new
34
+ @reconnection_mutex = Mutex.new
35
+
36
+ @url = url
37
+ @authorization_header = authorization_header
38
+
39
+ reconnection_count = 0
40
+
41
+ begin
42
+ reconnection_count += 1
43
+ connect
44
+ rescue TimeoutError, InitialConnectionFailure => e
45
+ Buildkite::TestCollector.logger.warn("rspec-buildkite-analytics could not establish an initial connection with Buildkite due to #{e}. Attempting retry #{reconnection_count} of #{MAX_RECONNECTION_ATTEMPTS}...")
46
+ if reconnection_count > MAX_RECONNECTION_ATTEMPTS
47
+ Buildkite::TestCollector.logger.error "rspec-buildkite-analytics could not establish an initial connection with Buildkite due to #{e.message} after #{MAX_RECONNECTION_ATTEMPTS} attempts. You may be missing some data for this test suite, please contact support if this issue persists."
48
+ else
49
+ sleep(WAIT_BETWEEN_RECONNECTIONS)
50
+ Buildkite::TestCollector.logger.warn("retrying reconnection")
51
+ retry
52
+ end
53
+ end
54
+ init_write_thread
55
+ end
56
+
57
+ def disconnected(connection)
58
+ @reconnection_mutex.synchronize do
59
+ # When the first thread detects a disconnection, it calls the disconnect method
60
+ # with the current connection. This thread grabs the reconnection mutex and does the
61
+ # reconnection, which then updates the value of @connection.
62
+ #
63
+ # At some point in that process, the second thread would have detected the
64
+ # disconnection too, and it also calls it with the current connection. However, the
65
+ # second thread can't run the reconnection code because of the mutex. By the
66
+ # time the mutex is released, the value of @connection has been refreshed, and so
67
+ # the second thread returns early and does not reattempt the reconnection.
68
+ return unless connection == @connection
69
+ Buildkite::TestCollector.logger.debug("starting reconnection")
70
+
71
+ reconnection_count = 0
72
+
73
+ begin
74
+ reconnection_count += 1
75
+ connect
76
+ init_write_thread
77
+ rescue *DISCONNECTED_EXCEPTIONS => e
78
+ Buildkite::TestCollector.logger.warn("failed reconnection attempt #{reconnection_count} due to #{e}")
79
+ if reconnection_count > MAX_RECONNECTION_ATTEMPTS
80
+ Buildkite::TestCollector.logger.error "rspec-buildkite-analytics experienced a disconnection and could not reconnect to Buildkite due to #{e.message}. Please contact support."
81
+ raise e
82
+ else
83
+ sleep(WAIT_BETWEEN_RECONNECTIONS)
84
+ Buildkite::TestCollector.logger.warn("retrying reconnection")
85
+ retry
86
+ end
87
+ end
88
+ end
89
+ retransmit
90
+ end
91
+
92
+ def close(examples_count)
93
+ @closing = true
94
+ @examples_count = examples_count
95
+ Buildkite::TestCollector.logger.debug("closing socket connection")
96
+
97
+ # Because the server only sends us confirmations after every 10mb of
98
+ # data it uploads to S3, we'll never get confirmation of the
99
+ # identifiers of the last upload part unless we send an explicit finish,
100
+ # to which the server will respond with the last bits of data
101
+ send_eot
102
+
103
+ # After EOT, we wait for 75 seconds for the send queue to be drained and for the
104
+ # server to confirm the last idents. If everything has already been confirmed we can
105
+ # proceed without waiting.
106
+ @idents_mutex.synchronize do
107
+ if @unconfirmed_idents.any?
108
+ Buildkite::TestCollector.logger.debug "Waiting for Buildkite Test Analytics to send results..."
109
+ Buildkite::TestCollector.logger.debug("waiting for last confirm")
110
+
111
+ @empty.wait(@idents_mutex, CONFIRMATION_TIMEOUT)
112
+ end
113
+ end
114
+
115
+ # Then we always disconnect cos we can't wait forever? 🤷‍♀️
116
+ @connection.close
117
+ # We kill the write thread cos it's got a while loop in it, so it won't finish otherwise
118
+ @write_thread&.kill
119
+
120
+ Buildkite::TestCollector.logger.info "Buildkite Test Analytics completed"
121
+ Buildkite::TestCollector.logger.debug("socket connection closed")
122
+ end
123
+
124
+ def handle(_connection, data)
125
+ data = JSON.parse(data)
126
+ case data["type"]
127
+ when "ping"
128
+ # In absence of other message, the server sends us a ping every 3 seconds
129
+ # We are currently not doing anything with these
130
+ Buildkite::TestCollector.logger.debug("received ping")
131
+ when "welcome", "confirm_subscription"
132
+ # Push these two messages onto the queue, so that we block on waiting for the
133
+ # initializing phase to complete
134
+ @establish_subscription_queue.push(data)
135
+ Buildkite::TestCollector.logger.debug("received #{data['type']}")
136
+ when "reject_subscription"
137
+ Buildkite::TestCollector.logger.debug("received rejected_subscription")
138
+ raise RejectedSubscription
139
+ else
140
+ process_message(data)
141
+ end
142
+ end
143
+
144
+ def write_result(result)
145
+ queue_and_track_result(result.id, result.as_hash)
146
+
147
+ Buildkite::TestCollector.logger.debug("added #{result.id} to send queue")
148
+ end
149
+
150
+ def unconfirmed_idents_count
151
+ @idents_mutex.synchronize do
152
+ @unconfirmed_idents.count
153
+ end
154
+ end
155
+
156
+ private
157
+
158
+ def connect
159
+ Buildkite::TestCollector.logger.debug("starting socket connection process")
160
+
161
+ @connection = SocketConnection.new(self, @url, {
162
+ "Authorization" => @authorization_header,
163
+ })
164
+
165
+ wait_for_welcome
166
+
167
+ @connection.transmit({
168
+ "command" => "subscribe",
169
+ "identifier" => @channel
170
+ })
171
+
172
+ wait_for_confirm
173
+
174
+ Buildkite::TestCollector.logger.info "Connected to Buildkite Test Analytics!"
175
+ Buildkite::TestCollector.logger.debug("connected")
176
+ end
177
+
178
+ def init_write_thread
179
+ # As this method can be called multiple times in the
180
+ # reconnection process, kill prev write threads (if any) before
181
+ # setting up the new one
182
+ @write_thread&.kill
183
+
184
+ @write_thread = Thread.new do
185
+ Buildkite::TestCollector.logger.debug("hello from write thread")
186
+ # Pretty sure this eternal loop is fine cos the call to queue.pop is blocking
187
+ loop do
188
+ data = @send_queue.pop
189
+ message_type = data["action"]
190
+
191
+ if message_type == "end_of_transmission"
192
+ # Because of the unpredictable sequencing between the test suite finishing
193
+ # (EOT gets queued) and disconnections happening (retransmit results gets
194
+ # queued), we don't want to send an EOT before any retransmits are sent.
195
+ if @send_queue.length > 0
196
+ @send_queue << data
197
+ Buildkite::TestCollector.logger.debug("putting eot at back of queue")
198
+ next
199
+ end
200
+ @eot_queued_mutex.synchronize do
201
+ @eot_queued = false
202
+ end
203
+ end
204
+
205
+ @connection.transmit({
206
+ "identifier" => @channel,
207
+ "command" => "message",
208
+ "data" => data.to_json
209
+ })
210
+
211
+ if Buildkite::TestCollector.debug_enabled
212
+ ids = if message_type == "record_results"
213
+ data["results"].map { |result| result["id"] }
214
+ end
215
+ Buildkite::TestCollector.logger.debug("transmitted #{message_type} #{ids}")
216
+ end
217
+ end
218
+ end
219
+ end
220
+
221
+ def pop_with_timeout(message_type)
222
+ Timeout.timeout(30, Buildkite::TestCollector::TimeoutError, "Timeout: Waited 30 seconds for #{message_type}") do
223
+ @establish_subscription_queue.pop
224
+ end
225
+ end
226
+
227
+ def wait_for_welcome
228
+ welcome = pop_with_timeout("welcome")
229
+
230
+ if welcome && welcome != { "type" => "welcome" }
231
+ raise InitialConnectionFailure.new("Wrong message received, expected a welcome, but received: #{welcome.inspect}")
232
+ end
233
+ end
234
+
235
+ def wait_for_confirm
236
+ confirm = pop_with_timeout("confirm")
237
+
238
+ if confirm && confirm != { "type" => "confirm_subscription", "identifier" => @channel }
239
+ raise InitialConnectionFailure.new("Wrong message received, expected a confirm, but received: #{confirm.inspect}")
240
+ end
241
+ end
242
+
243
+ def queue_and_track_result(ident, result_as_hash)
244
+ @idents_mutex.synchronize do
245
+ @unconfirmed_idents[ident] = result_as_hash
246
+
247
+ @send_queue << {
248
+ "action" => "record_results",
249
+ "results" => [result_as_hash]
250
+ }
251
+ end
252
+ end
253
+
254
+ def confirm_idents(idents)
255
+ retransmit_required = @closing
256
+
257
+ @idents_mutex.synchronize do
258
+ # Remove received idents from unconfirmed_idents
259
+ idents.each { |key| @unconfirmed_idents.delete(key) }
260
+
261
+ Buildkite::TestCollector.logger.debug("received confirm for indentifiers: #{idents}")
262
+
263
+ # This @empty ConditionVariable broadcasts every time that @unconfirmed_idents is
264
+ # empty, which will happen about every 10mb of data as that's when the server
265
+ # sends back confirmations.
266
+ #
267
+ # However, there aren't any threads waiting on this signal until after we
268
+ # send the EOT message, so the prior broadcasts shouldn't do anything.
269
+ if @unconfirmed_idents.empty?
270
+ @empty.broadcast
271
+
272
+ retransmit_required = false
273
+
274
+ Buildkite::TestCollector.logger.debug("all identifiers have been confirmed")
275
+ else
276
+ Buildkite::TestCollector.logger.debug("still waiting on confirm for identifiers: #{@unconfirmed_idents.keys}")
277
+ end
278
+ end
279
+
280
+ # If we're closing, any unconfirmed results need to be retransmitted.
281
+ retransmit if retransmit_required
282
+ end
283
+
284
+ def send_eot
285
+ @eot_queued_mutex.synchronize do
286
+ return if @eot_queued
287
+
288
+ @send_queue << {
289
+ "action" => "end_of_transmission",
290
+ "examples_count" => @examples_count.to_json
291
+ }
292
+ @eot_queued = true
293
+
294
+ Buildkite::TestCollector.logger.debug("added EOT to send queue")
295
+ end
296
+ end
297
+
298
+ def process_message(data)
299
+ # Check we're getting the data we expect
300
+ return unless data["identifier"] == @channel
301
+
302
+ case
303
+ when data["message"].key?("confirm")
304
+ confirm_idents(data["message"]["confirm"])
305
+ else
306
+ # unhandled message
307
+ Buildkite::TestCollector.logger.debug("received unhandled message #{data["message"]}")
308
+ end
309
+ end
310
+
311
+ def retransmit
312
+ @idents_mutex.synchronize do
313
+ results = @unconfirmed_idents.values
314
+
315
+ # queue the contents of the buffer, unless it's empty
316
+ if results.any?
317
+ @send_queue << {
318
+ "action" => "record_results",
319
+ "results" => results
320
+ }
321
+
322
+ Buildkite::TestCollector.logger.debug("queueing up retransmitted results #{@unconfirmed_idents.keys}")
323
+ end
324
+ end
325
+
326
+ # if we were disconnected in the closing phase, then resend the EOT
327
+ # message so the server can persist the last upload part
328
+ send_eot if @closing
329
+ end
330
+ end
331
+ end
@@ -0,0 +1,157 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "socket"
4
+ require "openssl"
5
+ require "json"
6
+
7
+ module Buildkite::TestCollector
8
+ class SocketConnection
9
+ class HandshakeError < StandardError; end
10
+ class SocketError < StandardError; end
11
+
12
+ def initialize(session, url, headers)
13
+ uri = URI.parse(url)
14
+ @session = session
15
+ protocol = "http"
16
+
17
+ begin
18
+ socket = TCPSocket.new(uri.host, uri.port || (uri.scheme == "wss" ? 443 : 80))
19
+
20
+ if uri.scheme == "wss"
21
+ ctx = OpenSSL::SSL::SSLContext.new
22
+ protocol = "https"
23
+
24
+ ctx.min_version = :TLS1_2
25
+ ctx.verify_mode = OpenSSL::SSL::VERIFY_PEER
26
+ ctx.cert_store = OpenSSL::X509::Store.new.tap(&:set_default_paths)
27
+
28
+ socket = OpenSSL::SSL::SSLSocket.new(socket, ctx)
29
+ socket.connect
30
+ end
31
+ rescue
32
+ # We are rescuing all here, as there are a range of Errno errors that could be
33
+ # raised when we fail to establish a TCP connection
34
+ raise SocketError
35
+ end
36
+
37
+ @socket = socket
38
+
39
+ headers = { "Origin" => "#{protocol}://#{uri.host}" }.merge(headers)
40
+ handshake = WebSocket::Handshake::Client.new(url: url, headers: headers)
41
+
42
+ @socket.write handshake.to_s
43
+
44
+ until handshake.finished?
45
+ if byte = @socket.getc
46
+ handshake << byte
47
+ end
48
+ end
49
+
50
+ # The errors below are raised when we establish the TCP connection, but get back
51
+ # an error, i.e. in dev we can still connect to puma-dev while nginx isn't
52
+ # running, or in prod we can hit a load balancer while app is down
53
+ unless handshake.valid?
54
+ case handshake.error
55
+ when Exception, String
56
+ raise HandshakeError.new(handshake.error)
57
+ when nil
58
+ raise HandshakeError.new("Invalid handshake")
59
+ else
60
+ raise HandshakeError.new(handshake.error.inspect)
61
+ end
62
+ end
63
+
64
+ @version = handshake.version
65
+
66
+ # Setting up a new thread that listens on the socket, and processes incoming
67
+ # comms from the server
68
+ @thread = Thread.new do
69
+ Buildkite::TestCollector.logger.debug("listening in on socket")
70
+ frame = WebSocket::Frame::Incoming::Client.new
71
+
72
+ while @socket
73
+ frame << @socket.readpartial(4096)
74
+
75
+ while data = frame.next
76
+ @session.handle(self, data.data)
77
+ end
78
+ end
79
+ # These get re-raise from session, we should fail gracefully
80
+ rescue *Buildkite::TestCollector::Session::DISCONNECTED_EXCEPTIONS => e
81
+ Buildkite::TestCollector.logger.error("We could not establish a connection with Buildkite Test Analytics. The error was: #{e.message}. If this is a problem, please contact support.")
82
+ rescue EOFError => e
83
+ Buildkite::TestCollector.logger.warn("#{e}")
84
+ if @socket
85
+ Buildkite::TestCollector.logger.warn("attempting disconnected flow")
86
+ @session.disconnected(self)
87
+ disconnect
88
+ end
89
+ rescue Errno::ECONNRESET => e
90
+ Buildkite::TestCollector.logger.error("#{e}")
91
+ if @socket
92
+ Buildkite::TestCollector.logger.error("attempting disconnected flow")
93
+ @session.disconnected(self)
94
+ disconnect
95
+ end
96
+ rescue IOError
97
+ # This is fine to ignore
98
+ Buildkite::TestCollector.logger.error("IOError")
99
+ rescue IndexError
100
+ # I don't like that we're doing this but I think it's the best of the options
101
+ #
102
+ # This relates to this issue https://github.com/ruby/openssl/issues/452
103
+ # A fix for it has been released but the repercussions of overriding
104
+ # the OpenSSL version in the stdlib seem worse than catching this error here.
105
+ Buildkite::TestCollector.logger.error("IndexError")
106
+ if @socket
107
+ Buildkite::TestCollector.logger.error("attempting disconnected flow")
108
+ @session.disconnected(self)
109
+ disconnect
110
+ end
111
+ end
112
+ end
113
+
114
+ def transmit(data, type: :text)
115
+ # this line prevents us from calling disconnect twice
116
+ return if @socket.nil?
117
+
118
+ raw_data = data.to_json
119
+ frame = WebSocket::Frame::Outgoing::Client.new(data: raw_data, type: :text, version: @version)
120
+ @socket.write(frame.to_s)
121
+ rescue Errno::EPIPE, Errno::ECONNRESET, OpenSSL::SSL::SSLError => e
122
+ return unless @socket
123
+ return if type == :close
124
+ Buildkite::TestCollector.logger.error("got #{e}, attempting disconnected flow")
125
+ @session.disconnected(self)
126
+ disconnect
127
+ rescue IndexError
128
+ # I don't like that we're doing this but I think it's the best of the options
129
+ #
130
+ # This relates to this issue https://github.com/ruby/openssl/issues/452
131
+ # A fix for it has been released but the repercussions of overriding
132
+ # the OpenSSL version in the stdlib seem worse than catching this error here.
133
+ Buildkite::TestCollector.logger.error("IndexError")
134
+ if @socket
135
+ Buildkite::TestCollector.logger.error("attempting disconnected flow")
136
+ @session.disconnected(self)
137
+ disconnect
138
+ end
139
+ end
140
+
141
+ def close
142
+ Buildkite::TestCollector.logger.debug("socket close")
143
+ transmit(nil, type: :close)
144
+ disconnect
145
+ end
146
+
147
+ private
148
+
149
+ def disconnect
150
+ Buildkite::TestCollector.logger.debug("socket disconnect")
151
+ socket = @socket
152
+ @socket = nil
153
+ socket&.close
154
+ @thread&.join unless @thread == Thread.current
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_support/core_ext/hash/indifferent_access"
4
+
5
+ module Buildkite::TestCollector
6
+ class Tracer
7
+ class Span
8
+ attr_accessor :section, :start_at, :end_at, :detail, :children
9
+
10
+ def initialize(section, start_at, end_at, detail)
11
+ @section = section
12
+ @start_at = start_at
13
+ @end_at = end_at
14
+ @detail = detail
15
+ @children = []
16
+ end
17
+
18
+ def as_hash
19
+ {
20
+ section: section,
21
+ start_at: start_at,
22
+ end_at: end_at,
23
+ duration: end_at - start_at,
24
+ detail: detail,
25
+ children: children.map(&:as_hash),
26
+ }.with_indifferent_access
27
+ end
28
+ end
29
+
30
+ def initialize
31
+ @top = Span.new(:top, Concurrent.monotonic_time, nil, {})
32
+ @stack = [@top]
33
+ end
34
+
35
+ def enter(section, **detail)
36
+ new_entry = Span.new(section, Concurrent.monotonic_time, nil, detail)
37
+ current_span.children << new_entry
38
+ @stack << new_entry
39
+ end
40
+
41
+ def leave
42
+ current_span.end_at = Concurrent.monotonic_time
43
+ @stack.pop
44
+ end
45
+
46
+ def backfill(section, duration, **detail)
47
+ new_entry = Span.new(section, Concurrent.monotonic_time - duration, Concurrent.monotonic_time, detail)
48
+ current_span.children << new_entry
49
+ end
50
+
51
+ def current_span
52
+ @stack.last
53
+ end
54
+
55
+ def finalize
56
+ raise "Stack not empty" unless @stack.size == 1
57
+ @top.end_at = Concurrent.monotonic_time
58
+ self
59
+ end
60
+
61
+ def history
62
+ @top.as_hash
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "openssl"
4
+ require "websocket"
5
+
6
+ require_relative "tracer"
7
+ require_relative "network"
8
+ require_relative "object"
9
+ require_relative "session"
10
+ require_relative "ci"
11
+ require_relative "http_client"
12
+
13
+ require "active_support"
14
+ require "active_support/notifications"
15
+
16
+ require "securerandom"
17
+
18
+ module Buildkite::TestCollector
19
+ class Uploader
20
+ def self.traces
21
+ @traces ||= {}
22
+ end
23
+
24
+ REQUEST_EXCEPTIONS = [
25
+ URI::InvalidURIError,
26
+ Net::HTTPBadResponse,
27
+ Net::HTTPHeaderSyntaxError,
28
+ Net::ReadTimeout,
29
+ Net::OpenTimeout,
30
+ OpenSSL::SSL::SSLError,
31
+ OpenSSL::SSL::SSLErrorWaitReadable,
32
+ EOFError
33
+ ]
34
+
35
+ def self.configure
36
+ Buildkite::TestCollector.logger.debug("hello from main thread")
37
+
38
+ if Buildkite::TestCollector.api_token
39
+ http = Buildkite::TestCollector::HTTPClient.new(Buildkite::TestCollector.url)
40
+
41
+ response = begin
42
+ http.post
43
+ rescue *Buildkite::TestCollector::Uploader::REQUEST_EXCEPTIONS => e
44
+ Buildkite::TestCollector.logger.error "Buildkite Test Analytics: Error communicating with the server: #{e.message}"
45
+ end
46
+
47
+ return unless response
48
+
49
+ case response.code
50
+ when "401"
51
+ Buildkite::TestCollector.logger.info "Buildkite Test Analytics: Invalid Suite API key. Please double check your Suite API key."
52
+ when "200"
53
+ json = JSON.parse(response.body)
54
+
55
+ if (socket_url = json["cable"]) && (channel = json["channel"])
56
+ Buildkite::TestCollector.session = Buildkite::TestCollector::Session.new(socket_url, http.authorization_header, channel)
57
+ end
58
+ else
59
+ request_id = response.to_hash["x-request-id"]
60
+ Buildkite::TestCollector.logger.info "rspec-buildkite-analytics could not establish an initial connection with Buildkite. You may be missing some data for this test suite, please contact support."
61
+ end
62
+ else
63
+ if !!ENV["BUILDKITE_BUILD_ID"]
64
+ Buildkite::TestCollector.logger.info "Buildkite Test Analytics: No Suite API key provided. You can get the API key from your Suite settings page."
65
+ end
66
+ end
67
+ end
68
+
69
+ def self.tracer
70
+ Thread.current[:_buildkite_tracer]
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Buildkite
4
+ module TestCollector
5
+ VERSION = "1.0.0"
6
+ NAME = "buildkite-test_collector"
7
+ end
8
+ end