buildkite-test_collector 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,331 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "socket_connection"
4
+
5
+ module Buildkite::TestCollector
6
+ class Session
7
+ # Picked 75 as the magic timeout number as it's longer than the TCP timeout of 60s 🤷‍♀️
8
+ CONFIRMATION_TIMEOUT = ENV.fetch("BUILDKITE_ANALYTICS_CONFIRMATION_TIMEOUT") { 75 }.to_i
9
+ MAX_RECONNECTION_ATTEMPTS = ENV.fetch("BUILDKITE_ANALYTICS_RECONNECTION_ATTEMPTS") { 3 }.to_i
10
+ WAIT_BETWEEN_RECONNECTIONS = ENV.fetch("BUILDKITE_ANALYTICS_RECONNECTION_WAIT") { 5 }.to_i
11
+
12
+ class RejectedSubscription < StandardError; end
13
+ class InitialConnectionFailure < StandardError; end
14
+
15
+ DISCONNECTED_EXCEPTIONS = [
16
+ SocketConnection::HandshakeError,
17
+ RejectedSubscription,
18
+ TimeoutError,
19
+ InitialConnectionFailure,
20
+ SocketConnection::SocketError
21
+ ]
22
+
23
+ def initialize(url, authorization_header, channel)
24
+ @establish_subscription_queue = Queue.new
25
+ @channel = channel
26
+
27
+ @unconfirmed_idents = {}
28
+ @idents_mutex = Mutex.new
29
+ @send_queue = Queue.new
30
+ @empty = ConditionVariable.new
31
+ @closing = false
32
+ @eot_queued = false
33
+ @eot_queued_mutex = Mutex.new
34
+ @reconnection_mutex = Mutex.new
35
+
36
+ @url = url
37
+ @authorization_header = authorization_header
38
+
39
+ reconnection_count = 0
40
+
41
+ begin
42
+ reconnection_count += 1
43
+ connect
44
+ rescue TimeoutError, InitialConnectionFailure => e
45
+ Buildkite::TestCollector.logger.warn("rspec-buildkite-analytics could not establish an initial connection with Buildkite due to #{e}. Attempting retry #{reconnection_count} of #{MAX_RECONNECTION_ATTEMPTS}...")
46
+ if reconnection_count > MAX_RECONNECTION_ATTEMPTS
47
+ Buildkite::TestCollector.logger.error "rspec-buildkite-analytics could not establish an initial connection with Buildkite due to #{e.message} after #{MAX_RECONNECTION_ATTEMPTS} attempts. You may be missing some data for this test suite, please contact support if this issue persists."
48
+ else
49
+ sleep(WAIT_BETWEEN_RECONNECTIONS)
50
+ Buildkite::TestCollector.logger.warn("retrying reconnection")
51
+ retry
52
+ end
53
+ end
54
+ init_write_thread
55
+ end
56
+
57
+ def disconnected(connection)
58
+ @reconnection_mutex.synchronize do
59
+ # When the first thread detects a disconnection, it calls the disconnect method
60
+ # with the current connection. This thread grabs the reconnection mutex and does the
61
+ # reconnection, which then updates the value of @connection.
62
+ #
63
+ # At some point in that process, the second thread would have detected the
64
+ # disconnection too, and it also calls it with the current connection. However, the
65
+ # second thread can't run the reconnection code because of the mutex. By the
66
+ # time the mutex is released, the value of @connection has been refreshed, and so
67
+ # the second thread returns early and does not reattempt the reconnection.
68
+ return unless connection == @connection
69
+ Buildkite::TestCollector.logger.debug("starting reconnection")
70
+
71
+ reconnection_count = 0
72
+
73
+ begin
74
+ reconnection_count += 1
75
+ connect
76
+ init_write_thread
77
+ rescue *DISCONNECTED_EXCEPTIONS => e
78
+ Buildkite::TestCollector.logger.warn("failed reconnection attempt #{reconnection_count} due to #{e}")
79
+ if reconnection_count > MAX_RECONNECTION_ATTEMPTS
80
+ Buildkite::TestCollector.logger.error "rspec-buildkite-analytics experienced a disconnection and could not reconnect to Buildkite due to #{e.message}. Please contact support."
81
+ raise e
82
+ else
83
+ sleep(WAIT_BETWEEN_RECONNECTIONS)
84
+ Buildkite::TestCollector.logger.warn("retrying reconnection")
85
+ retry
86
+ end
87
+ end
88
+ end
89
+ retransmit
90
+ end
91
+
92
+ def close(examples_count)
93
+ @closing = true
94
+ @examples_count = examples_count
95
+ Buildkite::TestCollector.logger.debug("closing socket connection")
96
+
97
+ # Because the server only sends us confirmations after every 10mb of
98
+ # data it uploads to S3, we'll never get confirmation of the
99
+ # identifiers of the last upload part unless we send an explicit finish,
100
+ # to which the server will respond with the last bits of data
101
+ send_eot
102
+
103
+ # After EOT, we wait for 75 seconds for the send queue to be drained and for the
104
+ # server to confirm the last idents. If everything has already been confirmed we can
105
+ # proceed without waiting.
106
+ @idents_mutex.synchronize do
107
+ if @unconfirmed_idents.any?
108
+ Buildkite::TestCollector.logger.debug "Waiting for Buildkite Test Analytics to send results..."
109
+ Buildkite::TestCollector.logger.debug("waiting for last confirm")
110
+
111
+ @empty.wait(@idents_mutex, CONFIRMATION_TIMEOUT)
112
+ end
113
+ end
114
+
115
+ # Then we always disconnect cos we can't wait forever? 🤷‍♀️
116
+ @connection.close
117
+ # We kill the write thread cos it's got a while loop in it, so it won't finish otherwise
118
+ @write_thread&.kill
119
+
120
+ Buildkite::TestCollector.logger.info "Buildkite Test Analytics completed"
121
+ Buildkite::TestCollector.logger.debug("socket connection closed")
122
+ end
123
+
124
+ def handle(_connection, data)
125
+ data = JSON.parse(data)
126
+ case data["type"]
127
+ when "ping"
128
+ # In absence of other message, the server sends us a ping every 3 seconds
129
+ # We are currently not doing anything with these
130
+ Buildkite::TestCollector.logger.debug("received ping")
131
+ when "welcome", "confirm_subscription"
132
+ # Push these two messages onto the queue, so that we block on waiting for the
133
+ # initializing phase to complete
134
+ @establish_subscription_queue.push(data)
135
+ Buildkite::TestCollector.logger.debug("received #{data['type']}")
136
+ when "reject_subscription"
137
+ Buildkite::TestCollector.logger.debug("received rejected_subscription")
138
+ raise RejectedSubscription
139
+ else
140
+ process_message(data)
141
+ end
142
+ end
143
+
144
+ def write_result(result)
145
+ queue_and_track_result(result.id, result.as_hash)
146
+
147
+ Buildkite::TestCollector.logger.debug("added #{result.id} to send queue")
148
+ end
149
+
150
+ def unconfirmed_idents_count
151
+ @idents_mutex.synchronize do
152
+ @unconfirmed_idents.count
153
+ end
154
+ end
155
+
156
+ private
157
+
158
+ def connect
159
+ Buildkite::TestCollector.logger.debug("starting socket connection process")
160
+
161
+ @connection = SocketConnection.new(self, @url, {
162
+ "Authorization" => @authorization_header,
163
+ })
164
+
165
+ wait_for_welcome
166
+
167
+ @connection.transmit({
168
+ "command" => "subscribe",
169
+ "identifier" => @channel
170
+ })
171
+
172
+ wait_for_confirm
173
+
174
+ Buildkite::TestCollector.logger.info "Connected to Buildkite Test Analytics!"
175
+ Buildkite::TestCollector.logger.debug("connected")
176
+ end
177
+
178
+ def init_write_thread
179
+ # As this method can be called multiple times in the
180
+ # reconnection process, kill prev write threads (if any) before
181
+ # setting up the new one
182
+ @write_thread&.kill
183
+
184
+ @write_thread = Thread.new do
185
+ Buildkite::TestCollector.logger.debug("hello from write thread")
186
+ # Pretty sure this eternal loop is fine cos the call to queue.pop is blocking
187
+ loop do
188
+ data = @send_queue.pop
189
+ message_type = data["action"]
190
+
191
+ if message_type == "end_of_transmission"
192
+ # Because of the unpredictable sequencing between the test suite finishing
193
+ # (EOT gets queued) and disconnections happening (retransmit results gets
194
+ # queued), we don't want to send an EOT before any retransmits are sent.
195
+ if @send_queue.length > 0
196
+ @send_queue << data
197
+ Buildkite::TestCollector.logger.debug("putting eot at back of queue")
198
+ next
199
+ end
200
+ @eot_queued_mutex.synchronize do
201
+ @eot_queued = false
202
+ end
203
+ end
204
+
205
+ @connection.transmit({
206
+ "identifier" => @channel,
207
+ "command" => "message",
208
+ "data" => data.to_json
209
+ })
210
+
211
+ if Buildkite::TestCollector.debug_enabled
212
+ ids = if message_type == "record_results"
213
+ data["results"].map { |result| result["id"] }
214
+ end
215
+ Buildkite::TestCollector.logger.debug("transmitted #{message_type} #{ids}")
216
+ end
217
+ end
218
+ end
219
+ end
220
+
221
+ def pop_with_timeout(message_type)
222
+ Timeout.timeout(30, Buildkite::TestCollector::TimeoutError, "Timeout: Waited 30 seconds for #{message_type}") do
223
+ @establish_subscription_queue.pop
224
+ end
225
+ end
226
+
227
+ def wait_for_welcome
228
+ welcome = pop_with_timeout("welcome")
229
+
230
+ if welcome && welcome != { "type" => "welcome" }
231
+ raise InitialConnectionFailure.new("Wrong message received, expected a welcome, but received: #{welcome.inspect}")
232
+ end
233
+ end
234
+
235
+ def wait_for_confirm
236
+ confirm = pop_with_timeout("confirm")
237
+
238
+ if confirm && confirm != { "type" => "confirm_subscription", "identifier" => @channel }
239
+ raise InitialConnectionFailure.new("Wrong message received, expected a confirm, but received: #{confirm.inspect}")
240
+ end
241
+ end
242
+
243
+ def queue_and_track_result(ident, result_as_hash)
244
+ @idents_mutex.synchronize do
245
+ @unconfirmed_idents[ident] = result_as_hash
246
+
247
+ @send_queue << {
248
+ "action" => "record_results",
249
+ "results" => [result_as_hash]
250
+ }
251
+ end
252
+ end
253
+
254
+ def confirm_idents(idents)
255
+ retransmit_required = @closing
256
+
257
+ @idents_mutex.synchronize do
258
+ # Remove received idents from unconfirmed_idents
259
+ idents.each { |key| @unconfirmed_idents.delete(key) }
260
+
261
+ Buildkite::TestCollector.logger.debug("received confirm for indentifiers: #{idents}")
262
+
263
+ # This @empty ConditionVariable broadcasts every time that @unconfirmed_idents is
264
+ # empty, which will happen about every 10mb of data as that's when the server
265
+ # sends back confirmations.
266
+ #
267
+ # However, there aren't any threads waiting on this signal until after we
268
+ # send the EOT message, so the prior broadcasts shouldn't do anything.
269
+ if @unconfirmed_idents.empty?
270
+ @empty.broadcast
271
+
272
+ retransmit_required = false
273
+
274
+ Buildkite::TestCollector.logger.debug("all identifiers have been confirmed")
275
+ else
276
+ Buildkite::TestCollector.logger.debug("still waiting on confirm for identifiers: #{@unconfirmed_idents.keys}")
277
+ end
278
+ end
279
+
280
+ # If we're closing, any unconfirmed results need to be retransmitted.
281
+ retransmit if retransmit_required
282
+ end
283
+
284
+ def send_eot
285
+ @eot_queued_mutex.synchronize do
286
+ return if @eot_queued
287
+
288
+ @send_queue << {
289
+ "action" => "end_of_transmission",
290
+ "examples_count" => @examples_count.to_json
291
+ }
292
+ @eot_queued = true
293
+
294
+ Buildkite::TestCollector.logger.debug("added EOT to send queue")
295
+ end
296
+ end
297
+
298
+ def process_message(data)
299
+ # Check we're getting the data we expect
300
+ return unless data["identifier"] == @channel
301
+
302
+ case
303
+ when data["message"].key?("confirm")
304
+ confirm_idents(data["message"]["confirm"])
305
+ else
306
+ # unhandled message
307
+ Buildkite::TestCollector.logger.debug("received unhandled message #{data["message"]}")
308
+ end
309
+ end
310
+
311
+ def retransmit
312
+ @idents_mutex.synchronize do
313
+ results = @unconfirmed_idents.values
314
+
315
+ # queue the contents of the buffer, unless it's empty
316
+ if results.any?
317
+ @send_queue << {
318
+ "action" => "record_results",
319
+ "results" => results
320
+ }
321
+
322
+ Buildkite::TestCollector.logger.debug("queueing up retransmitted results #{@unconfirmed_idents.keys}")
323
+ end
324
+ end
325
+
326
+ # if we were disconnected in the closing phase, then resend the EOT
327
+ # message so the server can persist the last upload part
328
+ send_eot if @closing
329
+ end
330
+ end
331
+ end
@@ -0,0 +1,157 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "socket"
4
+ require "openssl"
5
+ require "json"
6
+
7
+ module Buildkite::TestCollector
8
+ class SocketConnection
9
+ class HandshakeError < StandardError; end
10
+ class SocketError < StandardError; end
11
+
12
+ def initialize(session, url, headers)
13
+ uri = URI.parse(url)
14
+ @session = session
15
+ protocol = "http"
16
+
17
+ begin
18
+ socket = TCPSocket.new(uri.host, uri.port || (uri.scheme == "wss" ? 443 : 80))
19
+
20
+ if uri.scheme == "wss"
21
+ ctx = OpenSSL::SSL::SSLContext.new
22
+ protocol = "https"
23
+
24
+ ctx.min_version = :TLS1_2
25
+ ctx.verify_mode = OpenSSL::SSL::VERIFY_PEER
26
+ ctx.cert_store = OpenSSL::X509::Store.new.tap(&:set_default_paths)
27
+
28
+ socket = OpenSSL::SSL::SSLSocket.new(socket, ctx)
29
+ socket.connect
30
+ end
31
+ rescue
32
+ # We are rescuing all here, as there are a range of Errno errors that could be
33
+ # raised when we fail to establish a TCP connection
34
+ raise SocketError
35
+ end
36
+
37
+ @socket = socket
38
+
39
+ headers = { "Origin" => "#{protocol}://#{uri.host}" }.merge(headers)
40
+ handshake = WebSocket::Handshake::Client.new(url: url, headers: headers)
41
+
42
+ @socket.write handshake.to_s
43
+
44
+ until handshake.finished?
45
+ if byte = @socket.getc
46
+ handshake << byte
47
+ end
48
+ end
49
+
50
+ # The errors below are raised when we establish the TCP connection, but get back
51
+ # an error, i.e. in dev we can still connect to puma-dev while nginx isn't
52
+ # running, or in prod we can hit a load balancer while app is down
53
+ unless handshake.valid?
54
+ case handshake.error
55
+ when Exception, String
56
+ raise HandshakeError.new(handshake.error)
57
+ when nil
58
+ raise HandshakeError.new("Invalid handshake")
59
+ else
60
+ raise HandshakeError.new(handshake.error.inspect)
61
+ end
62
+ end
63
+
64
+ @version = handshake.version
65
+
66
+ # Setting up a new thread that listens on the socket, and processes incoming
67
+ # comms from the server
68
+ @thread = Thread.new do
69
+ Buildkite::TestCollector.logger.debug("listening in on socket")
70
+ frame = WebSocket::Frame::Incoming::Client.new
71
+
72
+ while @socket
73
+ frame << @socket.readpartial(4096)
74
+
75
+ while data = frame.next
76
+ @session.handle(self, data.data)
77
+ end
78
+ end
79
+ # These get re-raise from session, we should fail gracefully
80
+ rescue *Buildkite::TestCollector::Session::DISCONNECTED_EXCEPTIONS => e
81
+ Buildkite::TestCollector.logger.error("We could not establish a connection with Buildkite Test Analytics. The error was: #{e.message}. If this is a problem, please contact support.")
82
+ rescue EOFError => e
83
+ Buildkite::TestCollector.logger.warn("#{e}")
84
+ if @socket
85
+ Buildkite::TestCollector.logger.warn("attempting disconnected flow")
86
+ @session.disconnected(self)
87
+ disconnect
88
+ end
89
+ rescue Errno::ECONNRESET => e
90
+ Buildkite::TestCollector.logger.error("#{e}")
91
+ if @socket
92
+ Buildkite::TestCollector.logger.error("attempting disconnected flow")
93
+ @session.disconnected(self)
94
+ disconnect
95
+ end
96
+ rescue IOError
97
+ # This is fine to ignore
98
+ Buildkite::TestCollector.logger.error("IOError")
99
+ rescue IndexError
100
+ # I don't like that we're doing this but I think it's the best of the options
101
+ #
102
+ # This relates to this issue https://github.com/ruby/openssl/issues/452
103
+ # A fix for it has been released but the repercussions of overriding
104
+ # the OpenSSL version in the stdlib seem worse than catching this error here.
105
+ Buildkite::TestCollector.logger.error("IndexError")
106
+ if @socket
107
+ Buildkite::TestCollector.logger.error("attempting disconnected flow")
108
+ @session.disconnected(self)
109
+ disconnect
110
+ end
111
+ end
112
+ end
113
+
114
+ def transmit(data, type: :text)
115
+ # this line prevents us from calling disconnect twice
116
+ return if @socket.nil?
117
+
118
+ raw_data = data.to_json
119
+ frame = WebSocket::Frame::Outgoing::Client.new(data: raw_data, type: :text, version: @version)
120
+ @socket.write(frame.to_s)
121
+ rescue Errno::EPIPE, Errno::ECONNRESET, OpenSSL::SSL::SSLError => e
122
+ return unless @socket
123
+ return if type == :close
124
+ Buildkite::TestCollector.logger.error("got #{e}, attempting disconnected flow")
125
+ @session.disconnected(self)
126
+ disconnect
127
+ rescue IndexError
128
+ # I don't like that we're doing this but I think it's the best of the options
129
+ #
130
+ # This relates to this issue https://github.com/ruby/openssl/issues/452
131
+ # A fix for it has been released but the repercussions of overriding
132
+ # the OpenSSL version in the stdlib seem worse than catching this error here.
133
+ Buildkite::TestCollector.logger.error("IndexError")
134
+ if @socket
135
+ Buildkite::TestCollector.logger.error("attempting disconnected flow")
136
+ @session.disconnected(self)
137
+ disconnect
138
+ end
139
+ end
140
+
141
+ def close
142
+ Buildkite::TestCollector.logger.debug("socket close")
143
+ transmit(nil, type: :close)
144
+ disconnect
145
+ end
146
+
147
+ private
148
+
149
+ def disconnect
150
+ Buildkite::TestCollector.logger.debug("socket disconnect")
151
+ socket = @socket
152
+ @socket = nil
153
+ socket&.close
154
+ @thread&.join unless @thread == Thread.current
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_support/core_ext/hash/indifferent_access"
4
+
5
+ module Buildkite::TestCollector
6
+ class Tracer
7
+ class Span
8
+ attr_accessor :section, :start_at, :end_at, :detail, :children
9
+
10
+ def initialize(section, start_at, end_at, detail)
11
+ @section = section
12
+ @start_at = start_at
13
+ @end_at = end_at
14
+ @detail = detail
15
+ @children = []
16
+ end
17
+
18
+ def as_hash
19
+ {
20
+ section: section,
21
+ start_at: start_at,
22
+ end_at: end_at,
23
+ duration: end_at - start_at,
24
+ detail: detail,
25
+ children: children.map(&:as_hash),
26
+ }.with_indifferent_access
27
+ end
28
+ end
29
+
30
+ def initialize
31
+ @top = Span.new(:top, Concurrent.monotonic_time, nil, {})
32
+ @stack = [@top]
33
+ end
34
+
35
+ def enter(section, **detail)
36
+ new_entry = Span.new(section, Concurrent.monotonic_time, nil, detail)
37
+ current_span.children << new_entry
38
+ @stack << new_entry
39
+ end
40
+
41
+ def leave
42
+ current_span.end_at = Concurrent.monotonic_time
43
+ @stack.pop
44
+ end
45
+
46
+ def backfill(section, duration, **detail)
47
+ new_entry = Span.new(section, Concurrent.monotonic_time - duration, Concurrent.monotonic_time, detail)
48
+ current_span.children << new_entry
49
+ end
50
+
51
+ def current_span
52
+ @stack.last
53
+ end
54
+
55
+ def finalize
56
+ raise "Stack not empty" unless @stack.size == 1
57
+ @top.end_at = Concurrent.monotonic_time
58
+ self
59
+ end
60
+
61
+ def history
62
+ @top.as_hash
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "openssl"
4
+ require "websocket"
5
+
6
+ require_relative "tracer"
7
+ require_relative "network"
8
+ require_relative "object"
9
+ require_relative "session"
10
+ require_relative "ci"
11
+ require_relative "http_client"
12
+
13
+ require "active_support"
14
+ require "active_support/notifications"
15
+
16
+ require "securerandom"
17
+
18
+ module Buildkite::TestCollector
19
+ class Uploader
20
+ def self.traces
21
+ @traces ||= {}
22
+ end
23
+
24
+ REQUEST_EXCEPTIONS = [
25
+ URI::InvalidURIError,
26
+ Net::HTTPBadResponse,
27
+ Net::HTTPHeaderSyntaxError,
28
+ Net::ReadTimeout,
29
+ Net::OpenTimeout,
30
+ OpenSSL::SSL::SSLError,
31
+ OpenSSL::SSL::SSLErrorWaitReadable,
32
+ EOFError
33
+ ]
34
+
35
+ def self.configure
36
+ Buildkite::TestCollector.logger.debug("hello from main thread")
37
+
38
+ if Buildkite::TestCollector.api_token
39
+ http = Buildkite::TestCollector::HTTPClient.new(Buildkite::TestCollector.url)
40
+
41
+ response = begin
42
+ http.post
43
+ rescue *Buildkite::TestCollector::Uploader::REQUEST_EXCEPTIONS => e
44
+ Buildkite::TestCollector.logger.error "Buildkite Test Analytics: Error communicating with the server: #{e.message}"
45
+ end
46
+
47
+ return unless response
48
+
49
+ case response.code
50
+ when "401"
51
+ Buildkite::TestCollector.logger.info "Buildkite Test Analytics: Invalid Suite API key. Please double check your Suite API key."
52
+ when "200"
53
+ json = JSON.parse(response.body)
54
+
55
+ if (socket_url = json["cable"]) && (channel = json["channel"])
56
+ Buildkite::TestCollector.session = Buildkite::TestCollector::Session.new(socket_url, http.authorization_header, channel)
57
+ end
58
+ else
59
+ request_id = response.to_hash["x-request-id"]
60
+ Buildkite::TestCollector.logger.info "rspec-buildkite-analytics could not establish an initial connection with Buildkite. You may be missing some data for this test suite, please contact support."
61
+ end
62
+ else
63
+ if !!ENV["BUILDKITE_BUILD_ID"]
64
+ Buildkite::TestCollector.logger.info "Buildkite Test Analytics: No Suite API key provided. You can get the API key from your Suite settings page."
65
+ end
66
+ end
67
+ end
68
+
69
+ def self.tracer
70
+ Thread.current[:_buildkite_tracer]
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Buildkite
4
+ module TestCollector
5
+ VERSION = "1.0.0"
6
+ NAME = "buildkite-test_collector"
7
+ end
8
+ end