cem_acpt 0.12.1 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c9db8e060c748eb655248cac9a3a00d67802081d0d63b31f9f3a5daa6385c13f
4
- data.tar.gz: 365a1e9551b3721869e7b9dc649408f544be47eddc84cccb3b8cf0f0ab025178
3
+ metadata.gz: 1e37459e007c8e05a53fddf7920cefb42223e7e490785a3601372fdf172de82f
4
+ data.tar.gz: 1147801fce2be7b5d0a1847d817449ad2492c8ec04707d5a82713663b2406804
5
5
  SHA512:
6
- metadata.gz: e33957643fc4601962aff311034849f6840972ba8d47860ebabcaef9b42cab1e8cf874aa06be1f0e3743d8511c494a2ec92d57d92f74ef0f3cfa673d08e14b39
7
- data.tar.gz: 9fadc4795a4b18a1ad889719c69b2962c70c19b54775cebafd00c67f3d876e118913a06267b973d17080942a67937deab2b00bee6bec97b7c36741910fa1e488
6
+ metadata.gz: e3e28851afbfdeeb9a2d4ca0cfc0fb10f304ba121111422416f5a50757fc9ea50bdc30db20149d00b0d04dcf91d548e73c03276863601037417ce1c39a914090
7
+ data.tar.gz: 232de289f1ba433d6e5c25737cadffc8641dae7052075e359e663f935225547b44f3e430f8509c9f2519f8d8abf41204b83c76964eee387cf554f45bafd04b49
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- cem_acpt (0.12.1)
4
+ cem_acpt (0.12.2)
5
5
  async-http (>= 0.60, < 0.70)
6
6
  bcrypt_pbkdf (>= 1.0, < 2.0)
7
7
  deep_merge (>= 1.2, < 2.0)
@@ -45,6 +45,7 @@ module CemAcpt
45
45
  daemon: {
46
46
  port: 8084,
47
47
  ready_timeout: 60,
48
+ poll_interval: 10,
48
49
  },
49
50
  profiles: {
50
51
  openscap: {},
@@ -9,33 +9,42 @@ require_relative 'result'
9
9
  module CemAcpt
10
10
  module Scan
11
11
  # HTTP client for the on-node scan daemon. Mirrors the role of
12
- # {CemAcpt::Goss::Api}: build URIs, GET them, parse the JSON response,
13
- # turn the response into a typed result.
12
+ # {CemAcpt::Goss::Api}: build URIs, drive the async scan endpoints, parse
13
+ # JSON responses, turn the result into a typed {Result}.
14
14
  #
15
15
  # The daemon is installed by {CemAcpt::Provision::Linux#scan_provision_commands}
16
- # and serves two endpoints on the configurable scan port:
16
+ # and serves four endpoints on the configurable scan port:
17
17
  #
18
- # GET /health -> 200 OK once the daemon has started and the scanner
19
- # binaries it needs are present on disk.
20
- # GET /scan -> 200 with a JSON body shaped like:
21
- # { "score": 87.4, "passed_count": 187, "failed_count": 27,
22
- # "not_applicable_count": 14, "error_count": 0, "rules": [...] }
23
- # Non-200 responses or unparseable bodies raise
24
- # {ScannerInvocationError}.
18
+ # GET /health -> 200 OK once the daemon has started and the scanner
19
+ # binaries it needs are present on disk.
20
+ # POST /scan -> 202 Accepted; kicks off the scan in a background thread.
21
+ # 409 Conflict if a scan is already running.
22
+ # GET /status -> { "state": "idle" | "running" | "done" | "error" }.
23
+ # GET /result -> 200 with the JSON result when state is terminal,
24
+ # 404 otherwise.
25
+ #
26
+ # {#scan} drives the async flow: POST /scan, then poll /status, then
27
+ # GET /result. Each request is short-lived to avoid the connection-drop
28
+ # failure mode that affected the previous synchronous /scan design.
25
29
  class DaemonClient
26
30
  DEFAULT_PORT = 8084
27
31
  DEFAULT_READY_TIMEOUT = 60
28
- DEFAULT_HTTP_TIMEOUT = 1800 # 30 minutes — scans can be long
32
+ DEFAULT_HTTP_TIMEOUT = 1800 # 30 minutes — overall deadline for a scan
33
+ DEFAULT_POLL_INTERVAL = 10
34
+ DEFAULT_REQUEST_TIMEOUT = 60
29
35
 
30
36
  # @param host [String] The IP or DNS name of the test node.
31
37
  # @param port [Integer] The port the daemon listens on.
32
38
  # @param ready_timeout [Integer] How long to wait for /health.
33
- # @param http_timeout [Integer] How long a single /scan request may take.
34
- def initialize(host:, port: DEFAULT_PORT, ready_timeout: DEFAULT_READY_TIMEOUT, http_timeout: DEFAULT_HTTP_TIMEOUT)
39
+ # @param http_timeout [Integer] Overall deadline for a scan (POST + polls + result fetch).
40
+ # @param poll_interval [Integer] Seconds between /status polls.
41
+ def initialize(host:, port: DEFAULT_PORT, ready_timeout: DEFAULT_READY_TIMEOUT,
42
+ http_timeout: DEFAULT_HTTP_TIMEOUT, poll_interval: DEFAULT_POLL_INTERVAL)
35
43
  @host = host
36
44
  @port = port
37
45
  @ready_timeout = ready_timeout
38
46
  @http_timeout = http_timeout
47
+ @poll_interval = poll_interval
39
48
  end
40
49
 
41
50
  # Polls /health until it returns 200 or the timeout elapses.
@@ -56,35 +65,90 @@ module CemAcpt
56
65
  raise DaemonNotReadyError, msg
57
66
  end
58
67
 
59
- # Hits /scan and turns the response into a {Result}.
68
+ # Kicks off the scan, polls until it finishes, and fetches the result.
60
69
  # @param test_case [String] Acceptance-test directory name.
61
70
  # @param scanner [Symbol] :openscap or :ciscat.
62
71
  # @param profile [String] Scanner-native profile id.
63
72
  # @param threshold [Float] Pass threshold (0-100).
64
73
  # @return [Result]
65
- # @raise [ScannerInvocationError] on non-200 or unparseable body.
74
+ # @raise [ScannerInvocationError] on protocol errors or scan deadline exceeded.
66
75
  def scan(test_case:, scanner:, profile:, threshold:)
67
- uri = URI("http://#{@host}:#{@port}/scan")
68
- status, body = get(uri, timeout: @http_timeout)
76
+ start_scan
77
+ wait_for_completion
78
+ body = fetch_result
79
+ Result.new(test_case: test_case, scanner: scanner, profile: profile, threshold: threshold, body: body)
80
+ end
81
+
82
+ private
83
+
84
+ def start_scan
85
+ status, body = post(URI("http://#{@host}:#{@port}/scan"), timeout: DEFAULT_REQUEST_TIMEOUT)
86
+ return if status.to_i == 202
87
+
88
+ raise ScannerInvocationError, "Scan kickoff at #{@host}:#{@port} returned status #{status}: #{body.inspect}"
89
+ end
90
+
91
+ def wait_for_completion
92
+ deadline = Time.now + @http_timeout
93
+ loop do
94
+ raise ScannerInvocationError, "Scan did not finish within #{@http_timeout}s" if Time.now > deadline
95
+
96
+ _status, body = get(URI("http://#{@host}:#{@port}/status"), timeout: DEFAULT_REQUEST_TIMEOUT)
97
+ case body['state']
98
+ when 'done', 'error' then return
99
+ when 'running' then sleep @poll_interval
100
+ else
101
+ raise ScannerInvocationError, "Unexpected scan state from #{@host}:#{@port}: #{body['state'].inspect}"
102
+ end
103
+ end
104
+ end
105
+
106
+ def fetch_result
107
+ status, body = get(URI("http://#{@host}:#{@port}/result"), timeout: DEFAULT_REQUEST_TIMEOUT)
69
108
  unless status.to_i == 200
70
109
  raise ScannerInvocationError, "Scan daemon at #{@host}:#{@port} returned status #{status}: #{body.inspect}"
71
110
  end
72
111
 
73
- Result.new(test_case: test_case, scanner: scanner, profile: profile, threshold: threshold, body: body)
112
+ body
74
113
  end
75
114
 
76
- private
77
-
78
115
  def get(uri, timeout:)
116
+ request(uri, Net::HTTP::Get.new(uri.request_uri), timeout: timeout)
117
+ end
118
+
119
+ def post(uri, timeout:)
120
+ request(uri, Net::HTTP::Post.new(uri.request_uri), timeout: timeout)
121
+ end
122
+
123
+ def request(uri, req, timeout:)
79
124
  http = Net::HTTP.new(uri.host, uri.port)
80
125
  http.read_timeout = timeout
81
126
  http.open_timeout = [timeout, 30].min
82
- response = http.get(uri.request_uri)
127
+ http.start
128
+ enable_tcp_keepalive(http)
129
+ response = http.request(req)
83
130
  body = response.body.to_s
84
131
  parsed = body.empty? ? {} : JSON.parse(body)
85
132
  [response.code, parsed]
86
133
  rescue JSON::ParserError => e
87
134
  raise ScannerInvocationError, "Scan daemon at #{@host}:#{@port} returned non-JSON body: #{e.message}"
135
+ ensure
136
+ http.finish if http.started?
137
+ end
138
+
139
+ def enable_tcp_keepalive(http)
140
+ socket = http.instance_variable_get(:@socket)&.io
141
+ return unless socket
142
+
143
+ socket.setsockopt(Socket::SOL_SOCKET, Socket::SO_KEEPALIVE, true)
144
+ # Linux uses TCP_KEEPIDLE; macOS uses TCP_KEEPALIVE for the same idle-time option.
145
+ idle_opt = Socket.const_defined?(:TCP_KEEPIDLE) ? Socket::TCP_KEEPIDLE : Socket::TCP_KEEPALIVE
146
+ socket.setsockopt(Socket::IPPROTO_TCP, idle_opt, 60)
147
+ socket.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_KEEPINTVL, 10)
148
+ socket.setsockopt(Socket::IPPROTO_TCP, Socket::TCP_KEEPCNT, 6)
149
+ rescue Errno::ENOPROTOOPT
150
+ # Platform supports SO_KEEPALIVE but not the fine-grained probe constants;
151
+ # SO_KEEPALIVE alone was already set and provides partial protection.
88
152
  end
89
153
  end
90
154
  end
@@ -315,6 +315,7 @@ module CemAcpt
315
315
  logger.info('CemAcpt::TestRunner') { 'Running scan action...' }
316
316
  port = config.get('cem_acpt_scan.daemon.port') || CemAcpt::Scan::DaemonClient::DEFAULT_PORT
317
317
  ready_timeout = config.get('cem_acpt_scan.daemon.ready_timeout') || CemAcpt::Scan::DaemonClient::DEFAULT_READY_TIMEOUT
318
+ poll_interval = config.get('cem_acpt_scan.daemon.poll_interval') || CemAcpt::Scan::DaemonClient::DEFAULT_POLL_INTERVAL
318
319
  global_threshold = (config.get('cem_acpt_scan.threshold') || 80.0).to_f
319
320
  per_case_thresholds = config.get('cem_acpt_scan.test_thresholds') || {}
320
321
  scan_output = config.get('cem_acpt_scan.scan_output')
@@ -326,7 +327,7 @@ module CemAcpt
326
327
  scan_meta = scan_meta_for(test_name)
327
328
  threshold = per_case_thresholds[test_name] || per_case_thresholds[test_name.to_sym] || global_threshold
328
329
 
329
- client = CemAcpt::Scan::DaemonClient.new(host: host, port: port, ready_timeout: ready_timeout)
330
+ client = CemAcpt::Scan::DaemonClient.new(host: host, port: port, ready_timeout: ready_timeout, poll_interval: poll_interval)
330
331
  client.wait_until_ready
331
332
  result = client.scan(
332
333
  test_case: test_name,
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module CemAcpt
4
- VERSION = '0.12.1'
4
+ VERSION = '0.12.2'
5
5
  end
@@ -127,6 +127,13 @@ def perform_scan
127
127
  end
128
128
  end
129
129
 
130
+ # Module-level scan state guarded by a Mutex. POST /scan flips state to
131
+ # 'running' and spawns a background thread; the thread parks the result and
132
+ # flips state to 'done' or 'error' on completion. GET /status and GET /result
133
+ # read the state under the mutex. One scan in flight per daemon.
134
+ SCAN_STATE = { state: 'idle', result: nil, started_at: nil }
135
+ SCAN_MUTEX = Mutex.new
136
+
130
137
  port = ENV.fetch('SCAN_DAEMON_PORT', '8084').to_i
131
138
  server = WEBrick::HTTPServer.new(Port: port, BindAddress: '0.0.0.0')
132
139
 
@@ -136,11 +143,53 @@ server.mount_proc('/health') do |_req, res|
136
143
  res.body = JSON.generate('status' => 'ok')
137
144
  end
138
145
 
139
- server.mount_proc('/scan') do |_req, res|
140
- payload = perform_scan
141
- res.status = payload['error'] ? 500 : 200
146
+ server.mount_proc('/scan') do |req, res|
147
+ res['Content-Type'] = 'application/json'
148
+ unless req.request_method == 'POST'
149
+ res.status = 405
150
+ res.body = JSON.generate('error' => 'use POST /scan')
151
+ next
152
+ end
153
+ SCAN_MUTEX.synchronize do
154
+ if SCAN_STATE[:state] == 'running'
155
+ res.status = 409
156
+ res.body = JSON.generate('error' => 'scan already running')
157
+ next
158
+ end
159
+ SCAN_STATE[:state] = 'running'
160
+ SCAN_STATE[:started_at] = Time.now
161
+ SCAN_STATE[:result] = nil
162
+ end
163
+ Thread.new do
164
+ payload = perform_scan
165
+ SCAN_MUTEX.synchronize do
166
+ SCAN_STATE[:state] = payload['error'] ? 'error' : 'done'
167
+ SCAN_STATE[:result] = payload
168
+ end
169
+ end
170
+ res.status = 202
171
+ res.body = JSON.generate('status' => 'started')
172
+ end
173
+
174
+ server.mount_proc('/status') do |_req, res|
175
+ res['Content-Type'] = 'application/json'
176
+ SCAN_MUTEX.synchronize do
177
+ res.status = 200
178
+ res.body = JSON.generate('state' => SCAN_STATE[:state])
179
+ end
180
+ end
181
+
182
+ server.mount_proc('/result') do |_req, res|
142
183
  res['Content-Type'] = 'application/json'
143
- res.body = JSON.generate(payload)
184
+ SCAN_MUTEX.synchronize do
185
+ if %w[done error].include?(SCAN_STATE[:state])
186
+ res.status = 200
187
+ res.body = JSON.generate(SCAN_STATE[:result])
188
+ else
189
+ res.status = 404
190
+ res.body = JSON.generate('error' => "no result available, state: #{SCAN_STATE[:state]}")
191
+ end
192
+ end
144
193
  end
145
194
 
146
195
  trap('INT') { server.shutdown }
@@ -0,0 +1,294 @@
1
+ # CEM-6799 — Async scan protocol and TCP keepalive on DaemonClient
2
+
3
+ ## Summary
4
+
5
+ `DaemonClient#scan` makes a single synchronous `GET /scan` and holds the TCP
6
+ connection open for the full scan duration (15–30 minutes for STIG/CIS
7
+ benchmarks). Two failure modes follow from this:
8
+
9
+ 1. **Idle connection drop.** During the long wait no data flows; GCP's VPC
10
+ firewall, Cloud NAT, or OS-level connection tracking drops the idle session
11
+ and the client raises `Errno::ECONNRESET` or `Errno::ENOTCONN` at
12
+ `read_nonblock`. Confirmed in local runs (`ENOTCONN` after ~1007 s) and
13
+ GitHub Actions CI (`ECONNRESET` after ~912 s).
14
+ 2. **Response delivery failure.** Even when the scan completes successfully on
15
+ the node and writes its result file to disk, the response body cannot be
16
+ delivered back over a connection that has been idle long enough to be torn
17
+ down. Confirmed on Ubuntu 24.04: `ciscat-results.json` (81 KB) and
18
+ `ciscat-results-ARF.xml` (8.7 MB) both written to
19
+ `/opt/cem_acpt/scan/reports/`, but the client raised `ENOTCONN` at 736 s
20
+ while reading the response.
21
+
22
+ The fix has two parts:
23
+
24
+ 1. **Async scan protocol.** `/scan` becomes a kickoff endpoint that returns
25
+ immediately; the daemon runs the scan in a background thread. The client
26
+ polls a new `/status` endpoint with short-lived HTTP calls, then fetches the
27
+ result via a new `/result` endpoint once the daemon reports completion.
28
+ Long-lived connections are eliminated entirely.
29
+ 2. **TCP keepalive on `DaemonClient#get`.** Belt-and-suspenders: even the
30
+ short-lived polling calls enable `SO_KEEPALIVE` so any transient idle period
31
+ inside a single HTTP exchange is protected.
32
+
33
+ ## Functional Behavior
34
+
35
+ ### Daemon endpoints — `lib/terraform/gcp/linux/scan/scan_service.rb`
36
+
37
+ **Before:**
38
+
39
+ - `GET /health` — 200 OK when daemon is up.
40
+ - `GET /scan` — blocks for the scan duration; returns 200 with result JSON or
41
+ 500 with an error.
42
+
43
+ **After:**
44
+
45
+ - `GET /health` — unchanged.
46
+ - `POST /scan` — kicks off the scan in a background thread; returns `202
47
+ Accepted` with `{"status": "started"}` immediately. Returns `409 Conflict`
48
+ with `{"error": "scan already running"}` if a scan is already in flight.
49
+ - `GET /status` — new. Returns `{"state": "idle" | "running" | "done" |
50
+ "error"}` so the client can poll.
51
+ - `GET /result` — new. Returns `200` with the result JSON when the daemon's
52
+ state is `done` or `error`; `404` with `{"error": "no result available, state:
53
+ <current>"}` otherwise.
54
+
55
+ Scan state is held in module-level variables guarded by a `Mutex`. There is no
56
+ new class — the daemon script stays script-shaped:
57
+
58
+ ```ruby
59
+ SCAN_STATE = { state: 'idle', result: nil, started_at: nil }
60
+ SCAN_MUTEX = Mutex.new
61
+
62
+ server.mount_proc('/scan') do |req, res|
63
+ res['Content-Type'] = 'application/json'
64
+ unless req.request_method == 'POST'
65
+ res.status = 405
66
+ res.body = JSON.generate('error' => 'use POST /scan')
67
+ next
68
+ end
69
+ SCAN_MUTEX.synchronize do
70
+ if SCAN_STATE[:state] == 'running'
71
+ res.status = 409
72
+ res.body = JSON.generate('error' => 'scan already running')
73
+ next
74
+ end
75
+ SCAN_STATE[:state] = 'running'
76
+ SCAN_STATE[:started_at] = Time.now
77
+ SCAN_STATE[:result] = nil
78
+ end
79
+ Thread.new do
80
+ result = perform_scan
81
+ SCAN_MUTEX.synchronize do
82
+ SCAN_STATE[:state] = result['error'] ? 'error' : 'done'
83
+ SCAN_STATE[:result] = result
84
+ end
85
+ end
86
+ res.status = 202
87
+ res.body = JSON.generate('status' => 'started')
88
+ end
89
+
90
+ server.mount_proc('/status') do |_req, res|
91
+ res['Content-Type'] = 'application/json'
92
+ SCAN_MUTEX.synchronize do
93
+ res.status = 200
94
+ res.body = JSON.generate('state' => SCAN_STATE[:state])
95
+ end
96
+ end
97
+
98
+ server.mount_proc('/result') do |_req, res|
99
+ res['Content-Type'] = 'application/json'
100
+ SCAN_MUTEX.synchronize do
101
+ if %w[done error].include?(SCAN_STATE[:state])
102
+ res.status = 200
103
+ res.body = JSON.generate(SCAN_STATE[:result])
104
+ else
105
+ res.status = 404
106
+ res.body = JSON.generate('error' => "no result available, state: #{SCAN_STATE[:state]}")
107
+ end
108
+ end
109
+ end
110
+ ```
111
+
112
+ ### Client — `DaemonClient#scan` (`lib/cem_acpt/scan/daemon_client.rb`)
113
+
114
+ **Before:** single `GET /scan` with a 30-minute `read_timeout`.
115
+
116
+ **After:** POST → poll → GET. Each request is a short-lived connection.
117
+
118
+ ```ruby
119
+ DEFAULT_POLL_INTERVAL = 10
120
+ DEFAULT_REQUEST_TIMEOUT = 60
121
+
122
+ def scan(test_case:, scanner:, profile:, threshold:)
123
+ start_scan
124
+ wait_for_completion
125
+ body = fetch_result
126
+ Result.new(test_case: test_case, scanner: scanner, profile: profile, threshold: threshold, body: body)
127
+ end
128
+
129
+ private
130
+
131
+ def start_scan
132
+ status, body = post(URI("http://#{@host}:#{@port}/scan"), timeout: DEFAULT_REQUEST_TIMEOUT)
133
+ return if status.to_i == 202
134
+ raise ScannerInvocationError, "Scan kickoff at #{@host}:#{@port} returned status #{status}: #{body.inspect}"
135
+ end
136
+
137
+ def wait_for_completion
138
+ deadline = Time.now + @http_timeout
139
+ loop do
140
+ raise ScannerInvocationError, "Scan did not finish within #{@http_timeout}s" if Time.now > deadline
141
+
142
+ _status, body = get(URI("http://#{@host}:#{@port}/status"), timeout: DEFAULT_REQUEST_TIMEOUT)
143
+ case body['state']
144
+ when 'done', 'error' then return
145
+ when 'running' then sleep @poll_interval
146
+ else
147
+ raise ScannerInvocationError, "Unexpected scan state: #{body['state'].inspect}"
148
+ end
149
+ end
150
+ end
151
+
152
+ def fetch_result
153
+ status, body = get(URI("http://#{@host}:#{@port}/result"), timeout: DEFAULT_REQUEST_TIMEOUT)
154
+ raise ScannerInvocationError, "Scan daemon at #{@host}:#{@port} returned status #{status}: #{body.inspect}" unless status.to_i == 200
155
+ body
156
+ end
157
+ ```
158
+
159
+ `@poll_interval` defaults to `DEFAULT_POLL_INTERVAL` (10 s); configurable via
160
+ `cem_acpt_scan.daemon.poll_interval`.
161
+
162
+ A new private `post` helper mirrors `get` (same keepalive setup, same JSON
163
+ handling, no request body needed for `POST /scan` so it sends an empty body).
164
+
165
+ ### TCP keepalive — `DaemonClient#enable_tcp_keepalive`
166
+
167
+ Unchanged from the current keepalive implementation. Sets `SO_KEEPALIVE`,
168
+ `TCP_KEEPIDLE` (Linux) / `TCP_KEEPALIVE` (macOS), `TCP_KEEPINTVL`,
169
+ `TCP_KEEPCNT` on every socket opened by `get` and `post`. Kept as
170
+ belt-and-suspenders even though connections are now short-lived.
171
+
172
+ ## Non-Goals
173
+
174
+ - Persisting scan state across daemon restarts (state is in memory; a daemon
175
+ crash mid-scan loses the result).
176
+ - Multiple concurrent scans on a single node. The daemon serializes to one scan
177
+ at a time; concurrent `POST /scan` returns 409.
178
+ - Cancelling an in-flight scan via the API.
179
+ - Authentication on the scan endpoints. The daemon is firewalled to the
180
+ provisioning host; this is unchanged.
181
+
182
+ ## Configuration
183
+
184
+ New config key:
185
+
186
+ - `cem_acpt_scan.daemon.poll_interval` — Integer, seconds between `/status`
187
+ polls. Default: `10`.
188
+
189
+ Existing keys unchanged:
190
+
191
+ - `cem_acpt_scan.daemon.port` — daemon port.
192
+ - `cem_acpt_scan.daemon.ready_timeout` — `/health` poll timeout.
193
+ - The host-side `http_timeout` (default 1800 s) is the total deadline for a
194
+ scan, including all polling, not the timeout for any single HTTP request.
195
+
196
+ ## Edge Cases
197
+
198
+ - **Daemon crashes mid-scan.** The result file may still be on disk in
199
+ `/opt/cem_acpt/scan/reports/`, but the daemon's in-memory state is gone. On
200
+ daemon restart, `/status` returns `idle` and the client's poll loop will
201
+ receive an `Unexpected scan state` once it reconnects. Acceptable for now;
202
+ persistence is a non-goal.
203
+ - **`POST /scan` while a scan is running.** Returns 409. The client treats this
204
+ as an error.
205
+ - **`GET /result` while scan is running.** Returns 404 with state diagnostic.
206
+ The client only calls `/result` after `/status` reports a terminal state, so
207
+ this is for safety.
208
+ - **Background thread raises.** `perform_scan` already wraps errors and returns
209
+ a hash with an `error` key. The thread sets state to `error` and stores the
210
+ hash; the client picks it up via `/result`. The existing `ScannerInvocationError`
211
+ path in `DaemonClient` handles this when `Result.new` sees an error body.
212
+ - **macOS without `TCP_KEEPIDLE`.** The keepalive code already handles this
213
+ via `Socket.const_defined?(:TCP_KEEPIDLE)` falling back to
214
+ `Socket::TCP_KEEPALIVE`.
215
+
216
+ ## Constraints / Invariants
217
+
218
+ - `DaemonClient` public interface is unchanged: `wait_until_ready` and `scan`
219
+ with the same signatures and return shapes.
220
+ - The daemon script remains a single-file standalone systemd service.
221
+ - One scan at a time per daemon. The daemon is per-node, so this matches the
222
+ per-test-case model.
223
+
224
+ ## Alternatives Considered
225
+
226
+ **SCP the result file off the node.** After kicking off the scan, fetch the
227
+ result JSON via SSH/SCP from `/opt/cem_acpt/scan/reports/` rather than HTTP.
228
+ Rejected — would require threading SSH credentials through the scan layer (the
229
+ scan path currently uses HTTP only) and would bypass the daemon's error
230
+ encoding.
231
+
232
+ **Keep `GET /scan` synchronous with TCP keepalive only.** This was the original
233
+ CEM-6799 plan. Rejected because Ubuntu 24.04 testing on 2026-06-23 showed the
234
+ scan completing successfully but the response body being undeliverable: the
235
+ connection had been torn down despite keepalive probes. Keepalive on its own is
236
+ not sufficient.
237
+
238
+ ## Tests
239
+
240
+ ### `spec/cem_acpt/scan/daemon_client_spec.rb`
241
+
242
+ - Update existing `#scan` examples to stub three sequential calls: a `POST
243
+ /scan` returning 202, one or more `GET /status` returning `running` then
244
+ `done`, and a `GET /result` returning the JSON body.
245
+ - Add an example: `POST /scan` returns 409 → `ScannerInvocationError` raised.
246
+ - Add an example: `GET /status` returns an unrecognized state →
247
+ `ScannerInvocationError` raised.
248
+ - Add an example: poll loop hits `@http_timeout` without `done` →
249
+ `ScannerInvocationError` raised with "did not finish within" message.
250
+ - Existing `#enable_tcp_keepalive` examples unchanged.
251
+
252
+ ### `spec/terraform/gcp/linux/scan/scan_service_spec.rb`
253
+
254
+ - Add text-grep assertions that `/status` and `/result` `mount_proc` blocks are
255
+ present, mirroring the style of the existing flag-presence assertions.
256
+ - Add behavioral coverage: load `scan_service.rb` under the existing WEBrick
257
+ stub harness, drive the new endpoints directly. Stub `perform_scan` to return
258
+ a fixed hash. Assert state transitions `idle → running → done` and that
259
+ `/result` returns the stubbed hash.
260
+
261
+ ### Fixture mirror
262
+
263
+ - `spec/fixtures/config_testing/user_config_dir/terraform/gcp/linux/scan/scan_service.rb`
264
+ updated in lockstep.
265
+ - `spec/fixtures/config_testing/user_config_dir/terraform_checksum.txt`
266
+ regenerated.
267
+
268
+ ## Acceptance Criteria
269
+
270
+ - [ ] `POST /scan` kicks off the scan in a background thread and returns 202.
271
+ - [ ] `POST /scan` returns 409 when a scan is already running.
272
+ - [ ] `GET /status` returns the current state.
273
+ - [ ] `GET /result` returns 200 with the result when state is terminal, 404 otherwise.
274
+ - [ ] `DaemonClient#scan` posts, polls, and fetches the result with short-lived requests.
275
+ - [ ] Poll interval is configurable via `cem_acpt_scan.daemon.poll_interval` (default 10 s).
276
+ - [ ] TCP keepalive remains enabled on every `DaemonClient#get` / `#post` call.
277
+ - [ ] Existing `daemon_client_spec.rb` examples pass after update.
278
+ - [ ] New unit tests for the async client flow pass.
279
+ - [ ] New behavioral tests for `/status` and `/result` pass.
280
+ - [ ] Long scans (15+ minutes) no longer fail with `ECONNRESET` or `ENOTCONN`.
281
+
282
+ ## Files Touched
283
+
284
+ - `lib/cem_acpt/scan/daemon_client.rb` — async `scan` flow, new private `post`
285
+ helper, keepalive retained.
286
+ - `lib/terraform/gcp/linux/scan/scan_service.rb` — `POST /scan` semantics,
287
+ `/status` and `/result` endpoints, module-level state + mutex.
288
+ - `lib/cem_acpt/config/cem_acpt_scan.rb` — `poll_interval` default under
289
+ `cem_acpt_scan.daemon`.
290
+ - `spec/fixtures/config_testing/user_config_dir/terraform/gcp/linux/scan/scan_service.rb` — fixture mirror update.
291
+ - `spec/fixtures/config_testing/user_config_dir/terraform_checksum.txt` — checksum update.
292
+ - `spec/cem_acpt/scan/daemon_client_spec.rb` — async-flow tests.
293
+ - `spec/terraform/gcp/linux/scan/scan_service_spec.rb` — `/status` and `/result` tests.
294
+ - `specifications/CEM-6799.md` — this file.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cem_acpt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.1
4
+ version: 0.12.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - puppetlabs
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-06-10 00:00:00.000000000 Z
11
+ date: 2026-06-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: async-http
@@ -349,6 +349,7 @@ files:
349
349
  - specifications/CEM-6762.md
350
350
  - specifications/CEM-6765.md
351
351
  - specifications/CEM-6798.md
352
+ - specifications/CEM-6799.md
352
353
  homepage: https://github.com/puppetlabs/cem_acpt
353
354
  licenses:
354
355
  - proprietary