haechi 1.3.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,12 @@
4
4
  // through a bounded sliding buffer (cross-frame matches caught up to
5
5
  // streaming.maxMatchBytes), and all other string leaves in a frame get
6
6
  // within-frame protection. The whole stream is audited once at the end.
7
+ //
8
+ // P1-CR-005 — a frame whose data: payload is not JSON is NOT raw-passed. A
9
+ // CONTROL frame (the [DONE] sentinel, comment-only, empty/keepalive) has no
10
+ // inspectable text and passes through; a non-JSON CONTENT frame is inspected as
11
+ // text (single-shot protector.protectText, distinct from the delta buffer) so
12
+ // plain-text PII/secrets cannot bypass protection in inspect mode.
7
13
 
8
14
  const SSE_DONE = "[DONE]";
9
15
 
@@ -49,14 +55,48 @@ export async function inspectResponseStream({ source, sink, streaming, protector
49
55
  const frame = { raw, body: raw.trim() };
50
56
  const parsed = parseFrame(frame, wireFormat);
51
57
  if (!parsed.ok) {
52
- // Non-JSON frame (e.g. `data: [DONE]`, comments, keep-alives): pass
53
- // through verbatim there is nothing to inspect.
54
- sink.write(frame.raw);
58
+ // P1-CR-005 — a parse-failed frame is one of two things:
59
+ // (1) a CONTROL frame with no inspectable text (the SSE [DONE] sentinel,
60
+ // a comment-only frame, an empty/whitespace/keepalive frame) — there
61
+ // is genuinely nothing to protect, so pass it through verbatim; or
62
+ // (2) a CONTENT frame whose data: payload is NOT JSON (plain text,
63
+ // partial/malformed JSON, provider-specific text). That text CAN carry
64
+ // PII/secrets, so it must be INSPECTED AS TEXT, not raw-passed.
65
+ if (parsed.control || parsed.text == null) {
66
+ sink.write(frame.raw);
67
+ return;
68
+ }
69
+ // Inspect the reconstructed data text as a single self-contained payload.
70
+ // protectText is DISTINCT from the delta-channel push/flush buffer, so a
71
+ // non-JSON frame's text never corrupts the JSON delta sliding buffer. A
72
+ // block-action detection fails the stream closed; otherwise re-emit the
73
+ // protected text (preserving the original wire wrapper / event: lines).
74
+ const protectedText = await protector.protectText(parsed.text);
75
+ if (protectedText.blocked) {
76
+ blocked = true;
77
+ return;
78
+ }
79
+ sink.write(serializeTextFrame(protectedText.text, wireFormat, frame));
55
80
  return;
56
81
  }
57
82
 
58
83
  const json = parsed.json;
59
84
 
85
+ // A bare PRIMITIVE JSON value (string/number/boolean/null) has no object
86
+ // structure for the delta/extras object path — a deltaPath setByPath on a
87
+ // string root would throw an uncaught TypeError on an attacker-influenceable
88
+ // frame. A JSON string can itself carry PII, so inspect the re-serialized
89
+ // value as text (same single-shot path as a non-JSON content frame).
90
+ if (json === null || typeof json !== "object") {
91
+ const protectedPrimitive = await protector.protectText(JSON.stringify(json));
92
+ if (protectedPrimitive.blocked) {
93
+ blocked = true;
94
+ return;
95
+ }
96
+ sink.write(serializeTextFrame(protectedPrimitive.text, wireFormat, frame));
97
+ return;
98
+ }
99
+
60
100
  // A delta-terminating frame: flush the held tail (as a valid delta frame)
61
101
  // before emitting it, so the residual is correctly ordered.
62
102
  if (flushOnType && flushOnType.values.includes(getByPath(json, flushOnType.path))) {
@@ -157,28 +197,64 @@ function createFrameSplitter(format) {
157
197
  };
158
198
  }
159
199
 
200
+ // Parse a frame. On success: { ok:true, json }. On failure the caller needs to
201
+ // know WHICH kind of failure it is (P1-CR-005):
202
+ // - { ok:false, control:true } → a CONTROL frame (no inspectable
203
+ // text: [DONE], comment-only, empty/
204
+ // whitespace/keepalive) → pass raw.
205
+ // - { ok:false, control:false, text } → a CONTENT frame whose data: payload
206
+ // is non-JSON → inspect `text` as text.
207
+ // Recognize an SSE `data:` field line LENIENTLY — allowing (non-spec) leading
208
+ // whitespace before the field name — and return its payload (one leading space
209
+ // after the colon stripped per the SSE spec), or null if the line is not a data
210
+ // field. SECURITY (P1-CR-005 follow-up): recognition MUST be identical in the
211
+ // parser (which inspects/redacts) and the serializers (which re-emit). If the
212
+ // serializer used a stricter `line.startsWith("data:")` it would fail to match a
213
+ // ` data: <pii>` line, emit it VERBATIM, and leak the original plaintext while
214
+ // separately appending the redacted copy. Both sides use this one helper.
215
+ const SSE_DATA_LINE = /^[ \t]*data:/;
216
+ function sseDataPayload(line) {
217
+ const match = /^[ \t]*data:(.*)$/.exec(line);
218
+ return match ? match[1].replace(/^ /, "") : null;
219
+ }
220
+
160
221
  function parseFrame(frame, format) {
161
222
  if (!frame) {
162
- return { ok: false };
223
+ return { ok: false, control: true, text: null };
163
224
  }
164
225
  let payload = frame.body;
165
226
  if (format === "sse") {
227
+ // An empty/whitespace/comment-only/keepalive frame has no data: line → a
228
+ // CONTROL frame with nothing to inspect.
229
+ if (payload === "") {
230
+ return { ok: false, control: true, text: null };
231
+ }
166
232
  const dataLines = payload
167
233
  .split("\n")
168
- .filter((line) => line.startsWith("data:"))
169
- .map((line) => line.slice(5).trim());
234
+ .map(sseDataPayload)
235
+ .filter((value) => value !== null);
170
236
  if (dataLines.length === 0) {
171
- return { ok: false };
237
+ // Comment-only (`:` lines) or field-only (event:/id:/retry:) frame.
238
+ return { ok: false, control: true, text: null };
172
239
  }
173
- payload = dataLines.join("");
240
+ // P2-CR-013 — the SSE model joins multiple data: lines with a NEWLINE, not
241
+ // "". Newlines are valid JSON whitespace between tokens / inside a string, so
242
+ // a multi-line JSON event still JSON.parses; a multi-line plain-text event is
243
+ // reconstructed with its newlines before text inspection.
244
+ payload = dataLines.join("\n");
174
245
  if (payload === SSE_DONE) {
175
- return { ok: false };
246
+ // The [DONE] sentinel: a CONTROL frame, never inspected.
247
+ return { ok: false, control: true, text: null };
176
248
  }
249
+ } else if (payload === "") {
250
+ // NDJSON: an empty/whitespace line is a CONTROL/keepalive frame.
251
+ return { ok: false, control: true, text: null };
177
252
  }
178
253
  try {
179
254
  return { ok: true, json: JSON.parse(payload) };
180
255
  } catch {
181
- return { ok: false };
256
+ // Non-JSON CONTENT: surface the reconstructed payload text for inspection.
257
+ return { ok: false, control: false, text: payload };
182
258
  }
183
259
  }
184
260
 
@@ -195,8 +271,10 @@ function serializeFrame(json, format, original) {
195
271
  const out = [];
196
272
  let dataWritten = false;
197
273
  for (const line of lines) {
198
- if (line.startsWith("data:")) {
199
- // Collapse any (multi-line) data payload into the single new body.
274
+ if (SSE_DATA_LINE.test(line)) {
275
+ // Collapse any (multi-line) data payload into the single new body. Use
276
+ // the SAME lenient match as the parser so a ` data:` line is replaced,
277
+ // never emitted verbatim (which would leak the original plaintext).
200
278
  if (!dataWritten) {
201
279
  out.push(`data: ${body}`);
202
280
  dataWritten = true;
@@ -216,6 +294,43 @@ function serializeFrame(json, format, original) {
216
294
  return original && original.raw.endsWith("\n") ? `${body}\n` : `${body}\n`;
217
295
  }
218
296
 
297
+ // P1-CR-005 — re-serialize a parse-failed CONTENT frame after its data text has
298
+ // been inspected/transformed. Unlike serializeFrame (which JSON.stringifies an
299
+ // object), this carries through ARBITRARY text. For SSE it preserves the
300
+ // original non-data field lines (event:/id:/retry:/`:` comments) and re-emits the
301
+ // protected text as data: lines — one per text line, per the SSE spec, so a
302
+ // multi-line payload round-trips correctly. For NDJSON the frame body IS the
303
+ // text, so emit the protected text plus a newline.
304
+ function serializeTextFrame(text, format, original) {
305
+ if (format !== "sse") {
306
+ return `${text}\n`;
307
+ }
308
+ const dataLines = text.split("\n").map((line) => `data: ${line}`);
309
+ if (original && typeof original.raw === "string") {
310
+ const lines = original.raw.replace(/\n+$/, "").split("\n");
311
+ const out = [];
312
+ let dataWritten = false;
313
+ for (const line of lines) {
314
+ if (SSE_DATA_LINE.test(line)) {
315
+ // Replace the (possibly multi-line) data block with the protected lines.
316
+ // Lenient match (same as the parser) so a ` data:` line is replaced, not
317
+ // emitted verbatim (which would leak the original plaintext PII).
318
+ if (!dataWritten) {
319
+ out.push(...dataLines);
320
+ dataWritten = true;
321
+ }
322
+ } else {
323
+ out.push(line);
324
+ }
325
+ }
326
+ if (!dataWritten) {
327
+ out.push(...dataLines);
328
+ }
329
+ return `${out.join("\n")}\n\n`;
330
+ }
331
+ return `${dataLines.join("\n")}\n\n`;
332
+ }
333
+
219
334
  export function getByPath(value, path) {
220
335
  let current = value;
221
336
  for (const part of path) {