oxtail 0.9.1 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,206 @@
1
- import { existsSync, readFileSync } from "node:fs";
1
+ import { closeSync, existsSync, fstatSync, openSync, readFileSync, readSync } from "node:fs";
2
+ // Defaults are deliberately conservative: a casual read returns at most ~20
3
+ // recent messages and ~24KB of text (~6k tokens). To pull a full transcript,
4
+ // callers explicitly raise `limit` (up to MAX_LIMIT) and `maxBytes` (up to
5
+ // MAX_MAX_BYTES) — an explicit override rather than an easy `full` footgun.
6
+ export const DEFAULT_LIMIT = 20;
7
+ export const MAX_LIMIT = 1000;
8
+ export const DEFAULT_MAX_BYTES = 24_000;
9
+ export const MIN_MAX_BYTES = 256;
10
+ export const MAX_MAX_BYTES = 1_000_000;
11
+ export const DEFAULT_CHUNK_SIZE = 65_536;
12
+ export const MIN_CHUNK_SIZE = 16;
2
13
  function clamp(n, lo, hi) {
3
14
  return Math.max(lo, Math.min(hi, n));
4
15
  }
16
+ // Non-finite inputs (NaN/±Infinity) would slip past clamp() and produce nonsense
17
+ // (e.g. NaN budget → slice(NaN) returns everything, or zero with a bogus
18
+ // truncation flag). Coerce anything non-finite to the supplied default so the
19
+ // exported reader API is robust even when called directly (not just via zod).
20
+ // Per Codex Phase-B hardening note.
21
+ function finiteOr(n, fallback) {
22
+ return typeof n === "number" && Number.isFinite(n) ? n : fallback;
23
+ }
24
+ // Truncate `s` to at most `maxBytes` UTF-8 bytes WITHOUT splitting a multi-byte
25
+ // code point. Iterating the string yields whole code points, so we never emit a
26
+ // partial/garbled character at the boundary.
27
+ function truncateToBytes(s, maxBytes) {
28
+ if (Buffer.byteLength(s, "utf8") <= maxBytes)
29
+ return s;
30
+ let out = "";
31
+ let bytes = 0;
32
+ for (const ch of s) {
33
+ const cb = Buffer.byteLength(ch, "utf8");
34
+ if (bytes + cb > maxBytes)
35
+ break;
36
+ out += ch;
37
+ bytes += cb;
38
+ }
39
+ return out;
40
+ }
41
+ // Apply the byte budget to an already count-tailed, chronological message list.
42
+ // Walk newest→oldest so the MOST RECENT content is what survives the budget
43
+ // (tail-preserving). The oldest message that crosses the budget is head-
44
+ // truncated with a marker; everything older than it is dropped. Returns the
45
+ // kept messages back in chronological order.
46
+ function applyByteBudget(messages, maxBytes) {
47
+ let remaining = maxBytes;
48
+ let bytesTruncated = false;
49
+ const keptReversed = [];
50
+ for (let i = messages.length - 1; i >= 0; i--) {
51
+ const m = messages[i];
52
+ const tb = Buffer.byteLength(m.text, "utf8");
53
+ if (tb <= remaining) {
54
+ keptReversed.push(m);
55
+ remaining -= tb;
56
+ continue;
57
+ }
58
+ // This message overflows the remaining budget.
59
+ if (remaining > 0) {
60
+ const head = truncateToBytes(m.text, remaining);
61
+ const droppedBytes = tb - Buffer.byteLength(head, "utf8");
62
+ keptReversed.push({ ...m, text: `${head}…[+${droppedBytes}B truncated]` });
63
+ }
64
+ bytesTruncated = true;
65
+ break; // older messages fall outside the budget
66
+ }
67
+ return { kept: keptReversed.reverse(), bytesTruncated };
68
+ }
69
+ // Shared finalize step for both readers: count-tail to `limit`, then apply the
70
+ // byte budget, then gate timestamps. Keeps the two truncation signals distinct.
71
+ function finalize(all, opts) {
72
+ const limit = clamp(Math.floor(finiteOr(opts.limit, DEFAULT_LIMIT)), 1, MAX_LIMIT);
73
+ const maxBytes = clamp(Math.floor(finiteOr(opts.maxBytes, DEFAULT_MAX_BYTES)), MIN_MAX_BYTES, MAX_MAX_BYTES);
74
+ const includeTimestamps = opts.includeTimestamps ?? false;
75
+ const total = all.length;
76
+ const countTruncated = total > limit;
77
+ const tail = countTruncated ? all.slice(-limit) : all.slice();
78
+ const { kept, bytesTruncated } = applyByteBudget(tail, maxBytes);
79
+ const messages = kept.map((m) => ({
80
+ role: m.role,
81
+ text: m.text,
82
+ timestamp: includeTimestamps ? m.timestamp : null,
83
+ }));
84
+ return {
85
+ messages,
86
+ truncated: countTruncated || bytesTruncated,
87
+ count_truncated: countTruncated,
88
+ bytes_truncated: bytesTruncated,
89
+ total_messages: total,
90
+ total_messages_exact: true,
91
+ };
92
+ }
93
+ const EMPTY_RESULT = {
94
+ messages: [],
95
+ truncated: false,
96
+ count_truncated: false,
97
+ bytes_truncated: false,
98
+ total_messages: 0,
99
+ total_messages_exact: true,
100
+ };
101
+ // Split a buffer on the newline byte (0x0A). Safe for UTF-8 because 0x0A never
102
+ // appears inside a multi-byte sequence (continuation/lead bytes are all ≥ 0x80).
103
+ // The trailing segment (after the last newline) is always included, possibly
104
+ // empty. Returned as views; callers copy the one they retain across reads.
105
+ function splitBufferByNewline(buf) {
106
+ const out = [];
107
+ let start = 0;
108
+ for (let i = 0; i < buf.length; i++) {
109
+ if (buf[i] === 0x0a) {
110
+ out.push(buf.subarray(start, i));
111
+ start = i + 1;
112
+ }
113
+ }
114
+ out.push(buf.subarray(start));
115
+ return out;
116
+ }
117
+ // Reverse-tail reader: walk the file backward in chunks, decoding only complete
118
+ // lines, until we've collected `limit` messages or reached the start of file.
119
+ // `parseLine` is the same per-line→message logic the full-scan path uses, so the
120
+ // returned messages are byte-identical to a full scan; only the SCAN STRATEGY
121
+ // differs. UTF-8 safety: incomplete leftmost lines are carried as raw BYTES and
122
+ // only decoded once a newline to their left completes them (or BOF is reached),
123
+ // so a multi-byte char split across a chunk boundary is always reassembled
124
+ // before decoding.
125
+ function readTailScan(path, parseLine, opts) {
126
+ const limit = clamp(Math.floor(finiteOr(opts.limit, DEFAULT_LIMIT)), 1, MAX_LIMIT);
127
+ const maxBytes = clamp(Math.floor(finiteOr(opts.maxBytes, DEFAULT_MAX_BYTES)), MIN_MAX_BYTES, MAX_MAX_BYTES);
128
+ const includeTimestamps = opts.includeTimestamps ?? false;
129
+ const chunkSize = Math.max(MIN_CHUNK_SIZE, Math.floor(finiteOr(opts.chunkSize, DEFAULT_CHUNK_SIZE)));
130
+ const newestFirst = [];
131
+ // `hitLimit` — we stopped because we collected `limit` messages, so MORE may
132
+ // exist above the window. Exactness keys on this, NOT on reaching byte-offset
133
+ // 0: a small file fits in one chunk, so we can read every byte yet still cap
134
+ // out mid-chunk having skipped older messages. The total is exact only when we
135
+ // never capped — i.e. we accounted for every message in the file.
136
+ let hitLimit = false;
137
+ const fd = openSync(path, "r");
138
+ try {
139
+ let pos = fstatSync(fd).size;
140
+ let leftover = Buffer.alloc(0); // bytes of the not-yet-complete leftmost line
141
+ while (pos > 0 && !hitLimit) {
142
+ const readSize = Math.min(chunkSize, pos);
143
+ pos -= readSize;
144
+ const chunk = Buffer.allocUnsafe(readSize);
145
+ readSync(fd, chunk, 0, readSize, pos);
146
+ const buf = Buffer.concat([chunk, leftover]);
147
+ const segments = splitBufferByNewline(buf);
148
+ // segments[0] is the new leftmost partial (extends further left, unless we
149
+ // reach BOF next); copy it so we don't retain the whole `buf`.
150
+ leftover = Buffer.from(segments[0]);
151
+ // segments[1..] are complete lines; process right→left so newest first.
152
+ for (let i = segments.length - 1; i >= 1; i--) {
153
+ const line = segments[i].toString("utf8");
154
+ if (!line)
155
+ continue;
156
+ const m = parseLine(line);
157
+ if (m) {
158
+ newestFirst.push(m);
159
+ if (newestFirst.length >= limit) {
160
+ hitLimit = true;
161
+ break;
162
+ }
163
+ }
164
+ }
165
+ }
166
+ // Consumed the whole file without ever capping → the final leftover is the
167
+ // file's first line; process it so the count is complete and exact.
168
+ if (!hitLimit && pos === 0) {
169
+ const line = leftover.toString("utf8");
170
+ if (line) {
171
+ const m = parseLine(line);
172
+ if (m)
173
+ newestFirst.push(m);
174
+ }
175
+ }
176
+ }
177
+ finally {
178
+ closeSync(fd);
179
+ }
180
+ const exact = !hitLimit; // every message accounted for iff we never capped
181
+ const chronological = newestFirst.slice().reverse();
182
+ const { kept, bytesTruncated } = applyByteBudget(chronological, maxBytes);
183
+ const messages = kept.map((m) => ({
184
+ role: m.role,
185
+ text: m.text,
186
+ timestamp: includeTimestamps ? m.timestamp : null,
187
+ }));
188
+ return {
189
+ messages,
190
+ truncated: !exact || bytesTruncated,
191
+ count_truncated: !exact,
192
+ bytes_truncated: bytesTruncated,
193
+ total_messages: exact ? newestFirst.length : null,
194
+ total_messages_exact: exact,
195
+ };
196
+ }
197
+ // A bare number is accepted as a legacy `{ limit }` for backward compat with
198
+ // older call sites/tests that passed a message count positionally.
199
+ function normalizeOptions(opts) {
200
+ if (typeof opts === "number")
201
+ return { limit: opts };
202
+ return opts ?? {};
203
+ }
5
204
  function extractTextFromClaudeContent(content) {
6
205
  if (typeof content === "string")
7
206
  return content;
@@ -18,36 +217,43 @@ function extractTextFromClaudeContent(content) {
18
217
  }
19
218
  return parts.join("\n");
20
219
  }
21
- export function readClaudeTranscript(path, limit = 100) {
22
- if (!existsSync(path)) {
23
- return { messages: [], truncated: false, total_messages: 0 };
220
+ // Per-line parse for Claude transcripts. Returns null for any line that isn't a
221
+ // non-empty user/assistant message (malformed JSON, wrong type/role, empty
222
+ // text). Shared by the full-scan and tail-scan paths so they agree exactly.
223
+ function parseClaudeLine(line) {
224
+ let obj;
225
+ try {
226
+ obj = JSON.parse(line);
24
227
  }
228
+ catch {
229
+ return null;
230
+ }
231
+ if (obj.type !== "user" && obj.type !== "assistant")
232
+ return null;
233
+ const role = obj.message?.role;
234
+ if (role !== "user" && role !== "assistant")
235
+ return null;
236
+ const text = extractTextFromClaudeContent(obj.message?.content);
237
+ if (!text)
238
+ return null;
239
+ return { role, text, timestamp: obj.timestamp ?? null };
240
+ }
241
+ export function readClaudeTranscript(path, opts) {
242
+ const options = normalizeOptions(opts);
243
+ if (!existsSync(path))
244
+ return EMPTY_RESULT;
245
+ if (options.tailScan)
246
+ return readTailScan(path, parseClaudeLine, options);
25
247
  const raw = readFileSync(path, "utf8");
26
248
  const messages = [];
27
249
  for (const line of raw.split("\n")) {
28
250
  if (!line)
29
251
  continue;
30
- let obj;
31
- try {
32
- obj = JSON.parse(line);
33
- }
34
- catch {
35
- continue;
36
- }
37
- if (obj.type !== "user" && obj.type !== "assistant")
38
- continue;
39
- const role = obj.message?.role;
40
- if (role !== "user" && role !== "assistant")
41
- continue;
42
- const text = extractTextFromClaudeContent(obj.message?.content);
43
- if (!text)
44
- continue;
45
- messages.push({ role, text, timestamp: obj.timestamp ?? null });
252
+ const m = parseClaudeLine(line);
253
+ if (m)
254
+ messages.push(m);
46
255
  }
47
- const safeLimit = clamp(limit, 1, 1000);
48
- const truncated = messages.length > safeLimit;
49
- const tail = truncated ? messages.slice(-safeLimit) : messages;
50
- return { messages: tail, truncated, total_messages: messages.length };
256
+ return finalize(messages, options);
51
257
  }
52
258
  // Codex CLI injects two kinds of blocks into the first user message of a
53
259
  // rollout that look identical to user input at the role/type level:
@@ -83,37 +289,44 @@ function extractTextFromCodexContent(content) {
83
289
  }
84
290
  return parts.join("\n");
85
291
  }
86
- export function readCodexTranscript(path, limit = 100) {
87
- if (!existsSync(path)) {
88
- return { messages: [], truncated: false, total_messages: 0 };
292
+ // Per-line parse for Codex rollouts. Drops non-message response_items, wrong
293
+ // roles, injected AGENTS.md/environment_context blocks, and empty text. Shared
294
+ // by the full-scan and tail-scan paths.
295
+ function parseCodexLine(line) {
296
+ let obj;
297
+ try {
298
+ obj = JSON.parse(line);
299
+ }
300
+ catch {
301
+ return null;
89
302
  }
303
+ if (obj.type !== "response_item")
304
+ return null;
305
+ const p = obj.payload;
306
+ if (!p || p.type !== "message")
307
+ return null;
308
+ const role = p.role;
309
+ if (role !== "user" && role !== "assistant")
310
+ return null;
311
+ const text = extractTextFromCodexContent(p.content);
312
+ if (!text)
313
+ return null;
314
+ return { role, text, timestamp: obj.timestamp ?? null };
315
+ }
316
+ export function readCodexTranscript(path, opts) {
317
+ const options = normalizeOptions(opts);
318
+ if (!existsSync(path))
319
+ return EMPTY_RESULT;
320
+ if (options.tailScan)
321
+ return readTailScan(path, parseCodexLine, options);
90
322
  const raw = readFileSync(path, "utf8");
91
323
  const messages = [];
92
324
  for (const line of raw.split("\n")) {
93
325
  if (!line)
94
326
  continue;
95
- let obj;
96
- try {
97
- obj = JSON.parse(line);
98
- }
99
- catch {
100
- continue;
101
- }
102
- if (obj.type !== "response_item")
103
- continue;
104
- const p = obj.payload;
105
- if (!p || p.type !== "message")
106
- continue;
107
- const role = p.role;
108
- if (role !== "user" && role !== "assistant")
109
- continue;
110
- const text = extractTextFromCodexContent(p.content);
111
- if (!text)
112
- continue;
113
- messages.push({ role, text, timestamp: obj.timestamp ?? null });
327
+ const m = parseCodexLine(line);
328
+ if (m)
329
+ messages.push(m);
114
330
  }
115
- const safeLimit = clamp(limit, 1, 1000);
116
- const truncated = messages.length > safeLimit;
117
- const tail = truncated ? messages.slice(-safeLimit) : messages;
118
- return { messages: tail, truncated, total_messages: messages.length };
331
+ return finalize(messages, options);
119
332
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "oxtail",
3
- "version": "0.9.1",
3
+ "version": "0.10.1",
4
4
  "private": false,
5
5
  "type": "module",
6
6
  "description": "Coordination layer for parallel AI coding agent sessions, exposed over MCP.",