@c3-oss/prosa 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/bin/prosa.js CHANGED
@@ -98,16 +98,27 @@ var init_hash = __esm({
98
98
  // src/core/cas/index.ts
99
99
  var cas_exports = {};
100
100
  __export(cas_exports, {
101
+ createPendingObjects: () => createPendingObjects,
102
+ ensureDir: () => ensureDir,
103
+ flushPendingObjects: () => flushPendingObjects,
101
104
  getBytes: () => getBytes,
102
105
  getJson: () => getJson,
103
106
  getObjectMeta: () => getObjectMeta,
104
107
  getText: () => getText,
105
108
  putBytes: () => putBytes,
106
109
  putJson: () => putJson,
107
- putText: () => putText
110
+ putText: () => putText,
111
+ stageBytes: () => stageBytes,
112
+ stageJson: () => stageJson,
113
+ stageText: () => stageText
108
114
  });
109
115
  import { mkdir as mkdir2, readFile as readFile2, writeFile as writeFile2 } from "fs/promises";
110
116
  import path2 from "path";
117
+ async function ensureDir(absoluteDir) {
118
+ if (ensuredDirs.has(absoluteDir)) return;
119
+ await mkdir2(absoluteDir, { recursive: true });
120
+ ensuredDirs.add(absoluteDir);
121
+ }
111
122
  async function putBytes(bundle, bytes, options = {}) {
112
123
  const hash = blake3Hex(bytes);
113
124
  const objectId = objectIdFromHash(hash);
@@ -121,7 +132,7 @@ async function putBytes(bundle, bytes, options = {}) {
121
132
  const { bytes: stored, compression } = compressBytes(bytes);
122
133
  const storagePath = objectStoragePath(hash, compression);
123
134
  const absolutePath = path2.join(bundle.path, storagePath);
124
- await mkdir2(path2.dirname(absolutePath), { recursive: true });
135
+ await ensureDir(path2.dirname(absolutePath));
125
136
  await writeFile2(absolutePath, stored);
126
137
  prepare(
127
138
  bundle.db,
@@ -185,18 +196,126 @@ function getObjectMeta(bundle, objectId) {
185
196
  FROM objects WHERE object_id = ?`
186
197
  ).get(objectId) ?? null;
187
198
  }
199
+ function createPendingObjects() {
200
+ return { byId: /* @__PURE__ */ new Map() };
201
+ }
202
+ function stageBytes(pending, bytes, options = {}) {
203
+ const buf = Buffer.isBuffer(bytes) ? bytes : Buffer.from(bytes);
204
+ const hash = blake3Hex(buf);
205
+ const objectId = objectIdFromHash(hash);
206
+ if (!pending.byId.has(objectId)) {
207
+ pending.byId.set(objectId, {
208
+ objectId,
209
+ hash,
210
+ bytes: buf,
211
+ mimeType: options.mimeType ?? null,
212
+ encoding: options.encoding ?? null
213
+ });
214
+ }
215
+ return objectId;
216
+ }
217
+ function stageText(pending, text, options = {}) {
218
+ return stageBytes(pending, Buffer.from(text, "utf8"), {
219
+ mimeType: options.mimeType ?? "text/plain; charset=utf-8",
220
+ encoding: "utf-8"
221
+ });
222
+ }
223
+ function stageJson(pending, value) {
224
+ return stageBytes(pending, Buffer.from(JSON.stringify(value), "utf8"), {
225
+ mimeType: "application/json",
226
+ encoding: "utf-8"
227
+ });
228
+ }
229
+ async function flushPendingObjects(bundle, pending) {
230
+ if (pending.byId.size === 0) return;
231
+ const ids = [...pending.byId.keys()];
232
+ const existingIds = queryExistingObjectIds(bundle, ids);
233
+ const toWrite = [];
234
+ for (const obj of pending.byId.values()) {
235
+ if (existingIds.has(obj.objectId)) continue;
236
+ const { bytes: compressedBytes, compression } = compressBytes(obj.bytes);
237
+ const storagePath = objectStoragePath(obj.hash, compression);
238
+ toWrite.push({
239
+ staged: obj,
240
+ compression,
241
+ compressedBytes,
242
+ storagePath,
243
+ absolutePath: path2.join(bundle.path, storagePath)
244
+ });
245
+ }
246
+ if (toWrite.length > 0) {
247
+ await writeFilesParallel(toWrite);
248
+ }
249
+ const insertObject = prepare(
250
+ bundle.db,
251
+ `INSERT OR IGNORE INTO objects (
252
+ object_id, hash_alg, hash, size_bytes, compressed_size_bytes,
253
+ compression, mime_type, encoding, storage_path, created_at
254
+ ) VALUES (?, 'blake3', ?, ?, ?, ?, ?, ?, ?, ?)`
255
+ );
256
+ const now = (/* @__PURE__ */ new Date()).toISOString();
257
+ for (const p of toWrite) {
258
+ insertObject.run(
259
+ p.staged.objectId,
260
+ p.staged.hash,
261
+ p.staged.bytes.byteLength,
262
+ p.compression === "zstd" ? p.compressedBytes.byteLength : null,
263
+ p.compression,
264
+ p.staged.mimeType,
265
+ p.staged.encoding,
266
+ p.storagePath,
267
+ now
268
+ );
269
+ }
270
+ }
271
+ function queryExistingObjectIds(bundle, ids) {
272
+ const found = /* @__PURE__ */ new Set();
273
+ if (ids.length === 0) return found;
274
+ const CHUNK = 500;
275
+ for (let start = 0; start < ids.length; start += CHUNK) {
276
+ const slice = ids.slice(start, start + CHUNK);
277
+ const placeholders = slice.map(() => "?").join(",");
278
+ const rows = bundle.db.prepare(
279
+ `SELECT object_id FROM objects WHERE object_id IN (${placeholders})`
280
+ ).all(...slice);
281
+ for (const row of rows) found.add(row.object_id);
282
+ }
283
+ return found;
284
+ }
285
+ async function writeFilesParallel(tasks) {
286
+ let cursor = 0;
287
+ const workers = [];
288
+ const limit = Math.min(FS_WRITE_CONCURRENCY, tasks.length);
289
+ for (let w = 0; w < limit; w++) {
290
+ workers.push(
291
+ (async () => {
292
+ while (true) {
293
+ const i = cursor++;
294
+ if (i >= tasks.length) return;
295
+ const task = tasks[i];
296
+ await ensureDir(path2.dirname(task.absolutePath));
297
+ await writeFile2(task.absolutePath, task.compressedBytes);
298
+ }
299
+ })()
300
+ );
301
+ }
302
+ await Promise.all(workers);
303
+ }
304
+ var ensuredDirs, FS_WRITE_CONCURRENCY;
188
305
  var init_cas = __esm({
189
306
  "src/core/cas/index.ts"() {
190
307
  "use strict";
191
308
  init_db();
192
309
  init_compress();
193
310
  init_hash();
311
+ ensuredDirs = /* @__PURE__ */ new Set();
312
+ FS_WRITE_CONCURRENCY = 16;
194
313
  }
195
314
  });
196
315
 
197
316
  // src/services/indexing.ts
198
- import { mkdir as mkdir4, rm, writeFile as writeFile4 } from "fs/promises";
199
- import path12 from "path";
317
+ import { mkdir as mkdir4, rm as rm2, writeFile as writeFile5 } from "fs/promises";
318
+ import path13 from "path";
200
319
  function enableFts5Triggers(bundle) {
201
320
  bundle.db.exec(FTS5_TRIGGER_SQL);
202
321
  }
@@ -291,7 +410,7 @@ async function rebuildTantivyIndex(bundle) {
291
410
  try {
292
411
  const tantivy = await import("@oxdev03/node-tantivy-binding");
293
412
  const schema = new tantivy.SchemaBuilder().addTextField("doc_id", { stored: true, tokenizerName: "raw" }).addTextField("entity_type", { stored: true, tokenizerName: "raw" }).addTextField("entity_id", { stored: true, tokenizerName: "raw" }).addTextField("session_id", { stored: true, tokenizerName: "raw" }).addTextField("project_id", { stored: true, tokenizerName: "raw" }).addTextField("timestamp", { stored: true, tokenizerName: "raw" }).addTextField("role", { stored: true, tokenizerName: "raw" }).addTextField("tool_name", { stored: true, tokenizerName: "raw" }).addTextField("canonical_tool_type", { stored: true, tokenizerName: "raw" }).addTextField("field_kind", { stored: true, tokenizerName: "raw" }).addTextField("text", { stored: true }).build();
294
- await rm(bundle.paths.tantivy, { recursive: true, force: true });
413
+ await rm2(bundle.paths.tantivy, { recursive: true, force: true });
295
414
  await mkdir4(bundle.paths.tantivy, { recursive: true });
296
415
  const index = new tantivy.Index(schema, bundle.paths.tantivy, false);
297
416
  const writer = index.writer(5e7, 1);
@@ -320,8 +439,8 @@ async function rebuildTantivyIndex(bundle) {
320
439
  }
321
440
  writer.commit();
322
441
  index.reload();
323
- await writeFile4(
324
- path12.join(bundle.paths.tantivy, "prosa-index.json"),
442
+ await writeFile5(
443
+ path13.join(bundle.paths.tantivy, "prosa-index.json"),
325
444
  `${JSON.stringify(
326
445
  {
327
446
  engine: "tantivy",
@@ -1001,7 +1120,8 @@ var PROSA_PARSER_VERSION = "0.1.0";
1001
1120
  var PROSA_SCHEMA_VERSION = 2;
1002
1121
 
1003
1122
  // src/cli/commands/compile.ts
1004
- import path13 from "path";
1123
+ import os2 from "os";
1124
+ import path14 from "path";
1005
1125
  import { Command } from "commander";
1006
1126
 
1007
1127
  // src/core/bundle.ts
@@ -1627,8 +1747,9 @@ async function recordError(bundle, batchId, args) {
1627
1747
  // src/core/ingest/idempotency.ts
1628
1748
  init_compress();
1629
1749
  init_hash();
1750
+ init_cas();
1630
1751
  init_db();
1631
- import { access as access2, mkdir as mkdir3, readFile as readFile3, stat as stat2, writeFile as writeFile3 } from "fs/promises";
1752
+ import { access as access2, readFile as readFile3, stat as stat2, writeFile as writeFile3 } from "fs/promises";
1632
1753
  import path3 from "path";
1633
1754
  async function registerSourceFile(bundle, args) {
1634
1755
  const st = await stat2(args.absolutePath);
@@ -1714,7 +1835,7 @@ async function preserveRawSourceBytes(bundle, bytes) {
1714
1835
  const { bytes: stored, compression } = compressBytes(bytes);
1715
1836
  const storagePath = rawSourceStoragePath(hash, compression);
1716
1837
  const absolutePath = path3.join(bundle.path, storagePath);
1717
- await mkdir3(path3.dirname(absolutePath), { recursive: true });
1838
+ await ensureDir(path3.dirname(absolutePath));
1718
1839
  if (!await fileExists(absolutePath)) {
1719
1840
  await writeFile3(absolutePath, stored);
1720
1841
  }
@@ -1814,17 +1935,34 @@ async function readdirSafe(dir) {
1814
1935
 
1815
1936
  // src/importers/claude/index.ts
1816
1937
  var PREVIEW_MAX = 4e3;
1817
- async function compileClaude(bundle, root) {
1938
+ async function compileClaude(bundle, root, options = {}) {
1939
+ const logger = options.logger;
1818
1940
  const batch = startBatch(bundle, "claude", [root]);
1819
1941
  const counts = emptyCounts();
1942
+ logger?.info({ batch_id: batch.batch_id, root }, "claude batch started");
1820
1943
  try {
1821
1944
  for await (const file of discoverClaudeFiles(root)) {
1822
1945
  counts.source_files_seen++;
1946
+ logger?.debug(
1947
+ {
1948
+ path: file.filePath,
1949
+ project_slug: file.projectSlug,
1950
+ is_subagent: file.isSubagent
1951
+ },
1952
+ "claude source file discovered"
1953
+ );
1823
1954
  try {
1824
- const fc = await compileClaudeFile(bundle, batch, file);
1955
+ const fc = await compileClaudeFile(bundle, batch, file, logger);
1825
1956
  addCounts(counts, fc);
1826
1957
  } catch (error) {
1827
1958
  counts.errors++;
1959
+ logger?.warn(
1960
+ {
1961
+ err: error,
1962
+ path: file.filePath
1963
+ },
1964
+ "claude source file failed"
1965
+ );
1828
1966
  await recordError(bundle, batch.batch_id, {
1829
1967
  kind: "claude_file_failed",
1830
1968
  message: error instanceof Error ? error.message : String(error),
@@ -1833,9 +1971,12 @@ async function compileClaude(bundle, root) {
1833
1971
  }
1834
1972
  }
1835
1973
  linkSubagentParents(bundle);
1974
+ logger?.debug({ batch_id: batch.batch_id }, "claude subagent parent links refreshed");
1836
1975
  finishBatch(bundle, batch, counts, "completed");
1976
+ logger?.info({ batch_id: batch.batch_id, counts }, "claude batch completed");
1837
1977
  } catch (error) {
1838
1978
  finishBatch(bundle, batch, counts, "failed");
1979
+ logger?.error({ err: error, batch_id: batch.batch_id, counts }, "claude batch failed");
1839
1980
  throw error;
1840
1981
  }
1841
1982
  return { batch, counts };
@@ -1889,7 +2030,7 @@ function addCounts(target, source) {
1889
2030
  target.edges += source.edges;
1890
2031
  target.errors += source.errors;
1891
2032
  }
1892
- async function compileClaudeFile(bundle, batch, file) {
2033
+ async function compileClaudeFile(bundle, batch, file, logger) {
1893
2034
  const counts = emptyFileCounts();
1894
2035
  const { row: sourceFile, alreadyKnown } = await registerSourceFile(bundle, {
1895
2036
  sourceTool: "claude",
@@ -1899,9 +2040,17 @@ async function compileClaudeFile(bundle, batch, file) {
1899
2040
  });
1900
2041
  if (alreadyKnown) {
1901
2042
  counts.source_files_skipped = 1;
2043
+ logger?.debug(
2044
+ { path: file.filePath, source_file_id: sourceFile.source_file_id },
2045
+ "claude source file skipped"
2046
+ );
1902
2047
  return counts;
1903
2048
  }
1904
2049
  counts.source_files_imported = 1;
2050
+ logger?.debug(
2051
+ { path: file.filePath, source_file_id: sourceFile.source_file_id },
2052
+ "claude source file registered"
2053
+ );
1905
2054
  const text = await readFile4(file.filePath, "utf8");
1906
2055
  const rawLines = text.split("\n");
1907
2056
  const lines = rawLines[rawLines.length - 1] === "" ? rawLines.slice(0, -1) : rawLines;
@@ -1918,7 +2067,8 @@ async function compileClaudeFile(bundle, batch, file) {
1918
2067
  artifacts: [],
1919
2068
  edges: [],
1920
2069
  searchDocs: [],
1921
- uuidToMessageId: /* @__PURE__ */ new Map()
2070
+ uuidToMessageId: /* @__PURE__ */ new Map(),
2071
+ objects: createPendingObjects()
1922
2072
  };
1923
2073
  let modelFirst = null;
1924
2074
  let modelLast = null;
@@ -1933,7 +2083,7 @@ async function compileClaudeFile(bundle, batch, file) {
1933
2083
  const lineNo = i + 1;
1934
2084
  const ordinal = i;
1935
2085
  const lineBytes = Buffer.from(line, "utf8");
1936
- const rawObjectId = await putBytes(bundle, lineBytes, {
2086
+ const rawObjectId = stageBytes(pending.objects, lineBytes, {
1937
2087
  mimeType: "application/jsonl-line",
1938
2088
  encoding: "utf-8"
1939
2089
  });
@@ -1944,7 +2094,7 @@ async function compileClaudeFile(bundle, batch, file) {
1944
2094
  } catch {
1945
2095
  parserStatus = "failed";
1946
2096
  }
1947
- const decodedObjectId = parsed != null && parserStatus === "ok" ? await putJson(bundle, parsed) : null;
2097
+ const decodedObjectId = null;
1948
2098
  const nativeId = parsed?.uuid ?? null;
1949
2099
  const rawRecordId2 = rawRecordId(sourceFile.source_file_id, ordinal, rawObjectId);
1950
2100
  pending.rawRecords.push({
@@ -2042,7 +2192,7 @@ async function compileClaudeFile(bundle, batch, file) {
2042
2192
  raw_record_id: rawRecordId2
2043
2193
  });
2044
2194
  if (content.length > PREVIEW_MAX) {
2045
- const fullId = await putText(bundle, content);
2195
+ const fullId = stageText(pending.objects, content);
2046
2196
  const last = pending.blocks[pending.blocks.length - 1];
2047
2197
  if (last) last.text_object_id = fullId;
2048
2198
  }
@@ -2182,6 +2332,7 @@ async function compileClaudeFile(bundle, batch, file) {
2182
2332
  pending.session.git_branch_initial ??= branchInitial;
2183
2333
  }
2184
2334
  buildSearchDocs(pending);
2335
+ await flushPendingObjects(bundle, pending.objects);
2185
2336
  transactional(bundle.db, () => {
2186
2337
  flushPending(bundle, pending, { modelFirst, modelLast });
2187
2338
  });
@@ -2194,6 +2345,10 @@ async function compileClaudeFile(bundle, batch, file) {
2194
2345
  counts.tool_results = pending.toolResults.length;
2195
2346
  counts.artifacts = pending.artifacts.length;
2196
2347
  counts.edges = pending.edges.length;
2348
+ logger?.debug(
2349
+ { path: file.filePath, source_file_id: sourceFile.source_file_id, counts },
2350
+ "claude source file imported"
2351
+ );
2197
2352
  return counts;
2198
2353
  }
2199
2354
  function createSessionFromFirstRecord(file, parsed, meta, ts, rawRecordId2) {
@@ -2241,7 +2396,7 @@ async function processContentBlock(bundle, sessionId2, messageId2, eventId2, blo
2241
2396
  event_id: null,
2242
2397
  ordinal: blockOrdinal,
2243
2398
  block_type: "text",
2244
- text_object_id: text.length > PREVIEW_MAX ? await putText(bundle, text) : null,
2399
+ text_object_id: text.length > PREVIEW_MAX ? stageText(pending.objects, text) : null,
2245
2400
  text_inline: text.slice(0, PREVIEW_MAX),
2246
2401
  is_error: 0,
2247
2402
  visibility: "default",
@@ -2257,7 +2412,7 @@ async function processContentBlock(bundle, sessionId2, messageId2, eventId2, blo
2257
2412
  event_id: null,
2258
2413
  ordinal: blockOrdinal,
2259
2414
  block_type: "thinking",
2260
- text_object_id: text.length > PREVIEW_MAX ? await putText(bundle, text) : null,
2415
+ text_object_id: text.length > PREVIEW_MAX ? stageText(pending.objects, text) : null,
2261
2416
  text_inline: text.slice(0, PREVIEW_MAX),
2262
2417
  is_error: 0,
2263
2418
  visibility: "hidden_by_default",
@@ -2269,7 +2424,7 @@ async function processContentBlock(bundle, sessionId2, messageId2, eventId2, blo
2269
2424
  const tu = block;
2270
2425
  const sourceCallId = tu.id ?? `${blockOrdinal}`;
2271
2426
  const toolName = tu.name ?? "unknown";
2272
- const argsId = tu.input != null ? await putJson(bundle, tu.input) : null;
2427
+ const argsId = tu.input != null ? stageJson(pending.objects, tu.input) : null;
2273
2428
  const command = inferCommandFromArgs(toolName, tu.input);
2274
2429
  const filePath = inferPathFromArgs(tu.input);
2275
2430
  const tcId = toolCallId(sessionId2, sourceCallId);
@@ -2310,13 +2465,14 @@ async function processContentBlock(bundle, sessionId2, messageId2, eventId2, blo
2310
2465
  const sourceCallId = tr.tool_use_id ?? null;
2311
2466
  const isError = tr.is_error === true ? 1 : 0;
2312
2467
  const text = stringifyOrNull(tr.content) ?? "";
2468
+ const overflowId = text.length > PREVIEW_MAX ? stageText(pending.objects, text) : null;
2313
2469
  pending.blocks.push({
2314
2470
  block_id: blkId,
2315
2471
  message_id: messageId2,
2316
2472
  event_id: null,
2317
2473
  ordinal: blockOrdinal,
2318
2474
  block_type: "tool_result",
2319
- text_object_id: text.length > PREVIEW_MAX ? await putText(bundle, text) : null,
2475
+ text_object_id: overflowId,
2320
2476
  text_inline: text.slice(0, PREVIEW_MAX),
2321
2477
  is_error: isError,
2322
2478
  visibility: "default",
@@ -2335,7 +2491,7 @@ async function processContentBlock(bundle, sessionId2, messageId2, eventId2, blo
2335
2491
  duration_ms: null,
2336
2492
  stdout_object_id: null,
2337
2493
  stderr_object_id: null,
2338
- output_object_id: text.length > PREVIEW_MAX ? await putText(bundle, text) : null,
2494
+ output_object_id: overflowId,
2339
2495
  preview: text.slice(0, PREVIEW_MAX),
2340
2496
  raw_record_id: rawRecordId2
2341
2497
  });
@@ -2750,17 +2906,27 @@ async function* walk(dir) {
2750
2906
 
2751
2907
  // src/importers/codex/index.ts
2752
2908
  var PREVIEW_MAX2 = 4e3;
2753
- async function compileCodex(bundle, root) {
2909
+ async function compileCodex(bundle, root, options = {}) {
2910
+ const logger = options.logger;
2754
2911
  const batch = startBatch(bundle, "codex", [root]);
2755
2912
  const counts = emptyCounts();
2913
+ logger?.info({ batch_id: batch.batch_id, root }, "codex batch started");
2756
2914
  try {
2757
2915
  for await (const filePath of discoverCodexSessions(root)) {
2758
2916
  counts.source_files_seen++;
2917
+ logger?.debug({ path: filePath }, "codex source file discovered");
2759
2918
  try {
2760
- const fileCounts = await compileCodexFile(bundle, batch, filePath);
2919
+ const fileCounts = await compileCodexFile(bundle, batch, filePath, logger);
2761
2920
  addCounts2(counts, fileCounts);
2762
2921
  } catch (error) {
2763
2922
  counts.errors++;
2923
+ logger?.warn(
2924
+ {
2925
+ err: error,
2926
+ path: filePath
2927
+ },
2928
+ "codex source file failed"
2929
+ );
2764
2930
  await recordError(bundle, batch.batch_id, {
2765
2931
  kind: "codex_file_failed",
2766
2932
  message: error instanceof Error ? error.message : String(error),
@@ -2769,9 +2935,12 @@ async function compileCodex(bundle, root) {
2769
2935
  }
2770
2936
  }
2771
2937
  linkSubagentParents2(bundle);
2938
+ logger?.debug({ batch_id: batch.batch_id }, "codex subagent parent links refreshed");
2772
2939
  finishBatch(bundle, batch, counts, "completed");
2940
+ logger?.info({ batch_id: batch.batch_id, counts }, "codex batch completed");
2773
2941
  } catch (error) {
2774
2942
  finishBatch(bundle, batch, counts, "failed");
2943
+ logger?.error({ err: error, batch_id: batch.batch_id, counts }, "codex batch failed");
2775
2944
  throw error;
2776
2945
  }
2777
2946
  return { batch, counts };
@@ -2824,7 +2993,7 @@ function addCounts2(target, source) {
2824
2993
  target.edges += source.edges;
2825
2994
  target.errors += source.errors;
2826
2995
  }
2827
- async function compileCodexFile(bundle, batch, filePath) {
2996
+ async function compileCodexFile(bundle, batch, filePath, logger) {
2828
2997
  const counts = emptyFileCounts2();
2829
2998
  const { row: sourceFileRow, alreadyKnown } = await registerSourceFile(bundle, {
2830
2999
  sourceTool: "codex",
@@ -2833,9 +3002,17 @@ async function compileCodexFile(bundle, batch, filePath) {
2833
3002
  });
2834
3003
  if (alreadyKnown) {
2835
3004
  counts.source_files_skipped = 1;
3005
+ logger?.debug(
3006
+ { path: filePath, source_file_id: sourceFileRow.source_file_id },
3007
+ "codex source file skipped"
3008
+ );
2836
3009
  return counts;
2837
3010
  }
2838
3011
  counts.source_files_imported = 1;
3012
+ logger?.debug(
3013
+ { path: filePath, source_file_id: sourceFileRow.source_file_id },
3014
+ "codex source file registered"
3015
+ );
2839
3016
  const text = await readFile5(filePath, "utf8");
2840
3017
  const rawLines = text.split("\n");
2841
3018
  const lines = rawLines[rawLines.length - 1] === "" ? rawLines.slice(0, -1) : rawLines;
@@ -2852,7 +3029,8 @@ async function compileCodexFile(bundle, batch, filePath) {
2852
3029
  toolResults: [],
2853
3030
  artifacts: [],
2854
3031
  edges: [],
2855
- searchDocs: []
3032
+ searchDocs: [],
3033
+ objects: createPendingObjects()
2856
3034
  };
2857
3035
  let sessionStartTs = null;
2858
3036
  let sessionEndTs = null;
@@ -2866,7 +3044,7 @@ async function compileCodexFile(bundle, batch, filePath) {
2866
3044
  const lineNo = i + 1;
2867
3045
  const ordinal = i;
2868
3046
  const lineBytes = Buffer.from(line, "utf8");
2869
- const rawObjectId = await putBytes(bundle, lineBytes, {
3047
+ const rawObjectId = stageBytes(pending.objects, lineBytes, {
2870
3048
  mimeType: "application/jsonl-line",
2871
3049
  encoding: "utf-8"
2872
3050
  });
@@ -2877,7 +3055,7 @@ async function compileCodexFile(bundle, batch, filePath) {
2877
3055
  } catch {
2878
3056
  parserStatus = "failed";
2879
3057
  }
2880
- const decodedObjectId = parsed != null && parserStatus === "ok" ? await putJson(bundle, parsed) : null;
3058
+ const decodedObjectId = null;
2881
3059
  const nativeId = parsed ? extractNativeId(parsed) : null;
2882
3060
  const rawRecordId2 = rawRecordId(sourceFileRow.source_file_id, ordinal, rawObjectId);
2883
3061
  pending.rawRecords.push({
@@ -3046,6 +3224,7 @@ async function compileCodexFile(bundle, batch, filePath) {
3046
3224
  pending.session.start_ts ??= sessionStartTs;
3047
3225
  }
3048
3226
  buildSearchDocs2(pending);
3227
+ await flushPendingObjects(bundle, pending.objects);
3049
3228
  transactional(bundle.db, () => {
3050
3229
  flushPending2(bundle, pending, {
3051
3230
  sessionEndTs,
@@ -3064,6 +3243,10 @@ async function compileCodexFile(bundle, batch, filePath) {
3064
3243
  counts.tool_results = pending.toolResults.length;
3065
3244
  counts.artifacts = pending.artifacts.length;
3066
3245
  counts.edges = pending.edges.length;
3246
+ logger?.debug(
3247
+ { path: filePath, source_file_id: sourceFileRow.source_file_id, counts },
3248
+ "codex source file imported"
3249
+ );
3067
3250
  return counts;
3068
3251
  }
3069
3252
  function handleResponseItem(_bundle, sessionId2, currentTurnId, rawRecordId2, ordinal, ts, ri, payloadObjectId, nextMsgOrdinal, currentModel, pending) {
@@ -3223,8 +3406,8 @@ async function handleEventMsg(bundle, sessionId2, currentTurnId, rawRecordId2, o
3223
3406
  const subtype = em.type ?? "unknown";
3224
3407
  if (subtype === "exec_command_end") {
3225
3408
  const sourceCallId = em.call_id ?? null;
3226
- const stdoutId = em.stdout ? await putText(bundle, em.stdout, { mimeType: "text/plain" }) : null;
3227
- const stderrId = em.stderr ? await putText(bundle, em.stderr, { mimeType: "text/plain" }) : null;
3409
+ const stdoutId = em.stdout ? stageText(pending.objects, em.stdout, { mimeType: "text/plain" }) : null;
3410
+ const stderrId = em.stderr ? stageText(pending.objects, em.stderr, { mimeType: "text/plain" }) : null;
3228
3411
  const preview = (em.formatted_output ?? em.aggregated_output ?? em.stdout ?? "").slice(
3229
3412
  0,
3230
3413
  PREVIEW_MAX2
@@ -3871,17 +4054,34 @@ async function readdirSafe2(dir) {
3871
4054
 
3872
4055
  // src/importers/cursor/index.ts
3873
4056
  var PREVIEW_MAX3 = 4e3;
3874
- async function compileCursor(bundle, root) {
4057
+ async function compileCursor(bundle, root, options = {}) {
4058
+ const logger = options.logger;
3875
4059
  const batch = startBatch(bundle, "cursor", [root]);
3876
4060
  const counts = emptyCounts();
4061
+ logger?.info({ batch_id: batch.batch_id, root }, "cursor batch started");
3877
4062
  try {
3878
4063
  for await (const store of discoverCursorStores(root)) {
3879
4064
  counts.source_files_seen++;
4065
+ logger?.debug(
4066
+ {
4067
+ path: store.filePath,
4068
+ workspace_id: store.workspaceId,
4069
+ agent_id: store.agentId
4070
+ },
4071
+ "cursor store discovered"
4072
+ );
3880
4073
  try {
3881
- const fc = await compileCursorStore(bundle, batch, store);
4074
+ const fc = await compileCursorStore(bundle, batch, store, logger);
3882
4075
  addCounts3(counts, fc);
3883
4076
  } catch (error) {
3884
4077
  counts.errors++;
4078
+ logger?.warn(
4079
+ {
4080
+ err: error,
4081
+ path: store.filePath
4082
+ },
4083
+ "cursor store failed"
4084
+ );
3885
4085
  await recordError(bundle, batch.batch_id, {
3886
4086
  kind: "cursor_store_failed",
3887
4087
  message: error instanceof Error ? error.message : String(error),
@@ -3890,8 +4090,10 @@ async function compileCursor(bundle, root) {
3890
4090
  }
3891
4091
  }
3892
4092
  finishBatch(bundle, batch, counts, "completed");
4093
+ logger?.info({ batch_id: batch.batch_id, counts }, "cursor batch completed");
3893
4094
  } catch (error) {
3894
4095
  finishBatch(bundle, batch, counts, "failed");
4096
+ logger?.error({ err: error, batch_id: batch.batch_id, counts }, "cursor batch failed");
3895
4097
  throw error;
3896
4098
  }
3897
4099
  return { batch, counts };
@@ -3926,7 +4128,7 @@ function addCounts3(target, source) {
3926
4128
  target.edges += source.edges;
3927
4129
  target.errors += source.errors;
3928
4130
  }
3929
- async function compileCursorStore(bundle, batch, store) {
4131
+ async function compileCursorStore(bundle, batch, store, logger) {
3930
4132
  const counts = emptyFileCounts3();
3931
4133
  const { row: sourceFile, alreadyKnown } = await registerSourceFile(bundle, {
3932
4134
  sourceTool: "cursor",
@@ -3936,9 +4138,17 @@ async function compileCursorStore(bundle, batch, store) {
3936
4138
  });
3937
4139
  if (alreadyKnown) {
3938
4140
  counts.source_files_skipped = 1;
4141
+ logger?.debug(
4142
+ { path: store.filePath, source_file_id: sourceFile.source_file_id },
4143
+ "cursor store skipped"
4144
+ );
3939
4145
  return counts;
3940
4146
  }
3941
4147
  counts.source_files_imported = 1;
4148
+ logger?.debug(
4149
+ { path: store.filePath, source_file_id: sourceFile.source_file_id },
4150
+ "cursor store registered"
4151
+ );
3942
4152
  const cdb = new Database2(store.filePath, { readonly: true, fileMustExist: true });
3943
4153
  try {
3944
4154
  const pending = {
@@ -3951,7 +4161,8 @@ async function compileCursorStore(bundle, batch, store) {
3951
4161
  toolCallsList: [],
3952
4162
  toolResults: [],
3953
4163
  artifacts: [],
3954
- searchDocs: []
4164
+ searchDocs: [],
4165
+ objects: createPendingObjects()
3955
4166
  };
3956
4167
  const metaRow = cdb.prepare(`SELECT value FROM meta WHERE key='0'`).get();
3957
4168
  let meta = {};
@@ -3963,7 +4174,7 @@ async function compileCursorStore(bundle, batch, store) {
3963
4174
  } catch {
3964
4175
  meta = {};
3965
4176
  }
3966
- const metaObjId = await putBytes(bundle, Buffer.from(metaText, "utf8"), {
4177
+ const metaObjId = stageBytes(pending.objects, Buffer.from(metaText, "utf8"), {
3967
4178
  mimeType: "application/json",
3968
4179
  encoding: "utf-8"
3969
4180
  });
@@ -4002,7 +4213,7 @@ async function compileCursorStore(bundle, batch, store) {
4002
4213
  const blob = blobs[i];
4003
4214
  if (!blob) continue;
4004
4215
  const ordinal = i + 1;
4005
- const blobObjectId = await putBytes(bundle, blob.data);
4216
+ const blobObjectId = stageBytes(pending.objects, blob.data);
4006
4217
  const blobRawId = rawRecordId(sourceFile.source_file_id, ordinal, blobObjectId);
4007
4218
  let parsed = null;
4008
4219
  const firstByte = blob.data[0];
@@ -4022,7 +4233,7 @@ async function compileCursorStore(bundle, batch, store) {
4022
4233
  json_pointer: `blobs/${blob.id}`,
4023
4234
  native_id: blob.id,
4024
4235
  raw_object_id: blobObjectId,
4025
- decoded_json_object_id: parsed != null ? await putJson(bundle, parsed) : null,
4236
+ decoded_json_object_id: parsed != null ? stageJson(pending.objects, parsed) : null,
4026
4237
  parser_status: parsed != null ? "ok" : looksJson ? "failed" : "partial",
4027
4238
  confidence: "low",
4028
4239
  // timeline order from blob list isn't canonical
@@ -4077,6 +4288,7 @@ async function compileCursorStore(bundle, batch, store) {
4077
4288
  }
4078
4289
  }
4079
4290
  buildSearchDocs3(pending);
4291
+ await flushPendingObjects(bundle, pending.objects);
4080
4292
  transactional(bundle.db, () => {
4081
4293
  flushPending3(bundle, pending);
4082
4294
  });
@@ -4088,6 +4300,10 @@ async function compileCursorStore(bundle, batch, store) {
4088
4300
  counts.tool_calls = pending.toolCallsList.length;
4089
4301
  counts.tool_results = pending.toolResults.length;
4090
4302
  counts.artifacts = pending.artifacts.length;
4303
+ logger?.debug(
4304
+ { path: store.filePath, source_file_id: sourceFile.source_file_id, counts },
4305
+ "cursor store imported"
4306
+ );
4091
4307
  return counts;
4092
4308
  } finally {
4093
4309
  cdb.close();
@@ -4112,7 +4328,7 @@ function mapRole(role) {
4112
4328
  }
4113
4329
  async function pushTextBlock(bundle, pending, messageId2, ordinal, blockType, text, rawRecordId2, visibility = "default") {
4114
4330
  if (!text) return;
4115
- const overflow = text.length > PREVIEW_MAX3 ? await putText(bundle, text) : null;
4331
+ const overflow = text.length > PREVIEW_MAX3 ? stageText(pending.objects, text) : null;
4116
4332
  pending.blocks.push({
4117
4333
  block_id: blockId(messageId2, ordinal),
4118
4334
  message_id: messageId2,
@@ -4163,7 +4379,7 @@ async function processContentItem(bundle, sessionId2, messageId2, eventId2, ordi
4163
4379
  if (t === "tool-call") {
4164
4380
  const sourceCallId = item.toolCallId ?? `${ordinal}`;
4165
4381
  const toolName = item.toolName ?? "unknown";
4166
- const argsObjectId = item.args != null ? await putJson(bundle, item.args) : null;
4382
+ const argsObjectId = item.args != null ? stageJson(pending.objects, item.args) : null;
4167
4383
  const tcId = toolCallId(sessionId2, sourceCallId);
4168
4384
  pending.blocks.push({
4169
4385
  block_id: blockId(messageId2, ordinal),
@@ -4200,7 +4416,7 @@ async function processContentItem(bundle, sessionId2, messageId2, eventId2, ordi
4200
4416
  if (t === "tool-result") {
4201
4417
  const sourceCallId = item.toolCallId ?? `${ordinal}`;
4202
4418
  const text = stringifyOrNull3(item.result) ?? "";
4203
- const overflow = text.length > PREVIEW_MAX3 ? await putText(bundle, text) : null;
4419
+ const overflow = text.length > PREVIEW_MAX3 ? stageText(pending.objects, text) : null;
4204
4420
  const isError = readIsError(item) ? 1 : 0;
4205
4421
  pending.blocks.push({
4206
4422
  block_id: blockId(messageId2, ordinal),
@@ -4564,17 +4780,34 @@ async function readdirSafe3(dir) {
4564
4780
 
4565
4781
  // src/importers/gemini/index.ts
4566
4782
  var PREVIEW_MAX4 = 4e3;
4567
- async function compileGemini(bundle, root) {
4783
+ async function compileGemini(bundle, root, options = {}) {
4784
+ const logger = options.logger;
4568
4785
  const batch = startBatch(bundle, "gemini", [root]);
4569
4786
  const counts = emptyCounts();
4787
+ logger?.info({ batch_id: batch.batch_id, root }, "gemini batch started");
4570
4788
  try {
4571
4789
  for await (const file of discoverGeminiChats(root)) {
4572
4790
  counts.source_files_seen++;
4791
+ logger?.debug(
4792
+ {
4793
+ path: file.filePath,
4794
+ project_dir: file.projectDir,
4795
+ project_root: file.projectRoot
4796
+ },
4797
+ "gemini source file discovered"
4798
+ );
4573
4799
  try {
4574
- const fc = await compileGeminiFile(bundle, batch, file);
4800
+ const fc = await compileGeminiFile(bundle, batch, file, logger);
4575
4801
  addCounts4(counts, fc);
4576
4802
  } catch (error) {
4577
4803
  counts.errors++;
4804
+ logger?.warn(
4805
+ {
4806
+ err: error,
4807
+ path: file.filePath
4808
+ },
4809
+ "gemini source file failed"
4810
+ );
4578
4811
  await recordError(bundle, batch.batch_id, {
4579
4812
  kind: "gemini_file_failed",
4580
4813
  message: error instanceof Error ? error.message : String(error),
@@ -4583,8 +4816,10 @@ async function compileGemini(bundle, root) {
4583
4816
  }
4584
4817
  }
4585
4818
  finishBatch(bundle, batch, counts, "completed");
4819
+ logger?.info({ batch_id: batch.batch_id, counts }, "gemini batch completed");
4586
4820
  } catch (error) {
4587
4821
  finishBatch(bundle, batch, counts, "failed");
4822
+ logger?.error({ err: error, batch_id: batch.batch_id, counts }, "gemini batch failed");
4588
4823
  throw error;
4589
4824
  }
4590
4825
  return { batch, counts };
@@ -4619,7 +4854,7 @@ function addCounts4(target, source) {
4619
4854
  target.edges += source.edges;
4620
4855
  target.errors += source.errors;
4621
4856
  }
4622
- async function compileGeminiFile(bundle, batch, file) {
4857
+ async function compileGeminiFile(bundle, batch, file, logger) {
4623
4858
  const counts = emptyFileCounts4();
4624
4859
  const { row: sourceFile, alreadyKnown } = await registerSourceFile(bundle, {
4625
4860
  sourceTool: "gemini",
@@ -4629,12 +4864,21 @@ async function compileGeminiFile(bundle, batch, file) {
4629
4864
  });
4630
4865
  if (alreadyKnown) {
4631
4866
  counts.source_files_skipped = 1;
4867
+ logger?.debug(
4868
+ { path: file.filePath, source_file_id: sourceFile.source_file_id },
4869
+ "gemini source file skipped"
4870
+ );
4632
4871
  return counts;
4633
4872
  }
4634
4873
  counts.source_files_imported = 1;
4874
+ logger?.debug(
4875
+ { path: file.filePath, source_file_id: sourceFile.source_file_id },
4876
+ "gemini source file registered"
4877
+ );
4635
4878
  const text = await readFile7(file.filePath, "utf8");
4636
4879
  const parsed = JSON.parse(text);
4637
- const fileObjectId = await putBytes(bundle, Buffer.from(text, "utf8"), {
4880
+ const objects = createPendingObjects();
4881
+ const fileObjectId = stageBytes(objects, Buffer.from(text, "utf8"), {
4638
4882
  mimeType: "application/json",
4639
4883
  encoding: "utf-8"
4640
4884
  });
@@ -4664,7 +4908,8 @@ async function compileGeminiFile(bundle, batch, file) {
4664
4908
  toolResults: [],
4665
4909
  artifacts: [],
4666
4910
  searchDocs: [],
4667
- project: null
4911
+ project: null,
4912
+ objects
4668
4913
  };
4669
4914
  const sourceSid = parsed.sessionId ?? path11.basename(file.filePath, ".json");
4670
4915
  const sessionPk = sessionId("gemini", sourceSid);
@@ -4701,6 +4946,7 @@ async function compileGeminiFile(bundle, batch, file) {
4701
4946
  );
4702
4947
  }
4703
4948
  buildSearchDocs4(pending);
4949
+ await flushPendingObjects(bundle, pending.objects);
4704
4950
  transactional(bundle.db, () => {
4705
4951
  flushPending4(bundle, pending);
4706
4952
  });
@@ -4712,12 +4958,16 @@ async function compileGeminiFile(bundle, batch, file) {
4712
4958
  counts.tool_calls = pending.toolCallsList.length;
4713
4959
  counts.tool_results = pending.toolResults.length;
4714
4960
  counts.artifacts = pending.artifacts.length;
4961
+ logger?.debug(
4962
+ { path: file.filePath, source_file_id: sourceFile.source_file_id, counts },
4963
+ "gemini source file imported"
4964
+ );
4715
4965
  return counts;
4716
4966
  }
4717
4967
  async function processMessage(bundle, sessionId2, sourceFileId2, index, msg, batchId, pending) {
4718
4968
  const ordinal = index + 1;
4719
4969
  const ts = msg.timestamp ?? null;
4720
- const payloadId = await putJson(bundle, msg);
4970
+ const payloadId = stageJson(pending.objects, msg);
4721
4971
  const pointer = `/messages/${index}`;
4722
4972
  const rawObjectIdInput = sha256Hex(`${pointer}
4723
4973
  ${JSON.stringify(msg)}`);
@@ -4834,7 +5084,7 @@ ${JSON.stringify(msg)}`);
4834
5084
  }
4835
5085
  async function pushTextBlock2(bundle, pending, messageId2, blockOrdinal, blockType, text, rawRecordId2, visibility = "default") {
4836
5086
  if (!text) return;
4837
- const overflowId = text.length > PREVIEW_MAX4 ? await putText(bundle, text) : null;
5087
+ const overflowId = text.length > PREVIEW_MAX4 ? stageText(pending.objects, text) : null;
4838
5088
  pending.blocks.push({
4839
5089
  block_id: blockId(messageId2, blockOrdinal),
4840
5090
  message_id: messageId2,
@@ -4851,7 +5101,7 @@ async function processToolCall(bundle, sessionId2, messageId2, eventId2, index,
4851
5101
  const sourceCallId = tc.id ?? `${messageId2}:${index}`;
4852
5102
  const toolName = tc.name ?? "unknown";
4853
5103
  const toolCallId2 = toolCallId(sessionId2, sourceCallId);
4854
- const argsObjectId = tc.args ? await putJson(bundle, tc.args) : null;
5104
+ const argsObjectId = tc.args ? stageJson(pending.objects, tc.args) : null;
4855
5105
  pending.toolCallsList.push({
4856
5106
  tool_call_id: toolCallId2,
4857
5107
  message_id: messageId2,
@@ -4870,7 +5120,7 @@ async function processToolCall(bundle, sessionId2, messageId2, eventId2, index,
4870
5120
  });
4871
5121
  const isError = tc.status === "error" ? 1 : 0;
4872
5122
  const resultText = renderToolResultText(tc.result);
4873
- const overflowId = resultText.length > PREVIEW_MAX4 ? await putText(bundle, resultText) : null;
5123
+ const overflowId = resultText.length > PREVIEW_MAX4 ? stageText(pending.objects, resultText) : null;
4874
5124
  pending.toolResults.push({
4875
5125
  tool_result_id: toolResultId(sessionId2, sourceCallId),
4876
5126
  tool_call_id: toolCallId2,
@@ -4887,7 +5137,7 @@ async function processToolCall(bundle, sessionId2, messageId2, eventId2, index,
4887
5137
  const rd = tc.resultDisplay;
4888
5138
  if (rd.fileDiff || rd.filePath) {
4889
5139
  const diffText = rd.fileDiff ?? "";
4890
- const diffId = diffText ? await putText(bundle, diffText, { mimeType: "text/x-diff" }) : null;
5140
+ const diffId = diffText ? stageText(pending.objects, diffText, { mimeType: "text/x-diff" }) : null;
4891
5141
  pending.artifacts.push({
4892
5142
  artifact_id: artifactId(sessionId2, "gemini", `${toolCallId2}:diff`),
4893
5143
  kind: "diff",
@@ -5242,55 +5492,312 @@ function flushPending4(bundle, pending) {
5242
5492
  }
5243
5493
  }
5244
5494
 
5495
+ // src/services/export/parquet.ts
5496
+ import { mkdir as mkdir3, rm, writeFile as writeFile4 } from "fs/promises";
5497
+ import path12 from "path";
5498
+ import { DuckDBConnection } from "@duckdb/node-api";
5499
+ var PARQUET_TABLES = [
5500
+ "objects",
5501
+ "source_files",
5502
+ "import_batches",
5503
+ "raw_records",
5504
+ "import_errors",
5505
+ "uncertainties",
5506
+ "projects",
5507
+ "sessions",
5508
+ "turns",
5509
+ "events",
5510
+ "messages",
5511
+ "content_blocks",
5512
+ "tool_calls",
5513
+ "tool_results",
5514
+ "artifacts",
5515
+ "edges",
5516
+ "search_docs"
5517
+ ];
5518
+ async function exportBundleParquet(options) {
5519
+ const snapshot = await openBundleSnapshot(options.bundlePath);
5520
+ const outDir = path12.resolve(options.outDir ?? snapshot.defaultOutDir);
5521
+ await mkdir3(outDir, { recursive: true });
5522
+ const files = Object.fromEntries(
5523
+ PARQUET_TABLES.map((table) => [table, path12.join(outDir, `${table}.parquet`)])
5524
+ );
5525
+ const manifestPath = path12.join(outDir, "manifest.json");
5526
+ for (const file of [...Object.values(files), manifestPath]) {
5527
+ await rm(file, { force: true });
5528
+ }
5529
+ const connection = await createDuckDbConnection();
5530
+ try {
5531
+ await attachSqlite(connection, snapshot.dbPath);
5532
+ for (const table of PARQUET_TABLES) {
5533
+ await connection.run(
5534
+ `COPY (SELECT * FROM prosa.${quoteIdentifier(table)}) TO ${sqlString(files[table])} (FORMAT parquet)`
5535
+ );
5536
+ }
5537
+ } finally {
5538
+ connection.closeSync();
5539
+ }
5540
+ const manifest = {
5541
+ exported_at: (/* @__PURE__ */ new Date()).toISOString(),
5542
+ source_db: snapshot.dbPath,
5543
+ schema_version: snapshot.schemaVersion,
5544
+ parser_version: snapshot.parserVersion,
5545
+ tables: Object.fromEntries(
5546
+ PARQUET_TABLES.map((table) => [
5547
+ table,
5548
+ {
5549
+ file: path12.basename(files[table]),
5550
+ rows: snapshot.counts[table]
5551
+ }
5552
+ ])
5553
+ )
5554
+ };
5555
+ await writeFile4(manifestPath, `${JSON.stringify(manifest, null, 2)}
5556
+ `, "utf8");
5557
+ return { outDir, manifestPath, files, counts: snapshot.counts };
5558
+ }
5559
+ async function queryDuckDbParquet(options) {
5560
+ const parquetDir = path12.resolve(options.parquetDir);
5561
+ const connection = await createDuckDbConnection();
5562
+ try {
5563
+ for (const table of PARQUET_TABLES) {
5564
+ await connection.run(
5565
+ `CREATE OR REPLACE VIEW ${quoteIdentifier(table)} AS SELECT * FROM read_parquet(${sqlString(
5566
+ path12.join(parquetDir, `${table}.parquet`)
5567
+ )})`
5568
+ );
5569
+ }
5570
+ const reader = await connection.runAndReadAll(options.sql);
5571
+ return {
5572
+ columns: reader.deduplicatedColumnNames(),
5573
+ rows: reader.getRowObjectsJson()
5574
+ };
5575
+ } catch (error) {
5576
+ if (isMissingParquetError(error)) {
5577
+ throw new Error(
5578
+ `Parquet export not found in ${parquetDir}; run \`prosa export parquet --store <path>\` first`
5579
+ );
5580
+ }
5581
+ throw error;
5582
+ } finally {
5583
+ connection.closeSync();
5584
+ }
5585
+ }
5586
+ async function createDuckDbConnection() {
5587
+ return DuckDBConnection.create();
5588
+ }
5589
+ async function attachSqlite(connection, dbPath) {
5590
+ try {
5591
+ await connection.run("INSTALL sqlite");
5592
+ await connection.run("LOAD sqlite");
5593
+ await connection.run(`ATTACH ${sqlString(dbPath)} AS prosa (TYPE sqlite)`);
5594
+ } catch (error) {
5595
+ throw new Error(
5596
+ `DuckDB could not attach prosa.sqlite via the sqlite extension: ${error instanceof Error ? error.message : String(error)}`
5597
+ );
5598
+ }
5599
+ }
5600
+ async function openBundleSnapshot(bundlePath) {
5601
+ const bundle = await openBundle(bundlePath);
5602
+ try {
5603
+ const counts = Object.fromEntries(
5604
+ PARQUET_TABLES.map((table) => {
5605
+ const row = bundle.db.prepare(`SELECT count(*) AS n FROM ${quoteIdentifier(table)}`).get();
5606
+ return [table, row?.n ?? 0];
5607
+ })
5608
+ );
5609
+ return {
5610
+ dbPath: bundle.paths.db,
5611
+ schemaVersion: bundle.manifest.schema_version,
5612
+ parserVersion: bundle.manifest.parser_version,
5613
+ defaultOutDir: bundle.paths.parquet,
5614
+ counts
5615
+ };
5616
+ } finally {
5617
+ closeBundle(bundle);
5618
+ }
5619
+ }
5620
+ function quoteIdentifier(value) {
5621
+ return `"${value.replace(/"/g, '""')}"`;
5622
+ }
5623
+ function sqlString(value) {
5624
+ return `'${value.replace(/'/g, "''")}'`;
5625
+ }
5626
+ function isMissingParquetError(error) {
5627
+ const message = error instanceof Error ? error.message : String(error);
5628
+ return /No files found|does not exist|not found/i.test(message) && /\.parquet/i.test(message);
5629
+ }
5630
+
5245
5631
  // src/cli/commands/compile.ts
5246
5632
  init_indexing();
5633
+
5634
+ // src/cli/logger.ts
5635
+ import pino from "pino";
5636
+ import pretty from "pino-pretty";
5637
+ function createCliLogger(options) {
5638
+ const loggerOptions = {
5639
+ base: void 0,
5640
+ level: options.verbose === true ? "debug" : "info"
5641
+ };
5642
+ if (options.jsonLogs === true) {
5643
+ return pino(loggerOptions, pino.destination({ dest: 2, sync: true }));
5644
+ }
5645
+ return pino(
5646
+ loggerOptions,
5647
+ pretty({
5648
+ colorize: process.stderr.isTTY,
5649
+ destination: 2,
5650
+ ignore: "pid,hostname",
5651
+ singleLine: true,
5652
+ sync: true,
5653
+ translateTime: "SYS:yyyy-mm-dd HH:MM:ss.l"
5654
+ })
5655
+ );
5656
+ }
5657
+
5658
+ // src/cli/commands/compile.ts
5659
+ var PROVIDERS = [
5660
+ {
5661
+ name: "codex",
5662
+ description: "Import Codex CLI session histories into the bundle.",
5663
+ pathHelp: "root of Codex CLI sessions",
5664
+ defaultSessionsPath: () => path14.join(os2.homedir(), ".codex", "sessions"),
5665
+ compile: compileCodex
5666
+ },
5667
+ {
5668
+ name: "claude",
5669
+ description: "Import Claude Code project histories into the bundle.",
5670
+ pathHelp: "root of Claude Code projects",
5671
+ defaultSessionsPath: () => path14.join(os2.homedir(), ".claude", "projects"),
5672
+ compile: compileClaude
5673
+ },
5674
+ {
5675
+ name: "gemini",
5676
+ description: "Import Gemini CLI session histories into the bundle.",
5677
+ pathHelp: "root of Gemini CLI tmp dir",
5678
+ defaultSessionsPath: () => path14.join(os2.homedir(), ".gemini", "tmp"),
5679
+ compile: compileGemini
5680
+ },
5681
+ {
5682
+ name: "cursor",
5683
+ description: "Import Cursor agent stores into the bundle.",
5684
+ pathHelp: "root of Cursor agent stores",
5685
+ defaultSessionsPath: () => path14.join(os2.homedir(), ".cursor", "chats"),
5686
+ compile: compileCursor
5687
+ }
5688
+ ];
5247
5689
  function compileCommand() {
5248
- return new Command("compile").description("Import session histories from one or more agent CLIs into the bundle.").option("--codex <path>", "root of Codex CLI sessions (e.g. ~/.codex/sessions)").option("--claude <path>", "root of Claude Code projects (e.g. ~/.claude/projects)").option("--gemini <path>", "root of Gemini CLI tmp dir (e.g. ~/.gemini/tmp)").option("--cursor <path>", "root of Cursor agent stores (e.g. ~/.cursor/chats)").option("--store <path>", "bundle directory", defaultBundlePath()).option("--defer-index", "skip immediate FTS5 updates; run `prosa index fts5` later").action(
5249
- async (options) => {
5250
- if (!options.codex && !options.claude && !options.gemini && !options.cursor) {
5251
- process.stderr.write(
5252
- "no source specified \u2014 pass at least one of --codex / --claude / --gemini / --cursor\n"
5253
- );
5254
- process.exit(2);
5255
- }
5256
- const bundle = await openBundle(path13.resolve(options.store));
5257
- let importedAny = false;
5690
+ const command = addCompileLogOptions(
5691
+ new Command("compile").description(
5692
+ "Import session histories from one agent CLI into the bundle."
5693
+ )
5694
+ );
5695
+ for (const provider of PROVIDERS) {
5696
+ command.addCommand(providerCompileCommand(provider));
5697
+ }
5698
+ command.action(() => {
5699
+ command.help({ error: true });
5700
+ });
5701
+ return command;
5702
+ }
5703
+ function compileAllCommand() {
5704
+ return addCompileLogOptions(new Command("compile-all")).description("Import all agent CLI session histories using default source paths.").option("--defer-index", "skip immediate FTS5 updates; run `prosa index fts5` later").action(async (options) => {
5705
+ await runCompiles({
5706
+ providers: PROVIDERS,
5707
+ storePath: defaultBundlePath(),
5708
+ deferIndex: options.deferIndex === true,
5709
+ logOptions: options
5710
+ });
5711
+ });
5712
+ }
5713
+ function providerCompileCommand(provider) {
5714
+ return addCompileLogOptions(new Command(provider.name)).description(provider.description).option(
5715
+ "--sessions-path <path>",
5716
+ `${provider.pathHelp} (default: ${provider.defaultSessionsPath()})`,
5717
+ provider.defaultSessionsPath()
5718
+ ).option("--store <path>", "bundle directory", defaultBundlePath()).option("--defer-index", "skip immediate FTS5 updates; run `prosa index fts5` later").action(
5719
+ async (options, command) => {
5720
+ await runCompiles({
5721
+ providers: [provider],
5722
+ storePath: options.store,
5723
+ deferIndex: options.deferIndex === true,
5724
+ sessionsPath: options.sessionsPath,
5725
+ logOptions: command.optsWithGlobals()
5726
+ });
5727
+ }
5728
+ );
5729
+ }
5730
+ function addCompileLogOptions(command) {
5731
+ return command.option("--verbose", "emit debug logs during compilation").option("--json-logs", "emit raw newline-delimited JSON logs instead of pretty logs");
5732
+ }
5733
+ async function runCompiles(options) {
5734
+ const logger = createCliLogger(options.logOptions);
5735
+ const storePath = resolvePath(options.storePath);
5736
+ logger.info({ store_path: storePath }, "opening bundle");
5737
+ const bundle = await openBundle(storePath);
5738
+ let importedAny = false;
5739
+ try {
5740
+ if (options.deferIndex) {
5741
+ logger.info("disabling FTS5 triggers for deferred indexing");
5742
+ disableFts5Triggers(bundle);
5743
+ }
5744
+ for (const provider of options.providers) {
5745
+ const sourcePath = resolvePath(options.sessionsPath ?? provider.defaultSessionsPath());
5746
+ const providerLogger = logger.child({
5747
+ source_tool: provider.name,
5748
+ source_path: sourcePath
5749
+ });
5750
+ providerLogger.info("starting compile");
5751
+ const r = await provider.compile(bundle, sourcePath, { logger: providerLogger });
5752
+ importedAny ||= r.counts.source_files_imported > 0;
5753
+ providerLogger.info(
5754
+ {
5755
+ batch_id: r.batch.batch_id,
5756
+ counts: r.counts
5757
+ },
5758
+ "compile finished"
5759
+ );
5760
+ printCounts(provider.name, r.batch.batch_id, r.counts);
5761
+ }
5762
+ logger.info({ changed: importedAny, fts5_deferred: options.deferIndex }, "marking indexes");
5763
+ markIndexesAfterImport(bundle, {
5764
+ changed: importedAny,
5765
+ fts5Deferred: options.deferIndex
5766
+ });
5767
+ if (importedAny) {
5258
5768
  try {
5259
- if (options.deferIndex) {
5260
- disableFts5Triggers(bundle);
5261
- }
5262
- if (options.codex) {
5263
- const r = await compileCodex(bundle, path13.resolve(options.codex));
5264
- importedAny ||= r.counts.source_files_imported > 0;
5265
- printCounts("codex", r.batch.batch_id, r.counts);
5266
- }
5267
- if (options.claude) {
5268
- const r = await compileClaude(bundle, path13.resolve(options.claude));
5269
- importedAny ||= r.counts.source_files_imported > 0;
5270
- printCounts("claude", r.batch.batch_id, r.counts);
5271
- }
5272
- if (options.gemini) {
5273
- const r = await compileGemini(bundle, path13.resolve(options.gemini));
5274
- importedAny ||= r.counts.source_files_imported > 0;
5275
- printCounts("gemini", r.batch.batch_id, r.counts);
5276
- }
5277
- if (options.cursor) {
5278
- const r = await compileCursor(bundle, path13.resolve(options.cursor));
5279
- importedAny ||= r.counts.source_files_imported > 0;
5280
- printCounts("cursor", r.batch.batch_id, r.counts);
5281
- }
5282
- markIndexesAfterImport(bundle, {
5283
- changed: importedAny,
5284
- fts5Deferred: options.deferIndex === true
5285
- });
5286
- } finally {
5287
- if (options.deferIndex) {
5288
- enableFts5Triggers(bundle);
5289
- }
5290
- closeBundle(bundle);
5769
+ logger.info("rebuilding tantivy index");
5770
+ const status = await rebuildTantivyIndex(bundle);
5771
+ process.stdout.write(`tantivy: indexed ${status.indexed_doc_count} docs
5772
+ `);
5773
+ } catch (error) {
5774
+ logger.error({ err: error }, "tantivy rebuild failed; SQLite data is intact");
5291
5775
  }
5292
5776
  }
5293
- );
5777
+ } finally {
5778
+ if (options.deferIndex) {
5779
+ logger.info("re-enabling FTS5 triggers");
5780
+ enableFts5Triggers(bundle);
5781
+ }
5782
+ closeBundle(bundle);
5783
+ logger.info({ store_path: storePath }, "bundle closed");
5784
+ }
5785
+ if (importedAny) {
5786
+ try {
5787
+ logger.info({ store_path: storePath }, "exporting parquet");
5788
+ const result = await exportBundleParquet({ bundlePath: storePath });
5789
+ const tableCount = Object.keys(result.files).length;
5790
+ process.stdout.write(`parquet: wrote ${tableCount} tables to ${result.outDir}
5791
+ `);
5792
+ } catch (error) {
5793
+ logger.error({ err: error }, "parquet export failed; SQLite data is intact");
5794
+ }
5795
+ }
5796
+ }
5797
+ function resolvePath(p) {
5798
+ if (p === "~") return os2.homedir();
5799
+ if (p.startsWith("~/")) return path14.join(os2.homedir(), p.slice(2));
5800
+ return path14.resolve(p);
5294
5801
  }
5295
5802
  function printCounts(label, batchId, c) {
5296
5803
  process.stdout.write(
@@ -5419,142 +5926,6 @@ function renderToolCall(c) {
5419
5926
  return lines.join("\n");
5420
5927
  }
5421
5928
 
5422
- // src/services/export/parquet.ts
5423
- import { mkdir as mkdir5, rm as rm2, writeFile as writeFile5 } from "fs/promises";
5424
- import path14 from "path";
5425
- import { DuckDBConnection } from "@duckdb/node-api";
5426
- var PARQUET_TABLES = [
5427
- "objects",
5428
- "source_files",
5429
- "import_batches",
5430
- "raw_records",
5431
- "import_errors",
5432
- "uncertainties",
5433
- "projects",
5434
- "sessions",
5435
- "turns",
5436
- "events",
5437
- "messages",
5438
- "content_blocks",
5439
- "tool_calls",
5440
- "tool_results",
5441
- "artifacts",
5442
- "edges",
5443
- "search_docs"
5444
- ];
5445
- async function exportBundleParquet(options) {
5446
- const snapshot = await openBundleSnapshot(options.bundlePath);
5447
- const outDir = path14.resolve(options.outDir ?? snapshot.defaultOutDir);
5448
- await mkdir5(outDir, { recursive: true });
5449
- const files = Object.fromEntries(
5450
- PARQUET_TABLES.map((table) => [table, path14.join(outDir, `${table}.parquet`)])
5451
- );
5452
- const manifestPath = path14.join(outDir, "manifest.json");
5453
- for (const file of [...Object.values(files), manifestPath]) {
5454
- await rm2(file, { force: true });
5455
- }
5456
- const connection = await createDuckDbConnection();
5457
- try {
5458
- await attachSqlite(connection, snapshot.dbPath);
5459
- for (const table of PARQUET_TABLES) {
5460
- await connection.run(
5461
- `COPY (SELECT * FROM prosa.${quoteIdentifier(table)}) TO ${sqlString(files[table])} (FORMAT parquet)`
5462
- );
5463
- }
5464
- } finally {
5465
- connection.closeSync();
5466
- }
5467
- const manifest = {
5468
- exported_at: (/* @__PURE__ */ new Date()).toISOString(),
5469
- source_db: snapshot.dbPath,
5470
- schema_version: snapshot.schemaVersion,
5471
- parser_version: snapshot.parserVersion,
5472
- tables: Object.fromEntries(
5473
- PARQUET_TABLES.map((table) => [
5474
- table,
5475
- {
5476
- file: path14.basename(files[table]),
5477
- rows: snapshot.counts[table]
5478
- }
5479
- ])
5480
- )
5481
- };
5482
- await writeFile5(manifestPath, `${JSON.stringify(manifest, null, 2)}
5483
- `, "utf8");
5484
- return { outDir, manifestPath, files, counts: snapshot.counts };
5485
- }
5486
- async function queryDuckDbParquet(options) {
5487
- const parquetDir = path14.resolve(options.parquetDir);
5488
- const connection = await createDuckDbConnection();
5489
- try {
5490
- for (const table of PARQUET_TABLES) {
5491
- await connection.run(
5492
- `CREATE OR REPLACE VIEW ${quoteIdentifier(table)} AS SELECT * FROM read_parquet(${sqlString(
5493
- path14.join(parquetDir, `${table}.parquet`)
5494
- )})`
5495
- );
5496
- }
5497
- const reader = await connection.runAndReadAll(options.sql);
5498
- return {
5499
- columns: reader.deduplicatedColumnNames(),
5500
- rows: reader.getRowObjectsJson()
5501
- };
5502
- } catch (error) {
5503
- if (isMissingParquetError(error)) {
5504
- throw new Error(
5505
- `Parquet export not found in ${parquetDir}; run \`prosa export parquet --store <path>\` first`
5506
- );
5507
- }
5508
- throw error;
5509
- } finally {
5510
- connection.closeSync();
5511
- }
5512
- }
5513
- async function createDuckDbConnection() {
5514
- return DuckDBConnection.create();
5515
- }
5516
- async function attachSqlite(connection, dbPath) {
5517
- try {
5518
- await connection.run("INSTALL sqlite");
5519
- await connection.run("LOAD sqlite");
5520
- await connection.run(`ATTACH ${sqlString(dbPath)} AS prosa (TYPE sqlite)`);
5521
- } catch (error) {
5522
- throw new Error(
5523
- `DuckDB could not attach prosa.sqlite via the sqlite extension: ${error instanceof Error ? error.message : String(error)}`
5524
- );
5525
- }
5526
- }
5527
- async function openBundleSnapshot(bundlePath) {
5528
- const bundle = await openBundle(bundlePath);
5529
- try {
5530
- const counts = Object.fromEntries(
5531
- PARQUET_TABLES.map((table) => {
5532
- const row = bundle.db.prepare(`SELECT count(*) AS n FROM ${quoteIdentifier(table)}`).get();
5533
- return [table, row?.n ?? 0];
5534
- })
5535
- );
5536
- return {
5537
- dbPath: bundle.paths.db,
5538
- schemaVersion: bundle.manifest.schema_version,
5539
- parserVersion: bundle.manifest.parser_version,
5540
- defaultOutDir: bundle.paths.parquet,
5541
- counts
5542
- };
5543
- } finally {
5544
- closeBundle(bundle);
5545
- }
5546
- }
5547
- function quoteIdentifier(value) {
5548
- return `"${value.replace(/"/g, '""')}"`;
5549
- }
5550
- function sqlString(value) {
5551
- return `'${value.replace(/'/g, "''")}'`;
5552
- }
5553
- function isMissingParquetError(error) {
5554
- const message = error instanceof Error ? error.message : String(error);
5555
- return /No files found|does not exist|not found/i.test(message) && /\.parquet/i.test(message);
5556
- }
5557
-
5558
5929
  // src/cli/commands/export.ts
5559
5930
  function exportCommand() {
5560
5931
  const session = new Command2("session").description("Export a single session to a human-readable format.").argument("<session-id>", "prosa session_id").requiredOption("--format <fmt>", 'currently only "markdown" is supported').option("--out <path>", "write to file instead of stdout").option("--store <path>", "bundle directory", defaultBundlePath()).action(async (sessionId2, options) => {
@@ -6454,6 +6825,7 @@ async function runCli(argv) {
6454
6825
  ).version(PROSA_PARSER_VERSION, "-v, --version");
6455
6826
  program.addCommand(initCommand());
6456
6827
  program.addCommand(compileCommand());
6828
+ program.addCommand(compileAllCommand());
6457
6829
  program.addCommand(indexCommand());
6458
6830
  program.addCommand(sessionsCommand());
6459
6831
  program.addCommand(searchCommand());