agentboss 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +34 -0
  2. package/bin/aboss.js +288 -0
  3. package/client/dist/assets/index-C1wFD_Vo.css +1 -0
  4. package/client/dist/assets/index-DBj1Ujlx.js +137 -0
  5. package/client/dist/index.html +34 -0
  6. package/package.json +64 -0
  7. package/server/analysis/daily-aggregator.js +258 -0
  8. package/server/analysis/difficulty.js +129 -0
  9. package/server/analysis/dimensions/ai-knowledge.js +172 -0
  10. package/server/analysis/dimensions/ai-tools.js +161 -0
  11. package/server/analysis/dimensions/judgement.js +107 -0
  12. package/server/analysis/dimensions/llm-merge.js +57 -0
  13. package/server/analysis/dimensions/output-quality.js +167 -0
  14. package/server/analysis/dimensions/problem-definition.js +104 -0
  15. package/server/analysis/dimensions/system-thinking.js +225 -0
  16. package/server/analysis/evidence-builder.js +104 -0
  17. package/server/analysis/job.js +273 -0
  18. package/server/analysis/report-builder.js +581 -0
  19. package/server/analysis/scoring-v2.js +72 -0
  20. package/server/analysis/text-signals.js +179 -0
  21. package/server/analysis/thresholds-v2.js +358 -0
  22. package/server/api/advice.js +124 -0
  23. package/server/api/analysis.js +141 -0
  24. package/server/api/execution.js +330 -0
  25. package/server/api/metrics.js +277 -0
  26. package/server/api/overview.js +308 -0
  27. package/server/api/project.js +255 -0
  28. package/server/api/reports.js +125 -0
  29. package/server/api/sessions.js +118 -0
  30. package/server/api/settings.js +119 -0
  31. package/server/db/connection.js +175 -0
  32. package/server/db/queries.js +1051 -0
  33. package/server/db/schema.js +487 -0
  34. package/server/etl/active-time.js +150 -0
  35. package/server/etl/backfill-subagents.js +178 -0
  36. package/server/etl/claude-code.js +826 -0
  37. package/server/etl/detect.js +341 -0
  38. package/server/etl/judge-filter.js +117 -0
  39. package/server/etl/opencode.js +606 -0
  40. package/server/execution/job.js +662 -0
  41. package/server/execution/prompt.js +227 -0
  42. package/server/execution/runner.js +218 -0
  43. package/server/index.js +94 -0
  44. package/server/llm/advice-prompt.js +339 -0
  45. package/server/llm/advice.js +384 -0
  46. package/server/llm/analysis-prompt.js +162 -0
  47. package/server/llm/cli-runner.js +249 -0
  48. package/server/llm/judge-prompts.js +179 -0
  49. package/server/llm/judge.js +118 -0
  50. package/server/llm/project-advice-prompt.js +332 -0
  51. package/server/llm/project-advice.js +491 -0
  52. package/server/llm/session-analyzer.js +122 -0
  53. package/server/utils/project.js +80 -0
@@ -0,0 +1,826 @@
1
+ /**
2
+ * Claude Code ETL collector for Agent Boss
3
+ *
4
+ * Reads session, message, and tool-call data from Claude Code's JSONL/JSON
5
+ * files under ~/.claude/ and writes unified rows into boss.db.
6
+ *
7
+ * Source layout (§4.3):
8
+ * ~/.claude/
9
+ * projects/
10
+ * <encoded-path>/ (e.g. "-Users-user-project")
11
+ * sessions-index.json (JSON array of session metadata)
12
+ * <sessionId>.jsonl (one JSON object per line)
13
+ *
14
+ * @author Felix
15
+ */
16
+
17
+ const fs = require('fs');
18
+ const path = require('path');
19
+ const { saveDb } = require('../db/connection');
20
+ const { isJudgePrompt } = require('./judge-filter');
21
+ const {
22
+ upsertSession,
23
+ bulkInsertMessages,
24
+ bulkInsertParts,
25
+ bulkInsertToolCalls,
26
+ getEtlState,
27
+ updateEtlState,
28
+ } = require('../db/queries');
29
+
30
+ // ---------------------------------------------------------------------------
31
+ // Constants
32
+ // ---------------------------------------------------------------------------
33
+
34
+ const SOURCE = 'claude-code';
35
+ const BATCH_SIZE = 50;
36
+
37
+ // Rough per-token pricing (USD per million tokens)
38
+ const MODEL_PRICING = {
39
+ // Sonnet-class models
40
+ sonnet: { input: 3, output: 15 },
41
+ // Opus-class models
42
+ opus: { input: 15, output: 75 },
43
+ // Haiku-class models
44
+ haiku: { input: 0.25, output: 1.25 },
45
+ };
46
+
47
+ // ---------------------------------------------------------------------------
48
+ // Date / time helpers
49
+ // ---------------------------------------------------------------------------
50
+
51
+ /**
52
+ * Convert an ISO 8601 string to a YYYY-MM-DD date string (UTC).
53
+ * @param {string|null} iso
54
+ * @returns {string|null}
55
+ */
56
+ function isoToDate(iso) {
57
+ if (!iso) return null;
58
+ try {
59
+ return new Date(iso).toISOString().slice(0, 10);
60
+ } catch (_) {
61
+ return null;
62
+ }
63
+ }
64
+
65
+ /**
66
+ * Compute the difference between two ISO timestamps in minutes.
67
+ * @param {string} startIso
68
+ * @param {string} endIso
69
+ * @returns {number}
70
+ */
71
+ function diffMinutes(startIso, endIso) {
72
+ if (!startIso || !endIso) return 0;
73
+ try {
74
+ const ms = new Date(endIso).getTime() - new Date(startIso).getTime();
75
+ return Math.max(0, Math.round(ms / 60000));
76
+ } catch (_) {
77
+ return 0;
78
+ }
79
+ }
80
+
81
+ // ---------------------------------------------------------------------------
82
+ // Cost estimation
83
+ // ---------------------------------------------------------------------------
84
+
85
+ /**
86
+ * Determine the pricing tier for a model ID string.
87
+ * @param {string|null} modelId
88
+ * @returns {{input: number, output: number}}
89
+ */
90
+ function getPricing(modelId) {
91
+ if (!modelId) return MODEL_PRICING.sonnet;
92
+ const lower = modelId.toLowerCase();
93
+ if (lower.includes('opus')) return MODEL_PRICING.opus;
94
+ if (lower.includes('haiku')) return MODEL_PRICING.haiku;
95
+ return MODEL_PRICING.sonnet;
96
+ }
97
+
98
+ /**
99
+ * Estimate cost in USD from token counts and a model ID.
100
+ * @param {number} inputTokens
101
+ * @param {number} outputTokens
102
+ * @param {string|null} modelId
103
+ * @returns {number}
104
+ */
105
+ function estimateCost(inputTokens, outputTokens, modelId) {
106
+ const pricing = getPricing(modelId);
107
+ return (
108
+ ((inputTokens || 0) * pricing.input) / 1_000_000 +
109
+ ((outputTokens || 0) * pricing.output) / 1_000_000
110
+ );
111
+ }
112
+
113
+ // ---------------------------------------------------------------------------
114
+ // Encoded path helpers
115
+ // ---------------------------------------------------------------------------
116
+
117
+ /**
118
+ * Decode a Claude Code encoded directory name back to a filesystem path.
119
+ * e.g. "-Users-user-project" → "/Users/user/project"
120
+ * @param {string} encodedName
121
+ * @returns {string}
122
+ */
123
+ function decodeProjectPath(encodedName) {
124
+ // Replace leading dash with "/" and all subsequent dashes with "/"
125
+ return encodedName.replace(/-/g, '/');
126
+ }
127
+
128
+ // ---------------------------------------------------------------------------
129
+ // JSONL parsing
130
+ // ---------------------------------------------------------------------------
131
+
132
+ /**
133
+ * Read and parse a JSONL file, returning an array of parsed objects paired
134
+ * with their 1-based line number. Malformed lines are skipped and logged.
135
+ *
136
+ * @param {string} filePath
137
+ * @param {(msg: string) => void} log
138
+ * @returns {{lineNumber: number, data: object}[]}
139
+ */
140
+ function readJsonl(filePath, log) {
141
+ let content;
142
+ try {
143
+ content = fs.readFileSync(filePath, 'utf8');
144
+ } catch (err) {
145
+ log(`Cannot read JSONL file ${filePath}: ${err.message}`);
146
+ return [];
147
+ }
148
+
149
+ const lines = content.split('\n');
150
+ const results = [];
151
+
152
+ for (let i = 0; i < lines.length; i++) {
153
+ const line = lines[i].trim();
154
+ if (!line) continue;
155
+
156
+ try {
157
+ const data = JSON.parse(line);
158
+ results.push({ lineNumber: i + 1, data });
159
+ } catch (err) {
160
+ log(`JSONL parse error at ${filePath}:${i + 1}: ${err.message}`);
161
+ }
162
+ }
163
+
164
+ return results;
165
+ }
166
+
167
+ // ---------------------------------------------------------------------------
168
+ // Mapping functions
169
+ // ---------------------------------------------------------------------------
170
+
171
+ /**
172
+ * Determine the most common model across parsed JSONL entries.
173
+ * @param {{lineNumber: number, data: object}[]} entries
174
+ * @returns {string|null}
175
+ */
176
+ function findMostCommonModel(entries) {
177
+ const counts = {};
178
+ for (const { data } of entries) {
179
+ const model = data.message && data.message.model;
180
+ if (model) {
181
+ counts[model] = (counts[model] || 0) + 1;
182
+ }
183
+ }
184
+
185
+ let bestModel = null;
186
+ let bestCount = 0;
187
+ for (const [model, count] of Object.entries(counts)) {
188
+ if (count > bestCount) {
189
+ bestModel = model;
190
+ bestCount = count;
191
+ }
192
+ }
193
+ return bestModel;
194
+ }
195
+
196
+ /**
197
+ * Claude Code injects synthetic user messages at the start of every session
198
+ * for housekeeping — slash-command echoes, stdout captures, resume caveats,
199
+ * etc. They all look like XML-tagged blobs and are NOT what the human
200
+ * actually typed, so they make terrible session titles.
201
+ */
202
+ const SYNTHETIC_USER_TAGS = [
203
+ '<local-command-caveat',
204
+ '<local-command-stdout',
205
+ '<local-command-stderr',
206
+ '<command-name',
207
+ '<command-message',
208
+ '<command-args',
209
+ '<bash-input',
210
+ '<bash-stdout',
211
+ '<bash-stderr',
212
+ '<system-reminder',
213
+ '<user-memory-input',
214
+ ];
215
+
216
+ function isSyntheticUserText(text) {
217
+ if (typeof text !== 'string') return false;
218
+ const t = text.trimStart();
219
+ if (!t) return true;
220
+ return SYNTHETIC_USER_TAGS.some((tag) => t.startsWith(tag));
221
+ }
222
+
223
+ /**
224
+ * Extract the session title from the first *real* user message — skipping
225
+ * Claude Code's synthetic XML-tagged housekeeping messages.
226
+ *
227
+ * @param {{lineNumber: number, data: object}[]} entries
228
+ * @returns {string|null}
229
+ */
230
+ function extractTitle(entries) {
231
+ for (const { data } of entries) {
232
+ if (data.type !== 'user') continue;
233
+ const content = data.message && data.message.content;
234
+
235
+ let text = null;
236
+ if (typeof content === 'string') {
237
+ text = content;
238
+ } else if (Array.isArray(content)) {
239
+ for (const block of content) {
240
+ const blockText = typeof block === 'string' ? block : (block && block.text);
241
+ if (typeof blockText === 'string' && blockText.length > 0) {
242
+ text = blockText;
243
+ break;
244
+ }
245
+ }
246
+ }
247
+
248
+ if (!text || isSyntheticUserText(text)) continue;
249
+ return text.length > 100 ? text.slice(0, 100) : text;
250
+ }
251
+ return null;
252
+ }
253
+
254
+ /**
255
+ * Aggregate token counts from all JSONL entries in a session.
256
+ * @param {{lineNumber: number, data: object}[]} entries
257
+ * @returns {{input: number, output: number, cacheRead: number, cacheWrite: number}}
258
+ */
259
+ function aggregateTokens(entries) {
260
+ let input = 0;
261
+ let output = 0;
262
+ let cacheRead = 0;
263
+ let cacheWrite = 0;
264
+
265
+ for (const { data } of entries) {
266
+ const usage = data.message && data.message.usage;
267
+ if (!usage) continue;
268
+ input += usage.input_tokens || 0;
269
+ output += usage.output_tokens || 0;
270
+ cacheRead += usage.cache_read_input_tokens || 0;
271
+ cacheWrite += usage.cache_creation_input_tokens || 0;
272
+ }
273
+
274
+ return { input, output, cacheRead, cacheWrite };
275
+ }
276
+
277
+ /**
278
+ * Map a sessions-index entry + parsed JSONL entries to a unified session.
279
+ *
280
+ * @param {object} sessionMeta Entry from sessions-index.json
281
+ * @param {{lineNumber: number, data: object}[]} entries Parsed JSONL lines
282
+ * @param {string} projectPath Decoded project path
283
+ * @returns {object}
284
+ */
285
+ function mapSession(sessionMeta, entries, projectPath) {
286
+ const msgEntries = entries.filter(
287
+ (e) => e.data.type === 'user' || e.data.type === 'assistant'
288
+ );
289
+ const toolEntries = entries.filter((e) => e.data.toolName);
290
+ const errorCount = entries.filter((e) => {
291
+ if (e.data.toolUseResult && e.data.toolUseResult.error) return true;
292
+ if (e.data.message && e.data.message.stop_reason === 'error') return true;
293
+ return false;
294
+ }).length;
295
+
296
+ const tokens = aggregateTokens(entries);
297
+ const model = findMostCommonModel(entries);
298
+
299
+ // Derive created/modified from JSONL timestamps when the sessions-index
300
+ // entry didn't carry them (e.g. when we synthesised sessionMeta from the
301
+ // directory listing because sessions-index.json was missing).
302
+ const timestamps = entries
303
+ .map((e) => e.data && e.data.timestamp)
304
+ .filter((t) => typeof t === 'string' && t);
305
+ const firstTs = timestamps.length ? timestamps[0] : null;
306
+ const lastTs = timestamps.length ? timestamps[timestamps.length - 1] : null;
307
+ const created = sessionMeta.created || firstTs || null;
308
+ const modified = sessionMeta.modified || lastTs || created || null;
309
+
310
+ return {
311
+ id: sessionMeta.sessionId,
312
+ source: SOURCE,
313
+ date: isoToDate(created),
314
+ started_at: created,
315
+ ended_at: modified,
316
+ duration_minutes: diffMinutes(created, modified),
317
+ active_minutes: null, // calculated separately by active-time calculator
318
+ message_count: msgEntries.length,
319
+ tokens_input: tokens.input,
320
+ tokens_output: tokens.output,
321
+ tokens_reasoning: 0,
322
+ tokens_cache_read: tokens.cacheRead,
323
+ tokens_cache_write: tokens.cacheWrite,
324
+ cost_usd: estimateCost(tokens.input, tokens.output, model),
325
+ project: sessionMeta.projectPath || projectPath || null,
326
+ title: extractTitle(entries),
327
+ model: model,
328
+ error_count: errorCount,
329
+ tool_call_count: toolEntries.length,
330
+ summary_additions: 0,
331
+ summary_deletions: 0,
332
+ summary_files: 0,
333
+ reverted: 0,
334
+ time_compacting: 0,
335
+ };
336
+ }
337
+
338
+ /**
339
+ * Flatten a Claude Code tool_result `content` payload to a short string.
340
+ * tool_result.content may be a string, an array of {type:"text",text} blocks,
341
+ * or other typed blocks (images etc. — ignored). Output is truncated to
342
+ * 1024 chars so a single tool_result can't blow past the per-message cap.
343
+ *
344
+ * @param {*} content
345
+ * @returns {string}
346
+ */
347
+ function stringifyToolResult(content) {
348
+ if (content == null) return '';
349
+ let out = '';
350
+ if (typeof content === 'string') {
351
+ out = content;
352
+ } else if (Array.isArray(content)) {
353
+ const parts = [];
354
+ for (const b of content) {
355
+ if (typeof b === 'string') parts.push(b);
356
+ else if (b && typeof b.text === 'string') parts.push(b.text);
357
+ }
358
+ out = parts.join('\n');
359
+ }
360
+ out = out.trim();
361
+ if (out.length > 1024) out = out.slice(0, 1024) + '…';
362
+ return out;
363
+ }
364
+
365
+ /**
366
+ * Map a single JSONL entry (user/assistant) to a unified message.
367
+ *
368
+ * @param {string} sessionId
369
+ * @param {number} lineNumber
370
+ * @param {object} data Parsed JSONL line
371
+ * @returns {object}
372
+ */
373
+ function mapMessage(sessionId, lineNumber, data) {
374
+ const usage = (data.message && data.message.usage) || {};
375
+ const modelId = (data.message && data.message.model) || null;
376
+ const content = data.message && data.message.content;
377
+
378
+ // Extract the textual payload so the "原始对话" replay can render it.
379
+ // Claude Code content can be a plain string OR an array of typed blocks:
380
+ // - {type:"text", text:"..."} assistant prose / user message
381
+ // - {type:"thinking", thinking:"..."} extended-thinking trace
382
+ // - {type:"tool_use", name, input} skipped here — rendered by
383
+ // unified_tool_call timeline
384
+ // - {type:"tool_result", content:...} tool stdout/stderr fed back to
385
+ // the model; content is string or
386
+ // another array of typed blocks
387
+ // Same 4 KB cap as the opencode ETL so the column stays bounded.
388
+ let textParts = [];
389
+ let contentLength = 0;
390
+ if (typeof content === 'string') {
391
+ contentLength = content.length;
392
+ textParts.push(content);
393
+ } else if (Array.isArray(content)) {
394
+ for (const block of content) {
395
+ if (typeof block === 'string') {
396
+ contentLength += block.length;
397
+ textParts.push(block);
398
+ continue;
399
+ }
400
+ if (!block || typeof block !== 'object') continue;
401
+ if (typeof block.text === 'string' && block.text.length) {
402
+ contentLength += block.text.length;
403
+ textParts.push(block.text);
404
+ } else if (typeof block.thinking === 'string' && block.thinking.length) {
405
+ contentLength += block.thinking.length;
406
+ textParts.push(`[thinking] ${block.thinking}`);
407
+ } else if (block.type === 'tool_result') {
408
+ const piece = stringifyToolResult(block.content);
409
+ if (piece) {
410
+ contentLength += piece.length;
411
+ textParts.push(`[tool_result] ${piece}`);
412
+ }
413
+ }
414
+ // tool_use blocks intentionally skipped — they appear in the tool-call
415
+ // timeline already; duplicating them here just clutters the transcript.
416
+ }
417
+ }
418
+ let text = textParts.join('\n').trim() || null;
419
+ if (text && text.length > 4096) text = text.slice(0, 4096);
420
+
421
+ return {
422
+ id: `cc_${sessionId}_${lineNumber}`,
423
+ session_id: sessionId,
424
+ source: SOURCE,
425
+ role: data.type,
426
+ timestamp: data.timestamp || null,
427
+ tokens_input: usage.input_tokens || 0,
428
+ tokens_output: usage.output_tokens || 0,
429
+ tokens_reasoning: 0,
430
+ cost_usd: estimateCost(
431
+ usage.input_tokens || 0,
432
+ usage.output_tokens || 0,
433
+ modelId
434
+ ),
435
+ content_length: contentLength,
436
+ is_error: 0,
437
+ model_id: modelId,
438
+ text,
439
+ };
440
+ }
441
+
442
+ /**
443
+ * Map a single JSONL entry to a unified part.
444
+ *
445
+ * @param {string} sessionId
446
+ * @param {number} lineNumber
447
+ * @param {object} data Parsed JSONL line
448
+ * @param {string|null} messageId ID of the parent message (if applicable)
449
+ * @returns {object}
450
+ */
451
+ function mapPart(sessionId, lineNumber, data, messageId) {
452
+ return {
453
+ id: `cc_${sessionId}_part_${lineNumber}`,
454
+ message_id: messageId,
455
+ session_id: sessionId,
456
+ source: SOURCE,
457
+ type: data.type || null,
458
+ timestamp: data.timestamp || null,
459
+ };
460
+ }
461
+
462
+ /**
463
+ * Extract a target file path from a tool input object.
464
+ * @param {object|null} toolInput
465
+ * @returns {string|null}
466
+ */
467
+ function extractTargetFile(toolInput) {
468
+ if (!toolInput) return null;
469
+ if (typeof toolInput.path === 'string') return toolInput.path;
470
+ if (typeof toolInput.file_path === 'string') return toolInput.file_path;
471
+ if (typeof toolInput.filePath === 'string') return toolInput.filePath;
472
+ return null;
473
+ }
474
+
475
+ /**
476
+ * Map a JSONL entry with toolName to a unified tool call.
477
+ *
478
+ * @param {string} sessionId
479
+ * @param {number} lineNumber
480
+ * @param {object} data Parsed JSONL line
481
+ * @returns {object}
482
+ */
483
+ function mapToolCall(sessionId, lineNumber, data) {
484
+ const hasError =
485
+ data.toolUseResult && data.toolUseResult.error ? true : false;
486
+
487
+ return {
488
+ id: `cc_${sessionId}_tool_${lineNumber}`,
489
+ part_id: `cc_${sessionId}_tool_${lineNumber}`,
490
+ session_id: sessionId,
491
+ source: SOURCE,
492
+ tool_name: data.toolName || null,
493
+ timestamp: data.timestamp || null,
494
+ status: hasError ? 'error' : 'success',
495
+ error_message: hasError ? String(data.toolUseResult.error) : null,
496
+ target_file: extractTargetFile(data.toolInput),
497
+ };
498
+ }
499
+
500
+ // ---------------------------------------------------------------------------
501
+ // Core ETL: process a single session
502
+ // ---------------------------------------------------------------------------
503
+
504
+ /**
505
+ * Process a single Claude Code session: read its JSONL file, parse entries,
506
+ * map to unified rows, and write to boss.db.
507
+ *
508
+ * @param {object} bossDb sql.js database instance (boss.db)
509
+ * @param {string} projectDir Path to the project directory under projects/
510
+ * @param {object} sessionMeta Entry from sessions-index.json
511
+ * @param {string} projectPath Decoded project path
512
+ * @param {(msg: string) => void} log Progress callback
513
+ * @returns {{messages: number, parts: number, toolCalls: number, errors: number}}
514
+ */
515
+ /**
516
+ * Extract the plain text of a user JSONL entry (string content or the
517
+ * first text block). Used only for judge-artifact detection.
518
+ */
519
+ function entryUserText(data) {
520
+ const content = data.message && data.message.content;
521
+ if (typeof content === 'string') return content;
522
+ if (Array.isArray(content)) {
523
+ for (const block of content) {
524
+ if (typeof block === 'string') return block;
525
+ if (block && typeof block.text === 'string') return block.text;
526
+ }
527
+ }
528
+ return '';
529
+ }
530
+
531
+ function processSession(bossDb, projectDir, sessionMeta, projectPath, log) {
532
+ const jsonlPath = path.join(projectDir, `${sessionMeta.sessionId}.jsonl`);
533
+
534
+ if (!fs.existsSync(jsonlPath)) {
535
+ log(`Session JSONL not found: ${jsonlPath}`);
536
+ return { messages: 0, parts: 0, toolCalls: 0, errors: 0 };
537
+ }
538
+
539
+ // Parse all JSONL lines
540
+ const entries = readJsonl(jsonlPath, log);
541
+ if (entries.length === 0) {
542
+ return { messages: 0, parts: 0, toolCalls: 0, errors: 0 };
543
+ }
544
+
545
+ // Skip sessions created by our own LLM judge — `claude -p` logs each
546
+ // judge call as a session here; importing them back would create a
547
+ // feedback loop (see server/etl/judge-filter.js).
548
+ for (const { data } of entries) {
549
+ if (data.type !== 'user') continue;
550
+ if (isJudgePrompt(entryUserText(data))) {
551
+ return { messages: 0, parts: 0, toolCalls: 0, errors: 0, skipped: true };
552
+ }
553
+ break; // only the first user entry matters
554
+ }
555
+
556
+ // --- Build unified rows ---
557
+ const messages = [];
558
+ const parts = [];
559
+ const toolCalls = [];
560
+ let errorCount = 0;
561
+
562
+ for (const { lineNumber, data } of entries) {
563
+ // Messages: user and assistant types
564
+ if (data.type === 'user' || data.type === 'assistant') {
565
+ const messageId = `cc_${sessionMeta.sessionId}_${lineNumber}`;
566
+ messages.push(mapMessage(sessionMeta.sessionId, lineNumber, data));
567
+ parts.push(mapPart(sessionMeta.sessionId, lineNumber, data, messageId));
568
+ }
569
+
570
+ // Tool calls: entries with toolName
571
+ if (data.toolName) {
572
+ const tc = mapToolCall(sessionMeta.sessionId, lineNumber, data);
573
+ toolCalls.push(tc);
574
+
575
+ // Also create a part for tool-call entries
576
+ parts.push(
577
+ mapPart(
578
+ sessionMeta.sessionId,
579
+ lineNumber,
580
+ data,
581
+ // Associate with closest prior assistant message if available
582
+ messages.length > 0 ? messages[messages.length - 1].id : null
583
+ )
584
+ );
585
+
586
+ if (tc.status === 'error') {
587
+ errorCount++;
588
+ }
589
+ }
590
+ }
591
+
592
+ // --- Session row ---
593
+ const sessionObj = mapSession(sessionMeta, entries, projectPath);
594
+ sessionObj.error_count = errorCount; // override with precise count
595
+
596
+ // --- Write to boss.db ---
597
+ upsertSession(bossDb, sessionObj);
598
+ bulkInsertMessages(bossDb, messages);
599
+ bulkInsertParts(bossDb, parts);
600
+ bulkInsertToolCalls(bossDb, toolCalls);
601
+
602
+ return {
603
+ messages: messages.length,
604
+ parts: parts.length,
605
+ toolCalls: toolCalls.length,
606
+ errors: errorCount,
607
+ };
608
+ }
609
+
610
+ // ---------------------------------------------------------------------------
611
+ // Main entry point
612
+ // ---------------------------------------------------------------------------
613
+
614
+ /**
615
+ * Collect data from Claude Code and write to boss.db.
616
+ *
617
+ * Performs incremental sync based on etl_state.last_session_time. Sessions
618
+ * are processed in batches for memory efficiency. Each session is
619
+ * individually wrapped in try/catch so that one bad file never stops the
620
+ * whole ETL run.
621
+ *
622
+ * @param {object} bossDb sql.js database instance
623
+ * @param {string} claudeCodePath path to ~/.claude/
624
+ * @param {object} [options] { onProgress: (msg) => void }
625
+ * @returns {Promise<{sessionCount: number, messageCount: number, partCount: number, toolCallCount: number, errorSessionCount: number}>}
626
+ */
627
+ async function collectClaudeCode(bossDb, claudeCodePath, options = {}) {
628
+ const log = options.onProgress || (() => {});
629
+
630
+ // -- 1. Read ETL watermark --------------------------------------------------
631
+ const etlState = getEtlState(bossDb, SOURCE);
632
+ const lastSessionTime = etlState ? etlState.last_session_time || null : null;
633
+ log(`ETL watermark: last_session_time = ${lastSessionTime}`);
634
+
635
+ // -- 2. Find all project directories ----------------------------------------
636
+ const projectsDir = path.join(claudeCodePath, 'projects');
637
+ if (!fs.existsSync(projectsDir) || !fs.statSync(projectsDir).isDirectory()) {
638
+ log(`Projects directory not found: ${projectsDir}`);
639
+ return {
640
+ sessionCount: 0,
641
+ messageCount: 0,
642
+ partCount: 0,
643
+ toolCallCount: 0,
644
+ errorSessionCount: 0,
645
+ };
646
+ }
647
+
648
+ let projectFolders;
649
+ try {
650
+ projectFolders = fs
651
+ .readdirSync(projectsDir, { withFileTypes: true })
652
+ .filter((d) => d.isDirectory());
653
+ } catch (err) {
654
+ log(`Cannot read projects directory: ${err.message}`);
655
+ return {
656
+ sessionCount: 0,
657
+ messageCount: 0,
658
+ partCount: 0,
659
+ toolCallCount: 0,
660
+ errorSessionCount: 0,
661
+ };
662
+ }
663
+
664
+ log(`Found ${projectFolders.length} project folder(s)`);
665
+
666
+ const totals = {
667
+ sessionCount: 0,
668
+ messageCount: 0,
669
+ partCount: 0,
670
+ toolCallCount: 0,
671
+ errorSessionCount: 0,
672
+ };
673
+
674
+ let latestSessionTime = lastSessionTime;
675
+ let latestSessionId = etlState ? etlState.last_session_id || null : null;
676
+ let processedInBatch = 0;
677
+
678
+ // -- 3. Iterate over project directories ------------------------------------
679
+ for (const folder of projectFolders) {
680
+ const projectDir = path.join(projectsDir, folder.name);
681
+ const indexPath = path.join(projectDir, 'sessions-index.json');
682
+ const projectPath =
683
+ decodeProjectPath(folder.name) || folder.name;
684
+
685
+ // 3a. Read sessions-index.json, or fall back to scanning *.jsonl files.
686
+ // Recent Claude Code builds (and self-hosted setups) don't write
687
+ // sessions-index.json at all; the JSONL files alone are the source
688
+ // of truth. When the index is missing we synthesise minimal
689
+ // sessionMeta entries from the directory listing.
690
+ let sessionsIndex = null;
691
+ if (fs.existsSync(indexPath)) {
692
+ try {
693
+ const raw = fs.readFileSync(indexPath, 'utf8');
694
+ const parsed = JSON.parse(raw);
695
+ if (Array.isArray(parsed)) {
696
+ sessionsIndex = parsed;
697
+ } else {
698
+ log(`sessions-index.json in ${folder.name} is not an array; falling back to JSONL scan`);
699
+ }
700
+ } catch (err) {
701
+ log(`Cannot parse sessions-index.json in ${folder.name} (${err.message}); falling back to JSONL scan`);
702
+ }
703
+ }
704
+
705
+ if (!sessionsIndex) {
706
+ // Build the index from disk. `modified` becomes the file mtime ISO
707
+ // string so the incremental-sync watermark still works.
708
+ let files;
709
+ try {
710
+ files = fs.readdirSync(projectDir);
711
+ } catch (err) {
712
+ log(`Cannot read ${folder.name}: ${err.message}`);
713
+ continue;
714
+ }
715
+ sessionsIndex = [];
716
+ for (const f of files) {
717
+ if (!f.endsWith('.jsonl')) continue;
718
+ const sessionId = f.slice(0, -'.jsonl'.length);
719
+ let mtimeIso;
720
+ try {
721
+ mtimeIso = fs.statSync(path.join(projectDir, f)).mtime.toISOString();
722
+ } catch (_) {
723
+ mtimeIso = new Date().toISOString();
724
+ }
725
+ sessionsIndex.push({ sessionId, modified: mtimeIso });
726
+ }
727
+ if (!sessionsIndex.length) {
728
+ log(`No .jsonl files in ${folder.name}, skipping`);
729
+ continue;
730
+ }
731
+ }
732
+
733
+ // 3b. Filter sessions: only process those modified after last_session_time
734
+ const newSessions = sessionsIndex.filter((s) => {
735
+ if (!s || !s.sessionId) return false;
736
+ if (!lastSessionTime) return true; // first run: process all
737
+ if (!s.modified) return false;
738
+ return s.modified > lastSessionTime;
739
+ });
740
+
741
+ if (newSessions.length === 0) {
742
+ continue;
743
+ }
744
+
745
+ log(
746
+ `Project ${folder.name}: ${newSessions.length} new session(s) ` +
747
+ `(of ${sessionsIndex.length} total)`
748
+ );
749
+
750
+ // 3c. Process each new session
751
+ for (const sessionMeta of newSessions) {
752
+ try {
753
+ const result = processSession(
754
+ bossDb,
755
+ projectDir,
756
+ sessionMeta,
757
+ projectPath,
758
+ log
759
+ );
760
+
761
+ if (!result.skipped) {
762
+ totals.sessionCount++;
763
+ totals.messageCount += result.messages;
764
+ totals.partCount += result.parts;
765
+ totals.toolCallCount += result.toolCalls;
766
+ }
767
+
768
+ // Track watermark (use modified timestamp as the high-water mark)
769
+ if (
770
+ !latestSessionTime ||
771
+ (sessionMeta.modified && sessionMeta.modified > latestSessionTime)
772
+ ) {
773
+ latestSessionTime = sessionMeta.modified;
774
+ latestSessionId = sessionMeta.sessionId;
775
+ }
776
+ } catch (err) {
777
+ totals.errorSessionCount++;
778
+ log(
779
+ `Error processing session ${sessionMeta.sessionId}: ${err.message}`
780
+ );
781
+ }
782
+
783
+ // Periodic save after each batch
784
+ processedInBatch++;
785
+ if (processedInBatch >= BATCH_SIZE) {
786
+ updateEtlState(bossDb, SOURCE, {
787
+ last_sync_at: new Date().toISOString(),
788
+ last_session_id: latestSessionId,
789
+ last_session_time: latestSessionTime,
790
+ status: 'running',
791
+ });
792
+ saveDb();
793
+ log(
794
+ `Batch checkpoint: ${totals.sessionCount} sessions processed so far`
795
+ );
796
+ processedInBatch = 0;
797
+ }
798
+ }
799
+ }
800
+
801
+ // -- 4. Final ETL state update ----------------------------------------------
802
+ updateEtlState(bossDb, SOURCE, {
803
+ last_sync_at: new Date().toISOString(),
804
+ last_session_id: latestSessionId,
805
+ last_session_time: latestSessionTime,
806
+ status: 'idle',
807
+ });
808
+ saveDb();
809
+
810
+ log(
811
+ `ETL complete: ${totals.sessionCount} sessions, ` +
812
+ `${totals.messageCount} messages, ${totals.partCount} parts, ` +
813
+ `${totals.toolCallCount} tool calls` +
814
+ (totals.errorSessionCount
815
+ ? `, ${totals.errorSessionCount} failed`
816
+ : '')
817
+ );
818
+
819
+ return totals;
820
+ }
821
+
822
+ // ---------------------------------------------------------------------------
823
+ // Exports
824
+ // ---------------------------------------------------------------------------
825
+
826
+ module.exports = { collectClaudeCode };