clementine-agent 1.0.69 → 1.0.70
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/brain/adapters/pdf.js +10 -4
- package/dist/brain/ingestion-pipeline.js +46 -26
- package/dist/cli/dashboard.js +28 -2
- package/package.json +1 -1
|
@@ -13,18 +13,24 @@ export async function* parsePdf(filePath) {
|
|
|
13
13
|
try {
|
|
14
14
|
buf = readFileSync(filePath);
|
|
15
15
|
}
|
|
16
|
-
catch {
|
|
17
|
-
|
|
16
|
+
catch (err) {
|
|
17
|
+
throw new Error(`Failed to read PDF ${path.basename(filePath)}: ${err instanceof Error ? err.message : String(err)}`);
|
|
18
18
|
}
|
|
19
19
|
let result;
|
|
20
20
|
try {
|
|
21
21
|
result = await pdfParse(buf);
|
|
22
22
|
}
|
|
23
|
-
catch {
|
|
24
|
-
|
|
23
|
+
catch (err) {
|
|
24
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
25
|
+
const hint = /password/i.test(msg) ? ' (looks password-protected)' : '';
|
|
26
|
+
throw new Error(`Failed to parse PDF ${path.basename(filePath)}${hint}: ${msg}`);
|
|
25
27
|
}
|
|
26
28
|
const hint = path.basename(filePath, path.extname(filePath));
|
|
27
29
|
const pages = splitPages(result.text);
|
|
30
|
+
const hasAnyText = pages.some((p) => p.trim().length > 0);
|
|
31
|
+
if (!hasAnyText) {
|
|
32
|
+
throw new Error(`PDF ${path.basename(filePath)} has no extractable text — likely image-only (OCR is not supported). Re-export with a text layer or transcribe it first.`);
|
|
33
|
+
}
|
|
28
34
|
for (let i = 0; i < pages.length; i++) {
|
|
29
35
|
const pageText = pages[i].trim();
|
|
30
36
|
if (!pageText)
|
|
@@ -75,32 +75,41 @@ export async function runIngestion(opts) {
|
|
|
75
75
|
const samples = [];
|
|
76
76
|
const pendingStructured = [];
|
|
77
77
|
for (const iter of recordIterators) {
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
await
|
|
78
|
+
// Wrap so one unreadable file doesn't abort an otherwise-good folder
|
|
79
|
+
// ingest — record the adapter error and move on to the next iterator.
|
|
80
|
+
try {
|
|
81
|
+
for await (const record of iter) {
|
|
82
|
+
recordsIn += 1;
|
|
83
|
+
if (opts.limit && recordsIn > opts.limit)
|
|
84
|
+
break;
|
|
85
|
+
const flowPath = classifyRecord(record, intelligenceMode);
|
|
86
|
+
if (flowPath === 'structured') {
|
|
87
|
+
if (!schemaMapping && samples.length < SAMPLE_SIZE) {
|
|
88
|
+
samples.push(record);
|
|
89
|
+
pendingStructured.push(record);
|
|
90
|
+
continue;
|
|
91
|
+
}
|
|
92
|
+
if (!schemaMapping && samples.length >= SAMPLE_SIZE) {
|
|
93
|
+
schemaMapping = await inferSchema(samples, source.slug);
|
|
94
|
+
await applyStructuredColumns(schemaMapping);
|
|
95
|
+
for (const s of pendingStructured) {
|
|
96
|
+
await processStructured(s, schemaMapping, source, opts, store, report, plannedRecords, errors, writtenSummaries, counters());
|
|
97
|
+
}
|
|
98
|
+
pendingStructured.length = 0;
|
|
99
|
+
}
|
|
100
|
+
if (schemaMapping) {
|
|
101
|
+
await processStructured(record, schemaMapping, source, opts, store, report, plannedRecords, errors, writtenSummaries, counters());
|
|
94
102
|
}
|
|
95
|
-
pendingStructured.length = 0;
|
|
96
103
|
}
|
|
97
|
-
|
|
98
|
-
await
|
|
104
|
+
else {
|
|
105
|
+
await processFreeForm(record, source, opts, store, report, plannedRecords, errors, writtenSummaries, counters());
|
|
99
106
|
}
|
|
100
107
|
}
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
108
|
+
}
|
|
109
|
+
catch (err) {
|
|
110
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
111
|
+
errors.push({ error: msg });
|
|
112
|
+
report('parsing', msg);
|
|
104
113
|
}
|
|
105
114
|
}
|
|
106
115
|
// Flush structured records that never reached the schema-infer threshold
|
|
@@ -207,11 +216,22 @@ async function processStructured(record, mapping, source, opts, store, _report,
|
|
|
207
216
|
}
|
|
208
217
|
async function processFreeForm(record, source, opts, store, report, planned, _errors, _writtenSummaries, counters) {
|
|
209
218
|
try {
|
|
210
|
-
report('distilling');
|
|
211
219
|
const chunks = chunkContent(record.content, 3000);
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
220
|
+
report('distilling', chunks.length > 1 ? `chunk 0/${chunks.length}` : undefined);
|
|
221
|
+
// Parallelize per-chunk Haiku calls in small batches — one chunk at a
|
|
222
|
+
// time on a 30KB PDF adds up to 60–90s; 5-way concurrency cuts it to
|
|
223
|
+
// ~15s without pushing the API's rate limits.
|
|
224
|
+
const CONCURRENCY = 5;
|
|
225
|
+
const distillations = new Array(chunks.length);
|
|
226
|
+
let completed = 0;
|
|
227
|
+
for (let i = 0; i < chunks.length; i += CONCURRENCY) {
|
|
228
|
+
const batch = chunks.slice(i, i + CONCURRENCY);
|
|
229
|
+
const results = await Promise.all(batch.map((chunk) => distillChunk(chunk, record.metadata ?? {})));
|
|
230
|
+
for (let j = 0; j < results.length; j++)
|
|
231
|
+
distillations[i + j] = results[j];
|
|
232
|
+
completed += results.length;
|
|
233
|
+
if (chunks.length > 1)
|
|
234
|
+
report('distilling', `chunk ${completed}/${chunks.length}`);
|
|
215
235
|
}
|
|
216
236
|
const targetFolder = sanitizeFolder(source.targetFolder || `04-Ingest/${source.slug}`, source.slug);
|
|
217
237
|
const partial = combineDistillations(record, distillations, source.slug, targetFolder);
|
package/dist/cli/dashboard.js
CHANGED
|
@@ -2483,6 +2483,8 @@ export async function cmdDashboard(opts) {
|
|
|
2483
2483
|
title: r.title, tags: r.tags, targetRelPath: r.targetRelPath,
|
|
2484
2484
|
body: (r.body || '').slice(0, 800),
|
|
2485
2485
|
})),
|
|
2486
|
+
recordsIn: result.recordsIn,
|
|
2487
|
+
errors: result.errors.slice(0, 10),
|
|
2486
2488
|
});
|
|
2487
2489
|
}
|
|
2488
2490
|
catch (err) {
|
|
@@ -10210,14 +10212,25 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
|
|
|
10210
10212
|
|
|
10211
10213
|
const manifest = manifestData.manifest;
|
|
10212
10214
|
const preview = finalData.preview || [];
|
|
10215
|
+
const errorsList = finalData.errors || [];
|
|
10213
10216
|
const manifestRows = Object.entries(manifest.formats || {})
|
|
10214
10217
|
.map(([fmt, n]) => '<tr><td>' + escapeHtml(fmt) + '</td><td>' + n + '</td></tr>').join('');
|
|
10218
|
+
const warnBlock = errorsList.length
|
|
10219
|
+
? '<div style="margin-top:10px;padding:10px;background:#fff3cd;border:1px solid #f0c36d;border-radius:6px;color:#8a5a00;font-size:13px">' +
|
|
10220
|
+
'<div style="font-weight:600;margin-bottom:4px">' + errorsList.length + ' file(s) could not be ingested</div>' +
|
|
10221
|
+
errorsList.map((e) => '<div style="font-family:monospace;font-size:12px">• ' + escapeHtml(e.error) + '</div>').join('') +
|
|
10222
|
+
'</div>'
|
|
10223
|
+
: '';
|
|
10224
|
+
const emptyNote = (preview.length === 0 && !errorsList.length)
|
|
10225
|
+
? '<div style="margin-top:10px;padding:10px;background:#fff3cd;border:1px solid #f0c36d;border-radius:6px;color:#8a5a00;font-size:13px">No records extracted. The file may be empty or in an unsupported format.</div>'
|
|
10226
|
+
: '';
|
|
10215
10227
|
manifestEl.innerHTML =
|
|
10216
10228
|
'<div class="card" style="padding:12px"><div style="font-weight:600;margin-bottom:8px">Manifest</div>' +
|
|
10217
10229
|
'<div style="color:var(--muted);font-size:13px;margin-bottom:8px">' +
|
|
10218
10230
|
manifest.totalFiles + ' file(s), ' + brainHumanBytes(manifest.totalBytes) +
|
|
10219
10231
|
' · scanned in ' + Math.floor((Date.now() - progress.startedAt) / 1000) + 's</div>' +
|
|
10220
|
-
'<table class="data-table"><thead><tr><th>Format</th><th>Count</th></tr></thead><tbody>' + manifestRows + '</tbody></table
|
|
10232
|
+
'<table class="data-table"><thead><tr><th>Format</th><th>Count</th></tr></thead><tbody>' + manifestRows + '</tbody></table>' +
|
|
10233
|
+
warnBlock + emptyNote + '</div>';
|
|
10221
10234
|
if (preview.length) {
|
|
10222
10235
|
const previewHtml = preview.slice(0, 10).map((p, i) =>
|
|
10223
10236
|
'<div class="card" style="padding:12px;margin-bottom:8px">' +
|
|
@@ -10280,14 +10293,27 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
|
|
|
10280
10293
|
return;
|
|
10281
10294
|
}
|
|
10282
10295
|
const elapsed = Math.floor((Date.now() - progress.startedAt) / 1000);
|
|
10296
|
+
const errList = finalData.errors || [];
|
|
10297
|
+
const headerColor = (finalData.recordsWritten > 0) ? '#4ade80' : '#e5a84a';
|
|
10298
|
+
const headerIcon = (finalData.recordsWritten > 0) ? '✓' : '⚠';
|
|
10299
|
+
const headerText = (finalData.recordsWritten > 0)
|
|
10300
|
+
? 'Ingestion complete'
|
|
10301
|
+
: 'Ingestion finished, but nothing was written';
|
|
10302
|
+
const errBlock = errList.length
|
|
10303
|
+
? '<div style="margin-top:10px;padding:10px;background:#fff3cd;border:1px solid #f0c36d;border-radius:6px;color:#8a5a00;font-size:13px">' +
|
|
10304
|
+
'<div style="font-weight:600;margin-bottom:4px">' + errList.length + ' error(s)</div>' +
|
|
10305
|
+
errList.map((e) => '<div style="font-family:monospace;font-size:12px">• ' + escapeHtml(e.error) + '</div>').join('') +
|
|
10306
|
+
'</div>'
|
|
10307
|
+
: '';
|
|
10283
10308
|
progEl.innerHTML =
|
|
10284
10309
|
'<div class="card" style="padding:12px">' +
|
|
10285
|
-
'<div style="font-weight:600;color
|
|
10310
|
+
'<div style="font-weight:600;color:' + headerColor + '">' + headerIcon + ' ' + headerText + ' · ' + elapsed + 's</div>' +
|
|
10286
10311
|
'<div>Records in: ' + finalData.recordsIn + '</div>' +
|
|
10287
10312
|
'<div>Records written: ' + finalData.recordsWritten + '</div>' +
|
|
10288
10313
|
'<div>Records skipped: ' + finalData.recordsSkipped + '</div>' +
|
|
10289
10314
|
'<div>Records failed: ' + finalData.recordsFailed + '</div>' +
|
|
10290
10315
|
(finalData.overviewNotePath ? '<div style="margin-top:8px">Overview note: <code>' + escapeHtml(finalData.overviewNotePath) + '</code></div>' : '') +
|
|
10316
|
+
errBlock +
|
|
10291
10317
|
'</div>';
|
|
10292
10318
|
}
|
|
10293
10319
|
|