clementine-agent 1.0.69 → 1.0.71

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,14 @@
1
1
  /**
2
- * Clementine — PDF adapter (text layer only).
2
+ * Clementine — PDF adapter.
3
3
  *
4
- * Yields one RawRecord per PDF page. pdf-parse concatenates pages with
5
- * a form-feed separator (\f), so we split on that after extraction.
4
+ * Fast path: pdf-parse extracts the text layer. Yields one RawRecord per
5
+ * page (pdf-parse concatenates pages with \f).
6
+ *
7
+ * OCR fallback: for image-only / scanned PDFs, pdf-parse returns empty
8
+ * text. We then ask Claude Code to read the PDF itself — its built-in
9
+ * Read tool handles PDFs natively (including vision for scanned pages),
10
+ * and the call goes through the Agent SDK so it works with the user's
11
+ * OAuth session (no separate ANTHROPIC_API_KEY required).
6
12
  */
7
13
  import type { RawRecord } from '../../types.js';
8
14
  export declare function parsePdf(filePath: string): AsyncIterable<RawRecord>;
@@ -1,30 +1,66 @@
1
1
  /**
2
- * Clementine — PDF adapter (text layer only).
2
+ * Clementine — PDF adapter.
3
3
  *
4
- * Yields one RawRecord per PDF page. pdf-parse concatenates pages with
5
- * a form-feed separator (\f), so we split on that after extraction.
4
+ * Fast path: pdf-parse extracts the text layer. Yields one RawRecord per
5
+ * page (pdf-parse concatenates pages with \f).
6
+ *
7
+ * OCR fallback: for image-only / scanned PDFs, pdf-parse returns empty
8
+ * text. We then ask Claude Code to read the PDF itself — its built-in
9
+ * Read tool handles PDFs natively (including vision for scanned pages),
10
+ * and the call goes through the Agent SDK so it works with the user's
11
+ * OAuth session (no separate ANTHROPIC_API_KEY required).
6
12
  */
7
13
  import { readFileSync } from 'node:fs';
8
14
  import path from 'node:path';
9
15
  import pdfParse from 'pdf-parse';
10
16
  import { contentHash } from './common.js';
17
+ import { MODELS } from '../../config.js';
11
18
  export async function* parsePdf(filePath) {
12
19
  let buf;
13
20
  try {
14
21
  buf = readFileSync(filePath);
15
22
  }
16
- catch {
17
- return;
23
+ catch (err) {
24
+ throw new Error(`Failed to read PDF ${path.basename(filePath)}: ${err instanceof Error ? err.message : String(err)}`);
18
25
  }
19
26
  let result;
20
27
  try {
21
28
  result = await pdfParse(buf);
22
29
  }
23
- catch {
24
- return;
30
+ catch (err) {
31
+ const msg = err instanceof Error ? err.message : String(err);
32
+ const hint = /password/i.test(msg) ? ' (looks password-protected)' : '';
33
+ throw new Error(`Failed to parse PDF ${path.basename(filePath)}${hint}: ${msg}`);
25
34
  }
26
35
  const hint = path.basename(filePath, path.extname(filePath));
27
36
  const pages = splitPages(result.text);
37
+ const hasAnyText = pages.some((p) => p.trim().length > 0);
38
+ if (!hasAnyText) {
39
+ // Image-only / scanned PDF — fall back to Claude's native PDF reading.
40
+ const ocrPages = await ocrPdfViaClaude(filePath);
41
+ if (ocrPages.length === 0) {
42
+ throw new Error(`PDF ${path.basename(filePath)} has no extractable text and OCR returned nothing. The file may be corrupt or empty.`);
43
+ }
44
+ for (let i = 0; i < ocrPages.length; i++) {
45
+ const pageText = ocrPages[i].trim();
46
+ if (!pageText)
47
+ continue;
48
+ yield {
49
+ externalId: `pdf-ocr-${hint}-p${i + 1}-${contentHash(pageText)}`,
50
+ content: pageText,
51
+ rawPayload: pageText,
52
+ metadata: {
53
+ adapter: 'pdf',
54
+ extraction: 'claude-ocr',
55
+ source_file: filePath,
56
+ page: i + 1,
57
+ total_pages: ocrPages.length,
58
+ content_hash: contentHash(pageText),
59
+ },
60
+ };
61
+ }
62
+ return;
63
+ }
28
64
  for (let i = 0; i < pages.length; i++) {
29
65
  const pageText = pages[i].trim();
30
66
  if (!pageText)
@@ -35,6 +71,7 @@ export async function* parsePdf(filePath) {
35
71
  rawPayload: pageText,
36
72
  metadata: {
37
73
  adapter: 'pdf',
74
+ extraction: 'text-layer',
38
75
  source_file: filePath,
39
76
  page: i + 1,
40
77
  total_pages: result.numpages,
@@ -44,6 +81,50 @@ export async function* parsePdf(filePath) {
44
81
  };
45
82
  }
46
83
  }
84
+ /**
85
+ * OCR fallback via the Claude Agent SDK. Asks Claude Code to Read the PDF
86
+ * and transcribe every page verbatim, separated by \f. Returns one string
87
+ * per page. Empty array on failure (caller decides how to handle).
88
+ */
89
+ async function ocrPdfViaClaude(filePath) {
90
+ try {
91
+ const { query } = await import('@anthropic-ai/claude-agent-sdk');
92
+ const stream = query({
93
+ prompt: `Read the PDF at ${JSON.stringify(filePath)} using the Read tool. Transcribe every page's text verbatim — preserve the reading order, headings, lists, and paragraphs exactly as they appear. Separate pages with the form-feed character (\\f). Do NOT summarize, paraphrase, add commentary, or wrap in code fences. Output only the transcribed text.`,
94
+ options: {
95
+ model: MODELS.haiku,
96
+ maxTurns: 4, // Read tool call + response (a few turns of thinking is fine)
97
+ systemPrompt: 'You are a faithful OCR transcriber. Copy text exactly as written. When the PDF has images or scans, read the text from them using vision. Never invent content.',
98
+ // Claude Code's built-in Read tool handles PDFs (text + vision)
99
+ allowedTools: ['Read'],
100
+ permissionMode: 'bypassPermissions',
101
+ settingSources: [],
102
+ },
103
+ });
104
+ let text = '';
105
+ for await (const message of stream) {
106
+ if (message.type === 'assistant') {
107
+ const content = message
108
+ .message?.content ?? [];
109
+ for (const block of content) {
110
+ if (block.type === 'text' && typeof block.text === 'string') {
111
+ text += block.text;
112
+ }
113
+ }
114
+ }
115
+ else if (message.type === 'result') {
116
+ break;
117
+ }
118
+ }
119
+ const cleaned = text.trim();
120
+ if (cleaned.length < 20)
121
+ return [];
122
+ return splitPages(cleaned);
123
+ }
124
+ catch {
125
+ return [];
126
+ }
127
+ }
47
128
  /** pdf-parse inserts \f between pages. Fall back to paragraph-size chunks if not. */
48
129
  function splitPages(text) {
49
130
  if (text.includes('\f'))
@@ -75,32 +75,41 @@ export async function runIngestion(opts) {
75
75
  const samples = [];
76
76
  const pendingStructured = [];
77
77
  for (const iter of recordIterators) {
78
- for await (const record of iter) {
79
- recordsIn += 1;
80
- if (opts.limit && recordsIn > opts.limit)
81
- break;
82
- const flowPath = classifyRecord(record, intelligenceMode);
83
- if (flowPath === 'structured') {
84
- if (!schemaMapping && samples.length < SAMPLE_SIZE) {
85
- samples.push(record);
86
- pendingStructured.push(record);
87
- continue;
88
- }
89
- if (!schemaMapping && samples.length >= SAMPLE_SIZE) {
90
- schemaMapping = await inferSchema(samples, source.slug);
91
- await applyStructuredColumns(schemaMapping);
92
- for (const s of pendingStructured) {
93
- await processStructured(s, schemaMapping, source, opts, store, report, plannedRecords, errors, writtenSummaries, counters());
78
+ // Wrap so one unreadable file doesn't abort an otherwise-good folder
79
+ // ingest — record the adapter error and move on to the next iterator.
80
+ try {
81
+ for await (const record of iter) {
82
+ recordsIn += 1;
83
+ if (opts.limit && recordsIn > opts.limit)
84
+ break;
85
+ const flowPath = classifyRecord(record, intelligenceMode);
86
+ if (flowPath === 'structured') {
87
+ if (!schemaMapping && samples.length < SAMPLE_SIZE) {
88
+ samples.push(record);
89
+ pendingStructured.push(record);
90
+ continue;
91
+ }
92
+ if (!schemaMapping && samples.length >= SAMPLE_SIZE) {
93
+ schemaMapping = await inferSchema(samples, source.slug);
94
+ await applyStructuredColumns(schemaMapping);
95
+ for (const s of pendingStructured) {
96
+ await processStructured(s, schemaMapping, source, opts, store, report, plannedRecords, errors, writtenSummaries, counters());
97
+ }
98
+ pendingStructured.length = 0;
99
+ }
100
+ if (schemaMapping) {
101
+ await processStructured(record, schemaMapping, source, opts, store, report, plannedRecords, errors, writtenSummaries, counters());
94
102
  }
95
- pendingStructured.length = 0;
96
103
  }
97
- if (schemaMapping) {
98
- await processStructured(record, schemaMapping, source, opts, store, report, plannedRecords, errors, writtenSummaries, counters());
104
+ else {
105
+ await processFreeForm(record, source, opts, store, report, plannedRecords, errors, writtenSummaries, counters());
99
106
  }
100
107
  }
101
- else {
102
- await processFreeForm(record, source, opts, store, report, plannedRecords, errors, writtenSummaries, counters());
103
- }
108
+ }
109
+ catch (err) {
110
+ const msg = err instanceof Error ? err.message : String(err);
111
+ errors.push({ error: msg });
112
+ report('parsing', msg);
104
113
  }
105
114
  }
106
115
  // Flush structured records that never reached the schema-infer threshold
@@ -207,11 +216,22 @@ async function processStructured(record, mapping, source, opts, store, _report,
207
216
  }
208
217
  async function processFreeForm(record, source, opts, store, report, planned, _errors, _writtenSummaries, counters) {
209
218
  try {
210
- report('distilling');
211
219
  const chunks = chunkContent(record.content, 3000);
212
- const distillations = [];
213
- for (const chunk of chunks) {
214
- distillations.push(await distillChunk(chunk, record.metadata ?? {}));
220
+ report('distilling', chunks.length > 1 ? `chunk 0/${chunks.length}` : undefined);
221
+ // Parallelize per-chunk Haiku calls in small batches — one chunk at a
222
+ // time on a 30KB PDF adds up to 60–90s; 5-way concurrency cuts it to
223
+ // ~15s without pushing the API's rate limits.
224
+ const CONCURRENCY = 5;
225
+ const distillations = new Array(chunks.length);
226
+ let completed = 0;
227
+ for (let i = 0; i < chunks.length; i += CONCURRENCY) {
228
+ const batch = chunks.slice(i, i + CONCURRENCY);
229
+ const results = await Promise.all(batch.map((chunk) => distillChunk(chunk, record.metadata ?? {})));
230
+ for (let j = 0; j < results.length; j++)
231
+ distillations[i + j] = results[j];
232
+ completed += results.length;
233
+ if (chunks.length > 1)
234
+ report('distilling', `chunk ${completed}/${chunks.length}`);
215
235
  }
216
236
  const targetFolder = sanitizeFolder(source.targetFolder || `04-Ingest/${source.slug}`, source.slug);
217
237
  const partial = combineDistillations(record, distillations, source.slug, targetFolder);
@@ -2483,6 +2483,8 @@ export async function cmdDashboard(opts) {
2483
2483
  title: r.title, tags: r.tags, targetRelPath: r.targetRelPath,
2484
2484
  body: (r.body || '').slice(0, 800),
2485
2485
  })),
2486
+ recordsIn: result.recordsIn,
2487
+ errors: result.errors.slice(0, 10),
2486
2488
  });
2487
2489
  }
2488
2490
  catch (err) {
@@ -10210,14 +10212,25 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
10210
10212
 
10211
10213
  const manifest = manifestData.manifest;
10212
10214
  const preview = finalData.preview || [];
10215
+ const errorsList = finalData.errors || [];
10213
10216
  const manifestRows = Object.entries(manifest.formats || {})
10214
10217
  .map(([fmt, n]) => '<tr><td>' + escapeHtml(fmt) + '</td><td>' + n + '</td></tr>').join('');
10218
+ const warnBlock = errorsList.length
10219
+ ? '<div style="margin-top:10px;padding:10px;background:#fff3cd;border:1px solid #f0c36d;border-radius:6px;color:#8a5a00;font-size:13px">' +
10220
+ '<div style="font-weight:600;margin-bottom:4px">' + errorsList.length + ' file(s) could not be ingested</div>' +
10221
+ errorsList.map((e) => '<div style="font-family:monospace;font-size:12px">• ' + escapeHtml(e.error) + '</div>').join('') +
10222
+ '</div>'
10223
+ : '';
10224
+ const emptyNote = (preview.length === 0 && !errorsList.length)
10225
+ ? '<div style="margin-top:10px;padding:10px;background:#fff3cd;border:1px solid #f0c36d;border-radius:6px;color:#8a5a00;font-size:13px">No records extracted. The file may be empty or in an unsupported format.</div>'
10226
+ : '';
10215
10227
  manifestEl.innerHTML =
10216
10228
  '<div class="card" style="padding:12px"><div style="font-weight:600;margin-bottom:8px">Manifest</div>' +
10217
10229
  '<div style="color:var(--muted);font-size:13px;margin-bottom:8px">' +
10218
10230
  manifest.totalFiles + ' file(s), ' + brainHumanBytes(manifest.totalBytes) +
10219
10231
  ' · scanned in ' + Math.floor((Date.now() - progress.startedAt) / 1000) + 's</div>' +
10220
- '<table class="data-table"><thead><tr><th>Format</th><th>Count</th></tr></thead><tbody>' + manifestRows + '</tbody></table></div>';
10232
+ '<table class="data-table"><thead><tr><th>Format</th><th>Count</th></tr></thead><tbody>' + manifestRows + '</tbody></table>' +
10233
+ warnBlock + emptyNote + '</div>';
10221
10234
  if (preview.length) {
10222
10235
  const previewHtml = preview.slice(0, 10).map((p, i) =>
10223
10236
  '<div class="card" style="padding:12px;margin-bottom:8px">' +
@@ -10280,14 +10293,27 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
10280
10293
  return;
10281
10294
  }
10282
10295
  const elapsed = Math.floor((Date.now() - progress.startedAt) / 1000);
10296
+ const errList = finalData.errors || [];
10297
+ const headerColor = (finalData.recordsWritten > 0) ? '#4ade80' : '#e5a84a';
10298
+ const headerIcon = (finalData.recordsWritten > 0) ? '✓' : '⚠';
10299
+ const headerText = (finalData.recordsWritten > 0)
10300
+ ? 'Ingestion complete'
10301
+ : 'Ingestion finished, but nothing was written';
10302
+ const errBlock = errList.length
10303
+ ? '<div style="margin-top:10px;padding:10px;background:#fff3cd;border:1px solid #f0c36d;border-radius:6px;color:#8a5a00;font-size:13px">' +
10304
+ '<div style="font-weight:600;margin-bottom:4px">' + errList.length + ' error(s)</div>' +
10305
+ errList.map((e) => '<div style="font-family:monospace;font-size:12px">• ' + escapeHtml(e.error) + '</div>').join('') +
10306
+ '</div>'
10307
+ : '';
10283
10308
  progEl.innerHTML =
10284
10309
  '<div class="card" style="padding:12px">' +
10285
- '<div style="font-weight:600;color:#4ade80">✓ Ingestion complete · ' + elapsed + 's</div>' +
10310
+ '<div style="font-weight:600;color:' + headerColor + '">' + headerIcon + ' ' + headerText + ' · ' + elapsed + 's</div>' +
10286
10311
  '<div>Records in: ' + finalData.recordsIn + '</div>' +
10287
10312
  '<div>Records written: ' + finalData.recordsWritten + '</div>' +
10288
10313
  '<div>Records skipped: ' + finalData.recordsSkipped + '</div>' +
10289
10314
  '<div>Records failed: ' + finalData.recordsFailed + '</div>' +
10290
10315
  (finalData.overviewNotePath ? '<div style="margin-top:8px">Overview note: <code>' + escapeHtml(finalData.overviewNotePath) + '</code></div>' : '') +
10316
+ errBlock +
10291
10317
  '</div>';
10292
10318
  }
10293
10319
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clementine-agent",
3
- "version": "1.0.69",
3
+ "version": "1.0.71",
4
4
  "description": "Clementine — Personal AI Assistant (TypeScript)",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",