npm - clementine-agent - Versions diffs - 1.0.69 → 1.0.71 - Mend

clementine-agent 1.0.69 → 1.0.71

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/brain/adapters/pdf.d.ts +9 -3
package/dist/brain/adapters/pdf.js +88 -7
package/dist/brain/ingestion-pipeline.js +46 -26
package/dist/cli/dashboard.js +28 -2
package/package.json +1 -1

package/dist/brain/adapters/pdf.d.ts CHANGED Viewed

@@ -1,8 +1,14 @@
 /**
- * Clementine — PDF adapter (text layer only).
+ * Clementine — PDF adapter.
  *
- * Yields one RawRecord per PDF page. pdf-parse concatenates pages with
- * a form-feed separator (\f), so we split on that after extraction.
+ * Fast path: pdf-parse extracts the text layer. Yields one RawRecord per
+ * page (pdf-parse concatenates pages with \f).
+ *
+ * OCR fallback: for image-only / scanned PDFs, pdf-parse returns empty
+ * text. We then ask Claude Code to read the PDF itself — its built-in
+ * Read tool handles PDFs natively (including vision for scanned pages),
+ * and the call goes through the Agent SDK so it works with the user's
+ * OAuth session (no separate ANTHROPIC_API_KEY required).
  */
 import type { RawRecord } from '../../types.js';
 export declare function parsePdf(filePath: string): AsyncIterable<RawRecord>;

package/dist/brain/adapters/pdf.js CHANGED Viewed

@@ -1,30 +1,66 @@
 /**
- * Clementine — PDF adapter (text layer only).
+ * Clementine — PDF adapter.
  *
- * Yields one RawRecord per PDF page. pdf-parse concatenates pages with
- * a form-feed separator (\f), so we split on that after extraction.
+ * Fast path: pdf-parse extracts the text layer. Yields one RawRecord per
+ * page (pdf-parse concatenates pages with \f).
+ *
+ * OCR fallback: for image-only / scanned PDFs, pdf-parse returns empty
+ * text. We then ask Claude Code to read the PDF itself — its built-in
+ * Read tool handles PDFs natively (including vision for scanned pages),
+ * and the call goes through the Agent SDK so it works with the user's
+ * OAuth session (no separate ANTHROPIC_API_KEY required).
  */
 import { readFileSync } from 'node:fs';
 import path from 'node:path';
 import pdfParse from 'pdf-parse';
 import { contentHash } from './common.js';
+import { MODELS } from '../../config.js';
 export async function* parsePdf(filePath) {
     let buf;
     try {
         buf = readFileSync(filePath);
     }
-    catch {
-        return;
+    catch (err) {
+        throw new Error(`Failed to read PDF ${path.basename(filePath)}: ${err instanceof Error ? err.message : String(err)}`);
     }
     let result;
     try {
         result = await pdfParse(buf);
     }
-    catch {
-        return;
+    catch (err) {
+        const msg = err instanceof Error ? err.message : String(err);
+        const hint = /password/i.test(msg) ? ' (looks password-protected)' : '';
+        throw new Error(`Failed to parse PDF ${path.basename(filePath)}${hint}: ${msg}`);
     }
     const hint = path.basename(filePath, path.extname(filePath));
     const pages = splitPages(result.text);
+    const hasAnyText = pages.some((p) => p.trim().length > 0);
+    if (!hasAnyText) {
+        // Image-only / scanned PDF — fall back to Claude's native PDF reading.
+        const ocrPages = await ocrPdfViaClaude(filePath);
+        if (ocrPages.length === 0) {
+            throw new Error(`PDF ${path.basename(filePath)} has no extractable text and OCR returned nothing. The file may be corrupt or empty.`);
+        }
+        for (let i = 0; i < ocrPages.length; i++) {
+            const pageText = ocrPages[i].trim();
+            if (!pageText)
+                continue;
+            yield {
+                externalId: `pdf-ocr-${hint}-p${i + 1}-${contentHash(pageText)}`,
+                content: pageText,
+                rawPayload: pageText,
+                metadata: {
+                    adapter: 'pdf',
+                    extraction: 'claude-ocr',
+                    source_file: filePath,
+                    page: i + 1,
+                    total_pages: ocrPages.length,
+                    content_hash: contentHash(pageText),
+                },
+            };
+        }
+        return;
+    }
     for (let i = 0; i < pages.length; i++) {
         const pageText = pages[i].trim();
         if (!pageText)
@@ -35,6 +71,7 @@ export async function* parsePdf(filePath) {
             rawPayload: pageText,
             metadata: {
                 adapter: 'pdf',
+                extraction: 'text-layer',
                 source_file: filePath,
                 page: i + 1,
                 total_pages: result.numpages,
@@ -44,6 +81,50 @@ export async function* parsePdf(filePath) {
         };
     }
 }
+/**
+ * OCR fallback via the Claude Agent SDK. Asks Claude Code to Read the PDF
+ * and transcribe every page verbatim, separated by \f. Returns one string
+ * per page. Empty array on failure (caller decides how to handle).
+ */
+async function ocrPdfViaClaude(filePath) {
+    try {
+        const { query } = await import('@anthropic-ai/claude-agent-sdk');
+        const stream = query({
+            prompt: `Read the PDF at ${JSON.stringify(filePath)} using the Read tool. Transcribe every page's text verbatim — preserve the reading order, headings, lists, and paragraphs exactly as they appear. Separate pages with the form-feed character (\\f). Do NOT summarize, paraphrase, add commentary, or wrap in code fences. Output only the transcribed text.`,
+            options: {
+                model: MODELS.haiku,
+                maxTurns: 4, // Read tool call + response (a few turns of thinking is fine)
+                systemPrompt: 'You are a faithful OCR transcriber. Copy text exactly as written. When the PDF has images or scans, read the text from them using vision. Never invent content.',
+                // Claude Code's built-in Read tool handles PDFs (text + vision)
+                allowedTools: ['Read'],
+                permissionMode: 'bypassPermissions',
+                settingSources: [],
+            },
+        });
+        let text = '';
+        for await (const message of stream) {
+            if (message.type === 'assistant') {
+                const content = message
+                    .message?.content ?? [];
+                for (const block of content) {
+                    if (block.type === 'text' && typeof block.text === 'string') {
+                        text += block.text;
+                    }
+                }
+            }
+            else if (message.type === 'result') {
+                break;
+            }
+        }
+        const cleaned = text.trim();
+        if (cleaned.length < 20)
+            return [];
+        return splitPages(cleaned);
+    }
+    catch {
+        return [];
+    }
+}
 /** pdf-parse inserts \f between pages. Fall back to paragraph-size chunks if not. */
 function splitPages(text) {
     if (text.includes('\f'))

package/dist/brain/ingestion-pipeline.js CHANGED Viewed

@@ -75,32 +75,41 @@ export async function runIngestion(opts) {
         const samples = [];
         const pendingStructured = [];
         for (const iter of recordIterators) {
-            for await (const record of iter) {
-                recordsIn += 1;
-                if (opts.limit && recordsIn > opts.limit)
-                    break;
-                const flowPath = classifyRecord(record, intelligenceMode);
-                if (flowPath === 'structured') {
-                    if (!schemaMapping && samples.length < SAMPLE_SIZE) {
-                        samples.push(record);
-                        pendingStructured.push(record);
-                        continue;
-                    }
-                    if (!schemaMapping && samples.length >= SAMPLE_SIZE) {
-                        schemaMapping = await inferSchema(samples, source.slug);
-                        await applyStructuredColumns(schemaMapping);
-                        for (const s of pendingStructured) {
-                            await processStructured(s, schemaMapping, source, opts, store, report, plannedRecords, errors, writtenSummaries, counters());
+            // Wrap so one unreadable file doesn't abort an otherwise-good folder
+            // ingest — record the adapter error and move on to the next iterator.
+            try {
+                for await (const record of iter) {
+                    recordsIn += 1;
+                    if (opts.limit && recordsIn > opts.limit)
+                        break;
+                    const flowPath = classifyRecord(record, intelligenceMode);
+                    if (flowPath === 'structured') {
+                        if (!schemaMapping && samples.length < SAMPLE_SIZE) {
+                            samples.push(record);
+                            pendingStructured.push(record);
+                            continue;
+                        }
+                        if (!schemaMapping && samples.length >= SAMPLE_SIZE) {
+                            schemaMapping = await inferSchema(samples, source.slug);
+                            await applyStructuredColumns(schemaMapping);
+                            for (const s of pendingStructured) {
+                                await processStructured(s, schemaMapping, source, opts, store, report, plannedRecords, errors, writtenSummaries, counters());
+                            }
+                            pendingStructured.length = 0;
+                        }
+                        if (schemaMapping) {
+                            await processStructured(record, schemaMapping, source, opts, store, report, plannedRecords, errors, writtenSummaries, counters());
                         }
-                        pendingStructured.length = 0;
                     }
-                    if (schemaMapping) {
-                        await processStructured(record, schemaMapping, source, opts, store, report, plannedRecords, errors, writtenSummaries, counters());
+                    else {
+                        await processFreeForm(record, source, opts, store, report, plannedRecords, errors, writtenSummaries, counters());
                     }
                 }
-                else {
-                    await processFreeForm(record, source, opts, store, report, plannedRecords, errors, writtenSummaries, counters());
-                }
+            }
+            catch (err) {
+                const msg = err instanceof Error ? err.message : String(err);
+                errors.push({ error: msg });
+                report('parsing', msg);
             }
         }
         // Flush structured records that never reached the schema-infer threshold
@@ -207,11 +216,22 @@ async function processStructured(record, mapping, source, opts, store, _report,
 }
 async function processFreeForm(record, source, opts, store, report, planned, _errors, _writtenSummaries, counters) {
     try {
-        report('distilling');
         const chunks = chunkContent(record.content, 3000);
-        const distillations = [];
-        for (const chunk of chunks) {
-            distillations.push(await distillChunk(chunk, record.metadata ?? {}));
+        report('distilling', chunks.length > 1 ? `chunk 0/${chunks.length}` : undefined);
+        // Parallelize per-chunk Haiku calls in small batches — one chunk at a
+        // time on a 30KB PDF adds up to 60–90s; 5-way concurrency cuts it to
+        // ~15s without pushing the API's rate limits.
+        const CONCURRENCY = 5;
+        const distillations = new Array(chunks.length);
+        let completed = 0;
+        for (let i = 0; i < chunks.length; i += CONCURRENCY) {
+            const batch = chunks.slice(i, i + CONCURRENCY);
+            const results = await Promise.all(batch.map((chunk) => distillChunk(chunk, record.metadata ?? {})));
+            for (let j = 0; j < results.length; j++)
+                distillations[i + j] = results[j];
+            completed += results.length;
+            if (chunks.length > 1)
+                report('distilling', `chunk ${completed}/${chunks.length}`);
         }
         const targetFolder = sanitizeFolder(source.targetFolder || `04-Ingest/${source.slug}`, source.slug);
         const partial = combineDistillations(record, distillations, source.slug, targetFolder);

package/dist/cli/dashboard.js CHANGED Viewed

@@ -2483,6 +2483,8 @@ export async function cmdDashboard(opts) {
                     title: r.title, tags: r.tags, targetRelPath: r.targetRelPath,
                     body: (r.body || '').slice(0, 800),
                 })),
+                recordsIn: result.recordsIn,
+                errors: result.errors.slice(0, 10),
             });
         }
         catch (err) {
@@ -10210,14 +10212,25 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
           const manifest = manifestData.manifest;
           const preview = finalData.preview || [];
+          const errorsList = finalData.errors || [];
           const manifestRows = Object.entries(manifest.formats || {})
             .map(([fmt, n]) => '<tr><td>' + escapeHtml(fmt) + '</td><td>' + n + '</td></tr>').join('');
+          const warnBlock = errorsList.length
+            ? '<div style="margin-top:10px;padding:10px;background:#fff3cd;border:1px solid #f0c36d;border-radius:6px;color:#8a5a00;font-size:13px">' +
+              '<div style="font-weight:600;margin-bottom:4px">' + errorsList.length + ' file(s) could not be ingested</div>' +
+              errorsList.map((e) => '<div style="font-family:monospace;font-size:12px">• ' + escapeHtml(e.error) + '</div>').join('') +
+              '</div>'
+            : '';
+          const emptyNote = (preview.length === 0 && !errorsList.length)
+            ? '<div style="margin-top:10px;padding:10px;background:#fff3cd;border:1px solid #f0c36d;border-radius:6px;color:#8a5a00;font-size:13px">No records extracted. The file may be empty or in an unsupported format.</div>'
+            : '';
           manifestEl.innerHTML =
             '<div class="card" style="padding:12px"><div style="font-weight:600;margin-bottom:8px">Manifest</div>' +
             '<div style="color:var(--muted);font-size:13px;margin-bottom:8px">' +
             manifest.totalFiles + ' file(s), ' + brainHumanBytes(manifest.totalBytes) +
             ' · scanned in ' + Math.floor((Date.now() - progress.startedAt) / 1000) + 's</div>' +
-            '<table class="data-table"><thead><tr><th>Format</th><th>Count</th></tr></thead><tbody>' + manifestRows + '</tbody></table></div>';
+            '<table class="data-table"><thead><tr><th>Format</th><th>Count</th></tr></thead><tbody>' + manifestRows + '</tbody></table>' +
+            warnBlock + emptyNote + '</div>';
           if (preview.length) {
             const previewHtml = preview.slice(0, 10).map((p, i) =>
               '<div class="card" style="padding:12px;margin-bottom:8px">' +
@@ -10280,14 +10293,27 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
             return;
           }
           const elapsed = Math.floor((Date.now() - progress.startedAt) / 1000);
+          const errList = finalData.errors || [];
+          const headerColor = (finalData.recordsWritten > 0) ? '#4ade80' : '#e5a84a';
+          const headerIcon = (finalData.recordsWritten > 0) ? '✓' : '⚠';
+          const headerText = (finalData.recordsWritten > 0)
+            ? 'Ingestion complete'
+            : 'Ingestion finished, but nothing was written';
+          const errBlock = errList.length
+            ? '<div style="margin-top:10px;padding:10px;background:#fff3cd;border:1px solid #f0c36d;border-radius:6px;color:#8a5a00;font-size:13px">' +
+              '<div style="font-weight:600;margin-bottom:4px">' + errList.length + ' error(s)</div>' +
+              errList.map((e) => '<div style="font-family:monospace;font-size:12px">• ' + escapeHtml(e.error) + '</div>').join('') +
+              '</div>'
+            : '';
           progEl.innerHTML =
             '<div class="card" style="padding:12px">' +
-            '<div style="font-weight:600;color:#4ade80">✓ Ingestion complete · ' + elapsed + 's</div>' +
+            '<div style="font-weight:600;color:' + headerColor + '">' + headerIcon + ' ' + headerText + ' · ' + elapsed + 's</div>' +
             '<div>Records in: ' + finalData.recordsIn + '</div>' +
             '<div>Records written: ' + finalData.recordsWritten + '</div>' +
             '<div>Records skipped: ' + finalData.recordsSkipped + '</div>' +
             '<div>Records failed: ' + finalData.recordsFailed + '</div>' +
             (finalData.overviewNotePath ? '<div style="margin-top:8px">Overview note: <code>' + escapeHtml(finalData.overviewNotePath) + '</code></div>' : '') +
+            errBlock +
             '</div>';
         }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clementine-agent",
-  "version": "1.0.69",
+  "version": "1.0.71",
   "description": "Clementine — Personal AI Assistant (TypeScript)",
   "type": "module",
   "main": "dist/index.js",