clementine-agent 1.0.70 → 1.0.71

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,14 @@
1
1
  /**
2
- * Clementine — PDF adapter (text layer only).
2
+ * Clementine — PDF adapter.
3
3
  *
4
- * Yields one RawRecord per PDF page. pdf-parse concatenates pages with
5
- * a form-feed separator (\f), so we split on that after extraction.
4
+ * Fast path: pdf-parse extracts the text layer. Yields one RawRecord per
5
+ * page (pdf-parse concatenates pages with \f).
6
+ *
7
+ * OCR fallback: for image-only / scanned PDFs, pdf-parse returns empty
8
+ * text. We then ask Claude Code to read the PDF itself — its built-in
9
+ * Read tool handles PDFs natively (including vision for scanned pages),
10
+ * and the call goes through the Agent SDK so it works with the user's
11
+ * OAuth session (no separate ANTHROPIC_API_KEY required).
6
12
  */
7
13
  import type { RawRecord } from '../../types.js';
8
14
  export declare function parsePdf(filePath: string): AsyncIterable<RawRecord>;
@@ -1,13 +1,20 @@
1
1
  /**
2
- * Clementine — PDF adapter (text layer only).
2
+ * Clementine — PDF adapter.
3
3
  *
4
- * Yields one RawRecord per PDF page. pdf-parse concatenates pages with
5
- * a form-feed separator (\f), so we split on that after extraction.
4
+ * Fast path: pdf-parse extracts the text layer. Yields one RawRecord per
5
+ * page (pdf-parse concatenates pages with \f).
6
+ *
7
+ * OCR fallback: for image-only / scanned PDFs, pdf-parse returns empty
8
+ * text. We then ask Claude Code to read the PDF itself — its built-in
9
+ * Read tool handles PDFs natively (including vision for scanned pages),
10
+ * and the call goes through the Agent SDK so it works with the user's
11
+ * OAuth session (no separate ANTHROPIC_API_KEY required).
6
12
  */
7
13
  import { readFileSync } from 'node:fs';
8
14
  import path from 'node:path';
9
15
  import pdfParse from 'pdf-parse';
10
16
  import { contentHash } from './common.js';
17
+ import { MODELS } from '../../config.js';
11
18
  export async function* parsePdf(filePath) {
12
19
  let buf;
13
20
  try {
@@ -29,7 +36,30 @@ export async function* parsePdf(filePath) {
29
36
  const pages = splitPages(result.text);
30
37
  const hasAnyText = pages.some((p) => p.trim().length > 0);
31
38
  if (!hasAnyText) {
32
- throw new Error(`PDF ${path.basename(filePath)} has no extractable text — likely image-only (OCR is not supported). Re-export with a text layer or transcribe it first.`);
39
+ // Image-only / scanned PDF fall back to Claude's native PDF reading.
40
+ const ocrPages = await ocrPdfViaClaude(filePath);
41
+ if (ocrPages.length === 0) {
42
+ throw new Error(`PDF ${path.basename(filePath)} has no extractable text and OCR returned nothing. The file may be corrupt or empty.`);
43
+ }
44
+ for (let i = 0; i < ocrPages.length; i++) {
45
+ const pageText = ocrPages[i].trim();
46
+ if (!pageText)
47
+ continue;
48
+ yield {
49
+ externalId: `pdf-ocr-${hint}-p${i + 1}-${contentHash(pageText)}`,
50
+ content: pageText,
51
+ rawPayload: pageText,
52
+ metadata: {
53
+ adapter: 'pdf',
54
+ extraction: 'claude-ocr',
55
+ source_file: filePath,
56
+ page: i + 1,
57
+ total_pages: ocrPages.length,
58
+ content_hash: contentHash(pageText),
59
+ },
60
+ };
61
+ }
62
+ return;
33
63
  }
34
64
  for (let i = 0; i < pages.length; i++) {
35
65
  const pageText = pages[i].trim();
@@ -41,6 +71,7 @@ export async function* parsePdf(filePath) {
41
71
  rawPayload: pageText,
42
72
  metadata: {
43
73
  adapter: 'pdf',
74
+ extraction: 'text-layer',
44
75
  source_file: filePath,
45
76
  page: i + 1,
46
77
  total_pages: result.numpages,
@@ -50,6 +81,50 @@ export async function* parsePdf(filePath) {
50
81
  };
51
82
  }
52
83
  }
84
+ /**
85
+ * OCR fallback via the Claude Agent SDK. Asks Claude Code to Read the PDF
86
+ * and transcribe every page verbatim, separated by \f. Returns one string
87
+ * per page. Empty array on failure (caller decides how to handle).
88
+ */
89
+ async function ocrPdfViaClaude(filePath) {
90
+ try {
91
+ const { query } = await import('@anthropic-ai/claude-agent-sdk');
92
+ const stream = query({
93
+ prompt: `Read the PDF at ${JSON.stringify(filePath)} using the Read tool. Transcribe every page's text verbatim — preserve the reading order, headings, lists, and paragraphs exactly as they appear. Separate pages with the form-feed character (\\f). Do NOT summarize, paraphrase, add commentary, or wrap in code fences. Output only the transcribed text.`,
94
+ options: {
95
+ model: MODELS.haiku,
96
+ maxTurns: 4, // Read tool call + response (a few turns of thinking is fine)
97
+ systemPrompt: 'You are a faithful OCR transcriber. Copy text exactly as written. When the PDF has images or scans, read the text from them using vision. Never invent content.',
98
+ // Claude Code's built-in Read tool handles PDFs (text + vision)
99
+ allowedTools: ['Read'],
100
+ permissionMode: 'bypassPermissions',
101
+ settingSources: [],
102
+ },
103
+ });
104
+ let text = '';
105
+ for await (const message of stream) {
106
+ if (message.type === 'assistant') {
107
+ const content = message
108
+ .message?.content ?? [];
109
+ for (const block of content) {
110
+ if (block.type === 'text' && typeof block.text === 'string') {
111
+ text += block.text;
112
+ }
113
+ }
114
+ }
115
+ else if (message.type === 'result') {
116
+ break;
117
+ }
118
+ }
119
+ const cleaned = text.trim();
120
+ if (cleaned.length < 20)
121
+ return [];
122
+ return splitPages(cleaned);
123
+ }
124
+ catch {
125
+ return [];
126
+ }
127
+ }
53
128
  /** pdf-parse inserts \f between pages. Fall back to paragraph-size chunks if not. */
54
129
  function splitPages(text) {
55
130
  if (text.includes('\f'))
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clementine-agent",
3
- "version": "1.0.70",
3
+ "version": "1.0.71",
4
4
  "description": "Clementine — Personal AI Assistant (TypeScript)",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",