@echofiles/echo-pdf 0.7.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -0
- package/bin/echo-pdf.js +55 -1
- package/dist/local/formulas.d.ts +2 -0
- package/dist/local/formulas.js +71 -0
- package/dist/local/index.d.ts +4 -1
- package/dist/local/index.js +3 -0
- package/dist/local/semantic.js +104 -0
- package/dist/local/tables.d.ts +2 -0
- package/dist/local/tables.js +71 -0
- package/dist/local/types.d.ts +76 -0
- package/dist/local/understanding.d.ts +2 -0
- package/dist/local/understanding.js +136 -0
- package/dist/pdf-types.d.ts +1 -0
- package/echo-pdf.config.json +2 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -85,6 +85,9 @@ What these commands map to:
|
|
|
85
85
|
- `semantic` -> `get_semantic_document_structure`
|
|
86
86
|
- `page` -> `get_page_content`
|
|
87
87
|
- `render` -> `get_page_render`
|
|
88
|
+
- `tables` -> `get_page_tables_latex`
|
|
89
|
+
- `formulas` -> `get_page_formulas_latex`
|
|
90
|
+
- `understanding` -> `get_page_understanding`
|
|
88
91
|
|
|
89
92
|
By default, `echo-pdf` writes reusable artifacts into a local workspace:
|
|
90
93
|
|
|
@@ -101,6 +104,12 @@ By default, `echo-pdf` writes reusable artifacts into a local workspace:
|
|
|
101
104
|
renders/
|
|
102
105
|
0001.scale-2.json
|
|
103
106
|
0001.scale-2.png
|
|
107
|
+
tables/
|
|
108
|
+
0001.scale-2.provider-openai.model-gpt-4.1-mini.prompt-<hash>.json
|
|
109
|
+
formulas/
|
|
110
|
+
0001.scale-2.provider-openai.model-gpt-4.1-mini.prompt-<hash>.json
|
|
111
|
+
understanding/
|
|
112
|
+
0001.scale-2.provider-openai.model-gpt-4.1-mini.prompt-<hash>.json
|
|
104
113
|
```
|
|
105
114
|
|
|
106
115
|
These artifacts are meant to be inspected, cached, and reused by downstream local tools.
|
|
@@ -115,6 +124,9 @@ import {
|
|
|
115
124
|
get_semantic_document_structure,
|
|
116
125
|
get_page_content,
|
|
117
126
|
get_page_render,
|
|
127
|
+
get_page_tables_latex,
|
|
128
|
+
get_page_formulas_latex,
|
|
129
|
+
get_page_understanding,
|
|
118
130
|
} from "@echofiles/echo-pdf/local"
|
|
119
131
|
|
|
120
132
|
const document = await get_document({ pdfPath: "./sample.pdf" })
|
|
@@ -126,6 +138,9 @@ const semantic = await get_semantic_document_structure({
|
|
|
126
138
|
})
|
|
127
139
|
const page1 = await get_page_content({ pdfPath: "./sample.pdf", pageNumber: 1 })
|
|
128
140
|
const render1 = await get_page_render({ pdfPath: "./sample.pdf", pageNumber: 1, scale: 2 })
|
|
141
|
+
const tables = await get_page_tables_latex({ pdfPath: "./sample.pdf", pageNumber: 1, provider: "openai", model: "gpt-4.1-mini" })
|
|
142
|
+
const formulas = await get_page_formulas_latex({ pdfPath: "./sample.pdf", pageNumber: 1, provider: "openai", model: "gpt-4.1-mini" })
|
|
143
|
+
const understanding = await get_page_understanding({ pdfPath: "./sample.pdf", pageNumber: 1, provider: "openai", model: "gpt-4.1-mini" })
|
|
129
144
|
```
|
|
130
145
|
|
|
131
146
|
Notes:
|
package/bin/echo-pdf.js
CHANGED
|
@@ -215,7 +215,7 @@ const loadLocalDocumentApi = async () => {
|
|
|
215
215
|
return import(LOCAL_DOCUMENT_DIST_ENTRY.href)
|
|
216
216
|
}
|
|
217
217
|
|
|
218
|
-
const LOCAL_PRIMITIVE_COMMANDS = ["document", "structure", "semantic", "page", "render"]
|
|
218
|
+
const LOCAL_PRIMITIVE_COMMANDS = ["document", "structure", "semantic", "page", "render", "tables", "formulas", "understanding"]
|
|
219
219
|
const REMOVED_DOCUMENT_ALIAS_TO_PRIMITIVE = {
|
|
220
220
|
index: "document",
|
|
221
221
|
get: "document",
|
|
@@ -302,6 +302,57 @@ const runLocalPrimitiveCommand = async (command, subcommand, rest, flags) => {
|
|
|
302
302
|
return
|
|
303
303
|
}
|
|
304
304
|
|
|
305
|
+
if (primitive === "tables") {
|
|
306
|
+
const semanticContext = resolveLocalSemanticContext(flags)
|
|
307
|
+
const local = await loadLocalDocumentApi()
|
|
308
|
+
print(await local.get_page_tables_latex({
|
|
309
|
+
pdfPath,
|
|
310
|
+
workspaceDir,
|
|
311
|
+
forceRefresh,
|
|
312
|
+
pageNumber,
|
|
313
|
+
renderScale,
|
|
314
|
+
provider: semanticContext.provider,
|
|
315
|
+
model: semanticContext.model,
|
|
316
|
+
providerApiKeys: semanticContext.providerApiKeys,
|
|
317
|
+
prompt: typeof flags.prompt === "string" ? flags.prompt : undefined,
|
|
318
|
+
}))
|
|
319
|
+
return
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
if (primitive === "formulas") {
|
|
323
|
+
const semanticContext = resolveLocalSemanticContext(flags)
|
|
324
|
+
const local = await loadLocalDocumentApi()
|
|
325
|
+
print(await local.get_page_formulas_latex({
|
|
326
|
+
pdfPath,
|
|
327
|
+
workspaceDir,
|
|
328
|
+
forceRefresh,
|
|
329
|
+
pageNumber,
|
|
330
|
+
renderScale,
|
|
331
|
+
provider: semanticContext.provider,
|
|
332
|
+
model: semanticContext.model,
|
|
333
|
+
providerApiKeys: semanticContext.providerApiKeys,
|
|
334
|
+
prompt: typeof flags.prompt === "string" ? flags.prompt : undefined,
|
|
335
|
+
}))
|
|
336
|
+
return
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
if (primitive === "understanding") {
|
|
340
|
+
const semanticContext = resolveLocalSemanticContext(flags)
|
|
341
|
+
const local = await loadLocalDocumentApi()
|
|
342
|
+
print(await local.get_page_understanding({
|
|
343
|
+
pdfPath,
|
|
344
|
+
workspaceDir,
|
|
345
|
+
forceRefresh,
|
|
346
|
+
pageNumber,
|
|
347
|
+
renderScale,
|
|
348
|
+
provider: semanticContext.provider,
|
|
349
|
+
model: semanticContext.model,
|
|
350
|
+
providerApiKeys: semanticContext.providerApiKeys,
|
|
351
|
+
prompt: typeof flags.prompt === "string" ? flags.prompt : undefined,
|
|
352
|
+
}))
|
|
353
|
+
return
|
|
354
|
+
}
|
|
355
|
+
|
|
305
356
|
throw new Error(`Unsupported local primitive command: ${primitive}`)
|
|
306
357
|
}
|
|
307
358
|
|
|
@@ -313,6 +364,9 @@ const usage = () => {
|
|
|
313
364
|
process.stdout.write(` semantic <file.pdf> [--provider alias] [--model model] [--profile name] [--workspace DIR] [--force-refresh]\n`)
|
|
314
365
|
process.stdout.write(` page <file.pdf> --page <N> [--workspace DIR] [--force-refresh]\n`)
|
|
315
366
|
process.stdout.write(` render <file.pdf> --page <N> [--scale N] [--workspace DIR] [--force-refresh]\n`)
|
|
367
|
+
process.stdout.write(` tables <file.pdf> --page <N> [--provider alias] [--model model] [--scale N] [--prompt text] [--workspace DIR] [--force-refresh]\n`)
|
|
368
|
+
process.stdout.write(` formulas <file.pdf> --page <N> [--provider alias] [--model model] [--scale N] [--prompt text] [--workspace DIR] [--force-refresh]\n`)
|
|
369
|
+
process.stdout.write(` understanding <file.pdf> --page <N> [--provider alias] [--model model] [--scale N] [--prompt text] [--workspace DIR] [--force-refresh]\n`)
|
|
316
370
|
process.stdout.write(`\nLocal config commands:\n`)
|
|
317
371
|
process.stdout.write(` provider set --provider <${getProviderSetNames().join("|")}> --api-key <KEY> [--profile name]\n`)
|
|
318
372
|
process.stdout.write(` provider use --provider <${getProviderAliases().join("|")}> [--profile name]\n`)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/// <reference path="../node/compat.d.ts" />
|
|
2
|
+
import { readFile } from "node:fs/promises";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import { toDataUrl } from "../file-utils.js";
|
|
5
|
+
import { visionRecognize } from "../provider-client.js";
|
|
6
|
+
import { ensureRenderArtifact, indexDocumentInternal } from "./document.js";
|
|
7
|
+
import { buildStructuredArtifactPath, ensurePageNumber, fileExists, matchesSourceSnapshot, normalizeFormulaItems, pageLabel, parseJsonObject, readJson, resolveAgentSelection, resolveConfig, resolveEnv, resolveRenderScale, writeJson, } from "./shared.js";
|
|
8
|
+
const DEFAULT_FORMULA_PROMPT = "Detect all displayed mathematical formulas from this PDF page image. " +
|
|
9
|
+
"Return JSON only. Schema: " +
|
|
10
|
+
'{ "formulas": [{ "latexMath": "LaTeX math expression", "label": "optional equation label", "evidenceText": "optional" }] }. ' +
|
|
11
|
+
"Use LaTeX math notation. Do not include inline prose math or trivial single-symbol expressions. " +
|
|
12
|
+
"If no displayed formulas are found, return {\"formulas\":[]}.";
|
|
13
|
+
export const get_page_formulas_latex = async (request) => {
|
|
14
|
+
const env = resolveEnv(request.env);
|
|
15
|
+
const config = resolveConfig(request.config, env);
|
|
16
|
+
const { record } = await indexDocumentInternal(request);
|
|
17
|
+
ensurePageNumber(record.pageCount, request.pageNumber);
|
|
18
|
+
const { provider, model } = resolveAgentSelection(config, request);
|
|
19
|
+
const renderScale = resolveRenderScale(config, request.renderScale);
|
|
20
|
+
const prompt = typeof request.prompt === "string" && request.prompt.trim().length > 0
|
|
21
|
+
? request.prompt.trim()
|
|
22
|
+
: DEFAULT_FORMULA_PROMPT;
|
|
23
|
+
const formulasDir = path.join(record.artifactPaths.documentDir, "formulas");
|
|
24
|
+
const artifactPath = buildStructuredArtifactPath(formulasDir, request.pageNumber, renderScale, provider, model, prompt);
|
|
25
|
+
if (!request.forceRefresh && await fileExists(artifactPath)) {
|
|
26
|
+
const cached = await readJson(artifactPath);
|
|
27
|
+
if (matchesSourceSnapshot(cached, record)) {
|
|
28
|
+
return { ...cached, cacheStatus: "reused" };
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
const renderArtifact = await ensureRenderArtifact({
|
|
32
|
+
pdfPath: request.pdfPath,
|
|
33
|
+
workspaceDir: request.workspaceDir,
|
|
34
|
+
forceRefresh: request.forceRefresh,
|
|
35
|
+
config,
|
|
36
|
+
pageNumber: request.pageNumber,
|
|
37
|
+
renderScale: request.renderScale,
|
|
38
|
+
});
|
|
39
|
+
const imageBytes = new Uint8Array(await readFile(renderArtifact.imagePath));
|
|
40
|
+
const imageDataUrl = toDataUrl(imageBytes, renderArtifact.mimeType);
|
|
41
|
+
const response = await visionRecognize({
|
|
42
|
+
config,
|
|
43
|
+
env,
|
|
44
|
+
providerAlias: provider,
|
|
45
|
+
model,
|
|
46
|
+
prompt,
|
|
47
|
+
imageDataUrl,
|
|
48
|
+
runtimeApiKeys: request.providerApiKeys,
|
|
49
|
+
});
|
|
50
|
+
const parsed = parseJsonObject(response);
|
|
51
|
+
const formulas = normalizeFormulaItems(parsed?.formulas);
|
|
52
|
+
const pageArtifactPath = path.join(record.artifactPaths.pagesDir, `${pageLabel(request.pageNumber)}.json`);
|
|
53
|
+
const artifact = {
|
|
54
|
+
documentId: record.documentId,
|
|
55
|
+
pageNumber: request.pageNumber,
|
|
56
|
+
renderScale,
|
|
57
|
+
sourceSizeBytes: record.sizeBytes,
|
|
58
|
+
sourceMtimeMs: record.mtimeMs,
|
|
59
|
+
provider,
|
|
60
|
+
model,
|
|
61
|
+
prompt,
|
|
62
|
+
imagePath: renderArtifact.imagePath,
|
|
63
|
+
pageArtifactPath,
|
|
64
|
+
renderArtifactPath: renderArtifact.artifactPath,
|
|
65
|
+
artifactPath,
|
|
66
|
+
generatedAt: new Date().toISOString(),
|
|
67
|
+
formulas,
|
|
68
|
+
};
|
|
69
|
+
await writeJson(artifactPath, artifact);
|
|
70
|
+
return { ...artifact, cacheStatus: "fresh" };
|
|
71
|
+
};
|
package/dist/local/index.d.ts
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
-
export type { LocalDocumentArtifactPaths, LocalDocumentMetadata, LocalDocumentRequest, LocalDocumentStructure, LocalDocumentStructureNode, LocalPageContent, LocalPageContentRequest, LocalPageRenderArtifact, LocalPageRenderRequest, LocalSemanticDocumentRequest, LocalSemanticDocumentStructure, LocalSemanticStructureNode, } from "./types.js";
|
|
1
|
+
export type { LocalDocumentArtifactPaths, LocalDocumentMetadata, LocalDocumentRequest, LocalDocumentStructure, LocalDocumentStructureNode, LocalFigureArtifactItem, LocalFormulaArtifactItem, LocalPageContent, LocalPageContentRequest, LocalPageFormulasArtifact, LocalPageFormulasRequest, LocalPageRenderArtifact, LocalPageRenderRequest, LocalPageTablesArtifact, LocalPageTablesRequest, LocalPageUnderstandingArtifact, LocalPageUnderstandingRequest, LocalSemanticDocumentRequest, LocalSemanticDocumentStructure, LocalSemanticStructureNode, LocalTableArtifactItem, MergedFigureItem, MergedFormulaItem, MergedTableItem, } from "./types.js";
|
|
2
2
|
export { get_document, get_document_structure, get_page_content, get_page_render } from "./document.js";
|
|
3
|
+
export { get_page_formulas_latex } from "./formulas.js";
|
|
3
4
|
export { get_semantic_document_structure } from "./semantic.js";
|
|
5
|
+
export { get_page_tables_latex } from "./tables.js";
|
|
6
|
+
export { get_page_understanding } from "./understanding.js";
|
package/dist/local/index.js
CHANGED
|
@@ -1,2 +1,5 @@
|
|
|
1
1
|
export { get_document, get_document_structure, get_page_content, get_page_render } from "./document.js";
|
|
2
|
+
export { get_page_formulas_latex } from "./formulas.js";
|
|
2
3
|
export { get_semantic_document_structure } from "./semantic.js";
|
|
4
|
+
export { get_page_tables_latex } from "./tables.js";
|
|
5
|
+
export { get_page_understanding } from "./understanding.js";
|
package/dist/local/semantic.js
CHANGED
|
@@ -6,6 +6,7 @@ import { toDataUrl } from "../file-utils.js";
|
|
|
6
6
|
import { generateText, visionRecognize } from "../provider-client.js";
|
|
7
7
|
import { ensureRenderArtifact, indexDocumentInternal } from "./document.js";
|
|
8
8
|
import { fileExists, matchesSourceSnapshot, matchesStrategyKey, pageLabel, parseJsonObject, readJson, resolveConfig, resolveEnv, writeJson, } from "./shared.js";
|
|
9
|
+
import { get_page_understanding } from "./understanding.js";
|
|
9
10
|
const resolveSemanticExtractionBudget = (input) => ({
|
|
10
11
|
pageSelection: "all",
|
|
11
12
|
chunkMaxChars: typeof input?.chunkMaxChars === "number" && Number.isFinite(input.chunkMaxChars) && input.chunkMaxChars > 400
|
|
@@ -154,6 +155,88 @@ const extractSemanticCandidatesFromRenderedPage = async (input) => {
|
|
|
154
155
|
.map((candidate) => normalizeSemanticAgentCandidate(candidate, input.page.pageNumber))
|
|
155
156
|
.filter((candidate) => candidate !== null);
|
|
156
157
|
};
|
|
158
|
+
const mergeCrossPageTables = (understandings) => {
|
|
159
|
+
const merged = [];
|
|
160
|
+
let nextId = 1;
|
|
161
|
+
for (const pu of understandings) {
|
|
162
|
+
for (const table of pu.tables) {
|
|
163
|
+
const prev = merged[merged.length - 1];
|
|
164
|
+
if (prev?.crossPageHint && table.truncatedTop) {
|
|
165
|
+
merged[merged.length - 1] = {
|
|
166
|
+
...prev,
|
|
167
|
+
latexTabular: prev.latexTabular + "\n" + table.latexTabular,
|
|
168
|
+
endPage: pu.pageNumber,
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
else {
|
|
172
|
+
merged.push({
|
|
173
|
+
id: `merged-table-${nextId++}`,
|
|
174
|
+
latexTabular: table.latexTabular,
|
|
175
|
+
caption: table.caption,
|
|
176
|
+
startPage: pu.pageNumber,
|
|
177
|
+
endPage: pu.pageNumber,
|
|
178
|
+
crossPageHint: table.truncatedBottom === true ? true : undefined,
|
|
179
|
+
});
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
return merged;
|
|
184
|
+
};
|
|
185
|
+
const mergeCrossPageFormulas = (understandings) => {
|
|
186
|
+
const merged = [];
|
|
187
|
+
let nextId = 1;
|
|
188
|
+
for (const pu of understandings) {
|
|
189
|
+
for (const formula of pu.formulas) {
|
|
190
|
+
const prev = merged[merged.length - 1];
|
|
191
|
+
if (prev?.crossPageHint && formula.truncatedTop) {
|
|
192
|
+
merged[merged.length - 1] = {
|
|
193
|
+
...prev,
|
|
194
|
+
latexMath: prev.latexMath + " " + formula.latexMath,
|
|
195
|
+
endPage: pu.pageNumber,
|
|
196
|
+
};
|
|
197
|
+
}
|
|
198
|
+
else {
|
|
199
|
+
merged.push({
|
|
200
|
+
id: `merged-formula-${nextId++}`,
|
|
201
|
+
latexMath: formula.latexMath,
|
|
202
|
+
label: formula.label,
|
|
203
|
+
startPage: pu.pageNumber,
|
|
204
|
+
endPage: pu.pageNumber,
|
|
205
|
+
crossPageHint: formula.truncatedBottom === true ? true : undefined,
|
|
206
|
+
});
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
return merged;
|
|
211
|
+
};
|
|
212
|
+
const mergeCrossPageFigures = (understandings) => {
|
|
213
|
+
const merged = [];
|
|
214
|
+
let nextId = 1;
|
|
215
|
+
for (const pu of understandings) {
|
|
216
|
+
for (const figure of pu.figures) {
|
|
217
|
+
const prev = merged[merged.length - 1];
|
|
218
|
+
if (prev?.crossPageHint && figure.truncatedTop) {
|
|
219
|
+
merged[merged.length - 1] = {
|
|
220
|
+
...prev,
|
|
221
|
+
description: [prev.description, figure.description].filter(Boolean).join(" "),
|
|
222
|
+
endPage: pu.pageNumber,
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
else {
|
|
226
|
+
merged.push({
|
|
227
|
+
id: `merged-figure-${nextId++}`,
|
|
228
|
+
figureType: figure.figureType,
|
|
229
|
+
caption: figure.caption,
|
|
230
|
+
description: figure.description,
|
|
231
|
+
startPage: pu.pageNumber,
|
|
232
|
+
endPage: pu.pageNumber,
|
|
233
|
+
crossPageHint: figure.truncatedBottom === true ? true : undefined,
|
|
234
|
+
});
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
return merged;
|
|
239
|
+
};
|
|
157
240
|
const ensureSemanticStructureArtifact = async (request) => {
|
|
158
241
|
const env = resolveEnv(request.env);
|
|
159
242
|
const config = resolveConfig(request.config, env);
|
|
@@ -196,6 +279,21 @@ const ensureSemanticStructureArtifact = async (request) => {
|
|
|
196
279
|
}
|
|
197
280
|
}
|
|
198
281
|
}
|
|
282
|
+
const understandings = [];
|
|
283
|
+
for (const page of pages) {
|
|
284
|
+
const pu = await get_page_understanding({
|
|
285
|
+
pdfPath: request.pdfPath,
|
|
286
|
+
workspaceDir: request.workspaceDir,
|
|
287
|
+
forceRefresh: request.forceRefresh,
|
|
288
|
+
config,
|
|
289
|
+
pageNumber: page.pageNumber,
|
|
290
|
+
provider,
|
|
291
|
+
model,
|
|
292
|
+
env,
|
|
293
|
+
providerApiKeys: request.providerApiKeys,
|
|
294
|
+
});
|
|
295
|
+
understandings.push(pu);
|
|
296
|
+
}
|
|
199
297
|
const aggregated = await generateText({
|
|
200
298
|
config,
|
|
201
299
|
env,
|
|
@@ -206,6 +304,9 @@ const ensureSemanticStructureArtifact = async (request) => {
|
|
|
206
304
|
});
|
|
207
305
|
const parsed = parseJsonObject(aggregated);
|
|
208
306
|
const sections = toSemanticTree(parsed?.sections, pageArtifactPaths);
|
|
307
|
+
const mergedTables = mergeCrossPageTables(understandings);
|
|
308
|
+
const mergedFormulas = mergeCrossPageFormulas(understandings);
|
|
309
|
+
const mergedFigures = mergeCrossPageFigures(understandings);
|
|
209
310
|
const artifact = {
|
|
210
311
|
documentId: record.documentId,
|
|
211
312
|
generatedAt: new Date().toISOString(),
|
|
@@ -221,6 +322,9 @@ const ensureSemanticStructureArtifact = async (request) => {
|
|
|
221
322
|
title: record.filename,
|
|
222
323
|
children: sections,
|
|
223
324
|
},
|
|
325
|
+
...(mergedTables.length > 0 ? { tables: mergedTables } : {}),
|
|
326
|
+
...(mergedFormulas.length > 0 ? { formulas: mergedFormulas } : {}),
|
|
327
|
+
...(mergedFigures.length > 0 ? { figures: mergedFigures } : {}),
|
|
224
328
|
};
|
|
225
329
|
await writeJson(artifactPath, artifact);
|
|
226
330
|
return {
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/// <reference path="../node/compat.d.ts" />
|
|
2
|
+
import { readFile } from "node:fs/promises";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import { toDataUrl } from "../file-utils.js";
|
|
5
|
+
import { visionRecognize } from "../provider-client.js";
|
|
6
|
+
import { ensureRenderArtifact, indexDocumentInternal } from "./document.js";
|
|
7
|
+
import { buildStructuredArtifactPath, ensurePageNumber, fileExists, matchesSourceSnapshot, normalizeTableItems, pageLabel, parseJsonObject, readJson, resolveAgentSelection, resolveConfig, resolveEnv, resolveRenderScale, writeJson, } from "./shared.js";
|
|
8
|
+
const DEFAULT_TABLE_PROMPT = "Detect all tabular structures from this PDF page image. " +
|
|
9
|
+
"Return JSON only. Schema: " +
|
|
10
|
+
'{ "tables": [{ "latexTabular": "\\\\begin{tabular}...\\\\end{tabular}", "caption": "optional", "evidenceText": "optional" }] }. ' +
|
|
11
|
+
"Each table must be a complete LaTeX tabular environment. " +
|
|
12
|
+
"If no tables are found, return {\"tables\":[]}.";
|
|
13
|
+
export const get_page_tables_latex = async (request) => {
|
|
14
|
+
const env = resolveEnv(request.env);
|
|
15
|
+
const config = resolveConfig(request.config, env);
|
|
16
|
+
const { record } = await indexDocumentInternal(request);
|
|
17
|
+
ensurePageNumber(record.pageCount, request.pageNumber);
|
|
18
|
+
const { provider, model } = resolveAgentSelection(config, request);
|
|
19
|
+
const renderScale = resolveRenderScale(config, request.renderScale);
|
|
20
|
+
const prompt = typeof request.prompt === "string" && request.prompt.trim().length > 0
|
|
21
|
+
? request.prompt.trim()
|
|
22
|
+
: (config.agent.tablePrompt || DEFAULT_TABLE_PROMPT);
|
|
23
|
+
const tablesDir = path.join(record.artifactPaths.documentDir, "tables");
|
|
24
|
+
const artifactPath = buildStructuredArtifactPath(tablesDir, request.pageNumber, renderScale, provider, model, prompt);
|
|
25
|
+
if (!request.forceRefresh && await fileExists(artifactPath)) {
|
|
26
|
+
const cached = await readJson(artifactPath);
|
|
27
|
+
if (matchesSourceSnapshot(cached, record)) {
|
|
28
|
+
return { ...cached, cacheStatus: "reused" };
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
const renderArtifact = await ensureRenderArtifact({
|
|
32
|
+
pdfPath: request.pdfPath,
|
|
33
|
+
workspaceDir: request.workspaceDir,
|
|
34
|
+
forceRefresh: request.forceRefresh,
|
|
35
|
+
config,
|
|
36
|
+
pageNumber: request.pageNumber,
|
|
37
|
+
renderScale: request.renderScale,
|
|
38
|
+
});
|
|
39
|
+
const imageBytes = new Uint8Array(await readFile(renderArtifact.imagePath));
|
|
40
|
+
const imageDataUrl = toDataUrl(imageBytes, renderArtifact.mimeType);
|
|
41
|
+
const response = await visionRecognize({
|
|
42
|
+
config,
|
|
43
|
+
env,
|
|
44
|
+
providerAlias: provider,
|
|
45
|
+
model,
|
|
46
|
+
prompt,
|
|
47
|
+
imageDataUrl,
|
|
48
|
+
runtimeApiKeys: request.providerApiKeys,
|
|
49
|
+
});
|
|
50
|
+
const parsed = parseJsonObject(response);
|
|
51
|
+
const tables = normalizeTableItems(parsed?.tables);
|
|
52
|
+
const pageArtifactPath = path.join(record.artifactPaths.pagesDir, `${pageLabel(request.pageNumber)}.json`);
|
|
53
|
+
const artifact = {
|
|
54
|
+
documentId: record.documentId,
|
|
55
|
+
pageNumber: request.pageNumber,
|
|
56
|
+
renderScale,
|
|
57
|
+
sourceSizeBytes: record.sizeBytes,
|
|
58
|
+
sourceMtimeMs: record.mtimeMs,
|
|
59
|
+
provider,
|
|
60
|
+
model,
|
|
61
|
+
prompt,
|
|
62
|
+
imagePath: renderArtifact.imagePath,
|
|
63
|
+
pageArtifactPath,
|
|
64
|
+
renderArtifactPath: renderArtifact.artifactPath,
|
|
65
|
+
artifactPath,
|
|
66
|
+
generatedAt: new Date().toISOString(),
|
|
67
|
+
tables,
|
|
68
|
+
};
|
|
69
|
+
await writeJson(artifactPath, artifact);
|
|
70
|
+
return { ...artifact, cacheStatus: "fresh" };
|
|
71
|
+
};
|
package/dist/local/types.d.ts
CHANGED
|
@@ -56,6 +56,9 @@ export interface LocalSemanticDocumentStructure {
|
|
|
56
56
|
readonly pageIndexArtifactPath: string;
|
|
57
57
|
readonly artifactPath: string;
|
|
58
58
|
readonly root: LocalSemanticStructureNode;
|
|
59
|
+
readonly tables?: ReadonlyArray<MergedTableItem>;
|
|
60
|
+
readonly formulas?: ReadonlyArray<MergedFormulaItem>;
|
|
61
|
+
readonly figures?: ReadonlyArray<MergedFigureItem>;
|
|
59
62
|
readonly cacheStatus: "fresh" | "reused";
|
|
60
63
|
}
|
|
61
64
|
export interface LocalPageContent {
|
|
@@ -127,6 +130,79 @@ export interface LocalPageFormulasArtifact {
|
|
|
127
130
|
readonly formulas: ReadonlyArray<LocalFormulaArtifactItem>;
|
|
128
131
|
readonly cacheStatus: "fresh" | "reused";
|
|
129
132
|
}
|
|
133
|
+
export interface LocalFigureArtifactItem {
|
|
134
|
+
readonly id: string;
|
|
135
|
+
readonly figureType: "schematic" | "chart" | "photo" | "diagram" | "other";
|
|
136
|
+
readonly caption?: string;
|
|
137
|
+
readonly description?: string;
|
|
138
|
+
readonly truncatedTop?: boolean;
|
|
139
|
+
readonly truncatedBottom?: boolean;
|
|
140
|
+
}
|
|
141
|
+
export interface LocalPageUnderstandingTableItem {
|
|
142
|
+
readonly id: string;
|
|
143
|
+
readonly latexTabular: string;
|
|
144
|
+
readonly caption?: string;
|
|
145
|
+
readonly truncatedTop?: boolean;
|
|
146
|
+
readonly truncatedBottom?: boolean;
|
|
147
|
+
}
|
|
148
|
+
export interface LocalPageUnderstandingFormulaItem {
|
|
149
|
+
readonly id: string;
|
|
150
|
+
readonly latexMath: string;
|
|
151
|
+
readonly label?: string;
|
|
152
|
+
readonly truncatedTop?: boolean;
|
|
153
|
+
readonly truncatedBottom?: boolean;
|
|
154
|
+
}
|
|
155
|
+
export interface LocalPageUnderstandingArtifact {
|
|
156
|
+
readonly documentId: string;
|
|
157
|
+
readonly pageNumber: number;
|
|
158
|
+
readonly renderScale: number;
|
|
159
|
+
readonly sourceSizeBytes: number;
|
|
160
|
+
readonly sourceMtimeMs: number;
|
|
161
|
+
readonly provider: string;
|
|
162
|
+
readonly model: string;
|
|
163
|
+
readonly prompt: string;
|
|
164
|
+
readonly imagePath: string;
|
|
165
|
+
readonly pageArtifactPath: string;
|
|
166
|
+
readonly renderArtifactPath: string;
|
|
167
|
+
readonly artifactPath: string;
|
|
168
|
+
readonly generatedAt: string;
|
|
169
|
+
readonly tables: ReadonlyArray<LocalPageUnderstandingTableItem>;
|
|
170
|
+
readonly formulas: ReadonlyArray<LocalPageUnderstandingFormulaItem>;
|
|
171
|
+
readonly figures: ReadonlyArray<LocalFigureArtifactItem>;
|
|
172
|
+
readonly cacheStatus: "fresh" | "reused";
|
|
173
|
+
}
|
|
174
|
+
export interface LocalPageUnderstandingRequest extends LocalPageRenderRequest {
|
|
175
|
+
readonly provider?: string;
|
|
176
|
+
readonly model?: string;
|
|
177
|
+
readonly prompt?: string;
|
|
178
|
+
readonly env?: Env;
|
|
179
|
+
readonly providerApiKeys?: Record<string, string>;
|
|
180
|
+
}
|
|
181
|
+
export interface MergedTableItem {
|
|
182
|
+
readonly id: string;
|
|
183
|
+
readonly latexTabular: string;
|
|
184
|
+
readonly caption?: string;
|
|
185
|
+
readonly startPage: number;
|
|
186
|
+
readonly endPage: number;
|
|
187
|
+
readonly crossPageHint?: boolean;
|
|
188
|
+
}
|
|
189
|
+
export interface MergedFormulaItem {
|
|
190
|
+
readonly id: string;
|
|
191
|
+
readonly latexMath: string;
|
|
192
|
+
readonly label?: string;
|
|
193
|
+
readonly startPage: number;
|
|
194
|
+
readonly endPage: number;
|
|
195
|
+
readonly crossPageHint?: boolean;
|
|
196
|
+
}
|
|
197
|
+
export interface MergedFigureItem {
|
|
198
|
+
readonly id: string;
|
|
199
|
+
readonly figureType: "schematic" | "chart" | "photo" | "diagram" | "other";
|
|
200
|
+
readonly caption?: string;
|
|
201
|
+
readonly description?: string;
|
|
202
|
+
readonly startPage: number;
|
|
203
|
+
readonly endPage: number;
|
|
204
|
+
readonly crossPageHint?: boolean;
|
|
205
|
+
}
|
|
130
206
|
export interface LocalDocumentRequest {
|
|
131
207
|
readonly pdfPath: string;
|
|
132
208
|
readonly workspaceDir?: string;
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/// <reference path="../node/compat.d.ts" />
|
|
2
|
+
import { readFile } from "node:fs/promises";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import { toDataUrl } from "../file-utils.js";
|
|
5
|
+
import { visionRecognize } from "../provider-client.js";
|
|
6
|
+
import { ensureRenderArtifact, indexDocumentInternal } from "./document.js";
|
|
7
|
+
import { buildStructuredArtifactPath, ensurePageNumber, fileExists, matchesSourceSnapshot, pageLabel, parseJsonObject, readJson, resolveAgentSelection, resolveConfig, resolveEnv, resolveRenderScale, stripCodeFences, writeJson, } from "./shared.js";
|
|
8
|
+
const DEFAULT_UNDERSTANDING_PROMPT = [
|
|
9
|
+
"Analyze this rendered PDF page image. Extract all tables, displayed formulas, and figures.",
|
|
10
|
+
"Return JSON only. Schema:",
|
|
11
|
+
"{",
|
|
12
|
+
' "tables": [{ "latexTabular": "\\\\begin{tabular}...\\\\end{tabular}", "caption": "optional", "truncatedTop": false, "truncatedBottom": false }],',
|
|
13
|
+
' "formulas": [{ "latexMath": "LaTeX expression", "label": "optional", "truncatedTop": false, "truncatedBottom": false }],',
|
|
14
|
+
' "figures": [{ "figureType": "schematic|chart|photo|diagram|other", "caption": "optional", "description": "brief visual description", "truncatedTop": false, "truncatedBottom": false }]',
|
|
15
|
+
"}",
|
|
16
|
+
"Rules:",
|
|
17
|
+
"- Tables must be complete LaTeX tabular environments.",
|
|
18
|
+
"- Formulas must use LaTeX math notation. Skip trivial inline math or single symbols.",
|
|
19
|
+
"- Figures should be described by type, caption, and a brief visual description. Do not crop or encode images.",
|
|
20
|
+
"- Set truncatedTop/truncatedBottom to true if the element appears cut off at the page boundary.",
|
|
21
|
+
'- If nothing is found for a category, return an empty array for that key.',
|
|
22
|
+
].join("\n");
|
|
23
|
+
const normalizeFigureItems = (value) => {
|
|
24
|
+
if (!Array.isArray(value))
|
|
25
|
+
return [];
|
|
26
|
+
return value.flatMap((item, index) => {
|
|
27
|
+
const figure = item;
|
|
28
|
+
const figureType = typeof figure.figureType === "string" ? figure.figureType.trim() : "other";
|
|
29
|
+
const validTypes = new Set(["schematic", "chart", "photo", "diagram", "other"]);
|
|
30
|
+
return [{
|
|
31
|
+
id: `figure-${index + 1}`,
|
|
32
|
+
figureType: validTypes.has(figureType) ? figureType : "other",
|
|
33
|
+
caption: typeof figure.caption === "string" ? figure.caption.trim() : undefined,
|
|
34
|
+
description: typeof figure.description === "string" ? figure.description.trim() : undefined,
|
|
35
|
+
truncatedTop: figure.truncatedTop === true,
|
|
36
|
+
truncatedBottom: figure.truncatedBottom === true,
|
|
37
|
+
}];
|
|
38
|
+
});
|
|
39
|
+
};
|
|
40
|
+
const normalizeUnderstandingTables = (value) => {
|
|
41
|
+
if (!Array.isArray(value))
|
|
42
|
+
return [];
|
|
43
|
+
return value.flatMap((item, index) => {
|
|
44
|
+
const table = item;
|
|
45
|
+
const latexTabular = typeof table.latexTabular === "string" ? stripCodeFences(table.latexTabular).trim() : "";
|
|
46
|
+
if (!latexTabular.includes("\\begin{tabular}") || !latexTabular.includes("\\end{tabular}"))
|
|
47
|
+
return [];
|
|
48
|
+
return [{
|
|
49
|
+
id: `table-${index + 1}`,
|
|
50
|
+
latexTabular,
|
|
51
|
+
caption: typeof table.caption === "string" ? table.caption.trim() : undefined,
|
|
52
|
+
truncatedTop: table.truncatedTop === true,
|
|
53
|
+
truncatedBottom: table.truncatedBottom === true,
|
|
54
|
+
}];
|
|
55
|
+
});
|
|
56
|
+
};
|
|
57
|
+
const normalizeUnderstandingFormulas = (value) => {
|
|
58
|
+
if (!Array.isArray(value))
|
|
59
|
+
return [];
|
|
60
|
+
return value.flatMap((item, index) => {
|
|
61
|
+
const formula = item;
|
|
62
|
+
const latexMath = typeof formula.latexMath === "string" ? stripCodeFences(formula.latexMath).trim() : "";
|
|
63
|
+
if (!latexMath)
|
|
64
|
+
return [];
|
|
65
|
+
return [{
|
|
66
|
+
id: `formula-${index + 1}`,
|
|
67
|
+
latexMath,
|
|
68
|
+
label: typeof formula.label === "string" ? formula.label.trim() : undefined,
|
|
69
|
+
truncatedTop: formula.truncatedTop === true,
|
|
70
|
+
truncatedBottom: formula.truncatedBottom === true,
|
|
71
|
+
}];
|
|
72
|
+
});
|
|
73
|
+
};
|
|
74
|
+
export const get_page_understanding = async (request) => {
|
|
75
|
+
const env = resolveEnv(request.env);
|
|
76
|
+
const config = resolveConfig(request.config, env);
|
|
77
|
+
const { record } = await indexDocumentInternal(request);
|
|
78
|
+
ensurePageNumber(record.pageCount, request.pageNumber);
|
|
79
|
+
const { provider, model } = resolveAgentSelection(config, request);
|
|
80
|
+
const renderScale = resolveRenderScale(config, request.renderScale);
|
|
81
|
+
const prompt = typeof request.prompt === "string" && request.prompt.trim().length > 0
|
|
82
|
+
? request.prompt.trim()
|
|
83
|
+
: DEFAULT_UNDERSTANDING_PROMPT;
|
|
84
|
+
const understandingDir = path.join(record.artifactPaths.documentDir, "understanding");
|
|
85
|
+
const artifactPath = buildStructuredArtifactPath(understandingDir, request.pageNumber, renderScale, provider, model, prompt);
|
|
86
|
+
if (!request.forceRefresh && await fileExists(artifactPath)) {
|
|
87
|
+
const cached = await readJson(artifactPath);
|
|
88
|
+
if (matchesSourceSnapshot(cached, record)) {
|
|
89
|
+
return { ...cached, cacheStatus: "reused" };
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
const renderArtifact = await ensureRenderArtifact({
|
|
93
|
+
pdfPath: request.pdfPath,
|
|
94
|
+
workspaceDir: request.workspaceDir,
|
|
95
|
+
forceRefresh: request.forceRefresh,
|
|
96
|
+
config,
|
|
97
|
+
pageNumber: request.pageNumber,
|
|
98
|
+
renderScale: request.renderScale,
|
|
99
|
+
});
|
|
100
|
+
const imageBytes = new Uint8Array(await readFile(renderArtifact.imagePath));
|
|
101
|
+
const imageDataUrl = toDataUrl(imageBytes, renderArtifact.mimeType);
|
|
102
|
+
const response = await visionRecognize({
|
|
103
|
+
config,
|
|
104
|
+
env,
|
|
105
|
+
providerAlias: provider,
|
|
106
|
+
model,
|
|
107
|
+
prompt,
|
|
108
|
+
imageDataUrl,
|
|
109
|
+
runtimeApiKeys: request.providerApiKeys,
|
|
110
|
+
});
|
|
111
|
+
const parsed = parseJsonObject(response);
|
|
112
|
+
const tables = normalizeUnderstandingTables(parsed?.tables);
|
|
113
|
+
const formulas = normalizeUnderstandingFormulas(parsed?.formulas);
|
|
114
|
+
const figures = normalizeFigureItems(parsed?.figures);
|
|
115
|
+
const pageArtifactPath = path.join(record.artifactPaths.pagesDir, `${pageLabel(request.pageNumber)}.json`);
|
|
116
|
+
const artifact = {
|
|
117
|
+
documentId: record.documentId,
|
|
118
|
+
pageNumber: request.pageNumber,
|
|
119
|
+
renderScale,
|
|
120
|
+
sourceSizeBytes: record.sizeBytes,
|
|
121
|
+
sourceMtimeMs: record.mtimeMs,
|
|
122
|
+
provider,
|
|
123
|
+
model,
|
|
124
|
+
prompt,
|
|
125
|
+
imagePath: renderArtifact.imagePath,
|
|
126
|
+
pageArtifactPath,
|
|
127
|
+
renderArtifactPath: renderArtifact.artifactPath,
|
|
128
|
+
artifactPath,
|
|
129
|
+
generatedAt: new Date().toISOString(),
|
|
130
|
+
tables,
|
|
131
|
+
formulas,
|
|
132
|
+
figures,
|
|
133
|
+
};
|
|
134
|
+
await writeJson(artifactPath, artifact);
|
|
135
|
+
return { ...artifact, cacheStatus: "fresh" };
|
|
136
|
+
};
|
package/dist/pdf-types.d.ts
CHANGED
package/echo-pdf.config.json
CHANGED
|
@@ -8,7 +8,8 @@
|
|
|
8
8
|
"agent": {
|
|
9
9
|
"defaultProvider": "openai",
|
|
10
10
|
"defaultModel": "",
|
|
11
|
-
"tablePrompt": "Detect all tabular structures from this PDF page image. Output only valid LaTeX tabular environments, no explanations, no markdown fences."
|
|
11
|
+
"tablePrompt": "Detect all tabular structures from this PDF page image. Output only valid LaTeX tabular environments, no explanations, no markdown fences.",
|
|
12
|
+
"formulaPrompt": ""
|
|
12
13
|
},
|
|
13
14
|
"providers": {
|
|
14
15
|
"openai": {
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@echofiles/echo-pdf",
|
|
3
3
|
"description": "Local-first PDF document component core with CLI, workspace artifacts, and reusable page primitives.",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.9.0",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"homepage": "https://pdf.echofile.ai/",
|
|
7
7
|
"repository": {
|