@echofiles/echo-pdf 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +85 -562
- package/bin/echo-pdf.js +130 -525
- package/dist/file-utils.d.ts +0 -3
- package/dist/file-utils.js +0 -18
- package/dist/local/document.d.ts +10 -0
- package/dist/local/document.js +133 -0
- package/dist/local/index.d.ts +3 -135
- package/dist/local/index.js +2 -555
- package/dist/local/semantic.d.ts +2 -0
- package/dist/local/semantic.js +231 -0
- package/dist/local/shared.d.ts +50 -0
- package/dist/local/shared.js +173 -0
- package/dist/local/types.d.ts +183 -0
- package/dist/local/types.js +2 -0
- package/dist/node/pdfium-local.js +30 -6
- package/dist/pdf-config.js +2 -65
- package/dist/pdf-types.d.ts +1 -58
- package/dist/types.d.ts +1 -87
- package/echo-pdf.config.json +1 -21
- package/package.json +25 -22
- package/bin/lib/http.js +0 -97
- package/bin/lib/mcp-stdio.js +0 -99
- package/dist/auth.d.ts +0 -18
- package/dist/auth.js +0 -36
- package/dist/core/index.d.ts +0 -50
- package/dist/core/index.js +0 -7
- package/dist/file-ops.d.ts +0 -11
- package/dist/file-ops.js +0 -36
- package/dist/file-store-do.d.ts +0 -36
- package/dist/file-store-do.js +0 -298
- package/dist/http-error.d.ts +0 -9
- package/dist/http-error.js +0 -14
- package/dist/index.d.ts +0 -1
- package/dist/index.js +0 -1
- package/dist/mcp-server.d.ts +0 -3
- package/dist/mcp-server.js +0 -124
- package/dist/node/semantic-local.d.ts +0 -16
- package/dist/node/semantic-local.js +0 -113
- package/dist/pdf-agent.d.ts +0 -18
- package/dist/pdf-agent.js +0 -217
- package/dist/pdf-storage.d.ts +0 -8
- package/dist/pdf-storage.js +0 -86
- package/dist/pdfium-engine.d.ts +0 -9
- package/dist/pdfium-engine.js +0 -180
- package/dist/r2-file-store.d.ts +0 -20
- package/dist/r2-file-store.js +0 -176
- package/dist/response-schema.d.ts +0 -15
- package/dist/response-schema.js +0 -159
- package/dist/tool-registry.d.ts +0 -16
- package/dist/tool-registry.js +0 -175
- package/dist/worker.d.ts +0 -7
- package/dist/worker.js +0 -386
- package/scripts/export-fixtures.sh +0 -204
- package/wrangler.toml +0 -19
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
/// <reference path="../node/compat.d.ts" />
|
|
2
|
+
import { readFile } from "node:fs/promises";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import { resolveModelForProvider, resolveProviderAlias } from "../agent-defaults.js";
|
|
5
|
+
import { toDataUrl } from "../file-utils.js";
|
|
6
|
+
import { generateText, visionRecognize } from "../provider-client.js";
|
|
7
|
+
import { ensureRenderArtifact, indexDocumentInternal } from "./document.js";
|
|
8
|
+
import { fileExists, matchesSourceSnapshot, matchesStrategyKey, pageLabel, parseJsonObject, readJson, resolveConfig, resolveEnv, writeJson, } from "./shared.js";
|
|
9
|
+
const resolveSemanticExtractionBudget = (input) => ({
|
|
10
|
+
pageSelection: "all",
|
|
11
|
+
chunkMaxChars: typeof input?.chunkMaxChars === "number" && Number.isFinite(input.chunkMaxChars) && input.chunkMaxChars > 400
|
|
12
|
+
? Math.floor(input.chunkMaxChars)
|
|
13
|
+
: 4000,
|
|
14
|
+
chunkOverlapChars: typeof input?.chunkOverlapChars === "number" && Number.isFinite(input.chunkOverlapChars) && input.chunkOverlapChars >= 0
|
|
15
|
+
? Math.floor(input.chunkOverlapChars)
|
|
16
|
+
: 300,
|
|
17
|
+
});
|
|
18
|
+
const normalizeSemanticAgentCandidate = (value, pageNumber) => {
|
|
19
|
+
const candidate = value;
|
|
20
|
+
const title = typeof candidate?.title === "string" ? candidate.title.trim() : "";
|
|
21
|
+
const level = typeof candidate?.level === "number" && Number.isInteger(candidate.level) && candidate.level > 0
|
|
22
|
+
? candidate.level
|
|
23
|
+
: 0;
|
|
24
|
+
const confidence = typeof candidate?.confidence === "number" && Number.isFinite(candidate.confidence)
|
|
25
|
+
? Math.max(0, Math.min(1, candidate.confidence))
|
|
26
|
+
: 0;
|
|
27
|
+
if (!title || level <= 0 || confidence < 0.6)
|
|
28
|
+
return null;
|
|
29
|
+
return {
|
|
30
|
+
title,
|
|
31
|
+
level,
|
|
32
|
+
pageNumber,
|
|
33
|
+
excerpt: typeof candidate?.excerpt === "string" ? candidate.excerpt.trim() : undefined,
|
|
34
|
+
confidence,
|
|
35
|
+
};
|
|
36
|
+
};
|
|
37
|
+
const toSemanticTree = (value, pageArtifactPaths) => {
|
|
38
|
+
if (!Array.isArray(value))
|
|
39
|
+
return [];
|
|
40
|
+
const nodes = [];
|
|
41
|
+
value.forEach((item, index) => {
|
|
42
|
+
const node = item;
|
|
43
|
+
const title = typeof node.title === "string" ? node.title.trim() : "";
|
|
44
|
+
const level = typeof node.level === "number" && Number.isInteger(node.level) && node.level > 0 ? node.level : undefined;
|
|
45
|
+
const pageNumber = typeof node.pageNumber === "number" && Number.isInteger(node.pageNumber) && node.pageNumber > 0 ? node.pageNumber : undefined;
|
|
46
|
+
if (!title || typeof level !== "number" || typeof pageNumber !== "number")
|
|
47
|
+
return;
|
|
48
|
+
nodes.push({
|
|
49
|
+
id: `semantic-node-${index + 1}-${pageNumber}-${level}`,
|
|
50
|
+
type: "section",
|
|
51
|
+
title,
|
|
52
|
+
level,
|
|
53
|
+
pageNumber,
|
|
54
|
+
pageArtifactPath: pageArtifactPaths.get(pageNumber),
|
|
55
|
+
excerpt: typeof node.excerpt === "string" ? node.excerpt.trim() : undefined,
|
|
56
|
+
children: toSemanticTree(node.children, pageArtifactPaths),
|
|
57
|
+
});
|
|
58
|
+
});
|
|
59
|
+
return nodes;
|
|
60
|
+
};
|
|
61
|
+
const buildSemanticPageUnderstandingPrompt = (page, renderScale) => {
|
|
62
|
+
return [
|
|
63
|
+
"You extract semantic heading candidates from one rendered PDF page.",
|
|
64
|
+
"Primary evidence is the page image layout. Use the extracted page text only as supporting context.",
|
|
65
|
+
"Return JSON only.",
|
|
66
|
+
"Schema:",
|
|
67
|
+
"{",
|
|
68
|
+
' "candidates": [',
|
|
69
|
+
" {",
|
|
70
|
+
' "title": "string",',
|
|
71
|
+
' "level": 1,',
|
|
72
|
+
' "excerpt": "short evidence string",',
|
|
73
|
+
' "confidence": 0.0',
|
|
74
|
+
" }",
|
|
75
|
+
" ]",
|
|
76
|
+
"}",
|
|
77
|
+
"Rules:",
|
|
78
|
+
"- Use only true document headings/sections that are clearly supported by page layout plus text.",
|
|
79
|
+
"- Prefer conservative extraction over guessing.",
|
|
80
|
+
"- Do not include table column headers, field labels, figure labels, unit/value rows, worksheet fragments, or prose sentences.",
|
|
81
|
+
"- Do not infer hierarchy beyond the explicit heading numbering or structure visible on the page.",
|
|
82
|
+
"- Confidence should reflect how likely the candidate is to be a real navigational section heading in the document.",
|
|
83
|
+
'- If no reliable semantic structure is detectable, return {"candidates":[]}.',
|
|
84
|
+
`Page number: ${page.pageNumber}`,
|
|
85
|
+
`Render scale: ${renderScale}`,
|
|
86
|
+
"",
|
|
87
|
+
"Extracted page text:",
|
|
88
|
+
page.text,
|
|
89
|
+
].join("\n");
|
|
90
|
+
};
|
|
91
|
+
const buildSemanticAggregationPrompt = (record, candidates) => {
|
|
92
|
+
const candidateDump = JSON.stringify({ candidates }, null, 2);
|
|
93
|
+
return [
|
|
94
|
+
"You assemble semantic document structure from page-understanding heading candidates.",
|
|
95
|
+
"Return JSON only.",
|
|
96
|
+
"Schema:",
|
|
97
|
+
"{",
|
|
98
|
+
' "sections": [',
|
|
99
|
+
" {",
|
|
100
|
+
' "title": "string",',
|
|
101
|
+
' "level": 1,',
|
|
102
|
+
' "pageNumber": 1,',
|
|
103
|
+
' "excerpt": "short evidence string",',
|
|
104
|
+
' "children": []',
|
|
105
|
+
" }",
|
|
106
|
+
" ]",
|
|
107
|
+
"}",
|
|
108
|
+
"Rules:",
|
|
109
|
+
"- Preserve hierarchy with nested children.",
|
|
110
|
+
"- Use only candidate headings that form a reliable document structure.",
|
|
111
|
+
"- Favor candidates with strong confidence and clear section semantics; drop visually prominent noise.",
|
|
112
|
+
"- Deduplicate repeated headings from overlapping segments.",
|
|
113
|
+
"- Do not invent sections not present in the candidates.",
|
|
114
|
+
'- If no reliable semantic structure is detectable, return {"sections":[]}.',
|
|
115
|
+
`Document filename: ${record.filename}`,
|
|
116
|
+
`Page count: ${record.pageCount}`,
|
|
117
|
+
"",
|
|
118
|
+
candidateDump,
|
|
119
|
+
].join("\n");
|
|
120
|
+
};
|
|
121
|
+
const resolveSemanticAgentContext = (config, request) => {
|
|
122
|
+
const provider = resolveProviderAlias(config, request.provider);
|
|
123
|
+
const model = resolveModelForProvider(config, provider, request.model);
|
|
124
|
+
if (!provider || !model) {
|
|
125
|
+
throw new Error([
|
|
126
|
+
"semantic extraction requires a configured provider and model.",
|
|
127
|
+
"Pass `provider` and `model` to `get_semantic_document_structure()`",
|
|
128
|
+
"or configure them first with `echo-pdf provider use --provider <alias>` and `echo-pdf model set --provider <alias> --model <model-id>`.",
|
|
129
|
+
].join(" "));
|
|
130
|
+
}
|
|
131
|
+
return { provider, model };
|
|
132
|
+
};
|
|
133
|
+
const extractSemanticCandidatesFromRenderedPage = async (input) => {
|
|
134
|
+
const renderArtifact = await ensureRenderArtifact({
|
|
135
|
+
pdfPath: input.request.pdfPath,
|
|
136
|
+
workspaceDir: input.request.workspaceDir,
|
|
137
|
+
forceRefresh: input.request.forceRefresh,
|
|
138
|
+
config: input.config,
|
|
139
|
+
pageNumber: input.page.pageNumber,
|
|
140
|
+
});
|
|
141
|
+
const imageBytes = new Uint8Array(await readFile(renderArtifact.imagePath));
|
|
142
|
+
const imageDataUrl = toDataUrl(imageBytes, renderArtifact.mimeType);
|
|
143
|
+
const response = await visionRecognize({
|
|
144
|
+
config: input.config,
|
|
145
|
+
env: input.env,
|
|
146
|
+
providerAlias: input.provider,
|
|
147
|
+
model: input.model,
|
|
148
|
+
prompt: buildSemanticPageUnderstandingPrompt(input.page, renderArtifact.renderScale),
|
|
149
|
+
imageDataUrl,
|
|
150
|
+
runtimeApiKeys: input.request.providerApiKeys,
|
|
151
|
+
});
|
|
152
|
+
const parsed = parseJsonObject(response);
|
|
153
|
+
return (Array.isArray(parsed?.candidates) ? parsed.candidates : [])
|
|
154
|
+
.map((candidate) => normalizeSemanticAgentCandidate(candidate, input.page.pageNumber))
|
|
155
|
+
.filter((candidate) => candidate !== null);
|
|
156
|
+
};
|
|
157
|
+
const ensureSemanticStructureArtifact = async (request) => {
|
|
158
|
+
const env = resolveEnv(request.env);
|
|
159
|
+
const config = resolveConfig(request.config, env);
|
|
160
|
+
const { record } = await indexDocumentInternal(request);
|
|
161
|
+
const artifactPath = record.artifactPaths.semanticStructureJsonPath;
|
|
162
|
+
const semanticBudget = resolveSemanticExtractionBudget(request.semanticExtraction);
|
|
163
|
+
const { provider, model } = resolveSemanticAgentContext(config, request);
|
|
164
|
+
const strategyKey = `agent::page-understanding-v1::${provider}::${model}::${config.service.defaultRenderScale}::${semanticBudget.pageSelection}::${semanticBudget.chunkMaxChars}::${semanticBudget.chunkOverlapChars}`;
|
|
165
|
+
if (!request.forceRefresh && await fileExists(artifactPath)) {
|
|
166
|
+
const cached = await readJson(artifactPath);
|
|
167
|
+
if (matchesSourceSnapshot(cached, record) && matchesStrategyKey(cached, strategyKey)) {
|
|
168
|
+
return {
|
|
169
|
+
...cached,
|
|
170
|
+
cacheStatus: "reused",
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
const pages = [];
|
|
175
|
+
for (let pageNumber = 1; pageNumber <= record.pageCount; pageNumber += 1) {
|
|
176
|
+
const pagePath = path.join(record.artifactPaths.pagesDir, `${pageLabel(pageNumber)}.json`);
|
|
177
|
+
const page = await readJson(pagePath);
|
|
178
|
+
pages.push(page);
|
|
179
|
+
}
|
|
180
|
+
const pageArtifactPaths = new Map(pages.map((page) => [page.pageNumber, page.artifactPath]));
|
|
181
|
+
const candidateMap = new Map();
|
|
182
|
+
for (const page of pages) {
|
|
183
|
+
const candidates = await extractSemanticCandidatesFromRenderedPage({
|
|
184
|
+
page,
|
|
185
|
+
request,
|
|
186
|
+
config,
|
|
187
|
+
env,
|
|
188
|
+
provider,
|
|
189
|
+
model,
|
|
190
|
+
});
|
|
191
|
+
for (const candidate of candidates) {
|
|
192
|
+
const key = `${candidate.pageNumber}:${candidate.level}:${candidate.title}`;
|
|
193
|
+
const existing = candidateMap.get(key);
|
|
194
|
+
if (!existing || candidate.confidence > existing.confidence) {
|
|
195
|
+
candidateMap.set(key, candidate);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
const aggregated = await generateText({
|
|
200
|
+
config,
|
|
201
|
+
env,
|
|
202
|
+
providerAlias: provider,
|
|
203
|
+
model,
|
|
204
|
+
prompt: buildSemanticAggregationPrompt(record, [...candidateMap.values()]),
|
|
205
|
+
runtimeApiKeys: request.providerApiKeys,
|
|
206
|
+
});
|
|
207
|
+
const parsed = parseJsonObject(aggregated);
|
|
208
|
+
const sections = toSemanticTree(parsed?.sections, pageArtifactPaths);
|
|
209
|
+
const artifact = {
|
|
210
|
+
documentId: record.documentId,
|
|
211
|
+
generatedAt: new Date().toISOString(),
|
|
212
|
+
detector: "agent-structured-v1",
|
|
213
|
+
strategyKey,
|
|
214
|
+
sourceSizeBytes: record.sizeBytes,
|
|
215
|
+
sourceMtimeMs: record.mtimeMs,
|
|
216
|
+
pageIndexArtifactPath: record.artifactPaths.structureJsonPath,
|
|
217
|
+
artifactPath,
|
|
218
|
+
root: {
|
|
219
|
+
id: `semantic-${record.documentId}`,
|
|
220
|
+
type: "document",
|
|
221
|
+
title: record.filename,
|
|
222
|
+
children: sections,
|
|
223
|
+
},
|
|
224
|
+
};
|
|
225
|
+
await writeJson(artifactPath, artifact);
|
|
226
|
+
return {
|
|
227
|
+
...artifact,
|
|
228
|
+
cacheStatus: "fresh",
|
|
229
|
+
};
|
|
230
|
+
};
|
|
231
|
+
export const get_semantic_document_structure = async (request) => ensureSemanticStructureArtifact(request);
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import type { EchoPdfConfig } from "../pdf-types.js";
|
|
2
|
+
import type { Env } from "../types.js";
|
|
3
|
+
import type { InternalDocumentArtifactPaths, LocalDocumentArtifactPaths, LocalFormulaArtifactItem, LocalTableArtifactItem, StoredDocumentRecord } from "./types.js";
|
|
4
|
+
export declare const defaultWorkspaceDir: () => string;
|
|
5
|
+
export declare const resolveWorkspaceDir: (workspaceDir?: string) => string;
|
|
6
|
+
export declare const toDocumentId: (absolutePdfPath: string) => string;
|
|
7
|
+
export declare const hashFragment: (value: string, length?: number) => string;
|
|
8
|
+
export declare const sanitizeSegment: (value: string) => string;
|
|
9
|
+
export declare const scaleLabel: (value: number) => string;
|
|
10
|
+
export declare const pageLabel: (pageNumber: number) => string;
|
|
11
|
+
export declare const buildArtifactPaths: (workspaceDir: string, documentId: string) => InternalDocumentArtifactPaths;
|
|
12
|
+
export declare const toPublicArtifactPaths: (paths: InternalDocumentArtifactPaths) => LocalDocumentArtifactPaths;
|
|
13
|
+
export declare const buildRenderArtifactPaths: (paths: LocalDocumentArtifactPaths, pageNumber: number, renderScale: number) => {
|
|
14
|
+
artifactPath: string;
|
|
15
|
+
imagePath: string;
|
|
16
|
+
};
|
|
17
|
+
export declare const buildStructuredArtifactPath: (baseDir: string, pageNumber: number, renderScale: number, provider: string, model: string, prompt: string) => string;
|
|
18
|
+
export declare const createPreview: (text: string) => string;
|
|
19
|
+
export declare const createPageTitle: (pageNumber: number, text: string) => string;
|
|
20
|
+
export declare const stripCodeFences: (value: string) => string;
|
|
21
|
+
export declare const parseJsonObject: (value: string) => unknown;
|
|
22
|
+
export declare const normalizeTableItems: (value: unknown) => LocalTableArtifactItem[];
|
|
23
|
+
export declare const normalizeFormulaItems: (value: unknown) => LocalFormulaArtifactItem[];
|
|
24
|
+
export declare const resolveEnv: (env?: Env) => Env;
|
|
25
|
+
export declare const resolveConfig: (config?: EchoPdfConfig, env?: Env) => EchoPdfConfig;
|
|
26
|
+
export declare const resolveAgentSelection: (config: EchoPdfConfig, input: {
|
|
27
|
+
provider?: string;
|
|
28
|
+
model?: string;
|
|
29
|
+
}) => {
|
|
30
|
+
provider: string;
|
|
31
|
+
model: string;
|
|
32
|
+
};
|
|
33
|
+
export declare const resolveRenderScale: (config: EchoPdfConfig, requestedScale?: number) => number;
|
|
34
|
+
export declare const fileExists: (targetPath: string) => Promise<boolean>;
|
|
35
|
+
export declare const readJson: <T>(targetPath: string) => Promise<T>;
|
|
36
|
+
export declare const loadStoredDocument: (paths: InternalDocumentArtifactPaths) => Promise<StoredDocumentRecord | null>;
|
|
37
|
+
export declare const isReusableRecord: (record: StoredDocumentRecord, sourceStats: {
|
|
38
|
+
sizeBytes: number;
|
|
39
|
+
mtimeMs: number;
|
|
40
|
+
}, paths: InternalDocumentArtifactPaths) => Promise<boolean>;
|
|
41
|
+
export declare const writeJson: (targetPath: string, data: unknown) => Promise<void>;
|
|
42
|
+
export declare const readSourceBytes: (sourcePath: string) => Promise<Uint8Array>;
|
|
43
|
+
export declare const matchesSourceSnapshot: (artifact: {
|
|
44
|
+
sourceSizeBytes?: unknown;
|
|
45
|
+
sourceMtimeMs?: unknown;
|
|
46
|
+
}, record: StoredDocumentRecord) => boolean;
|
|
47
|
+
export declare const matchesStrategyKey: (artifact: {
|
|
48
|
+
strategyKey?: unknown;
|
|
49
|
+
}, strategyKey: string) => boolean;
|
|
50
|
+
export declare const ensurePageNumber: (pageCount: number, pageNumber: number) => void;
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { mkdir, readFile, stat, writeFile } from "node:fs/promises";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import { resolveModelForProvider, resolveProviderAlias } from "../agent-defaults.js";
|
|
5
|
+
import { loadEchoPdfConfig } from "../pdf-config.js";
|
|
6
|
+
export const defaultWorkspaceDir = () => path.resolve(process.cwd(), ".echo-pdf-workspace");
|
|
7
|
+
export const resolveWorkspaceDir = (workspaceDir) => path.resolve(process.cwd(), workspaceDir?.trim() || defaultWorkspaceDir());
|
|
8
|
+
export const toDocumentId = (absolutePdfPath) => createHash("sha256").update(absolutePdfPath).digest("hex").slice(0, 16);
|
|
9
|
+
export const hashFragment = (value, length = 12) => createHash("sha256").update(value).digest("hex").slice(0, length);
|
|
10
|
+
export const sanitizeSegment = (value) => value.replace(/[^a-zA-Z0-9._-]+/g, "_");
|
|
11
|
+
export const scaleLabel = (value) => sanitizeSegment(String(value));
|
|
12
|
+
export const pageLabel = (pageNumber) => String(pageNumber).padStart(4, "0");
|
|
13
|
+
export const buildArtifactPaths = (workspaceDir, documentId) => {
|
|
14
|
+
const documentDir = path.join(workspaceDir, "documents", documentId);
|
|
15
|
+
return {
|
|
16
|
+
workspaceDir,
|
|
17
|
+
documentDir,
|
|
18
|
+
documentJsonPath: path.join(documentDir, "document.json"),
|
|
19
|
+
structureJsonPath: path.join(documentDir, "structure.json"),
|
|
20
|
+
semanticStructureJsonPath: path.join(documentDir, "semantic-structure.json"),
|
|
21
|
+
pagesDir: path.join(documentDir, "pages"),
|
|
22
|
+
rendersDir: path.join(documentDir, "renders"),
|
|
23
|
+
};
|
|
24
|
+
};
|
|
25
|
+
export const toPublicArtifactPaths = (paths) => ({
|
|
26
|
+
workspaceDir: paths.workspaceDir,
|
|
27
|
+
documentDir: paths.documentDir,
|
|
28
|
+
documentJsonPath: paths.documentJsonPath,
|
|
29
|
+
structureJsonPath: paths.structureJsonPath,
|
|
30
|
+
semanticStructureJsonPath: paths.semanticStructureJsonPath,
|
|
31
|
+
pagesDir: paths.pagesDir,
|
|
32
|
+
rendersDir: paths.rendersDir,
|
|
33
|
+
});
|
|
34
|
+
export const buildRenderArtifactPaths = (paths, pageNumber, renderScale) => {
|
|
35
|
+
const key = `${pageLabel(pageNumber)}.scale-${scaleLabel(renderScale)}`;
|
|
36
|
+
return {
|
|
37
|
+
artifactPath: path.join(paths.rendersDir, `${key}.json`),
|
|
38
|
+
imagePath: path.join(paths.rendersDir, `${key}.png`),
|
|
39
|
+
};
|
|
40
|
+
};
|
|
41
|
+
export const buildStructuredArtifactPath = (baseDir, pageNumber, renderScale, provider, model, prompt) => {
|
|
42
|
+
const key = [
|
|
43
|
+
pageLabel(pageNumber),
|
|
44
|
+
`scale-${scaleLabel(renderScale)}`,
|
|
45
|
+
`provider-${sanitizeSegment(provider)}`,
|
|
46
|
+
`model-${sanitizeSegment(model)}`,
|
|
47
|
+
`prompt-${hashFragment(prompt, 10)}`,
|
|
48
|
+
].join(".");
|
|
49
|
+
return path.join(baseDir, `${key}.json`);
|
|
50
|
+
};
|
|
51
|
+
export const createPreview = (text) => text.replace(/\s+/g, " ").trim().slice(0, 160);
|
|
52
|
+
export const createPageTitle = (pageNumber, text) => {
|
|
53
|
+
const firstLine = text
|
|
54
|
+
.split(/\r?\n/)
|
|
55
|
+
.map((line) => line.trim())
|
|
56
|
+
.find((line) => line.length > 0);
|
|
57
|
+
return firstLine ? `Page ${pageNumber}: ${firstLine.slice(0, 80)}` : `Page ${pageNumber}`;
|
|
58
|
+
};
|
|
59
|
+
export const stripCodeFences = (value) => {
|
|
60
|
+
const text = value.trim();
|
|
61
|
+
const fenced = text.match(/^```[a-zA-Z0-9_-]*\n([\s\S]*?)\n```$/);
|
|
62
|
+
return typeof fenced?.[1] === "string" ? fenced[1].trim() : text;
|
|
63
|
+
};
|
|
64
|
+
export const parseJsonObject = (value) => {
|
|
65
|
+
const trimmed = stripCodeFences(value).trim();
|
|
66
|
+
if (!trimmed)
|
|
67
|
+
return null;
|
|
68
|
+
try {
|
|
69
|
+
return JSON.parse(trimmed);
|
|
70
|
+
}
|
|
71
|
+
catch {
|
|
72
|
+
const start = trimmed.indexOf("{");
|
|
73
|
+
const end = trimmed.lastIndexOf("}");
|
|
74
|
+
if (start >= 0 && end > start) {
|
|
75
|
+
return JSON.parse(trimmed.slice(start, end + 1));
|
|
76
|
+
}
|
|
77
|
+
throw new Error("model output was not valid JSON");
|
|
78
|
+
}
|
|
79
|
+
};
|
|
80
|
+
export const normalizeTableItems = (value) => {
|
|
81
|
+
if (!Array.isArray(value))
|
|
82
|
+
return [];
|
|
83
|
+
return value.flatMap((item, index) => {
|
|
84
|
+
const table = item;
|
|
85
|
+
const latexTabular = typeof table.latexTabular === "string" ? stripCodeFences(table.latexTabular).trim() : "";
|
|
86
|
+
if (!latexTabular.includes("\\begin{tabular}") || !latexTabular.includes("\\end{tabular}"))
|
|
87
|
+
return [];
|
|
88
|
+
return [{
|
|
89
|
+
id: `table-${index + 1}`,
|
|
90
|
+
latexTabular,
|
|
91
|
+
caption: typeof table.caption === "string" ? table.caption.trim() : undefined,
|
|
92
|
+
evidenceText: typeof table.evidenceText === "string" ? table.evidenceText.trim() : undefined,
|
|
93
|
+
}];
|
|
94
|
+
});
|
|
95
|
+
};
|
|
96
|
+
export const normalizeFormulaItems = (value) => {
|
|
97
|
+
if (!Array.isArray(value))
|
|
98
|
+
return [];
|
|
99
|
+
return value.flatMap((item, index) => {
|
|
100
|
+
const formula = item;
|
|
101
|
+
const latexMath = typeof formula.latexMath === "string" ? stripCodeFences(formula.latexMath).trim() : "";
|
|
102
|
+
if (!latexMath)
|
|
103
|
+
return [];
|
|
104
|
+
return [{
|
|
105
|
+
id: `formula-${index + 1}`,
|
|
106
|
+
latexMath,
|
|
107
|
+
label: typeof formula.label === "string" ? formula.label.trim() : undefined,
|
|
108
|
+
evidenceText: typeof formula.evidenceText === "string" ? formula.evidenceText.trim() : undefined,
|
|
109
|
+
}];
|
|
110
|
+
});
|
|
111
|
+
};
|
|
112
|
+
export const resolveEnv = (env) => env ?? process.env;
|
|
113
|
+
export const resolveConfig = (config, env) => config ?? loadEchoPdfConfig(resolveEnv(env));
|
|
114
|
+
export const resolveAgentSelection = (config, input) => {
|
|
115
|
+
const provider = resolveProviderAlias(config, input.provider);
|
|
116
|
+
const model = resolveModelForProvider(config, provider, input.model);
|
|
117
|
+
if (!model) {
|
|
118
|
+
throw new Error(`model is required for VL-first structured artifacts; pass \`model\` or set agent.defaultModel for provider "${provider}"`);
|
|
119
|
+
}
|
|
120
|
+
return { provider, model };
|
|
121
|
+
};
|
|
122
|
+
export const resolveRenderScale = (config, requestedScale) => {
|
|
123
|
+
if (typeof requestedScale === "number" && Number.isFinite(requestedScale) && requestedScale > 0) {
|
|
124
|
+
return requestedScale;
|
|
125
|
+
}
|
|
126
|
+
return config.service.defaultRenderScale;
|
|
127
|
+
};
|
|
128
|
+
export const fileExists = async (targetPath) => {
|
|
129
|
+
try {
|
|
130
|
+
await stat(targetPath);
|
|
131
|
+
return true;
|
|
132
|
+
}
|
|
133
|
+
catch {
|
|
134
|
+
return false;
|
|
135
|
+
}
|
|
136
|
+
};
|
|
137
|
+
export const readJson = async (targetPath) => {
|
|
138
|
+
const raw = await readFile(targetPath, "utf-8");
|
|
139
|
+
return JSON.parse(raw);
|
|
140
|
+
};
|
|
141
|
+
export const loadStoredDocument = async (paths) => {
|
|
142
|
+
if (!await fileExists(paths.documentJsonPath))
|
|
143
|
+
return null;
|
|
144
|
+
const raw = await readJson(paths.documentJsonPath);
|
|
145
|
+
return {
|
|
146
|
+
...raw,
|
|
147
|
+
artifactPaths: paths,
|
|
148
|
+
};
|
|
149
|
+
};
|
|
150
|
+
export const isReusableRecord = async (record, sourceStats, paths) => {
|
|
151
|
+
if (record.sizeBytes !== sourceStats.sizeBytes || record.mtimeMs !== sourceStats.mtimeMs)
|
|
152
|
+
return false;
|
|
153
|
+
if (!await fileExists(paths.structureJsonPath))
|
|
154
|
+
return false;
|
|
155
|
+
for (let pageNumber = 1; pageNumber <= record.pageCount; pageNumber += 1) {
|
|
156
|
+
const pagePath = path.join(paths.pagesDir, `${pageLabel(pageNumber)}.json`);
|
|
157
|
+
if (!await fileExists(pagePath))
|
|
158
|
+
return false;
|
|
159
|
+
}
|
|
160
|
+
return true;
|
|
161
|
+
};
|
|
162
|
+
export const writeJson = async (targetPath, data) => {
|
|
163
|
+
await mkdir(path.dirname(targetPath), { recursive: true });
|
|
164
|
+
await writeFile(targetPath, `${JSON.stringify(data, null, 2)}\n`, "utf-8");
|
|
165
|
+
};
|
|
166
|
+
export const readSourceBytes = async (sourcePath) => new Uint8Array(await readFile(sourcePath));
|
|
167
|
+
export const matchesSourceSnapshot = (artifact, record) => artifact.sourceSizeBytes === record.sizeBytes && artifact.sourceMtimeMs === record.mtimeMs;
|
|
168
|
+
export const matchesStrategyKey = (artifact, strategyKey) => artifact.strategyKey === strategyKey;
|
|
169
|
+
export const ensurePageNumber = (pageCount, pageNumber) => {
|
|
170
|
+
if (!Number.isInteger(pageNumber) || pageNumber < 1 || pageNumber > pageCount) {
|
|
171
|
+
throw new Error(`pageNumber must be within 1..${pageCount}`);
|
|
172
|
+
}
|
|
173
|
+
};
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
import type { EchoPdfConfig } from "../pdf-types.js";
|
|
2
|
+
import type { Env } from "../types.js";
|
|
3
|
+
export interface LocalDocumentArtifactPaths {
|
|
4
|
+
readonly workspaceDir: string;
|
|
5
|
+
readonly documentDir: string;
|
|
6
|
+
readonly documentJsonPath: string;
|
|
7
|
+
readonly structureJsonPath: string;
|
|
8
|
+
readonly semanticStructureJsonPath: string;
|
|
9
|
+
readonly pagesDir: string;
|
|
10
|
+
readonly rendersDir: string;
|
|
11
|
+
}
|
|
12
|
+
export interface InternalDocumentArtifactPaths extends LocalDocumentArtifactPaths {
|
|
13
|
+
}
|
|
14
|
+
export interface LocalDocumentMetadata {
|
|
15
|
+
readonly documentId: string;
|
|
16
|
+
readonly sourcePath: string;
|
|
17
|
+
readonly filename: string;
|
|
18
|
+
readonly sizeBytes: number;
|
|
19
|
+
readonly mtimeMs: number;
|
|
20
|
+
readonly pageCount: number;
|
|
21
|
+
readonly indexedAt: string;
|
|
22
|
+
readonly cacheStatus: "fresh" | "reused";
|
|
23
|
+
readonly artifactPaths: LocalDocumentArtifactPaths;
|
|
24
|
+
}
|
|
25
|
+
export interface LocalDocumentStructureNode {
|
|
26
|
+
readonly id: string;
|
|
27
|
+
readonly type: "document" | "page";
|
|
28
|
+
readonly title: string;
|
|
29
|
+
readonly pageNumber?: number;
|
|
30
|
+
readonly preview?: string;
|
|
31
|
+
readonly artifactPath?: string;
|
|
32
|
+
readonly children?: ReadonlyArray<LocalDocumentStructureNode>;
|
|
33
|
+
}
|
|
34
|
+
export interface LocalDocumentStructure {
|
|
35
|
+
readonly documentId: string;
|
|
36
|
+
readonly generatedAt: string;
|
|
37
|
+
readonly root: LocalDocumentStructureNode;
|
|
38
|
+
}
|
|
39
|
+
export interface LocalSemanticStructureNode {
|
|
40
|
+
readonly id: string;
|
|
41
|
+
readonly type: "document" | "section";
|
|
42
|
+
readonly title: string;
|
|
43
|
+
readonly level?: number;
|
|
44
|
+
readonly pageNumber?: number;
|
|
45
|
+
readonly pageArtifactPath?: string;
|
|
46
|
+
readonly excerpt?: string;
|
|
47
|
+
readonly children?: ReadonlyArray<LocalSemanticStructureNode>;
|
|
48
|
+
}
|
|
49
|
+
export interface LocalSemanticDocumentStructure {
|
|
50
|
+
readonly documentId: string;
|
|
51
|
+
readonly generatedAt: string;
|
|
52
|
+
readonly detector: "agent-structured-v1";
|
|
53
|
+
readonly strategyKey: string;
|
|
54
|
+
readonly sourceSizeBytes: number;
|
|
55
|
+
readonly sourceMtimeMs: number;
|
|
56
|
+
readonly pageIndexArtifactPath: string;
|
|
57
|
+
readonly artifactPath: string;
|
|
58
|
+
readonly root: LocalSemanticStructureNode;
|
|
59
|
+
readonly cacheStatus: "fresh" | "reused";
|
|
60
|
+
}
|
|
61
|
+
export interface LocalPageContent {
|
|
62
|
+
readonly documentId: string;
|
|
63
|
+
readonly pageNumber: number;
|
|
64
|
+
readonly title: string;
|
|
65
|
+
readonly preview: string;
|
|
66
|
+
readonly text: string;
|
|
67
|
+
readonly chars: number;
|
|
68
|
+
readonly artifactPath: string;
|
|
69
|
+
}
|
|
70
|
+
export interface LocalPageRenderArtifact {
|
|
71
|
+
readonly documentId: string;
|
|
72
|
+
readonly pageNumber: number;
|
|
73
|
+
readonly renderScale: number;
|
|
74
|
+
readonly sourceSizeBytes: number;
|
|
75
|
+
readonly sourceMtimeMs: number;
|
|
76
|
+
readonly width: number;
|
|
77
|
+
readonly height: number;
|
|
78
|
+
readonly mimeType: "image/png";
|
|
79
|
+
readonly imagePath: string;
|
|
80
|
+
readonly artifactPath: string;
|
|
81
|
+
readonly generatedAt: string;
|
|
82
|
+
readonly cacheStatus: "fresh" | "reused";
|
|
83
|
+
}
|
|
84
|
+
export interface LocalTableArtifactItem {
|
|
85
|
+
readonly id: string;
|
|
86
|
+
readonly latexTabular: string;
|
|
87
|
+
readonly caption?: string;
|
|
88
|
+
readonly evidenceText?: string;
|
|
89
|
+
}
|
|
90
|
+
export interface LocalFormulaArtifactItem {
|
|
91
|
+
readonly id: string;
|
|
92
|
+
readonly latexMath: string;
|
|
93
|
+
readonly label?: string;
|
|
94
|
+
readonly evidenceText?: string;
|
|
95
|
+
}
|
|
96
|
+
export interface LocalPageTablesArtifact {
|
|
97
|
+
readonly documentId: string;
|
|
98
|
+
readonly pageNumber: number;
|
|
99
|
+
readonly renderScale: number;
|
|
100
|
+
readonly sourceSizeBytes: number;
|
|
101
|
+
readonly sourceMtimeMs: number;
|
|
102
|
+
readonly provider: string;
|
|
103
|
+
readonly model: string;
|
|
104
|
+
readonly prompt: string;
|
|
105
|
+
readonly imagePath: string;
|
|
106
|
+
readonly pageArtifactPath: string;
|
|
107
|
+
readonly renderArtifactPath: string;
|
|
108
|
+
readonly artifactPath: string;
|
|
109
|
+
readonly generatedAt: string;
|
|
110
|
+
readonly tables: ReadonlyArray<LocalTableArtifactItem>;
|
|
111
|
+
readonly cacheStatus: "fresh" | "reused";
|
|
112
|
+
}
|
|
113
|
+
export interface LocalPageFormulasArtifact {
|
|
114
|
+
readonly documentId: string;
|
|
115
|
+
readonly pageNumber: number;
|
|
116
|
+
readonly renderScale: number;
|
|
117
|
+
readonly sourceSizeBytes: number;
|
|
118
|
+
readonly sourceMtimeMs: number;
|
|
119
|
+
readonly provider: string;
|
|
120
|
+
readonly model: string;
|
|
121
|
+
readonly prompt: string;
|
|
122
|
+
readonly imagePath: string;
|
|
123
|
+
readonly pageArtifactPath: string;
|
|
124
|
+
readonly renderArtifactPath: string;
|
|
125
|
+
readonly artifactPath: string;
|
|
126
|
+
readonly generatedAt: string;
|
|
127
|
+
readonly formulas: ReadonlyArray<LocalFormulaArtifactItem>;
|
|
128
|
+
readonly cacheStatus: "fresh" | "reused";
|
|
129
|
+
}
|
|
130
|
+
export interface LocalDocumentRequest {
|
|
131
|
+
readonly pdfPath: string;
|
|
132
|
+
readonly workspaceDir?: string;
|
|
133
|
+
readonly forceRefresh?: boolean;
|
|
134
|
+
readonly config?: EchoPdfConfig;
|
|
135
|
+
}
|
|
136
|
+
export interface LocalPageContentRequest extends LocalDocumentRequest {
|
|
137
|
+
readonly pageNumber: number;
|
|
138
|
+
}
|
|
139
|
+
export interface LocalSemanticDocumentRequest extends LocalDocumentRequest {
|
|
140
|
+
readonly provider?: string;
|
|
141
|
+
readonly model?: string;
|
|
142
|
+
readonly semanticExtraction?: {
|
|
143
|
+
readonly pageSelection?: "all";
|
|
144
|
+
readonly chunkMaxChars?: number;
|
|
145
|
+
readonly chunkOverlapChars?: number;
|
|
146
|
+
};
|
|
147
|
+
readonly env?: Env;
|
|
148
|
+
readonly providerApiKeys?: Record<string, string>;
|
|
149
|
+
}
|
|
150
|
+
export interface LocalPageRenderRequest extends LocalPageContentRequest {
|
|
151
|
+
readonly renderScale?: number;
|
|
152
|
+
}
|
|
153
|
+
export interface LocalPageTablesRequest extends LocalPageRenderRequest {
|
|
154
|
+
readonly provider?: string;
|
|
155
|
+
readonly model?: string;
|
|
156
|
+
readonly prompt?: string;
|
|
157
|
+
readonly env?: Env;
|
|
158
|
+
readonly providerApiKeys?: Record<string, string>;
|
|
159
|
+
}
|
|
160
|
+
export interface LocalPageFormulasRequest extends LocalPageRenderRequest {
|
|
161
|
+
readonly provider?: string;
|
|
162
|
+
readonly model?: string;
|
|
163
|
+
readonly prompt?: string;
|
|
164
|
+
readonly env?: Env;
|
|
165
|
+
readonly providerApiKeys?: Record<string, string>;
|
|
166
|
+
}
|
|
167
|
+
export interface StoredDocumentRecord {
|
|
168
|
+
readonly documentId: string;
|
|
169
|
+
readonly sourcePath: string;
|
|
170
|
+
readonly filename: string;
|
|
171
|
+
readonly sizeBytes: number;
|
|
172
|
+
readonly mtimeMs: number;
|
|
173
|
+
readonly pageCount: number;
|
|
174
|
+
readonly indexedAt: string;
|
|
175
|
+
readonly artifactPaths: InternalDocumentArtifactPaths;
|
|
176
|
+
}
|
|
177
|
+
export type SemanticAgentCandidate = {
|
|
178
|
+
title: string;
|
|
179
|
+
level: number;
|
|
180
|
+
pageNumber: number;
|
|
181
|
+
excerpt?: string;
|
|
182
|
+
confidence: number;
|
|
183
|
+
};
|