@echofiles/echo-pdf 0.4.1 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +302 -11
- package/bin/echo-pdf.js +176 -8
- package/bin/lib/http.js +26 -1
- package/dist/agent-defaults.d.ts +3 -0
- package/dist/agent-defaults.js +18 -0
- package/dist/auth.d.ts +18 -0
- package/dist/auth.js +36 -0
- package/dist/core/index.d.ts +50 -0
- package/dist/core/index.js +7 -0
- package/dist/file-ops.d.ts +11 -0
- package/dist/file-ops.js +36 -0
- package/dist/file-store-do.d.ts +36 -0
- package/dist/file-store-do.js +298 -0
- package/dist/file-utils.d.ts +6 -0
- package/dist/file-utils.js +36 -0
- package/dist/http-error.d.ts +9 -0
- package/dist/http-error.js +14 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +1 -0
- package/dist/local/index.d.ts +135 -0
- package/dist/local/index.js +555 -0
- package/dist/mcp-server.d.ts +3 -0
- package/dist/mcp-server.js +124 -0
- package/dist/node/pdfium-local.d.ts +8 -0
- package/dist/node/pdfium-local.js +147 -0
- package/dist/node/semantic-local.d.ts +16 -0
- package/dist/node/semantic-local.js +113 -0
- package/dist/pdf-agent.d.ts +18 -0
- package/dist/pdf-agent.js +217 -0
- package/dist/pdf-config.d.ts +4 -0
- package/dist/pdf-config.js +140 -0
- package/dist/pdf-storage.d.ts +8 -0
- package/dist/pdf-storage.js +86 -0
- package/dist/pdf-types.d.ts +83 -0
- package/dist/pdf-types.js +1 -0
- package/dist/pdfium-engine.d.ts +9 -0
- package/dist/pdfium-engine.js +180 -0
- package/dist/provider-client.d.ts +20 -0
- package/dist/provider-client.js +173 -0
- package/dist/provider-keys.d.ts +10 -0
- package/dist/provider-keys.js +27 -0
- package/dist/r2-file-store.d.ts +20 -0
- package/dist/r2-file-store.js +176 -0
- package/dist/response-schema.d.ts +15 -0
- package/dist/response-schema.js +159 -0
- package/dist/tool-registry.d.ts +16 -0
- package/dist/tool-registry.js +175 -0
- package/dist/types.d.ts +91 -0
- package/dist/types.js +1 -0
- package/dist/worker.d.ts +7 -0
- package/dist/worker.js +386 -0
- package/package.json +34 -5
- package/wrangler.toml +1 -1
- package/src/agent-defaults.ts +0 -25
- package/src/file-ops.ts +0 -50
- package/src/file-store-do.ts +0 -349
- package/src/file-utils.ts +0 -43
- package/src/http-error.ts +0 -21
- package/src/index.ts +0 -415
- package/src/mcp-server.ts +0 -171
- package/src/pdf-agent.ts +0 -252
- package/src/pdf-config.ts +0 -143
- package/src/pdf-storage.ts +0 -109
- package/src/pdf-types.ts +0 -85
- package/src/pdfium-engine.ts +0 -207
- package/src/provider-client.ts +0 -176
- package/src/provider-keys.ts +0 -44
- package/src/r2-file-store.ts +0 -195
- package/src/response-schema.ts +0 -182
- package/src/tool-registry.ts +0 -203
- package/src/types.ts +0 -40
- package/src/wasm.d.ts +0 -4
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import type { EchoPdfConfig } from "../pdf-types.js";
|
|
2
|
+
import type { Env } from "../types.js";
|
|
3
|
+
export interface LocalDocumentArtifactPaths {
|
|
4
|
+
readonly workspaceDir: string;
|
|
5
|
+
readonly documentDir: string;
|
|
6
|
+
readonly documentJsonPath: string;
|
|
7
|
+
readonly structureJsonPath: string;
|
|
8
|
+
readonly semanticStructureJsonPath: string;
|
|
9
|
+
readonly pagesDir: string;
|
|
10
|
+
readonly rendersDir: string;
|
|
11
|
+
readonly ocrDir: string;
|
|
12
|
+
}
|
|
13
|
+
export interface LocalDocumentMetadata {
|
|
14
|
+
readonly documentId: string;
|
|
15
|
+
readonly sourcePath: string;
|
|
16
|
+
readonly filename: string;
|
|
17
|
+
readonly sizeBytes: number;
|
|
18
|
+
readonly mtimeMs: number;
|
|
19
|
+
readonly pageCount: number;
|
|
20
|
+
readonly indexedAt: string;
|
|
21
|
+
readonly cacheStatus: "fresh" | "reused";
|
|
22
|
+
readonly artifactPaths: LocalDocumentArtifactPaths;
|
|
23
|
+
}
|
|
24
|
+
export interface LocalDocumentStructureNode {
|
|
25
|
+
readonly id: string;
|
|
26
|
+
readonly type: "document" | "page";
|
|
27
|
+
readonly title: string;
|
|
28
|
+
readonly pageNumber?: number;
|
|
29
|
+
readonly preview?: string;
|
|
30
|
+
readonly artifactPath?: string;
|
|
31
|
+
readonly children?: ReadonlyArray<LocalDocumentStructureNode>;
|
|
32
|
+
}
|
|
33
|
+
export interface LocalDocumentStructure {
|
|
34
|
+
readonly documentId: string;
|
|
35
|
+
readonly generatedAt: string;
|
|
36
|
+
readonly root: LocalDocumentStructureNode;
|
|
37
|
+
}
|
|
38
|
+
export interface LocalSemanticStructureNode {
|
|
39
|
+
readonly id: string;
|
|
40
|
+
readonly type: "document" | "section";
|
|
41
|
+
readonly title: string;
|
|
42
|
+
readonly level?: number;
|
|
43
|
+
readonly pageNumber?: number;
|
|
44
|
+
readonly pageArtifactPath?: string;
|
|
45
|
+
readonly excerpt?: string;
|
|
46
|
+
readonly children?: ReadonlyArray<LocalSemanticStructureNode>;
|
|
47
|
+
}
|
|
48
|
+
export interface LocalSemanticDocumentStructure {
|
|
49
|
+
readonly documentId: string;
|
|
50
|
+
readonly generatedAt: string;
|
|
51
|
+
readonly detector: "agent-structured-v1" | "heading-heuristic-v1";
|
|
52
|
+
readonly strategyKey: string;
|
|
53
|
+
readonly sourceSizeBytes: number;
|
|
54
|
+
readonly sourceMtimeMs: number;
|
|
55
|
+
readonly pageIndexArtifactPath: string;
|
|
56
|
+
readonly artifactPath: string;
|
|
57
|
+
readonly root: LocalSemanticStructureNode;
|
|
58
|
+
readonly cacheStatus: "fresh" | "reused";
|
|
59
|
+
}
|
|
60
|
+
export interface LocalPageContent {
|
|
61
|
+
readonly documentId: string;
|
|
62
|
+
readonly pageNumber: number;
|
|
63
|
+
readonly title: string;
|
|
64
|
+
readonly preview: string;
|
|
65
|
+
readonly text: string;
|
|
66
|
+
readonly chars: number;
|
|
67
|
+
readonly artifactPath: string;
|
|
68
|
+
}
|
|
69
|
+
export interface LocalPageRenderArtifact {
|
|
70
|
+
readonly documentId: string;
|
|
71
|
+
readonly pageNumber: number;
|
|
72
|
+
readonly renderScale: number;
|
|
73
|
+
readonly sourceSizeBytes: number;
|
|
74
|
+
readonly sourceMtimeMs: number;
|
|
75
|
+
readonly width: number;
|
|
76
|
+
readonly height: number;
|
|
77
|
+
readonly mimeType: "image/png";
|
|
78
|
+
readonly imagePath: string;
|
|
79
|
+
readonly artifactPath: string;
|
|
80
|
+
readonly generatedAt: string;
|
|
81
|
+
readonly cacheStatus: "fresh" | "reused";
|
|
82
|
+
}
|
|
83
|
+
export interface LocalPageOcrArtifact {
|
|
84
|
+
readonly documentId: string;
|
|
85
|
+
readonly pageNumber: number;
|
|
86
|
+
readonly renderScale: number;
|
|
87
|
+
readonly sourceSizeBytes: number;
|
|
88
|
+
readonly sourceMtimeMs: number;
|
|
89
|
+
readonly provider: string;
|
|
90
|
+
readonly model: string;
|
|
91
|
+
readonly prompt: string;
|
|
92
|
+
readonly text: string;
|
|
93
|
+
readonly chars: number;
|
|
94
|
+
readonly imagePath: string;
|
|
95
|
+
readonly renderArtifactPath: string;
|
|
96
|
+
readonly artifactPath: string;
|
|
97
|
+
readonly generatedAt: string;
|
|
98
|
+
readonly cacheStatus: "fresh" | "reused";
|
|
99
|
+
}
|
|
100
|
+
export interface LocalDocumentRequest {
|
|
101
|
+
readonly pdfPath: string;
|
|
102
|
+
readonly workspaceDir?: string;
|
|
103
|
+
readonly forceRefresh?: boolean;
|
|
104
|
+
readonly config?: EchoPdfConfig;
|
|
105
|
+
}
|
|
106
|
+
export interface LocalPageContentRequest extends LocalDocumentRequest {
|
|
107
|
+
readonly pageNumber: number;
|
|
108
|
+
}
|
|
109
|
+
export interface LocalSemanticDocumentRequest extends LocalDocumentRequest {
|
|
110
|
+
readonly provider?: string;
|
|
111
|
+
readonly model?: string;
|
|
112
|
+
readonly semanticExtraction?: {
|
|
113
|
+
readonly pageSelection?: "all";
|
|
114
|
+
readonly chunkMaxChars?: number;
|
|
115
|
+
readonly chunkOverlapChars?: number;
|
|
116
|
+
};
|
|
117
|
+
readonly env?: Env;
|
|
118
|
+
readonly providerApiKeys?: Record<string, string>;
|
|
119
|
+
}
|
|
120
|
+
export interface LocalPageRenderRequest extends LocalPageContentRequest {
|
|
121
|
+
readonly renderScale?: number;
|
|
122
|
+
}
|
|
123
|
+
export interface LocalPageOcrRequest extends LocalPageRenderRequest {
|
|
124
|
+
readonly provider?: string;
|
|
125
|
+
readonly model?: string;
|
|
126
|
+
readonly prompt?: string;
|
|
127
|
+
readonly env?: Env;
|
|
128
|
+
readonly providerApiKeys?: Record<string, string>;
|
|
129
|
+
}
|
|
130
|
+
export declare const get_document: (request: LocalDocumentRequest) => Promise<LocalDocumentMetadata>;
|
|
131
|
+
export declare const get_document_structure: (request: LocalDocumentRequest) => Promise<LocalDocumentStructure>;
|
|
132
|
+
export declare const get_semantic_document_structure: (request: LocalSemanticDocumentRequest) => Promise<LocalSemanticDocumentStructure>;
|
|
133
|
+
export declare const get_page_content: (request: LocalPageContentRequest) => Promise<LocalPageContent>;
|
|
134
|
+
export declare const get_page_render: (request: LocalPageRenderRequest) => Promise<LocalPageRenderArtifact>;
|
|
135
|
+
export declare const get_page_ocr: (request: LocalPageOcrRequest) => Promise<LocalPageOcrArtifact>;
|
|
@@ -0,0 +1,555 @@
|
|
|
1
|
+
/// <reference path="../node/compat.d.ts" />
|
|
2
|
+
import { createHash } from "node:crypto";
|
|
3
|
+
import { mkdir, readFile, stat, writeFile } from "node:fs/promises";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { resolveModelForProvider, resolveProviderAlias } from "../agent-defaults.js";
|
|
6
|
+
import { toDataUrl } from "../file-utils.js";
|
|
7
|
+
import { loadEchoPdfConfig } from "../pdf-config.js";
|
|
8
|
+
import { generateText, visionRecognize } from "../provider-client.js";
|
|
9
|
+
import { extractLocalPdfPageText, getLocalPdfPageCount, renderLocalPdfPageToPng } from "../node/pdfium-local.js";
|
|
10
|
+
import { buildSemanticSectionTree } from "../node/semantic-local.js";
|
|
11
|
+
const defaultWorkspaceDir = () => path.resolve(process.cwd(), ".echo-pdf-workspace");
|
|
12
|
+
const resolveWorkspaceDir = (workspaceDir) => path.resolve(process.cwd(), workspaceDir?.trim() || defaultWorkspaceDir());
|
|
13
|
+
const toDocumentId = (absolutePdfPath) => createHash("sha256").update(absolutePdfPath).digest("hex").slice(0, 16);
|
|
14
|
+
const hashFragment = (value, length = 12) => createHash("sha256").update(value).digest("hex").slice(0, length);
|
|
15
|
+
const sanitizeSegment = (value) => value.replace(/[^a-zA-Z0-9._-]+/g, "_");
|
|
16
|
+
const scaleLabel = (value) => sanitizeSegment(String(value));
|
|
17
|
+
const pageLabel = (pageNumber) => String(pageNumber).padStart(4, "0");
|
|
18
|
+
const buildArtifactPaths = (workspaceDir, documentId) => {
|
|
19
|
+
const documentDir = path.join(workspaceDir, "documents", documentId);
|
|
20
|
+
return {
|
|
21
|
+
workspaceDir,
|
|
22
|
+
documentDir,
|
|
23
|
+
documentJsonPath: path.join(documentDir, "document.json"),
|
|
24
|
+
structureJsonPath: path.join(documentDir, "structure.json"),
|
|
25
|
+
semanticStructureJsonPath: path.join(documentDir, "semantic-structure.json"),
|
|
26
|
+
pagesDir: path.join(documentDir, "pages"),
|
|
27
|
+
rendersDir: path.join(documentDir, "renders"),
|
|
28
|
+
ocrDir: path.join(documentDir, "ocr"),
|
|
29
|
+
};
|
|
30
|
+
};
|
|
31
|
+
const buildRenderArtifactPaths = (paths, pageNumber, renderScale) => {
|
|
32
|
+
const key = `${pageLabel(pageNumber)}.scale-${scaleLabel(renderScale)}`;
|
|
33
|
+
return {
|
|
34
|
+
artifactPath: path.join(paths.rendersDir, `${key}.json`),
|
|
35
|
+
imagePath: path.join(paths.rendersDir, `${key}.png`),
|
|
36
|
+
};
|
|
37
|
+
};
|
|
38
|
+
const buildOcrArtifactPath = (paths, pageNumber, renderScale, provider, model, prompt) => {
|
|
39
|
+
const key = [
|
|
40
|
+
pageLabel(pageNumber),
|
|
41
|
+
`scale-${scaleLabel(renderScale)}`,
|
|
42
|
+
`provider-${sanitizeSegment(provider)}`,
|
|
43
|
+
`model-${sanitizeSegment(model)}`,
|
|
44
|
+
`prompt-${hashFragment(prompt, 10)}`,
|
|
45
|
+
].join(".");
|
|
46
|
+
return path.join(paths.ocrDir, `${key}.json`);
|
|
47
|
+
};
|
|
48
|
+
const createPreview = (text) => text.replace(/\s+/g, " ").trim().slice(0, 160);
|
|
49
|
+
const createPageTitle = (pageNumber, text) => {
|
|
50
|
+
const firstLine = text
|
|
51
|
+
.split(/\r?\n/)
|
|
52
|
+
.map((line) => line.trim())
|
|
53
|
+
.find((line) => line.length > 0);
|
|
54
|
+
return firstLine ? `Page ${pageNumber}: ${firstLine.slice(0, 80)}` : `Page ${pageNumber}`;
|
|
55
|
+
};
|
|
56
|
+
const stripCodeFences = (value) => {
|
|
57
|
+
const text = value.trim();
|
|
58
|
+
const fenced = text.match(/^```[a-zA-Z0-9_-]*\n([\s\S]*?)\n```$/);
|
|
59
|
+
return typeof fenced?.[1] === "string" ? fenced[1].trim() : text;
|
|
60
|
+
};
|
|
61
|
+
const parseJsonObject = (value) => {
|
|
62
|
+
const trimmed = stripCodeFences(value).trim();
|
|
63
|
+
if (!trimmed)
|
|
64
|
+
return null;
|
|
65
|
+
try {
|
|
66
|
+
return JSON.parse(trimmed);
|
|
67
|
+
}
|
|
68
|
+
catch {
|
|
69
|
+
const start = trimmed.indexOf("{");
|
|
70
|
+
const end = trimmed.lastIndexOf("}");
|
|
71
|
+
if (start >= 0 && end > start) {
|
|
72
|
+
return JSON.parse(trimmed.slice(start, end + 1));
|
|
73
|
+
}
|
|
74
|
+
throw new Error("semantic structure model output was not valid JSON");
|
|
75
|
+
}
|
|
76
|
+
};
|
|
77
|
+
const resolveConfig = (config) => config ?? loadEchoPdfConfig({});
|
|
78
|
+
const resolveEnv = (env) => env ?? process.env;
|
|
79
|
+
const fileExists = async (targetPath) => {
|
|
80
|
+
try {
|
|
81
|
+
await stat(targetPath);
|
|
82
|
+
return true;
|
|
83
|
+
}
|
|
84
|
+
catch {
|
|
85
|
+
return false;
|
|
86
|
+
}
|
|
87
|
+
};
|
|
88
|
+
const readJson = async (targetPath) => {
|
|
89
|
+
const raw = await readFile(targetPath, "utf-8");
|
|
90
|
+
return JSON.parse(raw);
|
|
91
|
+
};
|
|
92
|
+
const loadStoredDocument = async (paths) => {
|
|
93
|
+
if (!await fileExists(paths.documentJsonPath))
|
|
94
|
+
return null;
|
|
95
|
+
const raw = await readJson(paths.documentJsonPath);
|
|
96
|
+
return {
|
|
97
|
+
...raw,
|
|
98
|
+
artifactPaths: paths,
|
|
99
|
+
};
|
|
100
|
+
};
|
|
101
|
+
const isReusableRecord = async (record, sourceStats, paths) => {
|
|
102
|
+
if (record.sizeBytes !== sourceStats.sizeBytes || record.mtimeMs !== sourceStats.mtimeMs)
|
|
103
|
+
return false;
|
|
104
|
+
if (!await fileExists(paths.structureJsonPath))
|
|
105
|
+
return false;
|
|
106
|
+
for (let pageNumber = 1; pageNumber <= record.pageCount; pageNumber += 1) {
|
|
107
|
+
const pagePath = path.join(paths.pagesDir, `${pageLabel(pageNumber)}.json`);
|
|
108
|
+
if (!await fileExists(pagePath))
|
|
109
|
+
return false;
|
|
110
|
+
}
|
|
111
|
+
return true;
|
|
112
|
+
};
|
|
113
|
+
const writeJson = async (targetPath, data) => {
|
|
114
|
+
await mkdir(path.dirname(targetPath), { recursive: true });
|
|
115
|
+
await writeFile(targetPath, `${JSON.stringify(data, null, 2)}\n`, "utf-8");
|
|
116
|
+
};
|
|
117
|
+
const readSourceBytes = async (sourcePath) => new Uint8Array(await readFile(sourcePath));
|
|
118
|
+
const matchesSourceSnapshot = (artifact, record) => artifact.sourceSizeBytes === record.sizeBytes && artifact.sourceMtimeMs === record.mtimeMs;
|
|
119
|
+
const matchesStrategyKey = (artifact, strategyKey) => artifact.strategyKey === strategyKey;
|
|
120
|
+
const resolveSemanticExtractionBudget = (input) => ({
|
|
121
|
+
pageSelection: "all",
|
|
122
|
+
chunkMaxChars: typeof input?.chunkMaxChars === "number" && Number.isFinite(input.chunkMaxChars) && input.chunkMaxChars > 400
|
|
123
|
+
? Math.floor(input.chunkMaxChars)
|
|
124
|
+
: 4000,
|
|
125
|
+
chunkOverlapChars: typeof input?.chunkOverlapChars === "number" && Number.isFinite(input.chunkOverlapChars) && input.chunkOverlapChars >= 0
|
|
126
|
+
? Math.floor(input.chunkOverlapChars)
|
|
127
|
+
: 300,
|
|
128
|
+
});
|
|
129
|
+
const splitSemanticTextIntoChunks = (text, budget) => {
|
|
130
|
+
const normalized = text.trim();
|
|
131
|
+
if (!normalized)
|
|
132
|
+
return [];
|
|
133
|
+
if (normalized.length <= budget.chunkMaxChars)
|
|
134
|
+
return [normalized];
|
|
135
|
+
const chunks = [];
|
|
136
|
+
let start = 0;
|
|
137
|
+
while (start < normalized.length) {
|
|
138
|
+
const idealEnd = Math.min(normalized.length, start + budget.chunkMaxChars);
|
|
139
|
+
let end = idealEnd;
|
|
140
|
+
if (idealEnd < normalized.length) {
|
|
141
|
+
const newlineBreak = normalized.lastIndexOf("\n", idealEnd);
|
|
142
|
+
const sentenceBreak = normalized.lastIndexOf("。", idealEnd);
|
|
143
|
+
const whitespaceBreak = normalized.lastIndexOf(" ", idealEnd);
|
|
144
|
+
end = Math.max(newlineBreak, sentenceBreak, whitespaceBreak, start + Math.floor(budget.chunkMaxChars * 0.7));
|
|
145
|
+
if (end <= start)
|
|
146
|
+
end = idealEnd;
|
|
147
|
+
}
|
|
148
|
+
const chunk = normalized.slice(start, end).trim();
|
|
149
|
+
if (chunk)
|
|
150
|
+
chunks.push(chunk);
|
|
151
|
+
if (end >= normalized.length)
|
|
152
|
+
break;
|
|
153
|
+
start = Math.max(end - budget.chunkOverlapChars, start + 1);
|
|
154
|
+
}
|
|
155
|
+
return chunks;
|
|
156
|
+
};
|
|
157
|
+
const toSemanticTree = (value, pageArtifactPaths) => {
|
|
158
|
+
if (!Array.isArray(value))
|
|
159
|
+
return [];
|
|
160
|
+
const nodes = [];
|
|
161
|
+
value.forEach((item, index) => {
|
|
162
|
+
const node = item;
|
|
163
|
+
const title = typeof node.title === "string" ? node.title.trim() : "";
|
|
164
|
+
const level = typeof node.level === "number" && Number.isInteger(node.level) && node.level > 0 ? node.level : undefined;
|
|
165
|
+
const pageNumber = typeof node.pageNumber === "number" && Number.isInteger(node.pageNumber) && node.pageNumber > 0 ? node.pageNumber : undefined;
|
|
166
|
+
if (!title || typeof level !== "number" || typeof pageNumber !== "number")
|
|
167
|
+
return;
|
|
168
|
+
nodes.push({
|
|
169
|
+
id: `semantic-node-${index + 1}-${pageNumber}-${level}`,
|
|
170
|
+
type: "section",
|
|
171
|
+
title,
|
|
172
|
+
level,
|
|
173
|
+
pageNumber,
|
|
174
|
+
pageArtifactPath: pageArtifactPaths.get(pageNumber),
|
|
175
|
+
excerpt: typeof node.excerpt === "string" ? node.excerpt.trim() : undefined,
|
|
176
|
+
children: toSemanticTree(node.children, pageArtifactPaths),
|
|
177
|
+
});
|
|
178
|
+
});
|
|
179
|
+
return nodes;
|
|
180
|
+
};
|
|
181
|
+
const buildSemanticPrompt = (pageNumber, chunkIndex, chunkText) => {
|
|
182
|
+
return [
|
|
183
|
+
"You extract heading/section candidates from one document text segment.",
|
|
184
|
+
"Return JSON only.",
|
|
185
|
+
"Schema:",
|
|
186
|
+
"{",
|
|
187
|
+
' "candidates": [',
|
|
188
|
+
" {",
|
|
189
|
+
' "title": "string",',
|
|
190
|
+
' "level": 1,',
|
|
191
|
+
' "excerpt": "short evidence string",',
|
|
192
|
+
" }",
|
|
193
|
+
" ]",
|
|
194
|
+
"}",
|
|
195
|
+
"Rules:",
|
|
196
|
+
"- Use only headings/sections that are clearly supported by the text segment.",
|
|
197
|
+
"- Prefer conservative extraction over guessing.",
|
|
198
|
+
"- Do not include page index entries, table rows, figure labels, or prose sentences.",
|
|
199
|
+
"- Do not infer hierarchy beyond the explicit heading numbering or structure visible in the segment.",
|
|
200
|
+
"- If no reliable semantic structure is detectable, return {\"candidates\":[]}.",
|
|
201
|
+
`Page number: ${pageNumber}`,
|
|
202
|
+
`Chunk index: ${chunkIndex}`,
|
|
203
|
+
"",
|
|
204
|
+
chunkText,
|
|
205
|
+
].join("\n");
|
|
206
|
+
};
|
|
207
|
+
const buildSemanticAggregationPrompt = (record, candidates) => {
|
|
208
|
+
const candidateDump = JSON.stringify({ candidates }, null, 2);
|
|
209
|
+
return [
|
|
210
|
+
"You assemble semantic document structure from heading candidates.",
|
|
211
|
+
"Return JSON only.",
|
|
212
|
+
"Schema:",
|
|
213
|
+
"{",
|
|
214
|
+
' "sections": [',
|
|
215
|
+
" {",
|
|
216
|
+
' "title": "string",',
|
|
217
|
+
' "level": 1,',
|
|
218
|
+
' "pageNumber": 1,',
|
|
219
|
+
' "excerpt": "short evidence string",',
|
|
220
|
+
' "children": []',
|
|
221
|
+
" }",
|
|
222
|
+
" ]",
|
|
223
|
+
"}",
|
|
224
|
+
"Rules:",
|
|
225
|
+
"- Preserve hierarchy with nested children.",
|
|
226
|
+
"- Use only candidate headings that form a reliable document structure.",
|
|
227
|
+
"- Deduplicate repeated headings from overlapping segments.",
|
|
228
|
+
"- Do not invent sections not present in the candidates.",
|
|
229
|
+
"- If no reliable semantic structure is detectable, return {\"sections\":[]}.",
|
|
230
|
+
`Document filename: ${record.filename}`,
|
|
231
|
+
`Page count: ${record.pageCount}`,
|
|
232
|
+
"",
|
|
233
|
+
candidateDump,
|
|
234
|
+
].join("\n");
|
|
235
|
+
};
|
|
236
|
+
const buildHeuristicSemanticArtifact = (record, artifactPath, sections) => ({
|
|
237
|
+
documentId: record.documentId,
|
|
238
|
+
generatedAt: new Date().toISOString(),
|
|
239
|
+
detector: "heading-heuristic-v1",
|
|
240
|
+
strategyKey: "heuristic::heading-heuristic-v1",
|
|
241
|
+
sourceSizeBytes: record.sizeBytes,
|
|
242
|
+
sourceMtimeMs: record.mtimeMs,
|
|
243
|
+
pageIndexArtifactPath: record.artifactPaths.structureJsonPath,
|
|
244
|
+
artifactPath,
|
|
245
|
+
root: {
|
|
246
|
+
id: `semantic-${record.documentId}`,
|
|
247
|
+
type: "document",
|
|
248
|
+
title: record.filename,
|
|
249
|
+
children: sections,
|
|
250
|
+
},
|
|
251
|
+
});
|
|
252
|
+
const ensureSemanticStructureArtifact = async (request) => {
|
|
253
|
+
const config = resolveConfig(request.config);
|
|
254
|
+
const env = resolveEnv(request.env);
|
|
255
|
+
const { record } = await indexDocumentInternal(request);
|
|
256
|
+
const artifactPath = record.artifactPaths.semanticStructureJsonPath;
|
|
257
|
+
const semanticBudget = resolveSemanticExtractionBudget(request.semanticExtraction);
|
|
258
|
+
let provider = "";
|
|
259
|
+
let model = "";
|
|
260
|
+
try {
|
|
261
|
+
provider = resolveProviderAlias(config, request.provider);
|
|
262
|
+
model = provider ? resolveModelForProvider(config, provider) : "";
|
|
263
|
+
}
|
|
264
|
+
catch {
|
|
265
|
+
provider = "";
|
|
266
|
+
model = "";
|
|
267
|
+
}
|
|
268
|
+
if (provider) {
|
|
269
|
+
model = resolveModelForProvider(config, provider, request.model);
|
|
270
|
+
}
|
|
271
|
+
const strategyKey = model
|
|
272
|
+
? `agent::agent-structured-v1::${provider}::${model}::${semanticBudget.pageSelection}::${semanticBudget.chunkMaxChars}::${semanticBudget.chunkOverlapChars}`
|
|
273
|
+
: "heuristic::heading-heuristic-v1";
|
|
274
|
+
if (!request.forceRefresh && await fileExists(artifactPath)) {
|
|
275
|
+
const cached = await readJson(artifactPath);
|
|
276
|
+
if (matchesSourceSnapshot(cached, record) && matchesStrategyKey(cached, strategyKey)) {
|
|
277
|
+
return {
|
|
278
|
+
...cached,
|
|
279
|
+
cacheStatus: "reused",
|
|
280
|
+
};
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
const pages = [];
|
|
284
|
+
for (let pageNumber = 1; pageNumber <= record.pageCount; pageNumber += 1) {
|
|
285
|
+
const pagePath = path.join(record.artifactPaths.pagesDir, `${pageLabel(pageNumber)}.json`);
|
|
286
|
+
const page = await readJson(pagePath);
|
|
287
|
+
pages.push(page);
|
|
288
|
+
}
|
|
289
|
+
const pageArtifactPaths = new Map(pages.map((page) => [page.pageNumber, page.artifactPath]));
|
|
290
|
+
let artifact;
|
|
291
|
+
if (model) {
|
|
292
|
+
try {
|
|
293
|
+
const candidateMap = new Map();
|
|
294
|
+
for (const page of pages) {
|
|
295
|
+
const chunks = splitSemanticTextIntoChunks(page.text, semanticBudget);
|
|
296
|
+
for (const [chunkIndex, chunkText] of chunks.entries()) {
|
|
297
|
+
const response = await generateText({
|
|
298
|
+
config,
|
|
299
|
+
env,
|
|
300
|
+
providerAlias: provider,
|
|
301
|
+
model,
|
|
302
|
+
prompt: buildSemanticPrompt(page.pageNumber, chunkIndex + 1, chunkText),
|
|
303
|
+
runtimeApiKeys: request.providerApiKeys,
|
|
304
|
+
});
|
|
305
|
+
const parsed = parseJsonObject(response);
|
|
306
|
+
for (const candidate of Array.isArray(parsed?.candidates) ? parsed.candidates : []) {
|
|
307
|
+
const title = typeof candidate?.title === "string" ? candidate.title.trim() : "";
|
|
308
|
+
const level = typeof candidate?.level === "number" && Number.isInteger(candidate.level) && candidate.level > 0 ? candidate.level : 0;
|
|
309
|
+
if (!title || level <= 0)
|
|
310
|
+
continue;
|
|
311
|
+
const key = `${page.pageNumber}:${level}:${title}`;
|
|
312
|
+
if (!candidateMap.has(key)) {
|
|
313
|
+
candidateMap.set(key, {
|
|
314
|
+
title,
|
|
315
|
+
level,
|
|
316
|
+
pageNumber: page.pageNumber,
|
|
317
|
+
excerpt: typeof candidate?.excerpt === "string" ? candidate.excerpt.trim() : undefined,
|
|
318
|
+
});
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
const aggregated = await generateText({
|
|
324
|
+
config,
|
|
325
|
+
env,
|
|
326
|
+
providerAlias: provider,
|
|
327
|
+
model,
|
|
328
|
+
prompt: buildSemanticAggregationPrompt(record, [...candidateMap.values()]),
|
|
329
|
+
runtimeApiKeys: request.providerApiKeys,
|
|
330
|
+
});
|
|
331
|
+
const parsed = parseJsonObject(aggregated);
|
|
332
|
+
const sections = toSemanticTree(parsed?.sections, pageArtifactPaths);
|
|
333
|
+
artifact = {
|
|
334
|
+
documentId: record.documentId,
|
|
335
|
+
generatedAt: new Date().toISOString(),
|
|
336
|
+
detector: "agent-structured-v1",
|
|
337
|
+
strategyKey,
|
|
338
|
+
sourceSizeBytes: record.sizeBytes,
|
|
339
|
+
sourceMtimeMs: record.mtimeMs,
|
|
340
|
+
pageIndexArtifactPath: record.artifactPaths.structureJsonPath,
|
|
341
|
+
artifactPath,
|
|
342
|
+
root: {
|
|
343
|
+
id: `semantic-${record.documentId}`,
|
|
344
|
+
type: "document",
|
|
345
|
+
title: record.filename,
|
|
346
|
+
children: sections,
|
|
347
|
+
},
|
|
348
|
+
};
|
|
349
|
+
}
|
|
350
|
+
catch {
|
|
351
|
+
artifact = buildHeuristicSemanticArtifact(record, artifactPath, buildSemanticSectionTree(pages));
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
else {
|
|
355
|
+
artifact = buildHeuristicSemanticArtifact(record, artifactPath, buildSemanticSectionTree(pages));
|
|
356
|
+
}
|
|
357
|
+
await writeJson(artifactPath, artifact);
|
|
358
|
+
return {
|
|
359
|
+
...artifact,
|
|
360
|
+
cacheStatus: "fresh",
|
|
361
|
+
};
|
|
362
|
+
};
|
|
363
|
+
const ensurePageNumber = (pageCount, pageNumber) => {
|
|
364
|
+
if (!Number.isInteger(pageNumber) || pageNumber < 1 || pageNumber > pageCount) {
|
|
365
|
+
throw new Error(`pageNumber must be within 1..${pageCount}`);
|
|
366
|
+
}
|
|
367
|
+
};
|
|
368
|
+
const resolveRenderScale = (config, requestedScale) => {
|
|
369
|
+
if (typeof requestedScale === "number" && Number.isFinite(requestedScale) && requestedScale > 0) {
|
|
370
|
+
return requestedScale;
|
|
371
|
+
}
|
|
372
|
+
return config.service.defaultRenderScale;
|
|
373
|
+
};
|
|
374
|
+
const indexDocumentInternal = async (request) => {
|
|
375
|
+
const config = resolveConfig(request.config);
|
|
376
|
+
const sourcePath = path.resolve(process.cwd(), request.pdfPath);
|
|
377
|
+
const workspaceDir = resolveWorkspaceDir(request.workspaceDir);
|
|
378
|
+
const documentId = toDocumentId(sourcePath);
|
|
379
|
+
const artifactPaths = buildArtifactPaths(workspaceDir, documentId);
|
|
380
|
+
const sourceStats = await stat(sourcePath);
|
|
381
|
+
const stored = await loadStoredDocument(artifactPaths);
|
|
382
|
+
const sourceMeta = {
|
|
383
|
+
sizeBytes: sourceStats.size,
|
|
384
|
+
mtimeMs: sourceStats.mtimeMs,
|
|
385
|
+
};
|
|
386
|
+
if (!request.forceRefresh && stored && await isReusableRecord(stored, sourceMeta, artifactPaths)) {
|
|
387
|
+
return { record: stored, reused: true };
|
|
388
|
+
}
|
|
389
|
+
await mkdir(artifactPaths.pagesDir, { recursive: true });
|
|
390
|
+
const bytes = await readSourceBytes(sourcePath);
|
|
391
|
+
const pageCount = await getLocalPdfPageCount(config, bytes);
|
|
392
|
+
const pageNodes = [];
|
|
393
|
+
for (let pageNumber = 1; pageNumber <= pageCount; pageNumber += 1) {
|
|
394
|
+
const text = await extractLocalPdfPageText(config, bytes, pageNumber - 1);
|
|
395
|
+
const preview = createPreview(text);
|
|
396
|
+
const title = createPageTitle(pageNumber, text);
|
|
397
|
+
const artifactPath = path.join(artifactPaths.pagesDir, `${pageLabel(pageNumber)}.json`);
|
|
398
|
+
const pageArtifact = {
|
|
399
|
+
documentId,
|
|
400
|
+
pageNumber,
|
|
401
|
+
title,
|
|
402
|
+
preview,
|
|
403
|
+
text,
|
|
404
|
+
chars: text.length,
|
|
405
|
+
artifactPath,
|
|
406
|
+
};
|
|
407
|
+
await writeJson(artifactPath, pageArtifact);
|
|
408
|
+
pageNodes.push({
|
|
409
|
+
id: `page-${pageNumber}`,
|
|
410
|
+
type: "page",
|
|
411
|
+
title,
|
|
412
|
+
pageNumber,
|
|
413
|
+
preview,
|
|
414
|
+
artifactPath,
|
|
415
|
+
});
|
|
416
|
+
}
|
|
417
|
+
const structure = {
|
|
418
|
+
documentId,
|
|
419
|
+
generatedAt: new Date().toISOString(),
|
|
420
|
+
root: {
|
|
421
|
+
id: documentId,
|
|
422
|
+
type: "document",
|
|
423
|
+
title: path.basename(sourcePath),
|
|
424
|
+
children: pageNodes,
|
|
425
|
+
},
|
|
426
|
+
};
|
|
427
|
+
await writeJson(artifactPaths.structureJsonPath, structure);
|
|
428
|
+
const documentRecord = {
|
|
429
|
+
documentId,
|
|
430
|
+
sourcePath,
|
|
431
|
+
filename: path.basename(sourcePath),
|
|
432
|
+
sizeBytes: sourceMeta.sizeBytes,
|
|
433
|
+
mtimeMs: sourceMeta.mtimeMs,
|
|
434
|
+
pageCount,
|
|
435
|
+
indexedAt: structure.generatedAt,
|
|
436
|
+
artifactPaths,
|
|
437
|
+
};
|
|
438
|
+
await writeJson(artifactPaths.documentJsonPath, documentRecord);
|
|
439
|
+
return { record: documentRecord, reused: false };
|
|
440
|
+
};
|
|
441
|
+
const toMetadata = (record, cacheStatus) => ({
|
|
442
|
+
...record,
|
|
443
|
+
cacheStatus,
|
|
444
|
+
});
|
|
445
|
+
const ensureRenderArtifact = async (request) => {
|
|
446
|
+
const config = resolveConfig(request.config);
|
|
447
|
+
const { record } = await indexDocumentInternal(request);
|
|
448
|
+
ensurePageNumber(record.pageCount, request.pageNumber);
|
|
449
|
+
const renderScale = resolveRenderScale(config, request.renderScale);
|
|
450
|
+
const renderPaths = buildRenderArtifactPaths(record.artifactPaths, request.pageNumber, renderScale);
|
|
451
|
+
if (!request.forceRefresh && await fileExists(renderPaths.artifactPath) && await fileExists(renderPaths.imagePath)) {
|
|
452
|
+
const cached = await readJson(renderPaths.artifactPath);
|
|
453
|
+
if (matchesSourceSnapshot(cached, record)) {
|
|
454
|
+
return {
|
|
455
|
+
...cached,
|
|
456
|
+
cacheStatus: "reused",
|
|
457
|
+
};
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
const bytes = await readSourceBytes(record.sourcePath);
|
|
461
|
+
const rendered = await renderLocalPdfPageToPng(config, bytes, request.pageNumber - 1, renderScale);
|
|
462
|
+
await mkdir(path.dirname(renderPaths.imagePath), { recursive: true });
|
|
463
|
+
await writeFile(renderPaths.imagePath, rendered.png);
|
|
464
|
+
const artifact = {
|
|
465
|
+
documentId: record.documentId,
|
|
466
|
+
pageNumber: request.pageNumber,
|
|
467
|
+
renderScale,
|
|
468
|
+
sourceSizeBytes: record.sizeBytes,
|
|
469
|
+
sourceMtimeMs: record.mtimeMs,
|
|
470
|
+
width: rendered.width,
|
|
471
|
+
height: rendered.height,
|
|
472
|
+
mimeType: "image/png",
|
|
473
|
+
imagePath: renderPaths.imagePath,
|
|
474
|
+
artifactPath: renderPaths.artifactPath,
|
|
475
|
+
generatedAt: new Date().toISOString(),
|
|
476
|
+
};
|
|
477
|
+
await writeJson(renderPaths.artifactPath, artifact);
|
|
478
|
+
return {
|
|
479
|
+
...artifact,
|
|
480
|
+
cacheStatus: "fresh",
|
|
481
|
+
};
|
|
482
|
+
};
|
|
483
|
+
export const get_document = async (request) => {
|
|
484
|
+
const { record, reused } = await indexDocumentInternal(request);
|
|
485
|
+
return toMetadata(record, reused ? "reused" : "fresh");
|
|
486
|
+
};
|
|
487
|
+
export const get_document_structure = async (request) => {
|
|
488
|
+
const { record } = await indexDocumentInternal(request);
|
|
489
|
+
return readJson(record.artifactPaths.structureJsonPath);
|
|
490
|
+
};
|
|
491
|
+
export const get_semantic_document_structure = async (request) => ensureSemanticStructureArtifact(request);
|
|
492
|
+
export const get_page_content = async (request) => {
|
|
493
|
+
const { record } = await indexDocumentInternal(request);
|
|
494
|
+
ensurePageNumber(record.pageCount, request.pageNumber);
|
|
495
|
+
const pagePath = path.join(record.artifactPaths.pagesDir, `${pageLabel(request.pageNumber)}.json`);
|
|
496
|
+
return readJson(pagePath);
|
|
497
|
+
};
|
|
498
|
+
export const get_page_render = async (request) => ensureRenderArtifact(request);
|
|
499
|
+
export const get_page_ocr = async (request) => {
|
|
500
|
+
const config = resolveConfig(request.config);
|
|
501
|
+
const env = resolveEnv(request.env);
|
|
502
|
+
const { record } = await indexDocumentInternal(request);
|
|
503
|
+
ensurePageNumber(record.pageCount, request.pageNumber);
|
|
504
|
+
const renderArtifact = await ensureRenderArtifact(request);
|
|
505
|
+
const provider = resolveProviderAlias(config, request.provider);
|
|
506
|
+
const model = resolveModelForProvider(config, provider, request.model);
|
|
507
|
+
if (!model) {
|
|
508
|
+
throw new Error("model is required for local OCR artifacts; pass `model` or set agent.defaultModel");
|
|
509
|
+
}
|
|
510
|
+
const prompt = request.prompt?.trim() || config.agent.ocrPrompt;
|
|
511
|
+
const artifactPath = buildOcrArtifactPath(record.artifactPaths, request.pageNumber, renderArtifact.renderScale, provider, model, prompt);
|
|
512
|
+
if (!request.forceRefresh && await fileExists(artifactPath)) {
|
|
513
|
+
const cached = await readJson(artifactPath);
|
|
514
|
+
if (matchesSourceSnapshot(cached, record)) {
|
|
515
|
+
return {
|
|
516
|
+
...cached,
|
|
517
|
+
cacheStatus: "reused",
|
|
518
|
+
};
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
const imageBytes = new Uint8Array(await readFile(renderArtifact.imagePath));
|
|
522
|
+
const imageDataUrl = toDataUrl(imageBytes, renderArtifact.mimeType);
|
|
523
|
+
const fallbackText = (await get_page_content(request)).text;
|
|
524
|
+
const recognized = await visionRecognize({
|
|
525
|
+
config,
|
|
526
|
+
env,
|
|
527
|
+
providerAlias: provider,
|
|
528
|
+
model,
|
|
529
|
+
prompt,
|
|
530
|
+
imageDataUrl,
|
|
531
|
+
runtimeApiKeys: request.providerApiKeys,
|
|
532
|
+
});
|
|
533
|
+
const text = stripCodeFences(recognized || fallbackText || "");
|
|
534
|
+
const artifact = {
|
|
535
|
+
documentId: record.documentId,
|
|
536
|
+
pageNumber: request.pageNumber,
|
|
537
|
+
renderScale: renderArtifact.renderScale,
|
|
538
|
+
sourceSizeBytes: record.sizeBytes,
|
|
539
|
+
sourceMtimeMs: record.mtimeMs,
|
|
540
|
+
provider,
|
|
541
|
+
model,
|
|
542
|
+
prompt,
|
|
543
|
+
text,
|
|
544
|
+
chars: text.length,
|
|
545
|
+
imagePath: renderArtifact.imagePath,
|
|
546
|
+
renderArtifactPath: renderArtifact.artifactPath,
|
|
547
|
+
artifactPath,
|
|
548
|
+
generatedAt: new Date().toISOString(),
|
|
549
|
+
};
|
|
550
|
+
await writeJson(artifactPath, artifact);
|
|
551
|
+
return {
|
|
552
|
+
...artifact,
|
|
553
|
+
cacheStatus: "fresh",
|
|
554
|
+
};
|
|
555
|
+
};
|