@echofiles/echo-pdf 0.4.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +85 -562
- package/bin/echo-pdf.js +130 -525
- package/dist/file-utils.d.ts +0 -3
- package/dist/file-utils.js +0 -18
- package/dist/local/document.d.ts +10 -0
- package/dist/local/document.js +133 -0
- package/dist/local/index.d.ts +3 -135
- package/dist/local/index.js +2 -555
- package/dist/local/semantic.d.ts +2 -0
- package/dist/local/semantic.js +231 -0
- package/dist/local/shared.d.ts +50 -0
- package/dist/local/shared.js +173 -0
- package/dist/local/types.d.ts +183 -0
- package/dist/local/types.js +2 -0
- package/dist/node/pdfium-local.js +30 -6
- package/dist/pdf-config.js +2 -65
- package/dist/pdf-types.d.ts +1 -58
- package/dist/types.d.ts +1 -87
- package/echo-pdf.config.json +1 -21
- package/package.json +25 -22
- package/bin/lib/http.js +0 -97
- package/bin/lib/mcp-stdio.js +0 -99
- package/dist/auth.d.ts +0 -18
- package/dist/auth.js +0 -36
- package/dist/core/index.d.ts +0 -50
- package/dist/core/index.js +0 -7
- package/dist/file-ops.d.ts +0 -11
- package/dist/file-ops.js +0 -36
- package/dist/file-store-do.d.ts +0 -36
- package/dist/file-store-do.js +0 -298
- package/dist/http-error.d.ts +0 -9
- package/dist/http-error.js +0 -14
- package/dist/index.d.ts +0 -1
- package/dist/index.js +0 -1
- package/dist/mcp-server.d.ts +0 -3
- package/dist/mcp-server.js +0 -124
- package/dist/node/semantic-local.d.ts +0 -16
- package/dist/node/semantic-local.js +0 -113
- package/dist/pdf-agent.d.ts +0 -18
- package/dist/pdf-agent.js +0 -217
- package/dist/pdf-storage.d.ts +0 -8
- package/dist/pdf-storage.js +0 -86
- package/dist/pdfium-engine.d.ts +0 -9
- package/dist/pdfium-engine.js +0 -180
- package/dist/r2-file-store.d.ts +0 -20
- package/dist/r2-file-store.js +0 -176
- package/dist/response-schema.d.ts +0 -15
- package/dist/response-schema.js +0 -159
- package/dist/tool-registry.d.ts +0 -16
- package/dist/tool-registry.js +0 -175
- package/dist/worker.d.ts +0 -7
- package/dist/worker.js +0 -386
- package/scripts/export-fixtures.sh +0 -204
- package/wrangler.toml +0 -19
package/dist/local/index.js
CHANGED
|
@@ -1,555 +1,2 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
import { mkdir, readFile, stat, writeFile } from "node:fs/promises";
|
|
4
|
-
import path from "node:path";
|
|
5
|
-
import { resolveModelForProvider, resolveProviderAlias } from "../agent-defaults.js";
|
|
6
|
-
import { toDataUrl } from "../file-utils.js";
|
|
7
|
-
import { loadEchoPdfConfig } from "../pdf-config.js";
|
|
8
|
-
import { generateText, visionRecognize } from "../provider-client.js";
|
|
9
|
-
import { extractLocalPdfPageText, getLocalPdfPageCount, renderLocalPdfPageToPng } from "../node/pdfium-local.js";
|
|
10
|
-
import { buildSemanticSectionTree } from "../node/semantic-local.js";
|
|
11
|
-
const defaultWorkspaceDir = () => path.resolve(process.cwd(), ".echo-pdf-workspace");
|
|
12
|
-
const resolveWorkspaceDir = (workspaceDir) => path.resolve(process.cwd(), workspaceDir?.trim() || defaultWorkspaceDir());
|
|
13
|
-
const toDocumentId = (absolutePdfPath) => createHash("sha256").update(absolutePdfPath).digest("hex").slice(0, 16);
|
|
14
|
-
const hashFragment = (value, length = 12) => createHash("sha256").update(value).digest("hex").slice(0, length);
|
|
15
|
-
const sanitizeSegment = (value) => value.replace(/[^a-zA-Z0-9._-]+/g, "_");
|
|
16
|
-
const scaleLabel = (value) => sanitizeSegment(String(value));
|
|
17
|
-
const pageLabel = (pageNumber) => String(pageNumber).padStart(4, "0");
|
|
18
|
-
const buildArtifactPaths = (workspaceDir, documentId) => {
|
|
19
|
-
const documentDir = path.join(workspaceDir, "documents", documentId);
|
|
20
|
-
return {
|
|
21
|
-
workspaceDir,
|
|
22
|
-
documentDir,
|
|
23
|
-
documentJsonPath: path.join(documentDir, "document.json"),
|
|
24
|
-
structureJsonPath: path.join(documentDir, "structure.json"),
|
|
25
|
-
semanticStructureJsonPath: path.join(documentDir, "semantic-structure.json"),
|
|
26
|
-
pagesDir: path.join(documentDir, "pages"),
|
|
27
|
-
rendersDir: path.join(documentDir, "renders"),
|
|
28
|
-
ocrDir: path.join(documentDir, "ocr"),
|
|
29
|
-
};
|
|
30
|
-
};
|
|
31
|
-
const buildRenderArtifactPaths = (paths, pageNumber, renderScale) => {
|
|
32
|
-
const key = `${pageLabel(pageNumber)}.scale-${scaleLabel(renderScale)}`;
|
|
33
|
-
return {
|
|
34
|
-
artifactPath: path.join(paths.rendersDir, `${key}.json`),
|
|
35
|
-
imagePath: path.join(paths.rendersDir, `${key}.png`),
|
|
36
|
-
};
|
|
37
|
-
};
|
|
38
|
-
const buildOcrArtifactPath = (paths, pageNumber, renderScale, provider, model, prompt) => {
|
|
39
|
-
const key = [
|
|
40
|
-
pageLabel(pageNumber),
|
|
41
|
-
`scale-${scaleLabel(renderScale)}`,
|
|
42
|
-
`provider-${sanitizeSegment(provider)}`,
|
|
43
|
-
`model-${sanitizeSegment(model)}`,
|
|
44
|
-
`prompt-${hashFragment(prompt, 10)}`,
|
|
45
|
-
].join(".");
|
|
46
|
-
return path.join(paths.ocrDir, `${key}.json`);
|
|
47
|
-
};
|
|
48
|
-
const createPreview = (text) => text.replace(/\s+/g, " ").trim().slice(0, 160);
|
|
49
|
-
const createPageTitle = (pageNumber, text) => {
|
|
50
|
-
const firstLine = text
|
|
51
|
-
.split(/\r?\n/)
|
|
52
|
-
.map((line) => line.trim())
|
|
53
|
-
.find((line) => line.length > 0);
|
|
54
|
-
return firstLine ? `Page ${pageNumber}: ${firstLine.slice(0, 80)}` : `Page ${pageNumber}`;
|
|
55
|
-
};
|
|
56
|
-
const stripCodeFences = (value) => {
|
|
57
|
-
const text = value.trim();
|
|
58
|
-
const fenced = text.match(/^```[a-zA-Z0-9_-]*\n([\s\S]*?)\n```$/);
|
|
59
|
-
return typeof fenced?.[1] === "string" ? fenced[1].trim() : text;
|
|
60
|
-
};
|
|
61
|
-
const parseJsonObject = (value) => {
|
|
62
|
-
const trimmed = stripCodeFences(value).trim();
|
|
63
|
-
if (!trimmed)
|
|
64
|
-
return null;
|
|
65
|
-
try {
|
|
66
|
-
return JSON.parse(trimmed);
|
|
67
|
-
}
|
|
68
|
-
catch {
|
|
69
|
-
const start = trimmed.indexOf("{");
|
|
70
|
-
const end = trimmed.lastIndexOf("}");
|
|
71
|
-
if (start >= 0 && end > start) {
|
|
72
|
-
return JSON.parse(trimmed.slice(start, end + 1));
|
|
73
|
-
}
|
|
74
|
-
throw new Error("semantic structure model output was not valid JSON");
|
|
75
|
-
}
|
|
76
|
-
};
|
|
77
|
-
const resolveConfig = (config) => config ?? loadEchoPdfConfig({});
|
|
78
|
-
const resolveEnv = (env) => env ?? process.env;
|
|
79
|
-
const fileExists = async (targetPath) => {
|
|
80
|
-
try {
|
|
81
|
-
await stat(targetPath);
|
|
82
|
-
return true;
|
|
83
|
-
}
|
|
84
|
-
catch {
|
|
85
|
-
return false;
|
|
86
|
-
}
|
|
87
|
-
};
|
|
88
|
-
const readJson = async (targetPath) => {
|
|
89
|
-
const raw = await readFile(targetPath, "utf-8");
|
|
90
|
-
return JSON.parse(raw);
|
|
91
|
-
};
|
|
92
|
-
const loadStoredDocument = async (paths) => {
|
|
93
|
-
if (!await fileExists(paths.documentJsonPath))
|
|
94
|
-
return null;
|
|
95
|
-
const raw = await readJson(paths.documentJsonPath);
|
|
96
|
-
return {
|
|
97
|
-
...raw,
|
|
98
|
-
artifactPaths: paths,
|
|
99
|
-
};
|
|
100
|
-
};
|
|
101
|
-
const isReusableRecord = async (record, sourceStats, paths) => {
|
|
102
|
-
if (record.sizeBytes !== sourceStats.sizeBytes || record.mtimeMs !== sourceStats.mtimeMs)
|
|
103
|
-
return false;
|
|
104
|
-
if (!await fileExists(paths.structureJsonPath))
|
|
105
|
-
return false;
|
|
106
|
-
for (let pageNumber = 1; pageNumber <= record.pageCount; pageNumber += 1) {
|
|
107
|
-
const pagePath = path.join(paths.pagesDir, `${pageLabel(pageNumber)}.json`);
|
|
108
|
-
if (!await fileExists(pagePath))
|
|
109
|
-
return false;
|
|
110
|
-
}
|
|
111
|
-
return true;
|
|
112
|
-
};
|
|
113
|
-
const writeJson = async (targetPath, data) => {
|
|
114
|
-
await mkdir(path.dirname(targetPath), { recursive: true });
|
|
115
|
-
await writeFile(targetPath, `${JSON.stringify(data, null, 2)}\n`, "utf-8");
|
|
116
|
-
};
|
|
117
|
-
const readSourceBytes = async (sourcePath) => new Uint8Array(await readFile(sourcePath));
|
|
118
|
-
const matchesSourceSnapshot = (artifact, record) => artifact.sourceSizeBytes === record.sizeBytes && artifact.sourceMtimeMs === record.mtimeMs;
|
|
119
|
-
const matchesStrategyKey = (artifact, strategyKey) => artifact.strategyKey === strategyKey;
|
|
120
|
-
const resolveSemanticExtractionBudget = (input) => ({
|
|
121
|
-
pageSelection: "all",
|
|
122
|
-
chunkMaxChars: typeof input?.chunkMaxChars === "number" && Number.isFinite(input.chunkMaxChars) && input.chunkMaxChars > 400
|
|
123
|
-
? Math.floor(input.chunkMaxChars)
|
|
124
|
-
: 4000,
|
|
125
|
-
chunkOverlapChars: typeof input?.chunkOverlapChars === "number" && Number.isFinite(input.chunkOverlapChars) && input.chunkOverlapChars >= 0
|
|
126
|
-
? Math.floor(input.chunkOverlapChars)
|
|
127
|
-
: 300,
|
|
128
|
-
});
|
|
129
|
-
const splitSemanticTextIntoChunks = (text, budget) => {
|
|
130
|
-
const normalized = text.trim();
|
|
131
|
-
if (!normalized)
|
|
132
|
-
return [];
|
|
133
|
-
if (normalized.length <= budget.chunkMaxChars)
|
|
134
|
-
return [normalized];
|
|
135
|
-
const chunks = [];
|
|
136
|
-
let start = 0;
|
|
137
|
-
while (start < normalized.length) {
|
|
138
|
-
const idealEnd = Math.min(normalized.length, start + budget.chunkMaxChars);
|
|
139
|
-
let end = idealEnd;
|
|
140
|
-
if (idealEnd < normalized.length) {
|
|
141
|
-
const newlineBreak = normalized.lastIndexOf("\n", idealEnd);
|
|
142
|
-
const sentenceBreak = normalized.lastIndexOf("。", idealEnd);
|
|
143
|
-
const whitespaceBreak = normalized.lastIndexOf(" ", idealEnd);
|
|
144
|
-
end = Math.max(newlineBreak, sentenceBreak, whitespaceBreak, start + Math.floor(budget.chunkMaxChars * 0.7));
|
|
145
|
-
if (end <= start)
|
|
146
|
-
end = idealEnd;
|
|
147
|
-
}
|
|
148
|
-
const chunk = normalized.slice(start, end).trim();
|
|
149
|
-
if (chunk)
|
|
150
|
-
chunks.push(chunk);
|
|
151
|
-
if (end >= normalized.length)
|
|
152
|
-
break;
|
|
153
|
-
start = Math.max(end - budget.chunkOverlapChars, start + 1);
|
|
154
|
-
}
|
|
155
|
-
return chunks;
|
|
156
|
-
};
|
|
157
|
-
const toSemanticTree = (value, pageArtifactPaths) => {
|
|
158
|
-
if (!Array.isArray(value))
|
|
159
|
-
return [];
|
|
160
|
-
const nodes = [];
|
|
161
|
-
value.forEach((item, index) => {
|
|
162
|
-
const node = item;
|
|
163
|
-
const title = typeof node.title === "string" ? node.title.trim() : "";
|
|
164
|
-
const level = typeof node.level === "number" && Number.isInteger(node.level) && node.level > 0 ? node.level : undefined;
|
|
165
|
-
const pageNumber = typeof node.pageNumber === "number" && Number.isInteger(node.pageNumber) && node.pageNumber > 0 ? node.pageNumber : undefined;
|
|
166
|
-
if (!title || typeof level !== "number" || typeof pageNumber !== "number")
|
|
167
|
-
return;
|
|
168
|
-
nodes.push({
|
|
169
|
-
id: `semantic-node-${index + 1}-${pageNumber}-${level}`,
|
|
170
|
-
type: "section",
|
|
171
|
-
title,
|
|
172
|
-
level,
|
|
173
|
-
pageNumber,
|
|
174
|
-
pageArtifactPath: pageArtifactPaths.get(pageNumber),
|
|
175
|
-
excerpt: typeof node.excerpt === "string" ? node.excerpt.trim() : undefined,
|
|
176
|
-
children: toSemanticTree(node.children, pageArtifactPaths),
|
|
177
|
-
});
|
|
178
|
-
});
|
|
179
|
-
return nodes;
|
|
180
|
-
};
|
|
181
|
-
const buildSemanticPrompt = (pageNumber, chunkIndex, chunkText) => {
|
|
182
|
-
return [
|
|
183
|
-
"You extract heading/section candidates from one document text segment.",
|
|
184
|
-
"Return JSON only.",
|
|
185
|
-
"Schema:",
|
|
186
|
-
"{",
|
|
187
|
-
' "candidates": [',
|
|
188
|
-
" {",
|
|
189
|
-
' "title": "string",',
|
|
190
|
-
' "level": 1,',
|
|
191
|
-
' "excerpt": "short evidence string",',
|
|
192
|
-
" }",
|
|
193
|
-
" ]",
|
|
194
|
-
"}",
|
|
195
|
-
"Rules:",
|
|
196
|
-
"- Use only headings/sections that are clearly supported by the text segment.",
|
|
197
|
-
"- Prefer conservative extraction over guessing.",
|
|
198
|
-
"- Do not include page index entries, table rows, figure labels, or prose sentences.",
|
|
199
|
-
"- Do not infer hierarchy beyond the explicit heading numbering or structure visible in the segment.",
|
|
200
|
-
"- If no reliable semantic structure is detectable, return {\"candidates\":[]}.",
|
|
201
|
-
`Page number: ${pageNumber}`,
|
|
202
|
-
`Chunk index: ${chunkIndex}`,
|
|
203
|
-
"",
|
|
204
|
-
chunkText,
|
|
205
|
-
].join("\n");
|
|
206
|
-
};
|
|
207
|
-
const buildSemanticAggregationPrompt = (record, candidates) => {
|
|
208
|
-
const candidateDump = JSON.stringify({ candidates }, null, 2);
|
|
209
|
-
return [
|
|
210
|
-
"You assemble semantic document structure from heading candidates.",
|
|
211
|
-
"Return JSON only.",
|
|
212
|
-
"Schema:",
|
|
213
|
-
"{",
|
|
214
|
-
' "sections": [',
|
|
215
|
-
" {",
|
|
216
|
-
' "title": "string",',
|
|
217
|
-
' "level": 1,',
|
|
218
|
-
' "pageNumber": 1,',
|
|
219
|
-
' "excerpt": "short evidence string",',
|
|
220
|
-
' "children": []',
|
|
221
|
-
" }",
|
|
222
|
-
" ]",
|
|
223
|
-
"}",
|
|
224
|
-
"Rules:",
|
|
225
|
-
"- Preserve hierarchy with nested children.",
|
|
226
|
-
"- Use only candidate headings that form a reliable document structure.",
|
|
227
|
-
"- Deduplicate repeated headings from overlapping segments.",
|
|
228
|
-
"- Do not invent sections not present in the candidates.",
|
|
229
|
-
"- If no reliable semantic structure is detectable, return {\"sections\":[]}.",
|
|
230
|
-
`Document filename: ${record.filename}`,
|
|
231
|
-
`Page count: ${record.pageCount}`,
|
|
232
|
-
"",
|
|
233
|
-
candidateDump,
|
|
234
|
-
].join("\n");
|
|
235
|
-
};
|
|
236
|
-
const buildHeuristicSemanticArtifact = (record, artifactPath, sections) => ({
|
|
237
|
-
documentId: record.documentId,
|
|
238
|
-
generatedAt: new Date().toISOString(),
|
|
239
|
-
detector: "heading-heuristic-v1",
|
|
240
|
-
strategyKey: "heuristic::heading-heuristic-v1",
|
|
241
|
-
sourceSizeBytes: record.sizeBytes,
|
|
242
|
-
sourceMtimeMs: record.mtimeMs,
|
|
243
|
-
pageIndexArtifactPath: record.artifactPaths.structureJsonPath,
|
|
244
|
-
artifactPath,
|
|
245
|
-
root: {
|
|
246
|
-
id: `semantic-${record.documentId}`,
|
|
247
|
-
type: "document",
|
|
248
|
-
title: record.filename,
|
|
249
|
-
children: sections,
|
|
250
|
-
},
|
|
251
|
-
});
|
|
252
|
-
const ensureSemanticStructureArtifact = async (request) => {
|
|
253
|
-
const config = resolveConfig(request.config);
|
|
254
|
-
const env = resolveEnv(request.env);
|
|
255
|
-
const { record } = await indexDocumentInternal(request);
|
|
256
|
-
const artifactPath = record.artifactPaths.semanticStructureJsonPath;
|
|
257
|
-
const semanticBudget = resolveSemanticExtractionBudget(request.semanticExtraction);
|
|
258
|
-
let provider = "";
|
|
259
|
-
let model = "";
|
|
260
|
-
try {
|
|
261
|
-
provider = resolveProviderAlias(config, request.provider);
|
|
262
|
-
model = provider ? resolveModelForProvider(config, provider) : "";
|
|
263
|
-
}
|
|
264
|
-
catch {
|
|
265
|
-
provider = "";
|
|
266
|
-
model = "";
|
|
267
|
-
}
|
|
268
|
-
if (provider) {
|
|
269
|
-
model = resolveModelForProvider(config, provider, request.model);
|
|
270
|
-
}
|
|
271
|
-
const strategyKey = model
|
|
272
|
-
? `agent::agent-structured-v1::${provider}::${model}::${semanticBudget.pageSelection}::${semanticBudget.chunkMaxChars}::${semanticBudget.chunkOverlapChars}`
|
|
273
|
-
: "heuristic::heading-heuristic-v1";
|
|
274
|
-
if (!request.forceRefresh && await fileExists(artifactPath)) {
|
|
275
|
-
const cached = await readJson(artifactPath);
|
|
276
|
-
if (matchesSourceSnapshot(cached, record) && matchesStrategyKey(cached, strategyKey)) {
|
|
277
|
-
return {
|
|
278
|
-
...cached,
|
|
279
|
-
cacheStatus: "reused",
|
|
280
|
-
};
|
|
281
|
-
}
|
|
282
|
-
}
|
|
283
|
-
const pages = [];
|
|
284
|
-
for (let pageNumber = 1; pageNumber <= record.pageCount; pageNumber += 1) {
|
|
285
|
-
const pagePath = path.join(record.artifactPaths.pagesDir, `${pageLabel(pageNumber)}.json`);
|
|
286
|
-
const page = await readJson(pagePath);
|
|
287
|
-
pages.push(page);
|
|
288
|
-
}
|
|
289
|
-
const pageArtifactPaths = new Map(pages.map((page) => [page.pageNumber, page.artifactPath]));
|
|
290
|
-
let artifact;
|
|
291
|
-
if (model) {
|
|
292
|
-
try {
|
|
293
|
-
const candidateMap = new Map();
|
|
294
|
-
for (const page of pages) {
|
|
295
|
-
const chunks = splitSemanticTextIntoChunks(page.text, semanticBudget);
|
|
296
|
-
for (const [chunkIndex, chunkText] of chunks.entries()) {
|
|
297
|
-
const response = await generateText({
|
|
298
|
-
config,
|
|
299
|
-
env,
|
|
300
|
-
providerAlias: provider,
|
|
301
|
-
model,
|
|
302
|
-
prompt: buildSemanticPrompt(page.pageNumber, chunkIndex + 1, chunkText),
|
|
303
|
-
runtimeApiKeys: request.providerApiKeys,
|
|
304
|
-
});
|
|
305
|
-
const parsed = parseJsonObject(response);
|
|
306
|
-
for (const candidate of Array.isArray(parsed?.candidates) ? parsed.candidates : []) {
|
|
307
|
-
const title = typeof candidate?.title === "string" ? candidate.title.trim() : "";
|
|
308
|
-
const level = typeof candidate?.level === "number" && Number.isInteger(candidate.level) && candidate.level > 0 ? candidate.level : 0;
|
|
309
|
-
if (!title || level <= 0)
|
|
310
|
-
continue;
|
|
311
|
-
const key = `${page.pageNumber}:${level}:${title}`;
|
|
312
|
-
if (!candidateMap.has(key)) {
|
|
313
|
-
candidateMap.set(key, {
|
|
314
|
-
title,
|
|
315
|
-
level,
|
|
316
|
-
pageNumber: page.pageNumber,
|
|
317
|
-
excerpt: typeof candidate?.excerpt === "string" ? candidate.excerpt.trim() : undefined,
|
|
318
|
-
});
|
|
319
|
-
}
|
|
320
|
-
}
|
|
321
|
-
}
|
|
322
|
-
}
|
|
323
|
-
const aggregated = await generateText({
|
|
324
|
-
config,
|
|
325
|
-
env,
|
|
326
|
-
providerAlias: provider,
|
|
327
|
-
model,
|
|
328
|
-
prompt: buildSemanticAggregationPrompt(record, [...candidateMap.values()]),
|
|
329
|
-
runtimeApiKeys: request.providerApiKeys,
|
|
330
|
-
});
|
|
331
|
-
const parsed = parseJsonObject(aggregated);
|
|
332
|
-
const sections = toSemanticTree(parsed?.sections, pageArtifactPaths);
|
|
333
|
-
artifact = {
|
|
334
|
-
documentId: record.documentId,
|
|
335
|
-
generatedAt: new Date().toISOString(),
|
|
336
|
-
detector: "agent-structured-v1",
|
|
337
|
-
strategyKey,
|
|
338
|
-
sourceSizeBytes: record.sizeBytes,
|
|
339
|
-
sourceMtimeMs: record.mtimeMs,
|
|
340
|
-
pageIndexArtifactPath: record.artifactPaths.structureJsonPath,
|
|
341
|
-
artifactPath,
|
|
342
|
-
root: {
|
|
343
|
-
id: `semantic-${record.documentId}`,
|
|
344
|
-
type: "document",
|
|
345
|
-
title: record.filename,
|
|
346
|
-
children: sections,
|
|
347
|
-
},
|
|
348
|
-
};
|
|
349
|
-
}
|
|
350
|
-
catch {
|
|
351
|
-
artifact = buildHeuristicSemanticArtifact(record, artifactPath, buildSemanticSectionTree(pages));
|
|
352
|
-
}
|
|
353
|
-
}
|
|
354
|
-
else {
|
|
355
|
-
artifact = buildHeuristicSemanticArtifact(record, artifactPath, buildSemanticSectionTree(pages));
|
|
356
|
-
}
|
|
357
|
-
await writeJson(artifactPath, artifact);
|
|
358
|
-
return {
|
|
359
|
-
...artifact,
|
|
360
|
-
cacheStatus: "fresh",
|
|
361
|
-
};
|
|
362
|
-
};
|
|
363
|
-
const ensurePageNumber = (pageCount, pageNumber) => {
|
|
364
|
-
if (!Number.isInteger(pageNumber) || pageNumber < 1 || pageNumber > pageCount) {
|
|
365
|
-
throw new Error(`pageNumber must be within 1..${pageCount}`);
|
|
366
|
-
}
|
|
367
|
-
};
|
|
368
|
-
const resolveRenderScale = (config, requestedScale) => {
|
|
369
|
-
if (typeof requestedScale === "number" && Number.isFinite(requestedScale) && requestedScale > 0) {
|
|
370
|
-
return requestedScale;
|
|
371
|
-
}
|
|
372
|
-
return config.service.defaultRenderScale;
|
|
373
|
-
};
|
|
374
|
-
const indexDocumentInternal = async (request) => {
|
|
375
|
-
const config = resolveConfig(request.config);
|
|
376
|
-
const sourcePath = path.resolve(process.cwd(), request.pdfPath);
|
|
377
|
-
const workspaceDir = resolveWorkspaceDir(request.workspaceDir);
|
|
378
|
-
const documentId = toDocumentId(sourcePath);
|
|
379
|
-
const artifactPaths = buildArtifactPaths(workspaceDir, documentId);
|
|
380
|
-
const sourceStats = await stat(sourcePath);
|
|
381
|
-
const stored = await loadStoredDocument(artifactPaths);
|
|
382
|
-
const sourceMeta = {
|
|
383
|
-
sizeBytes: sourceStats.size,
|
|
384
|
-
mtimeMs: sourceStats.mtimeMs,
|
|
385
|
-
};
|
|
386
|
-
if (!request.forceRefresh && stored && await isReusableRecord(stored, sourceMeta, artifactPaths)) {
|
|
387
|
-
return { record: stored, reused: true };
|
|
388
|
-
}
|
|
389
|
-
await mkdir(artifactPaths.pagesDir, { recursive: true });
|
|
390
|
-
const bytes = await readSourceBytes(sourcePath);
|
|
391
|
-
const pageCount = await getLocalPdfPageCount(config, bytes);
|
|
392
|
-
const pageNodes = [];
|
|
393
|
-
for (let pageNumber = 1; pageNumber <= pageCount; pageNumber += 1) {
|
|
394
|
-
const text = await extractLocalPdfPageText(config, bytes, pageNumber - 1);
|
|
395
|
-
const preview = createPreview(text);
|
|
396
|
-
const title = createPageTitle(pageNumber, text);
|
|
397
|
-
const artifactPath = path.join(artifactPaths.pagesDir, `${pageLabel(pageNumber)}.json`);
|
|
398
|
-
const pageArtifact = {
|
|
399
|
-
documentId,
|
|
400
|
-
pageNumber,
|
|
401
|
-
title,
|
|
402
|
-
preview,
|
|
403
|
-
text,
|
|
404
|
-
chars: text.length,
|
|
405
|
-
artifactPath,
|
|
406
|
-
};
|
|
407
|
-
await writeJson(artifactPath, pageArtifact);
|
|
408
|
-
pageNodes.push({
|
|
409
|
-
id: `page-${pageNumber}`,
|
|
410
|
-
type: "page",
|
|
411
|
-
title,
|
|
412
|
-
pageNumber,
|
|
413
|
-
preview,
|
|
414
|
-
artifactPath,
|
|
415
|
-
});
|
|
416
|
-
}
|
|
417
|
-
const structure = {
|
|
418
|
-
documentId,
|
|
419
|
-
generatedAt: new Date().toISOString(),
|
|
420
|
-
root: {
|
|
421
|
-
id: documentId,
|
|
422
|
-
type: "document",
|
|
423
|
-
title: path.basename(sourcePath),
|
|
424
|
-
children: pageNodes,
|
|
425
|
-
},
|
|
426
|
-
};
|
|
427
|
-
await writeJson(artifactPaths.structureJsonPath, structure);
|
|
428
|
-
const documentRecord = {
|
|
429
|
-
documentId,
|
|
430
|
-
sourcePath,
|
|
431
|
-
filename: path.basename(sourcePath),
|
|
432
|
-
sizeBytes: sourceMeta.sizeBytes,
|
|
433
|
-
mtimeMs: sourceMeta.mtimeMs,
|
|
434
|
-
pageCount,
|
|
435
|
-
indexedAt: structure.generatedAt,
|
|
436
|
-
artifactPaths,
|
|
437
|
-
};
|
|
438
|
-
await writeJson(artifactPaths.documentJsonPath, documentRecord);
|
|
439
|
-
return { record: documentRecord, reused: false };
|
|
440
|
-
};
|
|
441
|
-
const toMetadata = (record, cacheStatus) => ({
|
|
442
|
-
...record,
|
|
443
|
-
cacheStatus,
|
|
444
|
-
});
|
|
445
|
-
const ensureRenderArtifact = async (request) => {
|
|
446
|
-
const config = resolveConfig(request.config);
|
|
447
|
-
const { record } = await indexDocumentInternal(request);
|
|
448
|
-
ensurePageNumber(record.pageCount, request.pageNumber);
|
|
449
|
-
const renderScale = resolveRenderScale(config, request.renderScale);
|
|
450
|
-
const renderPaths = buildRenderArtifactPaths(record.artifactPaths, request.pageNumber, renderScale);
|
|
451
|
-
if (!request.forceRefresh && await fileExists(renderPaths.artifactPath) && await fileExists(renderPaths.imagePath)) {
|
|
452
|
-
const cached = await readJson(renderPaths.artifactPath);
|
|
453
|
-
if (matchesSourceSnapshot(cached, record)) {
|
|
454
|
-
return {
|
|
455
|
-
...cached,
|
|
456
|
-
cacheStatus: "reused",
|
|
457
|
-
};
|
|
458
|
-
}
|
|
459
|
-
}
|
|
460
|
-
const bytes = await readSourceBytes(record.sourcePath);
|
|
461
|
-
const rendered = await renderLocalPdfPageToPng(config, bytes, request.pageNumber - 1, renderScale);
|
|
462
|
-
await mkdir(path.dirname(renderPaths.imagePath), { recursive: true });
|
|
463
|
-
await writeFile(renderPaths.imagePath, rendered.png);
|
|
464
|
-
const artifact = {
|
|
465
|
-
documentId: record.documentId,
|
|
466
|
-
pageNumber: request.pageNumber,
|
|
467
|
-
renderScale,
|
|
468
|
-
sourceSizeBytes: record.sizeBytes,
|
|
469
|
-
sourceMtimeMs: record.mtimeMs,
|
|
470
|
-
width: rendered.width,
|
|
471
|
-
height: rendered.height,
|
|
472
|
-
mimeType: "image/png",
|
|
473
|
-
imagePath: renderPaths.imagePath,
|
|
474
|
-
artifactPath: renderPaths.artifactPath,
|
|
475
|
-
generatedAt: new Date().toISOString(),
|
|
476
|
-
};
|
|
477
|
-
await writeJson(renderPaths.artifactPath, artifact);
|
|
478
|
-
return {
|
|
479
|
-
...artifact,
|
|
480
|
-
cacheStatus: "fresh",
|
|
481
|
-
};
|
|
482
|
-
};
|
|
483
|
-
export const get_document = async (request) => {
|
|
484
|
-
const { record, reused } = await indexDocumentInternal(request);
|
|
485
|
-
return toMetadata(record, reused ? "reused" : "fresh");
|
|
486
|
-
};
|
|
487
|
-
export const get_document_structure = async (request) => {
|
|
488
|
-
const { record } = await indexDocumentInternal(request);
|
|
489
|
-
return readJson(record.artifactPaths.structureJsonPath);
|
|
490
|
-
};
|
|
491
|
-
export const get_semantic_document_structure = async (request) => ensureSemanticStructureArtifact(request);
|
|
492
|
-
export const get_page_content = async (request) => {
|
|
493
|
-
const { record } = await indexDocumentInternal(request);
|
|
494
|
-
ensurePageNumber(record.pageCount, request.pageNumber);
|
|
495
|
-
const pagePath = path.join(record.artifactPaths.pagesDir, `${pageLabel(request.pageNumber)}.json`);
|
|
496
|
-
return readJson(pagePath);
|
|
497
|
-
};
|
|
498
|
-
export const get_page_render = async (request) => ensureRenderArtifact(request);
|
|
499
|
-
export const get_page_ocr = async (request) => {
|
|
500
|
-
const config = resolveConfig(request.config);
|
|
501
|
-
const env = resolveEnv(request.env);
|
|
502
|
-
const { record } = await indexDocumentInternal(request);
|
|
503
|
-
ensurePageNumber(record.pageCount, request.pageNumber);
|
|
504
|
-
const renderArtifact = await ensureRenderArtifact(request);
|
|
505
|
-
const provider = resolveProviderAlias(config, request.provider);
|
|
506
|
-
const model = resolveModelForProvider(config, provider, request.model);
|
|
507
|
-
if (!model) {
|
|
508
|
-
throw new Error("model is required for local OCR artifacts; pass `model` or set agent.defaultModel");
|
|
509
|
-
}
|
|
510
|
-
const prompt = request.prompt?.trim() || config.agent.ocrPrompt;
|
|
511
|
-
const artifactPath = buildOcrArtifactPath(record.artifactPaths, request.pageNumber, renderArtifact.renderScale, provider, model, prompt);
|
|
512
|
-
if (!request.forceRefresh && await fileExists(artifactPath)) {
|
|
513
|
-
const cached = await readJson(artifactPath);
|
|
514
|
-
if (matchesSourceSnapshot(cached, record)) {
|
|
515
|
-
return {
|
|
516
|
-
...cached,
|
|
517
|
-
cacheStatus: "reused",
|
|
518
|
-
};
|
|
519
|
-
}
|
|
520
|
-
}
|
|
521
|
-
const imageBytes = new Uint8Array(await readFile(renderArtifact.imagePath));
|
|
522
|
-
const imageDataUrl = toDataUrl(imageBytes, renderArtifact.mimeType);
|
|
523
|
-
const fallbackText = (await get_page_content(request)).text;
|
|
524
|
-
const recognized = await visionRecognize({
|
|
525
|
-
config,
|
|
526
|
-
env,
|
|
527
|
-
providerAlias: provider,
|
|
528
|
-
model,
|
|
529
|
-
prompt,
|
|
530
|
-
imageDataUrl,
|
|
531
|
-
runtimeApiKeys: request.providerApiKeys,
|
|
532
|
-
});
|
|
533
|
-
const text = stripCodeFences(recognized || fallbackText || "");
|
|
534
|
-
const artifact = {
|
|
535
|
-
documentId: record.documentId,
|
|
536
|
-
pageNumber: request.pageNumber,
|
|
537
|
-
renderScale: renderArtifact.renderScale,
|
|
538
|
-
sourceSizeBytes: record.sizeBytes,
|
|
539
|
-
sourceMtimeMs: record.mtimeMs,
|
|
540
|
-
provider,
|
|
541
|
-
model,
|
|
542
|
-
prompt,
|
|
543
|
-
text,
|
|
544
|
-
chars: text.length,
|
|
545
|
-
imagePath: renderArtifact.imagePath,
|
|
546
|
-
renderArtifactPath: renderArtifact.artifactPath,
|
|
547
|
-
artifactPath,
|
|
548
|
-
generatedAt: new Date().toISOString(),
|
|
549
|
-
};
|
|
550
|
-
await writeJson(artifactPath, artifact);
|
|
551
|
-
return {
|
|
552
|
-
...artifact,
|
|
553
|
-
cacheStatus: "fresh",
|
|
554
|
-
};
|
|
555
|
-
};
|
|
1
|
+
export { get_document, get_document_structure, get_page_content, get_page_render } from "./document.js";
|
|
2
|
+
export { get_semantic_document_structure } from "./semantic.js";
|