@echofiles/echo-pdf 0.4.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +85 -562
- package/bin/echo-pdf.js +130 -525
- package/dist/file-utils.d.ts +0 -3
- package/dist/file-utils.js +0 -18
- package/dist/local/document.d.ts +10 -0
- package/dist/local/document.js +133 -0
- package/dist/local/index.d.ts +3 -135
- package/dist/local/index.js +2 -555
- package/dist/local/semantic.d.ts +2 -0
- package/dist/local/semantic.js +231 -0
- package/dist/local/shared.d.ts +50 -0
- package/dist/local/shared.js +173 -0
- package/dist/local/types.d.ts +183 -0
- package/dist/local/types.js +2 -0
- package/dist/node/pdfium-local.js +30 -6
- package/dist/pdf-config.js +2 -65
- package/dist/pdf-types.d.ts +1 -58
- package/dist/types.d.ts +1 -87
- package/echo-pdf.config.json +1 -21
- package/package.json +25 -22
- package/bin/lib/http.js +0 -97
- package/bin/lib/mcp-stdio.js +0 -99
- package/dist/auth.d.ts +0 -18
- package/dist/auth.js +0 -36
- package/dist/core/index.d.ts +0 -50
- package/dist/core/index.js +0 -7
- package/dist/file-ops.d.ts +0 -11
- package/dist/file-ops.js +0 -36
- package/dist/file-store-do.d.ts +0 -36
- package/dist/file-store-do.js +0 -298
- package/dist/http-error.d.ts +0 -9
- package/dist/http-error.js +0 -14
- package/dist/index.d.ts +0 -1
- package/dist/index.js +0 -1
- package/dist/mcp-server.d.ts +0 -3
- package/dist/mcp-server.js +0 -124
- package/dist/node/semantic-local.d.ts +0 -16
- package/dist/node/semantic-local.js +0 -113
- package/dist/pdf-agent.d.ts +0 -18
- package/dist/pdf-agent.js +0 -217
- package/dist/pdf-storage.d.ts +0 -8
- package/dist/pdf-storage.js +0 -86
- package/dist/pdfium-engine.d.ts +0 -9
- package/dist/pdfium-engine.js +0 -180
- package/dist/r2-file-store.d.ts +0 -20
- package/dist/r2-file-store.js +0 -176
- package/dist/response-schema.d.ts +0 -15
- package/dist/response-schema.js +0 -159
- package/dist/tool-registry.d.ts +0 -16
- package/dist/tool-registry.js +0 -175
- package/dist/worker.d.ts +0 -7
- package/dist/worker.js +0 -386
- package/scripts/export-fixtures.sh +0 -204
- package/wrangler.toml +0 -19
|
@@ -1,113 +0,0 @@
|
|
|
1
|
-
const normalizeLine = (value) => value.replace(/\s+/g, " ").trim();
|
|
2
|
-
const excerptFor = (value) => normalizeLine(value).slice(0, 160);
|
|
3
|
-
const hasTocSuffix = (value) => /(?:\.{2,}|\s{2,}|\t)\d+$/.test(value);
|
|
4
|
-
const hasTrailingPageNumber = (value) => /\s\d+$/.test(value);
|
|
5
|
-
const isContentsHeading = (value) => {
|
|
6
|
-
const normalized = normalizeLine(value).toLowerCase();
|
|
7
|
-
return normalized === "contents" || normalized === "table of contents" || normalized === "目录";
|
|
8
|
-
};
|
|
9
|
-
const detectHeading = (line) => {
|
|
10
|
-
const normalized = normalizeLine(line);
|
|
11
|
-
if (!normalized || normalized.length > 120)
|
|
12
|
-
return null;
|
|
13
|
-
if (hasTocSuffix(normalized))
|
|
14
|
-
return null;
|
|
15
|
-
const numbered = normalized.match(/^(\d+(?:\.\d+){0,3})\s+(.+)$/);
|
|
16
|
-
if (numbered) {
|
|
17
|
-
const numberPath = numbered[1] || "";
|
|
18
|
-
const topLevelNumber = Number.parseInt(numberPath.split(".")[0] || "", 10);
|
|
19
|
-
const title = normalizeLine(numbered[2] || "");
|
|
20
|
-
const level = numberPath.split(".").length;
|
|
21
|
-
if (!title)
|
|
22
|
-
return null;
|
|
23
|
-
if (title.length < 2)
|
|
24
|
-
return null;
|
|
25
|
-
if (hasTrailingPageNumber(normalized))
|
|
26
|
-
return null;
|
|
27
|
-
if (!/^[A-Za-z\u4E00-\u9FFF第((]/.test(title))
|
|
28
|
-
return null;
|
|
29
|
-
if (/^(GHz|MHz|Kbps|Mbps|Hz|kHz|mA|V|W)\b/i.test(title))
|
|
30
|
-
return null;
|
|
31
|
-
if (/[。;;::]$/.test(title))
|
|
32
|
-
return null;
|
|
33
|
-
if (Number.isFinite(topLevelNumber) && topLevelNumber > 20)
|
|
34
|
-
return null;
|
|
35
|
-
if (/^[A-Z]+\d+$/.test(title))
|
|
36
|
-
return null;
|
|
37
|
-
if (level === 1 && title.length > 40)
|
|
38
|
-
return null;
|
|
39
|
-
if (level === 1 && /[,,×—]/.test(title))
|
|
40
|
-
return null;
|
|
41
|
-
return {
|
|
42
|
-
title: `${numberPath} ${title}`.trim(),
|
|
43
|
-
level,
|
|
44
|
-
};
|
|
45
|
-
}
|
|
46
|
-
const chinese = normalized.match(/^(第[0-9一二三四五六七八九十百]+)(章|节|部分)\s+(.+)$/);
|
|
47
|
-
if (chinese) {
|
|
48
|
-
const suffix = chinese[2] || "";
|
|
49
|
-
return {
|
|
50
|
-
title: normalized,
|
|
51
|
-
level: suffix === "节" ? 2 : 1,
|
|
52
|
-
};
|
|
53
|
-
}
|
|
54
|
-
const english = normalized.match(/^(Chapter|Section|Part|Appendix)\b[:\s-]*(.+)?$/i);
|
|
55
|
-
if (english) {
|
|
56
|
-
return {
|
|
57
|
-
title: normalized,
|
|
58
|
-
level: /section/i.test(english[1] || "") ? 2 : 1,
|
|
59
|
-
};
|
|
60
|
-
}
|
|
61
|
-
return null;
|
|
62
|
-
};
|
|
63
|
-
const toReadonlyTree = (node) => ({
|
|
64
|
-
...node,
|
|
65
|
-
children: node.children.map(toReadonlyTree),
|
|
66
|
-
});
|
|
67
|
-
export const buildSemanticSectionTree = (pages) => {
|
|
68
|
-
const rootChildren = [];
|
|
69
|
-
const stack = [];
|
|
70
|
-
const emittedKeys = new Set();
|
|
71
|
-
let nextId = 1;
|
|
72
|
-
for (const page of pages) {
|
|
73
|
-
const lines = page.text
|
|
74
|
-
.split(/\r?\n/)
|
|
75
|
-
.map(normalizeLine)
|
|
76
|
-
.filter(Boolean);
|
|
77
|
-
if (lines.length === 0)
|
|
78
|
-
continue;
|
|
79
|
-
const contentsPage = isContentsHeading(lines[0] || "");
|
|
80
|
-
for (const line of lines) {
|
|
81
|
-
const heading = detectHeading(line);
|
|
82
|
-
if (!heading || contentsPage)
|
|
83
|
-
continue;
|
|
84
|
-
const emittedKey = `${heading.level}:${heading.title}`;
|
|
85
|
-
if (emittedKeys.has(emittedKey))
|
|
86
|
-
continue;
|
|
87
|
-
const node = {
|
|
88
|
-
id: `section-${nextId}`,
|
|
89
|
-
type: "section",
|
|
90
|
-
title: heading.title,
|
|
91
|
-
level: heading.level,
|
|
92
|
-
pageNumber: page.pageNumber,
|
|
93
|
-
pageArtifactPath: page.artifactPath,
|
|
94
|
-
excerpt: excerptFor(line),
|
|
95
|
-
children: [],
|
|
96
|
-
};
|
|
97
|
-
nextId += 1;
|
|
98
|
-
emittedKeys.add(emittedKey);
|
|
99
|
-
while (stack.length > 0 && (stack[stack.length - 1]?.level || 0) >= heading.level) {
|
|
100
|
-
stack.pop();
|
|
101
|
-
}
|
|
102
|
-
const parent = stack[stack.length - 1];
|
|
103
|
-
if (parent) {
|
|
104
|
-
parent.children.push(node);
|
|
105
|
-
}
|
|
106
|
-
else {
|
|
107
|
-
rootChildren.push(node);
|
|
108
|
-
}
|
|
109
|
-
stack.push(node);
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
return rootChildren.map(toReadonlyTree);
|
|
113
|
-
};
|
package/dist/pdf-agent.d.ts
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
import type { Env, FileStore } from "./types.js";
|
|
2
|
-
import type { AgentTraceEvent, EchoPdfConfig, PdfOperationRequest } from "./pdf-types.js";
|
|
3
|
-
interface RuntimeOptions {
|
|
4
|
-
readonly trace?: (event: AgentTraceEvent) => void;
|
|
5
|
-
readonly fileStore: FileStore;
|
|
6
|
-
}
|
|
7
|
-
export declare const ingestPdfFromPayload: (config: EchoPdfConfig, input: {
|
|
8
|
-
readonly fileId?: string;
|
|
9
|
-
readonly url?: string;
|
|
10
|
-
readonly base64?: string;
|
|
11
|
-
readonly filename?: string;
|
|
12
|
-
}, opts: RuntimeOptions) => Promise<{
|
|
13
|
-
id: string;
|
|
14
|
-
filename: string;
|
|
15
|
-
bytes: Uint8Array;
|
|
16
|
-
}>;
|
|
17
|
-
export declare const runPdfAgent: (config: EchoPdfConfig, env: Env, request: PdfOperationRequest, opts: RuntimeOptions) => Promise<unknown>;
|
|
18
|
-
export {};
|
package/dist/pdf-agent.js
DELETED
|
@@ -1,217 +0,0 @@
|
|
|
1
|
-
import { resolveModelForProvider, resolveProviderAlias } from "./agent-defaults.js";
|
|
2
|
-
import { fromBase64, normalizeReturnMode, toDataUrl } from "./file-utils.js";
|
|
3
|
-
import { badRequest, notFound, unprocessable } from "./http-error.js";
|
|
4
|
-
import { extractPdfPageText, getPdfPageCount, renderPdfPageToPng, toBytes } from "./pdfium-engine.js";
|
|
5
|
-
import { visionRecognize } from "./provider-client.js";
|
|
6
|
-
const traceStep = (opts, phase, name, payload, level) => {
|
|
7
|
-
if (!opts.trace)
|
|
8
|
-
return;
|
|
9
|
-
opts.trace({ kind: "step", phase, name, payload, level });
|
|
10
|
-
};
|
|
11
|
-
const ensurePages = (pages, pageCount, maxPages) => {
|
|
12
|
-
if (pages.length === 0)
|
|
13
|
-
throw badRequest("PAGES_REQUIRED", "At least one page is required");
|
|
14
|
-
if (pages.length > maxPages) {
|
|
15
|
-
throw badRequest("TOO_MANY_PAGES", `Page count exceeds maxPagesPerRequest (${maxPages})`, {
|
|
16
|
-
maxPagesPerRequest: maxPages,
|
|
17
|
-
providedPages: pages.length,
|
|
18
|
-
});
|
|
19
|
-
}
|
|
20
|
-
for (const page of pages) {
|
|
21
|
-
if (!Number.isInteger(page) || page < 1 || page > pageCount) {
|
|
22
|
-
throw badRequest("PAGE_OUT_OF_RANGE", `Page ${page} out of range 1..${pageCount}`, {
|
|
23
|
-
page,
|
|
24
|
-
min: 1,
|
|
25
|
-
max: pageCount,
|
|
26
|
-
});
|
|
27
|
-
}
|
|
28
|
-
}
|
|
29
|
-
return [...new Set(pages)].sort((a, b) => a - b);
|
|
30
|
-
};
|
|
31
|
-
export const ingestPdfFromPayload = async (config, input, opts) => {
|
|
32
|
-
if (input.fileId) {
|
|
33
|
-
const existing = await opts.fileStore.get(input.fileId);
|
|
34
|
-
if (!existing) {
|
|
35
|
-
throw notFound("FILE_NOT_FOUND", `File not found: ${input.fileId}`, { fileId: input.fileId });
|
|
36
|
-
}
|
|
37
|
-
return {
|
|
38
|
-
id: existing.id,
|
|
39
|
-
filename: existing.filename,
|
|
40
|
-
bytes: existing.bytes,
|
|
41
|
-
};
|
|
42
|
-
}
|
|
43
|
-
let bytes = null;
|
|
44
|
-
let filename = input.filename ?? "document.pdf";
|
|
45
|
-
if (input.url) {
|
|
46
|
-
traceStep(opts, "start", "file.fetch.url", { url: input.url });
|
|
47
|
-
try {
|
|
48
|
-
bytes = await toBytes(input.url);
|
|
49
|
-
}
|
|
50
|
-
catch (error) {
|
|
51
|
-
throw badRequest("URL_FETCH_FAILED", `Unable to fetch PDF from url: ${error instanceof Error ? error.message : String(error)}`);
|
|
52
|
-
}
|
|
53
|
-
try {
|
|
54
|
-
const u = new URL(input.url);
|
|
55
|
-
filename = decodeURIComponent(u.pathname.split("/").pop() || filename);
|
|
56
|
-
}
|
|
57
|
-
catch {
|
|
58
|
-
// ignore URL parse failure
|
|
59
|
-
}
|
|
60
|
-
traceStep(opts, "end", "file.fetch.url", { sizeBytes: bytes.byteLength });
|
|
61
|
-
}
|
|
62
|
-
else if (input.base64) {
|
|
63
|
-
traceStep(opts, "start", "file.decode.base64");
|
|
64
|
-
bytes = fromBase64(input.base64);
|
|
65
|
-
traceStep(opts, "end", "file.decode.base64", { sizeBytes: bytes.byteLength });
|
|
66
|
-
}
|
|
67
|
-
if (!bytes) {
|
|
68
|
-
throw badRequest("MISSING_FILE_INPUT", "Missing file input. Provide fileId, url or base64");
|
|
69
|
-
}
|
|
70
|
-
if (bytes.byteLength > config.service.maxPdfBytes) {
|
|
71
|
-
throw badRequest("PDF_TOO_LARGE", `PDF exceeds max size (${config.service.maxPdfBytes} bytes)`, {
|
|
72
|
-
maxPdfBytes: config.service.maxPdfBytes,
|
|
73
|
-
sizeBytes: bytes.byteLength,
|
|
74
|
-
});
|
|
75
|
-
}
|
|
76
|
-
const meta = await opts.fileStore.put({
|
|
77
|
-
filename,
|
|
78
|
-
mimeType: "application/pdf",
|
|
79
|
-
bytes,
|
|
80
|
-
});
|
|
81
|
-
traceStep(opts, "end", "file.stored", { fileId: meta.id, sizeBytes: meta.sizeBytes });
|
|
82
|
-
return {
|
|
83
|
-
id: meta.id,
|
|
84
|
-
filename: meta.filename,
|
|
85
|
-
bytes,
|
|
86
|
-
};
|
|
87
|
-
};
|
|
88
|
-
const resolveReturnMode = (value) => normalizeReturnMode(value);
|
|
89
|
-
const stripCodeFences = (value) => {
|
|
90
|
-
const text = value.trim();
|
|
91
|
-
const fenced = text.match(/^```[a-zA-Z0-9_-]*\n([\s\S]*?)\n```$/);
|
|
92
|
-
return typeof fenced?.[1] === "string" ? fenced[1].trim() : text;
|
|
93
|
-
};
|
|
94
|
-
const extractTabularLatex = (value) => {
|
|
95
|
-
const text = stripCodeFences(value);
|
|
96
|
-
const blocks = text.match(/\\begin\{tabular\}[\s\S]*?\\end\{tabular\}/g);
|
|
97
|
-
if (!blocks || blocks.length === 0)
|
|
98
|
-
return "";
|
|
99
|
-
return blocks.map((b) => b.trim()).join("\n\n");
|
|
100
|
-
};
|
|
101
|
-
export const runPdfAgent = async (config, env, request, opts) => {
|
|
102
|
-
traceStep(opts, "start", "pdf.operation", { operation: request.operation });
|
|
103
|
-
const file = await ingestPdfFromPayload(config, request, opts);
|
|
104
|
-
const pageCount = await getPdfPageCount(config, file.bytes);
|
|
105
|
-
traceStep(opts, "log", "pdf.meta", { fileId: file.id, pageCount });
|
|
106
|
-
const pages = ensurePages(request.pages, pageCount, config.service.maxPagesPerRequest);
|
|
107
|
-
const scale = request.renderScale ?? config.service.defaultRenderScale;
|
|
108
|
-
const returnMode = resolveReturnMode(request.returnMode);
|
|
109
|
-
if (request.operation === "extract_pages") {
|
|
110
|
-
const images = [];
|
|
111
|
-
for (const page of pages) {
|
|
112
|
-
traceStep(opts, "start", "render.page", { page });
|
|
113
|
-
const rendered = await renderPdfPageToPng(config, file.bytes, page - 1, scale);
|
|
114
|
-
if (returnMode === "file_id") {
|
|
115
|
-
const stored = await opts.fileStore.put({
|
|
116
|
-
filename: `${file.filename}-p${page}.png`,
|
|
117
|
-
mimeType: "image/png",
|
|
118
|
-
bytes: rendered.png,
|
|
119
|
-
});
|
|
120
|
-
images.push({ page, mimeType: "image/png", fileId: stored.id });
|
|
121
|
-
}
|
|
122
|
-
else if (returnMode === "url") {
|
|
123
|
-
const stored = await opts.fileStore.put({
|
|
124
|
-
filename: `${file.filename}-p${page}.png`,
|
|
125
|
-
mimeType: "image/png",
|
|
126
|
-
bytes: rendered.png,
|
|
127
|
-
});
|
|
128
|
-
images.push({
|
|
129
|
-
page,
|
|
130
|
-
mimeType: "image/png",
|
|
131
|
-
fileId: stored.id,
|
|
132
|
-
url: `/api/files/get?fileId=${encodeURIComponent(stored.id)}`,
|
|
133
|
-
});
|
|
134
|
-
}
|
|
135
|
-
else {
|
|
136
|
-
images.push({
|
|
137
|
-
page,
|
|
138
|
-
mimeType: "image/png",
|
|
139
|
-
data: toDataUrl(rendered.png, "image/png"),
|
|
140
|
-
});
|
|
141
|
-
}
|
|
142
|
-
traceStep(opts, "end", "render.page", { page, width: rendered.width, height: rendered.height });
|
|
143
|
-
}
|
|
144
|
-
const result = { fileId: file.id, pageCount, returnMode, images };
|
|
145
|
-
traceStep(opts, "end", "pdf.operation", { operation: request.operation });
|
|
146
|
-
return result;
|
|
147
|
-
}
|
|
148
|
-
const providerAlias = resolveProviderAlias(config, request.provider);
|
|
149
|
-
const model = resolveModelForProvider(config, providerAlias, request.model);
|
|
150
|
-
if (!model) {
|
|
151
|
-
throw badRequest("MODEL_REQUIRED", "model is required for OCR or table extraction; set agent.defaultModel");
|
|
152
|
-
}
|
|
153
|
-
if (request.operation === "ocr_pages") {
|
|
154
|
-
const results = [];
|
|
155
|
-
for (const page of pages) {
|
|
156
|
-
traceStep(opts, "start", "ocr.page", { page });
|
|
157
|
-
const rendered = await renderPdfPageToPng(config, file.bytes, page - 1, scale);
|
|
158
|
-
const imageDataUrl = toDataUrl(rendered.png, "image/png");
|
|
159
|
-
const fallbackText = await extractPdfPageText(config, file.bytes, page - 1);
|
|
160
|
-
const prompt = request.prompt?.trim() || config.agent.ocrPrompt;
|
|
161
|
-
const llmText = await visionRecognize({
|
|
162
|
-
config,
|
|
163
|
-
env,
|
|
164
|
-
providerAlias,
|
|
165
|
-
model,
|
|
166
|
-
prompt,
|
|
167
|
-
imageDataUrl,
|
|
168
|
-
runtimeApiKeys: request.providerApiKeys,
|
|
169
|
-
});
|
|
170
|
-
const text = stripCodeFences(llmText || fallbackText || "");
|
|
171
|
-
results.push({ page, text });
|
|
172
|
-
traceStep(opts, "end", "ocr.page", { page, chars: text.length });
|
|
173
|
-
}
|
|
174
|
-
const result = {
|
|
175
|
-
fileId: file.id,
|
|
176
|
-
pageCount,
|
|
177
|
-
provider: providerAlias,
|
|
178
|
-
model,
|
|
179
|
-
pages: results,
|
|
180
|
-
};
|
|
181
|
-
traceStep(opts, "end", "pdf.operation", { operation: request.operation });
|
|
182
|
-
return result;
|
|
183
|
-
}
|
|
184
|
-
const tables = [];
|
|
185
|
-
for (const page of pages) {
|
|
186
|
-
traceStep(opts, "start", "table.page", { page });
|
|
187
|
-
const rendered = await renderPdfPageToPng(config, file.bytes, page - 1, scale);
|
|
188
|
-
const imageDataUrl = toDataUrl(rendered.png, "image/png");
|
|
189
|
-
const prompt = request.prompt?.trim() || config.agent.tablePrompt;
|
|
190
|
-
const rawLatex = await visionRecognize({
|
|
191
|
-
config,
|
|
192
|
-
env,
|
|
193
|
-
providerAlias,
|
|
194
|
-
model,
|
|
195
|
-
prompt,
|
|
196
|
-
imageDataUrl,
|
|
197
|
-
runtimeApiKeys: request.providerApiKeys,
|
|
198
|
-
});
|
|
199
|
-
const latex = extractTabularLatex(rawLatex);
|
|
200
|
-
if (!latex) {
|
|
201
|
-
throw unprocessable("TABLE_LATEX_MISSING", `table extraction did not return valid LaTeX tabular for page ${page}`, {
|
|
202
|
-
page,
|
|
203
|
-
});
|
|
204
|
-
}
|
|
205
|
-
tables.push({ page, latex });
|
|
206
|
-
traceStep(opts, "end", "table.page", { page, chars: latex.length });
|
|
207
|
-
}
|
|
208
|
-
const result = {
|
|
209
|
-
fileId: file.id,
|
|
210
|
-
pageCount,
|
|
211
|
-
provider: providerAlias,
|
|
212
|
-
model,
|
|
213
|
-
pages: tables,
|
|
214
|
-
};
|
|
215
|
-
traceStep(opts, "end", "pdf.operation", { operation: request.operation });
|
|
216
|
-
return result;
|
|
217
|
-
};
|
package/dist/pdf-storage.d.ts
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
import type { EchoPdfConfig } from "./pdf-types.js";
|
|
2
|
-
import type { Env, FileStore } from "./types.js";
|
|
3
|
-
export interface RuntimeFileStoreBundle {
|
|
4
|
-
readonly store: FileStore;
|
|
5
|
-
stats: () => Promise<unknown>;
|
|
6
|
-
cleanup: () => Promise<unknown>;
|
|
7
|
-
}
|
|
8
|
-
export declare const getRuntimeFileStore: (env: Env, config: EchoPdfConfig) => RuntimeFileStoreBundle;
|
package/dist/pdf-storage.js
DELETED
|
@@ -1,86 +0,0 @@
|
|
|
1
|
-
import { DurableObjectFileStore } from "./file-store-do.js";
|
|
2
|
-
import { R2FileStore } from "./r2-file-store.js";
|
|
3
|
-
class InMemoryFileStore {
|
|
4
|
-
store = new Map();
|
|
5
|
-
async put(input) {
|
|
6
|
-
const id = crypto.randomUUID();
|
|
7
|
-
const record = {
|
|
8
|
-
id,
|
|
9
|
-
filename: input.filename,
|
|
10
|
-
mimeType: input.mimeType,
|
|
11
|
-
sizeBytes: input.bytes.byteLength,
|
|
12
|
-
createdAt: new Date().toISOString(),
|
|
13
|
-
bytes: input.bytes,
|
|
14
|
-
};
|
|
15
|
-
this.store.set(id, record);
|
|
16
|
-
return this.toMeta(record);
|
|
17
|
-
}
|
|
18
|
-
async get(fileId) {
|
|
19
|
-
return this.store.get(fileId) ?? null;
|
|
20
|
-
}
|
|
21
|
-
async list() {
|
|
22
|
-
return [...this.store.values()].map((record) => this.toMeta(record));
|
|
23
|
-
}
|
|
24
|
-
async delete(fileId) {
|
|
25
|
-
return this.store.delete(fileId);
|
|
26
|
-
}
|
|
27
|
-
toMeta(record) {
|
|
28
|
-
return {
|
|
29
|
-
id: record.id,
|
|
30
|
-
filename: record.filename,
|
|
31
|
-
mimeType: record.mimeType,
|
|
32
|
-
sizeBytes: record.sizeBytes,
|
|
33
|
-
createdAt: record.createdAt,
|
|
34
|
-
};
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
const fallbackStore = new InMemoryFileStore();
|
|
38
|
-
const DO_SAFE_MAX_FILE_BYTES = 1_200_000;
|
|
39
|
-
export const getRuntimeFileStore = (env, config) => {
|
|
40
|
-
if (env.FILE_STORE_BUCKET) {
|
|
41
|
-
const store = new R2FileStore(env.FILE_STORE_BUCKET, config.service.storage);
|
|
42
|
-
return {
|
|
43
|
-
store,
|
|
44
|
-
stats: async () => store.stats(),
|
|
45
|
-
cleanup: async () => store.cleanup(),
|
|
46
|
-
};
|
|
47
|
-
}
|
|
48
|
-
if (env.FILE_STORE_DO) {
|
|
49
|
-
if (config.service.storage.maxFileBytes > DO_SAFE_MAX_FILE_BYTES) {
|
|
50
|
-
throw new Error(`service.storage.maxFileBytes=${config.service.storage.maxFileBytes} exceeds DO backend limit ${DO_SAFE_MAX_FILE_BYTES}; bind FILE_STORE_BUCKET (R2) or reduce maxFileBytes`);
|
|
51
|
-
}
|
|
52
|
-
const store = new DurableObjectFileStore(env.FILE_STORE_DO, config.service.storage);
|
|
53
|
-
return {
|
|
54
|
-
store,
|
|
55
|
-
stats: async () => store.stats(),
|
|
56
|
-
cleanup: async () => store.cleanup(),
|
|
57
|
-
};
|
|
58
|
-
}
|
|
59
|
-
return {
|
|
60
|
-
store: fallbackStore,
|
|
61
|
-
stats: async () => {
|
|
62
|
-
const files = await fallbackStore.list();
|
|
63
|
-
const totalBytes = files.reduce((sum, file) => sum + file.sizeBytes, 0);
|
|
64
|
-
return {
|
|
65
|
-
backend: "memory",
|
|
66
|
-
policy: config.service.storage,
|
|
67
|
-
stats: {
|
|
68
|
-
fileCount: files.length,
|
|
69
|
-
totalBytes,
|
|
70
|
-
},
|
|
71
|
-
};
|
|
72
|
-
},
|
|
73
|
-
cleanup: async () => ({
|
|
74
|
-
backend: "memory",
|
|
75
|
-
deletedExpired: 0,
|
|
76
|
-
deletedEvicted: 0,
|
|
77
|
-
stats: await (async () => {
|
|
78
|
-
const files = await fallbackStore.list();
|
|
79
|
-
return {
|
|
80
|
-
fileCount: files.length,
|
|
81
|
-
totalBytes: files.reduce((sum, file) => sum + file.sizeBytes, 0),
|
|
82
|
-
};
|
|
83
|
-
})(),
|
|
84
|
-
}),
|
|
85
|
-
};
|
|
86
|
-
};
|
package/dist/pdfium-engine.d.ts
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
import type { EchoPdfConfig } from "./pdf-types.js";
|
|
2
|
-
export declare const getPdfPageCount: (config: EchoPdfConfig, bytes: Uint8Array) => Promise<number>;
|
|
3
|
-
export declare const renderPdfPageToPng: (config: EchoPdfConfig, bytes: Uint8Array, pageIndex: number, scale?: number) => Promise<{
|
|
4
|
-
width: number;
|
|
5
|
-
height: number;
|
|
6
|
-
png: Uint8Array;
|
|
7
|
-
}>;
|
|
8
|
-
export declare const extractPdfPageText: (config: EchoPdfConfig, bytes: Uint8Array, pageIndex: number) => Promise<string>;
|
|
9
|
-
export declare const toBytes: (value: string) => Promise<Uint8Array>;
|
package/dist/pdfium-engine.js
DELETED
|
@@ -1,180 +0,0 @@
|
|
|
1
|
-
import { init } from "@embedpdf/pdfium";
|
|
2
|
-
import { encode as encodePng } from "@cf-wasm/png";
|
|
3
|
-
let moduleInstance = null;
|
|
4
|
-
let libraryInitialized = false;
|
|
5
|
-
const toUint8 = (value) => new Uint8Array(value);
|
|
6
|
-
const textDecoder = new TextDecoder();
|
|
7
|
-
const isWorkerdRuntime = () => typeof globalThis.WebSocketPair === "function";
|
|
8
|
-
const ensureWasmFunctionShim = () => {
|
|
9
|
-
const wasmApi = WebAssembly;
|
|
10
|
-
if (typeof wasmApi.Function === "function")
|
|
11
|
-
return;
|
|
12
|
-
wasmApi.Function = (_sig, fn) => fn;
|
|
13
|
-
};
|
|
14
|
-
const ensurePdfium = async (config) => {
|
|
15
|
-
ensureWasmFunctionShim();
|
|
16
|
-
if (!moduleInstance) {
|
|
17
|
-
if (isWorkerdRuntime()) {
|
|
18
|
-
const wasmModuleImport = await import("@embedpdf/pdfium/pdfium.wasm");
|
|
19
|
-
const maybeModule = wasmModuleImport.default ?? wasmModuleImport;
|
|
20
|
-
if (maybeModule instanceof WebAssembly.Module) {
|
|
21
|
-
moduleInstance = await init({
|
|
22
|
-
instantiateWasm: (imports, successCallback) => {
|
|
23
|
-
const instance = new WebAssembly.Instance(maybeModule, imports);
|
|
24
|
-
successCallback(instance, maybeModule);
|
|
25
|
-
return instance.exports;
|
|
26
|
-
},
|
|
27
|
-
});
|
|
28
|
-
}
|
|
29
|
-
}
|
|
30
|
-
if (!moduleInstance) {
|
|
31
|
-
const wasmBinary = await fetch(config.pdfium.wasmUrl).then((res) => res.arrayBuffer());
|
|
32
|
-
moduleInstance = await init({ wasmBinary });
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
if (!libraryInitialized) {
|
|
36
|
-
moduleInstance.FPDF_InitLibrary();
|
|
37
|
-
libraryInitialized = true;
|
|
38
|
-
}
|
|
39
|
-
return moduleInstance;
|
|
40
|
-
};
|
|
41
|
-
const makeDoc = (pdfium, bytes) => {
|
|
42
|
-
const memPtr = pdfium.pdfium.wasmExports.malloc(bytes.length);
|
|
43
|
-
pdfium.pdfium.HEAPU8.set(bytes, memPtr);
|
|
44
|
-
const doc = pdfium.FPDF_LoadMemDocument(memPtr, bytes.length, "");
|
|
45
|
-
if (!doc) {
|
|
46
|
-
pdfium.pdfium.wasmExports.free(memPtr);
|
|
47
|
-
throw new Error("Failed to load PDF document");
|
|
48
|
-
}
|
|
49
|
-
return { doc, memPtr };
|
|
50
|
-
};
|
|
51
|
-
const closeDoc = (pdfium, doc, memPtr) => {
|
|
52
|
-
pdfium.FPDF_CloseDocument(doc);
|
|
53
|
-
pdfium.pdfium.wasmExports.free(memPtr);
|
|
54
|
-
};
|
|
55
|
-
const bgraToRgba = (bgra) => {
|
|
56
|
-
const rgba = new Uint8Array(bgra.length);
|
|
57
|
-
for (let i = 0; i < bgra.length; i += 4) {
|
|
58
|
-
rgba[i] = bgra[i + 2] ?? 0;
|
|
59
|
-
rgba[i + 1] = bgra[i + 1] ?? 0;
|
|
60
|
-
rgba[i + 2] = bgra[i] ?? 0;
|
|
61
|
-
rgba[i + 3] = bgra[i + 3] ?? 255;
|
|
62
|
-
}
|
|
63
|
-
return rgba;
|
|
64
|
-
};
|
|
65
|
-
const decodeUtf16Le = (buf) => {
|
|
66
|
-
const view = new Uint16Array(buf.buffer, buf.byteOffset, Math.floor(buf.byteLength / 2));
|
|
67
|
-
const chars = [];
|
|
68
|
-
for (const code of view) {
|
|
69
|
-
if (code === 0)
|
|
70
|
-
break;
|
|
71
|
-
chars.push(code);
|
|
72
|
-
}
|
|
73
|
-
return String.fromCharCode(...chars);
|
|
74
|
-
};
|
|
75
|
-
export const getPdfPageCount = async (config, bytes) => {
|
|
76
|
-
const pdfium = await ensurePdfium(config);
|
|
77
|
-
const { doc, memPtr } = makeDoc(pdfium, bytes);
|
|
78
|
-
try {
|
|
79
|
-
return pdfium.FPDF_GetPageCount(doc);
|
|
80
|
-
}
|
|
81
|
-
finally {
|
|
82
|
-
closeDoc(pdfium, doc, memPtr);
|
|
83
|
-
}
|
|
84
|
-
};
|
|
85
|
-
export const renderPdfPageToPng = async (config, bytes, pageIndex, scale = config.service.defaultRenderScale) => {
|
|
86
|
-
const pdfium = await ensurePdfium(config);
|
|
87
|
-
const { doc, memPtr } = makeDoc(pdfium, bytes);
|
|
88
|
-
let page = 0;
|
|
89
|
-
let bitmap = 0;
|
|
90
|
-
try {
|
|
91
|
-
page = pdfium.FPDF_LoadPage(doc, pageIndex);
|
|
92
|
-
if (!page) {
|
|
93
|
-
throw new Error(`Failed to load page ${pageIndex}`);
|
|
94
|
-
}
|
|
95
|
-
const width = Math.max(1, Math.round(pdfium.FPDF_GetPageWidthF(page) * scale));
|
|
96
|
-
const height = Math.max(1, Math.round(pdfium.FPDF_GetPageHeightF(page) * scale));
|
|
97
|
-
bitmap = pdfium.FPDFBitmap_Create(width, height, 1);
|
|
98
|
-
if (!bitmap) {
|
|
99
|
-
throw new Error("Failed to create bitmap");
|
|
100
|
-
}
|
|
101
|
-
pdfium.FPDFBitmap_FillRect(bitmap, 0, 0, width, height, 0xffffffff);
|
|
102
|
-
pdfium.FPDF_RenderPageBitmap(bitmap, page, 0, 0, width, height, 0, 0);
|
|
103
|
-
const stride = pdfium.FPDFBitmap_GetStride(bitmap);
|
|
104
|
-
const bufferPtr = pdfium.FPDFBitmap_GetBuffer(bitmap);
|
|
105
|
-
const heap = pdfium.pdfium.HEAPU8;
|
|
106
|
-
const bgra = heap.slice(bufferPtr, bufferPtr + stride * height);
|
|
107
|
-
const rgba = bgraToRgba(bgra);
|
|
108
|
-
const png = encodePng(rgba, width, height);
|
|
109
|
-
return { width, height, png };
|
|
110
|
-
}
|
|
111
|
-
finally {
|
|
112
|
-
if (bitmap)
|
|
113
|
-
pdfium.FPDFBitmap_Destroy(bitmap);
|
|
114
|
-
if (page)
|
|
115
|
-
pdfium.FPDF_ClosePage(page);
|
|
116
|
-
closeDoc(pdfium, doc, memPtr);
|
|
117
|
-
}
|
|
118
|
-
};
|
|
119
|
-
export const extractPdfPageText = async (config, bytes, pageIndex) => {
|
|
120
|
-
const pdfium = await ensurePdfium(config);
|
|
121
|
-
const { doc, memPtr } = makeDoc(pdfium, bytes);
|
|
122
|
-
let page = 0;
|
|
123
|
-
let textPage = 0;
|
|
124
|
-
let outPtr = 0;
|
|
125
|
-
try {
|
|
126
|
-
page = pdfium.FPDF_LoadPage(doc, pageIndex);
|
|
127
|
-
if (!page) {
|
|
128
|
-
throw new Error(`Failed to load page ${pageIndex}`);
|
|
129
|
-
}
|
|
130
|
-
textPage = pdfium.FPDFText_LoadPage(page);
|
|
131
|
-
if (!textPage)
|
|
132
|
-
return "";
|
|
133
|
-
const chars = pdfium.FPDFText_CountChars(textPage);
|
|
134
|
-
if (chars <= 0)
|
|
135
|
-
return "";
|
|
136
|
-
const bytesLen = (chars + 1) * 2;
|
|
137
|
-
outPtr = pdfium.pdfium.wasmExports.malloc(bytesLen);
|
|
138
|
-
pdfium.FPDFText_GetText(textPage, 0, chars, outPtr);
|
|
139
|
-
const heap = pdfium.pdfium.HEAPU8;
|
|
140
|
-
const raw = heap.slice(outPtr, outPtr + bytesLen);
|
|
141
|
-
return decodeUtf16Le(raw).trim();
|
|
142
|
-
}
|
|
143
|
-
finally {
|
|
144
|
-
if (outPtr)
|
|
145
|
-
pdfium.pdfium.wasmExports.free(outPtr);
|
|
146
|
-
if (textPage)
|
|
147
|
-
pdfium.FPDFText_ClosePage(textPage);
|
|
148
|
-
if (page)
|
|
149
|
-
pdfium.FPDF_ClosePage(page);
|
|
150
|
-
closeDoc(pdfium, doc, memPtr);
|
|
151
|
-
}
|
|
152
|
-
};
|
|
153
|
-
export const toBytes = async (value) => {
|
|
154
|
-
const response = await fetch(value);
|
|
155
|
-
if (!response.ok) {
|
|
156
|
-
throw new Error(`Failed to fetch source: HTTP ${response.status}`);
|
|
157
|
-
}
|
|
158
|
-
const contentType = (response.headers.get("content-type") ?? "").toLowerCase();
|
|
159
|
-
const bytes = toUint8(await response.arrayBuffer());
|
|
160
|
-
const signature = textDecoder.decode(bytes.subarray(0, Math.min(8, bytes.length)));
|
|
161
|
-
if (contentType.includes("application/pdf") || signature.startsWith("%PDF-")) {
|
|
162
|
-
return bytes;
|
|
163
|
-
}
|
|
164
|
-
const html = textDecoder.decode(bytes);
|
|
165
|
-
const pdfMatch = html.match(/https?:\/\/[^"' )]+\.pdf[^"' )]*/i);
|
|
166
|
-
if (!pdfMatch || pdfMatch.length === 0) {
|
|
167
|
-
throw new Error("URL does not point to a PDF and no PDF link was found in the page");
|
|
168
|
-
}
|
|
169
|
-
const resolvedUrl = pdfMatch[0].replace(/&/g, "&");
|
|
170
|
-
const pdfResponse = await fetch(resolvedUrl);
|
|
171
|
-
if (!pdfResponse.ok) {
|
|
172
|
-
throw new Error(`Failed to fetch resolved PDF url: HTTP ${pdfResponse.status}`);
|
|
173
|
-
}
|
|
174
|
-
const pdfBytes = toUint8(await pdfResponse.arrayBuffer());
|
|
175
|
-
const pdfSignature = textDecoder.decode(pdfBytes.subarray(0, Math.min(8, pdfBytes.length)));
|
|
176
|
-
if (!pdfSignature.startsWith("%PDF-")) {
|
|
177
|
-
throw new Error("Resolved file is not a valid PDF");
|
|
178
|
-
}
|
|
179
|
-
return pdfBytes;
|
|
180
|
-
};
|
package/dist/r2-file-store.d.ts
DELETED
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
import type { StoragePolicy } from "./pdf-types.js";
|
|
2
|
-
import type { FileStore, R2Bucket, StoredFileMeta, StoredFileRecord } from "./types.js";
|
|
3
|
-
export declare class R2FileStore implements FileStore {
|
|
4
|
-
private readonly bucket;
|
|
5
|
-
private readonly policy;
|
|
6
|
-
constructor(bucket: R2Bucket, policy: StoragePolicy);
|
|
7
|
-
put(input: {
|
|
8
|
-
readonly filename: string;
|
|
9
|
-
readonly mimeType: string;
|
|
10
|
-
readonly bytes: Uint8Array;
|
|
11
|
-
}): Promise<StoredFileMeta>;
|
|
12
|
-
get(fileId: string): Promise<StoredFileRecord | null>;
|
|
13
|
-
list(): Promise<ReadonlyArray<StoredFileMeta>>;
|
|
14
|
-
delete(fileId: string): Promise<boolean>;
|
|
15
|
-
stats(): Promise<unknown>;
|
|
16
|
-
cleanup(): Promise<unknown>;
|
|
17
|
-
private cleanupInternal;
|
|
18
|
-
private pickEvictions;
|
|
19
|
-
private listAllFiles;
|
|
20
|
-
}
|