@echofiles/echo-pdf 0.11.2 → 0.11.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -6
- package/bin/echo-pdf.js +2 -2
- package/dist/local/semantic.js +70 -3
- package/dist/local/shared.d.ts +4 -0
- package/dist/local/shared.js +90 -0
- package/package.json +9 -8
- package/scripts/check-runtime.sh +2 -2
- package/scripts/smoke.sh +1 -1
package/README.md
CHANGED
|
@@ -187,12 +187,12 @@ Published docs site:
|
|
|
187
187
|
## Development
|
|
188
188
|
|
|
189
189
|
```bash
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
190
|
+
bun install --frozen-lockfile
|
|
191
|
+
bun run build
|
|
192
|
+
bun run typecheck
|
|
193
|
+
bun run test:unit
|
|
194
|
+
bun run test:acceptance
|
|
195
|
+
bun run test:integration
|
|
196
196
|
```
|
|
197
197
|
|
|
198
198
|
For source-checkout CLI development and repo-local workflows, see [docs/DEVELOPMENT.md](./docs/DEVELOPMENT.md).
|
package/bin/echo-pdf.js
CHANGED
|
@@ -203,13 +203,13 @@ const loadLocalDocumentApi = async () => {
|
|
|
203
203
|
}
|
|
204
204
|
throw new Error(
|
|
205
205
|
"Internal source-checkout CLI dev mode requires Bun and src/local/index.ts. " +
|
|
206
|
-
"Use `
|
|
206
|
+
"Use `bun run cli:dev -- <primitive> ...` only from a source checkout."
|
|
207
207
|
)
|
|
208
208
|
}
|
|
209
209
|
if (!fs.existsSync(LOCAL_DOCUMENT_DIST_PATH)) {
|
|
210
210
|
throw new Error(
|
|
211
211
|
"Local primitive commands require built artifacts in a source checkout. " +
|
|
212
|
-
"Run `
|
|
212
|
+
"Run `bun run build` first, use the internal `bun run cli:dev -- <primitive> ...` path in this repo, or install the published package."
|
|
213
213
|
)
|
|
214
214
|
}
|
|
215
215
|
return import(LOCAL_DOCUMENT_DIST_ENTRY.href)
|
package/dist/local/semantic.js
CHANGED
|
@@ -5,7 +5,7 @@ import { resolveModelForProvider, resolveProviderAlias } from "../provider-defau
|
|
|
5
5
|
import { toDataUrl } from "../file-utils.js";
|
|
6
6
|
import { generateText, visionRecognize } from "../provider-client.js";
|
|
7
7
|
import { ensureRenderArtifact, indexDocumentInternal } from "./document.js";
|
|
8
|
-
import { fileExists, matchesSourceSnapshot, matchesStrategyKey, pageLabel, parseJsonObject, readJson, resolveConfig, resolveEnv, writeJson, } from "./shared.js";
|
|
8
|
+
import { fileExists, matchesSourceSnapshot, matchesStrategyKey, pageLabel, parseJsonObject, parseJsonObjectWithRepair, readJson, resolveConfig, resolveEnv, writeJson, } from "./shared.js";
|
|
9
9
|
import { normalizeFigureItems, normalizeUnderstandingFormulas, normalizeUnderstandingTables } from "./understanding.js";
|
|
10
10
|
const resolveSemanticExtractionBudget = (input) => ({
|
|
11
11
|
pageSelection: "all",
|
|
@@ -134,6 +134,15 @@ const resolveSemanticAgentContext = (config, request) => {
|
|
|
134
134
|
}
|
|
135
135
|
return { provider, model };
|
|
136
136
|
};
|
|
137
|
+
class SemanticAggregationModelOutputError extends Error {
|
|
138
|
+
detail;
|
|
139
|
+
code = "SEMANTIC_AGGREGATION_INVALID_JSON";
|
|
140
|
+
constructor(message, detail) {
|
|
141
|
+
super(message);
|
|
142
|
+
this.detail = detail;
|
|
143
|
+
this.name = "SemanticAggregationModelOutputError";
|
|
144
|
+
}
|
|
145
|
+
}
|
|
137
146
|
const extractCombinedPageData = async (input) => {
|
|
138
147
|
const renderArtifact = await ensureRenderArtifact({
|
|
139
148
|
pdfPath: input.request.pdfPath,
|
|
@@ -170,6 +179,55 @@ const extractCombinedPageData = async (input) => {
|
|
|
170
179
|
},
|
|
171
180
|
};
|
|
172
181
|
};
|
|
182
|
+
const buildSemanticAggregationRetryPrompt = (record, candidates) => {
|
|
183
|
+
return [
|
|
184
|
+
buildSemanticAggregationPrompt(record, candidates),
|
|
185
|
+
"",
|
|
186
|
+
"Your previous response was not strict JSON.",
|
|
187
|
+
"Return the same semantic structure again, but this time produce strict RFC 8259 JSON only.",
|
|
188
|
+
"Do not wrap in markdown fences.",
|
|
189
|
+
"Do not use invalid backslash escapes such as \\(, \\), \\_, or \\- inside JSON strings.",
|
|
190
|
+
].join("\n");
|
|
191
|
+
};
|
|
192
|
+
const parseSemanticAggregationResponse = async (input) => {
|
|
193
|
+
try {
|
|
194
|
+
const parsed = parseJsonObjectWithRepair(input.aggregated);
|
|
195
|
+
return {
|
|
196
|
+
sections: parsed.parsed?.sections,
|
|
197
|
+
repaired: parsed.repaired,
|
|
198
|
+
retried: false,
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
catch (firstError) {
|
|
202
|
+
const causeMessage = firstError instanceof Error ? firstError.message : String(firstError);
|
|
203
|
+
const retried = await generateText({
|
|
204
|
+
config: input.config,
|
|
205
|
+
env: input.env,
|
|
206
|
+
providerAlias: input.provider,
|
|
207
|
+
model: input.model,
|
|
208
|
+
prompt: buildSemanticAggregationRetryPrompt(input.record, input.candidates),
|
|
209
|
+
runtimeApiKeys: input.runtimeApiKeys,
|
|
210
|
+
});
|
|
211
|
+
try {
|
|
212
|
+
const parsed = parseJsonObjectWithRepair(retried);
|
|
213
|
+
return {
|
|
214
|
+
sections: parsed.parsed?.sections,
|
|
215
|
+
repaired: parsed.repaired,
|
|
216
|
+
retried: true,
|
|
217
|
+
};
|
|
218
|
+
}
|
|
219
|
+
catch (retryError) {
|
|
220
|
+
const retryCauseMessage = retryError instanceof Error ? retryError.message : String(retryError);
|
|
221
|
+
throw new SemanticAggregationModelOutputError("semantic aggregation returned invalid JSON after repair and retry", {
|
|
222
|
+
provider: input.provider,
|
|
223
|
+
model: input.model,
|
|
224
|
+
repaired: false,
|
|
225
|
+
retried: true,
|
|
226
|
+
causeMessage: `${causeMessage}; retry=${retryCauseMessage}`,
|
|
227
|
+
});
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
};
|
|
173
231
|
const mergeCrossPageTables = (understandings) => {
|
|
174
232
|
const merged = [];
|
|
175
233
|
let nextId = 1;
|
|
@@ -304,8 +362,17 @@ const ensureSemanticStructureArtifact = async (request) => {
|
|
|
304
362
|
prompt: buildSemanticAggregationPrompt(record, [...candidateMap.values()]),
|
|
305
363
|
runtimeApiKeys: request.providerApiKeys,
|
|
306
364
|
});
|
|
307
|
-
const parsed =
|
|
308
|
-
|
|
365
|
+
const parsed = await parseSemanticAggregationResponse({
|
|
366
|
+
aggregated,
|
|
367
|
+
record,
|
|
368
|
+
candidates: [...candidateMap.values()],
|
|
369
|
+
config,
|
|
370
|
+
env,
|
|
371
|
+
provider,
|
|
372
|
+
model,
|
|
373
|
+
runtimeApiKeys: request.providerApiKeys,
|
|
374
|
+
});
|
|
375
|
+
const sections = toSemanticTree(parsed.sections, pageArtifactPaths);
|
|
309
376
|
const mergedTables = mergeCrossPageTables(pageElements);
|
|
310
377
|
const mergedFormulas = mergeCrossPageFormulas(pageElements);
|
|
311
378
|
const mergedFigures = mergeCrossPageFigures(pageElements);
|
package/dist/local/shared.d.ts
CHANGED
|
@@ -19,6 +19,10 @@ export declare const createPreview: (text: string) => string;
|
|
|
19
19
|
export declare const createPageTitle: (pageNumber: number, text: string) => string;
|
|
20
20
|
export declare const stripCodeFences: (value: string) => string;
|
|
21
21
|
export declare const parseJsonObject: (value: string) => unknown;
|
|
22
|
+
export declare const parseJsonObjectWithRepair: (value: string) => {
|
|
23
|
+
parsed: unknown;
|
|
24
|
+
repaired: boolean;
|
|
25
|
+
};
|
|
22
26
|
export declare const normalizeTableItems: (value: unknown) => LocalTableArtifactItem[];
|
|
23
27
|
export declare const normalizeFormulaItems: (value: unknown) => LocalFormulaArtifactItem[];
|
|
24
28
|
export declare const resolveEnv: (env?: Env) => Env;
|
package/dist/local/shared.js
CHANGED
|
@@ -77,6 +77,96 @@ export const parseJsonObject = (value) => {
|
|
|
77
77
|
throw new Error("model output was not valid JSON");
|
|
78
78
|
}
|
|
79
79
|
};
|
|
80
|
+
const validJsonEscape = (value) => /["\\/bfnrt]/.test(value);
|
|
81
|
+
const repairInvalidJsonEscapes = (value) => {
|
|
82
|
+
let repaired = false;
|
|
83
|
+
let inString = false;
|
|
84
|
+
let escaping = false;
|
|
85
|
+
let unicodeDigitsRemaining = 0;
|
|
86
|
+
let output = "";
|
|
87
|
+
for (let index = 0; index < value.length; index += 1) {
|
|
88
|
+
const char = value[index] ?? "";
|
|
89
|
+
if (!inString) {
|
|
90
|
+
output += char;
|
|
91
|
+
if (char === "\"")
|
|
92
|
+
inString = true;
|
|
93
|
+
continue;
|
|
94
|
+
}
|
|
95
|
+
if (unicodeDigitsRemaining > 0) {
|
|
96
|
+
output += char;
|
|
97
|
+
if (/^[0-9a-fA-F]$/.test(char)) {
|
|
98
|
+
unicodeDigitsRemaining -= 1;
|
|
99
|
+
}
|
|
100
|
+
else {
|
|
101
|
+
repaired = true;
|
|
102
|
+
unicodeDigitsRemaining = 0;
|
|
103
|
+
}
|
|
104
|
+
continue;
|
|
105
|
+
}
|
|
106
|
+
if (escaping) {
|
|
107
|
+
if (validJsonEscape(char)) {
|
|
108
|
+
output += char;
|
|
109
|
+
}
|
|
110
|
+
else if (char === "u") {
|
|
111
|
+
output += char;
|
|
112
|
+
unicodeDigitsRemaining = 4;
|
|
113
|
+
}
|
|
114
|
+
else {
|
|
115
|
+
output += `\\${char}`;
|
|
116
|
+
repaired = true;
|
|
117
|
+
}
|
|
118
|
+
escaping = false;
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
if (char === "\\") {
|
|
122
|
+
output += char;
|
|
123
|
+
escaping = true;
|
|
124
|
+
continue;
|
|
125
|
+
}
|
|
126
|
+
output += char;
|
|
127
|
+
if (char === "\"")
|
|
128
|
+
inString = false;
|
|
129
|
+
}
|
|
130
|
+
if (escaping) {
|
|
131
|
+
output += "\\";
|
|
132
|
+
repaired = true;
|
|
133
|
+
}
|
|
134
|
+
return { repairedText: output, repaired };
|
|
135
|
+
};
|
|
136
|
+
export const parseJsonObjectWithRepair = (value) => {
|
|
137
|
+
const trimmed = stripCodeFences(value).trim();
|
|
138
|
+
if (!trimmed)
|
|
139
|
+
return { parsed: null, repaired: false };
|
|
140
|
+
const candidates = [trimmed];
|
|
141
|
+
const start = trimmed.indexOf("{");
|
|
142
|
+
const end = trimmed.lastIndexOf("}");
|
|
143
|
+
if (start >= 0 && end > start) {
|
|
144
|
+
const sliced = trimmed.slice(start, end + 1);
|
|
145
|
+
if (sliced !== trimmed)
|
|
146
|
+
candidates.push(sliced);
|
|
147
|
+
}
|
|
148
|
+
let lastError = null;
|
|
149
|
+
for (const candidate of candidates) {
|
|
150
|
+
try {
|
|
151
|
+
return { parsed: JSON.parse(candidate), repaired: false };
|
|
152
|
+
}
|
|
153
|
+
catch (error) {
|
|
154
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
155
|
+
}
|
|
156
|
+
const repairedCandidate = repairInvalidJsonEscapes(candidate);
|
|
157
|
+
if (!repairedCandidate.repaired)
|
|
158
|
+
continue;
|
|
159
|
+
try {
|
|
160
|
+
return { parsed: JSON.parse(repairedCandidate.repairedText), repaired: true };
|
|
161
|
+
}
|
|
162
|
+
catch (error) {
|
|
163
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
if (lastError)
|
|
167
|
+
throw lastError;
|
|
168
|
+
throw new Error("model output was not valid JSON");
|
|
169
|
+
};
|
|
80
170
|
export const normalizeTableItems = (value) => {
|
|
81
171
|
if (!Array.isArray(value))
|
|
82
172
|
return [];
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@echofiles/echo-pdf",
|
|
3
3
|
"description": "Local-first PDF document component core with CLI, workspace artifacts, and reusable page primitives.",
|
|
4
|
-
"version": "0.11.
|
|
4
|
+
"version": "0.11.4",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"homepage": "https://pdf.echofile.ai/",
|
|
7
7
|
"repository": {
|
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
"cli",
|
|
19
19
|
"vision-language"
|
|
20
20
|
],
|
|
21
|
+
"packageManager": "bun@1.3.5",
|
|
21
22
|
"publishConfig": {
|
|
22
23
|
"access": "public"
|
|
23
24
|
},
|
|
@@ -53,14 +54,14 @@
|
|
|
53
54
|
"eval:stress": "node ./eval/run-local.mjs --suite stress",
|
|
54
55
|
"eval:known-bad": "node ./eval/run-local.mjs --suite known-bad",
|
|
55
56
|
"eval:fetch-public-samples": "node ./eval/fetch-public-samples.mjs",
|
|
56
|
-
"typecheck": "
|
|
57
|
-
"test:unit": "
|
|
58
|
-
"test:acceptance": "
|
|
59
|
-
"test:import-smoke": "
|
|
60
|
-
"test:integration": "
|
|
61
|
-
"test": "
|
|
57
|
+
"typecheck": "bun run check:runtime && tsc --noEmit",
|
|
58
|
+
"test:unit": "bun run check:runtime && vitest run tests/unit",
|
|
59
|
+
"test:acceptance": "bun run check:runtime && bun run build && vitest run tests/acceptance",
|
|
60
|
+
"test:import-smoke": "bun run check:runtime && bun run build && vitest run tests/integration/npm-pack-import.integration.test.ts tests/integration/ts-nodenext-consumer.integration.test.ts",
|
|
61
|
+
"test:integration": "bun run check:runtime && bun run build && vitest run tests/integration/local-document-cli.integration.test.ts tests/integration/local-document.integration.test.ts tests/integration/local-provider-stability.integration.test.ts tests/integration/local-semantic-structure.integration.test.ts tests/integration/npm-pack-import.integration.test.ts tests/integration/ts-nodenext-consumer.integration.test.ts",
|
|
62
|
+
"test": "bun run test:unit && bun run test:acceptance && bun run test:integration",
|
|
62
63
|
"smoke": "bash ./scripts/smoke.sh",
|
|
63
|
-
"prepublishOnly": "
|
|
64
|
+
"prepublishOnly": "bun run build && bun run typecheck && bun run test"
|
|
64
65
|
},
|
|
65
66
|
"engines": {
|
|
66
67
|
"node": ">=20.0.0"
|
package/scripts/check-runtime.sh
CHANGED
|
@@ -9,7 +9,7 @@ if [[ -z "${current_node_major}" ]] || (( current_node_major < required_node_maj
|
|
|
9
9
|
exit 1
|
|
10
10
|
fi
|
|
11
11
|
|
|
12
|
-
for cmd in
|
|
12
|
+
for cmd in bun curl grep sed; do
|
|
13
13
|
if ! command -v "${cmd}" >/dev/null 2>&1; then
|
|
14
14
|
echo "Missing required command: ${cmd}"
|
|
15
15
|
exit 1
|
|
@@ -23,4 +23,4 @@ if [[ "${CHECK_LLM_KEYS:-0}" == "1" ]]; then
|
|
|
23
23
|
fi
|
|
24
24
|
fi
|
|
25
25
|
|
|
26
|
-
echo "runtime check passed: node=$(node -v),
|
|
26
|
+
echo "runtime check passed: node=$(node -v), bun=$(bun -v)"
|
package/scripts/smoke.sh
CHANGED