aiex-cli 0.0.5-beta.5 → 0.0.5-beta.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/dist/cli.mjs +638 -377
- package/dist/{doctor-collector-NTNBFeBw.mjs → doctor-collector-BpqhXNcO.mjs} +26 -91
- package/dist/index.mjs +1 -1
- package/dist/web/assets/AISettings-sVI4PTNB.js +264 -0
- package/dist/web/assets/{DataBrowser-GAA-pGq0.js → DataBrowser-BGkZb9FV.js} +1 -1
- package/dist/web/assets/{ExtractionViewer-BhhWrBs2.js → ExtractionViewer-DNrkSECj.js} +1 -1
- package/dist/web/assets/{api-client-b4ZBXpNH.js → api-client-gQAAOw0v.js} +1 -1
- package/dist/web/assets/{index-CKV2X6sS.js → index-BQKZKzzP.js} +3 -3
- package/dist/web/assets/index-BU58oIRd.css +2 -0
- package/dist/web/index.html +3 -3
- package/dist/{zh-CN-Ca-Dv775.mjs → zh-CN-DkillGHx.mjs} +10 -23
- package/package.json +1 -1
- package/dist/web/assets/AISettings-BlyTFIIy.js +0 -272
- package/dist/web/assets/index-Csdgio76.css +0 -2
package/dist/cli.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { C as name, D as doctorDiagnosticsTableRows, O as formatDoctorDiagnosticsJson, S as description, T as version, _ as PLACEHOLDER_SCHEMA, a as parseJsonSchema, b as createConfig, c as recognizeImageText, d as getDefaultAIConfig, f as readAIConfig, g as DEFAULT_PROMPT_CONFIG, h as DEFAULT_MINERU_CONFIG, i as JsonSchemaDefinitionSchema, l as initI18n, m as DEFAULT_MINERU_API_CONFIG, n as createMigrationConfig, o as toSnakeCase, p as writeAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as t, v as PLACEHOLDER_TEXT, w as package_default, x as seedConfig, y as AIConfigSchema } from "./doctor-collector-BpqhXNcO.mjs";
|
|
2
2
|
import { createRequire } from "node:module";
|
|
3
3
|
import fs from "node:fs/promises";
|
|
4
4
|
import os from "node:os";
|
|
@@ -21,7 +21,6 @@ import { getEncoding } from "js-tiktoken";
|
|
|
21
21
|
import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
|
|
22
22
|
import { APICallError, Output, generateText, jsonSchema } from "ai";
|
|
23
23
|
import pRetry from "p-retry";
|
|
24
|
-
import mime from "mime";
|
|
25
24
|
import { jsonrepair } from "jsonrepair";
|
|
26
25
|
import { LangfuseSpanProcessor } from "@langfuse/otel";
|
|
27
26
|
import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
@@ -12861,28 +12860,6 @@ async function withRetry(fn, onRetry, maxRetries = 5) {
|
|
|
12861
12860
|
});
|
|
12862
12861
|
}
|
|
12863
12862
|
|
|
12864
|
-
//#endregion
|
|
12865
|
-
//#region src/core/ai-extraction/file-utils.ts
|
|
12866
|
-
function detectMimeType(filePath) {
|
|
12867
|
-
return mime.getType(filePath) ?? "application/octet-stream";
|
|
12868
|
-
}
|
|
12869
|
-
async function readFilePart(filePath) {
|
|
12870
|
-
const mimeStr = detectMimeType(filePath);
|
|
12871
|
-
const buffer = await fs.readFile(filePath);
|
|
12872
|
-
const name$1 = path.basename(filePath);
|
|
12873
|
-
if (mimeStr.startsWith("image/")) return {
|
|
12874
|
-
type: "image",
|
|
12875
|
-
image: buffer,
|
|
12876
|
-
mimeType: mimeStr
|
|
12877
|
-
};
|
|
12878
|
-
return {
|
|
12879
|
-
type: "file",
|
|
12880
|
-
data: buffer,
|
|
12881
|
-
mediaType: mimeStr,
|
|
12882
|
-
filename: name$1
|
|
12883
|
-
};
|
|
12884
|
-
}
|
|
12885
|
-
|
|
12886
12863
|
//#endregion
|
|
12887
12864
|
//#region src/core/ai-extraction/json-utils.ts
|
|
12888
12865
|
function parseJsonLike(text$1) {
|
|
@@ -12943,25 +12920,10 @@ function filterCompatible(models, inputTokens, outputTokens) {
|
|
|
12943
12920
|
});
|
|
12944
12921
|
}
|
|
12945
12922
|
function selectModel(input) {
|
|
12946
|
-
const { models,
|
|
12923
|
+
const { models, inputTokens, outputTokens } = input;
|
|
12947
12924
|
if (models.length === 0) throw new Error(t("errors.ai.noModels"));
|
|
12948
12925
|
let candidates = filterCompatible(models, inputTokens, outputTokens);
|
|
12949
12926
|
if (candidates.length === 0) candidates = models;
|
|
12950
|
-
if (isImage) {
|
|
12951
|
-
const visionModel = candidates.find((m) => m.capabilities.vision);
|
|
12952
|
-
if (!visionModel) {
|
|
12953
|
-
const hint = fileName ? ` (${fileName})` : "";
|
|
12954
|
-
const msg = inputTokens ? t("errors.ai.noVisionModelContext", {
|
|
12955
|
-
tokens: inputTokens,
|
|
12956
|
-
hint
|
|
12957
|
-
}) : t("errors.ai.noVisionModel", { hint });
|
|
12958
|
-
throw new Error(msg + t("errors.ai.addSuitableModel"));
|
|
12959
|
-
}
|
|
12960
|
-
return {
|
|
12961
|
-
name: visionModel.name,
|
|
12962
|
-
capabilities: visionModel.capabilities
|
|
12963
|
-
};
|
|
12964
|
-
}
|
|
12965
12927
|
const soModel = candidates.find((m) => m.capabilities.structuredOutput);
|
|
12966
12928
|
if (soModel) return {
|
|
12967
12929
|
name: soModel.name,
|
|
@@ -12975,36 +12937,46 @@ function selectModel(input) {
|
|
|
12975
12937
|
|
|
12976
12938
|
//#endregion
|
|
12977
12939
|
//#region src/core/ai-extraction/prompt-generator.ts
|
|
12978
|
-
|
|
12940
|
+
const CAMEL_CASE_BOUNDARY_RE = /([a-z0-9])([A-Z])/g;
|
|
12941
|
+
const IDENTIFIER_SEPARATOR_RE = /[\s_-]+/;
|
|
12942
|
+
function splitIdentifier(name$1) {
|
|
12943
|
+
return name$1.replace(CAMEL_CASE_BOUNDARY_RE, "$1 $2").split(IDENTIFIER_SEPARATOR_RE).map((part) => part.trim().toLowerCase()).filter(Boolean);
|
|
12944
|
+
}
|
|
12945
|
+
function propertyToDescription(name$1, prop, indent = "", required = false) {
|
|
12979
12946
|
const lines = [];
|
|
12980
12947
|
let typeStr = prop.type;
|
|
12981
12948
|
if (prop.type === "array" && prop.items) typeStr = `array of ${prop.items.type}`;
|
|
12982
|
-
lines.push(`${indent}- ${name$1}: ${typeStr}`);
|
|
12949
|
+
lines.push(`${indent}- ${name$1}: ${typeStr}${required ? " (required)" : ""}`);
|
|
12950
|
+
const terms = splitIdentifier(name$1);
|
|
12951
|
+
if (terms.length > 1) lines.push(`${indent} search terms: ${terms.join(", ")}`);
|
|
12952
|
+
if (prop.description) lines.push(`${indent} description: ${prop.description}`);
|
|
12983
12953
|
if (prop.minLength !== void 0 || prop.maxLength !== void 0) lines.push(`${indent} length: ${prop.minLength ?? 0} - ${prop.maxLength ?? "unlimited"}`);
|
|
12954
|
+
if (prop.minimum !== void 0 || prop.maximum !== void 0) lines.push(`${indent} range: ${prop.minimum ?? "-∞"} - ${prop.maximum ?? "+∞"}`);
|
|
12984
12955
|
if (prop.format) lines.push(`${indent} format: ${prop.format}`);
|
|
12985
12956
|
if (prop.unique) lines.push(`${indent} unique: true`);
|
|
12986
12957
|
if (prop.default !== void 0) lines.push(`${indent} default: ${JSON.stringify(prop.default)}`);
|
|
12987
12958
|
return lines.join("\n");
|
|
12988
12959
|
}
|
|
12989
|
-
function nestedPropertyToDescription(name$1, prop, indent = "") {
|
|
12960
|
+
function nestedPropertyToDescription(name$1, prop, indent = "", requiredFields = []) {
|
|
12990
12961
|
const lines = [];
|
|
12962
|
+
const isRequired = requiredFields.includes(name$1);
|
|
12991
12963
|
if (prop.nested?.enabled && prop.type === "object") {
|
|
12992
12964
|
const relation = prop.nested.relation || "has-one";
|
|
12993
|
-
lines.push(`${indent}- ${name$1}: object (related table, ${relation})`);
|
|
12994
|
-
if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}
|
|
12965
|
+
lines.push(`${indent}- ${name$1}: object (related table, ${relation})${isRequired ? " (required)" : ""}`);
|
|
12966
|
+
if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.required ?? []));
|
|
12995
12967
|
return lines.join("\n");
|
|
12996
12968
|
}
|
|
12997
12969
|
if (prop.type === "array" && prop.items?.nested?.enabled) {
|
|
12998
12970
|
const relation = prop.items.nested.relation || "has-many";
|
|
12999
|
-
lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})`);
|
|
13000
|
-
if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}
|
|
12971
|
+
lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})${isRequired ? " (required)" : ""}`);
|
|
12972
|
+
if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.items.required ?? []));
|
|
13001
12973
|
return lines.join("\n");
|
|
13002
12974
|
}
|
|
13003
|
-
lines.push(propertyToDescription(name$1, prop, indent));
|
|
13004
|
-
if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}
|
|
12975
|
+
lines.push(propertyToDescription(name$1, prop, indent, isRequired));
|
|
12976
|
+
if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.required ?? []));
|
|
13005
12977
|
if (prop.type === "array" && prop.items?.properties && !prop.items?.nested?.enabled) {
|
|
13006
12978
|
lines.push(`${indent} item fields:`);
|
|
13007
|
-
for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}
|
|
12979
|
+
for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.items.required ?? []));
|
|
13008
12980
|
}
|
|
13009
12981
|
return lines.join("\n");
|
|
13010
12982
|
}
|
|
@@ -13016,7 +12988,7 @@ function schemaToDescription(schema) {
|
|
|
13016
12988
|
lines.push("Fields:");
|
|
13017
12989
|
for (const [name$1, prop] of Object.entries(schema.properties)) {
|
|
13018
12990
|
const property = prop;
|
|
13019
|
-
lines.push(nestedPropertyToDescription(name$1, property));
|
|
12991
|
+
lines.push(nestedPropertyToDescription(name$1, property, "", schema.required ?? []));
|
|
13020
12992
|
}
|
|
13021
12993
|
if (schema.examples && schema.examples.length > 0) {
|
|
13022
12994
|
lines.push("");
|
|
@@ -13061,33 +13033,6 @@ function generatePromptSnapshot(schema, promptConfig = DEFAULT_PROMPT_CONFIG) {
|
|
|
13061
13033
|
].join("\n");
|
|
13062
13034
|
}
|
|
13063
13035
|
|
|
13064
|
-
//#endregion
|
|
13065
|
-
//#region src/core/ai-extraction/snapshot.ts
|
|
13066
|
-
const SYSTEM_PROMPT_REGEX = /## System Prompt\n([\s\S]*?)(?=## User Prompt|$)/;
|
|
13067
|
-
const USER_PROMPT_REGEX = /## User Prompt Template\n([\s\S]*)$/;
|
|
13068
|
-
async function loadPromptSnapshot(aiexDir, tableName) {
|
|
13069
|
-
const snapshotPath = path.join(aiexDir, "extracted", `${tableName}.prompt.md`);
|
|
13070
|
-
try {
|
|
13071
|
-
const content = await fs.readFile(snapshotPath, "utf-8");
|
|
13072
|
-
const systemMatch = content.match(SYSTEM_PROMPT_REGEX);
|
|
13073
|
-
const userMatch = content.match(USER_PROMPT_REGEX);
|
|
13074
|
-
if (systemMatch && userMatch) return {
|
|
13075
|
-
system: systemMatch[1].trim(),
|
|
13076
|
-
user: userMatch[1].trim()
|
|
13077
|
-
};
|
|
13078
|
-
} catch {}
|
|
13079
|
-
return null;
|
|
13080
|
-
}
|
|
13081
|
-
async function savePromptSnapshot(schema, aiexDir) {
|
|
13082
|
-
const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
|
|
13083
|
-
const outputDir = path.join(aiexDir, "extracted");
|
|
13084
|
-
await fs.mkdir(outputDir, { recursive: true });
|
|
13085
|
-
const fileName = `${schema.table.name}.prompt.md`;
|
|
13086
|
-
const outputPath = path.join(outputDir, fileName);
|
|
13087
|
-
await fs.writeFile(outputPath, content);
|
|
13088
|
-
return outputPath;
|
|
13089
|
-
}
|
|
13090
|
-
|
|
13091
13036
|
//#endregion
|
|
13092
13037
|
//#region src/core/ai-extraction/telemetry.ts
|
|
13093
13038
|
let langfuseInitialized = false;
|
|
@@ -13130,7 +13075,7 @@ function propertyToExtractionSchema(property) {
|
|
|
13130
13075
|
}
|
|
13131
13076
|
return { type: nullableType(property.type) };
|
|
13132
13077
|
}
|
|
13133
|
-
function isRecord$
|
|
13078
|
+
function isRecord$2(value) {
|
|
13134
13079
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
13135
13080
|
}
|
|
13136
13081
|
function schemaToExtractionOutputSchema(schema) {
|
|
@@ -13168,7 +13113,7 @@ function validatePropertyValue(path$1, property, value, issues) {
|
|
|
13168
13113
|
}
|
|
13169
13114
|
return;
|
|
13170
13115
|
case "object":
|
|
13171
|
-
if (!isRecord$
|
|
13116
|
+
if (!isRecord$2(value)) {
|
|
13172
13117
|
issues.push(`${path$1}: expected object or null`);
|
|
13173
13118
|
return;
|
|
13174
13119
|
}
|
|
@@ -13191,7 +13136,7 @@ function validateProperties(basePath, properties, data, issues) {
|
|
|
13191
13136
|
}
|
|
13192
13137
|
}
|
|
13193
13138
|
function validateExtractedData(schema, data) {
|
|
13194
|
-
if (!isRecord$
|
|
13139
|
+
if (!isRecord$2(data)) return {
|
|
13195
13140
|
success: false,
|
|
13196
13141
|
error: "Extracted data must be a JSON object."
|
|
13197
13142
|
};
|
|
@@ -13208,13 +13153,11 @@ function validateExtractedData(schema, data) {
|
|
|
13208
13153
|
//#region src/core/ai-extraction/extractor.ts
|
|
13209
13154
|
const OPENAI_COMPATIBLE_PROVIDER_NAME = "openai-compatible";
|
|
13210
13155
|
async function extractStructuredData(input) {
|
|
13211
|
-
const { config, schema, text: text$1,
|
|
13156
|
+
const { config, schema, text: text$1, modelOverride } = input;
|
|
13212
13157
|
if (!config.provider.apiKey) return {
|
|
13213
13158
|
success: false,
|
|
13214
13159
|
error: t("errors.ai.apiKeyMissing")
|
|
13215
13160
|
};
|
|
13216
|
-
const useFileContent = !!file;
|
|
13217
|
-
const isImageFile = useFileContent && detectMimeType(file).startsWith("image/");
|
|
13218
13161
|
const inputTokens = text$1 ? Math.ceil(text$1.length / 2) : void 0;
|
|
13219
13162
|
const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
|
|
13220
13163
|
const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
|
|
@@ -13222,8 +13165,6 @@ async function extractStructuredData(input) {
|
|
|
13222
13165
|
try {
|
|
13223
13166
|
selected = modelOverride ?? selectModel({
|
|
13224
13167
|
models: config.provider.models,
|
|
13225
|
-
isImage: isImageFile,
|
|
13226
|
-
fileName: file,
|
|
13227
13168
|
inputTokens,
|
|
13228
13169
|
outputTokens
|
|
13229
13170
|
});
|
|
@@ -13243,18 +13184,7 @@ async function extractStructuredData(input) {
|
|
|
13243
13184
|
apiKey: config.provider.apiKey,
|
|
13244
13185
|
supportsStructuredOutputs: useStructuredOutput
|
|
13245
13186
|
});
|
|
13246
|
-
|
|
13247
|
-
let user;
|
|
13248
|
-
const snapshot = await loadPromptSnapshot(aiexDir, schema.table.name);
|
|
13249
|
-
const promptText = file ? PLACEHOLDER_TEXT : text$1;
|
|
13250
|
-
if (snapshot) {
|
|
13251
|
-
system = snapshot.system;
|
|
13252
|
-
user = snapshot.user.replaceAll(PLACEHOLDER_TEXT, promptText);
|
|
13253
|
-
} else {
|
|
13254
|
-
const generated = generateExtractionPrompt(schema, promptText, config.prompt ?? DEFAULT_PROMPT_CONFIG);
|
|
13255
|
-
system = generated.system;
|
|
13256
|
-
user = generated.user;
|
|
13257
|
-
}
|
|
13187
|
+
const { system, user } = generateExtractionPrompt(schema, text$1, config.prompt ?? DEFAULT_PROMPT_CONFIG);
|
|
13258
13188
|
const outputSchema = jsonSchema(schemaToExtractionOutputSchema(schema));
|
|
13259
13189
|
const timeoutMs = (config.provider.timeout ?? 300) * 1e3;
|
|
13260
13190
|
let systemPrompt = system;
|
|
@@ -13269,38 +13199,16 @@ async function extractStructuredData(input) {
|
|
|
13269
13199
|
let parseError;
|
|
13270
13200
|
let validationError;
|
|
13271
13201
|
try {
|
|
13272
|
-
|
|
13273
|
-
|
|
13274
|
-
|
|
13275
|
-
|
|
13276
|
-
|
|
13277
|
-
|
|
13278
|
-
}
|
|
13279
|
-
|
|
13280
|
-
|
|
13281
|
-
|
|
13282
|
-
messages: [{
|
|
13283
|
-
role: "user",
|
|
13284
|
-
content: contentParts
|
|
13285
|
-
}],
|
|
13286
|
-
abortSignal: AbortSignal.timeout(timeoutMs),
|
|
13287
|
-
maxRetries: 0,
|
|
13288
|
-
experimental_telemetry: { isEnabled: useTelemetry }
|
|
13289
|
-
};
|
|
13290
|
-
if (useStructuredOutput) fileOpts.output = Output.object({ schema: outputSchema });
|
|
13291
|
-
result = await withRetry(() => generateText(fileOpts), input.onRetry);
|
|
13292
|
-
} else {
|
|
13293
|
-
const textOpts = {
|
|
13294
|
-
model: provider.chatModel(selected.name),
|
|
13295
|
-
system: systemPrompt,
|
|
13296
|
-
prompt: userPrompt,
|
|
13297
|
-
abortSignal: AbortSignal.timeout(timeoutMs),
|
|
13298
|
-
maxRetries: 0,
|
|
13299
|
-
experimental_telemetry: { isEnabled: useTelemetry }
|
|
13300
|
-
};
|
|
13301
|
-
if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
|
|
13302
|
-
result = await withRetry(() => generateText(textOpts), input.onRetry);
|
|
13303
|
-
}
|
|
13202
|
+
const textOpts = {
|
|
13203
|
+
model: provider.chatModel(selected.name),
|
|
13204
|
+
system: systemPrompt,
|
|
13205
|
+
prompt: userPrompt,
|
|
13206
|
+
abortSignal: AbortSignal.timeout(timeoutMs),
|
|
13207
|
+
maxRetries: 0,
|
|
13208
|
+
experimental_telemetry: { isEnabled: useTelemetry }
|
|
13209
|
+
};
|
|
13210
|
+
if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
|
|
13211
|
+
result = await withRetry(() => generateText(textOpts), input.onRetry);
|
|
13304
13212
|
if (result.usage) {
|
|
13305
13213
|
totalPromptTokens += result.usage.inputTokens ?? 0;
|
|
13306
13214
|
totalCompletionTokens += result.usage.outputTokens ?? 0;
|
|
@@ -13316,27 +13224,16 @@ async function extractStructuredData(input) {
|
|
|
13316
13224
|
}
|
|
13317
13225
|
if (!parseError && data !== void 0) {
|
|
13318
13226
|
const validation = validateExtractedData(schema, data);
|
|
13319
|
-
if (validation.success) {
|
|
13320
|
-
|
|
13321
|
-
|
|
13322
|
-
|
|
13323
|
-
|
|
13324
|
-
|
|
13325
|
-
|
|
13326
|
-
|
|
13327
|
-
|
|
13328
|
-
|
|
13329
|
-
return {
|
|
13330
|
-
success: true,
|
|
13331
|
-
outputPath,
|
|
13332
|
-
data,
|
|
13333
|
-
tokensUsed: {
|
|
13334
|
-
prompt: totalPromptTokens,
|
|
13335
|
-
completion: totalCompletionTokens,
|
|
13336
|
-
total: totalPromptTokens + totalCompletionTokens
|
|
13337
|
-
}
|
|
13338
|
-
};
|
|
13339
|
-
} else validationError = validation.error;
|
|
13227
|
+
if (validation.success) return {
|
|
13228
|
+
success: true,
|
|
13229
|
+
data,
|
|
13230
|
+
tokensUsed: {
|
|
13231
|
+
prompt: totalPromptTokens,
|
|
13232
|
+
completion: totalCompletionTokens,
|
|
13233
|
+
total: totalPromptTokens + totalCompletionTokens
|
|
13234
|
+
}
|
|
13235
|
+
};
|
|
13236
|
+
else validationError = validation.error;
|
|
13340
13237
|
}
|
|
13341
13238
|
const errorMsg = parseError || validationError || "Unknown validation error";
|
|
13342
13239
|
lastError = errorMsg;
|
|
@@ -13347,11 +13244,14 @@ async function extractStructuredData(input) {
|
|
|
13347
13244
|
CRITICAL RULES:
|
|
13348
13245
|
1. Only correct the fields that failed validation.
|
|
13349
13246
|
2. Preserve all other correctly extracted fields and their values exactly.
|
|
13350
|
-
3.
|
|
13247
|
+
3. Use only values supported by the original text. If a value cannot be confirmed, set it to null.
|
|
13248
|
+
4. Remove any fields not defined by the JSON Schema.
|
|
13249
|
+
5. Normalize values to the expected JSON type without changing the intended meaning.
|
|
13250
|
+
6. Return ONLY the corrected JSON object. No explanations, no markdown blocks other than JSON.`;
|
|
13351
13251
|
userPrompt = `The JSON data you generated previously failed validation. Please correct it.
|
|
13352
13252
|
|
|
13353
13253
|
[Original Text]
|
|
13354
|
-
${text$1 || "
|
|
13254
|
+
${text$1 || "Original text is empty."}
|
|
13355
13255
|
|
|
13356
13256
|
[JSON Schema Definition]
|
|
13357
13257
|
${JSON.stringify(schemaToExtractionOutputSchema(schema), null, 2)}
|
|
@@ -13362,6 +13262,11 @@ ${invalidJson}
|
|
|
13362
13262
|
[Validation Error Details]
|
|
13363
13263
|
${errorMsg}
|
|
13364
13264
|
|
|
13265
|
+
Correction checklist:
|
|
13266
|
+
- Fix each field path mentioned in the validation error.
|
|
13267
|
+
- Keep schema-valid fields unchanged.
|
|
13268
|
+
- Do not invent missing facts; use null when the original text does not support a value.
|
|
13269
|
+
|
|
13365
13270
|
Please output the corrected JSON object now:`;
|
|
13366
13271
|
}
|
|
13367
13272
|
}
|
|
@@ -13516,33 +13421,60 @@ function insertExtractedData(db, schema, data) {
|
|
|
13516
13421
|
|
|
13517
13422
|
//#endregion
|
|
13518
13423
|
//#region src/core/ai-extraction/json-merger.ts
|
|
13519
|
-
function isRecord(value) {
|
|
13424
|
+
function isRecord$1(value) {
|
|
13520
13425
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
13521
13426
|
}
|
|
13427
|
+
function stableKey(value) {
|
|
13428
|
+
if (!isRecord$1(value)) return JSON.stringify(value);
|
|
13429
|
+
return JSON.stringify(Object.keys(value).sort().reduce((acc, key) => {
|
|
13430
|
+
acc[key] = value[key];
|
|
13431
|
+
return acc;
|
|
13432
|
+
}, {}));
|
|
13433
|
+
}
|
|
13434
|
+
function isBlankString(value) {
|
|
13435
|
+
return typeof value === "string" && value.trim() === "";
|
|
13436
|
+
}
|
|
13437
|
+
function isPlaceholderString$1(value) {
|
|
13438
|
+
if (typeof value !== "string") return false;
|
|
13439
|
+
const normalized = value.trim().toLowerCase();
|
|
13440
|
+
return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
|
|
13441
|
+
}
|
|
13442
|
+
function pickPrimitiveValue(values) {
|
|
13443
|
+
const meaningful = values.filter((v) => !isBlankString(v) && !isPlaceholderString$1(v));
|
|
13444
|
+
if (meaningful.length === 0) return null;
|
|
13445
|
+
if (typeof meaningful[0] === "boolean") {
|
|
13446
|
+
const trueCount = meaningful.filter(Boolean).length;
|
|
13447
|
+
return trueCount >= meaningful.length - trueCount;
|
|
13448
|
+
}
|
|
13449
|
+
return meaningful[0];
|
|
13450
|
+
}
|
|
13522
13451
|
function mergePropertyValue(property, values) {
|
|
13523
13452
|
const nonNullValues = values.filter((v) => v !== null && v !== void 0);
|
|
13524
13453
|
if (nonNullValues.length === 0) return null;
|
|
13525
13454
|
if (property.type === "array") {
|
|
13526
13455
|
const concatenated = [];
|
|
13527
|
-
|
|
13456
|
+
const seen = /* @__PURE__ */ new Set();
|
|
13457
|
+
for (const val of nonNullValues) if (Array.isArray(val)) for (const item of val) {
|
|
13458
|
+
const key = stableKey(item);
|
|
13459
|
+
if (!seen.has(key)) {
|
|
13460
|
+
seen.add(key);
|
|
13461
|
+
concatenated.push(item);
|
|
13462
|
+
}
|
|
13463
|
+
}
|
|
13528
13464
|
return concatenated;
|
|
13529
13465
|
}
|
|
13530
13466
|
if (property.type === "object") {
|
|
13531
13467
|
const childProperties = property.properties;
|
|
13532
13468
|
if (!childProperties) {
|
|
13533
13469
|
const mergedObj$1 = {};
|
|
13534
|
-
for (const val of nonNullValues) if (isRecord(val)) Object.assign(mergedObj$1, val);
|
|
13470
|
+
for (const val of nonNullValues) if (isRecord$1(val)) Object.assign(mergedObj$1, val);
|
|
13535
13471
|
return mergedObj$1;
|
|
13536
13472
|
}
|
|
13537
13473
|
const mergedObj = {};
|
|
13538
|
-
for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord(v) ? v[propName] : void 0));
|
|
13474
|
+
for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord$1(v) ? v[propName] : void 0));
|
|
13539
13475
|
return mergedObj;
|
|
13540
13476
|
}
|
|
13541
|
-
|
|
13542
|
-
if (typeof v === "string") return v.trim() !== "";
|
|
13543
|
-
return true;
|
|
13544
|
-
});
|
|
13545
|
-
return bestValue !== void 0 ? bestValue : null;
|
|
13477
|
+
return pickPrimitiveValue(nonNullValues);
|
|
13546
13478
|
}
|
|
13547
13479
|
/**
|
|
13548
13480
|
* Merges structured extraction outputs from multiple document chunks
|
|
@@ -13559,12 +13491,39 @@ function mergeExtractionResults(schema, results) {
|
|
|
13559
13491
|
return merged;
|
|
13560
13492
|
}
|
|
13561
13493
|
|
|
13494
|
+
//#endregion
|
|
13495
|
+
//#region src/core/ai-extraction/snapshot.ts
|
|
13496
|
+
async function savePromptSnapshot(schema, aiexDir) {
|
|
13497
|
+
const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
|
|
13498
|
+
const outputDir = path.join(aiexDir, "extracted");
|
|
13499
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
13500
|
+
const fileName = `${schema.table.name}.prompt.md`;
|
|
13501
|
+
const outputPath = path.join(outputDir, fileName);
|
|
13502
|
+
await fs.writeFile(outputPath, content);
|
|
13503
|
+
return outputPath;
|
|
13504
|
+
}
|
|
13505
|
+
|
|
13562
13506
|
//#endregion
|
|
13563
13507
|
//#region src/core/ai-extraction/text-splitter.ts
|
|
13564
13508
|
const encoding$1 = getEncoding("cl100k_base");
|
|
13509
|
+
const MAX_OVERLAP_RATIO = .15;
|
|
13510
|
+
const MAX_EFFECTIVE_OVERLAP_TOKENS = 1200;
|
|
13511
|
+
const TABLE_SEPARATOR_CELL_RE = /^:?-{3,}:?$/;
|
|
13512
|
+
const LEADING_TABLE_PIPE_RE = /^\|/;
|
|
13513
|
+
const TRAILING_TABLE_PIPE_RE = /\|$/;
|
|
13565
13514
|
function countTokens(text$1) {
|
|
13566
13515
|
return encoding$1.encode(text$1).length;
|
|
13567
13516
|
}
|
|
13517
|
+
function calculateChunkTokenBudget(options = {}) {
|
|
13518
|
+
const configuredMaxTokens = options.configuredMaxTokens ?? 8e3;
|
|
13519
|
+
const modelMaxTokens = options.modelMaxTokens;
|
|
13520
|
+
if (!modelMaxTokens) return configuredMaxTokens;
|
|
13521
|
+
const outputReserveTokens = options.outputReserveTokens ?? 2e3;
|
|
13522
|
+
const promptReserveTokens = options.promptReserveTokens ?? 1200;
|
|
13523
|
+
const safetyBufferTokens = options.safetyBufferTokens ?? Math.min(1e3, Math.floor(modelMaxTokens * .1));
|
|
13524
|
+
const available = modelMaxTokens - outputReserveTokens - promptReserveTokens - safetyBufferTokens;
|
|
13525
|
+
return Math.max(512, Math.min(configuredMaxTokens, available));
|
|
13526
|
+
}
|
|
13568
13527
|
function formatHeadingContext(headings) {
|
|
13569
13528
|
const active = headings.filter(Boolean);
|
|
13570
13529
|
if (active.length === 0) return "";
|
|
@@ -13578,6 +13537,71 @@ function getMetadata(headings) {
|
|
|
13578
13537
|
h4: headings[3] || void 0
|
|
13579
13538
|
};
|
|
13580
13539
|
}
|
|
13540
|
+
function getHeadingPath(metadata) {
|
|
13541
|
+
return [
|
|
13542
|
+
metadata.h1,
|
|
13543
|
+
metadata.h2,
|
|
13544
|
+
metadata.h3,
|
|
13545
|
+
metadata.h4
|
|
13546
|
+
].filter(Boolean);
|
|
13547
|
+
}
|
|
13548
|
+
function finalizeChunks(chunks, sourceText) {
|
|
13549
|
+
let searchStart = 0;
|
|
13550
|
+
const totalChunks = chunks.length;
|
|
13551
|
+
return chunks.map((chunk, index) => {
|
|
13552
|
+
const tokenCount = countTokens(chunk.pageContent);
|
|
13553
|
+
let charStart = sourceText.indexOf(chunk.pageContent, searchStart);
|
|
13554
|
+
if (charStart === -1) charStart = sourceText.indexOf(chunk.pageContent);
|
|
13555
|
+
const charEnd = charStart >= 0 ? charStart + chunk.pageContent.length : void 0;
|
|
13556
|
+
if (charStart >= 0 && charEnd !== void 0) searchStart = charEnd;
|
|
13557
|
+
return {
|
|
13558
|
+
...chunk,
|
|
13559
|
+
chunkIndex: index,
|
|
13560
|
+
totalChunks,
|
|
13561
|
+
tokenCount,
|
|
13562
|
+
headingPath: getHeadingPath(chunk.metadata),
|
|
13563
|
+
charStart: charStart >= 0 ? charStart : void 0,
|
|
13564
|
+
charEnd
|
|
13565
|
+
};
|
|
13566
|
+
});
|
|
13567
|
+
}
|
|
13568
|
+
function getEffectiveOverlapTokens(maxTokens, overlapTokens) {
|
|
13569
|
+
return Math.floor(Math.min(overlapTokens, Math.max(64, maxTokens * MAX_OVERLAP_RATIO), MAX_EFFECTIVE_OVERLAP_TOKENS));
|
|
13570
|
+
}
|
|
13571
|
+
function splitMarkdownTable(tableText, maxTokens) {
|
|
13572
|
+
if (countTokens(tableText) <= maxTokens) return [tableText];
|
|
13573
|
+
const lines = tableText.split("\n");
|
|
13574
|
+
const headerIndex = lines.findIndex((line) => line.trim().startsWith("|"));
|
|
13575
|
+
const separatorIndex = lines.findIndex((line, index) => {
|
|
13576
|
+
if (index <= headerIndex) return false;
|
|
13577
|
+
const cells = line.trim().replace(LEADING_TABLE_PIPE_RE, "").replace(TRAILING_TABLE_PIPE_RE, "").split("|").map((cell) => cell.trim());
|
|
13578
|
+
return cells.length > 0 && cells.every((cell) => TABLE_SEPARATOR_CELL_RE.test(cell));
|
|
13579
|
+
});
|
|
13580
|
+
if (headerIndex === -1 || separatorIndex === -1) return splitTextRecursively(tableText, maxTokens, ["\n"]);
|
|
13581
|
+
const prefix = lines.slice(0, headerIndex);
|
|
13582
|
+
const header = lines[headerIndex];
|
|
13583
|
+
const separator = lines[separatorIndex];
|
|
13584
|
+
const rows = lines.slice(separatorIndex + 1).filter((line) => line.trim() !== "");
|
|
13585
|
+
const chunks = [];
|
|
13586
|
+
let currentRows = [];
|
|
13587
|
+
const buildTable = (tableRows) => {
|
|
13588
|
+
return [
|
|
13589
|
+
...prefix,
|
|
13590
|
+
header,
|
|
13591
|
+
separator,
|
|
13592
|
+
...tableRows
|
|
13593
|
+
].join("\n");
|
|
13594
|
+
};
|
|
13595
|
+
for (const row of rows) {
|
|
13596
|
+
const candidateRows = [...currentRows, row];
|
|
13597
|
+
if (currentRows.length > 0 && countTokens(buildTable(candidateRows)) > maxTokens) {
|
|
13598
|
+
chunks.push(buildTable(currentRows));
|
|
13599
|
+
currentRows = [row];
|
|
13600
|
+
} else currentRows = candidateRows;
|
|
13601
|
+
}
|
|
13602
|
+
if (currentRows.length > 0) chunks.push(buildTable(currentRows));
|
|
13603
|
+
return chunks.length > 0 ? chunks : [tableText];
|
|
13604
|
+
}
|
|
13581
13605
|
/**
|
|
13582
13606
|
* Splits text recursively using a list of separators.
|
|
13583
13607
|
* Preserves the separators when re-joining.
|
|
@@ -13640,6 +13664,7 @@ function splitTextRecursively(text$1, maxTokens, separators = [
|
|
|
13640
13664
|
function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
|
|
13641
13665
|
const tokens = marked.lexer(text$1);
|
|
13642
13666
|
const chunks = [];
|
|
13667
|
+
const effectiveOverlapTokens = getEffectiveOverlapTokens(maxTokens, overlapTokens);
|
|
13643
13668
|
let currentHeadings = [];
|
|
13644
13669
|
let currentChunkList = [];
|
|
13645
13670
|
let accumulatedTokens = 0;
|
|
@@ -13651,7 +13676,7 @@ function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
|
|
|
13651
13676
|
pageContent,
|
|
13652
13677
|
metadata: getMetadata(firstHeadings)
|
|
13653
13678
|
});
|
|
13654
|
-
if (isHeadingChange ||
|
|
13679
|
+
if (isHeadingChange || effectiveOverlapTokens <= 0) {
|
|
13655
13680
|
currentChunkList = [];
|
|
13656
13681
|
accumulatedTokens = 0;
|
|
13657
13682
|
} else {
|
|
@@ -13660,7 +13685,7 @@ function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
|
|
|
13660
13685
|
for (let i = currentChunkList.length - 1; i >= 0; i--) {
|
|
13661
13686
|
const item = currentChunkList[i];
|
|
13662
13687
|
const itemTokens = countTokens(item.text);
|
|
13663
|
-
if (currentOverlapTokens + itemTokens >
|
|
13688
|
+
if (currentOverlapTokens + itemTokens > effectiveOverlapTokens && overlapItems.length > 0) break;
|
|
13664
13689
|
overlapItems.unshift(item);
|
|
13665
13690
|
currentOverlapTokens += itemTokens;
|
|
13666
13691
|
}
|
|
@@ -13691,7 +13716,7 @@ function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
|
|
|
13691
13716
|
}
|
|
13692
13717
|
}
|
|
13693
13718
|
flushCurrentChunk(true);
|
|
13694
|
-
return chunks;
|
|
13719
|
+
return finalizeChunks(chunks, text$1);
|
|
13695
13720
|
function processTextBlock(blockText, headings, isAtomic = false) {
|
|
13696
13721
|
const blockTokens = countTokens(blockText);
|
|
13697
13722
|
const contextTokens = countTokens(formatHeadingContext(headings));
|
|
@@ -13699,12 +13724,15 @@ function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
|
|
|
13699
13724
|
const budgetLimit = Math.max(5, maxTokens - contextTokens - safetyBuffer);
|
|
13700
13725
|
if (blockTokens > budgetLimit) if (isAtomic) {
|
|
13701
13726
|
flushCurrentChunk(false);
|
|
13702
|
-
|
|
13703
|
-
|
|
13704
|
-
|
|
13705
|
-
|
|
13706
|
-
|
|
13707
|
-
|
|
13727
|
+
const atomicBlocks = blockTokens <= maxTokens ? [blockText] : blockText.includes("|") ? splitMarkdownTable(blockText, budgetLimit) : splitTextRecursively(blockText, budgetLimit, ["\n"]);
|
|
13728
|
+
for (const block of atomicBlocks) {
|
|
13729
|
+
currentChunkList.push({
|
|
13730
|
+
text: block,
|
|
13731
|
+
headings: [...headings]
|
|
13732
|
+
});
|
|
13733
|
+
accumulatedTokens = countTokens(block);
|
|
13734
|
+
flushCurrentChunk(false);
|
|
13735
|
+
}
|
|
13708
13736
|
} else {
|
|
13709
13737
|
flushCurrentChunk(false);
|
|
13710
13738
|
const subBlocks = splitTextRecursively(blockText, budgetLimit);
|
|
@@ -13878,6 +13906,276 @@ function getFileHash(filePath) {
|
|
|
13878
13906
|
});
|
|
13879
13907
|
}
|
|
13880
13908
|
|
|
13909
|
+
//#endregion
|
|
13910
|
+
//#region src/core/ai-extraction/evidence.ts
|
|
13911
|
+
const JSON_FILE_SUFFIX_RE$1 = /\.json$/i;
|
|
13912
|
+
const FIELD_PATH_PREFIX_RE = /^\$\./;
|
|
13913
|
+
function isRecord(value) {
|
|
13914
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
13915
|
+
}
|
|
13916
|
+
function stableValueKey(value) {
|
|
13917
|
+
return JSON.stringify(value);
|
|
13918
|
+
}
|
|
13919
|
+
function isPlaceholderString(value) {
|
|
13920
|
+
if (typeof value !== "string") return false;
|
|
13921
|
+
const normalized = value.trim().toLowerCase();
|
|
13922
|
+
return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
|
|
13923
|
+
}
|
|
13924
|
+
function primitiveToText(value) {
|
|
13925
|
+
if (value === null || value === void 0) return null;
|
|
13926
|
+
if (typeof value === "string") return value.trim() || null;
|
|
13927
|
+
if (typeof value === "number" || typeof value === "boolean") return String(value);
|
|
13928
|
+
return null;
|
|
13929
|
+
}
|
|
13930
|
+
function isMeaningfulValue(value) {
|
|
13931
|
+
return primitiveToText(value) !== null && !isPlaceholderString(value);
|
|
13932
|
+
}
|
|
13933
|
+
function normalizeText(value) {
|
|
13934
|
+
return value.toLowerCase().replace(/\s+/g, " ").trim();
|
|
13935
|
+
}
|
|
13936
|
+
function quoteAround(text$1, start, length) {
|
|
13937
|
+
const before = Math.max(0, start - 80);
|
|
13938
|
+
const after = Math.min(text$1.length, start + length + 80);
|
|
13939
|
+
return text$1.slice(before, after).replace(/\s+/g, " ").trim();
|
|
13940
|
+
}
|
|
13941
|
+
function findEvidence(value, chunks) {
|
|
13942
|
+
const searchText = primitiveToText(value);
|
|
13943
|
+
if (!searchText) return null;
|
|
13944
|
+
const normalizedSearchText = normalizeText(searchText);
|
|
13945
|
+
if (!normalizedSearchText) return null;
|
|
13946
|
+
for (const chunk of chunks) {
|
|
13947
|
+
if (normalizeText(chunk.text).indexOf(normalizedSearchText) === -1) continue;
|
|
13948
|
+
const rawIndex = chunk.text.toLowerCase().indexOf(searchText.toLowerCase());
|
|
13949
|
+
const quoteIndex = rawIndex >= 0 ? rawIndex : 0;
|
|
13950
|
+
return {
|
|
13951
|
+
chunkIndex: chunk.chunkIndex,
|
|
13952
|
+
headingPath: chunk.headingPath,
|
|
13953
|
+
quote: quoteAround(chunk.text, quoteIndex, searchText.length)
|
|
13954
|
+
};
|
|
13955
|
+
}
|
|
13956
|
+
return null;
|
|
13957
|
+
}
|
|
13958
|
+
function addEvidenceForProperty(fields, path$1, property, value, chunks) {
|
|
13959
|
+
if (property.type === "object" && property.properties) {
|
|
13960
|
+
const record = isRecord(value) ? value : {};
|
|
13961
|
+
for (const [childName, childProperty] of Object.entries(property.properties)) addEvidenceForProperty(fields, `${path$1}.${childName}`, childProperty, record[childName], chunks);
|
|
13962
|
+
return;
|
|
13963
|
+
}
|
|
13964
|
+
if (property.type === "array") {
|
|
13965
|
+
if (!Array.isArray(value) || value.length === 0) {
|
|
13966
|
+
fields.push({
|
|
13967
|
+
fieldPath: path$1,
|
|
13968
|
+
status: "missing",
|
|
13969
|
+
value: null,
|
|
13970
|
+
confidence: 0,
|
|
13971
|
+
note: "Array field is empty or missing."
|
|
13972
|
+
});
|
|
13973
|
+
return;
|
|
13974
|
+
}
|
|
13975
|
+
value.forEach((item, index) => {
|
|
13976
|
+
if (property.items?.type === "object" && property.items.properties) {
|
|
13977
|
+
const record = isRecord(item) ? item : {};
|
|
13978
|
+
for (const [childName, childProperty] of Object.entries(property.items.properties)) addEvidenceForProperty(fields, `${path$1}[${index}].${childName}`, childProperty, record[childName], chunks);
|
|
13979
|
+
} else addPrimitiveEvidence(fields, `${path$1}[${index}]`, item, chunks);
|
|
13980
|
+
});
|
|
13981
|
+
return;
|
|
13982
|
+
}
|
|
13983
|
+
addPrimitiveEvidence(fields, path$1, value, chunks);
|
|
13984
|
+
}
|
|
13985
|
+
function addPrimitiveEvidence(fields, fieldPath, value, chunks) {
|
|
13986
|
+
if (value === null || value === void 0 || value === "") {
|
|
13987
|
+
fields.push({
|
|
13988
|
+
fieldPath,
|
|
13989
|
+
status: "missing",
|
|
13990
|
+
value: null,
|
|
13991
|
+
confidence: 0,
|
|
13992
|
+
note: "Field is null or empty in final extraction."
|
|
13993
|
+
});
|
|
13994
|
+
return;
|
|
13995
|
+
}
|
|
13996
|
+
const found = findEvidence(value, chunks);
|
|
13997
|
+
if (found) {
|
|
13998
|
+
fields.push({
|
|
13999
|
+
fieldPath,
|
|
14000
|
+
status: "found",
|
|
14001
|
+
value,
|
|
14002
|
+
confidence: .8,
|
|
14003
|
+
...found
|
|
14004
|
+
});
|
|
14005
|
+
return;
|
|
14006
|
+
}
|
|
14007
|
+
fields.push({
|
|
14008
|
+
fieldPath,
|
|
14009
|
+
status: "inferred",
|
|
14010
|
+
value,
|
|
14011
|
+
confidence: .35,
|
|
14012
|
+
note: "Final value was not found verbatim in the available source text."
|
|
14013
|
+
});
|
|
14014
|
+
}
|
|
14015
|
+
function sourceChunksFromText(text$1) {
|
|
14016
|
+
return text$1 ? [{
|
|
14017
|
+
text: text$1,
|
|
14018
|
+
chunkIndex: 0,
|
|
14019
|
+
headingPath: []
|
|
14020
|
+
}] : [];
|
|
14021
|
+
}
|
|
14022
|
+
function sourceChunksFromMarkdownChunks(chunks) {
|
|
14023
|
+
return chunks.map((chunk, index) => ({
|
|
14024
|
+
text: chunk.pageContent,
|
|
14025
|
+
chunkIndex: chunk.chunkIndex ?? index,
|
|
14026
|
+
headingPath: chunk.headingPath ?? []
|
|
14027
|
+
}));
|
|
14028
|
+
}
|
|
14029
|
+
function getPathParts(fieldPath) {
|
|
14030
|
+
return fieldPath.replace(FIELD_PATH_PREFIX_RE, "").split(".").filter(Boolean);
|
|
14031
|
+
}
|
|
14032
|
+
function getValueAtPath$1(data, fieldPath) {
|
|
14033
|
+
let current = data;
|
|
14034
|
+
for (const part of getPathParts(fieldPath)) {
|
|
14035
|
+
if (!isRecord(current)) return void 0;
|
|
14036
|
+
current = current[part];
|
|
14037
|
+
}
|
|
14038
|
+
return current;
|
|
14039
|
+
}
|
|
14040
|
+
function setValueAtPath(data, fieldPath, value) {
|
|
14041
|
+
const parts = getPathParts(fieldPath);
|
|
14042
|
+
let current = data;
|
|
14043
|
+
for (let i = 0; i < parts.length - 1; i++) {
|
|
14044
|
+
const part = parts[i];
|
|
14045
|
+
if (!isRecord(current[part])) current[part] = {};
|
|
14046
|
+
current = current[part];
|
|
14047
|
+
}
|
|
14048
|
+
current[parts[parts.length - 1]] = value;
|
|
14049
|
+
}
|
|
14050
|
+
function collectScalarFields(fields, fieldPath, property) {
|
|
14051
|
+
if (property.type === "object" && property.properties) {
|
|
14052
|
+
for (const [name$1, childProperty] of Object.entries(property.properties)) collectScalarFields(fields, `${fieldPath}.${name$1}`, childProperty);
|
|
14053
|
+
return;
|
|
14054
|
+
}
|
|
14055
|
+
if (property.type !== "array") fields.push({
|
|
14056
|
+
fieldPath,
|
|
14057
|
+
property
|
|
14058
|
+
});
|
|
14059
|
+
}
|
|
14060
|
+
function candidateScore(candidate) {
|
|
14061
|
+
return (candidate.status === "found" ? 100 : 0) + Math.round(candidate.confidence * 10) + candidate.chunkIndex;
|
|
14062
|
+
}
|
|
14063
|
+
function selectCandidatesForField(candidates) {
|
|
14064
|
+
if (candidates.length === 0) return null;
|
|
14065
|
+
candidates.sort((a, b) => candidateScore(b) - candidateScore(a));
|
|
14066
|
+
const selected = candidates[0];
|
|
14067
|
+
selected.selected = true;
|
|
14068
|
+
for (const candidate of candidates.slice(1)) {
|
|
14069
|
+
candidate.selected = false;
|
|
14070
|
+
candidate.rejectionReason = "Lower evidence score or earlier chunk position.";
|
|
14071
|
+
}
|
|
14072
|
+
const distinctValues = /* @__PURE__ */ new Map();
|
|
14073
|
+
for (const candidate of candidates) distinctValues.set(stableValueKey(candidate.value), candidate.value);
|
|
14074
|
+
if (distinctValues.size <= 1) return null;
|
|
14075
|
+
return {
|
|
14076
|
+
fieldPath: selected.fieldPath,
|
|
14077
|
+
selectedValue: selected.value,
|
|
14078
|
+
rejectedValues: candidates.slice(1).map((candidate) => candidate.value),
|
|
14079
|
+
candidates: [...candidates]
|
|
14080
|
+
};
|
|
14081
|
+
}
|
|
14082
|
+
function buildCandidateMergeReport(input) {
|
|
14083
|
+
const scalarFields = [];
|
|
14084
|
+
for (const [name$1, property] of Object.entries(input.schema.properties)) {
|
|
14085
|
+
if (property.primary && property.autoIncrement) continue;
|
|
14086
|
+
collectScalarFields(scalarFields, `$.${name$1}`, property);
|
|
14087
|
+
}
|
|
14088
|
+
const sourceChunks = sourceChunksFromMarkdownChunks(input.chunks);
|
|
14089
|
+
const candidatesByPath = /* @__PURE__ */ new Map();
|
|
14090
|
+
for (const { fieldPath } of scalarFields) for (let chunkIndex = 0; chunkIndex < input.chunkResults.length; chunkIndex++) {
|
|
14091
|
+
const value = getValueAtPath$1(input.chunkResults[chunkIndex], fieldPath);
|
|
14092
|
+
if (!isMeaningfulValue(value)) continue;
|
|
14093
|
+
const sourceChunk = sourceChunks[chunkIndex] ?? {
|
|
14094
|
+
text: "",
|
|
14095
|
+
chunkIndex
|
|
14096
|
+
};
|
|
14097
|
+
const found = findEvidence(value, [sourceChunk]);
|
|
14098
|
+
const candidate = {
|
|
14099
|
+
fieldPath,
|
|
14100
|
+
value,
|
|
14101
|
+
chunkIndex: sourceChunk.chunkIndex ?? chunkIndex,
|
|
14102
|
+
headingPath: sourceChunk.headingPath,
|
|
14103
|
+
status: found ? "found" : "inferred",
|
|
14104
|
+
quote: found?.quote,
|
|
14105
|
+
confidence: found ? .85 : .35
|
|
14106
|
+
};
|
|
14107
|
+
const candidates = candidatesByPath.get(fieldPath) ?? [];
|
|
14108
|
+
candidates.push(candidate);
|
|
14109
|
+
candidatesByPath.set(fieldPath, candidates);
|
|
14110
|
+
}
|
|
14111
|
+
const allCandidates = [];
|
|
14112
|
+
const conflicts = [];
|
|
14113
|
+
for (const candidates of candidatesByPath.values()) {
|
|
14114
|
+
const conflict = selectCandidatesForField(candidates);
|
|
14115
|
+
allCandidates.push(...candidates);
|
|
14116
|
+
if (conflict) conflicts.push(conflict);
|
|
14117
|
+
}
|
|
14118
|
+
return {
|
|
14119
|
+
candidates: allCandidates,
|
|
14120
|
+
conflicts
|
|
14121
|
+
};
|
|
14122
|
+
}
|
|
14123
|
+
function applySelectedCandidates(data, report) {
|
|
14124
|
+
const merged = structuredClone(data);
|
|
14125
|
+
for (const candidate of report.candidates) if (candidate.selected) setValueAtPath(merged, candidate.fieldPath, candidate.value);
|
|
14126
|
+
return merged;
|
|
14127
|
+
}
|
|
14128
|
+
function buildExtractionEvidence(input) {
|
|
14129
|
+
const data = isRecord(input.data) ? input.data : {};
|
|
14130
|
+
const chunks = input.chunks ? sourceChunksFromMarkdownChunks(input.chunks) : sourceChunksFromText(input.text ?? "");
|
|
14131
|
+
const fields = [];
|
|
14132
|
+
for (const [name$1, property] of Object.entries(input.schema.properties)) {
|
|
14133
|
+
if (property.primary && property.autoIncrement) continue;
|
|
14134
|
+
addEvidenceForProperty(fields, `$.${name$1}`, property, data[name$1], chunks);
|
|
14135
|
+
}
|
|
14136
|
+
const inferredIssues = fields.filter((field) => field.status === "inferred").map((field) => ({
|
|
14137
|
+
fieldPath: field.fieldPath,
|
|
14138
|
+
message: field.note ?? "Field value lacks source evidence."
|
|
14139
|
+
}));
|
|
14140
|
+
const conflictIssues = (input.candidateReport?.conflicts ?? []).map((conflict) => ({
|
|
14141
|
+
fieldPath: conflict.fieldPath,
|
|
14142
|
+
message: "Multiple chunk candidates disagree for this field."
|
|
14143
|
+
}));
|
|
14144
|
+
const issues = [...inferredIssues, ...conflictIssues];
|
|
14145
|
+
return {
|
|
14146
|
+
coverage: {
|
|
14147
|
+
path: input.outputPath ? evidencePathForOutput(input.outputPath) : void 0,
|
|
14148
|
+
fieldCount: fields.length,
|
|
14149
|
+
evidenceCount: fields.filter((field) => field.status === "found").length,
|
|
14150
|
+
foundCount: fields.filter((field) => field.status === "found").length,
|
|
14151
|
+
missingCount: fields.filter((field) => field.status === "missing").length,
|
|
14152
|
+
inferredCount: fields.filter((field) => field.status === "inferred").length,
|
|
14153
|
+
conflictCount: input.candidateReport?.conflicts.length ?? 0,
|
|
14154
|
+
issueCount: issues.length
|
|
14155
|
+
},
|
|
14156
|
+
fields,
|
|
14157
|
+
candidates: input.candidateReport?.candidates,
|
|
14158
|
+
conflicts: input.candidateReport?.conflicts,
|
|
14159
|
+
issues
|
|
14160
|
+
};
|
|
14161
|
+
}
|
|
14162
|
+
function evidencePathForOutput(outputPath) {
|
|
14163
|
+
return outputPath.replace(JSON_FILE_SUFFIX_RE$1, ".evidence.json");
|
|
14164
|
+
}
|
|
14165
|
+
async function writeExtractionEvidence(input) {
|
|
14166
|
+
const report = buildExtractionEvidence(input);
|
|
14167
|
+
const evidencePath = evidencePathForOutput(input.outputPath);
|
|
14168
|
+
report.coverage.path = evidencePath;
|
|
14169
|
+
await writeFile(evidencePath, report, {
|
|
14170
|
+
spaces: 2,
|
|
14171
|
+
EOL: "\n"
|
|
14172
|
+
});
|
|
14173
|
+
return {
|
|
14174
|
+
...report.coverage,
|
|
14175
|
+
path: path.resolve(evidencePath)
|
|
14176
|
+
};
|
|
14177
|
+
}
|
|
14178
|
+
|
|
13881
14179
|
//#endregion
|
|
13882
14180
|
//#region src/core/notion-sink.ts
|
|
13883
14181
|
const RICH_TEXT_LIMIT = 2e3;
|
|
@@ -14163,6 +14461,36 @@ async function triggerWebhook(aiConfig, auditId, schemaName, event, source, data
|
|
|
14163
14461
|
}
|
|
14164
14462
|
}
|
|
14165
14463
|
|
|
14464
|
+
//#endregion
|
|
14465
|
+
//#region src/core/ai-extraction/transcriber.ts
|
|
14466
|
+
const TRANSCRIPTION_PROMPT = "Transcribe all visible text from this image accurately. Preserve the layout and line breaks as much as possible.";
|
|
14467
|
+
async function transcribeImageWithVision(imagePath, baseURL, apiKey, modelName, timeoutMs) {
|
|
14468
|
+
const provider = createOpenAICompatible({
|
|
14469
|
+
baseURL,
|
|
14470
|
+
name: "openai-compatible",
|
|
14471
|
+
apiKey
|
|
14472
|
+
});
|
|
14473
|
+
const buffer = await fs.readFile(imagePath);
|
|
14474
|
+
const effectiveTimeout = timeoutMs ?? 3e5;
|
|
14475
|
+
return {
|
|
14476
|
+
text: (await generateText({
|
|
14477
|
+
model: provider.chatModel(modelName),
|
|
14478
|
+
messages: [{
|
|
14479
|
+
role: "user",
|
|
14480
|
+
content: [{
|
|
14481
|
+
type: "text",
|
|
14482
|
+
text: TRANSCRIPTION_PROMPT
|
|
14483
|
+
}, {
|
|
14484
|
+
type: "image",
|
|
14485
|
+
image: buffer
|
|
14486
|
+
}]
|
|
14487
|
+
}],
|
|
14488
|
+
abortSignal: AbortSignal.timeout(effectiveTimeout)
|
|
14489
|
+
})).text,
|
|
14490
|
+
modelName
|
|
14491
|
+
};
|
|
14492
|
+
}
|
|
14493
|
+
|
|
14166
14494
|
//#endregion
|
|
14167
14495
|
//#region src/core/file-constants.ts
|
|
14168
14496
|
const MAX_UPLOAD_SIZE = 30 * 1024 * 1024;
|
|
@@ -14496,14 +14824,6 @@ function createPdfConverter(config) {
|
|
|
14496
14824
|
return withFallback(new ExternalCommandPdfConverter("mineru", mineruConfig), mineruConfig);
|
|
14497
14825
|
}
|
|
14498
14826
|
if (config.converter === "mineru_api") return new MineruApiPdfConverter(config.mineruApi ?? DEFAULT_MINERU_API_CONFIG);
|
|
14499
|
-
if (config.converter === "markitdown") {
|
|
14500
|
-
const markitdownConfig = config.markitdown ?? DEFAULT_MARKITDOWN_CONFIG;
|
|
14501
|
-
return withFallback(new ExternalCommandPdfConverter("markitdown", markitdownConfig), markitdownConfig);
|
|
14502
|
-
}
|
|
14503
|
-
if (config.converter === "marker") {
|
|
14504
|
-
const markerConfig = config.marker ?? DEFAULT_MARKER_CONFIG;
|
|
14505
|
-
return withFallback(new ExternalCommandPdfConverter("marker", markerConfig), markerConfig);
|
|
14506
|
-
}
|
|
14507
14827
|
if (config.converter === "external") {
|
|
14508
14828
|
if (!config.external) throw new Error(t("errors.pdf.externalNotConfigured"));
|
|
14509
14829
|
return new ExternalCommandPdfConverter("external", config.external);
|
|
@@ -14531,7 +14851,7 @@ const FILE_PART_EXTENSIONS = new Set([
|
|
|
14531
14851
|
"svg"
|
|
14532
14852
|
]);
|
|
14533
14853
|
const PDF_EXT_RE = /\.pdf$/i;
|
|
14534
|
-
async function readExtractFileInput(filePath, aiConfig
|
|
14854
|
+
async function readExtractFileInput(filePath, aiConfig) {
|
|
14535
14855
|
const stat = fs$1.statSync(filePath);
|
|
14536
14856
|
if (stat.size > MAX_UPLOAD_SIZE) throw new Error(t("errors.file.sizeExceeded", {
|
|
14537
14857
|
size: bytesToMB(stat.size).toFixed(1),
|
|
@@ -14540,15 +14860,22 @@ async function readExtractFileInput(filePath, aiConfig, modelOverride) {
|
|
|
14540
14860
|
}));
|
|
14541
14861
|
const ext = path.extname(filePath).toLowerCase().replace(".", "");
|
|
14542
14862
|
if (FILE_PART_EXTENSIONS.has(ext)) {
|
|
14543
|
-
|
|
14544
|
-
|
|
14545
|
-
|
|
14546
|
-
|
|
14863
|
+
const image = aiConfig?.image;
|
|
14864
|
+
if (image?.imageConversion === "vision" && image.imageModelName && aiConfig) {
|
|
14865
|
+
const baseURL = image.visionBaseURL || aiConfig.provider.baseURL;
|
|
14866
|
+
const apiKey = image.visionApiKey || aiConfig.provider.apiKey;
|
|
14867
|
+
const timeout = (aiConfig.provider.timeout ?? 300) * 1e3;
|
|
14868
|
+
try {
|
|
14869
|
+
const result$1 = await transcribeImageWithVision(filePath, baseURL, apiKey, image.imageModelName, timeout);
|
|
14870
|
+
consola.info(t("command.extract.file.visionTranscribed", { model: result$1.modelName }));
|
|
14871
|
+
return { text: result$1.text };
|
|
14872
|
+
} catch {
|
|
14873
|
+
consola.warn(t("command.extract.file.visionTranscribeFailed", { model: image.imageModelName }));
|
|
14874
|
+
}
|
|
14547
14875
|
}
|
|
14548
|
-
|
|
14549
|
-
|
|
14550
|
-
|
|
14551
|
-
};
|
|
14876
|
+
const result = await recognizeImageText(filePath, aiConfig?.image);
|
|
14877
|
+
consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
|
|
14878
|
+
return { text: result.text };
|
|
14552
14879
|
}
|
|
14553
14880
|
if (ext === "pdf") {
|
|
14554
14881
|
const buffer = await fs.readFile(filePath);
|
|
@@ -14684,29 +15011,6 @@ async function limitConcurrency(concurrency, items, fn) {
|
|
|
14684
15011
|
await Promise.all(workers);
|
|
14685
15012
|
return results;
|
|
14686
15013
|
}
|
|
14687
|
-
function getSchemaKeywords(schema) {
|
|
14688
|
-
const keywords = /* @__PURE__ */ new Set();
|
|
14689
|
-
function walk(properties) {
|
|
14690
|
-
if (!properties) return;
|
|
14691
|
-
for (const [name$1, prop] of Object.entries(properties)) {
|
|
14692
|
-
keywords.add(name$1.toLowerCase());
|
|
14693
|
-
const parts = name$1.replace(/([a-z0-9])([A-Z])/g, "$1 $2").split(/[\s._:/\\-]+/g);
|
|
14694
|
-
for (const part of parts) if (part.length > 1) keywords.add(part.toLowerCase());
|
|
14695
|
-
if (prop && typeof prop === "object") {
|
|
14696
|
-
const p = prop;
|
|
14697
|
-
if (typeof p.title === "string") keywords.add(p.title.toLowerCase());
|
|
14698
|
-
if (typeof p.description === "string") {
|
|
14699
|
-
const descParts = p.description.toLowerCase().match(/[\p{L}\p{N}_-]+/gu) ?? [];
|
|
14700
|
-
for (const d of descParts) if (d.length > 2) keywords.add(d);
|
|
14701
|
-
}
|
|
14702
|
-
if (p.type === "object") walk(p.properties);
|
|
14703
|
-
if (p.type === "array" && p.items?.type === "object") walk(p.items.properties);
|
|
14704
|
-
}
|
|
14705
|
-
}
|
|
14706
|
-
}
|
|
14707
|
-
walk(schema.properties);
|
|
14708
|
-
return Array.from(keywords);
|
|
14709
|
-
}
|
|
14710
15014
|
async function ensureDatabaseReady(dbPath, schema) {
|
|
14711
15015
|
try {
|
|
14712
15016
|
await fs.access(dbPath);
|
|
@@ -14778,184 +15082,145 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14778
15082
|
}
|
|
14779
15083
|
const s = spinner();
|
|
14780
15084
|
if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
|
|
14781
|
-
const maxTokens =
|
|
15085
|
+
const maxTokens = calculateChunkTokenBudget({
|
|
15086
|
+
configuredMaxTokens: aiConfig.extraction?.maxTokens ?? 8e3,
|
|
15087
|
+
modelMaxTokens: modelOverride?.capabilities.maxTokens
|
|
15088
|
+
});
|
|
14782
15089
|
const overlapTokens = aiConfig.extraction?.overlapSize ?? 1e3;
|
|
14783
|
-
let result;
|
|
14784
15090
|
const totalTokens = text$1 ? encoding.encode(text$1).length : 0;
|
|
14785
|
-
if (text$1 && totalTokens > maxTokens) {
|
|
14786
|
-
|
|
14787
|
-
|
|
14788
|
-
|
|
14789
|
-
|
|
14790
|
-
|
|
14791
|
-
|
|
14792
|
-
|
|
14793
|
-
|
|
14794
|
-
|
|
14795
|
-
|
|
14796
|
-
|
|
14797
|
-
|
|
14798
|
-
|
|
14799
|
-
|
|
14800
|
-
|
|
14801
|
-
|
|
14802
|
-
|
|
14803
|
-
|
|
14804
|
-
|
|
14805
|
-
|
|
14806
|
-
|
|
14807
|
-
|
|
14808
|
-
|
|
14809
|
-
|
|
14810
|
-
|
|
14811
|
-
|
|
14812
|
-
|
|
14813
|
-
|
|
14814
|
-
|
|
14815
|
-
|
|
14816
|
-
|
|
14817
|
-
|
|
14818
|
-
|
|
14819
|
-
|
|
14820
|
-
|
|
14821
|
-
|
|
14822
|
-
|
|
14823
|
-
|
|
14824
|
-
|
|
14825
|
-
|
|
14826
|
-
|
|
14827
|
-
|
|
14828
|
-
|
|
14829
|
-
|
|
14830
|
-
|
|
14831
|
-
|
|
14832
|
-
|
|
14833
|
-
|
|
14834
|
-
let errorMsg = "";
|
|
14835
|
-
const extractionTasks = processedDocs.map((doc, i) => {
|
|
14836
|
-
return async () => {
|
|
14837
|
-
if (!success) return;
|
|
14838
|
-
const headings = [];
|
|
14839
|
-
if (doc.metadata) {
|
|
14840
|
-
if (doc.metadata.h1) headings.push(doc.metadata.h1);
|
|
14841
|
-
if (doc.metadata.h2) headings.push(doc.metadata.h2);
|
|
14842
|
-
if (doc.metadata.h3) headings.push(doc.metadata.h3);
|
|
14843
|
-
if (doc.metadata.h4) headings.push(doc.metadata.h4);
|
|
14844
|
-
}
|
|
14845
|
-
let chunkText = doc.pageContent;
|
|
14846
|
-
if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
|
|
14847
|
-
const chunkResult = await extractStructuredData({
|
|
14848
|
-
config: aiConfig,
|
|
14849
|
-
schema: schemaLoad.schema,
|
|
14850
|
-
text: chunkText,
|
|
14851
|
-
aiexDir,
|
|
14852
|
-
modelOverride,
|
|
14853
|
-
onRetry(info) {
|
|
14854
|
-
if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
|
|
14855
|
-
current: i + 1,
|
|
14856
|
-
total: processedDocs.length,
|
|
14857
|
-
code: info.statusCode,
|
|
14858
|
-
delay: info.delayMs / 1e3,
|
|
14859
|
-
attempt: info.attempt,
|
|
14860
|
-
max: info.maxRetries
|
|
14861
|
-
}));
|
|
14862
|
-
}
|
|
14863
|
-
});
|
|
14864
|
-
if (!chunkResult.success) {
|
|
14865
|
-
success = false;
|
|
14866
|
-
errorMsg = chunkResult.error || t("common.unknownError");
|
|
14867
|
-
if (!options?.quiet) {
|
|
14868
|
-
s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
|
|
14869
|
-
consola.error(errorMsg);
|
|
14870
|
-
}
|
|
14871
|
-
return;
|
|
14872
|
-
}
|
|
14873
|
-
if (chunkResult.data) chunkResults.push(chunkResult.data);
|
|
14874
|
-
if (chunkResult.tokensUsed) {
|
|
14875
|
-
accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
|
|
14876
|
-
accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
|
|
14877
|
-
accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
|
|
15091
|
+
if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunking", {
|
|
15092
|
+
length: totalTokens,
|
|
15093
|
+
limit: maxTokens
|
|
15094
|
+
}));
|
|
15095
|
+
const processedDocs = text$1 && totalTokens > maxTokens ? splitMarkdown(text$1, maxTokens, overlapTokens) : [{
|
|
15096
|
+
pageContent: text$1 ?? "",
|
|
15097
|
+
metadata: {},
|
|
15098
|
+
chunkIndex: 0,
|
|
15099
|
+
totalChunks: 1,
|
|
15100
|
+
tokenCount: totalTokens,
|
|
15101
|
+
headingPath: [],
|
|
15102
|
+
charStart: 0,
|
|
15103
|
+
charEnd: text$1?.length ?? 0
|
|
15104
|
+
}];
|
|
15105
|
+
if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: processedDocs.length }));
|
|
15106
|
+
const chunkResults = Array.from({ length: processedDocs.length });
|
|
15107
|
+
const accumulatedTokens = {
|
|
15108
|
+
prompt: 0,
|
|
15109
|
+
completion: 0,
|
|
15110
|
+
total: 0
|
|
15111
|
+
};
|
|
15112
|
+
let success = true;
|
|
15113
|
+
let errorMsg = "";
|
|
15114
|
+
const extractionTasks = processedDocs.map((doc, i) => {
|
|
15115
|
+
return async () => {
|
|
15116
|
+
if (!success) return;
|
|
15117
|
+
const headings = doc.headingPath?.length ? doc.headingPath : [
|
|
15118
|
+
doc.metadata.h1,
|
|
15119
|
+
doc.metadata.h2,
|
|
15120
|
+
doc.metadata.h3,
|
|
15121
|
+
doc.metadata.h4
|
|
15122
|
+
].filter(Boolean);
|
|
15123
|
+
let chunkText = doc.pageContent;
|
|
15124
|
+
if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
|
|
15125
|
+
const chunkResult = await extractStructuredData({
|
|
15126
|
+
config: aiConfig,
|
|
15127
|
+
schema: schemaLoad.schema,
|
|
15128
|
+
text: chunkText,
|
|
15129
|
+
aiexDir,
|
|
15130
|
+
modelOverride,
|
|
15131
|
+
onRetry(info) {
|
|
15132
|
+
if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
|
|
15133
|
+
current: i + 1,
|
|
15134
|
+
total: processedDocs.length,
|
|
15135
|
+
code: info.statusCode,
|
|
15136
|
+
delay: info.delayMs / 1e3,
|
|
15137
|
+
attempt: info.attempt,
|
|
15138
|
+
max: info.maxRetries
|
|
15139
|
+
}));
|
|
14878
15140
|
}
|
|
14879
|
-
};
|
|
14880
|
-
});
|
|
14881
|
-
const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
|
|
14882
|
-
if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
|
|
14883
|
-
current: 1,
|
|
14884
|
-
total: processedDocs.length
|
|
14885
|
-
}));
|
|
14886
|
-
try {
|
|
14887
|
-
await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
|
|
14888
|
-
if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
|
|
14889
|
-
current: idx + 1,
|
|
14890
|
-
total: processedDocs.length
|
|
14891
|
-
}));
|
|
14892
|
-
await task();
|
|
14893
15141
|
});
|
|
14894
|
-
|
|
14895
|
-
|
|
14896
|
-
|
|
14897
|
-
|
|
14898
|
-
|
|
14899
|
-
|
|
14900
|
-
|
|
14901
|
-
|
|
14902
|
-
|
|
14903
|
-
|
|
14904
|
-
|
|
14905
|
-
|
|
14906
|
-
|
|
14907
|
-
|
|
14908
|
-
consola.error(valError);
|
|
15142
|
+
if (!chunkResult.success) {
|
|
15143
|
+
success = false;
|
|
15144
|
+
errorMsg = chunkResult.error || t("common.unknownError");
|
|
15145
|
+
if (!options?.quiet) {
|
|
15146
|
+
s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
|
|
15147
|
+
consola.error(errorMsg);
|
|
15148
|
+
}
|
|
15149
|
+
return;
|
|
15150
|
+
}
|
|
15151
|
+
if (chunkResult.data) chunkResults[i] = chunkResult.data;
|
|
15152
|
+
if (chunkResult.tokensUsed) {
|
|
15153
|
+
accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
|
|
15154
|
+
accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
|
|
15155
|
+
accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
|
|
14909
15156
|
}
|
|
14910
|
-
return {
|
|
14911
|
-
success: false,
|
|
14912
|
-
error: valError
|
|
14913
|
-
};
|
|
14914
|
-
}
|
|
14915
|
-
const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
|
|
14916
|
-
await fs.mkdir(outputDir, { recursive: true });
|
|
14917
|
-
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
14918
|
-
const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
|
|
14919
|
-
const finalMergedOutputPath = path.join(outputDir, outputFileName);
|
|
14920
|
-
await fs.writeFile(finalMergedOutputPath, JSON.stringify(mergedData, null, 2));
|
|
14921
|
-
result = {
|
|
14922
|
-
success: true,
|
|
14923
|
-
data: mergedData,
|
|
14924
|
-
tokensUsed: accumulatedTokens,
|
|
14925
|
-
outputPath: finalMergedOutputPath
|
|
14926
15157
|
};
|
|
14927
|
-
}
|
|
14928
|
-
|
|
14929
|
-
|
|
14930
|
-
|
|
14931
|
-
|
|
14932
|
-
|
|
14933
|
-
|
|
14934
|
-
|
|
14935
|
-
if (!options?.quiet) s.message(t("command.extract.file.
|
|
14936
|
-
|
|
14937
|
-
|
|
14938
|
-
attempt: info.attempt,
|
|
14939
|
-
max: info.maxRetries
|
|
15158
|
+
});
|
|
15159
|
+
const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
|
|
15160
|
+
if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
|
|
15161
|
+
current: 1,
|
|
15162
|
+
total: processedDocs.length
|
|
15163
|
+
}));
|
|
15164
|
+
try {
|
|
15165
|
+
await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
|
|
15166
|
+
if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
|
|
15167
|
+
current: idx + 1,
|
|
15168
|
+
total: processedDocs.length
|
|
14940
15169
|
}));
|
|
14941
|
-
|
|
15170
|
+
await task();
|
|
15171
|
+
});
|
|
15172
|
+
} catch (e) {
|
|
15173
|
+
success = false;
|
|
15174
|
+
errorMsg = e instanceof Error ? e.message : String(e);
|
|
15175
|
+
}
|
|
15176
|
+
if (!success) return {
|
|
15177
|
+
success: false,
|
|
15178
|
+
error: errorMsg
|
|
15179
|
+
};
|
|
15180
|
+
const successfulChunkResults = chunkResults.filter((chunkResult) => !!chunkResult);
|
|
15181
|
+
const candidateReport = buildCandidateMergeReport({
|
|
15182
|
+
schema: schemaLoad.schema,
|
|
15183
|
+
chunkResults: successfulChunkResults,
|
|
15184
|
+
chunks: processedDocs
|
|
14942
15185
|
});
|
|
14943
|
-
|
|
15186
|
+
const mergedData = applySelectedCandidates(mergeExtractionResults(schemaLoad.schema, successfulChunkResults), candidateReport);
|
|
15187
|
+
const validation = validateExtractedData(schemaLoad.schema, mergedData);
|
|
15188
|
+
if (!validation.success) {
|
|
15189
|
+
const valError = validation.error || "Merged data validation failed";
|
|
14944
15190
|
if (!options?.quiet) {
|
|
14945
|
-
s.stop(t("command.extract.file.
|
|
14946
|
-
consola.error(
|
|
15191
|
+
s.stop(t("command.extract.file.validationFail"));
|
|
15192
|
+
consola.error(valError);
|
|
14947
15193
|
}
|
|
14948
15194
|
return {
|
|
14949
15195
|
success: false,
|
|
14950
|
-
error:
|
|
15196
|
+
error: valError
|
|
14951
15197
|
};
|
|
14952
15198
|
}
|
|
15199
|
+
const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
|
|
15200
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
15201
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
15202
|
+
const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
|
|
15203
|
+
const outputPath = path.join(outputDir, outputFileName);
|
|
15204
|
+
await fs.writeFile(outputPath, JSON.stringify(mergedData, null, 2));
|
|
15205
|
+
const result = {
|
|
15206
|
+
success: true,
|
|
15207
|
+
data: mergedData,
|
|
15208
|
+
tokensUsed: accumulatedTokens,
|
|
15209
|
+
outputPath,
|
|
15210
|
+
evidenceSummary: await writeExtractionEvidence({
|
|
15211
|
+
schema: schemaLoad.schema,
|
|
15212
|
+
data: mergedData,
|
|
15213
|
+
outputPath,
|
|
15214
|
+
chunks: processedDocs,
|
|
15215
|
+
candidateReport
|
|
15216
|
+
})
|
|
15217
|
+
};
|
|
14953
15218
|
if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
|
|
14954
15219
|
if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
|
|
14955
15220
|
if (result.evidenceSummary && !options?.quiet) {
|
|
14956
15221
|
const summary = result.evidenceSummary;
|
|
14957
15222
|
const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
|
|
14958
|
-
consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, issues ${issueText}`));
|
|
15223
|
+
consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, conflicts ${summary.conflictCount ?? 0}, issues ${issueText}`));
|
|
14959
15224
|
}
|
|
14960
15225
|
if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
|
|
14961
15226
|
prompt: result.tokensUsed.prompt,
|
|
@@ -15077,13 +15342,9 @@ async function runAuditedExtraction(options) {
|
|
|
15077
15342
|
});
|
|
15078
15343
|
try {
|
|
15079
15344
|
let text$1 = "";
|
|
15080
|
-
|
|
15081
|
-
|
|
15082
|
-
|
|
15083
|
-
text$1 = input.text;
|
|
15084
|
-
filePath = input.filePath;
|
|
15085
|
-
} else text$1 = source.text;
|
|
15086
|
-
const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
|
|
15345
|
+
if (source.type === "file") text$1 = (await readExtractFileInput(source.filePath, aiConfig)).text;
|
|
15346
|
+
else text$1 = source.text;
|
|
15347
|
+
const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, source.type === "file" ? source.filePath : void 0, modelOverride, {
|
|
15087
15348
|
quiet,
|
|
15088
15349
|
insert
|
|
15089
15350
|
});
|