aiex-cli 0.0.5-beta.4 → 0.0.5-beta.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/dist/cli.mjs +785 -460
- package/dist/{doctor-collector-Cv7RArla.mjs → doctor-collector-BpqhXNcO.mjs} +30 -92
- package/dist/index.mjs +1 -1
- package/dist/web/assets/AISettings-sVI4PTNB.js +264 -0
- package/dist/web/assets/{DataBrowser-GAA-pGq0.js → DataBrowser-BGkZb9FV.js} +1 -1
- package/dist/web/assets/{ExtractionViewer-BhhWrBs2.js → ExtractionViewer-DNrkSECj.js} +1 -1
- package/dist/web/assets/{api-client-b4ZBXpNH.js → api-client-gQAAOw0v.js} +1 -1
- package/dist/web/assets/{index-CKV2X6sS.js → index-BQKZKzzP.js} +3 -3
- package/dist/web/assets/index-BU58oIRd.css +2 -0
- package/dist/web/index.html +3 -3
- package/dist/{zh-CN-CyL-61Ow.mjs → zh-CN-DkillGHx.mjs} +11 -24
- package/package.json +3 -1
- package/dist/web/assets/AISettings-BlyTFIIy.js +0 -272
- package/dist/web/assets/index-Csdgio76.css +0 -2
package/dist/cli.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { C as name, D as doctorDiagnosticsTableRows, O as formatDoctorDiagnosticsJson, S as description, T as version, _ as PLACEHOLDER_SCHEMA, a as parseJsonSchema, b as createConfig, c as recognizeImageText, d as getDefaultAIConfig, f as readAIConfig, g as DEFAULT_PROMPT_CONFIG, h as DEFAULT_MINERU_CONFIG, i as JsonSchemaDefinitionSchema, l as initI18n, m as DEFAULT_MINERU_API_CONFIG, n as createMigrationConfig, o as toSnakeCase, p as writeAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as t, v as PLACEHOLDER_TEXT, w as package_default, x as seedConfig, y as AIConfigSchema } from "./doctor-collector-BpqhXNcO.mjs";
|
|
2
2
|
import { createRequire } from "node:module";
|
|
3
3
|
import fs from "node:fs/promises";
|
|
4
4
|
import os from "node:os";
|
|
@@ -17,13 +17,14 @@ import Database from "better-sqlite3";
|
|
|
17
17
|
import pc from "picocolors";
|
|
18
18
|
import { Buffer } from "node:buffer";
|
|
19
19
|
import * as XLSX from "xlsx";
|
|
20
|
+
import { getEncoding } from "js-tiktoken";
|
|
20
21
|
import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
|
|
21
22
|
import { APICallError, Output, generateText, jsonSchema } from "ai";
|
|
22
23
|
import pRetry from "p-retry";
|
|
23
|
-
import mime from "mime";
|
|
24
24
|
import { jsonrepair } from "jsonrepair";
|
|
25
25
|
import { LangfuseSpanProcessor } from "@langfuse/otel";
|
|
26
26
|
import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
27
|
+
import { marked } from "marked";
|
|
27
28
|
import crypto from "node:crypto";
|
|
28
29
|
import { Client, extractNotionId } from "@notionhq/client";
|
|
29
30
|
import { execa } from "execa";
|
|
@@ -12859,28 +12860,6 @@ async function withRetry(fn, onRetry, maxRetries = 5) {
|
|
|
12859
12860
|
});
|
|
12860
12861
|
}
|
|
12861
12862
|
|
|
12862
|
-
//#endregion
|
|
12863
|
-
//#region src/core/ai-extraction/file-utils.ts
|
|
12864
|
-
function detectMimeType(filePath) {
|
|
12865
|
-
return mime.getType(filePath) ?? "application/octet-stream";
|
|
12866
|
-
}
|
|
12867
|
-
async function readFilePart(filePath) {
|
|
12868
|
-
const mimeStr = detectMimeType(filePath);
|
|
12869
|
-
const buffer = await fs.readFile(filePath);
|
|
12870
|
-
const name$1 = path.basename(filePath);
|
|
12871
|
-
if (mimeStr.startsWith("image/")) return {
|
|
12872
|
-
type: "image",
|
|
12873
|
-
image: buffer,
|
|
12874
|
-
mimeType: mimeStr
|
|
12875
|
-
};
|
|
12876
|
-
return {
|
|
12877
|
-
type: "file",
|
|
12878
|
-
data: buffer,
|
|
12879
|
-
mediaType: mimeStr,
|
|
12880
|
-
filename: name$1
|
|
12881
|
-
};
|
|
12882
|
-
}
|
|
12883
|
-
|
|
12884
12863
|
//#endregion
|
|
12885
12864
|
//#region src/core/ai-extraction/json-utils.ts
|
|
12886
12865
|
function parseJsonLike(text$1) {
|
|
@@ -12941,25 +12920,10 @@ function filterCompatible(models, inputTokens, outputTokens) {
|
|
|
12941
12920
|
});
|
|
12942
12921
|
}
|
|
12943
12922
|
function selectModel(input) {
|
|
12944
|
-
const { models,
|
|
12923
|
+
const { models, inputTokens, outputTokens } = input;
|
|
12945
12924
|
if (models.length === 0) throw new Error(t("errors.ai.noModels"));
|
|
12946
12925
|
let candidates = filterCompatible(models, inputTokens, outputTokens);
|
|
12947
12926
|
if (candidates.length === 0) candidates = models;
|
|
12948
|
-
if (isImage) {
|
|
12949
|
-
const visionModel = candidates.find((m) => m.capabilities.vision);
|
|
12950
|
-
if (!visionModel) {
|
|
12951
|
-
const hint = fileName ? ` (${fileName})` : "";
|
|
12952
|
-
const msg = inputTokens ? t("errors.ai.noVisionModelContext", {
|
|
12953
|
-
tokens: inputTokens,
|
|
12954
|
-
hint
|
|
12955
|
-
}) : t("errors.ai.noVisionModel", { hint });
|
|
12956
|
-
throw new Error(msg + t("errors.ai.addSuitableModel"));
|
|
12957
|
-
}
|
|
12958
|
-
return {
|
|
12959
|
-
name: visionModel.name,
|
|
12960
|
-
capabilities: visionModel.capabilities
|
|
12961
|
-
};
|
|
12962
|
-
}
|
|
12963
12927
|
const soModel = candidates.find((m) => m.capabilities.structuredOutput);
|
|
12964
12928
|
if (soModel) return {
|
|
12965
12929
|
name: soModel.name,
|
|
@@ -12973,36 +12937,46 @@ function selectModel(input) {
|
|
|
12973
12937
|
|
|
12974
12938
|
//#endregion
|
|
12975
12939
|
//#region src/core/ai-extraction/prompt-generator.ts
|
|
12976
|
-
|
|
12940
|
+
const CAMEL_CASE_BOUNDARY_RE = /([a-z0-9])([A-Z])/g;
|
|
12941
|
+
const IDENTIFIER_SEPARATOR_RE = /[\s_-]+/;
|
|
12942
|
+
function splitIdentifier(name$1) {
|
|
12943
|
+
return name$1.replace(CAMEL_CASE_BOUNDARY_RE, "$1 $2").split(IDENTIFIER_SEPARATOR_RE).map((part) => part.trim().toLowerCase()).filter(Boolean);
|
|
12944
|
+
}
|
|
12945
|
+
function propertyToDescription(name$1, prop, indent = "", required = false) {
|
|
12977
12946
|
const lines = [];
|
|
12978
12947
|
let typeStr = prop.type;
|
|
12979
12948
|
if (prop.type === "array" && prop.items) typeStr = `array of ${prop.items.type}`;
|
|
12980
|
-
lines.push(`${indent}- ${name$1}: ${typeStr}`);
|
|
12949
|
+
lines.push(`${indent}- ${name$1}: ${typeStr}${required ? " (required)" : ""}`);
|
|
12950
|
+
const terms = splitIdentifier(name$1);
|
|
12951
|
+
if (terms.length > 1) lines.push(`${indent} search terms: ${terms.join(", ")}`);
|
|
12952
|
+
if (prop.description) lines.push(`${indent} description: ${prop.description}`);
|
|
12981
12953
|
if (prop.minLength !== void 0 || prop.maxLength !== void 0) lines.push(`${indent} length: ${prop.minLength ?? 0} - ${prop.maxLength ?? "unlimited"}`);
|
|
12954
|
+
if (prop.minimum !== void 0 || prop.maximum !== void 0) lines.push(`${indent} range: ${prop.minimum ?? "-∞"} - ${prop.maximum ?? "+∞"}`);
|
|
12982
12955
|
if (prop.format) lines.push(`${indent} format: ${prop.format}`);
|
|
12983
12956
|
if (prop.unique) lines.push(`${indent} unique: true`);
|
|
12984
12957
|
if (prop.default !== void 0) lines.push(`${indent} default: ${JSON.stringify(prop.default)}`);
|
|
12985
12958
|
return lines.join("\n");
|
|
12986
12959
|
}
|
|
12987
|
-
function nestedPropertyToDescription(name$1, prop, indent = "") {
|
|
12960
|
+
function nestedPropertyToDescription(name$1, prop, indent = "", requiredFields = []) {
|
|
12988
12961
|
const lines = [];
|
|
12962
|
+
const isRequired = requiredFields.includes(name$1);
|
|
12989
12963
|
if (prop.nested?.enabled && prop.type === "object") {
|
|
12990
12964
|
const relation = prop.nested.relation || "has-one";
|
|
12991
|
-
lines.push(`${indent}- ${name$1}: object (related table, ${relation})`);
|
|
12992
|
-
if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}
|
|
12965
|
+
lines.push(`${indent}- ${name$1}: object (related table, ${relation})${isRequired ? " (required)" : ""}`);
|
|
12966
|
+
if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.required ?? []));
|
|
12993
12967
|
return lines.join("\n");
|
|
12994
12968
|
}
|
|
12995
12969
|
if (prop.type === "array" && prop.items?.nested?.enabled) {
|
|
12996
12970
|
const relation = prop.items.nested.relation || "has-many";
|
|
12997
|
-
lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})`);
|
|
12998
|
-
if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}
|
|
12971
|
+
lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})${isRequired ? " (required)" : ""}`);
|
|
12972
|
+
if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.items.required ?? []));
|
|
12999
12973
|
return lines.join("\n");
|
|
13000
12974
|
}
|
|
13001
|
-
lines.push(propertyToDescription(name$1, prop, indent));
|
|
13002
|
-
if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}
|
|
12975
|
+
lines.push(propertyToDescription(name$1, prop, indent, isRequired));
|
|
12976
|
+
if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.required ?? []));
|
|
13003
12977
|
if (prop.type === "array" && prop.items?.properties && !prop.items?.nested?.enabled) {
|
|
13004
12978
|
lines.push(`${indent} item fields:`);
|
|
13005
|
-
for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}
|
|
12979
|
+
for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.items.required ?? []));
|
|
13006
12980
|
}
|
|
13007
12981
|
return lines.join("\n");
|
|
13008
12982
|
}
|
|
@@ -13014,7 +12988,7 @@ function schemaToDescription(schema) {
|
|
|
13014
12988
|
lines.push("Fields:");
|
|
13015
12989
|
for (const [name$1, prop] of Object.entries(schema.properties)) {
|
|
13016
12990
|
const property = prop;
|
|
13017
|
-
lines.push(nestedPropertyToDescription(name$1, property));
|
|
12991
|
+
lines.push(nestedPropertyToDescription(name$1, property, "", schema.required ?? []));
|
|
13018
12992
|
}
|
|
13019
12993
|
if (schema.examples && schema.examples.length > 0) {
|
|
13020
12994
|
lines.push("");
|
|
@@ -13059,33 +13033,6 @@ function generatePromptSnapshot(schema, promptConfig = DEFAULT_PROMPT_CONFIG) {
|
|
|
13059
13033
|
].join("\n");
|
|
13060
13034
|
}
|
|
13061
13035
|
|
|
13062
|
-
//#endregion
|
|
13063
|
-
//#region src/core/ai-extraction/snapshot.ts
|
|
13064
|
-
const SYSTEM_PROMPT_REGEX = /## System Prompt\n([\s\S]*?)(?=## User Prompt|$)/;
|
|
13065
|
-
const USER_PROMPT_REGEX = /## User Prompt Template\n([\s\S]*)$/;
|
|
13066
|
-
async function loadPromptSnapshot(aiexDir, tableName) {
|
|
13067
|
-
const snapshotPath = path.join(aiexDir, "extracted", `${tableName}.prompt.md`);
|
|
13068
|
-
try {
|
|
13069
|
-
const content = await fs.readFile(snapshotPath, "utf-8");
|
|
13070
|
-
const systemMatch = content.match(SYSTEM_PROMPT_REGEX);
|
|
13071
|
-
const userMatch = content.match(USER_PROMPT_REGEX);
|
|
13072
|
-
if (systemMatch && userMatch) return {
|
|
13073
|
-
system: systemMatch[1].trim(),
|
|
13074
|
-
user: userMatch[1].trim()
|
|
13075
|
-
};
|
|
13076
|
-
} catch {}
|
|
13077
|
-
return null;
|
|
13078
|
-
}
|
|
13079
|
-
async function savePromptSnapshot(schema, aiexDir) {
|
|
13080
|
-
const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
|
|
13081
|
-
const outputDir = path.join(aiexDir, "extracted");
|
|
13082
|
-
await fs.mkdir(outputDir, { recursive: true });
|
|
13083
|
-
const fileName = `${schema.table.name}.prompt.md`;
|
|
13084
|
-
const outputPath = path.join(outputDir, fileName);
|
|
13085
|
-
await fs.writeFile(outputPath, content);
|
|
13086
|
-
return outputPath;
|
|
13087
|
-
}
|
|
13088
|
-
|
|
13089
13036
|
//#endregion
|
|
13090
13037
|
//#region src/core/ai-extraction/telemetry.ts
|
|
13091
13038
|
let langfuseInitialized = false;
|
|
@@ -13128,7 +13075,7 @@ function propertyToExtractionSchema(property) {
|
|
|
13128
13075
|
}
|
|
13129
13076
|
return { type: nullableType(property.type) };
|
|
13130
13077
|
}
|
|
13131
|
-
function isRecord$
|
|
13078
|
+
function isRecord$2(value) {
|
|
13132
13079
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
13133
13080
|
}
|
|
13134
13081
|
function schemaToExtractionOutputSchema(schema) {
|
|
@@ -13166,7 +13113,7 @@ function validatePropertyValue(path$1, property, value, issues) {
|
|
|
13166
13113
|
}
|
|
13167
13114
|
return;
|
|
13168
13115
|
case "object":
|
|
13169
|
-
if (!isRecord$
|
|
13116
|
+
if (!isRecord$2(value)) {
|
|
13170
13117
|
issues.push(`${path$1}: expected object or null`);
|
|
13171
13118
|
return;
|
|
13172
13119
|
}
|
|
@@ -13189,7 +13136,7 @@ function validateProperties(basePath, properties, data, issues) {
|
|
|
13189
13136
|
}
|
|
13190
13137
|
}
|
|
13191
13138
|
function validateExtractedData(schema, data) {
|
|
13192
|
-
if (!isRecord$
|
|
13139
|
+
if (!isRecord$2(data)) return {
|
|
13193
13140
|
success: false,
|
|
13194
13141
|
error: "Extracted data must be a JSON object."
|
|
13195
13142
|
};
|
|
@@ -13206,13 +13153,11 @@ function validateExtractedData(schema, data) {
|
|
|
13206
13153
|
//#region src/core/ai-extraction/extractor.ts
|
|
13207
13154
|
const OPENAI_COMPATIBLE_PROVIDER_NAME = "openai-compatible";
|
|
13208
13155
|
async function extractStructuredData(input) {
|
|
13209
|
-
const { config, schema, text: text$1,
|
|
13156
|
+
const { config, schema, text: text$1, modelOverride } = input;
|
|
13210
13157
|
if (!config.provider.apiKey) return {
|
|
13211
13158
|
success: false,
|
|
13212
13159
|
error: t("errors.ai.apiKeyMissing")
|
|
13213
13160
|
};
|
|
13214
|
-
const useFileContent = !!file;
|
|
13215
|
-
const isImageFile = useFileContent && detectMimeType(file).startsWith("image/");
|
|
13216
13161
|
const inputTokens = text$1 ? Math.ceil(text$1.length / 2) : void 0;
|
|
13217
13162
|
const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
|
|
13218
13163
|
const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
|
|
@@ -13220,8 +13165,6 @@ async function extractStructuredData(input) {
|
|
|
13220
13165
|
try {
|
|
13221
13166
|
selected = modelOverride ?? selectModel({
|
|
13222
13167
|
models: config.provider.models,
|
|
13223
|
-
isImage: isImageFile,
|
|
13224
|
-
fileName: file,
|
|
13225
13168
|
inputTokens,
|
|
13226
13169
|
outputTokens
|
|
13227
13170
|
});
|
|
@@ -13241,18 +13184,7 @@ async function extractStructuredData(input) {
|
|
|
13241
13184
|
apiKey: config.provider.apiKey,
|
|
13242
13185
|
supportsStructuredOutputs: useStructuredOutput
|
|
13243
13186
|
});
|
|
13244
|
-
|
|
13245
|
-
let user;
|
|
13246
|
-
const snapshot = await loadPromptSnapshot(aiexDir, schema.table.name);
|
|
13247
|
-
const promptText = file ? PLACEHOLDER_TEXT : text$1;
|
|
13248
|
-
if (snapshot) {
|
|
13249
|
-
system = snapshot.system;
|
|
13250
|
-
user = snapshot.user.replaceAll(PLACEHOLDER_TEXT, promptText);
|
|
13251
|
-
} else {
|
|
13252
|
-
const generated = generateExtractionPrompt(schema, promptText, config.prompt ?? DEFAULT_PROMPT_CONFIG);
|
|
13253
|
-
system = generated.system;
|
|
13254
|
-
user = generated.user;
|
|
13255
|
-
}
|
|
13187
|
+
const { system, user } = generateExtractionPrompt(schema, text$1, config.prompt ?? DEFAULT_PROMPT_CONFIG);
|
|
13256
13188
|
const outputSchema = jsonSchema(schemaToExtractionOutputSchema(schema));
|
|
13257
13189
|
const timeoutMs = (config.provider.timeout ?? 300) * 1e3;
|
|
13258
13190
|
let systemPrompt = system;
|
|
@@ -13267,38 +13199,16 @@ async function extractStructuredData(input) {
|
|
|
13267
13199
|
let parseError;
|
|
13268
13200
|
let validationError;
|
|
13269
13201
|
try {
|
|
13270
|
-
|
|
13271
|
-
|
|
13272
|
-
|
|
13273
|
-
|
|
13274
|
-
|
|
13275
|
-
|
|
13276
|
-
}
|
|
13277
|
-
|
|
13278
|
-
|
|
13279
|
-
|
|
13280
|
-
messages: [{
|
|
13281
|
-
role: "user",
|
|
13282
|
-
content: contentParts
|
|
13283
|
-
}],
|
|
13284
|
-
abortSignal: AbortSignal.timeout(timeoutMs),
|
|
13285
|
-
maxRetries: 0,
|
|
13286
|
-
experimental_telemetry: { isEnabled: useTelemetry }
|
|
13287
|
-
};
|
|
13288
|
-
if (useStructuredOutput) fileOpts.output = Output.object({ schema: outputSchema });
|
|
13289
|
-
result = await withRetry(() => generateText(fileOpts), input.onRetry);
|
|
13290
|
-
} else {
|
|
13291
|
-
const textOpts = {
|
|
13292
|
-
model: provider.chatModel(selected.name),
|
|
13293
|
-
system: systemPrompt,
|
|
13294
|
-
prompt: userPrompt,
|
|
13295
|
-
abortSignal: AbortSignal.timeout(timeoutMs),
|
|
13296
|
-
maxRetries: 0,
|
|
13297
|
-
experimental_telemetry: { isEnabled: useTelemetry }
|
|
13298
|
-
};
|
|
13299
|
-
if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
|
|
13300
|
-
result = await withRetry(() => generateText(textOpts), input.onRetry);
|
|
13301
|
-
}
|
|
13202
|
+
const textOpts = {
|
|
13203
|
+
model: provider.chatModel(selected.name),
|
|
13204
|
+
system: systemPrompt,
|
|
13205
|
+
prompt: userPrompt,
|
|
13206
|
+
abortSignal: AbortSignal.timeout(timeoutMs),
|
|
13207
|
+
maxRetries: 0,
|
|
13208
|
+
experimental_telemetry: { isEnabled: useTelemetry }
|
|
13209
|
+
};
|
|
13210
|
+
if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
|
|
13211
|
+
result = await withRetry(() => generateText(textOpts), input.onRetry);
|
|
13302
13212
|
if (result.usage) {
|
|
13303
13213
|
totalPromptTokens += result.usage.inputTokens ?? 0;
|
|
13304
13214
|
totalCompletionTokens += result.usage.outputTokens ?? 0;
|
|
@@ -13314,27 +13224,16 @@ async function extractStructuredData(input) {
|
|
|
13314
13224
|
}
|
|
13315
13225
|
if (!parseError && data !== void 0) {
|
|
13316
13226
|
const validation = validateExtractedData(schema, data);
|
|
13317
|
-
if (validation.success) {
|
|
13318
|
-
|
|
13319
|
-
|
|
13320
|
-
|
|
13321
|
-
|
|
13322
|
-
|
|
13323
|
-
|
|
13324
|
-
|
|
13325
|
-
|
|
13326
|
-
|
|
13327
|
-
return {
|
|
13328
|
-
success: true,
|
|
13329
|
-
outputPath,
|
|
13330
|
-
data,
|
|
13331
|
-
tokensUsed: {
|
|
13332
|
-
prompt: totalPromptTokens,
|
|
13333
|
-
completion: totalCompletionTokens,
|
|
13334
|
-
total: totalPromptTokens + totalCompletionTokens
|
|
13335
|
-
}
|
|
13336
|
-
};
|
|
13337
|
-
} else validationError = validation.error;
|
|
13227
|
+
if (validation.success) return {
|
|
13228
|
+
success: true,
|
|
13229
|
+
data,
|
|
13230
|
+
tokensUsed: {
|
|
13231
|
+
prompt: totalPromptTokens,
|
|
13232
|
+
completion: totalCompletionTokens,
|
|
13233
|
+
total: totalPromptTokens + totalCompletionTokens
|
|
13234
|
+
}
|
|
13235
|
+
};
|
|
13236
|
+
else validationError = validation.error;
|
|
13338
13237
|
}
|
|
13339
13238
|
const errorMsg = parseError || validationError || "Unknown validation error";
|
|
13340
13239
|
lastError = errorMsg;
|
|
@@ -13345,11 +13244,14 @@ async function extractStructuredData(input) {
|
|
|
13345
13244
|
CRITICAL RULES:
|
|
13346
13245
|
1. Only correct the fields that failed validation.
|
|
13347
13246
|
2. Preserve all other correctly extracted fields and their values exactly.
|
|
13348
|
-
3.
|
|
13247
|
+
3. Use only values supported by the original text. If a value cannot be confirmed, set it to null.
|
|
13248
|
+
4. Remove any fields not defined by the JSON Schema.
|
|
13249
|
+
5. Normalize values to the expected JSON type without changing the intended meaning.
|
|
13250
|
+
6. Return ONLY the corrected JSON object. No explanations, no markdown blocks other than JSON.`;
|
|
13349
13251
|
userPrompt = `The JSON data you generated previously failed validation. Please correct it.
|
|
13350
13252
|
|
|
13351
13253
|
[Original Text]
|
|
13352
|
-
${text$1 || "
|
|
13254
|
+
${text$1 || "Original text is empty."}
|
|
13353
13255
|
|
|
13354
13256
|
[JSON Schema Definition]
|
|
13355
13257
|
${JSON.stringify(schemaToExtractionOutputSchema(schema), null, 2)}
|
|
@@ -13360,6 +13262,11 @@ ${invalidJson}
|
|
|
13360
13262
|
[Validation Error Details]
|
|
13361
13263
|
${errorMsg}
|
|
13362
13264
|
|
|
13265
|
+
Correction checklist:
|
|
13266
|
+
- Fix each field path mentioned in the validation error.
|
|
13267
|
+
- Keep schema-valid fields unchanged.
|
|
13268
|
+
- Do not invent missing facts; use null when the original text does not support a value.
|
|
13269
|
+
|
|
13363
13270
|
Please output the corrected JSON object now:`;
|
|
13364
13271
|
}
|
|
13365
13272
|
}
|
|
@@ -13514,33 +13421,60 @@ function insertExtractedData(db, schema, data) {
|
|
|
13514
13421
|
|
|
13515
13422
|
//#endregion
|
|
13516
13423
|
//#region src/core/ai-extraction/json-merger.ts
|
|
13517
|
-
function isRecord(value) {
|
|
13424
|
+
function isRecord$1(value) {
|
|
13518
13425
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
13519
13426
|
}
|
|
13427
|
+
function stableKey(value) {
|
|
13428
|
+
if (!isRecord$1(value)) return JSON.stringify(value);
|
|
13429
|
+
return JSON.stringify(Object.keys(value).sort().reduce((acc, key) => {
|
|
13430
|
+
acc[key] = value[key];
|
|
13431
|
+
return acc;
|
|
13432
|
+
}, {}));
|
|
13433
|
+
}
|
|
13434
|
+
function isBlankString(value) {
|
|
13435
|
+
return typeof value === "string" && value.trim() === "";
|
|
13436
|
+
}
|
|
13437
|
+
function isPlaceholderString$1(value) {
|
|
13438
|
+
if (typeof value !== "string") return false;
|
|
13439
|
+
const normalized = value.trim().toLowerCase();
|
|
13440
|
+
return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
|
|
13441
|
+
}
|
|
13442
|
+
function pickPrimitiveValue(values) {
|
|
13443
|
+
const meaningful = values.filter((v) => !isBlankString(v) && !isPlaceholderString$1(v));
|
|
13444
|
+
if (meaningful.length === 0) return null;
|
|
13445
|
+
if (typeof meaningful[0] === "boolean") {
|
|
13446
|
+
const trueCount = meaningful.filter(Boolean).length;
|
|
13447
|
+
return trueCount >= meaningful.length - trueCount;
|
|
13448
|
+
}
|
|
13449
|
+
return meaningful[0];
|
|
13450
|
+
}
|
|
13520
13451
|
function mergePropertyValue(property, values) {
|
|
13521
13452
|
const nonNullValues = values.filter((v) => v !== null && v !== void 0);
|
|
13522
13453
|
if (nonNullValues.length === 0) return null;
|
|
13523
13454
|
if (property.type === "array") {
|
|
13524
13455
|
const concatenated = [];
|
|
13525
|
-
|
|
13456
|
+
const seen = /* @__PURE__ */ new Set();
|
|
13457
|
+
for (const val of nonNullValues) if (Array.isArray(val)) for (const item of val) {
|
|
13458
|
+
const key = stableKey(item);
|
|
13459
|
+
if (!seen.has(key)) {
|
|
13460
|
+
seen.add(key);
|
|
13461
|
+
concatenated.push(item);
|
|
13462
|
+
}
|
|
13463
|
+
}
|
|
13526
13464
|
return concatenated;
|
|
13527
13465
|
}
|
|
13528
13466
|
if (property.type === "object") {
|
|
13529
13467
|
const childProperties = property.properties;
|
|
13530
13468
|
if (!childProperties) {
|
|
13531
13469
|
const mergedObj$1 = {};
|
|
13532
|
-
for (const val of nonNullValues) if (isRecord(val)) Object.assign(mergedObj$1, val);
|
|
13470
|
+
for (const val of nonNullValues) if (isRecord$1(val)) Object.assign(mergedObj$1, val);
|
|
13533
13471
|
return mergedObj$1;
|
|
13534
13472
|
}
|
|
13535
13473
|
const mergedObj = {};
|
|
13536
|
-
for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord(v) ? v[propName] : void 0));
|
|
13474
|
+
for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord$1(v) ? v[propName] : void 0));
|
|
13537
13475
|
return mergedObj;
|
|
13538
13476
|
}
|
|
13539
|
-
|
|
13540
|
-
if (typeof v === "string") return v.trim() !== "";
|
|
13541
|
-
return true;
|
|
13542
|
-
});
|
|
13543
|
-
return bestValue !== void 0 ? bestValue : null;
|
|
13477
|
+
return pickPrimitiveValue(nonNullValues);
|
|
13544
13478
|
}
|
|
13545
13479
|
/**
|
|
13546
13480
|
* Merges structured extraction outputs from multiple document chunks
|
|
@@ -13557,114 +13491,269 @@ function mergeExtractionResults(schema, results) {
|
|
|
13557
13491
|
return merged;
|
|
13558
13492
|
}
|
|
13559
13493
|
|
|
13494
|
+
//#endregion
|
|
13495
|
+
//#region src/core/ai-extraction/snapshot.ts
|
|
13496
|
+
async function savePromptSnapshot(schema, aiexDir) {
|
|
13497
|
+
const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
|
|
13498
|
+
const outputDir = path.join(aiexDir, "extracted");
|
|
13499
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
13500
|
+
const fileName = `${schema.table.name}.prompt.md`;
|
|
13501
|
+
const outputPath = path.join(outputDir, fileName);
|
|
13502
|
+
await fs.writeFile(outputPath, content);
|
|
13503
|
+
return outputPath;
|
|
13504
|
+
}
|
|
13505
|
+
|
|
13560
13506
|
//#endregion
|
|
13561
13507
|
//#region src/core/ai-extraction/text-splitter.ts
|
|
13562
|
-
const
|
|
13563
|
-
|
|
13564
|
-
|
|
13565
|
-
|
|
13566
|
-
|
|
13567
|
-
|
|
13568
|
-
function
|
|
13569
|
-
|
|
13570
|
-
|
|
13571
|
-
|
|
13572
|
-
|
|
13573
|
-
|
|
13574
|
-
|
|
13575
|
-
const
|
|
13508
|
+
const encoding$1 = getEncoding("cl100k_base");
|
|
13509
|
+
const MAX_OVERLAP_RATIO = .15;
|
|
13510
|
+
const MAX_EFFECTIVE_OVERLAP_TOKENS = 1200;
|
|
13511
|
+
const TABLE_SEPARATOR_CELL_RE = /^:?-{3,}:?$/;
|
|
13512
|
+
const LEADING_TABLE_PIPE_RE = /^\|/;
|
|
13513
|
+
const TRAILING_TABLE_PIPE_RE = /\|$/;
|
|
13514
|
+
function countTokens(text$1) {
|
|
13515
|
+
return encoding$1.encode(text$1).length;
|
|
13516
|
+
}
|
|
13517
|
+
function calculateChunkTokenBudget(options = {}) {
|
|
13518
|
+
const configuredMaxTokens = options.configuredMaxTokens ?? 8e3;
|
|
13519
|
+
const modelMaxTokens = options.modelMaxTokens;
|
|
13520
|
+
if (!modelMaxTokens) return configuredMaxTokens;
|
|
13521
|
+
const outputReserveTokens = options.outputReserveTokens ?? 2e3;
|
|
13522
|
+
const promptReserveTokens = options.promptReserveTokens ?? 1200;
|
|
13523
|
+
const safetyBufferTokens = options.safetyBufferTokens ?? Math.min(1e3, Math.floor(modelMaxTokens * .1));
|
|
13524
|
+
const available = modelMaxTokens - outputReserveTokens - promptReserveTokens - safetyBufferTokens;
|
|
13525
|
+
return Math.max(512, Math.min(configuredMaxTokens, available));
|
|
13526
|
+
}
|
|
13527
|
+
function formatHeadingContext(headings) {
|
|
13528
|
+
const active = headings.filter(Boolean);
|
|
13529
|
+
if (active.length === 0) return "";
|
|
13530
|
+
return `> **[Context]** Belong to: ${active.join(" > ")}\n\n`;
|
|
13531
|
+
}
|
|
13532
|
+
function getMetadata(headings) {
|
|
13533
|
+
return {
|
|
13534
|
+
h1: headings[0] || void 0,
|
|
13535
|
+
h2: headings[1] || void 0,
|
|
13536
|
+
h3: headings[2] || void 0,
|
|
13537
|
+
h4: headings[3] || void 0
|
|
13538
|
+
};
|
|
13539
|
+
}
|
|
13540
|
+
function getHeadingPath(metadata) {
|
|
13541
|
+
return [
|
|
13542
|
+
metadata.h1,
|
|
13543
|
+
metadata.h2,
|
|
13544
|
+
metadata.h3,
|
|
13545
|
+
metadata.h4
|
|
13546
|
+
].filter(Boolean);
|
|
13547
|
+
}
|
|
13548
|
+
function finalizeChunks(chunks, sourceText) {
|
|
13549
|
+
let searchStart = 0;
|
|
13550
|
+
const totalChunks = chunks.length;
|
|
13551
|
+
return chunks.map((chunk, index) => {
|
|
13552
|
+
const tokenCount = countTokens(chunk.pageContent);
|
|
13553
|
+
let charStart = sourceText.indexOf(chunk.pageContent, searchStart);
|
|
13554
|
+
if (charStart === -1) charStart = sourceText.indexOf(chunk.pageContent);
|
|
13555
|
+
const charEnd = charStart >= 0 ? charStart + chunk.pageContent.length : void 0;
|
|
13556
|
+
if (charStart >= 0 && charEnd !== void 0) searchStart = charEnd;
|
|
13576
13557
|
return {
|
|
13577
|
-
|
|
13578
|
-
|
|
13579
|
-
|
|
13580
|
-
|
|
13558
|
+
...chunk,
|
|
13559
|
+
chunkIndex: index,
|
|
13560
|
+
totalChunks,
|
|
13561
|
+
tokenCount,
|
|
13562
|
+
headingPath: getHeadingPath(chunk.metadata),
|
|
13563
|
+
charStart: charStart >= 0 ? charStart : void 0,
|
|
13564
|
+
charEnd
|
|
13581
13565
|
};
|
|
13566
|
+
});
|
|
13567
|
+
}
|
|
13568
|
+
function getEffectiveOverlapTokens(maxTokens, overlapTokens) {
|
|
13569
|
+
return Math.floor(Math.min(overlapTokens, Math.max(64, maxTokens * MAX_OVERLAP_RATIO), MAX_EFFECTIVE_OVERLAP_TOKENS));
|
|
13570
|
+
}
|
|
13571
|
+
function splitMarkdownTable(tableText, maxTokens) {
|
|
13572
|
+
if (countTokens(tableText) <= maxTokens) return [tableText];
|
|
13573
|
+
const lines = tableText.split("\n");
|
|
13574
|
+
const headerIndex = lines.findIndex((line) => line.trim().startsWith("|"));
|
|
13575
|
+
const separatorIndex = lines.findIndex((line, index) => {
|
|
13576
|
+
if (index <= headerIndex) return false;
|
|
13577
|
+
const cells = line.trim().replace(LEADING_TABLE_PIPE_RE, "").replace(TRAILING_TABLE_PIPE_RE, "").split("|").map((cell) => cell.trim());
|
|
13578
|
+
return cells.length > 0 && cells.every((cell) => TABLE_SEPARATOR_CELL_RE.test(cell));
|
|
13579
|
+
});
|
|
13580
|
+
if (headerIndex === -1 || separatorIndex === -1) return splitTextRecursively(tableText, maxTokens, ["\n"]);
|
|
13581
|
+
const prefix = lines.slice(0, headerIndex);
|
|
13582
|
+
const header = lines[headerIndex];
|
|
13583
|
+
const separator = lines[separatorIndex];
|
|
13584
|
+
const rows = lines.slice(separatorIndex + 1).filter((line) => line.trim() !== "");
|
|
13585
|
+
const chunks = [];
|
|
13586
|
+
let currentRows = [];
|
|
13587
|
+
const buildTable = (tableRows) => {
|
|
13588
|
+
return [
|
|
13589
|
+
...prefix,
|
|
13590
|
+
header,
|
|
13591
|
+
separator,
|
|
13592
|
+
...tableRows
|
|
13593
|
+
].join("\n");
|
|
13582
13594
|
};
|
|
13583
|
-
const
|
|
13584
|
-
|
|
13585
|
-
|
|
13586
|
-
|
|
13587
|
-
|
|
13588
|
-
|
|
13589
|
-
|
|
13590
|
-
|
|
13591
|
-
|
|
13592
|
-
|
|
13593
|
-
|
|
13594
|
-
|
|
13595
|
-
|
|
13596
|
-
|
|
13597
|
-
|
|
13598
|
-
|
|
13599
|
-
|
|
13600
|
-
|
|
13601
|
-
|
|
13602
|
-
|
|
13603
|
-
|
|
13604
|
-
|
|
13605
|
-
|
|
13606
|
-
|
|
13607
|
-
|
|
13608
|
-
|
|
13609
|
-
|
|
13610
|
-
|
|
13611
|
-
|
|
13612
|
-
|
|
13613
|
-
|
|
13614
|
-
|
|
13615
|
-
|
|
13616
|
-
|
|
13595
|
+
for (const row of rows) {
|
|
13596
|
+
const candidateRows = [...currentRows, row];
|
|
13597
|
+
if (currentRows.length > 0 && countTokens(buildTable(candidateRows)) > maxTokens) {
|
|
13598
|
+
chunks.push(buildTable(currentRows));
|
|
13599
|
+
currentRows = [row];
|
|
13600
|
+
} else currentRows = candidateRows;
|
|
13601
|
+
}
|
|
13602
|
+
if (currentRows.length > 0) chunks.push(buildTable(currentRows));
|
|
13603
|
+
return chunks.length > 0 ? chunks : [tableText];
|
|
13604
|
+
}
|
|
13605
|
+
/**
|
|
13606
|
+
* Splits text recursively using a list of separators.
|
|
13607
|
+
* Preserves the separators when re-joining.
|
|
13608
|
+
*/
|
|
13609
|
+
function splitTextRecursively(text$1, maxTokens, separators = [
|
|
13610
|
+
"\n\n",
|
|
13611
|
+
"\n",
|
|
13612
|
+
"。",
|
|
13613
|
+
". ",
|
|
13614
|
+
" "
|
|
13615
|
+
]) {
|
|
13616
|
+
if (countTokens(text$1) <= maxTokens) return [text$1];
|
|
13617
|
+
if (separators.length === 0) {
|
|
13618
|
+
const chunks = [];
|
|
13619
|
+
let current = "";
|
|
13620
|
+
for (const char of text$1) if (countTokens(current + char) > maxTokens) {
|
|
13621
|
+
chunks.push(current);
|
|
13622
|
+
current = char;
|
|
13623
|
+
} else current += char;
|
|
13624
|
+
if (current) chunks.push(current);
|
|
13625
|
+
return chunks;
|
|
13626
|
+
}
|
|
13627
|
+
const separator = separators[0];
|
|
13628
|
+
const nextSeparators = separators.slice(1);
|
|
13629
|
+
const parts = text$1.split(separator);
|
|
13630
|
+
const result = [];
|
|
13631
|
+
let currentChunk = [];
|
|
13632
|
+
let currentChunkTokens = 0;
|
|
13633
|
+
for (let i = 0; i < parts.length; i++) {
|
|
13634
|
+
const part = parts[i];
|
|
13635
|
+
const itemText = part + (i < parts.length - 1 ? separator : "");
|
|
13636
|
+
const partTokens = countTokens(itemText);
|
|
13637
|
+
if (partTokens > maxTokens) {
|
|
13638
|
+
if (currentChunk.length > 0) {
|
|
13639
|
+
result.push(currentChunk.join(""));
|
|
13640
|
+
currentChunk = [];
|
|
13641
|
+
currentChunkTokens = 0;
|
|
13617
13642
|
}
|
|
13618
|
-
|
|
13619
|
-
|
|
13620
|
-
|
|
13621
|
-
|
|
13622
|
-
metadata: getMetadata(currentHeadings)
|
|
13623
|
-
});
|
|
13624
|
-
lastChunkContent = content;
|
|
13643
|
+
const subParts = splitTextRecursively(part, maxTokens, nextSeparators);
|
|
13644
|
+
for (let j = 0; j < subParts.length; j++) {
|
|
13645
|
+
const finalSub = subParts[j] + (j === subParts.length - 1 && i < parts.length - 1 ? separator : "");
|
|
13646
|
+
result.push(finalSub);
|
|
13625
13647
|
}
|
|
13648
|
+
} else if (currentChunkTokens + partTokens > maxTokens) {
|
|
13649
|
+
result.push(currentChunk.join(""));
|
|
13650
|
+
currentChunk = [itemText];
|
|
13651
|
+
currentChunkTokens = partTokens;
|
|
13626
13652
|
} else {
|
|
13627
|
-
|
|
13628
|
-
|
|
13629
|
-
metadata: getMetadata(currentHeadings)
|
|
13630
|
-
});
|
|
13631
|
-
lastChunkContent = pageContent;
|
|
13653
|
+
currentChunk.push(itemText);
|
|
13654
|
+
currentChunkTokens += partTokens;
|
|
13632
13655
|
}
|
|
13633
|
-
|
|
13634
|
-
|
|
13635
|
-
|
|
13636
|
-
|
|
13637
|
-
|
|
13638
|
-
|
|
13639
|
-
|
|
13640
|
-
|
|
13641
|
-
|
|
13642
|
-
|
|
13643
|
-
|
|
13644
|
-
|
|
13645
|
-
|
|
13656
|
+
}
|
|
13657
|
+
if (currentChunk.length > 0) result.push(currentChunk.join(""));
|
|
13658
|
+
return result;
|
|
13659
|
+
}
|
|
13660
|
+
/**
|
|
13661
|
+
* Splits a Markdown document into chunks based on heading contexts, AST block parsing, and token limits.
|
|
13662
|
+
* Protects tables, list items, and code blocks from being broken.
|
|
13663
|
+
*/
|
|
13664
|
+
function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
|
|
13665
|
+
const tokens = marked.lexer(text$1);
|
|
13666
|
+
const chunks = [];
|
|
13667
|
+
const effectiveOverlapTokens = getEffectiveOverlapTokens(maxTokens, overlapTokens);
|
|
13668
|
+
let currentHeadings = [];
|
|
13669
|
+
let currentChunkList = [];
|
|
13670
|
+
let accumulatedTokens = 0;
|
|
13671
|
+
const flushCurrentChunk = (isHeadingChange = false) => {
|
|
13672
|
+
if (currentChunkList.length === 0) return;
|
|
13673
|
+
const pageContent = currentChunkList.map((item) => item.text).join("");
|
|
13674
|
+
const firstHeadings = currentChunkList[0].headings;
|
|
13675
|
+
chunks.push({
|
|
13676
|
+
pageContent,
|
|
13677
|
+
metadata: getMetadata(firstHeadings)
|
|
13678
|
+
});
|
|
13679
|
+
if (isHeadingChange || effectiveOverlapTokens <= 0) {
|
|
13680
|
+
currentChunkList = [];
|
|
13681
|
+
accumulatedTokens = 0;
|
|
13646
13682
|
} else {
|
|
13647
|
-
|
|
13648
|
-
|
|
13683
|
+
const overlapItems = [];
|
|
13684
|
+
let currentOverlapTokens = 0;
|
|
13685
|
+
for (let i = currentChunkList.length - 1; i >= 0; i--) {
|
|
13686
|
+
const item = currentChunkList[i];
|
|
13687
|
+
const itemTokens = countTokens(item.text);
|
|
13688
|
+
if (currentOverlapTokens + itemTokens > effectiveOverlapTokens && overlapItems.length > 0) break;
|
|
13689
|
+
overlapItems.unshift(item);
|
|
13690
|
+
currentOverlapTokens += itemTokens;
|
|
13691
|
+
}
|
|
13692
|
+
currentChunkList = [...overlapItems];
|
|
13693
|
+
accumulatedTokens = currentOverlapTokens;
|
|
13649
13694
|
}
|
|
13650
|
-
hasNewLines = false;
|
|
13651
13695
|
};
|
|
13652
|
-
for (const
|
|
13653
|
-
|
|
13654
|
-
|
|
13655
|
-
|
|
13656
|
-
|
|
13657
|
-
|
|
13696
|
+
for (const token of tokens) {
|
|
13697
|
+
if (token.type === "space") {
|
|
13698
|
+
if (currentChunkList.length > 0) {
|
|
13699
|
+
currentChunkList[currentChunkList.length - 1].text += token.raw;
|
|
13700
|
+
accumulatedTokens += countTokens(token.raw);
|
|
13701
|
+
}
|
|
13702
|
+
continue;
|
|
13703
|
+
}
|
|
13704
|
+
if (token.type === "heading") {
|
|
13705
|
+
flushCurrentChunk(true);
|
|
13706
|
+
const depth = token.depth;
|
|
13707
|
+
const title = token.text.trim();
|
|
13658
13708
|
currentHeadings = currentHeadings.slice(0, depth - 1);
|
|
13659
13709
|
currentHeadings[depth - 1] = title;
|
|
13660
13710
|
}
|
|
13661
|
-
|
|
13662
|
-
|
|
13663
|
-
|
|
13664
|
-
|
|
13711
|
+
const rawText = token.raw;
|
|
13712
|
+
if (token.type === "list" && countTokens(rawText) > maxTokens) for (const item of token.items) processTextBlock(item.raw, currentHeadings);
|
|
13713
|
+
else {
|
|
13714
|
+
const isAtomic = token.type === "table" || token.type === "code";
|
|
13715
|
+
processTextBlock(rawText, currentHeadings, isAtomic);
|
|
13716
|
+
}
|
|
13717
|
+
}
|
|
13718
|
+
flushCurrentChunk(true);
|
|
13719
|
+
return finalizeChunks(chunks, text$1);
|
|
13720
|
+
function processTextBlock(blockText, headings, isAtomic = false) {
|
|
13721
|
+
const blockTokens = countTokens(blockText);
|
|
13722
|
+
const contextTokens = countTokens(formatHeadingContext(headings));
|
|
13723
|
+
const safetyBuffer = Math.min(100, Math.max(2, Math.floor(maxTokens * .1)));
|
|
13724
|
+
const budgetLimit = Math.max(5, maxTokens - contextTokens - safetyBuffer);
|
|
13725
|
+
if (blockTokens > budgetLimit) if (isAtomic) {
|
|
13726
|
+
flushCurrentChunk(false);
|
|
13727
|
+
const atomicBlocks = blockTokens <= maxTokens ? [blockText] : blockText.includes("|") ? splitMarkdownTable(blockText, budgetLimit) : splitTextRecursively(blockText, budgetLimit, ["\n"]);
|
|
13728
|
+
for (const block of atomicBlocks) {
|
|
13729
|
+
currentChunkList.push({
|
|
13730
|
+
text: block,
|
|
13731
|
+
headings: [...headings]
|
|
13732
|
+
});
|
|
13733
|
+
accumulatedTokens = countTokens(block);
|
|
13734
|
+
flushCurrentChunk(false);
|
|
13735
|
+
}
|
|
13736
|
+
} else {
|
|
13737
|
+
flushCurrentChunk(false);
|
|
13738
|
+
const subBlocks = splitTextRecursively(blockText, budgetLimit);
|
|
13739
|
+
for (const sub of subBlocks) {
|
|
13740
|
+
currentChunkList.push({
|
|
13741
|
+
text: sub,
|
|
13742
|
+
headings: [...headings]
|
|
13743
|
+
});
|
|
13744
|
+
accumulatedTokens += countTokens(sub);
|
|
13745
|
+
if (accumulatedTokens > budgetLimit) flushCurrentChunk(false);
|
|
13746
|
+
}
|
|
13747
|
+
}
|
|
13748
|
+
else {
|
|
13749
|
+
if (accumulatedTokens + blockTokens + contextTokens > maxTokens && currentChunkList.length > 0) flushCurrentChunk(false);
|
|
13750
|
+
currentChunkList.push({
|
|
13751
|
+
text: blockText,
|
|
13752
|
+
headings: [...headings]
|
|
13753
|
+
});
|
|
13754
|
+
accumulatedTokens += blockTokens;
|
|
13755
|
+
}
|
|
13665
13756
|
}
|
|
13666
|
-
flushChunk(true);
|
|
13667
|
-
return chunks;
|
|
13668
13757
|
}
|
|
13669
13758
|
|
|
13670
13759
|
//#endregion
|
|
@@ -13817,6 +13906,276 @@ function getFileHash(filePath) {
|
|
|
13817
13906
|
});
|
|
13818
13907
|
}
|
|
13819
13908
|
|
|
13909
|
+
//#endregion
|
|
13910
|
+
//#region src/core/ai-extraction/evidence.ts
|
|
13911
|
+
const JSON_FILE_SUFFIX_RE$1 = /\.json$/i;
|
|
13912
|
+
const FIELD_PATH_PREFIX_RE = /^\$\./;
|
|
13913
|
+
function isRecord(value) {
|
|
13914
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
13915
|
+
}
|
|
13916
|
+
function stableValueKey(value) {
|
|
13917
|
+
return JSON.stringify(value);
|
|
13918
|
+
}
|
|
13919
|
+
function isPlaceholderString(value) {
|
|
13920
|
+
if (typeof value !== "string") return false;
|
|
13921
|
+
const normalized = value.trim().toLowerCase();
|
|
13922
|
+
return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
|
|
13923
|
+
}
|
|
13924
|
+
function primitiveToText(value) {
|
|
13925
|
+
if (value === null || value === void 0) return null;
|
|
13926
|
+
if (typeof value === "string") return value.trim() || null;
|
|
13927
|
+
if (typeof value === "number" || typeof value === "boolean") return String(value);
|
|
13928
|
+
return null;
|
|
13929
|
+
}
|
|
13930
|
+
function isMeaningfulValue(value) {
|
|
13931
|
+
return primitiveToText(value) !== null && !isPlaceholderString(value);
|
|
13932
|
+
}
|
|
13933
|
+
function normalizeText(value) {
|
|
13934
|
+
return value.toLowerCase().replace(/\s+/g, " ").trim();
|
|
13935
|
+
}
|
|
13936
|
+
function quoteAround(text$1, start, length) {
|
|
13937
|
+
const before = Math.max(0, start - 80);
|
|
13938
|
+
const after = Math.min(text$1.length, start + length + 80);
|
|
13939
|
+
return text$1.slice(before, after).replace(/\s+/g, " ").trim();
|
|
13940
|
+
}
|
|
13941
|
+
function findEvidence(value, chunks) {
|
|
13942
|
+
const searchText = primitiveToText(value);
|
|
13943
|
+
if (!searchText) return null;
|
|
13944
|
+
const normalizedSearchText = normalizeText(searchText);
|
|
13945
|
+
if (!normalizedSearchText) return null;
|
|
13946
|
+
for (const chunk of chunks) {
|
|
13947
|
+
if (normalizeText(chunk.text).indexOf(normalizedSearchText) === -1) continue;
|
|
13948
|
+
const rawIndex = chunk.text.toLowerCase().indexOf(searchText.toLowerCase());
|
|
13949
|
+
const quoteIndex = rawIndex >= 0 ? rawIndex : 0;
|
|
13950
|
+
return {
|
|
13951
|
+
chunkIndex: chunk.chunkIndex,
|
|
13952
|
+
headingPath: chunk.headingPath,
|
|
13953
|
+
quote: quoteAround(chunk.text, quoteIndex, searchText.length)
|
|
13954
|
+
};
|
|
13955
|
+
}
|
|
13956
|
+
return null;
|
|
13957
|
+
}
|
|
13958
|
+
function addEvidenceForProperty(fields, path$1, property, value, chunks) {
|
|
13959
|
+
if (property.type === "object" && property.properties) {
|
|
13960
|
+
const record = isRecord(value) ? value : {};
|
|
13961
|
+
for (const [childName, childProperty] of Object.entries(property.properties)) addEvidenceForProperty(fields, `${path$1}.${childName}`, childProperty, record[childName], chunks);
|
|
13962
|
+
return;
|
|
13963
|
+
}
|
|
13964
|
+
if (property.type === "array") {
|
|
13965
|
+
if (!Array.isArray(value) || value.length === 0) {
|
|
13966
|
+
fields.push({
|
|
13967
|
+
fieldPath: path$1,
|
|
13968
|
+
status: "missing",
|
|
13969
|
+
value: null,
|
|
13970
|
+
confidence: 0,
|
|
13971
|
+
note: "Array field is empty or missing."
|
|
13972
|
+
});
|
|
13973
|
+
return;
|
|
13974
|
+
}
|
|
13975
|
+
value.forEach((item, index) => {
|
|
13976
|
+
if (property.items?.type === "object" && property.items.properties) {
|
|
13977
|
+
const record = isRecord(item) ? item : {};
|
|
13978
|
+
for (const [childName, childProperty] of Object.entries(property.items.properties)) addEvidenceForProperty(fields, `${path$1}[${index}].${childName}`, childProperty, record[childName], chunks);
|
|
13979
|
+
} else addPrimitiveEvidence(fields, `${path$1}[${index}]`, item, chunks);
|
|
13980
|
+
});
|
|
13981
|
+
return;
|
|
13982
|
+
}
|
|
13983
|
+
addPrimitiveEvidence(fields, path$1, value, chunks);
|
|
13984
|
+
}
|
|
13985
|
+
function addPrimitiveEvidence(fields, fieldPath, value, chunks) {
|
|
13986
|
+
if (value === null || value === void 0 || value === "") {
|
|
13987
|
+
fields.push({
|
|
13988
|
+
fieldPath,
|
|
13989
|
+
status: "missing",
|
|
13990
|
+
value: null,
|
|
13991
|
+
confidence: 0,
|
|
13992
|
+
note: "Field is null or empty in final extraction."
|
|
13993
|
+
});
|
|
13994
|
+
return;
|
|
13995
|
+
}
|
|
13996
|
+
const found = findEvidence(value, chunks);
|
|
13997
|
+
if (found) {
|
|
13998
|
+
fields.push({
|
|
13999
|
+
fieldPath,
|
|
14000
|
+
status: "found",
|
|
14001
|
+
value,
|
|
14002
|
+
confidence: .8,
|
|
14003
|
+
...found
|
|
14004
|
+
});
|
|
14005
|
+
return;
|
|
14006
|
+
}
|
|
14007
|
+
fields.push({
|
|
14008
|
+
fieldPath,
|
|
14009
|
+
status: "inferred",
|
|
14010
|
+
value,
|
|
14011
|
+
confidence: .35,
|
|
14012
|
+
note: "Final value was not found verbatim in the available source text."
|
|
14013
|
+
});
|
|
14014
|
+
}
|
|
14015
|
+
function sourceChunksFromText(text$1) {
|
|
14016
|
+
return text$1 ? [{
|
|
14017
|
+
text: text$1,
|
|
14018
|
+
chunkIndex: 0,
|
|
14019
|
+
headingPath: []
|
|
14020
|
+
}] : [];
|
|
14021
|
+
}
|
|
14022
|
+
function sourceChunksFromMarkdownChunks(chunks) {
|
|
14023
|
+
return chunks.map((chunk, index) => ({
|
|
14024
|
+
text: chunk.pageContent,
|
|
14025
|
+
chunkIndex: chunk.chunkIndex ?? index,
|
|
14026
|
+
headingPath: chunk.headingPath ?? []
|
|
14027
|
+
}));
|
|
14028
|
+
}
|
|
14029
|
+
function getPathParts(fieldPath) {
|
|
14030
|
+
return fieldPath.replace(FIELD_PATH_PREFIX_RE, "").split(".").filter(Boolean);
|
|
14031
|
+
}
|
|
14032
|
+
function getValueAtPath$1(data, fieldPath) {
|
|
14033
|
+
let current = data;
|
|
14034
|
+
for (const part of getPathParts(fieldPath)) {
|
|
14035
|
+
if (!isRecord(current)) return void 0;
|
|
14036
|
+
current = current[part];
|
|
14037
|
+
}
|
|
14038
|
+
return current;
|
|
14039
|
+
}
|
|
14040
|
+
function setValueAtPath(data, fieldPath, value) {
|
|
14041
|
+
const parts = getPathParts(fieldPath);
|
|
14042
|
+
let current = data;
|
|
14043
|
+
for (let i = 0; i < parts.length - 1; i++) {
|
|
14044
|
+
const part = parts[i];
|
|
14045
|
+
if (!isRecord(current[part])) current[part] = {};
|
|
14046
|
+
current = current[part];
|
|
14047
|
+
}
|
|
14048
|
+
current[parts[parts.length - 1]] = value;
|
|
14049
|
+
}
|
|
14050
|
+
function collectScalarFields(fields, fieldPath, property) {
|
|
14051
|
+
if (property.type === "object" && property.properties) {
|
|
14052
|
+
for (const [name$1, childProperty] of Object.entries(property.properties)) collectScalarFields(fields, `${fieldPath}.${name$1}`, childProperty);
|
|
14053
|
+
return;
|
|
14054
|
+
}
|
|
14055
|
+
if (property.type !== "array") fields.push({
|
|
14056
|
+
fieldPath,
|
|
14057
|
+
property
|
|
14058
|
+
});
|
|
14059
|
+
}
|
|
14060
|
+
function candidateScore(candidate) {
|
|
14061
|
+
return (candidate.status === "found" ? 100 : 0) + Math.round(candidate.confidence * 10) + candidate.chunkIndex;
|
|
14062
|
+
}
|
|
14063
|
+
function selectCandidatesForField(candidates) {
|
|
14064
|
+
if (candidates.length === 0) return null;
|
|
14065
|
+
candidates.sort((a, b) => candidateScore(b) - candidateScore(a));
|
|
14066
|
+
const selected = candidates[0];
|
|
14067
|
+
selected.selected = true;
|
|
14068
|
+
for (const candidate of candidates.slice(1)) {
|
|
14069
|
+
candidate.selected = false;
|
|
14070
|
+
candidate.rejectionReason = "Lower evidence score or earlier chunk position.";
|
|
14071
|
+
}
|
|
14072
|
+
const distinctValues = /* @__PURE__ */ new Map();
|
|
14073
|
+
for (const candidate of candidates) distinctValues.set(stableValueKey(candidate.value), candidate.value);
|
|
14074
|
+
if (distinctValues.size <= 1) return null;
|
|
14075
|
+
return {
|
|
14076
|
+
fieldPath: selected.fieldPath,
|
|
14077
|
+
selectedValue: selected.value,
|
|
14078
|
+
rejectedValues: candidates.slice(1).map((candidate) => candidate.value),
|
|
14079
|
+
candidates: [...candidates]
|
|
14080
|
+
};
|
|
14081
|
+
}
|
|
14082
|
+
function buildCandidateMergeReport(input) {
|
|
14083
|
+
const scalarFields = [];
|
|
14084
|
+
for (const [name$1, property] of Object.entries(input.schema.properties)) {
|
|
14085
|
+
if (property.primary && property.autoIncrement) continue;
|
|
14086
|
+
collectScalarFields(scalarFields, `$.${name$1}`, property);
|
|
14087
|
+
}
|
|
14088
|
+
const sourceChunks = sourceChunksFromMarkdownChunks(input.chunks);
|
|
14089
|
+
const candidatesByPath = /* @__PURE__ */ new Map();
|
|
14090
|
+
for (const { fieldPath } of scalarFields) for (let chunkIndex = 0; chunkIndex < input.chunkResults.length; chunkIndex++) {
|
|
14091
|
+
const value = getValueAtPath$1(input.chunkResults[chunkIndex], fieldPath);
|
|
14092
|
+
if (!isMeaningfulValue(value)) continue;
|
|
14093
|
+
const sourceChunk = sourceChunks[chunkIndex] ?? {
|
|
14094
|
+
text: "",
|
|
14095
|
+
chunkIndex
|
|
14096
|
+
};
|
|
14097
|
+
const found = findEvidence(value, [sourceChunk]);
|
|
14098
|
+
const candidate = {
|
|
14099
|
+
fieldPath,
|
|
14100
|
+
value,
|
|
14101
|
+
chunkIndex: sourceChunk.chunkIndex ?? chunkIndex,
|
|
14102
|
+
headingPath: sourceChunk.headingPath,
|
|
14103
|
+
status: found ? "found" : "inferred",
|
|
14104
|
+
quote: found?.quote,
|
|
14105
|
+
confidence: found ? .85 : .35
|
|
14106
|
+
};
|
|
14107
|
+
const candidates = candidatesByPath.get(fieldPath) ?? [];
|
|
14108
|
+
candidates.push(candidate);
|
|
14109
|
+
candidatesByPath.set(fieldPath, candidates);
|
|
14110
|
+
}
|
|
14111
|
+
const allCandidates = [];
|
|
14112
|
+
const conflicts = [];
|
|
14113
|
+
for (const candidates of candidatesByPath.values()) {
|
|
14114
|
+
const conflict = selectCandidatesForField(candidates);
|
|
14115
|
+
allCandidates.push(...candidates);
|
|
14116
|
+
if (conflict) conflicts.push(conflict);
|
|
14117
|
+
}
|
|
14118
|
+
return {
|
|
14119
|
+
candidates: allCandidates,
|
|
14120
|
+
conflicts
|
|
14121
|
+
};
|
|
14122
|
+
}
|
|
14123
|
+
function applySelectedCandidates(data, report) {
|
|
14124
|
+
const merged = structuredClone(data);
|
|
14125
|
+
for (const candidate of report.candidates) if (candidate.selected) setValueAtPath(merged, candidate.fieldPath, candidate.value);
|
|
14126
|
+
return merged;
|
|
14127
|
+
}
|
|
14128
|
+
function buildExtractionEvidence(input) {
|
|
14129
|
+
const data = isRecord(input.data) ? input.data : {};
|
|
14130
|
+
const chunks = input.chunks ? sourceChunksFromMarkdownChunks(input.chunks) : sourceChunksFromText(input.text ?? "");
|
|
14131
|
+
const fields = [];
|
|
14132
|
+
for (const [name$1, property] of Object.entries(input.schema.properties)) {
|
|
14133
|
+
if (property.primary && property.autoIncrement) continue;
|
|
14134
|
+
addEvidenceForProperty(fields, `$.${name$1}`, property, data[name$1], chunks);
|
|
14135
|
+
}
|
|
14136
|
+
const inferredIssues = fields.filter((field) => field.status === "inferred").map((field) => ({
|
|
14137
|
+
fieldPath: field.fieldPath,
|
|
14138
|
+
message: field.note ?? "Field value lacks source evidence."
|
|
14139
|
+
}));
|
|
14140
|
+
const conflictIssues = (input.candidateReport?.conflicts ?? []).map((conflict) => ({
|
|
14141
|
+
fieldPath: conflict.fieldPath,
|
|
14142
|
+
message: "Multiple chunk candidates disagree for this field."
|
|
14143
|
+
}));
|
|
14144
|
+
const issues = [...inferredIssues, ...conflictIssues];
|
|
14145
|
+
return {
|
|
14146
|
+
coverage: {
|
|
14147
|
+
path: input.outputPath ? evidencePathForOutput(input.outputPath) : void 0,
|
|
14148
|
+
fieldCount: fields.length,
|
|
14149
|
+
evidenceCount: fields.filter((field) => field.status === "found").length,
|
|
14150
|
+
foundCount: fields.filter((field) => field.status === "found").length,
|
|
14151
|
+
missingCount: fields.filter((field) => field.status === "missing").length,
|
|
14152
|
+
inferredCount: fields.filter((field) => field.status === "inferred").length,
|
|
14153
|
+
conflictCount: input.candidateReport?.conflicts.length ?? 0,
|
|
14154
|
+
issueCount: issues.length
|
|
14155
|
+
},
|
|
14156
|
+
fields,
|
|
14157
|
+
candidates: input.candidateReport?.candidates,
|
|
14158
|
+
conflicts: input.candidateReport?.conflicts,
|
|
14159
|
+
issues
|
|
14160
|
+
};
|
|
14161
|
+
}
|
|
14162
|
+
function evidencePathForOutput(outputPath) {
|
|
14163
|
+
return outputPath.replace(JSON_FILE_SUFFIX_RE$1, ".evidence.json");
|
|
14164
|
+
}
|
|
14165
|
+
async function writeExtractionEvidence(input) {
|
|
14166
|
+
const report = buildExtractionEvidence(input);
|
|
14167
|
+
const evidencePath = evidencePathForOutput(input.outputPath);
|
|
14168
|
+
report.coverage.path = evidencePath;
|
|
14169
|
+
await writeFile(evidencePath, report, {
|
|
14170
|
+
spaces: 2,
|
|
14171
|
+
EOL: "\n"
|
|
14172
|
+
});
|
|
14173
|
+
return {
|
|
14174
|
+
...report.coverage,
|
|
14175
|
+
path: path.resolve(evidencePath)
|
|
14176
|
+
};
|
|
14177
|
+
}
|
|
14178
|
+
|
|
13820
14179
|
//#endregion
|
|
13821
14180
|
//#region src/core/notion-sink.ts
|
|
13822
14181
|
const RICH_TEXT_LIMIT = 2e3;
|
|
@@ -14102,6 +14461,36 @@ async function triggerWebhook(aiConfig, auditId, schemaName, event, source, data
|
|
|
14102
14461
|
}
|
|
14103
14462
|
}
|
|
14104
14463
|
|
|
14464
|
+
//#endregion
|
|
14465
|
+
//#region src/core/ai-extraction/transcriber.ts
|
|
14466
|
+
const TRANSCRIPTION_PROMPT = "Transcribe all visible text from this image accurately. Preserve the layout and line breaks as much as possible.";
|
|
14467
|
+
async function transcribeImageWithVision(imagePath, baseURL, apiKey, modelName, timeoutMs) {
|
|
14468
|
+
const provider = createOpenAICompatible({
|
|
14469
|
+
baseURL,
|
|
14470
|
+
name: "openai-compatible",
|
|
14471
|
+
apiKey
|
|
14472
|
+
});
|
|
14473
|
+
const buffer = await fs.readFile(imagePath);
|
|
14474
|
+
const effectiveTimeout = timeoutMs ?? 3e5;
|
|
14475
|
+
return {
|
|
14476
|
+
text: (await generateText({
|
|
14477
|
+
model: provider.chatModel(modelName),
|
|
14478
|
+
messages: [{
|
|
14479
|
+
role: "user",
|
|
14480
|
+
content: [{
|
|
14481
|
+
type: "text",
|
|
14482
|
+
text: TRANSCRIPTION_PROMPT
|
|
14483
|
+
}, {
|
|
14484
|
+
type: "image",
|
|
14485
|
+
image: buffer
|
|
14486
|
+
}]
|
|
14487
|
+
}],
|
|
14488
|
+
abortSignal: AbortSignal.timeout(effectiveTimeout)
|
|
14489
|
+
})).text,
|
|
14490
|
+
modelName
|
|
14491
|
+
};
|
|
14492
|
+
}
|
|
14493
|
+
|
|
14105
14494
|
//#endregion
|
|
14106
14495
|
//#region src/core/file-constants.ts
|
|
14107
14496
|
const MAX_UPLOAD_SIZE = 30 * 1024 * 1024;
|
|
@@ -14435,14 +14824,6 @@ function createPdfConverter(config) {
|
|
|
14435
14824
|
return withFallback(new ExternalCommandPdfConverter("mineru", mineruConfig), mineruConfig);
|
|
14436
14825
|
}
|
|
14437
14826
|
if (config.converter === "mineru_api") return new MineruApiPdfConverter(config.mineruApi ?? DEFAULT_MINERU_API_CONFIG);
|
|
14438
|
-
if (config.converter === "markitdown") {
|
|
14439
|
-
const markitdownConfig = config.markitdown ?? DEFAULT_MARKITDOWN_CONFIG;
|
|
14440
|
-
return withFallback(new ExternalCommandPdfConverter("markitdown", markitdownConfig), markitdownConfig);
|
|
14441
|
-
}
|
|
14442
|
-
if (config.converter === "marker") {
|
|
14443
|
-
const markerConfig = config.marker ?? DEFAULT_MARKER_CONFIG;
|
|
14444
|
-
return withFallback(new ExternalCommandPdfConverter("marker", markerConfig), markerConfig);
|
|
14445
|
-
}
|
|
14446
14827
|
if (config.converter === "external") {
|
|
14447
14828
|
if (!config.external) throw new Error(t("errors.pdf.externalNotConfigured"));
|
|
14448
14829
|
return new ExternalCommandPdfConverter("external", config.external);
|
|
@@ -14470,7 +14851,7 @@ const FILE_PART_EXTENSIONS = new Set([
|
|
|
14470
14851
|
"svg"
|
|
14471
14852
|
]);
|
|
14472
14853
|
const PDF_EXT_RE = /\.pdf$/i;
|
|
14473
|
-
async function readExtractFileInput(filePath, aiConfig
|
|
14854
|
+
async function readExtractFileInput(filePath, aiConfig) {
|
|
14474
14855
|
const stat = fs$1.statSync(filePath);
|
|
14475
14856
|
if (stat.size > MAX_UPLOAD_SIZE) throw new Error(t("errors.file.sizeExceeded", {
|
|
14476
14857
|
size: bytesToMB(stat.size).toFixed(1),
|
|
@@ -14479,15 +14860,22 @@ async function readExtractFileInput(filePath, aiConfig, modelOverride) {
|
|
|
14479
14860
|
}));
|
|
14480
14861
|
const ext = path.extname(filePath).toLowerCase().replace(".", "");
|
|
14481
14862
|
if (FILE_PART_EXTENSIONS.has(ext)) {
|
|
14482
|
-
|
|
14483
|
-
|
|
14484
|
-
|
|
14485
|
-
|
|
14863
|
+
const image = aiConfig?.image;
|
|
14864
|
+
if (image?.imageConversion === "vision" && image.imageModelName && aiConfig) {
|
|
14865
|
+
const baseURL = image.visionBaseURL || aiConfig.provider.baseURL;
|
|
14866
|
+
const apiKey = image.visionApiKey || aiConfig.provider.apiKey;
|
|
14867
|
+
const timeout = (aiConfig.provider.timeout ?? 300) * 1e3;
|
|
14868
|
+
try {
|
|
14869
|
+
const result$1 = await transcribeImageWithVision(filePath, baseURL, apiKey, image.imageModelName, timeout);
|
|
14870
|
+
consola.info(t("command.extract.file.visionTranscribed", { model: result$1.modelName }));
|
|
14871
|
+
return { text: result$1.text };
|
|
14872
|
+
} catch {
|
|
14873
|
+
consola.warn(t("command.extract.file.visionTranscribeFailed", { model: image.imageModelName }));
|
|
14874
|
+
}
|
|
14486
14875
|
}
|
|
14487
|
-
|
|
14488
|
-
|
|
14489
|
-
|
|
14490
|
-
};
|
|
14876
|
+
const result = await recognizeImageText(filePath, aiConfig?.image);
|
|
14877
|
+
consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
|
|
14878
|
+
return { text: result.text };
|
|
14491
14879
|
}
|
|
14492
14880
|
if (ext === "pdf") {
|
|
14493
14881
|
const buffer = await fs.readFile(filePath);
|
|
@@ -14608,6 +14996,7 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
|
|
|
14608
14996
|
|
|
14609
14997
|
//#endregion
|
|
14610
14998
|
//#region src/core/extract-runner.ts
|
|
14999
|
+
const encoding = getEncoding("cl100k_base");
|
|
14611
15000
|
const JSON_EXT_RE$1 = /\.json$/;
|
|
14612
15001
|
async function limitConcurrency(concurrency, items, fn) {
|
|
14613
15002
|
const results = Array.from({ length: items.length });
|
|
@@ -14622,29 +15011,6 @@ async function limitConcurrency(concurrency, items, fn) {
|
|
|
14622
15011
|
await Promise.all(workers);
|
|
14623
15012
|
return results;
|
|
14624
15013
|
}
|
|
14625
|
-
function getSchemaKeywords(schema) {
|
|
14626
|
-
const keywords = /* @__PURE__ */ new Set();
|
|
14627
|
-
function walk(properties) {
|
|
14628
|
-
if (!properties) return;
|
|
14629
|
-
for (const [name$1, prop] of Object.entries(properties)) {
|
|
14630
|
-
keywords.add(name$1.toLowerCase());
|
|
14631
|
-
const parts = name$1.replace(/([a-z0-9])([A-Z])/g, "$1 $2").split(/[\s._:/\\-]+/g);
|
|
14632
|
-
for (const part of parts) if (part.length > 1) keywords.add(part.toLowerCase());
|
|
14633
|
-
if (prop && typeof prop === "object") {
|
|
14634
|
-
const p = prop;
|
|
14635
|
-
if (typeof p.title === "string") keywords.add(p.title.toLowerCase());
|
|
14636
|
-
if (typeof p.description === "string") {
|
|
14637
|
-
const descParts = p.description.toLowerCase().match(/[\p{L}\p{N}_-]+/gu) ?? [];
|
|
14638
|
-
for (const d of descParts) if (d.length > 2) keywords.add(d);
|
|
14639
|
-
}
|
|
14640
|
-
if (p.type === "object") walk(p.properties);
|
|
14641
|
-
if (p.type === "array" && p.items?.type === "object") walk(p.items.properties);
|
|
14642
|
-
}
|
|
14643
|
-
}
|
|
14644
|
-
}
|
|
14645
|
-
walk(schema.properties);
|
|
14646
|
-
return Array.from(keywords);
|
|
14647
|
-
}
|
|
14648
15014
|
async function ensureDatabaseReady(dbPath, schema) {
|
|
14649
15015
|
try {
|
|
14650
15016
|
await fs.access(dbPath);
|
|
@@ -14716,182 +15082,145 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14716
15082
|
}
|
|
14717
15083
|
const s = spinner();
|
|
14718
15084
|
if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
|
|
14719
|
-
const
|
|
14720
|
-
|
|
14721
|
-
|
|
14722
|
-
|
|
14723
|
-
|
|
14724
|
-
|
|
14725
|
-
|
|
14726
|
-
|
|
14727
|
-
|
|
14728
|
-
|
|
14729
|
-
|
|
14730
|
-
|
|
14731
|
-
|
|
14732
|
-
|
|
14733
|
-
|
|
14734
|
-
|
|
14735
|
-
|
|
14736
|
-
|
|
14737
|
-
|
|
14738
|
-
|
|
14739
|
-
|
|
14740
|
-
|
|
14741
|
-
|
|
14742
|
-
|
|
14743
|
-
|
|
14744
|
-
|
|
14745
|
-
|
|
14746
|
-
|
|
14747
|
-
|
|
14748
|
-
|
|
14749
|
-
|
|
14750
|
-
|
|
14751
|
-
const
|
|
14752
|
-
|
|
14753
|
-
|
|
14754
|
-
|
|
14755
|
-
|
|
14756
|
-
|
|
14757
|
-
|
|
14758
|
-
if (
|
|
14759
|
-
|
|
14760
|
-
|
|
14761
|
-
|
|
14762
|
-
|
|
14763
|
-
|
|
14764
|
-
|
|
14765
|
-
|
|
14766
|
-
|
|
14767
|
-
|
|
14768
|
-
|
|
14769
|
-
|
|
14770
|
-
|
|
14771
|
-
|
|
14772
|
-
|
|
14773
|
-
|
|
14774
|
-
const headings = [];
|
|
14775
|
-
if (doc.metadata) {
|
|
14776
|
-
if (doc.metadata.h1) headings.push(doc.metadata.h1);
|
|
14777
|
-
if (doc.metadata.h2) headings.push(doc.metadata.h2);
|
|
14778
|
-
if (doc.metadata.h3) headings.push(doc.metadata.h3);
|
|
14779
|
-
if (doc.metadata.h4) headings.push(doc.metadata.h4);
|
|
14780
|
-
}
|
|
14781
|
-
let chunkText = doc.pageContent;
|
|
14782
|
-
if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
|
|
14783
|
-
const chunkResult = await extractStructuredData({
|
|
14784
|
-
config: aiConfig,
|
|
14785
|
-
schema: schemaLoad.schema,
|
|
14786
|
-
text: chunkText,
|
|
14787
|
-
aiexDir,
|
|
14788
|
-
modelOverride,
|
|
14789
|
-
onRetry(info) {
|
|
14790
|
-
if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
|
|
14791
|
-
current: i + 1,
|
|
14792
|
-
total: processedDocs.length,
|
|
14793
|
-
code: info.statusCode,
|
|
14794
|
-
delay: info.delayMs / 1e3,
|
|
14795
|
-
attempt: info.attempt,
|
|
14796
|
-
max: info.maxRetries
|
|
14797
|
-
}));
|
|
14798
|
-
}
|
|
14799
|
-
});
|
|
14800
|
-
if (!chunkResult.success) {
|
|
14801
|
-
success = false;
|
|
14802
|
-
errorMsg = chunkResult.error || t("common.unknownError");
|
|
14803
|
-
if (!options?.quiet) {
|
|
14804
|
-
s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
|
|
14805
|
-
consola.error(errorMsg);
|
|
14806
|
-
}
|
|
14807
|
-
return;
|
|
14808
|
-
}
|
|
14809
|
-
if (chunkResult.data) chunkResults.push(chunkResult.data);
|
|
14810
|
-
if (chunkResult.tokensUsed) {
|
|
14811
|
-
accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
|
|
14812
|
-
accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
|
|
14813
|
-
accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
|
|
15085
|
+
const maxTokens = calculateChunkTokenBudget({
|
|
15086
|
+
configuredMaxTokens: aiConfig.extraction?.maxTokens ?? 8e3,
|
|
15087
|
+
modelMaxTokens: modelOverride?.capabilities.maxTokens
|
|
15088
|
+
});
|
|
15089
|
+
const overlapTokens = aiConfig.extraction?.overlapSize ?? 1e3;
|
|
15090
|
+
const totalTokens = text$1 ? encoding.encode(text$1).length : 0;
|
|
15091
|
+
if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunking", {
|
|
15092
|
+
length: totalTokens,
|
|
15093
|
+
limit: maxTokens
|
|
15094
|
+
}));
|
|
15095
|
+
const processedDocs = text$1 && totalTokens > maxTokens ? splitMarkdown(text$1, maxTokens, overlapTokens) : [{
|
|
15096
|
+
pageContent: text$1 ?? "",
|
|
15097
|
+
metadata: {},
|
|
15098
|
+
chunkIndex: 0,
|
|
15099
|
+
totalChunks: 1,
|
|
15100
|
+
tokenCount: totalTokens,
|
|
15101
|
+
headingPath: [],
|
|
15102
|
+
charStart: 0,
|
|
15103
|
+
charEnd: text$1?.length ?? 0
|
|
15104
|
+
}];
|
|
15105
|
+
if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: processedDocs.length }));
|
|
15106
|
+
const chunkResults = Array.from({ length: processedDocs.length });
|
|
15107
|
+
const accumulatedTokens = {
|
|
15108
|
+
prompt: 0,
|
|
15109
|
+
completion: 0,
|
|
15110
|
+
total: 0
|
|
15111
|
+
};
|
|
15112
|
+
let success = true;
|
|
15113
|
+
let errorMsg = "";
|
|
15114
|
+
const extractionTasks = processedDocs.map((doc, i) => {
|
|
15115
|
+
return async () => {
|
|
15116
|
+
if (!success) return;
|
|
15117
|
+
const headings = doc.headingPath?.length ? doc.headingPath : [
|
|
15118
|
+
doc.metadata.h1,
|
|
15119
|
+
doc.metadata.h2,
|
|
15120
|
+
doc.metadata.h3,
|
|
15121
|
+
doc.metadata.h4
|
|
15122
|
+
].filter(Boolean);
|
|
15123
|
+
let chunkText = doc.pageContent;
|
|
15124
|
+
if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
|
|
15125
|
+
const chunkResult = await extractStructuredData({
|
|
15126
|
+
config: aiConfig,
|
|
15127
|
+
schema: schemaLoad.schema,
|
|
15128
|
+
text: chunkText,
|
|
15129
|
+
aiexDir,
|
|
15130
|
+
modelOverride,
|
|
15131
|
+
onRetry(info) {
|
|
15132
|
+
if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
|
|
15133
|
+
current: i + 1,
|
|
15134
|
+
total: processedDocs.length,
|
|
15135
|
+
code: info.statusCode,
|
|
15136
|
+
delay: info.delayMs / 1e3,
|
|
15137
|
+
attempt: info.attempt,
|
|
15138
|
+
max: info.maxRetries
|
|
15139
|
+
}));
|
|
14814
15140
|
}
|
|
14815
|
-
};
|
|
14816
|
-
});
|
|
14817
|
-
const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
|
|
14818
|
-
if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
|
|
14819
|
-
current: 1,
|
|
14820
|
-
total: processedDocs.length
|
|
14821
|
-
}));
|
|
14822
|
-
try {
|
|
14823
|
-
await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
|
|
14824
|
-
if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
|
|
14825
|
-
current: idx + 1,
|
|
14826
|
-
total: processedDocs.length
|
|
14827
|
-
}));
|
|
14828
|
-
await task();
|
|
14829
15141
|
});
|
|
14830
|
-
|
|
14831
|
-
|
|
14832
|
-
|
|
14833
|
-
|
|
14834
|
-
|
|
14835
|
-
|
|
14836
|
-
|
|
14837
|
-
|
|
14838
|
-
|
|
14839
|
-
|
|
14840
|
-
|
|
14841
|
-
|
|
14842
|
-
|
|
14843
|
-
|
|
14844
|
-
consola.error(valError);
|
|
15142
|
+
if (!chunkResult.success) {
|
|
15143
|
+
success = false;
|
|
15144
|
+
errorMsg = chunkResult.error || t("common.unknownError");
|
|
15145
|
+
if (!options?.quiet) {
|
|
15146
|
+
s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
|
|
15147
|
+
consola.error(errorMsg);
|
|
15148
|
+
}
|
|
15149
|
+
return;
|
|
15150
|
+
}
|
|
15151
|
+
if (chunkResult.data) chunkResults[i] = chunkResult.data;
|
|
15152
|
+
if (chunkResult.tokensUsed) {
|
|
15153
|
+
accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
|
|
15154
|
+
accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
|
|
15155
|
+
accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
|
|
14845
15156
|
}
|
|
14846
|
-
return {
|
|
14847
|
-
success: false,
|
|
14848
|
-
error: valError
|
|
14849
|
-
};
|
|
14850
|
-
}
|
|
14851
|
-
const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
|
|
14852
|
-
await fs.mkdir(outputDir, { recursive: true });
|
|
14853
|
-
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
14854
|
-
const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
|
|
14855
|
-
const finalMergedOutputPath = path.join(outputDir, outputFileName);
|
|
14856
|
-
await fs.writeFile(finalMergedOutputPath, JSON.stringify(mergedData, null, 2));
|
|
14857
|
-
result = {
|
|
14858
|
-
success: true,
|
|
14859
|
-
data: mergedData,
|
|
14860
|
-
tokensUsed: accumulatedTokens,
|
|
14861
|
-
outputPath: finalMergedOutputPath
|
|
14862
15157
|
};
|
|
14863
|
-
}
|
|
14864
|
-
|
|
14865
|
-
|
|
14866
|
-
|
|
14867
|
-
|
|
14868
|
-
|
|
14869
|
-
|
|
14870
|
-
|
|
14871
|
-
if (!options?.quiet) s.message(t("command.extract.file.
|
|
14872
|
-
|
|
14873
|
-
|
|
14874
|
-
attempt: info.attempt,
|
|
14875
|
-
max: info.maxRetries
|
|
15158
|
+
});
|
|
15159
|
+
const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
|
|
15160
|
+
if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
|
|
15161
|
+
current: 1,
|
|
15162
|
+
total: processedDocs.length
|
|
15163
|
+
}));
|
|
15164
|
+
try {
|
|
15165
|
+
await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
|
|
15166
|
+
if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
|
|
15167
|
+
current: idx + 1,
|
|
15168
|
+
total: processedDocs.length
|
|
14876
15169
|
}));
|
|
14877
|
-
|
|
15170
|
+
await task();
|
|
15171
|
+
});
|
|
15172
|
+
} catch (e) {
|
|
15173
|
+
success = false;
|
|
15174
|
+
errorMsg = e instanceof Error ? e.message : String(e);
|
|
15175
|
+
}
|
|
15176
|
+
if (!success) return {
|
|
15177
|
+
success: false,
|
|
15178
|
+
error: errorMsg
|
|
15179
|
+
};
|
|
15180
|
+
const successfulChunkResults = chunkResults.filter((chunkResult) => !!chunkResult);
|
|
15181
|
+
const candidateReport = buildCandidateMergeReport({
|
|
15182
|
+
schema: schemaLoad.schema,
|
|
15183
|
+
chunkResults: successfulChunkResults,
|
|
15184
|
+
chunks: processedDocs
|
|
14878
15185
|
});
|
|
14879
|
-
|
|
15186
|
+
const mergedData = applySelectedCandidates(mergeExtractionResults(schemaLoad.schema, successfulChunkResults), candidateReport);
|
|
15187
|
+
const validation = validateExtractedData(schemaLoad.schema, mergedData);
|
|
15188
|
+
if (!validation.success) {
|
|
15189
|
+
const valError = validation.error || "Merged data validation failed";
|
|
14880
15190
|
if (!options?.quiet) {
|
|
14881
|
-
s.stop(t("command.extract.file.
|
|
14882
|
-
consola.error(
|
|
15191
|
+
s.stop(t("command.extract.file.validationFail"));
|
|
15192
|
+
consola.error(valError);
|
|
14883
15193
|
}
|
|
14884
15194
|
return {
|
|
14885
15195
|
success: false,
|
|
14886
|
-
error:
|
|
15196
|
+
error: valError
|
|
14887
15197
|
};
|
|
14888
15198
|
}
|
|
15199
|
+
const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
|
|
15200
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
15201
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
15202
|
+
const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
|
|
15203
|
+
const outputPath = path.join(outputDir, outputFileName);
|
|
15204
|
+
await fs.writeFile(outputPath, JSON.stringify(mergedData, null, 2));
|
|
15205
|
+
const result = {
|
|
15206
|
+
success: true,
|
|
15207
|
+
data: mergedData,
|
|
15208
|
+
tokensUsed: accumulatedTokens,
|
|
15209
|
+
outputPath,
|
|
15210
|
+
evidenceSummary: await writeExtractionEvidence({
|
|
15211
|
+
schema: schemaLoad.schema,
|
|
15212
|
+
data: mergedData,
|
|
15213
|
+
outputPath,
|
|
15214
|
+
chunks: processedDocs,
|
|
15215
|
+
candidateReport
|
|
15216
|
+
})
|
|
15217
|
+
};
|
|
14889
15218
|
if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
|
|
14890
15219
|
if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
|
|
14891
15220
|
if (result.evidenceSummary && !options?.quiet) {
|
|
14892
15221
|
const summary = result.evidenceSummary;
|
|
14893
15222
|
const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
|
|
14894
|
-
consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, issues ${issueText}`));
|
|
15223
|
+
consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, conflicts ${summary.conflictCount ?? 0}, issues ${issueText}`));
|
|
14895
15224
|
}
|
|
14896
15225
|
if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
|
|
14897
15226
|
prompt: result.tokensUsed.prompt,
|
|
@@ -15013,13 +15342,9 @@ async function runAuditedExtraction(options) {
|
|
|
15013
15342
|
});
|
|
15014
15343
|
try {
|
|
15015
15344
|
let text$1 = "";
|
|
15016
|
-
|
|
15017
|
-
|
|
15018
|
-
|
|
15019
|
-
text$1 = input.text;
|
|
15020
|
-
filePath = input.filePath;
|
|
15021
|
-
} else text$1 = source.text;
|
|
15022
|
-
const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
|
|
15345
|
+
if (source.type === "file") text$1 = (await readExtractFileInput(source.filePath, aiConfig)).text;
|
|
15346
|
+
else text$1 = source.text;
|
|
15347
|
+
const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, source.type === "file" ? source.filePath : void 0, modelOverride, {
|
|
15023
15348
|
quiet,
|
|
15024
15349
|
insert
|
|
15025
15350
|
});
|