aiex-cli 0.0.5-beta.6 → 0.0.6-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -11
- package/dist/cli.mjs +197 -900
- package/dist/{doctor-collector-BpqhXNcO.mjs → doctor-collector-hWEvJ4lw.mjs} +89 -44
- package/dist/index.d.mts +88 -91
- package/dist/index.mjs +1 -1
- package/dist/web/assets/AISettings-BlyTFIIy.js +272 -0
- package/dist/web/assets/{DataBrowser-BGkZb9FV.js → DataBrowser-GAA-pGq0.js} +1 -1
- package/dist/web/assets/ExtractionViewer-DqIrBGNK.js +1 -0
- package/dist/web/assets/{api-client-gQAAOw0v.js → api-client-b4ZBXpNH.js} +1 -1
- package/dist/web/assets/index-CvY9TGny.css +2 -0
- package/dist/web/assets/{index-BQKZKzzP.js → index-Dlze68g1.js} +3 -3
- package/dist/web/index.html +3 -3
- package/dist/{zh-CN-DkillGHx.mjs → zh-CN-Qcn0DHFh.mjs} +22 -16
- package/package.json +1 -3
- package/dist/web/assets/AISettings-sVI4PTNB.js +0 -264
- package/dist/web/assets/ExtractionViewer-DNrkSECj.js +0 -1
- package/dist/web/assets/index-BU58oIRd.css +0 -2
package/dist/cli.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { C as
|
|
1
|
+
import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-hWEvJ4lw.mjs";
|
|
2
2
|
import { createRequire } from "node:module";
|
|
3
3
|
import fs from "node:fs/promises";
|
|
4
4
|
import os from "node:os";
|
|
@@ -17,14 +17,13 @@ import Database from "better-sqlite3";
|
|
|
17
17
|
import pc from "picocolors";
|
|
18
18
|
import { Buffer } from "node:buffer";
|
|
19
19
|
import * as XLSX from "xlsx";
|
|
20
|
-
import { getEncoding } from "js-tiktoken";
|
|
21
20
|
import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
|
|
22
21
|
import { APICallError, Output, generateText, jsonSchema } from "ai";
|
|
23
22
|
import pRetry from "p-retry";
|
|
23
|
+
import mime from "mime";
|
|
24
24
|
import { jsonrepair } from "jsonrepair";
|
|
25
25
|
import { LangfuseSpanProcessor } from "@langfuse/otel";
|
|
26
26
|
import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
27
|
-
import { marked } from "marked";
|
|
28
27
|
import crypto from "node:crypto";
|
|
29
28
|
import { Client, extractNotionId } from "@notionhq/client";
|
|
30
29
|
import { execa } from "execa";
|
|
@@ -12860,6 +12859,28 @@ async function withRetry(fn, onRetry, maxRetries = 5) {
|
|
|
12860
12859
|
});
|
|
12861
12860
|
}
|
|
12862
12861
|
|
|
12862
|
+
//#endregion
|
|
12863
|
+
//#region src/core/ai-extraction/file-utils.ts
|
|
12864
|
+
function detectMimeType(filePath) {
|
|
12865
|
+
return mime.getType(filePath) ?? "application/octet-stream";
|
|
12866
|
+
}
|
|
12867
|
+
async function readFilePart(filePath) {
|
|
12868
|
+
const mimeStr = detectMimeType(filePath);
|
|
12869
|
+
const buffer = await fs.readFile(filePath);
|
|
12870
|
+
const name$1 = path.basename(filePath);
|
|
12871
|
+
if (mimeStr.startsWith("image/")) return {
|
|
12872
|
+
type: "image",
|
|
12873
|
+
image: buffer,
|
|
12874
|
+
mimeType: mimeStr
|
|
12875
|
+
};
|
|
12876
|
+
return {
|
|
12877
|
+
type: "file",
|
|
12878
|
+
data: buffer,
|
|
12879
|
+
mediaType: mimeStr,
|
|
12880
|
+
filename: name$1
|
|
12881
|
+
};
|
|
12882
|
+
}
|
|
12883
|
+
|
|
12863
12884
|
//#endregion
|
|
12864
12885
|
//#region src/core/ai-extraction/json-utils.ts
|
|
12865
12886
|
function parseJsonLike(text$1) {
|
|
@@ -12920,10 +12941,25 @@ function filterCompatible(models, inputTokens, outputTokens) {
|
|
|
12920
12941
|
});
|
|
12921
12942
|
}
|
|
12922
12943
|
function selectModel(input) {
|
|
12923
|
-
const { models, inputTokens, outputTokens } = input;
|
|
12944
|
+
const { models, isImage, fileName, inputTokens, outputTokens } = input;
|
|
12924
12945
|
if (models.length === 0) throw new Error(t("errors.ai.noModels"));
|
|
12925
12946
|
let candidates = filterCompatible(models, inputTokens, outputTokens);
|
|
12926
12947
|
if (candidates.length === 0) candidates = models;
|
|
12948
|
+
if (isImage) {
|
|
12949
|
+
const visionModel = candidates.find((m) => m.capabilities.vision);
|
|
12950
|
+
if (!visionModel) {
|
|
12951
|
+
const hint = fileName ? ` (${fileName})` : "";
|
|
12952
|
+
const msg = inputTokens ? t("errors.ai.noVisionModelContext", {
|
|
12953
|
+
tokens: inputTokens,
|
|
12954
|
+
hint
|
|
12955
|
+
}) : t("errors.ai.noVisionModel", { hint });
|
|
12956
|
+
throw new Error(msg + t("errors.ai.addSuitableModel"));
|
|
12957
|
+
}
|
|
12958
|
+
return {
|
|
12959
|
+
name: visionModel.name,
|
|
12960
|
+
capabilities: visionModel.capabilities
|
|
12961
|
+
};
|
|
12962
|
+
}
|
|
12927
12963
|
const soModel = candidates.find((m) => m.capabilities.structuredOutput);
|
|
12928
12964
|
if (soModel) return {
|
|
12929
12965
|
name: soModel.name,
|
|
@@ -12937,46 +12973,36 @@ function selectModel(input) {
|
|
|
12937
12973
|
|
|
12938
12974
|
//#endregion
|
|
12939
12975
|
//#region src/core/ai-extraction/prompt-generator.ts
|
|
12940
|
-
|
|
12941
|
-
const IDENTIFIER_SEPARATOR_RE = /[\s_-]+/;
|
|
12942
|
-
function splitIdentifier(name$1) {
|
|
12943
|
-
return name$1.replace(CAMEL_CASE_BOUNDARY_RE, "$1 $2").split(IDENTIFIER_SEPARATOR_RE).map((part) => part.trim().toLowerCase()).filter(Boolean);
|
|
12944
|
-
}
|
|
12945
|
-
function propertyToDescription(name$1, prop, indent = "", required = false) {
|
|
12976
|
+
function propertyToDescription(name$1, prop, indent = "") {
|
|
12946
12977
|
const lines = [];
|
|
12947
12978
|
let typeStr = prop.type;
|
|
12948
12979
|
if (prop.type === "array" && prop.items) typeStr = `array of ${prop.items.type}`;
|
|
12949
|
-
lines.push(`${indent}- ${name$1}: ${typeStr}
|
|
12950
|
-
const terms = splitIdentifier(name$1);
|
|
12951
|
-
if (terms.length > 1) lines.push(`${indent} search terms: ${terms.join(", ")}`);
|
|
12952
|
-
if (prop.description) lines.push(`${indent} description: ${prop.description}`);
|
|
12980
|
+
lines.push(`${indent}- ${name$1}: ${typeStr}`);
|
|
12953
12981
|
if (prop.minLength !== void 0 || prop.maxLength !== void 0) lines.push(`${indent} length: ${prop.minLength ?? 0} - ${prop.maxLength ?? "unlimited"}`);
|
|
12954
|
-
if (prop.minimum !== void 0 || prop.maximum !== void 0) lines.push(`${indent} range: ${prop.minimum ?? "-∞"} - ${prop.maximum ?? "+∞"}`);
|
|
12955
12982
|
if (prop.format) lines.push(`${indent} format: ${prop.format}`);
|
|
12956
12983
|
if (prop.unique) lines.push(`${indent} unique: true`);
|
|
12957
12984
|
if (prop.default !== void 0) lines.push(`${indent} default: ${JSON.stringify(prop.default)}`);
|
|
12958
12985
|
return lines.join("\n");
|
|
12959
12986
|
}
|
|
12960
|
-
function nestedPropertyToDescription(name$1, prop, indent = ""
|
|
12987
|
+
function nestedPropertyToDescription(name$1, prop, indent = "") {
|
|
12961
12988
|
const lines = [];
|
|
12962
|
-
const isRequired = requiredFields.includes(name$1);
|
|
12963
12989
|
if (prop.nested?.enabled && prop.type === "object") {
|
|
12964
12990
|
const relation = prop.nested.relation || "has-one";
|
|
12965
|
-
lines.push(`${indent}- ${name$1}: object (related table, ${relation})
|
|
12966
|
-
if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}
|
|
12991
|
+
lines.push(`${indent}- ${name$1}: object (related table, ${relation})`);
|
|
12992
|
+
if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
|
|
12967
12993
|
return lines.join("\n");
|
|
12968
12994
|
}
|
|
12969
12995
|
if (prop.type === "array" && prop.items?.nested?.enabled) {
|
|
12970
12996
|
const relation = prop.items.nested.relation || "has-many";
|
|
12971
|
-
lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})
|
|
12972
|
-
if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}
|
|
12997
|
+
lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})`);
|
|
12998
|
+
if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
|
|
12973
12999
|
return lines.join("\n");
|
|
12974
13000
|
}
|
|
12975
|
-
lines.push(propertyToDescription(name$1, prop, indent
|
|
12976
|
-
if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}
|
|
13001
|
+
lines.push(propertyToDescription(name$1, prop, indent));
|
|
13002
|
+
if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
|
|
12977
13003
|
if (prop.type === "array" && prop.items?.properties && !prop.items?.nested?.enabled) {
|
|
12978
13004
|
lines.push(`${indent} item fields:`);
|
|
12979
|
-
for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}
|
|
13005
|
+
for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
|
|
12980
13006
|
}
|
|
12981
13007
|
return lines.join("\n");
|
|
12982
13008
|
}
|
|
@@ -12988,7 +13014,7 @@ function schemaToDescription(schema) {
|
|
|
12988
13014
|
lines.push("Fields:");
|
|
12989
13015
|
for (const [name$1, prop] of Object.entries(schema.properties)) {
|
|
12990
13016
|
const property = prop;
|
|
12991
|
-
lines.push(nestedPropertyToDescription(name$1, property
|
|
13017
|
+
lines.push(nestedPropertyToDescription(name$1, property));
|
|
12992
13018
|
}
|
|
12993
13019
|
if (schema.examples && schema.examples.length > 0) {
|
|
12994
13020
|
lines.push("");
|
|
@@ -13033,6 +13059,33 @@ function generatePromptSnapshot(schema, promptConfig = DEFAULT_PROMPT_CONFIG) {
|
|
|
13033
13059
|
].join("\n");
|
|
13034
13060
|
}
|
|
13035
13061
|
|
|
13062
|
+
//#endregion
|
|
13063
|
+
//#region src/core/ai-extraction/snapshot.ts
|
|
13064
|
+
const SYSTEM_PROMPT_REGEX = /## System Prompt\n([\s\S]*?)(?=## User Prompt|$)/;
|
|
13065
|
+
const USER_PROMPT_REGEX = /## User Prompt Template\n([\s\S]*)$/;
|
|
13066
|
+
async function loadPromptSnapshot(aiexDir, tableName) {
|
|
13067
|
+
const snapshotPath = path.join(aiexDir, "extracted", `${tableName}.prompt.md`);
|
|
13068
|
+
try {
|
|
13069
|
+
const content = await fs.readFile(snapshotPath, "utf-8");
|
|
13070
|
+
const systemMatch = content.match(SYSTEM_PROMPT_REGEX);
|
|
13071
|
+
const userMatch = content.match(USER_PROMPT_REGEX);
|
|
13072
|
+
if (systemMatch && userMatch) return {
|
|
13073
|
+
system: systemMatch[1].trim(),
|
|
13074
|
+
user: userMatch[1].trim()
|
|
13075
|
+
};
|
|
13076
|
+
} catch {}
|
|
13077
|
+
return null;
|
|
13078
|
+
}
|
|
13079
|
+
async function savePromptSnapshot(schema, aiexDir) {
|
|
13080
|
+
const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
|
|
13081
|
+
const outputDir = path.join(aiexDir, "extracted");
|
|
13082
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
13083
|
+
const fileName = `${schema.table.name}.prompt.md`;
|
|
13084
|
+
const outputPath = path.join(outputDir, fileName);
|
|
13085
|
+
await fs.writeFile(outputPath, content);
|
|
13086
|
+
return outputPath;
|
|
13087
|
+
}
|
|
13088
|
+
|
|
13036
13089
|
//#endregion
|
|
13037
13090
|
//#region src/core/ai-extraction/telemetry.ts
|
|
13038
13091
|
let langfuseInitialized = false;
|
|
@@ -13075,7 +13128,7 @@ function propertyToExtractionSchema(property) {
|
|
|
13075
13128
|
}
|
|
13076
13129
|
return { type: nullableType(property.type) };
|
|
13077
13130
|
}
|
|
13078
|
-
function isRecord
|
|
13131
|
+
function isRecord(value) {
|
|
13079
13132
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
13080
13133
|
}
|
|
13081
13134
|
function schemaToExtractionOutputSchema(schema) {
|
|
@@ -13113,7 +13166,7 @@ function validatePropertyValue(path$1, property, value, issues) {
|
|
|
13113
13166
|
}
|
|
13114
13167
|
return;
|
|
13115
13168
|
case "object":
|
|
13116
|
-
if (!isRecord
|
|
13169
|
+
if (!isRecord(value)) {
|
|
13117
13170
|
issues.push(`${path$1}: expected object or null`);
|
|
13118
13171
|
return;
|
|
13119
13172
|
}
|
|
@@ -13136,7 +13189,7 @@ function validateProperties(basePath, properties, data, issues) {
|
|
|
13136
13189
|
}
|
|
13137
13190
|
}
|
|
13138
13191
|
function validateExtractedData(schema, data) {
|
|
13139
|
-
if (!isRecord
|
|
13192
|
+
if (!isRecord(data)) return {
|
|
13140
13193
|
success: false,
|
|
13141
13194
|
error: "Extracted data must be a JSON object."
|
|
13142
13195
|
};
|
|
@@ -13153,11 +13206,13 @@ function validateExtractedData(schema, data) {
|
|
|
13153
13206
|
//#region src/core/ai-extraction/extractor.ts
|
|
13154
13207
|
const OPENAI_COMPATIBLE_PROVIDER_NAME = "openai-compatible";
|
|
13155
13208
|
async function extractStructuredData(input) {
|
|
13156
|
-
const { config, schema, text: text$1, modelOverride } = input;
|
|
13209
|
+
const { config, schema, text: text$1, aiexDir, file, modelOverride } = input;
|
|
13157
13210
|
if (!config.provider.apiKey) return {
|
|
13158
13211
|
success: false,
|
|
13159
13212
|
error: t("errors.ai.apiKeyMissing")
|
|
13160
13213
|
};
|
|
13214
|
+
const useFileContent = !!file;
|
|
13215
|
+
const isImageFile = useFileContent && detectMimeType(file).startsWith("image/");
|
|
13161
13216
|
const inputTokens = text$1 ? Math.ceil(text$1.length / 2) : void 0;
|
|
13162
13217
|
const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
|
|
13163
13218
|
const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
|
|
@@ -13165,6 +13220,8 @@ async function extractStructuredData(input) {
|
|
|
13165
13220
|
try {
|
|
13166
13221
|
selected = modelOverride ?? selectModel({
|
|
13167
13222
|
models: config.provider.models,
|
|
13223
|
+
isImage: isImageFile,
|
|
13224
|
+
fileName: file,
|
|
13168
13225
|
inputTokens,
|
|
13169
13226
|
outputTokens
|
|
13170
13227
|
});
|
|
@@ -13184,7 +13241,18 @@ async function extractStructuredData(input) {
|
|
|
13184
13241
|
apiKey: config.provider.apiKey,
|
|
13185
13242
|
supportsStructuredOutputs: useStructuredOutput
|
|
13186
13243
|
});
|
|
13187
|
-
|
|
13244
|
+
let system;
|
|
13245
|
+
let user;
|
|
13246
|
+
const snapshot = await loadPromptSnapshot(aiexDir, schema.table.name);
|
|
13247
|
+
const promptText = file ? PLACEHOLDER_TEXT : text$1;
|
|
13248
|
+
if (snapshot) {
|
|
13249
|
+
system = snapshot.system;
|
|
13250
|
+
user = snapshot.user.replaceAll(PLACEHOLDER_TEXT, promptText);
|
|
13251
|
+
} else {
|
|
13252
|
+
const generated = generateExtractionPrompt(schema, promptText, config.prompt ?? DEFAULT_PROMPT_CONFIG);
|
|
13253
|
+
system = generated.system;
|
|
13254
|
+
user = generated.user;
|
|
13255
|
+
}
|
|
13188
13256
|
const outputSchema = jsonSchema(schemaToExtractionOutputSchema(schema));
|
|
13189
13257
|
const timeoutMs = (config.provider.timeout ?? 300) * 1e3;
|
|
13190
13258
|
let systemPrompt = system;
|
|
@@ -13199,16 +13267,38 @@ async function extractStructuredData(input) {
|
|
|
13199
13267
|
let parseError;
|
|
13200
13268
|
let validationError;
|
|
13201
13269
|
try {
|
|
13202
|
-
|
|
13203
|
-
|
|
13204
|
-
|
|
13205
|
-
|
|
13206
|
-
|
|
13207
|
-
|
|
13208
|
-
|
|
13209
|
-
|
|
13210
|
-
|
|
13211
|
-
|
|
13270
|
+
if (useFileContent) {
|
|
13271
|
+
const filePart = await readFilePart(file);
|
|
13272
|
+
const fileName = filePart.type === "file" ? filePart.filename : path.basename(file);
|
|
13273
|
+
const contentParts = [{
|
|
13274
|
+
type: "text",
|
|
13275
|
+
text: userPrompt.includes(PLACEHOLDER_TEXT) ? userPrompt.replaceAll(PLACEHOLDER_TEXT, text$1 || `Data is contained in the attached file: ${fileName}`) : userPrompt
|
|
13276
|
+
}, filePart];
|
|
13277
|
+
const fileOpts = {
|
|
13278
|
+
model: provider.chatModel(selected.name),
|
|
13279
|
+
system: systemPrompt,
|
|
13280
|
+
messages: [{
|
|
13281
|
+
role: "user",
|
|
13282
|
+
content: contentParts
|
|
13283
|
+
}],
|
|
13284
|
+
abortSignal: AbortSignal.timeout(timeoutMs),
|
|
13285
|
+
maxRetries: 0,
|
|
13286
|
+
experimental_telemetry: { isEnabled: useTelemetry }
|
|
13287
|
+
};
|
|
13288
|
+
if (useStructuredOutput) fileOpts.output = Output.object({ schema: outputSchema });
|
|
13289
|
+
result = await withRetry(() => generateText(fileOpts), input.onRetry);
|
|
13290
|
+
} else {
|
|
13291
|
+
const textOpts = {
|
|
13292
|
+
model: provider.chatModel(selected.name),
|
|
13293
|
+
system: systemPrompt,
|
|
13294
|
+
prompt: userPrompt,
|
|
13295
|
+
abortSignal: AbortSignal.timeout(timeoutMs),
|
|
13296
|
+
maxRetries: 0,
|
|
13297
|
+
experimental_telemetry: { isEnabled: useTelemetry }
|
|
13298
|
+
};
|
|
13299
|
+
if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
|
|
13300
|
+
result = await withRetry(() => generateText(textOpts), input.onRetry);
|
|
13301
|
+
}
|
|
13212
13302
|
if (result.usage) {
|
|
13213
13303
|
totalPromptTokens += result.usage.inputTokens ?? 0;
|
|
13214
13304
|
totalCompletionTokens += result.usage.outputTokens ?? 0;
|
|
@@ -13224,16 +13314,27 @@ async function extractStructuredData(input) {
|
|
|
13224
13314
|
}
|
|
13225
13315
|
if (!parseError && data !== void 0) {
|
|
13226
13316
|
const validation = validateExtractedData(schema, data);
|
|
13227
|
-
if (validation.success)
|
|
13228
|
-
|
|
13229
|
-
|
|
13230
|
-
|
|
13231
|
-
|
|
13232
|
-
|
|
13233
|
-
|
|
13234
|
-
|
|
13235
|
-
|
|
13236
|
-
|
|
13317
|
+
if (validation.success) {
|
|
13318
|
+
const outputDir = path.resolve(aiexDir, config.extraction.outputDir.replace(".aiex/", ""));
|
|
13319
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
13320
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
13321
|
+
const outputFileName = `${schema.table.name}-${timestamp}.json`;
|
|
13322
|
+
const outputPath = path.join(outputDir, outputFileName);
|
|
13323
|
+
await writeFile(outputPath, data, {
|
|
13324
|
+
spaces: 2,
|
|
13325
|
+
EOL: "\n"
|
|
13326
|
+
});
|
|
13327
|
+
return {
|
|
13328
|
+
success: true,
|
|
13329
|
+
outputPath,
|
|
13330
|
+
data,
|
|
13331
|
+
tokensUsed: {
|
|
13332
|
+
prompt: totalPromptTokens,
|
|
13333
|
+
completion: totalCompletionTokens,
|
|
13334
|
+
total: totalPromptTokens + totalCompletionTokens
|
|
13335
|
+
}
|
|
13336
|
+
};
|
|
13337
|
+
} else validationError = validation.error;
|
|
13237
13338
|
}
|
|
13238
13339
|
const errorMsg = parseError || validationError || "Unknown validation error";
|
|
13239
13340
|
lastError = errorMsg;
|
|
@@ -13244,14 +13345,11 @@ async function extractStructuredData(input) {
|
|
|
13244
13345
|
CRITICAL RULES:
|
|
13245
13346
|
1. Only correct the fields that failed validation.
|
|
13246
13347
|
2. Preserve all other correctly extracted fields and their values exactly.
|
|
13247
|
-
3.
|
|
13248
|
-
4. Remove any fields not defined by the JSON Schema.
|
|
13249
|
-
5. Normalize values to the expected JSON type without changing the intended meaning.
|
|
13250
|
-
6. Return ONLY the corrected JSON object. No explanations, no markdown blocks other than JSON.`;
|
|
13348
|
+
3. Return ONLY the corrected JSON object. No explanations, no markdown blocks other than JSON.`;
|
|
13251
13349
|
userPrompt = `The JSON data you generated previously failed validation. Please correct it.
|
|
13252
13350
|
|
|
13253
13351
|
[Original Text]
|
|
13254
|
-
${text$1 || "
|
|
13352
|
+
${text$1 || "Data is contained in the attached file."}
|
|
13255
13353
|
|
|
13256
13354
|
[JSON Schema Definition]
|
|
13257
13355
|
${JSON.stringify(schemaToExtractionOutputSchema(schema), null, 2)}
|
|
@@ -13262,11 +13360,6 @@ ${invalidJson}
|
|
|
13262
13360
|
[Validation Error Details]
|
|
13263
13361
|
${errorMsg}
|
|
13264
13362
|
|
|
13265
|
-
Correction checklist:
|
|
13266
|
-
- Fix each field path mentioned in the validation error.
|
|
13267
|
-
- Keep schema-valid fields unchanged.
|
|
13268
|
-
- Do not invent missing facts; use null when the original text does not support a value.
|
|
13269
|
-
|
|
13270
13363
|
Please output the corrected JSON object now:`;
|
|
13271
13364
|
}
|
|
13272
13365
|
}
|
|
@@ -13419,343 +13512,6 @@ function insertExtractedData(db, schema, data) {
|
|
|
13419
13512
|
}
|
|
13420
13513
|
}
|
|
13421
13514
|
|
|
13422
|
-
//#endregion
|
|
13423
|
-
//#region src/core/ai-extraction/json-merger.ts
|
|
13424
|
-
function isRecord$1(value) {
|
|
13425
|
-
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
13426
|
-
}
|
|
13427
|
-
function stableKey(value) {
|
|
13428
|
-
if (!isRecord$1(value)) return JSON.stringify(value);
|
|
13429
|
-
return JSON.stringify(Object.keys(value).sort().reduce((acc, key) => {
|
|
13430
|
-
acc[key] = value[key];
|
|
13431
|
-
return acc;
|
|
13432
|
-
}, {}));
|
|
13433
|
-
}
|
|
13434
|
-
function isBlankString(value) {
|
|
13435
|
-
return typeof value === "string" && value.trim() === "";
|
|
13436
|
-
}
|
|
13437
|
-
function isPlaceholderString$1(value) {
|
|
13438
|
-
if (typeof value !== "string") return false;
|
|
13439
|
-
const normalized = value.trim().toLowerCase();
|
|
13440
|
-
return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
|
|
13441
|
-
}
|
|
13442
|
-
function pickPrimitiveValue(values) {
|
|
13443
|
-
const meaningful = values.filter((v) => !isBlankString(v) && !isPlaceholderString$1(v));
|
|
13444
|
-
if (meaningful.length === 0) return null;
|
|
13445
|
-
if (typeof meaningful[0] === "boolean") {
|
|
13446
|
-
const trueCount = meaningful.filter(Boolean).length;
|
|
13447
|
-
return trueCount >= meaningful.length - trueCount;
|
|
13448
|
-
}
|
|
13449
|
-
return meaningful[0];
|
|
13450
|
-
}
|
|
13451
|
-
function mergePropertyValue(property, values) {
|
|
13452
|
-
const nonNullValues = values.filter((v) => v !== null && v !== void 0);
|
|
13453
|
-
if (nonNullValues.length === 0) return null;
|
|
13454
|
-
if (property.type === "array") {
|
|
13455
|
-
const concatenated = [];
|
|
13456
|
-
const seen = /* @__PURE__ */ new Set();
|
|
13457
|
-
for (const val of nonNullValues) if (Array.isArray(val)) for (const item of val) {
|
|
13458
|
-
const key = stableKey(item);
|
|
13459
|
-
if (!seen.has(key)) {
|
|
13460
|
-
seen.add(key);
|
|
13461
|
-
concatenated.push(item);
|
|
13462
|
-
}
|
|
13463
|
-
}
|
|
13464
|
-
return concatenated;
|
|
13465
|
-
}
|
|
13466
|
-
if (property.type === "object") {
|
|
13467
|
-
const childProperties = property.properties;
|
|
13468
|
-
if (!childProperties) {
|
|
13469
|
-
const mergedObj$1 = {};
|
|
13470
|
-
for (const val of nonNullValues) if (isRecord$1(val)) Object.assign(mergedObj$1, val);
|
|
13471
|
-
return mergedObj$1;
|
|
13472
|
-
}
|
|
13473
|
-
const mergedObj = {};
|
|
13474
|
-
for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord$1(v) ? v[propName] : void 0));
|
|
13475
|
-
return mergedObj;
|
|
13476
|
-
}
|
|
13477
|
-
return pickPrimitiveValue(nonNullValues);
|
|
13478
|
-
}
|
|
13479
|
-
/**
|
|
13480
|
-
* Merges structured extraction outputs from multiple document chunks
|
|
13481
|
-
* according to the schema properties.
|
|
13482
|
-
*/
|
|
13483
|
-
function mergeExtractionResults(schema, results) {
|
|
13484
|
-
if (results.length === 0) return {};
|
|
13485
|
-
if (results.length === 1) return results[0];
|
|
13486
|
-
const merged = {};
|
|
13487
|
-
for (const [propName, propDef] of Object.entries(schema.properties)) {
|
|
13488
|
-
if (propDef.primary && propDef.autoIncrement) continue;
|
|
13489
|
-
merged[propName] = mergePropertyValue(propDef, results.map((r) => r[propName]));
|
|
13490
|
-
}
|
|
13491
|
-
return merged;
|
|
13492
|
-
}
|
|
13493
|
-
|
|
13494
|
-
//#endregion
|
|
13495
|
-
//#region src/core/ai-extraction/snapshot.ts
|
|
13496
|
-
async function savePromptSnapshot(schema, aiexDir) {
|
|
13497
|
-
const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
|
|
13498
|
-
const outputDir = path.join(aiexDir, "extracted");
|
|
13499
|
-
await fs.mkdir(outputDir, { recursive: true });
|
|
13500
|
-
const fileName = `${schema.table.name}.prompt.md`;
|
|
13501
|
-
const outputPath = path.join(outputDir, fileName);
|
|
13502
|
-
await fs.writeFile(outputPath, content);
|
|
13503
|
-
return outputPath;
|
|
13504
|
-
}
|
|
13505
|
-
|
|
13506
|
-
//#endregion
|
|
13507
|
-
//#region src/core/ai-extraction/text-splitter.ts
|
|
13508
|
-
const encoding$1 = getEncoding("cl100k_base");
|
|
13509
|
-
const MAX_OVERLAP_RATIO = .15;
|
|
13510
|
-
const MAX_EFFECTIVE_OVERLAP_TOKENS = 1200;
|
|
13511
|
-
const TABLE_SEPARATOR_CELL_RE = /^:?-{3,}:?$/;
|
|
13512
|
-
const LEADING_TABLE_PIPE_RE = /^\|/;
|
|
13513
|
-
const TRAILING_TABLE_PIPE_RE = /\|$/;
|
|
13514
|
-
function countTokens(text$1) {
|
|
13515
|
-
return encoding$1.encode(text$1).length;
|
|
13516
|
-
}
|
|
13517
|
-
function calculateChunkTokenBudget(options = {}) {
|
|
13518
|
-
const configuredMaxTokens = options.configuredMaxTokens ?? 8e3;
|
|
13519
|
-
const modelMaxTokens = options.modelMaxTokens;
|
|
13520
|
-
if (!modelMaxTokens) return configuredMaxTokens;
|
|
13521
|
-
const outputReserveTokens = options.outputReserveTokens ?? 2e3;
|
|
13522
|
-
const promptReserveTokens = options.promptReserveTokens ?? 1200;
|
|
13523
|
-
const safetyBufferTokens = options.safetyBufferTokens ?? Math.min(1e3, Math.floor(modelMaxTokens * .1));
|
|
13524
|
-
const available = modelMaxTokens - outputReserveTokens - promptReserveTokens - safetyBufferTokens;
|
|
13525
|
-
return Math.max(512, Math.min(configuredMaxTokens, available));
|
|
13526
|
-
}
|
|
13527
|
-
function formatHeadingContext(headings) {
|
|
13528
|
-
const active = headings.filter(Boolean);
|
|
13529
|
-
if (active.length === 0) return "";
|
|
13530
|
-
return `> **[Context]** Belong to: ${active.join(" > ")}\n\n`;
|
|
13531
|
-
}
|
|
13532
|
-
function getMetadata(headings) {
|
|
13533
|
-
return {
|
|
13534
|
-
h1: headings[0] || void 0,
|
|
13535
|
-
h2: headings[1] || void 0,
|
|
13536
|
-
h3: headings[2] || void 0,
|
|
13537
|
-
h4: headings[3] || void 0
|
|
13538
|
-
};
|
|
13539
|
-
}
|
|
13540
|
-
function getHeadingPath(metadata) {
|
|
13541
|
-
return [
|
|
13542
|
-
metadata.h1,
|
|
13543
|
-
metadata.h2,
|
|
13544
|
-
metadata.h3,
|
|
13545
|
-
metadata.h4
|
|
13546
|
-
].filter(Boolean);
|
|
13547
|
-
}
|
|
13548
|
-
function finalizeChunks(chunks, sourceText) {
|
|
13549
|
-
let searchStart = 0;
|
|
13550
|
-
const totalChunks = chunks.length;
|
|
13551
|
-
return chunks.map((chunk, index) => {
|
|
13552
|
-
const tokenCount = countTokens(chunk.pageContent);
|
|
13553
|
-
let charStart = sourceText.indexOf(chunk.pageContent, searchStart);
|
|
13554
|
-
if (charStart === -1) charStart = sourceText.indexOf(chunk.pageContent);
|
|
13555
|
-
const charEnd = charStart >= 0 ? charStart + chunk.pageContent.length : void 0;
|
|
13556
|
-
if (charStart >= 0 && charEnd !== void 0) searchStart = charEnd;
|
|
13557
|
-
return {
|
|
13558
|
-
...chunk,
|
|
13559
|
-
chunkIndex: index,
|
|
13560
|
-
totalChunks,
|
|
13561
|
-
tokenCount,
|
|
13562
|
-
headingPath: getHeadingPath(chunk.metadata),
|
|
13563
|
-
charStart: charStart >= 0 ? charStart : void 0,
|
|
13564
|
-
charEnd
|
|
13565
|
-
};
|
|
13566
|
-
});
|
|
13567
|
-
}
|
|
13568
|
-
function getEffectiveOverlapTokens(maxTokens, overlapTokens) {
|
|
13569
|
-
return Math.floor(Math.min(overlapTokens, Math.max(64, maxTokens * MAX_OVERLAP_RATIO), MAX_EFFECTIVE_OVERLAP_TOKENS));
|
|
13570
|
-
}
|
|
13571
|
-
function splitMarkdownTable(tableText, maxTokens) {
|
|
13572
|
-
if (countTokens(tableText) <= maxTokens) return [tableText];
|
|
13573
|
-
const lines = tableText.split("\n");
|
|
13574
|
-
const headerIndex = lines.findIndex((line) => line.trim().startsWith("|"));
|
|
13575
|
-
const separatorIndex = lines.findIndex((line, index) => {
|
|
13576
|
-
if (index <= headerIndex) return false;
|
|
13577
|
-
const cells = line.trim().replace(LEADING_TABLE_PIPE_RE, "").replace(TRAILING_TABLE_PIPE_RE, "").split("|").map((cell) => cell.trim());
|
|
13578
|
-
return cells.length > 0 && cells.every((cell) => TABLE_SEPARATOR_CELL_RE.test(cell));
|
|
13579
|
-
});
|
|
13580
|
-
if (headerIndex === -1 || separatorIndex === -1) return splitTextRecursively(tableText, maxTokens, ["\n"]);
|
|
13581
|
-
const prefix = lines.slice(0, headerIndex);
|
|
13582
|
-
const header = lines[headerIndex];
|
|
13583
|
-
const separator = lines[separatorIndex];
|
|
13584
|
-
const rows = lines.slice(separatorIndex + 1).filter((line) => line.trim() !== "");
|
|
13585
|
-
const chunks = [];
|
|
13586
|
-
let currentRows = [];
|
|
13587
|
-
const buildTable = (tableRows) => {
|
|
13588
|
-
return [
|
|
13589
|
-
...prefix,
|
|
13590
|
-
header,
|
|
13591
|
-
separator,
|
|
13592
|
-
...tableRows
|
|
13593
|
-
].join("\n");
|
|
13594
|
-
};
|
|
13595
|
-
for (const row of rows) {
|
|
13596
|
-
const candidateRows = [...currentRows, row];
|
|
13597
|
-
if (currentRows.length > 0 && countTokens(buildTable(candidateRows)) > maxTokens) {
|
|
13598
|
-
chunks.push(buildTable(currentRows));
|
|
13599
|
-
currentRows = [row];
|
|
13600
|
-
} else currentRows = candidateRows;
|
|
13601
|
-
}
|
|
13602
|
-
if (currentRows.length > 0) chunks.push(buildTable(currentRows));
|
|
13603
|
-
return chunks.length > 0 ? chunks : [tableText];
|
|
13604
|
-
}
|
|
13605
|
-
/**
|
|
13606
|
-
* Splits text recursively using a list of separators.
|
|
13607
|
-
* Preserves the separators when re-joining.
|
|
13608
|
-
*/
|
|
13609
|
-
function splitTextRecursively(text$1, maxTokens, separators = [
|
|
13610
|
-
"\n\n",
|
|
13611
|
-
"\n",
|
|
13612
|
-
"。",
|
|
13613
|
-
". ",
|
|
13614
|
-
" "
|
|
13615
|
-
]) {
|
|
13616
|
-
if (countTokens(text$1) <= maxTokens) return [text$1];
|
|
13617
|
-
if (separators.length === 0) {
|
|
13618
|
-
const chunks = [];
|
|
13619
|
-
let current = "";
|
|
13620
|
-
for (const char of text$1) if (countTokens(current + char) > maxTokens) {
|
|
13621
|
-
chunks.push(current);
|
|
13622
|
-
current = char;
|
|
13623
|
-
} else current += char;
|
|
13624
|
-
if (current) chunks.push(current);
|
|
13625
|
-
return chunks;
|
|
13626
|
-
}
|
|
13627
|
-
const separator = separators[0];
|
|
13628
|
-
const nextSeparators = separators.slice(1);
|
|
13629
|
-
const parts = text$1.split(separator);
|
|
13630
|
-
const result = [];
|
|
13631
|
-
let currentChunk = [];
|
|
13632
|
-
let currentChunkTokens = 0;
|
|
13633
|
-
for (let i = 0; i < parts.length; i++) {
|
|
13634
|
-
const part = parts[i];
|
|
13635
|
-
const itemText = part + (i < parts.length - 1 ? separator : "");
|
|
13636
|
-
const partTokens = countTokens(itemText);
|
|
13637
|
-
if (partTokens > maxTokens) {
|
|
13638
|
-
if (currentChunk.length > 0) {
|
|
13639
|
-
result.push(currentChunk.join(""));
|
|
13640
|
-
currentChunk = [];
|
|
13641
|
-
currentChunkTokens = 0;
|
|
13642
|
-
}
|
|
13643
|
-
const subParts = splitTextRecursively(part, maxTokens, nextSeparators);
|
|
13644
|
-
for (let j = 0; j < subParts.length; j++) {
|
|
13645
|
-
const finalSub = subParts[j] + (j === subParts.length - 1 && i < parts.length - 1 ? separator : "");
|
|
13646
|
-
result.push(finalSub);
|
|
13647
|
-
}
|
|
13648
|
-
} else if (currentChunkTokens + partTokens > maxTokens) {
|
|
13649
|
-
result.push(currentChunk.join(""));
|
|
13650
|
-
currentChunk = [itemText];
|
|
13651
|
-
currentChunkTokens = partTokens;
|
|
13652
|
-
} else {
|
|
13653
|
-
currentChunk.push(itemText);
|
|
13654
|
-
currentChunkTokens += partTokens;
|
|
13655
|
-
}
|
|
13656
|
-
}
|
|
13657
|
-
if (currentChunk.length > 0) result.push(currentChunk.join(""));
|
|
13658
|
-
return result;
|
|
13659
|
-
}
|
|
13660
|
-
/**
|
|
13661
|
-
* Splits a Markdown document into chunks based on heading contexts, AST block parsing, and token limits.
|
|
13662
|
-
* Protects tables, list items, and code blocks from being broken.
|
|
13663
|
-
*/
|
|
13664
|
-
function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
|
|
13665
|
-
const tokens = marked.lexer(text$1);
|
|
13666
|
-
const chunks = [];
|
|
13667
|
-
const effectiveOverlapTokens = getEffectiveOverlapTokens(maxTokens, overlapTokens);
|
|
13668
|
-
let currentHeadings = [];
|
|
13669
|
-
let currentChunkList = [];
|
|
13670
|
-
let accumulatedTokens = 0;
|
|
13671
|
-
const flushCurrentChunk = (isHeadingChange = false) => {
|
|
13672
|
-
if (currentChunkList.length === 0) return;
|
|
13673
|
-
const pageContent = currentChunkList.map((item) => item.text).join("");
|
|
13674
|
-
const firstHeadings = currentChunkList[0].headings;
|
|
13675
|
-
chunks.push({
|
|
13676
|
-
pageContent,
|
|
13677
|
-
metadata: getMetadata(firstHeadings)
|
|
13678
|
-
});
|
|
13679
|
-
if (isHeadingChange || effectiveOverlapTokens <= 0) {
|
|
13680
|
-
currentChunkList = [];
|
|
13681
|
-
accumulatedTokens = 0;
|
|
13682
|
-
} else {
|
|
13683
|
-
const overlapItems = [];
|
|
13684
|
-
let currentOverlapTokens = 0;
|
|
13685
|
-
for (let i = currentChunkList.length - 1; i >= 0; i--) {
|
|
13686
|
-
const item = currentChunkList[i];
|
|
13687
|
-
const itemTokens = countTokens(item.text);
|
|
13688
|
-
if (currentOverlapTokens + itemTokens > effectiveOverlapTokens && overlapItems.length > 0) break;
|
|
13689
|
-
overlapItems.unshift(item);
|
|
13690
|
-
currentOverlapTokens += itemTokens;
|
|
13691
|
-
}
|
|
13692
|
-
currentChunkList = [...overlapItems];
|
|
13693
|
-
accumulatedTokens = currentOverlapTokens;
|
|
13694
|
-
}
|
|
13695
|
-
};
|
|
13696
|
-
for (const token of tokens) {
|
|
13697
|
-
if (token.type === "space") {
|
|
13698
|
-
if (currentChunkList.length > 0) {
|
|
13699
|
-
currentChunkList[currentChunkList.length - 1].text += token.raw;
|
|
13700
|
-
accumulatedTokens += countTokens(token.raw);
|
|
13701
|
-
}
|
|
13702
|
-
continue;
|
|
13703
|
-
}
|
|
13704
|
-
if (token.type === "heading") {
|
|
13705
|
-
flushCurrentChunk(true);
|
|
13706
|
-
const depth = token.depth;
|
|
13707
|
-
const title = token.text.trim();
|
|
13708
|
-
currentHeadings = currentHeadings.slice(0, depth - 1);
|
|
13709
|
-
currentHeadings[depth - 1] = title;
|
|
13710
|
-
}
|
|
13711
|
-
const rawText = token.raw;
|
|
13712
|
-
if (token.type === "list" && countTokens(rawText) > maxTokens) for (const item of token.items) processTextBlock(item.raw, currentHeadings);
|
|
13713
|
-
else {
|
|
13714
|
-
const isAtomic = token.type === "table" || token.type === "code";
|
|
13715
|
-
processTextBlock(rawText, currentHeadings, isAtomic);
|
|
13716
|
-
}
|
|
13717
|
-
}
|
|
13718
|
-
flushCurrentChunk(true);
|
|
13719
|
-
return finalizeChunks(chunks, text$1);
|
|
13720
|
-
function processTextBlock(blockText, headings, isAtomic = false) {
|
|
13721
|
-
const blockTokens = countTokens(blockText);
|
|
13722
|
-
const contextTokens = countTokens(formatHeadingContext(headings));
|
|
13723
|
-
const safetyBuffer = Math.min(100, Math.max(2, Math.floor(maxTokens * .1)));
|
|
13724
|
-
const budgetLimit = Math.max(5, maxTokens - contextTokens - safetyBuffer);
|
|
13725
|
-
if (blockTokens > budgetLimit) if (isAtomic) {
|
|
13726
|
-
flushCurrentChunk(false);
|
|
13727
|
-
const atomicBlocks = blockTokens <= maxTokens ? [blockText] : blockText.includes("|") ? splitMarkdownTable(blockText, budgetLimit) : splitTextRecursively(blockText, budgetLimit, ["\n"]);
|
|
13728
|
-
for (const block of atomicBlocks) {
|
|
13729
|
-
currentChunkList.push({
|
|
13730
|
-
text: block,
|
|
13731
|
-
headings: [...headings]
|
|
13732
|
-
});
|
|
13733
|
-
accumulatedTokens = countTokens(block);
|
|
13734
|
-
flushCurrentChunk(false);
|
|
13735
|
-
}
|
|
13736
|
-
} else {
|
|
13737
|
-
flushCurrentChunk(false);
|
|
13738
|
-
const subBlocks = splitTextRecursively(blockText, budgetLimit);
|
|
13739
|
-
for (const sub of subBlocks) {
|
|
13740
|
-
currentChunkList.push({
|
|
13741
|
-
text: sub,
|
|
13742
|
-
headings: [...headings]
|
|
13743
|
-
});
|
|
13744
|
-
accumulatedTokens += countTokens(sub);
|
|
13745
|
-
if (accumulatedTokens > budgetLimit) flushCurrentChunk(false);
|
|
13746
|
-
}
|
|
13747
|
-
}
|
|
13748
|
-
else {
|
|
13749
|
-
if (accumulatedTokens + blockTokens + contextTokens > maxTokens && currentChunkList.length > 0) flushCurrentChunk(false);
|
|
13750
|
-
currentChunkList.push({
|
|
13751
|
-
text: blockText,
|
|
13752
|
-
headings: [...headings]
|
|
13753
|
-
});
|
|
13754
|
-
accumulatedTokens += blockTokens;
|
|
13755
|
-
}
|
|
13756
|
-
}
|
|
13757
|
-
}
|
|
13758
|
-
|
|
13759
13515
|
//#endregion
|
|
13760
13516
|
//#region src/core/extraction-audit.ts
|
|
13761
13517
|
const AUDIT_ID_RE = /^[\w.-]+$/;
|
|
@@ -13906,276 +13662,6 @@ function getFileHash(filePath) {
|
|
|
13906
13662
|
});
|
|
13907
13663
|
}
|
|
13908
13664
|
|
|
13909
|
-
//#endregion
|
|
13910
|
-
//#region src/core/ai-extraction/evidence.ts
|
|
13911
|
-
const JSON_FILE_SUFFIX_RE$1 = /\.json$/i;
|
|
13912
|
-
const FIELD_PATH_PREFIX_RE = /^\$\./;
|
|
13913
|
-
function isRecord(value) {
|
|
13914
|
-
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
13915
|
-
}
|
|
13916
|
-
function stableValueKey(value) {
|
|
13917
|
-
return JSON.stringify(value);
|
|
13918
|
-
}
|
|
13919
|
-
function isPlaceholderString(value) {
|
|
13920
|
-
if (typeof value !== "string") return false;
|
|
13921
|
-
const normalized = value.trim().toLowerCase();
|
|
13922
|
-
return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
|
|
13923
|
-
}
|
|
13924
|
-
function primitiveToText(value) {
|
|
13925
|
-
if (value === null || value === void 0) return null;
|
|
13926
|
-
if (typeof value === "string") return value.trim() || null;
|
|
13927
|
-
if (typeof value === "number" || typeof value === "boolean") return String(value);
|
|
13928
|
-
return null;
|
|
13929
|
-
}
|
|
13930
|
-
function isMeaningfulValue(value) {
|
|
13931
|
-
return primitiveToText(value) !== null && !isPlaceholderString(value);
|
|
13932
|
-
}
|
|
13933
|
-
function normalizeText(value) {
|
|
13934
|
-
return value.toLowerCase().replace(/\s+/g, " ").trim();
|
|
13935
|
-
}
|
|
13936
|
-
function quoteAround(text$1, start, length) {
|
|
13937
|
-
const before = Math.max(0, start - 80);
|
|
13938
|
-
const after = Math.min(text$1.length, start + length + 80);
|
|
13939
|
-
return text$1.slice(before, after).replace(/\s+/g, " ").trim();
|
|
13940
|
-
}
|
|
13941
|
-
function findEvidence(value, chunks) {
|
|
13942
|
-
const searchText = primitiveToText(value);
|
|
13943
|
-
if (!searchText) return null;
|
|
13944
|
-
const normalizedSearchText = normalizeText(searchText);
|
|
13945
|
-
if (!normalizedSearchText) return null;
|
|
13946
|
-
for (const chunk of chunks) {
|
|
13947
|
-
if (normalizeText(chunk.text).indexOf(normalizedSearchText) === -1) continue;
|
|
13948
|
-
const rawIndex = chunk.text.toLowerCase().indexOf(searchText.toLowerCase());
|
|
13949
|
-
const quoteIndex = rawIndex >= 0 ? rawIndex : 0;
|
|
13950
|
-
return {
|
|
13951
|
-
chunkIndex: chunk.chunkIndex,
|
|
13952
|
-
headingPath: chunk.headingPath,
|
|
13953
|
-
quote: quoteAround(chunk.text, quoteIndex, searchText.length)
|
|
13954
|
-
};
|
|
13955
|
-
}
|
|
13956
|
-
return null;
|
|
13957
|
-
}
|
|
13958
|
-
function addEvidenceForProperty(fields, path$1, property, value, chunks) {
|
|
13959
|
-
if (property.type === "object" && property.properties) {
|
|
13960
|
-
const record = isRecord(value) ? value : {};
|
|
13961
|
-
for (const [childName, childProperty] of Object.entries(property.properties)) addEvidenceForProperty(fields, `${path$1}.${childName}`, childProperty, record[childName], chunks);
|
|
13962
|
-
return;
|
|
13963
|
-
}
|
|
13964
|
-
if (property.type === "array") {
|
|
13965
|
-
if (!Array.isArray(value) || value.length === 0) {
|
|
13966
|
-
fields.push({
|
|
13967
|
-
fieldPath: path$1,
|
|
13968
|
-
status: "missing",
|
|
13969
|
-
value: null,
|
|
13970
|
-
confidence: 0,
|
|
13971
|
-
note: "Array field is empty or missing."
|
|
13972
|
-
});
|
|
13973
|
-
return;
|
|
13974
|
-
}
|
|
13975
|
-
value.forEach((item, index) => {
|
|
13976
|
-
if (property.items?.type === "object" && property.items.properties) {
|
|
13977
|
-
const record = isRecord(item) ? item : {};
|
|
13978
|
-
for (const [childName, childProperty] of Object.entries(property.items.properties)) addEvidenceForProperty(fields, `${path$1}[${index}].${childName}`, childProperty, record[childName], chunks);
|
|
13979
|
-
} else addPrimitiveEvidence(fields, `${path$1}[${index}]`, item, chunks);
|
|
13980
|
-
});
|
|
13981
|
-
return;
|
|
13982
|
-
}
|
|
13983
|
-
addPrimitiveEvidence(fields, path$1, value, chunks);
|
|
13984
|
-
}
|
|
13985
|
-
function addPrimitiveEvidence(fields, fieldPath, value, chunks) {
|
|
13986
|
-
if (value === null || value === void 0 || value === "") {
|
|
13987
|
-
fields.push({
|
|
13988
|
-
fieldPath,
|
|
13989
|
-
status: "missing",
|
|
13990
|
-
value: null,
|
|
13991
|
-
confidence: 0,
|
|
13992
|
-
note: "Field is null or empty in final extraction."
|
|
13993
|
-
});
|
|
13994
|
-
return;
|
|
13995
|
-
}
|
|
13996
|
-
const found = findEvidence(value, chunks);
|
|
13997
|
-
if (found) {
|
|
13998
|
-
fields.push({
|
|
13999
|
-
fieldPath,
|
|
14000
|
-
status: "found",
|
|
14001
|
-
value,
|
|
14002
|
-
confidence: .8,
|
|
14003
|
-
...found
|
|
14004
|
-
});
|
|
14005
|
-
return;
|
|
14006
|
-
}
|
|
14007
|
-
fields.push({
|
|
14008
|
-
fieldPath,
|
|
14009
|
-
status: "inferred",
|
|
14010
|
-
value,
|
|
14011
|
-
confidence: .35,
|
|
14012
|
-
note: "Final value was not found verbatim in the available source text."
|
|
14013
|
-
});
|
|
14014
|
-
}
|
|
14015
|
-
function sourceChunksFromText(text$1) {
|
|
14016
|
-
return text$1 ? [{
|
|
14017
|
-
text: text$1,
|
|
14018
|
-
chunkIndex: 0,
|
|
14019
|
-
headingPath: []
|
|
14020
|
-
}] : [];
|
|
14021
|
-
}
|
|
14022
|
-
function sourceChunksFromMarkdownChunks(chunks) {
|
|
14023
|
-
return chunks.map((chunk, index) => ({
|
|
14024
|
-
text: chunk.pageContent,
|
|
14025
|
-
chunkIndex: chunk.chunkIndex ?? index,
|
|
14026
|
-
headingPath: chunk.headingPath ?? []
|
|
14027
|
-
}));
|
|
14028
|
-
}
|
|
14029
|
-
function getPathParts(fieldPath) {
|
|
14030
|
-
return fieldPath.replace(FIELD_PATH_PREFIX_RE, "").split(".").filter(Boolean);
|
|
14031
|
-
}
|
|
14032
|
-
function getValueAtPath$1(data, fieldPath) {
|
|
14033
|
-
let current = data;
|
|
14034
|
-
for (const part of getPathParts(fieldPath)) {
|
|
14035
|
-
if (!isRecord(current)) return void 0;
|
|
14036
|
-
current = current[part];
|
|
14037
|
-
}
|
|
14038
|
-
return current;
|
|
14039
|
-
}
|
|
14040
|
-
function setValueAtPath(data, fieldPath, value) {
|
|
14041
|
-
const parts = getPathParts(fieldPath);
|
|
14042
|
-
let current = data;
|
|
14043
|
-
for (let i = 0; i < parts.length - 1; i++) {
|
|
14044
|
-
const part = parts[i];
|
|
14045
|
-
if (!isRecord(current[part])) current[part] = {};
|
|
14046
|
-
current = current[part];
|
|
14047
|
-
}
|
|
14048
|
-
current[parts[parts.length - 1]] = value;
|
|
14049
|
-
}
|
|
14050
|
-
function collectScalarFields(fields, fieldPath, property) {
|
|
14051
|
-
if (property.type === "object" && property.properties) {
|
|
14052
|
-
for (const [name$1, childProperty] of Object.entries(property.properties)) collectScalarFields(fields, `${fieldPath}.${name$1}`, childProperty);
|
|
14053
|
-
return;
|
|
14054
|
-
}
|
|
14055
|
-
if (property.type !== "array") fields.push({
|
|
14056
|
-
fieldPath,
|
|
14057
|
-
property
|
|
14058
|
-
});
|
|
14059
|
-
}
|
|
14060
|
-
function candidateScore(candidate) {
|
|
14061
|
-
return (candidate.status === "found" ? 100 : 0) + Math.round(candidate.confidence * 10) + candidate.chunkIndex;
|
|
14062
|
-
}
|
|
14063
|
-
function selectCandidatesForField(candidates) {
|
|
14064
|
-
if (candidates.length === 0) return null;
|
|
14065
|
-
candidates.sort((a, b) => candidateScore(b) - candidateScore(a));
|
|
14066
|
-
const selected = candidates[0];
|
|
14067
|
-
selected.selected = true;
|
|
14068
|
-
for (const candidate of candidates.slice(1)) {
|
|
14069
|
-
candidate.selected = false;
|
|
14070
|
-
candidate.rejectionReason = "Lower evidence score or earlier chunk position.";
|
|
14071
|
-
}
|
|
14072
|
-
const distinctValues = /* @__PURE__ */ new Map();
|
|
14073
|
-
for (const candidate of candidates) distinctValues.set(stableValueKey(candidate.value), candidate.value);
|
|
14074
|
-
if (distinctValues.size <= 1) return null;
|
|
14075
|
-
return {
|
|
14076
|
-
fieldPath: selected.fieldPath,
|
|
14077
|
-
selectedValue: selected.value,
|
|
14078
|
-
rejectedValues: candidates.slice(1).map((candidate) => candidate.value),
|
|
14079
|
-
candidates: [...candidates]
|
|
14080
|
-
};
|
|
14081
|
-
}
|
|
14082
|
-
function buildCandidateMergeReport(input) {
|
|
14083
|
-
const scalarFields = [];
|
|
14084
|
-
for (const [name$1, property] of Object.entries(input.schema.properties)) {
|
|
14085
|
-
if (property.primary && property.autoIncrement) continue;
|
|
14086
|
-
collectScalarFields(scalarFields, `$.${name$1}`, property);
|
|
14087
|
-
}
|
|
14088
|
-
const sourceChunks = sourceChunksFromMarkdownChunks(input.chunks);
|
|
14089
|
-
const candidatesByPath = /* @__PURE__ */ new Map();
|
|
14090
|
-
for (const { fieldPath } of scalarFields) for (let chunkIndex = 0; chunkIndex < input.chunkResults.length; chunkIndex++) {
|
|
14091
|
-
const value = getValueAtPath$1(input.chunkResults[chunkIndex], fieldPath);
|
|
14092
|
-
if (!isMeaningfulValue(value)) continue;
|
|
14093
|
-
const sourceChunk = sourceChunks[chunkIndex] ?? {
|
|
14094
|
-
text: "",
|
|
14095
|
-
chunkIndex
|
|
14096
|
-
};
|
|
14097
|
-
const found = findEvidence(value, [sourceChunk]);
|
|
14098
|
-
const candidate = {
|
|
14099
|
-
fieldPath,
|
|
14100
|
-
value,
|
|
14101
|
-
chunkIndex: sourceChunk.chunkIndex ?? chunkIndex,
|
|
14102
|
-
headingPath: sourceChunk.headingPath,
|
|
14103
|
-
status: found ? "found" : "inferred",
|
|
14104
|
-
quote: found?.quote,
|
|
14105
|
-
confidence: found ? .85 : .35
|
|
14106
|
-
};
|
|
14107
|
-
const candidates = candidatesByPath.get(fieldPath) ?? [];
|
|
14108
|
-
candidates.push(candidate);
|
|
14109
|
-
candidatesByPath.set(fieldPath, candidates);
|
|
14110
|
-
}
|
|
14111
|
-
const allCandidates = [];
|
|
14112
|
-
const conflicts = [];
|
|
14113
|
-
for (const candidates of candidatesByPath.values()) {
|
|
14114
|
-
const conflict = selectCandidatesForField(candidates);
|
|
14115
|
-
allCandidates.push(...candidates);
|
|
14116
|
-
if (conflict) conflicts.push(conflict);
|
|
14117
|
-
}
|
|
14118
|
-
return {
|
|
14119
|
-
candidates: allCandidates,
|
|
14120
|
-
conflicts
|
|
14121
|
-
};
|
|
14122
|
-
}
|
|
14123
|
-
function applySelectedCandidates(data, report) {
|
|
14124
|
-
const merged = structuredClone(data);
|
|
14125
|
-
for (const candidate of report.candidates) if (candidate.selected) setValueAtPath(merged, candidate.fieldPath, candidate.value);
|
|
14126
|
-
return merged;
|
|
14127
|
-
}
|
|
14128
|
-
function buildExtractionEvidence(input) {
|
|
14129
|
-
const data = isRecord(input.data) ? input.data : {};
|
|
14130
|
-
const chunks = input.chunks ? sourceChunksFromMarkdownChunks(input.chunks) : sourceChunksFromText(input.text ?? "");
|
|
14131
|
-
const fields = [];
|
|
14132
|
-
for (const [name$1, property] of Object.entries(input.schema.properties)) {
|
|
14133
|
-
if (property.primary && property.autoIncrement) continue;
|
|
14134
|
-
addEvidenceForProperty(fields, `$.${name$1}`, property, data[name$1], chunks);
|
|
14135
|
-
}
|
|
14136
|
-
const inferredIssues = fields.filter((field) => field.status === "inferred").map((field) => ({
|
|
14137
|
-
fieldPath: field.fieldPath,
|
|
14138
|
-
message: field.note ?? "Field value lacks source evidence."
|
|
14139
|
-
}));
|
|
14140
|
-
const conflictIssues = (input.candidateReport?.conflicts ?? []).map((conflict) => ({
|
|
14141
|
-
fieldPath: conflict.fieldPath,
|
|
14142
|
-
message: "Multiple chunk candidates disagree for this field."
|
|
14143
|
-
}));
|
|
14144
|
-
const issues = [...inferredIssues, ...conflictIssues];
|
|
14145
|
-
return {
|
|
14146
|
-
coverage: {
|
|
14147
|
-
path: input.outputPath ? evidencePathForOutput(input.outputPath) : void 0,
|
|
14148
|
-
fieldCount: fields.length,
|
|
14149
|
-
evidenceCount: fields.filter((field) => field.status === "found").length,
|
|
14150
|
-
foundCount: fields.filter((field) => field.status === "found").length,
|
|
14151
|
-
missingCount: fields.filter((field) => field.status === "missing").length,
|
|
14152
|
-
inferredCount: fields.filter((field) => field.status === "inferred").length,
|
|
14153
|
-
conflictCount: input.candidateReport?.conflicts.length ?? 0,
|
|
14154
|
-
issueCount: issues.length
|
|
14155
|
-
},
|
|
14156
|
-
fields,
|
|
14157
|
-
candidates: input.candidateReport?.candidates,
|
|
14158
|
-
conflicts: input.candidateReport?.conflicts,
|
|
14159
|
-
issues
|
|
14160
|
-
};
|
|
14161
|
-
}
|
|
14162
|
-
function evidencePathForOutput(outputPath) {
|
|
14163
|
-
return outputPath.replace(JSON_FILE_SUFFIX_RE$1, ".evidence.json");
|
|
14164
|
-
}
|
|
14165
|
-
async function writeExtractionEvidence(input) {
|
|
14166
|
-
const report = buildExtractionEvidence(input);
|
|
14167
|
-
const evidencePath = evidencePathForOutput(input.outputPath);
|
|
14168
|
-
report.coverage.path = evidencePath;
|
|
14169
|
-
await writeFile(evidencePath, report, {
|
|
14170
|
-
spaces: 2,
|
|
14171
|
-
EOL: "\n"
|
|
14172
|
-
});
|
|
14173
|
-
return {
|
|
14174
|
-
...report.coverage,
|
|
14175
|
-
path: path.resolve(evidencePath)
|
|
14176
|
-
};
|
|
14177
|
-
}
|
|
14178
|
-
|
|
14179
13665
|
//#endregion
|
|
14180
13666
|
//#region src/core/notion-sink.ts
|
|
14181
13667
|
const RICH_TEXT_LIMIT = 2e3;
|
|
@@ -14461,36 +13947,6 @@ async function triggerWebhook(aiConfig, auditId, schemaName, event, source, data
|
|
|
14461
13947
|
}
|
|
14462
13948
|
}
|
|
14463
13949
|
|
|
14464
|
-
//#endregion
|
|
14465
|
-
//#region src/core/ai-extraction/transcriber.ts
|
|
14466
|
-
const TRANSCRIPTION_PROMPT = "Transcribe all visible text from this image accurately. Preserve the layout and line breaks as much as possible.";
|
|
14467
|
-
async function transcribeImageWithVision(imagePath, baseURL, apiKey, modelName, timeoutMs) {
|
|
14468
|
-
const provider = createOpenAICompatible({
|
|
14469
|
-
baseURL,
|
|
14470
|
-
name: "openai-compatible",
|
|
14471
|
-
apiKey
|
|
14472
|
-
});
|
|
14473
|
-
const buffer = await fs.readFile(imagePath);
|
|
14474
|
-
const effectiveTimeout = timeoutMs ?? 3e5;
|
|
14475
|
-
return {
|
|
14476
|
-
text: (await generateText({
|
|
14477
|
-
model: provider.chatModel(modelName),
|
|
14478
|
-
messages: [{
|
|
14479
|
-
role: "user",
|
|
14480
|
-
content: [{
|
|
14481
|
-
type: "text",
|
|
14482
|
-
text: TRANSCRIPTION_PROMPT
|
|
14483
|
-
}, {
|
|
14484
|
-
type: "image",
|
|
14485
|
-
image: buffer
|
|
14486
|
-
}]
|
|
14487
|
-
}],
|
|
14488
|
-
abortSignal: AbortSignal.timeout(effectiveTimeout)
|
|
14489
|
-
})).text,
|
|
14490
|
-
modelName
|
|
14491
|
-
};
|
|
14492
|
-
}
|
|
14493
|
-
|
|
14494
13950
|
//#endregion
|
|
14495
13951
|
//#region src/core/file-constants.ts
|
|
14496
13952
|
const MAX_UPLOAD_SIZE = 30 * 1024 * 1024;
|
|
@@ -14824,6 +14280,14 @@ function createPdfConverter(config) {
|
|
|
14824
14280
|
return withFallback(new ExternalCommandPdfConverter("mineru", mineruConfig), mineruConfig);
|
|
14825
14281
|
}
|
|
14826
14282
|
if (config.converter === "mineru_api") return new MineruApiPdfConverter(config.mineruApi ?? DEFAULT_MINERU_API_CONFIG);
|
|
14283
|
+
if (config.converter === "markitdown") {
|
|
14284
|
+
const markitdownConfig = config.markitdown ?? DEFAULT_MARKITDOWN_CONFIG;
|
|
14285
|
+
return withFallback(new ExternalCommandPdfConverter("markitdown", markitdownConfig), markitdownConfig);
|
|
14286
|
+
}
|
|
14287
|
+
if (config.converter === "marker") {
|
|
14288
|
+
const markerConfig = config.marker ?? DEFAULT_MARKER_CONFIG;
|
|
14289
|
+
return withFallback(new ExternalCommandPdfConverter("marker", markerConfig), markerConfig);
|
|
14290
|
+
}
|
|
14827
14291
|
if (config.converter === "external") {
|
|
14828
14292
|
if (!config.external) throw new Error(t("errors.pdf.externalNotConfigured"));
|
|
14829
14293
|
return new ExternalCommandPdfConverter("external", config.external);
|
|
@@ -14851,7 +14315,7 @@ const FILE_PART_EXTENSIONS = new Set([
|
|
|
14851
14315
|
"svg"
|
|
14852
14316
|
]);
|
|
14853
14317
|
const PDF_EXT_RE = /\.pdf$/i;
|
|
14854
|
-
async function readExtractFileInput(filePath, aiConfig) {
|
|
14318
|
+
async function readExtractFileInput(filePath, aiConfig, modelOverride) {
|
|
14855
14319
|
const stat = fs$1.statSync(filePath);
|
|
14856
14320
|
if (stat.size > MAX_UPLOAD_SIZE) throw new Error(t("errors.file.sizeExceeded", {
|
|
14857
14321
|
size: bytesToMB(stat.size).toFixed(1),
|
|
@@ -14860,22 +14324,15 @@ async function readExtractFileInput(filePath, aiConfig) {
|
|
|
14860
14324
|
}));
|
|
14861
14325
|
const ext = path.extname(filePath).toLowerCase().replace(".", "");
|
|
14862
14326
|
if (FILE_PART_EXTENSIONS.has(ext)) {
|
|
14863
|
-
|
|
14864
|
-
|
|
14865
|
-
|
|
14866
|
-
|
|
14867
|
-
const timeout = (aiConfig.provider.timeout ?? 300) * 1e3;
|
|
14868
|
-
try {
|
|
14869
|
-
const result$1 = await transcribeImageWithVision(filePath, baseURL, apiKey, image.imageModelName, timeout);
|
|
14870
|
-
consola.info(t("command.extract.file.visionTranscribed", { model: result$1.modelName }));
|
|
14871
|
-
return { text: result$1.text };
|
|
14872
|
-
} catch {
|
|
14873
|
-
consola.warn(t("command.extract.file.visionTranscribeFailed", { model: image.imageModelName }));
|
|
14874
|
-
}
|
|
14327
|
+
if (shouldUseImageOcrFallback(aiConfig, modelOverride)) {
|
|
14328
|
+
const result = await recognizeImageText(filePath, aiConfig?.image);
|
|
14329
|
+
consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
|
|
14330
|
+
return { text: result.text };
|
|
14875
14331
|
}
|
|
14876
|
-
|
|
14877
|
-
|
|
14878
|
-
|
|
14332
|
+
return {
|
|
14333
|
+
text: "",
|
|
14334
|
+
filePath
|
|
14335
|
+
};
|
|
14879
14336
|
}
|
|
14880
14337
|
if (ext === "pdf") {
|
|
14881
14338
|
const buffer = await fs.readFile(filePath);
|
|
@@ -14996,21 +14453,7 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
|
|
|
14996
14453
|
|
|
14997
14454
|
//#endregion
|
|
14998
14455
|
//#region src/core/extract-runner.ts
|
|
14999
|
-
const encoding = getEncoding("cl100k_base");
|
|
15000
14456
|
const JSON_EXT_RE$1 = /\.json$/;
|
|
15001
|
-
async function limitConcurrency(concurrency, items, fn) {
|
|
15002
|
-
const results = Array.from({ length: items.length });
|
|
15003
|
-
let nextIndex = 0;
|
|
15004
|
-
async function worker() {
|
|
15005
|
-
while (nextIndex < items.length) {
|
|
15006
|
-
const currentIndex = nextIndex++;
|
|
15007
|
-
results[currentIndex] = await fn(items[currentIndex], currentIndex);
|
|
15008
|
-
}
|
|
15009
|
-
}
|
|
15010
|
-
const workers = Array.from({ length: Math.min(concurrency, items.length) }, worker);
|
|
15011
|
-
await Promise.all(workers);
|
|
15012
|
-
return results;
|
|
15013
|
-
}
|
|
15014
14457
|
async function ensureDatabaseReady(dbPath, schema) {
|
|
15015
14458
|
try {
|
|
15016
14459
|
await fs.access(dbPath);
|
|
@@ -15082,146 +14525,34 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
15082
14525
|
}
|
|
15083
14526
|
const s = spinner();
|
|
15084
14527
|
if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
|
|
15085
|
-
const
|
|
15086
|
-
|
|
15087
|
-
modelMaxTokens: modelOverride?.capabilities.maxTokens
|
|
15088
|
-
});
|
|
15089
|
-
const overlapTokens = aiConfig.extraction?.overlapSize ?? 1e3;
|
|
15090
|
-
const totalTokens = text$1 ? encoding.encode(text$1).length : 0;
|
|
15091
|
-
if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunking", {
|
|
15092
|
-
length: totalTokens,
|
|
15093
|
-
limit: maxTokens
|
|
15094
|
-
}));
|
|
15095
|
-
const processedDocs = text$1 && totalTokens > maxTokens ? splitMarkdown(text$1, maxTokens, overlapTokens) : [{
|
|
15096
|
-
pageContent: text$1 ?? "",
|
|
15097
|
-
metadata: {},
|
|
15098
|
-
chunkIndex: 0,
|
|
15099
|
-
totalChunks: 1,
|
|
15100
|
-
tokenCount: totalTokens,
|
|
15101
|
-
headingPath: [],
|
|
15102
|
-
charStart: 0,
|
|
15103
|
-
charEnd: text$1?.length ?? 0
|
|
15104
|
-
}];
|
|
15105
|
-
if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: processedDocs.length }));
|
|
15106
|
-
const chunkResults = Array.from({ length: processedDocs.length });
|
|
15107
|
-
const accumulatedTokens = {
|
|
15108
|
-
prompt: 0,
|
|
15109
|
-
completion: 0,
|
|
15110
|
-
total: 0
|
|
15111
|
-
};
|
|
15112
|
-
let success = true;
|
|
15113
|
-
let errorMsg = "";
|
|
15114
|
-
const extractionTasks = processedDocs.map((doc, i) => {
|
|
15115
|
-
return async () => {
|
|
15116
|
-
if (!success) return;
|
|
15117
|
-
const headings = doc.headingPath?.length ? doc.headingPath : [
|
|
15118
|
-
doc.metadata.h1,
|
|
15119
|
-
doc.metadata.h2,
|
|
15120
|
-
doc.metadata.h3,
|
|
15121
|
-
doc.metadata.h4
|
|
15122
|
-
].filter(Boolean);
|
|
15123
|
-
let chunkText = doc.pageContent;
|
|
15124
|
-
if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
|
|
15125
|
-
const chunkResult = await extractStructuredData({
|
|
15126
|
-
config: aiConfig,
|
|
15127
|
-
schema: schemaLoad.schema,
|
|
15128
|
-
text: chunkText,
|
|
15129
|
-
aiexDir,
|
|
15130
|
-
modelOverride,
|
|
15131
|
-
onRetry(info) {
|
|
15132
|
-
if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
|
|
15133
|
-
current: i + 1,
|
|
15134
|
-
total: processedDocs.length,
|
|
15135
|
-
code: info.statusCode,
|
|
15136
|
-
delay: info.delayMs / 1e3,
|
|
15137
|
-
attempt: info.attempt,
|
|
15138
|
-
max: info.maxRetries
|
|
15139
|
-
}));
|
|
15140
|
-
}
|
|
15141
|
-
});
|
|
15142
|
-
if (!chunkResult.success) {
|
|
15143
|
-
success = false;
|
|
15144
|
-
errorMsg = chunkResult.error || t("common.unknownError");
|
|
15145
|
-
if (!options?.quiet) {
|
|
15146
|
-
s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
|
|
15147
|
-
consola.error(errorMsg);
|
|
15148
|
-
}
|
|
15149
|
-
return;
|
|
15150
|
-
}
|
|
15151
|
-
if (chunkResult.data) chunkResults[i] = chunkResult.data;
|
|
15152
|
-
if (chunkResult.tokensUsed) {
|
|
15153
|
-
accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
|
|
15154
|
-
accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
|
|
15155
|
-
accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
|
|
15156
|
-
}
|
|
15157
|
-
};
|
|
15158
|
-
});
|
|
15159
|
-
const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
|
|
15160
|
-
if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
|
|
15161
|
-
current: 1,
|
|
15162
|
-
total: processedDocs.length
|
|
15163
|
-
}));
|
|
15164
|
-
try {
|
|
15165
|
-
await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
|
|
15166
|
-
if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
|
|
15167
|
-
current: idx + 1,
|
|
15168
|
-
total: processedDocs.length
|
|
15169
|
-
}));
|
|
15170
|
-
await task();
|
|
15171
|
-
});
|
|
15172
|
-
} catch (e) {
|
|
15173
|
-
success = false;
|
|
15174
|
-
errorMsg = e instanceof Error ? e.message : String(e);
|
|
15175
|
-
}
|
|
15176
|
-
if (!success) return {
|
|
15177
|
-
success: false,
|
|
15178
|
-
error: errorMsg
|
|
15179
|
-
};
|
|
15180
|
-
const successfulChunkResults = chunkResults.filter((chunkResult) => !!chunkResult);
|
|
15181
|
-
const candidateReport = buildCandidateMergeReport({
|
|
14528
|
+
const result = await extractStructuredData({
|
|
14529
|
+
config: aiConfig,
|
|
15182
14530
|
schema: schemaLoad.schema,
|
|
15183
|
-
|
|
15184
|
-
|
|
14531
|
+
text: text$1 ?? "",
|
|
14532
|
+
aiexDir,
|
|
14533
|
+
file: filePath,
|
|
14534
|
+
modelOverride,
|
|
14535
|
+
onRetry(info) {
|
|
14536
|
+
if (!options?.quiet) s.message(t("command.extract.file.extractRetry", {
|
|
14537
|
+
code: info.statusCode,
|
|
14538
|
+
delay: info.delayMs / 1e3,
|
|
14539
|
+
attempt: info.attempt,
|
|
14540
|
+
max: info.maxRetries
|
|
14541
|
+
}));
|
|
14542
|
+
}
|
|
15185
14543
|
});
|
|
15186
|
-
|
|
15187
|
-
const validation = validateExtractedData(schemaLoad.schema, mergedData);
|
|
15188
|
-
if (!validation.success) {
|
|
15189
|
-
const valError = validation.error || "Merged data validation failed";
|
|
14544
|
+
if (!result.success) {
|
|
15190
14545
|
if (!options?.quiet) {
|
|
15191
|
-
s.stop(t("command.extract.file.
|
|
15192
|
-
consola.error(
|
|
14546
|
+
s.stop(t("command.extract.file.extractFail"));
|
|
14547
|
+
consola.error(result.error || t("common.unknownError"));
|
|
15193
14548
|
}
|
|
15194
14549
|
return {
|
|
15195
14550
|
success: false,
|
|
15196
|
-
error:
|
|
14551
|
+
error: result.error || t("common.unknownError")
|
|
15197
14552
|
};
|
|
15198
14553
|
}
|
|
15199
|
-
const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
|
|
15200
|
-
await fs.mkdir(outputDir, { recursive: true });
|
|
15201
|
-
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
15202
|
-
const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
|
|
15203
|
-
const outputPath = path.join(outputDir, outputFileName);
|
|
15204
|
-
await fs.writeFile(outputPath, JSON.stringify(mergedData, null, 2));
|
|
15205
|
-
const result = {
|
|
15206
|
-
success: true,
|
|
15207
|
-
data: mergedData,
|
|
15208
|
-
tokensUsed: accumulatedTokens,
|
|
15209
|
-
outputPath,
|
|
15210
|
-
evidenceSummary: await writeExtractionEvidence({
|
|
15211
|
-
schema: schemaLoad.schema,
|
|
15212
|
-
data: mergedData,
|
|
15213
|
-
outputPath,
|
|
15214
|
-
chunks: processedDocs,
|
|
15215
|
-
candidateReport
|
|
15216
|
-
})
|
|
15217
|
-
};
|
|
15218
14554
|
if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
|
|
15219
14555
|
if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
|
|
15220
|
-
if (result.evidenceSummary && !options?.quiet) {
|
|
15221
|
-
const summary = result.evidenceSummary;
|
|
15222
|
-
const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
|
|
15223
|
-
consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, conflicts ${summary.conflictCount ?? 0}, issues ${issueText}`));
|
|
15224
|
-
}
|
|
15225
14556
|
if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
|
|
15226
14557
|
prompt: result.tokensUsed.prompt,
|
|
15227
14558
|
completion: result.tokensUsed.completion,
|
|
@@ -15250,7 +14581,6 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
15250
14581
|
outputPath: result.outputPath,
|
|
15251
14582
|
data: result.data,
|
|
15252
14583
|
tablesInserted: insertResult.tablesInserted,
|
|
15253
|
-
evidenceSummary: result.evidenceSummary,
|
|
15254
14584
|
tokensUsed: result.tokensUsed
|
|
15255
14585
|
};
|
|
15256
14586
|
} else {
|
|
@@ -15277,7 +14607,6 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
15277
14607
|
success: true,
|
|
15278
14608
|
outputPath: result.outputPath,
|
|
15279
14609
|
data: result.data,
|
|
15280
|
-
evidenceSummary: result.evidenceSummary,
|
|
15281
14610
|
tokensUsed: result.tokensUsed
|
|
15282
14611
|
};
|
|
15283
14612
|
}
|
|
@@ -15342,9 +14671,13 @@ async function runAuditedExtraction(options) {
|
|
|
15342
14671
|
});
|
|
15343
14672
|
try {
|
|
15344
14673
|
let text$1 = "";
|
|
15345
|
-
|
|
15346
|
-
|
|
15347
|
-
|
|
14674
|
+
let filePath;
|
|
14675
|
+
if (source.type === "file") {
|
|
14676
|
+
const input = await readExtractFileInput(source.filePath, aiConfig, modelOverride);
|
|
14677
|
+
text$1 = input.text;
|
|
14678
|
+
filePath = input.filePath;
|
|
14679
|
+
} else text$1 = source.text;
|
|
14680
|
+
const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
|
|
15348
14681
|
quiet,
|
|
15349
14682
|
insert
|
|
15350
14683
|
});
|
|
@@ -15386,7 +14719,6 @@ async function runAuditedExtraction(options) {
|
|
|
15386
14719
|
outputName: updated.outputName,
|
|
15387
14720
|
tablesInserted: updated.tablesInserted,
|
|
15388
14721
|
notionPages: updated.notionPages,
|
|
15389
|
-
evidenceSummary: r.evidenceSummary,
|
|
15390
14722
|
tokensUsed: updated.tokensUsed,
|
|
15391
14723
|
auditId: updated.id,
|
|
15392
14724
|
fileHash
|
|
@@ -16514,7 +15846,6 @@ function aiRoutes(config) {
|
|
|
16514
15846
|
//#endregion
|
|
16515
15847
|
//#region src/core/data-service.ts
|
|
16516
15848
|
const FILE_REGEX = /\.json$/;
|
|
16517
|
-
const EVIDENCE_FILE_SUFFIX = ".evidence.json";
|
|
16518
15849
|
const EXTRACTION_TIMESTAMP_RE = /-\d{4}-\d{2}-\d{2}T/;
|
|
16519
15850
|
const INTERNAL_ROWID_COLUMN = "__aiex_rowid";
|
|
16520
15851
|
const TIMESTAMP_CLEANUP = /(\d{2})-(\d{2})-(\d{2})/;
|
|
@@ -16530,24 +15861,6 @@ function getAuditNotionStatus(record) {
|
|
|
16530
15861
|
if (record.status === "failed") return "failed";
|
|
16531
15862
|
return "not_synced";
|
|
16532
15863
|
}
|
|
16533
|
-
async function readEvidenceSummary(extractedDir, outputName) {
|
|
16534
|
-
const evidencePath = path.join(extractedDir, outputName.replace(FILE_REGEX, EVIDENCE_FILE_SUFFIX));
|
|
16535
|
-
try {
|
|
16536
|
-
const coverage = (await readFile(evidencePath))?.coverage;
|
|
16537
|
-
if (!coverage || typeof coverage !== "object") return void 0;
|
|
16538
|
-
return {
|
|
16539
|
-
path: evidencePath,
|
|
16540
|
-
fieldCount: Number(coverage.fieldCount) || 0,
|
|
16541
|
-
evidenceCount: Number(coverage.evidenceCount) || 0,
|
|
16542
|
-
foundCount: Number(coverage.foundCount) || 0,
|
|
16543
|
-
missingCount: Number(coverage.missingCount) || 0,
|
|
16544
|
-
inferredCount: Number(coverage.inferredCount) || 0,
|
|
16545
|
-
issueCount: Number(coverage.issueCount) || 0
|
|
16546
|
-
};
|
|
16547
|
-
} catch {
|
|
16548
|
-
return;
|
|
16549
|
-
}
|
|
16550
|
-
}
|
|
16551
15864
|
async function getRowExtractionActions(aiexDir, tableName) {
|
|
16552
15865
|
const actions = /* @__PURE__ */ new Map();
|
|
16553
15866
|
const auditRecords = await listExtractionAuditRecords(aiexDir);
|
|
@@ -16575,7 +15888,7 @@ async function listExtractions(config) {
|
|
|
16575
15888
|
const aiexDir = path.dirname(config.schemaPath);
|
|
16576
15889
|
const extractedDir = path.join(aiexDir, "extracted");
|
|
16577
15890
|
await fs.mkdir(extractedDir, { recursive: true });
|
|
16578
|
-
const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md")
|
|
15891
|
+
const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md"));
|
|
16579
15892
|
const auditRecords = await listExtractionAuditRecords(aiexDir);
|
|
16580
15893
|
const auditByOutputName = new Map(auditRecords.map((record) => [record.outputName, record]));
|
|
16581
15894
|
const records = [];
|
|
@@ -16594,7 +15907,6 @@ async function listExtractions(config) {
|
|
|
16594
15907
|
timestamp,
|
|
16595
15908
|
fileSize: stat.size,
|
|
16596
15909
|
modifiedAt: stat.mtime.toISOString(),
|
|
16597
|
-
evidenceSummary: await readEvidenceSummary(extractedDir, file),
|
|
16598
15910
|
notionStatus: notionPages ? "synced" : audit?.status === "failed" ? "failed" : "not_synced",
|
|
16599
15911
|
notionPages,
|
|
16600
15912
|
notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0
|
|
@@ -16774,7 +16086,6 @@ async function retryNotionSync(config, fileName) {
|
|
|
16774
16086
|
|
|
16775
16087
|
//#endregion
|
|
16776
16088
|
//#region src/server/routes/data.ts
|
|
16777
|
-
const JSON_FILE_SUFFIX_RE = /\.json$/;
|
|
16778
16089
|
const tableParamSchema = z.object({ name: z.string().regex(/^[a-z][a-z0-9_]*$/) });
|
|
16779
16090
|
const extractionFileParamSchema = z.object({ name: z.string().regex(/^[\w.-]+\.json$/).refine((name$1) => name$1 === path.basename(name$1) && !name$1.includes("..")) });
|
|
16780
16091
|
const tableQuerySchema = z.object({
|
|
@@ -16827,22 +16138,10 @@ function dataRoutes(config) {
|
|
|
16827
16138
|
const filePath = path.join(extractedDir, name$1);
|
|
16828
16139
|
try {
|
|
16829
16140
|
const content = await fs.readFile(filePath, "utf-8");
|
|
16830
|
-
const evidencePath = path.join(extractedDir, name$1.replace(JSON_FILE_SUFFIX_RE, ".evidence.json"));
|
|
16831
|
-
let evidenceSummary;
|
|
16832
|
-
try {
|
|
16833
|
-
const evidence = JSON.parse(await fs.readFile(evidencePath, "utf-8"));
|
|
16834
|
-
evidenceSummary = evidence?.coverage ? {
|
|
16835
|
-
...evidence.coverage,
|
|
16836
|
-
path: evidencePath
|
|
16837
|
-
} : void 0;
|
|
16838
|
-
} catch {
|
|
16839
|
-
evidenceSummary = void 0;
|
|
16840
|
-
}
|
|
16841
16141
|
return c.json({
|
|
16842
16142
|
success: true,
|
|
16843
16143
|
content,
|
|
16844
|
-
name: name$1
|
|
16845
|
-
evidenceSummary
|
|
16144
|
+
name: name$1
|
|
16846
16145
|
});
|
|
16847
16146
|
} catch {
|
|
16848
16147
|
return c.json({ error: t("server.extractionNotFound") }, 404);
|
|
@@ -16986,7 +16285,6 @@ function extractRoutes(config) {
|
|
|
16986
16285
|
outputName: result.outputName,
|
|
16987
16286
|
tablesInserted: result.tablesInserted,
|
|
16988
16287
|
notionPages: result.notionPages,
|
|
16989
|
-
evidenceSummary: result.evidenceSummary,
|
|
16990
16288
|
tokensUsed: result.tokensUsed,
|
|
16991
16289
|
auditId: result.auditId
|
|
16992
16290
|
}, 200);
|
|
@@ -17054,7 +16352,6 @@ function extractRoutes(config) {
|
|
|
17054
16352
|
outputName: result.outputName,
|
|
17055
16353
|
tablesInserted: result.tablesInserted,
|
|
17056
16354
|
notionPages: result.notionPages,
|
|
17057
|
-
evidenceSummary: result.evidenceSummary,
|
|
17058
16355
|
tokensUsed: result.tokensUsed,
|
|
17059
16356
|
auditId: result.auditId
|
|
17060
16357
|
}, 200);
|