aiex-cli 0.0.5-beta.6 → 0.0.6-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -11
- package/dist/cli.mjs +322 -951
- package/dist/{doctor-collector-BpqhXNcO.mjs → doctor-collector-CGo5dgHm.mjs} +70 -52
- package/dist/index.d.mts +88 -91
- package/dist/index.mjs +1 -1
- package/dist/web/assets/AISettings-Dbma0Oku.js +264 -0
- package/dist/web/assets/{DataBrowser-BGkZb9FV.js → DataBrowser-GAA-pGq0.js} +1 -1
- package/dist/web/assets/ExtractionViewer-CrQMLtX7.js +1 -0
- package/dist/web/assets/{api-client-gQAAOw0v.js → api-client-b4ZBXpNH.js} +1 -1
- package/dist/web/assets/{index-BQKZKzzP.js → index-CdQgz6dJ.js} +8 -8
- package/dist/web/assets/index-D0So2rJE.css +2 -0
- package/dist/web/index.html +3 -3
- package/dist/{zh-CN-DkillGHx.mjs → zh-CN-wEUNhuHM.mjs} +18 -18
- package/package.json +2 -3
- package/dist/web/assets/AISettings-sVI4PTNB.js +0 -264
- package/dist/web/assets/ExtractionViewer-DNrkSECj.js +0 -1
- package/dist/web/assets/index-BU58oIRd.css +0 -2
package/dist/cli.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { C as
|
|
1
|
+
import { C as description, E as version, O as doctorDiagnosticsTableRows, S as seedConfig, T as package_default, _ as DEFAULT_PROMPT_CONFIG, a as parseJsonSchema, b as AIConfigSchema, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MINERU_CONFIG, h as DEFAULT_MINERU_API_CONFIG, i as JsonSchemaDefinitionSchema, k as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as PLACEHOLDER_SCHEMA, w as name, x as createConfig, y as PLACEHOLDER_TEXT } from "./doctor-collector-CGo5dgHm.mjs";
|
|
2
2
|
import { createRequire } from "node:module";
|
|
3
3
|
import fs from "node:fs/promises";
|
|
4
4
|
import os from "node:os";
|
|
@@ -17,14 +17,15 @@ import Database from "better-sqlite3";
|
|
|
17
17
|
import pc from "picocolors";
|
|
18
18
|
import { Buffer } from "node:buffer";
|
|
19
19
|
import * as XLSX from "xlsx";
|
|
20
|
-
import { getEncoding } from "js-tiktoken";
|
|
21
20
|
import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
|
|
22
21
|
import { APICallError, Output, generateText, jsonSchema } from "ai";
|
|
23
22
|
import pRetry from "p-retry";
|
|
23
|
+
import mime from "mime";
|
|
24
|
+
import { TextDecoder, promisify } from "node:util";
|
|
25
|
+
import { fileTypeFromBuffer, fileTypeFromFile } from "file-type";
|
|
24
26
|
import { jsonrepair } from "jsonrepair";
|
|
25
27
|
import { LangfuseSpanProcessor } from "@langfuse/otel";
|
|
26
28
|
import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
27
|
-
import { marked } from "marked";
|
|
28
29
|
import crypto from "node:crypto";
|
|
29
30
|
import { Client, extractNotionId } from "@notionhq/client";
|
|
30
31
|
import { execa } from "execa";
|
|
@@ -32,7 +33,6 @@ import { glob, globSync } from "tinyglobby";
|
|
|
32
33
|
import { extractText, getDocumentProxy, getMeta } from "unpdf";
|
|
33
34
|
import AdmZip from "adm-zip";
|
|
34
35
|
import { execFile } from "node:child_process";
|
|
35
|
-
import { promisify } from "node:util";
|
|
36
36
|
import * as chokidar from "chokidar";
|
|
37
37
|
import { serve } from "@hono/node-server";
|
|
38
38
|
import open from "open";
|
|
@@ -12860,6 +12860,80 @@ async function withRetry(fn, onRetry, maxRetries = 5) {
|
|
|
12860
12860
|
});
|
|
12861
12861
|
}
|
|
12862
12862
|
|
|
12863
|
+
//#endregion
|
|
12864
|
+
//#region src/core/input-file-kind.ts
|
|
12865
|
+
const UTF8_DECODER = new TextDecoder("utf-8", { fatal: true });
|
|
12866
|
+
const SVG_START_RE = /^\s*<svg[\s>]/i;
|
|
12867
|
+
const SVG_ANY_RE = /<svg[\s>]/i;
|
|
12868
|
+
function isSupportedImageMime(mime$1) {
|
|
12869
|
+
return !!mime$1 && [
|
|
12870
|
+
"image/png",
|
|
12871
|
+
"image/jpeg",
|
|
12872
|
+
"image/webp"
|
|
12873
|
+
].includes(mime$1);
|
|
12874
|
+
}
|
|
12875
|
+
function detectTextKind(buffer) {
|
|
12876
|
+
try {
|
|
12877
|
+
const text$1 = UTF8_DECODER.decode(buffer);
|
|
12878
|
+
if (SVG_START_RE.test(text$1) || SVG_ANY_RE.test(text$1.slice(0, 4096))) return {
|
|
12879
|
+
kind: "unsupported",
|
|
12880
|
+
mime: "image/svg+xml"
|
|
12881
|
+
};
|
|
12882
|
+
return {
|
|
12883
|
+
kind: "text",
|
|
12884
|
+
mime: "text/plain"
|
|
12885
|
+
};
|
|
12886
|
+
} catch {
|
|
12887
|
+
return { kind: "unsupported" };
|
|
12888
|
+
}
|
|
12889
|
+
}
|
|
12890
|
+
async function detectInputFileKind(filePath) {
|
|
12891
|
+
const detected = await fileTypeFromFile(filePath);
|
|
12892
|
+
if (detected?.mime === "application/pdf") return {
|
|
12893
|
+
kind: "pdf",
|
|
12894
|
+
mime: detected.mime
|
|
12895
|
+
};
|
|
12896
|
+
if (isSupportedImageMime(detected?.mime)) return {
|
|
12897
|
+
kind: "image",
|
|
12898
|
+
mime: detected?.mime
|
|
12899
|
+
};
|
|
12900
|
+
return detectTextKind(await fs.readFile(filePath));
|
|
12901
|
+
}
|
|
12902
|
+
async function detectInputBufferKind(buffer) {
|
|
12903
|
+
const detected = await fileTypeFromBuffer(buffer);
|
|
12904
|
+
if (detected?.mime === "application/pdf") return {
|
|
12905
|
+
kind: "pdf",
|
|
12906
|
+
mime: detected.mime
|
|
12907
|
+
};
|
|
12908
|
+
if (isSupportedImageMime(detected?.mime)) return {
|
|
12909
|
+
kind: "image",
|
|
12910
|
+
mime: detected?.mime
|
|
12911
|
+
};
|
|
12912
|
+
return detectTextKind(buffer);
|
|
12913
|
+
}
|
|
12914
|
+
|
|
12915
|
+
//#endregion
|
|
12916
|
+
//#region src/core/ai-extraction/file-utils.ts
|
|
12917
|
+
async function detectMimeType(filePath) {
|
|
12918
|
+
return (await detectInputFileKind(filePath)).mime ?? mime.getType(filePath) ?? "application/octet-stream";
|
|
12919
|
+
}
|
|
12920
|
+
async function readFilePart(filePath) {
|
|
12921
|
+
const mimeStr = await detectMimeType(filePath);
|
|
12922
|
+
const buffer = await fs.readFile(filePath);
|
|
12923
|
+
const name$1 = path.basename(filePath);
|
|
12924
|
+
if (mimeStr.startsWith("image/")) return {
|
|
12925
|
+
type: "image",
|
|
12926
|
+
image: buffer,
|
|
12927
|
+
mimeType: mimeStr
|
|
12928
|
+
};
|
|
12929
|
+
return {
|
|
12930
|
+
type: "file",
|
|
12931
|
+
data: buffer,
|
|
12932
|
+
mediaType: mimeStr,
|
|
12933
|
+
filename: name$1
|
|
12934
|
+
};
|
|
12935
|
+
}
|
|
12936
|
+
|
|
12863
12937
|
//#endregion
|
|
12864
12938
|
//#region src/core/ai-extraction/json-utils.ts
|
|
12865
12939
|
function parseJsonLike(text$1) {
|
|
@@ -12920,10 +12994,25 @@ function filterCompatible(models, inputTokens, outputTokens) {
|
|
|
12920
12994
|
});
|
|
12921
12995
|
}
|
|
12922
12996
|
function selectModel(input) {
|
|
12923
|
-
const { models, inputTokens, outputTokens } = input;
|
|
12997
|
+
const { models, isImage, fileName, inputTokens, outputTokens } = input;
|
|
12924
12998
|
if (models.length === 0) throw new Error(t("errors.ai.noModels"));
|
|
12925
12999
|
let candidates = filterCompatible(models, inputTokens, outputTokens);
|
|
12926
13000
|
if (candidates.length === 0) candidates = models;
|
|
13001
|
+
if (isImage) {
|
|
13002
|
+
const visionModel = candidates.find((m) => m.capabilities.vision);
|
|
13003
|
+
if (!visionModel) {
|
|
13004
|
+
const hint = fileName ? ` (${fileName})` : "";
|
|
13005
|
+
const msg = inputTokens ? t("errors.ai.noVisionModelContext", {
|
|
13006
|
+
tokens: inputTokens,
|
|
13007
|
+
hint
|
|
13008
|
+
}) : t("errors.ai.noVisionModel", { hint });
|
|
13009
|
+
throw new Error(msg + t("errors.ai.addSuitableModel"));
|
|
13010
|
+
}
|
|
13011
|
+
return {
|
|
13012
|
+
name: visionModel.name,
|
|
13013
|
+
capabilities: visionModel.capabilities
|
|
13014
|
+
};
|
|
13015
|
+
}
|
|
12927
13016
|
const soModel = candidates.find((m) => m.capabilities.structuredOutput);
|
|
12928
13017
|
if (soModel) return {
|
|
12929
13018
|
name: soModel.name,
|
|
@@ -12937,46 +13026,36 @@ function selectModel(input) {
|
|
|
12937
13026
|
|
|
12938
13027
|
//#endregion
|
|
12939
13028
|
//#region src/core/ai-extraction/prompt-generator.ts
|
|
12940
|
-
|
|
12941
|
-
const IDENTIFIER_SEPARATOR_RE = /[\s_-]+/;
|
|
12942
|
-
function splitIdentifier(name$1) {
|
|
12943
|
-
return name$1.replace(CAMEL_CASE_BOUNDARY_RE, "$1 $2").split(IDENTIFIER_SEPARATOR_RE).map((part) => part.trim().toLowerCase()).filter(Boolean);
|
|
12944
|
-
}
|
|
12945
|
-
function propertyToDescription(name$1, prop, indent = "", required = false) {
|
|
13029
|
+
function propertyToDescription(name$1, prop, indent = "") {
|
|
12946
13030
|
const lines = [];
|
|
12947
13031
|
let typeStr = prop.type;
|
|
12948
13032
|
if (prop.type === "array" && prop.items) typeStr = `array of ${prop.items.type}`;
|
|
12949
|
-
lines.push(`${indent}- ${name$1}: ${typeStr}
|
|
12950
|
-
const terms = splitIdentifier(name$1);
|
|
12951
|
-
if (terms.length > 1) lines.push(`${indent} search terms: ${terms.join(", ")}`);
|
|
12952
|
-
if (prop.description) lines.push(`${indent} description: ${prop.description}`);
|
|
13033
|
+
lines.push(`${indent}- ${name$1}: ${typeStr}`);
|
|
12953
13034
|
if (prop.minLength !== void 0 || prop.maxLength !== void 0) lines.push(`${indent} length: ${prop.minLength ?? 0} - ${prop.maxLength ?? "unlimited"}`);
|
|
12954
|
-
if (prop.minimum !== void 0 || prop.maximum !== void 0) lines.push(`${indent} range: ${prop.minimum ?? "-∞"} - ${prop.maximum ?? "+∞"}`);
|
|
12955
13035
|
if (prop.format) lines.push(`${indent} format: ${prop.format}`);
|
|
12956
13036
|
if (prop.unique) lines.push(`${indent} unique: true`);
|
|
12957
13037
|
if (prop.default !== void 0) lines.push(`${indent} default: ${JSON.stringify(prop.default)}`);
|
|
12958
13038
|
return lines.join("\n");
|
|
12959
13039
|
}
|
|
12960
|
-
function nestedPropertyToDescription(name$1, prop, indent = ""
|
|
13040
|
+
function nestedPropertyToDescription(name$1, prop, indent = "") {
|
|
12961
13041
|
const lines = [];
|
|
12962
|
-
const isRequired = requiredFields.includes(name$1);
|
|
12963
13042
|
if (prop.nested?.enabled && prop.type === "object") {
|
|
12964
13043
|
const relation = prop.nested.relation || "has-one";
|
|
12965
|
-
lines.push(`${indent}- ${name$1}: object (related table, ${relation})
|
|
12966
|
-
if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}
|
|
13044
|
+
lines.push(`${indent}- ${name$1}: object (related table, ${relation})`);
|
|
13045
|
+
if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
|
|
12967
13046
|
return lines.join("\n");
|
|
12968
13047
|
}
|
|
12969
13048
|
if (prop.type === "array" && prop.items?.nested?.enabled) {
|
|
12970
13049
|
const relation = prop.items.nested.relation || "has-many";
|
|
12971
|
-
lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})
|
|
12972
|
-
if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}
|
|
13050
|
+
lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})`);
|
|
13051
|
+
if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
|
|
12973
13052
|
return lines.join("\n");
|
|
12974
13053
|
}
|
|
12975
|
-
lines.push(propertyToDescription(name$1, prop, indent
|
|
12976
|
-
if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}
|
|
13054
|
+
lines.push(propertyToDescription(name$1, prop, indent));
|
|
13055
|
+
if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
|
|
12977
13056
|
if (prop.type === "array" && prop.items?.properties && !prop.items?.nested?.enabled) {
|
|
12978
13057
|
lines.push(`${indent} item fields:`);
|
|
12979
|
-
for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}
|
|
13058
|
+
for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
|
|
12980
13059
|
}
|
|
12981
13060
|
return lines.join("\n");
|
|
12982
13061
|
}
|
|
@@ -12988,7 +13067,7 @@ function schemaToDescription(schema) {
|
|
|
12988
13067
|
lines.push("Fields:");
|
|
12989
13068
|
for (const [name$1, prop] of Object.entries(schema.properties)) {
|
|
12990
13069
|
const property = prop;
|
|
12991
|
-
lines.push(nestedPropertyToDescription(name$1, property
|
|
13070
|
+
lines.push(nestedPropertyToDescription(name$1, property));
|
|
12992
13071
|
}
|
|
12993
13072
|
if (schema.examples && schema.examples.length > 0) {
|
|
12994
13073
|
lines.push("");
|
|
@@ -13033,6 +13112,33 @@ function generatePromptSnapshot(schema, promptConfig = DEFAULT_PROMPT_CONFIG) {
|
|
|
13033
13112
|
].join("\n");
|
|
13034
13113
|
}
|
|
13035
13114
|
|
|
13115
|
+
//#endregion
|
|
13116
|
+
//#region src/core/ai-extraction/snapshot.ts
|
|
13117
|
+
const SYSTEM_PROMPT_REGEX = /## System Prompt\n([\s\S]*?)(?=## User Prompt|$)/;
|
|
13118
|
+
const USER_PROMPT_REGEX = /## User Prompt Template\n([\s\S]*)$/;
|
|
13119
|
+
async function loadPromptSnapshot(aiexDir, tableName) {
|
|
13120
|
+
const snapshotPath = path.join(aiexDir, "extracted", `${tableName}.prompt.md`);
|
|
13121
|
+
try {
|
|
13122
|
+
const content = await fs.readFile(snapshotPath, "utf-8");
|
|
13123
|
+
const systemMatch = content.match(SYSTEM_PROMPT_REGEX);
|
|
13124
|
+
const userMatch = content.match(USER_PROMPT_REGEX);
|
|
13125
|
+
if (systemMatch && userMatch) return {
|
|
13126
|
+
system: systemMatch[1].trim(),
|
|
13127
|
+
user: userMatch[1].trim()
|
|
13128
|
+
};
|
|
13129
|
+
} catch {}
|
|
13130
|
+
return null;
|
|
13131
|
+
}
|
|
13132
|
+
async function savePromptSnapshot(schema, aiexDir) {
|
|
13133
|
+
const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
|
|
13134
|
+
const outputDir = path.join(aiexDir, "extracted");
|
|
13135
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
13136
|
+
const fileName = `${schema.table.name}.prompt.md`;
|
|
13137
|
+
const outputPath = path.join(outputDir, fileName);
|
|
13138
|
+
await fs.writeFile(outputPath, content);
|
|
13139
|
+
return outputPath;
|
|
13140
|
+
}
|
|
13141
|
+
|
|
13036
13142
|
//#endregion
|
|
13037
13143
|
//#region src/core/ai-extraction/telemetry.ts
|
|
13038
13144
|
let langfuseInitialized = false;
|
|
@@ -13075,7 +13181,7 @@ function propertyToExtractionSchema(property) {
|
|
|
13075
13181
|
}
|
|
13076
13182
|
return { type: nullableType(property.type) };
|
|
13077
13183
|
}
|
|
13078
|
-
function isRecord
|
|
13184
|
+
function isRecord(value) {
|
|
13079
13185
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
13080
13186
|
}
|
|
13081
13187
|
function schemaToExtractionOutputSchema(schema) {
|
|
@@ -13113,7 +13219,7 @@ function validatePropertyValue(path$1, property, value, issues) {
|
|
|
13113
13219
|
}
|
|
13114
13220
|
return;
|
|
13115
13221
|
case "object":
|
|
13116
|
-
if (!isRecord
|
|
13222
|
+
if (!isRecord(value)) {
|
|
13117
13223
|
issues.push(`${path$1}: expected object or null`);
|
|
13118
13224
|
return;
|
|
13119
13225
|
}
|
|
@@ -13136,7 +13242,7 @@ function validateProperties(basePath, properties, data, issues) {
|
|
|
13136
13242
|
}
|
|
13137
13243
|
}
|
|
13138
13244
|
function validateExtractedData(schema, data) {
|
|
13139
|
-
if (!isRecord
|
|
13245
|
+
if (!isRecord(data)) return {
|
|
13140
13246
|
success: false,
|
|
13141
13247
|
error: "Extracted data must be a JSON object."
|
|
13142
13248
|
};
|
|
@@ -13153,11 +13259,13 @@ function validateExtractedData(schema, data) {
|
|
|
13153
13259
|
//#region src/core/ai-extraction/extractor.ts
|
|
13154
13260
|
const OPENAI_COMPATIBLE_PROVIDER_NAME = "openai-compatible";
|
|
13155
13261
|
async function extractStructuredData(input) {
|
|
13156
|
-
const { config, schema, text: text$1, modelOverride } = input;
|
|
13262
|
+
const { config, schema, text: text$1, aiexDir, file, modelOverride } = input;
|
|
13157
13263
|
if (!config.provider.apiKey) return {
|
|
13158
13264
|
success: false,
|
|
13159
13265
|
error: t("errors.ai.apiKeyMissing")
|
|
13160
13266
|
};
|
|
13267
|
+
const useFileContent = !!file;
|
|
13268
|
+
const isImageFile = (useFileContent ? await detectMimeType(file) : "").startsWith("image/");
|
|
13161
13269
|
const inputTokens = text$1 ? Math.ceil(text$1.length / 2) : void 0;
|
|
13162
13270
|
const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
|
|
13163
13271
|
const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
|
|
@@ -13165,6 +13273,8 @@ async function extractStructuredData(input) {
|
|
|
13165
13273
|
try {
|
|
13166
13274
|
selected = modelOverride ?? selectModel({
|
|
13167
13275
|
models: config.provider.models,
|
|
13276
|
+
isImage: isImageFile,
|
|
13277
|
+
fileName: file,
|
|
13168
13278
|
inputTokens,
|
|
13169
13279
|
outputTokens
|
|
13170
13280
|
});
|
|
@@ -13184,7 +13294,18 @@ async function extractStructuredData(input) {
|
|
|
13184
13294
|
apiKey: config.provider.apiKey,
|
|
13185
13295
|
supportsStructuredOutputs: useStructuredOutput
|
|
13186
13296
|
});
|
|
13187
|
-
|
|
13297
|
+
let system;
|
|
13298
|
+
let user;
|
|
13299
|
+
const snapshot = await loadPromptSnapshot(aiexDir, schema.table.name);
|
|
13300
|
+
const promptText = file ? PLACEHOLDER_TEXT : text$1;
|
|
13301
|
+
if (snapshot) {
|
|
13302
|
+
system = snapshot.system;
|
|
13303
|
+
user = snapshot.user.replaceAll(PLACEHOLDER_TEXT, promptText);
|
|
13304
|
+
} else {
|
|
13305
|
+
const generated = generateExtractionPrompt(schema, promptText, config.prompt ?? DEFAULT_PROMPT_CONFIG);
|
|
13306
|
+
system = generated.system;
|
|
13307
|
+
user = generated.user;
|
|
13308
|
+
}
|
|
13188
13309
|
const outputSchema = jsonSchema(schemaToExtractionOutputSchema(schema));
|
|
13189
13310
|
const timeoutMs = (config.provider.timeout ?? 300) * 1e3;
|
|
13190
13311
|
let systemPrompt = system;
|
|
@@ -13199,16 +13320,38 @@ async function extractStructuredData(input) {
|
|
|
13199
13320
|
let parseError;
|
|
13200
13321
|
let validationError;
|
|
13201
13322
|
try {
|
|
13202
|
-
|
|
13203
|
-
|
|
13204
|
-
|
|
13205
|
-
|
|
13206
|
-
|
|
13207
|
-
|
|
13208
|
-
|
|
13209
|
-
|
|
13210
|
-
|
|
13211
|
-
|
|
13323
|
+
if (useFileContent) {
|
|
13324
|
+
const filePart = await readFilePart(file);
|
|
13325
|
+
const fileName = filePart.type === "file" ? filePart.filename : path.basename(file);
|
|
13326
|
+
const contentParts = [{
|
|
13327
|
+
type: "text",
|
|
13328
|
+
text: userPrompt.includes(PLACEHOLDER_TEXT) ? userPrompt.replaceAll(PLACEHOLDER_TEXT, text$1 || `Data is contained in the attached file: ${fileName}`) : userPrompt
|
|
13329
|
+
}, filePart];
|
|
13330
|
+
const fileOpts = {
|
|
13331
|
+
model: provider.chatModel(selected.name),
|
|
13332
|
+
system: systemPrompt,
|
|
13333
|
+
messages: [{
|
|
13334
|
+
role: "user",
|
|
13335
|
+
content: contentParts
|
|
13336
|
+
}],
|
|
13337
|
+
abortSignal: AbortSignal.timeout(timeoutMs),
|
|
13338
|
+
maxRetries: 0,
|
|
13339
|
+
experimental_telemetry: { isEnabled: useTelemetry }
|
|
13340
|
+
};
|
|
13341
|
+
if (useStructuredOutput) fileOpts.output = Output.object({ schema: outputSchema });
|
|
13342
|
+
result = await withRetry(() => generateText(fileOpts), input.onRetry);
|
|
13343
|
+
} else {
|
|
13344
|
+
const textOpts = {
|
|
13345
|
+
model: provider.chatModel(selected.name),
|
|
13346
|
+
system: systemPrompt,
|
|
13347
|
+
prompt: userPrompt,
|
|
13348
|
+
abortSignal: AbortSignal.timeout(timeoutMs),
|
|
13349
|
+
maxRetries: 0,
|
|
13350
|
+
experimental_telemetry: { isEnabled: useTelemetry }
|
|
13351
|
+
};
|
|
13352
|
+
if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
|
|
13353
|
+
result = await withRetry(() => generateText(textOpts), input.onRetry);
|
|
13354
|
+
}
|
|
13212
13355
|
if (result.usage) {
|
|
13213
13356
|
totalPromptTokens += result.usage.inputTokens ?? 0;
|
|
13214
13357
|
totalCompletionTokens += result.usage.outputTokens ?? 0;
|
|
@@ -13224,16 +13367,27 @@ async function extractStructuredData(input) {
|
|
|
13224
13367
|
}
|
|
13225
13368
|
if (!parseError && data !== void 0) {
|
|
13226
13369
|
const validation = validateExtractedData(schema, data);
|
|
13227
|
-
if (validation.success)
|
|
13228
|
-
|
|
13229
|
-
|
|
13230
|
-
|
|
13231
|
-
|
|
13232
|
-
|
|
13233
|
-
|
|
13234
|
-
|
|
13235
|
-
|
|
13236
|
-
|
|
13370
|
+
if (validation.success) {
|
|
13371
|
+
const outputDir = path.resolve(aiexDir, config.extraction.outputDir.replace(".aiex/", ""));
|
|
13372
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
13373
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
13374
|
+
const outputFileName = `${schema.table.name}-${timestamp}.json`;
|
|
13375
|
+
const outputPath = path.join(outputDir, outputFileName);
|
|
13376
|
+
await writeFile(outputPath, data, {
|
|
13377
|
+
spaces: 2,
|
|
13378
|
+
EOL: "\n"
|
|
13379
|
+
});
|
|
13380
|
+
return {
|
|
13381
|
+
success: true,
|
|
13382
|
+
outputPath,
|
|
13383
|
+
data,
|
|
13384
|
+
tokensUsed: {
|
|
13385
|
+
prompt: totalPromptTokens,
|
|
13386
|
+
completion: totalCompletionTokens,
|
|
13387
|
+
total: totalPromptTokens + totalCompletionTokens
|
|
13388
|
+
}
|
|
13389
|
+
};
|
|
13390
|
+
} else validationError = validation.error;
|
|
13237
13391
|
}
|
|
13238
13392
|
const errorMsg = parseError || validationError || "Unknown validation error";
|
|
13239
13393
|
lastError = errorMsg;
|
|
@@ -13244,14 +13398,11 @@ async function extractStructuredData(input) {
|
|
|
13244
13398
|
CRITICAL RULES:
|
|
13245
13399
|
1. Only correct the fields that failed validation.
|
|
13246
13400
|
2. Preserve all other correctly extracted fields and their values exactly.
|
|
13247
|
-
3.
|
|
13248
|
-
4. Remove any fields not defined by the JSON Schema.
|
|
13249
|
-
5. Normalize values to the expected JSON type without changing the intended meaning.
|
|
13250
|
-
6. Return ONLY the corrected JSON object. No explanations, no markdown blocks other than JSON.`;
|
|
13401
|
+
3. Return ONLY the corrected JSON object. No explanations, no markdown blocks other than JSON.`;
|
|
13251
13402
|
userPrompt = `The JSON data you generated previously failed validation. Please correct it.
|
|
13252
13403
|
|
|
13253
13404
|
[Original Text]
|
|
13254
|
-
${text$1 || "
|
|
13405
|
+
${text$1 || "Data is contained in the attached file."}
|
|
13255
13406
|
|
|
13256
13407
|
[JSON Schema Definition]
|
|
13257
13408
|
${JSON.stringify(schemaToExtractionOutputSchema(schema), null, 2)}
|
|
@@ -13262,11 +13413,6 @@ ${invalidJson}
|
|
|
13262
13413
|
[Validation Error Details]
|
|
13263
13414
|
${errorMsg}
|
|
13264
13415
|
|
|
13265
|
-
Correction checklist:
|
|
13266
|
-
- Fix each field path mentioned in the validation error.
|
|
13267
|
-
- Keep schema-valid fields unchanged.
|
|
13268
|
-
- Do not invent missing facts; use null when the original text does not support a value.
|
|
13269
|
-
|
|
13270
13416
|
Please output the corrected JSON object now:`;
|
|
13271
13417
|
}
|
|
13272
13418
|
}
|
|
@@ -13419,343 +13565,6 @@ function insertExtractedData(db, schema, data) {
|
|
|
13419
13565
|
}
|
|
13420
13566
|
}
|
|
13421
13567
|
|
|
13422
|
-
//#endregion
|
|
13423
|
-
//#region src/core/ai-extraction/json-merger.ts
|
|
13424
|
-
function isRecord$1(value) {
|
|
13425
|
-
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
13426
|
-
}
|
|
13427
|
-
function stableKey(value) {
|
|
13428
|
-
if (!isRecord$1(value)) return JSON.stringify(value);
|
|
13429
|
-
return JSON.stringify(Object.keys(value).sort().reduce((acc, key) => {
|
|
13430
|
-
acc[key] = value[key];
|
|
13431
|
-
return acc;
|
|
13432
|
-
}, {}));
|
|
13433
|
-
}
|
|
13434
|
-
function isBlankString(value) {
|
|
13435
|
-
return typeof value === "string" && value.trim() === "";
|
|
13436
|
-
}
|
|
13437
|
-
function isPlaceholderString$1(value) {
|
|
13438
|
-
if (typeof value !== "string") return false;
|
|
13439
|
-
const normalized = value.trim().toLowerCase();
|
|
13440
|
-
return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
|
|
13441
|
-
}
|
|
13442
|
-
function pickPrimitiveValue(values) {
|
|
13443
|
-
const meaningful = values.filter((v) => !isBlankString(v) && !isPlaceholderString$1(v));
|
|
13444
|
-
if (meaningful.length === 0) return null;
|
|
13445
|
-
if (typeof meaningful[0] === "boolean") {
|
|
13446
|
-
const trueCount = meaningful.filter(Boolean).length;
|
|
13447
|
-
return trueCount >= meaningful.length - trueCount;
|
|
13448
|
-
}
|
|
13449
|
-
return meaningful[0];
|
|
13450
|
-
}
|
|
13451
|
-
function mergePropertyValue(property, values) {
|
|
13452
|
-
const nonNullValues = values.filter((v) => v !== null && v !== void 0);
|
|
13453
|
-
if (nonNullValues.length === 0) return null;
|
|
13454
|
-
if (property.type === "array") {
|
|
13455
|
-
const concatenated = [];
|
|
13456
|
-
const seen = /* @__PURE__ */ new Set();
|
|
13457
|
-
for (const val of nonNullValues) if (Array.isArray(val)) for (const item of val) {
|
|
13458
|
-
const key = stableKey(item);
|
|
13459
|
-
if (!seen.has(key)) {
|
|
13460
|
-
seen.add(key);
|
|
13461
|
-
concatenated.push(item);
|
|
13462
|
-
}
|
|
13463
|
-
}
|
|
13464
|
-
return concatenated;
|
|
13465
|
-
}
|
|
13466
|
-
if (property.type === "object") {
|
|
13467
|
-
const childProperties = property.properties;
|
|
13468
|
-
if (!childProperties) {
|
|
13469
|
-
const mergedObj$1 = {};
|
|
13470
|
-
for (const val of nonNullValues) if (isRecord$1(val)) Object.assign(mergedObj$1, val);
|
|
13471
|
-
return mergedObj$1;
|
|
13472
|
-
}
|
|
13473
|
-
const mergedObj = {};
|
|
13474
|
-
for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord$1(v) ? v[propName] : void 0));
|
|
13475
|
-
return mergedObj;
|
|
13476
|
-
}
|
|
13477
|
-
return pickPrimitiveValue(nonNullValues);
|
|
13478
|
-
}
|
|
13479
|
-
/**
|
|
13480
|
-
* Merges structured extraction outputs from multiple document chunks
|
|
13481
|
-
* according to the schema properties.
|
|
13482
|
-
*/
|
|
13483
|
-
function mergeExtractionResults(schema, results) {
|
|
13484
|
-
if (results.length === 0) return {};
|
|
13485
|
-
if (results.length === 1) return results[0];
|
|
13486
|
-
const merged = {};
|
|
13487
|
-
for (const [propName, propDef] of Object.entries(schema.properties)) {
|
|
13488
|
-
if (propDef.primary && propDef.autoIncrement) continue;
|
|
13489
|
-
merged[propName] = mergePropertyValue(propDef, results.map((r) => r[propName]));
|
|
13490
|
-
}
|
|
13491
|
-
return merged;
|
|
13492
|
-
}
|
|
13493
|
-
|
|
13494
|
-
//#endregion
|
|
13495
|
-
//#region src/core/ai-extraction/snapshot.ts
|
|
13496
|
-
async function savePromptSnapshot(schema, aiexDir) {
|
|
13497
|
-
const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
|
|
13498
|
-
const outputDir = path.join(aiexDir, "extracted");
|
|
13499
|
-
await fs.mkdir(outputDir, { recursive: true });
|
|
13500
|
-
const fileName = `${schema.table.name}.prompt.md`;
|
|
13501
|
-
const outputPath = path.join(outputDir, fileName);
|
|
13502
|
-
await fs.writeFile(outputPath, content);
|
|
13503
|
-
return outputPath;
|
|
13504
|
-
}
|
|
13505
|
-
|
|
13506
|
-
//#endregion
|
|
13507
|
-
//#region src/core/ai-extraction/text-splitter.ts
|
|
13508
|
-
const encoding$1 = getEncoding("cl100k_base");
|
|
13509
|
-
const MAX_OVERLAP_RATIO = .15;
|
|
13510
|
-
const MAX_EFFECTIVE_OVERLAP_TOKENS = 1200;
|
|
13511
|
-
const TABLE_SEPARATOR_CELL_RE = /^:?-{3,}:?$/;
|
|
13512
|
-
const LEADING_TABLE_PIPE_RE = /^\|/;
|
|
13513
|
-
const TRAILING_TABLE_PIPE_RE = /\|$/;
|
|
13514
|
-
function countTokens(text$1) {
|
|
13515
|
-
return encoding$1.encode(text$1).length;
|
|
13516
|
-
}
|
|
13517
|
-
function calculateChunkTokenBudget(options = {}) {
|
|
13518
|
-
const configuredMaxTokens = options.configuredMaxTokens ?? 8e3;
|
|
13519
|
-
const modelMaxTokens = options.modelMaxTokens;
|
|
13520
|
-
if (!modelMaxTokens) return configuredMaxTokens;
|
|
13521
|
-
const outputReserveTokens = options.outputReserveTokens ?? 2e3;
|
|
13522
|
-
const promptReserveTokens = options.promptReserveTokens ?? 1200;
|
|
13523
|
-
const safetyBufferTokens = options.safetyBufferTokens ?? Math.min(1e3, Math.floor(modelMaxTokens * .1));
|
|
13524
|
-
const available = modelMaxTokens - outputReserveTokens - promptReserveTokens - safetyBufferTokens;
|
|
13525
|
-
return Math.max(512, Math.min(configuredMaxTokens, available));
|
|
13526
|
-
}
|
|
13527
|
-
function formatHeadingContext(headings) {
|
|
13528
|
-
const active = headings.filter(Boolean);
|
|
13529
|
-
if (active.length === 0) return "";
|
|
13530
|
-
return `> **[Context]** Belong to: ${active.join(" > ")}\n\n`;
|
|
13531
|
-
}
|
|
13532
|
-
function getMetadata(headings) {
|
|
13533
|
-
return {
|
|
13534
|
-
h1: headings[0] || void 0,
|
|
13535
|
-
h2: headings[1] || void 0,
|
|
13536
|
-
h3: headings[2] || void 0,
|
|
13537
|
-
h4: headings[3] || void 0
|
|
13538
|
-
};
|
|
13539
|
-
}
|
|
13540
|
-
function getHeadingPath(metadata) {
|
|
13541
|
-
return [
|
|
13542
|
-
metadata.h1,
|
|
13543
|
-
metadata.h2,
|
|
13544
|
-
metadata.h3,
|
|
13545
|
-
metadata.h4
|
|
13546
|
-
].filter(Boolean);
|
|
13547
|
-
}
|
|
13548
|
-
function finalizeChunks(chunks, sourceText) {
|
|
13549
|
-
let searchStart = 0;
|
|
13550
|
-
const totalChunks = chunks.length;
|
|
13551
|
-
return chunks.map((chunk, index) => {
|
|
13552
|
-
const tokenCount = countTokens(chunk.pageContent);
|
|
13553
|
-
let charStart = sourceText.indexOf(chunk.pageContent, searchStart);
|
|
13554
|
-
if (charStart === -1) charStart = sourceText.indexOf(chunk.pageContent);
|
|
13555
|
-
const charEnd = charStart >= 0 ? charStart + chunk.pageContent.length : void 0;
|
|
13556
|
-
if (charStart >= 0 && charEnd !== void 0) searchStart = charEnd;
|
|
13557
|
-
return {
|
|
13558
|
-
...chunk,
|
|
13559
|
-
chunkIndex: index,
|
|
13560
|
-
totalChunks,
|
|
13561
|
-
tokenCount,
|
|
13562
|
-
headingPath: getHeadingPath(chunk.metadata),
|
|
13563
|
-
charStart: charStart >= 0 ? charStart : void 0,
|
|
13564
|
-
charEnd
|
|
13565
|
-
};
|
|
13566
|
-
});
|
|
13567
|
-
}
|
|
13568
|
-
function getEffectiveOverlapTokens(maxTokens, overlapTokens) {
|
|
13569
|
-
return Math.floor(Math.min(overlapTokens, Math.max(64, maxTokens * MAX_OVERLAP_RATIO), MAX_EFFECTIVE_OVERLAP_TOKENS));
|
|
13570
|
-
}
|
|
13571
|
-
function splitMarkdownTable(tableText, maxTokens) {
|
|
13572
|
-
if (countTokens(tableText) <= maxTokens) return [tableText];
|
|
13573
|
-
const lines = tableText.split("\n");
|
|
13574
|
-
const headerIndex = lines.findIndex((line) => line.trim().startsWith("|"));
|
|
13575
|
-
const separatorIndex = lines.findIndex((line, index) => {
|
|
13576
|
-
if (index <= headerIndex) return false;
|
|
13577
|
-
const cells = line.trim().replace(LEADING_TABLE_PIPE_RE, "").replace(TRAILING_TABLE_PIPE_RE, "").split("|").map((cell) => cell.trim());
|
|
13578
|
-
return cells.length > 0 && cells.every((cell) => TABLE_SEPARATOR_CELL_RE.test(cell));
|
|
13579
|
-
});
|
|
13580
|
-
if (headerIndex === -1 || separatorIndex === -1) return splitTextRecursively(tableText, maxTokens, ["\n"]);
|
|
13581
|
-
const prefix = lines.slice(0, headerIndex);
|
|
13582
|
-
const header = lines[headerIndex];
|
|
13583
|
-
const separator = lines[separatorIndex];
|
|
13584
|
-
const rows = lines.slice(separatorIndex + 1).filter((line) => line.trim() !== "");
|
|
13585
|
-
const chunks = [];
|
|
13586
|
-
let currentRows = [];
|
|
13587
|
-
const buildTable = (tableRows) => {
|
|
13588
|
-
return [
|
|
13589
|
-
...prefix,
|
|
13590
|
-
header,
|
|
13591
|
-
separator,
|
|
13592
|
-
...tableRows
|
|
13593
|
-
].join("\n");
|
|
13594
|
-
};
|
|
13595
|
-
for (const row of rows) {
|
|
13596
|
-
const candidateRows = [...currentRows, row];
|
|
13597
|
-
if (currentRows.length > 0 && countTokens(buildTable(candidateRows)) > maxTokens) {
|
|
13598
|
-
chunks.push(buildTable(currentRows));
|
|
13599
|
-
currentRows = [row];
|
|
13600
|
-
} else currentRows = candidateRows;
|
|
13601
|
-
}
|
|
13602
|
-
if (currentRows.length > 0) chunks.push(buildTable(currentRows));
|
|
13603
|
-
return chunks.length > 0 ? chunks : [tableText];
|
|
13604
|
-
}
|
|
13605
|
-
/**
|
|
13606
|
-
* Splits text recursively using a list of separators.
|
|
13607
|
-
* Preserves the separators when re-joining.
|
|
13608
|
-
*/
|
|
13609
|
-
function splitTextRecursively(text$1, maxTokens, separators = [
|
|
13610
|
-
"\n\n",
|
|
13611
|
-
"\n",
|
|
13612
|
-
"。",
|
|
13613
|
-
". ",
|
|
13614
|
-
" "
|
|
13615
|
-
]) {
|
|
13616
|
-
if (countTokens(text$1) <= maxTokens) return [text$1];
|
|
13617
|
-
if (separators.length === 0) {
|
|
13618
|
-
const chunks = [];
|
|
13619
|
-
let current = "";
|
|
13620
|
-
for (const char of text$1) if (countTokens(current + char) > maxTokens) {
|
|
13621
|
-
chunks.push(current);
|
|
13622
|
-
current = char;
|
|
13623
|
-
} else current += char;
|
|
13624
|
-
if (current) chunks.push(current);
|
|
13625
|
-
return chunks;
|
|
13626
|
-
}
|
|
13627
|
-
const separator = separators[0];
|
|
13628
|
-
const nextSeparators = separators.slice(1);
|
|
13629
|
-
const parts = text$1.split(separator);
|
|
13630
|
-
const result = [];
|
|
13631
|
-
let currentChunk = [];
|
|
13632
|
-
let currentChunkTokens = 0;
|
|
13633
|
-
for (let i = 0; i < parts.length; i++) {
|
|
13634
|
-
const part = parts[i];
|
|
13635
|
-
const itemText = part + (i < parts.length - 1 ? separator : "");
|
|
13636
|
-
const partTokens = countTokens(itemText);
|
|
13637
|
-
if (partTokens > maxTokens) {
|
|
13638
|
-
if (currentChunk.length > 0) {
|
|
13639
|
-
result.push(currentChunk.join(""));
|
|
13640
|
-
currentChunk = [];
|
|
13641
|
-
currentChunkTokens = 0;
|
|
13642
|
-
}
|
|
13643
|
-
const subParts = splitTextRecursively(part, maxTokens, nextSeparators);
|
|
13644
|
-
for (let j = 0; j < subParts.length; j++) {
|
|
13645
|
-
const finalSub = subParts[j] + (j === subParts.length - 1 && i < parts.length - 1 ? separator : "");
|
|
13646
|
-
result.push(finalSub);
|
|
13647
|
-
}
|
|
13648
|
-
} else if (currentChunkTokens + partTokens > maxTokens) {
|
|
13649
|
-
result.push(currentChunk.join(""));
|
|
13650
|
-
currentChunk = [itemText];
|
|
13651
|
-
currentChunkTokens = partTokens;
|
|
13652
|
-
} else {
|
|
13653
|
-
currentChunk.push(itemText);
|
|
13654
|
-
currentChunkTokens += partTokens;
|
|
13655
|
-
}
|
|
13656
|
-
}
|
|
13657
|
-
if (currentChunk.length > 0) result.push(currentChunk.join(""));
|
|
13658
|
-
return result;
|
|
13659
|
-
}
|
|
13660
|
-
/**
|
|
13661
|
-
* Splits a Markdown document into chunks based on heading contexts, AST block parsing, and token limits.
|
|
13662
|
-
* Protects tables, list items, and code blocks from being broken.
|
|
13663
|
-
*/
|
|
13664
|
-
function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
|
|
13665
|
-
const tokens = marked.lexer(text$1);
|
|
13666
|
-
const chunks = [];
|
|
13667
|
-
const effectiveOverlapTokens = getEffectiveOverlapTokens(maxTokens, overlapTokens);
|
|
13668
|
-
let currentHeadings = [];
|
|
13669
|
-
let currentChunkList = [];
|
|
13670
|
-
let accumulatedTokens = 0;
|
|
13671
|
-
const flushCurrentChunk = (isHeadingChange = false) => {
|
|
13672
|
-
if (currentChunkList.length === 0) return;
|
|
13673
|
-
const pageContent = currentChunkList.map((item) => item.text).join("");
|
|
13674
|
-
const firstHeadings = currentChunkList[0].headings;
|
|
13675
|
-
chunks.push({
|
|
13676
|
-
pageContent,
|
|
13677
|
-
metadata: getMetadata(firstHeadings)
|
|
13678
|
-
});
|
|
13679
|
-
if (isHeadingChange || effectiveOverlapTokens <= 0) {
|
|
13680
|
-
currentChunkList = [];
|
|
13681
|
-
accumulatedTokens = 0;
|
|
13682
|
-
} else {
|
|
13683
|
-
const overlapItems = [];
|
|
13684
|
-
let currentOverlapTokens = 0;
|
|
13685
|
-
for (let i = currentChunkList.length - 1; i >= 0; i--) {
|
|
13686
|
-
const item = currentChunkList[i];
|
|
13687
|
-
const itemTokens = countTokens(item.text);
|
|
13688
|
-
if (currentOverlapTokens + itemTokens > effectiveOverlapTokens && overlapItems.length > 0) break;
|
|
13689
|
-
overlapItems.unshift(item);
|
|
13690
|
-
currentOverlapTokens += itemTokens;
|
|
13691
|
-
}
|
|
13692
|
-
currentChunkList = [...overlapItems];
|
|
13693
|
-
accumulatedTokens = currentOverlapTokens;
|
|
13694
|
-
}
|
|
13695
|
-
};
|
|
13696
|
-
for (const token of tokens) {
|
|
13697
|
-
if (token.type === "space") {
|
|
13698
|
-
if (currentChunkList.length > 0) {
|
|
13699
|
-
currentChunkList[currentChunkList.length - 1].text += token.raw;
|
|
13700
|
-
accumulatedTokens += countTokens(token.raw);
|
|
13701
|
-
}
|
|
13702
|
-
continue;
|
|
13703
|
-
}
|
|
13704
|
-
if (token.type === "heading") {
|
|
13705
|
-
flushCurrentChunk(true);
|
|
13706
|
-
const depth = token.depth;
|
|
13707
|
-
const title = token.text.trim();
|
|
13708
|
-
currentHeadings = currentHeadings.slice(0, depth - 1);
|
|
13709
|
-
currentHeadings[depth - 1] = title;
|
|
13710
|
-
}
|
|
13711
|
-
const rawText = token.raw;
|
|
13712
|
-
if (token.type === "list" && countTokens(rawText) > maxTokens) for (const item of token.items) processTextBlock(item.raw, currentHeadings);
|
|
13713
|
-
else {
|
|
13714
|
-
const isAtomic = token.type === "table" || token.type === "code";
|
|
13715
|
-
processTextBlock(rawText, currentHeadings, isAtomic);
|
|
13716
|
-
}
|
|
13717
|
-
}
|
|
13718
|
-
flushCurrentChunk(true);
|
|
13719
|
-
return finalizeChunks(chunks, text$1);
|
|
13720
|
-
function processTextBlock(blockText, headings, isAtomic = false) {
|
|
13721
|
-
const blockTokens = countTokens(blockText);
|
|
13722
|
-
const contextTokens = countTokens(formatHeadingContext(headings));
|
|
13723
|
-
const safetyBuffer = Math.min(100, Math.max(2, Math.floor(maxTokens * .1)));
|
|
13724
|
-
const budgetLimit = Math.max(5, maxTokens - contextTokens - safetyBuffer);
|
|
13725
|
-
if (blockTokens > budgetLimit) if (isAtomic) {
|
|
13726
|
-
flushCurrentChunk(false);
|
|
13727
|
-
const atomicBlocks = blockTokens <= maxTokens ? [blockText] : blockText.includes("|") ? splitMarkdownTable(blockText, budgetLimit) : splitTextRecursively(blockText, budgetLimit, ["\n"]);
|
|
13728
|
-
for (const block of atomicBlocks) {
|
|
13729
|
-
currentChunkList.push({
|
|
13730
|
-
text: block,
|
|
13731
|
-
headings: [...headings]
|
|
13732
|
-
});
|
|
13733
|
-
accumulatedTokens = countTokens(block);
|
|
13734
|
-
flushCurrentChunk(false);
|
|
13735
|
-
}
|
|
13736
|
-
} else {
|
|
13737
|
-
flushCurrentChunk(false);
|
|
13738
|
-
const subBlocks = splitTextRecursively(blockText, budgetLimit);
|
|
13739
|
-
for (const sub of subBlocks) {
|
|
13740
|
-
currentChunkList.push({
|
|
13741
|
-
text: sub,
|
|
13742
|
-
headings: [...headings]
|
|
13743
|
-
});
|
|
13744
|
-
accumulatedTokens += countTokens(sub);
|
|
13745
|
-
if (accumulatedTokens > budgetLimit) flushCurrentChunk(false);
|
|
13746
|
-
}
|
|
13747
|
-
}
|
|
13748
|
-
else {
|
|
13749
|
-
if (accumulatedTokens + blockTokens + contextTokens > maxTokens && currentChunkList.length > 0) flushCurrentChunk(false);
|
|
13750
|
-
currentChunkList.push({
|
|
13751
|
-
text: blockText,
|
|
13752
|
-
headings: [...headings]
|
|
13753
|
-
});
|
|
13754
|
-
accumulatedTokens += blockTokens;
|
|
13755
|
-
}
|
|
13756
|
-
}
|
|
13757
|
-
}
|
|
13758
|
-
|
|
13759
13568
|
//#endregion
|
|
13760
13569
|
//#region src/core/extraction-audit.ts
|
|
13761
13570
|
const AUDIT_ID_RE = /^[\w.-]+$/;
|
|
@@ -13906,276 +13715,6 @@ function getFileHash(filePath) {
|
|
|
13906
13715
|
});
|
|
13907
13716
|
}
|
|
13908
13717
|
|
|
13909
|
-
//#endregion
|
|
13910
|
-
//#region src/core/ai-extraction/evidence.ts
|
|
13911
|
-
const JSON_FILE_SUFFIX_RE$1 = /\.json$/i;
|
|
13912
|
-
const FIELD_PATH_PREFIX_RE = /^\$\./;
|
|
13913
|
-
function isRecord(value) {
|
|
13914
|
-
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
13915
|
-
}
|
|
13916
|
-
function stableValueKey(value) {
|
|
13917
|
-
return JSON.stringify(value);
|
|
13918
|
-
}
|
|
13919
|
-
function isPlaceholderString(value) {
|
|
13920
|
-
if (typeof value !== "string") return false;
|
|
13921
|
-
const normalized = value.trim().toLowerCase();
|
|
13922
|
-
return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
|
|
13923
|
-
}
|
|
13924
|
-
function primitiveToText(value) {
|
|
13925
|
-
if (value === null || value === void 0) return null;
|
|
13926
|
-
if (typeof value === "string") return value.trim() || null;
|
|
13927
|
-
if (typeof value === "number" || typeof value === "boolean") return String(value);
|
|
13928
|
-
return null;
|
|
13929
|
-
}
|
|
13930
|
-
function isMeaningfulValue(value) {
|
|
13931
|
-
return primitiveToText(value) !== null && !isPlaceholderString(value);
|
|
13932
|
-
}
|
|
13933
|
-
function normalizeText(value) {
|
|
13934
|
-
return value.toLowerCase().replace(/\s+/g, " ").trim();
|
|
13935
|
-
}
|
|
13936
|
-
function quoteAround(text$1, start, length) {
|
|
13937
|
-
const before = Math.max(0, start - 80);
|
|
13938
|
-
const after = Math.min(text$1.length, start + length + 80);
|
|
13939
|
-
return text$1.slice(before, after).replace(/\s+/g, " ").trim();
|
|
13940
|
-
}
|
|
13941
|
-
function findEvidence(value, chunks) {
|
|
13942
|
-
const searchText = primitiveToText(value);
|
|
13943
|
-
if (!searchText) return null;
|
|
13944
|
-
const normalizedSearchText = normalizeText(searchText);
|
|
13945
|
-
if (!normalizedSearchText) return null;
|
|
13946
|
-
for (const chunk of chunks) {
|
|
13947
|
-
if (normalizeText(chunk.text).indexOf(normalizedSearchText) === -1) continue;
|
|
13948
|
-
const rawIndex = chunk.text.toLowerCase().indexOf(searchText.toLowerCase());
|
|
13949
|
-
const quoteIndex = rawIndex >= 0 ? rawIndex : 0;
|
|
13950
|
-
return {
|
|
13951
|
-
chunkIndex: chunk.chunkIndex,
|
|
13952
|
-
headingPath: chunk.headingPath,
|
|
13953
|
-
quote: quoteAround(chunk.text, quoteIndex, searchText.length)
|
|
13954
|
-
};
|
|
13955
|
-
}
|
|
13956
|
-
return null;
|
|
13957
|
-
}
|
|
13958
|
-
function addEvidenceForProperty(fields, path$1, property, value, chunks) {
|
|
13959
|
-
if (property.type === "object" && property.properties) {
|
|
13960
|
-
const record = isRecord(value) ? value : {};
|
|
13961
|
-
for (const [childName, childProperty] of Object.entries(property.properties)) addEvidenceForProperty(fields, `${path$1}.${childName}`, childProperty, record[childName], chunks);
|
|
13962
|
-
return;
|
|
13963
|
-
}
|
|
13964
|
-
if (property.type === "array") {
|
|
13965
|
-
if (!Array.isArray(value) || value.length === 0) {
|
|
13966
|
-
fields.push({
|
|
13967
|
-
fieldPath: path$1,
|
|
13968
|
-
status: "missing",
|
|
13969
|
-
value: null,
|
|
13970
|
-
confidence: 0,
|
|
13971
|
-
note: "Array field is empty or missing."
|
|
13972
|
-
});
|
|
13973
|
-
return;
|
|
13974
|
-
}
|
|
13975
|
-
value.forEach((item, index) => {
|
|
13976
|
-
if (property.items?.type === "object" && property.items.properties) {
|
|
13977
|
-
const record = isRecord(item) ? item : {};
|
|
13978
|
-
for (const [childName, childProperty] of Object.entries(property.items.properties)) addEvidenceForProperty(fields, `${path$1}[${index}].${childName}`, childProperty, record[childName], chunks);
|
|
13979
|
-
} else addPrimitiveEvidence(fields, `${path$1}[${index}]`, item, chunks);
|
|
13980
|
-
});
|
|
13981
|
-
return;
|
|
13982
|
-
}
|
|
13983
|
-
addPrimitiveEvidence(fields, path$1, value, chunks);
|
|
13984
|
-
}
|
|
13985
|
-
function addPrimitiveEvidence(fields, fieldPath, value, chunks) {
|
|
13986
|
-
if (value === null || value === void 0 || value === "") {
|
|
13987
|
-
fields.push({
|
|
13988
|
-
fieldPath,
|
|
13989
|
-
status: "missing",
|
|
13990
|
-
value: null,
|
|
13991
|
-
confidence: 0,
|
|
13992
|
-
note: "Field is null or empty in final extraction."
|
|
13993
|
-
});
|
|
13994
|
-
return;
|
|
13995
|
-
}
|
|
13996
|
-
const found = findEvidence(value, chunks);
|
|
13997
|
-
if (found) {
|
|
13998
|
-
fields.push({
|
|
13999
|
-
fieldPath,
|
|
14000
|
-
status: "found",
|
|
14001
|
-
value,
|
|
14002
|
-
confidence: .8,
|
|
14003
|
-
...found
|
|
14004
|
-
});
|
|
14005
|
-
return;
|
|
14006
|
-
}
|
|
14007
|
-
fields.push({
|
|
14008
|
-
fieldPath,
|
|
14009
|
-
status: "inferred",
|
|
14010
|
-
value,
|
|
14011
|
-
confidence: .35,
|
|
14012
|
-
note: "Final value was not found verbatim in the available source text."
|
|
14013
|
-
});
|
|
14014
|
-
}
|
|
14015
|
-
function sourceChunksFromText(text$1) {
|
|
14016
|
-
return text$1 ? [{
|
|
14017
|
-
text: text$1,
|
|
14018
|
-
chunkIndex: 0,
|
|
14019
|
-
headingPath: []
|
|
14020
|
-
}] : [];
|
|
14021
|
-
}
|
|
14022
|
-
function sourceChunksFromMarkdownChunks(chunks) {
|
|
14023
|
-
return chunks.map((chunk, index) => ({
|
|
14024
|
-
text: chunk.pageContent,
|
|
14025
|
-
chunkIndex: chunk.chunkIndex ?? index,
|
|
14026
|
-
headingPath: chunk.headingPath ?? []
|
|
14027
|
-
}));
|
|
14028
|
-
}
|
|
14029
|
-
function getPathParts(fieldPath) {
|
|
14030
|
-
return fieldPath.replace(FIELD_PATH_PREFIX_RE, "").split(".").filter(Boolean);
|
|
14031
|
-
}
|
|
14032
|
-
function getValueAtPath$1(data, fieldPath) {
|
|
14033
|
-
let current = data;
|
|
14034
|
-
for (const part of getPathParts(fieldPath)) {
|
|
14035
|
-
if (!isRecord(current)) return void 0;
|
|
14036
|
-
current = current[part];
|
|
14037
|
-
}
|
|
14038
|
-
return current;
|
|
14039
|
-
}
|
|
14040
|
-
function setValueAtPath(data, fieldPath, value) {
|
|
14041
|
-
const parts = getPathParts(fieldPath);
|
|
14042
|
-
let current = data;
|
|
14043
|
-
for (let i = 0; i < parts.length - 1; i++) {
|
|
14044
|
-
const part = parts[i];
|
|
14045
|
-
if (!isRecord(current[part])) current[part] = {};
|
|
14046
|
-
current = current[part];
|
|
14047
|
-
}
|
|
14048
|
-
current[parts[parts.length - 1]] = value;
|
|
14049
|
-
}
|
|
14050
|
-
function collectScalarFields(fields, fieldPath, property) {
|
|
14051
|
-
if (property.type === "object" && property.properties) {
|
|
14052
|
-
for (const [name$1, childProperty] of Object.entries(property.properties)) collectScalarFields(fields, `${fieldPath}.${name$1}`, childProperty);
|
|
14053
|
-
return;
|
|
14054
|
-
}
|
|
14055
|
-
if (property.type !== "array") fields.push({
|
|
14056
|
-
fieldPath,
|
|
14057
|
-
property
|
|
14058
|
-
});
|
|
14059
|
-
}
|
|
14060
|
-
function candidateScore(candidate) {
|
|
14061
|
-
return (candidate.status === "found" ? 100 : 0) + Math.round(candidate.confidence * 10) + candidate.chunkIndex;
|
|
14062
|
-
}
|
|
14063
|
-
function selectCandidatesForField(candidates) {
|
|
14064
|
-
if (candidates.length === 0) return null;
|
|
14065
|
-
candidates.sort((a, b) => candidateScore(b) - candidateScore(a));
|
|
14066
|
-
const selected = candidates[0];
|
|
14067
|
-
selected.selected = true;
|
|
14068
|
-
for (const candidate of candidates.slice(1)) {
|
|
14069
|
-
candidate.selected = false;
|
|
14070
|
-
candidate.rejectionReason = "Lower evidence score or earlier chunk position.";
|
|
14071
|
-
}
|
|
14072
|
-
const distinctValues = /* @__PURE__ */ new Map();
|
|
14073
|
-
for (const candidate of candidates) distinctValues.set(stableValueKey(candidate.value), candidate.value);
|
|
14074
|
-
if (distinctValues.size <= 1) return null;
|
|
14075
|
-
return {
|
|
14076
|
-
fieldPath: selected.fieldPath,
|
|
14077
|
-
selectedValue: selected.value,
|
|
14078
|
-
rejectedValues: candidates.slice(1).map((candidate) => candidate.value),
|
|
14079
|
-
candidates: [...candidates]
|
|
14080
|
-
};
|
|
14081
|
-
}
|
|
14082
|
-
function buildCandidateMergeReport(input) {
|
|
14083
|
-
const scalarFields = [];
|
|
14084
|
-
for (const [name$1, property] of Object.entries(input.schema.properties)) {
|
|
14085
|
-
if (property.primary && property.autoIncrement) continue;
|
|
14086
|
-
collectScalarFields(scalarFields, `$.${name$1}`, property);
|
|
14087
|
-
}
|
|
14088
|
-
const sourceChunks = sourceChunksFromMarkdownChunks(input.chunks);
|
|
14089
|
-
const candidatesByPath = /* @__PURE__ */ new Map();
|
|
14090
|
-
for (const { fieldPath } of scalarFields) for (let chunkIndex = 0; chunkIndex < input.chunkResults.length; chunkIndex++) {
|
|
14091
|
-
const value = getValueAtPath$1(input.chunkResults[chunkIndex], fieldPath);
|
|
14092
|
-
if (!isMeaningfulValue(value)) continue;
|
|
14093
|
-
const sourceChunk = sourceChunks[chunkIndex] ?? {
|
|
14094
|
-
text: "",
|
|
14095
|
-
chunkIndex
|
|
14096
|
-
};
|
|
14097
|
-
const found = findEvidence(value, [sourceChunk]);
|
|
14098
|
-
const candidate = {
|
|
14099
|
-
fieldPath,
|
|
14100
|
-
value,
|
|
14101
|
-
chunkIndex: sourceChunk.chunkIndex ?? chunkIndex,
|
|
14102
|
-
headingPath: sourceChunk.headingPath,
|
|
14103
|
-
status: found ? "found" : "inferred",
|
|
14104
|
-
quote: found?.quote,
|
|
14105
|
-
confidence: found ? .85 : .35
|
|
14106
|
-
};
|
|
14107
|
-
const candidates = candidatesByPath.get(fieldPath) ?? [];
|
|
14108
|
-
candidates.push(candidate);
|
|
14109
|
-
candidatesByPath.set(fieldPath, candidates);
|
|
14110
|
-
}
|
|
14111
|
-
const allCandidates = [];
|
|
14112
|
-
const conflicts = [];
|
|
14113
|
-
for (const candidates of candidatesByPath.values()) {
|
|
14114
|
-
const conflict = selectCandidatesForField(candidates);
|
|
14115
|
-
allCandidates.push(...candidates);
|
|
14116
|
-
if (conflict) conflicts.push(conflict);
|
|
14117
|
-
}
|
|
14118
|
-
return {
|
|
14119
|
-
candidates: allCandidates,
|
|
14120
|
-
conflicts
|
|
14121
|
-
};
|
|
14122
|
-
}
|
|
14123
|
-
function applySelectedCandidates(data, report) {
|
|
14124
|
-
const merged = structuredClone(data);
|
|
14125
|
-
for (const candidate of report.candidates) if (candidate.selected) setValueAtPath(merged, candidate.fieldPath, candidate.value);
|
|
14126
|
-
return merged;
|
|
14127
|
-
}
|
|
14128
|
-
function buildExtractionEvidence(input) {
|
|
14129
|
-
const data = isRecord(input.data) ? input.data : {};
|
|
14130
|
-
const chunks = input.chunks ? sourceChunksFromMarkdownChunks(input.chunks) : sourceChunksFromText(input.text ?? "");
|
|
14131
|
-
const fields = [];
|
|
14132
|
-
for (const [name$1, property] of Object.entries(input.schema.properties)) {
|
|
14133
|
-
if (property.primary && property.autoIncrement) continue;
|
|
14134
|
-
addEvidenceForProperty(fields, `$.${name$1}`, property, data[name$1], chunks);
|
|
14135
|
-
}
|
|
14136
|
-
const inferredIssues = fields.filter((field) => field.status === "inferred").map((field) => ({
|
|
14137
|
-
fieldPath: field.fieldPath,
|
|
14138
|
-
message: field.note ?? "Field value lacks source evidence."
|
|
14139
|
-
}));
|
|
14140
|
-
const conflictIssues = (input.candidateReport?.conflicts ?? []).map((conflict) => ({
|
|
14141
|
-
fieldPath: conflict.fieldPath,
|
|
14142
|
-
message: "Multiple chunk candidates disagree for this field."
|
|
14143
|
-
}));
|
|
14144
|
-
const issues = [...inferredIssues, ...conflictIssues];
|
|
14145
|
-
return {
|
|
14146
|
-
coverage: {
|
|
14147
|
-
path: input.outputPath ? evidencePathForOutput(input.outputPath) : void 0,
|
|
14148
|
-
fieldCount: fields.length,
|
|
14149
|
-
evidenceCount: fields.filter((field) => field.status === "found").length,
|
|
14150
|
-
foundCount: fields.filter((field) => field.status === "found").length,
|
|
14151
|
-
missingCount: fields.filter((field) => field.status === "missing").length,
|
|
14152
|
-
inferredCount: fields.filter((field) => field.status === "inferred").length,
|
|
14153
|
-
conflictCount: input.candidateReport?.conflicts.length ?? 0,
|
|
14154
|
-
issueCount: issues.length
|
|
14155
|
-
},
|
|
14156
|
-
fields,
|
|
14157
|
-
candidates: input.candidateReport?.candidates,
|
|
14158
|
-
conflicts: input.candidateReport?.conflicts,
|
|
14159
|
-
issues
|
|
14160
|
-
};
|
|
14161
|
-
}
|
|
14162
|
-
function evidencePathForOutput(outputPath) {
|
|
14163
|
-
return outputPath.replace(JSON_FILE_SUFFIX_RE$1, ".evidence.json");
|
|
14164
|
-
}
|
|
14165
|
-
async function writeExtractionEvidence(input) {
|
|
14166
|
-
const report = buildExtractionEvidence(input);
|
|
14167
|
-
const evidencePath = evidencePathForOutput(input.outputPath);
|
|
14168
|
-
report.coverage.path = evidencePath;
|
|
14169
|
-
await writeFile(evidencePath, report, {
|
|
14170
|
-
spaces: 2,
|
|
14171
|
-
EOL: "\n"
|
|
14172
|
-
});
|
|
14173
|
-
return {
|
|
14174
|
-
...report.coverage,
|
|
14175
|
-
path: path.resolve(evidencePath)
|
|
14176
|
-
};
|
|
14177
|
-
}
|
|
14178
|
-
|
|
14179
13718
|
//#endregion
|
|
14180
13719
|
//#region src/core/notion-sink.ts
|
|
14181
13720
|
const RICH_TEXT_LIMIT = 2e3;
|
|
@@ -14461,66 +14000,16 @@ async function triggerWebhook(aiConfig, auditId, schemaName, event, source, data
|
|
|
14461
14000
|
}
|
|
14462
14001
|
}
|
|
14463
14002
|
|
|
14464
|
-
//#endregion
|
|
14465
|
-
//#region src/core/ai-extraction/transcriber.ts
|
|
14466
|
-
const TRANSCRIPTION_PROMPT = "Transcribe all visible text from this image accurately. Preserve the layout and line breaks as much as possible.";
|
|
14467
|
-
async function transcribeImageWithVision(imagePath, baseURL, apiKey, modelName, timeoutMs) {
|
|
14468
|
-
const provider = createOpenAICompatible({
|
|
14469
|
-
baseURL,
|
|
14470
|
-
name: "openai-compatible",
|
|
14471
|
-
apiKey
|
|
14472
|
-
});
|
|
14473
|
-
const buffer = await fs.readFile(imagePath);
|
|
14474
|
-
const effectiveTimeout = timeoutMs ?? 3e5;
|
|
14475
|
-
return {
|
|
14476
|
-
text: (await generateText({
|
|
14477
|
-
model: provider.chatModel(modelName),
|
|
14478
|
-
messages: [{
|
|
14479
|
-
role: "user",
|
|
14480
|
-
content: [{
|
|
14481
|
-
type: "text",
|
|
14482
|
-
text: TRANSCRIPTION_PROMPT
|
|
14483
|
-
}, {
|
|
14484
|
-
type: "image",
|
|
14485
|
-
image: buffer
|
|
14486
|
-
}]
|
|
14487
|
-
}],
|
|
14488
|
-
abortSignal: AbortSignal.timeout(effectiveTimeout)
|
|
14489
|
-
})).text,
|
|
14490
|
-
modelName
|
|
14491
|
-
};
|
|
14492
|
-
}
|
|
14493
|
-
|
|
14494
14003
|
//#endregion
|
|
14495
14004
|
//#region src/core/file-constants.ts
|
|
14496
14005
|
const MAX_UPLOAD_SIZE = 30 * 1024 * 1024;
|
|
14497
14006
|
const MAX_UPLOAD_SIZE_TEXT = "30MB";
|
|
14498
14007
|
const SUPPORTED_FILE_TYPES_TEXT = "images, PDF, text, markdown, CSV, JSON, HTML, XML, YAML";
|
|
14499
14008
|
const MISSING_UPLOAD_FILE_TEXT = t("errors.file.missingUpload");
|
|
14500
|
-
const SUPPORTED_MIME_TYPES = new Set([
|
|
14501
|
-
"image/png",
|
|
14502
|
-
"image/jpeg",
|
|
14503
|
-
"image/gif",
|
|
14504
|
-
"image/webp",
|
|
14505
|
-
"image/bmp",
|
|
14506
|
-
"image/svg+xml",
|
|
14507
|
-
"application/pdf",
|
|
14508
|
-
"text/plain",
|
|
14509
|
-
"text/markdown",
|
|
14510
|
-
"text/csv",
|
|
14511
|
-
"application/json",
|
|
14512
|
-
"text/html",
|
|
14513
|
-
"text/xml",
|
|
14514
|
-
"application/x-yaml",
|
|
14515
|
-
"text/yaml"
|
|
14516
|
-
]);
|
|
14517
14009
|
const MIME_TO_EXT = {
|
|
14518
14010
|
"image/png": "png",
|
|
14519
14011
|
"image/jpeg": "jpg",
|
|
14520
|
-
"image/gif": "gif",
|
|
14521
14012
|
"image/webp": "webp",
|
|
14522
|
-
"image/bmp": "bmp",
|
|
14523
|
-
"image/svg+xml": "svg",
|
|
14524
14013
|
"application/pdf": "pdf",
|
|
14525
14014
|
"text/plain": "txt",
|
|
14526
14015
|
"text/markdown": "md",
|
|
@@ -14537,8 +14026,8 @@ function bytesToMB(bytes) {
|
|
|
14537
14026
|
function getExtensionFromMime(mimeType) {
|
|
14538
14027
|
return MIME_TO_EXT[mimeType];
|
|
14539
14028
|
}
|
|
14540
|
-
function
|
|
14541
|
-
return
|
|
14029
|
+
function getExtensionForDetectedFile(mimeType) {
|
|
14030
|
+
return mimeType ? getExtensionFromMime(mimeType) ?? "txt" : "txt";
|
|
14542
14031
|
}
|
|
14543
14032
|
function unsupportedFileTypeMessage(mimeType) {
|
|
14544
14033
|
return t("errors.file.unsupportedType", {
|
|
@@ -14555,14 +14044,16 @@ var FileValidationError = class extends Error {
|
|
|
14555
14044
|
this.name = "FileValidationError";
|
|
14556
14045
|
}
|
|
14557
14046
|
};
|
|
14558
|
-
function
|
|
14047
|
+
async function validateFileUploadContent(file, buffer) {
|
|
14559
14048
|
if (file.size === 0) throw new FileValidationError(t("errors.file.empty"));
|
|
14560
14049
|
if (file.size > MAX_UPLOAD_SIZE) throw new FileValidationError(t("errors.file.sizeExceeded", {
|
|
14561
14050
|
size: bytesToMB(file.size).toFixed(1),
|
|
14562
14051
|
limit: MAX_UPLOAD_SIZE_TEXT,
|
|
14563
14052
|
file: file.name
|
|
14564
14053
|
}));
|
|
14565
|
-
|
|
14054
|
+
const detected = await detectInputBufferKind(buffer);
|
|
14055
|
+
if (detected.kind === "unsupported") throw new FileValidationError(unsupportedFileTypeMessage(detected.mime ?? (file.type || "application/octet-stream")));
|
|
14056
|
+
return detected.mime ?? "text/plain";
|
|
14566
14057
|
}
|
|
14567
14058
|
|
|
14568
14059
|
//#endregion
|
|
@@ -14845,39 +14336,56 @@ const FILE_PART_EXTENSIONS = new Set([
|
|
|
14845
14336
|
"png",
|
|
14846
14337
|
"jpg",
|
|
14847
14338
|
"jpeg",
|
|
14848
|
-
"
|
|
14849
|
-
"webp",
|
|
14850
|
-
"bmp",
|
|
14851
|
-
"svg"
|
|
14339
|
+
"webp"
|
|
14852
14340
|
]);
|
|
14853
14341
|
const PDF_EXT_RE = /\.pdf$/i;
|
|
14854
|
-
async function
|
|
14342
|
+
async function describeExtractFileInput(filePath, aiConfig, modelOverride) {
|
|
14343
|
+
const detected = await detectInputFileKind(filePath);
|
|
14344
|
+
if (detected.kind === "image") return {
|
|
14345
|
+
kind: "image",
|
|
14346
|
+
mime: detected.mime,
|
|
14347
|
+
handler: shouldUseImageOcrFallback(aiConfig, modelOverride) ? "image_local_ocr" : "image_vision"
|
|
14348
|
+
};
|
|
14349
|
+
if (detected.kind === "pdf") {
|
|
14350
|
+
const converter = createPdfConverter(aiConfig?.pdf);
|
|
14351
|
+
return {
|
|
14352
|
+
kind: "pdf",
|
|
14353
|
+
mime: detected.mime,
|
|
14354
|
+
handler: "pdf_converter",
|
|
14355
|
+
converter: converter.name
|
|
14356
|
+
};
|
|
14357
|
+
}
|
|
14358
|
+
if (detected.kind === "text") return {
|
|
14359
|
+
kind: "text",
|
|
14360
|
+
mime: detected.mime,
|
|
14361
|
+
handler: "text"
|
|
14362
|
+
};
|
|
14363
|
+
throw new Error(unsupportedFileTypeMessage(detected.mime ?? "application/octet-stream"));
|
|
14364
|
+
}
|
|
14365
|
+
async function readExtractFileInput(filePath, aiConfig, modelOverride) {
|
|
14855
14366
|
const stat = fs$1.statSync(filePath);
|
|
14856
14367
|
if (stat.size > MAX_UPLOAD_SIZE) throw new Error(t("errors.file.sizeExceeded", {
|
|
14857
14368
|
size: bytesToMB(stat.size).toFixed(1),
|
|
14858
14369
|
limit: MAX_UPLOAD_SIZE_TEXT,
|
|
14859
14370
|
file: filePath
|
|
14860
14371
|
}));
|
|
14861
|
-
const
|
|
14862
|
-
if (
|
|
14863
|
-
|
|
14864
|
-
|
|
14865
|
-
|
|
14866
|
-
|
|
14867
|
-
|
|
14868
|
-
|
|
14869
|
-
|
|
14870
|
-
consola.info(t("command.extract.file.visionTranscribed", { model: result$1.modelName }));
|
|
14871
|
-
return { text: result$1.text };
|
|
14872
|
-
} catch {
|
|
14873
|
-
consola.warn(t("command.extract.file.visionTranscribeFailed", { model: image.imageModelName }));
|
|
14874
|
-
}
|
|
14372
|
+
const inputProcessing = await describeExtractFileInput(filePath, aiConfig, modelOverride);
|
|
14373
|
+
if (inputProcessing.kind === "image") {
|
|
14374
|
+
if (inputProcessing.handler === "image_local_ocr") {
|
|
14375
|
+
const result = await recognizeImageText(filePath);
|
|
14376
|
+
consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
|
|
14377
|
+
return {
|
|
14378
|
+
text: result.text,
|
|
14379
|
+
inputProcessing
|
|
14380
|
+
};
|
|
14875
14381
|
}
|
|
14876
|
-
|
|
14877
|
-
|
|
14878
|
-
|
|
14382
|
+
return {
|
|
14383
|
+
text: "",
|
|
14384
|
+
filePath,
|
|
14385
|
+
inputProcessing
|
|
14386
|
+
};
|
|
14879
14387
|
}
|
|
14880
|
-
if (
|
|
14388
|
+
if (inputProcessing.kind === "pdf") {
|
|
14881
14389
|
const buffer = await fs.readFile(filePath);
|
|
14882
14390
|
const converter = createPdfConverter(aiConfig?.pdf);
|
|
14883
14391
|
const result = await converter.convert(buffer, filePath);
|
|
@@ -14895,9 +14403,16 @@ async function readExtractFileInput(filePath, aiConfig) {
|
|
|
14895
14403
|
await fs.writeFile(fallbackMd, result.text);
|
|
14896
14404
|
consola.info(t("command.extract.file.markdownSaved", { path: fallbackMd }));
|
|
14897
14405
|
}
|
|
14898
|
-
return {
|
|
14406
|
+
return {
|
|
14407
|
+
text: result.text,
|
|
14408
|
+
inputProcessing
|
|
14409
|
+
};
|
|
14899
14410
|
}
|
|
14900
|
-
|
|
14411
|
+
if (inputProcessing.kind === "text") return {
|
|
14412
|
+
text: await fs.readFile(filePath, "utf-8"),
|
|
14413
|
+
inputProcessing
|
|
14414
|
+
};
|
|
14415
|
+
throw new Error(unsupportedFileTypeMessage(inputProcessing.mime ?? "application/octet-stream"));
|
|
14901
14416
|
}
|
|
14902
14417
|
|
|
14903
14418
|
//#endregion
|
|
@@ -14996,21 +14511,7 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
|
|
|
14996
14511
|
|
|
14997
14512
|
//#endregion
|
|
14998
14513
|
//#region src/core/extract-runner.ts
|
|
14999
|
-
const encoding = getEncoding("cl100k_base");
|
|
15000
14514
|
const JSON_EXT_RE$1 = /\.json$/;
|
|
15001
|
-
async function limitConcurrency(concurrency, items, fn) {
|
|
15002
|
-
const results = Array.from({ length: items.length });
|
|
15003
|
-
let nextIndex = 0;
|
|
15004
|
-
async function worker() {
|
|
15005
|
-
while (nextIndex < items.length) {
|
|
15006
|
-
const currentIndex = nextIndex++;
|
|
15007
|
-
results[currentIndex] = await fn(items[currentIndex], currentIndex);
|
|
15008
|
-
}
|
|
15009
|
-
}
|
|
15010
|
-
const workers = Array.from({ length: Math.min(concurrency, items.length) }, worker);
|
|
15011
|
-
await Promise.all(workers);
|
|
15012
|
-
return results;
|
|
15013
|
-
}
|
|
15014
14515
|
async function ensureDatabaseReady(dbPath, schema) {
|
|
15015
14516
|
try {
|
|
15016
14517
|
await fs.access(dbPath);
|
|
@@ -15082,146 +14583,34 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
15082
14583
|
}
|
|
15083
14584
|
const s = spinner();
|
|
15084
14585
|
if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
|
|
15085
|
-
const
|
|
15086
|
-
|
|
15087
|
-
modelMaxTokens: modelOverride?.capabilities.maxTokens
|
|
15088
|
-
});
|
|
15089
|
-
const overlapTokens = aiConfig.extraction?.overlapSize ?? 1e3;
|
|
15090
|
-
const totalTokens = text$1 ? encoding.encode(text$1).length : 0;
|
|
15091
|
-
if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunking", {
|
|
15092
|
-
length: totalTokens,
|
|
15093
|
-
limit: maxTokens
|
|
15094
|
-
}));
|
|
15095
|
-
const processedDocs = text$1 && totalTokens > maxTokens ? splitMarkdown(text$1, maxTokens, overlapTokens) : [{
|
|
15096
|
-
pageContent: text$1 ?? "",
|
|
15097
|
-
metadata: {},
|
|
15098
|
-
chunkIndex: 0,
|
|
15099
|
-
totalChunks: 1,
|
|
15100
|
-
tokenCount: totalTokens,
|
|
15101
|
-
headingPath: [],
|
|
15102
|
-
charStart: 0,
|
|
15103
|
-
charEnd: text$1?.length ?? 0
|
|
15104
|
-
}];
|
|
15105
|
-
if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: processedDocs.length }));
|
|
15106
|
-
const chunkResults = Array.from({ length: processedDocs.length });
|
|
15107
|
-
const accumulatedTokens = {
|
|
15108
|
-
prompt: 0,
|
|
15109
|
-
completion: 0,
|
|
15110
|
-
total: 0
|
|
15111
|
-
};
|
|
15112
|
-
let success = true;
|
|
15113
|
-
let errorMsg = "";
|
|
15114
|
-
const extractionTasks = processedDocs.map((doc, i) => {
|
|
15115
|
-
return async () => {
|
|
15116
|
-
if (!success) return;
|
|
15117
|
-
const headings = doc.headingPath?.length ? doc.headingPath : [
|
|
15118
|
-
doc.metadata.h1,
|
|
15119
|
-
doc.metadata.h2,
|
|
15120
|
-
doc.metadata.h3,
|
|
15121
|
-
doc.metadata.h4
|
|
15122
|
-
].filter(Boolean);
|
|
15123
|
-
let chunkText = doc.pageContent;
|
|
15124
|
-
if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
|
|
15125
|
-
const chunkResult = await extractStructuredData({
|
|
15126
|
-
config: aiConfig,
|
|
15127
|
-
schema: schemaLoad.schema,
|
|
15128
|
-
text: chunkText,
|
|
15129
|
-
aiexDir,
|
|
15130
|
-
modelOverride,
|
|
15131
|
-
onRetry(info) {
|
|
15132
|
-
if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
|
|
15133
|
-
current: i + 1,
|
|
15134
|
-
total: processedDocs.length,
|
|
15135
|
-
code: info.statusCode,
|
|
15136
|
-
delay: info.delayMs / 1e3,
|
|
15137
|
-
attempt: info.attempt,
|
|
15138
|
-
max: info.maxRetries
|
|
15139
|
-
}));
|
|
15140
|
-
}
|
|
15141
|
-
});
|
|
15142
|
-
if (!chunkResult.success) {
|
|
15143
|
-
success = false;
|
|
15144
|
-
errorMsg = chunkResult.error || t("common.unknownError");
|
|
15145
|
-
if (!options?.quiet) {
|
|
15146
|
-
s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
|
|
15147
|
-
consola.error(errorMsg);
|
|
15148
|
-
}
|
|
15149
|
-
return;
|
|
15150
|
-
}
|
|
15151
|
-
if (chunkResult.data) chunkResults[i] = chunkResult.data;
|
|
15152
|
-
if (chunkResult.tokensUsed) {
|
|
15153
|
-
accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
|
|
15154
|
-
accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
|
|
15155
|
-
accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
|
|
15156
|
-
}
|
|
15157
|
-
};
|
|
15158
|
-
});
|
|
15159
|
-
const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
|
|
15160
|
-
if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
|
|
15161
|
-
current: 1,
|
|
15162
|
-
total: processedDocs.length
|
|
15163
|
-
}));
|
|
15164
|
-
try {
|
|
15165
|
-
await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
|
|
15166
|
-
if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
|
|
15167
|
-
current: idx + 1,
|
|
15168
|
-
total: processedDocs.length
|
|
15169
|
-
}));
|
|
15170
|
-
await task();
|
|
15171
|
-
});
|
|
15172
|
-
} catch (e) {
|
|
15173
|
-
success = false;
|
|
15174
|
-
errorMsg = e instanceof Error ? e.message : String(e);
|
|
15175
|
-
}
|
|
15176
|
-
if (!success) return {
|
|
15177
|
-
success: false,
|
|
15178
|
-
error: errorMsg
|
|
15179
|
-
};
|
|
15180
|
-
const successfulChunkResults = chunkResults.filter((chunkResult) => !!chunkResult);
|
|
15181
|
-
const candidateReport = buildCandidateMergeReport({
|
|
14586
|
+
const result = await extractStructuredData({
|
|
14587
|
+
config: aiConfig,
|
|
15182
14588
|
schema: schemaLoad.schema,
|
|
15183
|
-
|
|
15184
|
-
|
|
14589
|
+
text: text$1 ?? "",
|
|
14590
|
+
aiexDir,
|
|
14591
|
+
file: filePath,
|
|
14592
|
+
modelOverride,
|
|
14593
|
+
onRetry(info) {
|
|
14594
|
+
if (!options?.quiet) s.message(t("command.extract.file.extractRetry", {
|
|
14595
|
+
code: info.statusCode,
|
|
14596
|
+
delay: info.delayMs / 1e3,
|
|
14597
|
+
attempt: info.attempt,
|
|
14598
|
+
max: info.maxRetries
|
|
14599
|
+
}));
|
|
14600
|
+
}
|
|
15185
14601
|
});
|
|
15186
|
-
|
|
15187
|
-
const validation = validateExtractedData(schemaLoad.schema, mergedData);
|
|
15188
|
-
if (!validation.success) {
|
|
15189
|
-
const valError = validation.error || "Merged data validation failed";
|
|
14602
|
+
if (!result.success) {
|
|
15190
14603
|
if (!options?.quiet) {
|
|
15191
|
-
s.stop(t("command.extract.file.
|
|
15192
|
-
consola.error(
|
|
14604
|
+
s.stop(t("command.extract.file.extractFail"));
|
|
14605
|
+
consola.error(result.error || t("common.unknownError"));
|
|
15193
14606
|
}
|
|
15194
14607
|
return {
|
|
15195
14608
|
success: false,
|
|
15196
|
-
error:
|
|
14609
|
+
error: result.error || t("common.unknownError")
|
|
15197
14610
|
};
|
|
15198
14611
|
}
|
|
15199
|
-
const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
|
|
15200
|
-
await fs.mkdir(outputDir, { recursive: true });
|
|
15201
|
-
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
15202
|
-
const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
|
|
15203
|
-
const outputPath = path.join(outputDir, outputFileName);
|
|
15204
|
-
await fs.writeFile(outputPath, JSON.stringify(mergedData, null, 2));
|
|
15205
|
-
const result = {
|
|
15206
|
-
success: true,
|
|
15207
|
-
data: mergedData,
|
|
15208
|
-
tokensUsed: accumulatedTokens,
|
|
15209
|
-
outputPath,
|
|
15210
|
-
evidenceSummary: await writeExtractionEvidence({
|
|
15211
|
-
schema: schemaLoad.schema,
|
|
15212
|
-
data: mergedData,
|
|
15213
|
-
outputPath,
|
|
15214
|
-
chunks: processedDocs,
|
|
15215
|
-
candidateReport
|
|
15216
|
-
})
|
|
15217
|
-
};
|
|
15218
14612
|
if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
|
|
15219
14613
|
if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
|
|
15220
|
-
if (result.evidenceSummary && !options?.quiet) {
|
|
15221
|
-
const summary = result.evidenceSummary;
|
|
15222
|
-
const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
|
|
15223
|
-
consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, conflicts ${summary.conflictCount ?? 0}, issues ${issueText}`));
|
|
15224
|
-
}
|
|
15225
14614
|
if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
|
|
15226
14615
|
prompt: result.tokensUsed.prompt,
|
|
15227
14616
|
completion: result.tokensUsed.completion,
|
|
@@ -15250,7 +14639,6 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
15250
14639
|
outputPath: result.outputPath,
|
|
15251
14640
|
data: result.data,
|
|
15252
14641
|
tablesInserted: insertResult.tablesInserted,
|
|
15253
|
-
evidenceSummary: result.evidenceSummary,
|
|
15254
14642
|
tokensUsed: result.tokensUsed
|
|
15255
14643
|
};
|
|
15256
14644
|
} else {
|
|
@@ -15277,10 +14665,13 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
15277
14665
|
success: true,
|
|
15278
14666
|
outputPath: result.outputPath,
|
|
15279
14667
|
data: result.data,
|
|
15280
|
-
evidenceSummary: result.evidenceSummary,
|
|
15281
14668
|
tokensUsed: result.tokensUsed
|
|
15282
14669
|
};
|
|
15283
14670
|
}
|
|
14671
|
+
function formatInputProcessing$1(input) {
|
|
14672
|
+
const handler = input.converter ? `${input.handler}(${input.converter})` : input.handler;
|
|
14673
|
+
return `${input.mime ?? input.kind} -> ${handler}`;
|
|
14674
|
+
}
|
|
15284
14675
|
async function runAuditedExtraction(options) {
|
|
15285
14676
|
const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false } = options;
|
|
15286
14677
|
let fileHash;
|
|
@@ -15321,7 +14712,8 @@ async function runAuditedExtraction(options) {
|
|
|
15321
14712
|
outputName: existing.outputName,
|
|
15322
14713
|
tablesInserted: existing.tablesInserted,
|
|
15323
14714
|
notionPages: existing.notionPages,
|
|
15324
|
-
tokensUsed: existing.tokensUsed
|
|
14715
|
+
tokensUsed: existing.tokensUsed,
|
|
14716
|
+
inputProcessing: existing.inputProcessing
|
|
15325
14717
|
};
|
|
15326
14718
|
}
|
|
15327
14719
|
}
|
|
@@ -15342,9 +14734,17 @@ async function runAuditedExtraction(options) {
|
|
|
15342
14734
|
});
|
|
15343
14735
|
try {
|
|
15344
14736
|
let text$1 = "";
|
|
15345
|
-
|
|
15346
|
-
|
|
15347
|
-
|
|
14737
|
+
let filePath;
|
|
14738
|
+
let inputProcessing;
|
|
14739
|
+
if (source.type === "file") {
|
|
14740
|
+
const input = await readExtractFileInput(source.filePath, aiConfig, modelOverride);
|
|
14741
|
+
text$1 = input.text;
|
|
14742
|
+
filePath = input.filePath;
|
|
14743
|
+
inputProcessing = input.inputProcessing;
|
|
14744
|
+
if (!quiet) consola.info(`Input: ${formatInputProcessing$1(inputProcessing)}`);
|
|
14745
|
+
await updateExtractionAuditRecord(aiexDir, audit.id, { inputProcessing });
|
|
14746
|
+
} else text$1 = source.text;
|
|
14747
|
+
const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
|
|
15348
14748
|
quiet,
|
|
15349
14749
|
insert
|
|
15350
14750
|
});
|
|
@@ -15368,7 +14768,8 @@ async function runAuditedExtraction(options) {
|
|
|
15368
14768
|
success: false,
|
|
15369
14769
|
error: error instanceof Error ? error.message : String(error),
|
|
15370
14770
|
auditId: audit.id,
|
|
15371
|
-
fileHash
|
|
14771
|
+
fileHash,
|
|
14772
|
+
inputProcessing
|
|
15372
14773
|
};
|
|
15373
14774
|
}
|
|
15374
14775
|
const updated = await updateExtractionAuditRecord(aiexDir, audit.id, {
|
|
@@ -15386,10 +14787,10 @@ async function runAuditedExtraction(options) {
|
|
|
15386
14787
|
outputName: updated.outputName,
|
|
15387
14788
|
tablesInserted: updated.tablesInserted,
|
|
15388
14789
|
notionPages: updated.notionPages,
|
|
15389
|
-
evidenceSummary: r.evidenceSummary,
|
|
15390
14790
|
tokensUsed: updated.tokensUsed,
|
|
15391
14791
|
auditId: updated.id,
|
|
15392
|
-
fileHash
|
|
14792
|
+
fileHash,
|
|
14793
|
+
inputProcessing: updated.inputProcessing
|
|
15393
14794
|
};
|
|
15394
14795
|
} else {
|
|
15395
14796
|
await updateExtractionAuditRecord(aiexDir, audit.id, {
|
|
@@ -15402,7 +14803,8 @@ async function runAuditedExtraction(options) {
|
|
|
15402
14803
|
success: false,
|
|
15403
14804
|
error: r.error,
|
|
15404
14805
|
auditId: audit.id,
|
|
15405
|
-
fileHash
|
|
14806
|
+
fileHash,
|
|
14807
|
+
inputProcessing
|
|
15406
14808
|
};
|
|
15407
14809
|
}
|
|
15408
14810
|
} catch (e) {
|
|
@@ -15606,6 +15008,11 @@ function isExtractSubCommand(rawArgs) {
|
|
|
15606
15008
|
function formatSource(source) {
|
|
15607
15009
|
return source.type === "file" ? source.fileName || "file" : "unknown";
|
|
15608
15010
|
}
|
|
15011
|
+
function formatInputProcessing(input) {
|
|
15012
|
+
if (!input) return "";
|
|
15013
|
+
const handler = input.converter ? `${input.handler}(${input.converter})` : input.handler;
|
|
15014
|
+
return ` [${input.mime ?? input.kind} -> ${handler}]`;
|
|
15015
|
+
}
|
|
15609
15016
|
async function loadConfiguredAI(aiexDir) {
|
|
15610
15017
|
const aiConfig = await readAIConfig(aiexDir);
|
|
15611
15018
|
if (!aiConfig) {
|
|
@@ -15648,7 +15055,7 @@ const historyCommand = defineCommand({
|
|
|
15648
15055
|
}
|
|
15649
15056
|
for (const record of records) {
|
|
15650
15057
|
const suffix = record.error ? ` — ${record.error}` : record.outputName ? ` — ${record.outputName}` : "";
|
|
15651
|
-
consola.info(`${record.status.padEnd(9)} ${record.id} ${record.schemaName} ${formatSource(record.source)}${suffix}`);
|
|
15058
|
+
consola.info(`${record.status.padEnd(9)} ${record.id} ${record.schemaName} ${formatSource(record.source)}${formatInputProcessing(record.inputProcessing)}${suffix}`);
|
|
15652
15059
|
}
|
|
15653
15060
|
}
|
|
15654
15061
|
});
|
|
@@ -16161,10 +15568,7 @@ const SUPPORTED_EXTENSIONS = new Set([
|
|
|
16161
15568
|
"png",
|
|
16162
15569
|
"jpg",
|
|
16163
15570
|
"jpeg",
|
|
16164
|
-
"gif",
|
|
16165
15571
|
"webp",
|
|
16166
|
-
"bmp",
|
|
16167
|
-
"svg",
|
|
16168
15572
|
"pdf",
|
|
16169
15573
|
"txt",
|
|
16170
15574
|
"md",
|
|
@@ -16514,7 +15918,6 @@ function aiRoutes(config) {
|
|
|
16514
15918
|
//#endregion
|
|
16515
15919
|
//#region src/core/data-service.ts
|
|
16516
15920
|
const FILE_REGEX = /\.json$/;
|
|
16517
|
-
const EVIDENCE_FILE_SUFFIX = ".evidence.json";
|
|
16518
15921
|
const EXTRACTION_TIMESTAMP_RE = /-\d{4}-\d{2}-\d{2}T/;
|
|
16519
15922
|
const INTERNAL_ROWID_COLUMN = "__aiex_rowid";
|
|
16520
15923
|
const TIMESTAMP_CLEANUP = /(\d{2})-(\d{2})-(\d{2})/;
|
|
@@ -16530,24 +15933,6 @@ function getAuditNotionStatus(record) {
|
|
|
16530
15933
|
if (record.status === "failed") return "failed";
|
|
16531
15934
|
return "not_synced";
|
|
16532
15935
|
}
|
|
16533
|
-
async function readEvidenceSummary(extractedDir, outputName) {
|
|
16534
|
-
const evidencePath = path.join(extractedDir, outputName.replace(FILE_REGEX, EVIDENCE_FILE_SUFFIX));
|
|
16535
|
-
try {
|
|
16536
|
-
const coverage = (await readFile(evidencePath))?.coverage;
|
|
16537
|
-
if (!coverage || typeof coverage !== "object") return void 0;
|
|
16538
|
-
return {
|
|
16539
|
-
path: evidencePath,
|
|
16540
|
-
fieldCount: Number(coverage.fieldCount) || 0,
|
|
16541
|
-
evidenceCount: Number(coverage.evidenceCount) || 0,
|
|
16542
|
-
foundCount: Number(coverage.foundCount) || 0,
|
|
16543
|
-
missingCount: Number(coverage.missingCount) || 0,
|
|
16544
|
-
inferredCount: Number(coverage.inferredCount) || 0,
|
|
16545
|
-
issueCount: Number(coverage.issueCount) || 0
|
|
16546
|
-
};
|
|
16547
|
-
} catch {
|
|
16548
|
-
return;
|
|
16549
|
-
}
|
|
16550
|
-
}
|
|
16551
15936
|
async function getRowExtractionActions(aiexDir, tableName) {
|
|
16552
15937
|
const actions = /* @__PURE__ */ new Map();
|
|
16553
15938
|
const auditRecords = await listExtractionAuditRecords(aiexDir);
|
|
@@ -16575,7 +15960,7 @@ async function listExtractions(config) {
|
|
|
16575
15960
|
const aiexDir = path.dirname(config.schemaPath);
|
|
16576
15961
|
const extractedDir = path.join(aiexDir, "extracted");
|
|
16577
15962
|
await fs.mkdir(extractedDir, { recursive: true });
|
|
16578
|
-
const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md")
|
|
15963
|
+
const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md"));
|
|
16579
15964
|
const auditRecords = await listExtractionAuditRecords(aiexDir);
|
|
16580
15965
|
const auditByOutputName = new Map(auditRecords.map((record) => [record.outputName, record]));
|
|
16581
15966
|
const records = [];
|
|
@@ -16594,10 +15979,10 @@ async function listExtractions(config) {
|
|
|
16594
15979
|
timestamp,
|
|
16595
15980
|
fileSize: stat.size,
|
|
16596
15981
|
modifiedAt: stat.mtime.toISOString(),
|
|
16597
|
-
evidenceSummary: await readEvidenceSummary(extractedDir, file),
|
|
16598
15982
|
notionStatus: notionPages ? "synced" : audit?.status === "failed" ? "failed" : "not_synced",
|
|
16599
15983
|
notionPages,
|
|
16600
|
-
notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0
|
|
15984
|
+
notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0,
|
|
15985
|
+
inputProcessing: audit?.inputProcessing
|
|
16601
15986
|
});
|
|
16602
15987
|
} catch {
|
|
16603
15988
|
continue;
|
|
@@ -16774,7 +16159,6 @@ async function retryNotionSync(config, fileName) {
|
|
|
16774
16159
|
|
|
16775
16160
|
//#endregion
|
|
16776
16161
|
//#region src/server/routes/data.ts
|
|
16777
|
-
const JSON_FILE_SUFFIX_RE = /\.json$/;
|
|
16778
16162
|
const tableParamSchema = z.object({ name: z.string().regex(/^[a-z][a-z0-9_]*$/) });
|
|
16779
16163
|
const extractionFileParamSchema = z.object({ name: z.string().regex(/^[\w.-]+\.json$/).refine((name$1) => name$1 === path.basename(name$1) && !name$1.includes("..")) });
|
|
16780
16164
|
const tableQuerySchema = z.object({
|
|
@@ -16827,22 +16211,10 @@ function dataRoutes(config) {
|
|
|
16827
16211
|
const filePath = path.join(extractedDir, name$1);
|
|
16828
16212
|
try {
|
|
16829
16213
|
const content = await fs.readFile(filePath, "utf-8");
|
|
16830
|
-
const evidencePath = path.join(extractedDir, name$1.replace(JSON_FILE_SUFFIX_RE, ".evidence.json"));
|
|
16831
|
-
let evidenceSummary;
|
|
16832
|
-
try {
|
|
16833
|
-
const evidence = JSON.parse(await fs.readFile(evidencePath, "utf-8"));
|
|
16834
|
-
evidenceSummary = evidence?.coverage ? {
|
|
16835
|
-
...evidence.coverage,
|
|
16836
|
-
path: evidencePath
|
|
16837
|
-
} : void 0;
|
|
16838
|
-
} catch {
|
|
16839
|
-
evidenceSummary = void 0;
|
|
16840
|
-
}
|
|
16841
16214
|
return c.json({
|
|
16842
16215
|
success: true,
|
|
16843
16216
|
content,
|
|
16844
|
-
name: name$1
|
|
16845
|
-
evidenceSummary
|
|
16217
|
+
name: name$1
|
|
16846
16218
|
});
|
|
16847
16219
|
} catch {
|
|
16848
16220
|
return c.json({ error: t("server.extractionNotFound") }, 404);
|
|
@@ -16881,10 +16253,9 @@ function getFormFile(value) {
|
|
|
16881
16253
|
function safeUploadName(name$1) {
|
|
16882
16254
|
return path.basename(name$1).replace(/[^\w.-]/g, "_") || "upload.txt";
|
|
16883
16255
|
}
|
|
16884
|
-
function safeUploadNameForMime(file) {
|
|
16256
|
+
function safeUploadNameForMime(file, mimeType) {
|
|
16885
16257
|
const safeName = safeUploadName(file.name);
|
|
16886
|
-
const ext =
|
|
16887
|
-
if (!ext) throw new FileValidationError(unsupportedFileTypeMessage(file.type));
|
|
16258
|
+
const ext = getExtensionForDetectedFile(mimeType);
|
|
16888
16259
|
return `${path.parse(safeName).name || "upload"}.${ext}`;
|
|
16889
16260
|
}
|
|
16890
16261
|
function jsonResponse(body, status) {
|
|
@@ -16894,10 +16265,10 @@ function jsonResponse(body, status) {
|
|
|
16894
16265
|
});
|
|
16895
16266
|
}
|
|
16896
16267
|
async function saveUploadToFile(file, uploadsDir, id) {
|
|
16897
|
-
validateFileUpload(file);
|
|
16898
|
-
await fs.mkdir(uploadsDir, { recursive: true });
|
|
16899
|
-
const filePath = path.join(uploadsDir, `${id}-${safeUploadNameForMime(file)}`);
|
|
16900
16268
|
const buffer = Buffer.from(await file.arrayBuffer());
|
|
16269
|
+
const mimeType = await validateFileUploadContent(file, buffer);
|
|
16270
|
+
await fs.mkdir(uploadsDir, { recursive: true });
|
|
16271
|
+
const filePath = path.join(uploadsDir, `${id}-${safeUploadNameForMime(file, mimeType)}`);
|
|
16901
16272
|
await fs.writeFile(filePath, buffer);
|
|
16902
16273
|
return filePath;
|
|
16903
16274
|
}
|
|
@@ -16986,9 +16357,9 @@ function extractRoutes(config) {
|
|
|
16986
16357
|
outputName: result.outputName,
|
|
16987
16358
|
tablesInserted: result.tablesInserted,
|
|
16988
16359
|
notionPages: result.notionPages,
|
|
16989
|
-
evidenceSummary: result.evidenceSummary,
|
|
16990
16360
|
tokensUsed: result.tokensUsed,
|
|
16991
|
-
auditId: result.auditId
|
|
16361
|
+
auditId: result.auditId,
|
|
16362
|
+
inputProcessing: result.inputProcessing
|
|
16992
16363
|
}, 200);
|
|
16993
16364
|
} catch (error) {
|
|
16994
16365
|
if (isMissingUploadFileError(error)) return c.json({
|
|
@@ -17054,9 +16425,9 @@ function extractRoutes(config) {
|
|
|
17054
16425
|
outputName: result.outputName,
|
|
17055
16426
|
tablesInserted: result.tablesInserted,
|
|
17056
16427
|
notionPages: result.notionPages,
|
|
17057
|
-
evidenceSummary: result.evidenceSummary,
|
|
17058
16428
|
tokensUsed: result.tokensUsed,
|
|
17059
|
-
auditId: result.auditId
|
|
16429
|
+
auditId: result.auditId,
|
|
16430
|
+
inputProcessing: result.inputProcessing
|
|
17060
16431
|
}, 200);
|
|
17061
16432
|
});
|
|
17062
16433
|
app.delete("/extract/records/:id", async (c) => {
|