aiex-cli 0.0.5-beta.5 → 0.0.5-beta.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-NTNBFeBw.mjs";
1
+ import { C as name, D as doctorDiagnosticsTableRows, O as formatDoctorDiagnosticsJson, S as description, T as version, _ as PLACEHOLDER_SCHEMA, a as parseJsonSchema, b as createConfig, c as recognizeImageText, d as getDefaultAIConfig, f as readAIConfig, g as DEFAULT_PROMPT_CONFIG, h as DEFAULT_MINERU_CONFIG, i as JsonSchemaDefinitionSchema, l as initI18n, m as DEFAULT_MINERU_API_CONFIG, n as createMigrationConfig, o as toSnakeCase, p as writeAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as t, v as PLACEHOLDER_TEXT, w as package_default, x as seedConfig, y as AIConfigSchema } from "./doctor-collector-BpqhXNcO.mjs";
2
2
  import { createRequire } from "node:module";
3
3
  import fs from "node:fs/promises";
4
4
  import os from "node:os";
@@ -21,7 +21,6 @@ import { getEncoding } from "js-tiktoken";
21
21
  import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
22
22
  import { APICallError, Output, generateText, jsonSchema } from "ai";
23
23
  import pRetry from "p-retry";
24
- import mime from "mime";
25
24
  import { jsonrepair } from "jsonrepair";
26
25
  import { LangfuseSpanProcessor } from "@langfuse/otel";
27
26
  import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
@@ -12861,28 +12860,6 @@ async function withRetry(fn, onRetry, maxRetries = 5) {
12861
12860
  });
12862
12861
  }
12863
12862
 
12864
- //#endregion
12865
- //#region src/core/ai-extraction/file-utils.ts
12866
- function detectMimeType(filePath) {
12867
- return mime.getType(filePath) ?? "application/octet-stream";
12868
- }
12869
- async function readFilePart(filePath) {
12870
- const mimeStr = detectMimeType(filePath);
12871
- const buffer = await fs.readFile(filePath);
12872
- const name$1 = path.basename(filePath);
12873
- if (mimeStr.startsWith("image/")) return {
12874
- type: "image",
12875
- image: buffer,
12876
- mimeType: mimeStr
12877
- };
12878
- return {
12879
- type: "file",
12880
- data: buffer,
12881
- mediaType: mimeStr,
12882
- filename: name$1
12883
- };
12884
- }
12885
-
12886
12863
  //#endregion
12887
12864
  //#region src/core/ai-extraction/json-utils.ts
12888
12865
  function parseJsonLike(text$1) {
@@ -12943,25 +12920,10 @@ function filterCompatible(models, inputTokens, outputTokens) {
12943
12920
  });
12944
12921
  }
12945
12922
  function selectModel(input) {
12946
- const { models, isImage, fileName, inputTokens, outputTokens } = input;
12923
+ const { models, inputTokens, outputTokens } = input;
12947
12924
  if (models.length === 0) throw new Error(t("errors.ai.noModels"));
12948
12925
  let candidates = filterCompatible(models, inputTokens, outputTokens);
12949
12926
  if (candidates.length === 0) candidates = models;
12950
- if (isImage) {
12951
- const visionModel = candidates.find((m) => m.capabilities.vision);
12952
- if (!visionModel) {
12953
- const hint = fileName ? ` (${fileName})` : "";
12954
- const msg = inputTokens ? t("errors.ai.noVisionModelContext", {
12955
- tokens: inputTokens,
12956
- hint
12957
- }) : t("errors.ai.noVisionModel", { hint });
12958
- throw new Error(msg + t("errors.ai.addSuitableModel"));
12959
- }
12960
- return {
12961
- name: visionModel.name,
12962
- capabilities: visionModel.capabilities
12963
- };
12964
- }
12965
12927
  const soModel = candidates.find((m) => m.capabilities.structuredOutput);
12966
12928
  if (soModel) return {
12967
12929
  name: soModel.name,
@@ -12975,36 +12937,46 @@ function selectModel(input) {
12975
12937
 
12976
12938
  //#endregion
12977
12939
  //#region src/core/ai-extraction/prompt-generator.ts
12978
- function propertyToDescription(name$1, prop, indent = "") {
12940
+ const CAMEL_CASE_BOUNDARY_RE = /([a-z0-9])([A-Z])/g;
12941
+ const IDENTIFIER_SEPARATOR_RE = /[\s_-]+/;
12942
+ function splitIdentifier(name$1) {
12943
+ return name$1.replace(CAMEL_CASE_BOUNDARY_RE, "$1 $2").split(IDENTIFIER_SEPARATOR_RE).map((part) => part.trim().toLowerCase()).filter(Boolean);
12944
+ }
12945
+ function propertyToDescription(name$1, prop, indent = "", required = false) {
12979
12946
  const lines = [];
12980
12947
  let typeStr = prop.type;
12981
12948
  if (prop.type === "array" && prop.items) typeStr = `array of ${prop.items.type}`;
12982
- lines.push(`${indent}- ${name$1}: ${typeStr}`);
12949
+ lines.push(`${indent}- ${name$1}: ${typeStr}${required ? " (required)" : ""}`);
12950
+ const terms = splitIdentifier(name$1);
12951
+ if (terms.length > 1) lines.push(`${indent} search terms: ${terms.join(", ")}`);
12952
+ if (prop.description) lines.push(`${indent} description: ${prop.description}`);
12983
12953
  if (prop.minLength !== void 0 || prop.maxLength !== void 0) lines.push(`${indent} length: ${prop.minLength ?? 0} - ${prop.maxLength ?? "unlimited"}`);
12954
+ if (prop.minimum !== void 0 || prop.maximum !== void 0) lines.push(`${indent} range: ${prop.minimum ?? "-∞"} - ${prop.maximum ?? "+∞"}`);
12984
12955
  if (prop.format) lines.push(`${indent} format: ${prop.format}`);
12985
12956
  if (prop.unique) lines.push(`${indent} unique: true`);
12986
12957
  if (prop.default !== void 0) lines.push(`${indent} default: ${JSON.stringify(prop.default)}`);
12987
12958
  return lines.join("\n");
12988
12959
  }
12989
- function nestedPropertyToDescription(name$1, prop, indent = "") {
12960
+ function nestedPropertyToDescription(name$1, prop, indent = "", requiredFields = []) {
12990
12961
  const lines = [];
12962
+ const isRequired = requiredFields.includes(name$1);
12991
12963
  if (prop.nested?.enabled && prop.type === "object") {
12992
12964
  const relation = prop.nested.relation || "has-one";
12993
- lines.push(`${indent}- ${name$1}: object (related table, ${relation})`);
12994
- if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
12965
+ lines.push(`${indent}- ${name$1}: object (related table, ${relation})${isRequired ? " (required)" : ""}`);
12966
+ if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.required ?? []));
12995
12967
  return lines.join("\n");
12996
12968
  }
12997
12969
  if (prop.type === "array" && prop.items?.nested?.enabled) {
12998
12970
  const relation = prop.items.nested.relation || "has-many";
12999
- lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})`);
13000
- if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
12971
+ lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})${isRequired ? " (required)" : ""}`);
12972
+ if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.items.required ?? []));
13001
12973
  return lines.join("\n");
13002
12974
  }
13003
- lines.push(propertyToDescription(name$1, prop, indent));
13004
- if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
12975
+ lines.push(propertyToDescription(name$1, prop, indent, isRequired));
12976
+ if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.required ?? []));
13005
12977
  if (prop.type === "array" && prop.items?.properties && !prop.items?.nested?.enabled) {
13006
12978
  lines.push(`${indent} item fields:`);
13007
- for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
12979
+ for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.items.required ?? []));
13008
12980
  }
13009
12981
  return lines.join("\n");
13010
12982
  }
@@ -13016,7 +12988,7 @@ function schemaToDescription(schema) {
13016
12988
  lines.push("Fields:");
13017
12989
  for (const [name$1, prop] of Object.entries(schema.properties)) {
13018
12990
  const property = prop;
13019
- lines.push(nestedPropertyToDescription(name$1, property));
12991
+ lines.push(nestedPropertyToDescription(name$1, property, "", schema.required ?? []));
13020
12992
  }
13021
12993
  if (schema.examples && schema.examples.length > 0) {
13022
12994
  lines.push("");
@@ -13061,33 +13033,6 @@ function generatePromptSnapshot(schema, promptConfig = DEFAULT_PROMPT_CONFIG) {
13061
13033
  ].join("\n");
13062
13034
  }
13063
13035
 
13064
- //#endregion
13065
- //#region src/core/ai-extraction/snapshot.ts
13066
- const SYSTEM_PROMPT_REGEX = /## System Prompt\n([\s\S]*?)(?=## User Prompt|$)/;
13067
- const USER_PROMPT_REGEX = /## User Prompt Template\n([\s\S]*)$/;
13068
- async function loadPromptSnapshot(aiexDir, tableName) {
13069
- const snapshotPath = path.join(aiexDir, "extracted", `${tableName}.prompt.md`);
13070
- try {
13071
- const content = await fs.readFile(snapshotPath, "utf-8");
13072
- const systemMatch = content.match(SYSTEM_PROMPT_REGEX);
13073
- const userMatch = content.match(USER_PROMPT_REGEX);
13074
- if (systemMatch && userMatch) return {
13075
- system: systemMatch[1].trim(),
13076
- user: userMatch[1].trim()
13077
- };
13078
- } catch {}
13079
- return null;
13080
- }
13081
- async function savePromptSnapshot(schema, aiexDir) {
13082
- const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
13083
- const outputDir = path.join(aiexDir, "extracted");
13084
- await fs.mkdir(outputDir, { recursive: true });
13085
- const fileName = `${schema.table.name}.prompt.md`;
13086
- const outputPath = path.join(outputDir, fileName);
13087
- await fs.writeFile(outputPath, content);
13088
- return outputPath;
13089
- }
13090
-
13091
13036
  //#endregion
13092
13037
  //#region src/core/ai-extraction/telemetry.ts
13093
13038
  let langfuseInitialized = false;
@@ -13130,7 +13075,7 @@ function propertyToExtractionSchema(property) {
13130
13075
  }
13131
13076
  return { type: nullableType(property.type) };
13132
13077
  }
13133
- function isRecord$1(value) {
13078
+ function isRecord$2(value) {
13134
13079
  return typeof value === "object" && value !== null && !Array.isArray(value);
13135
13080
  }
13136
13081
  function schemaToExtractionOutputSchema(schema) {
@@ -13168,7 +13113,7 @@ function validatePropertyValue(path$1, property, value, issues) {
13168
13113
  }
13169
13114
  return;
13170
13115
  case "object":
13171
- if (!isRecord$1(value)) {
13116
+ if (!isRecord$2(value)) {
13172
13117
  issues.push(`${path$1}: expected object or null`);
13173
13118
  return;
13174
13119
  }
@@ -13191,7 +13136,7 @@ function validateProperties(basePath, properties, data, issues) {
13191
13136
  }
13192
13137
  }
13193
13138
  function validateExtractedData(schema, data) {
13194
- if (!isRecord$1(data)) return {
13139
+ if (!isRecord$2(data)) return {
13195
13140
  success: false,
13196
13141
  error: "Extracted data must be a JSON object."
13197
13142
  };
@@ -13208,13 +13153,11 @@ function validateExtractedData(schema, data) {
13208
13153
  //#region src/core/ai-extraction/extractor.ts
13209
13154
  const OPENAI_COMPATIBLE_PROVIDER_NAME = "openai-compatible";
13210
13155
  async function extractStructuredData(input) {
13211
- const { config, schema, text: text$1, aiexDir, file, modelOverride } = input;
13156
+ const { config, schema, text: text$1, modelOverride } = input;
13212
13157
  if (!config.provider.apiKey) return {
13213
13158
  success: false,
13214
13159
  error: t("errors.ai.apiKeyMissing")
13215
13160
  };
13216
- const useFileContent = !!file;
13217
- const isImageFile = useFileContent && detectMimeType(file).startsWith("image/");
13218
13161
  const inputTokens = text$1 ? Math.ceil(text$1.length / 2) : void 0;
13219
13162
  const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
13220
13163
  const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
@@ -13222,8 +13165,6 @@ async function extractStructuredData(input) {
13222
13165
  try {
13223
13166
  selected = modelOverride ?? selectModel({
13224
13167
  models: config.provider.models,
13225
- isImage: isImageFile,
13226
- fileName: file,
13227
13168
  inputTokens,
13228
13169
  outputTokens
13229
13170
  });
@@ -13243,18 +13184,7 @@ async function extractStructuredData(input) {
13243
13184
  apiKey: config.provider.apiKey,
13244
13185
  supportsStructuredOutputs: useStructuredOutput
13245
13186
  });
13246
- let system;
13247
- let user;
13248
- const snapshot = await loadPromptSnapshot(aiexDir, schema.table.name);
13249
- const promptText = file ? PLACEHOLDER_TEXT : text$1;
13250
- if (snapshot) {
13251
- system = snapshot.system;
13252
- user = snapshot.user.replaceAll(PLACEHOLDER_TEXT, promptText);
13253
- } else {
13254
- const generated = generateExtractionPrompt(schema, promptText, config.prompt ?? DEFAULT_PROMPT_CONFIG);
13255
- system = generated.system;
13256
- user = generated.user;
13257
- }
13187
+ const { system, user } = generateExtractionPrompt(schema, text$1, config.prompt ?? DEFAULT_PROMPT_CONFIG);
13258
13188
  const outputSchema = jsonSchema(schemaToExtractionOutputSchema(schema));
13259
13189
  const timeoutMs = (config.provider.timeout ?? 300) * 1e3;
13260
13190
  let systemPrompt = system;
@@ -13269,38 +13199,16 @@ async function extractStructuredData(input) {
13269
13199
  let parseError;
13270
13200
  let validationError;
13271
13201
  try {
13272
- if (useFileContent) {
13273
- const filePart = await readFilePart(file);
13274
- const fileName = filePart.type === "file" ? filePart.filename : path.basename(file);
13275
- const contentParts = [{
13276
- type: "text",
13277
- text: userPrompt.includes(PLACEHOLDER_TEXT) ? userPrompt.replaceAll(PLACEHOLDER_TEXT, text$1 || `Data is contained in the attached file: ${fileName}`) : userPrompt
13278
- }, filePart];
13279
- const fileOpts = {
13280
- model: provider.chatModel(selected.name),
13281
- system: systemPrompt,
13282
- messages: [{
13283
- role: "user",
13284
- content: contentParts
13285
- }],
13286
- abortSignal: AbortSignal.timeout(timeoutMs),
13287
- maxRetries: 0,
13288
- experimental_telemetry: { isEnabled: useTelemetry }
13289
- };
13290
- if (useStructuredOutput) fileOpts.output = Output.object({ schema: outputSchema });
13291
- result = await withRetry(() => generateText(fileOpts), input.onRetry);
13292
- } else {
13293
- const textOpts = {
13294
- model: provider.chatModel(selected.name),
13295
- system: systemPrompt,
13296
- prompt: userPrompt,
13297
- abortSignal: AbortSignal.timeout(timeoutMs),
13298
- maxRetries: 0,
13299
- experimental_telemetry: { isEnabled: useTelemetry }
13300
- };
13301
- if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
13302
- result = await withRetry(() => generateText(textOpts), input.onRetry);
13303
- }
13202
+ const textOpts = {
13203
+ model: provider.chatModel(selected.name),
13204
+ system: systemPrompt,
13205
+ prompt: userPrompt,
13206
+ abortSignal: AbortSignal.timeout(timeoutMs),
13207
+ maxRetries: 0,
13208
+ experimental_telemetry: { isEnabled: useTelemetry }
13209
+ };
13210
+ if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
13211
+ result = await withRetry(() => generateText(textOpts), input.onRetry);
13304
13212
  if (result.usage) {
13305
13213
  totalPromptTokens += result.usage.inputTokens ?? 0;
13306
13214
  totalCompletionTokens += result.usage.outputTokens ?? 0;
@@ -13316,27 +13224,16 @@ async function extractStructuredData(input) {
13316
13224
  }
13317
13225
  if (!parseError && data !== void 0) {
13318
13226
  const validation = validateExtractedData(schema, data);
13319
- if (validation.success) {
13320
- const outputDir = path.resolve(aiexDir, config.extraction.outputDir.replace(".aiex/", ""));
13321
- await fs.mkdir(outputDir, { recursive: true });
13322
- const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
13323
- const outputFileName = `${schema.table.name}-${timestamp}.json`;
13324
- const outputPath = path.join(outputDir, outputFileName);
13325
- await writeFile(outputPath, data, {
13326
- spaces: 2,
13327
- EOL: "\n"
13328
- });
13329
- return {
13330
- success: true,
13331
- outputPath,
13332
- data,
13333
- tokensUsed: {
13334
- prompt: totalPromptTokens,
13335
- completion: totalCompletionTokens,
13336
- total: totalPromptTokens + totalCompletionTokens
13337
- }
13338
- };
13339
- } else validationError = validation.error;
13227
+ if (validation.success) return {
13228
+ success: true,
13229
+ data,
13230
+ tokensUsed: {
13231
+ prompt: totalPromptTokens,
13232
+ completion: totalCompletionTokens,
13233
+ total: totalPromptTokens + totalCompletionTokens
13234
+ }
13235
+ };
13236
+ else validationError = validation.error;
13340
13237
  }
13341
13238
  const errorMsg = parseError || validationError || "Unknown validation error";
13342
13239
  lastError = errorMsg;
@@ -13347,11 +13244,14 @@ async function extractStructuredData(input) {
13347
13244
  CRITICAL RULES:
13348
13245
  1. Only correct the fields that failed validation.
13349
13246
  2. Preserve all other correctly extracted fields and their values exactly.
13350
- 3. Return ONLY the corrected JSON object. No explanations, no markdown blocks other than JSON.`;
13247
+ 3. Use only values supported by the original text. If a value cannot be confirmed, set it to null.
13248
+ 4. Remove any fields not defined by the JSON Schema.
13249
+ 5. Normalize values to the expected JSON type without changing the intended meaning.
13250
+ 6. Return ONLY the corrected JSON object. No explanations, no markdown blocks other than JSON.`;
13351
13251
  userPrompt = `The JSON data you generated previously failed validation. Please correct it.
13352
13252
 
13353
13253
  [Original Text]
13354
- ${text$1 || "Data is contained in the attached file."}
13254
+ ${text$1 || "Original text is empty."}
13355
13255
 
13356
13256
  [JSON Schema Definition]
13357
13257
  ${JSON.stringify(schemaToExtractionOutputSchema(schema), null, 2)}
@@ -13362,6 +13262,11 @@ ${invalidJson}
13362
13262
  [Validation Error Details]
13363
13263
  ${errorMsg}
13364
13264
 
13265
+ Correction checklist:
13266
+ - Fix each field path mentioned in the validation error.
13267
+ - Keep schema-valid fields unchanged.
13268
+ - Do not invent missing facts; use null when the original text does not support a value.
13269
+
13365
13270
  Please output the corrected JSON object now:`;
13366
13271
  }
13367
13272
  }
@@ -13516,33 +13421,60 @@ function insertExtractedData(db, schema, data) {
13516
13421
 
13517
13422
  //#endregion
13518
13423
  //#region src/core/ai-extraction/json-merger.ts
13519
- function isRecord(value) {
13424
+ function isRecord$1(value) {
13520
13425
  return typeof value === "object" && value !== null && !Array.isArray(value);
13521
13426
  }
13427
+ function stableKey(value) {
13428
+ if (!isRecord$1(value)) return JSON.stringify(value);
13429
+ return JSON.stringify(Object.keys(value).sort().reduce((acc, key) => {
13430
+ acc[key] = value[key];
13431
+ return acc;
13432
+ }, {}));
13433
+ }
13434
+ function isBlankString(value) {
13435
+ return typeof value === "string" && value.trim() === "";
13436
+ }
13437
+ function isPlaceholderString$1(value) {
13438
+ if (typeof value !== "string") return false;
13439
+ const normalized = value.trim().toLowerCase();
13440
+ return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
13441
+ }
13442
+ function pickPrimitiveValue(values) {
13443
+ const meaningful = values.filter((v) => !isBlankString(v) && !isPlaceholderString$1(v));
13444
+ if (meaningful.length === 0) return null;
13445
+ if (typeof meaningful[0] === "boolean") {
13446
+ const trueCount = meaningful.filter(Boolean).length;
13447
+ return trueCount >= meaningful.length - trueCount;
13448
+ }
13449
+ return meaningful[0];
13450
+ }
13522
13451
  function mergePropertyValue(property, values) {
13523
13452
  const nonNullValues = values.filter((v) => v !== null && v !== void 0);
13524
13453
  if (nonNullValues.length === 0) return null;
13525
13454
  if (property.type === "array") {
13526
13455
  const concatenated = [];
13527
- for (const val of nonNullValues) if (Array.isArray(val)) concatenated.push(...val);
13456
+ const seen = /* @__PURE__ */ new Set();
13457
+ for (const val of nonNullValues) if (Array.isArray(val)) for (const item of val) {
13458
+ const key = stableKey(item);
13459
+ if (!seen.has(key)) {
13460
+ seen.add(key);
13461
+ concatenated.push(item);
13462
+ }
13463
+ }
13528
13464
  return concatenated;
13529
13465
  }
13530
13466
  if (property.type === "object") {
13531
13467
  const childProperties = property.properties;
13532
13468
  if (!childProperties) {
13533
13469
  const mergedObj$1 = {};
13534
- for (const val of nonNullValues) if (isRecord(val)) Object.assign(mergedObj$1, val);
13470
+ for (const val of nonNullValues) if (isRecord$1(val)) Object.assign(mergedObj$1, val);
13535
13471
  return mergedObj$1;
13536
13472
  }
13537
13473
  const mergedObj = {};
13538
- for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord(v) ? v[propName] : void 0));
13474
+ for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord$1(v) ? v[propName] : void 0));
13539
13475
  return mergedObj;
13540
13476
  }
13541
- const bestValue = nonNullValues.find((v) => {
13542
- if (typeof v === "string") return v.trim() !== "";
13543
- return true;
13544
- });
13545
- return bestValue !== void 0 ? bestValue : null;
13477
+ return pickPrimitiveValue(nonNullValues);
13546
13478
  }
13547
13479
  /**
13548
13480
  * Merges structured extraction outputs from multiple document chunks
@@ -13559,12 +13491,39 @@ function mergeExtractionResults(schema, results) {
13559
13491
  return merged;
13560
13492
  }
13561
13493
 
13494
+ //#endregion
13495
+ //#region src/core/ai-extraction/snapshot.ts
13496
+ async function savePromptSnapshot(schema, aiexDir) {
13497
+ const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
13498
+ const outputDir = path.join(aiexDir, "extracted");
13499
+ await fs.mkdir(outputDir, { recursive: true });
13500
+ const fileName = `${schema.table.name}.prompt.md`;
13501
+ const outputPath = path.join(outputDir, fileName);
13502
+ await fs.writeFile(outputPath, content);
13503
+ return outputPath;
13504
+ }
13505
+
13562
13506
  //#endregion
13563
13507
  //#region src/core/ai-extraction/text-splitter.ts
13564
13508
  const encoding$1 = getEncoding("cl100k_base");
13509
+ const MAX_OVERLAP_RATIO = .15;
13510
+ const MAX_EFFECTIVE_OVERLAP_TOKENS = 1200;
13511
+ const TABLE_SEPARATOR_CELL_RE = /^:?-{3,}:?$/;
13512
+ const LEADING_TABLE_PIPE_RE = /^\|/;
13513
+ const TRAILING_TABLE_PIPE_RE = /\|$/;
13565
13514
  function countTokens(text$1) {
13566
13515
  return encoding$1.encode(text$1).length;
13567
13516
  }
13517
+ function calculateChunkTokenBudget(options = {}) {
13518
+ const configuredMaxTokens = options.configuredMaxTokens ?? 8e3;
13519
+ const modelMaxTokens = options.modelMaxTokens;
13520
+ if (!modelMaxTokens) return configuredMaxTokens;
13521
+ const outputReserveTokens = options.outputReserveTokens ?? 2e3;
13522
+ const promptReserveTokens = options.promptReserveTokens ?? 1200;
13523
+ const safetyBufferTokens = options.safetyBufferTokens ?? Math.min(1e3, Math.floor(modelMaxTokens * .1));
13524
+ const available = modelMaxTokens - outputReserveTokens - promptReserveTokens - safetyBufferTokens;
13525
+ return Math.max(512, Math.min(configuredMaxTokens, available));
13526
+ }
13568
13527
  function formatHeadingContext(headings) {
13569
13528
  const active = headings.filter(Boolean);
13570
13529
  if (active.length === 0) return "";
@@ -13578,6 +13537,71 @@ function getMetadata(headings) {
13578
13537
  h4: headings[3] || void 0
13579
13538
  };
13580
13539
  }
13540
+ function getHeadingPath(metadata) {
13541
+ return [
13542
+ metadata.h1,
13543
+ metadata.h2,
13544
+ metadata.h3,
13545
+ metadata.h4
13546
+ ].filter(Boolean);
13547
+ }
13548
+ function finalizeChunks(chunks, sourceText) {
13549
+ let searchStart = 0;
13550
+ const totalChunks = chunks.length;
13551
+ return chunks.map((chunk, index) => {
13552
+ const tokenCount = countTokens(chunk.pageContent);
13553
+ let charStart = sourceText.indexOf(chunk.pageContent, searchStart);
13554
+ if (charStart === -1) charStart = sourceText.indexOf(chunk.pageContent);
13555
+ const charEnd = charStart >= 0 ? charStart + chunk.pageContent.length : void 0;
13556
+ if (charStart >= 0 && charEnd !== void 0) searchStart = charEnd;
13557
+ return {
13558
+ ...chunk,
13559
+ chunkIndex: index,
13560
+ totalChunks,
13561
+ tokenCount,
13562
+ headingPath: getHeadingPath(chunk.metadata),
13563
+ charStart: charStart >= 0 ? charStart : void 0,
13564
+ charEnd
13565
+ };
13566
+ });
13567
+ }
13568
+ function getEffectiveOverlapTokens(maxTokens, overlapTokens) {
13569
+ return Math.floor(Math.min(overlapTokens, Math.max(64, maxTokens * MAX_OVERLAP_RATIO), MAX_EFFECTIVE_OVERLAP_TOKENS));
13570
+ }
13571
+ function splitMarkdownTable(tableText, maxTokens) {
13572
+ if (countTokens(tableText) <= maxTokens) return [tableText];
13573
+ const lines = tableText.split("\n");
13574
+ const headerIndex = lines.findIndex((line) => line.trim().startsWith("|"));
13575
+ const separatorIndex = lines.findIndex((line, index) => {
13576
+ if (index <= headerIndex) return false;
13577
+ const cells = line.trim().replace(LEADING_TABLE_PIPE_RE, "").replace(TRAILING_TABLE_PIPE_RE, "").split("|").map((cell) => cell.trim());
13578
+ return cells.length > 0 && cells.every((cell) => TABLE_SEPARATOR_CELL_RE.test(cell));
13579
+ });
13580
+ if (headerIndex === -1 || separatorIndex === -1) return splitTextRecursively(tableText, maxTokens, ["\n"]);
13581
+ const prefix = lines.slice(0, headerIndex);
13582
+ const header = lines[headerIndex];
13583
+ const separator = lines[separatorIndex];
13584
+ const rows = lines.slice(separatorIndex + 1).filter((line) => line.trim() !== "");
13585
+ const chunks = [];
13586
+ let currentRows = [];
13587
+ const buildTable = (tableRows) => {
13588
+ return [
13589
+ ...prefix,
13590
+ header,
13591
+ separator,
13592
+ ...tableRows
13593
+ ].join("\n");
13594
+ };
13595
+ for (const row of rows) {
13596
+ const candidateRows = [...currentRows, row];
13597
+ if (currentRows.length > 0 && countTokens(buildTable(candidateRows)) > maxTokens) {
13598
+ chunks.push(buildTable(currentRows));
13599
+ currentRows = [row];
13600
+ } else currentRows = candidateRows;
13601
+ }
13602
+ if (currentRows.length > 0) chunks.push(buildTable(currentRows));
13603
+ return chunks.length > 0 ? chunks : [tableText];
13604
+ }
13581
13605
  /**
13582
13606
  * Splits text recursively using a list of separators.
13583
13607
  * Preserves the separators when re-joining.
@@ -13640,6 +13664,7 @@ function splitTextRecursively(text$1, maxTokens, separators = [
13640
13664
  function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
13641
13665
  const tokens = marked.lexer(text$1);
13642
13666
  const chunks = [];
13667
+ const effectiveOverlapTokens = getEffectiveOverlapTokens(maxTokens, overlapTokens);
13643
13668
  let currentHeadings = [];
13644
13669
  let currentChunkList = [];
13645
13670
  let accumulatedTokens = 0;
@@ -13651,7 +13676,7 @@ function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
13651
13676
  pageContent,
13652
13677
  metadata: getMetadata(firstHeadings)
13653
13678
  });
13654
- if (isHeadingChange || overlapTokens <= 0) {
13679
+ if (isHeadingChange || effectiveOverlapTokens <= 0) {
13655
13680
  currentChunkList = [];
13656
13681
  accumulatedTokens = 0;
13657
13682
  } else {
@@ -13660,7 +13685,7 @@ function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
13660
13685
  for (let i = currentChunkList.length - 1; i >= 0; i--) {
13661
13686
  const item = currentChunkList[i];
13662
13687
  const itemTokens = countTokens(item.text);
13663
- if (currentOverlapTokens + itemTokens > overlapTokens && overlapItems.length > 0) break;
13688
+ if (currentOverlapTokens + itemTokens > effectiveOverlapTokens && overlapItems.length > 0) break;
13664
13689
  overlapItems.unshift(item);
13665
13690
  currentOverlapTokens += itemTokens;
13666
13691
  }
@@ -13691,7 +13716,7 @@ function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
13691
13716
  }
13692
13717
  }
13693
13718
  flushCurrentChunk(true);
13694
- return chunks;
13719
+ return finalizeChunks(chunks, text$1);
13695
13720
  function processTextBlock(blockText, headings, isAtomic = false) {
13696
13721
  const blockTokens = countTokens(blockText);
13697
13722
  const contextTokens = countTokens(formatHeadingContext(headings));
@@ -13699,12 +13724,15 @@ function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
13699
13724
  const budgetLimit = Math.max(5, maxTokens - contextTokens - safetyBuffer);
13700
13725
  if (blockTokens > budgetLimit) if (isAtomic) {
13701
13726
  flushCurrentChunk(false);
13702
- currentChunkList.push({
13703
- text: blockText,
13704
- headings: [...headings]
13705
- });
13706
- accumulatedTokens = blockTokens;
13707
- flushCurrentChunk(false);
13727
+ const atomicBlocks = blockTokens <= maxTokens ? [blockText] : blockText.includes("|") ? splitMarkdownTable(blockText, budgetLimit) : splitTextRecursively(blockText, budgetLimit, ["\n"]);
13728
+ for (const block of atomicBlocks) {
13729
+ currentChunkList.push({
13730
+ text: block,
13731
+ headings: [...headings]
13732
+ });
13733
+ accumulatedTokens = countTokens(block);
13734
+ flushCurrentChunk(false);
13735
+ }
13708
13736
  } else {
13709
13737
  flushCurrentChunk(false);
13710
13738
  const subBlocks = splitTextRecursively(blockText, budgetLimit);
@@ -13878,6 +13906,276 @@ function getFileHash(filePath) {
13878
13906
  });
13879
13907
  }
13880
13908
 
13909
+ //#endregion
13910
+ //#region src/core/ai-extraction/evidence.ts
13911
+ const JSON_FILE_SUFFIX_RE$1 = /\.json$/i;
13912
+ const FIELD_PATH_PREFIX_RE = /^\$\./;
13913
+ function isRecord(value) {
13914
+ return typeof value === "object" && value !== null && !Array.isArray(value);
13915
+ }
13916
+ function stableValueKey(value) {
13917
+ return JSON.stringify(value);
13918
+ }
13919
+ function isPlaceholderString(value) {
13920
+ if (typeof value !== "string") return false;
13921
+ const normalized = value.trim().toLowerCase();
13922
+ return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
13923
+ }
13924
+ function primitiveToText(value) {
13925
+ if (value === null || value === void 0) return null;
13926
+ if (typeof value === "string") return value.trim() || null;
13927
+ if (typeof value === "number" || typeof value === "boolean") return String(value);
13928
+ return null;
13929
+ }
13930
+ function isMeaningfulValue(value) {
13931
+ return primitiveToText(value) !== null && !isPlaceholderString(value);
13932
+ }
13933
+ function normalizeText(value) {
13934
+ return value.toLowerCase().replace(/\s+/g, " ").trim();
13935
+ }
13936
+ function quoteAround(text$1, start, length) {
13937
+ const before = Math.max(0, start - 80);
13938
+ const after = Math.min(text$1.length, start + length + 80);
13939
+ return text$1.slice(before, after).replace(/\s+/g, " ").trim();
13940
+ }
13941
+ function findEvidence(value, chunks) {
13942
+ const searchText = primitiveToText(value);
13943
+ if (!searchText) return null;
13944
+ const normalizedSearchText = normalizeText(searchText);
13945
+ if (!normalizedSearchText) return null;
13946
+ for (const chunk of chunks) {
13947
+ if (normalizeText(chunk.text).indexOf(normalizedSearchText) === -1) continue;
13948
+ const rawIndex = chunk.text.toLowerCase().indexOf(searchText.toLowerCase());
13949
+ const quoteIndex = rawIndex >= 0 ? rawIndex : 0;
13950
+ return {
13951
+ chunkIndex: chunk.chunkIndex,
13952
+ headingPath: chunk.headingPath,
13953
+ quote: quoteAround(chunk.text, quoteIndex, searchText.length)
13954
+ };
13955
+ }
13956
+ return null;
13957
+ }
13958
+ function addEvidenceForProperty(fields, path$1, property, value, chunks) {
13959
+ if (property.type === "object" && property.properties) {
13960
+ const record = isRecord(value) ? value : {};
13961
+ for (const [childName, childProperty] of Object.entries(property.properties)) addEvidenceForProperty(fields, `${path$1}.${childName}`, childProperty, record[childName], chunks);
13962
+ return;
13963
+ }
13964
+ if (property.type === "array") {
13965
+ if (!Array.isArray(value) || value.length === 0) {
13966
+ fields.push({
13967
+ fieldPath: path$1,
13968
+ status: "missing",
13969
+ value: null,
13970
+ confidence: 0,
13971
+ note: "Array field is empty or missing."
13972
+ });
13973
+ return;
13974
+ }
13975
+ value.forEach((item, index) => {
13976
+ if (property.items?.type === "object" && property.items.properties) {
13977
+ const record = isRecord(item) ? item : {};
13978
+ for (const [childName, childProperty] of Object.entries(property.items.properties)) addEvidenceForProperty(fields, `${path$1}[${index}].${childName}`, childProperty, record[childName], chunks);
13979
+ } else addPrimitiveEvidence(fields, `${path$1}[${index}]`, item, chunks);
13980
+ });
13981
+ return;
13982
+ }
13983
+ addPrimitiveEvidence(fields, path$1, value, chunks);
13984
+ }
13985
+ function addPrimitiveEvidence(fields, fieldPath, value, chunks) {
13986
+ if (value === null || value === void 0 || value === "") {
13987
+ fields.push({
13988
+ fieldPath,
13989
+ status: "missing",
13990
+ value: null,
13991
+ confidence: 0,
13992
+ note: "Field is null or empty in final extraction."
13993
+ });
13994
+ return;
13995
+ }
13996
+ const found = findEvidence(value, chunks);
13997
+ if (found) {
13998
+ fields.push({
13999
+ fieldPath,
14000
+ status: "found",
14001
+ value,
14002
+ confidence: .8,
14003
+ ...found
14004
+ });
14005
+ return;
14006
+ }
14007
+ fields.push({
14008
+ fieldPath,
14009
+ status: "inferred",
14010
+ value,
14011
+ confidence: .35,
14012
+ note: "Final value was not found verbatim in the available source text."
14013
+ });
14014
+ }
14015
+ function sourceChunksFromText(text$1) {
14016
+ return text$1 ? [{
14017
+ text: text$1,
14018
+ chunkIndex: 0,
14019
+ headingPath: []
14020
+ }] : [];
14021
+ }
14022
+ function sourceChunksFromMarkdownChunks(chunks) {
14023
+ return chunks.map((chunk, index) => ({
14024
+ text: chunk.pageContent,
14025
+ chunkIndex: chunk.chunkIndex ?? index,
14026
+ headingPath: chunk.headingPath ?? []
14027
+ }));
14028
+ }
14029
+ function getPathParts(fieldPath) {
14030
+ return fieldPath.replace(FIELD_PATH_PREFIX_RE, "").split(".").filter(Boolean);
14031
+ }
14032
+ function getValueAtPath$1(data, fieldPath) {
14033
+ let current = data;
14034
+ for (const part of getPathParts(fieldPath)) {
14035
+ if (!isRecord(current)) return void 0;
14036
+ current = current[part];
14037
+ }
14038
+ return current;
14039
+ }
14040
+ function setValueAtPath(data, fieldPath, value) {
14041
+ const parts = getPathParts(fieldPath);
14042
+ let current = data;
14043
+ for (let i = 0; i < parts.length - 1; i++) {
14044
+ const part = parts[i];
14045
+ if (!isRecord(current[part])) current[part] = {};
14046
+ current = current[part];
14047
+ }
14048
+ current[parts[parts.length - 1]] = value;
14049
+ }
14050
+ function collectScalarFields(fields, fieldPath, property) {
14051
+ if (property.type === "object" && property.properties) {
14052
+ for (const [name$1, childProperty] of Object.entries(property.properties)) collectScalarFields(fields, `${fieldPath}.${name$1}`, childProperty);
14053
+ return;
14054
+ }
14055
+ if (property.type !== "array") fields.push({
14056
+ fieldPath,
14057
+ property
14058
+ });
14059
+ }
14060
+ function candidateScore(candidate) {
14061
+ return (candidate.status === "found" ? 100 : 0) + Math.round(candidate.confidence * 10) + candidate.chunkIndex;
14062
+ }
14063
+ function selectCandidatesForField(candidates) {
14064
+ if (candidates.length === 0) return null;
14065
+ candidates.sort((a, b) => candidateScore(b) - candidateScore(a));
14066
+ const selected = candidates[0];
14067
+ selected.selected = true;
14068
+ for (const candidate of candidates.slice(1)) {
14069
+ candidate.selected = false;
14070
+ candidate.rejectionReason = "Lower evidence score or earlier chunk position.";
14071
+ }
14072
+ const distinctValues = /* @__PURE__ */ new Map();
14073
+ for (const candidate of candidates) distinctValues.set(stableValueKey(candidate.value), candidate.value);
14074
+ if (distinctValues.size <= 1) return null;
14075
+ return {
14076
+ fieldPath: selected.fieldPath,
14077
+ selectedValue: selected.value,
14078
+ rejectedValues: candidates.slice(1).map((candidate) => candidate.value),
14079
+ candidates: [...candidates]
14080
+ };
14081
+ }
14082
+ function buildCandidateMergeReport(input) {
14083
+ const scalarFields = [];
14084
+ for (const [name$1, property] of Object.entries(input.schema.properties)) {
14085
+ if (property.primary && property.autoIncrement) continue;
14086
+ collectScalarFields(scalarFields, `$.${name$1}`, property);
14087
+ }
14088
+ const sourceChunks = sourceChunksFromMarkdownChunks(input.chunks);
14089
+ const candidatesByPath = /* @__PURE__ */ new Map();
14090
+ for (const { fieldPath } of scalarFields) for (let chunkIndex = 0; chunkIndex < input.chunkResults.length; chunkIndex++) {
14091
+ const value = getValueAtPath$1(input.chunkResults[chunkIndex], fieldPath);
14092
+ if (!isMeaningfulValue(value)) continue;
14093
+ const sourceChunk = sourceChunks[chunkIndex] ?? {
14094
+ text: "",
14095
+ chunkIndex
14096
+ };
14097
+ const found = findEvidence(value, [sourceChunk]);
14098
+ const candidate = {
14099
+ fieldPath,
14100
+ value,
14101
+ chunkIndex: sourceChunk.chunkIndex ?? chunkIndex,
14102
+ headingPath: sourceChunk.headingPath,
14103
+ status: found ? "found" : "inferred",
14104
+ quote: found?.quote,
14105
+ confidence: found ? .85 : .35
14106
+ };
14107
+ const candidates = candidatesByPath.get(fieldPath) ?? [];
14108
+ candidates.push(candidate);
14109
+ candidatesByPath.set(fieldPath, candidates);
14110
+ }
14111
+ const allCandidates = [];
14112
+ const conflicts = [];
14113
+ for (const candidates of candidatesByPath.values()) {
14114
+ const conflict = selectCandidatesForField(candidates);
14115
+ allCandidates.push(...candidates);
14116
+ if (conflict) conflicts.push(conflict);
14117
+ }
14118
+ return {
14119
+ candidates: allCandidates,
14120
+ conflicts
14121
+ };
14122
+ }
14123
+ function applySelectedCandidates(data, report) {
14124
+ const merged = structuredClone(data);
14125
+ for (const candidate of report.candidates) if (candidate.selected) setValueAtPath(merged, candidate.fieldPath, candidate.value);
14126
+ return merged;
14127
+ }
14128
+ function buildExtractionEvidence(input) {
14129
+ const data = isRecord(input.data) ? input.data : {};
14130
+ const chunks = input.chunks ? sourceChunksFromMarkdownChunks(input.chunks) : sourceChunksFromText(input.text ?? "");
14131
+ const fields = [];
14132
+ for (const [name$1, property] of Object.entries(input.schema.properties)) {
14133
+ if (property.primary && property.autoIncrement) continue;
14134
+ addEvidenceForProperty(fields, `$.${name$1}`, property, data[name$1], chunks);
14135
+ }
14136
+ const inferredIssues = fields.filter((field) => field.status === "inferred").map((field) => ({
14137
+ fieldPath: field.fieldPath,
14138
+ message: field.note ?? "Field value lacks source evidence."
14139
+ }));
14140
+ const conflictIssues = (input.candidateReport?.conflicts ?? []).map((conflict) => ({
14141
+ fieldPath: conflict.fieldPath,
14142
+ message: "Multiple chunk candidates disagree for this field."
14143
+ }));
14144
+ const issues = [...inferredIssues, ...conflictIssues];
14145
+ return {
14146
+ coverage: {
14147
+ path: input.outputPath ? evidencePathForOutput(input.outputPath) : void 0,
14148
+ fieldCount: fields.length,
14149
+ evidenceCount: fields.filter((field) => field.status === "found").length,
14150
+ foundCount: fields.filter((field) => field.status === "found").length,
14151
+ missingCount: fields.filter((field) => field.status === "missing").length,
14152
+ inferredCount: fields.filter((field) => field.status === "inferred").length,
14153
+ conflictCount: input.candidateReport?.conflicts.length ?? 0,
14154
+ issueCount: issues.length
14155
+ },
14156
+ fields,
14157
+ candidates: input.candidateReport?.candidates,
14158
+ conflicts: input.candidateReport?.conflicts,
14159
+ issues
14160
+ };
14161
+ }
14162
+ function evidencePathForOutput(outputPath) {
14163
+ return outputPath.replace(JSON_FILE_SUFFIX_RE$1, ".evidence.json");
14164
+ }
14165
+ async function writeExtractionEvidence(input) {
14166
+ const report = buildExtractionEvidence(input);
14167
+ const evidencePath = evidencePathForOutput(input.outputPath);
14168
+ report.coverage.path = evidencePath;
14169
+ await writeFile(evidencePath, report, {
14170
+ spaces: 2,
14171
+ EOL: "\n"
14172
+ });
14173
+ return {
14174
+ ...report.coverage,
14175
+ path: path.resolve(evidencePath)
14176
+ };
14177
+ }
14178
+
13881
14179
  //#endregion
13882
14180
  //#region src/core/notion-sink.ts
13883
14181
  const RICH_TEXT_LIMIT = 2e3;
@@ -14163,6 +14461,36 @@ async function triggerWebhook(aiConfig, auditId, schemaName, event, source, data
14163
14461
  }
14164
14462
  }
14165
14463
 
14464
+ //#endregion
14465
+ //#region src/core/ai-extraction/transcriber.ts
14466
+ const TRANSCRIPTION_PROMPT = "Transcribe all visible text from this image accurately. Preserve the layout and line breaks as much as possible.";
14467
+ async function transcribeImageWithVision(imagePath, baseURL, apiKey, modelName, timeoutMs) {
14468
+ const provider = createOpenAICompatible({
14469
+ baseURL,
14470
+ name: "openai-compatible",
14471
+ apiKey
14472
+ });
14473
+ const buffer = await fs.readFile(imagePath);
14474
+ const effectiveTimeout = timeoutMs ?? 3e5;
14475
+ return {
14476
+ text: (await generateText({
14477
+ model: provider.chatModel(modelName),
14478
+ messages: [{
14479
+ role: "user",
14480
+ content: [{
14481
+ type: "text",
14482
+ text: TRANSCRIPTION_PROMPT
14483
+ }, {
14484
+ type: "image",
14485
+ image: buffer
14486
+ }]
14487
+ }],
14488
+ abortSignal: AbortSignal.timeout(effectiveTimeout)
14489
+ })).text,
14490
+ modelName
14491
+ };
14492
+ }
14493
+
14166
14494
  //#endregion
14167
14495
  //#region src/core/file-constants.ts
14168
14496
  const MAX_UPLOAD_SIZE = 30 * 1024 * 1024;
@@ -14496,14 +14824,6 @@ function createPdfConverter(config) {
14496
14824
  return withFallback(new ExternalCommandPdfConverter("mineru", mineruConfig), mineruConfig);
14497
14825
  }
14498
14826
  if (config.converter === "mineru_api") return new MineruApiPdfConverter(config.mineruApi ?? DEFAULT_MINERU_API_CONFIG);
14499
- if (config.converter === "markitdown") {
14500
- const markitdownConfig = config.markitdown ?? DEFAULT_MARKITDOWN_CONFIG;
14501
- return withFallback(new ExternalCommandPdfConverter("markitdown", markitdownConfig), markitdownConfig);
14502
- }
14503
- if (config.converter === "marker") {
14504
- const markerConfig = config.marker ?? DEFAULT_MARKER_CONFIG;
14505
- return withFallback(new ExternalCommandPdfConverter("marker", markerConfig), markerConfig);
14506
- }
14507
14827
  if (config.converter === "external") {
14508
14828
  if (!config.external) throw new Error(t("errors.pdf.externalNotConfigured"));
14509
14829
  return new ExternalCommandPdfConverter("external", config.external);
@@ -14531,7 +14851,7 @@ const FILE_PART_EXTENSIONS = new Set([
14531
14851
  "svg"
14532
14852
  ]);
14533
14853
  const PDF_EXT_RE = /\.pdf$/i;
14534
- async function readExtractFileInput(filePath, aiConfig, modelOverride) {
14854
+ async function readExtractFileInput(filePath, aiConfig) {
14535
14855
  const stat = fs$1.statSync(filePath);
14536
14856
  if (stat.size > MAX_UPLOAD_SIZE) throw new Error(t("errors.file.sizeExceeded", {
14537
14857
  size: bytesToMB(stat.size).toFixed(1),
@@ -14540,15 +14860,22 @@ async function readExtractFileInput(filePath, aiConfig, modelOverride) {
14540
14860
  }));
14541
14861
  const ext = path.extname(filePath).toLowerCase().replace(".", "");
14542
14862
  if (FILE_PART_EXTENSIONS.has(ext)) {
14543
- if (shouldUseImageOcrFallback(aiConfig, modelOverride)) {
14544
- const result = await recognizeImageText(filePath, aiConfig?.image);
14545
- consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
14546
- return { text: result.text };
14863
+ const image = aiConfig?.image;
14864
+ if (image?.imageConversion === "vision" && image.imageModelName && aiConfig) {
14865
+ const baseURL = image.visionBaseURL || aiConfig.provider.baseURL;
14866
+ const apiKey = image.visionApiKey || aiConfig.provider.apiKey;
14867
+ const timeout = (aiConfig.provider.timeout ?? 300) * 1e3;
14868
+ try {
14869
+ const result$1 = await transcribeImageWithVision(filePath, baseURL, apiKey, image.imageModelName, timeout);
14870
+ consola.info(t("command.extract.file.visionTranscribed", { model: result$1.modelName }));
14871
+ return { text: result$1.text };
14872
+ } catch {
14873
+ consola.warn(t("command.extract.file.visionTranscribeFailed", { model: image.imageModelName }));
14874
+ }
14547
14875
  }
14548
- return {
14549
- text: "",
14550
- filePath
14551
- };
14876
+ const result = await recognizeImageText(filePath, aiConfig?.image);
14877
+ consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
14878
+ return { text: result.text };
14552
14879
  }
14553
14880
  if (ext === "pdf") {
14554
14881
  const buffer = await fs.readFile(filePath);
@@ -14684,29 +15011,6 @@ async function limitConcurrency(concurrency, items, fn) {
14684
15011
  await Promise.all(workers);
14685
15012
  return results;
14686
15013
  }
14687
- function getSchemaKeywords(schema) {
14688
- const keywords = /* @__PURE__ */ new Set();
14689
- function walk(properties) {
14690
- if (!properties) return;
14691
- for (const [name$1, prop] of Object.entries(properties)) {
14692
- keywords.add(name$1.toLowerCase());
14693
- const parts = name$1.replace(/([a-z0-9])([A-Z])/g, "$1 $2").split(/[\s._:/\\-]+/g);
14694
- for (const part of parts) if (part.length > 1) keywords.add(part.toLowerCase());
14695
- if (prop && typeof prop === "object") {
14696
- const p = prop;
14697
- if (typeof p.title === "string") keywords.add(p.title.toLowerCase());
14698
- if (typeof p.description === "string") {
14699
- const descParts = p.description.toLowerCase().match(/[\p{L}\p{N}_-]+/gu) ?? [];
14700
- for (const d of descParts) if (d.length > 2) keywords.add(d);
14701
- }
14702
- if (p.type === "object") walk(p.properties);
14703
- if (p.type === "array" && p.items?.type === "object") walk(p.items.properties);
14704
- }
14705
- }
14706
- }
14707
- walk(schema.properties);
14708
- return Array.from(keywords);
14709
- }
14710
15014
  async function ensureDatabaseReady(dbPath, schema) {
14711
15015
  try {
14712
15016
  await fs.access(dbPath);
@@ -14778,184 +15082,145 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14778
15082
  }
14779
15083
  const s = spinner();
14780
15084
  if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
14781
- const maxTokens = aiConfig.extraction?.maxTokens ?? 8e3;
15085
+ const maxTokens = calculateChunkTokenBudget({
15086
+ configuredMaxTokens: aiConfig.extraction?.maxTokens ?? 8e3,
15087
+ modelMaxTokens: modelOverride?.capabilities.maxTokens
15088
+ });
14782
15089
  const overlapTokens = aiConfig.extraction?.overlapSize ?? 1e3;
14783
- let result;
14784
15090
  const totalTokens = text$1 ? encoding.encode(text$1).length : 0;
14785
- if (text$1 && totalTokens > maxTokens) {
14786
- if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
14787
- length: totalTokens,
14788
- limit: maxTokens
14789
- }));
14790
- const finalDocs = splitMarkdown(text$1, maxTokens, overlapTokens);
14791
- if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
14792
- let processedDocs = finalDocs;
14793
- if (!!aiConfig.extraction?.preFiltering && finalDocs.length > 1) {
14794
- const preFilteringLimit = aiConfig.extraction?.preFilteringLimit ?? 5;
14795
- const keywords = getSchemaKeywords(schemaLoad.schema);
14796
- const scoredChunks = finalDocs.map((doc, idx) => {
14797
- if (idx === 0) return {
14798
- index: idx,
14799
- score: Number.POSITIVE_INFINITY
14800
- };
14801
- let score = 0;
14802
- const docTextLower = doc.pageContent.toLowerCase();
14803
- for (const kw of keywords) {
14804
- let pos = docTextLower.indexOf(kw);
14805
- while (pos !== -1) {
14806
- score++;
14807
- pos = docTextLower.indexOf(kw, pos + kw.length);
14808
- }
14809
- }
14810
- return {
14811
- index: idx,
14812
- score
14813
- };
14814
- }).slice(1).sort((a, b) => b.score - a.score);
14815
- const selectedIndices = new Set([0]);
14816
- let keptCount = 0;
14817
- for (const sc of scoredChunks) if (sc.score > 0 && keptCount < preFilteringLimit) {
14818
- selectedIndices.add(sc.index);
14819
- keptCount++;
14820
- }
14821
- processedDocs = finalDocs.filter((_, idx) => selectedIndices.has(idx));
14822
- if (!options?.quiet) consola.info(t("command.extract.file.preFiltering", {
14823
- original: finalDocs.length,
14824
- filtered: processedDocs.length
14825
- }));
14826
- }
14827
- const chunkResults = [];
14828
- const accumulatedTokens = {
14829
- prompt: 0,
14830
- completion: 0,
14831
- total: 0
14832
- };
14833
- let success = true;
14834
- let errorMsg = "";
14835
- const extractionTasks = processedDocs.map((doc, i) => {
14836
- return async () => {
14837
- if (!success) return;
14838
- const headings = [];
14839
- if (doc.metadata) {
14840
- if (doc.metadata.h1) headings.push(doc.metadata.h1);
14841
- if (doc.metadata.h2) headings.push(doc.metadata.h2);
14842
- if (doc.metadata.h3) headings.push(doc.metadata.h3);
14843
- if (doc.metadata.h4) headings.push(doc.metadata.h4);
14844
- }
14845
- let chunkText = doc.pageContent;
14846
- if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
14847
- const chunkResult = await extractStructuredData({
14848
- config: aiConfig,
14849
- schema: schemaLoad.schema,
14850
- text: chunkText,
14851
- aiexDir,
14852
- modelOverride,
14853
- onRetry(info) {
14854
- if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
14855
- current: i + 1,
14856
- total: processedDocs.length,
14857
- code: info.statusCode,
14858
- delay: info.delayMs / 1e3,
14859
- attempt: info.attempt,
14860
- max: info.maxRetries
14861
- }));
14862
- }
14863
- });
14864
- if (!chunkResult.success) {
14865
- success = false;
14866
- errorMsg = chunkResult.error || t("common.unknownError");
14867
- if (!options?.quiet) {
14868
- s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
14869
- consola.error(errorMsg);
14870
- }
14871
- return;
14872
- }
14873
- if (chunkResult.data) chunkResults.push(chunkResult.data);
14874
- if (chunkResult.tokensUsed) {
14875
- accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
14876
- accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
14877
- accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
15091
+ if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunking", {
15092
+ length: totalTokens,
15093
+ limit: maxTokens
15094
+ }));
15095
+ const processedDocs = text$1 && totalTokens > maxTokens ? splitMarkdown(text$1, maxTokens, overlapTokens) : [{
15096
+ pageContent: text$1 ?? "",
15097
+ metadata: {},
15098
+ chunkIndex: 0,
15099
+ totalChunks: 1,
15100
+ tokenCount: totalTokens,
15101
+ headingPath: [],
15102
+ charStart: 0,
15103
+ charEnd: text$1?.length ?? 0
15104
+ }];
15105
+ if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: processedDocs.length }));
15106
+ const chunkResults = Array.from({ length: processedDocs.length });
15107
+ const accumulatedTokens = {
15108
+ prompt: 0,
15109
+ completion: 0,
15110
+ total: 0
15111
+ };
15112
+ let success = true;
15113
+ let errorMsg = "";
15114
+ const extractionTasks = processedDocs.map((doc, i) => {
15115
+ return async () => {
15116
+ if (!success) return;
15117
+ const headings = doc.headingPath?.length ? doc.headingPath : [
15118
+ doc.metadata.h1,
15119
+ doc.metadata.h2,
15120
+ doc.metadata.h3,
15121
+ doc.metadata.h4
15122
+ ].filter(Boolean);
15123
+ let chunkText = doc.pageContent;
15124
+ if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
15125
+ const chunkResult = await extractStructuredData({
15126
+ config: aiConfig,
15127
+ schema: schemaLoad.schema,
15128
+ text: chunkText,
15129
+ aiexDir,
15130
+ modelOverride,
15131
+ onRetry(info) {
15132
+ if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
15133
+ current: i + 1,
15134
+ total: processedDocs.length,
15135
+ code: info.statusCode,
15136
+ delay: info.delayMs / 1e3,
15137
+ attempt: info.attempt,
15138
+ max: info.maxRetries
15139
+ }));
14878
15140
  }
14879
- };
14880
- });
14881
- const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
14882
- if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
14883
- current: 1,
14884
- total: processedDocs.length
14885
- }));
14886
- try {
14887
- await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
14888
- if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
14889
- current: idx + 1,
14890
- total: processedDocs.length
14891
- }));
14892
- await task();
14893
15141
  });
14894
- } catch (e) {
14895
- success = false;
14896
- errorMsg = e instanceof Error ? e.message : String(e);
14897
- }
14898
- if (!success) return {
14899
- success: false,
14900
- error: errorMsg
14901
- };
14902
- const mergedData = mergeExtractionResults(schemaLoad.schema, chunkResults);
14903
- const validation = validateExtractedData(schemaLoad.schema, mergedData);
14904
- if (!validation.success) {
14905
- const valError = validation.error || "Merged data validation failed";
14906
- if (!options?.quiet) {
14907
- s.stop(t("command.extract.file.validationFail"));
14908
- consola.error(valError);
15142
+ if (!chunkResult.success) {
15143
+ success = false;
15144
+ errorMsg = chunkResult.error || t("common.unknownError");
15145
+ if (!options?.quiet) {
15146
+ s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
15147
+ consola.error(errorMsg);
15148
+ }
15149
+ return;
15150
+ }
15151
+ if (chunkResult.data) chunkResults[i] = chunkResult.data;
15152
+ if (chunkResult.tokensUsed) {
15153
+ accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
15154
+ accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
15155
+ accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
14909
15156
  }
14910
- return {
14911
- success: false,
14912
- error: valError
14913
- };
14914
- }
14915
- const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
14916
- await fs.mkdir(outputDir, { recursive: true });
14917
- const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
14918
- const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
14919
- const finalMergedOutputPath = path.join(outputDir, outputFileName);
14920
- await fs.writeFile(finalMergedOutputPath, JSON.stringify(mergedData, null, 2));
14921
- result = {
14922
- success: true,
14923
- data: mergedData,
14924
- tokensUsed: accumulatedTokens,
14925
- outputPath: finalMergedOutputPath
14926
15157
  };
14927
- } else result = await extractStructuredData({
14928
- config: aiConfig,
14929
- schema: schemaLoad.schema,
14930
- text: text$1 ?? "",
14931
- aiexDir,
14932
- file: filePath,
14933
- modelOverride,
14934
- onRetry(info) {
14935
- if (!options?.quiet) s.message(t("command.extract.file.extractRetry", {
14936
- code: info.statusCode,
14937
- delay: info.delayMs / 1e3,
14938
- attempt: info.attempt,
14939
- max: info.maxRetries
15158
+ });
15159
+ const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
15160
+ if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
15161
+ current: 1,
15162
+ total: processedDocs.length
15163
+ }));
15164
+ try {
15165
+ await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
15166
+ if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
15167
+ current: idx + 1,
15168
+ total: processedDocs.length
14940
15169
  }));
14941
- }
15170
+ await task();
15171
+ });
15172
+ } catch (e) {
15173
+ success = false;
15174
+ errorMsg = e instanceof Error ? e.message : String(e);
15175
+ }
15176
+ if (!success) return {
15177
+ success: false,
15178
+ error: errorMsg
15179
+ };
15180
+ const successfulChunkResults = chunkResults.filter((chunkResult) => !!chunkResult);
15181
+ const candidateReport = buildCandidateMergeReport({
15182
+ schema: schemaLoad.schema,
15183
+ chunkResults: successfulChunkResults,
15184
+ chunks: processedDocs
14942
15185
  });
14943
- if (!result.success) {
15186
+ const mergedData = applySelectedCandidates(mergeExtractionResults(schemaLoad.schema, successfulChunkResults), candidateReport);
15187
+ const validation = validateExtractedData(schemaLoad.schema, mergedData);
15188
+ if (!validation.success) {
15189
+ const valError = validation.error || "Merged data validation failed";
14944
15190
  if (!options?.quiet) {
14945
- s.stop(t("command.extract.file.extractFail"));
14946
- consola.error(result.error || t("common.unknownError"));
15191
+ s.stop(t("command.extract.file.validationFail"));
15192
+ consola.error(valError);
14947
15193
  }
14948
15194
  return {
14949
15195
  success: false,
14950
- error: result.error || t("common.unknownError")
15196
+ error: valError
14951
15197
  };
14952
15198
  }
15199
+ const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
15200
+ await fs.mkdir(outputDir, { recursive: true });
15201
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
15202
+ const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
15203
+ const outputPath = path.join(outputDir, outputFileName);
15204
+ await fs.writeFile(outputPath, JSON.stringify(mergedData, null, 2));
15205
+ const result = {
15206
+ success: true,
15207
+ data: mergedData,
15208
+ tokensUsed: accumulatedTokens,
15209
+ outputPath,
15210
+ evidenceSummary: await writeExtractionEvidence({
15211
+ schema: schemaLoad.schema,
15212
+ data: mergedData,
15213
+ outputPath,
15214
+ chunks: processedDocs,
15215
+ candidateReport
15216
+ })
15217
+ };
14953
15218
  if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
14954
15219
  if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
14955
15220
  if (result.evidenceSummary && !options?.quiet) {
14956
15221
  const summary = result.evidenceSummary;
14957
15222
  const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
14958
- consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, issues ${issueText}`));
15223
+ consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, conflicts ${summary.conflictCount ?? 0}, issues ${issueText}`));
14959
15224
  }
14960
15225
  if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
14961
15226
  prompt: result.tokensUsed.prompt,
@@ -15077,13 +15342,9 @@ async function runAuditedExtraction(options) {
15077
15342
  });
15078
15343
  try {
15079
15344
  let text$1 = "";
15080
- let filePath;
15081
- if (source.type === "file") {
15082
- const input = await readExtractFileInput(source.filePath, aiConfig, modelOverride);
15083
- text$1 = input.text;
15084
- filePath = input.filePath;
15085
- } else text$1 = source.text;
15086
- const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
15345
+ if (source.type === "file") text$1 = (await readExtractFileInput(source.filePath, aiConfig)).text;
15346
+ else text$1 = source.text;
15347
+ const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, source.type === "file" ? source.filePath : void 0, modelOverride, {
15087
15348
  quiet,
15088
15349
  insert
15089
15350
  });