aiex-cli 0.0.5-beta.4 → 0.0.5-beta.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-Cv7RArla.mjs";
1
+ import { C as name, D as doctorDiagnosticsTableRows, O as formatDoctorDiagnosticsJson, S as description, T as version, _ as PLACEHOLDER_SCHEMA, a as parseJsonSchema, b as createConfig, c as recognizeImageText, d as getDefaultAIConfig, f as readAIConfig, g as DEFAULT_PROMPT_CONFIG, h as DEFAULT_MINERU_CONFIG, i as JsonSchemaDefinitionSchema, l as initI18n, m as DEFAULT_MINERU_API_CONFIG, n as createMigrationConfig, o as toSnakeCase, p as writeAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as t, v as PLACEHOLDER_TEXT, w as package_default, x as seedConfig, y as AIConfigSchema } from "./doctor-collector-BpqhXNcO.mjs";
2
2
  import { createRequire } from "node:module";
3
3
  import fs from "node:fs/promises";
4
4
  import os from "node:os";
@@ -17,13 +17,14 @@ import Database from "better-sqlite3";
17
17
  import pc from "picocolors";
18
18
  import { Buffer } from "node:buffer";
19
19
  import * as XLSX from "xlsx";
20
+ import { getEncoding } from "js-tiktoken";
20
21
  import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
21
22
  import { APICallError, Output, generateText, jsonSchema } from "ai";
22
23
  import pRetry from "p-retry";
23
- import mime from "mime";
24
24
  import { jsonrepair } from "jsonrepair";
25
25
  import { LangfuseSpanProcessor } from "@langfuse/otel";
26
26
  import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
27
+ import { marked } from "marked";
27
28
  import crypto from "node:crypto";
28
29
  import { Client, extractNotionId } from "@notionhq/client";
29
30
  import { execa } from "execa";
@@ -12859,28 +12860,6 @@ async function withRetry(fn, onRetry, maxRetries = 5) {
12859
12860
  });
12860
12861
  }
12861
12862
 
12862
- //#endregion
12863
- //#region src/core/ai-extraction/file-utils.ts
12864
- function detectMimeType(filePath) {
12865
- return mime.getType(filePath) ?? "application/octet-stream";
12866
- }
12867
- async function readFilePart(filePath) {
12868
- const mimeStr = detectMimeType(filePath);
12869
- const buffer = await fs.readFile(filePath);
12870
- const name$1 = path.basename(filePath);
12871
- if (mimeStr.startsWith("image/")) return {
12872
- type: "image",
12873
- image: buffer,
12874
- mimeType: mimeStr
12875
- };
12876
- return {
12877
- type: "file",
12878
- data: buffer,
12879
- mediaType: mimeStr,
12880
- filename: name$1
12881
- };
12882
- }
12883
-
12884
12863
  //#endregion
12885
12864
  //#region src/core/ai-extraction/json-utils.ts
12886
12865
  function parseJsonLike(text$1) {
@@ -12941,25 +12920,10 @@ function filterCompatible(models, inputTokens, outputTokens) {
12941
12920
  });
12942
12921
  }
12943
12922
  function selectModel(input) {
12944
- const { models, isImage, fileName, inputTokens, outputTokens } = input;
12923
+ const { models, inputTokens, outputTokens } = input;
12945
12924
  if (models.length === 0) throw new Error(t("errors.ai.noModels"));
12946
12925
  let candidates = filterCompatible(models, inputTokens, outputTokens);
12947
12926
  if (candidates.length === 0) candidates = models;
12948
- if (isImage) {
12949
- const visionModel = candidates.find((m) => m.capabilities.vision);
12950
- if (!visionModel) {
12951
- const hint = fileName ? ` (${fileName})` : "";
12952
- const msg = inputTokens ? t("errors.ai.noVisionModelContext", {
12953
- tokens: inputTokens,
12954
- hint
12955
- }) : t("errors.ai.noVisionModel", { hint });
12956
- throw new Error(msg + t("errors.ai.addSuitableModel"));
12957
- }
12958
- return {
12959
- name: visionModel.name,
12960
- capabilities: visionModel.capabilities
12961
- };
12962
- }
12963
12927
  const soModel = candidates.find((m) => m.capabilities.structuredOutput);
12964
12928
  if (soModel) return {
12965
12929
  name: soModel.name,
@@ -12973,36 +12937,46 @@ function selectModel(input) {
12973
12937
 
12974
12938
  //#endregion
12975
12939
  //#region src/core/ai-extraction/prompt-generator.ts
12976
- function propertyToDescription(name$1, prop, indent = "") {
12940
+ const CAMEL_CASE_BOUNDARY_RE = /([a-z0-9])([A-Z])/g;
12941
+ const IDENTIFIER_SEPARATOR_RE = /[\s_-]+/;
12942
+ function splitIdentifier(name$1) {
12943
+ return name$1.replace(CAMEL_CASE_BOUNDARY_RE, "$1 $2").split(IDENTIFIER_SEPARATOR_RE).map((part) => part.trim().toLowerCase()).filter(Boolean);
12944
+ }
12945
+ function propertyToDescription(name$1, prop, indent = "", required = false) {
12977
12946
  const lines = [];
12978
12947
  let typeStr = prop.type;
12979
12948
  if (prop.type === "array" && prop.items) typeStr = `array of ${prop.items.type}`;
12980
- lines.push(`${indent}- ${name$1}: ${typeStr}`);
12949
+ lines.push(`${indent}- ${name$1}: ${typeStr}${required ? " (required)" : ""}`);
12950
+ const terms = splitIdentifier(name$1);
12951
+ if (terms.length > 1) lines.push(`${indent} search terms: ${terms.join(", ")}`);
12952
+ if (prop.description) lines.push(`${indent} description: ${prop.description}`);
12981
12953
  if (prop.minLength !== void 0 || prop.maxLength !== void 0) lines.push(`${indent} length: ${prop.minLength ?? 0} - ${prop.maxLength ?? "unlimited"}`);
12954
+ if (prop.minimum !== void 0 || prop.maximum !== void 0) lines.push(`${indent} range: ${prop.minimum ?? "-∞"} - ${prop.maximum ?? "+∞"}`);
12982
12955
  if (prop.format) lines.push(`${indent} format: ${prop.format}`);
12983
12956
  if (prop.unique) lines.push(`${indent} unique: true`);
12984
12957
  if (prop.default !== void 0) lines.push(`${indent} default: ${JSON.stringify(prop.default)}`);
12985
12958
  return lines.join("\n");
12986
12959
  }
12987
- function nestedPropertyToDescription(name$1, prop, indent = "") {
12960
+ function nestedPropertyToDescription(name$1, prop, indent = "", requiredFields = []) {
12988
12961
  const lines = [];
12962
+ const isRequired = requiredFields.includes(name$1);
12989
12963
  if (prop.nested?.enabled && prop.type === "object") {
12990
12964
  const relation = prop.nested.relation || "has-one";
12991
- lines.push(`${indent}- ${name$1}: object (related table, ${relation})`);
12992
- if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
12965
+ lines.push(`${indent}- ${name$1}: object (related table, ${relation})${isRequired ? " (required)" : ""}`);
12966
+ if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.required ?? []));
12993
12967
  return lines.join("\n");
12994
12968
  }
12995
12969
  if (prop.type === "array" && prop.items?.nested?.enabled) {
12996
12970
  const relation = prop.items.nested.relation || "has-many";
12997
- lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})`);
12998
- if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
12971
+ lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})${isRequired ? " (required)" : ""}`);
12972
+ if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.items.required ?? []));
12999
12973
  return lines.join("\n");
13000
12974
  }
13001
- lines.push(propertyToDescription(name$1, prop, indent));
13002
- if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
12975
+ lines.push(propertyToDescription(name$1, prop, indent, isRequired));
12976
+ if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.required ?? []));
13003
12977
  if (prop.type === "array" && prop.items?.properties && !prop.items?.nested?.enabled) {
13004
12978
  lines.push(`${indent} item fields:`);
13005
- for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
12979
+ for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.items.required ?? []));
13006
12980
  }
13007
12981
  return lines.join("\n");
13008
12982
  }
@@ -13014,7 +12988,7 @@ function schemaToDescription(schema) {
13014
12988
  lines.push("Fields:");
13015
12989
  for (const [name$1, prop] of Object.entries(schema.properties)) {
13016
12990
  const property = prop;
13017
- lines.push(nestedPropertyToDescription(name$1, property));
12991
+ lines.push(nestedPropertyToDescription(name$1, property, "", schema.required ?? []));
13018
12992
  }
13019
12993
  if (schema.examples && schema.examples.length > 0) {
13020
12994
  lines.push("");
@@ -13059,33 +13033,6 @@ function generatePromptSnapshot(schema, promptConfig = DEFAULT_PROMPT_CONFIG) {
13059
13033
  ].join("\n");
13060
13034
  }
13061
13035
 
13062
- //#endregion
13063
- //#region src/core/ai-extraction/snapshot.ts
13064
- const SYSTEM_PROMPT_REGEX = /## System Prompt\n([\s\S]*?)(?=## User Prompt|$)/;
13065
- const USER_PROMPT_REGEX = /## User Prompt Template\n([\s\S]*)$/;
13066
- async function loadPromptSnapshot(aiexDir, tableName) {
13067
- const snapshotPath = path.join(aiexDir, "extracted", `${tableName}.prompt.md`);
13068
- try {
13069
- const content = await fs.readFile(snapshotPath, "utf-8");
13070
- const systemMatch = content.match(SYSTEM_PROMPT_REGEX);
13071
- const userMatch = content.match(USER_PROMPT_REGEX);
13072
- if (systemMatch && userMatch) return {
13073
- system: systemMatch[1].trim(),
13074
- user: userMatch[1].trim()
13075
- };
13076
- } catch {}
13077
- return null;
13078
- }
13079
- async function savePromptSnapshot(schema, aiexDir) {
13080
- const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
13081
- const outputDir = path.join(aiexDir, "extracted");
13082
- await fs.mkdir(outputDir, { recursive: true });
13083
- const fileName = `${schema.table.name}.prompt.md`;
13084
- const outputPath = path.join(outputDir, fileName);
13085
- await fs.writeFile(outputPath, content);
13086
- return outputPath;
13087
- }
13088
-
13089
13036
  //#endregion
13090
13037
  //#region src/core/ai-extraction/telemetry.ts
13091
13038
  let langfuseInitialized = false;
@@ -13128,7 +13075,7 @@ function propertyToExtractionSchema(property) {
13128
13075
  }
13129
13076
  return { type: nullableType(property.type) };
13130
13077
  }
13131
- function isRecord$1(value) {
13078
+ function isRecord$2(value) {
13132
13079
  return typeof value === "object" && value !== null && !Array.isArray(value);
13133
13080
  }
13134
13081
  function schemaToExtractionOutputSchema(schema) {
@@ -13166,7 +13113,7 @@ function validatePropertyValue(path$1, property, value, issues) {
13166
13113
  }
13167
13114
  return;
13168
13115
  case "object":
13169
- if (!isRecord$1(value)) {
13116
+ if (!isRecord$2(value)) {
13170
13117
  issues.push(`${path$1}: expected object or null`);
13171
13118
  return;
13172
13119
  }
@@ -13189,7 +13136,7 @@ function validateProperties(basePath, properties, data, issues) {
13189
13136
  }
13190
13137
  }
13191
13138
  function validateExtractedData(schema, data) {
13192
- if (!isRecord$1(data)) return {
13139
+ if (!isRecord$2(data)) return {
13193
13140
  success: false,
13194
13141
  error: "Extracted data must be a JSON object."
13195
13142
  };
@@ -13206,13 +13153,11 @@ function validateExtractedData(schema, data) {
13206
13153
  //#region src/core/ai-extraction/extractor.ts
13207
13154
  const OPENAI_COMPATIBLE_PROVIDER_NAME = "openai-compatible";
13208
13155
  async function extractStructuredData(input) {
13209
- const { config, schema, text: text$1, aiexDir, file, modelOverride } = input;
13156
+ const { config, schema, text: text$1, modelOverride } = input;
13210
13157
  if (!config.provider.apiKey) return {
13211
13158
  success: false,
13212
13159
  error: t("errors.ai.apiKeyMissing")
13213
13160
  };
13214
- const useFileContent = !!file;
13215
- const isImageFile = useFileContent && detectMimeType(file).startsWith("image/");
13216
13161
  const inputTokens = text$1 ? Math.ceil(text$1.length / 2) : void 0;
13217
13162
  const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
13218
13163
  const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
@@ -13220,8 +13165,6 @@ async function extractStructuredData(input) {
13220
13165
  try {
13221
13166
  selected = modelOverride ?? selectModel({
13222
13167
  models: config.provider.models,
13223
- isImage: isImageFile,
13224
- fileName: file,
13225
13168
  inputTokens,
13226
13169
  outputTokens
13227
13170
  });
@@ -13241,18 +13184,7 @@ async function extractStructuredData(input) {
13241
13184
  apiKey: config.provider.apiKey,
13242
13185
  supportsStructuredOutputs: useStructuredOutput
13243
13186
  });
13244
- let system;
13245
- let user;
13246
- const snapshot = await loadPromptSnapshot(aiexDir, schema.table.name);
13247
- const promptText = file ? PLACEHOLDER_TEXT : text$1;
13248
- if (snapshot) {
13249
- system = snapshot.system;
13250
- user = snapshot.user.replaceAll(PLACEHOLDER_TEXT, promptText);
13251
- } else {
13252
- const generated = generateExtractionPrompt(schema, promptText, config.prompt ?? DEFAULT_PROMPT_CONFIG);
13253
- system = generated.system;
13254
- user = generated.user;
13255
- }
13187
+ const { system, user } = generateExtractionPrompt(schema, text$1, config.prompt ?? DEFAULT_PROMPT_CONFIG);
13256
13188
  const outputSchema = jsonSchema(schemaToExtractionOutputSchema(schema));
13257
13189
  const timeoutMs = (config.provider.timeout ?? 300) * 1e3;
13258
13190
  let systemPrompt = system;
@@ -13267,38 +13199,16 @@ async function extractStructuredData(input) {
13267
13199
  let parseError;
13268
13200
  let validationError;
13269
13201
  try {
13270
- if (useFileContent) {
13271
- const filePart = await readFilePart(file);
13272
- const fileName = filePart.type === "file" ? filePart.filename : path.basename(file);
13273
- const contentParts = [{
13274
- type: "text",
13275
- text: userPrompt.includes(PLACEHOLDER_TEXT) ? userPrompt.replaceAll(PLACEHOLDER_TEXT, text$1 || `Data is contained in the attached file: ${fileName}`) : userPrompt
13276
- }, filePart];
13277
- const fileOpts = {
13278
- model: provider.chatModel(selected.name),
13279
- system: systemPrompt,
13280
- messages: [{
13281
- role: "user",
13282
- content: contentParts
13283
- }],
13284
- abortSignal: AbortSignal.timeout(timeoutMs),
13285
- maxRetries: 0,
13286
- experimental_telemetry: { isEnabled: useTelemetry }
13287
- };
13288
- if (useStructuredOutput) fileOpts.output = Output.object({ schema: outputSchema });
13289
- result = await withRetry(() => generateText(fileOpts), input.onRetry);
13290
- } else {
13291
- const textOpts = {
13292
- model: provider.chatModel(selected.name),
13293
- system: systemPrompt,
13294
- prompt: userPrompt,
13295
- abortSignal: AbortSignal.timeout(timeoutMs),
13296
- maxRetries: 0,
13297
- experimental_telemetry: { isEnabled: useTelemetry }
13298
- };
13299
- if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
13300
- result = await withRetry(() => generateText(textOpts), input.onRetry);
13301
- }
13202
+ const textOpts = {
13203
+ model: provider.chatModel(selected.name),
13204
+ system: systemPrompt,
13205
+ prompt: userPrompt,
13206
+ abortSignal: AbortSignal.timeout(timeoutMs),
13207
+ maxRetries: 0,
13208
+ experimental_telemetry: { isEnabled: useTelemetry }
13209
+ };
13210
+ if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
13211
+ result = await withRetry(() => generateText(textOpts), input.onRetry);
13302
13212
  if (result.usage) {
13303
13213
  totalPromptTokens += result.usage.inputTokens ?? 0;
13304
13214
  totalCompletionTokens += result.usage.outputTokens ?? 0;
@@ -13314,27 +13224,16 @@ async function extractStructuredData(input) {
13314
13224
  }
13315
13225
  if (!parseError && data !== void 0) {
13316
13226
  const validation = validateExtractedData(schema, data);
13317
- if (validation.success) {
13318
- const outputDir = path.resolve(aiexDir, config.extraction.outputDir.replace(".aiex/", ""));
13319
- await fs.mkdir(outputDir, { recursive: true });
13320
- const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
13321
- const outputFileName = `${schema.table.name}-${timestamp}.json`;
13322
- const outputPath = path.join(outputDir, outputFileName);
13323
- await writeFile(outputPath, data, {
13324
- spaces: 2,
13325
- EOL: "\n"
13326
- });
13327
- return {
13328
- success: true,
13329
- outputPath,
13330
- data,
13331
- tokensUsed: {
13332
- prompt: totalPromptTokens,
13333
- completion: totalCompletionTokens,
13334
- total: totalPromptTokens + totalCompletionTokens
13335
- }
13336
- };
13337
- } else validationError = validation.error;
13227
+ if (validation.success) return {
13228
+ success: true,
13229
+ data,
13230
+ tokensUsed: {
13231
+ prompt: totalPromptTokens,
13232
+ completion: totalCompletionTokens,
13233
+ total: totalPromptTokens + totalCompletionTokens
13234
+ }
13235
+ };
13236
+ else validationError = validation.error;
13338
13237
  }
13339
13238
  const errorMsg = parseError || validationError || "Unknown validation error";
13340
13239
  lastError = errorMsg;
@@ -13345,11 +13244,14 @@ async function extractStructuredData(input) {
13345
13244
  CRITICAL RULES:
13346
13245
  1. Only correct the fields that failed validation.
13347
13246
  2. Preserve all other correctly extracted fields and their values exactly.
13348
- 3. Return ONLY the corrected JSON object. No explanations, no markdown blocks other than JSON.`;
13247
+ 3. Use only values supported by the original text. If a value cannot be confirmed, set it to null.
13248
+ 4. Remove any fields not defined by the JSON Schema.
13249
+ 5. Normalize values to the expected JSON type without changing the intended meaning.
13250
+ 6. Return ONLY the corrected JSON object. No explanations, no markdown blocks other than JSON.`;
13349
13251
  userPrompt = `The JSON data you generated previously failed validation. Please correct it.
13350
13252
 
13351
13253
  [Original Text]
13352
- ${text$1 || "Data is contained in the attached file."}
13254
+ ${text$1 || "Original text is empty."}
13353
13255
 
13354
13256
  [JSON Schema Definition]
13355
13257
  ${JSON.stringify(schemaToExtractionOutputSchema(schema), null, 2)}
@@ -13360,6 +13262,11 @@ ${invalidJson}
13360
13262
  [Validation Error Details]
13361
13263
  ${errorMsg}
13362
13264
 
13265
+ Correction checklist:
13266
+ - Fix each field path mentioned in the validation error.
13267
+ - Keep schema-valid fields unchanged.
13268
+ - Do not invent missing facts; use null when the original text does not support a value.
13269
+
13363
13270
  Please output the corrected JSON object now:`;
13364
13271
  }
13365
13272
  }
@@ -13514,33 +13421,60 @@ function insertExtractedData(db, schema, data) {
13514
13421
 
13515
13422
  //#endregion
13516
13423
  //#region src/core/ai-extraction/json-merger.ts
13517
- function isRecord(value) {
13424
+ function isRecord$1(value) {
13518
13425
  return typeof value === "object" && value !== null && !Array.isArray(value);
13519
13426
  }
13427
+ function stableKey(value) {
13428
+ if (!isRecord$1(value)) return JSON.stringify(value);
13429
+ return JSON.stringify(Object.keys(value).sort().reduce((acc, key) => {
13430
+ acc[key] = value[key];
13431
+ return acc;
13432
+ }, {}));
13433
+ }
13434
+ function isBlankString(value) {
13435
+ return typeof value === "string" && value.trim() === "";
13436
+ }
13437
+ function isPlaceholderString$1(value) {
13438
+ if (typeof value !== "string") return false;
13439
+ const normalized = value.trim().toLowerCase();
13440
+ return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
13441
+ }
13442
+ function pickPrimitiveValue(values) {
13443
+ const meaningful = values.filter((v) => !isBlankString(v) && !isPlaceholderString$1(v));
13444
+ if (meaningful.length === 0) return null;
13445
+ if (typeof meaningful[0] === "boolean") {
13446
+ const trueCount = meaningful.filter(Boolean).length;
13447
+ return trueCount >= meaningful.length - trueCount;
13448
+ }
13449
+ return meaningful[0];
13450
+ }
13520
13451
  function mergePropertyValue(property, values) {
13521
13452
  const nonNullValues = values.filter((v) => v !== null && v !== void 0);
13522
13453
  if (nonNullValues.length === 0) return null;
13523
13454
  if (property.type === "array") {
13524
13455
  const concatenated = [];
13525
- for (const val of nonNullValues) if (Array.isArray(val)) concatenated.push(...val);
13456
+ const seen = /* @__PURE__ */ new Set();
13457
+ for (const val of nonNullValues) if (Array.isArray(val)) for (const item of val) {
13458
+ const key = stableKey(item);
13459
+ if (!seen.has(key)) {
13460
+ seen.add(key);
13461
+ concatenated.push(item);
13462
+ }
13463
+ }
13526
13464
  return concatenated;
13527
13465
  }
13528
13466
  if (property.type === "object") {
13529
13467
  const childProperties = property.properties;
13530
13468
  if (!childProperties) {
13531
13469
  const mergedObj$1 = {};
13532
- for (const val of nonNullValues) if (isRecord(val)) Object.assign(mergedObj$1, val);
13470
+ for (const val of nonNullValues) if (isRecord$1(val)) Object.assign(mergedObj$1, val);
13533
13471
  return mergedObj$1;
13534
13472
  }
13535
13473
  const mergedObj = {};
13536
- for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord(v) ? v[propName] : void 0));
13474
+ for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord$1(v) ? v[propName] : void 0));
13537
13475
  return mergedObj;
13538
13476
  }
13539
- const bestValue = nonNullValues.find((v) => {
13540
- if (typeof v === "string") return v.trim() !== "";
13541
- return true;
13542
- });
13543
- return bestValue !== void 0 ? bestValue : null;
13477
+ return pickPrimitiveValue(nonNullValues);
13544
13478
  }
13545
13479
  /**
13546
13480
  * Merges structured extraction outputs from multiple document chunks
@@ -13557,114 +13491,269 @@ function mergeExtractionResults(schema, results) {
13557
13491
  return merged;
13558
13492
  }
13559
13493
 
13494
+ //#endregion
13495
+ //#region src/core/ai-extraction/snapshot.ts
13496
+ async function savePromptSnapshot(schema, aiexDir) {
13497
+ const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
13498
+ const outputDir = path.join(aiexDir, "extracted");
13499
+ await fs.mkdir(outputDir, { recursive: true });
13500
+ const fileName = `${schema.table.name}.prompt.md`;
13501
+ const outputPath = path.join(outputDir, fileName);
13502
+ await fs.writeFile(outputPath, content);
13503
+ return outputPath;
13504
+ }
13505
+
13560
13506
  //#endregion
13561
13507
  //#region src/core/ai-extraction/text-splitter.ts
13562
- const HEADING_RE = /^(#{1,6})\s+(\S.*)$/;
13563
- /**
13564
- * Splits a Markdown document into chunks based on header hierarchy.
13565
- * Keeps tables and list blocks intact by splitting along paragraphs (\n\n)
13566
- * when a section exceeds the maxSize limit.
13567
- */
13568
- function splitMarkdown(text$1, maxSize = 4e4, overlapSize = 0) {
13569
- const lines = text$1.split("\n");
13570
- const chunks = [];
13571
- let currentHeadings = [];
13572
- let currentChunkLines = [];
13573
- let currentSize = 0;
13574
- let hasNewLines = false;
13575
- const getMetadata = (headings) => {
13508
+ const encoding$1 = getEncoding("cl100k_base");
13509
+ const MAX_OVERLAP_RATIO = .15;
13510
+ const MAX_EFFECTIVE_OVERLAP_TOKENS = 1200;
13511
+ const TABLE_SEPARATOR_CELL_RE = /^:?-{3,}:?$/;
13512
+ const LEADING_TABLE_PIPE_RE = /^\|/;
13513
+ const TRAILING_TABLE_PIPE_RE = /\|$/;
13514
+ function countTokens(text$1) {
13515
+ return encoding$1.encode(text$1).length;
13516
+ }
13517
+ function calculateChunkTokenBudget(options = {}) {
13518
+ const configuredMaxTokens = options.configuredMaxTokens ?? 8e3;
13519
+ const modelMaxTokens = options.modelMaxTokens;
13520
+ if (!modelMaxTokens) return configuredMaxTokens;
13521
+ const outputReserveTokens = options.outputReserveTokens ?? 2e3;
13522
+ const promptReserveTokens = options.promptReserveTokens ?? 1200;
13523
+ const safetyBufferTokens = options.safetyBufferTokens ?? Math.min(1e3, Math.floor(modelMaxTokens * .1));
13524
+ const available = modelMaxTokens - outputReserveTokens - promptReserveTokens - safetyBufferTokens;
13525
+ return Math.max(512, Math.min(configuredMaxTokens, available));
13526
+ }
13527
+ function formatHeadingContext(headings) {
13528
+ const active = headings.filter(Boolean);
13529
+ if (active.length === 0) return "";
13530
+ return `> **[Context]** Belong to: ${active.join(" > ")}\n\n`;
13531
+ }
13532
+ function getMetadata(headings) {
13533
+ return {
13534
+ h1: headings[0] || void 0,
13535
+ h2: headings[1] || void 0,
13536
+ h3: headings[2] || void 0,
13537
+ h4: headings[3] || void 0
13538
+ };
13539
+ }
13540
+ function getHeadingPath(metadata) {
13541
+ return [
13542
+ metadata.h1,
13543
+ metadata.h2,
13544
+ metadata.h3,
13545
+ metadata.h4
13546
+ ].filter(Boolean);
13547
+ }
13548
+ function finalizeChunks(chunks, sourceText) {
13549
+ let searchStart = 0;
13550
+ const totalChunks = chunks.length;
13551
+ return chunks.map((chunk, index) => {
13552
+ const tokenCount = countTokens(chunk.pageContent);
13553
+ let charStart = sourceText.indexOf(chunk.pageContent, searchStart);
13554
+ if (charStart === -1) charStart = sourceText.indexOf(chunk.pageContent);
13555
+ const charEnd = charStart >= 0 ? charStart + chunk.pageContent.length : void 0;
13556
+ if (charStart >= 0 && charEnd !== void 0) searchStart = charEnd;
13576
13557
  return {
13577
- h1: headings[0] || void 0,
13578
- h2: headings[1] || void 0,
13579
- h3: headings[2] || void 0,
13580
- h4: headings[3] || void 0
13558
+ ...chunk,
13559
+ chunkIndex: index,
13560
+ totalChunks,
13561
+ tokenCount,
13562
+ headingPath: getHeadingPath(chunk.metadata),
13563
+ charStart: charStart >= 0 ? charStart : void 0,
13564
+ charEnd
13581
13565
  };
13566
+ });
13567
+ }
13568
+ function getEffectiveOverlapTokens(maxTokens, overlapTokens) {
13569
+ return Math.floor(Math.min(overlapTokens, Math.max(64, maxTokens * MAX_OVERLAP_RATIO), MAX_EFFECTIVE_OVERLAP_TOKENS));
13570
+ }
13571
+ function splitMarkdownTable(tableText, maxTokens) {
13572
+ if (countTokens(tableText) <= maxTokens) return [tableText];
13573
+ const lines = tableText.split("\n");
13574
+ const headerIndex = lines.findIndex((line) => line.trim().startsWith("|"));
13575
+ const separatorIndex = lines.findIndex((line, index) => {
13576
+ if (index <= headerIndex) return false;
13577
+ const cells = line.trim().replace(LEADING_TABLE_PIPE_RE, "").replace(TRAILING_TABLE_PIPE_RE, "").split("|").map((cell) => cell.trim());
13578
+ return cells.length > 0 && cells.every((cell) => TABLE_SEPARATOR_CELL_RE.test(cell));
13579
+ });
13580
+ if (headerIndex === -1 || separatorIndex === -1) return splitTextRecursively(tableText, maxTokens, ["\n"]);
13581
+ const prefix = lines.slice(0, headerIndex);
13582
+ const header = lines[headerIndex];
13583
+ const separator = lines[separatorIndex];
13584
+ const rows = lines.slice(separatorIndex + 1).filter((line) => line.trim() !== "");
13585
+ const chunks = [];
13586
+ let currentRows = [];
13587
+ const buildTable = (tableRows) => {
13588
+ return [
13589
+ ...prefix,
13590
+ header,
13591
+ separator,
13592
+ ...tableRows
13593
+ ].join("\n");
13582
13594
  };
13583
- const flushChunk = (isHeadingChange = false) => {
13584
- if (currentChunkLines.length === 0 || !hasNewLines) {
13585
- currentChunkLines = [];
13586
- currentSize = 0;
13587
- hasNewLines = false;
13588
- return;
13589
- }
13590
- const pageContent = currentChunkLines.join("\n");
13591
- let lastChunkContent = "";
13592
- if (pageContent.length > maxSize) {
13593
- const paragraphs = pageContent.split("\n\n");
13594
- let subLines = [];
13595
- let subSize = 0;
13596
- for (const para of paragraphs) {
13597
- const paraSize = para.length;
13598
- if (subSize + paraSize > maxSize && subLines.length > 0) {
13599
- const content = subLines.join("\n\n");
13600
- chunks.push({
13601
- pageContent: content,
13602
- metadata: getMetadata(currentHeadings)
13603
- });
13604
- const overlapParas = [];
13605
- let currentOverlapSize = 0;
13606
- for (let j = subLines.length - 1; j >= 0; j--) {
13607
- const p = subLines[j];
13608
- if (currentOverlapSize + p.length > overlapSize && overlapParas.length > 0) break;
13609
- overlapParas.unshift(p);
13610
- currentOverlapSize += p.length + 2;
13611
- }
13612
- subLines = [...overlapParas];
13613
- subSize = currentOverlapSize;
13614
- }
13615
- subLines.push(para);
13616
- subSize += paraSize + 2;
13595
+ for (const row of rows) {
13596
+ const candidateRows = [...currentRows, row];
13597
+ if (currentRows.length > 0 && countTokens(buildTable(candidateRows)) > maxTokens) {
13598
+ chunks.push(buildTable(currentRows));
13599
+ currentRows = [row];
13600
+ } else currentRows = candidateRows;
13601
+ }
13602
+ if (currentRows.length > 0) chunks.push(buildTable(currentRows));
13603
+ return chunks.length > 0 ? chunks : [tableText];
13604
+ }
13605
+ /**
13606
+ * Splits text recursively using a list of separators.
13607
+ * Preserves the separators when re-joining.
13608
+ */
13609
+ function splitTextRecursively(text$1, maxTokens, separators = [
13610
+ "\n\n",
13611
+ "\n",
13612
+ "。",
13613
+ ". ",
13614
+ " "
13615
+ ]) {
13616
+ if (countTokens(text$1) <= maxTokens) return [text$1];
13617
+ if (separators.length === 0) {
13618
+ const chunks = [];
13619
+ let current = "";
13620
+ for (const char of text$1) if (countTokens(current + char) > maxTokens) {
13621
+ chunks.push(current);
13622
+ current = char;
13623
+ } else current += char;
13624
+ if (current) chunks.push(current);
13625
+ return chunks;
13626
+ }
13627
+ const separator = separators[0];
13628
+ const nextSeparators = separators.slice(1);
13629
+ const parts = text$1.split(separator);
13630
+ const result = [];
13631
+ let currentChunk = [];
13632
+ let currentChunkTokens = 0;
13633
+ for (let i = 0; i < parts.length; i++) {
13634
+ const part = parts[i];
13635
+ const itemText = part + (i < parts.length - 1 ? separator : "");
13636
+ const partTokens = countTokens(itemText);
13637
+ if (partTokens > maxTokens) {
13638
+ if (currentChunk.length > 0) {
13639
+ result.push(currentChunk.join(""));
13640
+ currentChunk = [];
13641
+ currentChunkTokens = 0;
13617
13642
  }
13618
- if (subLines.length > 0) {
13619
- const content = subLines.join("\n\n");
13620
- chunks.push({
13621
- pageContent: content,
13622
- metadata: getMetadata(currentHeadings)
13623
- });
13624
- lastChunkContent = content;
13643
+ const subParts = splitTextRecursively(part, maxTokens, nextSeparators);
13644
+ for (let j = 0; j < subParts.length; j++) {
13645
+ const finalSub = subParts[j] + (j === subParts.length - 1 && i < parts.length - 1 ? separator : "");
13646
+ result.push(finalSub);
13625
13647
  }
13648
+ } else if (currentChunkTokens + partTokens > maxTokens) {
13649
+ result.push(currentChunk.join(""));
13650
+ currentChunk = [itemText];
13651
+ currentChunkTokens = partTokens;
13626
13652
  } else {
13627
- chunks.push({
13628
- pageContent,
13629
- metadata: getMetadata(currentHeadings)
13630
- });
13631
- lastChunkContent = pageContent;
13653
+ currentChunk.push(itemText);
13654
+ currentChunkTokens += partTokens;
13632
13655
  }
13633
- if (!isHeadingChange && lastChunkContent && overlapSize > 0) {
13634
- const paragraphs = lastChunkContent.split("\n\n");
13635
- const overlapParas = [];
13636
- let currentOverlapSize = 0;
13637
- for (let j = paragraphs.length - 1; j >= 0; j--) {
13638
- const p = paragraphs[j];
13639
- if (currentOverlapSize + p.length > overlapSize && overlapParas.length > 0) break;
13640
- overlapParas.unshift(p);
13641
- currentOverlapSize += p.length + 2;
13642
- }
13643
- const overlapText = overlapParas.join("\n\n");
13644
- currentChunkLines = overlapText.split("\n");
13645
- currentSize = overlapText.length;
13656
+ }
13657
+ if (currentChunk.length > 0) result.push(currentChunk.join(""));
13658
+ return result;
13659
+ }
13660
+ /**
13661
+ * Splits a Markdown document into chunks based on heading contexts, AST block parsing, and token limits.
13662
+ * Protects tables, list items, and code blocks from being broken.
13663
+ */
13664
+ function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
13665
+ const tokens = marked.lexer(text$1);
13666
+ const chunks = [];
13667
+ const effectiveOverlapTokens = getEffectiveOverlapTokens(maxTokens, overlapTokens);
13668
+ let currentHeadings = [];
13669
+ let currentChunkList = [];
13670
+ let accumulatedTokens = 0;
13671
+ const flushCurrentChunk = (isHeadingChange = false) => {
13672
+ if (currentChunkList.length === 0) return;
13673
+ const pageContent = currentChunkList.map((item) => item.text).join("");
13674
+ const firstHeadings = currentChunkList[0].headings;
13675
+ chunks.push({
13676
+ pageContent,
13677
+ metadata: getMetadata(firstHeadings)
13678
+ });
13679
+ if (isHeadingChange || effectiveOverlapTokens <= 0) {
13680
+ currentChunkList = [];
13681
+ accumulatedTokens = 0;
13646
13682
  } else {
13647
- currentChunkLines = [];
13648
- currentSize = 0;
13683
+ const overlapItems = [];
13684
+ let currentOverlapTokens = 0;
13685
+ for (let i = currentChunkList.length - 1; i >= 0; i--) {
13686
+ const item = currentChunkList[i];
13687
+ const itemTokens = countTokens(item.text);
13688
+ if (currentOverlapTokens + itemTokens > effectiveOverlapTokens && overlapItems.length > 0) break;
13689
+ overlapItems.unshift(item);
13690
+ currentOverlapTokens += itemTokens;
13691
+ }
13692
+ currentChunkList = [...overlapItems];
13693
+ accumulatedTokens = currentOverlapTokens;
13649
13694
  }
13650
- hasNewLines = false;
13651
13695
  };
13652
- for (const line of lines) {
13653
- const headingMatch = line.match(HEADING_RE);
13654
- if (headingMatch) {
13655
- flushChunk(true);
13656
- const depth = headingMatch[1].length;
13657
- const title = headingMatch[2].trim();
13696
+ for (const token of tokens) {
13697
+ if (token.type === "space") {
13698
+ if (currentChunkList.length > 0) {
13699
+ currentChunkList[currentChunkList.length - 1].text += token.raw;
13700
+ accumulatedTokens += countTokens(token.raw);
13701
+ }
13702
+ continue;
13703
+ }
13704
+ if (token.type === "heading") {
13705
+ flushCurrentChunk(true);
13706
+ const depth = token.depth;
13707
+ const title = token.text.trim();
13658
13708
  currentHeadings = currentHeadings.slice(0, depth - 1);
13659
13709
  currentHeadings[depth - 1] = title;
13660
13710
  }
13661
- currentChunkLines.push(line);
13662
- currentSize += line.length + 1;
13663
- hasNewLines = true;
13664
- if (currentSize > maxSize) flushChunk(false);
13711
+ const rawText = token.raw;
13712
+ if (token.type === "list" && countTokens(rawText) > maxTokens) for (const item of token.items) processTextBlock(item.raw, currentHeadings);
13713
+ else {
13714
+ const isAtomic = token.type === "table" || token.type === "code";
13715
+ processTextBlock(rawText, currentHeadings, isAtomic);
13716
+ }
13717
+ }
13718
+ flushCurrentChunk(true);
13719
+ return finalizeChunks(chunks, text$1);
13720
+ function processTextBlock(blockText, headings, isAtomic = false) {
13721
+ const blockTokens = countTokens(blockText);
13722
+ const contextTokens = countTokens(formatHeadingContext(headings));
13723
+ const safetyBuffer = Math.min(100, Math.max(2, Math.floor(maxTokens * .1)));
13724
+ const budgetLimit = Math.max(5, maxTokens - contextTokens - safetyBuffer);
13725
+ if (blockTokens > budgetLimit) if (isAtomic) {
13726
+ flushCurrentChunk(false);
13727
+ const atomicBlocks = blockTokens <= maxTokens ? [blockText] : blockText.includes("|") ? splitMarkdownTable(blockText, budgetLimit) : splitTextRecursively(blockText, budgetLimit, ["\n"]);
13728
+ for (const block of atomicBlocks) {
13729
+ currentChunkList.push({
13730
+ text: block,
13731
+ headings: [...headings]
13732
+ });
13733
+ accumulatedTokens = countTokens(block);
13734
+ flushCurrentChunk(false);
13735
+ }
13736
+ } else {
13737
+ flushCurrentChunk(false);
13738
+ const subBlocks = splitTextRecursively(blockText, budgetLimit);
13739
+ for (const sub of subBlocks) {
13740
+ currentChunkList.push({
13741
+ text: sub,
13742
+ headings: [...headings]
13743
+ });
13744
+ accumulatedTokens += countTokens(sub);
13745
+ if (accumulatedTokens > budgetLimit) flushCurrentChunk(false);
13746
+ }
13747
+ }
13748
+ else {
13749
+ if (accumulatedTokens + blockTokens + contextTokens > maxTokens && currentChunkList.length > 0) flushCurrentChunk(false);
13750
+ currentChunkList.push({
13751
+ text: blockText,
13752
+ headings: [...headings]
13753
+ });
13754
+ accumulatedTokens += blockTokens;
13755
+ }
13665
13756
  }
13666
- flushChunk(true);
13667
- return chunks;
13668
13757
  }
13669
13758
 
13670
13759
  //#endregion
@@ -13817,6 +13906,276 @@ function getFileHash(filePath) {
13817
13906
  });
13818
13907
  }
13819
13908
 
13909
+ //#endregion
13910
+ //#region src/core/ai-extraction/evidence.ts
13911
+ const JSON_FILE_SUFFIX_RE$1 = /\.json$/i;
13912
+ const FIELD_PATH_PREFIX_RE = /^\$\./;
13913
+ function isRecord(value) {
13914
+ return typeof value === "object" && value !== null && !Array.isArray(value);
13915
+ }
13916
+ function stableValueKey(value) {
13917
+ return JSON.stringify(value);
13918
+ }
13919
+ function isPlaceholderString(value) {
13920
+ if (typeof value !== "string") return false;
13921
+ const normalized = value.trim().toLowerCase();
13922
+ return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
13923
+ }
13924
+ function primitiveToText(value) {
13925
+ if (value === null || value === void 0) return null;
13926
+ if (typeof value === "string") return value.trim() || null;
13927
+ if (typeof value === "number" || typeof value === "boolean") return String(value);
13928
+ return null;
13929
+ }
13930
+ function isMeaningfulValue(value) {
13931
+ return primitiveToText(value) !== null && !isPlaceholderString(value);
13932
+ }
13933
+ function normalizeText(value) {
13934
+ return value.toLowerCase().replace(/\s+/g, " ").trim();
13935
+ }
13936
+ function quoteAround(text$1, start, length) {
13937
+ const before = Math.max(0, start - 80);
13938
+ const after = Math.min(text$1.length, start + length + 80);
13939
+ return text$1.slice(before, after).replace(/\s+/g, " ").trim();
13940
+ }
13941
+ function findEvidence(value, chunks) {
13942
+ const searchText = primitiveToText(value);
13943
+ if (!searchText) return null;
13944
+ const normalizedSearchText = normalizeText(searchText);
13945
+ if (!normalizedSearchText) return null;
13946
+ for (const chunk of chunks) {
13947
+ if (normalizeText(chunk.text).indexOf(normalizedSearchText) === -1) continue;
13948
+ const rawIndex = chunk.text.toLowerCase().indexOf(searchText.toLowerCase());
13949
+ const quoteIndex = rawIndex >= 0 ? rawIndex : 0;
13950
+ return {
13951
+ chunkIndex: chunk.chunkIndex,
13952
+ headingPath: chunk.headingPath,
13953
+ quote: quoteAround(chunk.text, quoteIndex, searchText.length)
13954
+ };
13955
+ }
13956
+ return null;
13957
+ }
13958
+ function addEvidenceForProperty(fields, path$1, property, value, chunks) {
13959
+ if (property.type === "object" && property.properties) {
13960
+ const record = isRecord(value) ? value : {};
13961
+ for (const [childName, childProperty] of Object.entries(property.properties)) addEvidenceForProperty(fields, `${path$1}.${childName}`, childProperty, record[childName], chunks);
13962
+ return;
13963
+ }
13964
+ if (property.type === "array") {
13965
+ if (!Array.isArray(value) || value.length === 0) {
13966
+ fields.push({
13967
+ fieldPath: path$1,
13968
+ status: "missing",
13969
+ value: null,
13970
+ confidence: 0,
13971
+ note: "Array field is empty or missing."
13972
+ });
13973
+ return;
13974
+ }
13975
+ value.forEach((item, index) => {
13976
+ if (property.items?.type === "object" && property.items.properties) {
13977
+ const record = isRecord(item) ? item : {};
13978
+ for (const [childName, childProperty] of Object.entries(property.items.properties)) addEvidenceForProperty(fields, `${path$1}[${index}].${childName}`, childProperty, record[childName], chunks);
13979
+ } else addPrimitiveEvidence(fields, `${path$1}[${index}]`, item, chunks);
13980
+ });
13981
+ return;
13982
+ }
13983
+ addPrimitiveEvidence(fields, path$1, value, chunks);
13984
+ }
13985
+ function addPrimitiveEvidence(fields, fieldPath, value, chunks) {
13986
+ if (value === null || value === void 0 || value === "") {
13987
+ fields.push({
13988
+ fieldPath,
13989
+ status: "missing",
13990
+ value: null,
13991
+ confidence: 0,
13992
+ note: "Field is null or empty in final extraction."
13993
+ });
13994
+ return;
13995
+ }
13996
+ const found = findEvidence(value, chunks);
13997
+ if (found) {
13998
+ fields.push({
13999
+ fieldPath,
14000
+ status: "found",
14001
+ value,
14002
+ confidence: .8,
14003
+ ...found
14004
+ });
14005
+ return;
14006
+ }
14007
+ fields.push({
14008
+ fieldPath,
14009
+ status: "inferred",
14010
+ value,
14011
+ confidence: .35,
14012
+ note: "Final value was not found verbatim in the available source text."
14013
+ });
14014
+ }
14015
+ function sourceChunksFromText(text$1) {
14016
+ return text$1 ? [{
14017
+ text: text$1,
14018
+ chunkIndex: 0,
14019
+ headingPath: []
14020
+ }] : [];
14021
+ }
14022
+ function sourceChunksFromMarkdownChunks(chunks) {
14023
+ return chunks.map((chunk, index) => ({
14024
+ text: chunk.pageContent,
14025
+ chunkIndex: chunk.chunkIndex ?? index,
14026
+ headingPath: chunk.headingPath ?? []
14027
+ }));
14028
+ }
14029
+ function getPathParts(fieldPath) {
14030
+ return fieldPath.replace(FIELD_PATH_PREFIX_RE, "").split(".").filter(Boolean);
14031
+ }
14032
+ function getValueAtPath$1(data, fieldPath) {
14033
+ let current = data;
14034
+ for (const part of getPathParts(fieldPath)) {
14035
+ if (!isRecord(current)) return void 0;
14036
+ current = current[part];
14037
+ }
14038
+ return current;
14039
+ }
14040
+ function setValueAtPath(data, fieldPath, value) {
14041
+ const parts = getPathParts(fieldPath);
14042
+ let current = data;
14043
+ for (let i = 0; i < parts.length - 1; i++) {
14044
+ const part = parts[i];
14045
+ if (!isRecord(current[part])) current[part] = {};
14046
+ current = current[part];
14047
+ }
14048
+ current[parts[parts.length - 1]] = value;
14049
+ }
14050
+ function collectScalarFields(fields, fieldPath, property) {
14051
+ if (property.type === "object" && property.properties) {
14052
+ for (const [name$1, childProperty] of Object.entries(property.properties)) collectScalarFields(fields, `${fieldPath}.${name$1}`, childProperty);
14053
+ return;
14054
+ }
14055
+ if (property.type !== "array") fields.push({
14056
+ fieldPath,
14057
+ property
14058
+ });
14059
+ }
14060
+ function candidateScore(candidate) {
14061
+ return (candidate.status === "found" ? 100 : 0) + Math.round(candidate.confidence * 10) + candidate.chunkIndex;
14062
+ }
14063
+ function selectCandidatesForField(candidates) {
14064
+ if (candidates.length === 0) return null;
14065
+ candidates.sort((a, b) => candidateScore(b) - candidateScore(a));
14066
+ const selected = candidates[0];
14067
+ selected.selected = true;
14068
+ for (const candidate of candidates.slice(1)) {
14069
+ candidate.selected = false;
14070
+ candidate.rejectionReason = "Lower evidence score or earlier chunk position.";
14071
+ }
14072
+ const distinctValues = /* @__PURE__ */ new Map();
14073
+ for (const candidate of candidates) distinctValues.set(stableValueKey(candidate.value), candidate.value);
14074
+ if (distinctValues.size <= 1) return null;
14075
+ return {
14076
+ fieldPath: selected.fieldPath,
14077
+ selectedValue: selected.value,
14078
+ rejectedValues: candidates.slice(1).map((candidate) => candidate.value),
14079
+ candidates: [...candidates]
14080
+ };
14081
+ }
14082
+ function buildCandidateMergeReport(input) {
14083
+ const scalarFields = [];
14084
+ for (const [name$1, property] of Object.entries(input.schema.properties)) {
14085
+ if (property.primary && property.autoIncrement) continue;
14086
+ collectScalarFields(scalarFields, `$.${name$1}`, property);
14087
+ }
14088
+ const sourceChunks = sourceChunksFromMarkdownChunks(input.chunks);
14089
+ const candidatesByPath = /* @__PURE__ */ new Map();
14090
+ for (const { fieldPath } of scalarFields) for (let chunkIndex = 0; chunkIndex < input.chunkResults.length; chunkIndex++) {
14091
+ const value = getValueAtPath$1(input.chunkResults[chunkIndex], fieldPath);
14092
+ if (!isMeaningfulValue(value)) continue;
14093
+ const sourceChunk = sourceChunks[chunkIndex] ?? {
14094
+ text: "",
14095
+ chunkIndex
14096
+ };
14097
+ const found = findEvidence(value, [sourceChunk]);
14098
+ const candidate = {
14099
+ fieldPath,
14100
+ value,
14101
+ chunkIndex: sourceChunk.chunkIndex ?? chunkIndex,
14102
+ headingPath: sourceChunk.headingPath,
14103
+ status: found ? "found" : "inferred",
14104
+ quote: found?.quote,
14105
+ confidence: found ? .85 : .35
14106
+ };
14107
+ const candidates = candidatesByPath.get(fieldPath) ?? [];
14108
+ candidates.push(candidate);
14109
+ candidatesByPath.set(fieldPath, candidates);
14110
+ }
14111
+ const allCandidates = [];
14112
+ const conflicts = [];
14113
+ for (const candidates of candidatesByPath.values()) {
14114
+ const conflict = selectCandidatesForField(candidates);
14115
+ allCandidates.push(...candidates);
14116
+ if (conflict) conflicts.push(conflict);
14117
+ }
14118
+ return {
14119
+ candidates: allCandidates,
14120
+ conflicts
14121
+ };
14122
+ }
14123
+ function applySelectedCandidates(data, report) {
14124
+ const merged = structuredClone(data);
14125
+ for (const candidate of report.candidates) if (candidate.selected) setValueAtPath(merged, candidate.fieldPath, candidate.value);
14126
+ return merged;
14127
+ }
14128
+ function buildExtractionEvidence(input) {
14129
+ const data = isRecord(input.data) ? input.data : {};
14130
+ const chunks = input.chunks ? sourceChunksFromMarkdownChunks(input.chunks) : sourceChunksFromText(input.text ?? "");
14131
+ const fields = [];
14132
+ for (const [name$1, property] of Object.entries(input.schema.properties)) {
14133
+ if (property.primary && property.autoIncrement) continue;
14134
+ addEvidenceForProperty(fields, `$.${name$1}`, property, data[name$1], chunks);
14135
+ }
14136
+ const inferredIssues = fields.filter((field) => field.status === "inferred").map((field) => ({
14137
+ fieldPath: field.fieldPath,
14138
+ message: field.note ?? "Field value lacks source evidence."
14139
+ }));
14140
+ const conflictIssues = (input.candidateReport?.conflicts ?? []).map((conflict) => ({
14141
+ fieldPath: conflict.fieldPath,
14142
+ message: "Multiple chunk candidates disagree for this field."
14143
+ }));
14144
+ const issues = [...inferredIssues, ...conflictIssues];
14145
+ return {
14146
+ coverage: {
14147
+ path: input.outputPath ? evidencePathForOutput(input.outputPath) : void 0,
14148
+ fieldCount: fields.length,
14149
+ evidenceCount: fields.filter((field) => field.status === "found").length,
14150
+ foundCount: fields.filter((field) => field.status === "found").length,
14151
+ missingCount: fields.filter((field) => field.status === "missing").length,
14152
+ inferredCount: fields.filter((field) => field.status === "inferred").length,
14153
+ conflictCount: input.candidateReport?.conflicts.length ?? 0,
14154
+ issueCount: issues.length
14155
+ },
14156
+ fields,
14157
+ candidates: input.candidateReport?.candidates,
14158
+ conflicts: input.candidateReport?.conflicts,
14159
+ issues
14160
+ };
14161
+ }
14162
+ function evidencePathForOutput(outputPath) {
14163
+ return outputPath.replace(JSON_FILE_SUFFIX_RE$1, ".evidence.json");
14164
+ }
14165
+ async function writeExtractionEvidence(input) {
14166
+ const report = buildExtractionEvidence(input);
14167
+ const evidencePath = evidencePathForOutput(input.outputPath);
14168
+ report.coverage.path = evidencePath;
14169
+ await writeFile(evidencePath, report, {
14170
+ spaces: 2,
14171
+ EOL: "\n"
14172
+ });
14173
+ return {
14174
+ ...report.coverage,
14175
+ path: path.resolve(evidencePath)
14176
+ };
14177
+ }
14178
+
13820
14179
  //#endregion
13821
14180
  //#region src/core/notion-sink.ts
13822
14181
  const RICH_TEXT_LIMIT = 2e3;
@@ -14102,6 +14461,36 @@ async function triggerWebhook(aiConfig, auditId, schemaName, event, source, data
14102
14461
  }
14103
14462
  }
14104
14463
 
14464
+ //#endregion
14465
+ //#region src/core/ai-extraction/transcriber.ts
14466
+ const TRANSCRIPTION_PROMPT = "Transcribe all visible text from this image accurately. Preserve the layout and line breaks as much as possible.";
14467
+ async function transcribeImageWithVision(imagePath, baseURL, apiKey, modelName, timeoutMs) {
14468
+ const provider = createOpenAICompatible({
14469
+ baseURL,
14470
+ name: "openai-compatible",
14471
+ apiKey
14472
+ });
14473
+ const buffer = await fs.readFile(imagePath);
14474
+ const effectiveTimeout = timeoutMs ?? 3e5;
14475
+ return {
14476
+ text: (await generateText({
14477
+ model: provider.chatModel(modelName),
14478
+ messages: [{
14479
+ role: "user",
14480
+ content: [{
14481
+ type: "text",
14482
+ text: TRANSCRIPTION_PROMPT
14483
+ }, {
14484
+ type: "image",
14485
+ image: buffer
14486
+ }]
14487
+ }],
14488
+ abortSignal: AbortSignal.timeout(effectiveTimeout)
14489
+ })).text,
14490
+ modelName
14491
+ };
14492
+ }
14493
+
14105
14494
  //#endregion
14106
14495
  //#region src/core/file-constants.ts
14107
14496
  const MAX_UPLOAD_SIZE = 30 * 1024 * 1024;
@@ -14435,14 +14824,6 @@ function createPdfConverter(config) {
14435
14824
  return withFallback(new ExternalCommandPdfConverter("mineru", mineruConfig), mineruConfig);
14436
14825
  }
14437
14826
  if (config.converter === "mineru_api") return new MineruApiPdfConverter(config.mineruApi ?? DEFAULT_MINERU_API_CONFIG);
14438
- if (config.converter === "markitdown") {
14439
- const markitdownConfig = config.markitdown ?? DEFAULT_MARKITDOWN_CONFIG;
14440
- return withFallback(new ExternalCommandPdfConverter("markitdown", markitdownConfig), markitdownConfig);
14441
- }
14442
- if (config.converter === "marker") {
14443
- const markerConfig = config.marker ?? DEFAULT_MARKER_CONFIG;
14444
- return withFallback(new ExternalCommandPdfConverter("marker", markerConfig), markerConfig);
14445
- }
14446
14827
  if (config.converter === "external") {
14447
14828
  if (!config.external) throw new Error(t("errors.pdf.externalNotConfigured"));
14448
14829
  return new ExternalCommandPdfConverter("external", config.external);
@@ -14470,7 +14851,7 @@ const FILE_PART_EXTENSIONS = new Set([
14470
14851
  "svg"
14471
14852
  ]);
14472
14853
  const PDF_EXT_RE = /\.pdf$/i;
14473
- async function readExtractFileInput(filePath, aiConfig, modelOverride) {
14854
+ async function readExtractFileInput(filePath, aiConfig) {
14474
14855
  const stat = fs$1.statSync(filePath);
14475
14856
  if (stat.size > MAX_UPLOAD_SIZE) throw new Error(t("errors.file.sizeExceeded", {
14476
14857
  size: bytesToMB(stat.size).toFixed(1),
@@ -14479,15 +14860,22 @@ async function readExtractFileInput(filePath, aiConfig, modelOverride) {
14479
14860
  }));
14480
14861
  const ext = path.extname(filePath).toLowerCase().replace(".", "");
14481
14862
  if (FILE_PART_EXTENSIONS.has(ext)) {
14482
- if (shouldUseImageOcrFallback(aiConfig, modelOverride)) {
14483
- const result = await recognizeImageText(filePath, aiConfig?.image);
14484
- consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
14485
- return { text: result.text };
14863
+ const image = aiConfig?.image;
14864
+ if (image?.imageConversion === "vision" && image.imageModelName && aiConfig) {
14865
+ const baseURL = image.visionBaseURL || aiConfig.provider.baseURL;
14866
+ const apiKey = image.visionApiKey || aiConfig.provider.apiKey;
14867
+ const timeout = (aiConfig.provider.timeout ?? 300) * 1e3;
14868
+ try {
14869
+ const result$1 = await transcribeImageWithVision(filePath, baseURL, apiKey, image.imageModelName, timeout);
14870
+ consola.info(t("command.extract.file.visionTranscribed", { model: result$1.modelName }));
14871
+ return { text: result$1.text };
14872
+ } catch {
14873
+ consola.warn(t("command.extract.file.visionTranscribeFailed", { model: image.imageModelName }));
14874
+ }
14486
14875
  }
14487
- return {
14488
- text: "",
14489
- filePath
14490
- };
14876
+ const result = await recognizeImageText(filePath, aiConfig?.image);
14877
+ consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
14878
+ return { text: result.text };
14491
14879
  }
14492
14880
  if (ext === "pdf") {
14493
14881
  const buffer = await fs.readFile(filePath);
@@ -14608,6 +14996,7 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
14608
14996
 
14609
14997
  //#endregion
14610
14998
  //#region src/core/extract-runner.ts
14999
+ const encoding = getEncoding("cl100k_base");
14611
15000
  const JSON_EXT_RE$1 = /\.json$/;
14612
15001
  async function limitConcurrency(concurrency, items, fn) {
14613
15002
  const results = Array.from({ length: items.length });
@@ -14622,29 +15011,6 @@ async function limitConcurrency(concurrency, items, fn) {
14622
15011
  await Promise.all(workers);
14623
15012
  return results;
14624
15013
  }
14625
- function getSchemaKeywords(schema) {
14626
- const keywords = /* @__PURE__ */ new Set();
14627
- function walk(properties) {
14628
- if (!properties) return;
14629
- for (const [name$1, prop] of Object.entries(properties)) {
14630
- keywords.add(name$1.toLowerCase());
14631
- const parts = name$1.replace(/([a-z0-9])([A-Z])/g, "$1 $2").split(/[\s._:/\\-]+/g);
14632
- for (const part of parts) if (part.length > 1) keywords.add(part.toLowerCase());
14633
- if (prop && typeof prop === "object") {
14634
- const p = prop;
14635
- if (typeof p.title === "string") keywords.add(p.title.toLowerCase());
14636
- if (typeof p.description === "string") {
14637
- const descParts = p.description.toLowerCase().match(/[\p{L}\p{N}_-]+/gu) ?? [];
14638
- for (const d of descParts) if (d.length > 2) keywords.add(d);
14639
- }
14640
- if (p.type === "object") walk(p.properties);
14641
- if (p.type === "array" && p.items?.type === "object") walk(p.items.properties);
14642
- }
14643
- }
14644
- }
14645
- walk(schema.properties);
14646
- return Array.from(keywords);
14647
- }
14648
15014
  async function ensureDatabaseReady(dbPath, schema) {
14649
15015
  try {
14650
15016
  await fs.access(dbPath);
@@ -14716,182 +15082,145 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14716
15082
  }
14717
15083
  const s = spinner();
14718
15084
  if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
14719
- const CHUNK_LIMIT = 4e4;
14720
- let result;
14721
- if (text$1 && text$1.length > CHUNK_LIMIT) {
14722
- if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
14723
- length: text$1.length,
14724
- limit: CHUNK_LIMIT
14725
- }));
14726
- const finalDocs = splitMarkdown(text$1, CHUNK_LIMIT, aiConfig.extraction?.overlapSize ?? 2e3);
14727
- if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
14728
- let processedDocs = finalDocs;
14729
- if (!!aiConfig.extraction?.preFiltering && finalDocs.length > 1) {
14730
- const preFilteringLimit = aiConfig.extraction?.preFilteringLimit ?? 5;
14731
- const keywords = getSchemaKeywords(schemaLoad.schema);
14732
- const scoredChunks = finalDocs.map((doc, idx) => {
14733
- if (idx === 0) return {
14734
- index: idx,
14735
- score: Number.POSITIVE_INFINITY
14736
- };
14737
- let score = 0;
14738
- const docTextLower = doc.pageContent.toLowerCase();
14739
- for (const kw of keywords) {
14740
- let pos = docTextLower.indexOf(kw);
14741
- while (pos !== -1) {
14742
- score++;
14743
- pos = docTextLower.indexOf(kw, pos + kw.length);
14744
- }
14745
- }
14746
- return {
14747
- index: idx,
14748
- score
14749
- };
14750
- }).slice(1).sort((a, b) => b.score - a.score);
14751
- const selectedIndices = new Set([0]);
14752
- let keptCount = 0;
14753
- for (const sc of scoredChunks) if (sc.score > 0 && keptCount < preFilteringLimit) {
14754
- selectedIndices.add(sc.index);
14755
- keptCount++;
14756
- }
14757
- processedDocs = finalDocs.filter((_, idx) => selectedIndices.has(idx));
14758
- if (!options?.quiet) consola.info(t("command.extract.file.preFiltering", {
14759
- original: finalDocs.length,
14760
- filtered: processedDocs.length
14761
- }));
14762
- }
14763
- const chunkResults = [];
14764
- const accumulatedTokens = {
14765
- prompt: 0,
14766
- completion: 0,
14767
- total: 0
14768
- };
14769
- let success = true;
14770
- let errorMsg = "";
14771
- const extractionTasks = processedDocs.map((doc, i) => {
14772
- return async () => {
14773
- if (!success) return;
14774
- const headings = [];
14775
- if (doc.metadata) {
14776
- if (doc.metadata.h1) headings.push(doc.metadata.h1);
14777
- if (doc.metadata.h2) headings.push(doc.metadata.h2);
14778
- if (doc.metadata.h3) headings.push(doc.metadata.h3);
14779
- if (doc.metadata.h4) headings.push(doc.metadata.h4);
14780
- }
14781
- let chunkText = doc.pageContent;
14782
- if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
14783
- const chunkResult = await extractStructuredData({
14784
- config: aiConfig,
14785
- schema: schemaLoad.schema,
14786
- text: chunkText,
14787
- aiexDir,
14788
- modelOverride,
14789
- onRetry(info) {
14790
- if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
14791
- current: i + 1,
14792
- total: processedDocs.length,
14793
- code: info.statusCode,
14794
- delay: info.delayMs / 1e3,
14795
- attempt: info.attempt,
14796
- max: info.maxRetries
14797
- }));
14798
- }
14799
- });
14800
- if (!chunkResult.success) {
14801
- success = false;
14802
- errorMsg = chunkResult.error || t("common.unknownError");
14803
- if (!options?.quiet) {
14804
- s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
14805
- consola.error(errorMsg);
14806
- }
14807
- return;
14808
- }
14809
- if (chunkResult.data) chunkResults.push(chunkResult.data);
14810
- if (chunkResult.tokensUsed) {
14811
- accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
14812
- accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
14813
- accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
15085
+ const maxTokens = calculateChunkTokenBudget({
15086
+ configuredMaxTokens: aiConfig.extraction?.maxTokens ?? 8e3,
15087
+ modelMaxTokens: modelOverride?.capabilities.maxTokens
15088
+ });
15089
+ const overlapTokens = aiConfig.extraction?.overlapSize ?? 1e3;
15090
+ const totalTokens = text$1 ? encoding.encode(text$1).length : 0;
15091
+ if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunking", {
15092
+ length: totalTokens,
15093
+ limit: maxTokens
15094
+ }));
15095
+ const processedDocs = text$1 && totalTokens > maxTokens ? splitMarkdown(text$1, maxTokens, overlapTokens) : [{
15096
+ pageContent: text$1 ?? "",
15097
+ metadata: {},
15098
+ chunkIndex: 0,
15099
+ totalChunks: 1,
15100
+ tokenCount: totalTokens,
15101
+ headingPath: [],
15102
+ charStart: 0,
15103
+ charEnd: text$1?.length ?? 0
15104
+ }];
15105
+ if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: processedDocs.length }));
15106
+ const chunkResults = Array.from({ length: processedDocs.length });
15107
+ const accumulatedTokens = {
15108
+ prompt: 0,
15109
+ completion: 0,
15110
+ total: 0
15111
+ };
15112
+ let success = true;
15113
+ let errorMsg = "";
15114
+ const extractionTasks = processedDocs.map((doc, i) => {
15115
+ return async () => {
15116
+ if (!success) return;
15117
+ const headings = doc.headingPath?.length ? doc.headingPath : [
15118
+ doc.metadata.h1,
15119
+ doc.metadata.h2,
15120
+ doc.metadata.h3,
15121
+ doc.metadata.h4
15122
+ ].filter(Boolean);
15123
+ let chunkText = doc.pageContent;
15124
+ if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
15125
+ const chunkResult = await extractStructuredData({
15126
+ config: aiConfig,
15127
+ schema: schemaLoad.schema,
15128
+ text: chunkText,
15129
+ aiexDir,
15130
+ modelOverride,
15131
+ onRetry(info) {
15132
+ if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
15133
+ current: i + 1,
15134
+ total: processedDocs.length,
15135
+ code: info.statusCode,
15136
+ delay: info.delayMs / 1e3,
15137
+ attempt: info.attempt,
15138
+ max: info.maxRetries
15139
+ }));
14814
15140
  }
14815
- };
14816
- });
14817
- const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
14818
- if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
14819
- current: 1,
14820
- total: processedDocs.length
14821
- }));
14822
- try {
14823
- await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
14824
- if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
14825
- current: idx + 1,
14826
- total: processedDocs.length
14827
- }));
14828
- await task();
14829
15141
  });
14830
- } catch (e) {
14831
- success = false;
14832
- errorMsg = e instanceof Error ? e.message : String(e);
14833
- }
14834
- if (!success) return {
14835
- success: false,
14836
- error: errorMsg
14837
- };
14838
- const mergedData = mergeExtractionResults(schemaLoad.schema, chunkResults);
14839
- const validation = validateExtractedData(schemaLoad.schema, mergedData);
14840
- if (!validation.success) {
14841
- const valError = validation.error || "Merged data validation failed";
14842
- if (!options?.quiet) {
14843
- s.stop(t("command.extract.file.validationFail"));
14844
- consola.error(valError);
15142
+ if (!chunkResult.success) {
15143
+ success = false;
15144
+ errorMsg = chunkResult.error || t("common.unknownError");
15145
+ if (!options?.quiet) {
15146
+ s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
15147
+ consola.error(errorMsg);
15148
+ }
15149
+ return;
15150
+ }
15151
+ if (chunkResult.data) chunkResults[i] = chunkResult.data;
15152
+ if (chunkResult.tokensUsed) {
15153
+ accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
15154
+ accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
15155
+ accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
14845
15156
  }
14846
- return {
14847
- success: false,
14848
- error: valError
14849
- };
14850
- }
14851
- const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
14852
- await fs.mkdir(outputDir, { recursive: true });
14853
- const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
14854
- const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
14855
- const finalMergedOutputPath = path.join(outputDir, outputFileName);
14856
- await fs.writeFile(finalMergedOutputPath, JSON.stringify(mergedData, null, 2));
14857
- result = {
14858
- success: true,
14859
- data: mergedData,
14860
- tokensUsed: accumulatedTokens,
14861
- outputPath: finalMergedOutputPath
14862
15157
  };
14863
- } else result = await extractStructuredData({
14864
- config: aiConfig,
14865
- schema: schemaLoad.schema,
14866
- text: text$1 ?? "",
14867
- aiexDir,
14868
- file: filePath,
14869
- modelOverride,
14870
- onRetry(info) {
14871
- if (!options?.quiet) s.message(t("command.extract.file.extractRetry", {
14872
- code: info.statusCode,
14873
- delay: info.delayMs / 1e3,
14874
- attempt: info.attempt,
14875
- max: info.maxRetries
15158
+ });
15159
+ const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
15160
+ if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
15161
+ current: 1,
15162
+ total: processedDocs.length
15163
+ }));
15164
+ try {
15165
+ await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
15166
+ if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
15167
+ current: idx + 1,
15168
+ total: processedDocs.length
14876
15169
  }));
14877
- }
15170
+ await task();
15171
+ });
15172
+ } catch (e) {
15173
+ success = false;
15174
+ errorMsg = e instanceof Error ? e.message : String(e);
15175
+ }
15176
+ if (!success) return {
15177
+ success: false,
15178
+ error: errorMsg
15179
+ };
15180
+ const successfulChunkResults = chunkResults.filter((chunkResult) => !!chunkResult);
15181
+ const candidateReport = buildCandidateMergeReport({
15182
+ schema: schemaLoad.schema,
15183
+ chunkResults: successfulChunkResults,
15184
+ chunks: processedDocs
14878
15185
  });
14879
- if (!result.success) {
15186
+ const mergedData = applySelectedCandidates(mergeExtractionResults(schemaLoad.schema, successfulChunkResults), candidateReport);
15187
+ const validation = validateExtractedData(schemaLoad.schema, mergedData);
15188
+ if (!validation.success) {
15189
+ const valError = validation.error || "Merged data validation failed";
14880
15190
  if (!options?.quiet) {
14881
- s.stop(t("command.extract.file.extractFail"));
14882
- consola.error(result.error || t("common.unknownError"));
15191
+ s.stop(t("command.extract.file.validationFail"));
15192
+ consola.error(valError);
14883
15193
  }
14884
15194
  return {
14885
15195
  success: false,
14886
- error: result.error || t("common.unknownError")
15196
+ error: valError
14887
15197
  };
14888
15198
  }
15199
+ const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
15200
+ await fs.mkdir(outputDir, { recursive: true });
15201
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
15202
+ const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
15203
+ const outputPath = path.join(outputDir, outputFileName);
15204
+ await fs.writeFile(outputPath, JSON.stringify(mergedData, null, 2));
15205
+ const result = {
15206
+ success: true,
15207
+ data: mergedData,
15208
+ tokensUsed: accumulatedTokens,
15209
+ outputPath,
15210
+ evidenceSummary: await writeExtractionEvidence({
15211
+ schema: schemaLoad.schema,
15212
+ data: mergedData,
15213
+ outputPath,
15214
+ chunks: processedDocs,
15215
+ candidateReport
15216
+ })
15217
+ };
14889
15218
  if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
14890
15219
  if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
14891
15220
  if (result.evidenceSummary && !options?.quiet) {
14892
15221
  const summary = result.evidenceSummary;
14893
15222
  const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
14894
- consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, issues ${issueText}`));
15223
+ consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, conflicts ${summary.conflictCount ?? 0}, issues ${issueText}`));
14895
15224
  }
14896
15225
  if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
14897
15226
  prompt: result.tokensUsed.prompt,
@@ -15013,13 +15342,9 @@ async function runAuditedExtraction(options) {
15013
15342
  });
15014
15343
  try {
15015
15344
  let text$1 = "";
15016
- let filePath;
15017
- if (source.type === "file") {
15018
- const input = await readExtractFileInput(source.filePath, aiConfig, modelOverride);
15019
- text$1 = input.text;
15020
- filePath = input.filePath;
15021
- } else text$1 = source.text;
15022
- const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
15345
+ if (source.type === "file") text$1 = (await readExtractFileInput(source.filePath, aiConfig)).text;
15346
+ else text$1 = source.text;
15347
+ const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, source.type === "file" ? source.filePath : void 0, modelOverride, {
15023
15348
  quiet,
15024
15349
  insert
15025
15350
  });