aiex-cli 0.0.5-beta.6 → 0.0.6-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { C as name, D as doctorDiagnosticsTableRows, O as formatDoctorDiagnosticsJson, S as description, T as version, _ as PLACEHOLDER_SCHEMA, a as parseJsonSchema, b as createConfig, c as recognizeImageText, d as getDefaultAIConfig, f as readAIConfig, g as DEFAULT_PROMPT_CONFIG, h as DEFAULT_MINERU_CONFIG, i as JsonSchemaDefinitionSchema, l as initI18n, m as DEFAULT_MINERU_API_CONFIG, n as createMigrationConfig, o as toSnakeCase, p as writeAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as t, v as PLACEHOLDER_TEXT, w as package_default, x as seedConfig, y as AIConfigSchema } from "./doctor-collector-BpqhXNcO.mjs";
1
+ import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-hWEvJ4lw.mjs";
2
2
  import { createRequire } from "node:module";
3
3
  import fs from "node:fs/promises";
4
4
  import os from "node:os";
@@ -17,14 +17,13 @@ import Database from "better-sqlite3";
17
17
  import pc from "picocolors";
18
18
  import { Buffer } from "node:buffer";
19
19
  import * as XLSX from "xlsx";
20
- import { getEncoding } from "js-tiktoken";
21
20
  import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
22
21
  import { APICallError, Output, generateText, jsonSchema } from "ai";
23
22
  import pRetry from "p-retry";
23
+ import mime from "mime";
24
24
  import { jsonrepair } from "jsonrepair";
25
25
  import { LangfuseSpanProcessor } from "@langfuse/otel";
26
26
  import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
27
- import { marked } from "marked";
28
27
  import crypto from "node:crypto";
29
28
  import { Client, extractNotionId } from "@notionhq/client";
30
29
  import { execa } from "execa";
@@ -12860,6 +12859,28 @@ async function withRetry(fn, onRetry, maxRetries = 5) {
12860
12859
  });
12861
12860
  }
12862
12861
 
12862
+ //#endregion
12863
+ //#region src/core/ai-extraction/file-utils.ts
12864
+ function detectMimeType(filePath) {
12865
+ return mime.getType(filePath) ?? "application/octet-stream";
12866
+ }
12867
+ async function readFilePart(filePath) {
12868
+ const mimeStr = detectMimeType(filePath);
12869
+ const buffer = await fs.readFile(filePath);
12870
+ const name$1 = path.basename(filePath);
12871
+ if (mimeStr.startsWith("image/")) return {
12872
+ type: "image",
12873
+ image: buffer,
12874
+ mimeType: mimeStr
12875
+ };
12876
+ return {
12877
+ type: "file",
12878
+ data: buffer,
12879
+ mediaType: mimeStr,
12880
+ filename: name$1
12881
+ };
12882
+ }
12883
+
12863
12884
  //#endregion
12864
12885
  //#region src/core/ai-extraction/json-utils.ts
12865
12886
  function parseJsonLike(text$1) {
@@ -12920,10 +12941,25 @@ function filterCompatible(models, inputTokens, outputTokens) {
12920
12941
  });
12921
12942
  }
12922
12943
  function selectModel(input) {
12923
- const { models, inputTokens, outputTokens } = input;
12944
+ const { models, isImage, fileName, inputTokens, outputTokens } = input;
12924
12945
  if (models.length === 0) throw new Error(t("errors.ai.noModels"));
12925
12946
  let candidates = filterCompatible(models, inputTokens, outputTokens);
12926
12947
  if (candidates.length === 0) candidates = models;
12948
+ if (isImage) {
12949
+ const visionModel = candidates.find((m) => m.capabilities.vision);
12950
+ if (!visionModel) {
12951
+ const hint = fileName ? ` (${fileName})` : "";
12952
+ const msg = inputTokens ? t("errors.ai.noVisionModelContext", {
12953
+ tokens: inputTokens,
12954
+ hint
12955
+ }) : t("errors.ai.noVisionModel", { hint });
12956
+ throw new Error(msg + t("errors.ai.addSuitableModel"));
12957
+ }
12958
+ return {
12959
+ name: visionModel.name,
12960
+ capabilities: visionModel.capabilities
12961
+ };
12962
+ }
12927
12963
  const soModel = candidates.find((m) => m.capabilities.structuredOutput);
12928
12964
  if (soModel) return {
12929
12965
  name: soModel.name,
@@ -12937,46 +12973,36 @@ function selectModel(input) {
12937
12973
 
12938
12974
  //#endregion
12939
12975
  //#region src/core/ai-extraction/prompt-generator.ts
12940
- const CAMEL_CASE_BOUNDARY_RE = /([a-z0-9])([A-Z])/g;
12941
- const IDENTIFIER_SEPARATOR_RE = /[\s_-]+/;
12942
- function splitIdentifier(name$1) {
12943
- return name$1.replace(CAMEL_CASE_BOUNDARY_RE, "$1 $2").split(IDENTIFIER_SEPARATOR_RE).map((part) => part.trim().toLowerCase()).filter(Boolean);
12944
- }
12945
- function propertyToDescription(name$1, prop, indent = "", required = false) {
12976
+ function propertyToDescription(name$1, prop, indent = "") {
12946
12977
  const lines = [];
12947
12978
  let typeStr = prop.type;
12948
12979
  if (prop.type === "array" && prop.items) typeStr = `array of ${prop.items.type}`;
12949
- lines.push(`${indent}- ${name$1}: ${typeStr}${required ? " (required)" : ""}`);
12950
- const terms = splitIdentifier(name$1);
12951
- if (terms.length > 1) lines.push(`${indent} search terms: ${terms.join(", ")}`);
12952
- if (prop.description) lines.push(`${indent} description: ${prop.description}`);
12980
+ lines.push(`${indent}- ${name$1}: ${typeStr}`);
12953
12981
  if (prop.minLength !== void 0 || prop.maxLength !== void 0) lines.push(`${indent} length: ${prop.minLength ?? 0} - ${prop.maxLength ?? "unlimited"}`);
12954
- if (prop.minimum !== void 0 || prop.maximum !== void 0) lines.push(`${indent} range: ${prop.minimum ?? "-∞"} - ${prop.maximum ?? "+∞"}`);
12955
12982
  if (prop.format) lines.push(`${indent} format: ${prop.format}`);
12956
12983
  if (prop.unique) lines.push(`${indent} unique: true`);
12957
12984
  if (prop.default !== void 0) lines.push(`${indent} default: ${JSON.stringify(prop.default)}`);
12958
12985
  return lines.join("\n");
12959
12986
  }
12960
- function nestedPropertyToDescription(name$1, prop, indent = "", requiredFields = []) {
12987
+ function nestedPropertyToDescription(name$1, prop, indent = "") {
12961
12988
  const lines = [];
12962
- const isRequired = requiredFields.includes(name$1);
12963
12989
  if (prop.nested?.enabled && prop.type === "object") {
12964
12990
  const relation = prop.nested.relation || "has-one";
12965
- lines.push(`${indent}- ${name$1}: object (related table, ${relation})${isRequired ? " (required)" : ""}`);
12966
- if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.required ?? []));
12991
+ lines.push(`${indent}- ${name$1}: object (related table, ${relation})`);
12992
+ if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
12967
12993
  return lines.join("\n");
12968
12994
  }
12969
12995
  if (prop.type === "array" && prop.items?.nested?.enabled) {
12970
12996
  const relation = prop.items.nested.relation || "has-many";
12971
- lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})${isRequired ? " (required)" : ""}`);
12972
- if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.items.required ?? []));
12997
+ lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})`);
12998
+ if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
12973
12999
  return lines.join("\n");
12974
13000
  }
12975
- lines.push(propertyToDescription(name$1, prop, indent, isRequired));
12976
- if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.required ?? []));
13001
+ lines.push(propertyToDescription(name$1, prop, indent));
13002
+ if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
12977
13003
  if (prop.type === "array" && prop.items?.properties && !prop.items?.nested?.enabled) {
12978
13004
  lines.push(`${indent} item fields:`);
12979
- for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `, prop.items.required ?? []));
13005
+ for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent} `));
12980
13006
  }
12981
13007
  return lines.join("\n");
12982
13008
  }
@@ -12988,7 +13014,7 @@ function schemaToDescription(schema) {
12988
13014
  lines.push("Fields:");
12989
13015
  for (const [name$1, prop] of Object.entries(schema.properties)) {
12990
13016
  const property = prop;
12991
- lines.push(nestedPropertyToDescription(name$1, property, "", schema.required ?? []));
13017
+ lines.push(nestedPropertyToDescription(name$1, property));
12992
13018
  }
12993
13019
  if (schema.examples && schema.examples.length > 0) {
12994
13020
  lines.push("");
@@ -13033,6 +13059,33 @@ function generatePromptSnapshot(schema, promptConfig = DEFAULT_PROMPT_CONFIG) {
13033
13059
  ].join("\n");
13034
13060
  }
13035
13061
 
13062
+ //#endregion
13063
+ //#region src/core/ai-extraction/snapshot.ts
13064
+ const SYSTEM_PROMPT_REGEX = /## System Prompt\n([\s\S]*?)(?=## User Prompt|$)/;
13065
+ const USER_PROMPT_REGEX = /## User Prompt Template\n([\s\S]*)$/;
13066
+ async function loadPromptSnapshot(aiexDir, tableName) {
13067
+ const snapshotPath = path.join(aiexDir, "extracted", `${tableName}.prompt.md`);
13068
+ try {
13069
+ const content = await fs.readFile(snapshotPath, "utf-8");
13070
+ const systemMatch = content.match(SYSTEM_PROMPT_REGEX);
13071
+ const userMatch = content.match(USER_PROMPT_REGEX);
13072
+ if (systemMatch && userMatch) return {
13073
+ system: systemMatch[1].trim(),
13074
+ user: userMatch[1].trim()
13075
+ };
13076
+ } catch {}
13077
+ return null;
13078
+ }
13079
+ async function savePromptSnapshot(schema, aiexDir) {
13080
+ const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
13081
+ const outputDir = path.join(aiexDir, "extracted");
13082
+ await fs.mkdir(outputDir, { recursive: true });
13083
+ const fileName = `${schema.table.name}.prompt.md`;
13084
+ const outputPath = path.join(outputDir, fileName);
13085
+ await fs.writeFile(outputPath, content);
13086
+ return outputPath;
13087
+ }
13088
+
13036
13089
  //#endregion
13037
13090
  //#region src/core/ai-extraction/telemetry.ts
13038
13091
  let langfuseInitialized = false;
@@ -13075,7 +13128,7 @@ function propertyToExtractionSchema(property) {
13075
13128
  }
13076
13129
  return { type: nullableType(property.type) };
13077
13130
  }
13078
- function isRecord$2(value) {
13131
+ function isRecord(value) {
13079
13132
  return typeof value === "object" && value !== null && !Array.isArray(value);
13080
13133
  }
13081
13134
  function schemaToExtractionOutputSchema(schema) {
@@ -13113,7 +13166,7 @@ function validatePropertyValue(path$1, property, value, issues) {
13113
13166
  }
13114
13167
  return;
13115
13168
  case "object":
13116
- if (!isRecord$2(value)) {
13169
+ if (!isRecord(value)) {
13117
13170
  issues.push(`${path$1}: expected object or null`);
13118
13171
  return;
13119
13172
  }
@@ -13136,7 +13189,7 @@ function validateProperties(basePath, properties, data, issues) {
13136
13189
  }
13137
13190
  }
13138
13191
  function validateExtractedData(schema, data) {
13139
- if (!isRecord$2(data)) return {
13192
+ if (!isRecord(data)) return {
13140
13193
  success: false,
13141
13194
  error: "Extracted data must be a JSON object."
13142
13195
  };
@@ -13153,11 +13206,13 @@ function validateExtractedData(schema, data) {
13153
13206
  //#region src/core/ai-extraction/extractor.ts
13154
13207
  const OPENAI_COMPATIBLE_PROVIDER_NAME = "openai-compatible";
13155
13208
  async function extractStructuredData(input) {
13156
- const { config, schema, text: text$1, modelOverride } = input;
13209
+ const { config, schema, text: text$1, aiexDir, file, modelOverride } = input;
13157
13210
  if (!config.provider.apiKey) return {
13158
13211
  success: false,
13159
13212
  error: t("errors.ai.apiKeyMissing")
13160
13213
  };
13214
+ const useFileContent = !!file;
13215
+ const isImageFile = useFileContent && detectMimeType(file).startsWith("image/");
13161
13216
  const inputTokens = text$1 ? Math.ceil(text$1.length / 2) : void 0;
13162
13217
  const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
13163
13218
  const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
@@ -13165,6 +13220,8 @@ async function extractStructuredData(input) {
13165
13220
  try {
13166
13221
  selected = modelOverride ?? selectModel({
13167
13222
  models: config.provider.models,
13223
+ isImage: isImageFile,
13224
+ fileName: file,
13168
13225
  inputTokens,
13169
13226
  outputTokens
13170
13227
  });
@@ -13184,7 +13241,18 @@ async function extractStructuredData(input) {
13184
13241
  apiKey: config.provider.apiKey,
13185
13242
  supportsStructuredOutputs: useStructuredOutput
13186
13243
  });
13187
- const { system, user } = generateExtractionPrompt(schema, text$1, config.prompt ?? DEFAULT_PROMPT_CONFIG);
13244
+ let system;
13245
+ let user;
13246
+ const snapshot = await loadPromptSnapshot(aiexDir, schema.table.name);
13247
+ const promptText = file ? PLACEHOLDER_TEXT : text$1;
13248
+ if (snapshot) {
13249
+ system = snapshot.system;
13250
+ user = snapshot.user.replaceAll(PLACEHOLDER_TEXT, promptText);
13251
+ } else {
13252
+ const generated = generateExtractionPrompt(schema, promptText, config.prompt ?? DEFAULT_PROMPT_CONFIG);
13253
+ system = generated.system;
13254
+ user = generated.user;
13255
+ }
13188
13256
  const outputSchema = jsonSchema(schemaToExtractionOutputSchema(schema));
13189
13257
  const timeoutMs = (config.provider.timeout ?? 300) * 1e3;
13190
13258
  let systemPrompt = system;
@@ -13199,16 +13267,38 @@ async function extractStructuredData(input) {
13199
13267
  let parseError;
13200
13268
  let validationError;
13201
13269
  try {
13202
- const textOpts = {
13203
- model: provider.chatModel(selected.name),
13204
- system: systemPrompt,
13205
- prompt: userPrompt,
13206
- abortSignal: AbortSignal.timeout(timeoutMs),
13207
- maxRetries: 0,
13208
- experimental_telemetry: { isEnabled: useTelemetry }
13209
- };
13210
- if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
13211
- result = await withRetry(() => generateText(textOpts), input.onRetry);
13270
+ if (useFileContent) {
13271
+ const filePart = await readFilePart(file);
13272
+ const fileName = filePart.type === "file" ? filePart.filename : path.basename(file);
13273
+ const contentParts = [{
13274
+ type: "text",
13275
+ text: userPrompt.includes(PLACEHOLDER_TEXT) ? userPrompt.replaceAll(PLACEHOLDER_TEXT, text$1 || `Data is contained in the attached file: ${fileName}`) : userPrompt
13276
+ }, filePart];
13277
+ const fileOpts = {
13278
+ model: provider.chatModel(selected.name),
13279
+ system: systemPrompt,
13280
+ messages: [{
13281
+ role: "user",
13282
+ content: contentParts
13283
+ }],
13284
+ abortSignal: AbortSignal.timeout(timeoutMs),
13285
+ maxRetries: 0,
13286
+ experimental_telemetry: { isEnabled: useTelemetry }
13287
+ };
13288
+ if (useStructuredOutput) fileOpts.output = Output.object({ schema: outputSchema });
13289
+ result = await withRetry(() => generateText(fileOpts), input.onRetry);
13290
+ } else {
13291
+ const textOpts = {
13292
+ model: provider.chatModel(selected.name),
13293
+ system: systemPrompt,
13294
+ prompt: userPrompt,
13295
+ abortSignal: AbortSignal.timeout(timeoutMs),
13296
+ maxRetries: 0,
13297
+ experimental_telemetry: { isEnabled: useTelemetry }
13298
+ };
13299
+ if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
13300
+ result = await withRetry(() => generateText(textOpts), input.onRetry);
13301
+ }
13212
13302
  if (result.usage) {
13213
13303
  totalPromptTokens += result.usage.inputTokens ?? 0;
13214
13304
  totalCompletionTokens += result.usage.outputTokens ?? 0;
@@ -13224,16 +13314,27 @@ async function extractStructuredData(input) {
13224
13314
  }
13225
13315
  if (!parseError && data !== void 0) {
13226
13316
  const validation = validateExtractedData(schema, data);
13227
- if (validation.success) return {
13228
- success: true,
13229
- data,
13230
- tokensUsed: {
13231
- prompt: totalPromptTokens,
13232
- completion: totalCompletionTokens,
13233
- total: totalPromptTokens + totalCompletionTokens
13234
- }
13235
- };
13236
- else validationError = validation.error;
13317
+ if (validation.success) {
13318
+ const outputDir = path.resolve(aiexDir, config.extraction.outputDir.replace(".aiex/", ""));
13319
+ await fs.mkdir(outputDir, { recursive: true });
13320
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
13321
+ const outputFileName = `${schema.table.name}-${timestamp}.json`;
13322
+ const outputPath = path.join(outputDir, outputFileName);
13323
+ await writeFile(outputPath, data, {
13324
+ spaces: 2,
13325
+ EOL: "\n"
13326
+ });
13327
+ return {
13328
+ success: true,
13329
+ outputPath,
13330
+ data,
13331
+ tokensUsed: {
13332
+ prompt: totalPromptTokens,
13333
+ completion: totalCompletionTokens,
13334
+ total: totalPromptTokens + totalCompletionTokens
13335
+ }
13336
+ };
13337
+ } else validationError = validation.error;
13237
13338
  }
13238
13339
  const errorMsg = parseError || validationError || "Unknown validation error";
13239
13340
  lastError = errorMsg;
@@ -13244,14 +13345,11 @@ async function extractStructuredData(input) {
13244
13345
  CRITICAL RULES:
13245
13346
  1. Only correct the fields that failed validation.
13246
13347
  2. Preserve all other correctly extracted fields and their values exactly.
13247
- 3. Use only values supported by the original text. If a value cannot be confirmed, set it to null.
13248
- 4. Remove any fields not defined by the JSON Schema.
13249
- 5. Normalize values to the expected JSON type without changing the intended meaning.
13250
- 6. Return ONLY the corrected JSON object. No explanations, no markdown blocks other than JSON.`;
13348
+ 3. Return ONLY the corrected JSON object. No explanations, no markdown blocks other than JSON.`;
13251
13349
  userPrompt = `The JSON data you generated previously failed validation. Please correct it.
13252
13350
 
13253
13351
  [Original Text]
13254
- ${text$1 || "Original text is empty."}
13352
+ ${text$1 || "Data is contained in the attached file."}
13255
13353
 
13256
13354
  [JSON Schema Definition]
13257
13355
  ${JSON.stringify(schemaToExtractionOutputSchema(schema), null, 2)}
@@ -13262,11 +13360,6 @@ ${invalidJson}
13262
13360
  [Validation Error Details]
13263
13361
  ${errorMsg}
13264
13362
 
13265
- Correction checklist:
13266
- - Fix each field path mentioned in the validation error.
13267
- - Keep schema-valid fields unchanged.
13268
- - Do not invent missing facts; use null when the original text does not support a value.
13269
-
13270
13363
  Please output the corrected JSON object now:`;
13271
13364
  }
13272
13365
  }
@@ -13419,343 +13512,6 @@ function insertExtractedData(db, schema, data) {
13419
13512
  }
13420
13513
  }
13421
13514
 
13422
- //#endregion
13423
- //#region src/core/ai-extraction/json-merger.ts
13424
- function isRecord$1(value) {
13425
- return typeof value === "object" && value !== null && !Array.isArray(value);
13426
- }
13427
- function stableKey(value) {
13428
- if (!isRecord$1(value)) return JSON.stringify(value);
13429
- return JSON.stringify(Object.keys(value).sort().reduce((acc, key) => {
13430
- acc[key] = value[key];
13431
- return acc;
13432
- }, {}));
13433
- }
13434
- function isBlankString(value) {
13435
- return typeof value === "string" && value.trim() === "";
13436
- }
13437
- function isPlaceholderString$1(value) {
13438
- if (typeof value !== "string") return false;
13439
- const normalized = value.trim().toLowerCase();
13440
- return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
13441
- }
13442
- function pickPrimitiveValue(values) {
13443
- const meaningful = values.filter((v) => !isBlankString(v) && !isPlaceholderString$1(v));
13444
- if (meaningful.length === 0) return null;
13445
- if (typeof meaningful[0] === "boolean") {
13446
- const trueCount = meaningful.filter(Boolean).length;
13447
- return trueCount >= meaningful.length - trueCount;
13448
- }
13449
- return meaningful[0];
13450
- }
13451
- function mergePropertyValue(property, values) {
13452
- const nonNullValues = values.filter((v) => v !== null && v !== void 0);
13453
- if (nonNullValues.length === 0) return null;
13454
- if (property.type === "array") {
13455
- const concatenated = [];
13456
- const seen = /* @__PURE__ */ new Set();
13457
- for (const val of nonNullValues) if (Array.isArray(val)) for (const item of val) {
13458
- const key = stableKey(item);
13459
- if (!seen.has(key)) {
13460
- seen.add(key);
13461
- concatenated.push(item);
13462
- }
13463
- }
13464
- return concatenated;
13465
- }
13466
- if (property.type === "object") {
13467
- const childProperties = property.properties;
13468
- if (!childProperties) {
13469
- const mergedObj$1 = {};
13470
- for (const val of nonNullValues) if (isRecord$1(val)) Object.assign(mergedObj$1, val);
13471
- return mergedObj$1;
13472
- }
13473
- const mergedObj = {};
13474
- for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord$1(v) ? v[propName] : void 0));
13475
- return mergedObj;
13476
- }
13477
- return pickPrimitiveValue(nonNullValues);
13478
- }
13479
- /**
13480
- * Merges structured extraction outputs from multiple document chunks
13481
- * according to the schema properties.
13482
- */
13483
- function mergeExtractionResults(schema, results) {
13484
- if (results.length === 0) return {};
13485
- if (results.length === 1) return results[0];
13486
- const merged = {};
13487
- for (const [propName, propDef] of Object.entries(schema.properties)) {
13488
- if (propDef.primary && propDef.autoIncrement) continue;
13489
- merged[propName] = mergePropertyValue(propDef, results.map((r) => r[propName]));
13490
- }
13491
- return merged;
13492
- }
13493
-
13494
- //#endregion
13495
- //#region src/core/ai-extraction/snapshot.ts
13496
- async function savePromptSnapshot(schema, aiexDir) {
13497
- const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
13498
- const outputDir = path.join(aiexDir, "extracted");
13499
- await fs.mkdir(outputDir, { recursive: true });
13500
- const fileName = `${schema.table.name}.prompt.md`;
13501
- const outputPath = path.join(outputDir, fileName);
13502
- await fs.writeFile(outputPath, content);
13503
- return outputPath;
13504
- }
13505
-
13506
- //#endregion
13507
- //#region src/core/ai-extraction/text-splitter.ts
13508
- const encoding$1 = getEncoding("cl100k_base");
13509
- const MAX_OVERLAP_RATIO = .15;
13510
- const MAX_EFFECTIVE_OVERLAP_TOKENS = 1200;
13511
- const TABLE_SEPARATOR_CELL_RE = /^:?-{3,}:?$/;
13512
- const LEADING_TABLE_PIPE_RE = /^\|/;
13513
- const TRAILING_TABLE_PIPE_RE = /\|$/;
13514
- function countTokens(text$1) {
13515
- return encoding$1.encode(text$1).length;
13516
- }
13517
- function calculateChunkTokenBudget(options = {}) {
13518
- const configuredMaxTokens = options.configuredMaxTokens ?? 8e3;
13519
- const modelMaxTokens = options.modelMaxTokens;
13520
- if (!modelMaxTokens) return configuredMaxTokens;
13521
- const outputReserveTokens = options.outputReserveTokens ?? 2e3;
13522
- const promptReserveTokens = options.promptReserveTokens ?? 1200;
13523
- const safetyBufferTokens = options.safetyBufferTokens ?? Math.min(1e3, Math.floor(modelMaxTokens * .1));
13524
- const available = modelMaxTokens - outputReserveTokens - promptReserveTokens - safetyBufferTokens;
13525
- return Math.max(512, Math.min(configuredMaxTokens, available));
13526
- }
13527
- function formatHeadingContext(headings) {
13528
- const active = headings.filter(Boolean);
13529
- if (active.length === 0) return "";
13530
- return `> **[Context]** Belong to: ${active.join(" > ")}\n\n`;
13531
- }
13532
- function getMetadata(headings) {
13533
- return {
13534
- h1: headings[0] || void 0,
13535
- h2: headings[1] || void 0,
13536
- h3: headings[2] || void 0,
13537
- h4: headings[3] || void 0
13538
- };
13539
- }
13540
- function getHeadingPath(metadata) {
13541
- return [
13542
- metadata.h1,
13543
- metadata.h2,
13544
- metadata.h3,
13545
- metadata.h4
13546
- ].filter(Boolean);
13547
- }
13548
- function finalizeChunks(chunks, sourceText) {
13549
- let searchStart = 0;
13550
- const totalChunks = chunks.length;
13551
- return chunks.map((chunk, index) => {
13552
- const tokenCount = countTokens(chunk.pageContent);
13553
- let charStart = sourceText.indexOf(chunk.pageContent, searchStart);
13554
- if (charStart === -1) charStart = sourceText.indexOf(chunk.pageContent);
13555
- const charEnd = charStart >= 0 ? charStart + chunk.pageContent.length : void 0;
13556
- if (charStart >= 0 && charEnd !== void 0) searchStart = charEnd;
13557
- return {
13558
- ...chunk,
13559
- chunkIndex: index,
13560
- totalChunks,
13561
- tokenCount,
13562
- headingPath: getHeadingPath(chunk.metadata),
13563
- charStart: charStart >= 0 ? charStart : void 0,
13564
- charEnd
13565
- };
13566
- });
13567
- }
13568
- function getEffectiveOverlapTokens(maxTokens, overlapTokens) {
13569
- return Math.floor(Math.min(overlapTokens, Math.max(64, maxTokens * MAX_OVERLAP_RATIO), MAX_EFFECTIVE_OVERLAP_TOKENS));
13570
- }
13571
- function splitMarkdownTable(tableText, maxTokens) {
13572
- if (countTokens(tableText) <= maxTokens) return [tableText];
13573
- const lines = tableText.split("\n");
13574
- const headerIndex = lines.findIndex((line) => line.trim().startsWith("|"));
13575
- const separatorIndex = lines.findIndex((line, index) => {
13576
- if (index <= headerIndex) return false;
13577
- const cells = line.trim().replace(LEADING_TABLE_PIPE_RE, "").replace(TRAILING_TABLE_PIPE_RE, "").split("|").map((cell) => cell.trim());
13578
- return cells.length > 0 && cells.every((cell) => TABLE_SEPARATOR_CELL_RE.test(cell));
13579
- });
13580
- if (headerIndex === -1 || separatorIndex === -1) return splitTextRecursively(tableText, maxTokens, ["\n"]);
13581
- const prefix = lines.slice(0, headerIndex);
13582
- const header = lines[headerIndex];
13583
- const separator = lines[separatorIndex];
13584
- const rows = lines.slice(separatorIndex + 1).filter((line) => line.trim() !== "");
13585
- const chunks = [];
13586
- let currentRows = [];
13587
- const buildTable = (tableRows) => {
13588
- return [
13589
- ...prefix,
13590
- header,
13591
- separator,
13592
- ...tableRows
13593
- ].join("\n");
13594
- };
13595
- for (const row of rows) {
13596
- const candidateRows = [...currentRows, row];
13597
- if (currentRows.length > 0 && countTokens(buildTable(candidateRows)) > maxTokens) {
13598
- chunks.push(buildTable(currentRows));
13599
- currentRows = [row];
13600
- } else currentRows = candidateRows;
13601
- }
13602
- if (currentRows.length > 0) chunks.push(buildTable(currentRows));
13603
- return chunks.length > 0 ? chunks : [tableText];
13604
- }
13605
- /**
13606
- * Splits text recursively using a list of separators.
13607
- * Preserves the separators when re-joining.
13608
- */
13609
- function splitTextRecursively(text$1, maxTokens, separators = [
13610
- "\n\n",
13611
- "\n",
13612
- "。",
13613
- ". ",
13614
- " "
13615
- ]) {
13616
- if (countTokens(text$1) <= maxTokens) return [text$1];
13617
- if (separators.length === 0) {
13618
- const chunks = [];
13619
- let current = "";
13620
- for (const char of text$1) if (countTokens(current + char) > maxTokens) {
13621
- chunks.push(current);
13622
- current = char;
13623
- } else current += char;
13624
- if (current) chunks.push(current);
13625
- return chunks;
13626
- }
13627
- const separator = separators[0];
13628
- const nextSeparators = separators.slice(1);
13629
- const parts = text$1.split(separator);
13630
- const result = [];
13631
- let currentChunk = [];
13632
- let currentChunkTokens = 0;
13633
- for (let i = 0; i < parts.length; i++) {
13634
- const part = parts[i];
13635
- const itemText = part + (i < parts.length - 1 ? separator : "");
13636
- const partTokens = countTokens(itemText);
13637
- if (partTokens > maxTokens) {
13638
- if (currentChunk.length > 0) {
13639
- result.push(currentChunk.join(""));
13640
- currentChunk = [];
13641
- currentChunkTokens = 0;
13642
- }
13643
- const subParts = splitTextRecursively(part, maxTokens, nextSeparators);
13644
- for (let j = 0; j < subParts.length; j++) {
13645
- const finalSub = subParts[j] + (j === subParts.length - 1 && i < parts.length - 1 ? separator : "");
13646
- result.push(finalSub);
13647
- }
13648
- } else if (currentChunkTokens + partTokens > maxTokens) {
13649
- result.push(currentChunk.join(""));
13650
- currentChunk = [itemText];
13651
- currentChunkTokens = partTokens;
13652
- } else {
13653
- currentChunk.push(itemText);
13654
- currentChunkTokens += partTokens;
13655
- }
13656
- }
13657
- if (currentChunk.length > 0) result.push(currentChunk.join(""));
13658
- return result;
13659
- }
13660
- /**
13661
- * Splits a Markdown document into chunks based on heading contexts, AST block parsing, and token limits.
13662
- * Protects tables, list items, and code blocks from being broken.
13663
- */
13664
- function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
13665
- const tokens = marked.lexer(text$1);
13666
- const chunks = [];
13667
- const effectiveOverlapTokens = getEffectiveOverlapTokens(maxTokens, overlapTokens);
13668
- let currentHeadings = [];
13669
- let currentChunkList = [];
13670
- let accumulatedTokens = 0;
13671
- const flushCurrentChunk = (isHeadingChange = false) => {
13672
- if (currentChunkList.length === 0) return;
13673
- const pageContent = currentChunkList.map((item) => item.text).join("");
13674
- const firstHeadings = currentChunkList[0].headings;
13675
- chunks.push({
13676
- pageContent,
13677
- metadata: getMetadata(firstHeadings)
13678
- });
13679
- if (isHeadingChange || effectiveOverlapTokens <= 0) {
13680
- currentChunkList = [];
13681
- accumulatedTokens = 0;
13682
- } else {
13683
- const overlapItems = [];
13684
- let currentOverlapTokens = 0;
13685
- for (let i = currentChunkList.length - 1; i >= 0; i--) {
13686
- const item = currentChunkList[i];
13687
- const itemTokens = countTokens(item.text);
13688
- if (currentOverlapTokens + itemTokens > effectiveOverlapTokens && overlapItems.length > 0) break;
13689
- overlapItems.unshift(item);
13690
- currentOverlapTokens += itemTokens;
13691
- }
13692
- currentChunkList = [...overlapItems];
13693
- accumulatedTokens = currentOverlapTokens;
13694
- }
13695
- };
13696
- for (const token of tokens) {
13697
- if (token.type === "space") {
13698
- if (currentChunkList.length > 0) {
13699
- currentChunkList[currentChunkList.length - 1].text += token.raw;
13700
- accumulatedTokens += countTokens(token.raw);
13701
- }
13702
- continue;
13703
- }
13704
- if (token.type === "heading") {
13705
- flushCurrentChunk(true);
13706
- const depth = token.depth;
13707
- const title = token.text.trim();
13708
- currentHeadings = currentHeadings.slice(0, depth - 1);
13709
- currentHeadings[depth - 1] = title;
13710
- }
13711
- const rawText = token.raw;
13712
- if (token.type === "list" && countTokens(rawText) > maxTokens) for (const item of token.items) processTextBlock(item.raw, currentHeadings);
13713
- else {
13714
- const isAtomic = token.type === "table" || token.type === "code";
13715
- processTextBlock(rawText, currentHeadings, isAtomic);
13716
- }
13717
- }
13718
- flushCurrentChunk(true);
13719
- return finalizeChunks(chunks, text$1);
13720
- function processTextBlock(blockText, headings, isAtomic = false) {
13721
- const blockTokens = countTokens(blockText);
13722
- const contextTokens = countTokens(formatHeadingContext(headings));
13723
- const safetyBuffer = Math.min(100, Math.max(2, Math.floor(maxTokens * .1)));
13724
- const budgetLimit = Math.max(5, maxTokens - contextTokens - safetyBuffer);
13725
- if (blockTokens > budgetLimit) if (isAtomic) {
13726
- flushCurrentChunk(false);
13727
- const atomicBlocks = blockTokens <= maxTokens ? [blockText] : blockText.includes("|") ? splitMarkdownTable(blockText, budgetLimit) : splitTextRecursively(blockText, budgetLimit, ["\n"]);
13728
- for (const block of atomicBlocks) {
13729
- currentChunkList.push({
13730
- text: block,
13731
- headings: [...headings]
13732
- });
13733
- accumulatedTokens = countTokens(block);
13734
- flushCurrentChunk(false);
13735
- }
13736
- } else {
13737
- flushCurrentChunk(false);
13738
- const subBlocks = splitTextRecursively(blockText, budgetLimit);
13739
- for (const sub of subBlocks) {
13740
- currentChunkList.push({
13741
- text: sub,
13742
- headings: [...headings]
13743
- });
13744
- accumulatedTokens += countTokens(sub);
13745
- if (accumulatedTokens > budgetLimit) flushCurrentChunk(false);
13746
- }
13747
- }
13748
- else {
13749
- if (accumulatedTokens + blockTokens + contextTokens > maxTokens && currentChunkList.length > 0) flushCurrentChunk(false);
13750
- currentChunkList.push({
13751
- text: blockText,
13752
- headings: [...headings]
13753
- });
13754
- accumulatedTokens += blockTokens;
13755
- }
13756
- }
13757
- }
13758
-
13759
13515
  //#endregion
13760
13516
  //#region src/core/extraction-audit.ts
13761
13517
  const AUDIT_ID_RE = /^[\w.-]+$/;
@@ -13906,276 +13662,6 @@ function getFileHash(filePath) {
13906
13662
  });
13907
13663
  }
13908
13664
 
13909
- //#endregion
13910
- //#region src/core/ai-extraction/evidence.ts
13911
- const JSON_FILE_SUFFIX_RE$1 = /\.json$/i;
13912
- const FIELD_PATH_PREFIX_RE = /^\$\./;
13913
- function isRecord(value) {
13914
- return typeof value === "object" && value !== null && !Array.isArray(value);
13915
- }
13916
- function stableValueKey(value) {
13917
- return JSON.stringify(value);
13918
- }
13919
- function isPlaceholderString(value) {
13920
- if (typeof value !== "string") return false;
13921
- const normalized = value.trim().toLowerCase();
13922
- return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
13923
- }
13924
- function primitiveToText(value) {
13925
- if (value === null || value === void 0) return null;
13926
- if (typeof value === "string") return value.trim() || null;
13927
- if (typeof value === "number" || typeof value === "boolean") return String(value);
13928
- return null;
13929
- }
13930
- function isMeaningfulValue(value) {
13931
- return primitiveToText(value) !== null && !isPlaceholderString(value);
13932
- }
13933
- function normalizeText(value) {
13934
- return value.toLowerCase().replace(/\s+/g, " ").trim();
13935
- }
13936
- function quoteAround(text$1, start, length) {
13937
- const before = Math.max(0, start - 80);
13938
- const after = Math.min(text$1.length, start + length + 80);
13939
- return text$1.slice(before, after).replace(/\s+/g, " ").trim();
13940
- }
13941
- function findEvidence(value, chunks) {
13942
- const searchText = primitiveToText(value);
13943
- if (!searchText) return null;
13944
- const normalizedSearchText = normalizeText(searchText);
13945
- if (!normalizedSearchText) return null;
13946
- for (const chunk of chunks) {
13947
- if (normalizeText(chunk.text).indexOf(normalizedSearchText) === -1) continue;
13948
- const rawIndex = chunk.text.toLowerCase().indexOf(searchText.toLowerCase());
13949
- const quoteIndex = rawIndex >= 0 ? rawIndex : 0;
13950
- return {
13951
- chunkIndex: chunk.chunkIndex,
13952
- headingPath: chunk.headingPath,
13953
- quote: quoteAround(chunk.text, quoteIndex, searchText.length)
13954
- };
13955
- }
13956
- return null;
13957
- }
13958
- function addEvidenceForProperty(fields, path$1, property, value, chunks) {
13959
- if (property.type === "object" && property.properties) {
13960
- const record = isRecord(value) ? value : {};
13961
- for (const [childName, childProperty] of Object.entries(property.properties)) addEvidenceForProperty(fields, `${path$1}.${childName}`, childProperty, record[childName], chunks);
13962
- return;
13963
- }
13964
- if (property.type === "array") {
13965
- if (!Array.isArray(value) || value.length === 0) {
13966
- fields.push({
13967
- fieldPath: path$1,
13968
- status: "missing",
13969
- value: null,
13970
- confidence: 0,
13971
- note: "Array field is empty or missing."
13972
- });
13973
- return;
13974
- }
13975
- value.forEach((item, index) => {
13976
- if (property.items?.type === "object" && property.items.properties) {
13977
- const record = isRecord(item) ? item : {};
13978
- for (const [childName, childProperty] of Object.entries(property.items.properties)) addEvidenceForProperty(fields, `${path$1}[${index}].${childName}`, childProperty, record[childName], chunks);
13979
- } else addPrimitiveEvidence(fields, `${path$1}[${index}]`, item, chunks);
13980
- });
13981
- return;
13982
- }
13983
- addPrimitiveEvidence(fields, path$1, value, chunks);
13984
- }
13985
- function addPrimitiveEvidence(fields, fieldPath, value, chunks) {
13986
- if (value === null || value === void 0 || value === "") {
13987
- fields.push({
13988
- fieldPath,
13989
- status: "missing",
13990
- value: null,
13991
- confidence: 0,
13992
- note: "Field is null or empty in final extraction."
13993
- });
13994
- return;
13995
- }
13996
- const found = findEvidence(value, chunks);
13997
- if (found) {
13998
- fields.push({
13999
- fieldPath,
14000
- status: "found",
14001
- value,
14002
- confidence: .8,
14003
- ...found
14004
- });
14005
- return;
14006
- }
14007
- fields.push({
14008
- fieldPath,
14009
- status: "inferred",
14010
- value,
14011
- confidence: .35,
14012
- note: "Final value was not found verbatim in the available source text."
14013
- });
14014
- }
14015
- function sourceChunksFromText(text$1) {
14016
- return text$1 ? [{
14017
- text: text$1,
14018
- chunkIndex: 0,
14019
- headingPath: []
14020
- }] : [];
14021
- }
14022
- function sourceChunksFromMarkdownChunks(chunks) {
14023
- return chunks.map((chunk, index) => ({
14024
- text: chunk.pageContent,
14025
- chunkIndex: chunk.chunkIndex ?? index,
14026
- headingPath: chunk.headingPath ?? []
14027
- }));
14028
- }
14029
- function getPathParts(fieldPath) {
14030
- return fieldPath.replace(FIELD_PATH_PREFIX_RE, "").split(".").filter(Boolean);
14031
- }
14032
- function getValueAtPath$1(data, fieldPath) {
14033
- let current = data;
14034
- for (const part of getPathParts(fieldPath)) {
14035
- if (!isRecord(current)) return void 0;
14036
- current = current[part];
14037
- }
14038
- return current;
14039
- }
14040
- function setValueAtPath(data, fieldPath, value) {
14041
- const parts = getPathParts(fieldPath);
14042
- let current = data;
14043
- for (let i = 0; i < parts.length - 1; i++) {
14044
- const part = parts[i];
14045
- if (!isRecord(current[part])) current[part] = {};
14046
- current = current[part];
14047
- }
14048
- current[parts[parts.length - 1]] = value;
14049
- }
14050
- function collectScalarFields(fields, fieldPath, property) {
14051
- if (property.type === "object" && property.properties) {
14052
- for (const [name$1, childProperty] of Object.entries(property.properties)) collectScalarFields(fields, `${fieldPath}.${name$1}`, childProperty);
14053
- return;
14054
- }
14055
- if (property.type !== "array") fields.push({
14056
- fieldPath,
14057
- property
14058
- });
14059
- }
14060
- function candidateScore(candidate) {
14061
- return (candidate.status === "found" ? 100 : 0) + Math.round(candidate.confidence * 10) + candidate.chunkIndex;
14062
- }
14063
- function selectCandidatesForField(candidates) {
14064
- if (candidates.length === 0) return null;
14065
- candidates.sort((a, b) => candidateScore(b) - candidateScore(a));
14066
- const selected = candidates[0];
14067
- selected.selected = true;
14068
- for (const candidate of candidates.slice(1)) {
14069
- candidate.selected = false;
14070
- candidate.rejectionReason = "Lower evidence score or earlier chunk position.";
14071
- }
14072
- const distinctValues = /* @__PURE__ */ new Map();
14073
- for (const candidate of candidates) distinctValues.set(stableValueKey(candidate.value), candidate.value);
14074
- if (distinctValues.size <= 1) return null;
14075
- return {
14076
- fieldPath: selected.fieldPath,
14077
- selectedValue: selected.value,
14078
- rejectedValues: candidates.slice(1).map((candidate) => candidate.value),
14079
- candidates: [...candidates]
14080
- };
14081
- }
14082
- function buildCandidateMergeReport(input) {
14083
- const scalarFields = [];
14084
- for (const [name$1, property] of Object.entries(input.schema.properties)) {
14085
- if (property.primary && property.autoIncrement) continue;
14086
- collectScalarFields(scalarFields, `$.${name$1}`, property);
14087
- }
14088
- const sourceChunks = sourceChunksFromMarkdownChunks(input.chunks);
14089
- const candidatesByPath = /* @__PURE__ */ new Map();
14090
- for (const { fieldPath } of scalarFields) for (let chunkIndex = 0; chunkIndex < input.chunkResults.length; chunkIndex++) {
14091
- const value = getValueAtPath$1(input.chunkResults[chunkIndex], fieldPath);
14092
- if (!isMeaningfulValue(value)) continue;
14093
- const sourceChunk = sourceChunks[chunkIndex] ?? {
14094
- text: "",
14095
- chunkIndex
14096
- };
14097
- const found = findEvidence(value, [sourceChunk]);
14098
- const candidate = {
14099
- fieldPath,
14100
- value,
14101
- chunkIndex: sourceChunk.chunkIndex ?? chunkIndex,
14102
- headingPath: sourceChunk.headingPath,
14103
- status: found ? "found" : "inferred",
14104
- quote: found?.quote,
14105
- confidence: found ? .85 : .35
14106
- };
14107
- const candidates = candidatesByPath.get(fieldPath) ?? [];
14108
- candidates.push(candidate);
14109
- candidatesByPath.set(fieldPath, candidates);
14110
- }
14111
- const allCandidates = [];
14112
- const conflicts = [];
14113
- for (const candidates of candidatesByPath.values()) {
14114
- const conflict = selectCandidatesForField(candidates);
14115
- allCandidates.push(...candidates);
14116
- if (conflict) conflicts.push(conflict);
14117
- }
14118
- return {
14119
- candidates: allCandidates,
14120
- conflicts
14121
- };
14122
- }
14123
- function applySelectedCandidates(data, report) {
14124
- const merged = structuredClone(data);
14125
- for (const candidate of report.candidates) if (candidate.selected) setValueAtPath(merged, candidate.fieldPath, candidate.value);
14126
- return merged;
14127
- }
14128
- function buildExtractionEvidence(input) {
14129
- const data = isRecord(input.data) ? input.data : {};
14130
- const chunks = input.chunks ? sourceChunksFromMarkdownChunks(input.chunks) : sourceChunksFromText(input.text ?? "");
14131
- const fields = [];
14132
- for (const [name$1, property] of Object.entries(input.schema.properties)) {
14133
- if (property.primary && property.autoIncrement) continue;
14134
- addEvidenceForProperty(fields, `$.${name$1}`, property, data[name$1], chunks);
14135
- }
14136
- const inferredIssues = fields.filter((field) => field.status === "inferred").map((field) => ({
14137
- fieldPath: field.fieldPath,
14138
- message: field.note ?? "Field value lacks source evidence."
14139
- }));
14140
- const conflictIssues = (input.candidateReport?.conflicts ?? []).map((conflict) => ({
14141
- fieldPath: conflict.fieldPath,
14142
- message: "Multiple chunk candidates disagree for this field."
14143
- }));
14144
- const issues = [...inferredIssues, ...conflictIssues];
14145
- return {
14146
- coverage: {
14147
- path: input.outputPath ? evidencePathForOutput(input.outputPath) : void 0,
14148
- fieldCount: fields.length,
14149
- evidenceCount: fields.filter((field) => field.status === "found").length,
14150
- foundCount: fields.filter((field) => field.status === "found").length,
14151
- missingCount: fields.filter((field) => field.status === "missing").length,
14152
- inferredCount: fields.filter((field) => field.status === "inferred").length,
14153
- conflictCount: input.candidateReport?.conflicts.length ?? 0,
14154
- issueCount: issues.length
14155
- },
14156
- fields,
14157
- candidates: input.candidateReport?.candidates,
14158
- conflicts: input.candidateReport?.conflicts,
14159
- issues
14160
- };
14161
- }
14162
- function evidencePathForOutput(outputPath) {
14163
- return outputPath.replace(JSON_FILE_SUFFIX_RE$1, ".evidence.json");
14164
- }
14165
- async function writeExtractionEvidence(input) {
14166
- const report = buildExtractionEvidence(input);
14167
- const evidencePath = evidencePathForOutput(input.outputPath);
14168
- report.coverage.path = evidencePath;
14169
- await writeFile(evidencePath, report, {
14170
- spaces: 2,
14171
- EOL: "\n"
14172
- });
14173
- return {
14174
- ...report.coverage,
14175
- path: path.resolve(evidencePath)
14176
- };
14177
- }
14178
-
14179
13665
  //#endregion
14180
13666
  //#region src/core/notion-sink.ts
14181
13667
  const RICH_TEXT_LIMIT = 2e3;
@@ -14461,36 +13947,6 @@ async function triggerWebhook(aiConfig, auditId, schemaName, event, source, data
14461
13947
  }
14462
13948
  }
14463
13949
 
14464
- //#endregion
14465
- //#region src/core/ai-extraction/transcriber.ts
14466
- const TRANSCRIPTION_PROMPT = "Transcribe all visible text from this image accurately. Preserve the layout and line breaks as much as possible.";
14467
- async function transcribeImageWithVision(imagePath, baseURL, apiKey, modelName, timeoutMs) {
14468
- const provider = createOpenAICompatible({
14469
- baseURL,
14470
- name: "openai-compatible",
14471
- apiKey
14472
- });
14473
- const buffer = await fs.readFile(imagePath);
14474
- const effectiveTimeout = timeoutMs ?? 3e5;
14475
- return {
14476
- text: (await generateText({
14477
- model: provider.chatModel(modelName),
14478
- messages: [{
14479
- role: "user",
14480
- content: [{
14481
- type: "text",
14482
- text: TRANSCRIPTION_PROMPT
14483
- }, {
14484
- type: "image",
14485
- image: buffer
14486
- }]
14487
- }],
14488
- abortSignal: AbortSignal.timeout(effectiveTimeout)
14489
- })).text,
14490
- modelName
14491
- };
14492
- }
14493
-
14494
13950
  //#endregion
14495
13951
  //#region src/core/file-constants.ts
14496
13952
  const MAX_UPLOAD_SIZE = 30 * 1024 * 1024;
@@ -14824,6 +14280,14 @@ function createPdfConverter(config) {
14824
14280
  return withFallback(new ExternalCommandPdfConverter("mineru", mineruConfig), mineruConfig);
14825
14281
  }
14826
14282
  if (config.converter === "mineru_api") return new MineruApiPdfConverter(config.mineruApi ?? DEFAULT_MINERU_API_CONFIG);
14283
+ if (config.converter === "markitdown") {
14284
+ const markitdownConfig = config.markitdown ?? DEFAULT_MARKITDOWN_CONFIG;
14285
+ return withFallback(new ExternalCommandPdfConverter("markitdown", markitdownConfig), markitdownConfig);
14286
+ }
14287
+ if (config.converter === "marker") {
14288
+ const markerConfig = config.marker ?? DEFAULT_MARKER_CONFIG;
14289
+ return withFallback(new ExternalCommandPdfConverter("marker", markerConfig), markerConfig);
14290
+ }
14827
14291
  if (config.converter === "external") {
14828
14292
  if (!config.external) throw new Error(t("errors.pdf.externalNotConfigured"));
14829
14293
  return new ExternalCommandPdfConverter("external", config.external);
@@ -14851,7 +14315,7 @@ const FILE_PART_EXTENSIONS = new Set([
14851
14315
  "svg"
14852
14316
  ]);
14853
14317
  const PDF_EXT_RE = /\.pdf$/i;
14854
- async function readExtractFileInput(filePath, aiConfig) {
14318
+ async function readExtractFileInput(filePath, aiConfig, modelOverride) {
14855
14319
  const stat = fs$1.statSync(filePath);
14856
14320
  if (stat.size > MAX_UPLOAD_SIZE) throw new Error(t("errors.file.sizeExceeded", {
14857
14321
  size: bytesToMB(stat.size).toFixed(1),
@@ -14860,22 +14324,15 @@ async function readExtractFileInput(filePath, aiConfig) {
14860
14324
  }));
14861
14325
  const ext = path.extname(filePath).toLowerCase().replace(".", "");
14862
14326
  if (FILE_PART_EXTENSIONS.has(ext)) {
14863
- const image = aiConfig?.image;
14864
- if (image?.imageConversion === "vision" && image.imageModelName && aiConfig) {
14865
- const baseURL = image.visionBaseURL || aiConfig.provider.baseURL;
14866
- const apiKey = image.visionApiKey || aiConfig.provider.apiKey;
14867
- const timeout = (aiConfig.provider.timeout ?? 300) * 1e3;
14868
- try {
14869
- const result$1 = await transcribeImageWithVision(filePath, baseURL, apiKey, image.imageModelName, timeout);
14870
- consola.info(t("command.extract.file.visionTranscribed", { model: result$1.modelName }));
14871
- return { text: result$1.text };
14872
- } catch {
14873
- consola.warn(t("command.extract.file.visionTranscribeFailed", { model: image.imageModelName }));
14874
- }
14327
+ if (shouldUseImageOcrFallback(aiConfig, modelOverride)) {
14328
+ const result = await recognizeImageText(filePath, aiConfig?.image);
14329
+ consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
14330
+ return { text: result.text };
14875
14331
  }
14876
- const result = await recognizeImageText(filePath, aiConfig?.image);
14877
- consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
14878
- return { text: result.text };
14332
+ return {
14333
+ text: "",
14334
+ filePath
14335
+ };
14879
14336
  }
14880
14337
  if (ext === "pdf") {
14881
14338
  const buffer = await fs.readFile(filePath);
@@ -14996,21 +14453,7 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
14996
14453
 
14997
14454
  //#endregion
14998
14455
  //#region src/core/extract-runner.ts
14999
- const encoding = getEncoding("cl100k_base");
15000
14456
  const JSON_EXT_RE$1 = /\.json$/;
15001
- async function limitConcurrency(concurrency, items, fn) {
15002
- const results = Array.from({ length: items.length });
15003
- let nextIndex = 0;
15004
- async function worker() {
15005
- while (nextIndex < items.length) {
15006
- const currentIndex = nextIndex++;
15007
- results[currentIndex] = await fn(items[currentIndex], currentIndex);
15008
- }
15009
- }
15010
- const workers = Array.from({ length: Math.min(concurrency, items.length) }, worker);
15011
- await Promise.all(workers);
15012
- return results;
15013
- }
15014
14457
  async function ensureDatabaseReady(dbPath, schema) {
15015
14458
  try {
15016
14459
  await fs.access(dbPath);
@@ -15082,146 +14525,34 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
15082
14525
  }
15083
14526
  const s = spinner();
15084
14527
  if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
15085
- const maxTokens = calculateChunkTokenBudget({
15086
- configuredMaxTokens: aiConfig.extraction?.maxTokens ?? 8e3,
15087
- modelMaxTokens: modelOverride?.capabilities.maxTokens
15088
- });
15089
- const overlapTokens = aiConfig.extraction?.overlapSize ?? 1e3;
15090
- const totalTokens = text$1 ? encoding.encode(text$1).length : 0;
15091
- if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunking", {
15092
- length: totalTokens,
15093
- limit: maxTokens
15094
- }));
15095
- const processedDocs = text$1 && totalTokens > maxTokens ? splitMarkdown(text$1, maxTokens, overlapTokens) : [{
15096
- pageContent: text$1 ?? "",
15097
- metadata: {},
15098
- chunkIndex: 0,
15099
- totalChunks: 1,
15100
- tokenCount: totalTokens,
15101
- headingPath: [],
15102
- charStart: 0,
15103
- charEnd: text$1?.length ?? 0
15104
- }];
15105
- if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: processedDocs.length }));
15106
- const chunkResults = Array.from({ length: processedDocs.length });
15107
- const accumulatedTokens = {
15108
- prompt: 0,
15109
- completion: 0,
15110
- total: 0
15111
- };
15112
- let success = true;
15113
- let errorMsg = "";
15114
- const extractionTasks = processedDocs.map((doc, i) => {
15115
- return async () => {
15116
- if (!success) return;
15117
- const headings = doc.headingPath?.length ? doc.headingPath : [
15118
- doc.metadata.h1,
15119
- doc.metadata.h2,
15120
- doc.metadata.h3,
15121
- doc.metadata.h4
15122
- ].filter(Boolean);
15123
- let chunkText = doc.pageContent;
15124
- if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
15125
- const chunkResult = await extractStructuredData({
15126
- config: aiConfig,
15127
- schema: schemaLoad.schema,
15128
- text: chunkText,
15129
- aiexDir,
15130
- modelOverride,
15131
- onRetry(info) {
15132
- if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
15133
- current: i + 1,
15134
- total: processedDocs.length,
15135
- code: info.statusCode,
15136
- delay: info.delayMs / 1e3,
15137
- attempt: info.attempt,
15138
- max: info.maxRetries
15139
- }));
15140
- }
15141
- });
15142
- if (!chunkResult.success) {
15143
- success = false;
15144
- errorMsg = chunkResult.error || t("common.unknownError");
15145
- if (!options?.quiet) {
15146
- s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
15147
- consola.error(errorMsg);
15148
- }
15149
- return;
15150
- }
15151
- if (chunkResult.data) chunkResults[i] = chunkResult.data;
15152
- if (chunkResult.tokensUsed) {
15153
- accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
15154
- accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
15155
- accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
15156
- }
15157
- };
15158
- });
15159
- const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
15160
- if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
15161
- current: 1,
15162
- total: processedDocs.length
15163
- }));
15164
- try {
15165
- await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
15166
- if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
15167
- current: idx + 1,
15168
- total: processedDocs.length
15169
- }));
15170
- await task();
15171
- });
15172
- } catch (e) {
15173
- success = false;
15174
- errorMsg = e instanceof Error ? e.message : String(e);
15175
- }
15176
- if (!success) return {
15177
- success: false,
15178
- error: errorMsg
15179
- };
15180
- const successfulChunkResults = chunkResults.filter((chunkResult) => !!chunkResult);
15181
- const candidateReport = buildCandidateMergeReport({
14528
+ const result = await extractStructuredData({
14529
+ config: aiConfig,
15182
14530
  schema: schemaLoad.schema,
15183
- chunkResults: successfulChunkResults,
15184
- chunks: processedDocs
14531
+ text: text$1 ?? "",
14532
+ aiexDir,
14533
+ file: filePath,
14534
+ modelOverride,
14535
+ onRetry(info) {
14536
+ if (!options?.quiet) s.message(t("command.extract.file.extractRetry", {
14537
+ code: info.statusCode,
14538
+ delay: info.delayMs / 1e3,
14539
+ attempt: info.attempt,
14540
+ max: info.maxRetries
14541
+ }));
14542
+ }
15185
14543
  });
15186
- const mergedData = applySelectedCandidates(mergeExtractionResults(schemaLoad.schema, successfulChunkResults), candidateReport);
15187
- const validation = validateExtractedData(schemaLoad.schema, mergedData);
15188
- if (!validation.success) {
15189
- const valError = validation.error || "Merged data validation failed";
14544
+ if (!result.success) {
15190
14545
  if (!options?.quiet) {
15191
- s.stop(t("command.extract.file.validationFail"));
15192
- consola.error(valError);
14546
+ s.stop(t("command.extract.file.extractFail"));
14547
+ consola.error(result.error || t("common.unknownError"));
15193
14548
  }
15194
14549
  return {
15195
14550
  success: false,
15196
- error: valError
14551
+ error: result.error || t("common.unknownError")
15197
14552
  };
15198
14553
  }
15199
- const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
15200
- await fs.mkdir(outputDir, { recursive: true });
15201
- const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
15202
- const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
15203
- const outputPath = path.join(outputDir, outputFileName);
15204
- await fs.writeFile(outputPath, JSON.stringify(mergedData, null, 2));
15205
- const result = {
15206
- success: true,
15207
- data: mergedData,
15208
- tokensUsed: accumulatedTokens,
15209
- outputPath,
15210
- evidenceSummary: await writeExtractionEvidence({
15211
- schema: schemaLoad.schema,
15212
- data: mergedData,
15213
- outputPath,
15214
- chunks: processedDocs,
15215
- candidateReport
15216
- })
15217
- };
15218
14554
  if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
15219
14555
  if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
15220
- if (result.evidenceSummary && !options?.quiet) {
15221
- const summary = result.evidenceSummary;
15222
- const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
15223
- consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, conflicts ${summary.conflictCount ?? 0}, issues ${issueText}`));
15224
- }
15225
14556
  if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
15226
14557
  prompt: result.tokensUsed.prompt,
15227
14558
  completion: result.tokensUsed.completion,
@@ -15250,7 +14581,6 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
15250
14581
  outputPath: result.outputPath,
15251
14582
  data: result.data,
15252
14583
  tablesInserted: insertResult.tablesInserted,
15253
- evidenceSummary: result.evidenceSummary,
15254
14584
  tokensUsed: result.tokensUsed
15255
14585
  };
15256
14586
  } else {
@@ -15277,7 +14607,6 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
15277
14607
  success: true,
15278
14608
  outputPath: result.outputPath,
15279
14609
  data: result.data,
15280
- evidenceSummary: result.evidenceSummary,
15281
14610
  tokensUsed: result.tokensUsed
15282
14611
  };
15283
14612
  }
@@ -15342,9 +14671,13 @@ async function runAuditedExtraction(options) {
15342
14671
  });
15343
14672
  try {
15344
14673
  let text$1 = "";
15345
- if (source.type === "file") text$1 = (await readExtractFileInput(source.filePath, aiConfig)).text;
15346
- else text$1 = source.text;
15347
- const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, source.type === "file" ? source.filePath : void 0, modelOverride, {
14674
+ let filePath;
14675
+ if (source.type === "file") {
14676
+ const input = await readExtractFileInput(source.filePath, aiConfig, modelOverride);
14677
+ text$1 = input.text;
14678
+ filePath = input.filePath;
14679
+ } else text$1 = source.text;
14680
+ const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
15348
14681
  quiet,
15349
14682
  insert
15350
14683
  });
@@ -15386,7 +14719,6 @@ async function runAuditedExtraction(options) {
15386
14719
  outputName: updated.outputName,
15387
14720
  tablesInserted: updated.tablesInserted,
15388
14721
  notionPages: updated.notionPages,
15389
- evidenceSummary: r.evidenceSummary,
15390
14722
  tokensUsed: updated.tokensUsed,
15391
14723
  auditId: updated.id,
15392
14724
  fileHash
@@ -16514,7 +15846,6 @@ function aiRoutes(config) {
16514
15846
  //#endregion
16515
15847
  //#region src/core/data-service.ts
16516
15848
  const FILE_REGEX = /\.json$/;
16517
- const EVIDENCE_FILE_SUFFIX = ".evidence.json";
16518
15849
  const EXTRACTION_TIMESTAMP_RE = /-\d{4}-\d{2}-\d{2}T/;
16519
15850
  const INTERNAL_ROWID_COLUMN = "__aiex_rowid";
16520
15851
  const TIMESTAMP_CLEANUP = /(\d{2})-(\d{2})-(\d{2})/;
@@ -16530,24 +15861,6 @@ function getAuditNotionStatus(record) {
16530
15861
  if (record.status === "failed") return "failed";
16531
15862
  return "not_synced";
16532
15863
  }
16533
- async function readEvidenceSummary(extractedDir, outputName) {
16534
- const evidencePath = path.join(extractedDir, outputName.replace(FILE_REGEX, EVIDENCE_FILE_SUFFIX));
16535
- try {
16536
- const coverage = (await readFile(evidencePath))?.coverage;
16537
- if (!coverage || typeof coverage !== "object") return void 0;
16538
- return {
16539
- path: evidencePath,
16540
- fieldCount: Number(coverage.fieldCount) || 0,
16541
- evidenceCount: Number(coverage.evidenceCount) || 0,
16542
- foundCount: Number(coverage.foundCount) || 0,
16543
- missingCount: Number(coverage.missingCount) || 0,
16544
- inferredCount: Number(coverage.inferredCount) || 0,
16545
- issueCount: Number(coverage.issueCount) || 0
16546
- };
16547
- } catch {
16548
- return;
16549
- }
16550
- }
16551
15864
  async function getRowExtractionActions(aiexDir, tableName) {
16552
15865
  const actions = /* @__PURE__ */ new Map();
16553
15866
  const auditRecords = await listExtractionAuditRecords(aiexDir);
@@ -16575,7 +15888,7 @@ async function listExtractions(config) {
16575
15888
  const aiexDir = path.dirname(config.schemaPath);
16576
15889
  const extractedDir = path.join(aiexDir, "extracted");
16577
15890
  await fs.mkdir(extractedDir, { recursive: true });
16578
- const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md") && !f.endsWith(EVIDENCE_FILE_SUFFIX));
15891
+ const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md"));
16579
15892
  const auditRecords = await listExtractionAuditRecords(aiexDir);
16580
15893
  const auditByOutputName = new Map(auditRecords.map((record) => [record.outputName, record]));
16581
15894
  const records = [];
@@ -16594,7 +15907,6 @@ async function listExtractions(config) {
16594
15907
  timestamp,
16595
15908
  fileSize: stat.size,
16596
15909
  modifiedAt: stat.mtime.toISOString(),
16597
- evidenceSummary: await readEvidenceSummary(extractedDir, file),
16598
15910
  notionStatus: notionPages ? "synced" : audit?.status === "failed" ? "failed" : "not_synced",
16599
15911
  notionPages,
16600
15912
  notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0
@@ -16774,7 +16086,6 @@ async function retryNotionSync(config, fileName) {
16774
16086
 
16775
16087
  //#endregion
16776
16088
  //#region src/server/routes/data.ts
16777
- const JSON_FILE_SUFFIX_RE = /\.json$/;
16778
16089
  const tableParamSchema = z.object({ name: z.string().regex(/^[a-z][a-z0-9_]*$/) });
16779
16090
  const extractionFileParamSchema = z.object({ name: z.string().regex(/^[\w.-]+\.json$/).refine((name$1) => name$1 === path.basename(name$1) && !name$1.includes("..")) });
16780
16091
  const tableQuerySchema = z.object({
@@ -16827,22 +16138,10 @@ function dataRoutes(config) {
16827
16138
  const filePath = path.join(extractedDir, name$1);
16828
16139
  try {
16829
16140
  const content = await fs.readFile(filePath, "utf-8");
16830
- const evidencePath = path.join(extractedDir, name$1.replace(JSON_FILE_SUFFIX_RE, ".evidence.json"));
16831
- let evidenceSummary;
16832
- try {
16833
- const evidence = JSON.parse(await fs.readFile(evidencePath, "utf-8"));
16834
- evidenceSummary = evidence?.coverage ? {
16835
- ...evidence.coverage,
16836
- path: evidencePath
16837
- } : void 0;
16838
- } catch {
16839
- evidenceSummary = void 0;
16840
- }
16841
16141
  return c.json({
16842
16142
  success: true,
16843
16143
  content,
16844
- name: name$1,
16845
- evidenceSummary
16144
+ name: name$1
16846
16145
  });
16847
16146
  } catch {
16848
16147
  return c.json({ error: t("server.extractionNotFound") }, 404);
@@ -16986,7 +16285,6 @@ function extractRoutes(config) {
16986
16285
  outputName: result.outputName,
16987
16286
  tablesInserted: result.tablesInserted,
16988
16287
  notionPages: result.notionPages,
16989
- evidenceSummary: result.evidenceSummary,
16990
16288
  tokensUsed: result.tokensUsed,
16991
16289
  auditId: result.auditId
16992
16290
  }, 200);
@@ -17054,7 +16352,6 @@ function extractRoutes(config) {
17054
16352
  outputName: result.outputName,
17055
16353
  tablesInserted: result.tablesInserted,
17056
16354
  notionPages: result.notionPages,
17057
- evidenceSummary: result.evidenceSummary,
17058
16355
  tokensUsed: result.tokensUsed,
17059
16356
  auditId: result.auditId
17060
16357
  }, 200);