npm - aiex-cli - Versions diffs - 0.0.5-beta.4 → 0.0.5-beta.6 - Mend

aiex-cli 0.0.5-beta.4 → 0.0.5-beta.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/README.md +4 -4
package/dist/cli.mjs +785 -460
package/dist/{doctor-collector-Cv7RArla.mjs → doctor-collector-BpqhXNcO.mjs} +30 -92
package/dist/index.mjs +1 -1
package/dist/web/assets/AISettings-sVI4PTNB.js +264 -0
package/dist/web/assets/{DataBrowser-GAA-pGq0.js → DataBrowser-BGkZb9FV.js} +1 -1
package/dist/web/assets/{ExtractionViewer-BhhWrBs2.js → ExtractionViewer-DNrkSECj.js} +1 -1
package/dist/web/assets/{api-client-b4ZBXpNH.js → api-client-gQAAOw0v.js} +1 -1
package/dist/web/assets/{index-CKV2X6sS.js → index-BQKZKzzP.js} +3 -3
package/dist/web/assets/index-BU58oIRd.css +2 -0
package/dist/web/index.html +3 -3
package/dist/{zh-CN-CyL-61Ow.mjs → zh-CN-DkillGHx.mjs} +11 -24
package/package.json +3 -1
package/dist/web/assets/AISettings-BlyTFIIy.js +0 -272
package/dist/web/assets/index-Csdgio76.css +0 -2

package/dist/cli.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-Cv7RArla.mjs";
+import { C as name, D as doctorDiagnosticsTableRows, O as formatDoctorDiagnosticsJson, S as description, T as version, _ as PLACEHOLDER_SCHEMA, a as parseJsonSchema, b as createConfig, c as recognizeImageText, d as getDefaultAIConfig, f as readAIConfig, g as DEFAULT_PROMPT_CONFIG, h as DEFAULT_MINERU_CONFIG, i as JsonSchemaDefinitionSchema, l as initI18n, m as DEFAULT_MINERU_API_CONFIG, n as createMigrationConfig, o as toSnakeCase, p as writeAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as t, v as PLACEHOLDER_TEXT, w as package_default, x as seedConfig, y as AIConfigSchema } from "./doctor-collector-BpqhXNcO.mjs";
 import { createRequire } from "node:module";
 import fs from "node:fs/promises";
 import os from "node:os";
@@ -17,13 +17,14 @@ import Database from "better-sqlite3";
 import pc from "picocolors";
 import { Buffer } from "node:buffer";
 import * as XLSX from "xlsx";
+import { getEncoding } from "js-tiktoken";
 import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
 import { APICallError, Output, generateText, jsonSchema } from "ai";
 import pRetry from "p-retry";
-import mime from "mime";
 import { jsonrepair } from "jsonrepair";
 import { LangfuseSpanProcessor } from "@langfuse/otel";
 import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
+import { marked } from "marked";
 import crypto from "node:crypto";
 import { Client, extractNotionId } from "@notionhq/client";
 import { execa } from "execa";
@@ -12859,28 +12860,6 @@ async function withRetry(fn, onRetry, maxRetries = 5) {
 	});
 }
-//#endregion
-//#region src/core/ai-extraction/file-utils.ts
-function detectMimeType(filePath) {
-	return mime.getType(filePath) ?? "application/octet-stream";
-}
-async function readFilePart(filePath) {
-	const mimeStr = detectMimeType(filePath);
-	const buffer = await fs.readFile(filePath);
-	const name$1 = path.basename(filePath);
-	if (mimeStr.startsWith("image/")) return {
-		type: "image",
-		image: buffer,
-		mimeType: mimeStr
-	};
-	return {
-		type: "file",
-		data: buffer,
-		mediaType: mimeStr,
-		filename: name$1
-	};
-}
 //#endregion
 //#region src/core/ai-extraction/json-utils.ts
 function parseJsonLike(text$1) {
@@ -12941,25 +12920,10 @@ function filterCompatible(models, inputTokens, outputTokens) {
 	});
 }
 function selectModel(input) {
-	const { models, isImage, fileName, inputTokens, outputTokens } = input;
+	const { models, inputTokens, outputTokens } = input;
 	if (models.length === 0) throw new Error(t("errors.ai.noModels"));
 	let candidates = filterCompatible(models, inputTokens, outputTokens);
 	if (candidates.length === 0) candidates = models;
-	if (isImage) {
-		const visionModel = candidates.find((m) => m.capabilities.vision);
-		if (!visionModel) {
-			const hint = fileName ? ` (${fileName})` : "";
-			const msg = inputTokens ? t("errors.ai.noVisionModelContext", {
-				tokens: inputTokens,
-				hint
-			}) : t("errors.ai.noVisionModel", { hint });
-			throw new Error(msg + t("errors.ai.addSuitableModel"));
-		}
-		return {
-			name: visionModel.name,
-			capabilities: visionModel.capabilities
-		};
-	}
 	const soModel = candidates.find((m) => m.capabilities.structuredOutput);
 	if (soModel) return {
 		name: soModel.name,
@@ -12973,36 +12937,46 @@ function selectModel(input) {
 //#endregion
 //#region src/core/ai-extraction/prompt-generator.ts
-function propertyToDescription(name$1, prop, indent = "") {
+const CAMEL_CASE_BOUNDARY_RE = /([a-z0-9])([A-Z])/g;
+const IDENTIFIER_SEPARATOR_RE = /[\s_-]+/;
+function splitIdentifier(name$1) {
+	return name$1.replace(CAMEL_CASE_BOUNDARY_RE, "$1 $2").split(IDENTIFIER_SEPARATOR_RE).map((part) => part.trim().toLowerCase()).filter(Boolean);
+}
+function propertyToDescription(name$1, prop, indent = "", required = false) {
 	const lines = [];
 	let typeStr = prop.type;
 	if (prop.type === "array" && prop.items) typeStr = `array of ${prop.items.type}`;
-	lines.push(`${indent}- ${name$1}: ${typeStr}`);
+	lines.push(`${indent}- ${name$1}: ${typeStr}${required ? " (required)" : ""}`);
+	const terms = splitIdentifier(name$1);
+	if (terms.length > 1) lines.push(`${indent}  search terms: ${terms.join(", ")}`);
+	if (prop.description) lines.push(`${indent}  description: ${prop.description}`);
 	if (prop.minLength !== void 0 || prop.maxLength !== void 0) lines.push(`${indent}  length: ${prop.minLength ?? 0} - ${prop.maxLength ?? "unlimited"}`);
+	if (prop.minimum !== void 0 || prop.maximum !== void 0) lines.push(`${indent}  range: ${prop.minimum ?? "-∞"} - ${prop.maximum ?? "+∞"}`);
 	if (prop.format) lines.push(`${indent}  format: ${prop.format}`);
 	if (prop.unique) lines.push(`${indent}  unique: true`);
 	if (prop.default !== void 0) lines.push(`${indent}  default: ${JSON.stringify(prop.default)}`);
 	return lines.join("\n");
 }
-function nestedPropertyToDescription(name$1, prop, indent = "") {
+function nestedPropertyToDescription(name$1, prop, indent = "", requiredFields = []) {
 	const lines = [];
+	const isRequired = requiredFields.includes(name$1);
 	if (prop.nested?.enabled && prop.type === "object") {
 		const relation = prop.nested.relation || "has-one";
-		lines.push(`${indent}- ${name$1}: object (related table, ${relation})`);
-		if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}  `));
+		lines.push(`${indent}- ${name$1}: object (related table, ${relation})${isRequired ? " (required)" : ""}`);
+		if (prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}  `, prop.required ?? []));
 		return lines.join("\n");
 	}
 	if (prop.type === "array" && prop.items?.nested?.enabled) {
 		const relation = prop.items.nested.relation || "has-many";
-		lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})`);
-		if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}  `));
+		lines.push(`${indent}- ${name$1}: array of object (related table, ${relation})${isRequired ? " (required)" : ""}`);
+		if (prop.items.properties) for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}  `, prop.items.required ?? []));
 		return lines.join("\n");
 	}
-	lines.push(propertyToDescription(name$1, prop, indent));
-	if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}  `));
+	lines.push(propertyToDescription(name$1, prop, indent, isRequired));
+	if (prop.type === "object" && prop.properties) for (const [childName, childProp] of Object.entries(prop.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}  `, prop.required ?? []));
 	if (prop.type === "array" && prop.items?.properties && !prop.items?.nested?.enabled) {
 		lines.push(`${indent}  item fields:`);
-		for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}    `));
+		for (const [childName, childProp] of Object.entries(prop.items.properties)) lines.push(nestedPropertyToDescription(childName, childProp, `${indent}    `, prop.items.required ?? []));
 	}
 	return lines.join("\n");
 }
@@ -13014,7 +12988,7 @@ function schemaToDescription(schema) {
 	lines.push("Fields:");
 	for (const [name$1, prop] of Object.entries(schema.properties)) {
 		const property = prop;
-		lines.push(nestedPropertyToDescription(name$1, property));
+		lines.push(nestedPropertyToDescription(name$1, property, "", schema.required ?? []));
 	}
 	if (schema.examples && schema.examples.length > 0) {
 		lines.push("");
@@ -13059,33 +13033,6 @@ function generatePromptSnapshot(schema, promptConfig = DEFAULT_PROMPT_CONFIG) {
 	].join("\n");
 }
-//#endregion
-//#region src/core/ai-extraction/snapshot.ts
-const SYSTEM_PROMPT_REGEX = /## System Prompt\n([\s\S]*?)(?=## User Prompt|$)/;
-const USER_PROMPT_REGEX = /## User Prompt Template\n([\s\S]*)$/;
-async function loadPromptSnapshot(aiexDir, tableName) {
-	const snapshotPath = path.join(aiexDir, "extracted", `${tableName}.prompt.md`);
-	try {
-		const content = await fs.readFile(snapshotPath, "utf-8");
-		const systemMatch = content.match(SYSTEM_PROMPT_REGEX);
-		const userMatch = content.match(USER_PROMPT_REGEX);
-		if (systemMatch && userMatch) return {
-			system: systemMatch[1].trim(),
-			user: userMatch[1].trim()
-		};
-	} catch {}
-	return null;
-}
-async function savePromptSnapshot(schema, aiexDir) {
-	const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
-	const outputDir = path.join(aiexDir, "extracted");
-	await fs.mkdir(outputDir, { recursive: true });
-	const fileName = `${schema.table.name}.prompt.md`;
-	const outputPath = path.join(outputDir, fileName);
-	await fs.writeFile(outputPath, content);
-	return outputPath;
-}
 //#endregion
 //#region src/core/ai-extraction/telemetry.ts
 let langfuseInitialized = false;
@@ -13128,7 +13075,7 @@ function propertyToExtractionSchema(property) {
 	}
 	return { type: nullableType(property.type) };
 }
-function isRecord$1(value) {
+function isRecord$2(value) {
 	return typeof value === "object" && value !== null && !Array.isArray(value);
 }
 function schemaToExtractionOutputSchema(schema) {
@@ -13166,7 +13113,7 @@ function validatePropertyValue(path$1, property, value, issues) {
 			}
 			return;
 		case "object":
-			if (!isRecord$1(value)) {
+			if (!isRecord$2(value)) {
 				issues.push(`${path$1}: expected object or null`);
 				return;
 			}
@@ -13189,7 +13136,7 @@ function validateProperties(basePath, properties, data, issues) {
 	}
 }
 function validateExtractedData(schema, data) {
-	if (!isRecord$1(data)) return {
+	if (!isRecord$2(data)) return {
 		success: false,
 		error: "Extracted data must be a JSON object."
 	};
@@ -13206,13 +13153,11 @@ function validateExtractedData(schema, data) {
 //#region src/core/ai-extraction/extractor.ts
 const OPENAI_COMPATIBLE_PROVIDER_NAME = "openai-compatible";
 async function extractStructuredData(input) {
-	const { config, schema, text: text$1, aiexDir, file, modelOverride } = input;
+	const { config, schema, text: text$1, modelOverride } = input;
 	if (!config.provider.apiKey) return {
 		success: false,
 		error: t("errors.ai.apiKeyMissing")
 	};
-	const useFileContent = !!file;
-	const isImageFile = useFileContent && detectMimeType(file).startsWith("image/");
 	const inputTokens = text$1 ? Math.ceil(text$1.length / 2) : void 0;
 	const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
 	const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
@@ -13220,8 +13165,6 @@ async function extractStructuredData(input) {
 	try {
 		selected = modelOverride ?? selectModel({
 			models: config.provider.models,
-			isImage: isImageFile,
-			fileName: file,
 			inputTokens,
 			outputTokens
 		});
@@ -13241,18 +13184,7 @@ async function extractStructuredData(input) {
 			apiKey: config.provider.apiKey,
 			supportsStructuredOutputs: useStructuredOutput
 		});
-		let system;
-		let user;
-		const snapshot = await loadPromptSnapshot(aiexDir, schema.table.name);
-		const promptText = file ? PLACEHOLDER_TEXT : text$1;
-		if (snapshot) {
-			system = snapshot.system;
-			user = snapshot.user.replaceAll(PLACEHOLDER_TEXT, promptText);
-		} else {
-			const generated = generateExtractionPrompt(schema, promptText, config.prompt ?? DEFAULT_PROMPT_CONFIG);
-			system = generated.system;
-			user = generated.user;
-		}
+		const { system, user } = generateExtractionPrompt(schema, text$1, config.prompt ?? DEFAULT_PROMPT_CONFIG);
 		const outputSchema = jsonSchema(schemaToExtractionOutputSchema(schema));
 		const timeoutMs = (config.provider.timeout ?? 300) * 1e3;
 		let systemPrompt = system;
@@ -13267,38 +13199,16 @@ async function extractStructuredData(input) {
 			let parseError;
 			let validationError;
 			try {
-				if (useFileContent) {
-					const filePart = await readFilePart(file);
-					const fileName = filePart.type === "file" ? filePart.filename : path.basename(file);
-					const contentParts = [{
-						type: "text",
-						text: userPrompt.includes(PLACEHOLDER_TEXT) ? userPrompt.replaceAll(PLACEHOLDER_TEXT, text$1 || `Data is contained in the attached file: ${fileName}`) : userPrompt
-					}, filePart];
-					const fileOpts = {
-						model: provider.chatModel(selected.name),
-						system: systemPrompt,
-						messages: [{
-							role: "user",
-							content: contentParts
-						}],
-						abortSignal: AbortSignal.timeout(timeoutMs),
-						maxRetries: 0,
-						experimental_telemetry: { isEnabled: useTelemetry }
-					};
-					if (useStructuredOutput) fileOpts.output = Output.object({ schema: outputSchema });
-					result = await withRetry(() => generateText(fileOpts), input.onRetry);
-				} else {
-					const textOpts = {
-						model: provider.chatModel(selected.name),
-						system: systemPrompt,
-						prompt: userPrompt,
-						abortSignal: AbortSignal.timeout(timeoutMs),
-						maxRetries: 0,
-						experimental_telemetry: { isEnabled: useTelemetry }
-					};
-					if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
-					result = await withRetry(() => generateText(textOpts), input.onRetry);
-				}
+				const textOpts = {
+					model: provider.chatModel(selected.name),
+					system: systemPrompt,
+					prompt: userPrompt,
+					abortSignal: AbortSignal.timeout(timeoutMs),
+					maxRetries: 0,
+					experimental_telemetry: { isEnabled: useTelemetry }
+				};
+				if (useStructuredOutput) textOpts.output = Output.object({ schema: outputSchema });
+				result = await withRetry(() => generateText(textOpts), input.onRetry);
 				if (result.usage) {
 					totalPromptTokens += result.usage.inputTokens ?? 0;
 					totalCompletionTokens += result.usage.outputTokens ?? 0;
@@ -13314,27 +13224,16 @@ async function extractStructuredData(input) {
 			}
 			if (!parseError && data !== void 0) {
 				const validation = validateExtractedData(schema, data);
-				if (validation.success) {
-					const outputDir = path.resolve(aiexDir, config.extraction.outputDir.replace(".aiex/", ""));
-					await fs.mkdir(outputDir, { recursive: true });
-					const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
-					const outputFileName = `${schema.table.name}-${timestamp}.json`;
-					const outputPath = path.join(outputDir, outputFileName);
-					await writeFile(outputPath, data, {
-						spaces: 2,
-						EOL: "\n"
-					});
-					return {
-						success: true,
-						outputPath,
-						data,
-						tokensUsed: {
-							prompt: totalPromptTokens,
-							completion: totalCompletionTokens,
-							total: totalPromptTokens + totalCompletionTokens
-						}
-					};
-				} else validationError = validation.error;
+				if (validation.success) return {
+					success: true,
+					data,
+					tokensUsed: {
+						prompt: totalPromptTokens,
+						completion: totalCompletionTokens,
+						total: totalPromptTokens + totalCompletionTokens
+					}
+				};
+				else validationError = validation.error;
 			}
 			const errorMsg = parseError || validationError || "Unknown validation error";
 			lastError = errorMsg;
@@ -13345,11 +13244,14 @@ async function extractStructuredData(input) {
 CRITICAL RULES:
 1. Only correct the fields that failed validation.
 2. Preserve all other correctly extracted fields and their values exactly.
-3. Return ONLY the corrected JSON object. No explanations, no markdown blocks other than JSON.`;
+3. Use only values supported by the original text. If a value cannot be confirmed, set it to null.
+4. Remove any fields not defined by the JSON Schema.
+5. Normalize values to the expected JSON type without changing the intended meaning.
+6. Return ONLY the corrected JSON object. No explanations, no markdown blocks other than JSON.`;
 				userPrompt = `The JSON data you generated previously failed validation. Please correct it.
 [Original Text]
-${text$1 || "Data is contained in the attached file."}
+${text$1 || "Original text is empty."}
 [JSON Schema Definition]
 ${JSON.stringify(schemaToExtractionOutputSchema(schema), null, 2)}
@@ -13360,6 +13262,11 @@ ${invalidJson}
 [Validation Error Details]
 ${errorMsg}
+Correction checklist:
+- Fix each field path mentioned in the validation error.
+- Keep schema-valid fields unchanged.
+- Do not invent missing facts; use null when the original text does not support a value.
 Please output the corrected JSON object now:`;
 			}
 		}
@@ -13514,33 +13421,60 @@ function insertExtractedData(db, schema, data) {
 //#endregion
 //#region src/core/ai-extraction/json-merger.ts
-function isRecord(value) {
+function isRecord$1(value) {
 	return typeof value === "object" && value !== null && !Array.isArray(value);
 }
+function stableKey(value) {
+	if (!isRecord$1(value)) return JSON.stringify(value);
+	return JSON.stringify(Object.keys(value).sort().reduce((acc, key) => {
+		acc[key] = value[key];
+		return acc;
+	}, {}));
+}
+function isBlankString(value) {
+	return typeof value === "string" && value.trim() === "";
+}
+function isPlaceholderString$1(value) {
+	if (typeof value !== "string") return false;
+	const normalized = value.trim().toLowerCase();
+	return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
+}
+function pickPrimitiveValue(values) {
+	const meaningful = values.filter((v) => !isBlankString(v) && !isPlaceholderString$1(v));
+	if (meaningful.length === 0) return null;
+	if (typeof meaningful[0] === "boolean") {
+		const trueCount = meaningful.filter(Boolean).length;
+		return trueCount >= meaningful.length - trueCount;
+	}
+	return meaningful[0];
+}
 function mergePropertyValue(property, values) {
 	const nonNullValues = values.filter((v) => v !== null && v !== void 0);
 	if (nonNullValues.length === 0) return null;
 	if (property.type === "array") {
 		const concatenated = [];
-		for (const val of nonNullValues) if (Array.isArray(val)) concatenated.push(...val);
+		const seen = /* @__PURE__ */ new Set();
+		for (const val of nonNullValues) if (Array.isArray(val)) for (const item of val) {
+			const key = stableKey(item);
+			if (!seen.has(key)) {
+				seen.add(key);
+				concatenated.push(item);
+			}
+		}
 		return concatenated;
 	}
 	if (property.type === "object") {
 		const childProperties = property.properties;
 		if (!childProperties) {
 			const mergedObj$1 = {};
-			for (const val of nonNullValues) if (isRecord(val)) Object.assign(mergedObj$1, val);
+			for (const val of nonNullValues) if (isRecord$1(val)) Object.assign(mergedObj$1, val);
 			return mergedObj$1;
 		}
 		const mergedObj = {};
-		for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord(v) ? v[propName] : void 0));
+		for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord$1(v) ? v[propName] : void 0));
 		return mergedObj;
 	}
-	const bestValue = nonNullValues.find((v) => {
-		if (typeof v === "string") return v.trim() !== "";
-		return true;
-	});
-	return bestValue !== void 0 ? bestValue : null;
+	return pickPrimitiveValue(nonNullValues);
 }
 /**
 * Merges structured extraction outputs from multiple document chunks
@@ -13557,114 +13491,269 @@ function mergeExtractionResults(schema, results) {
 	return merged;
 }
+//#endregion
+//#region src/core/ai-extraction/snapshot.ts
+async function savePromptSnapshot(schema, aiexDir) {
+	const content = generatePromptSnapshot(schema, (await readAIConfig(aiexDir))?.prompt ?? DEFAULT_PROMPT_CONFIG);
+	const outputDir = path.join(aiexDir, "extracted");
+	await fs.mkdir(outputDir, { recursive: true });
+	const fileName = `${schema.table.name}.prompt.md`;
+	const outputPath = path.join(outputDir, fileName);
+	await fs.writeFile(outputPath, content);
+	return outputPath;
+}
 //#endregion
 //#region src/core/ai-extraction/text-splitter.ts
-const HEADING_RE = /^(#{1,6})\s+(\S.*)$/;
-/**
-* Splits a Markdown document into chunks based on header hierarchy.
-* Keeps tables and list blocks intact by splitting along paragraphs (\n\n)
-* when a section exceeds the maxSize limit.
-*/
-function splitMarkdown(text$1, maxSize = 4e4, overlapSize = 0) {
-	const lines = text$1.split("\n");
-	const chunks = [];
-	let currentHeadings = [];
-	let currentChunkLines = [];
-	let currentSize = 0;
-	let hasNewLines = false;
-	const getMetadata = (headings) => {
+const encoding$1 = getEncoding("cl100k_base");
+const MAX_OVERLAP_RATIO = .15;
+const MAX_EFFECTIVE_OVERLAP_TOKENS = 1200;
+const TABLE_SEPARATOR_CELL_RE = /^:?-{3,}:?$/;
+const LEADING_TABLE_PIPE_RE = /^\|/;
+const TRAILING_TABLE_PIPE_RE = /\|$/;
+function countTokens(text$1) {
+	return encoding$1.encode(text$1).length;
+}
+function calculateChunkTokenBudget(options = {}) {
+	const configuredMaxTokens = options.configuredMaxTokens ?? 8e3;
+	const modelMaxTokens = options.modelMaxTokens;
+	if (!modelMaxTokens) return configuredMaxTokens;
+	const outputReserveTokens = options.outputReserveTokens ?? 2e3;
+	const promptReserveTokens = options.promptReserveTokens ?? 1200;
+	const safetyBufferTokens = options.safetyBufferTokens ?? Math.min(1e3, Math.floor(modelMaxTokens * .1));
+	const available = modelMaxTokens - outputReserveTokens - promptReserveTokens - safetyBufferTokens;
+	return Math.max(512, Math.min(configuredMaxTokens, available));
+}
+function formatHeadingContext(headings) {
+	const active = headings.filter(Boolean);
+	if (active.length === 0) return "";
+	return `> **[Context]** Belong to: ${active.join(" > ")}\n\n`;
+}
+function getMetadata(headings) {
+	return {
+		h1: headings[0] || void 0,
+		h2: headings[1] || void 0,
+		h3: headings[2] || void 0,
+		h4: headings[3] || void 0
+	};
+}
+function getHeadingPath(metadata) {
+	return [
+		metadata.h1,
+		metadata.h2,
+		metadata.h3,
+		metadata.h4
+	].filter(Boolean);
+}
+function finalizeChunks(chunks, sourceText) {
+	let searchStart = 0;
+	const totalChunks = chunks.length;
+	return chunks.map((chunk, index) => {
+		const tokenCount = countTokens(chunk.pageContent);
+		let charStart = sourceText.indexOf(chunk.pageContent, searchStart);
+		if (charStart === -1) charStart = sourceText.indexOf(chunk.pageContent);
+		const charEnd = charStart >= 0 ? charStart + chunk.pageContent.length : void 0;
+		if (charStart >= 0 && charEnd !== void 0) searchStart = charEnd;
 		return {
-			h1: headings[0] || void 0,
-			h2: headings[1] || void 0,
-			h3: headings[2] || void 0,
-			h4: headings[3] || void 0
+			...chunk,
+			chunkIndex: index,
+			totalChunks,
+			tokenCount,
+			headingPath: getHeadingPath(chunk.metadata),
+			charStart: charStart >= 0 ? charStart : void 0,
+			charEnd
 		};
+	});
+}
+function getEffectiveOverlapTokens(maxTokens, overlapTokens) {
+	return Math.floor(Math.min(overlapTokens, Math.max(64, maxTokens * MAX_OVERLAP_RATIO), MAX_EFFECTIVE_OVERLAP_TOKENS));
+}
+function splitMarkdownTable(tableText, maxTokens) {
+	if (countTokens(tableText) <= maxTokens) return [tableText];
+	const lines = tableText.split("\n");
+	const headerIndex = lines.findIndex((line) => line.trim().startsWith("|"));
+	const separatorIndex = lines.findIndex((line, index) => {
+		if (index <= headerIndex) return false;
+		const cells = line.trim().replace(LEADING_TABLE_PIPE_RE, "").replace(TRAILING_TABLE_PIPE_RE, "").split("|").map((cell) => cell.trim());
+		return cells.length > 0 && cells.every((cell) => TABLE_SEPARATOR_CELL_RE.test(cell));
+	});
+	if (headerIndex === -1 || separatorIndex === -1) return splitTextRecursively(tableText, maxTokens, ["\n"]);
+	const prefix = lines.slice(0, headerIndex);
+	const header = lines[headerIndex];
+	const separator = lines[separatorIndex];
+	const rows = lines.slice(separatorIndex + 1).filter((line) => line.trim() !== "");
+	const chunks = [];
+	let currentRows = [];
+	const buildTable = (tableRows) => {
+		return [
+			...prefix,
+			header,
+			separator,
+			...tableRows
+		].join("\n");
 	};
-	const flushChunk = (isHeadingChange = false) => {
-		if (currentChunkLines.length === 0 || !hasNewLines) {
-			currentChunkLines = [];
-			currentSize = 0;
-			hasNewLines = false;
-			return;
-		}
-		const pageContent = currentChunkLines.join("\n");
-		let lastChunkContent = "";
-		if (pageContent.length > maxSize) {
-			const paragraphs = pageContent.split("\n\n");
-			let subLines = [];
-			let subSize = 0;
-			for (const para of paragraphs) {
-				const paraSize = para.length;
-				if (subSize + paraSize > maxSize && subLines.length > 0) {
-					const content = subLines.join("\n\n");
-					chunks.push({
-						pageContent: content,
-						metadata: getMetadata(currentHeadings)
-					});
-					const overlapParas = [];
-					let currentOverlapSize = 0;
-					for (let j = subLines.length - 1; j >= 0; j--) {
-						const p = subLines[j];
-						if (currentOverlapSize + p.length > overlapSize && overlapParas.length > 0) break;
-						overlapParas.unshift(p);
-						currentOverlapSize += p.length + 2;
-					}
-					subLines = [...overlapParas];
-					subSize = currentOverlapSize;
-				}
-				subLines.push(para);
-				subSize += paraSize + 2;
+	for (const row of rows) {
+		const candidateRows = [...currentRows, row];
+		if (currentRows.length > 0 && countTokens(buildTable(candidateRows)) > maxTokens) {
+			chunks.push(buildTable(currentRows));
+			currentRows = [row];
+		} else currentRows = candidateRows;
+	}
+	if (currentRows.length > 0) chunks.push(buildTable(currentRows));
+	return chunks.length > 0 ? chunks : [tableText];
+}
+/**
+* Splits text recursively using a list of separators.
+* Preserves the separators when re-joining.
+*/
+function splitTextRecursively(text$1, maxTokens, separators = [
+	"\n\n",
+	"\n",
+	"。",
+	". ",
+	" "
+]) {
+	if (countTokens(text$1) <= maxTokens) return [text$1];
+	if (separators.length === 0) {
+		const chunks = [];
+		let current = "";
+		for (const char of text$1) if (countTokens(current + char) > maxTokens) {
+			chunks.push(current);
+			current = char;
+		} else current += char;
+		if (current) chunks.push(current);
+		return chunks;
+	}
+	const separator = separators[0];
+	const nextSeparators = separators.slice(1);
+	const parts = text$1.split(separator);
+	const result = [];
+	let currentChunk = [];
+	let currentChunkTokens = 0;
+	for (let i = 0; i < parts.length; i++) {
+		const part = parts[i];
+		const itemText = part + (i < parts.length - 1 ? separator : "");
+		const partTokens = countTokens(itemText);
+		if (partTokens > maxTokens) {
+			if (currentChunk.length > 0) {
+				result.push(currentChunk.join(""));
+				currentChunk = [];
+				currentChunkTokens = 0;
 			}
-			if (subLines.length > 0) {
-				const content = subLines.join("\n\n");
-				chunks.push({
-					pageContent: content,
-					metadata: getMetadata(currentHeadings)
-				});
-				lastChunkContent = content;
+			const subParts = splitTextRecursively(part, maxTokens, nextSeparators);
+			for (let j = 0; j < subParts.length; j++) {
+				const finalSub = subParts[j] + (j === subParts.length - 1 && i < parts.length - 1 ? separator : "");
+				result.push(finalSub);
 			}
+		} else if (currentChunkTokens + partTokens > maxTokens) {
+			result.push(currentChunk.join(""));
+			currentChunk = [itemText];
+			currentChunkTokens = partTokens;
 		} else {
-			chunks.push({
-				pageContent,
-				metadata: getMetadata(currentHeadings)
-			});
-			lastChunkContent = pageContent;
+			currentChunk.push(itemText);
+			currentChunkTokens += partTokens;
 		}
-		if (!isHeadingChange && lastChunkContent && overlapSize > 0) {
-			const paragraphs = lastChunkContent.split("\n\n");
-			const overlapParas = [];
-			let currentOverlapSize = 0;
-			for (let j = paragraphs.length - 1; j >= 0; j--) {
-				const p = paragraphs[j];
-				if (currentOverlapSize + p.length > overlapSize && overlapParas.length > 0) break;
-				overlapParas.unshift(p);
-				currentOverlapSize += p.length + 2;
-			}
-			const overlapText = overlapParas.join("\n\n");
-			currentChunkLines = overlapText.split("\n");
-			currentSize = overlapText.length;
+	}
+	if (currentChunk.length > 0) result.push(currentChunk.join(""));
+	return result;
+}
+/**
+* Splits a Markdown document into chunks based on heading contexts, AST block parsing, and token limits.
+* Protects tables, list items, and code blocks from being broken.
+*/
+function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
+	const tokens = marked.lexer(text$1);
+	const chunks = [];
+	const effectiveOverlapTokens = getEffectiveOverlapTokens(maxTokens, overlapTokens);
+	let currentHeadings = [];
+	let currentChunkList = [];
+	let accumulatedTokens = 0;
+	const flushCurrentChunk = (isHeadingChange = false) => {
+		if (currentChunkList.length === 0) return;
+		const pageContent = currentChunkList.map((item) => item.text).join("");
+		const firstHeadings = currentChunkList[0].headings;
+		chunks.push({
+			pageContent,
+			metadata: getMetadata(firstHeadings)
+		});
+		if (isHeadingChange || effectiveOverlapTokens <= 0) {
+			currentChunkList = [];
+			accumulatedTokens = 0;
 		} else {
-			currentChunkLines = [];
-			currentSize = 0;
+			const overlapItems = [];
+			let currentOverlapTokens = 0;
+			for (let i = currentChunkList.length - 1; i >= 0; i--) {
+				const item = currentChunkList[i];
+				const itemTokens = countTokens(item.text);
+				if (currentOverlapTokens + itemTokens > effectiveOverlapTokens && overlapItems.length > 0) break;
+				overlapItems.unshift(item);
+				currentOverlapTokens += itemTokens;
+			}
+			currentChunkList = [...overlapItems];
+			accumulatedTokens = currentOverlapTokens;
 		}
-		hasNewLines = false;
 	};
-	for (const line of lines) {
-		const headingMatch = line.match(HEADING_RE);
-		if (headingMatch) {
-			flushChunk(true);
-			const depth = headingMatch[1].length;
-			const title = headingMatch[2].trim();
+	for (const token of tokens) {
+		if (token.type === "space") {
+			if (currentChunkList.length > 0) {
+				currentChunkList[currentChunkList.length - 1].text += token.raw;
+				accumulatedTokens += countTokens(token.raw);
+			}
+			continue;
+		}
+		if (token.type === "heading") {
+			flushCurrentChunk(true);
+			const depth = token.depth;
+			const title = token.text.trim();
 			currentHeadings = currentHeadings.slice(0, depth - 1);
 			currentHeadings[depth - 1] = title;
 		}
-		currentChunkLines.push(line);
-		currentSize += line.length + 1;
-		hasNewLines = true;
-		if (currentSize > maxSize) flushChunk(false);
+		const rawText = token.raw;
+		if (token.type === "list" && countTokens(rawText) > maxTokens) for (const item of token.items) processTextBlock(item.raw, currentHeadings);
+		else {
+			const isAtomic = token.type === "table" || token.type === "code";
+			processTextBlock(rawText, currentHeadings, isAtomic);
+		}
+	}
+	flushCurrentChunk(true);
+	return finalizeChunks(chunks, text$1);
+	function processTextBlock(blockText, headings, isAtomic = false) {
+		const blockTokens = countTokens(blockText);
+		const contextTokens = countTokens(formatHeadingContext(headings));
+		const safetyBuffer = Math.min(100, Math.max(2, Math.floor(maxTokens * .1)));
+		const budgetLimit = Math.max(5, maxTokens - contextTokens - safetyBuffer);
+		if (blockTokens > budgetLimit) if (isAtomic) {
+			flushCurrentChunk(false);
+			const atomicBlocks = blockTokens <= maxTokens ? [blockText] : blockText.includes("|") ? splitMarkdownTable(blockText, budgetLimit) : splitTextRecursively(blockText, budgetLimit, ["\n"]);
+			for (const block of atomicBlocks) {
+				currentChunkList.push({
+					text: block,
+					headings: [...headings]
+				});
+				accumulatedTokens = countTokens(block);
+				flushCurrentChunk(false);
+			}
+		} else {
+			flushCurrentChunk(false);
+			const subBlocks = splitTextRecursively(blockText, budgetLimit);
+			for (const sub of subBlocks) {
+				currentChunkList.push({
+					text: sub,
+					headings: [...headings]
+				});
+				accumulatedTokens += countTokens(sub);
+				if (accumulatedTokens > budgetLimit) flushCurrentChunk(false);
+			}
+		}
+		else {
+			if (accumulatedTokens + blockTokens + contextTokens > maxTokens && currentChunkList.length > 0) flushCurrentChunk(false);
+			currentChunkList.push({
+				text: blockText,
+				headings: [...headings]
+			});
+			accumulatedTokens += blockTokens;
+		}
 	}
-	flushChunk(true);
-	return chunks;
 }
 //#endregion
@@ -13817,6 +13906,276 @@ function getFileHash(filePath) {
 	});
 }
+//#endregion
+//#region src/core/ai-extraction/evidence.ts
+const JSON_FILE_SUFFIX_RE$1 = /\.json$/i;
+const FIELD_PATH_PREFIX_RE = /^\$\./;
+function isRecord(value) {
+	return typeof value === "object" && value !== null && !Array.isArray(value);
+}
+function stableValueKey(value) {
+	return JSON.stringify(value);
+}
+function isPlaceholderString(value) {
+	if (typeof value !== "string") return false;
+	const normalized = value.trim().toLowerCase();
+	return normalized === "" || normalized === "n/a" || normalized === "na" || normalized === "none" || normalized === "null" || normalized === "unknown" || normalized === "tbd" || normalized === "-" || normalized === "--";
+}
+function primitiveToText(value) {
+	if (value === null || value === void 0) return null;
+	if (typeof value === "string") return value.trim() || null;
+	if (typeof value === "number" || typeof value === "boolean") return String(value);
+	return null;
+}
+function isMeaningfulValue(value) {
+	return primitiveToText(value) !== null && !isPlaceholderString(value);
+}
+function normalizeText(value) {
+	return value.toLowerCase().replace(/\s+/g, " ").trim();
+}
+function quoteAround(text$1, start, length) {
+	const before = Math.max(0, start - 80);
+	const after = Math.min(text$1.length, start + length + 80);
+	return text$1.slice(before, after).replace(/\s+/g, " ").trim();
+}
+function findEvidence(value, chunks) {
+	const searchText = primitiveToText(value);
+	if (!searchText) return null;
+	const normalizedSearchText = normalizeText(searchText);
+	if (!normalizedSearchText) return null;
+	for (const chunk of chunks) {
+		if (normalizeText(chunk.text).indexOf(normalizedSearchText) === -1) continue;
+		const rawIndex = chunk.text.toLowerCase().indexOf(searchText.toLowerCase());
+		const quoteIndex = rawIndex >= 0 ? rawIndex : 0;
+		return {
+			chunkIndex: chunk.chunkIndex,
+			headingPath: chunk.headingPath,
+			quote: quoteAround(chunk.text, quoteIndex, searchText.length)
+		};
+	}
+	return null;
+}
+function addEvidenceForProperty(fields, path$1, property, value, chunks) {
+	if (property.type === "object" && property.properties) {
+		const record = isRecord(value) ? value : {};
+		for (const [childName, childProperty] of Object.entries(property.properties)) addEvidenceForProperty(fields, `${path$1}.${childName}`, childProperty, record[childName], chunks);
+		return;
+	}
+	if (property.type === "array") {
+		if (!Array.isArray(value) || value.length === 0) {
+			fields.push({
+				fieldPath: path$1,
+				status: "missing",
+				value: null,
+				confidence: 0,
+				note: "Array field is empty or missing."
+			});
+			return;
+		}
+		value.forEach((item, index) => {
+			if (property.items?.type === "object" && property.items.properties) {
+				const record = isRecord(item) ? item : {};
+				for (const [childName, childProperty] of Object.entries(property.items.properties)) addEvidenceForProperty(fields, `${path$1}[${index}].${childName}`, childProperty, record[childName], chunks);
+			} else addPrimitiveEvidence(fields, `${path$1}[${index}]`, item, chunks);
+		});
+		return;
+	}
+	addPrimitiveEvidence(fields, path$1, value, chunks);
+}
+function addPrimitiveEvidence(fields, fieldPath, value, chunks) {
+	if (value === null || value === void 0 || value === "") {
+		fields.push({
+			fieldPath,
+			status: "missing",
+			value: null,
+			confidence: 0,
+			note: "Field is null or empty in final extraction."
+		});
+		return;
+	}
+	const found = findEvidence(value, chunks);
+	if (found) {
+		fields.push({
+			fieldPath,
+			status: "found",
+			value,
+			confidence: .8,
+			...found
+		});
+		return;
+	}
+	fields.push({
+		fieldPath,
+		status: "inferred",
+		value,
+		confidence: .35,
+		note: "Final value was not found verbatim in the available source text."
+	});
+}
+function sourceChunksFromText(text$1) {
+	return text$1 ? [{
+		text: text$1,
+		chunkIndex: 0,
+		headingPath: []
+	}] : [];
+}
+function sourceChunksFromMarkdownChunks(chunks) {
+	return chunks.map((chunk, index) => ({
+		text: chunk.pageContent,
+		chunkIndex: chunk.chunkIndex ?? index,
+		headingPath: chunk.headingPath ?? []
+	}));
+}
+function getPathParts(fieldPath) {
+	return fieldPath.replace(FIELD_PATH_PREFIX_RE, "").split(".").filter(Boolean);
+}
+function getValueAtPath$1(data, fieldPath) {
+	let current = data;
+	for (const part of getPathParts(fieldPath)) {
+		if (!isRecord(current)) return void 0;
+		current = current[part];
+	}
+	return current;
+}
+function setValueAtPath(data, fieldPath, value) {
+	const parts = getPathParts(fieldPath);
+	let current = data;
+	for (let i = 0; i < parts.length - 1; i++) {
+		const part = parts[i];
+		if (!isRecord(current[part])) current[part] = {};
+		current = current[part];
+	}
+	current[parts[parts.length - 1]] = value;
+}
+function collectScalarFields(fields, fieldPath, property) {
+	if (property.type === "object" && property.properties) {
+		for (const [name$1, childProperty] of Object.entries(property.properties)) collectScalarFields(fields, `${fieldPath}.${name$1}`, childProperty);
+		return;
+	}
+	if (property.type !== "array") fields.push({
+		fieldPath,
+		property
+	});
+}
+function candidateScore(candidate) {
+	return (candidate.status === "found" ? 100 : 0) + Math.round(candidate.confidence * 10) + candidate.chunkIndex;
+}
+function selectCandidatesForField(candidates) {
+	if (candidates.length === 0) return null;
+	candidates.sort((a, b) => candidateScore(b) - candidateScore(a));
+	const selected = candidates[0];
+	selected.selected = true;
+	for (const candidate of candidates.slice(1)) {
+		candidate.selected = false;
+		candidate.rejectionReason = "Lower evidence score or earlier chunk position.";
+	}
+	const distinctValues = /* @__PURE__ */ new Map();
+	for (const candidate of candidates) distinctValues.set(stableValueKey(candidate.value), candidate.value);
+	if (distinctValues.size <= 1) return null;
+	return {
+		fieldPath: selected.fieldPath,
+		selectedValue: selected.value,
+		rejectedValues: candidates.slice(1).map((candidate) => candidate.value),
+		candidates: [...candidates]
+	};
+}
+function buildCandidateMergeReport(input) {
+	const scalarFields = [];
+	for (const [name$1, property] of Object.entries(input.schema.properties)) {
+		if (property.primary && property.autoIncrement) continue;
+		collectScalarFields(scalarFields, `$.${name$1}`, property);
+	}
+	const sourceChunks = sourceChunksFromMarkdownChunks(input.chunks);
+	const candidatesByPath = /* @__PURE__ */ new Map();
+	for (const { fieldPath } of scalarFields) for (let chunkIndex = 0; chunkIndex < input.chunkResults.length; chunkIndex++) {
+		const value = getValueAtPath$1(input.chunkResults[chunkIndex], fieldPath);
+		if (!isMeaningfulValue(value)) continue;
+		const sourceChunk = sourceChunks[chunkIndex] ?? {
+			text: "",
+			chunkIndex
+		};
+		const found = findEvidence(value, [sourceChunk]);
+		const candidate = {
+			fieldPath,
+			value,
+			chunkIndex: sourceChunk.chunkIndex ?? chunkIndex,
+			headingPath: sourceChunk.headingPath,
+			status: found ? "found" : "inferred",
+			quote: found?.quote,
+			confidence: found ? .85 : .35
+		};
+		const candidates = candidatesByPath.get(fieldPath) ?? [];
+		candidates.push(candidate);
+		candidatesByPath.set(fieldPath, candidates);
+	}
+	const allCandidates = [];
+	const conflicts = [];
+	for (const candidates of candidatesByPath.values()) {
+		const conflict = selectCandidatesForField(candidates);
+		allCandidates.push(...candidates);
+		if (conflict) conflicts.push(conflict);
+	}
+	return {
+		candidates: allCandidates,
+		conflicts
+	};
+}
+function applySelectedCandidates(data, report) {
+	const merged = structuredClone(data);
+	for (const candidate of report.candidates) if (candidate.selected) setValueAtPath(merged, candidate.fieldPath, candidate.value);
+	return merged;
+}
+function buildExtractionEvidence(input) {
+	const data = isRecord(input.data) ? input.data : {};
+	const chunks = input.chunks ? sourceChunksFromMarkdownChunks(input.chunks) : sourceChunksFromText(input.text ?? "");
+	const fields = [];
+	for (const [name$1, property] of Object.entries(input.schema.properties)) {
+		if (property.primary && property.autoIncrement) continue;
+		addEvidenceForProperty(fields, `$.${name$1}`, property, data[name$1], chunks);
+	}
+	const inferredIssues = fields.filter((field) => field.status === "inferred").map((field) => ({
+		fieldPath: field.fieldPath,
+		message: field.note ?? "Field value lacks source evidence."
+	}));
+	const conflictIssues = (input.candidateReport?.conflicts ?? []).map((conflict) => ({
+		fieldPath: conflict.fieldPath,
+		message: "Multiple chunk candidates disagree for this field."
+	}));
+	const issues = [...inferredIssues, ...conflictIssues];
+	return {
+		coverage: {
+			path: input.outputPath ? evidencePathForOutput(input.outputPath) : void 0,
+			fieldCount: fields.length,
+			evidenceCount: fields.filter((field) => field.status === "found").length,
+			foundCount: fields.filter((field) => field.status === "found").length,
+			missingCount: fields.filter((field) => field.status === "missing").length,
+			inferredCount: fields.filter((field) => field.status === "inferred").length,
+			conflictCount: input.candidateReport?.conflicts.length ?? 0,
+			issueCount: issues.length
+		},
+		fields,
+		candidates: input.candidateReport?.candidates,
+		conflicts: input.candidateReport?.conflicts,
+		issues
+	};
+}
+function evidencePathForOutput(outputPath) {
+	return outputPath.replace(JSON_FILE_SUFFIX_RE$1, ".evidence.json");
+}
+async function writeExtractionEvidence(input) {
+	const report = buildExtractionEvidence(input);
+	const evidencePath = evidencePathForOutput(input.outputPath);
+	report.coverage.path = evidencePath;
+	await writeFile(evidencePath, report, {
+		spaces: 2,
+		EOL: "\n"
+	});
+	return {
+		...report.coverage,
+		path: path.resolve(evidencePath)
+	};
+}
 //#endregion
 //#region src/core/notion-sink.ts
 const RICH_TEXT_LIMIT = 2e3;
@@ -14102,6 +14461,36 @@ async function triggerWebhook(aiConfig, auditId, schemaName, event, source, data
 	}
 }
+//#endregion
+//#region src/core/ai-extraction/transcriber.ts
+const TRANSCRIPTION_PROMPT = "Transcribe all visible text from this image accurately. Preserve the layout and line breaks as much as possible.";
+async function transcribeImageWithVision(imagePath, baseURL, apiKey, modelName, timeoutMs) {
+	const provider = createOpenAICompatible({
+		baseURL,
+		name: "openai-compatible",
+		apiKey
+	});
+	const buffer = await fs.readFile(imagePath);
+	const effectiveTimeout = timeoutMs ?? 3e5;
+	return {
+		text: (await generateText({
+			model: provider.chatModel(modelName),
+			messages: [{
+				role: "user",
+				content: [{
+					type: "text",
+					text: TRANSCRIPTION_PROMPT
+				}, {
+					type: "image",
+					image: buffer
+				}]
+			}],
+			abortSignal: AbortSignal.timeout(effectiveTimeout)
+		})).text,
+		modelName
+	};
+}
 //#endregion
 //#region src/core/file-constants.ts
 const MAX_UPLOAD_SIZE = 30 * 1024 * 1024;
@@ -14435,14 +14824,6 @@ function createPdfConverter(config) {
 			return withFallback(new ExternalCommandPdfConverter("mineru", mineruConfig), mineruConfig);
 		}
 		if (config.converter === "mineru_api") return new MineruApiPdfConverter(config.mineruApi ?? DEFAULT_MINERU_API_CONFIG);
-		if (config.converter === "markitdown") {
-			const markitdownConfig = config.markitdown ?? DEFAULT_MARKITDOWN_CONFIG;
-			return withFallback(new ExternalCommandPdfConverter("markitdown", markitdownConfig), markitdownConfig);
-		}
-		if (config.converter === "marker") {
-			const markerConfig = config.marker ?? DEFAULT_MARKER_CONFIG;
-			return withFallback(new ExternalCommandPdfConverter("marker", markerConfig), markerConfig);
-		}
 		if (config.converter === "external") {
 			if (!config.external) throw new Error(t("errors.pdf.externalNotConfigured"));
 			return new ExternalCommandPdfConverter("external", config.external);
@@ -14470,7 +14851,7 @@ const FILE_PART_EXTENSIONS = new Set([
 	"svg"
 ]);
 const PDF_EXT_RE = /\.pdf$/i;
-async function readExtractFileInput(filePath, aiConfig, modelOverride) {
+async function readExtractFileInput(filePath, aiConfig) {
 	const stat = fs$1.statSync(filePath);
 	if (stat.size > MAX_UPLOAD_SIZE) throw new Error(t("errors.file.sizeExceeded", {
 		size: bytesToMB(stat.size).toFixed(1),
@@ -14479,15 +14860,22 @@ async function readExtractFileInput(filePath, aiConfig, modelOverride) {
 	}));
 	const ext = path.extname(filePath).toLowerCase().replace(".", "");
 	if (FILE_PART_EXTENSIONS.has(ext)) {
-		if (shouldUseImageOcrFallback(aiConfig, modelOverride)) {
-			const result = await recognizeImageText(filePath, aiConfig?.image);
-			consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
-			return { text: result.text };
+		const image = aiConfig?.image;
+		if (image?.imageConversion === "vision" && image.imageModelName && aiConfig) {
+			const baseURL = image.visionBaseURL || aiConfig.provider.baseURL;
+			const apiKey = image.visionApiKey || aiConfig.provider.apiKey;
+			const timeout = (aiConfig.provider.timeout ?? 300) * 1e3;
+			try {
+				const result$1 = await transcribeImageWithVision(filePath, baseURL, apiKey, image.imageModelName, timeout);
+				consola.info(t("command.extract.file.visionTranscribed", { model: result$1.modelName }));
+				return { text: result$1.text };
+			} catch {
+				consola.warn(t("command.extract.file.visionTranscribeFailed", { model: image.imageModelName }));
+			}
 		}
-		return {
-			text: "",
-			filePath
-		};
+		const result = await recognizeImageText(filePath, aiConfig?.image);
+		consola.info(t("command.extract.file.ocrText", { confidence: (result.confidence * 100).toFixed(1) }));
+		return { text: result.text };
 	}
 	if (ext === "pdf") {
 		const buffer = await fs.readFile(filePath);
@@ -14608,6 +14996,7 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
 //#endregion
 //#region src/core/extract-runner.ts
+const encoding = getEncoding("cl100k_base");
 const JSON_EXT_RE$1 = /\.json$/;
 async function limitConcurrency(concurrency, items, fn) {
 	const results = Array.from({ length: items.length });
@@ -14622,29 +15011,6 @@ async function limitConcurrency(concurrency, items, fn) {
 	await Promise.all(workers);
 	return results;
 }
-function getSchemaKeywords(schema) {
-	const keywords = /* @__PURE__ */ new Set();
-	function walk(properties) {
-		if (!properties) return;
-		for (const [name$1, prop] of Object.entries(properties)) {
-			keywords.add(name$1.toLowerCase());
-			const parts = name$1.replace(/([a-z0-9])([A-Z])/g, "$1 $2").split(/[\s._:/\\-]+/g);
-			for (const part of parts) if (part.length > 1) keywords.add(part.toLowerCase());
-			if (prop && typeof prop === "object") {
-				const p = prop;
-				if (typeof p.title === "string") keywords.add(p.title.toLowerCase());
-				if (typeof p.description === "string") {
-					const descParts = p.description.toLowerCase().match(/[\p{L}\p{N}_-]+/gu) ?? [];
-					for (const d of descParts) if (d.length > 2) keywords.add(d);
-				}
-				if (p.type === "object") walk(p.properties);
-				if (p.type === "array" && p.items?.type === "object") walk(p.items.properties);
-			}
-		}
-	}
-	walk(schema.properties);
-	return Array.from(keywords);
-}
 async function ensureDatabaseReady(dbPath, schema) {
 	try {
 		await fs.access(dbPath);
@@ -14716,182 +15082,145 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
 	}
 	const s = spinner();
 	if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
-	const CHUNK_LIMIT = 4e4;
-	let result;
-	if (text$1 && text$1.length > CHUNK_LIMIT) {
-		if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
-			length: text$1.length,
-			limit: CHUNK_LIMIT
-		}));
-		const finalDocs = splitMarkdown(text$1, CHUNK_LIMIT, aiConfig.extraction?.overlapSize ?? 2e3);
-		if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
-		let processedDocs = finalDocs;
-		if (!!aiConfig.extraction?.preFiltering && finalDocs.length > 1) {
-			const preFilteringLimit = aiConfig.extraction?.preFilteringLimit ?? 5;
-			const keywords = getSchemaKeywords(schemaLoad.schema);
-			const scoredChunks = finalDocs.map((doc, idx) => {
-				if (idx === 0) return {
-					index: idx,
-					score: Number.POSITIVE_INFINITY
-				};
-				let score = 0;
-				const docTextLower = doc.pageContent.toLowerCase();
-				for (const kw of keywords) {
-					let pos = docTextLower.indexOf(kw);
-					while (pos !== -1) {
-						score++;
-						pos = docTextLower.indexOf(kw, pos + kw.length);
-					}
-				}
-				return {
-					index: idx,
-					score
-				};
-			}).slice(1).sort((a, b) => b.score - a.score);
-			const selectedIndices = new Set([0]);
-			let keptCount = 0;
-			for (const sc of scoredChunks) if (sc.score > 0 && keptCount < preFilteringLimit) {
-				selectedIndices.add(sc.index);
-				keptCount++;
-			}
-			processedDocs = finalDocs.filter((_, idx) => selectedIndices.has(idx));
-			if (!options?.quiet) consola.info(t("command.extract.file.preFiltering", {
-				original: finalDocs.length,
-				filtered: processedDocs.length
-			}));
-		}
-		const chunkResults = [];
-		const accumulatedTokens = {
-			prompt: 0,
-			completion: 0,
-			total: 0
-		};
-		let success = true;
-		let errorMsg = "";
-		const extractionTasks = processedDocs.map((doc, i) => {
-			return async () => {
-				if (!success) return;
-				const headings = [];
-				if (doc.metadata) {
-					if (doc.metadata.h1) headings.push(doc.metadata.h1);
-					if (doc.metadata.h2) headings.push(doc.metadata.h2);
-					if (doc.metadata.h3) headings.push(doc.metadata.h3);
-					if (doc.metadata.h4) headings.push(doc.metadata.h4);
-				}
-				let chunkText = doc.pageContent;
-				if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
-				const chunkResult = await extractStructuredData({
-					config: aiConfig,
-					schema: schemaLoad.schema,
-					text: chunkText,
-					aiexDir,
-					modelOverride,
-					onRetry(info) {
-						if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
-							current: i + 1,
-							total: processedDocs.length,
-							code: info.statusCode,
-							delay: info.delayMs / 1e3,
-							attempt: info.attempt,
-							max: info.maxRetries
-						}));
-					}
-				});
-				if (!chunkResult.success) {
-					success = false;
-					errorMsg = chunkResult.error || t("common.unknownError");
-					if (!options?.quiet) {
-						s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
-						consola.error(errorMsg);
-					}
-					return;
-				}
-				if (chunkResult.data) chunkResults.push(chunkResult.data);
-				if (chunkResult.tokensUsed) {
-					accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
-					accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
-					accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
+	const maxTokens = calculateChunkTokenBudget({
+		configuredMaxTokens: aiConfig.extraction?.maxTokens ?? 8e3,
+		modelMaxTokens: modelOverride?.capabilities.maxTokens
+	});
+	const overlapTokens = aiConfig.extraction?.overlapSize ?? 1e3;
+	const totalTokens = text$1 ? encoding.encode(text$1).length : 0;
+	if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunking", {
+		length: totalTokens,
+		limit: maxTokens
+	}));
+	const processedDocs = text$1 && totalTokens > maxTokens ? splitMarkdown(text$1, maxTokens, overlapTokens) : [{
+		pageContent: text$1 ?? "",
+		metadata: {},
+		chunkIndex: 0,
+		totalChunks: 1,
+		tokenCount: totalTokens,
+		headingPath: [],
+		charStart: 0,
+		charEnd: text$1?.length ?? 0
+	}];
+	if (text$1 && totalTokens > maxTokens && !options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: processedDocs.length }));
+	const chunkResults = Array.from({ length: processedDocs.length });
+	const accumulatedTokens = {
+		prompt: 0,
+		completion: 0,
+		total: 0
+	};
+	let success = true;
+	let errorMsg = "";
+	const extractionTasks = processedDocs.map((doc, i) => {
+		return async () => {
+			if (!success) return;
+			const headings = doc.headingPath?.length ? doc.headingPath : [
+				doc.metadata.h1,
+				doc.metadata.h2,
+				doc.metadata.h3,
+				doc.metadata.h4
+			].filter(Boolean);
+			let chunkText = doc.pageContent;
+			if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
+			const chunkResult = await extractStructuredData({
+				config: aiConfig,
+				schema: schemaLoad.schema,
+				text: chunkText,
+				aiexDir,
+				modelOverride,
+				onRetry(info) {
+					if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
+						current: i + 1,
+						total: processedDocs.length,
+						code: info.statusCode,
+						delay: info.delayMs / 1e3,
+						attempt: info.attempt,
+						max: info.maxRetries
+					}));
 				}
-			};
-		});
-		const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
-		if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
-			current: 1,
-			total: processedDocs.length
-		}));
-		try {
-			await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
-				if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
-					current: idx + 1,
-					total: processedDocs.length
-				}));
-				await task();
 			});
-		} catch (e) {
-			success = false;
-			errorMsg = e instanceof Error ? e.message : String(e);
-		}
-		if (!success) return {
-			success: false,
-			error: errorMsg
-		};
-		const mergedData = mergeExtractionResults(schemaLoad.schema, chunkResults);
-		const validation = validateExtractedData(schemaLoad.schema, mergedData);
-		if (!validation.success) {
-			const valError = validation.error || "Merged data validation failed";
-			if (!options?.quiet) {
-				s.stop(t("command.extract.file.validationFail"));
-				consola.error(valError);
+			if (!chunkResult.success) {
+				success = false;
+				errorMsg = chunkResult.error || t("common.unknownError");
+				if (!options?.quiet) {
+					s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
+					consola.error(errorMsg);
+				}
+				return;
+			}
+			if (chunkResult.data) chunkResults[i] = chunkResult.data;
+			if (chunkResult.tokensUsed) {
+				accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
+				accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
+				accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
 			}
-			return {
-				success: false,
-				error: valError
-			};
-		}
-		const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
-		await fs.mkdir(outputDir, { recursive: true });
-		const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
-		const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
-		const finalMergedOutputPath = path.join(outputDir, outputFileName);
-		await fs.writeFile(finalMergedOutputPath, JSON.stringify(mergedData, null, 2));
-		result = {
-			success: true,
-			data: mergedData,
-			tokensUsed: accumulatedTokens,
-			outputPath: finalMergedOutputPath
 		};
-	} else result = await extractStructuredData({
-		config: aiConfig,
-		schema: schemaLoad.schema,
-		text: text$1 ?? "",
-		aiexDir,
-		file: filePath,
-		modelOverride,
-		onRetry(info) {
-			if (!options?.quiet) s.message(t("command.extract.file.extractRetry", {
-				code: info.statusCode,
-				delay: info.delayMs / 1e3,
-				attempt: info.attempt,
-				max: info.maxRetries
+	});
+	const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
+	if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
+		current: 1,
+		total: processedDocs.length
+	}));
+	try {
+		await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
+			if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
+				current: idx + 1,
+				total: processedDocs.length
 			}));
-		}
+			await task();
+		});
+	} catch (e) {
+		success = false;
+		errorMsg = e instanceof Error ? e.message : String(e);
+	}
+	if (!success) return {
+		success: false,
+		error: errorMsg
+	};
+	const successfulChunkResults = chunkResults.filter((chunkResult) => !!chunkResult);
+	const candidateReport = buildCandidateMergeReport({
+		schema: schemaLoad.schema,
+		chunkResults: successfulChunkResults,
+		chunks: processedDocs
 	});
-	if (!result.success) {
+	const mergedData = applySelectedCandidates(mergeExtractionResults(schemaLoad.schema, successfulChunkResults), candidateReport);
+	const validation = validateExtractedData(schemaLoad.schema, mergedData);
+	if (!validation.success) {
+		const valError = validation.error || "Merged data validation failed";
 		if (!options?.quiet) {
-			s.stop(t("command.extract.file.extractFail"));
-			consola.error(result.error || t("common.unknownError"));
+			s.stop(t("command.extract.file.validationFail"));
+			consola.error(valError);
 		}
 		return {
 			success: false,
-			error: result.error || t("common.unknownError")
+			error: valError
 		};
 	}
+	const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
+	await fs.mkdir(outputDir, { recursive: true });
+	const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
+	const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
+	const outputPath = path.join(outputDir, outputFileName);
+	await fs.writeFile(outputPath, JSON.stringify(mergedData, null, 2));
+	const result = {
+		success: true,
+		data: mergedData,
+		tokensUsed: accumulatedTokens,
+		outputPath,
+		evidenceSummary: await writeExtractionEvidence({
+			schema: schemaLoad.schema,
+			data: mergedData,
+			outputPath,
+			chunks: processedDocs,
+			candidateReport
+		})
+	};
 	if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
 	if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
 	if (result.evidenceSummary && !options?.quiet) {
 		const summary = result.evidenceSummary;
 		const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
-		consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, issues ${issueText}`));
+		consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, conflicts ${summary.conflictCount ?? 0}, issues ${issueText}`));
 	}
 	if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
 		prompt: result.tokensUsed.prompt,
@@ -15013,13 +15342,9 @@ async function runAuditedExtraction(options) {
 	});
 	try {
 		let text$1 = "";
-		let filePath;
-		if (source.type === "file") {
-			const input = await readExtractFileInput(source.filePath, aiConfig, modelOverride);
-			text$1 = input.text;
-			filePath = input.filePath;
-		} else text$1 = source.text;
-		const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
+		if (source.type === "file") text$1 = (await readExtractFileInput(source.filePath, aiConfig)).text;
+		else text$1 = source.text;
+		const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, source.type === "file" ? source.filePath : void 0, modelOverride, {
 			quiet,
 			insert
 		});