npm - aiex-cli - Versions diffs - 0.0.5-beta.3 → 0.0.5-beta.5 - Mend

aiex-cli 0.0.5-beta.3 → 0.0.5-beta.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +6 -16
package/dist/cli.mjs +340 -355
package/dist/{doctor-collector-CQPDBVTw.mjs → doctor-collector-NTNBFeBw.mjs} +12 -6
package/dist/index.mjs +1 -1
package/dist/web/assets/AISettings-BlyTFIIy.js +272 -0
package/dist/web/assets/ExtractionViewer-BhhWrBs2.js +1 -0
package/dist/web/assets/{index-BWm_fhNt.js → index-CKV2X6sS.js} +2 -2
package/dist/web/assets/index-Csdgio76.css +2 -0
package/dist/web/index.html +2 -2
package/dist/{zh-CN-CKxdpj8c.mjs → zh-CN-Ca-Dv775.mjs} +2 -3
package/package.json +3 -1
package/dist/web/assets/AISettings-DoDVYWfb.js +0 -272
package/dist/web/assets/ExtractionViewer-DqIrBGNK.js +0 -1
package/dist/web/assets/index-CvY9TGny.css +0 -2

package/dist/cli.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-CQPDBVTw.mjs";
+import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-NTNBFeBw.mjs";
 import { createRequire } from "node:module";
 import fs from "node:fs/promises";
 import os from "node:os";
@@ -17,13 +17,15 @@ import Database from "better-sqlite3";
 import pc from "picocolors";
 import { Buffer } from "node:buffer";
 import * as XLSX from "xlsx";
+import { getEncoding } from "js-tiktoken";
 import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
-import { APICallError, Output, generateText, jsonSchema, tool } from "ai";
+import { APICallError, Output, generateText, jsonSchema } from "ai";
 import pRetry from "p-retry";
 import mime from "mime";
 import { jsonrepair } from "jsonrepair";
 import { LangfuseSpanProcessor } from "@langfuse/otel";
 import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
+import { marked } from "marked";
 import crypto from "node:crypto";
 import { Client, extractNotionId } from "@notionhq/client";
 import { execa } from "execa";
@@ -13559,279 +13561,170 @@ function mergeExtractionResults(schema, results) {
 //#endregion
 //#region src/core/ai-extraction/text-splitter.ts
-const HEADING_RE = /^(#{1,6})\s+(\S.*)$/;
+const encoding$1 = getEncoding("cl100k_base");
+function countTokens(text$1) {
+	return encoding$1.encode(text$1).length;
+}
+function formatHeadingContext(headings) {
+	const active = headings.filter(Boolean);
+	if (active.length === 0) return "";
+	return `> **[Context]** Belong to: ${active.join(" > ")}\n\n`;
+}
+function getMetadata(headings) {
+	return {
+		h1: headings[0] || void 0,
+		h2: headings[1] || void 0,
+		h3: headings[2] || void 0,
+		h4: headings[3] || void 0
+	};
+}
 /**
-* Splits a Markdown document into chunks based on header hierarchy.
-* Keeps tables and list blocks intact by splitting along paragraphs (\n\n)
-* when a section exceeds the maxSize limit.
+* Splits text recursively using a list of separators.
+* Preserves the separators when re-joining.
 */
-function splitMarkdown(text$1, maxSize = 4e4) {
-	const lines = text$1.split("\n");
+function splitTextRecursively(text$1, maxTokens, separators = [
+	"\n\n",
+	"\n",
+	"。",
+	". ",
+	" "
+]) {
+	if (countTokens(text$1) <= maxTokens) return [text$1];
+	if (separators.length === 0) {
+		const chunks = [];
+		let current = "";
+		for (const char of text$1) if (countTokens(current + char) > maxTokens) {
+			chunks.push(current);
+			current = char;
+		} else current += char;
+		if (current) chunks.push(current);
+		return chunks;
+	}
+	const separator = separators[0];
+	const nextSeparators = separators.slice(1);
+	const parts = text$1.split(separator);
+	const result = [];
+	let currentChunk = [];
+	let currentChunkTokens = 0;
+	for (let i = 0; i < parts.length; i++) {
+		const part = parts[i];
+		const itemText = part + (i < parts.length - 1 ? separator : "");
+		const partTokens = countTokens(itemText);
+		if (partTokens > maxTokens) {
+			if (currentChunk.length > 0) {
+				result.push(currentChunk.join(""));
+				currentChunk = [];
+				currentChunkTokens = 0;
+			}
+			const subParts = splitTextRecursively(part, maxTokens, nextSeparators);
+			for (let j = 0; j < subParts.length; j++) {
+				const finalSub = subParts[j] + (j === subParts.length - 1 && i < parts.length - 1 ? separator : "");
+				result.push(finalSub);
+			}
+		} else if (currentChunkTokens + partTokens > maxTokens) {
+			result.push(currentChunk.join(""));
+			currentChunk = [itemText];
+			currentChunkTokens = partTokens;
+		} else {
+			currentChunk.push(itemText);
+			currentChunkTokens += partTokens;
+		}
+	}
+	if (currentChunk.length > 0) result.push(currentChunk.join(""));
+	return result;
+}
+/**
+* Splits a Markdown document into chunks based on heading contexts, AST block parsing, and token limits.
+* Protects tables, list items, and code blocks from being broken.
+*/
+function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
+	const tokens = marked.lexer(text$1);
 	const chunks = [];
 	let currentHeadings = [];
-	let currentChunkLines = [];
-	let currentSize = 0;
-	const getMetadata = (headings) => {
-		return {
-			h1: headings[0] || void 0,
-			h2: headings[1] || void 0,
-			h3: headings[2] || void 0,
-			h4: headings[3] || void 0
-		};
-	};
-	const flushChunk = () => {
-		if (currentChunkLines.length === 0) return;
-		const pageContent = currentChunkLines.join("\n");
-		if (pageContent.length > maxSize) {
-			const paragraphs = pageContent.split("\n\n");
-			let subLines = [];
-			let subSize = 0;
-			for (const para of paragraphs) {
-				const paraSize = para.length;
-				if (subSize + paraSize > maxSize && subLines.length > 0) {
-					chunks.push({
-						pageContent: subLines.join("\n\n"),
-						metadata: getMetadata(currentHeadings)
-					});
-					subLines = [];
-					subSize = 0;
-				}
-				subLines.push(para);
-				subSize += paraSize + 2;
-			}
-			if (subLines.length > 0) chunks.push({
-				pageContent: subLines.join("\n\n"),
-				metadata: getMetadata(currentHeadings)
-			});
-		} else chunks.push({
+	let currentChunkList = [];
+	let accumulatedTokens = 0;
+	const flushCurrentChunk = (isHeadingChange = false) => {
+		if (currentChunkList.length === 0) return;
+		const pageContent = currentChunkList.map((item) => item.text).join("");
+		const firstHeadings = currentChunkList[0].headings;
+		chunks.push({
 			pageContent,
-			metadata: getMetadata(currentHeadings)
+			metadata: getMetadata(firstHeadings)
 		});
-		currentChunkLines = [];
-		currentSize = 0;
+		if (isHeadingChange || overlapTokens <= 0) {
+			currentChunkList = [];
+			accumulatedTokens = 0;
+		} else {
+			const overlapItems = [];
+			let currentOverlapTokens = 0;
+			for (let i = currentChunkList.length - 1; i >= 0; i--) {
+				const item = currentChunkList[i];
+				const itemTokens = countTokens(item.text);
+				if (currentOverlapTokens + itemTokens > overlapTokens && overlapItems.length > 0) break;
+				overlapItems.unshift(item);
+				currentOverlapTokens += itemTokens;
+			}
+			currentChunkList = [...overlapItems];
+			accumulatedTokens = currentOverlapTokens;
+		}
 	};
-	for (const line of lines) {
-		const headingMatch = line.match(HEADING_RE);
-		if (headingMatch) {
-			flushChunk();
-			const depth = headingMatch[1].length;
-			const title = headingMatch[2].trim();
+	for (const token of tokens) {
+		if (token.type === "space") {
+			if (currentChunkList.length > 0) {
+				currentChunkList[currentChunkList.length - 1].text += token.raw;
+				accumulatedTokens += countTokens(token.raw);
+			}
+			continue;
+		}
+		if (token.type === "heading") {
+			flushCurrentChunk(true);
+			const depth = token.depth;
+			const title = token.text.trim();
 			currentHeadings = currentHeadings.slice(0, depth - 1);
 			currentHeadings[depth - 1] = title;
 		}
-		currentChunkLines.push(line);
-		currentSize += line.length + 1;
-		if (currentSize > maxSize) flushChunk();
+		const rawText = token.raw;
+		if (token.type === "list" && countTokens(rawText) > maxTokens) for (const item of token.items) processTextBlock(item.raw, currentHeadings);
+		else {
+			const isAtomic = token.type === "table" || token.type === "code";
+			processTextBlock(rawText, currentHeadings, isAtomic);
+		}
 	}
-	flushChunk();
+	flushCurrentChunk(true);
 	return chunks;
-}
-//#endregion
-//#region src/core/ai-extraction/react-agent.ts
-async function extractStructuredDataWithAgent(input) {
-	const { config, schema, text: text$1, aiexDir, modelOverride, onAgentStep } = input;
-	if (!config.provider.apiKey) return {
-		success: false,
-		error: t("errors.ai.apiKeyMissing")
-	};
-	const chunks = splitMarkdown(text$1, 15e3);
-	const inputTokens = Math.ceil(text$1.length / 2);
-	const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
-	const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
-	let selected;
-	try {
-		selected = modelOverride ?? selectModel({
-			models: config.provider.models,
-			isImage: false,
-			inputTokens,
-			outputTokens
-		});
-	} catch (e) {
-		return {
-			success: false,
-			error: e.message
-		};
-	}
-	const useTelemetry = !!(config.langfuse?.publicKey && config.langfuse.secretKey);
-	try {
-		if (useTelemetry) initLangfuse(config);
-		const provider = createOpenAICompatible({
-			baseURL: config.provider.baseURL,
-			name: "openai-compatible",
-			apiKey: config.provider.apiKey,
-			supportsStructuredOutputs: false
-		});
-		let finalExtractedData = null;
-		const tools = {
-			listChunks: tool({
-				description: "Get a list of all text chunks in the document, showing their chunk index ID, character size, and markdown heading hierarchy (metadata). Use this as a Table of Contents to locate sections of interest.",
-				parameters: z.object({}),
-				execute: async () => {
-					return chunks.map((c, idx) => ({
-						id: idx + 1,
-						size: c.pageContent.length,
-						headings: c.metadata
-					}));
-				}
-			}),
-			readChunk: tool({
-				description: "Read the full text content of a specific chunk by its ID.",
-				parameters: z.object({ chunkId: z.number().int().describe("The ID (1-based index) of the chunk to read.") }),
-				execute: async ({ chunkId }) => {
-					const index = chunkId - 1;
-					if (index < 0 || index >= chunks.length) return { error: `Invalid chunkId: ${chunkId}. Valid IDs are 1 to ${chunks.length}.` };
-					const chunk = chunks[index];
-					const headings = [];
-					if (chunk.metadata) {
-						if (chunk.metadata.h1) headings.push(chunk.metadata.h1);
-						if (chunk.metadata.h2) headings.push(chunk.metadata.h2);
-						if (chunk.metadata.h3) headings.push(chunk.metadata.h3);
-						if (chunk.metadata.h4) headings.push(chunk.metadata.h4);
-					}
-					return {
-						chunkId,
-						headings: headings.join(" > "),
-						content: chunk.pageContent
-					};
-				}
-			}),
-			searchChunks: tool({
-				description: "Search all chunks in the document for specific keywords or search terms. Returns matching chunk IDs and small matching context snippets.",
-				parameters: z.object({ query: z.string().describe("The keyword or search phrase to search for.") }),
-				execute: async ({ query }) => {
-					const results = [];
-					const lowercaseQuery = query.toLowerCase();
-					for (let i = 0; i < chunks.length; i++) {
-						const chunkText = chunks[i].pageContent;
-						const idx = chunkText.toLowerCase().indexOf(lowercaseQuery);
-						if (idx !== -1) {
-							const start = Math.max(0, idx - 60);
-							const end = Math.min(chunkText.length, idx + lowercaseQuery.length + 60);
-							const snippet = `...${chunkText.slice(start, end).replace(/\n/g, " ")}...`;
-							results.push({
-								chunkId: i + 1,
-								headings: chunks[i].metadata,
-								snippet
-							});
-						}
-					}
-					return results.slice(0, 10);
-				}
-			}),
-			submitExtraction: tool({
-				description: "Submit the final extracted JSON object conforming to the schema definition. Call this ONLY after you have gathered all necessary information.",
-				parameters: z.object({ data: z.any().describe("The extracted JSON object conforming to the target schema.") }),
-				execute: async ({ data }) => {
-					finalExtractedData = data;
-					return {
-						status: "success",
-						message: "Data submitted successfully. The extraction is now complete."
-					};
-				}
-			})
-		};
-		const outputSchema = schemaToExtractionOutputSchema(schema);
-		const systemPrompt = `You are a precise data extraction agent. Your goal is to extract structured information from a document to populate the target JSON schema.
-Target JSON Schema structure to populate:
-${JSON.stringify(outputSchema, null, 2)}
-You are equipped with tools to browse the document dynamically:
-1. First, call listChunks to understand the document layout and what sections exist.
-2. Based on the schema fields, call readChunk or searchChunks to locate and read relevant content.
-3. You can make multiple tool calls. Do not guess. Check the text carefully.
-4. Once you have located and read all the necessary information, call the submitExtraction tool with the fully extracted JSON object.
-5. After calling submitExtraction, you should stop.
-CRITICAL RULES:
-1. Extract data strictly conforming to the types and properties of the Target JSON Schema.
-2. If a field's value cannot be found in the document after thorough search, set it to null.
-3. Do not invent any values.
-4. Call submitExtraction exactly once with the final JSON result.`;
-		const timeoutMs = (config.provider.timeout ?? 300) * 1e3;
-		const result = await generateText({
-			model: provider.chatModel(selected.name),
-			system: systemPrompt,
-			prompt: "Please start by listing the chunks to understand the document structure, then gather the required facts and submit the final JSON extraction.",
-			tools,
-			maxSteps: 12,
-			abortSignal: AbortSignal.timeout(timeoutMs),
-			experimental_telemetry: { isEnabled: useTelemetry },
-			onStepFinish({ text: text$2, toolCalls }) {
-				if (onAgentStep) onAgentStep({
-					thought: text$2,
-					toolCalls
+	function processTextBlock(blockText, headings, isAtomic = false) {
+		const blockTokens = countTokens(blockText);
+		const contextTokens = countTokens(formatHeadingContext(headings));
+		const safetyBuffer = Math.min(100, Math.max(2, Math.floor(maxTokens * .1)));
+		const budgetLimit = Math.max(5, maxTokens - contextTokens - safetyBuffer);
+		if (blockTokens > budgetLimit) if (isAtomic) {
+			flushCurrentChunk(false);
+			currentChunkList.push({
+				text: blockText,
+				headings: [...headings]
+			});
+			accumulatedTokens = blockTokens;
+			flushCurrentChunk(false);
+		} else {
+			flushCurrentChunk(false);
+			const subBlocks = splitTextRecursively(blockText, budgetLimit);
+			for (const sub of subBlocks) {
+				currentChunkList.push({
+					text: sub,
+					headings: [...headings]
 				});
+				accumulatedTokens += countTokens(sub);
+				if (accumulatedTokens > budgetLimit) flushCurrentChunk(false);
 			}
-		});
-		if (!finalExtractedData) {
-			if (result.text) try {
-				finalExtractedData = safeParseJSON(result.text);
-			} catch {}
-		}
-		if (!finalExtractedData) return {
-			success: false,
-			error: "Agent finished without submitting structured data."
-		};
-		const validation = validateExtractedData(schema, finalExtractedData);
-		if (!validation.success) {
-			const correctionSystemPrompt = `You are a precise data correction assistant. Your task is to correct validation errors in a previously generated JSON object to make it comply with the JSON Schema.
-JSON Schema Definition:
-${JSON.stringify(outputSchema, null, 2)}
-Validation Errors:
-${validation.error}
-Original Incorrect JSON:
-${JSON.stringify(finalExtractedData, null, 2)}
-Please output the corrected JSON object. Return ONLY the corrected JSON object, with no markdown tags or explanations.`;
-			const correctedData = safeParseJSON((await generateText({
-				model: provider.chatModel(selected.name),
-				system: correctionSystemPrompt,
-				prompt: "Please correct the JSON output now.",
-				abortSignal: AbortSignal.timeout(timeoutMs),
-				experimental_telemetry: { isEnabled: useTelemetry }
-			})).text);
-			const secondValidation = validateExtractedData(schema, correctedData);
-			if (!secondValidation.success) return {
-				success: false,
-				error: `Agent output validation failed: ${secondValidation.error}`
-			};
-			finalExtractedData = correctedData;
 		}
-		const outputDir = path.resolve(aiexDir, config.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
-		await fs.mkdir(outputDir, { recursive: true });
-		const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
-		const outputFileName = `${schema.table.name}-${timestamp}.json`;
-		const outputPath = path.join(outputDir, outputFileName);
-		await writeFile(outputPath, finalExtractedData, {
-			spaces: 2,
-			EOL: "\n"
-		});
-		let totalPromptTokens = 0;
-		let totalCompletionTokens = 0;
-		if (result.usage) {
-			totalPromptTokens = result.usage.inputTokens ?? 0;
-			totalCompletionTokens = result.usage.outputTokens ?? 0;
+		else {
+			if (accumulatedTokens + blockTokens + contextTokens > maxTokens && currentChunkList.length > 0) flushCurrentChunk(false);
+			currentChunkList.push({
+				text: blockText,
+				headings: [...headings]
+			});
+			accumulatedTokens += blockTokens;
 		}
-		return {
-			success: true,
-			outputPath,
-			data: finalExtractedData,
-			tokensUsed: {
-				prompt: totalPromptTokens,
-				completion: totalCompletionTokens,
-				total: totalPromptTokens + totalCompletionTokens
-			}
-		};
-	} catch (error) {
-		return {
-			success: false,
-			error: getErrorMessage(error)
-		};
 	}
 }
@@ -14719,7 +14612,6 @@ async function processOneFile(aiexDir, config, aiConfig, schemaName, filePath, m
 		modelOverride,
 		insert: options?.insert,
 		force: options?.force,
-		agent: options?.agent,
 		quiet: false
 	});
 	if (result.success) {
@@ -14759,8 +14651,7 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
 		})}`);
 		if (await processOneFile(aiexDir, config, aiConfig, schemaName, file, modelOverride, {
 			insert: options?.insert,
-			force: options?.force,
-			agent: options?.agent
+			force: options?.force
 		})) successCount++;
 		else failCount++;
 	}
@@ -14778,7 +14669,44 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
 //#endregion
 //#region src/core/extract-runner.ts
+const encoding = getEncoding("cl100k_base");
 const JSON_EXT_RE$1 = /\.json$/;
+async function limitConcurrency(concurrency, items, fn) {
+	const results = Array.from({ length: items.length });
+	let nextIndex = 0;
+	async function worker() {
+		while (nextIndex < items.length) {
+			const currentIndex = nextIndex++;
+			results[currentIndex] = await fn(items[currentIndex], currentIndex);
+		}
+	}
+	const workers = Array.from({ length: Math.min(concurrency, items.length) }, worker);
+	await Promise.all(workers);
+	return results;
+}
+function getSchemaKeywords(schema) {
+	const keywords = /* @__PURE__ */ new Set();
+	function walk(properties) {
+		if (!properties) return;
+		for (const [name$1, prop] of Object.entries(properties)) {
+			keywords.add(name$1.toLowerCase());
+			const parts = name$1.replace(/([a-z0-9])([A-Z])/g, "$1 $2").split(/[\s._:/\\-]+/g);
+			for (const part of parts) if (part.length > 1) keywords.add(part.toLowerCase());
+			if (prop && typeof prop === "object") {
+				const p = prop;
+				if (typeof p.title === "string") keywords.add(p.title.toLowerCase());
+				if (typeof p.description === "string") {
+					const descParts = p.description.toLowerCase().match(/[\p{L}\p{N}_-]+/gu) ?? [];
+					for (const d of descParts) if (d.length > 2) keywords.add(d);
+				}
+				if (p.type === "object") walk(p.properties);
+				if (p.type === "array" && p.items?.type === "object") walk(p.items.properties);
+			}
+		}
+	}
+	walk(schema.properties);
+	return Array.from(keywords);
+}
 async function ensureDatabaseReady(dbPath, schema) {
 	try {
 		await fs.access(dbPath);
@@ -14850,44 +14778,52 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
 	}
 	const s = spinner();
 	if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
-	const CHUNK_LIMIT = 4e4;
+	const maxTokens = aiConfig.extraction?.maxTokens ?? 8e3;
+	const overlapTokens = aiConfig.extraction?.overlapSize ?? 1e3;
 	let result;
-	if (options?.agent || aiConfig.extraction?.mode === "react") {
-		if (!options?.quiet) consola.info(t("command.extract.file.reactAgentMode"));
-		const agentResult = await extractStructuredDataWithAgent({
-			config: aiConfig,
-			schema: schemaLoad.schema,
-			text: text$1 ?? "",
-			aiexDir,
-			modelOverride,
-			onAgentStep(step) {
-				if (!options?.quiet) {
-					if (step.thought) {
-						const thoughtPreview = step.thought.length > 100 ? `${step.thought.slice(0, 100)}...` : step.thought;
-						s.message(`${pc.cyan(t("command.extract.file.agentThought"))}: ${thoughtPreview.replace(/\n/g, " ")}`);
+	const totalTokens = text$1 ? encoding.encode(text$1).length : 0;
+	if (text$1 && totalTokens > maxTokens) {
+		if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
+			length: totalTokens,
+			limit: maxTokens
+		}));
+		const finalDocs = splitMarkdown(text$1, maxTokens, overlapTokens);
+		if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
+		let processedDocs = finalDocs;
+		if (!!aiConfig.extraction?.preFiltering && finalDocs.length > 1) {
+			const preFilteringLimit = aiConfig.extraction?.preFilteringLimit ?? 5;
+			const keywords = getSchemaKeywords(schemaLoad.schema);
+			const scoredChunks = finalDocs.map((doc, idx) => {
+				if (idx === 0) return {
+					index: idx,
+					score: Number.POSITIVE_INFINITY
+				};
+				let score = 0;
+				const docTextLower = doc.pageContent.toLowerCase();
+				for (const kw of keywords) {
+					let pos = docTextLower.indexOf(kw);
+					while (pos !== -1) {
+						score++;
+						pos = docTextLower.indexOf(kw, pos + kw.length);
 					}
-					if (step.toolCalls && step.toolCalls.length > 0) for (const call of step.toolCalls) consola.info(`[Agent Action] Calling tool: ${pc.green(call.toolName)}`);
 				}
+				return {
+					index: idx,
+					score
+				};
+			}).slice(1).sort((a, b) => b.score - a.score);
+			const selectedIndices = new Set([0]);
+			let keptCount = 0;
+			for (const sc of scoredChunks) if (sc.score > 0 && keptCount < preFilteringLimit) {
+				selectedIndices.add(sc.index);
+				keptCount++;
 			}
-		});
-		if (!agentResult.success) {
-			if (!options?.quiet) {
-				s.stop(t("command.extract.file.extractFail"));
-				consola.error(agentResult.error);
-			}
-			return {
-				success: false,
-				error: agentResult.error
-			};
+			processedDocs = finalDocs.filter((_, idx) => selectedIndices.has(idx));
+			if (!options?.quiet) consola.info(t("command.extract.file.preFiltering", {
+				original: finalDocs.length,
+				filtered: processedDocs.length
+			}));
 		}
-		result = agentResult;
-	} else if (text$1 && text$1.length > CHUNK_LIMIT) {
-		if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
-			length: text$1.length,
-			limit: CHUNK_LIMIT
-		}));
-		const finalDocs = splitMarkdown(text$1, CHUNK_LIMIT);
-		if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
 		const chunkResults = [];
 		const accumulatedTokens = {
 			prompt: 0,
@@ -14896,53 +14832,68 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
 		};
 		let success = true;
 		let errorMsg = "";
-		for (let i = 0; i < finalDocs.length; i++) {
-			const doc = finalDocs[i];
-			if (!options?.quiet) s.message(t("command.extract.file.extractingChunk", {
-				current: i + 1,
-				total: finalDocs.length
-			}));
-			const headings = [];
-			if (doc.metadata) {
-				if (doc.metadata.h1) headings.push(doc.metadata.h1);
-				if (doc.metadata.h2) headings.push(doc.metadata.h2);
-				if (doc.metadata.h3) headings.push(doc.metadata.h3);
-				if (doc.metadata.h4) headings.push(doc.metadata.h4);
-			}
-			let chunkText = doc.pageContent;
-			if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
-			const chunkResult = await extractStructuredData({
-				config: aiConfig,
-				schema: schemaLoad.schema,
-				text: chunkText,
-				aiexDir,
-				modelOverride,
-				onRetry(info) {
-					if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
-						current: i + 1,
-						total: finalDocs.length,
-						code: info.statusCode,
-						delay: info.delayMs / 1e3,
-						attempt: info.attempt,
-						max: info.maxRetries
-					}));
+		const extractionTasks = processedDocs.map((doc, i) => {
+			return async () => {
+				if (!success) return;
+				const headings = [];
+				if (doc.metadata) {
+					if (doc.metadata.h1) headings.push(doc.metadata.h1);
+					if (doc.metadata.h2) headings.push(doc.metadata.h2);
+					if (doc.metadata.h3) headings.push(doc.metadata.h3);
+					if (doc.metadata.h4) headings.push(doc.metadata.h4);
 				}
-			});
-			if (!chunkResult.success) {
-				success = false;
-				errorMsg = chunkResult.error || t("common.unknownError");
-				if (!options?.quiet) {
-					s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
-					consola.error(errorMsg);
+				let chunkText = doc.pageContent;
+				if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
+				const chunkResult = await extractStructuredData({
+					config: aiConfig,
+					schema: schemaLoad.schema,
+					text: chunkText,
+					aiexDir,
+					modelOverride,
+					onRetry(info) {
+						if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
+							current: i + 1,
+							total: processedDocs.length,
+							code: info.statusCode,
+							delay: info.delayMs / 1e3,
+							attempt: info.attempt,
+							max: info.maxRetries
+						}));
+					}
+				});
+				if (!chunkResult.success) {
+					success = false;
+					errorMsg = chunkResult.error || t("common.unknownError");
+					if (!options?.quiet) {
+						s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
+						consola.error(errorMsg);
+					}
+					return;
 				}
-				break;
-			}
-			if (chunkResult.data) chunkResults.push(chunkResult.data);
-			if (chunkResult.tokensUsed) {
-				accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
-				accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
-				accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
-			}
+				if (chunkResult.data) chunkResults.push(chunkResult.data);
+				if (chunkResult.tokensUsed) {
+					accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
+					accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
+					accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
+				}
+			};
+		});
+		const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
+		if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
+			current: 1,
+			total: processedDocs.length
+		}));
+		try {
+			await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
+				if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
+					current: idx + 1,
+					total: processedDocs.length
+				}));
+				await task();
+			});
+		} catch (e) {
+			success = false;
+			errorMsg = e instanceof Error ? e.message : String(e);
 		}
 		if (!success) return {
 			success: false,
@@ -15001,6 +14952,11 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
 	}
 	if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
 	if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
+	if (result.evidenceSummary && !options?.quiet) {
+		const summary = result.evidenceSummary;
+		const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
+		consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, issues ${issueText}`));
+	}
 	if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
 		prompt: result.tokensUsed.prompt,
 		completion: result.tokensUsed.completion,
@@ -15029,6 +14985,7 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
 						outputPath: result.outputPath,
 						data: result.data,
 						tablesInserted: insertResult.tablesInserted,
+						evidenceSummary: result.evidenceSummary,
 						tokensUsed: result.tokensUsed
 					};
 				} else {
@@ -15055,11 +15012,12 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
 		success: true,
 		outputPath: result.outputPath,
 		data: result.data,
+		evidenceSummary: result.evidenceSummary,
 		tokensUsed: result.tokensUsed
 	};
 }
 async function runAuditedExtraction(options) {
-	const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false, agent = false } = options;
+	const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false } = options;
 	let fileHash;
 	let isPlainTextFile = false;
 	if (source.type === "file") {
@@ -15127,8 +15085,7 @@ async function runAuditedExtraction(options) {
 		} else text$1 = source.text;
 		const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
 			quiet,
-			insert,
-			agent
+			insert
 		});
 		if (r.success) {
 			let notionPages;
@@ -15168,6 +15125,7 @@ async function runAuditedExtraction(options) {
 				outputName: updated.outputName,
 				tablesInserted: updated.tablesInserted,
 				notionPages: updated.notionPages,
+				evidenceSummary: r.evidenceSummary,
 				tokensUsed: updated.tokensUsed,
 				auditId: updated.id,
 				fileHash
@@ -15587,12 +15545,6 @@ const extractCommand = defineCommand({
 			type: "boolean",
 			description: t("command.extract.args.force"),
 			default: false
-		},
-		agent: {
-			type: "boolean",
-			alias: "a",
-			description: "Enable ReAct agent extraction mode",
-			default: false
 		}
 	},
 	async run({ args, rawArgs }) {
@@ -15620,8 +15572,7 @@ const extractCommand = defineCommand({
 			}
 			const result$1 = await runBatchExtraction(aiexDir, config, aiConfig, args.schema, args.dir, args.glob, modelOverride, {
 				insert: !args.noInsert,
-				force: args.force,
-				agent: args.agent
+				force: args.force
 			});
 			if (!result$1.ok) {
 				failCommand(result$1.error);
@@ -15652,8 +15603,7 @@ const extractCommand = defineCommand({
 			modelOverride,
 			insert: !args.noInsert,
 			force: args.force,
-			quiet: false,
-			agent: args.agent
+			quiet: false
 		});
 		if (!result.success) {
 			failCommand(result.error);
@@ -16303,6 +16253,7 @@ function aiRoutes(config) {
 //#endregion
 //#region src/core/data-service.ts
 const FILE_REGEX = /\.json$/;
+const EVIDENCE_FILE_SUFFIX = ".evidence.json";
 const EXTRACTION_TIMESTAMP_RE = /-\d{4}-\d{2}-\d{2}T/;
 const INTERNAL_ROWID_COLUMN = "__aiex_rowid";
 const TIMESTAMP_CLEANUP = /(\d{2})-(\d{2})-(\d{2})/;
@@ -16318,6 +16269,24 @@ function getAuditNotionStatus(record) {
 	if (record.status === "failed") return "failed";
 	return "not_synced";
 }
+async function readEvidenceSummary(extractedDir, outputName) {
+	const evidencePath = path.join(extractedDir, outputName.replace(FILE_REGEX, EVIDENCE_FILE_SUFFIX));
+	try {
+		const coverage = (await readFile(evidencePath))?.coverage;
+		if (!coverage || typeof coverage !== "object") return void 0;
+		return {
+			path: evidencePath,
+			fieldCount: Number(coverage.fieldCount) || 0,
+			evidenceCount: Number(coverage.evidenceCount) || 0,
+			foundCount: Number(coverage.foundCount) || 0,
+			missingCount: Number(coverage.missingCount) || 0,
+			inferredCount: Number(coverage.inferredCount) || 0,
+			issueCount: Number(coverage.issueCount) || 0
+		};
+	} catch {
+		return;
+	}
+}
 async function getRowExtractionActions(aiexDir, tableName) {
 	const actions = /* @__PURE__ */ new Map();
 	const auditRecords = await listExtractionAuditRecords(aiexDir);
@@ -16345,7 +16314,7 @@ async function listExtractions(config) {
 	const aiexDir = path.dirname(config.schemaPath);
 	const extractedDir = path.join(aiexDir, "extracted");
 	await fs.mkdir(extractedDir, { recursive: true });
-	const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md"));
+	const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md") && !f.endsWith(EVIDENCE_FILE_SUFFIX));
 	const auditRecords = await listExtractionAuditRecords(aiexDir);
 	const auditByOutputName = new Map(auditRecords.map((record) => [record.outputName, record]));
 	const records = [];
@@ -16364,6 +16333,7 @@ async function listExtractions(config) {
 				timestamp,
 				fileSize: stat.size,
 				modifiedAt: stat.mtime.toISOString(),
+				evidenceSummary: await readEvidenceSummary(extractedDir, file),
 				notionStatus: notionPages ? "synced" : audit?.status === "failed" ? "failed" : "not_synced",
 				notionPages,
 				notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0
@@ -16543,6 +16513,7 @@ async function retryNotionSync(config, fileName) {
 //#endregion
 //#region src/server/routes/data.ts
+const JSON_FILE_SUFFIX_RE = /\.json$/;
 const tableParamSchema = z.object({ name: z.string().regex(/^[a-z][a-z0-9_]*$/) });
 const extractionFileParamSchema = z.object({ name: z.string().regex(/^[\w.-]+\.json$/).refine((name$1) => name$1 === path.basename(name$1) && !name$1.includes("..")) });
 const tableQuerySchema = z.object({
@@ -16595,10 +16566,22 @@ function dataRoutes(config) {
 		const filePath = path.join(extractedDir, name$1);
 		try {
 			const content = await fs.readFile(filePath, "utf-8");
+			const evidencePath = path.join(extractedDir, name$1.replace(JSON_FILE_SUFFIX_RE, ".evidence.json"));
+			let evidenceSummary;
+			try {
+				const evidence = JSON.parse(await fs.readFile(evidencePath, "utf-8"));
+				evidenceSummary = evidence?.coverage ? {
+					...evidence.coverage,
+					path: evidencePath
+				} : void 0;
+			} catch {
+				evidenceSummary = void 0;
+			}
 			return c.json({
 				success: true,
 				content,
-				name: name$1
+				name: name$1,
+				evidenceSummary
 			});
 		} catch {
 			return c.json({ error: t("server.extractionNotFound") }, 404);
@@ -16742,6 +16725,7 @@ function extractRoutes(config) {
 				outputName: result.outputName,
 				tablesInserted: result.tablesInserted,
 				notionPages: result.notionPages,
+				evidenceSummary: result.evidenceSummary,
 				tokensUsed: result.tokensUsed,
 				auditId: result.auditId
 			}, 200);
@@ -16809,6 +16793,7 @@ function extractRoutes(config) {
 			outputName: result.outputName,
 			tablesInserted: result.tablesInserted,
 			notionPages: result.notionPages,
+			evidenceSummary: result.evidenceSummary,
 			tokensUsed: result.tokensUsed,
 			auditId: result.auditId
 		}, 200);