npm - aiex-cli - Versions diffs - 0.0.5-beta.2 → 0.0.5-beta.3 - Mend

aiex-cli 0.0.5-beta.2 → 0.0.5-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/README.md +21 -0
package/dist/cli.mjs +468 -11
package/dist/{doctor-collector-DZyLrpqA.mjs → doctor-collector-CQPDBVTw.mjs} +18 -4
package/dist/index.d.mts +91 -88
package/dist/index.mjs +1 -1
package/dist/web/assets/AISettings-DoDVYWfb.js +272 -0
package/dist/web/assets/{index-Dlze68g1.js → index-BWm_fhNt.js} +2 -2
package/dist/web/index.html +1 -1
package/dist/{zh-CN-Qcn0DHFh.mjs → zh-CN-CKxdpj8c.mjs} +8 -0
package/package.json +1 -1
package/dist/web/assets/AISettings-BlyTFIIy.js +0 -272

package/README.md CHANGED Viewed

@@ -70,6 +70,7 @@ aiex extract -s <schema> -f <file>        # from file (txt, pdf, png, jpg, ...)
 aiex extract -s <schema> -f <file> -m <model>      # specify AI model (overrides auto-selection)
 aiex extract -s <schema> -f <file> --no-insert     # extract and save JSON without inserting into SQLite
 aiex extract -s <schema> -f <file> --force         # force re-extraction even if already processed
+aiex extract -s <schema> -f <file> --agent         # run ReAct agent mode (ideal for large documents)
 aiex extract -s <schema> -d <directory>            # batch extract all supported files in a directory
 aiex extract -s <schema> -d <dir> -g "*.pdf"       # batch with glob filter
 aiex extract history                               # list extraction audit records
@@ -128,6 +129,7 @@ Dumps all extracted data for a given schema (or table) from the SQLite database
 | `aiex extract -s <name> -f <file> -m <model>` | Extract with a specific AI model |
 | `aiex extract -s <name> -f <file> --no-insert` | Extract and save JSON without inserting into SQLite |
 | `aiex extract -s <name> -f <file> --force` | Force re-extraction even if the file has already been processed |
+| `aiex extract -s <name> -f <file> --agent` | Extract data in ReAct agent mode (using tool navigation) |
 | `aiex extract -s <name> -d <dir>` | Batch extract all supported files in a directory |
 | `aiex extract -s <name> -d <dir> -g "*.pdf"` | Batch extract with glob filter |
 | `aiex extract history` | List extraction audit records |
@@ -202,6 +204,25 @@ aiex completion fish | source
 <br>
+## 📄 Large Document Processing (Pipeline vs. ReAct Agent)
+When processing very large documents (exceeding `40,000` characters), `aiex` provides two separate modes to handle context window limits and cost:
+### 1. Pipeline Mode (Default)
+- **Mechanism**: Splits the document logically at Markdown headings or paragraph boundaries. It processes each chunk sequentially through the LLM, prepending active heading stacks as context to prevent losing track of document structure (like headers). Finally, it merges the outputs recursively.
+- **Best for**: Small-to-medium files or structures where every single section must be scanned completely (e.g. log files).
+### 2. ReAct Agent Mode
+- **Mechanism**: Spawns an agent equipped with document navigation tools:
+  - `listChunks()`: Returns a Table of Contents (headings, sizes, indices).
+  - `readChunk(chunkId)`: Fetches a specific section.
+  - `searchChunks(query)`: Matches keywords across all chunks.
+  - `submitExtraction(data)`: Submits the final structured JSON payload.
+  The agent uses these tools to dynamically browse and retrieve only the relevant parts, drastically reducing API token costs for giant documents.
+- **How to run**: Pass `--agent` / `-a` via the CLI, or toggle **Extraction Mode** under the **Prompts** tab in the Web UI.
+<br>
 ## 🔧 AI Configuration
 aiex works with any OpenAI-compatible API provider. Configure in the Web UI (AI Settings panel):

package/dist/cli.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-DZyLrpqA.mjs";
+import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-CQPDBVTw.mjs";
 import { createRequire } from "node:module";
 import fs from "node:fs/promises";
 import os from "node:os";
@@ -18,7 +18,7 @@ import pc from "picocolors";
 import { Buffer } from "node:buffer";
 import * as XLSX from "xlsx";
 import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
-import { APICallError, Output, generateText, jsonSchema } from "ai";
+import { APICallError, Output, generateText, jsonSchema, tool } from "ai";
 import pRetry from "p-retry";
 import mime from "mime";
 import { jsonrepair } from "jsonrepair";
@@ -13128,7 +13128,7 @@ function propertyToExtractionSchema(property) {
 	}
 	return { type: nullableType(property.type) };
 }
-function isRecord(value) {
+function isRecord$1(value) {
 	return typeof value === "object" && value !== null && !Array.isArray(value);
 }
 function schemaToExtractionOutputSchema(schema) {
@@ -13166,7 +13166,7 @@ function validatePropertyValue(path$1, property, value, issues) {
 			}
 			return;
 		case "object":
-			if (!isRecord(value)) {
+			if (!isRecord$1(value)) {
 				issues.push(`${path$1}: expected object or null`);
 				return;
 			}
@@ -13189,7 +13189,7 @@ function validateProperties(basePath, properties, data, issues) {
 	}
 }
 function validateExtractedData(schema, data) {
-	if (!isRecord(data)) return {
+	if (!isRecord$1(data)) return {
 		success: false,
 		error: "Extracted data must be a JSON object."
 	};
@@ -13512,6 +13512,329 @@ function insertExtractedData(db, schema, data) {
 	}
 }
+//#endregion
+//#region src/core/ai-extraction/json-merger.ts
+function isRecord(value) {
+	return typeof value === "object" && value !== null && !Array.isArray(value);
+}
+function mergePropertyValue(property, values) {
+	const nonNullValues = values.filter((v) => v !== null && v !== void 0);
+	if (nonNullValues.length === 0) return null;
+	if (property.type === "array") {
+		const concatenated = [];
+		for (const val of nonNullValues) if (Array.isArray(val)) concatenated.push(...val);
+		return concatenated;
+	}
+	if (property.type === "object") {
+		const childProperties = property.properties;
+		if (!childProperties) {
+			const mergedObj$1 = {};
+			for (const val of nonNullValues) if (isRecord(val)) Object.assign(mergedObj$1, val);
+			return mergedObj$1;
+		}
+		const mergedObj = {};
+		for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord(v) ? v[propName] : void 0));
+		return mergedObj;
+	}
+	const bestValue = nonNullValues.find((v) => {
+		if (typeof v === "string") return v.trim() !== "";
+		return true;
+	});
+	return bestValue !== void 0 ? bestValue : null;
+}
+/**
+* Merges structured extraction outputs from multiple document chunks
+* according to the schema properties.
+*/
+function mergeExtractionResults(schema, results) {
+	if (results.length === 0) return {};
+	if (results.length === 1) return results[0];
+	const merged = {};
+	for (const [propName, propDef] of Object.entries(schema.properties)) {
+		if (propDef.primary && propDef.autoIncrement) continue;
+		merged[propName] = mergePropertyValue(propDef, results.map((r) => r[propName]));
+	}
+	return merged;
+}
+//#endregion
+//#region src/core/ai-extraction/text-splitter.ts
+const HEADING_RE = /^(#{1,6})\s+(\S.*)$/;
+/**
+* Splits a Markdown document into chunks based on header hierarchy.
+* Keeps tables and list blocks intact by splitting along paragraphs (\n\n)
+* when a section exceeds the maxSize limit.
+*/
+function splitMarkdown(text$1, maxSize = 4e4) {
+	const lines = text$1.split("\n");
+	const chunks = [];
+	let currentHeadings = [];
+	let currentChunkLines = [];
+	let currentSize = 0;
+	const getMetadata = (headings) => {
+		return {
+			h1: headings[0] || void 0,
+			h2: headings[1] || void 0,
+			h3: headings[2] || void 0,
+			h4: headings[3] || void 0
+		};
+	};
+	const flushChunk = () => {
+		if (currentChunkLines.length === 0) return;
+		const pageContent = currentChunkLines.join("\n");
+		if (pageContent.length > maxSize) {
+			const paragraphs = pageContent.split("\n\n");
+			let subLines = [];
+			let subSize = 0;
+			for (const para of paragraphs) {
+				const paraSize = para.length;
+				if (subSize + paraSize > maxSize && subLines.length > 0) {
+					chunks.push({
+						pageContent: subLines.join("\n\n"),
+						metadata: getMetadata(currentHeadings)
+					});
+					subLines = [];
+					subSize = 0;
+				}
+				subLines.push(para);
+				subSize += paraSize + 2;
+			}
+			if (subLines.length > 0) chunks.push({
+				pageContent: subLines.join("\n\n"),
+				metadata: getMetadata(currentHeadings)
+			});
+		} else chunks.push({
+			pageContent,
+			metadata: getMetadata(currentHeadings)
+		});
+		currentChunkLines = [];
+		currentSize = 0;
+	};
+	for (const line of lines) {
+		const headingMatch = line.match(HEADING_RE);
+		if (headingMatch) {
+			flushChunk();
+			const depth = headingMatch[1].length;
+			const title = headingMatch[2].trim();
+			currentHeadings = currentHeadings.slice(0, depth - 1);
+			currentHeadings[depth - 1] = title;
+		}
+		currentChunkLines.push(line);
+		currentSize += line.length + 1;
+		if (currentSize > maxSize) flushChunk();
+	}
+	flushChunk();
+	return chunks;
+}
+//#endregion
+//#region src/core/ai-extraction/react-agent.ts
+async function extractStructuredDataWithAgent(input) {
+	const { config, schema, text: text$1, aiexDir, modelOverride, onAgentStep } = input;
+	if (!config.provider.apiKey) return {
+		success: false,
+		error: t("errors.ai.apiKeyMissing")
+	};
+	const chunks = splitMarkdown(text$1, 15e3);
+	const inputTokens = Math.ceil(text$1.length / 2);
+	const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
+	const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
+	let selected;
+	try {
+		selected = modelOverride ?? selectModel({
+			models: config.provider.models,
+			isImage: false,
+			inputTokens,
+			outputTokens
+		});
+	} catch (e) {
+		return {
+			success: false,
+			error: e.message
+		};
+	}
+	const useTelemetry = !!(config.langfuse?.publicKey && config.langfuse.secretKey);
+	try {
+		if (useTelemetry) initLangfuse(config);
+		const provider = createOpenAICompatible({
+			baseURL: config.provider.baseURL,
+			name: "openai-compatible",
+			apiKey: config.provider.apiKey,
+			supportsStructuredOutputs: false
+		});
+		let finalExtractedData = null;
+		const tools = {
+			listChunks: tool({
+				description: "Get a list of all text chunks in the document, showing their chunk index ID, character size, and markdown heading hierarchy (metadata). Use this as a Table of Contents to locate sections of interest.",
+				parameters: z.object({}),
+				execute: async () => {
+					return chunks.map((c, idx) => ({
+						id: idx + 1,
+						size: c.pageContent.length,
+						headings: c.metadata
+					}));
+				}
+			}),
+			readChunk: tool({
+				description: "Read the full text content of a specific chunk by its ID.",
+				parameters: z.object({ chunkId: z.number().int().describe("The ID (1-based index) of the chunk to read.") }),
+				execute: async ({ chunkId }) => {
+					const index = chunkId - 1;
+					if (index < 0 || index >= chunks.length) return { error: `Invalid chunkId: ${chunkId}. Valid IDs are 1 to ${chunks.length}.` };
+					const chunk = chunks[index];
+					const headings = [];
+					if (chunk.metadata) {
+						if (chunk.metadata.h1) headings.push(chunk.metadata.h1);
+						if (chunk.metadata.h2) headings.push(chunk.metadata.h2);
+						if (chunk.metadata.h3) headings.push(chunk.metadata.h3);
+						if (chunk.metadata.h4) headings.push(chunk.metadata.h4);
+					}
+					return {
+						chunkId,
+						headings: headings.join(" > "),
+						content: chunk.pageContent
+					};
+				}
+			}),
+			searchChunks: tool({
+				description: "Search all chunks in the document for specific keywords or search terms. Returns matching chunk IDs and small matching context snippets.",
+				parameters: z.object({ query: z.string().describe("The keyword or search phrase to search for.") }),
+				execute: async ({ query }) => {
+					const results = [];
+					const lowercaseQuery = query.toLowerCase();
+					for (let i = 0; i < chunks.length; i++) {
+						const chunkText = chunks[i].pageContent;
+						const idx = chunkText.toLowerCase().indexOf(lowercaseQuery);
+						if (idx !== -1) {
+							const start = Math.max(0, idx - 60);
+							const end = Math.min(chunkText.length, idx + lowercaseQuery.length + 60);
+							const snippet = `...${chunkText.slice(start, end).replace(/\n/g, " ")}...`;
+							results.push({
+								chunkId: i + 1,
+								headings: chunks[i].metadata,
+								snippet
+							});
+						}
+					}
+					return results.slice(0, 10);
+				}
+			}),
+			submitExtraction: tool({
+				description: "Submit the final extracted JSON object conforming to the schema definition. Call this ONLY after you have gathered all necessary information.",
+				parameters: z.object({ data: z.any().describe("The extracted JSON object conforming to the target schema.") }),
+				execute: async ({ data }) => {
+					finalExtractedData = data;
+					return {
+						status: "success",
+						message: "Data submitted successfully. The extraction is now complete."
+					};
+				}
+			})
+		};
+		const outputSchema = schemaToExtractionOutputSchema(schema);
+		const systemPrompt = `You are a precise data extraction agent. Your goal is to extract structured information from a document to populate the target JSON schema.
+Target JSON Schema structure to populate:
+${JSON.stringify(outputSchema, null, 2)}
+You are equipped with tools to browse the document dynamically:
+1. First, call listChunks to understand the document layout and what sections exist.
+2. Based on the schema fields, call readChunk or searchChunks to locate and read relevant content.
+3. You can make multiple tool calls. Do not guess. Check the text carefully.
+4. Once you have located and read all the necessary information, call the submitExtraction tool with the fully extracted JSON object.
+5. After calling submitExtraction, you should stop.
+CRITICAL RULES:
+1. Extract data strictly conforming to the types and properties of the Target JSON Schema.
+2. If a field's value cannot be found in the document after thorough search, set it to null.
+3. Do not invent any values.
+4. Call submitExtraction exactly once with the final JSON result.`;
+		const timeoutMs = (config.provider.timeout ?? 300) * 1e3;
+		const result = await generateText({
+			model: provider.chatModel(selected.name),
+			system: systemPrompt,
+			prompt: "Please start by listing the chunks to understand the document structure, then gather the required facts and submit the final JSON extraction.",
+			tools,
+			maxSteps: 12,
+			abortSignal: AbortSignal.timeout(timeoutMs),
+			experimental_telemetry: { isEnabled: useTelemetry },
+			onStepFinish({ text: text$2, toolCalls }) {
+				if (onAgentStep) onAgentStep({
+					thought: text$2,
+					toolCalls
+				});
+			}
+		});
+		if (!finalExtractedData) {
+			if (result.text) try {
+				finalExtractedData = safeParseJSON(result.text);
+			} catch {}
+		}
+		if (!finalExtractedData) return {
+			success: false,
+			error: "Agent finished without submitting structured data."
+		};
+		const validation = validateExtractedData(schema, finalExtractedData);
+		if (!validation.success) {
+			const correctionSystemPrompt = `You are a precise data correction assistant. Your task is to correct validation errors in a previously generated JSON object to make it comply with the JSON Schema.
+JSON Schema Definition:
+${JSON.stringify(outputSchema, null, 2)}
+Validation Errors:
+${validation.error}
+Original Incorrect JSON:
+${JSON.stringify(finalExtractedData, null, 2)}
+Please output the corrected JSON object. Return ONLY the corrected JSON object, with no markdown tags or explanations.`;
+			const correctedData = safeParseJSON((await generateText({
+				model: provider.chatModel(selected.name),
+				system: correctionSystemPrompt,
+				prompt: "Please correct the JSON output now.",
+				abortSignal: AbortSignal.timeout(timeoutMs),
+				experimental_telemetry: { isEnabled: useTelemetry }
+			})).text);
+			const secondValidation = validateExtractedData(schema, correctedData);
+			if (!secondValidation.success) return {
+				success: false,
+				error: `Agent output validation failed: ${secondValidation.error}`
+			};
+			finalExtractedData = correctedData;
+		}
+		const outputDir = path.resolve(aiexDir, config.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
+		await fs.mkdir(outputDir, { recursive: true });
+		const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
+		const outputFileName = `${schema.table.name}-${timestamp}.json`;
+		const outputPath = path.join(outputDir, outputFileName);
+		await writeFile(outputPath, finalExtractedData, {
+			spaces: 2,
+			EOL: "\n"
+		});
+		let totalPromptTokens = 0;
+		let totalCompletionTokens = 0;
+		if (result.usage) {
+			totalPromptTokens = result.usage.inputTokens ?? 0;
+			totalCompletionTokens = result.usage.outputTokens ?? 0;
+		}
+		return {
+			success: true,
+			outputPath,
+			data: finalExtractedData,
+			tokensUsed: {
+				prompt: totalPromptTokens,
+				completion: totalCompletionTokens,
+				total: totalPromptTokens + totalCompletionTokens
+			}
+		};
+	} catch (error) {
+		return {
+			success: false,
+			error: getErrorMessage(error)
+		};
+	}
+}
 //#endregion
 //#region src/core/extraction-audit.ts
 const AUDIT_ID_RE = /^[\w.-]+$/;
@@ -14396,6 +14719,7 @@ async function processOneFile(aiexDir, config, aiConfig, schemaName, filePath, m
 		modelOverride,
 		insert: options?.insert,
 		force: options?.force,
+		agent: options?.agent,
 		quiet: false
 	});
 	if (result.success) {
@@ -14435,7 +14759,8 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
 		})}`);
 		if (await processOneFile(aiexDir, config, aiConfig, schemaName, file, modelOverride, {
 			insert: options?.insert,
-			force: options?.force
+			force: options?.force,
+			agent: options?.agent
 		})) successCount++;
 		else failCount++;
 	}
@@ -14525,7 +14850,130 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
 	}
 	const s = spinner();
 	if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
-	const result = await extractStructuredData({
+	const CHUNK_LIMIT = 4e4;
+	let result;
+	if (options?.agent || aiConfig.extraction?.mode === "react") {
+		if (!options?.quiet) consola.info(t("command.extract.file.reactAgentMode"));
+		const agentResult = await extractStructuredDataWithAgent({
+			config: aiConfig,
+			schema: schemaLoad.schema,
+			text: text$1 ?? "",
+			aiexDir,
+			modelOverride,
+			onAgentStep(step) {
+				if (!options?.quiet) {
+					if (step.thought) {
+						const thoughtPreview = step.thought.length > 100 ? `${step.thought.slice(0, 100)}...` : step.thought;
+						s.message(`${pc.cyan(t("command.extract.file.agentThought"))}: ${thoughtPreview.replace(/\n/g, " ")}`);
+					}
+					if (step.toolCalls && step.toolCalls.length > 0) for (const call of step.toolCalls) consola.info(`[Agent Action] Calling tool: ${pc.green(call.toolName)}`);
+				}
+			}
+		});
+		if (!agentResult.success) {
+			if (!options?.quiet) {
+				s.stop(t("command.extract.file.extractFail"));
+				consola.error(agentResult.error);
+			}
+			return {
+				success: false,
+				error: agentResult.error
+			};
+		}
+		result = agentResult;
+	} else if (text$1 && text$1.length > CHUNK_LIMIT) {
+		if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
+			length: text$1.length,
+			limit: CHUNK_LIMIT
+		}));
+		const finalDocs = splitMarkdown(text$1, CHUNK_LIMIT);
+		if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
+		const chunkResults = [];
+		const accumulatedTokens = {
+			prompt: 0,
+			completion: 0,
+			total: 0
+		};
+		let success = true;
+		let errorMsg = "";
+		for (let i = 0; i < finalDocs.length; i++) {
+			const doc = finalDocs[i];
+			if (!options?.quiet) s.message(t("command.extract.file.extractingChunk", {
+				current: i + 1,
+				total: finalDocs.length
+			}));
+			const headings = [];
+			if (doc.metadata) {
+				if (doc.metadata.h1) headings.push(doc.metadata.h1);
+				if (doc.metadata.h2) headings.push(doc.metadata.h2);
+				if (doc.metadata.h3) headings.push(doc.metadata.h3);
+				if (doc.metadata.h4) headings.push(doc.metadata.h4);
+			}
+			let chunkText = doc.pageContent;
+			if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
+			const chunkResult = await extractStructuredData({
+				config: aiConfig,
+				schema: schemaLoad.schema,
+				text: chunkText,
+				aiexDir,
+				modelOverride,
+				onRetry(info) {
+					if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
+						current: i + 1,
+						total: finalDocs.length,
+						code: info.statusCode,
+						delay: info.delayMs / 1e3,
+						attempt: info.attempt,
+						max: info.maxRetries
+					}));
+				}
+			});
+			if (!chunkResult.success) {
+				success = false;
+				errorMsg = chunkResult.error || t("common.unknownError");
+				if (!options?.quiet) {
+					s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
+					consola.error(errorMsg);
+				}
+				break;
+			}
+			if (chunkResult.data) chunkResults.push(chunkResult.data);
+			if (chunkResult.tokensUsed) {
+				accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
+				accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
+				accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
+			}
+		}
+		if (!success) return {
+			success: false,
+			error: errorMsg
+		};
+		const mergedData = mergeExtractionResults(schemaLoad.schema, chunkResults);
+		const validation = validateExtractedData(schemaLoad.schema, mergedData);
+		if (!validation.success) {
+			const valError = validation.error || "Merged data validation failed";
+			if (!options?.quiet) {
+				s.stop(t("command.extract.file.validationFail"));
+				consola.error(valError);
+			}
+			return {
+				success: false,
+				error: valError
+			};
+		}
+		const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
+		await fs.mkdir(outputDir, { recursive: true });
+		const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
+		const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
+		const finalMergedOutputPath = path.join(outputDir, outputFileName);
+		await fs.writeFile(finalMergedOutputPath, JSON.stringify(mergedData, null, 2));
+		result = {
+			success: true,
+			data: mergedData,
+			tokensUsed: accumulatedTokens,
+			outputPath: finalMergedOutputPath
+		};
+	} else result = await extractStructuredData({
 		config: aiConfig,
 		schema: schemaLoad.schema,
 		text: text$1 ?? "",
@@ -14611,7 +15059,7 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
 	};
 }
 async function runAuditedExtraction(options) {
-	const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false } = options;
+	const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false, agent = false } = options;
 	let fileHash;
 	let isPlainTextFile = false;
 	if (source.type === "file") {
@@ -14679,7 +15127,8 @@ async function runAuditedExtraction(options) {
 		} else text$1 = source.text;
 		const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
 			quiet,
-			insert
+			insert,
+			agent
 		});
 		if (r.success) {
 			let notionPages;
@@ -15138,6 +15587,12 @@ const extractCommand = defineCommand({
 			type: "boolean",
 			description: t("command.extract.args.force"),
 			default: false
+		},
+		agent: {
+			type: "boolean",
+			alias: "a",
+			description: "Enable ReAct agent extraction mode",
+			default: false
 		}
 	},
 	async run({ args, rawArgs }) {
@@ -15165,7 +15620,8 @@ const extractCommand = defineCommand({
 			}
 			const result$1 = await runBatchExtraction(aiexDir, config, aiConfig, args.schema, args.dir, args.glob, modelOverride, {
 				insert: !args.noInsert,
-				force: args.force
+				force: args.force,
+				agent: args.agent
 			});
 			if (!result$1.ok) {
 				failCommand(result$1.error);
@@ -15196,7 +15652,8 @@ const extractCommand = defineCommand({
 			modelOverride,
 			insert: !args.noInsert,
 			force: args.force,
-			quiet: false
+			quiet: false,
+			agent: args.agent
 		});
 		if (!result.success) {
 			failCommand(result.error);

package/dist/{doctor-collector-DZyLrpqA.mjs → doctor-collector-CQPDBVTw.mjs} RENAMED Viewed

@@ -74,7 +74,7 @@ function doctorDiagnosticsTableRows(d) {
 //#endregion
 //#region package.json
 var name = "aiex-cli";
-var version = "0.0.5-beta.2";
+var version = "0.0.5-beta.3";
 var description = "JSON Schema → SQLite with AI-powered data extraction";
 var package_default = {
 	name,
@@ -228,7 +228,10 @@ const PromptConfigSchema = z.object({
 	systemTemplate: z.string().min(1),
 	userTemplate: z.string().min(1)
 });
-const ExtractionConfigSchema = z.object({ outputDir: z.string().min(1) });
+const ExtractionConfigSchema = z.object({
+	outputDir: z.string().min(1),
+	mode: z.enum(["pipeline", "react"]).default("pipeline").optional()
+});
 const ImageOcrConfigSchema = z.object({
 	ocrFallback: z.enum([
 		"auto",
@@ -335,7 +338,10 @@ Extraction requirements:
 	userTemplate: `Please extract data from the following text:
 {text}`
 };
-const DEFAULT_EXTRACTION_CONFIG = { outputDir: ".aiex/extracted" };
+const DEFAULT_EXTRACTION_CONFIG = {
+	outputDir: ".aiex/extracted",
+	mode: "pipeline"
+};
 const DEFAULT_IMAGE_OCR_CONFIG = {
 	ocrFallback: "auto",
 	ocrLanguages: "en-US, zh-Hans",
@@ -564,9 +570,17 @@ const en = {
 				errorProcessing: "Error processing {{name}}: {{error}}",
 				extractedFrom: "Extracting from {{file}}...",
 				extracting: "Extracting data...",
+				reactAgentMode: "Starting ReAct Agent extraction...",
+				agentThought: "Agent Thought",
 				extractFail: "Extraction failed",
 				extractComplete: "Extraction complete",
 				extractRetry: "API responded with {{code}}, retrying in {{delay}}s ({{attempt}}/{{max}})",
+				chunking: "Input text length ({{length}} chars) exceeds limit ({{limit}} chars). Splitting into chunks...",
+				chunksCount: "Split into {{count}} chunk(s).",
+				extractingChunk: "Extracting chunk {{current}}/{{total}}...",
+				extractRetryChunk: "Chunk {{current}}/{{total}} API responded with {{code}}, retrying in {{delay}}s ({{attempt}}/{{max}})",
+				extractFailChunk: "Extraction failed for chunk {{current}}/{{total}}",
+				validationFail: "Merged data validation failed",
 				resultSaved: "Result saved: {{path}}",
 				tokenUsage: "Token usage: prompt={{prompt}}, completion={{completion}}, total={{total}}",
 				insertingDb: "Inserting into database...",
@@ -956,7 +970,7 @@ async function initI18n(lng) {
 			fallbackLng: "en",
 			resources: {
 				"en": { translation: en },
-				"zh-CN": { translation: await import("./zh-CN-Qcn0DHFh.mjs").then((m) => m.zhCN) }
+				"zh-CN": { translation: await import("./zh-CN-CKxdpj8c.mjs").then((m) => m.zhCN) }
 			},
 			interpolation: { escapeValue: false },
 			returnNull: false