aiex-cli 0.0.5-beta.2 → 0.0.5-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -70,6 +70,7 @@ aiex extract -s <schema> -f <file> # from file (txt, pdf, png, jpg, ...)
70
70
  aiex extract -s <schema> -f <file> -m <model> # specify AI model (overrides auto-selection)
71
71
  aiex extract -s <schema> -f <file> --no-insert # extract and save JSON without inserting into SQLite
72
72
  aiex extract -s <schema> -f <file> --force # force re-extraction even if already processed
73
+ aiex extract -s <schema> -f <file> --agent # run ReAct agent mode (ideal for large documents)
73
74
  aiex extract -s <schema> -d <directory> # batch extract all supported files in a directory
74
75
  aiex extract -s <schema> -d <dir> -g "*.pdf" # batch with glob filter
75
76
  aiex extract history # list extraction audit records
@@ -128,6 +129,7 @@ Dumps all extracted data for a given schema (or table) from the SQLite database
128
129
  | `aiex extract -s <name> -f <file> -m <model>` | Extract with a specific AI model |
129
130
  | `aiex extract -s <name> -f <file> --no-insert` | Extract and save JSON without inserting into SQLite |
130
131
  | `aiex extract -s <name> -f <file> --force` | Force re-extraction even if the file has already been processed |
132
+ | `aiex extract -s <name> -f <file> --agent` | Extract data in ReAct agent mode (using tool navigation) |
131
133
  | `aiex extract -s <name> -d <dir>` | Batch extract all supported files in a directory |
132
134
  | `aiex extract -s <name> -d <dir> -g "*.pdf"` | Batch extract with glob filter |
133
135
  | `aiex extract history` | List extraction audit records |
@@ -202,6 +204,25 @@ aiex completion fish | source
202
204
 
203
205
  <br>
204
206
 
207
+ ## 📄 Large Document Processing (Pipeline vs. ReAct Agent)
208
+
209
+ When processing very large documents (exceeding `40,000` characters), `aiex` provides two separate modes to handle context window limits and cost:
210
+
211
+ ### 1. Pipeline Mode (Default)
212
+ - **Mechanism**: Splits the document logically at Markdown headings or paragraph boundaries. It processes each chunk sequentially through the LLM, prepending active heading stacks as context to prevent losing track of document structure (like headers). Finally, it merges the outputs recursively.
213
+ - **Best for**: Small-to-medium files or structures where every single section must be scanned completely (e.g. log files).
214
+
215
+ ### 2. ReAct Agent Mode
216
+ - **Mechanism**: Spawns an agent equipped with document navigation tools:
217
+ - `listChunks()`: Returns a Table of Contents (headings, sizes, indices).
218
+ - `readChunk(chunkId)`: Fetches a specific section.
219
+ - `searchChunks(query)`: Matches keywords across all chunks.
220
+ - `submitExtraction(data)`: Submits the final structured JSON payload.
221
+ The agent uses these tools to dynamically browse and retrieve only the relevant parts, drastically reducing API token costs for giant documents.
222
+ - **How to run**: Pass `--agent` / `-a` via the CLI, or toggle **Extraction Mode** under the **Prompts** tab in the Web UI.
223
+
224
+ <br>
225
+
205
226
  ## 🔧 AI Configuration
206
227
 
207
228
  aiex works with any OpenAI-compatible API provider. Configure in the Web UI (AI Settings panel):
package/dist/cli.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-DZyLrpqA.mjs";
1
+ import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-CQPDBVTw.mjs";
2
2
  import { createRequire } from "node:module";
3
3
  import fs from "node:fs/promises";
4
4
  import os from "node:os";
@@ -18,7 +18,7 @@ import pc from "picocolors";
18
18
  import { Buffer } from "node:buffer";
19
19
  import * as XLSX from "xlsx";
20
20
  import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
21
- import { APICallError, Output, generateText, jsonSchema } from "ai";
21
+ import { APICallError, Output, generateText, jsonSchema, tool } from "ai";
22
22
  import pRetry from "p-retry";
23
23
  import mime from "mime";
24
24
  import { jsonrepair } from "jsonrepair";
@@ -13128,7 +13128,7 @@ function propertyToExtractionSchema(property) {
13128
13128
  }
13129
13129
  return { type: nullableType(property.type) };
13130
13130
  }
13131
- function isRecord(value) {
13131
+ function isRecord$1(value) {
13132
13132
  return typeof value === "object" && value !== null && !Array.isArray(value);
13133
13133
  }
13134
13134
  function schemaToExtractionOutputSchema(schema) {
@@ -13166,7 +13166,7 @@ function validatePropertyValue(path$1, property, value, issues) {
13166
13166
  }
13167
13167
  return;
13168
13168
  case "object":
13169
- if (!isRecord(value)) {
13169
+ if (!isRecord$1(value)) {
13170
13170
  issues.push(`${path$1}: expected object or null`);
13171
13171
  return;
13172
13172
  }
@@ -13189,7 +13189,7 @@ function validateProperties(basePath, properties, data, issues) {
13189
13189
  }
13190
13190
  }
13191
13191
  function validateExtractedData(schema, data) {
13192
- if (!isRecord(data)) return {
13192
+ if (!isRecord$1(data)) return {
13193
13193
  success: false,
13194
13194
  error: "Extracted data must be a JSON object."
13195
13195
  };
@@ -13512,6 +13512,329 @@ function insertExtractedData(db, schema, data) {
13512
13512
  }
13513
13513
  }
13514
13514
 
13515
+ //#endregion
13516
+ //#region src/core/ai-extraction/json-merger.ts
13517
+ function isRecord(value) {
13518
+ return typeof value === "object" && value !== null && !Array.isArray(value);
13519
+ }
13520
+ function mergePropertyValue(property, values) {
13521
+ const nonNullValues = values.filter((v) => v !== null && v !== void 0);
13522
+ if (nonNullValues.length === 0) return null;
13523
+ if (property.type === "array") {
13524
+ const concatenated = [];
13525
+ for (const val of nonNullValues) if (Array.isArray(val)) concatenated.push(...val);
13526
+ return concatenated;
13527
+ }
13528
+ if (property.type === "object") {
13529
+ const childProperties = property.properties;
13530
+ if (!childProperties) {
13531
+ const mergedObj$1 = {};
13532
+ for (const val of nonNullValues) if (isRecord(val)) Object.assign(mergedObj$1, val);
13533
+ return mergedObj$1;
13534
+ }
13535
+ const mergedObj = {};
13536
+ for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord(v) ? v[propName] : void 0));
13537
+ return mergedObj;
13538
+ }
13539
+ const bestValue = nonNullValues.find((v) => {
13540
+ if (typeof v === "string") return v.trim() !== "";
13541
+ return true;
13542
+ });
13543
+ return bestValue !== void 0 ? bestValue : null;
13544
+ }
13545
+ /**
13546
+ * Merges structured extraction outputs from multiple document chunks
13547
+ * according to the schema properties.
13548
+ */
13549
+ function mergeExtractionResults(schema, results) {
13550
+ if (results.length === 0) return {};
13551
+ if (results.length === 1) return results[0];
13552
+ const merged = {};
13553
+ for (const [propName, propDef] of Object.entries(schema.properties)) {
13554
+ if (propDef.primary && propDef.autoIncrement) continue;
13555
+ merged[propName] = mergePropertyValue(propDef, results.map((r) => r[propName]));
13556
+ }
13557
+ return merged;
13558
+ }
13559
+
13560
+ //#endregion
13561
+ //#region src/core/ai-extraction/text-splitter.ts
13562
+ const HEADING_RE = /^(#{1,6})\s+(\S.*)$/;
13563
+ /**
13564
+ * Splits a Markdown document into chunks based on header hierarchy.
13565
+ * Keeps tables and list blocks intact by splitting along paragraphs (\n\n)
13566
+ * when a section exceeds the maxSize limit.
13567
+ */
13568
+ function splitMarkdown(text$1, maxSize = 4e4) {
13569
+ const lines = text$1.split("\n");
13570
+ const chunks = [];
13571
+ let currentHeadings = [];
13572
+ let currentChunkLines = [];
13573
+ let currentSize = 0;
13574
+ const getMetadata = (headings) => {
13575
+ return {
13576
+ h1: headings[0] || void 0,
13577
+ h2: headings[1] || void 0,
13578
+ h3: headings[2] || void 0,
13579
+ h4: headings[3] || void 0
13580
+ };
13581
+ };
13582
+ const flushChunk = () => {
13583
+ if (currentChunkLines.length === 0) return;
13584
+ const pageContent = currentChunkLines.join("\n");
13585
+ if (pageContent.length > maxSize) {
13586
+ const paragraphs = pageContent.split("\n\n");
13587
+ let subLines = [];
13588
+ let subSize = 0;
13589
+ for (const para of paragraphs) {
13590
+ const paraSize = para.length;
13591
+ if (subSize + paraSize > maxSize && subLines.length > 0) {
13592
+ chunks.push({
13593
+ pageContent: subLines.join("\n\n"),
13594
+ metadata: getMetadata(currentHeadings)
13595
+ });
13596
+ subLines = [];
13597
+ subSize = 0;
13598
+ }
13599
+ subLines.push(para);
13600
+ subSize += paraSize + 2;
13601
+ }
13602
+ if (subLines.length > 0) chunks.push({
13603
+ pageContent: subLines.join("\n\n"),
13604
+ metadata: getMetadata(currentHeadings)
13605
+ });
13606
+ } else chunks.push({
13607
+ pageContent,
13608
+ metadata: getMetadata(currentHeadings)
13609
+ });
13610
+ currentChunkLines = [];
13611
+ currentSize = 0;
13612
+ };
13613
+ for (const line of lines) {
13614
+ const headingMatch = line.match(HEADING_RE);
13615
+ if (headingMatch) {
13616
+ flushChunk();
13617
+ const depth = headingMatch[1].length;
13618
+ const title = headingMatch[2].trim();
13619
+ currentHeadings = currentHeadings.slice(0, depth - 1);
13620
+ currentHeadings[depth - 1] = title;
13621
+ }
13622
+ currentChunkLines.push(line);
13623
+ currentSize += line.length + 1;
13624
+ if (currentSize > maxSize) flushChunk();
13625
+ }
13626
+ flushChunk();
13627
+ return chunks;
13628
+ }
13629
+
13630
+ //#endregion
13631
+ //#region src/core/ai-extraction/react-agent.ts
13632
+ async function extractStructuredDataWithAgent(input) {
13633
+ const { config, schema, text: text$1, aiexDir, modelOverride, onAgentStep } = input;
13634
+ if (!config.provider.apiKey) return {
13635
+ success: false,
13636
+ error: t("errors.ai.apiKeyMissing")
13637
+ };
13638
+ const chunks = splitMarkdown(text$1, 15e3);
13639
+ const inputTokens = Math.ceil(text$1.length / 2);
13640
+ const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
13641
+ const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
13642
+ let selected;
13643
+ try {
13644
+ selected = modelOverride ?? selectModel({
13645
+ models: config.provider.models,
13646
+ isImage: false,
13647
+ inputTokens,
13648
+ outputTokens
13649
+ });
13650
+ } catch (e) {
13651
+ return {
13652
+ success: false,
13653
+ error: e.message
13654
+ };
13655
+ }
13656
+ const useTelemetry = !!(config.langfuse?.publicKey && config.langfuse.secretKey);
13657
+ try {
13658
+ if (useTelemetry) initLangfuse(config);
13659
+ const provider = createOpenAICompatible({
13660
+ baseURL: config.provider.baseURL,
13661
+ name: "openai-compatible",
13662
+ apiKey: config.provider.apiKey,
13663
+ supportsStructuredOutputs: false
13664
+ });
13665
+ let finalExtractedData = null;
13666
+ const tools = {
13667
+ listChunks: tool({
13668
+ description: "Get a list of all text chunks in the document, showing their chunk index ID, character size, and markdown heading hierarchy (metadata). Use this as a Table of Contents to locate sections of interest.",
13669
+ parameters: z.object({}),
13670
+ execute: async () => {
13671
+ return chunks.map((c, idx) => ({
13672
+ id: idx + 1,
13673
+ size: c.pageContent.length,
13674
+ headings: c.metadata
13675
+ }));
13676
+ }
13677
+ }),
13678
+ readChunk: tool({
13679
+ description: "Read the full text content of a specific chunk by its ID.",
13680
+ parameters: z.object({ chunkId: z.number().int().describe("The ID (1-based index) of the chunk to read.") }),
13681
+ execute: async ({ chunkId }) => {
13682
+ const index = chunkId - 1;
13683
+ if (index < 0 || index >= chunks.length) return { error: `Invalid chunkId: ${chunkId}. Valid IDs are 1 to ${chunks.length}.` };
13684
+ const chunk = chunks[index];
13685
+ const headings = [];
13686
+ if (chunk.metadata) {
13687
+ if (chunk.metadata.h1) headings.push(chunk.metadata.h1);
13688
+ if (chunk.metadata.h2) headings.push(chunk.metadata.h2);
13689
+ if (chunk.metadata.h3) headings.push(chunk.metadata.h3);
13690
+ if (chunk.metadata.h4) headings.push(chunk.metadata.h4);
13691
+ }
13692
+ return {
13693
+ chunkId,
13694
+ headings: headings.join(" > "),
13695
+ content: chunk.pageContent
13696
+ };
13697
+ }
13698
+ }),
13699
+ searchChunks: tool({
13700
+ description: "Search all chunks in the document for specific keywords or search terms. Returns matching chunk IDs and small matching context snippets.",
13701
+ parameters: z.object({ query: z.string().describe("The keyword or search phrase to search for.") }),
13702
+ execute: async ({ query }) => {
13703
+ const results = [];
13704
+ const lowercaseQuery = query.toLowerCase();
13705
+ for (let i = 0; i < chunks.length; i++) {
13706
+ const chunkText = chunks[i].pageContent;
13707
+ const idx = chunkText.toLowerCase().indexOf(lowercaseQuery);
13708
+ if (idx !== -1) {
13709
+ const start = Math.max(0, idx - 60);
13710
+ const end = Math.min(chunkText.length, idx + lowercaseQuery.length + 60);
13711
+ const snippet = `...${chunkText.slice(start, end).replace(/\n/g, " ")}...`;
13712
+ results.push({
13713
+ chunkId: i + 1,
13714
+ headings: chunks[i].metadata,
13715
+ snippet
13716
+ });
13717
+ }
13718
+ }
13719
+ return results.slice(0, 10);
13720
+ }
13721
+ }),
13722
+ submitExtraction: tool({
13723
+ description: "Submit the final extracted JSON object conforming to the schema definition. Call this ONLY after you have gathered all necessary information.",
13724
+ parameters: z.object({ data: z.any().describe("The extracted JSON object conforming to the target schema.") }),
13725
+ execute: async ({ data }) => {
13726
+ finalExtractedData = data;
13727
+ return {
13728
+ status: "success",
13729
+ message: "Data submitted successfully. The extraction is now complete."
13730
+ };
13731
+ }
13732
+ })
13733
+ };
13734
+ const outputSchema = schemaToExtractionOutputSchema(schema);
13735
+ const systemPrompt = `You are a precise data extraction agent. Your goal is to extract structured information from a document to populate the target JSON schema.
13736
+
13737
+ Target JSON Schema structure to populate:
13738
+ ${JSON.stringify(outputSchema, null, 2)}
13739
+
13740
+ You are equipped with tools to browse the document dynamically:
13741
+ 1. First, call listChunks to understand the document layout and what sections exist.
13742
+ 2. Based on the schema fields, call readChunk or searchChunks to locate and read relevant content.
13743
+ 3. You can make multiple tool calls. Do not guess. Check the text carefully.
13744
+ 4. Once you have located and read all the necessary information, call the submitExtraction tool with the fully extracted JSON object.
13745
+ 5. After calling submitExtraction, you should stop.
13746
+
13747
+ CRITICAL RULES:
13748
+ 1. Extract data strictly conforming to the types and properties of the Target JSON Schema.
13749
+ 2. If a field's value cannot be found in the document after thorough search, set it to null.
13750
+ 3. Do not invent any values.
13751
+ 4. Call submitExtraction exactly once with the final JSON result.`;
13752
+ const timeoutMs = (config.provider.timeout ?? 300) * 1e3;
13753
+ const result = await generateText({
13754
+ model: provider.chatModel(selected.name),
13755
+ system: systemPrompt,
13756
+ prompt: "Please start by listing the chunks to understand the document structure, then gather the required facts and submit the final JSON extraction.",
13757
+ tools,
13758
+ maxSteps: 12,
13759
+ abortSignal: AbortSignal.timeout(timeoutMs),
13760
+ experimental_telemetry: { isEnabled: useTelemetry },
13761
+ onStepFinish({ text: text$2, toolCalls }) {
13762
+ if (onAgentStep) onAgentStep({
13763
+ thought: text$2,
13764
+ toolCalls
13765
+ });
13766
+ }
13767
+ });
13768
+ if (!finalExtractedData) {
13769
+ if (result.text) try {
13770
+ finalExtractedData = safeParseJSON(result.text);
13771
+ } catch {}
13772
+ }
13773
+ if (!finalExtractedData) return {
13774
+ success: false,
13775
+ error: "Agent finished without submitting structured data."
13776
+ };
13777
+ const validation = validateExtractedData(schema, finalExtractedData);
13778
+ if (!validation.success) {
13779
+ const correctionSystemPrompt = `You are a precise data correction assistant. Your task is to correct validation errors in a previously generated JSON object to make it comply with the JSON Schema.
13780
+
13781
+ JSON Schema Definition:
13782
+ ${JSON.stringify(outputSchema, null, 2)}
13783
+
13784
+ Validation Errors:
13785
+ ${validation.error}
13786
+
13787
+ Original Incorrect JSON:
13788
+ ${JSON.stringify(finalExtractedData, null, 2)}
13789
+
13790
+ Please output the corrected JSON object. Return ONLY the corrected JSON object, with no markdown tags or explanations.`;
13791
+ const correctedData = safeParseJSON((await generateText({
13792
+ model: provider.chatModel(selected.name),
13793
+ system: correctionSystemPrompt,
13794
+ prompt: "Please correct the JSON output now.",
13795
+ abortSignal: AbortSignal.timeout(timeoutMs),
13796
+ experimental_telemetry: { isEnabled: useTelemetry }
13797
+ })).text);
13798
+ const secondValidation = validateExtractedData(schema, correctedData);
13799
+ if (!secondValidation.success) return {
13800
+ success: false,
13801
+ error: `Agent output validation failed: ${secondValidation.error}`
13802
+ };
13803
+ finalExtractedData = correctedData;
13804
+ }
13805
+ const outputDir = path.resolve(aiexDir, config.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
13806
+ await fs.mkdir(outputDir, { recursive: true });
13807
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
13808
+ const outputFileName = `${schema.table.name}-${timestamp}.json`;
13809
+ const outputPath = path.join(outputDir, outputFileName);
13810
+ await writeFile(outputPath, finalExtractedData, {
13811
+ spaces: 2,
13812
+ EOL: "\n"
13813
+ });
13814
+ let totalPromptTokens = 0;
13815
+ let totalCompletionTokens = 0;
13816
+ if (result.usage) {
13817
+ totalPromptTokens = result.usage.inputTokens ?? 0;
13818
+ totalCompletionTokens = result.usage.outputTokens ?? 0;
13819
+ }
13820
+ return {
13821
+ success: true,
13822
+ outputPath,
13823
+ data: finalExtractedData,
13824
+ tokensUsed: {
13825
+ prompt: totalPromptTokens,
13826
+ completion: totalCompletionTokens,
13827
+ total: totalPromptTokens + totalCompletionTokens
13828
+ }
13829
+ };
13830
+ } catch (error) {
13831
+ return {
13832
+ success: false,
13833
+ error: getErrorMessage(error)
13834
+ };
13835
+ }
13836
+ }
13837
+
13515
13838
  //#endregion
13516
13839
  //#region src/core/extraction-audit.ts
13517
13840
  const AUDIT_ID_RE = /^[\w.-]+$/;
@@ -14396,6 +14719,7 @@ async function processOneFile(aiexDir, config, aiConfig, schemaName, filePath, m
14396
14719
  modelOverride,
14397
14720
  insert: options?.insert,
14398
14721
  force: options?.force,
14722
+ agent: options?.agent,
14399
14723
  quiet: false
14400
14724
  });
14401
14725
  if (result.success) {
@@ -14435,7 +14759,8 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
14435
14759
  })}`);
14436
14760
  if (await processOneFile(aiexDir, config, aiConfig, schemaName, file, modelOverride, {
14437
14761
  insert: options?.insert,
14438
- force: options?.force
14762
+ force: options?.force,
14763
+ agent: options?.agent
14439
14764
  })) successCount++;
14440
14765
  else failCount++;
14441
14766
  }
@@ -14525,7 +14850,130 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14525
14850
  }
14526
14851
  const s = spinner();
14527
14852
  if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
14528
- const result = await extractStructuredData({
14853
+ const CHUNK_LIMIT = 4e4;
14854
+ let result;
14855
+ if (options?.agent || aiConfig.extraction?.mode === "react") {
14856
+ if (!options?.quiet) consola.info(t("command.extract.file.reactAgentMode"));
14857
+ const agentResult = await extractStructuredDataWithAgent({
14858
+ config: aiConfig,
14859
+ schema: schemaLoad.schema,
14860
+ text: text$1 ?? "",
14861
+ aiexDir,
14862
+ modelOverride,
14863
+ onAgentStep(step) {
14864
+ if (!options?.quiet) {
14865
+ if (step.thought) {
14866
+ const thoughtPreview = step.thought.length > 100 ? `${step.thought.slice(0, 100)}...` : step.thought;
14867
+ s.message(`${pc.cyan(t("command.extract.file.agentThought"))}: ${thoughtPreview.replace(/\n/g, " ")}`);
14868
+ }
14869
+ if (step.toolCalls && step.toolCalls.length > 0) for (const call of step.toolCalls) consola.info(`[Agent Action] Calling tool: ${pc.green(call.toolName)}`);
14870
+ }
14871
+ }
14872
+ });
14873
+ if (!agentResult.success) {
14874
+ if (!options?.quiet) {
14875
+ s.stop(t("command.extract.file.extractFail"));
14876
+ consola.error(agentResult.error);
14877
+ }
14878
+ return {
14879
+ success: false,
14880
+ error: agentResult.error
14881
+ };
14882
+ }
14883
+ result = agentResult;
14884
+ } else if (text$1 && text$1.length > CHUNK_LIMIT) {
14885
+ if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
14886
+ length: text$1.length,
14887
+ limit: CHUNK_LIMIT
14888
+ }));
14889
+ const finalDocs = splitMarkdown(text$1, CHUNK_LIMIT);
14890
+ if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
14891
+ const chunkResults = [];
14892
+ const accumulatedTokens = {
14893
+ prompt: 0,
14894
+ completion: 0,
14895
+ total: 0
14896
+ };
14897
+ let success = true;
14898
+ let errorMsg = "";
14899
+ for (let i = 0; i < finalDocs.length; i++) {
14900
+ const doc = finalDocs[i];
14901
+ if (!options?.quiet) s.message(t("command.extract.file.extractingChunk", {
14902
+ current: i + 1,
14903
+ total: finalDocs.length
14904
+ }));
14905
+ const headings = [];
14906
+ if (doc.metadata) {
14907
+ if (doc.metadata.h1) headings.push(doc.metadata.h1);
14908
+ if (doc.metadata.h2) headings.push(doc.metadata.h2);
14909
+ if (doc.metadata.h3) headings.push(doc.metadata.h3);
14910
+ if (doc.metadata.h4) headings.push(doc.metadata.h4);
14911
+ }
14912
+ let chunkText = doc.pageContent;
14913
+ if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
14914
+ const chunkResult = await extractStructuredData({
14915
+ config: aiConfig,
14916
+ schema: schemaLoad.schema,
14917
+ text: chunkText,
14918
+ aiexDir,
14919
+ modelOverride,
14920
+ onRetry(info) {
14921
+ if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
14922
+ current: i + 1,
14923
+ total: finalDocs.length,
14924
+ code: info.statusCode,
14925
+ delay: info.delayMs / 1e3,
14926
+ attempt: info.attempt,
14927
+ max: info.maxRetries
14928
+ }));
14929
+ }
14930
+ });
14931
+ if (!chunkResult.success) {
14932
+ success = false;
14933
+ errorMsg = chunkResult.error || t("common.unknownError");
14934
+ if (!options?.quiet) {
14935
+ s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
14936
+ consola.error(errorMsg);
14937
+ }
14938
+ break;
14939
+ }
14940
+ if (chunkResult.data) chunkResults.push(chunkResult.data);
14941
+ if (chunkResult.tokensUsed) {
14942
+ accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
14943
+ accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
14944
+ accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
14945
+ }
14946
+ }
14947
+ if (!success) return {
14948
+ success: false,
14949
+ error: errorMsg
14950
+ };
14951
+ const mergedData = mergeExtractionResults(schemaLoad.schema, chunkResults);
14952
+ const validation = validateExtractedData(schemaLoad.schema, mergedData);
14953
+ if (!validation.success) {
14954
+ const valError = validation.error || "Merged data validation failed";
14955
+ if (!options?.quiet) {
14956
+ s.stop(t("command.extract.file.validationFail"));
14957
+ consola.error(valError);
14958
+ }
14959
+ return {
14960
+ success: false,
14961
+ error: valError
14962
+ };
14963
+ }
14964
+ const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
14965
+ await fs.mkdir(outputDir, { recursive: true });
14966
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
14967
+ const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
14968
+ const finalMergedOutputPath = path.join(outputDir, outputFileName);
14969
+ await fs.writeFile(finalMergedOutputPath, JSON.stringify(mergedData, null, 2));
14970
+ result = {
14971
+ success: true,
14972
+ data: mergedData,
14973
+ tokensUsed: accumulatedTokens,
14974
+ outputPath: finalMergedOutputPath
14975
+ };
14976
+ } else result = await extractStructuredData({
14529
14977
  config: aiConfig,
14530
14978
  schema: schemaLoad.schema,
14531
14979
  text: text$1 ?? "",
@@ -14611,7 +15059,7 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14611
15059
  };
14612
15060
  }
14613
15061
  async function runAuditedExtraction(options) {
14614
- const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false } = options;
15062
+ const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false, agent = false } = options;
14615
15063
  let fileHash;
14616
15064
  let isPlainTextFile = false;
14617
15065
  if (source.type === "file") {
@@ -14679,7 +15127,8 @@ async function runAuditedExtraction(options) {
14679
15127
  } else text$1 = source.text;
14680
15128
  const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
14681
15129
  quiet,
14682
- insert
15130
+ insert,
15131
+ agent
14683
15132
  });
14684
15133
  if (r.success) {
14685
15134
  let notionPages;
@@ -15138,6 +15587,12 @@ const extractCommand = defineCommand({
15138
15587
  type: "boolean",
15139
15588
  description: t("command.extract.args.force"),
15140
15589
  default: false
15590
+ },
15591
+ agent: {
15592
+ type: "boolean",
15593
+ alias: "a",
15594
+ description: "Enable ReAct agent extraction mode",
15595
+ default: false
15141
15596
  }
15142
15597
  },
15143
15598
  async run({ args, rawArgs }) {
@@ -15165,7 +15620,8 @@ const extractCommand = defineCommand({
15165
15620
  }
15166
15621
  const result$1 = await runBatchExtraction(aiexDir, config, aiConfig, args.schema, args.dir, args.glob, modelOverride, {
15167
15622
  insert: !args.noInsert,
15168
- force: args.force
15623
+ force: args.force,
15624
+ agent: args.agent
15169
15625
  });
15170
15626
  if (!result$1.ok) {
15171
15627
  failCommand(result$1.error);
@@ -15196,7 +15652,8 @@ const extractCommand = defineCommand({
15196
15652
  modelOverride,
15197
15653
  insert: !args.noInsert,
15198
15654
  force: args.force,
15199
- quiet: false
15655
+ quiet: false,
15656
+ agent: args.agent
15200
15657
  });
15201
15658
  if (!result.success) {
15202
15659
  failCommand(result.error);
@@ -74,7 +74,7 @@ function doctorDiagnosticsTableRows(d) {
74
74
  //#endregion
75
75
  //#region package.json
76
76
  var name = "aiex-cli";
77
- var version = "0.0.5-beta.2";
77
+ var version = "0.0.5-beta.3";
78
78
  var description = "JSON Schema → SQLite with AI-powered data extraction";
79
79
  var package_default = {
80
80
  name,
@@ -228,7 +228,10 @@ const PromptConfigSchema = z.object({
228
228
  systemTemplate: z.string().min(1),
229
229
  userTemplate: z.string().min(1)
230
230
  });
231
- const ExtractionConfigSchema = z.object({ outputDir: z.string().min(1) });
231
+ const ExtractionConfigSchema = z.object({
232
+ outputDir: z.string().min(1),
233
+ mode: z.enum(["pipeline", "react"]).default("pipeline").optional()
234
+ });
232
235
  const ImageOcrConfigSchema = z.object({
233
236
  ocrFallback: z.enum([
234
237
  "auto",
@@ -335,7 +338,10 @@ Extraction requirements:
335
338
  userTemplate: `Please extract data from the following text:
336
339
  {text}`
337
340
  };
338
- const DEFAULT_EXTRACTION_CONFIG = { outputDir: ".aiex/extracted" };
341
+ const DEFAULT_EXTRACTION_CONFIG = {
342
+ outputDir: ".aiex/extracted",
343
+ mode: "pipeline"
344
+ };
339
345
  const DEFAULT_IMAGE_OCR_CONFIG = {
340
346
  ocrFallback: "auto",
341
347
  ocrLanguages: "en-US, zh-Hans",
@@ -564,9 +570,17 @@ const en = {
564
570
  errorProcessing: "Error processing {{name}}: {{error}}",
565
571
  extractedFrom: "Extracting from {{file}}...",
566
572
  extracting: "Extracting data...",
573
+ reactAgentMode: "Starting ReAct Agent extraction...",
574
+ agentThought: "Agent Thought",
567
575
  extractFail: "Extraction failed",
568
576
  extractComplete: "Extraction complete",
569
577
  extractRetry: "API responded with {{code}}, retrying in {{delay}}s ({{attempt}}/{{max}})",
578
+ chunking: "Input text length ({{length}} chars) exceeds limit ({{limit}} chars). Splitting into chunks...",
579
+ chunksCount: "Split into {{count}} chunk(s).",
580
+ extractingChunk: "Extracting chunk {{current}}/{{total}}...",
581
+ extractRetryChunk: "Chunk {{current}}/{{total}} API responded with {{code}}, retrying in {{delay}}s ({{attempt}}/{{max}})",
582
+ extractFailChunk: "Extraction failed for chunk {{current}}/{{total}}",
583
+ validationFail: "Merged data validation failed",
570
584
  resultSaved: "Result saved: {{path}}",
571
585
  tokenUsage: "Token usage: prompt={{prompt}}, completion={{completion}}, total={{total}}",
572
586
  insertingDb: "Inserting into database...",
@@ -956,7 +970,7 @@ async function initI18n(lng) {
956
970
  fallbackLng: "en",
957
971
  resources: {
958
972
  "en": { translation: en },
959
- "zh-CN": { translation: await import("./zh-CN-Qcn0DHFh.mjs").then((m) => m.zhCN) }
973
+ "zh-CN": { translation: await import("./zh-CN-CKxdpj8c.mjs").then((m) => m.zhCN) }
960
974
  },
961
975
  interpolation: { escapeValue: false },
962
976
  returnNull: false