aiex-cli 0.0.5-beta.3 → 0.0.5-beta.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-CQPDBVTw.mjs";
1
+ import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-Cv7RArla.mjs";
2
2
  import { createRequire } from "node:module";
3
3
  import fs from "node:fs/promises";
4
4
  import os from "node:os";
@@ -18,7 +18,7 @@ import pc from "picocolors";
18
18
  import { Buffer } from "node:buffer";
19
19
  import * as XLSX from "xlsx";
20
20
  import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
21
- import { APICallError, Output, generateText, jsonSchema, tool } from "ai";
21
+ import { APICallError, Output, generateText, jsonSchema } from "ai";
22
22
  import pRetry from "p-retry";
23
23
  import mime from "mime";
24
24
  import { jsonrepair } from "jsonrepair";
@@ -13565,12 +13565,13 @@ const HEADING_RE = /^(#{1,6})\s+(\S.*)$/;
13565
13565
  * Keeps tables and list blocks intact by splitting along paragraphs (\n\n)
13566
13566
  * when a section exceeds the maxSize limit.
13567
13567
  */
13568
- function splitMarkdown(text$1, maxSize = 4e4) {
13568
+ function splitMarkdown(text$1, maxSize = 4e4, overlapSize = 0) {
13569
13569
  const lines = text$1.split("\n");
13570
13570
  const chunks = [];
13571
13571
  let currentHeadings = [];
13572
13572
  let currentChunkLines = [];
13573
13573
  let currentSize = 0;
13574
+ let hasNewLines = false;
13574
13575
  const getMetadata = (headings) => {
13575
13576
  return {
13576
13577
  h1: headings[0] || void 0,
@@ -13579,9 +13580,15 @@ function splitMarkdown(text$1, maxSize = 4e4) {
13579
13580
  h4: headings[3] || void 0
13580
13581
  };
13581
13582
  };
13582
- const flushChunk = () => {
13583
- if (currentChunkLines.length === 0) return;
13583
+ const flushChunk = (isHeadingChange = false) => {
13584
+ if (currentChunkLines.length === 0 || !hasNewLines) {
13585
+ currentChunkLines = [];
13586
+ currentSize = 0;
13587
+ hasNewLines = false;
13588
+ return;
13589
+ }
13584
13590
  const pageContent = currentChunkLines.join("\n");
13591
+ let lastChunkContent = "";
13585
13592
  if (pageContent.length > maxSize) {
13586
13593
  const paragraphs = pageContent.split("\n\n");
13587
13594
  let subLines = [];
@@ -13589,31 +13596,63 @@ function splitMarkdown(text$1, maxSize = 4e4) {
13589
13596
  for (const para of paragraphs) {
13590
13597
  const paraSize = para.length;
13591
13598
  if (subSize + paraSize > maxSize && subLines.length > 0) {
13599
+ const content = subLines.join("\n\n");
13592
13600
  chunks.push({
13593
- pageContent: subLines.join("\n\n"),
13601
+ pageContent: content,
13594
13602
  metadata: getMetadata(currentHeadings)
13595
13603
  });
13596
- subLines = [];
13597
- subSize = 0;
13604
+ const overlapParas = [];
13605
+ let currentOverlapSize = 0;
13606
+ for (let j = subLines.length - 1; j >= 0; j--) {
13607
+ const p = subLines[j];
13608
+ if (currentOverlapSize + p.length > overlapSize && overlapParas.length > 0) break;
13609
+ overlapParas.unshift(p);
13610
+ currentOverlapSize += p.length + 2;
13611
+ }
13612
+ subLines = [...overlapParas];
13613
+ subSize = currentOverlapSize;
13598
13614
  }
13599
13615
  subLines.push(para);
13600
13616
  subSize += paraSize + 2;
13601
13617
  }
13602
- if (subLines.length > 0) chunks.push({
13603
- pageContent: subLines.join("\n\n"),
13618
+ if (subLines.length > 0) {
13619
+ const content = subLines.join("\n\n");
13620
+ chunks.push({
13621
+ pageContent: content,
13622
+ metadata: getMetadata(currentHeadings)
13623
+ });
13624
+ lastChunkContent = content;
13625
+ }
13626
+ } else {
13627
+ chunks.push({
13628
+ pageContent,
13604
13629
  metadata: getMetadata(currentHeadings)
13605
13630
  });
13606
- } else chunks.push({
13607
- pageContent,
13608
- metadata: getMetadata(currentHeadings)
13609
- });
13610
- currentChunkLines = [];
13611
- currentSize = 0;
13631
+ lastChunkContent = pageContent;
13632
+ }
13633
+ if (!isHeadingChange && lastChunkContent && overlapSize > 0) {
13634
+ const paragraphs = lastChunkContent.split("\n\n");
13635
+ const overlapParas = [];
13636
+ let currentOverlapSize = 0;
13637
+ for (let j = paragraphs.length - 1; j >= 0; j--) {
13638
+ const p = paragraphs[j];
13639
+ if (currentOverlapSize + p.length > overlapSize && overlapParas.length > 0) break;
13640
+ overlapParas.unshift(p);
13641
+ currentOverlapSize += p.length + 2;
13642
+ }
13643
+ const overlapText = overlapParas.join("\n\n");
13644
+ currentChunkLines = overlapText.split("\n");
13645
+ currentSize = overlapText.length;
13646
+ } else {
13647
+ currentChunkLines = [];
13648
+ currentSize = 0;
13649
+ }
13650
+ hasNewLines = false;
13612
13651
  };
13613
13652
  for (const line of lines) {
13614
13653
  const headingMatch = line.match(HEADING_RE);
13615
13654
  if (headingMatch) {
13616
- flushChunk();
13655
+ flushChunk(true);
13617
13656
  const depth = headingMatch[1].length;
13618
13657
  const title = headingMatch[2].trim();
13619
13658
  currentHeadings = currentHeadings.slice(0, depth - 1);
@@ -13621,220 +13660,13 @@ function splitMarkdown(text$1, maxSize = 4e4) {
13621
13660
  }
13622
13661
  currentChunkLines.push(line);
13623
13662
  currentSize += line.length + 1;
13624
- if (currentSize > maxSize) flushChunk();
13663
+ hasNewLines = true;
13664
+ if (currentSize > maxSize) flushChunk(false);
13625
13665
  }
13626
- flushChunk();
13666
+ flushChunk(true);
13627
13667
  return chunks;
13628
13668
  }
13629
13669
 
13630
- //#endregion
13631
- //#region src/core/ai-extraction/react-agent.ts
13632
- async function extractStructuredDataWithAgent(input) {
13633
- const { config, schema, text: text$1, aiexDir, modelOverride, onAgentStep } = input;
13634
- if (!config.provider.apiKey) return {
13635
- success: false,
13636
- error: t("errors.ai.apiKeyMissing")
13637
- };
13638
- const chunks = splitMarkdown(text$1, 15e3);
13639
- const inputTokens = Math.ceil(text$1.length / 2);
13640
- const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
13641
- const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
13642
- let selected;
13643
- try {
13644
- selected = modelOverride ?? selectModel({
13645
- models: config.provider.models,
13646
- isImage: false,
13647
- inputTokens,
13648
- outputTokens
13649
- });
13650
- } catch (e) {
13651
- return {
13652
- success: false,
13653
- error: e.message
13654
- };
13655
- }
13656
- const useTelemetry = !!(config.langfuse?.publicKey && config.langfuse.secretKey);
13657
- try {
13658
- if (useTelemetry) initLangfuse(config);
13659
- const provider = createOpenAICompatible({
13660
- baseURL: config.provider.baseURL,
13661
- name: "openai-compatible",
13662
- apiKey: config.provider.apiKey,
13663
- supportsStructuredOutputs: false
13664
- });
13665
- let finalExtractedData = null;
13666
- const tools = {
13667
- listChunks: tool({
13668
- description: "Get a list of all text chunks in the document, showing their chunk index ID, character size, and markdown heading hierarchy (metadata). Use this as a Table of Contents to locate sections of interest.",
13669
- parameters: z.object({}),
13670
- execute: async () => {
13671
- return chunks.map((c, idx) => ({
13672
- id: idx + 1,
13673
- size: c.pageContent.length,
13674
- headings: c.metadata
13675
- }));
13676
- }
13677
- }),
13678
- readChunk: tool({
13679
- description: "Read the full text content of a specific chunk by its ID.",
13680
- parameters: z.object({ chunkId: z.number().int().describe("The ID (1-based index) of the chunk to read.") }),
13681
- execute: async ({ chunkId }) => {
13682
- const index = chunkId - 1;
13683
- if (index < 0 || index >= chunks.length) return { error: `Invalid chunkId: ${chunkId}. Valid IDs are 1 to ${chunks.length}.` };
13684
- const chunk = chunks[index];
13685
- const headings = [];
13686
- if (chunk.metadata) {
13687
- if (chunk.metadata.h1) headings.push(chunk.metadata.h1);
13688
- if (chunk.metadata.h2) headings.push(chunk.metadata.h2);
13689
- if (chunk.metadata.h3) headings.push(chunk.metadata.h3);
13690
- if (chunk.metadata.h4) headings.push(chunk.metadata.h4);
13691
- }
13692
- return {
13693
- chunkId,
13694
- headings: headings.join(" > "),
13695
- content: chunk.pageContent
13696
- };
13697
- }
13698
- }),
13699
- searchChunks: tool({
13700
- description: "Search all chunks in the document for specific keywords or search terms. Returns matching chunk IDs and small matching context snippets.",
13701
- parameters: z.object({ query: z.string().describe("The keyword or search phrase to search for.") }),
13702
- execute: async ({ query }) => {
13703
- const results = [];
13704
- const lowercaseQuery = query.toLowerCase();
13705
- for (let i = 0; i < chunks.length; i++) {
13706
- const chunkText = chunks[i].pageContent;
13707
- const idx = chunkText.toLowerCase().indexOf(lowercaseQuery);
13708
- if (idx !== -1) {
13709
- const start = Math.max(0, idx - 60);
13710
- const end = Math.min(chunkText.length, idx + lowercaseQuery.length + 60);
13711
- const snippet = `...${chunkText.slice(start, end).replace(/\n/g, " ")}...`;
13712
- results.push({
13713
- chunkId: i + 1,
13714
- headings: chunks[i].metadata,
13715
- snippet
13716
- });
13717
- }
13718
- }
13719
- return results.slice(0, 10);
13720
- }
13721
- }),
13722
- submitExtraction: tool({
13723
- description: "Submit the final extracted JSON object conforming to the schema definition. Call this ONLY after you have gathered all necessary information.",
13724
- parameters: z.object({ data: z.any().describe("The extracted JSON object conforming to the target schema.") }),
13725
- execute: async ({ data }) => {
13726
- finalExtractedData = data;
13727
- return {
13728
- status: "success",
13729
- message: "Data submitted successfully. The extraction is now complete."
13730
- };
13731
- }
13732
- })
13733
- };
13734
- const outputSchema = schemaToExtractionOutputSchema(schema);
13735
- const systemPrompt = `You are a precise data extraction agent. Your goal is to extract structured information from a document to populate the target JSON schema.
13736
-
13737
- Target JSON Schema structure to populate:
13738
- ${JSON.stringify(outputSchema, null, 2)}
13739
-
13740
- You are equipped with tools to browse the document dynamically:
13741
- 1. First, call listChunks to understand the document layout and what sections exist.
13742
- 2. Based on the schema fields, call readChunk or searchChunks to locate and read relevant content.
13743
- 3. You can make multiple tool calls. Do not guess. Check the text carefully.
13744
- 4. Once you have located and read all the necessary information, call the submitExtraction tool with the fully extracted JSON object.
13745
- 5. After calling submitExtraction, you should stop.
13746
-
13747
- CRITICAL RULES:
13748
- 1. Extract data strictly conforming to the types and properties of the Target JSON Schema.
13749
- 2. If a field's value cannot be found in the document after thorough search, set it to null.
13750
- 3. Do not invent any values.
13751
- 4. Call submitExtraction exactly once with the final JSON result.`;
13752
- const timeoutMs = (config.provider.timeout ?? 300) * 1e3;
13753
- const result = await generateText({
13754
- model: provider.chatModel(selected.name),
13755
- system: systemPrompt,
13756
- prompt: "Please start by listing the chunks to understand the document structure, then gather the required facts and submit the final JSON extraction.",
13757
- tools,
13758
- maxSteps: 12,
13759
- abortSignal: AbortSignal.timeout(timeoutMs),
13760
- experimental_telemetry: { isEnabled: useTelemetry },
13761
- onStepFinish({ text: text$2, toolCalls }) {
13762
- if (onAgentStep) onAgentStep({
13763
- thought: text$2,
13764
- toolCalls
13765
- });
13766
- }
13767
- });
13768
- if (!finalExtractedData) {
13769
- if (result.text) try {
13770
- finalExtractedData = safeParseJSON(result.text);
13771
- } catch {}
13772
- }
13773
- if (!finalExtractedData) return {
13774
- success: false,
13775
- error: "Agent finished without submitting structured data."
13776
- };
13777
- const validation = validateExtractedData(schema, finalExtractedData);
13778
- if (!validation.success) {
13779
- const correctionSystemPrompt = `You are a precise data correction assistant. Your task is to correct validation errors in a previously generated JSON object to make it comply with the JSON Schema.
13780
-
13781
- JSON Schema Definition:
13782
- ${JSON.stringify(outputSchema, null, 2)}
13783
-
13784
- Validation Errors:
13785
- ${validation.error}
13786
-
13787
- Original Incorrect JSON:
13788
- ${JSON.stringify(finalExtractedData, null, 2)}
13789
-
13790
- Please output the corrected JSON object. Return ONLY the corrected JSON object, with no markdown tags or explanations.`;
13791
- const correctedData = safeParseJSON((await generateText({
13792
- model: provider.chatModel(selected.name),
13793
- system: correctionSystemPrompt,
13794
- prompt: "Please correct the JSON output now.",
13795
- abortSignal: AbortSignal.timeout(timeoutMs),
13796
- experimental_telemetry: { isEnabled: useTelemetry }
13797
- })).text);
13798
- const secondValidation = validateExtractedData(schema, correctedData);
13799
- if (!secondValidation.success) return {
13800
- success: false,
13801
- error: `Agent output validation failed: ${secondValidation.error}`
13802
- };
13803
- finalExtractedData = correctedData;
13804
- }
13805
- const outputDir = path.resolve(aiexDir, config.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
13806
- await fs.mkdir(outputDir, { recursive: true });
13807
- const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
13808
- const outputFileName = `${schema.table.name}-${timestamp}.json`;
13809
- const outputPath = path.join(outputDir, outputFileName);
13810
- await writeFile(outputPath, finalExtractedData, {
13811
- spaces: 2,
13812
- EOL: "\n"
13813
- });
13814
- let totalPromptTokens = 0;
13815
- let totalCompletionTokens = 0;
13816
- if (result.usage) {
13817
- totalPromptTokens = result.usage.inputTokens ?? 0;
13818
- totalCompletionTokens = result.usage.outputTokens ?? 0;
13819
- }
13820
- return {
13821
- success: true,
13822
- outputPath,
13823
- data: finalExtractedData,
13824
- tokensUsed: {
13825
- prompt: totalPromptTokens,
13826
- completion: totalCompletionTokens,
13827
- total: totalPromptTokens + totalCompletionTokens
13828
- }
13829
- };
13830
- } catch (error) {
13831
- return {
13832
- success: false,
13833
- error: getErrorMessage(error)
13834
- };
13835
- }
13836
- }
13837
-
13838
13670
  //#endregion
13839
13671
  //#region src/core/extraction-audit.ts
13840
13672
  const AUDIT_ID_RE = /^[\w.-]+$/;
@@ -14719,7 +14551,6 @@ async function processOneFile(aiexDir, config, aiConfig, schemaName, filePath, m
14719
14551
  modelOverride,
14720
14552
  insert: options?.insert,
14721
14553
  force: options?.force,
14722
- agent: options?.agent,
14723
14554
  quiet: false
14724
14555
  });
14725
14556
  if (result.success) {
@@ -14759,8 +14590,7 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
14759
14590
  })}`);
14760
14591
  if (await processOneFile(aiexDir, config, aiConfig, schemaName, file, modelOverride, {
14761
14592
  insert: options?.insert,
14762
- force: options?.force,
14763
- agent: options?.agent
14593
+ force: options?.force
14764
14594
  })) successCount++;
14765
14595
  else failCount++;
14766
14596
  }
@@ -14779,6 +14609,42 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
14779
14609
  //#endregion
14780
14610
  //#region src/core/extract-runner.ts
14781
14611
  const JSON_EXT_RE$1 = /\.json$/;
14612
+ async function limitConcurrency(concurrency, items, fn) {
14613
+ const results = Array.from({ length: items.length });
14614
+ let nextIndex = 0;
14615
+ async function worker() {
14616
+ while (nextIndex < items.length) {
14617
+ const currentIndex = nextIndex++;
14618
+ results[currentIndex] = await fn(items[currentIndex], currentIndex);
14619
+ }
14620
+ }
14621
+ const workers = Array.from({ length: Math.min(concurrency, items.length) }, worker);
14622
+ await Promise.all(workers);
14623
+ return results;
14624
+ }
14625
+ function getSchemaKeywords(schema) {
14626
+ const keywords = /* @__PURE__ */ new Set();
14627
+ function walk(properties) {
14628
+ if (!properties) return;
14629
+ for (const [name$1, prop] of Object.entries(properties)) {
14630
+ keywords.add(name$1.toLowerCase());
14631
+ const parts = name$1.replace(/([a-z0-9])([A-Z])/g, "$1 $2").split(/[\s._:/\\-]+/g);
14632
+ for (const part of parts) if (part.length > 1) keywords.add(part.toLowerCase());
14633
+ if (prop && typeof prop === "object") {
14634
+ const p = prop;
14635
+ if (typeof p.title === "string") keywords.add(p.title.toLowerCase());
14636
+ if (typeof p.description === "string") {
14637
+ const descParts = p.description.toLowerCase().match(/[\p{L}\p{N}_-]+/gu) ?? [];
14638
+ for (const d of descParts) if (d.length > 2) keywords.add(d);
14639
+ }
14640
+ if (p.type === "object") walk(p.properties);
14641
+ if (p.type === "array" && p.items?.type === "object") walk(p.items.properties);
14642
+ }
14643
+ }
14644
+ }
14645
+ walk(schema.properties);
14646
+ return Array.from(keywords);
14647
+ }
14782
14648
  async function ensureDatabaseReady(dbPath, schema) {
14783
14649
  try {
14784
14650
  await fs.access(dbPath);
@@ -14852,42 +14718,48 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14852
14718
  if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
14853
14719
  const CHUNK_LIMIT = 4e4;
14854
14720
  let result;
14855
- if (options?.agent || aiConfig.extraction?.mode === "react") {
14856
- if (!options?.quiet) consola.info(t("command.extract.file.reactAgentMode"));
14857
- const agentResult = await extractStructuredDataWithAgent({
14858
- config: aiConfig,
14859
- schema: schemaLoad.schema,
14860
- text: text$1 ?? "",
14861
- aiexDir,
14862
- modelOverride,
14863
- onAgentStep(step) {
14864
- if (!options?.quiet) {
14865
- if (step.thought) {
14866
- const thoughtPreview = step.thought.length > 100 ? `${step.thought.slice(0, 100)}...` : step.thought;
14867
- s.message(`${pc.cyan(t("command.extract.file.agentThought"))}: ${thoughtPreview.replace(/\n/g, " ")}`);
14868
- }
14869
- if (step.toolCalls && step.toolCalls.length > 0) for (const call of step.toolCalls) consola.info(`[Agent Action] Calling tool: ${pc.green(call.toolName)}`);
14870
- }
14871
- }
14872
- });
14873
- if (!agentResult.success) {
14874
- if (!options?.quiet) {
14875
- s.stop(t("command.extract.file.extractFail"));
14876
- consola.error(agentResult.error);
14877
- }
14878
- return {
14879
- success: false,
14880
- error: agentResult.error
14881
- };
14882
- }
14883
- result = agentResult;
14884
- } else if (text$1 && text$1.length > CHUNK_LIMIT) {
14721
+ if (text$1 && text$1.length > CHUNK_LIMIT) {
14885
14722
  if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
14886
14723
  length: text$1.length,
14887
14724
  limit: CHUNK_LIMIT
14888
14725
  }));
14889
- const finalDocs = splitMarkdown(text$1, CHUNK_LIMIT);
14726
+ const finalDocs = splitMarkdown(text$1, CHUNK_LIMIT, aiConfig.extraction?.overlapSize ?? 2e3);
14890
14727
  if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
14728
+ let processedDocs = finalDocs;
14729
+ if (!!aiConfig.extraction?.preFiltering && finalDocs.length > 1) {
14730
+ const preFilteringLimit = aiConfig.extraction?.preFilteringLimit ?? 5;
14731
+ const keywords = getSchemaKeywords(schemaLoad.schema);
14732
+ const scoredChunks = finalDocs.map((doc, idx) => {
14733
+ if (idx === 0) return {
14734
+ index: idx,
14735
+ score: Number.POSITIVE_INFINITY
14736
+ };
14737
+ let score = 0;
14738
+ const docTextLower = doc.pageContent.toLowerCase();
14739
+ for (const kw of keywords) {
14740
+ let pos = docTextLower.indexOf(kw);
14741
+ while (pos !== -1) {
14742
+ score++;
14743
+ pos = docTextLower.indexOf(kw, pos + kw.length);
14744
+ }
14745
+ }
14746
+ return {
14747
+ index: idx,
14748
+ score
14749
+ };
14750
+ }).slice(1).sort((a, b) => b.score - a.score);
14751
+ const selectedIndices = new Set([0]);
14752
+ let keptCount = 0;
14753
+ for (const sc of scoredChunks) if (sc.score > 0 && keptCount < preFilteringLimit) {
14754
+ selectedIndices.add(sc.index);
14755
+ keptCount++;
14756
+ }
14757
+ processedDocs = finalDocs.filter((_, idx) => selectedIndices.has(idx));
14758
+ if (!options?.quiet) consola.info(t("command.extract.file.preFiltering", {
14759
+ original: finalDocs.length,
14760
+ filtered: processedDocs.length
14761
+ }));
14762
+ }
14891
14763
  const chunkResults = [];
14892
14764
  const accumulatedTokens = {
14893
14765
  prompt: 0,
@@ -14896,53 +14768,68 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14896
14768
  };
14897
14769
  let success = true;
14898
14770
  let errorMsg = "";
14899
- for (let i = 0; i < finalDocs.length; i++) {
14900
- const doc = finalDocs[i];
14901
- if (!options?.quiet) s.message(t("command.extract.file.extractingChunk", {
14902
- current: i + 1,
14903
- total: finalDocs.length
14904
- }));
14905
- const headings = [];
14906
- if (doc.metadata) {
14907
- if (doc.metadata.h1) headings.push(doc.metadata.h1);
14908
- if (doc.metadata.h2) headings.push(doc.metadata.h2);
14909
- if (doc.metadata.h3) headings.push(doc.metadata.h3);
14910
- if (doc.metadata.h4) headings.push(doc.metadata.h4);
14911
- }
14912
- let chunkText = doc.pageContent;
14913
- if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
14914
- const chunkResult = await extractStructuredData({
14915
- config: aiConfig,
14916
- schema: schemaLoad.schema,
14917
- text: chunkText,
14918
- aiexDir,
14919
- modelOverride,
14920
- onRetry(info) {
14921
- if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
14922
- current: i + 1,
14923
- total: finalDocs.length,
14924
- code: info.statusCode,
14925
- delay: info.delayMs / 1e3,
14926
- attempt: info.attempt,
14927
- max: info.maxRetries
14928
- }));
14771
+ const extractionTasks = processedDocs.map((doc, i) => {
14772
+ return async () => {
14773
+ if (!success) return;
14774
+ const headings = [];
14775
+ if (doc.metadata) {
14776
+ if (doc.metadata.h1) headings.push(doc.metadata.h1);
14777
+ if (doc.metadata.h2) headings.push(doc.metadata.h2);
14778
+ if (doc.metadata.h3) headings.push(doc.metadata.h3);
14779
+ if (doc.metadata.h4) headings.push(doc.metadata.h4);
14929
14780
  }
14930
- });
14931
- if (!chunkResult.success) {
14932
- success = false;
14933
- errorMsg = chunkResult.error || t("common.unknownError");
14934
- if (!options?.quiet) {
14935
- s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
14936
- consola.error(errorMsg);
14781
+ let chunkText = doc.pageContent;
14782
+ if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
14783
+ const chunkResult = await extractStructuredData({
14784
+ config: aiConfig,
14785
+ schema: schemaLoad.schema,
14786
+ text: chunkText,
14787
+ aiexDir,
14788
+ modelOverride,
14789
+ onRetry(info) {
14790
+ if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
14791
+ current: i + 1,
14792
+ total: processedDocs.length,
14793
+ code: info.statusCode,
14794
+ delay: info.delayMs / 1e3,
14795
+ attempt: info.attempt,
14796
+ max: info.maxRetries
14797
+ }));
14798
+ }
14799
+ });
14800
+ if (!chunkResult.success) {
14801
+ success = false;
14802
+ errorMsg = chunkResult.error || t("common.unknownError");
14803
+ if (!options?.quiet) {
14804
+ s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
14805
+ consola.error(errorMsg);
14806
+ }
14807
+ return;
14937
14808
  }
14938
- break;
14939
- }
14940
- if (chunkResult.data) chunkResults.push(chunkResult.data);
14941
- if (chunkResult.tokensUsed) {
14942
- accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
14943
- accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
14944
- accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
14945
- }
14809
+ if (chunkResult.data) chunkResults.push(chunkResult.data);
14810
+ if (chunkResult.tokensUsed) {
14811
+ accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
14812
+ accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
14813
+ accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
14814
+ }
14815
+ };
14816
+ });
14817
+ const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
14818
+ if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
14819
+ current: 1,
14820
+ total: processedDocs.length
14821
+ }));
14822
+ try {
14823
+ await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
14824
+ if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
14825
+ current: idx + 1,
14826
+ total: processedDocs.length
14827
+ }));
14828
+ await task();
14829
+ });
14830
+ } catch (e) {
14831
+ success = false;
14832
+ errorMsg = e instanceof Error ? e.message : String(e);
14946
14833
  }
14947
14834
  if (!success) return {
14948
14835
  success: false,
@@ -15001,6 +14888,11 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
15001
14888
  }
15002
14889
  if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
15003
14890
  if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
14891
+ if (result.evidenceSummary && !options?.quiet) {
14892
+ const summary = result.evidenceSummary;
14893
+ const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
14894
+ consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, issues ${issueText}`));
14895
+ }
15004
14896
  if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
15005
14897
  prompt: result.tokensUsed.prompt,
15006
14898
  completion: result.tokensUsed.completion,
@@ -15029,6 +14921,7 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
15029
14921
  outputPath: result.outputPath,
15030
14922
  data: result.data,
15031
14923
  tablesInserted: insertResult.tablesInserted,
14924
+ evidenceSummary: result.evidenceSummary,
15032
14925
  tokensUsed: result.tokensUsed
15033
14926
  };
15034
14927
  } else {
@@ -15055,11 +14948,12 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
15055
14948
  success: true,
15056
14949
  outputPath: result.outputPath,
15057
14950
  data: result.data,
14951
+ evidenceSummary: result.evidenceSummary,
15058
14952
  tokensUsed: result.tokensUsed
15059
14953
  };
15060
14954
  }
15061
14955
  async function runAuditedExtraction(options) {
15062
- const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false, agent = false } = options;
14956
+ const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false } = options;
15063
14957
  let fileHash;
15064
14958
  let isPlainTextFile = false;
15065
14959
  if (source.type === "file") {
@@ -15127,8 +15021,7 @@ async function runAuditedExtraction(options) {
15127
15021
  } else text$1 = source.text;
15128
15022
  const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
15129
15023
  quiet,
15130
- insert,
15131
- agent
15024
+ insert
15132
15025
  });
15133
15026
  if (r.success) {
15134
15027
  let notionPages;
@@ -15168,6 +15061,7 @@ async function runAuditedExtraction(options) {
15168
15061
  outputName: updated.outputName,
15169
15062
  tablesInserted: updated.tablesInserted,
15170
15063
  notionPages: updated.notionPages,
15064
+ evidenceSummary: r.evidenceSummary,
15171
15065
  tokensUsed: updated.tokensUsed,
15172
15066
  auditId: updated.id,
15173
15067
  fileHash
@@ -15587,12 +15481,6 @@ const extractCommand = defineCommand({
15587
15481
  type: "boolean",
15588
15482
  description: t("command.extract.args.force"),
15589
15483
  default: false
15590
- },
15591
- agent: {
15592
- type: "boolean",
15593
- alias: "a",
15594
- description: "Enable ReAct agent extraction mode",
15595
- default: false
15596
15484
  }
15597
15485
  },
15598
15486
  async run({ args, rawArgs }) {
@@ -15620,8 +15508,7 @@ const extractCommand = defineCommand({
15620
15508
  }
15621
15509
  const result$1 = await runBatchExtraction(aiexDir, config, aiConfig, args.schema, args.dir, args.glob, modelOverride, {
15622
15510
  insert: !args.noInsert,
15623
- force: args.force,
15624
- agent: args.agent
15511
+ force: args.force
15625
15512
  });
15626
15513
  if (!result$1.ok) {
15627
15514
  failCommand(result$1.error);
@@ -15652,8 +15539,7 @@ const extractCommand = defineCommand({
15652
15539
  modelOverride,
15653
15540
  insert: !args.noInsert,
15654
15541
  force: args.force,
15655
- quiet: false,
15656
- agent: args.agent
15542
+ quiet: false
15657
15543
  });
15658
15544
  if (!result.success) {
15659
15545
  failCommand(result.error);
@@ -16303,6 +16189,7 @@ function aiRoutes(config) {
16303
16189
  //#endregion
16304
16190
  //#region src/core/data-service.ts
16305
16191
  const FILE_REGEX = /\.json$/;
16192
+ const EVIDENCE_FILE_SUFFIX = ".evidence.json";
16306
16193
  const EXTRACTION_TIMESTAMP_RE = /-\d{4}-\d{2}-\d{2}T/;
16307
16194
  const INTERNAL_ROWID_COLUMN = "__aiex_rowid";
16308
16195
  const TIMESTAMP_CLEANUP = /(\d{2})-(\d{2})-(\d{2})/;
@@ -16318,6 +16205,24 @@ function getAuditNotionStatus(record) {
16318
16205
  if (record.status === "failed") return "failed";
16319
16206
  return "not_synced";
16320
16207
  }
16208
+ async function readEvidenceSummary(extractedDir, outputName) {
16209
+ const evidencePath = path.join(extractedDir, outputName.replace(FILE_REGEX, EVIDENCE_FILE_SUFFIX));
16210
+ try {
16211
+ const coverage = (await readFile(evidencePath))?.coverage;
16212
+ if (!coverage || typeof coverage !== "object") return void 0;
16213
+ return {
16214
+ path: evidencePath,
16215
+ fieldCount: Number(coverage.fieldCount) || 0,
16216
+ evidenceCount: Number(coverage.evidenceCount) || 0,
16217
+ foundCount: Number(coverage.foundCount) || 0,
16218
+ missingCount: Number(coverage.missingCount) || 0,
16219
+ inferredCount: Number(coverage.inferredCount) || 0,
16220
+ issueCount: Number(coverage.issueCount) || 0
16221
+ };
16222
+ } catch {
16223
+ return;
16224
+ }
16225
+ }
16321
16226
  async function getRowExtractionActions(aiexDir, tableName) {
16322
16227
  const actions = /* @__PURE__ */ new Map();
16323
16228
  const auditRecords = await listExtractionAuditRecords(aiexDir);
@@ -16345,7 +16250,7 @@ async function listExtractions(config) {
16345
16250
  const aiexDir = path.dirname(config.schemaPath);
16346
16251
  const extractedDir = path.join(aiexDir, "extracted");
16347
16252
  await fs.mkdir(extractedDir, { recursive: true });
16348
- const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md"));
16253
+ const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md") && !f.endsWith(EVIDENCE_FILE_SUFFIX));
16349
16254
  const auditRecords = await listExtractionAuditRecords(aiexDir);
16350
16255
  const auditByOutputName = new Map(auditRecords.map((record) => [record.outputName, record]));
16351
16256
  const records = [];
@@ -16364,6 +16269,7 @@ async function listExtractions(config) {
16364
16269
  timestamp,
16365
16270
  fileSize: stat.size,
16366
16271
  modifiedAt: stat.mtime.toISOString(),
16272
+ evidenceSummary: await readEvidenceSummary(extractedDir, file),
16367
16273
  notionStatus: notionPages ? "synced" : audit?.status === "failed" ? "failed" : "not_synced",
16368
16274
  notionPages,
16369
16275
  notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0
@@ -16543,6 +16449,7 @@ async function retryNotionSync(config, fileName) {
16543
16449
 
16544
16450
  //#endregion
16545
16451
  //#region src/server/routes/data.ts
16452
+ const JSON_FILE_SUFFIX_RE = /\.json$/;
16546
16453
  const tableParamSchema = z.object({ name: z.string().regex(/^[a-z][a-z0-9_]*$/) });
16547
16454
  const extractionFileParamSchema = z.object({ name: z.string().regex(/^[\w.-]+\.json$/).refine((name$1) => name$1 === path.basename(name$1) && !name$1.includes("..")) });
16548
16455
  const tableQuerySchema = z.object({
@@ -16595,10 +16502,22 @@ function dataRoutes(config) {
16595
16502
  const filePath = path.join(extractedDir, name$1);
16596
16503
  try {
16597
16504
  const content = await fs.readFile(filePath, "utf-8");
16505
+ const evidencePath = path.join(extractedDir, name$1.replace(JSON_FILE_SUFFIX_RE, ".evidence.json"));
16506
+ let evidenceSummary;
16507
+ try {
16508
+ const evidence = JSON.parse(await fs.readFile(evidencePath, "utf-8"));
16509
+ evidenceSummary = evidence?.coverage ? {
16510
+ ...evidence.coverage,
16511
+ path: evidencePath
16512
+ } : void 0;
16513
+ } catch {
16514
+ evidenceSummary = void 0;
16515
+ }
16598
16516
  return c.json({
16599
16517
  success: true,
16600
16518
  content,
16601
- name: name$1
16519
+ name: name$1,
16520
+ evidenceSummary
16602
16521
  });
16603
16522
  } catch {
16604
16523
  return c.json({ error: t("server.extractionNotFound") }, 404);
@@ -16742,6 +16661,7 @@ function extractRoutes(config) {
16742
16661
  outputName: result.outputName,
16743
16662
  tablesInserted: result.tablesInserted,
16744
16663
  notionPages: result.notionPages,
16664
+ evidenceSummary: result.evidenceSummary,
16745
16665
  tokensUsed: result.tokensUsed,
16746
16666
  auditId: result.auditId
16747
16667
  }, 200);
@@ -16809,6 +16729,7 @@ function extractRoutes(config) {
16809
16729
  outputName: result.outputName,
16810
16730
  tablesInserted: result.tablesInserted,
16811
16731
  notionPages: result.notionPages,
16732
+ evidenceSummary: result.evidenceSummary,
16812
16733
  tokensUsed: result.tokensUsed,
16813
16734
  auditId: result.auditId
16814
16735
  }, 200);