aiex-cli 0.0.5-beta.3 → 0.0.5-beta.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-CQPDBVTw.mjs";
1
+ import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-NTNBFeBw.mjs";
2
2
  import { createRequire } from "node:module";
3
3
  import fs from "node:fs/promises";
4
4
  import os from "node:os";
@@ -17,13 +17,15 @@ import Database from "better-sqlite3";
17
17
  import pc from "picocolors";
18
18
  import { Buffer } from "node:buffer";
19
19
  import * as XLSX from "xlsx";
20
+ import { getEncoding } from "js-tiktoken";
20
21
  import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
21
- import { APICallError, Output, generateText, jsonSchema, tool } from "ai";
22
+ import { APICallError, Output, generateText, jsonSchema } from "ai";
22
23
  import pRetry from "p-retry";
23
24
  import mime from "mime";
24
25
  import { jsonrepair } from "jsonrepair";
25
26
  import { LangfuseSpanProcessor } from "@langfuse/otel";
26
27
  import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
28
+ import { marked } from "marked";
27
29
  import crypto from "node:crypto";
28
30
  import { Client, extractNotionId } from "@notionhq/client";
29
31
  import { execa } from "execa";
@@ -13559,279 +13561,170 @@ function mergeExtractionResults(schema, results) {
13559
13561
 
13560
13562
  //#endregion
13561
13563
  //#region src/core/ai-extraction/text-splitter.ts
13562
- const HEADING_RE = /^(#{1,6})\s+(\S.*)$/;
13564
+ const encoding$1 = getEncoding("cl100k_base");
13565
+ function countTokens(text$1) {
13566
+ return encoding$1.encode(text$1).length;
13567
+ }
13568
+ function formatHeadingContext(headings) {
13569
+ const active = headings.filter(Boolean);
13570
+ if (active.length === 0) return "";
13571
+ return `> **[Context]** Belong to: ${active.join(" > ")}\n\n`;
13572
+ }
13573
+ function getMetadata(headings) {
13574
+ return {
13575
+ h1: headings[0] || void 0,
13576
+ h2: headings[1] || void 0,
13577
+ h3: headings[2] || void 0,
13578
+ h4: headings[3] || void 0
13579
+ };
13580
+ }
13563
13581
  /**
13564
- * Splits a Markdown document into chunks based on header hierarchy.
13565
- * Keeps tables and list blocks intact by splitting along paragraphs (\n\n)
13566
- * when a section exceeds the maxSize limit.
13582
+ * Splits text recursively using a list of separators.
13583
+ * Preserves the separators when re-joining.
13567
13584
  */
13568
- function splitMarkdown(text$1, maxSize = 4e4) {
13569
- const lines = text$1.split("\n");
13585
+ function splitTextRecursively(text$1, maxTokens, separators = [
13586
+ "\n\n",
13587
+ "\n",
13588
+ "。",
13589
+ ". ",
13590
+ " "
13591
+ ]) {
13592
+ if (countTokens(text$1) <= maxTokens) return [text$1];
13593
+ if (separators.length === 0) {
13594
+ const chunks = [];
13595
+ let current = "";
13596
+ for (const char of text$1) if (countTokens(current + char) > maxTokens) {
13597
+ chunks.push(current);
13598
+ current = char;
13599
+ } else current += char;
13600
+ if (current) chunks.push(current);
13601
+ return chunks;
13602
+ }
13603
+ const separator = separators[0];
13604
+ const nextSeparators = separators.slice(1);
13605
+ const parts = text$1.split(separator);
13606
+ const result = [];
13607
+ let currentChunk = [];
13608
+ let currentChunkTokens = 0;
13609
+ for (let i = 0; i < parts.length; i++) {
13610
+ const part = parts[i];
13611
+ const itemText = part + (i < parts.length - 1 ? separator : "");
13612
+ const partTokens = countTokens(itemText);
13613
+ if (partTokens > maxTokens) {
13614
+ if (currentChunk.length > 0) {
13615
+ result.push(currentChunk.join(""));
13616
+ currentChunk = [];
13617
+ currentChunkTokens = 0;
13618
+ }
13619
+ const subParts = splitTextRecursively(part, maxTokens, nextSeparators);
13620
+ for (let j = 0; j < subParts.length; j++) {
13621
+ const finalSub = subParts[j] + (j === subParts.length - 1 && i < parts.length - 1 ? separator : "");
13622
+ result.push(finalSub);
13623
+ }
13624
+ } else if (currentChunkTokens + partTokens > maxTokens) {
13625
+ result.push(currentChunk.join(""));
13626
+ currentChunk = [itemText];
13627
+ currentChunkTokens = partTokens;
13628
+ } else {
13629
+ currentChunk.push(itemText);
13630
+ currentChunkTokens += partTokens;
13631
+ }
13632
+ }
13633
+ if (currentChunk.length > 0) result.push(currentChunk.join(""));
13634
+ return result;
13635
+ }
13636
+ /**
13637
+ * Splits a Markdown document into chunks based on heading contexts, AST block parsing, and token limits.
13638
+ * Protects tables, list items, and code blocks from being broken.
13639
+ */
13640
+ function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
13641
+ const tokens = marked.lexer(text$1);
13570
13642
  const chunks = [];
13571
13643
  let currentHeadings = [];
13572
- let currentChunkLines = [];
13573
- let currentSize = 0;
13574
- const getMetadata = (headings) => {
13575
- return {
13576
- h1: headings[0] || void 0,
13577
- h2: headings[1] || void 0,
13578
- h3: headings[2] || void 0,
13579
- h4: headings[3] || void 0
13580
- };
13581
- };
13582
- const flushChunk = () => {
13583
- if (currentChunkLines.length === 0) return;
13584
- const pageContent = currentChunkLines.join("\n");
13585
- if (pageContent.length > maxSize) {
13586
- const paragraphs = pageContent.split("\n\n");
13587
- let subLines = [];
13588
- let subSize = 0;
13589
- for (const para of paragraphs) {
13590
- const paraSize = para.length;
13591
- if (subSize + paraSize > maxSize && subLines.length > 0) {
13592
- chunks.push({
13593
- pageContent: subLines.join("\n\n"),
13594
- metadata: getMetadata(currentHeadings)
13595
- });
13596
- subLines = [];
13597
- subSize = 0;
13598
- }
13599
- subLines.push(para);
13600
- subSize += paraSize + 2;
13601
- }
13602
- if (subLines.length > 0) chunks.push({
13603
- pageContent: subLines.join("\n\n"),
13604
- metadata: getMetadata(currentHeadings)
13605
- });
13606
- } else chunks.push({
13644
+ let currentChunkList = [];
13645
+ let accumulatedTokens = 0;
13646
+ const flushCurrentChunk = (isHeadingChange = false) => {
13647
+ if (currentChunkList.length === 0) return;
13648
+ const pageContent = currentChunkList.map((item) => item.text).join("");
13649
+ const firstHeadings = currentChunkList[0].headings;
13650
+ chunks.push({
13607
13651
  pageContent,
13608
- metadata: getMetadata(currentHeadings)
13652
+ metadata: getMetadata(firstHeadings)
13609
13653
  });
13610
- currentChunkLines = [];
13611
- currentSize = 0;
13654
+ if (isHeadingChange || overlapTokens <= 0) {
13655
+ currentChunkList = [];
13656
+ accumulatedTokens = 0;
13657
+ } else {
13658
+ const overlapItems = [];
13659
+ let currentOverlapTokens = 0;
13660
+ for (let i = currentChunkList.length - 1; i >= 0; i--) {
13661
+ const item = currentChunkList[i];
13662
+ const itemTokens = countTokens(item.text);
13663
+ if (currentOverlapTokens + itemTokens > overlapTokens && overlapItems.length > 0) break;
13664
+ overlapItems.unshift(item);
13665
+ currentOverlapTokens += itemTokens;
13666
+ }
13667
+ currentChunkList = [...overlapItems];
13668
+ accumulatedTokens = currentOverlapTokens;
13669
+ }
13612
13670
  };
13613
- for (const line of lines) {
13614
- const headingMatch = line.match(HEADING_RE);
13615
- if (headingMatch) {
13616
- flushChunk();
13617
- const depth = headingMatch[1].length;
13618
- const title = headingMatch[2].trim();
13671
+ for (const token of tokens) {
13672
+ if (token.type === "space") {
13673
+ if (currentChunkList.length > 0) {
13674
+ currentChunkList[currentChunkList.length - 1].text += token.raw;
13675
+ accumulatedTokens += countTokens(token.raw);
13676
+ }
13677
+ continue;
13678
+ }
13679
+ if (token.type === "heading") {
13680
+ flushCurrentChunk(true);
13681
+ const depth = token.depth;
13682
+ const title = token.text.trim();
13619
13683
  currentHeadings = currentHeadings.slice(0, depth - 1);
13620
13684
  currentHeadings[depth - 1] = title;
13621
13685
  }
13622
- currentChunkLines.push(line);
13623
- currentSize += line.length + 1;
13624
- if (currentSize > maxSize) flushChunk();
13686
+ const rawText = token.raw;
13687
+ if (token.type === "list" && countTokens(rawText) > maxTokens) for (const item of token.items) processTextBlock(item.raw, currentHeadings);
13688
+ else {
13689
+ const isAtomic = token.type === "table" || token.type === "code";
13690
+ processTextBlock(rawText, currentHeadings, isAtomic);
13691
+ }
13625
13692
  }
13626
- flushChunk();
13693
+ flushCurrentChunk(true);
13627
13694
  return chunks;
13628
- }
13629
-
13630
- //#endregion
13631
- //#region src/core/ai-extraction/react-agent.ts
13632
- async function extractStructuredDataWithAgent(input) {
13633
- const { config, schema, text: text$1, aiexDir, modelOverride, onAgentStep } = input;
13634
- if (!config.provider.apiKey) return {
13635
- success: false,
13636
- error: t("errors.ai.apiKeyMissing")
13637
- };
13638
- const chunks = splitMarkdown(text$1, 15e3);
13639
- const inputTokens = Math.ceil(text$1.length / 2);
13640
- const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
13641
- const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
13642
- let selected;
13643
- try {
13644
- selected = modelOverride ?? selectModel({
13645
- models: config.provider.models,
13646
- isImage: false,
13647
- inputTokens,
13648
- outputTokens
13649
- });
13650
- } catch (e) {
13651
- return {
13652
- success: false,
13653
- error: e.message
13654
- };
13655
- }
13656
- const useTelemetry = !!(config.langfuse?.publicKey && config.langfuse.secretKey);
13657
- try {
13658
- if (useTelemetry) initLangfuse(config);
13659
- const provider = createOpenAICompatible({
13660
- baseURL: config.provider.baseURL,
13661
- name: "openai-compatible",
13662
- apiKey: config.provider.apiKey,
13663
- supportsStructuredOutputs: false
13664
- });
13665
- let finalExtractedData = null;
13666
- const tools = {
13667
- listChunks: tool({
13668
- description: "Get a list of all text chunks in the document, showing their chunk index ID, character size, and markdown heading hierarchy (metadata). Use this as a Table of Contents to locate sections of interest.",
13669
- parameters: z.object({}),
13670
- execute: async () => {
13671
- return chunks.map((c, idx) => ({
13672
- id: idx + 1,
13673
- size: c.pageContent.length,
13674
- headings: c.metadata
13675
- }));
13676
- }
13677
- }),
13678
- readChunk: tool({
13679
- description: "Read the full text content of a specific chunk by its ID.",
13680
- parameters: z.object({ chunkId: z.number().int().describe("The ID (1-based index) of the chunk to read.") }),
13681
- execute: async ({ chunkId }) => {
13682
- const index = chunkId - 1;
13683
- if (index < 0 || index >= chunks.length) return { error: `Invalid chunkId: ${chunkId}. Valid IDs are 1 to ${chunks.length}.` };
13684
- const chunk = chunks[index];
13685
- const headings = [];
13686
- if (chunk.metadata) {
13687
- if (chunk.metadata.h1) headings.push(chunk.metadata.h1);
13688
- if (chunk.metadata.h2) headings.push(chunk.metadata.h2);
13689
- if (chunk.metadata.h3) headings.push(chunk.metadata.h3);
13690
- if (chunk.metadata.h4) headings.push(chunk.metadata.h4);
13691
- }
13692
- return {
13693
- chunkId,
13694
- headings: headings.join(" > "),
13695
- content: chunk.pageContent
13696
- };
13697
- }
13698
- }),
13699
- searchChunks: tool({
13700
- description: "Search all chunks in the document for specific keywords or search terms. Returns matching chunk IDs and small matching context snippets.",
13701
- parameters: z.object({ query: z.string().describe("The keyword or search phrase to search for.") }),
13702
- execute: async ({ query }) => {
13703
- const results = [];
13704
- const lowercaseQuery = query.toLowerCase();
13705
- for (let i = 0; i < chunks.length; i++) {
13706
- const chunkText = chunks[i].pageContent;
13707
- const idx = chunkText.toLowerCase().indexOf(lowercaseQuery);
13708
- if (idx !== -1) {
13709
- const start = Math.max(0, idx - 60);
13710
- const end = Math.min(chunkText.length, idx + lowercaseQuery.length + 60);
13711
- const snippet = `...${chunkText.slice(start, end).replace(/\n/g, " ")}...`;
13712
- results.push({
13713
- chunkId: i + 1,
13714
- headings: chunks[i].metadata,
13715
- snippet
13716
- });
13717
- }
13718
- }
13719
- return results.slice(0, 10);
13720
- }
13721
- }),
13722
- submitExtraction: tool({
13723
- description: "Submit the final extracted JSON object conforming to the schema definition. Call this ONLY after you have gathered all necessary information.",
13724
- parameters: z.object({ data: z.any().describe("The extracted JSON object conforming to the target schema.") }),
13725
- execute: async ({ data }) => {
13726
- finalExtractedData = data;
13727
- return {
13728
- status: "success",
13729
- message: "Data submitted successfully. The extraction is now complete."
13730
- };
13731
- }
13732
- })
13733
- };
13734
- const outputSchema = schemaToExtractionOutputSchema(schema);
13735
- const systemPrompt = `You are a precise data extraction agent. Your goal is to extract structured information from a document to populate the target JSON schema.
13736
-
13737
- Target JSON Schema structure to populate:
13738
- ${JSON.stringify(outputSchema, null, 2)}
13739
-
13740
- You are equipped with tools to browse the document dynamically:
13741
- 1. First, call listChunks to understand the document layout and what sections exist.
13742
- 2. Based on the schema fields, call readChunk or searchChunks to locate and read relevant content.
13743
- 3. You can make multiple tool calls. Do not guess. Check the text carefully.
13744
- 4. Once you have located and read all the necessary information, call the submitExtraction tool with the fully extracted JSON object.
13745
- 5. After calling submitExtraction, you should stop.
13746
-
13747
- CRITICAL RULES:
13748
- 1. Extract data strictly conforming to the types and properties of the Target JSON Schema.
13749
- 2. If a field's value cannot be found in the document after thorough search, set it to null.
13750
- 3. Do not invent any values.
13751
- 4. Call submitExtraction exactly once with the final JSON result.`;
13752
- const timeoutMs = (config.provider.timeout ?? 300) * 1e3;
13753
- const result = await generateText({
13754
- model: provider.chatModel(selected.name),
13755
- system: systemPrompt,
13756
- prompt: "Please start by listing the chunks to understand the document structure, then gather the required facts and submit the final JSON extraction.",
13757
- tools,
13758
- maxSteps: 12,
13759
- abortSignal: AbortSignal.timeout(timeoutMs),
13760
- experimental_telemetry: { isEnabled: useTelemetry },
13761
- onStepFinish({ text: text$2, toolCalls }) {
13762
- if (onAgentStep) onAgentStep({
13763
- thought: text$2,
13764
- toolCalls
13695
+ function processTextBlock(blockText, headings, isAtomic = false) {
13696
+ const blockTokens = countTokens(blockText);
13697
+ const contextTokens = countTokens(formatHeadingContext(headings));
13698
+ const safetyBuffer = Math.min(100, Math.max(2, Math.floor(maxTokens * .1)));
13699
+ const budgetLimit = Math.max(5, maxTokens - contextTokens - safetyBuffer);
13700
+ if (blockTokens > budgetLimit) if (isAtomic) {
13701
+ flushCurrentChunk(false);
13702
+ currentChunkList.push({
13703
+ text: blockText,
13704
+ headings: [...headings]
13705
+ });
13706
+ accumulatedTokens = blockTokens;
13707
+ flushCurrentChunk(false);
13708
+ } else {
13709
+ flushCurrentChunk(false);
13710
+ const subBlocks = splitTextRecursively(blockText, budgetLimit);
13711
+ for (const sub of subBlocks) {
13712
+ currentChunkList.push({
13713
+ text: sub,
13714
+ headings: [...headings]
13765
13715
  });
13716
+ accumulatedTokens += countTokens(sub);
13717
+ if (accumulatedTokens > budgetLimit) flushCurrentChunk(false);
13766
13718
  }
13767
- });
13768
- if (!finalExtractedData) {
13769
- if (result.text) try {
13770
- finalExtractedData = safeParseJSON(result.text);
13771
- } catch {}
13772
- }
13773
- if (!finalExtractedData) return {
13774
- success: false,
13775
- error: "Agent finished without submitting structured data."
13776
- };
13777
- const validation = validateExtractedData(schema, finalExtractedData);
13778
- if (!validation.success) {
13779
- const correctionSystemPrompt = `You are a precise data correction assistant. Your task is to correct validation errors in a previously generated JSON object to make it comply with the JSON Schema.
13780
-
13781
- JSON Schema Definition:
13782
- ${JSON.stringify(outputSchema, null, 2)}
13783
-
13784
- Validation Errors:
13785
- ${validation.error}
13786
-
13787
- Original Incorrect JSON:
13788
- ${JSON.stringify(finalExtractedData, null, 2)}
13789
-
13790
- Please output the corrected JSON object. Return ONLY the corrected JSON object, with no markdown tags or explanations.`;
13791
- const correctedData = safeParseJSON((await generateText({
13792
- model: provider.chatModel(selected.name),
13793
- system: correctionSystemPrompt,
13794
- prompt: "Please correct the JSON output now.",
13795
- abortSignal: AbortSignal.timeout(timeoutMs),
13796
- experimental_telemetry: { isEnabled: useTelemetry }
13797
- })).text);
13798
- const secondValidation = validateExtractedData(schema, correctedData);
13799
- if (!secondValidation.success) return {
13800
- success: false,
13801
- error: `Agent output validation failed: ${secondValidation.error}`
13802
- };
13803
- finalExtractedData = correctedData;
13804
13719
  }
13805
- const outputDir = path.resolve(aiexDir, config.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
13806
- await fs.mkdir(outputDir, { recursive: true });
13807
- const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
13808
- const outputFileName = `${schema.table.name}-${timestamp}.json`;
13809
- const outputPath = path.join(outputDir, outputFileName);
13810
- await writeFile(outputPath, finalExtractedData, {
13811
- spaces: 2,
13812
- EOL: "\n"
13813
- });
13814
- let totalPromptTokens = 0;
13815
- let totalCompletionTokens = 0;
13816
- if (result.usage) {
13817
- totalPromptTokens = result.usage.inputTokens ?? 0;
13818
- totalCompletionTokens = result.usage.outputTokens ?? 0;
13720
+ else {
13721
+ if (accumulatedTokens + blockTokens + contextTokens > maxTokens && currentChunkList.length > 0) flushCurrentChunk(false);
13722
+ currentChunkList.push({
13723
+ text: blockText,
13724
+ headings: [...headings]
13725
+ });
13726
+ accumulatedTokens += blockTokens;
13819
13727
  }
13820
- return {
13821
- success: true,
13822
- outputPath,
13823
- data: finalExtractedData,
13824
- tokensUsed: {
13825
- prompt: totalPromptTokens,
13826
- completion: totalCompletionTokens,
13827
- total: totalPromptTokens + totalCompletionTokens
13828
- }
13829
- };
13830
- } catch (error) {
13831
- return {
13832
- success: false,
13833
- error: getErrorMessage(error)
13834
- };
13835
13728
  }
13836
13729
  }
13837
13730
 
@@ -14719,7 +14612,6 @@ async function processOneFile(aiexDir, config, aiConfig, schemaName, filePath, m
14719
14612
  modelOverride,
14720
14613
  insert: options?.insert,
14721
14614
  force: options?.force,
14722
- agent: options?.agent,
14723
14615
  quiet: false
14724
14616
  });
14725
14617
  if (result.success) {
@@ -14759,8 +14651,7 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
14759
14651
  })}`);
14760
14652
  if (await processOneFile(aiexDir, config, aiConfig, schemaName, file, modelOverride, {
14761
14653
  insert: options?.insert,
14762
- force: options?.force,
14763
- agent: options?.agent
14654
+ force: options?.force
14764
14655
  })) successCount++;
14765
14656
  else failCount++;
14766
14657
  }
@@ -14778,7 +14669,44 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
14778
14669
 
14779
14670
  //#endregion
14780
14671
  //#region src/core/extract-runner.ts
14672
+ const encoding = getEncoding("cl100k_base");
14781
14673
  const JSON_EXT_RE$1 = /\.json$/;
14674
+ async function limitConcurrency(concurrency, items, fn) {
14675
+ const results = Array.from({ length: items.length });
14676
+ let nextIndex = 0;
14677
+ async function worker() {
14678
+ while (nextIndex < items.length) {
14679
+ const currentIndex = nextIndex++;
14680
+ results[currentIndex] = await fn(items[currentIndex], currentIndex);
14681
+ }
14682
+ }
14683
+ const workers = Array.from({ length: Math.min(concurrency, items.length) }, worker);
14684
+ await Promise.all(workers);
14685
+ return results;
14686
+ }
14687
+ function getSchemaKeywords(schema) {
14688
+ const keywords = /* @__PURE__ */ new Set();
14689
+ function walk(properties) {
14690
+ if (!properties) return;
14691
+ for (const [name$1, prop] of Object.entries(properties)) {
14692
+ keywords.add(name$1.toLowerCase());
14693
+ const parts = name$1.replace(/([a-z0-9])([A-Z])/g, "$1 $2").split(/[\s._:/\\-]+/g);
14694
+ for (const part of parts) if (part.length > 1) keywords.add(part.toLowerCase());
14695
+ if (prop && typeof prop === "object") {
14696
+ const p = prop;
14697
+ if (typeof p.title === "string") keywords.add(p.title.toLowerCase());
14698
+ if (typeof p.description === "string") {
14699
+ const descParts = p.description.toLowerCase().match(/[\p{L}\p{N}_-]+/gu) ?? [];
14700
+ for (const d of descParts) if (d.length > 2) keywords.add(d);
14701
+ }
14702
+ if (p.type === "object") walk(p.properties);
14703
+ if (p.type === "array" && p.items?.type === "object") walk(p.items.properties);
14704
+ }
14705
+ }
14706
+ }
14707
+ walk(schema.properties);
14708
+ return Array.from(keywords);
14709
+ }
14782
14710
  async function ensureDatabaseReady(dbPath, schema) {
14783
14711
  try {
14784
14712
  await fs.access(dbPath);
@@ -14850,44 +14778,52 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14850
14778
  }
14851
14779
  const s = spinner();
14852
14780
  if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
14853
- const CHUNK_LIMIT = 4e4;
14781
+ const maxTokens = aiConfig.extraction?.maxTokens ?? 8e3;
14782
+ const overlapTokens = aiConfig.extraction?.overlapSize ?? 1e3;
14854
14783
  let result;
14855
- if (options?.agent || aiConfig.extraction?.mode === "react") {
14856
- if (!options?.quiet) consola.info(t("command.extract.file.reactAgentMode"));
14857
- const agentResult = await extractStructuredDataWithAgent({
14858
- config: aiConfig,
14859
- schema: schemaLoad.schema,
14860
- text: text$1 ?? "",
14861
- aiexDir,
14862
- modelOverride,
14863
- onAgentStep(step) {
14864
- if (!options?.quiet) {
14865
- if (step.thought) {
14866
- const thoughtPreview = step.thought.length > 100 ? `${step.thought.slice(0, 100)}...` : step.thought;
14867
- s.message(`${pc.cyan(t("command.extract.file.agentThought"))}: ${thoughtPreview.replace(/\n/g, " ")}`);
14784
+ const totalTokens = text$1 ? encoding.encode(text$1).length : 0;
14785
+ if (text$1 && totalTokens > maxTokens) {
14786
+ if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
14787
+ length: totalTokens,
14788
+ limit: maxTokens
14789
+ }));
14790
+ const finalDocs = splitMarkdown(text$1, maxTokens, overlapTokens);
14791
+ if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
14792
+ let processedDocs = finalDocs;
14793
+ if (!!aiConfig.extraction?.preFiltering && finalDocs.length > 1) {
14794
+ const preFilteringLimit = aiConfig.extraction?.preFilteringLimit ?? 5;
14795
+ const keywords = getSchemaKeywords(schemaLoad.schema);
14796
+ const scoredChunks = finalDocs.map((doc, idx) => {
14797
+ if (idx === 0) return {
14798
+ index: idx,
14799
+ score: Number.POSITIVE_INFINITY
14800
+ };
14801
+ let score = 0;
14802
+ const docTextLower = doc.pageContent.toLowerCase();
14803
+ for (const kw of keywords) {
14804
+ let pos = docTextLower.indexOf(kw);
14805
+ while (pos !== -1) {
14806
+ score++;
14807
+ pos = docTextLower.indexOf(kw, pos + kw.length);
14868
14808
  }
14869
- if (step.toolCalls && step.toolCalls.length > 0) for (const call of step.toolCalls) consola.info(`[Agent Action] Calling tool: ${pc.green(call.toolName)}`);
14870
14809
  }
14810
+ return {
14811
+ index: idx,
14812
+ score
14813
+ };
14814
+ }).slice(1).sort((a, b) => b.score - a.score);
14815
+ const selectedIndices = new Set([0]);
14816
+ let keptCount = 0;
14817
+ for (const sc of scoredChunks) if (sc.score > 0 && keptCount < preFilteringLimit) {
14818
+ selectedIndices.add(sc.index);
14819
+ keptCount++;
14871
14820
  }
14872
- });
14873
- if (!agentResult.success) {
14874
- if (!options?.quiet) {
14875
- s.stop(t("command.extract.file.extractFail"));
14876
- consola.error(agentResult.error);
14877
- }
14878
- return {
14879
- success: false,
14880
- error: agentResult.error
14881
- };
14821
+ processedDocs = finalDocs.filter((_, idx) => selectedIndices.has(idx));
14822
+ if (!options?.quiet) consola.info(t("command.extract.file.preFiltering", {
14823
+ original: finalDocs.length,
14824
+ filtered: processedDocs.length
14825
+ }));
14882
14826
  }
14883
- result = agentResult;
14884
- } else if (text$1 && text$1.length > CHUNK_LIMIT) {
14885
- if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
14886
- length: text$1.length,
14887
- limit: CHUNK_LIMIT
14888
- }));
14889
- const finalDocs = splitMarkdown(text$1, CHUNK_LIMIT);
14890
- if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
14891
14827
  const chunkResults = [];
14892
14828
  const accumulatedTokens = {
14893
14829
  prompt: 0,
@@ -14896,53 +14832,68 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14896
14832
  };
14897
14833
  let success = true;
14898
14834
  let errorMsg = "";
14899
- for (let i = 0; i < finalDocs.length; i++) {
14900
- const doc = finalDocs[i];
14901
- if (!options?.quiet) s.message(t("command.extract.file.extractingChunk", {
14902
- current: i + 1,
14903
- total: finalDocs.length
14904
- }));
14905
- const headings = [];
14906
- if (doc.metadata) {
14907
- if (doc.metadata.h1) headings.push(doc.metadata.h1);
14908
- if (doc.metadata.h2) headings.push(doc.metadata.h2);
14909
- if (doc.metadata.h3) headings.push(doc.metadata.h3);
14910
- if (doc.metadata.h4) headings.push(doc.metadata.h4);
14911
- }
14912
- let chunkText = doc.pageContent;
14913
- if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
14914
- const chunkResult = await extractStructuredData({
14915
- config: aiConfig,
14916
- schema: schemaLoad.schema,
14917
- text: chunkText,
14918
- aiexDir,
14919
- modelOverride,
14920
- onRetry(info) {
14921
- if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
14922
- current: i + 1,
14923
- total: finalDocs.length,
14924
- code: info.statusCode,
14925
- delay: info.delayMs / 1e3,
14926
- attempt: info.attempt,
14927
- max: info.maxRetries
14928
- }));
14835
+ const extractionTasks = processedDocs.map((doc, i) => {
14836
+ return async () => {
14837
+ if (!success) return;
14838
+ const headings = [];
14839
+ if (doc.metadata) {
14840
+ if (doc.metadata.h1) headings.push(doc.metadata.h1);
14841
+ if (doc.metadata.h2) headings.push(doc.metadata.h2);
14842
+ if (doc.metadata.h3) headings.push(doc.metadata.h3);
14843
+ if (doc.metadata.h4) headings.push(doc.metadata.h4);
14929
14844
  }
14930
- });
14931
- if (!chunkResult.success) {
14932
- success = false;
14933
- errorMsg = chunkResult.error || t("common.unknownError");
14934
- if (!options?.quiet) {
14935
- s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
14936
- consola.error(errorMsg);
14845
+ let chunkText = doc.pageContent;
14846
+ if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
14847
+ const chunkResult = await extractStructuredData({
14848
+ config: aiConfig,
14849
+ schema: schemaLoad.schema,
14850
+ text: chunkText,
14851
+ aiexDir,
14852
+ modelOverride,
14853
+ onRetry(info) {
14854
+ if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
14855
+ current: i + 1,
14856
+ total: processedDocs.length,
14857
+ code: info.statusCode,
14858
+ delay: info.delayMs / 1e3,
14859
+ attempt: info.attempt,
14860
+ max: info.maxRetries
14861
+ }));
14862
+ }
14863
+ });
14864
+ if (!chunkResult.success) {
14865
+ success = false;
14866
+ errorMsg = chunkResult.error || t("common.unknownError");
14867
+ if (!options?.quiet) {
14868
+ s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
14869
+ consola.error(errorMsg);
14870
+ }
14871
+ return;
14937
14872
  }
14938
- break;
14939
- }
14940
- if (chunkResult.data) chunkResults.push(chunkResult.data);
14941
- if (chunkResult.tokensUsed) {
14942
- accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
14943
- accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
14944
- accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
14945
- }
14873
+ if (chunkResult.data) chunkResults.push(chunkResult.data);
14874
+ if (chunkResult.tokensUsed) {
14875
+ accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
14876
+ accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
14877
+ accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
14878
+ }
14879
+ };
14880
+ });
14881
+ const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
14882
+ if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
14883
+ current: 1,
14884
+ total: processedDocs.length
14885
+ }));
14886
+ try {
14887
+ await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
14888
+ if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
14889
+ current: idx + 1,
14890
+ total: processedDocs.length
14891
+ }));
14892
+ await task();
14893
+ });
14894
+ } catch (e) {
14895
+ success = false;
14896
+ errorMsg = e instanceof Error ? e.message : String(e);
14946
14897
  }
14947
14898
  if (!success) return {
14948
14899
  success: false,
@@ -15001,6 +14952,11 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
15001
14952
  }
15002
14953
  if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
15003
14954
  if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
14955
+ if (result.evidenceSummary && !options?.quiet) {
14956
+ const summary = result.evidenceSummary;
14957
+ const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
14958
+ consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, issues ${issueText}`));
14959
+ }
15004
14960
  if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
15005
14961
  prompt: result.tokensUsed.prompt,
15006
14962
  completion: result.tokensUsed.completion,
@@ -15029,6 +14985,7 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
15029
14985
  outputPath: result.outputPath,
15030
14986
  data: result.data,
15031
14987
  tablesInserted: insertResult.tablesInserted,
14988
+ evidenceSummary: result.evidenceSummary,
15032
14989
  tokensUsed: result.tokensUsed
15033
14990
  };
15034
14991
  } else {
@@ -15055,11 +15012,12 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
15055
15012
  success: true,
15056
15013
  outputPath: result.outputPath,
15057
15014
  data: result.data,
15015
+ evidenceSummary: result.evidenceSummary,
15058
15016
  tokensUsed: result.tokensUsed
15059
15017
  };
15060
15018
  }
15061
15019
  async function runAuditedExtraction(options) {
15062
- const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false, agent = false } = options;
15020
+ const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false } = options;
15063
15021
  let fileHash;
15064
15022
  let isPlainTextFile = false;
15065
15023
  if (source.type === "file") {
@@ -15127,8 +15085,7 @@ async function runAuditedExtraction(options) {
15127
15085
  } else text$1 = source.text;
15128
15086
  const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
15129
15087
  quiet,
15130
- insert,
15131
- agent
15088
+ insert
15132
15089
  });
15133
15090
  if (r.success) {
15134
15091
  let notionPages;
@@ -15168,6 +15125,7 @@ async function runAuditedExtraction(options) {
15168
15125
  outputName: updated.outputName,
15169
15126
  tablesInserted: updated.tablesInserted,
15170
15127
  notionPages: updated.notionPages,
15128
+ evidenceSummary: r.evidenceSummary,
15171
15129
  tokensUsed: updated.tokensUsed,
15172
15130
  auditId: updated.id,
15173
15131
  fileHash
@@ -15587,12 +15545,6 @@ const extractCommand = defineCommand({
15587
15545
  type: "boolean",
15588
15546
  description: t("command.extract.args.force"),
15589
15547
  default: false
15590
- },
15591
- agent: {
15592
- type: "boolean",
15593
- alias: "a",
15594
- description: "Enable ReAct agent extraction mode",
15595
- default: false
15596
15548
  }
15597
15549
  },
15598
15550
  async run({ args, rawArgs }) {
@@ -15620,8 +15572,7 @@ const extractCommand = defineCommand({
15620
15572
  }
15621
15573
  const result$1 = await runBatchExtraction(aiexDir, config, aiConfig, args.schema, args.dir, args.glob, modelOverride, {
15622
15574
  insert: !args.noInsert,
15623
- force: args.force,
15624
- agent: args.agent
15575
+ force: args.force
15625
15576
  });
15626
15577
  if (!result$1.ok) {
15627
15578
  failCommand(result$1.error);
@@ -15652,8 +15603,7 @@ const extractCommand = defineCommand({
15652
15603
  modelOverride,
15653
15604
  insert: !args.noInsert,
15654
15605
  force: args.force,
15655
- quiet: false,
15656
- agent: args.agent
15606
+ quiet: false
15657
15607
  });
15658
15608
  if (!result.success) {
15659
15609
  failCommand(result.error);
@@ -16303,6 +16253,7 @@ function aiRoutes(config) {
16303
16253
  //#endregion
16304
16254
  //#region src/core/data-service.ts
16305
16255
  const FILE_REGEX = /\.json$/;
16256
+ const EVIDENCE_FILE_SUFFIX = ".evidence.json";
16306
16257
  const EXTRACTION_TIMESTAMP_RE = /-\d{4}-\d{2}-\d{2}T/;
16307
16258
  const INTERNAL_ROWID_COLUMN = "__aiex_rowid";
16308
16259
  const TIMESTAMP_CLEANUP = /(\d{2})-(\d{2})-(\d{2})/;
@@ -16318,6 +16269,24 @@ function getAuditNotionStatus(record) {
16318
16269
  if (record.status === "failed") return "failed";
16319
16270
  return "not_synced";
16320
16271
  }
16272
+ async function readEvidenceSummary(extractedDir, outputName) {
16273
+ const evidencePath = path.join(extractedDir, outputName.replace(FILE_REGEX, EVIDENCE_FILE_SUFFIX));
16274
+ try {
16275
+ const coverage = (await readFile(evidencePath))?.coverage;
16276
+ if (!coverage || typeof coverage !== "object") return void 0;
16277
+ return {
16278
+ path: evidencePath,
16279
+ fieldCount: Number(coverage.fieldCount) || 0,
16280
+ evidenceCount: Number(coverage.evidenceCount) || 0,
16281
+ foundCount: Number(coverage.foundCount) || 0,
16282
+ missingCount: Number(coverage.missingCount) || 0,
16283
+ inferredCount: Number(coverage.inferredCount) || 0,
16284
+ issueCount: Number(coverage.issueCount) || 0
16285
+ };
16286
+ } catch {
16287
+ return;
16288
+ }
16289
+ }
16321
16290
  async function getRowExtractionActions(aiexDir, tableName) {
16322
16291
  const actions = /* @__PURE__ */ new Map();
16323
16292
  const auditRecords = await listExtractionAuditRecords(aiexDir);
@@ -16345,7 +16314,7 @@ async function listExtractions(config) {
16345
16314
  const aiexDir = path.dirname(config.schemaPath);
16346
16315
  const extractedDir = path.join(aiexDir, "extracted");
16347
16316
  await fs.mkdir(extractedDir, { recursive: true });
16348
- const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md"));
16317
+ const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md") && !f.endsWith(EVIDENCE_FILE_SUFFIX));
16349
16318
  const auditRecords = await listExtractionAuditRecords(aiexDir);
16350
16319
  const auditByOutputName = new Map(auditRecords.map((record) => [record.outputName, record]));
16351
16320
  const records = [];
@@ -16364,6 +16333,7 @@ async function listExtractions(config) {
16364
16333
  timestamp,
16365
16334
  fileSize: stat.size,
16366
16335
  modifiedAt: stat.mtime.toISOString(),
16336
+ evidenceSummary: await readEvidenceSummary(extractedDir, file),
16367
16337
  notionStatus: notionPages ? "synced" : audit?.status === "failed" ? "failed" : "not_synced",
16368
16338
  notionPages,
16369
16339
  notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0
@@ -16543,6 +16513,7 @@ async function retryNotionSync(config, fileName) {
16543
16513
 
16544
16514
  //#endregion
16545
16515
  //#region src/server/routes/data.ts
16516
+ const JSON_FILE_SUFFIX_RE = /\.json$/;
16546
16517
  const tableParamSchema = z.object({ name: z.string().regex(/^[a-z][a-z0-9_]*$/) });
16547
16518
  const extractionFileParamSchema = z.object({ name: z.string().regex(/^[\w.-]+\.json$/).refine((name$1) => name$1 === path.basename(name$1) && !name$1.includes("..")) });
16548
16519
  const tableQuerySchema = z.object({
@@ -16595,10 +16566,22 @@ function dataRoutes(config) {
16595
16566
  const filePath = path.join(extractedDir, name$1);
16596
16567
  try {
16597
16568
  const content = await fs.readFile(filePath, "utf-8");
16569
+ const evidencePath = path.join(extractedDir, name$1.replace(JSON_FILE_SUFFIX_RE, ".evidence.json"));
16570
+ let evidenceSummary;
16571
+ try {
16572
+ const evidence = JSON.parse(await fs.readFile(evidencePath, "utf-8"));
16573
+ evidenceSummary = evidence?.coverage ? {
16574
+ ...evidence.coverage,
16575
+ path: evidencePath
16576
+ } : void 0;
16577
+ } catch {
16578
+ evidenceSummary = void 0;
16579
+ }
16598
16580
  return c.json({
16599
16581
  success: true,
16600
16582
  content,
16601
- name: name$1
16583
+ name: name$1,
16584
+ evidenceSummary
16602
16585
  });
16603
16586
  } catch {
16604
16587
  return c.json({ error: t("server.extractionNotFound") }, 404);
@@ -16742,6 +16725,7 @@ function extractRoutes(config) {
16742
16725
  outputName: result.outputName,
16743
16726
  tablesInserted: result.tablesInserted,
16744
16727
  notionPages: result.notionPages,
16728
+ evidenceSummary: result.evidenceSummary,
16745
16729
  tokensUsed: result.tokensUsed,
16746
16730
  auditId: result.auditId
16747
16731
  }, 200);
@@ -16809,6 +16793,7 @@ function extractRoutes(config) {
16809
16793
  outputName: result.outputName,
16810
16794
  tablesInserted: result.tablesInserted,
16811
16795
  notionPages: result.notionPages,
16796
+ evidenceSummary: result.evidenceSummary,
16812
16797
  tokensUsed: result.tokensUsed,
16813
16798
  auditId: result.auditId
16814
16799
  }, 200);