aiex-cli 0.0.5-beta.2 → 0.0.5-beta.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -202,6 +202,17 @@ aiex completion fish | source
202
202
 
203
203
  <br>
204
204
 
205
+ ## 📄 Large Document Processing
206
+
207
+ When processing very large documents (exceeding `40,000` characters), `aiex` runs an optimized **Pipeline Mode** to handle context window limits and control API costs:
208
+
209
+ - **Sliding Window & Overlapping Slices**: Splits the document logically at Markdown headings or paragraph boundaries. It uses an overlapping sliding window to ensure contextual continuity at slice boundaries. Active heading hierarchies are tracked and prepended to each chunk as context.
210
+ - **Concurrency Limiting**: To respect strict model rate limits, chunk extractions are processed in parallel with a strict concurrency limit (capped at 2 concurrent requests).
211
+ - **Pre-filtering**: Integrates hybrid search-based pre-filtering to score and select only the most relevant document chunks based on schema queries, preventing unnecessary token usage on unrelated sections.
212
+ - **Recursive Merging**: The final extracted JSON objects from each chunk are recursively merged, concatenating lists and deduplicating primitive fields.
213
+
214
+ <br>
215
+
205
216
  ## 🔧 AI Configuration
206
217
 
207
218
  aiex works with any OpenAI-compatible API provider. Configure in the Web UI (AI Settings panel):
package/dist/cli.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-DZyLrpqA.mjs";
1
+ import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-Cv7RArla.mjs";
2
2
  import { createRequire } from "node:module";
3
3
  import fs from "node:fs/promises";
4
4
  import os from "node:os";
@@ -13128,7 +13128,7 @@ function propertyToExtractionSchema(property) {
13128
13128
  }
13129
13129
  return { type: nullableType(property.type) };
13130
13130
  }
13131
- function isRecord(value) {
13131
+ function isRecord$1(value) {
13132
13132
  return typeof value === "object" && value !== null && !Array.isArray(value);
13133
13133
  }
13134
13134
  function schemaToExtractionOutputSchema(schema) {
@@ -13166,7 +13166,7 @@ function validatePropertyValue(path$1, property, value, issues) {
13166
13166
  }
13167
13167
  return;
13168
13168
  case "object":
13169
- if (!isRecord(value)) {
13169
+ if (!isRecord$1(value)) {
13170
13170
  issues.push(`${path$1}: expected object or null`);
13171
13171
  return;
13172
13172
  }
@@ -13189,7 +13189,7 @@ function validateProperties(basePath, properties, data, issues) {
13189
13189
  }
13190
13190
  }
13191
13191
  function validateExtractedData(schema, data) {
13192
- if (!isRecord(data)) return {
13192
+ if (!isRecord$1(data)) return {
13193
13193
  success: false,
13194
13194
  error: "Extracted data must be a JSON object."
13195
13195
  };
@@ -13512,6 +13512,161 @@ function insertExtractedData(db, schema, data) {
13512
13512
  }
13513
13513
  }
13514
13514
 
13515
+ //#endregion
13516
+ //#region src/core/ai-extraction/json-merger.ts
13517
+ function isRecord(value) {
13518
+ return typeof value === "object" && value !== null && !Array.isArray(value);
13519
+ }
13520
+ function mergePropertyValue(property, values) {
13521
+ const nonNullValues = values.filter((v) => v !== null && v !== void 0);
13522
+ if (nonNullValues.length === 0) return null;
13523
+ if (property.type === "array") {
13524
+ const concatenated = [];
13525
+ for (const val of nonNullValues) if (Array.isArray(val)) concatenated.push(...val);
13526
+ return concatenated;
13527
+ }
13528
+ if (property.type === "object") {
13529
+ const childProperties = property.properties;
13530
+ if (!childProperties) {
13531
+ const mergedObj$1 = {};
13532
+ for (const val of nonNullValues) if (isRecord(val)) Object.assign(mergedObj$1, val);
13533
+ return mergedObj$1;
13534
+ }
13535
+ const mergedObj = {};
13536
+ for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord(v) ? v[propName] : void 0));
13537
+ return mergedObj;
13538
+ }
13539
+ const bestValue = nonNullValues.find((v) => {
13540
+ if (typeof v === "string") return v.trim() !== "";
13541
+ return true;
13542
+ });
13543
+ return bestValue !== void 0 ? bestValue : null;
13544
+ }
13545
+ /**
13546
+ * Merges structured extraction outputs from multiple document chunks
13547
+ * according to the schema properties.
13548
+ */
13549
+ function mergeExtractionResults(schema, results) {
13550
+ if (results.length === 0) return {};
13551
+ if (results.length === 1) return results[0];
13552
+ const merged = {};
13553
+ for (const [propName, propDef] of Object.entries(schema.properties)) {
13554
+ if (propDef.primary && propDef.autoIncrement) continue;
13555
+ merged[propName] = mergePropertyValue(propDef, results.map((r) => r[propName]));
13556
+ }
13557
+ return merged;
13558
+ }
13559
+
13560
+ //#endregion
13561
+ //#region src/core/ai-extraction/text-splitter.ts
13562
+ const HEADING_RE = /^(#{1,6})\s+(\S.*)$/;
13563
+ /**
13564
+ * Splits a Markdown document into chunks based on header hierarchy.
13565
+ * Keeps tables and list blocks intact by splitting along paragraphs (\n\n)
13566
+ * when a section exceeds the maxSize limit.
13567
+ */
13568
+ function splitMarkdown(text$1, maxSize = 4e4, overlapSize = 0) {
13569
+ const lines = text$1.split("\n");
13570
+ const chunks = [];
13571
+ let currentHeadings = [];
13572
+ let currentChunkLines = [];
13573
+ let currentSize = 0;
13574
+ let hasNewLines = false;
13575
+ const getMetadata = (headings) => {
13576
+ return {
13577
+ h1: headings[0] || void 0,
13578
+ h2: headings[1] || void 0,
13579
+ h3: headings[2] || void 0,
13580
+ h4: headings[3] || void 0
13581
+ };
13582
+ };
13583
+ const flushChunk = (isHeadingChange = false) => {
13584
+ if (currentChunkLines.length === 0 || !hasNewLines) {
13585
+ currentChunkLines = [];
13586
+ currentSize = 0;
13587
+ hasNewLines = false;
13588
+ return;
13589
+ }
13590
+ const pageContent = currentChunkLines.join("\n");
13591
+ let lastChunkContent = "";
13592
+ if (pageContent.length > maxSize) {
13593
+ const paragraphs = pageContent.split("\n\n");
13594
+ let subLines = [];
13595
+ let subSize = 0;
13596
+ for (const para of paragraphs) {
13597
+ const paraSize = para.length;
13598
+ if (subSize + paraSize > maxSize && subLines.length > 0) {
13599
+ const content = subLines.join("\n\n");
13600
+ chunks.push({
13601
+ pageContent: content,
13602
+ metadata: getMetadata(currentHeadings)
13603
+ });
13604
+ const overlapParas = [];
13605
+ let currentOverlapSize = 0;
13606
+ for (let j = subLines.length - 1; j >= 0; j--) {
13607
+ const p = subLines[j];
13608
+ if (currentOverlapSize + p.length > overlapSize && overlapParas.length > 0) break;
13609
+ overlapParas.unshift(p);
13610
+ currentOverlapSize += p.length + 2;
13611
+ }
13612
+ subLines = [...overlapParas];
13613
+ subSize = currentOverlapSize;
13614
+ }
13615
+ subLines.push(para);
13616
+ subSize += paraSize + 2;
13617
+ }
13618
+ if (subLines.length > 0) {
13619
+ const content = subLines.join("\n\n");
13620
+ chunks.push({
13621
+ pageContent: content,
13622
+ metadata: getMetadata(currentHeadings)
13623
+ });
13624
+ lastChunkContent = content;
13625
+ }
13626
+ } else {
13627
+ chunks.push({
13628
+ pageContent,
13629
+ metadata: getMetadata(currentHeadings)
13630
+ });
13631
+ lastChunkContent = pageContent;
13632
+ }
13633
+ if (!isHeadingChange && lastChunkContent && overlapSize > 0) {
13634
+ const paragraphs = lastChunkContent.split("\n\n");
13635
+ const overlapParas = [];
13636
+ let currentOverlapSize = 0;
13637
+ for (let j = paragraphs.length - 1; j >= 0; j--) {
13638
+ const p = paragraphs[j];
13639
+ if (currentOverlapSize + p.length > overlapSize && overlapParas.length > 0) break;
13640
+ overlapParas.unshift(p);
13641
+ currentOverlapSize += p.length + 2;
13642
+ }
13643
+ const overlapText = overlapParas.join("\n\n");
13644
+ currentChunkLines = overlapText.split("\n");
13645
+ currentSize = overlapText.length;
13646
+ } else {
13647
+ currentChunkLines = [];
13648
+ currentSize = 0;
13649
+ }
13650
+ hasNewLines = false;
13651
+ };
13652
+ for (const line of lines) {
13653
+ const headingMatch = line.match(HEADING_RE);
13654
+ if (headingMatch) {
13655
+ flushChunk(true);
13656
+ const depth = headingMatch[1].length;
13657
+ const title = headingMatch[2].trim();
13658
+ currentHeadings = currentHeadings.slice(0, depth - 1);
13659
+ currentHeadings[depth - 1] = title;
13660
+ }
13661
+ currentChunkLines.push(line);
13662
+ currentSize += line.length + 1;
13663
+ hasNewLines = true;
13664
+ if (currentSize > maxSize) flushChunk(false);
13665
+ }
13666
+ flushChunk(true);
13667
+ return chunks;
13668
+ }
13669
+
13515
13670
  //#endregion
13516
13671
  //#region src/core/extraction-audit.ts
13517
13672
  const AUDIT_ID_RE = /^[\w.-]+$/;
@@ -14454,6 +14609,42 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
14454
14609
  //#endregion
14455
14610
  //#region src/core/extract-runner.ts
14456
14611
  const JSON_EXT_RE$1 = /\.json$/;
14612
+ async function limitConcurrency(concurrency, items, fn) {
14613
+ const results = Array.from({ length: items.length });
14614
+ let nextIndex = 0;
14615
+ async function worker() {
14616
+ while (nextIndex < items.length) {
14617
+ const currentIndex = nextIndex++;
14618
+ results[currentIndex] = await fn(items[currentIndex], currentIndex);
14619
+ }
14620
+ }
14621
+ const workers = Array.from({ length: Math.min(concurrency, items.length) }, worker);
14622
+ await Promise.all(workers);
14623
+ return results;
14624
+ }
14625
+ function getSchemaKeywords(schema) {
14626
+ const keywords = /* @__PURE__ */ new Set();
14627
+ function walk(properties) {
14628
+ if (!properties) return;
14629
+ for (const [name$1, prop] of Object.entries(properties)) {
14630
+ keywords.add(name$1.toLowerCase());
14631
+ const parts = name$1.replace(/([a-z0-9])([A-Z])/g, "$1 $2").split(/[\s._:/\\-]+/g);
14632
+ for (const part of parts) if (part.length > 1) keywords.add(part.toLowerCase());
14633
+ if (prop && typeof prop === "object") {
14634
+ const p = prop;
14635
+ if (typeof p.title === "string") keywords.add(p.title.toLowerCase());
14636
+ if (typeof p.description === "string") {
14637
+ const descParts = p.description.toLowerCase().match(/[\p{L}\p{N}_-]+/gu) ?? [];
14638
+ for (const d of descParts) if (d.length > 2) keywords.add(d);
14639
+ }
14640
+ if (p.type === "object") walk(p.properties);
14641
+ if (p.type === "array" && p.items?.type === "object") walk(p.items.properties);
14642
+ }
14643
+ }
14644
+ }
14645
+ walk(schema.properties);
14646
+ return Array.from(keywords);
14647
+ }
14457
14648
  async function ensureDatabaseReady(dbPath, schema) {
14458
14649
  try {
14459
14650
  await fs.access(dbPath);
@@ -14525,7 +14716,151 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14525
14716
  }
14526
14717
  const s = spinner();
14527
14718
  if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
14528
- const result = await extractStructuredData({
14719
+ const CHUNK_LIMIT = 4e4;
14720
+ let result;
14721
+ if (text$1 && text$1.length > CHUNK_LIMIT) {
14722
+ if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
14723
+ length: text$1.length,
14724
+ limit: CHUNK_LIMIT
14725
+ }));
14726
+ const finalDocs = splitMarkdown(text$1, CHUNK_LIMIT, aiConfig.extraction?.overlapSize ?? 2e3);
14727
+ if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
14728
+ let processedDocs = finalDocs;
14729
+ if (!!aiConfig.extraction?.preFiltering && finalDocs.length > 1) {
14730
+ const preFilteringLimit = aiConfig.extraction?.preFilteringLimit ?? 5;
14731
+ const keywords = getSchemaKeywords(schemaLoad.schema);
14732
+ const scoredChunks = finalDocs.map((doc, idx) => {
14733
+ if (idx === 0) return {
14734
+ index: idx,
14735
+ score: Number.POSITIVE_INFINITY
14736
+ };
14737
+ let score = 0;
14738
+ const docTextLower = doc.pageContent.toLowerCase();
14739
+ for (const kw of keywords) {
14740
+ let pos = docTextLower.indexOf(kw);
14741
+ while (pos !== -1) {
14742
+ score++;
14743
+ pos = docTextLower.indexOf(kw, pos + kw.length);
14744
+ }
14745
+ }
14746
+ return {
14747
+ index: idx,
14748
+ score
14749
+ };
14750
+ }).slice(1).sort((a, b) => b.score - a.score);
14751
+ const selectedIndices = new Set([0]);
14752
+ let keptCount = 0;
14753
+ for (const sc of scoredChunks) if (sc.score > 0 && keptCount < preFilteringLimit) {
14754
+ selectedIndices.add(sc.index);
14755
+ keptCount++;
14756
+ }
14757
+ processedDocs = finalDocs.filter((_, idx) => selectedIndices.has(idx));
14758
+ if (!options?.quiet) consola.info(t("command.extract.file.preFiltering", {
14759
+ original: finalDocs.length,
14760
+ filtered: processedDocs.length
14761
+ }));
14762
+ }
14763
+ const chunkResults = [];
14764
+ const accumulatedTokens = {
14765
+ prompt: 0,
14766
+ completion: 0,
14767
+ total: 0
14768
+ };
14769
+ let success = true;
14770
+ let errorMsg = "";
14771
+ const extractionTasks = processedDocs.map((doc, i) => {
14772
+ return async () => {
14773
+ if (!success) return;
14774
+ const headings = [];
14775
+ if (doc.metadata) {
14776
+ if (doc.metadata.h1) headings.push(doc.metadata.h1);
14777
+ if (doc.metadata.h2) headings.push(doc.metadata.h2);
14778
+ if (doc.metadata.h3) headings.push(doc.metadata.h3);
14779
+ if (doc.metadata.h4) headings.push(doc.metadata.h4);
14780
+ }
14781
+ let chunkText = doc.pageContent;
14782
+ if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
14783
+ const chunkResult = await extractStructuredData({
14784
+ config: aiConfig,
14785
+ schema: schemaLoad.schema,
14786
+ text: chunkText,
14787
+ aiexDir,
14788
+ modelOverride,
14789
+ onRetry(info) {
14790
+ if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
14791
+ current: i + 1,
14792
+ total: processedDocs.length,
14793
+ code: info.statusCode,
14794
+ delay: info.delayMs / 1e3,
14795
+ attempt: info.attempt,
14796
+ max: info.maxRetries
14797
+ }));
14798
+ }
14799
+ });
14800
+ if (!chunkResult.success) {
14801
+ success = false;
14802
+ errorMsg = chunkResult.error || t("common.unknownError");
14803
+ if (!options?.quiet) {
14804
+ s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
14805
+ consola.error(errorMsg);
14806
+ }
14807
+ return;
14808
+ }
14809
+ if (chunkResult.data) chunkResults.push(chunkResult.data);
14810
+ if (chunkResult.tokensUsed) {
14811
+ accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
14812
+ accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
14813
+ accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
14814
+ }
14815
+ };
14816
+ });
14817
+ const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
14818
+ if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
14819
+ current: 1,
14820
+ total: processedDocs.length
14821
+ }));
14822
+ try {
14823
+ await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
14824
+ if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
14825
+ current: idx + 1,
14826
+ total: processedDocs.length
14827
+ }));
14828
+ await task();
14829
+ });
14830
+ } catch (e) {
14831
+ success = false;
14832
+ errorMsg = e instanceof Error ? e.message : String(e);
14833
+ }
14834
+ if (!success) return {
14835
+ success: false,
14836
+ error: errorMsg
14837
+ };
14838
+ const mergedData = mergeExtractionResults(schemaLoad.schema, chunkResults);
14839
+ const validation = validateExtractedData(schemaLoad.schema, mergedData);
14840
+ if (!validation.success) {
14841
+ const valError = validation.error || "Merged data validation failed";
14842
+ if (!options?.quiet) {
14843
+ s.stop(t("command.extract.file.validationFail"));
14844
+ consola.error(valError);
14845
+ }
14846
+ return {
14847
+ success: false,
14848
+ error: valError
14849
+ };
14850
+ }
14851
+ const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
14852
+ await fs.mkdir(outputDir, { recursive: true });
14853
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
14854
+ const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
14855
+ const finalMergedOutputPath = path.join(outputDir, outputFileName);
14856
+ await fs.writeFile(finalMergedOutputPath, JSON.stringify(mergedData, null, 2));
14857
+ result = {
14858
+ success: true,
14859
+ data: mergedData,
14860
+ tokensUsed: accumulatedTokens,
14861
+ outputPath: finalMergedOutputPath
14862
+ };
14863
+ } else result = await extractStructuredData({
14529
14864
  config: aiConfig,
14530
14865
  schema: schemaLoad.schema,
14531
14866
  text: text$1 ?? "",
@@ -14553,6 +14888,11 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14553
14888
  }
14554
14889
  if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
14555
14890
  if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
14891
+ if (result.evidenceSummary && !options?.quiet) {
14892
+ const summary = result.evidenceSummary;
14893
+ const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
14894
+ consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, issues ${issueText}`));
14895
+ }
14556
14896
  if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
14557
14897
  prompt: result.tokensUsed.prompt,
14558
14898
  completion: result.tokensUsed.completion,
@@ -14581,6 +14921,7 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14581
14921
  outputPath: result.outputPath,
14582
14922
  data: result.data,
14583
14923
  tablesInserted: insertResult.tablesInserted,
14924
+ evidenceSummary: result.evidenceSummary,
14584
14925
  tokensUsed: result.tokensUsed
14585
14926
  };
14586
14927
  } else {
@@ -14607,6 +14948,7 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14607
14948
  success: true,
14608
14949
  outputPath: result.outputPath,
14609
14950
  data: result.data,
14951
+ evidenceSummary: result.evidenceSummary,
14610
14952
  tokensUsed: result.tokensUsed
14611
14953
  };
14612
14954
  }
@@ -14719,6 +15061,7 @@ async function runAuditedExtraction(options) {
14719
15061
  outputName: updated.outputName,
14720
15062
  tablesInserted: updated.tablesInserted,
14721
15063
  notionPages: updated.notionPages,
15064
+ evidenceSummary: r.evidenceSummary,
14722
15065
  tokensUsed: updated.tokensUsed,
14723
15066
  auditId: updated.id,
14724
15067
  fileHash
@@ -15846,6 +16189,7 @@ function aiRoutes(config) {
15846
16189
  //#endregion
15847
16190
  //#region src/core/data-service.ts
15848
16191
  const FILE_REGEX = /\.json$/;
16192
+ const EVIDENCE_FILE_SUFFIX = ".evidence.json";
15849
16193
  const EXTRACTION_TIMESTAMP_RE = /-\d{4}-\d{2}-\d{2}T/;
15850
16194
  const INTERNAL_ROWID_COLUMN = "__aiex_rowid";
15851
16195
  const TIMESTAMP_CLEANUP = /(\d{2})-(\d{2})-(\d{2})/;
@@ -15861,6 +16205,24 @@ function getAuditNotionStatus(record) {
15861
16205
  if (record.status === "failed") return "failed";
15862
16206
  return "not_synced";
15863
16207
  }
16208
+ async function readEvidenceSummary(extractedDir, outputName) {
16209
+ const evidencePath = path.join(extractedDir, outputName.replace(FILE_REGEX, EVIDENCE_FILE_SUFFIX));
16210
+ try {
16211
+ const coverage = (await readFile(evidencePath))?.coverage;
16212
+ if (!coverage || typeof coverage !== "object") return void 0;
16213
+ return {
16214
+ path: evidencePath,
16215
+ fieldCount: Number(coverage.fieldCount) || 0,
16216
+ evidenceCount: Number(coverage.evidenceCount) || 0,
16217
+ foundCount: Number(coverage.foundCount) || 0,
16218
+ missingCount: Number(coverage.missingCount) || 0,
16219
+ inferredCount: Number(coverage.inferredCount) || 0,
16220
+ issueCount: Number(coverage.issueCount) || 0
16221
+ };
16222
+ } catch {
16223
+ return;
16224
+ }
16225
+ }
15864
16226
  async function getRowExtractionActions(aiexDir, tableName) {
15865
16227
  const actions = /* @__PURE__ */ new Map();
15866
16228
  const auditRecords = await listExtractionAuditRecords(aiexDir);
@@ -15888,7 +16250,7 @@ async function listExtractions(config) {
15888
16250
  const aiexDir = path.dirname(config.schemaPath);
15889
16251
  const extractedDir = path.join(aiexDir, "extracted");
15890
16252
  await fs.mkdir(extractedDir, { recursive: true });
15891
- const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md"));
16253
+ const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md") && !f.endsWith(EVIDENCE_FILE_SUFFIX));
15892
16254
  const auditRecords = await listExtractionAuditRecords(aiexDir);
15893
16255
  const auditByOutputName = new Map(auditRecords.map((record) => [record.outputName, record]));
15894
16256
  const records = [];
@@ -15907,6 +16269,7 @@ async function listExtractions(config) {
15907
16269
  timestamp,
15908
16270
  fileSize: stat.size,
15909
16271
  modifiedAt: stat.mtime.toISOString(),
16272
+ evidenceSummary: await readEvidenceSummary(extractedDir, file),
15910
16273
  notionStatus: notionPages ? "synced" : audit?.status === "failed" ? "failed" : "not_synced",
15911
16274
  notionPages,
15912
16275
  notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0
@@ -16086,6 +16449,7 @@ async function retryNotionSync(config, fileName) {
16086
16449
 
16087
16450
  //#endregion
16088
16451
  //#region src/server/routes/data.ts
16452
+ const JSON_FILE_SUFFIX_RE = /\.json$/;
16089
16453
  const tableParamSchema = z.object({ name: z.string().regex(/^[a-z][a-z0-9_]*$/) });
16090
16454
  const extractionFileParamSchema = z.object({ name: z.string().regex(/^[\w.-]+\.json$/).refine((name$1) => name$1 === path.basename(name$1) && !name$1.includes("..")) });
16091
16455
  const tableQuerySchema = z.object({
@@ -16138,10 +16502,22 @@ function dataRoutes(config) {
16138
16502
  const filePath = path.join(extractedDir, name$1);
16139
16503
  try {
16140
16504
  const content = await fs.readFile(filePath, "utf-8");
16505
+ const evidencePath = path.join(extractedDir, name$1.replace(JSON_FILE_SUFFIX_RE, ".evidence.json"));
16506
+ let evidenceSummary;
16507
+ try {
16508
+ const evidence = JSON.parse(await fs.readFile(evidencePath, "utf-8"));
16509
+ evidenceSummary = evidence?.coverage ? {
16510
+ ...evidence.coverage,
16511
+ path: evidencePath
16512
+ } : void 0;
16513
+ } catch {
16514
+ evidenceSummary = void 0;
16515
+ }
16141
16516
  return c.json({
16142
16517
  success: true,
16143
16518
  content,
16144
- name: name$1
16519
+ name: name$1,
16520
+ evidenceSummary
16145
16521
  });
16146
16522
  } catch {
16147
16523
  return c.json({ error: t("server.extractionNotFound") }, 404);
@@ -16285,6 +16661,7 @@ function extractRoutes(config) {
16285
16661
  outputName: result.outputName,
16286
16662
  tablesInserted: result.tablesInserted,
16287
16663
  notionPages: result.notionPages,
16664
+ evidenceSummary: result.evidenceSummary,
16288
16665
  tokensUsed: result.tokensUsed,
16289
16666
  auditId: result.auditId
16290
16667
  }, 200);
@@ -16352,6 +16729,7 @@ function extractRoutes(config) {
16352
16729
  outputName: result.outputName,
16353
16730
  tablesInserted: result.tablesInserted,
16354
16731
  notionPages: result.notionPages,
16732
+ evidenceSummary: result.evidenceSummary,
16355
16733
  tokensUsed: result.tokensUsed,
16356
16734
  auditId: result.auditId
16357
16735
  }, 200);
@@ -74,7 +74,7 @@ function doctorDiagnosticsTableRows(d) {
74
74
  //#endregion
75
75
  //#region package.json
76
76
  var name = "aiex-cli";
77
- var version = "0.0.5-beta.2";
77
+ var version = "0.0.5-beta.4";
78
78
  var description = "JSON Schema → SQLite with AI-powered data extraction";
79
79
  var package_default = {
80
80
  name,
@@ -228,7 +228,14 @@ const PromptConfigSchema = z.object({
228
228
  systemTemplate: z.string().min(1),
229
229
  userTemplate: z.string().min(1)
230
230
  });
231
- const ExtractionConfigSchema = z.object({ outputDir: z.string().min(1) });
231
+ const ExtractionConfigSchema = z.object({
232
+ outputDir: z.string().min(1),
233
+ mode: z.enum(["pipeline"]).default("pipeline").optional(),
234
+ concurrency: z.number().int().min(1).optional(),
235
+ overlapSize: z.number().int().nonnegative().optional(),
236
+ preFiltering: z.boolean().optional(),
237
+ preFilteringLimit: z.number().int().min(1).optional()
238
+ });
232
239
  const ImageOcrConfigSchema = z.object({
233
240
  ocrFallback: z.enum([
234
241
  "auto",
@@ -335,7 +342,10 @@ Extraction requirements:
335
342
  userTemplate: `Please extract data from the following text:
336
343
  {text}`
337
344
  };
338
- const DEFAULT_EXTRACTION_CONFIG = { outputDir: ".aiex/extracted" };
345
+ const DEFAULT_EXTRACTION_CONFIG = {
346
+ outputDir: ".aiex/extracted",
347
+ mode: "pipeline"
348
+ };
339
349
  const DEFAULT_IMAGE_OCR_CONFIG = {
340
350
  ocrFallback: "auto",
341
351
  ocrLanguages: "en-US, zh-Hans",
@@ -567,6 +577,13 @@ const en = {
567
577
  extractFail: "Extraction failed",
568
578
  extractComplete: "Extraction complete",
569
579
  extractRetry: "API responded with {{code}}, retrying in {{delay}}s ({{attempt}}/{{max}})",
580
+ chunking: "Input text length ({{length}} chars) exceeds limit ({{limit}} chars). Splitting into chunks...",
581
+ chunksCount: "Split into {{count}} chunk(s).",
582
+ preFiltering: "Hybrid pre-filtering: selected {{filtered}} out of {{original}} chunks based on schema relevance.",
583
+ extractingChunk: "Extracting chunk {{current}}/{{total}}...",
584
+ extractRetryChunk: "Chunk {{current}}/{{total}} API responded with {{code}}, retrying in {{delay}}s ({{attempt}}/{{max}})",
585
+ extractFailChunk: "Extraction failed for chunk {{current}}/{{total}}",
586
+ validationFail: "Merged data validation failed",
570
587
  resultSaved: "Result saved: {{path}}",
571
588
  tokenUsage: "Token usage: prompt={{prompt}}, completion={{completion}}, total={{total}}",
572
589
  insertingDb: "Inserting into database...",
@@ -956,7 +973,7 @@ async function initI18n(lng) {
956
973
  fallbackLng: "en",
957
974
  resources: {
958
975
  "en": { translation: en },
959
- "zh-CN": { translation: await import("./zh-CN-Qcn0DHFh.mjs").then((m) => m.zhCN) }
976
+ "zh-CN": { translation: await import("./zh-CN-CyL-61Ow.mjs").then((m) => m.zhCN) }
960
977
  },
961
978
  interpolation: { escapeValue: false },
962
979
  returnNull: false