aiex-cli 0.0.5-beta.5 → 0.0.6-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -202,17 +202,6 @@ aiex completion fish | source
202
202
 
203
203
  <br>
204
204
 
205
- ## 📄 Large Document Processing
206
-
207
- When processing very large documents (exceeding `40,000` characters), `aiex` runs an optimized **Pipeline Mode** to handle context window limits and control API costs:
208
-
209
- - **Token-Aware AST Splitting**: Parses structural Markdown elements (headings, paragraphs, lists) using an AST-based parser (`marked.lexer`) and splits them using precise token counters (`js-tiktoken`). Active heading hierarchies are tracked and prepended to each chunk as context. Tables and code blocks are kept intact (atomic blocks) to avoid syntax corruption.
210
- - **Concurrency Limiting**: To respect strict model rate limits, chunk extractions are processed in parallel with a strict concurrency limit (capped at 2 concurrent requests).
211
- - **Pre-filtering**: Integrates hybrid search-based pre-filtering to score and select only the most relevant document chunks based on schema queries, preventing unnecessary token usage on unrelated sections.
212
- - **Recursive Merging**: The final extracted JSON objects from each chunk are recursively merged, concatenating lists and deduplicating primitive fields.
213
-
214
- <br>
215
-
216
205
  ## 🔧 AI Configuration
217
206
 
218
207
  aiex works with any OpenAI-compatible API provider. Configure in the Web UI (AI Settings panel):
package/dist/cli.mjs CHANGED
@@ -1,4 +1,4 @@
1
- import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-NTNBFeBw.mjs";
1
+ import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-hWEvJ4lw.mjs";
2
2
  import { createRequire } from "node:module";
3
3
  import fs from "node:fs/promises";
4
4
  import os from "node:os";
@@ -17,7 +17,6 @@ import Database from "better-sqlite3";
17
17
  import pc from "picocolors";
18
18
  import { Buffer } from "node:buffer";
19
19
  import * as XLSX from "xlsx";
20
- import { getEncoding } from "js-tiktoken";
21
20
  import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
22
21
  import { APICallError, Output, generateText, jsonSchema } from "ai";
23
22
  import pRetry from "p-retry";
@@ -25,7 +24,6 @@ import mime from "mime";
25
24
  import { jsonrepair } from "jsonrepair";
26
25
  import { LangfuseSpanProcessor } from "@langfuse/otel";
27
26
  import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
28
- import { marked } from "marked";
29
27
  import crypto from "node:crypto";
30
28
  import { Client, extractNotionId } from "@notionhq/client";
31
29
  import { execa } from "execa";
@@ -13130,7 +13128,7 @@ function propertyToExtractionSchema(property) {
13130
13128
  }
13131
13129
  return { type: nullableType(property.type) };
13132
13130
  }
13133
- function isRecord$1(value) {
13131
+ function isRecord(value) {
13134
13132
  return typeof value === "object" && value !== null && !Array.isArray(value);
13135
13133
  }
13136
13134
  function schemaToExtractionOutputSchema(schema) {
@@ -13168,7 +13166,7 @@ function validatePropertyValue(path$1, property, value, issues) {
13168
13166
  }
13169
13167
  return;
13170
13168
  case "object":
13171
- if (!isRecord$1(value)) {
13169
+ if (!isRecord(value)) {
13172
13170
  issues.push(`${path$1}: expected object or null`);
13173
13171
  return;
13174
13172
  }
@@ -13191,7 +13189,7 @@ function validateProperties(basePath, properties, data, issues) {
13191
13189
  }
13192
13190
  }
13193
13191
  function validateExtractedData(schema, data) {
13194
- if (!isRecord$1(data)) return {
13192
+ if (!isRecord(data)) return {
13195
13193
  success: false,
13196
13194
  error: "Extracted data must be a JSON object."
13197
13195
  };
@@ -13514,220 +13512,6 @@ function insertExtractedData(db, schema, data) {
13514
13512
  }
13515
13513
  }
13516
13514
 
13517
- //#endregion
13518
- //#region src/core/ai-extraction/json-merger.ts
13519
- function isRecord(value) {
13520
- return typeof value === "object" && value !== null && !Array.isArray(value);
13521
- }
13522
- function mergePropertyValue(property, values) {
13523
- const nonNullValues = values.filter((v) => v !== null && v !== void 0);
13524
- if (nonNullValues.length === 0) return null;
13525
- if (property.type === "array") {
13526
- const concatenated = [];
13527
- for (const val of nonNullValues) if (Array.isArray(val)) concatenated.push(...val);
13528
- return concatenated;
13529
- }
13530
- if (property.type === "object") {
13531
- const childProperties = property.properties;
13532
- if (!childProperties) {
13533
- const mergedObj$1 = {};
13534
- for (const val of nonNullValues) if (isRecord(val)) Object.assign(mergedObj$1, val);
13535
- return mergedObj$1;
13536
- }
13537
- const mergedObj = {};
13538
- for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord(v) ? v[propName] : void 0));
13539
- return mergedObj;
13540
- }
13541
- const bestValue = nonNullValues.find((v) => {
13542
- if (typeof v === "string") return v.trim() !== "";
13543
- return true;
13544
- });
13545
- return bestValue !== void 0 ? bestValue : null;
13546
- }
13547
- /**
13548
- * Merges structured extraction outputs from multiple document chunks
13549
- * according to the schema properties.
13550
- */
13551
- function mergeExtractionResults(schema, results) {
13552
- if (results.length === 0) return {};
13553
- if (results.length === 1) return results[0];
13554
- const merged = {};
13555
- for (const [propName, propDef] of Object.entries(schema.properties)) {
13556
- if (propDef.primary && propDef.autoIncrement) continue;
13557
- merged[propName] = mergePropertyValue(propDef, results.map((r) => r[propName]));
13558
- }
13559
- return merged;
13560
- }
13561
-
13562
- //#endregion
13563
- //#region src/core/ai-extraction/text-splitter.ts
13564
- const encoding$1 = getEncoding("cl100k_base");
13565
- function countTokens(text$1) {
13566
- return encoding$1.encode(text$1).length;
13567
- }
13568
- function formatHeadingContext(headings) {
13569
- const active = headings.filter(Boolean);
13570
- if (active.length === 0) return "";
13571
- return `> **[Context]** Belong to: ${active.join(" > ")}\n\n`;
13572
- }
13573
- function getMetadata(headings) {
13574
- return {
13575
- h1: headings[0] || void 0,
13576
- h2: headings[1] || void 0,
13577
- h3: headings[2] || void 0,
13578
- h4: headings[3] || void 0
13579
- };
13580
- }
13581
- /**
13582
- * Splits text recursively using a list of separators.
13583
- * Preserves the separators when re-joining.
13584
- */
13585
- function splitTextRecursively(text$1, maxTokens, separators = [
13586
- "\n\n",
13587
- "\n",
13588
- "。",
13589
- ". ",
13590
- " "
13591
- ]) {
13592
- if (countTokens(text$1) <= maxTokens) return [text$1];
13593
- if (separators.length === 0) {
13594
- const chunks = [];
13595
- let current = "";
13596
- for (const char of text$1) if (countTokens(current + char) > maxTokens) {
13597
- chunks.push(current);
13598
- current = char;
13599
- } else current += char;
13600
- if (current) chunks.push(current);
13601
- return chunks;
13602
- }
13603
- const separator = separators[0];
13604
- const nextSeparators = separators.slice(1);
13605
- const parts = text$1.split(separator);
13606
- const result = [];
13607
- let currentChunk = [];
13608
- let currentChunkTokens = 0;
13609
- for (let i = 0; i < parts.length; i++) {
13610
- const part = parts[i];
13611
- const itemText = part + (i < parts.length - 1 ? separator : "");
13612
- const partTokens = countTokens(itemText);
13613
- if (partTokens > maxTokens) {
13614
- if (currentChunk.length > 0) {
13615
- result.push(currentChunk.join(""));
13616
- currentChunk = [];
13617
- currentChunkTokens = 0;
13618
- }
13619
- const subParts = splitTextRecursively(part, maxTokens, nextSeparators);
13620
- for (let j = 0; j < subParts.length; j++) {
13621
- const finalSub = subParts[j] + (j === subParts.length - 1 && i < parts.length - 1 ? separator : "");
13622
- result.push(finalSub);
13623
- }
13624
- } else if (currentChunkTokens + partTokens > maxTokens) {
13625
- result.push(currentChunk.join(""));
13626
- currentChunk = [itemText];
13627
- currentChunkTokens = partTokens;
13628
- } else {
13629
- currentChunk.push(itemText);
13630
- currentChunkTokens += partTokens;
13631
- }
13632
- }
13633
- if (currentChunk.length > 0) result.push(currentChunk.join(""));
13634
- return result;
13635
- }
13636
- /**
13637
- * Splits a Markdown document into chunks based on heading contexts, AST block parsing, and token limits.
13638
- * Protects tables, list items, and code blocks from being broken.
13639
- */
13640
- function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
13641
- const tokens = marked.lexer(text$1);
13642
- const chunks = [];
13643
- let currentHeadings = [];
13644
- let currentChunkList = [];
13645
- let accumulatedTokens = 0;
13646
- const flushCurrentChunk = (isHeadingChange = false) => {
13647
- if (currentChunkList.length === 0) return;
13648
- const pageContent = currentChunkList.map((item) => item.text).join("");
13649
- const firstHeadings = currentChunkList[0].headings;
13650
- chunks.push({
13651
- pageContent,
13652
- metadata: getMetadata(firstHeadings)
13653
- });
13654
- if (isHeadingChange || overlapTokens <= 0) {
13655
- currentChunkList = [];
13656
- accumulatedTokens = 0;
13657
- } else {
13658
- const overlapItems = [];
13659
- let currentOverlapTokens = 0;
13660
- for (let i = currentChunkList.length - 1; i >= 0; i--) {
13661
- const item = currentChunkList[i];
13662
- const itemTokens = countTokens(item.text);
13663
- if (currentOverlapTokens + itemTokens > overlapTokens && overlapItems.length > 0) break;
13664
- overlapItems.unshift(item);
13665
- currentOverlapTokens += itemTokens;
13666
- }
13667
- currentChunkList = [...overlapItems];
13668
- accumulatedTokens = currentOverlapTokens;
13669
- }
13670
- };
13671
- for (const token of tokens) {
13672
- if (token.type === "space") {
13673
- if (currentChunkList.length > 0) {
13674
- currentChunkList[currentChunkList.length - 1].text += token.raw;
13675
- accumulatedTokens += countTokens(token.raw);
13676
- }
13677
- continue;
13678
- }
13679
- if (token.type === "heading") {
13680
- flushCurrentChunk(true);
13681
- const depth = token.depth;
13682
- const title = token.text.trim();
13683
- currentHeadings = currentHeadings.slice(0, depth - 1);
13684
- currentHeadings[depth - 1] = title;
13685
- }
13686
- const rawText = token.raw;
13687
- if (token.type === "list" && countTokens(rawText) > maxTokens) for (const item of token.items) processTextBlock(item.raw, currentHeadings);
13688
- else {
13689
- const isAtomic = token.type === "table" || token.type === "code";
13690
- processTextBlock(rawText, currentHeadings, isAtomic);
13691
- }
13692
- }
13693
- flushCurrentChunk(true);
13694
- return chunks;
13695
- function processTextBlock(blockText, headings, isAtomic = false) {
13696
- const blockTokens = countTokens(blockText);
13697
- const contextTokens = countTokens(formatHeadingContext(headings));
13698
- const safetyBuffer = Math.min(100, Math.max(2, Math.floor(maxTokens * .1)));
13699
- const budgetLimit = Math.max(5, maxTokens - contextTokens - safetyBuffer);
13700
- if (blockTokens > budgetLimit) if (isAtomic) {
13701
- flushCurrentChunk(false);
13702
- currentChunkList.push({
13703
- text: blockText,
13704
- headings: [...headings]
13705
- });
13706
- accumulatedTokens = blockTokens;
13707
- flushCurrentChunk(false);
13708
- } else {
13709
- flushCurrentChunk(false);
13710
- const subBlocks = splitTextRecursively(blockText, budgetLimit);
13711
- for (const sub of subBlocks) {
13712
- currentChunkList.push({
13713
- text: sub,
13714
- headings: [...headings]
13715
- });
13716
- accumulatedTokens += countTokens(sub);
13717
- if (accumulatedTokens > budgetLimit) flushCurrentChunk(false);
13718
- }
13719
- }
13720
- else {
13721
- if (accumulatedTokens + blockTokens + contextTokens > maxTokens && currentChunkList.length > 0) flushCurrentChunk(false);
13722
- currentChunkList.push({
13723
- text: blockText,
13724
- headings: [...headings]
13725
- });
13726
- accumulatedTokens += blockTokens;
13727
- }
13728
- }
13729
- }
13730
-
13731
13515
  //#endregion
13732
13516
  //#region src/core/extraction-audit.ts
13733
13517
  const AUDIT_ID_RE = /^[\w.-]+$/;
@@ -14669,44 +14453,7 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
14669
14453
 
14670
14454
  //#endregion
14671
14455
  //#region src/core/extract-runner.ts
14672
- const encoding = getEncoding("cl100k_base");
14673
14456
  const JSON_EXT_RE$1 = /\.json$/;
14674
- async function limitConcurrency(concurrency, items, fn) {
14675
- const results = Array.from({ length: items.length });
14676
- let nextIndex = 0;
14677
- async function worker() {
14678
- while (nextIndex < items.length) {
14679
- const currentIndex = nextIndex++;
14680
- results[currentIndex] = await fn(items[currentIndex], currentIndex);
14681
- }
14682
- }
14683
- const workers = Array.from({ length: Math.min(concurrency, items.length) }, worker);
14684
- await Promise.all(workers);
14685
- return results;
14686
- }
14687
- function getSchemaKeywords(schema) {
14688
- const keywords = /* @__PURE__ */ new Set();
14689
- function walk(properties) {
14690
- if (!properties) return;
14691
- for (const [name$1, prop] of Object.entries(properties)) {
14692
- keywords.add(name$1.toLowerCase());
14693
- const parts = name$1.replace(/([a-z0-9])([A-Z])/g, "$1 $2").split(/[\s._:/\\-]+/g);
14694
- for (const part of parts) if (part.length > 1) keywords.add(part.toLowerCase());
14695
- if (prop && typeof prop === "object") {
14696
- const p = prop;
14697
- if (typeof p.title === "string") keywords.add(p.title.toLowerCase());
14698
- if (typeof p.description === "string") {
14699
- const descParts = p.description.toLowerCase().match(/[\p{L}\p{N}_-]+/gu) ?? [];
14700
- for (const d of descParts) if (d.length > 2) keywords.add(d);
14701
- }
14702
- if (p.type === "object") walk(p.properties);
14703
- if (p.type === "array" && p.items?.type === "object") walk(p.items.properties);
14704
- }
14705
- }
14706
- }
14707
- walk(schema.properties);
14708
- return Array.from(keywords);
14709
- }
14710
14457
  async function ensureDatabaseReady(dbPath, schema) {
14711
14458
  try {
14712
14459
  await fs.access(dbPath);
@@ -14778,153 +14525,7 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14778
14525
  }
14779
14526
  const s = spinner();
14780
14527
  if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
14781
- const maxTokens = aiConfig.extraction?.maxTokens ?? 8e3;
14782
- const overlapTokens = aiConfig.extraction?.overlapSize ?? 1e3;
14783
- let result;
14784
- const totalTokens = text$1 ? encoding.encode(text$1).length : 0;
14785
- if (text$1 && totalTokens > maxTokens) {
14786
- if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
14787
- length: totalTokens,
14788
- limit: maxTokens
14789
- }));
14790
- const finalDocs = splitMarkdown(text$1, maxTokens, overlapTokens);
14791
- if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
14792
- let processedDocs = finalDocs;
14793
- if (!!aiConfig.extraction?.preFiltering && finalDocs.length > 1) {
14794
- const preFilteringLimit = aiConfig.extraction?.preFilteringLimit ?? 5;
14795
- const keywords = getSchemaKeywords(schemaLoad.schema);
14796
- const scoredChunks = finalDocs.map((doc, idx) => {
14797
- if (idx === 0) return {
14798
- index: idx,
14799
- score: Number.POSITIVE_INFINITY
14800
- };
14801
- let score = 0;
14802
- const docTextLower = doc.pageContent.toLowerCase();
14803
- for (const kw of keywords) {
14804
- let pos = docTextLower.indexOf(kw);
14805
- while (pos !== -1) {
14806
- score++;
14807
- pos = docTextLower.indexOf(kw, pos + kw.length);
14808
- }
14809
- }
14810
- return {
14811
- index: idx,
14812
- score
14813
- };
14814
- }).slice(1).sort((a, b) => b.score - a.score);
14815
- const selectedIndices = new Set([0]);
14816
- let keptCount = 0;
14817
- for (const sc of scoredChunks) if (sc.score > 0 && keptCount < preFilteringLimit) {
14818
- selectedIndices.add(sc.index);
14819
- keptCount++;
14820
- }
14821
- processedDocs = finalDocs.filter((_, idx) => selectedIndices.has(idx));
14822
- if (!options?.quiet) consola.info(t("command.extract.file.preFiltering", {
14823
- original: finalDocs.length,
14824
- filtered: processedDocs.length
14825
- }));
14826
- }
14827
- const chunkResults = [];
14828
- const accumulatedTokens = {
14829
- prompt: 0,
14830
- completion: 0,
14831
- total: 0
14832
- };
14833
- let success = true;
14834
- let errorMsg = "";
14835
- const extractionTasks = processedDocs.map((doc, i) => {
14836
- return async () => {
14837
- if (!success) return;
14838
- const headings = [];
14839
- if (doc.metadata) {
14840
- if (doc.metadata.h1) headings.push(doc.metadata.h1);
14841
- if (doc.metadata.h2) headings.push(doc.metadata.h2);
14842
- if (doc.metadata.h3) headings.push(doc.metadata.h3);
14843
- if (doc.metadata.h4) headings.push(doc.metadata.h4);
14844
- }
14845
- let chunkText = doc.pageContent;
14846
- if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
14847
- const chunkResult = await extractStructuredData({
14848
- config: aiConfig,
14849
- schema: schemaLoad.schema,
14850
- text: chunkText,
14851
- aiexDir,
14852
- modelOverride,
14853
- onRetry(info) {
14854
- if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
14855
- current: i + 1,
14856
- total: processedDocs.length,
14857
- code: info.statusCode,
14858
- delay: info.delayMs / 1e3,
14859
- attempt: info.attempt,
14860
- max: info.maxRetries
14861
- }));
14862
- }
14863
- });
14864
- if (!chunkResult.success) {
14865
- success = false;
14866
- errorMsg = chunkResult.error || t("common.unknownError");
14867
- if (!options?.quiet) {
14868
- s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
14869
- consola.error(errorMsg);
14870
- }
14871
- return;
14872
- }
14873
- if (chunkResult.data) chunkResults.push(chunkResult.data);
14874
- if (chunkResult.tokensUsed) {
14875
- accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
14876
- accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
14877
- accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
14878
- }
14879
- };
14880
- });
14881
- const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
14882
- if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
14883
- current: 1,
14884
- total: processedDocs.length
14885
- }));
14886
- try {
14887
- await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
14888
- if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
14889
- current: idx + 1,
14890
- total: processedDocs.length
14891
- }));
14892
- await task();
14893
- });
14894
- } catch (e) {
14895
- success = false;
14896
- errorMsg = e instanceof Error ? e.message : String(e);
14897
- }
14898
- if (!success) return {
14899
- success: false,
14900
- error: errorMsg
14901
- };
14902
- const mergedData = mergeExtractionResults(schemaLoad.schema, chunkResults);
14903
- const validation = validateExtractedData(schemaLoad.schema, mergedData);
14904
- if (!validation.success) {
14905
- const valError = validation.error || "Merged data validation failed";
14906
- if (!options?.quiet) {
14907
- s.stop(t("command.extract.file.validationFail"));
14908
- consola.error(valError);
14909
- }
14910
- return {
14911
- success: false,
14912
- error: valError
14913
- };
14914
- }
14915
- const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
14916
- await fs.mkdir(outputDir, { recursive: true });
14917
- const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
14918
- const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
14919
- const finalMergedOutputPath = path.join(outputDir, outputFileName);
14920
- await fs.writeFile(finalMergedOutputPath, JSON.stringify(mergedData, null, 2));
14921
- result = {
14922
- success: true,
14923
- data: mergedData,
14924
- tokensUsed: accumulatedTokens,
14925
- outputPath: finalMergedOutputPath
14926
- };
14927
- } else result = await extractStructuredData({
14528
+ const result = await extractStructuredData({
14928
14529
  config: aiConfig,
14929
14530
  schema: schemaLoad.schema,
14930
14531
  text: text$1 ?? "",
@@ -14952,11 +14553,6 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14952
14553
  }
14953
14554
  if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
14954
14555
  if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
14955
- if (result.evidenceSummary && !options?.quiet) {
14956
- const summary = result.evidenceSummary;
14957
- const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
14958
- consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, issues ${issueText}`));
14959
- }
14960
14556
  if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
14961
14557
  prompt: result.tokensUsed.prompt,
14962
14558
  completion: result.tokensUsed.completion,
@@ -14985,7 +14581,6 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
14985
14581
  outputPath: result.outputPath,
14986
14582
  data: result.data,
14987
14583
  tablesInserted: insertResult.tablesInserted,
14988
- evidenceSummary: result.evidenceSummary,
14989
14584
  tokensUsed: result.tokensUsed
14990
14585
  };
14991
14586
  } else {
@@ -15012,7 +14607,6 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
15012
14607
  success: true,
15013
14608
  outputPath: result.outputPath,
15014
14609
  data: result.data,
15015
- evidenceSummary: result.evidenceSummary,
15016
14610
  tokensUsed: result.tokensUsed
15017
14611
  };
15018
14612
  }
@@ -15125,7 +14719,6 @@ async function runAuditedExtraction(options) {
15125
14719
  outputName: updated.outputName,
15126
14720
  tablesInserted: updated.tablesInserted,
15127
14721
  notionPages: updated.notionPages,
15128
- evidenceSummary: r.evidenceSummary,
15129
14722
  tokensUsed: updated.tokensUsed,
15130
14723
  auditId: updated.id,
15131
14724
  fileHash
@@ -16253,7 +15846,6 @@ function aiRoutes(config) {
16253
15846
  //#endregion
16254
15847
  //#region src/core/data-service.ts
16255
15848
  const FILE_REGEX = /\.json$/;
16256
- const EVIDENCE_FILE_SUFFIX = ".evidence.json";
16257
15849
  const EXTRACTION_TIMESTAMP_RE = /-\d{4}-\d{2}-\d{2}T/;
16258
15850
  const INTERNAL_ROWID_COLUMN = "__aiex_rowid";
16259
15851
  const TIMESTAMP_CLEANUP = /(\d{2})-(\d{2})-(\d{2})/;
@@ -16269,24 +15861,6 @@ function getAuditNotionStatus(record) {
16269
15861
  if (record.status === "failed") return "failed";
16270
15862
  return "not_synced";
16271
15863
  }
16272
- async function readEvidenceSummary(extractedDir, outputName) {
16273
- const evidencePath = path.join(extractedDir, outputName.replace(FILE_REGEX, EVIDENCE_FILE_SUFFIX));
16274
- try {
16275
- const coverage = (await readFile(evidencePath))?.coverage;
16276
- if (!coverage || typeof coverage !== "object") return void 0;
16277
- return {
16278
- path: evidencePath,
16279
- fieldCount: Number(coverage.fieldCount) || 0,
16280
- evidenceCount: Number(coverage.evidenceCount) || 0,
16281
- foundCount: Number(coverage.foundCount) || 0,
16282
- missingCount: Number(coverage.missingCount) || 0,
16283
- inferredCount: Number(coverage.inferredCount) || 0,
16284
- issueCount: Number(coverage.issueCount) || 0
16285
- };
16286
- } catch {
16287
- return;
16288
- }
16289
- }
16290
15864
  async function getRowExtractionActions(aiexDir, tableName) {
16291
15865
  const actions = /* @__PURE__ */ new Map();
16292
15866
  const auditRecords = await listExtractionAuditRecords(aiexDir);
@@ -16314,7 +15888,7 @@ async function listExtractions(config) {
16314
15888
  const aiexDir = path.dirname(config.schemaPath);
16315
15889
  const extractedDir = path.join(aiexDir, "extracted");
16316
15890
  await fs.mkdir(extractedDir, { recursive: true });
16317
- const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md") && !f.endsWith(EVIDENCE_FILE_SUFFIX));
15891
+ const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md"));
16318
15892
  const auditRecords = await listExtractionAuditRecords(aiexDir);
16319
15893
  const auditByOutputName = new Map(auditRecords.map((record) => [record.outputName, record]));
16320
15894
  const records = [];
@@ -16333,7 +15907,6 @@ async function listExtractions(config) {
16333
15907
  timestamp,
16334
15908
  fileSize: stat.size,
16335
15909
  modifiedAt: stat.mtime.toISOString(),
16336
- evidenceSummary: await readEvidenceSummary(extractedDir, file),
16337
15910
  notionStatus: notionPages ? "synced" : audit?.status === "failed" ? "failed" : "not_synced",
16338
15911
  notionPages,
16339
15912
  notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0
@@ -16513,7 +16086,6 @@ async function retryNotionSync(config, fileName) {
16513
16086
 
16514
16087
  //#endregion
16515
16088
  //#region src/server/routes/data.ts
16516
- const JSON_FILE_SUFFIX_RE = /\.json$/;
16517
16089
  const tableParamSchema = z.object({ name: z.string().regex(/^[a-z][a-z0-9_]*$/) });
16518
16090
  const extractionFileParamSchema = z.object({ name: z.string().regex(/^[\w.-]+\.json$/).refine((name$1) => name$1 === path.basename(name$1) && !name$1.includes("..")) });
16519
16091
  const tableQuerySchema = z.object({
@@ -16566,22 +16138,10 @@ function dataRoutes(config) {
16566
16138
  const filePath = path.join(extractedDir, name$1);
16567
16139
  try {
16568
16140
  const content = await fs.readFile(filePath, "utf-8");
16569
- const evidencePath = path.join(extractedDir, name$1.replace(JSON_FILE_SUFFIX_RE, ".evidence.json"));
16570
- let evidenceSummary;
16571
- try {
16572
- const evidence = JSON.parse(await fs.readFile(evidencePath, "utf-8"));
16573
- evidenceSummary = evidence?.coverage ? {
16574
- ...evidence.coverage,
16575
- path: evidencePath
16576
- } : void 0;
16577
- } catch {
16578
- evidenceSummary = void 0;
16579
- }
16580
16141
  return c.json({
16581
16142
  success: true,
16582
16143
  content,
16583
- name: name$1,
16584
- evidenceSummary
16144
+ name: name$1
16585
16145
  });
16586
16146
  } catch {
16587
16147
  return c.json({ error: t("server.extractionNotFound") }, 404);
@@ -16725,7 +16285,6 @@ function extractRoutes(config) {
16725
16285
  outputName: result.outputName,
16726
16286
  tablesInserted: result.tablesInserted,
16727
16287
  notionPages: result.notionPages,
16728
- evidenceSummary: result.evidenceSummary,
16729
16288
  tokensUsed: result.tokensUsed,
16730
16289
  auditId: result.auditId
16731
16290
  }, 200);
@@ -16793,7 +16352,6 @@ function extractRoutes(config) {
16793
16352
  outputName: result.outputName,
16794
16353
  tablesInserted: result.tablesInserted,
16795
16354
  notionPages: result.notionPages,
16796
- evidenceSummary: result.evidenceSummary,
16797
16355
  tokensUsed: result.tokensUsed,
16798
16356
  auditId: result.auditId
16799
16357
  }, 200);
@@ -74,7 +74,7 @@ function doctorDiagnosticsTableRows(d) {
74
74
  //#endregion
75
75
  //#region package.json
76
76
  var name = "aiex-cli";
77
- var version = "0.0.5-beta.5";
77
+ var version = "0.0.6-beta.1";
78
78
  var description = "JSON Schema → SQLite with AI-powered data extraction";
79
79
  var package_default = {
80
80
  name,
@@ -158,11 +158,9 @@ var package_default = {
158
158
  "hono": "catalog:",
159
159
  "i18next": "catalog:",
160
160
  "i18next-fs-backend": "catalog:",
161
- "js-tiktoken": "catalog:",
162
161
  "jsonfile": "catalog:",
163
162
  "jsonrepair": "catalog:",
164
163
  "kysely": "catalog:",
165
- "marked": "catalog:",
166
164
  "mime": "catalog:",
167
165
  "open": "catalog:",
168
166
  "p-retry": "catalog:",
@@ -230,15 +228,7 @@ const PromptConfigSchema = z.object({
230
228
  systemTemplate: z.string().min(1),
231
229
  userTemplate: z.string().min(1)
232
230
  });
233
- const ExtractionConfigSchema = z.object({
234
- outputDir: z.string().min(1),
235
- mode: z.enum(["pipeline"]).default("pipeline").optional(),
236
- concurrency: z.number().int().min(1).optional(),
237
- maxTokens: z.number().int().positive().default(8e3).optional(),
238
- overlapSize: z.number().int().nonnegative().optional(),
239
- preFiltering: z.boolean().optional(),
240
- preFilteringLimit: z.number().int().min(1).optional()
241
- });
231
+ const ExtractionConfigSchema = z.object({ outputDir: z.string().min(1) });
242
232
  const ImageOcrConfigSchema = z.object({
243
233
  ocrFallback: z.enum([
244
234
  "auto",
@@ -345,10 +335,7 @@ Extraction requirements:
345
335
  userTemplate: `Please extract data from the following text:
346
336
  {text}`
347
337
  };
348
- const DEFAULT_EXTRACTION_CONFIG = {
349
- outputDir: ".aiex/extracted",
350
- mode: "pipeline"
351
- };
338
+ const DEFAULT_EXTRACTION_CONFIG = { outputDir: ".aiex/extracted" };
352
339
  const DEFAULT_IMAGE_OCR_CONFIG = {
353
340
  ocrFallback: "auto",
354
341
  ocrLanguages: "en-US, zh-Hans",
@@ -580,13 +567,6 @@ const en = {
580
567
  extractFail: "Extraction failed",
581
568
  extractComplete: "Extraction complete",
582
569
  extractRetry: "API responded with {{code}}, retrying in {{delay}}s ({{attempt}}/{{max}})",
583
- chunking: "Input text ({{length}} tokens) exceeds limit ({{limit}} tokens). Splitting into chunks...",
584
- chunksCount: "Split into {{count}} chunk(s).",
585
- preFiltering: "Hybrid pre-filtering: selected {{filtered}} out of {{original}} chunks based on schema relevance.",
586
- extractingChunk: "Extracting chunk {{current}}/{{total}}...",
587
- extractRetryChunk: "Chunk {{current}}/{{total}} API responded with {{code}}, retrying in {{delay}}s ({{attempt}}/{{max}})",
588
- extractFailChunk: "Extraction failed for chunk {{current}}/{{total}}",
589
- validationFail: "Merged data validation failed",
590
570
  resultSaved: "Result saved: {{path}}",
591
571
  tokenUsage: "Token usage: prompt={{prompt}}, completion={{completion}}, total={{total}}",
592
572
  insertingDb: "Inserting into database...",
@@ -976,7 +956,7 @@ async function initI18n(lng) {
976
956
  fallbackLng: "en",
977
957
  resources: {
978
958
  "en": { translation: en },
979
- "zh-CN": { translation: await import("./zh-CN-Ca-Dv775.mjs").then((m) => m.zhCN) }
959
+ "zh-CN": { translation: await import("./zh-CN-Qcn0DHFh.mjs").then((m) => m.zhCN) }
980
960
  },
981
961
  interpolation: { escapeValue: false },
982
962
  returnNull: false