npm - aiex-cli - Versions diffs - 0.0.5-beta.4 → 0.0.5-beta.5 - Mend

aiex-cli 0.0.5-beta.4 → 0.0.5-beta.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md +1 -1
package/dist/cli.mjs +162 -98
package/dist/{doctor-collector-Cv7RArla.mjs → doctor-collector-NTNBFeBw.mjs} +6 -3
package/dist/index.mjs +1 -1
package/dist/{zh-CN-CyL-61Ow.mjs → zh-CN-Ca-Dv775.mjs} +1 -1
package/package.json +3 -1

package/README.md CHANGED Viewed

@@ -206,7 +206,7 @@ aiex completion fish | source
 When processing very large documents (exceeding `40,000` characters), `aiex` runs an optimized **Pipeline Mode** to handle context window limits and control API costs:
-- **Sliding Window & Overlapping Slices**: Splits the document logically at Markdown headings or paragraph boundaries. It uses an overlapping sliding window to ensure contextual continuity at slice boundaries. Active heading hierarchies are tracked and prepended to each chunk as context.
+- **Token-Aware AST Splitting**: Parses structural Markdown elements (headings, paragraphs, lists) using an AST-based parser (`marked.lexer`) and splits them using precise token counters (`js-tiktoken`). Active heading hierarchies are tracked and prepended to each chunk as context. Tables and code blocks are kept intact (atomic blocks) to avoid syntax corruption.
 - **Concurrency Limiting**: To respect strict model rate limits, chunk extractions are processed in parallel with a strict concurrency limit (capped at 2 concurrent requests).
 - **Pre-filtering**: Integrates hybrid search-based pre-filtering to score and select only the most relevant document chunks based on schema queries, preventing unnecessary token usage on unrelated sections.
 - **Recursive Merging**: The final extracted JSON objects from each chunk are recursively merged, concatenating lists and deduplicating primitive fields.

package/dist/cli.mjs CHANGED Viewed

@@ -1,4 +1,4 @@
-import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-Cv7RArla.mjs";
+import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-NTNBFeBw.mjs";
 import { createRequire } from "node:module";
 import fs from "node:fs/promises";
 import os from "node:os";
@@ -17,6 +17,7 @@ import Database from "better-sqlite3";
 import pc from "picocolors";
 import { Buffer } from "node:buffer";
 import * as XLSX from "xlsx";
+import { getEncoding } from "js-tiktoken";
 import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
 import { APICallError, Output, generateText, jsonSchema } from "ai";
 import pRetry from "p-retry";
@@ -24,6 +25,7 @@ import mime from "mime";
 import { jsonrepair } from "jsonrepair";
 import { LangfuseSpanProcessor } from "@langfuse/otel";
 import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
+import { marked } from "marked";
 import crypto from "node:crypto";
 import { Client, extractNotionId } from "@notionhq/client";
 import { execa } from "execa";
@@ -13559,112 +13561,171 @@ function mergeExtractionResults(schema, results) {
 //#endregion
 //#region src/core/ai-extraction/text-splitter.ts
-const HEADING_RE = /^(#{1,6})\s+(\S.*)$/;
+const encoding$1 = getEncoding("cl100k_base");
+function countTokens(text$1) {
+	return encoding$1.encode(text$1).length;
+}
+function formatHeadingContext(headings) {
+	const active = headings.filter(Boolean);
+	if (active.length === 0) return "";
+	return `> **[Context]** Belong to: ${active.join(" > ")}\n\n`;
+}
+function getMetadata(headings) {
+	return {
+		h1: headings[0] || void 0,
+		h2: headings[1] || void 0,
+		h3: headings[2] || void 0,
+		h4: headings[3] || void 0
+	};
+}
 /**
-* Splits a Markdown document into chunks based on header hierarchy.
-* Keeps tables and list blocks intact by splitting along paragraphs (\n\n)
-* when a section exceeds the maxSize limit.
+* Splits text recursively using a list of separators.
+* Preserves the separators when re-joining.
 */
-function splitMarkdown(text$1, maxSize = 4e4, overlapSize = 0) {
-	const lines = text$1.split("\n");
-	const chunks = [];
-	let currentHeadings = [];
-	let currentChunkLines = [];
-	let currentSize = 0;
-	let hasNewLines = false;
-	const getMetadata = (headings) => {
-		return {
-			h1: headings[0] || void 0,
-			h2: headings[1] || void 0,
-			h3: headings[2] || void 0,
-			h4: headings[3] || void 0
-		};
-	};
-	const flushChunk = (isHeadingChange = false) => {
-		if (currentChunkLines.length === 0 || !hasNewLines) {
-			currentChunkLines = [];
-			currentSize = 0;
-			hasNewLines = false;
-			return;
-		}
-		const pageContent = currentChunkLines.join("\n");
-		let lastChunkContent = "";
-		if (pageContent.length > maxSize) {
-			const paragraphs = pageContent.split("\n\n");
-			let subLines = [];
-			let subSize = 0;
-			for (const para of paragraphs) {
-				const paraSize = para.length;
-				if (subSize + paraSize > maxSize && subLines.length > 0) {
-					const content = subLines.join("\n\n");
-					chunks.push({
-						pageContent: content,
-						metadata: getMetadata(currentHeadings)
-					});
-					const overlapParas = [];
-					let currentOverlapSize = 0;
-					for (let j = subLines.length - 1; j >= 0; j--) {
-						const p = subLines[j];
-						if (currentOverlapSize + p.length > overlapSize && overlapParas.length > 0) break;
-						overlapParas.unshift(p);
-						currentOverlapSize += p.length + 2;
-					}
-					subLines = [...overlapParas];
-					subSize = currentOverlapSize;
-				}
-				subLines.push(para);
-				subSize += paraSize + 2;
+function splitTextRecursively(text$1, maxTokens, separators = [
+	"\n\n",
+	"\n",
+	"。",
+	". ",
+	" "
+]) {
+	if (countTokens(text$1) <= maxTokens) return [text$1];
+	if (separators.length === 0) {
+		const chunks = [];
+		let current = "";
+		for (const char of text$1) if (countTokens(current + char) > maxTokens) {
+			chunks.push(current);
+			current = char;
+		} else current += char;
+		if (current) chunks.push(current);
+		return chunks;
+	}
+	const separator = separators[0];
+	const nextSeparators = separators.slice(1);
+	const parts = text$1.split(separator);
+	const result = [];
+	let currentChunk = [];
+	let currentChunkTokens = 0;
+	for (let i = 0; i < parts.length; i++) {
+		const part = parts[i];
+		const itemText = part + (i < parts.length - 1 ? separator : "");
+		const partTokens = countTokens(itemText);
+		if (partTokens > maxTokens) {
+			if (currentChunk.length > 0) {
+				result.push(currentChunk.join(""));
+				currentChunk = [];
+				currentChunkTokens = 0;
 			}
-			if (subLines.length > 0) {
-				const content = subLines.join("\n\n");
-				chunks.push({
-					pageContent: content,
-					metadata: getMetadata(currentHeadings)
-				});
-				lastChunkContent = content;
+			const subParts = splitTextRecursively(part, maxTokens, nextSeparators);
+			for (let j = 0; j < subParts.length; j++) {
+				const finalSub = subParts[j] + (j === subParts.length - 1 && i < parts.length - 1 ? separator : "");
+				result.push(finalSub);
 			}
+		} else if (currentChunkTokens + partTokens > maxTokens) {
+			result.push(currentChunk.join(""));
+			currentChunk = [itemText];
+			currentChunkTokens = partTokens;
 		} else {
-			chunks.push({
-				pageContent,
-				metadata: getMetadata(currentHeadings)
-			});
-			lastChunkContent = pageContent;
+			currentChunk.push(itemText);
+			currentChunkTokens += partTokens;
 		}
-		if (!isHeadingChange && lastChunkContent && overlapSize > 0) {
-			const paragraphs = lastChunkContent.split("\n\n");
-			const overlapParas = [];
-			let currentOverlapSize = 0;
-			for (let j = paragraphs.length - 1; j >= 0; j--) {
-				const p = paragraphs[j];
-				if (currentOverlapSize + p.length > overlapSize && overlapParas.length > 0) break;
-				overlapParas.unshift(p);
-				currentOverlapSize += p.length + 2;
-			}
-			const overlapText = overlapParas.join("\n\n");
-			currentChunkLines = overlapText.split("\n");
-			currentSize = overlapText.length;
+	}
+	if (currentChunk.length > 0) result.push(currentChunk.join(""));
+	return result;
+}
+/**
+* Splits a Markdown document into chunks based on heading contexts, AST block parsing, and token limits.
+* Protects tables, list items, and code blocks from being broken.
+*/
+function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
+	const tokens = marked.lexer(text$1);
+	const chunks = [];
+	let currentHeadings = [];
+	let currentChunkList = [];
+	let accumulatedTokens = 0;
+	const flushCurrentChunk = (isHeadingChange = false) => {
+		if (currentChunkList.length === 0) return;
+		const pageContent = currentChunkList.map((item) => item.text).join("");
+		const firstHeadings = currentChunkList[0].headings;
+		chunks.push({
+			pageContent,
+			metadata: getMetadata(firstHeadings)
+		});
+		if (isHeadingChange || overlapTokens <= 0) {
+			currentChunkList = [];
+			accumulatedTokens = 0;
 		} else {
-			currentChunkLines = [];
-			currentSize = 0;
+			const overlapItems = [];
+			let currentOverlapTokens = 0;
+			for (let i = currentChunkList.length - 1; i >= 0; i--) {
+				const item = currentChunkList[i];
+				const itemTokens = countTokens(item.text);
+				if (currentOverlapTokens + itemTokens > overlapTokens && overlapItems.length > 0) break;
+				overlapItems.unshift(item);
+				currentOverlapTokens += itemTokens;
+			}
+			currentChunkList = [...overlapItems];
+			accumulatedTokens = currentOverlapTokens;
 		}
-		hasNewLines = false;
 	};
-	for (const line of lines) {
-		const headingMatch = line.match(HEADING_RE);
-		if (headingMatch) {
-			flushChunk(true);
-			const depth = headingMatch[1].length;
-			const title = headingMatch[2].trim();
+	for (const token of tokens) {
+		if (token.type === "space") {
+			if (currentChunkList.length > 0) {
+				currentChunkList[currentChunkList.length - 1].text += token.raw;
+				accumulatedTokens += countTokens(token.raw);
+			}
+			continue;
+		}
+		if (token.type === "heading") {
+			flushCurrentChunk(true);
+			const depth = token.depth;
+			const title = token.text.trim();
 			currentHeadings = currentHeadings.slice(0, depth - 1);
 			currentHeadings[depth - 1] = title;
 		}
-		currentChunkLines.push(line);
-		currentSize += line.length + 1;
-		hasNewLines = true;
-		if (currentSize > maxSize) flushChunk(false);
+		const rawText = token.raw;
+		if (token.type === "list" && countTokens(rawText) > maxTokens) for (const item of token.items) processTextBlock(item.raw, currentHeadings);
+		else {
+			const isAtomic = token.type === "table" || token.type === "code";
+			processTextBlock(rawText, currentHeadings, isAtomic);
+		}
 	}
-	flushChunk(true);
+	flushCurrentChunk(true);
 	return chunks;
+	function processTextBlock(blockText, headings, isAtomic = false) {
+		const blockTokens = countTokens(blockText);
+		const contextTokens = countTokens(formatHeadingContext(headings));
+		const safetyBuffer = Math.min(100, Math.max(2, Math.floor(maxTokens * .1)));
+		const budgetLimit = Math.max(5, maxTokens - contextTokens - safetyBuffer);
+		if (blockTokens > budgetLimit) if (isAtomic) {
+			flushCurrentChunk(false);
+			currentChunkList.push({
+				text: blockText,
+				headings: [...headings]
+			});
+			accumulatedTokens = blockTokens;
+			flushCurrentChunk(false);
+		} else {
+			flushCurrentChunk(false);
+			const subBlocks = splitTextRecursively(blockText, budgetLimit);
+			for (const sub of subBlocks) {
+				currentChunkList.push({
+					text: sub,
+					headings: [...headings]
+				});
+				accumulatedTokens += countTokens(sub);
+				if (accumulatedTokens > budgetLimit) flushCurrentChunk(false);
+			}
+		}
+		else {
+			if (accumulatedTokens + blockTokens + contextTokens > maxTokens && currentChunkList.length > 0) flushCurrentChunk(false);
+			currentChunkList.push({
+				text: blockText,
+				headings: [...headings]
+			});
+			accumulatedTokens += blockTokens;
+		}
+	}
 }
 //#endregion
@@ -14608,6 +14669,7 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
 //#endregion
 //#region src/core/extract-runner.ts
+const encoding = getEncoding("cl100k_base");
 const JSON_EXT_RE$1 = /\.json$/;
 async function limitConcurrency(concurrency, items, fn) {
 	const results = Array.from({ length: items.length });
@@ -14716,14 +14778,16 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
 	}
 	const s = spinner();
 	if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
-	const CHUNK_LIMIT = 4e4;
+	const maxTokens = aiConfig.extraction?.maxTokens ?? 8e3;
+	const overlapTokens = aiConfig.extraction?.overlapSize ?? 1e3;
 	let result;
-	if (text$1 && text$1.length > CHUNK_LIMIT) {
+	const totalTokens = text$1 ? encoding.encode(text$1).length : 0;
+	if (text$1 && totalTokens > maxTokens) {
 		if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
-			length: text$1.length,
-			limit: CHUNK_LIMIT
+			length: totalTokens,
+			limit: maxTokens
 		}));
-		const finalDocs = splitMarkdown(text$1, CHUNK_LIMIT, aiConfig.extraction?.overlapSize ?? 2e3);
+		const finalDocs = splitMarkdown(text$1, maxTokens, overlapTokens);
 		if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
 		let processedDocs = finalDocs;
 		if (!!aiConfig.extraction?.preFiltering && finalDocs.length > 1) {

package/dist/{doctor-collector-Cv7RArla.mjs → doctor-collector-NTNBFeBw.mjs} RENAMED Viewed

@@ -74,7 +74,7 @@ function doctorDiagnosticsTableRows(d) {
 //#endregion
 //#region package.json
 var name = "aiex-cli";
-var version = "0.0.5-beta.4";
+var version = "0.0.5-beta.5";
 var description = "JSON Schema → SQLite with AI-powered data extraction";
 var package_default = {
 	name,
@@ -158,9 +158,11 @@ var package_default = {
 		"hono": "catalog:",
 		"i18next": "catalog:",
 		"i18next-fs-backend": "catalog:",
+		"js-tiktoken": "catalog:",
 		"jsonfile": "catalog:",
 		"jsonrepair": "catalog:",
 		"kysely": "catalog:",
+		"marked": "catalog:",
 		"mime": "catalog:",
 		"open": "catalog:",
 		"p-retry": "catalog:",
@@ -232,6 +234,7 @@ const ExtractionConfigSchema = z.object({
 	outputDir: z.string().min(1),
 	mode: z.enum(["pipeline"]).default("pipeline").optional(),
 	concurrency: z.number().int().min(1).optional(),
+	maxTokens: z.number().int().positive().default(8e3).optional(),
 	overlapSize: z.number().int().nonnegative().optional(),
 	preFiltering: z.boolean().optional(),
 	preFilteringLimit: z.number().int().min(1).optional()
@@ -577,7 +580,7 @@ const en = {
 				extractFail: "Extraction failed",
 				extractComplete: "Extraction complete",
 				extractRetry: "API responded with {{code}}, retrying in {{delay}}s ({{attempt}}/{{max}})",
-				chunking: "Input text length ({{length}} chars) exceeds limit ({{limit}} chars). Splitting into chunks...",
+				chunking: "Input text ({{length}} tokens) exceeds limit ({{limit}} tokens). Splitting into chunks...",
 				chunksCount: "Split into {{count}} chunk(s).",
 				preFiltering: "Hybrid pre-filtering: selected {{filtered}} out of {{original}} chunks based on schema relevance.",
 				extractingChunk: "Extracting chunk {{current}}/{{total}}...",
@@ -973,7 +976,7 @@ async function initI18n(lng) {
 			fallbackLng: "en",
 			resources: {
 				"en": { translation: en },
-				"zh-CN": { translation: await import("./zh-CN-CyL-61Ow.mjs").then((m) => m.zhCN) }
+				"zh-CN": { translation: await import("./zh-CN-Ca-Dv775.mjs").then((m) => m.zhCN) }
 			},
 			interpolation: { escapeValue: false },
 			returnNull: false

package/dist/index.mjs CHANGED Viewed

@@ -1,3 +1,3 @@
-import { A as doctorDiagnosticsTableRows, a as parseJsonSchema, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, k as buildDoctorDiagnostics, n as createMigrationConfig, r as generateDrizzleConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics } from "./doctor-collector-Cv7RArla.mjs";
+import { A as doctorDiagnosticsTableRows, a as parseJsonSchema, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, k as buildDoctorDiagnostics, n as createMigrationConfig, r as generateDrizzleConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics } from "./doctor-collector-NTNBFeBw.mjs";
 export { JsonSchemaDefinitionSchema, buildDoctorDiagnostics, collectDoctorDiagnostics, createMigrationConfig, doctorDiagnosticsTableRows, formatDoctorDiagnosticsJson, generateDrizzleConfig, generateDrizzleSchema, parseJsonSchema };

package/dist/{zh-CN-CyL-61Ow.mjs → zh-CN-Ca-Dv775.mjs} RENAMED Viewed

@@ -126,7 +126,7 @@ const zhCN = {
 				extractFail: "抽取失败",
 				extractComplete: "抽取完成",
 				extractRetry: "API 返回 {{code}}，{{delay}} 秒后重试（{{attempt}}/{{max}}）",
-				chunking: "输入文本长度 ({{length}} 字符) 超过限制 ({{limit}} 字符)。正在拆分为多个切片...",
+				chunking: "输入文本 ({{length}} tokens) 超过限制 ({{limit}} tokens)。正在拆分为多个切片...",
 				chunksCount: "已拆分为 {{count}} 个切片。",
 				preFiltering: "混合预过滤：根据 Schema 相关性筛选保留了 {{filtered}} / {{original}} 个切片。",
 				extractingChunk: "正在提取切片 {{current}}/{{total}}...",

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "aiex-cli",
   "type": "module",
-  "version": "0.0.5-beta.4",
+  "version": "0.0.5-beta.5",
   "description": "JSON Schema → SQLite with AI-powered data extraction",
   "author": "OSpoon <zxin088@gmail.com>",
   "license": "MIT",
@@ -68,9 +68,11 @@
     "hono": "^4.0.0",
     "i18next": "^26.2.0",
     "i18next-fs-backend": "^2.6.6",
+    "js-tiktoken": "^1.0.21",
     "jsonfile": "^6.2.1",
     "jsonrepair": "^3.14.0",
     "kysely": "^0.29.2",
+    "marked": "^12.0.1",
     "mime": "^4.1.0",
     "open": "^11.0.0",
     "p-retry": "^7.1.0",