aiex-cli 0.0.5-beta.4 → 0.0.5-beta.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md
CHANGED
|
@@ -206,7 +206,7 @@ aiex completion fish | source
|
|
|
206
206
|
|
|
207
207
|
When processing very large documents (exceeding `40,000` characters), `aiex` runs an optimized **Pipeline Mode** to handle context window limits and control API costs:
|
|
208
208
|
|
|
209
|
-
- **
|
|
209
|
+
- **Token-Aware AST Splitting**: Parses structural Markdown elements (headings, paragraphs, lists) using an AST-based parser (`marked.lexer`) and splits them using precise token counters (`js-tiktoken`). Active heading hierarchies are tracked and prepended to each chunk as context. Tables and code blocks are kept intact (atomic blocks) to avoid syntax corruption.
|
|
210
210
|
- **Concurrency Limiting**: To respect strict model rate limits, chunk extractions are processed in parallel with a strict concurrency limit (capped at 2 concurrent requests).
|
|
211
211
|
- **Pre-filtering**: Integrates hybrid search-based pre-filtering to score and select only the most relevant document chunks based on schema queries, preventing unnecessary token usage on unrelated sections.
|
|
212
212
|
- **Recursive Merging**: The final extracted JSON objects from each chunk are recursively merged, concatenating lists and deduplicating primitive fields.
|
package/dist/cli.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-
|
|
1
|
+
import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-NTNBFeBw.mjs";
|
|
2
2
|
import { createRequire } from "node:module";
|
|
3
3
|
import fs from "node:fs/promises";
|
|
4
4
|
import os from "node:os";
|
|
@@ -17,6 +17,7 @@ import Database from "better-sqlite3";
|
|
|
17
17
|
import pc from "picocolors";
|
|
18
18
|
import { Buffer } from "node:buffer";
|
|
19
19
|
import * as XLSX from "xlsx";
|
|
20
|
+
import { getEncoding } from "js-tiktoken";
|
|
20
21
|
import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
|
|
21
22
|
import { APICallError, Output, generateText, jsonSchema } from "ai";
|
|
22
23
|
import pRetry from "p-retry";
|
|
@@ -24,6 +25,7 @@ import mime from "mime";
|
|
|
24
25
|
import { jsonrepair } from "jsonrepair";
|
|
25
26
|
import { LangfuseSpanProcessor } from "@langfuse/otel";
|
|
26
27
|
import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
28
|
+
import { marked } from "marked";
|
|
27
29
|
import crypto from "node:crypto";
|
|
28
30
|
import { Client, extractNotionId } from "@notionhq/client";
|
|
29
31
|
import { execa } from "execa";
|
|
@@ -13559,112 +13561,171 @@ function mergeExtractionResults(schema, results) {
|
|
|
13559
13561
|
|
|
13560
13562
|
//#endregion
|
|
13561
13563
|
//#region src/core/ai-extraction/text-splitter.ts
|
|
13562
|
-
const
|
|
13564
|
+
const encoding$1 = getEncoding("cl100k_base");
|
|
13565
|
+
function countTokens(text$1) {
|
|
13566
|
+
return encoding$1.encode(text$1).length;
|
|
13567
|
+
}
|
|
13568
|
+
function formatHeadingContext(headings) {
|
|
13569
|
+
const active = headings.filter(Boolean);
|
|
13570
|
+
if (active.length === 0) return "";
|
|
13571
|
+
return `> **[Context]** Belong to: ${active.join(" > ")}\n\n`;
|
|
13572
|
+
}
|
|
13573
|
+
function getMetadata(headings) {
|
|
13574
|
+
return {
|
|
13575
|
+
h1: headings[0] || void 0,
|
|
13576
|
+
h2: headings[1] || void 0,
|
|
13577
|
+
h3: headings[2] || void 0,
|
|
13578
|
+
h4: headings[3] || void 0
|
|
13579
|
+
};
|
|
13580
|
+
}
|
|
13563
13581
|
/**
|
|
13564
|
-
* Splits
|
|
13565
|
-
*
|
|
13566
|
-
* when a section exceeds the maxSize limit.
|
|
13582
|
+
* Splits text recursively using a list of separators.
|
|
13583
|
+
* Preserves the separators when re-joining.
|
|
13567
13584
|
*/
|
|
13568
|
-
function
|
|
13569
|
-
|
|
13570
|
-
|
|
13571
|
-
|
|
13572
|
-
|
|
13573
|
-
|
|
13574
|
-
|
|
13575
|
-
|
|
13576
|
-
|
|
13577
|
-
|
|
13578
|
-
|
|
13579
|
-
|
|
13580
|
-
|
|
13581
|
-
|
|
13582
|
-
|
|
13583
|
-
|
|
13584
|
-
|
|
13585
|
-
|
|
13586
|
-
|
|
13587
|
-
|
|
13588
|
-
|
|
13589
|
-
|
|
13590
|
-
|
|
13591
|
-
|
|
13592
|
-
|
|
13593
|
-
|
|
13594
|
-
|
|
13595
|
-
|
|
13596
|
-
|
|
13597
|
-
|
|
13598
|
-
|
|
13599
|
-
|
|
13600
|
-
|
|
13601
|
-
pageContent: content,
|
|
13602
|
-
metadata: getMetadata(currentHeadings)
|
|
13603
|
-
});
|
|
13604
|
-
const overlapParas = [];
|
|
13605
|
-
let currentOverlapSize = 0;
|
|
13606
|
-
for (let j = subLines.length - 1; j >= 0; j--) {
|
|
13607
|
-
const p = subLines[j];
|
|
13608
|
-
if (currentOverlapSize + p.length > overlapSize && overlapParas.length > 0) break;
|
|
13609
|
-
overlapParas.unshift(p);
|
|
13610
|
-
currentOverlapSize += p.length + 2;
|
|
13611
|
-
}
|
|
13612
|
-
subLines = [...overlapParas];
|
|
13613
|
-
subSize = currentOverlapSize;
|
|
13614
|
-
}
|
|
13615
|
-
subLines.push(para);
|
|
13616
|
-
subSize += paraSize + 2;
|
|
13585
|
+
function splitTextRecursively(text$1, maxTokens, separators = [
|
|
13586
|
+
"\n\n",
|
|
13587
|
+
"\n",
|
|
13588
|
+
"。",
|
|
13589
|
+
". ",
|
|
13590
|
+
" "
|
|
13591
|
+
]) {
|
|
13592
|
+
if (countTokens(text$1) <= maxTokens) return [text$1];
|
|
13593
|
+
if (separators.length === 0) {
|
|
13594
|
+
const chunks = [];
|
|
13595
|
+
let current = "";
|
|
13596
|
+
for (const char of text$1) if (countTokens(current + char) > maxTokens) {
|
|
13597
|
+
chunks.push(current);
|
|
13598
|
+
current = char;
|
|
13599
|
+
} else current += char;
|
|
13600
|
+
if (current) chunks.push(current);
|
|
13601
|
+
return chunks;
|
|
13602
|
+
}
|
|
13603
|
+
const separator = separators[0];
|
|
13604
|
+
const nextSeparators = separators.slice(1);
|
|
13605
|
+
const parts = text$1.split(separator);
|
|
13606
|
+
const result = [];
|
|
13607
|
+
let currentChunk = [];
|
|
13608
|
+
let currentChunkTokens = 0;
|
|
13609
|
+
for (let i = 0; i < parts.length; i++) {
|
|
13610
|
+
const part = parts[i];
|
|
13611
|
+
const itemText = part + (i < parts.length - 1 ? separator : "");
|
|
13612
|
+
const partTokens = countTokens(itemText);
|
|
13613
|
+
if (partTokens > maxTokens) {
|
|
13614
|
+
if (currentChunk.length > 0) {
|
|
13615
|
+
result.push(currentChunk.join(""));
|
|
13616
|
+
currentChunk = [];
|
|
13617
|
+
currentChunkTokens = 0;
|
|
13617
13618
|
}
|
|
13618
|
-
|
|
13619
|
-
|
|
13620
|
-
|
|
13621
|
-
|
|
13622
|
-
metadata: getMetadata(currentHeadings)
|
|
13623
|
-
});
|
|
13624
|
-
lastChunkContent = content;
|
|
13619
|
+
const subParts = splitTextRecursively(part, maxTokens, nextSeparators);
|
|
13620
|
+
for (let j = 0; j < subParts.length; j++) {
|
|
13621
|
+
const finalSub = subParts[j] + (j === subParts.length - 1 && i < parts.length - 1 ? separator : "");
|
|
13622
|
+
result.push(finalSub);
|
|
13625
13623
|
}
|
|
13624
|
+
} else if (currentChunkTokens + partTokens > maxTokens) {
|
|
13625
|
+
result.push(currentChunk.join(""));
|
|
13626
|
+
currentChunk = [itemText];
|
|
13627
|
+
currentChunkTokens = partTokens;
|
|
13626
13628
|
} else {
|
|
13627
|
-
|
|
13628
|
-
|
|
13629
|
-
metadata: getMetadata(currentHeadings)
|
|
13630
|
-
});
|
|
13631
|
-
lastChunkContent = pageContent;
|
|
13629
|
+
currentChunk.push(itemText);
|
|
13630
|
+
currentChunkTokens += partTokens;
|
|
13632
13631
|
}
|
|
13633
|
-
|
|
13634
|
-
|
|
13635
|
-
|
|
13636
|
-
|
|
13637
|
-
|
|
13638
|
-
|
|
13639
|
-
|
|
13640
|
-
|
|
13641
|
-
|
|
13642
|
-
|
|
13643
|
-
|
|
13644
|
-
|
|
13645
|
-
|
|
13632
|
+
}
|
|
13633
|
+
if (currentChunk.length > 0) result.push(currentChunk.join(""));
|
|
13634
|
+
return result;
|
|
13635
|
+
}
|
|
13636
|
+
/**
|
|
13637
|
+
* Splits a Markdown document into chunks based on heading contexts, AST block parsing, and token limits.
|
|
13638
|
+
* Protects tables, list items, and code blocks from being broken.
|
|
13639
|
+
*/
|
|
13640
|
+
function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
|
|
13641
|
+
const tokens = marked.lexer(text$1);
|
|
13642
|
+
const chunks = [];
|
|
13643
|
+
let currentHeadings = [];
|
|
13644
|
+
let currentChunkList = [];
|
|
13645
|
+
let accumulatedTokens = 0;
|
|
13646
|
+
const flushCurrentChunk = (isHeadingChange = false) => {
|
|
13647
|
+
if (currentChunkList.length === 0) return;
|
|
13648
|
+
const pageContent = currentChunkList.map((item) => item.text).join("");
|
|
13649
|
+
const firstHeadings = currentChunkList[0].headings;
|
|
13650
|
+
chunks.push({
|
|
13651
|
+
pageContent,
|
|
13652
|
+
metadata: getMetadata(firstHeadings)
|
|
13653
|
+
});
|
|
13654
|
+
if (isHeadingChange || overlapTokens <= 0) {
|
|
13655
|
+
currentChunkList = [];
|
|
13656
|
+
accumulatedTokens = 0;
|
|
13646
13657
|
} else {
|
|
13647
|
-
|
|
13648
|
-
|
|
13658
|
+
const overlapItems = [];
|
|
13659
|
+
let currentOverlapTokens = 0;
|
|
13660
|
+
for (let i = currentChunkList.length - 1; i >= 0; i--) {
|
|
13661
|
+
const item = currentChunkList[i];
|
|
13662
|
+
const itemTokens = countTokens(item.text);
|
|
13663
|
+
if (currentOverlapTokens + itemTokens > overlapTokens && overlapItems.length > 0) break;
|
|
13664
|
+
overlapItems.unshift(item);
|
|
13665
|
+
currentOverlapTokens += itemTokens;
|
|
13666
|
+
}
|
|
13667
|
+
currentChunkList = [...overlapItems];
|
|
13668
|
+
accumulatedTokens = currentOverlapTokens;
|
|
13649
13669
|
}
|
|
13650
|
-
hasNewLines = false;
|
|
13651
13670
|
};
|
|
13652
|
-
for (const
|
|
13653
|
-
|
|
13654
|
-
|
|
13655
|
-
|
|
13656
|
-
|
|
13657
|
-
|
|
13671
|
+
for (const token of tokens) {
|
|
13672
|
+
if (token.type === "space") {
|
|
13673
|
+
if (currentChunkList.length > 0) {
|
|
13674
|
+
currentChunkList[currentChunkList.length - 1].text += token.raw;
|
|
13675
|
+
accumulatedTokens += countTokens(token.raw);
|
|
13676
|
+
}
|
|
13677
|
+
continue;
|
|
13678
|
+
}
|
|
13679
|
+
if (token.type === "heading") {
|
|
13680
|
+
flushCurrentChunk(true);
|
|
13681
|
+
const depth = token.depth;
|
|
13682
|
+
const title = token.text.trim();
|
|
13658
13683
|
currentHeadings = currentHeadings.slice(0, depth - 1);
|
|
13659
13684
|
currentHeadings[depth - 1] = title;
|
|
13660
13685
|
}
|
|
13661
|
-
|
|
13662
|
-
|
|
13663
|
-
|
|
13664
|
-
|
|
13686
|
+
const rawText = token.raw;
|
|
13687
|
+
if (token.type === "list" && countTokens(rawText) > maxTokens) for (const item of token.items) processTextBlock(item.raw, currentHeadings);
|
|
13688
|
+
else {
|
|
13689
|
+
const isAtomic = token.type === "table" || token.type === "code";
|
|
13690
|
+
processTextBlock(rawText, currentHeadings, isAtomic);
|
|
13691
|
+
}
|
|
13665
13692
|
}
|
|
13666
|
-
|
|
13693
|
+
flushCurrentChunk(true);
|
|
13667
13694
|
return chunks;
|
|
13695
|
+
function processTextBlock(blockText, headings, isAtomic = false) {
|
|
13696
|
+
const blockTokens = countTokens(blockText);
|
|
13697
|
+
const contextTokens = countTokens(formatHeadingContext(headings));
|
|
13698
|
+
const safetyBuffer = Math.min(100, Math.max(2, Math.floor(maxTokens * .1)));
|
|
13699
|
+
const budgetLimit = Math.max(5, maxTokens - contextTokens - safetyBuffer);
|
|
13700
|
+
if (blockTokens > budgetLimit) if (isAtomic) {
|
|
13701
|
+
flushCurrentChunk(false);
|
|
13702
|
+
currentChunkList.push({
|
|
13703
|
+
text: blockText,
|
|
13704
|
+
headings: [...headings]
|
|
13705
|
+
});
|
|
13706
|
+
accumulatedTokens = blockTokens;
|
|
13707
|
+
flushCurrentChunk(false);
|
|
13708
|
+
} else {
|
|
13709
|
+
flushCurrentChunk(false);
|
|
13710
|
+
const subBlocks = splitTextRecursively(blockText, budgetLimit);
|
|
13711
|
+
for (const sub of subBlocks) {
|
|
13712
|
+
currentChunkList.push({
|
|
13713
|
+
text: sub,
|
|
13714
|
+
headings: [...headings]
|
|
13715
|
+
});
|
|
13716
|
+
accumulatedTokens += countTokens(sub);
|
|
13717
|
+
if (accumulatedTokens > budgetLimit) flushCurrentChunk(false);
|
|
13718
|
+
}
|
|
13719
|
+
}
|
|
13720
|
+
else {
|
|
13721
|
+
if (accumulatedTokens + blockTokens + contextTokens > maxTokens && currentChunkList.length > 0) flushCurrentChunk(false);
|
|
13722
|
+
currentChunkList.push({
|
|
13723
|
+
text: blockText,
|
|
13724
|
+
headings: [...headings]
|
|
13725
|
+
});
|
|
13726
|
+
accumulatedTokens += blockTokens;
|
|
13727
|
+
}
|
|
13728
|
+
}
|
|
13668
13729
|
}
|
|
13669
13730
|
|
|
13670
13731
|
//#endregion
|
|
@@ -14608,6 +14669,7 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
|
|
|
14608
14669
|
|
|
14609
14670
|
//#endregion
|
|
14610
14671
|
//#region src/core/extract-runner.ts
|
|
14672
|
+
const encoding = getEncoding("cl100k_base");
|
|
14611
14673
|
const JSON_EXT_RE$1 = /\.json$/;
|
|
14612
14674
|
async function limitConcurrency(concurrency, items, fn) {
|
|
14613
14675
|
const results = Array.from({ length: items.length });
|
|
@@ -14716,14 +14778,16 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14716
14778
|
}
|
|
14717
14779
|
const s = spinner();
|
|
14718
14780
|
if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
|
|
14719
|
-
const
|
|
14781
|
+
const maxTokens = aiConfig.extraction?.maxTokens ?? 8e3;
|
|
14782
|
+
const overlapTokens = aiConfig.extraction?.overlapSize ?? 1e3;
|
|
14720
14783
|
let result;
|
|
14721
|
-
|
|
14784
|
+
const totalTokens = text$1 ? encoding.encode(text$1).length : 0;
|
|
14785
|
+
if (text$1 && totalTokens > maxTokens) {
|
|
14722
14786
|
if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
|
|
14723
|
-
length:
|
|
14724
|
-
limit:
|
|
14787
|
+
length: totalTokens,
|
|
14788
|
+
limit: maxTokens
|
|
14725
14789
|
}));
|
|
14726
|
-
const finalDocs = splitMarkdown(text$1,
|
|
14790
|
+
const finalDocs = splitMarkdown(text$1, maxTokens, overlapTokens);
|
|
14727
14791
|
if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
|
|
14728
14792
|
let processedDocs = finalDocs;
|
|
14729
14793
|
if (!!aiConfig.extraction?.preFiltering && finalDocs.length > 1) {
|
|
@@ -74,7 +74,7 @@ function doctorDiagnosticsTableRows(d) {
|
|
|
74
74
|
//#endregion
|
|
75
75
|
//#region package.json
|
|
76
76
|
var name = "aiex-cli";
|
|
77
|
-
var version = "0.0.5-beta.
|
|
77
|
+
var version = "0.0.5-beta.5";
|
|
78
78
|
var description = "JSON Schema → SQLite with AI-powered data extraction";
|
|
79
79
|
var package_default = {
|
|
80
80
|
name,
|
|
@@ -158,9 +158,11 @@ var package_default = {
|
|
|
158
158
|
"hono": "catalog:",
|
|
159
159
|
"i18next": "catalog:",
|
|
160
160
|
"i18next-fs-backend": "catalog:",
|
|
161
|
+
"js-tiktoken": "catalog:",
|
|
161
162
|
"jsonfile": "catalog:",
|
|
162
163
|
"jsonrepair": "catalog:",
|
|
163
164
|
"kysely": "catalog:",
|
|
165
|
+
"marked": "catalog:",
|
|
164
166
|
"mime": "catalog:",
|
|
165
167
|
"open": "catalog:",
|
|
166
168
|
"p-retry": "catalog:",
|
|
@@ -232,6 +234,7 @@ const ExtractionConfigSchema = z.object({
|
|
|
232
234
|
outputDir: z.string().min(1),
|
|
233
235
|
mode: z.enum(["pipeline"]).default("pipeline").optional(),
|
|
234
236
|
concurrency: z.number().int().min(1).optional(),
|
|
237
|
+
maxTokens: z.number().int().positive().default(8e3).optional(),
|
|
235
238
|
overlapSize: z.number().int().nonnegative().optional(),
|
|
236
239
|
preFiltering: z.boolean().optional(),
|
|
237
240
|
preFilteringLimit: z.number().int().min(1).optional()
|
|
@@ -577,7 +580,7 @@ const en = {
|
|
|
577
580
|
extractFail: "Extraction failed",
|
|
578
581
|
extractComplete: "Extraction complete",
|
|
579
582
|
extractRetry: "API responded with {{code}}, retrying in {{delay}}s ({{attempt}}/{{max}})",
|
|
580
|
-
chunking: "Input text
|
|
583
|
+
chunking: "Input text ({{length}} tokens) exceeds limit ({{limit}} tokens). Splitting into chunks...",
|
|
581
584
|
chunksCount: "Split into {{count}} chunk(s).",
|
|
582
585
|
preFiltering: "Hybrid pre-filtering: selected {{filtered}} out of {{original}} chunks based on schema relevance.",
|
|
583
586
|
extractingChunk: "Extracting chunk {{current}}/{{total}}...",
|
|
@@ -973,7 +976,7 @@ async function initI18n(lng) {
|
|
|
973
976
|
fallbackLng: "en",
|
|
974
977
|
resources: {
|
|
975
978
|
"en": { translation: en },
|
|
976
|
-
"zh-CN": { translation: await import("./zh-CN-
|
|
979
|
+
"zh-CN": { translation: await import("./zh-CN-Ca-Dv775.mjs").then((m) => m.zhCN) }
|
|
977
980
|
},
|
|
978
981
|
interpolation: { escapeValue: false },
|
|
979
982
|
returnNull: false
|
package/dist/index.mjs
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import { A as doctorDiagnosticsTableRows, a as parseJsonSchema, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, k as buildDoctorDiagnostics, n as createMigrationConfig, r as generateDrizzleConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics } from "./doctor-collector-
|
|
1
|
+
import { A as doctorDiagnosticsTableRows, a as parseJsonSchema, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, k as buildDoctorDiagnostics, n as createMigrationConfig, r as generateDrizzleConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics } from "./doctor-collector-NTNBFeBw.mjs";
|
|
2
2
|
|
|
3
3
|
export { JsonSchemaDefinitionSchema, buildDoctorDiagnostics, collectDoctorDiagnostics, createMigrationConfig, doctorDiagnosticsTableRows, formatDoctorDiagnosticsJson, generateDrizzleConfig, generateDrizzleSchema, parseJsonSchema };
|
|
@@ -126,7 +126,7 @@ const zhCN = {
|
|
|
126
126
|
extractFail: "抽取失败",
|
|
127
127
|
extractComplete: "抽取完成",
|
|
128
128
|
extractRetry: "API 返回 {{code}},{{delay}} 秒后重试({{attempt}}/{{max}})",
|
|
129
|
-
chunking: "
|
|
129
|
+
chunking: "输入文本 ({{length}} tokens) 超过限制 ({{limit}} tokens)。正在拆分为多个切片...",
|
|
130
130
|
chunksCount: "已拆分为 {{count}} 个切片。",
|
|
131
131
|
preFiltering: "混合预过滤:根据 Schema 相关性筛选保留了 {{filtered}} / {{original}} 个切片。",
|
|
132
132
|
extractingChunk: "正在提取切片 {{current}}/{{total}}...",
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "aiex-cli",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.0.5-beta.
|
|
4
|
+
"version": "0.0.5-beta.5",
|
|
5
5
|
"description": "JSON Schema → SQLite with AI-powered data extraction",
|
|
6
6
|
"author": "OSpoon <zxin088@gmail.com>",
|
|
7
7
|
"license": "MIT",
|
|
@@ -68,9 +68,11 @@
|
|
|
68
68
|
"hono": "^4.0.0",
|
|
69
69
|
"i18next": "^26.2.0",
|
|
70
70
|
"i18next-fs-backend": "^2.6.6",
|
|
71
|
+
"js-tiktoken": "^1.0.21",
|
|
71
72
|
"jsonfile": "^6.2.1",
|
|
72
73
|
"jsonrepair": "^3.14.0",
|
|
73
74
|
"kysely": "^0.29.2",
|
|
75
|
+
"marked": "^12.0.1",
|
|
74
76
|
"mime": "^4.1.0",
|
|
75
77
|
"open": "^11.0.0",
|
|
76
78
|
"p-retry": "^7.1.0",
|