aiex-cli 0.0.5-beta.5 → 0.0.6-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -11
- package/dist/cli.mjs +7 -449
- package/dist/{doctor-collector-NTNBFeBw.mjs → doctor-collector-hWEvJ4lw.mjs} +4 -24
- package/dist/index.d.mts +88 -91
- package/dist/index.mjs +1 -1
- package/dist/web/assets/ExtractionViewer-DqIrBGNK.js +1 -0
- package/dist/web/assets/index-CvY9TGny.css +2 -0
- package/dist/web/assets/{index-CKV2X6sS.js → index-Dlze68g1.js} +2 -2
- package/dist/web/index.html +2 -2
- package/dist/{zh-CN-Ca-Dv775.mjs → zh-CN-Qcn0DHFh.mjs} +0 -7
- package/package.json +1 -3
- package/dist/web/assets/ExtractionViewer-BhhWrBs2.js +0 -1
- package/dist/web/assets/index-Csdgio76.css +0 -2
package/README.md
CHANGED
|
@@ -202,17 +202,6 @@ aiex completion fish | source
|
|
|
202
202
|
|
|
203
203
|
<br>
|
|
204
204
|
|
|
205
|
-
## 📄 Large Document Processing
|
|
206
|
-
|
|
207
|
-
When processing very large documents (exceeding `40,000` characters), `aiex` runs an optimized **Pipeline Mode** to handle context window limits and control API costs:
|
|
208
|
-
|
|
209
|
-
- **Token-Aware AST Splitting**: Parses structural Markdown elements (headings, paragraphs, lists) using an AST-based parser (`marked.lexer`) and splits them using precise token counters (`js-tiktoken`). Active heading hierarchies are tracked and prepended to each chunk as context. Tables and code blocks are kept intact (atomic blocks) to avoid syntax corruption.
|
|
210
|
-
- **Concurrency Limiting**: To respect strict model rate limits, chunk extractions are processed in parallel with a strict concurrency limit (capped at 2 concurrent requests).
|
|
211
|
-
- **Pre-filtering**: Integrates hybrid search-based pre-filtering to score and select only the most relevant document chunks based on schema queries, preventing unnecessary token usage on unrelated sections.
|
|
212
|
-
- **Recursive Merging**: The final extracted JSON objects from each chunk are recursively merged, concatenating lists and deduplicating primitive fields.
|
|
213
|
-
|
|
214
|
-
<br>
|
|
215
|
-
|
|
216
205
|
## 🔧 AI Configuration
|
|
217
206
|
|
|
218
207
|
aiex works with any OpenAI-compatible API provider. Configure in the Web UI (AI Settings panel):
|
package/dist/cli.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-
|
|
1
|
+
import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-hWEvJ4lw.mjs";
|
|
2
2
|
import { createRequire } from "node:module";
|
|
3
3
|
import fs from "node:fs/promises";
|
|
4
4
|
import os from "node:os";
|
|
@@ -17,7 +17,6 @@ import Database from "better-sqlite3";
|
|
|
17
17
|
import pc from "picocolors";
|
|
18
18
|
import { Buffer } from "node:buffer";
|
|
19
19
|
import * as XLSX from "xlsx";
|
|
20
|
-
import { getEncoding } from "js-tiktoken";
|
|
21
20
|
import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
|
|
22
21
|
import { APICallError, Output, generateText, jsonSchema } from "ai";
|
|
23
22
|
import pRetry from "p-retry";
|
|
@@ -25,7 +24,6 @@ import mime from "mime";
|
|
|
25
24
|
import { jsonrepair } from "jsonrepair";
|
|
26
25
|
import { LangfuseSpanProcessor } from "@langfuse/otel";
|
|
27
26
|
import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
28
|
-
import { marked } from "marked";
|
|
29
27
|
import crypto from "node:crypto";
|
|
30
28
|
import { Client, extractNotionId } from "@notionhq/client";
|
|
31
29
|
import { execa } from "execa";
|
|
@@ -13130,7 +13128,7 @@ function propertyToExtractionSchema(property) {
|
|
|
13130
13128
|
}
|
|
13131
13129
|
return { type: nullableType(property.type) };
|
|
13132
13130
|
}
|
|
13133
|
-
function isRecord
|
|
13131
|
+
function isRecord(value) {
|
|
13134
13132
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
13135
13133
|
}
|
|
13136
13134
|
function schemaToExtractionOutputSchema(schema) {
|
|
@@ -13168,7 +13166,7 @@ function validatePropertyValue(path$1, property, value, issues) {
|
|
|
13168
13166
|
}
|
|
13169
13167
|
return;
|
|
13170
13168
|
case "object":
|
|
13171
|
-
if (!isRecord
|
|
13169
|
+
if (!isRecord(value)) {
|
|
13172
13170
|
issues.push(`${path$1}: expected object or null`);
|
|
13173
13171
|
return;
|
|
13174
13172
|
}
|
|
@@ -13191,7 +13189,7 @@ function validateProperties(basePath, properties, data, issues) {
|
|
|
13191
13189
|
}
|
|
13192
13190
|
}
|
|
13193
13191
|
function validateExtractedData(schema, data) {
|
|
13194
|
-
if (!isRecord
|
|
13192
|
+
if (!isRecord(data)) return {
|
|
13195
13193
|
success: false,
|
|
13196
13194
|
error: "Extracted data must be a JSON object."
|
|
13197
13195
|
};
|
|
@@ -13514,220 +13512,6 @@ function insertExtractedData(db, schema, data) {
|
|
|
13514
13512
|
}
|
|
13515
13513
|
}
|
|
13516
13514
|
|
|
13517
|
-
//#endregion
|
|
13518
|
-
//#region src/core/ai-extraction/json-merger.ts
|
|
13519
|
-
function isRecord(value) {
|
|
13520
|
-
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
13521
|
-
}
|
|
13522
|
-
function mergePropertyValue(property, values) {
|
|
13523
|
-
const nonNullValues = values.filter((v) => v !== null && v !== void 0);
|
|
13524
|
-
if (nonNullValues.length === 0) return null;
|
|
13525
|
-
if (property.type === "array") {
|
|
13526
|
-
const concatenated = [];
|
|
13527
|
-
for (const val of nonNullValues) if (Array.isArray(val)) concatenated.push(...val);
|
|
13528
|
-
return concatenated;
|
|
13529
|
-
}
|
|
13530
|
-
if (property.type === "object") {
|
|
13531
|
-
const childProperties = property.properties;
|
|
13532
|
-
if (!childProperties) {
|
|
13533
|
-
const mergedObj$1 = {};
|
|
13534
|
-
for (const val of nonNullValues) if (isRecord(val)) Object.assign(mergedObj$1, val);
|
|
13535
|
-
return mergedObj$1;
|
|
13536
|
-
}
|
|
13537
|
-
const mergedObj = {};
|
|
13538
|
-
for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord(v) ? v[propName] : void 0));
|
|
13539
|
-
return mergedObj;
|
|
13540
|
-
}
|
|
13541
|
-
const bestValue = nonNullValues.find((v) => {
|
|
13542
|
-
if (typeof v === "string") return v.trim() !== "";
|
|
13543
|
-
return true;
|
|
13544
|
-
});
|
|
13545
|
-
return bestValue !== void 0 ? bestValue : null;
|
|
13546
|
-
}
|
|
13547
|
-
/**
|
|
13548
|
-
* Merges structured extraction outputs from multiple document chunks
|
|
13549
|
-
* according to the schema properties.
|
|
13550
|
-
*/
|
|
13551
|
-
function mergeExtractionResults(schema, results) {
|
|
13552
|
-
if (results.length === 0) return {};
|
|
13553
|
-
if (results.length === 1) return results[0];
|
|
13554
|
-
const merged = {};
|
|
13555
|
-
for (const [propName, propDef] of Object.entries(schema.properties)) {
|
|
13556
|
-
if (propDef.primary && propDef.autoIncrement) continue;
|
|
13557
|
-
merged[propName] = mergePropertyValue(propDef, results.map((r) => r[propName]));
|
|
13558
|
-
}
|
|
13559
|
-
return merged;
|
|
13560
|
-
}
|
|
13561
|
-
|
|
13562
|
-
//#endregion
|
|
13563
|
-
//#region src/core/ai-extraction/text-splitter.ts
|
|
13564
|
-
const encoding$1 = getEncoding("cl100k_base");
|
|
13565
|
-
function countTokens(text$1) {
|
|
13566
|
-
return encoding$1.encode(text$1).length;
|
|
13567
|
-
}
|
|
13568
|
-
function formatHeadingContext(headings) {
|
|
13569
|
-
const active = headings.filter(Boolean);
|
|
13570
|
-
if (active.length === 0) return "";
|
|
13571
|
-
return `> **[Context]** Belong to: ${active.join(" > ")}\n\n`;
|
|
13572
|
-
}
|
|
13573
|
-
function getMetadata(headings) {
|
|
13574
|
-
return {
|
|
13575
|
-
h1: headings[0] || void 0,
|
|
13576
|
-
h2: headings[1] || void 0,
|
|
13577
|
-
h3: headings[2] || void 0,
|
|
13578
|
-
h4: headings[3] || void 0
|
|
13579
|
-
};
|
|
13580
|
-
}
|
|
13581
|
-
/**
|
|
13582
|
-
* Splits text recursively using a list of separators.
|
|
13583
|
-
* Preserves the separators when re-joining.
|
|
13584
|
-
*/
|
|
13585
|
-
function splitTextRecursively(text$1, maxTokens, separators = [
|
|
13586
|
-
"\n\n",
|
|
13587
|
-
"\n",
|
|
13588
|
-
"。",
|
|
13589
|
-
". ",
|
|
13590
|
-
" "
|
|
13591
|
-
]) {
|
|
13592
|
-
if (countTokens(text$1) <= maxTokens) return [text$1];
|
|
13593
|
-
if (separators.length === 0) {
|
|
13594
|
-
const chunks = [];
|
|
13595
|
-
let current = "";
|
|
13596
|
-
for (const char of text$1) if (countTokens(current + char) > maxTokens) {
|
|
13597
|
-
chunks.push(current);
|
|
13598
|
-
current = char;
|
|
13599
|
-
} else current += char;
|
|
13600
|
-
if (current) chunks.push(current);
|
|
13601
|
-
return chunks;
|
|
13602
|
-
}
|
|
13603
|
-
const separator = separators[0];
|
|
13604
|
-
const nextSeparators = separators.slice(1);
|
|
13605
|
-
const parts = text$1.split(separator);
|
|
13606
|
-
const result = [];
|
|
13607
|
-
let currentChunk = [];
|
|
13608
|
-
let currentChunkTokens = 0;
|
|
13609
|
-
for (let i = 0; i < parts.length; i++) {
|
|
13610
|
-
const part = parts[i];
|
|
13611
|
-
const itemText = part + (i < parts.length - 1 ? separator : "");
|
|
13612
|
-
const partTokens = countTokens(itemText);
|
|
13613
|
-
if (partTokens > maxTokens) {
|
|
13614
|
-
if (currentChunk.length > 0) {
|
|
13615
|
-
result.push(currentChunk.join(""));
|
|
13616
|
-
currentChunk = [];
|
|
13617
|
-
currentChunkTokens = 0;
|
|
13618
|
-
}
|
|
13619
|
-
const subParts = splitTextRecursively(part, maxTokens, nextSeparators);
|
|
13620
|
-
for (let j = 0; j < subParts.length; j++) {
|
|
13621
|
-
const finalSub = subParts[j] + (j === subParts.length - 1 && i < parts.length - 1 ? separator : "");
|
|
13622
|
-
result.push(finalSub);
|
|
13623
|
-
}
|
|
13624
|
-
} else if (currentChunkTokens + partTokens > maxTokens) {
|
|
13625
|
-
result.push(currentChunk.join(""));
|
|
13626
|
-
currentChunk = [itemText];
|
|
13627
|
-
currentChunkTokens = partTokens;
|
|
13628
|
-
} else {
|
|
13629
|
-
currentChunk.push(itemText);
|
|
13630
|
-
currentChunkTokens += partTokens;
|
|
13631
|
-
}
|
|
13632
|
-
}
|
|
13633
|
-
if (currentChunk.length > 0) result.push(currentChunk.join(""));
|
|
13634
|
-
return result;
|
|
13635
|
-
}
|
|
13636
|
-
/**
|
|
13637
|
-
* Splits a Markdown document into chunks based on heading contexts, AST block parsing, and token limits.
|
|
13638
|
-
* Protects tables, list items, and code blocks from being broken.
|
|
13639
|
-
*/
|
|
13640
|
-
function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
|
|
13641
|
-
const tokens = marked.lexer(text$1);
|
|
13642
|
-
const chunks = [];
|
|
13643
|
-
let currentHeadings = [];
|
|
13644
|
-
let currentChunkList = [];
|
|
13645
|
-
let accumulatedTokens = 0;
|
|
13646
|
-
const flushCurrentChunk = (isHeadingChange = false) => {
|
|
13647
|
-
if (currentChunkList.length === 0) return;
|
|
13648
|
-
const pageContent = currentChunkList.map((item) => item.text).join("");
|
|
13649
|
-
const firstHeadings = currentChunkList[0].headings;
|
|
13650
|
-
chunks.push({
|
|
13651
|
-
pageContent,
|
|
13652
|
-
metadata: getMetadata(firstHeadings)
|
|
13653
|
-
});
|
|
13654
|
-
if (isHeadingChange || overlapTokens <= 0) {
|
|
13655
|
-
currentChunkList = [];
|
|
13656
|
-
accumulatedTokens = 0;
|
|
13657
|
-
} else {
|
|
13658
|
-
const overlapItems = [];
|
|
13659
|
-
let currentOverlapTokens = 0;
|
|
13660
|
-
for (let i = currentChunkList.length - 1; i >= 0; i--) {
|
|
13661
|
-
const item = currentChunkList[i];
|
|
13662
|
-
const itemTokens = countTokens(item.text);
|
|
13663
|
-
if (currentOverlapTokens + itemTokens > overlapTokens && overlapItems.length > 0) break;
|
|
13664
|
-
overlapItems.unshift(item);
|
|
13665
|
-
currentOverlapTokens += itemTokens;
|
|
13666
|
-
}
|
|
13667
|
-
currentChunkList = [...overlapItems];
|
|
13668
|
-
accumulatedTokens = currentOverlapTokens;
|
|
13669
|
-
}
|
|
13670
|
-
};
|
|
13671
|
-
for (const token of tokens) {
|
|
13672
|
-
if (token.type === "space") {
|
|
13673
|
-
if (currentChunkList.length > 0) {
|
|
13674
|
-
currentChunkList[currentChunkList.length - 1].text += token.raw;
|
|
13675
|
-
accumulatedTokens += countTokens(token.raw);
|
|
13676
|
-
}
|
|
13677
|
-
continue;
|
|
13678
|
-
}
|
|
13679
|
-
if (token.type === "heading") {
|
|
13680
|
-
flushCurrentChunk(true);
|
|
13681
|
-
const depth = token.depth;
|
|
13682
|
-
const title = token.text.trim();
|
|
13683
|
-
currentHeadings = currentHeadings.slice(0, depth - 1);
|
|
13684
|
-
currentHeadings[depth - 1] = title;
|
|
13685
|
-
}
|
|
13686
|
-
const rawText = token.raw;
|
|
13687
|
-
if (token.type === "list" && countTokens(rawText) > maxTokens) for (const item of token.items) processTextBlock(item.raw, currentHeadings);
|
|
13688
|
-
else {
|
|
13689
|
-
const isAtomic = token.type === "table" || token.type === "code";
|
|
13690
|
-
processTextBlock(rawText, currentHeadings, isAtomic);
|
|
13691
|
-
}
|
|
13692
|
-
}
|
|
13693
|
-
flushCurrentChunk(true);
|
|
13694
|
-
return chunks;
|
|
13695
|
-
function processTextBlock(blockText, headings, isAtomic = false) {
|
|
13696
|
-
const blockTokens = countTokens(blockText);
|
|
13697
|
-
const contextTokens = countTokens(formatHeadingContext(headings));
|
|
13698
|
-
const safetyBuffer = Math.min(100, Math.max(2, Math.floor(maxTokens * .1)));
|
|
13699
|
-
const budgetLimit = Math.max(5, maxTokens - contextTokens - safetyBuffer);
|
|
13700
|
-
if (blockTokens > budgetLimit) if (isAtomic) {
|
|
13701
|
-
flushCurrentChunk(false);
|
|
13702
|
-
currentChunkList.push({
|
|
13703
|
-
text: blockText,
|
|
13704
|
-
headings: [...headings]
|
|
13705
|
-
});
|
|
13706
|
-
accumulatedTokens = blockTokens;
|
|
13707
|
-
flushCurrentChunk(false);
|
|
13708
|
-
} else {
|
|
13709
|
-
flushCurrentChunk(false);
|
|
13710
|
-
const subBlocks = splitTextRecursively(blockText, budgetLimit);
|
|
13711
|
-
for (const sub of subBlocks) {
|
|
13712
|
-
currentChunkList.push({
|
|
13713
|
-
text: sub,
|
|
13714
|
-
headings: [...headings]
|
|
13715
|
-
});
|
|
13716
|
-
accumulatedTokens += countTokens(sub);
|
|
13717
|
-
if (accumulatedTokens > budgetLimit) flushCurrentChunk(false);
|
|
13718
|
-
}
|
|
13719
|
-
}
|
|
13720
|
-
else {
|
|
13721
|
-
if (accumulatedTokens + blockTokens + contextTokens > maxTokens && currentChunkList.length > 0) flushCurrentChunk(false);
|
|
13722
|
-
currentChunkList.push({
|
|
13723
|
-
text: blockText,
|
|
13724
|
-
headings: [...headings]
|
|
13725
|
-
});
|
|
13726
|
-
accumulatedTokens += blockTokens;
|
|
13727
|
-
}
|
|
13728
|
-
}
|
|
13729
|
-
}
|
|
13730
|
-
|
|
13731
13515
|
//#endregion
|
|
13732
13516
|
//#region src/core/extraction-audit.ts
|
|
13733
13517
|
const AUDIT_ID_RE = /^[\w.-]+$/;
|
|
@@ -14669,44 +14453,7 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
|
|
|
14669
14453
|
|
|
14670
14454
|
//#endregion
|
|
14671
14455
|
//#region src/core/extract-runner.ts
|
|
14672
|
-
const encoding = getEncoding("cl100k_base");
|
|
14673
14456
|
const JSON_EXT_RE$1 = /\.json$/;
|
|
14674
|
-
async function limitConcurrency(concurrency, items, fn) {
|
|
14675
|
-
const results = Array.from({ length: items.length });
|
|
14676
|
-
let nextIndex = 0;
|
|
14677
|
-
async function worker() {
|
|
14678
|
-
while (nextIndex < items.length) {
|
|
14679
|
-
const currentIndex = nextIndex++;
|
|
14680
|
-
results[currentIndex] = await fn(items[currentIndex], currentIndex);
|
|
14681
|
-
}
|
|
14682
|
-
}
|
|
14683
|
-
const workers = Array.from({ length: Math.min(concurrency, items.length) }, worker);
|
|
14684
|
-
await Promise.all(workers);
|
|
14685
|
-
return results;
|
|
14686
|
-
}
|
|
14687
|
-
function getSchemaKeywords(schema) {
|
|
14688
|
-
const keywords = /* @__PURE__ */ new Set();
|
|
14689
|
-
function walk(properties) {
|
|
14690
|
-
if (!properties) return;
|
|
14691
|
-
for (const [name$1, prop] of Object.entries(properties)) {
|
|
14692
|
-
keywords.add(name$1.toLowerCase());
|
|
14693
|
-
const parts = name$1.replace(/([a-z0-9])([A-Z])/g, "$1 $2").split(/[\s._:/\\-]+/g);
|
|
14694
|
-
for (const part of parts) if (part.length > 1) keywords.add(part.toLowerCase());
|
|
14695
|
-
if (prop && typeof prop === "object") {
|
|
14696
|
-
const p = prop;
|
|
14697
|
-
if (typeof p.title === "string") keywords.add(p.title.toLowerCase());
|
|
14698
|
-
if (typeof p.description === "string") {
|
|
14699
|
-
const descParts = p.description.toLowerCase().match(/[\p{L}\p{N}_-]+/gu) ?? [];
|
|
14700
|
-
for (const d of descParts) if (d.length > 2) keywords.add(d);
|
|
14701
|
-
}
|
|
14702
|
-
if (p.type === "object") walk(p.properties);
|
|
14703
|
-
if (p.type === "array" && p.items?.type === "object") walk(p.items.properties);
|
|
14704
|
-
}
|
|
14705
|
-
}
|
|
14706
|
-
}
|
|
14707
|
-
walk(schema.properties);
|
|
14708
|
-
return Array.from(keywords);
|
|
14709
|
-
}
|
|
14710
14457
|
async function ensureDatabaseReady(dbPath, schema) {
|
|
14711
14458
|
try {
|
|
14712
14459
|
await fs.access(dbPath);
|
|
@@ -14778,153 +14525,7 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14778
14525
|
}
|
|
14779
14526
|
const s = spinner();
|
|
14780
14527
|
if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
|
|
14781
|
-
const
|
|
14782
|
-
const overlapTokens = aiConfig.extraction?.overlapSize ?? 1e3;
|
|
14783
|
-
let result;
|
|
14784
|
-
const totalTokens = text$1 ? encoding.encode(text$1).length : 0;
|
|
14785
|
-
if (text$1 && totalTokens > maxTokens) {
|
|
14786
|
-
if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
|
|
14787
|
-
length: totalTokens,
|
|
14788
|
-
limit: maxTokens
|
|
14789
|
-
}));
|
|
14790
|
-
const finalDocs = splitMarkdown(text$1, maxTokens, overlapTokens);
|
|
14791
|
-
if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
|
|
14792
|
-
let processedDocs = finalDocs;
|
|
14793
|
-
if (!!aiConfig.extraction?.preFiltering && finalDocs.length > 1) {
|
|
14794
|
-
const preFilteringLimit = aiConfig.extraction?.preFilteringLimit ?? 5;
|
|
14795
|
-
const keywords = getSchemaKeywords(schemaLoad.schema);
|
|
14796
|
-
const scoredChunks = finalDocs.map((doc, idx) => {
|
|
14797
|
-
if (idx === 0) return {
|
|
14798
|
-
index: idx,
|
|
14799
|
-
score: Number.POSITIVE_INFINITY
|
|
14800
|
-
};
|
|
14801
|
-
let score = 0;
|
|
14802
|
-
const docTextLower = doc.pageContent.toLowerCase();
|
|
14803
|
-
for (const kw of keywords) {
|
|
14804
|
-
let pos = docTextLower.indexOf(kw);
|
|
14805
|
-
while (pos !== -1) {
|
|
14806
|
-
score++;
|
|
14807
|
-
pos = docTextLower.indexOf(kw, pos + kw.length);
|
|
14808
|
-
}
|
|
14809
|
-
}
|
|
14810
|
-
return {
|
|
14811
|
-
index: idx,
|
|
14812
|
-
score
|
|
14813
|
-
};
|
|
14814
|
-
}).slice(1).sort((a, b) => b.score - a.score);
|
|
14815
|
-
const selectedIndices = new Set([0]);
|
|
14816
|
-
let keptCount = 0;
|
|
14817
|
-
for (const sc of scoredChunks) if (sc.score > 0 && keptCount < preFilteringLimit) {
|
|
14818
|
-
selectedIndices.add(sc.index);
|
|
14819
|
-
keptCount++;
|
|
14820
|
-
}
|
|
14821
|
-
processedDocs = finalDocs.filter((_, idx) => selectedIndices.has(idx));
|
|
14822
|
-
if (!options?.quiet) consola.info(t("command.extract.file.preFiltering", {
|
|
14823
|
-
original: finalDocs.length,
|
|
14824
|
-
filtered: processedDocs.length
|
|
14825
|
-
}));
|
|
14826
|
-
}
|
|
14827
|
-
const chunkResults = [];
|
|
14828
|
-
const accumulatedTokens = {
|
|
14829
|
-
prompt: 0,
|
|
14830
|
-
completion: 0,
|
|
14831
|
-
total: 0
|
|
14832
|
-
};
|
|
14833
|
-
let success = true;
|
|
14834
|
-
let errorMsg = "";
|
|
14835
|
-
const extractionTasks = processedDocs.map((doc, i) => {
|
|
14836
|
-
return async () => {
|
|
14837
|
-
if (!success) return;
|
|
14838
|
-
const headings = [];
|
|
14839
|
-
if (doc.metadata) {
|
|
14840
|
-
if (doc.metadata.h1) headings.push(doc.metadata.h1);
|
|
14841
|
-
if (doc.metadata.h2) headings.push(doc.metadata.h2);
|
|
14842
|
-
if (doc.metadata.h3) headings.push(doc.metadata.h3);
|
|
14843
|
-
if (doc.metadata.h4) headings.push(doc.metadata.h4);
|
|
14844
|
-
}
|
|
14845
|
-
let chunkText = doc.pageContent;
|
|
14846
|
-
if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
|
|
14847
|
-
const chunkResult = await extractStructuredData({
|
|
14848
|
-
config: aiConfig,
|
|
14849
|
-
schema: schemaLoad.schema,
|
|
14850
|
-
text: chunkText,
|
|
14851
|
-
aiexDir,
|
|
14852
|
-
modelOverride,
|
|
14853
|
-
onRetry(info) {
|
|
14854
|
-
if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
|
|
14855
|
-
current: i + 1,
|
|
14856
|
-
total: processedDocs.length,
|
|
14857
|
-
code: info.statusCode,
|
|
14858
|
-
delay: info.delayMs / 1e3,
|
|
14859
|
-
attempt: info.attempt,
|
|
14860
|
-
max: info.maxRetries
|
|
14861
|
-
}));
|
|
14862
|
-
}
|
|
14863
|
-
});
|
|
14864
|
-
if (!chunkResult.success) {
|
|
14865
|
-
success = false;
|
|
14866
|
-
errorMsg = chunkResult.error || t("common.unknownError");
|
|
14867
|
-
if (!options?.quiet) {
|
|
14868
|
-
s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
|
|
14869
|
-
consola.error(errorMsg);
|
|
14870
|
-
}
|
|
14871
|
-
return;
|
|
14872
|
-
}
|
|
14873
|
-
if (chunkResult.data) chunkResults.push(chunkResult.data);
|
|
14874
|
-
if (chunkResult.tokensUsed) {
|
|
14875
|
-
accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
|
|
14876
|
-
accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
|
|
14877
|
-
accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
|
|
14878
|
-
}
|
|
14879
|
-
};
|
|
14880
|
-
});
|
|
14881
|
-
const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
|
|
14882
|
-
if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
|
|
14883
|
-
current: 1,
|
|
14884
|
-
total: processedDocs.length
|
|
14885
|
-
}));
|
|
14886
|
-
try {
|
|
14887
|
-
await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
|
|
14888
|
-
if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
|
|
14889
|
-
current: idx + 1,
|
|
14890
|
-
total: processedDocs.length
|
|
14891
|
-
}));
|
|
14892
|
-
await task();
|
|
14893
|
-
});
|
|
14894
|
-
} catch (e) {
|
|
14895
|
-
success = false;
|
|
14896
|
-
errorMsg = e instanceof Error ? e.message : String(e);
|
|
14897
|
-
}
|
|
14898
|
-
if (!success) return {
|
|
14899
|
-
success: false,
|
|
14900
|
-
error: errorMsg
|
|
14901
|
-
};
|
|
14902
|
-
const mergedData = mergeExtractionResults(schemaLoad.schema, chunkResults);
|
|
14903
|
-
const validation = validateExtractedData(schemaLoad.schema, mergedData);
|
|
14904
|
-
if (!validation.success) {
|
|
14905
|
-
const valError = validation.error || "Merged data validation failed";
|
|
14906
|
-
if (!options?.quiet) {
|
|
14907
|
-
s.stop(t("command.extract.file.validationFail"));
|
|
14908
|
-
consola.error(valError);
|
|
14909
|
-
}
|
|
14910
|
-
return {
|
|
14911
|
-
success: false,
|
|
14912
|
-
error: valError
|
|
14913
|
-
};
|
|
14914
|
-
}
|
|
14915
|
-
const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
|
|
14916
|
-
await fs.mkdir(outputDir, { recursive: true });
|
|
14917
|
-
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
14918
|
-
const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
|
|
14919
|
-
const finalMergedOutputPath = path.join(outputDir, outputFileName);
|
|
14920
|
-
await fs.writeFile(finalMergedOutputPath, JSON.stringify(mergedData, null, 2));
|
|
14921
|
-
result = {
|
|
14922
|
-
success: true,
|
|
14923
|
-
data: mergedData,
|
|
14924
|
-
tokensUsed: accumulatedTokens,
|
|
14925
|
-
outputPath: finalMergedOutputPath
|
|
14926
|
-
};
|
|
14927
|
-
} else result = await extractStructuredData({
|
|
14528
|
+
const result = await extractStructuredData({
|
|
14928
14529
|
config: aiConfig,
|
|
14929
14530
|
schema: schemaLoad.schema,
|
|
14930
14531
|
text: text$1 ?? "",
|
|
@@ -14952,11 +14553,6 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14952
14553
|
}
|
|
14953
14554
|
if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
|
|
14954
14555
|
if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
|
|
14955
|
-
if (result.evidenceSummary && !options?.quiet) {
|
|
14956
|
-
const summary = result.evidenceSummary;
|
|
14957
|
-
const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
|
|
14958
|
-
consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, issues ${issueText}`));
|
|
14959
|
-
}
|
|
14960
14556
|
if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
|
|
14961
14557
|
prompt: result.tokensUsed.prompt,
|
|
14962
14558
|
completion: result.tokensUsed.completion,
|
|
@@ -14985,7 +14581,6 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14985
14581
|
outputPath: result.outputPath,
|
|
14986
14582
|
data: result.data,
|
|
14987
14583
|
tablesInserted: insertResult.tablesInserted,
|
|
14988
|
-
evidenceSummary: result.evidenceSummary,
|
|
14989
14584
|
tokensUsed: result.tokensUsed
|
|
14990
14585
|
};
|
|
14991
14586
|
} else {
|
|
@@ -15012,7 +14607,6 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
15012
14607
|
success: true,
|
|
15013
14608
|
outputPath: result.outputPath,
|
|
15014
14609
|
data: result.data,
|
|
15015
|
-
evidenceSummary: result.evidenceSummary,
|
|
15016
14610
|
tokensUsed: result.tokensUsed
|
|
15017
14611
|
};
|
|
15018
14612
|
}
|
|
@@ -15125,7 +14719,6 @@ async function runAuditedExtraction(options) {
|
|
|
15125
14719
|
outputName: updated.outputName,
|
|
15126
14720
|
tablesInserted: updated.tablesInserted,
|
|
15127
14721
|
notionPages: updated.notionPages,
|
|
15128
|
-
evidenceSummary: r.evidenceSummary,
|
|
15129
14722
|
tokensUsed: updated.tokensUsed,
|
|
15130
14723
|
auditId: updated.id,
|
|
15131
14724
|
fileHash
|
|
@@ -16253,7 +15846,6 @@ function aiRoutes(config) {
|
|
|
16253
15846
|
//#endregion
|
|
16254
15847
|
//#region src/core/data-service.ts
|
|
16255
15848
|
const FILE_REGEX = /\.json$/;
|
|
16256
|
-
const EVIDENCE_FILE_SUFFIX = ".evidence.json";
|
|
16257
15849
|
const EXTRACTION_TIMESTAMP_RE = /-\d{4}-\d{2}-\d{2}T/;
|
|
16258
15850
|
const INTERNAL_ROWID_COLUMN = "__aiex_rowid";
|
|
16259
15851
|
const TIMESTAMP_CLEANUP = /(\d{2})-(\d{2})-(\d{2})/;
|
|
@@ -16269,24 +15861,6 @@ function getAuditNotionStatus(record) {
|
|
|
16269
15861
|
if (record.status === "failed") return "failed";
|
|
16270
15862
|
return "not_synced";
|
|
16271
15863
|
}
|
|
16272
|
-
async function readEvidenceSummary(extractedDir, outputName) {
|
|
16273
|
-
const evidencePath = path.join(extractedDir, outputName.replace(FILE_REGEX, EVIDENCE_FILE_SUFFIX));
|
|
16274
|
-
try {
|
|
16275
|
-
const coverage = (await readFile(evidencePath))?.coverage;
|
|
16276
|
-
if (!coverage || typeof coverage !== "object") return void 0;
|
|
16277
|
-
return {
|
|
16278
|
-
path: evidencePath,
|
|
16279
|
-
fieldCount: Number(coverage.fieldCount) || 0,
|
|
16280
|
-
evidenceCount: Number(coverage.evidenceCount) || 0,
|
|
16281
|
-
foundCount: Number(coverage.foundCount) || 0,
|
|
16282
|
-
missingCount: Number(coverage.missingCount) || 0,
|
|
16283
|
-
inferredCount: Number(coverage.inferredCount) || 0,
|
|
16284
|
-
issueCount: Number(coverage.issueCount) || 0
|
|
16285
|
-
};
|
|
16286
|
-
} catch {
|
|
16287
|
-
return;
|
|
16288
|
-
}
|
|
16289
|
-
}
|
|
16290
15864
|
async function getRowExtractionActions(aiexDir, tableName) {
|
|
16291
15865
|
const actions = /* @__PURE__ */ new Map();
|
|
16292
15866
|
const auditRecords = await listExtractionAuditRecords(aiexDir);
|
|
@@ -16314,7 +15888,7 @@ async function listExtractions(config) {
|
|
|
16314
15888
|
const aiexDir = path.dirname(config.schemaPath);
|
|
16315
15889
|
const extractedDir = path.join(aiexDir, "extracted");
|
|
16316
15890
|
await fs.mkdir(extractedDir, { recursive: true });
|
|
16317
|
-
const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md")
|
|
15891
|
+
const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md"));
|
|
16318
15892
|
const auditRecords = await listExtractionAuditRecords(aiexDir);
|
|
16319
15893
|
const auditByOutputName = new Map(auditRecords.map((record) => [record.outputName, record]));
|
|
16320
15894
|
const records = [];
|
|
@@ -16333,7 +15907,6 @@ async function listExtractions(config) {
|
|
|
16333
15907
|
timestamp,
|
|
16334
15908
|
fileSize: stat.size,
|
|
16335
15909
|
modifiedAt: stat.mtime.toISOString(),
|
|
16336
|
-
evidenceSummary: await readEvidenceSummary(extractedDir, file),
|
|
16337
15910
|
notionStatus: notionPages ? "synced" : audit?.status === "failed" ? "failed" : "not_synced",
|
|
16338
15911
|
notionPages,
|
|
16339
15912
|
notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0
|
|
@@ -16513,7 +16086,6 @@ async function retryNotionSync(config, fileName) {
|
|
|
16513
16086
|
|
|
16514
16087
|
//#endregion
|
|
16515
16088
|
//#region src/server/routes/data.ts
|
|
16516
|
-
const JSON_FILE_SUFFIX_RE = /\.json$/;
|
|
16517
16089
|
const tableParamSchema = z.object({ name: z.string().regex(/^[a-z][a-z0-9_]*$/) });
|
|
16518
16090
|
const extractionFileParamSchema = z.object({ name: z.string().regex(/^[\w.-]+\.json$/).refine((name$1) => name$1 === path.basename(name$1) && !name$1.includes("..")) });
|
|
16519
16091
|
const tableQuerySchema = z.object({
|
|
@@ -16566,22 +16138,10 @@ function dataRoutes(config) {
|
|
|
16566
16138
|
const filePath = path.join(extractedDir, name$1);
|
|
16567
16139
|
try {
|
|
16568
16140
|
const content = await fs.readFile(filePath, "utf-8");
|
|
16569
|
-
const evidencePath = path.join(extractedDir, name$1.replace(JSON_FILE_SUFFIX_RE, ".evidence.json"));
|
|
16570
|
-
let evidenceSummary;
|
|
16571
|
-
try {
|
|
16572
|
-
const evidence = JSON.parse(await fs.readFile(evidencePath, "utf-8"));
|
|
16573
|
-
evidenceSummary = evidence?.coverage ? {
|
|
16574
|
-
...evidence.coverage,
|
|
16575
|
-
path: evidencePath
|
|
16576
|
-
} : void 0;
|
|
16577
|
-
} catch {
|
|
16578
|
-
evidenceSummary = void 0;
|
|
16579
|
-
}
|
|
16580
16141
|
return c.json({
|
|
16581
16142
|
success: true,
|
|
16582
16143
|
content,
|
|
16583
|
-
name: name$1
|
|
16584
|
-
evidenceSummary
|
|
16144
|
+
name: name$1
|
|
16585
16145
|
});
|
|
16586
16146
|
} catch {
|
|
16587
16147
|
return c.json({ error: t("server.extractionNotFound") }, 404);
|
|
@@ -16725,7 +16285,6 @@ function extractRoutes(config) {
|
|
|
16725
16285
|
outputName: result.outputName,
|
|
16726
16286
|
tablesInserted: result.tablesInserted,
|
|
16727
16287
|
notionPages: result.notionPages,
|
|
16728
|
-
evidenceSummary: result.evidenceSummary,
|
|
16729
16288
|
tokensUsed: result.tokensUsed,
|
|
16730
16289
|
auditId: result.auditId
|
|
16731
16290
|
}, 200);
|
|
@@ -16793,7 +16352,6 @@ function extractRoutes(config) {
|
|
|
16793
16352
|
outputName: result.outputName,
|
|
16794
16353
|
tablesInserted: result.tablesInserted,
|
|
16795
16354
|
notionPages: result.notionPages,
|
|
16796
|
-
evidenceSummary: result.evidenceSummary,
|
|
16797
16355
|
tokensUsed: result.tokensUsed,
|
|
16798
16356
|
auditId: result.auditId
|
|
16799
16357
|
}, 200);
|
|
@@ -74,7 +74,7 @@ function doctorDiagnosticsTableRows(d) {
|
|
|
74
74
|
//#endregion
|
|
75
75
|
//#region package.json
|
|
76
76
|
var name = "aiex-cli";
|
|
77
|
-
var version = "0.0.
|
|
77
|
+
var version = "0.0.6-beta.1";
|
|
78
78
|
var description = "JSON Schema → SQLite with AI-powered data extraction";
|
|
79
79
|
var package_default = {
|
|
80
80
|
name,
|
|
@@ -158,11 +158,9 @@ var package_default = {
|
|
|
158
158
|
"hono": "catalog:",
|
|
159
159
|
"i18next": "catalog:",
|
|
160
160
|
"i18next-fs-backend": "catalog:",
|
|
161
|
-
"js-tiktoken": "catalog:",
|
|
162
161
|
"jsonfile": "catalog:",
|
|
163
162
|
"jsonrepair": "catalog:",
|
|
164
163
|
"kysely": "catalog:",
|
|
165
|
-
"marked": "catalog:",
|
|
166
164
|
"mime": "catalog:",
|
|
167
165
|
"open": "catalog:",
|
|
168
166
|
"p-retry": "catalog:",
|
|
@@ -230,15 +228,7 @@ const PromptConfigSchema = z.object({
|
|
|
230
228
|
systemTemplate: z.string().min(1),
|
|
231
229
|
userTemplate: z.string().min(1)
|
|
232
230
|
});
|
|
233
|
-
const ExtractionConfigSchema = z.object({
|
|
234
|
-
outputDir: z.string().min(1),
|
|
235
|
-
mode: z.enum(["pipeline"]).default("pipeline").optional(),
|
|
236
|
-
concurrency: z.number().int().min(1).optional(),
|
|
237
|
-
maxTokens: z.number().int().positive().default(8e3).optional(),
|
|
238
|
-
overlapSize: z.number().int().nonnegative().optional(),
|
|
239
|
-
preFiltering: z.boolean().optional(),
|
|
240
|
-
preFilteringLimit: z.number().int().min(1).optional()
|
|
241
|
-
});
|
|
231
|
+
const ExtractionConfigSchema = z.object({ outputDir: z.string().min(1) });
|
|
242
232
|
const ImageOcrConfigSchema = z.object({
|
|
243
233
|
ocrFallback: z.enum([
|
|
244
234
|
"auto",
|
|
@@ -345,10 +335,7 @@ Extraction requirements:
|
|
|
345
335
|
userTemplate: `Please extract data from the following text:
|
|
346
336
|
{text}`
|
|
347
337
|
};
|
|
348
|
-
const DEFAULT_EXTRACTION_CONFIG = {
|
|
349
|
-
outputDir: ".aiex/extracted",
|
|
350
|
-
mode: "pipeline"
|
|
351
|
-
};
|
|
338
|
+
const DEFAULT_EXTRACTION_CONFIG = { outputDir: ".aiex/extracted" };
|
|
352
339
|
const DEFAULT_IMAGE_OCR_CONFIG = {
|
|
353
340
|
ocrFallback: "auto",
|
|
354
341
|
ocrLanguages: "en-US, zh-Hans",
|
|
@@ -580,13 +567,6 @@ const en = {
|
|
|
580
567
|
extractFail: "Extraction failed",
|
|
581
568
|
extractComplete: "Extraction complete",
|
|
582
569
|
extractRetry: "API responded with {{code}}, retrying in {{delay}}s ({{attempt}}/{{max}})",
|
|
583
|
-
chunking: "Input text ({{length}} tokens) exceeds limit ({{limit}} tokens). Splitting into chunks...",
|
|
584
|
-
chunksCount: "Split into {{count}} chunk(s).",
|
|
585
|
-
preFiltering: "Hybrid pre-filtering: selected {{filtered}} out of {{original}} chunks based on schema relevance.",
|
|
586
|
-
extractingChunk: "Extracting chunk {{current}}/{{total}}...",
|
|
587
|
-
extractRetryChunk: "Chunk {{current}}/{{total}} API responded with {{code}}, retrying in {{delay}}s ({{attempt}}/{{max}})",
|
|
588
|
-
extractFailChunk: "Extraction failed for chunk {{current}}/{{total}}",
|
|
589
|
-
validationFail: "Merged data validation failed",
|
|
590
570
|
resultSaved: "Result saved: {{path}}",
|
|
591
571
|
tokenUsage: "Token usage: prompt={{prompt}}, completion={{completion}}, total={{total}}",
|
|
592
572
|
insertingDb: "Inserting into database...",
|
|
@@ -976,7 +956,7 @@ async function initI18n(lng) {
|
|
|
976
956
|
fallbackLng: "en",
|
|
977
957
|
resources: {
|
|
978
958
|
"en": { translation: en },
|
|
979
|
-
"zh-CN": { translation: await import("./zh-CN-
|
|
959
|
+
"zh-CN": { translation: await import("./zh-CN-Qcn0DHFh.mjs").then((m) => m.zhCN) }
|
|
980
960
|
},
|
|
981
961
|
interpolation: { escapeValue: false },
|
|
982
962
|
returnNull: false
|