aiex-cli 0.0.5-beta.2 → 0.0.5-beta.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -0
- package/dist/cli.mjs +385 -7
- package/dist/{doctor-collector-DZyLrpqA.mjs → doctor-collector-Cv7RArla.mjs} +21 -4
- package/dist/index.d.mts +91 -88
- package/dist/index.mjs +1 -1
- package/dist/web/assets/ExtractionViewer-BhhWrBs2.js +1 -0
- package/dist/web/assets/{index-Dlze68g1.js → index-CKV2X6sS.js} +2 -2
- package/dist/web/assets/index-Csdgio76.css +2 -0
- package/dist/web/index.html +2 -2
- package/dist/{zh-CN-Qcn0DHFh.mjs → zh-CN-CyL-61Ow.mjs} +7 -0
- package/package.json +1 -1
- package/dist/web/assets/ExtractionViewer-DqIrBGNK.js +0 -1
- package/dist/web/assets/index-CvY9TGny.css +0 -2
package/README.md
CHANGED
|
@@ -202,6 +202,17 @@ aiex completion fish | source
|
|
|
202
202
|
|
|
203
203
|
<br>
|
|
204
204
|
|
|
205
|
+
## 📄 Large Document Processing
|
|
206
|
+
|
|
207
|
+
When processing very large documents (exceeding `40,000` characters), `aiex` runs an optimized **Pipeline Mode** to handle context window limits and control API costs:
|
|
208
|
+
|
|
209
|
+
- **Sliding Window & Overlapping Slices**: Splits the document logically at Markdown headings or paragraph boundaries. It uses an overlapping sliding window to ensure contextual continuity at slice boundaries. Active heading hierarchies are tracked and prepended to each chunk as context.
|
|
210
|
+
- **Concurrency Limiting**: To respect strict model rate limits, chunk extractions are processed in parallel with a strict concurrency limit (capped at 2 concurrent requests).
|
|
211
|
+
- **Pre-filtering**: Integrates hybrid search-based pre-filtering to score and select only the most relevant document chunks based on schema queries, preventing unnecessary token usage on unrelated sections.
|
|
212
|
+
- **Recursive Merging**: The final extracted JSON objects from each chunk are recursively merged, concatenating lists and deduplicating primitive fields.
|
|
213
|
+
|
|
214
|
+
<br>
|
|
215
|
+
|
|
205
216
|
## 🔧 AI Configuration
|
|
206
217
|
|
|
207
218
|
aiex works with any OpenAI-compatible API provider. Configure in the Web UI (AI Settings panel):
|
package/dist/cli.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-
|
|
1
|
+
import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-Cv7RArla.mjs";
|
|
2
2
|
import { createRequire } from "node:module";
|
|
3
3
|
import fs from "node:fs/promises";
|
|
4
4
|
import os from "node:os";
|
|
@@ -13128,7 +13128,7 @@ function propertyToExtractionSchema(property) {
|
|
|
13128
13128
|
}
|
|
13129
13129
|
return { type: nullableType(property.type) };
|
|
13130
13130
|
}
|
|
13131
|
-
function isRecord(value) {
|
|
13131
|
+
function isRecord$1(value) {
|
|
13132
13132
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
13133
13133
|
}
|
|
13134
13134
|
function schemaToExtractionOutputSchema(schema) {
|
|
@@ -13166,7 +13166,7 @@ function validatePropertyValue(path$1, property, value, issues) {
|
|
|
13166
13166
|
}
|
|
13167
13167
|
return;
|
|
13168
13168
|
case "object":
|
|
13169
|
-
if (!isRecord(value)) {
|
|
13169
|
+
if (!isRecord$1(value)) {
|
|
13170
13170
|
issues.push(`${path$1}: expected object or null`);
|
|
13171
13171
|
return;
|
|
13172
13172
|
}
|
|
@@ -13189,7 +13189,7 @@ function validateProperties(basePath, properties, data, issues) {
|
|
|
13189
13189
|
}
|
|
13190
13190
|
}
|
|
13191
13191
|
function validateExtractedData(schema, data) {
|
|
13192
|
-
if (!isRecord(data)) return {
|
|
13192
|
+
if (!isRecord$1(data)) return {
|
|
13193
13193
|
success: false,
|
|
13194
13194
|
error: "Extracted data must be a JSON object."
|
|
13195
13195
|
};
|
|
@@ -13512,6 +13512,161 @@ function insertExtractedData(db, schema, data) {
|
|
|
13512
13512
|
}
|
|
13513
13513
|
}
|
|
13514
13514
|
|
|
13515
|
+
//#endregion
|
|
13516
|
+
//#region src/core/ai-extraction/json-merger.ts
|
|
13517
|
+
function isRecord(value) {
|
|
13518
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
13519
|
+
}
|
|
13520
|
+
function mergePropertyValue(property, values) {
|
|
13521
|
+
const nonNullValues = values.filter((v) => v !== null && v !== void 0);
|
|
13522
|
+
if (nonNullValues.length === 0) return null;
|
|
13523
|
+
if (property.type === "array") {
|
|
13524
|
+
const concatenated = [];
|
|
13525
|
+
for (const val of nonNullValues) if (Array.isArray(val)) concatenated.push(...val);
|
|
13526
|
+
return concatenated;
|
|
13527
|
+
}
|
|
13528
|
+
if (property.type === "object") {
|
|
13529
|
+
const childProperties = property.properties;
|
|
13530
|
+
if (!childProperties) {
|
|
13531
|
+
const mergedObj$1 = {};
|
|
13532
|
+
for (const val of nonNullValues) if (isRecord(val)) Object.assign(mergedObj$1, val);
|
|
13533
|
+
return mergedObj$1;
|
|
13534
|
+
}
|
|
13535
|
+
const mergedObj = {};
|
|
13536
|
+
for (const [propName, propDef] of Object.entries(childProperties)) mergedObj[propName] = mergePropertyValue(propDef, nonNullValues.map((v) => isRecord(v) ? v[propName] : void 0));
|
|
13537
|
+
return mergedObj;
|
|
13538
|
+
}
|
|
13539
|
+
const bestValue = nonNullValues.find((v) => {
|
|
13540
|
+
if (typeof v === "string") return v.trim() !== "";
|
|
13541
|
+
return true;
|
|
13542
|
+
});
|
|
13543
|
+
return bestValue !== void 0 ? bestValue : null;
|
|
13544
|
+
}
|
|
13545
|
+
/**
|
|
13546
|
+
* Merges structured extraction outputs from multiple document chunks
|
|
13547
|
+
* according to the schema properties.
|
|
13548
|
+
*/
|
|
13549
|
+
function mergeExtractionResults(schema, results) {
|
|
13550
|
+
if (results.length === 0) return {};
|
|
13551
|
+
if (results.length === 1) return results[0];
|
|
13552
|
+
const merged = {};
|
|
13553
|
+
for (const [propName, propDef] of Object.entries(schema.properties)) {
|
|
13554
|
+
if (propDef.primary && propDef.autoIncrement) continue;
|
|
13555
|
+
merged[propName] = mergePropertyValue(propDef, results.map((r) => r[propName]));
|
|
13556
|
+
}
|
|
13557
|
+
return merged;
|
|
13558
|
+
}
|
|
13559
|
+
|
|
13560
|
+
//#endregion
|
|
13561
|
+
//#region src/core/ai-extraction/text-splitter.ts
|
|
13562
|
+
const HEADING_RE = /^(#{1,6})\s+(\S.*)$/;
|
|
13563
|
+
/**
|
|
13564
|
+
* Splits a Markdown document into chunks based on header hierarchy.
|
|
13565
|
+
* Keeps tables and list blocks intact by splitting along paragraphs (\n\n)
|
|
13566
|
+
* when a section exceeds the maxSize limit.
|
|
13567
|
+
*/
|
|
13568
|
+
function splitMarkdown(text$1, maxSize = 4e4, overlapSize = 0) {
|
|
13569
|
+
const lines = text$1.split("\n");
|
|
13570
|
+
const chunks = [];
|
|
13571
|
+
let currentHeadings = [];
|
|
13572
|
+
let currentChunkLines = [];
|
|
13573
|
+
let currentSize = 0;
|
|
13574
|
+
let hasNewLines = false;
|
|
13575
|
+
const getMetadata = (headings) => {
|
|
13576
|
+
return {
|
|
13577
|
+
h1: headings[0] || void 0,
|
|
13578
|
+
h2: headings[1] || void 0,
|
|
13579
|
+
h3: headings[2] || void 0,
|
|
13580
|
+
h4: headings[3] || void 0
|
|
13581
|
+
};
|
|
13582
|
+
};
|
|
13583
|
+
const flushChunk = (isHeadingChange = false) => {
|
|
13584
|
+
if (currentChunkLines.length === 0 || !hasNewLines) {
|
|
13585
|
+
currentChunkLines = [];
|
|
13586
|
+
currentSize = 0;
|
|
13587
|
+
hasNewLines = false;
|
|
13588
|
+
return;
|
|
13589
|
+
}
|
|
13590
|
+
const pageContent = currentChunkLines.join("\n");
|
|
13591
|
+
let lastChunkContent = "";
|
|
13592
|
+
if (pageContent.length > maxSize) {
|
|
13593
|
+
const paragraphs = pageContent.split("\n\n");
|
|
13594
|
+
let subLines = [];
|
|
13595
|
+
let subSize = 0;
|
|
13596
|
+
for (const para of paragraphs) {
|
|
13597
|
+
const paraSize = para.length;
|
|
13598
|
+
if (subSize + paraSize > maxSize && subLines.length > 0) {
|
|
13599
|
+
const content = subLines.join("\n\n");
|
|
13600
|
+
chunks.push({
|
|
13601
|
+
pageContent: content,
|
|
13602
|
+
metadata: getMetadata(currentHeadings)
|
|
13603
|
+
});
|
|
13604
|
+
const overlapParas = [];
|
|
13605
|
+
let currentOverlapSize = 0;
|
|
13606
|
+
for (let j = subLines.length - 1; j >= 0; j--) {
|
|
13607
|
+
const p = subLines[j];
|
|
13608
|
+
if (currentOverlapSize + p.length > overlapSize && overlapParas.length > 0) break;
|
|
13609
|
+
overlapParas.unshift(p);
|
|
13610
|
+
currentOverlapSize += p.length + 2;
|
|
13611
|
+
}
|
|
13612
|
+
subLines = [...overlapParas];
|
|
13613
|
+
subSize = currentOverlapSize;
|
|
13614
|
+
}
|
|
13615
|
+
subLines.push(para);
|
|
13616
|
+
subSize += paraSize + 2;
|
|
13617
|
+
}
|
|
13618
|
+
if (subLines.length > 0) {
|
|
13619
|
+
const content = subLines.join("\n\n");
|
|
13620
|
+
chunks.push({
|
|
13621
|
+
pageContent: content,
|
|
13622
|
+
metadata: getMetadata(currentHeadings)
|
|
13623
|
+
});
|
|
13624
|
+
lastChunkContent = content;
|
|
13625
|
+
}
|
|
13626
|
+
} else {
|
|
13627
|
+
chunks.push({
|
|
13628
|
+
pageContent,
|
|
13629
|
+
metadata: getMetadata(currentHeadings)
|
|
13630
|
+
});
|
|
13631
|
+
lastChunkContent = pageContent;
|
|
13632
|
+
}
|
|
13633
|
+
if (!isHeadingChange && lastChunkContent && overlapSize > 0) {
|
|
13634
|
+
const paragraphs = lastChunkContent.split("\n\n");
|
|
13635
|
+
const overlapParas = [];
|
|
13636
|
+
let currentOverlapSize = 0;
|
|
13637
|
+
for (let j = paragraphs.length - 1; j >= 0; j--) {
|
|
13638
|
+
const p = paragraphs[j];
|
|
13639
|
+
if (currentOverlapSize + p.length > overlapSize && overlapParas.length > 0) break;
|
|
13640
|
+
overlapParas.unshift(p);
|
|
13641
|
+
currentOverlapSize += p.length + 2;
|
|
13642
|
+
}
|
|
13643
|
+
const overlapText = overlapParas.join("\n\n");
|
|
13644
|
+
currentChunkLines = overlapText.split("\n");
|
|
13645
|
+
currentSize = overlapText.length;
|
|
13646
|
+
} else {
|
|
13647
|
+
currentChunkLines = [];
|
|
13648
|
+
currentSize = 0;
|
|
13649
|
+
}
|
|
13650
|
+
hasNewLines = false;
|
|
13651
|
+
};
|
|
13652
|
+
for (const line of lines) {
|
|
13653
|
+
const headingMatch = line.match(HEADING_RE);
|
|
13654
|
+
if (headingMatch) {
|
|
13655
|
+
flushChunk(true);
|
|
13656
|
+
const depth = headingMatch[1].length;
|
|
13657
|
+
const title = headingMatch[2].trim();
|
|
13658
|
+
currentHeadings = currentHeadings.slice(0, depth - 1);
|
|
13659
|
+
currentHeadings[depth - 1] = title;
|
|
13660
|
+
}
|
|
13661
|
+
currentChunkLines.push(line);
|
|
13662
|
+
currentSize += line.length + 1;
|
|
13663
|
+
hasNewLines = true;
|
|
13664
|
+
if (currentSize > maxSize) flushChunk(false);
|
|
13665
|
+
}
|
|
13666
|
+
flushChunk(true);
|
|
13667
|
+
return chunks;
|
|
13668
|
+
}
|
|
13669
|
+
|
|
13515
13670
|
//#endregion
|
|
13516
13671
|
//#region src/core/extraction-audit.ts
|
|
13517
13672
|
const AUDIT_ID_RE = /^[\w.-]+$/;
|
|
@@ -14454,6 +14609,42 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
|
|
|
14454
14609
|
//#endregion
|
|
14455
14610
|
//#region src/core/extract-runner.ts
|
|
14456
14611
|
const JSON_EXT_RE$1 = /\.json$/;
|
|
14612
|
+
async function limitConcurrency(concurrency, items, fn) {
|
|
14613
|
+
const results = Array.from({ length: items.length });
|
|
14614
|
+
let nextIndex = 0;
|
|
14615
|
+
async function worker() {
|
|
14616
|
+
while (nextIndex < items.length) {
|
|
14617
|
+
const currentIndex = nextIndex++;
|
|
14618
|
+
results[currentIndex] = await fn(items[currentIndex], currentIndex);
|
|
14619
|
+
}
|
|
14620
|
+
}
|
|
14621
|
+
const workers = Array.from({ length: Math.min(concurrency, items.length) }, worker);
|
|
14622
|
+
await Promise.all(workers);
|
|
14623
|
+
return results;
|
|
14624
|
+
}
|
|
14625
|
+
function getSchemaKeywords(schema) {
|
|
14626
|
+
const keywords = /* @__PURE__ */ new Set();
|
|
14627
|
+
function walk(properties) {
|
|
14628
|
+
if (!properties) return;
|
|
14629
|
+
for (const [name$1, prop] of Object.entries(properties)) {
|
|
14630
|
+
keywords.add(name$1.toLowerCase());
|
|
14631
|
+
const parts = name$1.replace(/([a-z0-9])([A-Z])/g, "$1 $2").split(/[\s._:/\\-]+/g);
|
|
14632
|
+
for (const part of parts) if (part.length > 1) keywords.add(part.toLowerCase());
|
|
14633
|
+
if (prop && typeof prop === "object") {
|
|
14634
|
+
const p = prop;
|
|
14635
|
+
if (typeof p.title === "string") keywords.add(p.title.toLowerCase());
|
|
14636
|
+
if (typeof p.description === "string") {
|
|
14637
|
+
const descParts = p.description.toLowerCase().match(/[\p{L}\p{N}_-]+/gu) ?? [];
|
|
14638
|
+
for (const d of descParts) if (d.length > 2) keywords.add(d);
|
|
14639
|
+
}
|
|
14640
|
+
if (p.type === "object") walk(p.properties);
|
|
14641
|
+
if (p.type === "array" && p.items?.type === "object") walk(p.items.properties);
|
|
14642
|
+
}
|
|
14643
|
+
}
|
|
14644
|
+
}
|
|
14645
|
+
walk(schema.properties);
|
|
14646
|
+
return Array.from(keywords);
|
|
14647
|
+
}
|
|
14457
14648
|
async function ensureDatabaseReady(dbPath, schema) {
|
|
14458
14649
|
try {
|
|
14459
14650
|
await fs.access(dbPath);
|
|
@@ -14525,7 +14716,151 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14525
14716
|
}
|
|
14526
14717
|
const s = spinner();
|
|
14527
14718
|
if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
|
|
14528
|
-
const
|
|
14719
|
+
const CHUNK_LIMIT = 4e4;
|
|
14720
|
+
let result;
|
|
14721
|
+
if (text$1 && text$1.length > CHUNK_LIMIT) {
|
|
14722
|
+
if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
|
|
14723
|
+
length: text$1.length,
|
|
14724
|
+
limit: CHUNK_LIMIT
|
|
14725
|
+
}));
|
|
14726
|
+
const finalDocs = splitMarkdown(text$1, CHUNK_LIMIT, aiConfig.extraction?.overlapSize ?? 2e3);
|
|
14727
|
+
if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
|
|
14728
|
+
let processedDocs = finalDocs;
|
|
14729
|
+
if (!!aiConfig.extraction?.preFiltering && finalDocs.length > 1) {
|
|
14730
|
+
const preFilteringLimit = aiConfig.extraction?.preFilteringLimit ?? 5;
|
|
14731
|
+
const keywords = getSchemaKeywords(schemaLoad.schema);
|
|
14732
|
+
const scoredChunks = finalDocs.map((doc, idx) => {
|
|
14733
|
+
if (idx === 0) return {
|
|
14734
|
+
index: idx,
|
|
14735
|
+
score: Number.POSITIVE_INFINITY
|
|
14736
|
+
};
|
|
14737
|
+
let score = 0;
|
|
14738
|
+
const docTextLower = doc.pageContent.toLowerCase();
|
|
14739
|
+
for (const kw of keywords) {
|
|
14740
|
+
let pos = docTextLower.indexOf(kw);
|
|
14741
|
+
while (pos !== -1) {
|
|
14742
|
+
score++;
|
|
14743
|
+
pos = docTextLower.indexOf(kw, pos + kw.length);
|
|
14744
|
+
}
|
|
14745
|
+
}
|
|
14746
|
+
return {
|
|
14747
|
+
index: idx,
|
|
14748
|
+
score
|
|
14749
|
+
};
|
|
14750
|
+
}).slice(1).sort((a, b) => b.score - a.score);
|
|
14751
|
+
const selectedIndices = new Set([0]);
|
|
14752
|
+
let keptCount = 0;
|
|
14753
|
+
for (const sc of scoredChunks) if (sc.score > 0 && keptCount < preFilteringLimit) {
|
|
14754
|
+
selectedIndices.add(sc.index);
|
|
14755
|
+
keptCount++;
|
|
14756
|
+
}
|
|
14757
|
+
processedDocs = finalDocs.filter((_, idx) => selectedIndices.has(idx));
|
|
14758
|
+
if (!options?.quiet) consola.info(t("command.extract.file.preFiltering", {
|
|
14759
|
+
original: finalDocs.length,
|
|
14760
|
+
filtered: processedDocs.length
|
|
14761
|
+
}));
|
|
14762
|
+
}
|
|
14763
|
+
const chunkResults = [];
|
|
14764
|
+
const accumulatedTokens = {
|
|
14765
|
+
prompt: 0,
|
|
14766
|
+
completion: 0,
|
|
14767
|
+
total: 0
|
|
14768
|
+
};
|
|
14769
|
+
let success = true;
|
|
14770
|
+
let errorMsg = "";
|
|
14771
|
+
const extractionTasks = processedDocs.map((doc, i) => {
|
|
14772
|
+
return async () => {
|
|
14773
|
+
if (!success) return;
|
|
14774
|
+
const headings = [];
|
|
14775
|
+
if (doc.metadata) {
|
|
14776
|
+
if (doc.metadata.h1) headings.push(doc.metadata.h1);
|
|
14777
|
+
if (doc.metadata.h2) headings.push(doc.metadata.h2);
|
|
14778
|
+
if (doc.metadata.h3) headings.push(doc.metadata.h3);
|
|
14779
|
+
if (doc.metadata.h4) headings.push(doc.metadata.h4);
|
|
14780
|
+
}
|
|
14781
|
+
let chunkText = doc.pageContent;
|
|
14782
|
+
if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
|
|
14783
|
+
const chunkResult = await extractStructuredData({
|
|
14784
|
+
config: aiConfig,
|
|
14785
|
+
schema: schemaLoad.schema,
|
|
14786
|
+
text: chunkText,
|
|
14787
|
+
aiexDir,
|
|
14788
|
+
modelOverride,
|
|
14789
|
+
onRetry(info) {
|
|
14790
|
+
if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
|
|
14791
|
+
current: i + 1,
|
|
14792
|
+
total: processedDocs.length,
|
|
14793
|
+
code: info.statusCode,
|
|
14794
|
+
delay: info.delayMs / 1e3,
|
|
14795
|
+
attempt: info.attempt,
|
|
14796
|
+
max: info.maxRetries
|
|
14797
|
+
}));
|
|
14798
|
+
}
|
|
14799
|
+
});
|
|
14800
|
+
if (!chunkResult.success) {
|
|
14801
|
+
success = false;
|
|
14802
|
+
errorMsg = chunkResult.error || t("common.unknownError");
|
|
14803
|
+
if (!options?.quiet) {
|
|
14804
|
+
s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
|
|
14805
|
+
consola.error(errorMsg);
|
|
14806
|
+
}
|
|
14807
|
+
return;
|
|
14808
|
+
}
|
|
14809
|
+
if (chunkResult.data) chunkResults.push(chunkResult.data);
|
|
14810
|
+
if (chunkResult.tokensUsed) {
|
|
14811
|
+
accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
|
|
14812
|
+
accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
|
|
14813
|
+
accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
|
|
14814
|
+
}
|
|
14815
|
+
};
|
|
14816
|
+
});
|
|
14817
|
+
const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
|
|
14818
|
+
if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
|
|
14819
|
+
current: 1,
|
|
14820
|
+
total: processedDocs.length
|
|
14821
|
+
}));
|
|
14822
|
+
try {
|
|
14823
|
+
await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
|
|
14824
|
+
if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
|
|
14825
|
+
current: idx + 1,
|
|
14826
|
+
total: processedDocs.length
|
|
14827
|
+
}));
|
|
14828
|
+
await task();
|
|
14829
|
+
});
|
|
14830
|
+
} catch (e) {
|
|
14831
|
+
success = false;
|
|
14832
|
+
errorMsg = e instanceof Error ? e.message : String(e);
|
|
14833
|
+
}
|
|
14834
|
+
if (!success) return {
|
|
14835
|
+
success: false,
|
|
14836
|
+
error: errorMsg
|
|
14837
|
+
};
|
|
14838
|
+
const mergedData = mergeExtractionResults(schemaLoad.schema, chunkResults);
|
|
14839
|
+
const validation = validateExtractedData(schemaLoad.schema, mergedData);
|
|
14840
|
+
if (!validation.success) {
|
|
14841
|
+
const valError = validation.error || "Merged data validation failed";
|
|
14842
|
+
if (!options?.quiet) {
|
|
14843
|
+
s.stop(t("command.extract.file.validationFail"));
|
|
14844
|
+
consola.error(valError);
|
|
14845
|
+
}
|
|
14846
|
+
return {
|
|
14847
|
+
success: false,
|
|
14848
|
+
error: valError
|
|
14849
|
+
};
|
|
14850
|
+
}
|
|
14851
|
+
const outputDir = path.resolve(aiexDir, aiConfig.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
|
|
14852
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
14853
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
14854
|
+
const outputFileName = `${schemaLoad.schema.table.name}-${timestamp}.json`;
|
|
14855
|
+
const finalMergedOutputPath = path.join(outputDir, outputFileName);
|
|
14856
|
+
await fs.writeFile(finalMergedOutputPath, JSON.stringify(mergedData, null, 2));
|
|
14857
|
+
result = {
|
|
14858
|
+
success: true,
|
|
14859
|
+
data: mergedData,
|
|
14860
|
+
tokensUsed: accumulatedTokens,
|
|
14861
|
+
outputPath: finalMergedOutputPath
|
|
14862
|
+
};
|
|
14863
|
+
} else result = await extractStructuredData({
|
|
14529
14864
|
config: aiConfig,
|
|
14530
14865
|
schema: schemaLoad.schema,
|
|
14531
14866
|
text: text$1 ?? "",
|
|
@@ -14553,6 +14888,11 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14553
14888
|
}
|
|
14554
14889
|
if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
|
|
14555
14890
|
if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
|
|
14891
|
+
if (result.evidenceSummary && !options?.quiet) {
|
|
14892
|
+
const summary = result.evidenceSummary;
|
|
14893
|
+
const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
|
|
14894
|
+
consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, issues ${issueText}`));
|
|
14895
|
+
}
|
|
14556
14896
|
if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
|
|
14557
14897
|
prompt: result.tokensUsed.prompt,
|
|
14558
14898
|
completion: result.tokensUsed.completion,
|
|
@@ -14581,6 +14921,7 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14581
14921
|
outputPath: result.outputPath,
|
|
14582
14922
|
data: result.data,
|
|
14583
14923
|
tablesInserted: insertResult.tablesInserted,
|
|
14924
|
+
evidenceSummary: result.evidenceSummary,
|
|
14584
14925
|
tokensUsed: result.tokensUsed
|
|
14585
14926
|
};
|
|
14586
14927
|
} else {
|
|
@@ -14607,6 +14948,7 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14607
14948
|
success: true,
|
|
14608
14949
|
outputPath: result.outputPath,
|
|
14609
14950
|
data: result.data,
|
|
14951
|
+
evidenceSummary: result.evidenceSummary,
|
|
14610
14952
|
tokensUsed: result.tokensUsed
|
|
14611
14953
|
};
|
|
14612
14954
|
}
|
|
@@ -14719,6 +15061,7 @@ async function runAuditedExtraction(options) {
|
|
|
14719
15061
|
outputName: updated.outputName,
|
|
14720
15062
|
tablesInserted: updated.tablesInserted,
|
|
14721
15063
|
notionPages: updated.notionPages,
|
|
15064
|
+
evidenceSummary: r.evidenceSummary,
|
|
14722
15065
|
tokensUsed: updated.tokensUsed,
|
|
14723
15066
|
auditId: updated.id,
|
|
14724
15067
|
fileHash
|
|
@@ -15846,6 +16189,7 @@ function aiRoutes(config) {
|
|
|
15846
16189
|
//#endregion
|
|
15847
16190
|
//#region src/core/data-service.ts
|
|
15848
16191
|
const FILE_REGEX = /\.json$/;
|
|
16192
|
+
const EVIDENCE_FILE_SUFFIX = ".evidence.json";
|
|
15849
16193
|
const EXTRACTION_TIMESTAMP_RE = /-\d{4}-\d{2}-\d{2}T/;
|
|
15850
16194
|
const INTERNAL_ROWID_COLUMN = "__aiex_rowid";
|
|
15851
16195
|
const TIMESTAMP_CLEANUP = /(\d{2})-(\d{2})-(\d{2})/;
|
|
@@ -15861,6 +16205,24 @@ function getAuditNotionStatus(record) {
|
|
|
15861
16205
|
if (record.status === "failed") return "failed";
|
|
15862
16206
|
return "not_synced";
|
|
15863
16207
|
}
|
|
16208
|
+
async function readEvidenceSummary(extractedDir, outputName) {
|
|
16209
|
+
const evidencePath = path.join(extractedDir, outputName.replace(FILE_REGEX, EVIDENCE_FILE_SUFFIX));
|
|
16210
|
+
try {
|
|
16211
|
+
const coverage = (await readFile(evidencePath))?.coverage;
|
|
16212
|
+
if (!coverage || typeof coverage !== "object") return void 0;
|
|
16213
|
+
return {
|
|
16214
|
+
path: evidencePath,
|
|
16215
|
+
fieldCount: Number(coverage.fieldCount) || 0,
|
|
16216
|
+
evidenceCount: Number(coverage.evidenceCount) || 0,
|
|
16217
|
+
foundCount: Number(coverage.foundCount) || 0,
|
|
16218
|
+
missingCount: Number(coverage.missingCount) || 0,
|
|
16219
|
+
inferredCount: Number(coverage.inferredCount) || 0,
|
|
16220
|
+
issueCount: Number(coverage.issueCount) || 0
|
|
16221
|
+
};
|
|
16222
|
+
} catch {
|
|
16223
|
+
return;
|
|
16224
|
+
}
|
|
16225
|
+
}
|
|
15864
16226
|
async function getRowExtractionActions(aiexDir, tableName) {
|
|
15865
16227
|
const actions = /* @__PURE__ */ new Map();
|
|
15866
16228
|
const auditRecords = await listExtractionAuditRecords(aiexDir);
|
|
@@ -15888,7 +16250,7 @@ async function listExtractions(config) {
|
|
|
15888
16250
|
const aiexDir = path.dirname(config.schemaPath);
|
|
15889
16251
|
const extractedDir = path.join(aiexDir, "extracted");
|
|
15890
16252
|
await fs.mkdir(extractedDir, { recursive: true });
|
|
15891
|
-
const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md"));
|
|
16253
|
+
const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md") && !f.endsWith(EVIDENCE_FILE_SUFFIX));
|
|
15892
16254
|
const auditRecords = await listExtractionAuditRecords(aiexDir);
|
|
15893
16255
|
const auditByOutputName = new Map(auditRecords.map((record) => [record.outputName, record]));
|
|
15894
16256
|
const records = [];
|
|
@@ -15907,6 +16269,7 @@ async function listExtractions(config) {
|
|
|
15907
16269
|
timestamp,
|
|
15908
16270
|
fileSize: stat.size,
|
|
15909
16271
|
modifiedAt: stat.mtime.toISOString(),
|
|
16272
|
+
evidenceSummary: await readEvidenceSummary(extractedDir, file),
|
|
15910
16273
|
notionStatus: notionPages ? "synced" : audit?.status === "failed" ? "failed" : "not_synced",
|
|
15911
16274
|
notionPages,
|
|
15912
16275
|
notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0
|
|
@@ -16086,6 +16449,7 @@ async function retryNotionSync(config, fileName) {
|
|
|
16086
16449
|
|
|
16087
16450
|
//#endregion
|
|
16088
16451
|
//#region src/server/routes/data.ts
|
|
16452
|
+
const JSON_FILE_SUFFIX_RE = /\.json$/;
|
|
16089
16453
|
const tableParamSchema = z.object({ name: z.string().regex(/^[a-z][a-z0-9_]*$/) });
|
|
16090
16454
|
const extractionFileParamSchema = z.object({ name: z.string().regex(/^[\w.-]+\.json$/).refine((name$1) => name$1 === path.basename(name$1) && !name$1.includes("..")) });
|
|
16091
16455
|
const tableQuerySchema = z.object({
|
|
@@ -16138,10 +16502,22 @@ function dataRoutes(config) {
|
|
|
16138
16502
|
const filePath = path.join(extractedDir, name$1);
|
|
16139
16503
|
try {
|
|
16140
16504
|
const content = await fs.readFile(filePath, "utf-8");
|
|
16505
|
+
const evidencePath = path.join(extractedDir, name$1.replace(JSON_FILE_SUFFIX_RE, ".evidence.json"));
|
|
16506
|
+
let evidenceSummary;
|
|
16507
|
+
try {
|
|
16508
|
+
const evidence = JSON.parse(await fs.readFile(evidencePath, "utf-8"));
|
|
16509
|
+
evidenceSummary = evidence?.coverage ? {
|
|
16510
|
+
...evidence.coverage,
|
|
16511
|
+
path: evidencePath
|
|
16512
|
+
} : void 0;
|
|
16513
|
+
} catch {
|
|
16514
|
+
evidenceSummary = void 0;
|
|
16515
|
+
}
|
|
16141
16516
|
return c.json({
|
|
16142
16517
|
success: true,
|
|
16143
16518
|
content,
|
|
16144
|
-
name: name$1
|
|
16519
|
+
name: name$1,
|
|
16520
|
+
evidenceSummary
|
|
16145
16521
|
});
|
|
16146
16522
|
} catch {
|
|
16147
16523
|
return c.json({ error: t("server.extractionNotFound") }, 404);
|
|
@@ -16285,6 +16661,7 @@ function extractRoutes(config) {
|
|
|
16285
16661
|
outputName: result.outputName,
|
|
16286
16662
|
tablesInserted: result.tablesInserted,
|
|
16287
16663
|
notionPages: result.notionPages,
|
|
16664
|
+
evidenceSummary: result.evidenceSummary,
|
|
16288
16665
|
tokensUsed: result.tokensUsed,
|
|
16289
16666
|
auditId: result.auditId
|
|
16290
16667
|
}, 200);
|
|
@@ -16352,6 +16729,7 @@ function extractRoutes(config) {
|
|
|
16352
16729
|
outputName: result.outputName,
|
|
16353
16730
|
tablesInserted: result.tablesInserted,
|
|
16354
16731
|
notionPages: result.notionPages,
|
|
16732
|
+
evidenceSummary: result.evidenceSummary,
|
|
16355
16733
|
tokensUsed: result.tokensUsed,
|
|
16356
16734
|
auditId: result.auditId
|
|
16357
16735
|
}, 200);
|
|
@@ -74,7 +74,7 @@ function doctorDiagnosticsTableRows(d) {
|
|
|
74
74
|
//#endregion
|
|
75
75
|
//#region package.json
|
|
76
76
|
var name = "aiex-cli";
|
|
77
|
-
var version = "0.0.5-beta.
|
|
77
|
+
var version = "0.0.5-beta.4";
|
|
78
78
|
var description = "JSON Schema → SQLite with AI-powered data extraction";
|
|
79
79
|
var package_default = {
|
|
80
80
|
name,
|
|
@@ -228,7 +228,14 @@ const PromptConfigSchema = z.object({
|
|
|
228
228
|
systemTemplate: z.string().min(1),
|
|
229
229
|
userTemplate: z.string().min(1)
|
|
230
230
|
});
|
|
231
|
-
const ExtractionConfigSchema = z.object({
|
|
231
|
+
const ExtractionConfigSchema = z.object({
|
|
232
|
+
outputDir: z.string().min(1),
|
|
233
|
+
mode: z.enum(["pipeline"]).default("pipeline").optional(),
|
|
234
|
+
concurrency: z.number().int().min(1).optional(),
|
|
235
|
+
overlapSize: z.number().int().nonnegative().optional(),
|
|
236
|
+
preFiltering: z.boolean().optional(),
|
|
237
|
+
preFilteringLimit: z.number().int().min(1).optional()
|
|
238
|
+
});
|
|
232
239
|
const ImageOcrConfigSchema = z.object({
|
|
233
240
|
ocrFallback: z.enum([
|
|
234
241
|
"auto",
|
|
@@ -335,7 +342,10 @@ Extraction requirements:
|
|
|
335
342
|
userTemplate: `Please extract data from the following text:
|
|
336
343
|
{text}`
|
|
337
344
|
};
|
|
338
|
-
const DEFAULT_EXTRACTION_CONFIG = {
|
|
345
|
+
const DEFAULT_EXTRACTION_CONFIG = {
|
|
346
|
+
outputDir: ".aiex/extracted",
|
|
347
|
+
mode: "pipeline"
|
|
348
|
+
};
|
|
339
349
|
const DEFAULT_IMAGE_OCR_CONFIG = {
|
|
340
350
|
ocrFallback: "auto",
|
|
341
351
|
ocrLanguages: "en-US, zh-Hans",
|
|
@@ -567,6 +577,13 @@ const en = {
|
|
|
567
577
|
extractFail: "Extraction failed",
|
|
568
578
|
extractComplete: "Extraction complete",
|
|
569
579
|
extractRetry: "API responded with {{code}}, retrying in {{delay}}s ({{attempt}}/{{max}})",
|
|
580
|
+
chunking: "Input text length ({{length}} chars) exceeds limit ({{limit}} chars). Splitting into chunks...",
|
|
581
|
+
chunksCount: "Split into {{count}} chunk(s).",
|
|
582
|
+
preFiltering: "Hybrid pre-filtering: selected {{filtered}} out of {{original}} chunks based on schema relevance.",
|
|
583
|
+
extractingChunk: "Extracting chunk {{current}}/{{total}}...",
|
|
584
|
+
extractRetryChunk: "Chunk {{current}}/{{total}} API responded with {{code}}, retrying in {{delay}}s ({{attempt}}/{{max}})",
|
|
585
|
+
extractFailChunk: "Extraction failed for chunk {{current}}/{{total}}",
|
|
586
|
+
validationFail: "Merged data validation failed",
|
|
570
587
|
resultSaved: "Result saved: {{path}}",
|
|
571
588
|
tokenUsage: "Token usage: prompt={{prompt}}, completion={{completion}}, total={{total}}",
|
|
572
589
|
insertingDb: "Inserting into database...",
|
|
@@ -956,7 +973,7 @@ async function initI18n(lng) {
|
|
|
956
973
|
fallbackLng: "en",
|
|
957
974
|
resources: {
|
|
958
975
|
"en": { translation: en },
|
|
959
|
-
"zh-CN": { translation: await import("./zh-CN-
|
|
976
|
+
"zh-CN": { translation: await import("./zh-CN-CyL-61Ow.mjs").then((m) => m.zhCN) }
|
|
960
977
|
},
|
|
961
978
|
interpolation: { escapeValue: false },
|
|
962
979
|
returnNull: false
|