aiex-cli 0.0.5-beta.3 → 0.0.5-beta.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -16
- package/dist/cli.mjs +340 -355
- package/dist/{doctor-collector-CQPDBVTw.mjs → doctor-collector-NTNBFeBw.mjs} +12 -6
- package/dist/index.mjs +1 -1
- package/dist/web/assets/AISettings-BlyTFIIy.js +272 -0
- package/dist/web/assets/ExtractionViewer-BhhWrBs2.js +1 -0
- package/dist/web/assets/{index-BWm_fhNt.js → index-CKV2X6sS.js} +2 -2
- package/dist/web/assets/index-Csdgio76.css +2 -0
- package/dist/web/index.html +2 -2
- package/dist/{zh-CN-CKxdpj8c.mjs → zh-CN-Ca-Dv775.mjs} +2 -3
- package/package.json +3 -1
- package/dist/web/assets/AISettings-DoDVYWfb.js +0 -272
- package/dist/web/assets/ExtractionViewer-DqIrBGNK.js +0 -1
- package/dist/web/assets/index-CvY9TGny.css +0 -2
package/dist/cli.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-
|
|
1
|
+
import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-NTNBFeBw.mjs";
|
|
2
2
|
import { createRequire } from "node:module";
|
|
3
3
|
import fs from "node:fs/promises";
|
|
4
4
|
import os from "node:os";
|
|
@@ -17,13 +17,15 @@ import Database from "better-sqlite3";
|
|
|
17
17
|
import pc from "picocolors";
|
|
18
18
|
import { Buffer } from "node:buffer";
|
|
19
19
|
import * as XLSX from "xlsx";
|
|
20
|
+
import { getEncoding } from "js-tiktoken";
|
|
20
21
|
import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
|
|
21
|
-
import { APICallError, Output, generateText, jsonSchema
|
|
22
|
+
import { APICallError, Output, generateText, jsonSchema } from "ai";
|
|
22
23
|
import pRetry from "p-retry";
|
|
23
24
|
import mime from "mime";
|
|
24
25
|
import { jsonrepair } from "jsonrepair";
|
|
25
26
|
import { LangfuseSpanProcessor } from "@langfuse/otel";
|
|
26
27
|
import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
|
|
28
|
+
import { marked } from "marked";
|
|
27
29
|
import crypto from "node:crypto";
|
|
28
30
|
import { Client, extractNotionId } from "@notionhq/client";
|
|
29
31
|
import { execa } from "execa";
|
|
@@ -13559,279 +13561,170 @@ function mergeExtractionResults(schema, results) {
|
|
|
13559
13561
|
|
|
13560
13562
|
//#endregion
|
|
13561
13563
|
//#region src/core/ai-extraction/text-splitter.ts
|
|
13562
|
-
const
|
|
13564
|
+
const encoding$1 = getEncoding("cl100k_base");
|
|
13565
|
+
function countTokens(text$1) {
|
|
13566
|
+
return encoding$1.encode(text$1).length;
|
|
13567
|
+
}
|
|
13568
|
+
function formatHeadingContext(headings) {
|
|
13569
|
+
const active = headings.filter(Boolean);
|
|
13570
|
+
if (active.length === 0) return "";
|
|
13571
|
+
return `> **[Context]** Belong to: ${active.join(" > ")}\n\n`;
|
|
13572
|
+
}
|
|
13573
|
+
function getMetadata(headings) {
|
|
13574
|
+
return {
|
|
13575
|
+
h1: headings[0] || void 0,
|
|
13576
|
+
h2: headings[1] || void 0,
|
|
13577
|
+
h3: headings[2] || void 0,
|
|
13578
|
+
h4: headings[3] || void 0
|
|
13579
|
+
};
|
|
13580
|
+
}
|
|
13563
13581
|
/**
|
|
13564
|
-
* Splits
|
|
13565
|
-
*
|
|
13566
|
-
* when a section exceeds the maxSize limit.
|
|
13582
|
+
* Splits text recursively using a list of separators.
|
|
13583
|
+
* Preserves the separators when re-joining.
|
|
13567
13584
|
*/
|
|
13568
|
-
function
|
|
13569
|
-
|
|
13585
|
+
function splitTextRecursively(text$1, maxTokens, separators = [
|
|
13586
|
+
"\n\n",
|
|
13587
|
+
"\n",
|
|
13588
|
+
"。",
|
|
13589
|
+
". ",
|
|
13590
|
+
" "
|
|
13591
|
+
]) {
|
|
13592
|
+
if (countTokens(text$1) <= maxTokens) return [text$1];
|
|
13593
|
+
if (separators.length === 0) {
|
|
13594
|
+
const chunks = [];
|
|
13595
|
+
let current = "";
|
|
13596
|
+
for (const char of text$1) if (countTokens(current + char) > maxTokens) {
|
|
13597
|
+
chunks.push(current);
|
|
13598
|
+
current = char;
|
|
13599
|
+
} else current += char;
|
|
13600
|
+
if (current) chunks.push(current);
|
|
13601
|
+
return chunks;
|
|
13602
|
+
}
|
|
13603
|
+
const separator = separators[0];
|
|
13604
|
+
const nextSeparators = separators.slice(1);
|
|
13605
|
+
const parts = text$1.split(separator);
|
|
13606
|
+
const result = [];
|
|
13607
|
+
let currentChunk = [];
|
|
13608
|
+
let currentChunkTokens = 0;
|
|
13609
|
+
for (let i = 0; i < parts.length; i++) {
|
|
13610
|
+
const part = parts[i];
|
|
13611
|
+
const itemText = part + (i < parts.length - 1 ? separator : "");
|
|
13612
|
+
const partTokens = countTokens(itemText);
|
|
13613
|
+
if (partTokens > maxTokens) {
|
|
13614
|
+
if (currentChunk.length > 0) {
|
|
13615
|
+
result.push(currentChunk.join(""));
|
|
13616
|
+
currentChunk = [];
|
|
13617
|
+
currentChunkTokens = 0;
|
|
13618
|
+
}
|
|
13619
|
+
const subParts = splitTextRecursively(part, maxTokens, nextSeparators);
|
|
13620
|
+
for (let j = 0; j < subParts.length; j++) {
|
|
13621
|
+
const finalSub = subParts[j] + (j === subParts.length - 1 && i < parts.length - 1 ? separator : "");
|
|
13622
|
+
result.push(finalSub);
|
|
13623
|
+
}
|
|
13624
|
+
} else if (currentChunkTokens + partTokens > maxTokens) {
|
|
13625
|
+
result.push(currentChunk.join(""));
|
|
13626
|
+
currentChunk = [itemText];
|
|
13627
|
+
currentChunkTokens = partTokens;
|
|
13628
|
+
} else {
|
|
13629
|
+
currentChunk.push(itemText);
|
|
13630
|
+
currentChunkTokens += partTokens;
|
|
13631
|
+
}
|
|
13632
|
+
}
|
|
13633
|
+
if (currentChunk.length > 0) result.push(currentChunk.join(""));
|
|
13634
|
+
return result;
|
|
13635
|
+
}
|
|
13636
|
+
/**
|
|
13637
|
+
* Splits a Markdown document into chunks based on heading contexts, AST block parsing, and token limits.
|
|
13638
|
+
* Protects tables, list items, and code blocks from being broken.
|
|
13639
|
+
*/
|
|
13640
|
+
function splitMarkdown(text$1, maxTokens = 8e3, overlapTokens = 1e3) {
|
|
13641
|
+
const tokens = marked.lexer(text$1);
|
|
13570
13642
|
const chunks = [];
|
|
13571
13643
|
let currentHeadings = [];
|
|
13572
|
-
let
|
|
13573
|
-
let
|
|
13574
|
-
const
|
|
13575
|
-
return
|
|
13576
|
-
|
|
13577
|
-
|
|
13578
|
-
|
|
13579
|
-
h4: headings[3] || void 0
|
|
13580
|
-
};
|
|
13581
|
-
};
|
|
13582
|
-
const flushChunk = () => {
|
|
13583
|
-
if (currentChunkLines.length === 0) return;
|
|
13584
|
-
const pageContent = currentChunkLines.join("\n");
|
|
13585
|
-
if (pageContent.length > maxSize) {
|
|
13586
|
-
const paragraphs = pageContent.split("\n\n");
|
|
13587
|
-
let subLines = [];
|
|
13588
|
-
let subSize = 0;
|
|
13589
|
-
for (const para of paragraphs) {
|
|
13590
|
-
const paraSize = para.length;
|
|
13591
|
-
if (subSize + paraSize > maxSize && subLines.length > 0) {
|
|
13592
|
-
chunks.push({
|
|
13593
|
-
pageContent: subLines.join("\n\n"),
|
|
13594
|
-
metadata: getMetadata(currentHeadings)
|
|
13595
|
-
});
|
|
13596
|
-
subLines = [];
|
|
13597
|
-
subSize = 0;
|
|
13598
|
-
}
|
|
13599
|
-
subLines.push(para);
|
|
13600
|
-
subSize += paraSize + 2;
|
|
13601
|
-
}
|
|
13602
|
-
if (subLines.length > 0) chunks.push({
|
|
13603
|
-
pageContent: subLines.join("\n\n"),
|
|
13604
|
-
metadata: getMetadata(currentHeadings)
|
|
13605
|
-
});
|
|
13606
|
-
} else chunks.push({
|
|
13644
|
+
let currentChunkList = [];
|
|
13645
|
+
let accumulatedTokens = 0;
|
|
13646
|
+
const flushCurrentChunk = (isHeadingChange = false) => {
|
|
13647
|
+
if (currentChunkList.length === 0) return;
|
|
13648
|
+
const pageContent = currentChunkList.map((item) => item.text).join("");
|
|
13649
|
+
const firstHeadings = currentChunkList[0].headings;
|
|
13650
|
+
chunks.push({
|
|
13607
13651
|
pageContent,
|
|
13608
|
-
metadata: getMetadata(
|
|
13652
|
+
metadata: getMetadata(firstHeadings)
|
|
13609
13653
|
});
|
|
13610
|
-
|
|
13611
|
-
|
|
13654
|
+
if (isHeadingChange || overlapTokens <= 0) {
|
|
13655
|
+
currentChunkList = [];
|
|
13656
|
+
accumulatedTokens = 0;
|
|
13657
|
+
} else {
|
|
13658
|
+
const overlapItems = [];
|
|
13659
|
+
let currentOverlapTokens = 0;
|
|
13660
|
+
for (let i = currentChunkList.length - 1; i >= 0; i--) {
|
|
13661
|
+
const item = currentChunkList[i];
|
|
13662
|
+
const itemTokens = countTokens(item.text);
|
|
13663
|
+
if (currentOverlapTokens + itemTokens > overlapTokens && overlapItems.length > 0) break;
|
|
13664
|
+
overlapItems.unshift(item);
|
|
13665
|
+
currentOverlapTokens += itemTokens;
|
|
13666
|
+
}
|
|
13667
|
+
currentChunkList = [...overlapItems];
|
|
13668
|
+
accumulatedTokens = currentOverlapTokens;
|
|
13669
|
+
}
|
|
13612
13670
|
};
|
|
13613
|
-
for (const
|
|
13614
|
-
|
|
13615
|
-
|
|
13616
|
-
|
|
13617
|
-
|
|
13618
|
-
|
|
13671
|
+
for (const token of tokens) {
|
|
13672
|
+
if (token.type === "space") {
|
|
13673
|
+
if (currentChunkList.length > 0) {
|
|
13674
|
+
currentChunkList[currentChunkList.length - 1].text += token.raw;
|
|
13675
|
+
accumulatedTokens += countTokens(token.raw);
|
|
13676
|
+
}
|
|
13677
|
+
continue;
|
|
13678
|
+
}
|
|
13679
|
+
if (token.type === "heading") {
|
|
13680
|
+
flushCurrentChunk(true);
|
|
13681
|
+
const depth = token.depth;
|
|
13682
|
+
const title = token.text.trim();
|
|
13619
13683
|
currentHeadings = currentHeadings.slice(0, depth - 1);
|
|
13620
13684
|
currentHeadings[depth - 1] = title;
|
|
13621
13685
|
}
|
|
13622
|
-
|
|
13623
|
-
|
|
13624
|
-
|
|
13686
|
+
const rawText = token.raw;
|
|
13687
|
+
if (token.type === "list" && countTokens(rawText) > maxTokens) for (const item of token.items) processTextBlock(item.raw, currentHeadings);
|
|
13688
|
+
else {
|
|
13689
|
+
const isAtomic = token.type === "table" || token.type === "code";
|
|
13690
|
+
processTextBlock(rawText, currentHeadings, isAtomic);
|
|
13691
|
+
}
|
|
13625
13692
|
}
|
|
13626
|
-
|
|
13693
|
+
flushCurrentChunk(true);
|
|
13627
13694
|
return chunks;
|
|
13628
|
-
|
|
13629
|
-
|
|
13630
|
-
|
|
13631
|
-
|
|
13632
|
-
|
|
13633
|
-
|
|
13634
|
-
|
|
13635
|
-
|
|
13636
|
-
|
|
13637
|
-
|
|
13638
|
-
|
|
13639
|
-
|
|
13640
|
-
|
|
13641
|
-
|
|
13642
|
-
|
|
13643
|
-
|
|
13644
|
-
|
|
13645
|
-
|
|
13646
|
-
|
|
13647
|
-
|
|
13648
|
-
outputTokens
|
|
13649
|
-
});
|
|
13650
|
-
} catch (e) {
|
|
13651
|
-
return {
|
|
13652
|
-
success: false,
|
|
13653
|
-
error: e.message
|
|
13654
|
-
};
|
|
13655
|
-
}
|
|
13656
|
-
const useTelemetry = !!(config.langfuse?.publicKey && config.langfuse.secretKey);
|
|
13657
|
-
try {
|
|
13658
|
-
if (useTelemetry) initLangfuse(config);
|
|
13659
|
-
const provider = createOpenAICompatible({
|
|
13660
|
-
baseURL: config.provider.baseURL,
|
|
13661
|
-
name: "openai-compatible",
|
|
13662
|
-
apiKey: config.provider.apiKey,
|
|
13663
|
-
supportsStructuredOutputs: false
|
|
13664
|
-
});
|
|
13665
|
-
let finalExtractedData = null;
|
|
13666
|
-
const tools = {
|
|
13667
|
-
listChunks: tool({
|
|
13668
|
-
description: "Get a list of all text chunks in the document, showing their chunk index ID, character size, and markdown heading hierarchy (metadata). Use this as a Table of Contents to locate sections of interest.",
|
|
13669
|
-
parameters: z.object({}),
|
|
13670
|
-
execute: async () => {
|
|
13671
|
-
return chunks.map((c, idx) => ({
|
|
13672
|
-
id: idx + 1,
|
|
13673
|
-
size: c.pageContent.length,
|
|
13674
|
-
headings: c.metadata
|
|
13675
|
-
}));
|
|
13676
|
-
}
|
|
13677
|
-
}),
|
|
13678
|
-
readChunk: tool({
|
|
13679
|
-
description: "Read the full text content of a specific chunk by its ID.",
|
|
13680
|
-
parameters: z.object({ chunkId: z.number().int().describe("The ID (1-based index) of the chunk to read.") }),
|
|
13681
|
-
execute: async ({ chunkId }) => {
|
|
13682
|
-
const index = chunkId - 1;
|
|
13683
|
-
if (index < 0 || index >= chunks.length) return { error: `Invalid chunkId: ${chunkId}. Valid IDs are 1 to ${chunks.length}.` };
|
|
13684
|
-
const chunk = chunks[index];
|
|
13685
|
-
const headings = [];
|
|
13686
|
-
if (chunk.metadata) {
|
|
13687
|
-
if (chunk.metadata.h1) headings.push(chunk.metadata.h1);
|
|
13688
|
-
if (chunk.metadata.h2) headings.push(chunk.metadata.h2);
|
|
13689
|
-
if (chunk.metadata.h3) headings.push(chunk.metadata.h3);
|
|
13690
|
-
if (chunk.metadata.h4) headings.push(chunk.metadata.h4);
|
|
13691
|
-
}
|
|
13692
|
-
return {
|
|
13693
|
-
chunkId,
|
|
13694
|
-
headings: headings.join(" > "),
|
|
13695
|
-
content: chunk.pageContent
|
|
13696
|
-
};
|
|
13697
|
-
}
|
|
13698
|
-
}),
|
|
13699
|
-
searchChunks: tool({
|
|
13700
|
-
description: "Search all chunks in the document for specific keywords or search terms. Returns matching chunk IDs and small matching context snippets.",
|
|
13701
|
-
parameters: z.object({ query: z.string().describe("The keyword or search phrase to search for.") }),
|
|
13702
|
-
execute: async ({ query }) => {
|
|
13703
|
-
const results = [];
|
|
13704
|
-
const lowercaseQuery = query.toLowerCase();
|
|
13705
|
-
for (let i = 0; i < chunks.length; i++) {
|
|
13706
|
-
const chunkText = chunks[i].pageContent;
|
|
13707
|
-
const idx = chunkText.toLowerCase().indexOf(lowercaseQuery);
|
|
13708
|
-
if (idx !== -1) {
|
|
13709
|
-
const start = Math.max(0, idx - 60);
|
|
13710
|
-
const end = Math.min(chunkText.length, idx + lowercaseQuery.length + 60);
|
|
13711
|
-
const snippet = `...${chunkText.slice(start, end).replace(/\n/g, " ")}...`;
|
|
13712
|
-
results.push({
|
|
13713
|
-
chunkId: i + 1,
|
|
13714
|
-
headings: chunks[i].metadata,
|
|
13715
|
-
snippet
|
|
13716
|
-
});
|
|
13717
|
-
}
|
|
13718
|
-
}
|
|
13719
|
-
return results.slice(0, 10);
|
|
13720
|
-
}
|
|
13721
|
-
}),
|
|
13722
|
-
submitExtraction: tool({
|
|
13723
|
-
description: "Submit the final extracted JSON object conforming to the schema definition. Call this ONLY after you have gathered all necessary information.",
|
|
13724
|
-
parameters: z.object({ data: z.any().describe("The extracted JSON object conforming to the target schema.") }),
|
|
13725
|
-
execute: async ({ data }) => {
|
|
13726
|
-
finalExtractedData = data;
|
|
13727
|
-
return {
|
|
13728
|
-
status: "success",
|
|
13729
|
-
message: "Data submitted successfully. The extraction is now complete."
|
|
13730
|
-
};
|
|
13731
|
-
}
|
|
13732
|
-
})
|
|
13733
|
-
};
|
|
13734
|
-
const outputSchema = schemaToExtractionOutputSchema(schema);
|
|
13735
|
-
const systemPrompt = `You are a precise data extraction agent. Your goal is to extract structured information from a document to populate the target JSON schema.
|
|
13736
|
-
|
|
13737
|
-
Target JSON Schema structure to populate:
|
|
13738
|
-
${JSON.stringify(outputSchema, null, 2)}
|
|
13739
|
-
|
|
13740
|
-
You are equipped with tools to browse the document dynamically:
|
|
13741
|
-
1. First, call listChunks to understand the document layout and what sections exist.
|
|
13742
|
-
2. Based on the schema fields, call readChunk or searchChunks to locate and read relevant content.
|
|
13743
|
-
3. You can make multiple tool calls. Do not guess. Check the text carefully.
|
|
13744
|
-
4. Once you have located and read all the necessary information, call the submitExtraction tool with the fully extracted JSON object.
|
|
13745
|
-
5. After calling submitExtraction, you should stop.
|
|
13746
|
-
|
|
13747
|
-
CRITICAL RULES:
|
|
13748
|
-
1. Extract data strictly conforming to the types and properties of the Target JSON Schema.
|
|
13749
|
-
2. If a field's value cannot be found in the document after thorough search, set it to null.
|
|
13750
|
-
3. Do not invent any values.
|
|
13751
|
-
4. Call submitExtraction exactly once with the final JSON result.`;
|
|
13752
|
-
const timeoutMs = (config.provider.timeout ?? 300) * 1e3;
|
|
13753
|
-
const result = await generateText({
|
|
13754
|
-
model: provider.chatModel(selected.name),
|
|
13755
|
-
system: systemPrompt,
|
|
13756
|
-
prompt: "Please start by listing the chunks to understand the document structure, then gather the required facts and submit the final JSON extraction.",
|
|
13757
|
-
tools,
|
|
13758
|
-
maxSteps: 12,
|
|
13759
|
-
abortSignal: AbortSignal.timeout(timeoutMs),
|
|
13760
|
-
experimental_telemetry: { isEnabled: useTelemetry },
|
|
13761
|
-
onStepFinish({ text: text$2, toolCalls }) {
|
|
13762
|
-
if (onAgentStep) onAgentStep({
|
|
13763
|
-
thought: text$2,
|
|
13764
|
-
toolCalls
|
|
13695
|
+
function processTextBlock(blockText, headings, isAtomic = false) {
|
|
13696
|
+
const blockTokens = countTokens(blockText);
|
|
13697
|
+
const contextTokens = countTokens(formatHeadingContext(headings));
|
|
13698
|
+
const safetyBuffer = Math.min(100, Math.max(2, Math.floor(maxTokens * .1)));
|
|
13699
|
+
const budgetLimit = Math.max(5, maxTokens - contextTokens - safetyBuffer);
|
|
13700
|
+
if (blockTokens > budgetLimit) if (isAtomic) {
|
|
13701
|
+
flushCurrentChunk(false);
|
|
13702
|
+
currentChunkList.push({
|
|
13703
|
+
text: blockText,
|
|
13704
|
+
headings: [...headings]
|
|
13705
|
+
});
|
|
13706
|
+
accumulatedTokens = blockTokens;
|
|
13707
|
+
flushCurrentChunk(false);
|
|
13708
|
+
} else {
|
|
13709
|
+
flushCurrentChunk(false);
|
|
13710
|
+
const subBlocks = splitTextRecursively(blockText, budgetLimit);
|
|
13711
|
+
for (const sub of subBlocks) {
|
|
13712
|
+
currentChunkList.push({
|
|
13713
|
+
text: sub,
|
|
13714
|
+
headings: [...headings]
|
|
13765
13715
|
});
|
|
13716
|
+
accumulatedTokens += countTokens(sub);
|
|
13717
|
+
if (accumulatedTokens > budgetLimit) flushCurrentChunk(false);
|
|
13766
13718
|
}
|
|
13767
|
-
});
|
|
13768
|
-
if (!finalExtractedData) {
|
|
13769
|
-
if (result.text) try {
|
|
13770
|
-
finalExtractedData = safeParseJSON(result.text);
|
|
13771
|
-
} catch {}
|
|
13772
|
-
}
|
|
13773
|
-
if (!finalExtractedData) return {
|
|
13774
|
-
success: false,
|
|
13775
|
-
error: "Agent finished without submitting structured data."
|
|
13776
|
-
};
|
|
13777
|
-
const validation = validateExtractedData(schema, finalExtractedData);
|
|
13778
|
-
if (!validation.success) {
|
|
13779
|
-
const correctionSystemPrompt = `You are a precise data correction assistant. Your task is to correct validation errors in a previously generated JSON object to make it comply with the JSON Schema.
|
|
13780
|
-
|
|
13781
|
-
JSON Schema Definition:
|
|
13782
|
-
${JSON.stringify(outputSchema, null, 2)}
|
|
13783
|
-
|
|
13784
|
-
Validation Errors:
|
|
13785
|
-
${validation.error}
|
|
13786
|
-
|
|
13787
|
-
Original Incorrect JSON:
|
|
13788
|
-
${JSON.stringify(finalExtractedData, null, 2)}
|
|
13789
|
-
|
|
13790
|
-
Please output the corrected JSON object. Return ONLY the corrected JSON object, with no markdown tags or explanations.`;
|
|
13791
|
-
const correctedData = safeParseJSON((await generateText({
|
|
13792
|
-
model: provider.chatModel(selected.name),
|
|
13793
|
-
system: correctionSystemPrompt,
|
|
13794
|
-
prompt: "Please correct the JSON output now.",
|
|
13795
|
-
abortSignal: AbortSignal.timeout(timeoutMs),
|
|
13796
|
-
experimental_telemetry: { isEnabled: useTelemetry }
|
|
13797
|
-
})).text);
|
|
13798
|
-
const secondValidation = validateExtractedData(schema, correctedData);
|
|
13799
|
-
if (!secondValidation.success) return {
|
|
13800
|
-
success: false,
|
|
13801
|
-
error: `Agent output validation failed: ${secondValidation.error}`
|
|
13802
|
-
};
|
|
13803
|
-
finalExtractedData = correctedData;
|
|
13804
13719
|
}
|
|
13805
|
-
|
|
13806
|
-
|
|
13807
|
-
|
|
13808
|
-
|
|
13809
|
-
|
|
13810
|
-
|
|
13811
|
-
|
|
13812
|
-
EOL: "\n"
|
|
13813
|
-
});
|
|
13814
|
-
let totalPromptTokens = 0;
|
|
13815
|
-
let totalCompletionTokens = 0;
|
|
13816
|
-
if (result.usage) {
|
|
13817
|
-
totalPromptTokens = result.usage.inputTokens ?? 0;
|
|
13818
|
-
totalCompletionTokens = result.usage.outputTokens ?? 0;
|
|
13720
|
+
else {
|
|
13721
|
+
if (accumulatedTokens + blockTokens + contextTokens > maxTokens && currentChunkList.length > 0) flushCurrentChunk(false);
|
|
13722
|
+
currentChunkList.push({
|
|
13723
|
+
text: blockText,
|
|
13724
|
+
headings: [...headings]
|
|
13725
|
+
});
|
|
13726
|
+
accumulatedTokens += blockTokens;
|
|
13819
13727
|
}
|
|
13820
|
-
return {
|
|
13821
|
-
success: true,
|
|
13822
|
-
outputPath,
|
|
13823
|
-
data: finalExtractedData,
|
|
13824
|
-
tokensUsed: {
|
|
13825
|
-
prompt: totalPromptTokens,
|
|
13826
|
-
completion: totalCompletionTokens,
|
|
13827
|
-
total: totalPromptTokens + totalCompletionTokens
|
|
13828
|
-
}
|
|
13829
|
-
};
|
|
13830
|
-
} catch (error) {
|
|
13831
|
-
return {
|
|
13832
|
-
success: false,
|
|
13833
|
-
error: getErrorMessage(error)
|
|
13834
|
-
};
|
|
13835
13728
|
}
|
|
13836
13729
|
}
|
|
13837
13730
|
|
|
@@ -14719,7 +14612,6 @@ async function processOneFile(aiexDir, config, aiConfig, schemaName, filePath, m
|
|
|
14719
14612
|
modelOverride,
|
|
14720
14613
|
insert: options?.insert,
|
|
14721
14614
|
force: options?.force,
|
|
14722
|
-
agent: options?.agent,
|
|
14723
14615
|
quiet: false
|
|
14724
14616
|
});
|
|
14725
14617
|
if (result.success) {
|
|
@@ -14759,8 +14651,7 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
|
|
|
14759
14651
|
})}`);
|
|
14760
14652
|
if (await processOneFile(aiexDir, config, aiConfig, schemaName, file, modelOverride, {
|
|
14761
14653
|
insert: options?.insert,
|
|
14762
|
-
force: options?.force
|
|
14763
|
-
agent: options?.agent
|
|
14654
|
+
force: options?.force
|
|
14764
14655
|
})) successCount++;
|
|
14765
14656
|
else failCount++;
|
|
14766
14657
|
}
|
|
@@ -14778,7 +14669,44 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
|
|
|
14778
14669
|
|
|
14779
14670
|
//#endregion
|
|
14780
14671
|
//#region src/core/extract-runner.ts
|
|
14672
|
+
const encoding = getEncoding("cl100k_base");
|
|
14781
14673
|
const JSON_EXT_RE$1 = /\.json$/;
|
|
14674
|
+
async function limitConcurrency(concurrency, items, fn) {
|
|
14675
|
+
const results = Array.from({ length: items.length });
|
|
14676
|
+
let nextIndex = 0;
|
|
14677
|
+
async function worker() {
|
|
14678
|
+
while (nextIndex < items.length) {
|
|
14679
|
+
const currentIndex = nextIndex++;
|
|
14680
|
+
results[currentIndex] = await fn(items[currentIndex], currentIndex);
|
|
14681
|
+
}
|
|
14682
|
+
}
|
|
14683
|
+
const workers = Array.from({ length: Math.min(concurrency, items.length) }, worker);
|
|
14684
|
+
await Promise.all(workers);
|
|
14685
|
+
return results;
|
|
14686
|
+
}
|
|
14687
|
+
function getSchemaKeywords(schema) {
|
|
14688
|
+
const keywords = /* @__PURE__ */ new Set();
|
|
14689
|
+
function walk(properties) {
|
|
14690
|
+
if (!properties) return;
|
|
14691
|
+
for (const [name$1, prop] of Object.entries(properties)) {
|
|
14692
|
+
keywords.add(name$1.toLowerCase());
|
|
14693
|
+
const parts = name$1.replace(/([a-z0-9])([A-Z])/g, "$1 $2").split(/[\s._:/\\-]+/g);
|
|
14694
|
+
for (const part of parts) if (part.length > 1) keywords.add(part.toLowerCase());
|
|
14695
|
+
if (prop && typeof prop === "object") {
|
|
14696
|
+
const p = prop;
|
|
14697
|
+
if (typeof p.title === "string") keywords.add(p.title.toLowerCase());
|
|
14698
|
+
if (typeof p.description === "string") {
|
|
14699
|
+
const descParts = p.description.toLowerCase().match(/[\p{L}\p{N}_-]+/gu) ?? [];
|
|
14700
|
+
for (const d of descParts) if (d.length > 2) keywords.add(d);
|
|
14701
|
+
}
|
|
14702
|
+
if (p.type === "object") walk(p.properties);
|
|
14703
|
+
if (p.type === "array" && p.items?.type === "object") walk(p.items.properties);
|
|
14704
|
+
}
|
|
14705
|
+
}
|
|
14706
|
+
}
|
|
14707
|
+
walk(schema.properties);
|
|
14708
|
+
return Array.from(keywords);
|
|
14709
|
+
}
|
|
14782
14710
|
async function ensureDatabaseReady(dbPath, schema) {
|
|
14783
14711
|
try {
|
|
14784
14712
|
await fs.access(dbPath);
|
|
@@ -14850,44 +14778,52 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14850
14778
|
}
|
|
14851
14779
|
const s = spinner();
|
|
14852
14780
|
if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
|
|
14853
|
-
const
|
|
14781
|
+
const maxTokens = aiConfig.extraction?.maxTokens ?? 8e3;
|
|
14782
|
+
const overlapTokens = aiConfig.extraction?.overlapSize ?? 1e3;
|
|
14854
14783
|
let result;
|
|
14855
|
-
|
|
14856
|
-
|
|
14857
|
-
|
|
14858
|
-
|
|
14859
|
-
|
|
14860
|
-
|
|
14861
|
-
|
|
14862
|
-
|
|
14863
|
-
|
|
14864
|
-
|
|
14865
|
-
|
|
14866
|
-
|
|
14867
|
-
|
|
14784
|
+
const totalTokens = text$1 ? encoding.encode(text$1).length : 0;
|
|
14785
|
+
if (text$1 && totalTokens > maxTokens) {
|
|
14786
|
+
if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
|
|
14787
|
+
length: totalTokens,
|
|
14788
|
+
limit: maxTokens
|
|
14789
|
+
}));
|
|
14790
|
+
const finalDocs = splitMarkdown(text$1, maxTokens, overlapTokens);
|
|
14791
|
+
if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
|
|
14792
|
+
let processedDocs = finalDocs;
|
|
14793
|
+
if (!!aiConfig.extraction?.preFiltering && finalDocs.length > 1) {
|
|
14794
|
+
const preFilteringLimit = aiConfig.extraction?.preFilteringLimit ?? 5;
|
|
14795
|
+
const keywords = getSchemaKeywords(schemaLoad.schema);
|
|
14796
|
+
const scoredChunks = finalDocs.map((doc, idx) => {
|
|
14797
|
+
if (idx === 0) return {
|
|
14798
|
+
index: idx,
|
|
14799
|
+
score: Number.POSITIVE_INFINITY
|
|
14800
|
+
};
|
|
14801
|
+
let score = 0;
|
|
14802
|
+
const docTextLower = doc.pageContent.toLowerCase();
|
|
14803
|
+
for (const kw of keywords) {
|
|
14804
|
+
let pos = docTextLower.indexOf(kw);
|
|
14805
|
+
while (pos !== -1) {
|
|
14806
|
+
score++;
|
|
14807
|
+
pos = docTextLower.indexOf(kw, pos + kw.length);
|
|
14868
14808
|
}
|
|
14869
|
-
if (step.toolCalls && step.toolCalls.length > 0) for (const call of step.toolCalls) consola.info(`[Agent Action] Calling tool: ${pc.green(call.toolName)}`);
|
|
14870
14809
|
}
|
|
14810
|
+
return {
|
|
14811
|
+
index: idx,
|
|
14812
|
+
score
|
|
14813
|
+
};
|
|
14814
|
+
}).slice(1).sort((a, b) => b.score - a.score);
|
|
14815
|
+
const selectedIndices = new Set([0]);
|
|
14816
|
+
let keptCount = 0;
|
|
14817
|
+
for (const sc of scoredChunks) if (sc.score > 0 && keptCount < preFilteringLimit) {
|
|
14818
|
+
selectedIndices.add(sc.index);
|
|
14819
|
+
keptCount++;
|
|
14871
14820
|
}
|
|
14872
|
-
|
|
14873
|
-
|
|
14874
|
-
|
|
14875
|
-
|
|
14876
|
-
|
|
14877
|
-
}
|
|
14878
|
-
return {
|
|
14879
|
-
success: false,
|
|
14880
|
-
error: agentResult.error
|
|
14881
|
-
};
|
|
14821
|
+
processedDocs = finalDocs.filter((_, idx) => selectedIndices.has(idx));
|
|
14822
|
+
if (!options?.quiet) consola.info(t("command.extract.file.preFiltering", {
|
|
14823
|
+
original: finalDocs.length,
|
|
14824
|
+
filtered: processedDocs.length
|
|
14825
|
+
}));
|
|
14882
14826
|
}
|
|
14883
|
-
result = agentResult;
|
|
14884
|
-
} else if (text$1 && text$1.length > CHUNK_LIMIT) {
|
|
14885
|
-
if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
|
|
14886
|
-
length: text$1.length,
|
|
14887
|
-
limit: CHUNK_LIMIT
|
|
14888
|
-
}));
|
|
14889
|
-
const finalDocs = splitMarkdown(text$1, CHUNK_LIMIT);
|
|
14890
|
-
if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
|
|
14891
14827
|
const chunkResults = [];
|
|
14892
14828
|
const accumulatedTokens = {
|
|
14893
14829
|
prompt: 0,
|
|
@@ -14896,53 +14832,68 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14896
14832
|
};
|
|
14897
14833
|
let success = true;
|
|
14898
14834
|
let errorMsg = "";
|
|
14899
|
-
|
|
14900
|
-
|
|
14901
|
-
|
|
14902
|
-
|
|
14903
|
-
|
|
14904
|
-
|
|
14905
|
-
|
|
14906
|
-
|
|
14907
|
-
|
|
14908
|
-
if (doc.metadata.h2) headings.push(doc.metadata.h2);
|
|
14909
|
-
if (doc.metadata.h3) headings.push(doc.metadata.h3);
|
|
14910
|
-
if (doc.metadata.h4) headings.push(doc.metadata.h4);
|
|
14911
|
-
}
|
|
14912
|
-
let chunkText = doc.pageContent;
|
|
14913
|
-
if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
|
|
14914
|
-
const chunkResult = await extractStructuredData({
|
|
14915
|
-
config: aiConfig,
|
|
14916
|
-
schema: schemaLoad.schema,
|
|
14917
|
-
text: chunkText,
|
|
14918
|
-
aiexDir,
|
|
14919
|
-
modelOverride,
|
|
14920
|
-
onRetry(info) {
|
|
14921
|
-
if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
|
|
14922
|
-
current: i + 1,
|
|
14923
|
-
total: finalDocs.length,
|
|
14924
|
-
code: info.statusCode,
|
|
14925
|
-
delay: info.delayMs / 1e3,
|
|
14926
|
-
attempt: info.attempt,
|
|
14927
|
-
max: info.maxRetries
|
|
14928
|
-
}));
|
|
14835
|
+
const extractionTasks = processedDocs.map((doc, i) => {
|
|
14836
|
+
return async () => {
|
|
14837
|
+
if (!success) return;
|
|
14838
|
+
const headings = [];
|
|
14839
|
+
if (doc.metadata) {
|
|
14840
|
+
if (doc.metadata.h1) headings.push(doc.metadata.h1);
|
|
14841
|
+
if (doc.metadata.h2) headings.push(doc.metadata.h2);
|
|
14842
|
+
if (doc.metadata.h3) headings.push(doc.metadata.h3);
|
|
14843
|
+
if (doc.metadata.h4) headings.push(doc.metadata.h4);
|
|
14929
14844
|
}
|
|
14930
|
-
|
|
14931
|
-
|
|
14932
|
-
|
|
14933
|
-
|
|
14934
|
-
|
|
14935
|
-
|
|
14936
|
-
|
|
14845
|
+
let chunkText = doc.pageContent;
|
|
14846
|
+
if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
|
|
14847
|
+
const chunkResult = await extractStructuredData({
|
|
14848
|
+
config: aiConfig,
|
|
14849
|
+
schema: schemaLoad.schema,
|
|
14850
|
+
text: chunkText,
|
|
14851
|
+
aiexDir,
|
|
14852
|
+
modelOverride,
|
|
14853
|
+
onRetry(info) {
|
|
14854
|
+
if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
|
|
14855
|
+
current: i + 1,
|
|
14856
|
+
total: processedDocs.length,
|
|
14857
|
+
code: info.statusCode,
|
|
14858
|
+
delay: info.delayMs / 1e3,
|
|
14859
|
+
attempt: info.attempt,
|
|
14860
|
+
max: info.maxRetries
|
|
14861
|
+
}));
|
|
14862
|
+
}
|
|
14863
|
+
});
|
|
14864
|
+
if (!chunkResult.success) {
|
|
14865
|
+
success = false;
|
|
14866
|
+
errorMsg = chunkResult.error || t("common.unknownError");
|
|
14867
|
+
if (!options?.quiet) {
|
|
14868
|
+
s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
|
|
14869
|
+
consola.error(errorMsg);
|
|
14870
|
+
}
|
|
14871
|
+
return;
|
|
14937
14872
|
}
|
|
14938
|
-
|
|
14939
|
-
|
|
14940
|
-
|
|
14941
|
-
|
|
14942
|
-
|
|
14943
|
-
|
|
14944
|
-
|
|
14945
|
-
|
|
14873
|
+
if (chunkResult.data) chunkResults.push(chunkResult.data);
|
|
14874
|
+
if (chunkResult.tokensUsed) {
|
|
14875
|
+
accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
|
|
14876
|
+
accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
|
|
14877
|
+
accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
|
|
14878
|
+
}
|
|
14879
|
+
};
|
|
14880
|
+
});
|
|
14881
|
+
const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
|
|
14882
|
+
if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
|
|
14883
|
+
current: 1,
|
|
14884
|
+
total: processedDocs.length
|
|
14885
|
+
}));
|
|
14886
|
+
try {
|
|
14887
|
+
await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
|
|
14888
|
+
if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
|
|
14889
|
+
current: idx + 1,
|
|
14890
|
+
total: processedDocs.length
|
|
14891
|
+
}));
|
|
14892
|
+
await task();
|
|
14893
|
+
});
|
|
14894
|
+
} catch (e) {
|
|
14895
|
+
success = false;
|
|
14896
|
+
errorMsg = e instanceof Error ? e.message : String(e);
|
|
14946
14897
|
}
|
|
14947
14898
|
if (!success) return {
|
|
14948
14899
|
success: false,
|
|
@@ -15001,6 +14952,11 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
15001
14952
|
}
|
|
15002
14953
|
if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
|
|
15003
14954
|
if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
|
|
14955
|
+
if (result.evidenceSummary && !options?.quiet) {
|
|
14956
|
+
const summary = result.evidenceSummary;
|
|
14957
|
+
const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
|
|
14958
|
+
consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, issues ${issueText}`));
|
|
14959
|
+
}
|
|
15004
14960
|
if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
|
|
15005
14961
|
prompt: result.tokensUsed.prompt,
|
|
15006
14962
|
completion: result.tokensUsed.completion,
|
|
@@ -15029,6 +14985,7 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
15029
14985
|
outputPath: result.outputPath,
|
|
15030
14986
|
data: result.data,
|
|
15031
14987
|
tablesInserted: insertResult.tablesInserted,
|
|
14988
|
+
evidenceSummary: result.evidenceSummary,
|
|
15032
14989
|
tokensUsed: result.tokensUsed
|
|
15033
14990
|
};
|
|
15034
14991
|
} else {
|
|
@@ -15055,11 +15012,12 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
15055
15012
|
success: true,
|
|
15056
15013
|
outputPath: result.outputPath,
|
|
15057
15014
|
data: result.data,
|
|
15015
|
+
evidenceSummary: result.evidenceSummary,
|
|
15058
15016
|
tokensUsed: result.tokensUsed
|
|
15059
15017
|
};
|
|
15060
15018
|
}
|
|
15061
15019
|
async function runAuditedExtraction(options) {
|
|
15062
|
-
const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false
|
|
15020
|
+
const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false } = options;
|
|
15063
15021
|
let fileHash;
|
|
15064
15022
|
let isPlainTextFile = false;
|
|
15065
15023
|
if (source.type === "file") {
|
|
@@ -15127,8 +15085,7 @@ async function runAuditedExtraction(options) {
|
|
|
15127
15085
|
} else text$1 = source.text;
|
|
15128
15086
|
const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
|
|
15129
15087
|
quiet,
|
|
15130
|
-
insert
|
|
15131
|
-
agent
|
|
15088
|
+
insert
|
|
15132
15089
|
});
|
|
15133
15090
|
if (r.success) {
|
|
15134
15091
|
let notionPages;
|
|
@@ -15168,6 +15125,7 @@ async function runAuditedExtraction(options) {
|
|
|
15168
15125
|
outputName: updated.outputName,
|
|
15169
15126
|
tablesInserted: updated.tablesInserted,
|
|
15170
15127
|
notionPages: updated.notionPages,
|
|
15128
|
+
evidenceSummary: r.evidenceSummary,
|
|
15171
15129
|
tokensUsed: updated.tokensUsed,
|
|
15172
15130
|
auditId: updated.id,
|
|
15173
15131
|
fileHash
|
|
@@ -15587,12 +15545,6 @@ const extractCommand = defineCommand({
|
|
|
15587
15545
|
type: "boolean",
|
|
15588
15546
|
description: t("command.extract.args.force"),
|
|
15589
15547
|
default: false
|
|
15590
|
-
},
|
|
15591
|
-
agent: {
|
|
15592
|
-
type: "boolean",
|
|
15593
|
-
alias: "a",
|
|
15594
|
-
description: "Enable ReAct agent extraction mode",
|
|
15595
|
-
default: false
|
|
15596
15548
|
}
|
|
15597
15549
|
},
|
|
15598
15550
|
async run({ args, rawArgs }) {
|
|
@@ -15620,8 +15572,7 @@ const extractCommand = defineCommand({
|
|
|
15620
15572
|
}
|
|
15621
15573
|
const result$1 = await runBatchExtraction(aiexDir, config, aiConfig, args.schema, args.dir, args.glob, modelOverride, {
|
|
15622
15574
|
insert: !args.noInsert,
|
|
15623
|
-
force: args.force
|
|
15624
|
-
agent: args.agent
|
|
15575
|
+
force: args.force
|
|
15625
15576
|
});
|
|
15626
15577
|
if (!result$1.ok) {
|
|
15627
15578
|
failCommand(result$1.error);
|
|
@@ -15652,8 +15603,7 @@ const extractCommand = defineCommand({
|
|
|
15652
15603
|
modelOverride,
|
|
15653
15604
|
insert: !args.noInsert,
|
|
15654
15605
|
force: args.force,
|
|
15655
|
-
quiet: false
|
|
15656
|
-
agent: args.agent
|
|
15606
|
+
quiet: false
|
|
15657
15607
|
});
|
|
15658
15608
|
if (!result.success) {
|
|
15659
15609
|
failCommand(result.error);
|
|
@@ -16303,6 +16253,7 @@ function aiRoutes(config) {
|
|
|
16303
16253
|
//#endregion
|
|
16304
16254
|
//#region src/core/data-service.ts
|
|
16305
16255
|
const FILE_REGEX = /\.json$/;
|
|
16256
|
+
const EVIDENCE_FILE_SUFFIX = ".evidence.json";
|
|
16306
16257
|
const EXTRACTION_TIMESTAMP_RE = /-\d{4}-\d{2}-\d{2}T/;
|
|
16307
16258
|
const INTERNAL_ROWID_COLUMN = "__aiex_rowid";
|
|
16308
16259
|
const TIMESTAMP_CLEANUP = /(\d{2})-(\d{2})-(\d{2})/;
|
|
@@ -16318,6 +16269,24 @@ function getAuditNotionStatus(record) {
|
|
|
16318
16269
|
if (record.status === "failed") return "failed";
|
|
16319
16270
|
return "not_synced";
|
|
16320
16271
|
}
|
|
16272
|
+
async function readEvidenceSummary(extractedDir, outputName) {
|
|
16273
|
+
const evidencePath = path.join(extractedDir, outputName.replace(FILE_REGEX, EVIDENCE_FILE_SUFFIX));
|
|
16274
|
+
try {
|
|
16275
|
+
const coverage = (await readFile(evidencePath))?.coverage;
|
|
16276
|
+
if (!coverage || typeof coverage !== "object") return void 0;
|
|
16277
|
+
return {
|
|
16278
|
+
path: evidencePath,
|
|
16279
|
+
fieldCount: Number(coverage.fieldCount) || 0,
|
|
16280
|
+
evidenceCount: Number(coverage.evidenceCount) || 0,
|
|
16281
|
+
foundCount: Number(coverage.foundCount) || 0,
|
|
16282
|
+
missingCount: Number(coverage.missingCount) || 0,
|
|
16283
|
+
inferredCount: Number(coverage.inferredCount) || 0,
|
|
16284
|
+
issueCount: Number(coverage.issueCount) || 0
|
|
16285
|
+
};
|
|
16286
|
+
} catch {
|
|
16287
|
+
return;
|
|
16288
|
+
}
|
|
16289
|
+
}
|
|
16321
16290
|
async function getRowExtractionActions(aiexDir, tableName) {
|
|
16322
16291
|
const actions = /* @__PURE__ */ new Map();
|
|
16323
16292
|
const auditRecords = await listExtractionAuditRecords(aiexDir);
|
|
@@ -16345,7 +16314,7 @@ async function listExtractions(config) {
|
|
|
16345
16314
|
const aiexDir = path.dirname(config.schemaPath);
|
|
16346
16315
|
const extractedDir = path.join(aiexDir, "extracted");
|
|
16347
16316
|
await fs.mkdir(extractedDir, { recursive: true });
|
|
16348
|
-
const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md"));
|
|
16317
|
+
const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md") && !f.endsWith(EVIDENCE_FILE_SUFFIX));
|
|
16349
16318
|
const auditRecords = await listExtractionAuditRecords(aiexDir);
|
|
16350
16319
|
const auditByOutputName = new Map(auditRecords.map((record) => [record.outputName, record]));
|
|
16351
16320
|
const records = [];
|
|
@@ -16364,6 +16333,7 @@ async function listExtractions(config) {
|
|
|
16364
16333
|
timestamp,
|
|
16365
16334
|
fileSize: stat.size,
|
|
16366
16335
|
modifiedAt: stat.mtime.toISOString(),
|
|
16336
|
+
evidenceSummary: await readEvidenceSummary(extractedDir, file),
|
|
16367
16337
|
notionStatus: notionPages ? "synced" : audit?.status === "failed" ? "failed" : "not_synced",
|
|
16368
16338
|
notionPages,
|
|
16369
16339
|
notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0
|
|
@@ -16543,6 +16513,7 @@ async function retryNotionSync(config, fileName) {
|
|
|
16543
16513
|
|
|
16544
16514
|
//#endregion
|
|
16545
16515
|
//#region src/server/routes/data.ts
|
|
16516
|
+
const JSON_FILE_SUFFIX_RE = /\.json$/;
|
|
16546
16517
|
const tableParamSchema = z.object({ name: z.string().regex(/^[a-z][a-z0-9_]*$/) });
|
|
16547
16518
|
const extractionFileParamSchema = z.object({ name: z.string().regex(/^[\w.-]+\.json$/).refine((name$1) => name$1 === path.basename(name$1) && !name$1.includes("..")) });
|
|
16548
16519
|
const tableQuerySchema = z.object({
|
|
@@ -16595,10 +16566,22 @@ function dataRoutes(config) {
|
|
|
16595
16566
|
const filePath = path.join(extractedDir, name$1);
|
|
16596
16567
|
try {
|
|
16597
16568
|
const content = await fs.readFile(filePath, "utf-8");
|
|
16569
|
+
const evidencePath = path.join(extractedDir, name$1.replace(JSON_FILE_SUFFIX_RE, ".evidence.json"));
|
|
16570
|
+
let evidenceSummary;
|
|
16571
|
+
try {
|
|
16572
|
+
const evidence = JSON.parse(await fs.readFile(evidencePath, "utf-8"));
|
|
16573
|
+
evidenceSummary = evidence?.coverage ? {
|
|
16574
|
+
...evidence.coverage,
|
|
16575
|
+
path: evidencePath
|
|
16576
|
+
} : void 0;
|
|
16577
|
+
} catch {
|
|
16578
|
+
evidenceSummary = void 0;
|
|
16579
|
+
}
|
|
16598
16580
|
return c.json({
|
|
16599
16581
|
success: true,
|
|
16600
16582
|
content,
|
|
16601
|
-
name: name$1
|
|
16583
|
+
name: name$1,
|
|
16584
|
+
evidenceSummary
|
|
16602
16585
|
});
|
|
16603
16586
|
} catch {
|
|
16604
16587
|
return c.json({ error: t("server.extractionNotFound") }, 404);
|
|
@@ -16742,6 +16725,7 @@ function extractRoutes(config) {
|
|
|
16742
16725
|
outputName: result.outputName,
|
|
16743
16726
|
tablesInserted: result.tablesInserted,
|
|
16744
16727
|
notionPages: result.notionPages,
|
|
16728
|
+
evidenceSummary: result.evidenceSummary,
|
|
16745
16729
|
tokensUsed: result.tokensUsed,
|
|
16746
16730
|
auditId: result.auditId
|
|
16747
16731
|
}, 200);
|
|
@@ -16809,6 +16793,7 @@ function extractRoutes(config) {
|
|
|
16809
16793
|
outputName: result.outputName,
|
|
16810
16794
|
tablesInserted: result.tablesInserted,
|
|
16811
16795
|
notionPages: result.notionPages,
|
|
16796
|
+
evidenceSummary: result.evidenceSummary,
|
|
16812
16797
|
tokensUsed: result.tokensUsed,
|
|
16813
16798
|
auditId: result.auditId
|
|
16814
16799
|
}, 200);
|