aiex-cli 0.0.5-beta.3 → 0.0.5-beta.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -16
- package/dist/cli.mjs +242 -321
- package/dist/{doctor-collector-CQPDBVTw.mjs → doctor-collector-Cv7RArla.mjs} +8 -5
- package/dist/index.mjs +1 -1
- package/dist/web/assets/AISettings-BlyTFIIy.js +272 -0
- package/dist/web/assets/ExtractionViewer-BhhWrBs2.js +1 -0
- package/dist/web/assets/{index-BWm_fhNt.js → index-CKV2X6sS.js} +2 -2
- package/dist/web/assets/index-Csdgio76.css +2 -0
- package/dist/web/index.html +2 -2
- package/dist/{zh-CN-CKxdpj8c.mjs → zh-CN-CyL-61Ow.mjs} +1 -2
- package/package.json +1 -1
- package/dist/web/assets/AISettings-DoDVYWfb.js +0 -272
- package/dist/web/assets/ExtractionViewer-DqIrBGNK.js +0 -1
- package/dist/web/assets/index-CvY9TGny.css +0 -2
package/dist/cli.mjs
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-
|
|
1
|
+
import { A as doctorDiagnosticsTableRows, C as createConfig, D as package_default, E as name, O as version, S as AIConfigSchema, T as description, _ as DEFAULT_MINERU_API_CONFIG, a as parseJsonSchema, b as PLACEHOLDER_SCHEMA, c as recognizeImageText, d as t, f as getDefaultAIConfig, g as DEFAULT_MARKITDOWN_CONFIG, h as DEFAULT_MARKER_CONFIG, i as JsonSchemaDefinitionSchema, j as formatDoctorDiagnosticsJson, l as shouldUseImageOcrFallback, m as writeAIConfig, n as createMigrationConfig, o as toSnakeCase, p as readAIConfig, s as generateDrizzleSchema, t as collectDoctorDiagnostics, u as initI18n, v as DEFAULT_MINERU_CONFIG, w as seedConfig, x as PLACEHOLDER_TEXT, y as DEFAULT_PROMPT_CONFIG } from "./doctor-collector-Cv7RArla.mjs";
|
|
2
2
|
import { createRequire } from "node:module";
|
|
3
3
|
import fs from "node:fs/promises";
|
|
4
4
|
import os from "node:os";
|
|
@@ -18,7 +18,7 @@ import pc from "picocolors";
|
|
|
18
18
|
import { Buffer } from "node:buffer";
|
|
19
19
|
import * as XLSX from "xlsx";
|
|
20
20
|
import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
|
|
21
|
-
import { APICallError, Output, generateText, jsonSchema
|
|
21
|
+
import { APICallError, Output, generateText, jsonSchema } from "ai";
|
|
22
22
|
import pRetry from "p-retry";
|
|
23
23
|
import mime from "mime";
|
|
24
24
|
import { jsonrepair } from "jsonrepair";
|
|
@@ -13565,12 +13565,13 @@ const HEADING_RE = /^(#{1,6})\s+(\S.*)$/;
|
|
|
13565
13565
|
* Keeps tables and list blocks intact by splitting along paragraphs (\n\n)
|
|
13566
13566
|
* when a section exceeds the maxSize limit.
|
|
13567
13567
|
*/
|
|
13568
|
-
function splitMarkdown(text$1, maxSize = 4e4) {
|
|
13568
|
+
function splitMarkdown(text$1, maxSize = 4e4, overlapSize = 0) {
|
|
13569
13569
|
const lines = text$1.split("\n");
|
|
13570
13570
|
const chunks = [];
|
|
13571
13571
|
let currentHeadings = [];
|
|
13572
13572
|
let currentChunkLines = [];
|
|
13573
13573
|
let currentSize = 0;
|
|
13574
|
+
let hasNewLines = false;
|
|
13574
13575
|
const getMetadata = (headings) => {
|
|
13575
13576
|
return {
|
|
13576
13577
|
h1: headings[0] || void 0,
|
|
@@ -13579,9 +13580,15 @@ function splitMarkdown(text$1, maxSize = 4e4) {
|
|
|
13579
13580
|
h4: headings[3] || void 0
|
|
13580
13581
|
};
|
|
13581
13582
|
};
|
|
13582
|
-
const flushChunk = () => {
|
|
13583
|
-
if (currentChunkLines.length === 0)
|
|
13583
|
+
const flushChunk = (isHeadingChange = false) => {
|
|
13584
|
+
if (currentChunkLines.length === 0 || !hasNewLines) {
|
|
13585
|
+
currentChunkLines = [];
|
|
13586
|
+
currentSize = 0;
|
|
13587
|
+
hasNewLines = false;
|
|
13588
|
+
return;
|
|
13589
|
+
}
|
|
13584
13590
|
const pageContent = currentChunkLines.join("\n");
|
|
13591
|
+
let lastChunkContent = "";
|
|
13585
13592
|
if (pageContent.length > maxSize) {
|
|
13586
13593
|
const paragraphs = pageContent.split("\n\n");
|
|
13587
13594
|
let subLines = [];
|
|
@@ -13589,31 +13596,63 @@ function splitMarkdown(text$1, maxSize = 4e4) {
|
|
|
13589
13596
|
for (const para of paragraphs) {
|
|
13590
13597
|
const paraSize = para.length;
|
|
13591
13598
|
if (subSize + paraSize > maxSize && subLines.length > 0) {
|
|
13599
|
+
const content = subLines.join("\n\n");
|
|
13592
13600
|
chunks.push({
|
|
13593
|
-
pageContent:
|
|
13601
|
+
pageContent: content,
|
|
13594
13602
|
metadata: getMetadata(currentHeadings)
|
|
13595
13603
|
});
|
|
13596
|
-
|
|
13597
|
-
|
|
13604
|
+
const overlapParas = [];
|
|
13605
|
+
let currentOverlapSize = 0;
|
|
13606
|
+
for (let j = subLines.length - 1; j >= 0; j--) {
|
|
13607
|
+
const p = subLines[j];
|
|
13608
|
+
if (currentOverlapSize + p.length > overlapSize && overlapParas.length > 0) break;
|
|
13609
|
+
overlapParas.unshift(p);
|
|
13610
|
+
currentOverlapSize += p.length + 2;
|
|
13611
|
+
}
|
|
13612
|
+
subLines = [...overlapParas];
|
|
13613
|
+
subSize = currentOverlapSize;
|
|
13598
13614
|
}
|
|
13599
13615
|
subLines.push(para);
|
|
13600
13616
|
subSize += paraSize + 2;
|
|
13601
13617
|
}
|
|
13602
|
-
if (subLines.length > 0)
|
|
13603
|
-
|
|
13618
|
+
if (subLines.length > 0) {
|
|
13619
|
+
const content = subLines.join("\n\n");
|
|
13620
|
+
chunks.push({
|
|
13621
|
+
pageContent: content,
|
|
13622
|
+
metadata: getMetadata(currentHeadings)
|
|
13623
|
+
});
|
|
13624
|
+
lastChunkContent = content;
|
|
13625
|
+
}
|
|
13626
|
+
} else {
|
|
13627
|
+
chunks.push({
|
|
13628
|
+
pageContent,
|
|
13604
13629
|
metadata: getMetadata(currentHeadings)
|
|
13605
13630
|
});
|
|
13606
|
-
|
|
13607
|
-
|
|
13608
|
-
|
|
13609
|
-
|
|
13610
|
-
|
|
13611
|
-
|
|
13631
|
+
lastChunkContent = pageContent;
|
|
13632
|
+
}
|
|
13633
|
+
if (!isHeadingChange && lastChunkContent && overlapSize > 0) {
|
|
13634
|
+
const paragraphs = lastChunkContent.split("\n\n");
|
|
13635
|
+
const overlapParas = [];
|
|
13636
|
+
let currentOverlapSize = 0;
|
|
13637
|
+
for (let j = paragraphs.length - 1; j >= 0; j--) {
|
|
13638
|
+
const p = paragraphs[j];
|
|
13639
|
+
if (currentOverlapSize + p.length > overlapSize && overlapParas.length > 0) break;
|
|
13640
|
+
overlapParas.unshift(p);
|
|
13641
|
+
currentOverlapSize += p.length + 2;
|
|
13642
|
+
}
|
|
13643
|
+
const overlapText = overlapParas.join("\n\n");
|
|
13644
|
+
currentChunkLines = overlapText.split("\n");
|
|
13645
|
+
currentSize = overlapText.length;
|
|
13646
|
+
} else {
|
|
13647
|
+
currentChunkLines = [];
|
|
13648
|
+
currentSize = 0;
|
|
13649
|
+
}
|
|
13650
|
+
hasNewLines = false;
|
|
13612
13651
|
};
|
|
13613
13652
|
for (const line of lines) {
|
|
13614
13653
|
const headingMatch = line.match(HEADING_RE);
|
|
13615
13654
|
if (headingMatch) {
|
|
13616
|
-
flushChunk();
|
|
13655
|
+
flushChunk(true);
|
|
13617
13656
|
const depth = headingMatch[1].length;
|
|
13618
13657
|
const title = headingMatch[2].trim();
|
|
13619
13658
|
currentHeadings = currentHeadings.slice(0, depth - 1);
|
|
@@ -13621,220 +13660,13 @@ function splitMarkdown(text$1, maxSize = 4e4) {
|
|
|
13621
13660
|
}
|
|
13622
13661
|
currentChunkLines.push(line);
|
|
13623
13662
|
currentSize += line.length + 1;
|
|
13624
|
-
|
|
13663
|
+
hasNewLines = true;
|
|
13664
|
+
if (currentSize > maxSize) flushChunk(false);
|
|
13625
13665
|
}
|
|
13626
|
-
flushChunk();
|
|
13666
|
+
flushChunk(true);
|
|
13627
13667
|
return chunks;
|
|
13628
13668
|
}
|
|
13629
13669
|
|
|
13630
|
-
//#endregion
|
|
13631
|
-
//#region src/core/ai-extraction/react-agent.ts
|
|
13632
|
-
async function extractStructuredDataWithAgent(input) {
|
|
13633
|
-
const { config, schema, text: text$1, aiexDir, modelOverride, onAgentStep } = input;
|
|
13634
|
-
if (!config.provider.apiKey) return {
|
|
13635
|
-
success: false,
|
|
13636
|
-
error: t("errors.ai.apiKeyMissing")
|
|
13637
|
-
};
|
|
13638
|
-
const chunks = splitMarkdown(text$1, 15e3);
|
|
13639
|
-
const inputTokens = Math.ceil(text$1.length / 2);
|
|
13640
|
-
const fieldCount = schema.properties ? Object.keys(schema.properties).length : 0;
|
|
13641
|
-
const outputTokens = fieldCount > 0 ? fieldCount * 80 : void 0;
|
|
13642
|
-
let selected;
|
|
13643
|
-
try {
|
|
13644
|
-
selected = modelOverride ?? selectModel({
|
|
13645
|
-
models: config.provider.models,
|
|
13646
|
-
isImage: false,
|
|
13647
|
-
inputTokens,
|
|
13648
|
-
outputTokens
|
|
13649
|
-
});
|
|
13650
|
-
} catch (e) {
|
|
13651
|
-
return {
|
|
13652
|
-
success: false,
|
|
13653
|
-
error: e.message
|
|
13654
|
-
};
|
|
13655
|
-
}
|
|
13656
|
-
const useTelemetry = !!(config.langfuse?.publicKey && config.langfuse.secretKey);
|
|
13657
|
-
try {
|
|
13658
|
-
if (useTelemetry) initLangfuse(config);
|
|
13659
|
-
const provider = createOpenAICompatible({
|
|
13660
|
-
baseURL: config.provider.baseURL,
|
|
13661
|
-
name: "openai-compatible",
|
|
13662
|
-
apiKey: config.provider.apiKey,
|
|
13663
|
-
supportsStructuredOutputs: false
|
|
13664
|
-
});
|
|
13665
|
-
let finalExtractedData = null;
|
|
13666
|
-
const tools = {
|
|
13667
|
-
listChunks: tool({
|
|
13668
|
-
description: "Get a list of all text chunks in the document, showing their chunk index ID, character size, and markdown heading hierarchy (metadata). Use this as a Table of Contents to locate sections of interest.",
|
|
13669
|
-
parameters: z.object({}),
|
|
13670
|
-
execute: async () => {
|
|
13671
|
-
return chunks.map((c, idx) => ({
|
|
13672
|
-
id: idx + 1,
|
|
13673
|
-
size: c.pageContent.length,
|
|
13674
|
-
headings: c.metadata
|
|
13675
|
-
}));
|
|
13676
|
-
}
|
|
13677
|
-
}),
|
|
13678
|
-
readChunk: tool({
|
|
13679
|
-
description: "Read the full text content of a specific chunk by its ID.",
|
|
13680
|
-
parameters: z.object({ chunkId: z.number().int().describe("The ID (1-based index) of the chunk to read.") }),
|
|
13681
|
-
execute: async ({ chunkId }) => {
|
|
13682
|
-
const index = chunkId - 1;
|
|
13683
|
-
if (index < 0 || index >= chunks.length) return { error: `Invalid chunkId: ${chunkId}. Valid IDs are 1 to ${chunks.length}.` };
|
|
13684
|
-
const chunk = chunks[index];
|
|
13685
|
-
const headings = [];
|
|
13686
|
-
if (chunk.metadata) {
|
|
13687
|
-
if (chunk.metadata.h1) headings.push(chunk.metadata.h1);
|
|
13688
|
-
if (chunk.metadata.h2) headings.push(chunk.metadata.h2);
|
|
13689
|
-
if (chunk.metadata.h3) headings.push(chunk.metadata.h3);
|
|
13690
|
-
if (chunk.metadata.h4) headings.push(chunk.metadata.h4);
|
|
13691
|
-
}
|
|
13692
|
-
return {
|
|
13693
|
-
chunkId,
|
|
13694
|
-
headings: headings.join(" > "),
|
|
13695
|
-
content: chunk.pageContent
|
|
13696
|
-
};
|
|
13697
|
-
}
|
|
13698
|
-
}),
|
|
13699
|
-
searchChunks: tool({
|
|
13700
|
-
description: "Search all chunks in the document for specific keywords or search terms. Returns matching chunk IDs and small matching context snippets.",
|
|
13701
|
-
parameters: z.object({ query: z.string().describe("The keyword or search phrase to search for.") }),
|
|
13702
|
-
execute: async ({ query }) => {
|
|
13703
|
-
const results = [];
|
|
13704
|
-
const lowercaseQuery = query.toLowerCase();
|
|
13705
|
-
for (let i = 0; i < chunks.length; i++) {
|
|
13706
|
-
const chunkText = chunks[i].pageContent;
|
|
13707
|
-
const idx = chunkText.toLowerCase().indexOf(lowercaseQuery);
|
|
13708
|
-
if (idx !== -1) {
|
|
13709
|
-
const start = Math.max(0, idx - 60);
|
|
13710
|
-
const end = Math.min(chunkText.length, idx + lowercaseQuery.length + 60);
|
|
13711
|
-
const snippet = `...${chunkText.slice(start, end).replace(/\n/g, " ")}...`;
|
|
13712
|
-
results.push({
|
|
13713
|
-
chunkId: i + 1,
|
|
13714
|
-
headings: chunks[i].metadata,
|
|
13715
|
-
snippet
|
|
13716
|
-
});
|
|
13717
|
-
}
|
|
13718
|
-
}
|
|
13719
|
-
return results.slice(0, 10);
|
|
13720
|
-
}
|
|
13721
|
-
}),
|
|
13722
|
-
submitExtraction: tool({
|
|
13723
|
-
description: "Submit the final extracted JSON object conforming to the schema definition. Call this ONLY after you have gathered all necessary information.",
|
|
13724
|
-
parameters: z.object({ data: z.any().describe("The extracted JSON object conforming to the target schema.") }),
|
|
13725
|
-
execute: async ({ data }) => {
|
|
13726
|
-
finalExtractedData = data;
|
|
13727
|
-
return {
|
|
13728
|
-
status: "success",
|
|
13729
|
-
message: "Data submitted successfully. The extraction is now complete."
|
|
13730
|
-
};
|
|
13731
|
-
}
|
|
13732
|
-
})
|
|
13733
|
-
};
|
|
13734
|
-
const outputSchema = schemaToExtractionOutputSchema(schema);
|
|
13735
|
-
const systemPrompt = `You are a precise data extraction agent. Your goal is to extract structured information from a document to populate the target JSON schema.
|
|
13736
|
-
|
|
13737
|
-
Target JSON Schema structure to populate:
|
|
13738
|
-
${JSON.stringify(outputSchema, null, 2)}
|
|
13739
|
-
|
|
13740
|
-
You are equipped with tools to browse the document dynamically:
|
|
13741
|
-
1. First, call listChunks to understand the document layout and what sections exist.
|
|
13742
|
-
2. Based on the schema fields, call readChunk or searchChunks to locate and read relevant content.
|
|
13743
|
-
3. You can make multiple tool calls. Do not guess. Check the text carefully.
|
|
13744
|
-
4. Once you have located and read all the necessary information, call the submitExtraction tool with the fully extracted JSON object.
|
|
13745
|
-
5. After calling submitExtraction, you should stop.
|
|
13746
|
-
|
|
13747
|
-
CRITICAL RULES:
|
|
13748
|
-
1. Extract data strictly conforming to the types and properties of the Target JSON Schema.
|
|
13749
|
-
2. If a field's value cannot be found in the document after thorough search, set it to null.
|
|
13750
|
-
3. Do not invent any values.
|
|
13751
|
-
4. Call submitExtraction exactly once with the final JSON result.`;
|
|
13752
|
-
const timeoutMs = (config.provider.timeout ?? 300) * 1e3;
|
|
13753
|
-
const result = await generateText({
|
|
13754
|
-
model: provider.chatModel(selected.name),
|
|
13755
|
-
system: systemPrompt,
|
|
13756
|
-
prompt: "Please start by listing the chunks to understand the document structure, then gather the required facts and submit the final JSON extraction.",
|
|
13757
|
-
tools,
|
|
13758
|
-
maxSteps: 12,
|
|
13759
|
-
abortSignal: AbortSignal.timeout(timeoutMs),
|
|
13760
|
-
experimental_telemetry: { isEnabled: useTelemetry },
|
|
13761
|
-
onStepFinish({ text: text$2, toolCalls }) {
|
|
13762
|
-
if (onAgentStep) onAgentStep({
|
|
13763
|
-
thought: text$2,
|
|
13764
|
-
toolCalls
|
|
13765
|
-
});
|
|
13766
|
-
}
|
|
13767
|
-
});
|
|
13768
|
-
if (!finalExtractedData) {
|
|
13769
|
-
if (result.text) try {
|
|
13770
|
-
finalExtractedData = safeParseJSON(result.text);
|
|
13771
|
-
} catch {}
|
|
13772
|
-
}
|
|
13773
|
-
if (!finalExtractedData) return {
|
|
13774
|
-
success: false,
|
|
13775
|
-
error: "Agent finished without submitting structured data."
|
|
13776
|
-
};
|
|
13777
|
-
const validation = validateExtractedData(schema, finalExtractedData);
|
|
13778
|
-
if (!validation.success) {
|
|
13779
|
-
const correctionSystemPrompt = `You are a precise data correction assistant. Your task is to correct validation errors in a previously generated JSON object to make it comply with the JSON Schema.
|
|
13780
|
-
|
|
13781
|
-
JSON Schema Definition:
|
|
13782
|
-
${JSON.stringify(outputSchema, null, 2)}
|
|
13783
|
-
|
|
13784
|
-
Validation Errors:
|
|
13785
|
-
${validation.error}
|
|
13786
|
-
|
|
13787
|
-
Original Incorrect JSON:
|
|
13788
|
-
${JSON.stringify(finalExtractedData, null, 2)}
|
|
13789
|
-
|
|
13790
|
-
Please output the corrected JSON object. Return ONLY the corrected JSON object, with no markdown tags or explanations.`;
|
|
13791
|
-
const correctedData = safeParseJSON((await generateText({
|
|
13792
|
-
model: provider.chatModel(selected.name),
|
|
13793
|
-
system: correctionSystemPrompt,
|
|
13794
|
-
prompt: "Please correct the JSON output now.",
|
|
13795
|
-
abortSignal: AbortSignal.timeout(timeoutMs),
|
|
13796
|
-
experimental_telemetry: { isEnabled: useTelemetry }
|
|
13797
|
-
})).text);
|
|
13798
|
-
const secondValidation = validateExtractedData(schema, correctedData);
|
|
13799
|
-
if (!secondValidation.success) return {
|
|
13800
|
-
success: false,
|
|
13801
|
-
error: `Agent output validation failed: ${secondValidation.error}`
|
|
13802
|
-
};
|
|
13803
|
-
finalExtractedData = correctedData;
|
|
13804
|
-
}
|
|
13805
|
-
const outputDir = path.resolve(aiexDir, config.extraction?.outputDir?.replace(".aiex/", "") ?? "extracted");
|
|
13806
|
-
await fs.mkdir(outputDir, { recursive: true });
|
|
13807
|
-
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
13808
|
-
const outputFileName = `${schema.table.name}-${timestamp}.json`;
|
|
13809
|
-
const outputPath = path.join(outputDir, outputFileName);
|
|
13810
|
-
await writeFile(outputPath, finalExtractedData, {
|
|
13811
|
-
spaces: 2,
|
|
13812
|
-
EOL: "\n"
|
|
13813
|
-
});
|
|
13814
|
-
let totalPromptTokens = 0;
|
|
13815
|
-
let totalCompletionTokens = 0;
|
|
13816
|
-
if (result.usage) {
|
|
13817
|
-
totalPromptTokens = result.usage.inputTokens ?? 0;
|
|
13818
|
-
totalCompletionTokens = result.usage.outputTokens ?? 0;
|
|
13819
|
-
}
|
|
13820
|
-
return {
|
|
13821
|
-
success: true,
|
|
13822
|
-
outputPath,
|
|
13823
|
-
data: finalExtractedData,
|
|
13824
|
-
tokensUsed: {
|
|
13825
|
-
prompt: totalPromptTokens,
|
|
13826
|
-
completion: totalCompletionTokens,
|
|
13827
|
-
total: totalPromptTokens + totalCompletionTokens
|
|
13828
|
-
}
|
|
13829
|
-
};
|
|
13830
|
-
} catch (error) {
|
|
13831
|
-
return {
|
|
13832
|
-
success: false,
|
|
13833
|
-
error: getErrorMessage(error)
|
|
13834
|
-
};
|
|
13835
|
-
}
|
|
13836
|
-
}
|
|
13837
|
-
|
|
13838
13670
|
//#endregion
|
|
13839
13671
|
//#region src/core/extraction-audit.ts
|
|
13840
13672
|
const AUDIT_ID_RE = /^[\w.-]+$/;
|
|
@@ -14719,7 +14551,6 @@ async function processOneFile(aiexDir, config, aiConfig, schemaName, filePath, m
|
|
|
14719
14551
|
modelOverride,
|
|
14720
14552
|
insert: options?.insert,
|
|
14721
14553
|
force: options?.force,
|
|
14722
|
-
agent: options?.agent,
|
|
14723
14554
|
quiet: false
|
|
14724
14555
|
});
|
|
14725
14556
|
if (result.success) {
|
|
@@ -14759,8 +14590,7 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
|
|
|
14759
14590
|
})}`);
|
|
14760
14591
|
if (await processOneFile(aiexDir, config, aiConfig, schemaName, file, modelOverride, {
|
|
14761
14592
|
insert: options?.insert,
|
|
14762
|
-
force: options?.force
|
|
14763
|
-
agent: options?.agent
|
|
14593
|
+
force: options?.force
|
|
14764
14594
|
})) successCount++;
|
|
14765
14595
|
else failCount++;
|
|
14766
14596
|
}
|
|
@@ -14779,6 +14609,42 @@ async function runBatchExtraction(aiexDir, config, aiConfig, schemaName, dir, gl
|
|
|
14779
14609
|
//#endregion
|
|
14780
14610
|
//#region src/core/extract-runner.ts
|
|
14781
14611
|
const JSON_EXT_RE$1 = /\.json$/;
|
|
14612
|
+
async function limitConcurrency(concurrency, items, fn) {
|
|
14613
|
+
const results = Array.from({ length: items.length });
|
|
14614
|
+
let nextIndex = 0;
|
|
14615
|
+
async function worker() {
|
|
14616
|
+
while (nextIndex < items.length) {
|
|
14617
|
+
const currentIndex = nextIndex++;
|
|
14618
|
+
results[currentIndex] = await fn(items[currentIndex], currentIndex);
|
|
14619
|
+
}
|
|
14620
|
+
}
|
|
14621
|
+
const workers = Array.from({ length: Math.min(concurrency, items.length) }, worker);
|
|
14622
|
+
await Promise.all(workers);
|
|
14623
|
+
return results;
|
|
14624
|
+
}
|
|
14625
|
+
function getSchemaKeywords(schema) {
|
|
14626
|
+
const keywords = /* @__PURE__ */ new Set();
|
|
14627
|
+
function walk(properties) {
|
|
14628
|
+
if (!properties) return;
|
|
14629
|
+
for (const [name$1, prop] of Object.entries(properties)) {
|
|
14630
|
+
keywords.add(name$1.toLowerCase());
|
|
14631
|
+
const parts = name$1.replace(/([a-z0-9])([A-Z])/g, "$1 $2").split(/[\s._:/\\-]+/g);
|
|
14632
|
+
for (const part of parts) if (part.length > 1) keywords.add(part.toLowerCase());
|
|
14633
|
+
if (prop && typeof prop === "object") {
|
|
14634
|
+
const p = prop;
|
|
14635
|
+
if (typeof p.title === "string") keywords.add(p.title.toLowerCase());
|
|
14636
|
+
if (typeof p.description === "string") {
|
|
14637
|
+
const descParts = p.description.toLowerCase().match(/[\p{L}\p{N}_-]+/gu) ?? [];
|
|
14638
|
+
for (const d of descParts) if (d.length > 2) keywords.add(d);
|
|
14639
|
+
}
|
|
14640
|
+
if (p.type === "object") walk(p.properties);
|
|
14641
|
+
if (p.type === "array" && p.items?.type === "object") walk(p.items.properties);
|
|
14642
|
+
}
|
|
14643
|
+
}
|
|
14644
|
+
}
|
|
14645
|
+
walk(schema.properties);
|
|
14646
|
+
return Array.from(keywords);
|
|
14647
|
+
}
|
|
14782
14648
|
async function ensureDatabaseReady(dbPath, schema) {
|
|
14783
14649
|
try {
|
|
14784
14650
|
await fs.access(dbPath);
|
|
@@ -14852,42 +14718,48 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14852
14718
|
if (!options?.quiet) s.start(filePath ? t("command.extract.file.extractedFrom", { file: path.basename(filePath) }) : t("command.extract.file.extracting"));
|
|
14853
14719
|
const CHUNK_LIMIT = 4e4;
|
|
14854
14720
|
let result;
|
|
14855
|
-
if (
|
|
14856
|
-
if (!options?.quiet) consola.info(t("command.extract.file.reactAgentMode"));
|
|
14857
|
-
const agentResult = await extractStructuredDataWithAgent({
|
|
14858
|
-
config: aiConfig,
|
|
14859
|
-
schema: schemaLoad.schema,
|
|
14860
|
-
text: text$1 ?? "",
|
|
14861
|
-
aiexDir,
|
|
14862
|
-
modelOverride,
|
|
14863
|
-
onAgentStep(step) {
|
|
14864
|
-
if (!options?.quiet) {
|
|
14865
|
-
if (step.thought) {
|
|
14866
|
-
const thoughtPreview = step.thought.length > 100 ? `${step.thought.slice(0, 100)}...` : step.thought;
|
|
14867
|
-
s.message(`${pc.cyan(t("command.extract.file.agentThought"))}: ${thoughtPreview.replace(/\n/g, " ")}`);
|
|
14868
|
-
}
|
|
14869
|
-
if (step.toolCalls && step.toolCalls.length > 0) for (const call of step.toolCalls) consola.info(`[Agent Action] Calling tool: ${pc.green(call.toolName)}`);
|
|
14870
|
-
}
|
|
14871
|
-
}
|
|
14872
|
-
});
|
|
14873
|
-
if (!agentResult.success) {
|
|
14874
|
-
if (!options?.quiet) {
|
|
14875
|
-
s.stop(t("command.extract.file.extractFail"));
|
|
14876
|
-
consola.error(agentResult.error);
|
|
14877
|
-
}
|
|
14878
|
-
return {
|
|
14879
|
-
success: false,
|
|
14880
|
-
error: agentResult.error
|
|
14881
|
-
};
|
|
14882
|
-
}
|
|
14883
|
-
result = agentResult;
|
|
14884
|
-
} else if (text$1 && text$1.length > CHUNK_LIMIT) {
|
|
14721
|
+
if (text$1 && text$1.length > CHUNK_LIMIT) {
|
|
14885
14722
|
if (!options?.quiet) consola.info(t("command.extract.file.chunking", {
|
|
14886
14723
|
length: text$1.length,
|
|
14887
14724
|
limit: CHUNK_LIMIT
|
|
14888
14725
|
}));
|
|
14889
|
-
const finalDocs = splitMarkdown(text$1, CHUNK_LIMIT);
|
|
14726
|
+
const finalDocs = splitMarkdown(text$1, CHUNK_LIMIT, aiConfig.extraction?.overlapSize ?? 2e3);
|
|
14890
14727
|
if (!options?.quiet) consola.info(t("command.extract.file.chunksCount", { count: finalDocs.length }));
|
|
14728
|
+
let processedDocs = finalDocs;
|
|
14729
|
+
if (!!aiConfig.extraction?.preFiltering && finalDocs.length > 1) {
|
|
14730
|
+
const preFilteringLimit = aiConfig.extraction?.preFilteringLimit ?? 5;
|
|
14731
|
+
const keywords = getSchemaKeywords(schemaLoad.schema);
|
|
14732
|
+
const scoredChunks = finalDocs.map((doc, idx) => {
|
|
14733
|
+
if (idx === 0) return {
|
|
14734
|
+
index: idx,
|
|
14735
|
+
score: Number.POSITIVE_INFINITY
|
|
14736
|
+
};
|
|
14737
|
+
let score = 0;
|
|
14738
|
+
const docTextLower = doc.pageContent.toLowerCase();
|
|
14739
|
+
for (const kw of keywords) {
|
|
14740
|
+
let pos = docTextLower.indexOf(kw);
|
|
14741
|
+
while (pos !== -1) {
|
|
14742
|
+
score++;
|
|
14743
|
+
pos = docTextLower.indexOf(kw, pos + kw.length);
|
|
14744
|
+
}
|
|
14745
|
+
}
|
|
14746
|
+
return {
|
|
14747
|
+
index: idx,
|
|
14748
|
+
score
|
|
14749
|
+
};
|
|
14750
|
+
}).slice(1).sort((a, b) => b.score - a.score);
|
|
14751
|
+
const selectedIndices = new Set([0]);
|
|
14752
|
+
let keptCount = 0;
|
|
14753
|
+
for (const sc of scoredChunks) if (sc.score > 0 && keptCount < preFilteringLimit) {
|
|
14754
|
+
selectedIndices.add(sc.index);
|
|
14755
|
+
keptCount++;
|
|
14756
|
+
}
|
|
14757
|
+
processedDocs = finalDocs.filter((_, idx) => selectedIndices.has(idx));
|
|
14758
|
+
if (!options?.quiet) consola.info(t("command.extract.file.preFiltering", {
|
|
14759
|
+
original: finalDocs.length,
|
|
14760
|
+
filtered: processedDocs.length
|
|
14761
|
+
}));
|
|
14762
|
+
}
|
|
14891
14763
|
const chunkResults = [];
|
|
14892
14764
|
const accumulatedTokens = {
|
|
14893
14765
|
prompt: 0,
|
|
@@ -14896,53 +14768,68 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
14896
14768
|
};
|
|
14897
14769
|
let success = true;
|
|
14898
14770
|
let errorMsg = "";
|
|
14899
|
-
|
|
14900
|
-
|
|
14901
|
-
|
|
14902
|
-
|
|
14903
|
-
|
|
14904
|
-
|
|
14905
|
-
|
|
14906
|
-
|
|
14907
|
-
|
|
14908
|
-
if (doc.metadata.h2) headings.push(doc.metadata.h2);
|
|
14909
|
-
if (doc.metadata.h3) headings.push(doc.metadata.h3);
|
|
14910
|
-
if (doc.metadata.h4) headings.push(doc.metadata.h4);
|
|
14911
|
-
}
|
|
14912
|
-
let chunkText = doc.pageContent;
|
|
14913
|
-
if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
|
|
14914
|
-
const chunkResult = await extractStructuredData({
|
|
14915
|
-
config: aiConfig,
|
|
14916
|
-
schema: schemaLoad.schema,
|
|
14917
|
-
text: chunkText,
|
|
14918
|
-
aiexDir,
|
|
14919
|
-
modelOverride,
|
|
14920
|
-
onRetry(info) {
|
|
14921
|
-
if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
|
|
14922
|
-
current: i + 1,
|
|
14923
|
-
total: finalDocs.length,
|
|
14924
|
-
code: info.statusCode,
|
|
14925
|
-
delay: info.delayMs / 1e3,
|
|
14926
|
-
attempt: info.attempt,
|
|
14927
|
-
max: info.maxRetries
|
|
14928
|
-
}));
|
|
14771
|
+
const extractionTasks = processedDocs.map((doc, i) => {
|
|
14772
|
+
return async () => {
|
|
14773
|
+
if (!success) return;
|
|
14774
|
+
const headings = [];
|
|
14775
|
+
if (doc.metadata) {
|
|
14776
|
+
if (doc.metadata.h1) headings.push(doc.metadata.h1);
|
|
14777
|
+
if (doc.metadata.h2) headings.push(doc.metadata.h2);
|
|
14778
|
+
if (doc.metadata.h3) headings.push(doc.metadata.h3);
|
|
14779
|
+
if (doc.metadata.h4) headings.push(doc.metadata.h4);
|
|
14929
14780
|
}
|
|
14930
|
-
|
|
14931
|
-
|
|
14932
|
-
|
|
14933
|
-
|
|
14934
|
-
|
|
14935
|
-
|
|
14936
|
-
|
|
14781
|
+
let chunkText = doc.pageContent;
|
|
14782
|
+
if (headings.length > 0) chunkText = `> **[Context]** Belong to: ${headings.join(" > ")}\n\n${chunkText}`;
|
|
14783
|
+
const chunkResult = await extractStructuredData({
|
|
14784
|
+
config: aiConfig,
|
|
14785
|
+
schema: schemaLoad.schema,
|
|
14786
|
+
text: chunkText,
|
|
14787
|
+
aiexDir,
|
|
14788
|
+
modelOverride,
|
|
14789
|
+
onRetry(info) {
|
|
14790
|
+
if (!options?.quiet) s.message(t("command.extract.file.extractRetryChunk", {
|
|
14791
|
+
current: i + 1,
|
|
14792
|
+
total: processedDocs.length,
|
|
14793
|
+
code: info.statusCode,
|
|
14794
|
+
delay: info.delayMs / 1e3,
|
|
14795
|
+
attempt: info.attempt,
|
|
14796
|
+
max: info.maxRetries
|
|
14797
|
+
}));
|
|
14798
|
+
}
|
|
14799
|
+
});
|
|
14800
|
+
if (!chunkResult.success) {
|
|
14801
|
+
success = false;
|
|
14802
|
+
errorMsg = chunkResult.error || t("common.unknownError");
|
|
14803
|
+
if (!options?.quiet) {
|
|
14804
|
+
s.stop(t("command.extract.file.extractFailChunk", { current: i + 1 }));
|
|
14805
|
+
consola.error(errorMsg);
|
|
14806
|
+
}
|
|
14807
|
+
return;
|
|
14937
14808
|
}
|
|
14938
|
-
|
|
14939
|
-
|
|
14940
|
-
|
|
14941
|
-
|
|
14942
|
-
|
|
14943
|
-
|
|
14944
|
-
|
|
14945
|
-
|
|
14809
|
+
if (chunkResult.data) chunkResults.push(chunkResult.data);
|
|
14810
|
+
if (chunkResult.tokensUsed) {
|
|
14811
|
+
accumulatedTokens.prompt += chunkResult.tokensUsed.prompt ?? 0;
|
|
14812
|
+
accumulatedTokens.completion += chunkResult.tokensUsed.completion ?? 0;
|
|
14813
|
+
accumulatedTokens.total += chunkResult.tokensUsed.total ?? 0;
|
|
14814
|
+
}
|
|
14815
|
+
};
|
|
14816
|
+
});
|
|
14817
|
+
const concurrency = Math.min(aiConfig.extraction?.concurrency ?? 2, 2);
|
|
14818
|
+
if (!options?.quiet && processedDocs.length > 0) s.message(t("command.extract.file.extractingChunk", {
|
|
14819
|
+
current: 1,
|
|
14820
|
+
total: processedDocs.length
|
|
14821
|
+
}));
|
|
14822
|
+
try {
|
|
14823
|
+
await limitConcurrency(concurrency, extractionTasks, async (task, idx) => {
|
|
14824
|
+
if (!options?.quiet && success) s.message(t("command.extract.file.extractingChunk", {
|
|
14825
|
+
current: idx + 1,
|
|
14826
|
+
total: processedDocs.length
|
|
14827
|
+
}));
|
|
14828
|
+
await task();
|
|
14829
|
+
});
|
|
14830
|
+
} catch (e) {
|
|
14831
|
+
success = false;
|
|
14832
|
+
errorMsg = e instanceof Error ? e.message : String(e);
|
|
14946
14833
|
}
|
|
14947
14834
|
if (!success) return {
|
|
14948
14835
|
success: false,
|
|
@@ -15001,6 +14888,11 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
15001
14888
|
}
|
|
15002
14889
|
if (!options?.quiet) s.stop(t("command.extract.file.extractComplete"));
|
|
15003
14890
|
if (result.outputPath && !options?.quiet) consola.success(t("command.extract.file.resultSaved", { path: pc.cyan(result.outputPath) }));
|
|
14891
|
+
if (result.evidenceSummary && !options?.quiet) {
|
|
14892
|
+
const summary = result.evidenceSummary;
|
|
14893
|
+
const issueText = summary.issueCount > 0 ? pc.yellow(String(summary.issueCount)) : pc.green("0");
|
|
14894
|
+
consola.info(pc.gray(`Evidence coverage: ${summary.evidenceCount}/${summary.fieldCount} fields, found ${summary.foundCount}, inferred ${summary.inferredCount}, missing ${summary.missingCount}, issues ${issueText}`));
|
|
14895
|
+
}
|
|
15004
14896
|
if (result.tokensUsed && !options?.quiet) consola.info(pc.gray(t("command.extract.file.tokenUsage", {
|
|
15005
14897
|
prompt: result.tokensUsed.prompt,
|
|
15006
14898
|
completion: result.tokensUsed.completion,
|
|
@@ -15029,6 +14921,7 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
15029
14921
|
outputPath: result.outputPath,
|
|
15030
14922
|
data: result.data,
|
|
15031
14923
|
tablesInserted: insertResult.tablesInserted,
|
|
14924
|
+
evidenceSummary: result.evidenceSummary,
|
|
15032
14925
|
tokensUsed: result.tokensUsed
|
|
15033
14926
|
};
|
|
15034
14927
|
} else {
|
|
@@ -15055,11 +14948,12 @@ async function extractSingle(aiexDir, config, aiConfig, schemaName, text$1, file
|
|
|
15055
14948
|
success: true,
|
|
15056
14949
|
outputPath: result.outputPath,
|
|
15057
14950
|
data: result.data,
|
|
14951
|
+
evidenceSummary: result.evidenceSummary,
|
|
15058
14952
|
tokensUsed: result.tokensUsed
|
|
15059
14953
|
};
|
|
15060
14954
|
}
|
|
15061
14955
|
async function runAuditedExtraction(options) {
|
|
15062
|
-
const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false
|
|
14956
|
+
const { aiexDir, config, aiConfig, schemaName, source, modelOverride, retryOf, insert, force, quiet = false } = options;
|
|
15063
14957
|
let fileHash;
|
|
15064
14958
|
let isPlainTextFile = false;
|
|
15065
14959
|
if (source.type === "file") {
|
|
@@ -15127,8 +15021,7 @@ async function runAuditedExtraction(options) {
|
|
|
15127
15021
|
} else text$1 = source.text;
|
|
15128
15022
|
const r = await extractSingle(aiexDir, config, aiConfig, schemaName, text$1, filePath, modelOverride, {
|
|
15129
15023
|
quiet,
|
|
15130
|
-
insert
|
|
15131
|
-
agent
|
|
15024
|
+
insert
|
|
15132
15025
|
});
|
|
15133
15026
|
if (r.success) {
|
|
15134
15027
|
let notionPages;
|
|
@@ -15168,6 +15061,7 @@ async function runAuditedExtraction(options) {
|
|
|
15168
15061
|
outputName: updated.outputName,
|
|
15169
15062
|
tablesInserted: updated.tablesInserted,
|
|
15170
15063
|
notionPages: updated.notionPages,
|
|
15064
|
+
evidenceSummary: r.evidenceSummary,
|
|
15171
15065
|
tokensUsed: updated.tokensUsed,
|
|
15172
15066
|
auditId: updated.id,
|
|
15173
15067
|
fileHash
|
|
@@ -15587,12 +15481,6 @@ const extractCommand = defineCommand({
|
|
|
15587
15481
|
type: "boolean",
|
|
15588
15482
|
description: t("command.extract.args.force"),
|
|
15589
15483
|
default: false
|
|
15590
|
-
},
|
|
15591
|
-
agent: {
|
|
15592
|
-
type: "boolean",
|
|
15593
|
-
alias: "a",
|
|
15594
|
-
description: "Enable ReAct agent extraction mode",
|
|
15595
|
-
default: false
|
|
15596
15484
|
}
|
|
15597
15485
|
},
|
|
15598
15486
|
async run({ args, rawArgs }) {
|
|
@@ -15620,8 +15508,7 @@ const extractCommand = defineCommand({
|
|
|
15620
15508
|
}
|
|
15621
15509
|
const result$1 = await runBatchExtraction(aiexDir, config, aiConfig, args.schema, args.dir, args.glob, modelOverride, {
|
|
15622
15510
|
insert: !args.noInsert,
|
|
15623
|
-
force: args.force
|
|
15624
|
-
agent: args.agent
|
|
15511
|
+
force: args.force
|
|
15625
15512
|
});
|
|
15626
15513
|
if (!result$1.ok) {
|
|
15627
15514
|
failCommand(result$1.error);
|
|
@@ -15652,8 +15539,7 @@ const extractCommand = defineCommand({
|
|
|
15652
15539
|
modelOverride,
|
|
15653
15540
|
insert: !args.noInsert,
|
|
15654
15541
|
force: args.force,
|
|
15655
|
-
quiet: false
|
|
15656
|
-
agent: args.agent
|
|
15542
|
+
quiet: false
|
|
15657
15543
|
});
|
|
15658
15544
|
if (!result.success) {
|
|
15659
15545
|
failCommand(result.error);
|
|
@@ -16303,6 +16189,7 @@ function aiRoutes(config) {
|
|
|
16303
16189
|
//#endregion
|
|
16304
16190
|
//#region src/core/data-service.ts
|
|
16305
16191
|
const FILE_REGEX = /\.json$/;
|
|
16192
|
+
const EVIDENCE_FILE_SUFFIX = ".evidence.json";
|
|
16306
16193
|
const EXTRACTION_TIMESTAMP_RE = /-\d{4}-\d{2}-\d{2}T/;
|
|
16307
16194
|
const INTERNAL_ROWID_COLUMN = "__aiex_rowid";
|
|
16308
16195
|
const TIMESTAMP_CLEANUP = /(\d{2})-(\d{2})-(\d{2})/;
|
|
@@ -16318,6 +16205,24 @@ function getAuditNotionStatus(record) {
|
|
|
16318
16205
|
if (record.status === "failed") return "failed";
|
|
16319
16206
|
return "not_synced";
|
|
16320
16207
|
}
|
|
16208
|
+
async function readEvidenceSummary(extractedDir, outputName) {
|
|
16209
|
+
const evidencePath = path.join(extractedDir, outputName.replace(FILE_REGEX, EVIDENCE_FILE_SUFFIX));
|
|
16210
|
+
try {
|
|
16211
|
+
const coverage = (await readFile(evidencePath))?.coverage;
|
|
16212
|
+
if (!coverage || typeof coverage !== "object") return void 0;
|
|
16213
|
+
return {
|
|
16214
|
+
path: evidencePath,
|
|
16215
|
+
fieldCount: Number(coverage.fieldCount) || 0,
|
|
16216
|
+
evidenceCount: Number(coverage.evidenceCount) || 0,
|
|
16217
|
+
foundCount: Number(coverage.foundCount) || 0,
|
|
16218
|
+
missingCount: Number(coverage.missingCount) || 0,
|
|
16219
|
+
inferredCount: Number(coverage.inferredCount) || 0,
|
|
16220
|
+
issueCount: Number(coverage.issueCount) || 0
|
|
16221
|
+
};
|
|
16222
|
+
} catch {
|
|
16223
|
+
return;
|
|
16224
|
+
}
|
|
16225
|
+
}
|
|
16321
16226
|
async function getRowExtractionActions(aiexDir, tableName) {
|
|
16322
16227
|
const actions = /* @__PURE__ */ new Map();
|
|
16323
16228
|
const auditRecords = await listExtractionAuditRecords(aiexDir);
|
|
@@ -16345,7 +16250,7 @@ async function listExtractions(config) {
|
|
|
16345
16250
|
const aiexDir = path.dirname(config.schemaPath);
|
|
16346
16251
|
const extractedDir = path.join(aiexDir, "extracted");
|
|
16347
16252
|
await fs.mkdir(extractedDir, { recursive: true });
|
|
16348
|
-
const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md"));
|
|
16253
|
+
const jsonFiles = (await fs.readdir(extractedDir)).filter((f) => f.endsWith(".json") && !f.endsWith(".prompt.md") && !f.endsWith(EVIDENCE_FILE_SUFFIX));
|
|
16349
16254
|
const auditRecords = await listExtractionAuditRecords(aiexDir);
|
|
16350
16255
|
const auditByOutputName = new Map(auditRecords.map((record) => [record.outputName, record]));
|
|
16351
16256
|
const records = [];
|
|
@@ -16364,6 +16269,7 @@ async function listExtractions(config) {
|
|
|
16364
16269
|
timestamp,
|
|
16365
16270
|
fileSize: stat.size,
|
|
16366
16271
|
modifiedAt: stat.mtime.toISOString(),
|
|
16272
|
+
evidenceSummary: await readEvidenceSummary(extractedDir, file),
|
|
16367
16273
|
notionStatus: notionPages ? "synced" : audit?.status === "failed" ? "failed" : "not_synced",
|
|
16368
16274
|
notionPages,
|
|
16369
16275
|
notionError: !notionPages && audit?.status === "failed" ? audit.error : void 0
|
|
@@ -16543,6 +16449,7 @@ async function retryNotionSync(config, fileName) {
|
|
|
16543
16449
|
|
|
16544
16450
|
//#endregion
|
|
16545
16451
|
//#region src/server/routes/data.ts
|
|
16452
|
+
const JSON_FILE_SUFFIX_RE = /\.json$/;
|
|
16546
16453
|
const tableParamSchema = z.object({ name: z.string().regex(/^[a-z][a-z0-9_]*$/) });
|
|
16547
16454
|
const extractionFileParamSchema = z.object({ name: z.string().regex(/^[\w.-]+\.json$/).refine((name$1) => name$1 === path.basename(name$1) && !name$1.includes("..")) });
|
|
16548
16455
|
const tableQuerySchema = z.object({
|
|
@@ -16595,10 +16502,22 @@ function dataRoutes(config) {
|
|
|
16595
16502
|
const filePath = path.join(extractedDir, name$1);
|
|
16596
16503
|
try {
|
|
16597
16504
|
const content = await fs.readFile(filePath, "utf-8");
|
|
16505
|
+
const evidencePath = path.join(extractedDir, name$1.replace(JSON_FILE_SUFFIX_RE, ".evidence.json"));
|
|
16506
|
+
let evidenceSummary;
|
|
16507
|
+
try {
|
|
16508
|
+
const evidence = JSON.parse(await fs.readFile(evidencePath, "utf-8"));
|
|
16509
|
+
evidenceSummary = evidence?.coverage ? {
|
|
16510
|
+
...evidence.coverage,
|
|
16511
|
+
path: evidencePath
|
|
16512
|
+
} : void 0;
|
|
16513
|
+
} catch {
|
|
16514
|
+
evidenceSummary = void 0;
|
|
16515
|
+
}
|
|
16598
16516
|
return c.json({
|
|
16599
16517
|
success: true,
|
|
16600
16518
|
content,
|
|
16601
|
-
name: name$1
|
|
16519
|
+
name: name$1,
|
|
16520
|
+
evidenceSummary
|
|
16602
16521
|
});
|
|
16603
16522
|
} catch {
|
|
16604
16523
|
return c.json({ error: t("server.extractionNotFound") }, 404);
|
|
@@ -16742,6 +16661,7 @@ function extractRoutes(config) {
|
|
|
16742
16661
|
outputName: result.outputName,
|
|
16743
16662
|
tablesInserted: result.tablesInserted,
|
|
16744
16663
|
notionPages: result.notionPages,
|
|
16664
|
+
evidenceSummary: result.evidenceSummary,
|
|
16745
16665
|
tokensUsed: result.tokensUsed,
|
|
16746
16666
|
auditId: result.auditId
|
|
16747
16667
|
}, 200);
|
|
@@ -16809,6 +16729,7 @@ function extractRoutes(config) {
|
|
|
16809
16729
|
outputName: result.outputName,
|
|
16810
16730
|
tablesInserted: result.tablesInserted,
|
|
16811
16731
|
notionPages: result.notionPages,
|
|
16732
|
+
evidenceSummary: result.evidenceSummary,
|
|
16812
16733
|
tokensUsed: result.tokensUsed,
|
|
16813
16734
|
auditId: result.auditId
|
|
16814
16735
|
}, 200);
|