@exulu/backend 1.53.1 → 1.54.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +3404 -2389
- package/dist/index.d.cts +66 -4
- package/dist/index.d.ts +66 -4
- package/dist/index.js +4926 -3918
- package/ee/agentic-retrieval/ANALYSIS.md +658 -0
- package/ee/agentic-retrieval/logs/README.md +198 -0
- package/ee/agentic-retrieval/v2.ts +1628 -0
- package/ee/agentic-retrieval/v3/agent-loop.ts +242 -0
- package/ee/agentic-retrieval/v3/classifier.ts +73 -0
- package/ee/agentic-retrieval/v3/context-sampler.ts +70 -0
- package/ee/agentic-retrieval/v3/dynamic-tools.ts +115 -0
- package/ee/agentic-retrieval/v3/index.ts +281 -0
- package/ee/agentic-retrieval/v3/strategies.ts +167 -0
- package/ee/agentic-retrieval/v3/tools.ts +435 -0
- package/ee/agentic-retrieval/v3/trajectory.ts +96 -0
- package/ee/agentic-retrieval/v3/types.ts +59 -0
- package/ee/agentic-retrieval/v4/agent-loop.ts +121 -0
- package/ee/agentic-retrieval/v4/embed-preprocessor.ts +76 -0
- package/ee/agentic-retrieval/v4/index.ts +181 -0
- package/ee/agentic-retrieval/v4/system-prompt.ts +248 -0
- package/ee/agentic-retrieval/v4/tools.ts +241 -0
- package/ee/agentic-retrieval/v4/types.ts +29 -0
- package/ee/chunking/markdown.ts +4 -2
- package/ee/workers.ts +1 -1
- package/package.json +6 -3
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import * as fs from "fs/promises";
|
|
2
|
+
import * as path from "path";
|
|
3
|
+
import { exec } from "child_process";
|
|
4
|
+
import { promisify } from "util";
|
|
5
|
+
import { z } from "zod";
|
|
6
|
+
import { tool } from "ai";
|
|
7
|
+
import { postgresClient } from "@SRC/postgres/client";
|
|
8
|
+
import type { ExuluContext } from "@SRC/exulu/context";
|
|
9
|
+
import type { User } from "@EXULU_TYPES/models/user";
|
|
10
|
+
import { preprocessEmbedCalls } from "./embed-preprocessor";
|
|
11
|
+
import type { ChunkResult } from "./types";
|
|
12
|
+
|
|
13
|
+
const execAsync = promisify(exec);
|
|
14
|
+
|
|
15
|
+
const MAX_INLINE_CHARS = 20_000;
|
|
16
|
+
const MAX_GREP_OUTPUT_CHARS = 5_000;
|
|
17
|
+
|
|
18
|
+
// ──────────────────────────────────────────────────────────────────────────────
|
|
19
|
+
// SQL safety: only allow read-only statements
|
|
20
|
+
// ──────────────────────────────────────────────────────────────────────────────
|
|
21
|
+
|
|
22
|
+
const WRITE_PATTERN =
|
|
23
|
+
/^\s*(INSERT|UPDATE|DELETE|DROP|CREATE|ALTER|TRUNCATE|GRANT|REVOKE|VACUUM|ANALYZE|EXPLAIN\s+ANALYZE)\b/i;
|
|
24
|
+
|
|
25
|
+
function assertReadOnly(sql: string): void {
|
|
26
|
+
if (WRITE_PATTERN.test(sql)) {
|
|
27
|
+
throw new Error(
|
|
28
|
+
"Only SELECT queries are allowed. Write operations (INSERT, UPDATE, DELETE, DROP, etc.) are not permitted.",
|
|
29
|
+
);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// ──────────────────────────────────────────────────────────────────────────────
|
|
34
|
+
// Chunk harvesting: extract ChunkResult objects from raw SQL result rows
|
|
35
|
+
// ──────────────────────────────────────────────────────────────────────────────
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Tries to interpret a raw DB row as a ChunkResult.
|
|
39
|
+
* The system prompt instructs the agent to use standard aliases, so we look for
|
|
40
|
+
* those first and fall back to common alternative column names.
|
|
41
|
+
*/
|
|
42
|
+
export function rowToChunkResult(row: Record<string, any>): ChunkResult | null {
|
|
43
|
+
const chunkId = row.chunk_id ?? row.id;
|
|
44
|
+
const chunkContent = row.chunk_content ?? row.content;
|
|
45
|
+
const itemId = row.item_id ?? row.source;
|
|
46
|
+
const context = row.context ?? row.context_id;
|
|
47
|
+
const itemName = row.item_name ?? row.name;
|
|
48
|
+
|
|
49
|
+
// Require at minimum a chunk identifier and either content or an item reference
|
|
50
|
+
if (!chunkId || (!chunkContent && !itemId)) return null;
|
|
51
|
+
|
|
52
|
+
return {
|
|
53
|
+
item_name: itemName ?? "",
|
|
54
|
+
item_id: itemId ?? "",
|
|
55
|
+
context: context ?? "",
|
|
56
|
+
chunk_id: chunkId,
|
|
57
|
+
chunk_index: row.chunk_index ?? undefined,
|
|
58
|
+
chunk_content: chunkContent ?? undefined,
|
|
59
|
+
metadata: row.metadata ?? row.chunk_metadata ?? undefined,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// ──────────────────────────────────────────────────────────────────────────────
|
|
64
|
+
// Tool factory
|
|
65
|
+
// ──────────────────────────────────────────────────────────────────────────────
|
|
66
|
+
|
|
67
|
+
export type ToolFactoryParams = {
|
|
68
|
+
contexts: ExuluContext[];
|
|
69
|
+
user?: User;
|
|
70
|
+
role?: string;
|
|
71
|
+
sessionDir: string;
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
// eslint-disable-next-line @typescript-eslint/explicit-function-return-type
|
|
75
|
+
export function createTools(params: ToolFactoryParams) {
|
|
76
|
+
const { contexts, user, role, sessionDir } = params;
|
|
77
|
+
let queryCount = 0;
|
|
78
|
+
|
|
79
|
+
// ── execute_query ────────────────────────────────────────────────────────────
|
|
80
|
+
|
|
81
|
+
const execute_query = tool({
|
|
82
|
+
description: `Execute a read-only PostgreSQL SELECT query against the knowledge base.
|
|
83
|
+
|
|
84
|
+
Use this to search, filter, aggregate, and explore content. The database contains items
|
|
85
|
+
and chunks tables for each knowledge base (see schema in the system prompt).
|
|
86
|
+
|
|
87
|
+
Use embed('your text') anywhere in the query to generate a semantic search vector:
|
|
88
|
+
embedding <=> embed('machine learning') AS distance
|
|
89
|
+
|
|
90
|
+
If the result exceeds ${(MAX_INLINE_CHARS / 1000).toFixed(0)}k characters it is saved to a file.
|
|
91
|
+
Use the grep tool to iteratively search the file for relevant information.`,
|
|
92
|
+
inputSchema: z.object({
|
|
93
|
+
sql: z.string().describe("A read-only SELECT (or WITH ... SELECT) PostgreSQL query"),
|
|
94
|
+
}),
|
|
95
|
+
execute: async ({ sql }) => {
|
|
96
|
+
assertReadOnly(sql);
|
|
97
|
+
|
|
98
|
+
let processedSql: string;
|
|
99
|
+
try {
|
|
100
|
+
processedSql = await preprocessEmbedCalls(sql, contexts, user, role);
|
|
101
|
+
} catch (err: any) {
|
|
102
|
+
return JSON.stringify({ error: `embed() preprocessing failed: ${err.message}` });
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
let rows: any[];
|
|
106
|
+
try {
|
|
107
|
+
const { db } = await postgresClient();
|
|
108
|
+
const result = await db.raw(processedSql);
|
|
109
|
+
rows = result.rows ?? [];
|
|
110
|
+
} catch (err: any) {
|
|
111
|
+
return JSON.stringify({ error: `Query failed: ${err.message}` });
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const json = JSON.stringify(rows, null, 2);
|
|
115
|
+
|
|
116
|
+
if (json.length <= MAX_INLINE_CHARS) {
|
|
117
|
+
return json;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Results are large — store to session dir and tell the agent to grep
|
|
121
|
+
await fs.mkdir(sessionDir, { recursive: true });
|
|
122
|
+
const filename = `query_${++queryCount}.json`;
|
|
123
|
+
const filePath = path.join(sessionDir, filename);
|
|
124
|
+
await fs.writeFile(filePath, json, "utf-8");
|
|
125
|
+
|
|
126
|
+
return JSON.stringify({
|
|
127
|
+
stored: true,
|
|
128
|
+
file: filePath,
|
|
129
|
+
row_count: rows.length,
|
|
130
|
+
message: `Results too large to display (${rows.length} rows, ${(json.length / 1000).toFixed(1)}k chars). Stored at ${filePath}. Use the grep tool to search for relevant information.`,
|
|
131
|
+
grep_hint: `grep -i "keyword" ${filePath}`,
|
|
132
|
+
});
|
|
133
|
+
},
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
// ── grep ─────────────────────────────────────────────────────────────────────
|
|
137
|
+
|
|
138
|
+
const grep = tool({
|
|
139
|
+
description: `Search a stored query result file using grep.
|
|
140
|
+
|
|
141
|
+
Use this after execute_query returns a file path because results were too large.
|
|
142
|
+
Iteratively narrow down the results with multiple grep calls.`,
|
|
143
|
+
inputSchema: z.object({
|
|
144
|
+
pattern: z.string().describe("Regular expression or literal string to search for"),
|
|
145
|
+
file: z.string().describe("Absolute path to the file returned by execute_query"),
|
|
146
|
+
context_lines: z
|
|
147
|
+
.number()
|
|
148
|
+
.int()
|
|
149
|
+
.min(0)
|
|
150
|
+
.max(10)
|
|
151
|
+
.default(2)
|
|
152
|
+
.describe("Number of lines of context to show around each match (default 2)"),
|
|
153
|
+
case_insensitive: z
|
|
154
|
+
.boolean()
|
|
155
|
+
.default(true)
|
|
156
|
+
.describe("Case-insensitive matching (default true)"),
|
|
157
|
+
}),
|
|
158
|
+
execute: async ({ pattern, file, context_lines, case_insensitive }) => {
|
|
159
|
+
// Security: only allow reading from our session directory
|
|
160
|
+
const resolvedFile = path.resolve(file);
|
|
161
|
+
const resolvedSession = path.resolve(sessionDir);
|
|
162
|
+
if (!resolvedFile.startsWith(resolvedSession)) {
|
|
163
|
+
return JSON.stringify({
|
|
164
|
+
error: `Access denied. Only files within the session directory (${sessionDir}) can be searched.`,
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Verify file exists
|
|
169
|
+
try {
|
|
170
|
+
await fs.access(resolvedFile);
|
|
171
|
+
} catch {
|
|
172
|
+
return JSON.stringify({ error: `File not found: ${file}` });
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
const flags = [
|
|
176
|
+
"-n",
|
|
177
|
+
context_lines > 0 ? `-C${context_lines}` : "",
|
|
178
|
+
case_insensitive ? "-i" : "",
|
|
179
|
+
]
|
|
180
|
+
.filter(Boolean)
|
|
181
|
+
.join(" ");
|
|
182
|
+
|
|
183
|
+
// Escape pattern for shell to prevent injection
|
|
184
|
+
const escapedPattern = pattern.replace(/'/g, `'\\''`);
|
|
185
|
+
const cmd = `grep ${flags} '${escapedPattern}' '${resolvedFile}'`;
|
|
186
|
+
|
|
187
|
+
let output: string;
|
|
188
|
+
try {
|
|
189
|
+
const { stdout } = await execAsync(cmd, { maxBuffer: 10 * 1024 * 1024 });
|
|
190
|
+
output = stdout;
|
|
191
|
+
} catch (err: any) {
|
|
192
|
+
// grep exits with code 1 when no matches — that's not an error
|
|
193
|
+
if (err.code === 1) {
|
|
194
|
+
return JSON.stringify({ matches: 0, output: "No matches found." });
|
|
195
|
+
}
|
|
196
|
+
return JSON.stringify({ error: `grep failed: ${err.message}` });
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
if (output.length > MAX_GREP_OUTPUT_CHARS) {
|
|
200
|
+
output =
|
|
201
|
+
output.slice(0, MAX_GREP_OUTPUT_CHARS) +
|
|
202
|
+
`\n... (output truncated at ${MAX_GREP_OUTPUT_CHARS} chars — refine your pattern to narrow results)`;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const lineCount = output.split("\n").filter(Boolean).length;
|
|
206
|
+
return JSON.stringify({ matches: lineCount, output });
|
|
207
|
+
},
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
return { execute_query, grep };
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/**
|
|
214
|
+
* Harvests ChunkResult objects from all tool results in a step.
|
|
215
|
+
* Called after each agent step to collect any chunk-shaped rows the agent retrieved.
|
|
216
|
+
*/
|
|
217
|
+
export function harvestChunks(toolResults: any[]): ChunkResult[] {
|
|
218
|
+
const chunks: ChunkResult[] = [];
|
|
219
|
+
|
|
220
|
+
for (const result of toolResults ?? []) {
|
|
221
|
+
const rawOutput = result.output ?? result.result;
|
|
222
|
+
let parsed: any;
|
|
223
|
+
try {
|
|
224
|
+
parsed = typeof rawOutput === "string" ? JSON.parse(rawOutput) : rawOutput;
|
|
225
|
+
} catch {
|
|
226
|
+
continue;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// Array of rows (direct SELECT result)
|
|
230
|
+
if (Array.isArray(parsed)) {
|
|
231
|
+
for (const row of parsed) {
|
|
232
|
+
if (row && typeof row === "object") {
|
|
233
|
+
const chunk = rowToChunkResult(row);
|
|
234
|
+
if (chunk) chunks.push(chunk);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return chunks;
|
|
241
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
export interface ChunkResult {
|
|
2
|
+
item_name: string;
|
|
3
|
+
item_id: string;
|
|
4
|
+
context: string;
|
|
5
|
+
chunk_id?: string;
|
|
6
|
+
chunk_index?: number;
|
|
7
|
+
chunk_content?: string;
|
|
8
|
+
metadata?: Record<string, any>;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export interface RetrievalStep {
|
|
12
|
+
stepNumber: number;
|
|
13
|
+
text: string;
|
|
14
|
+
toolCalls: Array<{ name: string; id: string; input: any }>;
|
|
15
|
+
chunks: ChunkResult[];
|
|
16
|
+
tokens: number;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface AgenticRetrievalOutput {
|
|
20
|
+
steps: RetrievalStep[];
|
|
21
|
+
reasoning: Array<{
|
|
22
|
+
text: string;
|
|
23
|
+
tools: { name: string; id: string; input: any; output: any }[];
|
|
24
|
+
}>;
|
|
25
|
+
chunks: ChunkResult[];
|
|
26
|
+
usage: any[];
|
|
27
|
+
totalTokens: number;
|
|
28
|
+
trajectoryFile?: string;
|
|
29
|
+
}
|
package/ee/chunking/markdown.ts
CHANGED
|
@@ -516,7 +516,9 @@ export class MarkdownChunker {
|
|
|
516
516
|
return newHeaders;
|
|
517
517
|
}
|
|
518
518
|
|
|
519
|
-
public async chunk(text: string, chunkSize: number, prefix?: string
|
|
519
|
+
public async chunk(text: string, chunkSize: number, prefix?: string, config?: {
|
|
520
|
+
pageBreakTags?: boolean;
|
|
521
|
+
}): Promise<{
|
|
520
522
|
text: string;
|
|
521
523
|
page: number;
|
|
522
524
|
}[]> {
|
|
@@ -684,7 +686,7 @@ export class MarkdownChunker {
|
|
|
684
686
|
finalText = headerPrefixText + '\n\n' + currentSlice;
|
|
685
687
|
}
|
|
686
688
|
|
|
687
|
-
if (currentPage) {
|
|
689
|
+
if (currentPage && config?.pageBreakTags) {
|
|
688
690
|
finalText = `<!-- Current page: ${currentPage} -->\n\n` + finalText;
|
|
689
691
|
}
|
|
690
692
|
|
package/ee/workers.ts
CHANGED
|
@@ -1311,7 +1311,7 @@ const pollJobResult = async ({
|
|
|
1311
1311
|
break;
|
|
1312
1312
|
}
|
|
1313
1313
|
// Wait for 2 seconds before polling again
|
|
1314
|
-
await new Promise((resolve) => setTimeout((
|
|
1314
|
+
await new Promise((resolve) => setTimeout(() => resolve(true), 2000));
|
|
1315
1315
|
}
|
|
1316
1316
|
return result;
|
|
1317
1317
|
};
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@exulu/backend",
|
|
3
3
|
"author": "Qventu Bv.",
|
|
4
|
-
"version": "1.
|
|
4
|
+
"version": "1.54.0",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"private": false,
|
|
7
7
|
"publishConfig": {
|
|
@@ -87,6 +87,7 @@
|
|
|
87
87
|
},
|
|
88
88
|
"dependencies": {
|
|
89
89
|
"@ai-sdk/anthropic": "^3.0.23",
|
|
90
|
+
"@ai-sdk/azure": "^3.0.53",
|
|
90
91
|
"@ai-sdk/cerebras": "^2.0.29",
|
|
91
92
|
"@ai-sdk/google-vertex": "^4.0.28",
|
|
92
93
|
"@ai-sdk/openai": "^3.0.18",
|
|
@@ -111,6 +112,7 @@
|
|
|
111
112
|
"@opentelemetry/winston-transport": "^0.14.1",
|
|
112
113
|
"@perplexity-ai/perplexity_ai": "^0.25.0",
|
|
113
114
|
"ai": "^6.0.49",
|
|
115
|
+
"bash-tool": "^1.3.16",
|
|
114
116
|
"bcryptjs": "^3.0.2",
|
|
115
117
|
"body-parser": "^2.2.0",
|
|
116
118
|
"bullmq": "^5.48.1",
|
|
@@ -132,12 +134,13 @@
|
|
|
132
134
|
"jose": "^6.0.10",
|
|
133
135
|
"json-schema-to-zod": "^2.6.1",
|
|
134
136
|
"jsonwebtoken": "^9.0.2",
|
|
137
|
+
"just-bash": "^2.14.0",
|
|
135
138
|
"knex": "^3.1.0",
|
|
136
139
|
"link": "^2.1.1",
|
|
137
140
|
"mammoth": "^1.11.0",
|
|
138
141
|
"natural": "^8.1.0",
|
|
139
142
|
"officeparser": "^5.2.2",
|
|
140
|
-
"openai": "^
|
|
143
|
+
"openai": "^6.34.0",
|
|
141
144
|
"p-limit": "^7.3.0",
|
|
142
145
|
"papaparse": "^5.5.2",
|
|
143
146
|
"pg": "^8.16.3",
|
|
@@ -150,7 +153,7 @@
|
|
|
150
153
|
"wink-nlp": "^2.4.0",
|
|
151
154
|
"winston": "^3.17.0",
|
|
152
155
|
"word-extractor": "^1.0.4",
|
|
153
|
-
"zod": "^3.
|
|
156
|
+
"zod": "^4.3.6",
|
|
154
157
|
"zod-from-json-schema": "^0.5.2",
|
|
155
158
|
"zod-to-json-schema": "^3.25.1",
|
|
156
159
|
"zodex": "^0.18.2"
|