agentlang 0.9.9 → 0.9.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/extension/main.cjs +38 -38
- package/out/extension/main.cjs.map +2 -2
- package/out/language/generated/ast.d.ts +1 -1
- package/out/language/generated/ast.js +1 -1
- package/out/language/generated/grammar.d.ts +1 -1
- package/out/language/generated/grammar.js +1 -1
- package/out/language/generated/module.d.ts +1 -1
- package/out/language/generated/module.js +1 -1
- package/out/language/main.cjs +850 -2388
- package/out/language/main.cjs.map +4 -4
- package/out/runtime/agents/common.d.ts +3 -1
- package/out/runtime/agents/common.d.ts.map +1 -1
- package/out/runtime/agents/common.js +35 -31
- package/out/runtime/agents/common.js.map +1 -1
- package/out/runtime/docs.d.ts +1 -0
- package/out/runtime/docs.d.ts.map +1 -1
- package/out/runtime/docs.js +16 -1
- package/out/runtime/docs.js.map +1 -1
- package/out/runtime/interpreter.d.ts +1 -0
- package/out/runtime/interpreter.d.ts.map +1 -1
- package/out/runtime/interpreter.js +60 -9
- package/out/runtime/interpreter.js.map +1 -1
- package/out/runtime/jsmodules.d.ts +2 -1
- package/out/runtime/jsmodules.d.ts.map +1 -1
- package/out/runtime/jsmodules.js +2 -1
- package/out/runtime/jsmodules.js.map +1 -1
- package/out/runtime/loader.d.ts.map +1 -1
- package/out/runtime/loader.js +3 -2
- package/out/runtime/loader.js.map +1 -1
- package/out/runtime/module.d.ts +1 -0
- package/out/runtime/module.d.ts.map +1 -1
- package/out/runtime/module.js +3 -0
- package/out/runtime/module.js.map +1 -1
- package/out/runtime/modules/ai.d.ts +12 -0
- package/out/runtime/modules/ai.d.ts.map +1 -1
- package/out/runtime/modules/ai.js +225 -28
- package/out/runtime/modules/ai.js.map +1 -1
- package/out/runtime/modules/core.d.ts.map +1 -1
- package/out/runtime/modules/core.js +7 -1
- package/out/runtime/modules/core.js.map +1 -1
- package/out/runtime/resolvers/sqldb/impl.d.ts.map +1 -1
- package/out/runtime/resolvers/sqldb/impl.js +37 -6
- package/out/runtime/resolvers/sqldb/impl.js.map +1 -1
- package/out/runtime/services/documentFetcher.d.ts +70 -0
- package/out/runtime/services/documentFetcher.d.ts.map +1 -0
- package/out/runtime/services/documentFetcher.js +582 -0
- package/out/runtime/services/documentFetcher.js.map +1 -0
- package/package.json +2 -1
- package/src/language/generated/ast.ts +1 -1
- package/src/language/generated/grammar.ts +1 -1
- package/src/language/generated/module.ts +1 -1
- package/src/runtime/agents/common.ts +37 -31
- package/src/runtime/docs.ts +17 -1
- package/src/runtime/interpreter.ts +64 -7
- package/src/runtime/jsmodules.ts +3 -1
- package/src/runtime/loader.ts +3 -2
- package/src/runtime/module.ts +4 -0
- package/src/runtime/modules/ai.ts +270 -33
- package/src/runtime/modules/core.ts +7 -1
- package/src/runtime/resolvers/sqldb/impl.ts +36 -6
- package/src/runtime/services/documentFetcher.ts +691 -0
|
@@ -0,0 +1,691 @@
|
|
|
1
|
+
import { GetObjectCommand, S3Client } from '@aws-sdk/client-s3';
|
|
2
|
+
import { readFile } from 'node:fs/promises';
|
|
3
|
+
import { logger } from '../logger.js';
|
|
4
|
+
import { parseAndEvaluateStatement } from '../interpreter.js';
|
|
5
|
+
import { CoreAIModuleName } from '../modules/ai.js';
|
|
6
|
+
import { TtlCache } from '../state.js';
|
|
7
|
+
import { preprocessRawConfig } from '../util.js';
|
|
8
|
+
import { marked } from 'marked';
|
|
9
|
+
import { isNodeEnv } from '../../utils/runtime.js';
|
|
10
|
+
|
|
11
|
+
// Provider-specific configurations
|
|
12
|
+
export interface S3Config {
|
|
13
|
+
region?: string;
|
|
14
|
+
endpoint?: string;
|
|
15
|
+
accessKeyId?: string;
|
|
16
|
+
secretAccessKey?: string;
|
|
17
|
+
forcePathStyle?: boolean;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// Generic retrieval configuration for any storage provider
|
|
21
|
+
export interface RetrievalConfig {
|
|
22
|
+
provider: 's3' | 'box' | 'gdrive' | 'azure' | 'onedrive' | 'document-service' | string;
|
|
23
|
+
config: S3Config | Record<string, any>;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export interface EmbeddingConfig {
|
|
27
|
+
provider?: string;
|
|
28
|
+
model?: string;
|
|
29
|
+
chunkSize?: number;
|
|
30
|
+
chunkOverlap?: number;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export interface DocumentConfig {
|
|
34
|
+
title: string;
|
|
35
|
+
url?: string;
|
|
36
|
+
documentServiceId?: string;
|
|
37
|
+
retrievalConfig?: RetrievalConfig;
|
|
38
|
+
embeddingConfig?: EmbeddingConfig;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export interface FetchedDocument {
|
|
42
|
+
title: string;
|
|
43
|
+
content: string;
|
|
44
|
+
url: string;
|
|
45
|
+
format: string;
|
|
46
|
+
fetchedAt: Date;
|
|
47
|
+
embeddingConfig?: EmbeddingConfig;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
interface DocumentServiceConfig {
|
|
51
|
+
baseUrl: string;
|
|
52
|
+
appName: string;
|
|
53
|
+
authToken?: string;
|
|
54
|
+
getAuthToken?: () => Promise<string>;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
class DocumentFetcherService {
|
|
58
|
+
private static readonly CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
|
|
59
|
+
private documentCache = new TtlCache<FetchedDocument>(DocumentFetcherService.CACHE_TTL_MS);
|
|
60
|
+
private s3Clients = new Map<string, any>();
|
|
61
|
+
private pdfParser: any = null;
|
|
62
|
+
private documentServiceConfig?: DocumentServiceConfig;
|
|
63
|
+
|
|
64
|
+
configureDocumentService(config: DocumentServiceConfig): void {
|
|
65
|
+
this.documentServiceConfig = config;
|
|
66
|
+
logger.info('Document service configured', { baseUrl: config.baseUrl });
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
async fetchDocument(config: DocumentConfig): Promise<FetchedDocument | null> {
|
|
70
|
+
this.ensureNodeEnv();
|
|
71
|
+
const cacheKey = `${config.title}:${config.url || config.documentServiceId}`;
|
|
72
|
+
const cached = this.documentCache.get(cacheKey);
|
|
73
|
+
|
|
74
|
+
if (cached) {
|
|
75
|
+
logger.debug('Returning cached document', { title: config.title });
|
|
76
|
+
return cached;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
try {
|
|
80
|
+
let content: string;
|
|
81
|
+
let sourceUrl: string;
|
|
82
|
+
|
|
83
|
+
if (config.url?.startsWith('document-service://')) {
|
|
84
|
+
if (!config.retrievalConfig || config.retrievalConfig.provider !== 'document-service') {
|
|
85
|
+
throw new Error(
|
|
86
|
+
'Document service URL requires retrievalConfig with provider: "document-service"'
|
|
87
|
+
);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
const dsConfig = config.retrievalConfig.config as DocumentServiceConfig;
|
|
91
|
+
if (!dsConfig?.baseUrl) {
|
|
92
|
+
throw new Error('Document service config requires baseUrl');
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const urlPath = config.url.replace('document-service://', '');
|
|
96
|
+
const parts = urlPath.split('/');
|
|
97
|
+
|
|
98
|
+
if (parts.length !== 3) {
|
|
99
|
+
throw new Error(
|
|
100
|
+
`Invalid document service URL format: ${config.url}. Expected: document-service://<user-uuid>/<app-uuid>/<doc-uuid>.ext`
|
|
101
|
+
);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const appUuid = parts[1];
|
|
105
|
+
const docIdWithExt = parts[2];
|
|
106
|
+
const docId = docIdWithExt.split('.')[0]; // Remove extension
|
|
107
|
+
|
|
108
|
+
this.documentServiceConfig = {
|
|
109
|
+
baseUrl: dsConfig.baseUrl,
|
|
110
|
+
appName: appUuid,
|
|
111
|
+
authToken: dsConfig.authToken,
|
|
112
|
+
getAuthToken: dsConfig.getAuthToken,
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
content = await this.fetchFromDocumentService(docId);
|
|
116
|
+
sourceUrl = config.url;
|
|
117
|
+
} else if (config.retrievalConfig?.provider === 'document-service') {
|
|
118
|
+
const dsConfig = config.retrievalConfig.config as DocumentServiceConfig;
|
|
119
|
+
if (!dsConfig?.baseUrl || !dsConfig?.appName) {
|
|
120
|
+
throw new Error('Document service config requires baseUrl and appName');
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
this.documentServiceConfig = {
|
|
124
|
+
baseUrl: dsConfig.baseUrl,
|
|
125
|
+
appName: dsConfig.appName,
|
|
126
|
+
authToken: dsConfig.authToken,
|
|
127
|
+
getAuthToken: dsConfig.getAuthToken,
|
|
128
|
+
};
|
|
129
|
+
|
|
130
|
+
const docId = await this.lookupDocumentByTitle(config.title);
|
|
131
|
+
if (docId) {
|
|
132
|
+
content = await this.fetchFromDocumentService(docId);
|
|
133
|
+
sourceUrl = `document-service://${docId}`;
|
|
134
|
+
} else {
|
|
135
|
+
throw new Error(`Document not found by title in document service: ${config.title}`);
|
|
136
|
+
}
|
|
137
|
+
} else if (config.documentServiceId && this.documentServiceConfig) {
|
|
138
|
+
content = await this.fetchFromDocumentService(config.documentServiceId);
|
|
139
|
+
sourceUrl = `document-service://${config.documentServiceId}`;
|
|
140
|
+
} else if (config.url?.startsWith('s3://')) {
|
|
141
|
+
content = await this.fetchFromS3(config);
|
|
142
|
+
sourceUrl = config.url;
|
|
143
|
+
} else if (config.url?.startsWith('http://') || config.url?.startsWith('https://')) {
|
|
144
|
+
content = await this.fetchFromUrl(config.url);
|
|
145
|
+
sourceUrl = config.url;
|
|
146
|
+
} else if (config.url) {
|
|
147
|
+
content = await this.fetchFromLocal(config.url);
|
|
148
|
+
sourceUrl = config.url;
|
|
149
|
+
} else {
|
|
150
|
+
if (this.documentServiceConfig) {
|
|
151
|
+
const docId = await this.lookupDocumentByTitle(config.title);
|
|
152
|
+
if (docId) {
|
|
153
|
+
content = await this.fetchFromDocumentService(docId);
|
|
154
|
+
sourceUrl = `document-service://${docId}`;
|
|
155
|
+
} else {
|
|
156
|
+
throw new Error(`Document not found by title: ${config.title}`);
|
|
157
|
+
}
|
|
158
|
+
} else {
|
|
159
|
+
throw new Error(`No URL or document service ID provided for: ${config.title}`);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
const document: FetchedDocument = {
|
|
164
|
+
title: config.title,
|
|
165
|
+
content,
|
|
166
|
+
url: sourceUrl,
|
|
167
|
+
format: this.inferFormat(sourceUrl),
|
|
168
|
+
fetchedAt: new Date(),
|
|
169
|
+
embeddingConfig: config.embeddingConfig,
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
this.documentCache.set(cacheKey, document);
|
|
173
|
+
|
|
174
|
+
await this.createDocumentEntity(document);
|
|
175
|
+
|
|
176
|
+
return document;
|
|
177
|
+
} catch (error) {
|
|
178
|
+
logger.error('Failed to fetch document', {
|
|
179
|
+
title: config.title,
|
|
180
|
+
url: config.url,
|
|
181
|
+
documentServiceId: config.documentServiceId,
|
|
182
|
+
error: error instanceof Error ? error.message : String(error),
|
|
183
|
+
stack: error instanceof Error ? error.stack : undefined,
|
|
184
|
+
});
|
|
185
|
+
throw error;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
async fetchDocumentByTitle(title: string): Promise<FetchedDocument | null> {
|
|
190
|
+
this.ensureNodeEnv();
|
|
191
|
+
|
|
192
|
+
try {
|
|
193
|
+
// First check if we have it in cache
|
|
194
|
+
const cacheKey = `${title}:lookup`;
|
|
195
|
+
const cached = this.documentCache.get(cacheKey);
|
|
196
|
+
if (cached) {
|
|
197
|
+
logger.debug('Returning cached document by title', { title });
|
|
198
|
+
return cached;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Try document service lookup first (if configured)
|
|
202
|
+
if (this.documentServiceConfig) {
|
|
203
|
+
const docId = await this.lookupDocumentByTitle(title);
|
|
204
|
+
if (docId) {
|
|
205
|
+
return this.fetchDocument({
|
|
206
|
+
title,
|
|
207
|
+
documentServiceId: docId,
|
|
208
|
+
});
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// Fall back to config-based lookup
|
|
213
|
+
const doc = this.findDocumentInConfig(title);
|
|
214
|
+
if (doc) {
|
|
215
|
+
return this.fetchDocument(doc);
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
logger.warn('Document not found', { title });
|
|
219
|
+
return null;
|
|
220
|
+
} catch (error) {
|
|
221
|
+
logger.error('Failed to fetch document by title', { title, error });
|
|
222
|
+
return null;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// Fetch from secure document-service API
|
|
227
|
+
private async fetchFromDocumentService(documentId: string): Promise<string> {
|
|
228
|
+
if (!this.documentServiceConfig) {
|
|
229
|
+
throw new Error('Document service not configured');
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
try {
|
|
233
|
+
// Get token - either static from config or dynamic from function
|
|
234
|
+
let token: string;
|
|
235
|
+
if (this.documentServiceConfig.authToken) {
|
|
236
|
+
token = this.documentServiceConfig.authToken;
|
|
237
|
+
} else if (this.documentServiceConfig.getAuthToken) {
|
|
238
|
+
token = await this.documentServiceConfig.getAuthToken();
|
|
239
|
+
} else {
|
|
240
|
+
throw new Error('Document service requires authToken or getAuthToken');
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
const url = `${this.documentServiceConfig.baseUrl}/api/documents/${documentId}/content`;
|
|
244
|
+
|
|
245
|
+
logger.debug('Fetching from document service', { documentId, url });
|
|
246
|
+
|
|
247
|
+
const response = await fetch(url, {
|
|
248
|
+
headers: {
|
|
249
|
+
Authorization: `Bearer ${token}`,
|
|
250
|
+
'x-app-name': this.documentServiceConfig.appName,
|
|
251
|
+
Accept: 'application/json',
|
|
252
|
+
},
|
|
253
|
+
});
|
|
254
|
+
|
|
255
|
+
if (!response.ok) {
|
|
256
|
+
if (response.status === 404) {
|
|
257
|
+
throw new Error(`Document not found: ${documentId}`);
|
|
258
|
+
} else if (response.status === 403) {
|
|
259
|
+
throw new Error(`Access denied to document: ${documentId}`);
|
|
260
|
+
} else {
|
|
261
|
+
throw new Error(`Document service error: ${response.status} ${response.statusText}`);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
const data = await response.json();
|
|
266
|
+
|
|
267
|
+
if (data.isBase64) {
|
|
268
|
+
if (data.mimeType?.includes('pdf') || data.format?.toLowerCase() === 'pdf') {
|
|
269
|
+
try {
|
|
270
|
+
const { parsePdfBuffer } = await import('../docs.js');
|
|
271
|
+
const buffer = Buffer.from(data.content, 'base64');
|
|
272
|
+
const text = await parsePdfBuffer(new Uint8Array(buffer));
|
|
273
|
+
logger.debug('Extracted text from PDF', { documentId, textLength: text.length });
|
|
274
|
+
return text;
|
|
275
|
+
} catch (pdfError: any) {
|
|
276
|
+
logger.error('Failed to parse PDF from document service', {
|
|
277
|
+
documentId,
|
|
278
|
+
error: pdfError.message,
|
|
279
|
+
});
|
|
280
|
+
throw new Error(`Failed to extract text from PDF: ${pdfError.message}`);
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
return Buffer.from(data.content, 'base64').toString('utf-8');
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
if (data.format?.toLowerCase() === 'md' || data.format?.toLowerCase() === 'markdown') {
|
|
287
|
+
try {
|
|
288
|
+
const parsedText = this.parseMarkdownText(data.content);
|
|
289
|
+
logger.debug('Parsed markdown content', { documentId, textLength: parsedText.length });
|
|
290
|
+
return parsedText;
|
|
291
|
+
} catch (mdError: any) {
|
|
292
|
+
logger.warn('Markdown parsing failed, returning raw content', {
|
|
293
|
+
documentId,
|
|
294
|
+
error: mdError.message,
|
|
295
|
+
});
|
|
296
|
+
return data.content;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
return data.content;
|
|
301
|
+
} catch (error) {
|
|
302
|
+
logger.error('Document service fetch failed', {
|
|
303
|
+
documentId,
|
|
304
|
+
error: error instanceof Error ? error.message : String(error),
|
|
305
|
+
});
|
|
306
|
+
throw error;
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
private async lookupDocumentByTitle(title: string): Promise<string | null> {
|
|
311
|
+
if (!this.documentServiceConfig) {
|
|
312
|
+
return null;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
try {
|
|
316
|
+
let token: string;
|
|
317
|
+
if (this.documentServiceConfig.authToken) {
|
|
318
|
+
token = this.documentServiceConfig.authToken;
|
|
319
|
+
} else if (this.documentServiceConfig.getAuthToken) {
|
|
320
|
+
token = await this.documentServiceConfig.getAuthToken();
|
|
321
|
+
} else {
|
|
322
|
+
throw new Error('Document service requires authToken or getAuthToken');
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
const url = `${this.documentServiceConfig.baseUrl}/api/documents/lookup/by-title?title=${encodeURIComponent(title)}`;
|
|
326
|
+
|
|
327
|
+
logger.debug('Looking up document by title', { title, url });
|
|
328
|
+
|
|
329
|
+
const response = await fetch(url, {
|
|
330
|
+
headers: {
|
|
331
|
+
Authorization: `Bearer ${token}`,
|
|
332
|
+
'x-app-name': this.documentServiceConfig.appName,
|
|
333
|
+
Accept: 'application/json',
|
|
334
|
+
},
|
|
335
|
+
});
|
|
336
|
+
|
|
337
|
+
if (response.status === 404) {
|
|
338
|
+
logger.debug('Document not found by title', { title });
|
|
339
|
+
return null;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
if (!response.ok) {
|
|
343
|
+
throw new Error(`Document service lookup error: ${response.status}`);
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
const data = await response.json();
|
|
347
|
+
logger.debug('Found document by title', { title, documentId: data.documentId });
|
|
348
|
+
return data.documentId;
|
|
349
|
+
} catch (error) {
|
|
350
|
+
logger.error('Document lookup failed', {
|
|
351
|
+
title,
|
|
352
|
+
error: error instanceof Error ? error.message : String(error),
|
|
353
|
+
});
|
|
354
|
+
return null;
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
private async fetchFromS3(config: DocumentConfig): Promise<string> {
|
|
359
|
+
const s3Config = this.parseS3Url(config.url!, config.retrievalConfig);
|
|
360
|
+
const client = await this.getOrCreateS3Client(s3Config);
|
|
361
|
+
|
|
362
|
+
try {
|
|
363
|
+
const response = await client.send(
|
|
364
|
+
new GetObjectCommand({
|
|
365
|
+
Bucket: s3Config.bucket,
|
|
366
|
+
Key: s3Config.key,
|
|
367
|
+
})
|
|
368
|
+
);
|
|
369
|
+
|
|
370
|
+
if (!response.Body) {
|
|
371
|
+
throw new Error('S3 object has no body');
|
|
372
|
+
}
|
|
373
|
+
const bodyBuffer = await this.readS3BodyToBuffer(response.Body as any);
|
|
374
|
+
const contentType = (response.ContentType || '').toLowerCase();
|
|
375
|
+
const lowerKey = s3Config.key.toLowerCase();
|
|
376
|
+
const isPdf = contentType.includes('application/pdf') || lowerKey.endsWith('.pdf');
|
|
377
|
+
const isMarkdown =
|
|
378
|
+
contentType.includes('text/markdown') ||
|
|
379
|
+
lowerKey.endsWith('.md') ||
|
|
380
|
+
lowerKey.endsWith('.markdown') ||
|
|
381
|
+
lowerKey.endsWith('.mdown');
|
|
382
|
+
if (isPdf) {
|
|
383
|
+
return await this.parsePdfBuffer(bodyBuffer);
|
|
384
|
+
}
|
|
385
|
+
if (isMarkdown) {
|
|
386
|
+
return this.parseMarkdownText(bodyBuffer.toString('utf-8'));
|
|
387
|
+
}
|
|
388
|
+
return bodyBuffer.toString('utf-8');
|
|
389
|
+
} catch (error) {
|
|
390
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
391
|
+
const errorStack = error instanceof Error ? error.stack : undefined;
|
|
392
|
+
logger.error('S3 fetch failed', {
|
|
393
|
+
url: config.url,
|
|
394
|
+
bucket: s3Config.bucket,
|
|
395
|
+
key: s3Config.key,
|
|
396
|
+
region: s3Config.region,
|
|
397
|
+
hasAccessKey: !!s3Config.accessKeyId,
|
|
398
|
+
error: errorMessage,
|
|
399
|
+
stack: errorStack,
|
|
400
|
+
});
|
|
401
|
+
throw new Error(
|
|
402
|
+
`Failed to fetch from S3 (bucket: ${s3Config.bucket}, key: ${s3Config.key}, region: ${s3Config.region}): ${errorMessage}`
|
|
403
|
+
);
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
private async fetchFromUrl(url: string): Promise<string> {
|
|
408
|
+
try {
|
|
409
|
+
const response = await fetch(url, {
|
|
410
|
+
signal: AbortSignal.timeout(30000),
|
|
411
|
+
});
|
|
412
|
+
|
|
413
|
+
if (!response.ok) {
|
|
414
|
+
throw new Error(`HTTP ${response.status} ${response.statusText}`);
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
const body = await response.arrayBuffer();
|
|
418
|
+
const maxSize = 50 * 1024 * 1024;
|
|
419
|
+
if (body.byteLength > maxSize) {
|
|
420
|
+
throw new Error(`Response too large: ${body.byteLength} bytes`);
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
const contentType = (response.headers.get('content-type') || '').toLowerCase();
|
|
424
|
+
const lowerUrl = url.toLowerCase();
|
|
425
|
+
const isMarkdown =
|
|
426
|
+
contentType.includes('text/markdown') ||
|
|
427
|
+
lowerUrl.endsWith('.md') ||
|
|
428
|
+
lowerUrl.endsWith('.markdown');
|
|
429
|
+
|
|
430
|
+
if (isMarkdown) {
|
|
431
|
+
return this.parseMarkdownText(Buffer.from(body).toString('utf-8'));
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
return Buffer.from(body).toString('utf-8');
|
|
435
|
+
} catch (error) {
|
|
436
|
+
logger.error('URL fetch failed', {
|
|
437
|
+
url,
|
|
438
|
+
error: error instanceof Error ? error.message : String(error),
|
|
439
|
+
});
|
|
440
|
+
throw error;
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
private async fetchFromLocal(filePath: string): Promise<string> {
|
|
445
|
+
try {
|
|
446
|
+
const content = await readFile(filePath, 'utf-8');
|
|
447
|
+
const lowerPath = filePath.toLowerCase();
|
|
448
|
+
const isMarkdown = lowerPath.endsWith('.md') || lowerPath.endsWith('.markdown');
|
|
449
|
+
|
|
450
|
+
if (isMarkdown) {
|
|
451
|
+
return this.parseMarkdownText(content);
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
return content;
|
|
455
|
+
} catch (error) {
|
|
456
|
+
logger.error('Local file read failed', {
|
|
457
|
+
path: filePath,
|
|
458
|
+
error: error instanceof Error ? error.message : String(error),
|
|
459
|
+
});
|
|
460
|
+
throw error;
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
private parseS3Url(
|
|
465
|
+
url: string,
|
|
466
|
+
retrievalConfig?: RetrievalConfig
|
|
467
|
+
): {
|
|
468
|
+
bucket: string;
|
|
469
|
+
key: string;
|
|
470
|
+
region: string;
|
|
471
|
+
endpoint?: string;
|
|
472
|
+
accessKeyId?: string;
|
|
473
|
+
secretAccessKey?: string;
|
|
474
|
+
forcePathStyle?: boolean;
|
|
475
|
+
} {
|
|
476
|
+
// Parse s3://bucket/key format
|
|
477
|
+
const withoutProtocol = url.replace('s3://', '');
|
|
478
|
+
const firstSlash = withoutProtocol.indexOf('/');
|
|
479
|
+
|
|
480
|
+
if (firstSlash === -1) {
|
|
481
|
+
throw new Error(`Invalid S3 URL format: ${url}`);
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
const bucket = withoutProtocol.slice(0, firstSlash);
|
|
485
|
+
const key = withoutProtocol.slice(firstSlash + 1);
|
|
486
|
+
|
|
487
|
+
const normalizedRetrievalConfig = this.normalizeRetrievalConfig(retrievalConfig);
|
|
488
|
+
|
|
489
|
+
// Get S3-specific config from retrievalConfig if provider is s3
|
|
490
|
+
let s3SpecificConfig: S3Config = {};
|
|
491
|
+
if (normalizedRetrievalConfig?.provider === 's3' && normalizedRetrievalConfig.config) {
|
|
492
|
+
s3SpecificConfig = normalizedRetrievalConfig.config as S3Config;
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
return {
|
|
496
|
+
bucket,
|
|
497
|
+
key,
|
|
498
|
+
region: s3SpecificConfig.region || process.env.AWS_REGION || 'us-east-1',
|
|
499
|
+
endpoint: s3SpecificConfig.endpoint,
|
|
500
|
+
accessKeyId: s3SpecificConfig.accessKeyId || process.env.AWS_ACCESS_KEY_ID,
|
|
501
|
+
secretAccessKey: s3SpecificConfig.secretAccessKey || process.env.AWS_SECRET_ACCESS_KEY,
|
|
502
|
+
forcePathStyle: s3SpecificConfig.forcePathStyle,
|
|
503
|
+
};
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
private normalizeRetrievalConfig(config?: RetrievalConfig): RetrievalConfig | undefined {
|
|
507
|
+
if (!config) {
|
|
508
|
+
return undefined;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
// Handle nested config structure from Agentlang
|
|
512
|
+
const normalizedConfig = preprocessRawConfig(config) as RetrievalConfig;
|
|
513
|
+
|
|
514
|
+
return normalizedConfig;
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
private async getOrCreateS3Client(config: {
|
|
518
|
+
region: string;
|
|
519
|
+
endpoint?: string;
|
|
520
|
+
accessKeyId?: string;
|
|
521
|
+
secretAccessKey?: string;
|
|
522
|
+
forcePathStyle?: boolean;
|
|
523
|
+
}): Promise<any> {
|
|
524
|
+
const clientKey = `${config.region}:${config.endpoint || 'default'}:${config.accessKeyId || 'default'}`;
|
|
525
|
+
|
|
526
|
+
if (!this.s3Clients.has(clientKey)) {
|
|
527
|
+
const client = new S3Client({
|
|
528
|
+
region: config.region,
|
|
529
|
+
endpoint: config.endpoint,
|
|
530
|
+
forcePathStyle: config.forcePathStyle,
|
|
531
|
+
credentials:
|
|
532
|
+
config.accessKeyId && config.secretAccessKey
|
|
533
|
+
? {
|
|
534
|
+
accessKeyId: config.accessKeyId,
|
|
535
|
+
secretAccessKey: config.secretAccessKey,
|
|
536
|
+
}
|
|
537
|
+
: undefined,
|
|
538
|
+
});
|
|
539
|
+
|
|
540
|
+
this.s3Clients.set(clientKey, client);
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
return this.s3Clients.get(clientKey)!;
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
private async parsePdfBuffer(buffer: Buffer): Promise<string> {
|
|
547
|
+
// Lazy load PDF parser
|
|
548
|
+
if (!this.pdfParser) {
|
|
549
|
+
try {
|
|
550
|
+
const pdfParse = await import('pdf-parse');
|
|
551
|
+
// Handle both ESM and CSM module formats
|
|
552
|
+
const parser = (pdfParse as any).default || pdfParse;
|
|
553
|
+
this.pdfParser = parser;
|
|
554
|
+
} catch (error) {
|
|
555
|
+
logger.error('Failed to load PDF parser', { error });
|
|
556
|
+
throw new Error(
|
|
557
|
+
'PDF parsing not available. Please install pdf-parse: npm install pdf-parse'
|
|
558
|
+
);
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
try {
|
|
563
|
+
const result = await this.pdfParser(buffer);
|
|
564
|
+
return result.text || '';
|
|
565
|
+
} catch (error) {
|
|
566
|
+
logger.error('PDF parsing failed', { error });
|
|
567
|
+
throw new Error(`Failed to parse PDF: ${error}`);
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
private parseMarkdownText(text: string): string {
|
|
572
|
+
// Convert markdown to plain text for embedding
|
|
573
|
+
// This removes formatting but preserves content structure
|
|
574
|
+
try {
|
|
575
|
+
const html = marked.parse(text) as string;
|
|
576
|
+
// Simple HTML to text conversion
|
|
577
|
+
return html
|
|
578
|
+
.replace(/<[^>]+>/g, ' ') // Remove HTML tags
|
|
579
|
+
.replace(/\s+/g, ' ') // Normalize whitespace
|
|
580
|
+
.replace(/</g, '<')
|
|
581
|
+
.replace(/>/g, '>')
|
|
582
|
+
.replace(/&/g, '&')
|
|
583
|
+
.replace(/"/g, '"')
|
|
584
|
+
.trim();
|
|
585
|
+
} catch (error) {
|
|
586
|
+
logger.warn('Markdown parsing failed, returning raw text', { error });
|
|
587
|
+
return text;
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
private async readS3BodyToBuffer(body: any): Promise<Buffer> {
|
|
592
|
+
if (body.transformToByteArray) {
|
|
593
|
+
const data = await body.transformToByteArray();
|
|
594
|
+
return Buffer.from(data);
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
// Fallback for Readable streams
|
|
598
|
+
const chunks: Buffer[] = [];
|
|
599
|
+
for await (const chunk of body) {
|
|
600
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
601
|
+
}
|
|
602
|
+
return Buffer.concat(chunks);
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
private async createDocumentEntity(document: FetchedDocument): Promise<void> {
|
|
606
|
+
try {
|
|
607
|
+
// Build the Document entity attributes
|
|
608
|
+
let docAttrs = `{title "${document.title}", content "${this.escapeContent(document.content)}"`;
|
|
609
|
+
|
|
610
|
+
// Add embeddingConfig if present
|
|
611
|
+
if (document.embeddingConfig) {
|
|
612
|
+
const configStr = JSON.stringify(document.embeddingConfig).replace(/"/g, '\\"');
|
|
613
|
+
docAttrs += `, embeddingConfig "${configStr}"`;
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
docAttrs += '}';
|
|
617
|
+
|
|
618
|
+
// Upsert to database
|
|
619
|
+
await parseAndEvaluateStatement(`{${CoreAIModuleName}/Document ${docAttrs}, @upsert}`);
|
|
620
|
+
|
|
621
|
+
logger.debug('Created Document entity', {
|
|
622
|
+
title: document.title,
|
|
623
|
+
url: document.url,
|
|
624
|
+
hasEmbeddingConfig: !!document.embeddingConfig,
|
|
625
|
+
});
|
|
626
|
+
} catch (error) {
|
|
627
|
+
logger.error('Failed to create Document entity', {
|
|
628
|
+
title: document.title,
|
|
629
|
+
error,
|
|
630
|
+
});
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
private escapeContent(content: string): string {
|
|
635
|
+
return content
|
|
636
|
+
.replace(/\\/g, '\\\\')
|
|
637
|
+
.replace(/"/g, '\\"')
|
|
638
|
+
.replace(/\n/g, '\\n')
|
|
639
|
+
.replace(/\r/g, '\\r')
|
|
640
|
+
.replace(/\t/g, '\\t');
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
private inferFormat(url: string): string {
|
|
644
|
+
// Handle document-service URLs
|
|
645
|
+
if (url.startsWith('document-service://')) {
|
|
646
|
+
return 'txt';
|
|
647
|
+
}
|
|
648
|
+
const parts = url.split('.');
|
|
649
|
+
if (parts.length > 1) {
|
|
650
|
+
return parts[parts.length - 1].toLowerCase();
|
|
651
|
+
}
|
|
652
|
+
return 'txt';
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
private findDocumentInConfig(title: string): DocumentConfig | null {
|
|
656
|
+
// This method should be called during config loading
|
|
657
|
+
// The documents are stored when the config is parsed
|
|
658
|
+
const docs = getConfiguredDocuments();
|
|
659
|
+
return docs.find(d => d.title === title) || null;
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
private ensureNodeEnv(): void {
|
|
663
|
+
if (!isNodeEnv) {
|
|
664
|
+
throw new Error('Document fetching is only available in Node.js environment');
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
clearCache(): void {
|
|
669
|
+
// Clear all cache
|
|
670
|
+
this.documentCache.clear();
|
|
671
|
+
}
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
// Singleton instance
|
|
675
|
+
const documentFetcher = new DocumentFetcherService();
|
|
676
|
+
|
|
677
|
+
// Helper function to get configured documents from module config
|
|
678
|
+
function getConfiguredDocuments(): DocumentConfig[] {
|
|
679
|
+
// This should be populated during config parsing
|
|
680
|
+
// For now, return empty array - actual implementation depends on how
|
|
681
|
+
// the config system stores document definitions
|
|
682
|
+
return (global as any).__configuredDocuments || [];
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
// Export for use in config loading
|
|
686
|
+
export function setConfiguredDocuments(docs: DocumentConfig[]): void {
|
|
687
|
+
(global as any).__configuredDocuments = docs;
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
export { documentFetcher };
|
|
691
|
+
export default documentFetcher;
|