agentlang 0.9.9 → 0.9.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,468 @@
1
+ import { S3Client, GetObjectCommand } from '@aws-sdk/client-s3';
2
+ import { readFile } from 'node:fs/promises';
3
+ import path from 'node:path';
4
+ import { logger } from '../logger.js';
5
+ import { parseAndEvaluateStatement } from '../interpreter.js';
6
+ import { CoreAIModuleName } from '../modules/ai.js';
7
+ import { TtlCache } from '../state.js';
8
+ import { preprocessRawConfig } from '../util.js';
9
+ import { marked } from 'marked';
10
+ import { isNodeEnv } from '../../utils/runtime.js';
11
+
12
+ // Provider-specific configurations
13
+ export interface S3Config {
14
+ region?: string;
15
+ endpoint?: string;
16
+ accessKeyId?: string;
17
+ secretAccessKey?: string;
18
+ forcePathStyle?: boolean;
19
+ }
20
+
21
+ // Generic retrieval configuration for any storage provider
22
+ export interface RetrievalConfig {
23
+ provider: 's3' | 'box' | 'gdrive' | 'azure' | 'onedrive' | string;
24
+ config: S3Config | Record<string, any>;
25
+ }
26
+
27
+ export interface EmbeddingConfig {
28
+ provider?: string;
29
+ model?: string;
30
+ chunkSize?: number;
31
+ chunkOverlap?: number;
32
+ }
33
+
34
+ export interface DocumentConfig {
35
+ title: string;
36
+ url: string;
37
+ retrievalConfig?: RetrievalConfig;
38
+ embeddingConfig?: EmbeddingConfig;
39
+ }
40
+
41
+ export interface FetchedDocument {
42
+ title: string;
43
+ content: string;
44
+ url: string;
45
+ format: string;
46
+ fetchedAt: Date;
47
+ embeddingConfig?: EmbeddingConfig;
48
+ }
49
+
50
+ class DocumentFetcherService {
51
+ private static readonly CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
52
+ private documentCache = new TtlCache<FetchedDocument>(DocumentFetcherService.CACHE_TTL_MS);
53
+ private s3Clients = new Map<string, any>();
54
+ private pdfParser: any = null;
55
+
56
+ async fetchDocument(config: DocumentConfig): Promise<FetchedDocument | null> {
57
+ this.ensureNodeEnv();
58
+ const cacheKey = `${config.title}:${config.url}`;
59
+ const cached = this.documentCache.get(cacheKey);
60
+
61
+ if (cached) {
62
+ logger.debug('Returning cached document', { title: config.title });
63
+ return cached;
64
+ }
65
+
66
+ try {
67
+ let content: string;
68
+
69
+ if (config.url.startsWith('s3://')) {
70
+ content = await this.fetchFromS3(config);
71
+ } else if (config.url.startsWith('http://') || config.url.startsWith('https://')) {
72
+ content = await this.fetchFromUrl(config.url);
73
+ } else {
74
+ // Local file path
75
+ content = await this.fetchFromLocal(config.url);
76
+ }
77
+
78
+ const document: FetchedDocument = {
79
+ title: config.title,
80
+ content,
81
+ url: config.url,
82
+ format: this.inferFormat(config.url),
83
+ fetchedAt: new Date(),
84
+ embeddingConfig: config.embeddingConfig,
85
+ };
86
+
87
+ this.documentCache.set(cacheKey, document);
88
+
89
+ // Auto-create Document entity from fetched content
90
+ await this.createDocumentEntity(document);
91
+
92
+ return document;
93
+ } catch (error) {
94
+ logger.error('Failed to fetch document', {
95
+ title: config.title,
96
+ url: config.url,
97
+ error: error instanceof Error ? error.message : String(error),
98
+ stack: error instanceof Error ? error.stack : undefined,
99
+ });
100
+ // Re-throw the error so the caller knows what happened
101
+ throw error;
102
+ }
103
+ }
104
+
105
+ async fetchDocumentByTitle(title: string): Promise<FetchedDocument | null> {
106
+ this.ensureNodeEnv();
107
+ // First check if we have it in cache
108
+ // Note: TtlCache doesn't have a way to search by prefix, so we'll fetch directly
109
+
110
+ try {
111
+ // Try to find in loaded config
112
+ const doc = this.findDocumentInConfig(title);
113
+ if (doc) {
114
+ return this.fetchDocument(doc);
115
+ }
116
+
117
+ logger.warn('Document not found in config', { title });
118
+ return null;
119
+ } catch (error) {
120
+ logger.error('Failed to fetch document by title', { title, error });
121
+ return null;
122
+ }
123
+ }
124
+
125
+ private findDocumentInConfig(title: string): DocumentConfig | null {
126
+ // This method should be called during config loading
127
+ // The documents are stored when the config is parsed
128
+ const docs = getConfiguredDocuments();
129
+ return docs.find(d => d.title === title) || null;
130
+ }
131
+
132
+ private async fetchFromS3(config: DocumentConfig): Promise<string> {
133
+ const s3Config = this.parseS3Url(config.url, config.retrievalConfig);
134
+ const client = await this.getOrCreateS3Client(s3Config);
135
+
136
+ try {
137
+ const response = await client.send(
138
+ new GetObjectCommand({
139
+ Bucket: s3Config.bucket,
140
+ Key: s3Config.key,
141
+ })
142
+ );
143
+
144
+ if (!response.Body) {
145
+ throw new Error('S3 object has no body');
146
+ }
147
+ const bodyBuffer = await this.readS3BodyToBuffer(response.Body as any);
148
+ const contentType = (response.ContentType || '').toLowerCase();
149
+ const lowerKey = s3Config.key.toLowerCase();
150
+ const isPdf = contentType.includes('application/pdf') || lowerKey.endsWith('.pdf');
151
+ const isMarkdown =
152
+ contentType.includes('text/markdown') ||
153
+ lowerKey.endsWith('.md') ||
154
+ lowerKey.endsWith('.markdown') ||
155
+ lowerKey.endsWith('.mdown');
156
+ if (isPdf) {
157
+ return await this.parsePdfBuffer(bodyBuffer);
158
+ }
159
+ if (isMarkdown) {
160
+ return this.parseMarkdownText(bodyBuffer.toString('utf-8'));
161
+ }
162
+ return bodyBuffer.toString('utf-8');
163
+ } catch (error) {
164
+ const errorMessage = error instanceof Error ? error.message : String(error);
165
+ const errorStack = error instanceof Error ? error.stack : undefined;
166
+ logger.error('S3 fetch failed', {
167
+ url: config.url,
168
+ bucket: s3Config.bucket,
169
+ key: s3Config.key,
170
+ region: s3Config.region,
171
+ hasAccessKey: !!s3Config.accessKeyId,
172
+ error: errorMessage,
173
+ stack: errorStack,
174
+ });
175
+ throw new Error(
176
+ `Failed to fetch from S3 (bucket: ${s3Config.bucket}, key: ${s3Config.key}, region: ${s3Config.region}): ${errorMessage}`
177
+ );
178
+ }
179
+ }
180
+
181
+ private async fetchFromUrl(url: string): Promise<string> {
182
+ try {
183
+ const response = await fetch(url, {
184
+ signal: AbortSignal.timeout(30000),
185
+ });
186
+
187
+ if (!response.ok) {
188
+ throw new Error(`HTTP ${response.status} ${response.statusText}`);
189
+ }
190
+
191
+ const body = await response.arrayBuffer();
192
+ const maxSize = 50 * 1024 * 1024;
193
+ if (body.byteLength > maxSize) {
194
+ throw new Error(`Response too large: ${body.byteLength} bytes`);
195
+ }
196
+
197
+ const contentType = (response.headers.get('content-type') || '').toLowerCase();
198
+ const lowerUrl = url.toLowerCase();
199
+ const isMarkdown =
200
+ contentType.includes('text/markdown') ||
201
+ lowerUrl.endsWith('.md') ||
202
+ lowerUrl.endsWith('.markdown') ||
203
+ lowerUrl.endsWith('.mdown');
204
+ const text = Buffer.from(body).toString('utf-8');
205
+ return isMarkdown ? this.parseMarkdownText(text) : text;
206
+ } catch (error) {
207
+ logger.error('URL fetch failed', { url, error });
208
+ throw new Error(`Failed to fetch from URL: ${error}`);
209
+ }
210
+ }
211
+
212
+ private async fetchFromLocal(filePath: string): Promise<string> {
213
+ try {
214
+ const resolvedPath = path.resolve(filePath);
215
+ const content = await readFile(resolvedPath, 'utf-8');
216
+ const lowerPath = resolvedPath.toLowerCase();
217
+ const isMarkdown =
218
+ lowerPath.endsWith('.md') ||
219
+ lowerPath.endsWith('.markdown') ||
220
+ lowerPath.endsWith('.mdown');
221
+ return isMarkdown ? this.parseMarkdownText(content) : content;
222
+ } catch (error) {
223
+ logger.error('Local file read failed', { path: filePath, error });
224
+ throw new Error(`Failed to read local file: ${error}`);
225
+ }
226
+ }
227
+
228
+ private parseS3Url(
229
+ url: string,
230
+ retrievalConfig?: RetrievalConfig
231
+ ): {
232
+ bucket: string;
233
+ key: string;
234
+ region: string;
235
+ endpoint?: string;
236
+ accessKeyId?: string;
237
+ secretAccessKey?: string;
238
+ forcePathStyle?: boolean;
239
+ } {
240
+ // Parse s3://bucket/key format
241
+ if (!url.startsWith('s3://')) {
242
+ throw new Error('Invalid S3 URL format. Expected: s3://bucket/key');
243
+ }
244
+
245
+ const withoutProtocol = url.slice(5);
246
+ const firstSlash = withoutProtocol.indexOf('/');
247
+
248
+ if (firstSlash === -1) {
249
+ throw new Error('Invalid S3 URL format. Expected: s3://bucket/key');
250
+ }
251
+
252
+ const bucket = withoutProtocol.slice(0, firstSlash);
253
+ const key = withoutProtocol.slice(firstSlash + 1);
254
+
255
+ const normalizedRetrievalConfig = this.normalizeRetrievalConfig(retrievalConfig);
256
+
257
+ // Get S3-specific config from retrievalConfig if provider is s3
258
+ let s3SpecificConfig: S3Config = {};
259
+ if (normalizedRetrievalConfig?.provider === 's3' && normalizedRetrievalConfig.config) {
260
+ s3SpecificConfig = normalizedRetrievalConfig.config as S3Config;
261
+ }
262
+
263
+ return {
264
+ bucket,
265
+ key,
266
+ region: s3SpecificConfig.region || process.env.AWS_REGION || 'us-east-1',
267
+ endpoint: s3SpecificConfig.endpoint,
268
+ accessKeyId: s3SpecificConfig.accessKeyId || process.env.AWS_ACCESS_KEY_ID,
269
+ secretAccessKey: s3SpecificConfig.secretAccessKey || process.env.AWS_SECRET_ACCESS_KEY,
270
+ forcePathStyle: s3SpecificConfig.forcePathStyle,
271
+ };
272
+ }
273
+
274
+ private async getOrCreateS3Client(config: {
275
+ region: string;
276
+ endpoint?: string;
277
+ accessKeyId?: string;
278
+ secretAccessKey?: string;
279
+ forcePathStyle?: boolean;
280
+ }): Promise<any> {
281
+ const clientKey = `${config.region}:${config.endpoint || 'default'}:${config.accessKeyId || 'default'}`;
282
+
283
+ if (!this.s3Clients.has(clientKey)) {
284
+ const client = new S3Client({
285
+ region: config.region,
286
+ endpoint: config.endpoint,
287
+ forcePathStyle: config.forcePathStyle,
288
+ credentials:
289
+ config.accessKeyId && config.secretAccessKey
290
+ ? {
291
+ accessKeyId: config.accessKeyId,
292
+ secretAccessKey: config.secretAccessKey,
293
+ }
294
+ : undefined,
295
+ });
296
+
297
+ this.s3Clients.set(clientKey, client);
298
+ }
299
+
300
+ return this.s3Clients.get(clientKey)!;
301
+ }
302
+
303
+ private async createDocumentEntity(document: FetchedDocument): Promise<void> {
304
+ try {
305
+ // Build the Document entity attributes
306
+ let docAttrs = `{title "${document.title}", content "${this.escapeContent(document.content)}"`;
307
+
308
+ // Add embeddingConfig if present
309
+ if (document.embeddingConfig) {
310
+ const configStr = JSON.stringify(document.embeddingConfig).replace(/"/g, '\\"');
311
+ docAttrs += `, embeddingConfig "${configStr}"`;
312
+ }
313
+
314
+ docAttrs += '}';
315
+
316
+ // Upsert to database
317
+ await parseAndEvaluateStatement(`{${CoreAIModuleName}/Document ${docAttrs}, @upsert}`);
318
+
319
+ logger.debug('Created Document entity', {
320
+ title: document.title,
321
+ url: document.url,
322
+ hasEmbeddingConfig: !!document.embeddingConfig,
323
+ });
324
+ } catch (error) {
325
+ logger.error('Failed to create Document entity', {
326
+ title: document.title,
327
+ error,
328
+ });
329
+ }
330
+ }
331
+
332
+ private escapeContent(content: string): string {
333
+ return content
334
+ .replace(/\\/g, '\\\\')
335
+ .replace(/"/g, '\\"')
336
+ .replace(/\n/g, '\\n')
337
+ .replace(/\r/g, '\\r')
338
+ .replace(/\t/g, '\\t');
339
+ }
340
+
341
+ private inferFormat(url: string): string {
342
+ const parts = url.split('.');
343
+ if (parts.length > 1) {
344
+ return parts[parts.length - 1].toLowerCase();
345
+ }
346
+ return 'txt';
347
+ }
348
+
349
+ clearCache(title?: string): void {
350
+ if (title) {
351
+ // Note: TtlCache doesn't expose keys, clear all for now
352
+ this.documentCache.clear();
353
+ } else {
354
+ this.documentCache.clear();
355
+ }
356
+ }
357
+
358
+ private normalizeConfigValue(value: any): any {
359
+ if (value instanceof Map) {
360
+ const obj: Record<string, any> = {};
361
+ value.forEach((v, k) => {
362
+ obj[k] = this.normalizeConfigValue(v);
363
+ });
364
+ return obj;
365
+ }
366
+ if (Array.isArray(value)) {
367
+ return value.map(v => this.normalizeConfigValue(v));
368
+ }
369
+ if (value && typeof value === 'object') {
370
+ const obj: Record<string, any> = {};
371
+ Object.entries(value).forEach(([k, v]) => {
372
+ obj[k] = this.normalizeConfigValue(v);
373
+ });
374
+ return obj;
375
+ }
376
+ return value;
377
+ }
378
+
379
+ private normalizeRetrievalConfig(retrievalConfig?: RetrievalConfig): RetrievalConfig | undefined {
380
+ if (!retrievalConfig) return undefined;
381
+ const normalized = this.normalizeConfigValue(retrievalConfig);
382
+ if (normalized && typeof normalized === 'object') {
383
+ preprocessRawConfig(normalized);
384
+ }
385
+ return normalized as RetrievalConfig;
386
+ }
387
+
388
+ private ensureNodeEnv(): void {
389
+ if (!isNodeEnv) {
390
+ throw new Error('Document fetching is only available in Node.js environment');
391
+ }
392
+ }
393
+
394
+ private async readS3BodyToBuffer(body: any): Promise<Buffer> {
395
+ if (body.transformToByteArray) {
396
+ const bytes = await body.transformToByteArray();
397
+ return Buffer.from(bytes);
398
+ }
399
+ if (body.transformToString) {
400
+ const text = await body.transformToString('utf-8');
401
+ return Buffer.from(text, 'utf-8');
402
+ }
403
+ const chunks: Buffer[] = [];
404
+ for await (const chunk of body) {
405
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
406
+ }
407
+ return Buffer.concat(chunks);
408
+ }
409
+
410
+ private async getPdfParser(): Promise<any> {
411
+ if (!this.pdfParser) {
412
+ const pdfModule: any = await import('pdf-parse');
413
+ this.pdfParser = pdfModule.PDFParse || pdfModule.default;
414
+ }
415
+ return this.pdfParser;
416
+ }
417
+
418
+ private async parsePdfBuffer(buffer: Buffer): Promise<string> {
419
+ try {
420
+ const PDFParseClass = await this.getPdfParser();
421
+ const parser = new PDFParseClass({
422
+ data: buffer,
423
+ verbosity: 0,
424
+ });
425
+ const data = await parser.getText();
426
+ return data.text;
427
+ } catch (error: any) {
428
+ logger.error(`Failed to parse PDF: ${error.message}`);
429
+ throw new Error(`PDF parsing failed: ${error.message}`);
430
+ }
431
+ }
432
+
433
+ private parseMarkdownText(markdown: string): string {
434
+ const html = marked.parse(markdown);
435
+ if (typeof html !== 'string') {
436
+ return markdown;
437
+ }
438
+ return html
439
+ .replace(/<\s*br\s*\/?>/gi, '\n')
440
+ .replace(/<\/(p|li|h[1-6]|blockquote|pre|tr|table)>/gi, '\n')
441
+ .replace(/<[^>]+>/g, '')
442
+ .replace(/\n{3,}/g, '\n\n')
443
+ .trim();
444
+ }
445
+ }
446
+
447
+ // Store configured documents from config.al
448
+ let configuredDocuments: DocumentConfig[] = [];
449
+
450
+ export function registerConfiguredDocument(doc: DocumentConfig): void {
451
+ // Check if already registered
452
+ const existing = configuredDocuments.find(d => d.title === doc.title);
453
+ if (!existing) {
454
+ configuredDocuments.push(doc);
455
+ logger.debug('Registered configured document', { title: doc.title, url: doc.url });
456
+ }
457
+ }
458
+
459
+ export function getConfiguredDocuments(): DocumentConfig[] {
460
+ return [...configuredDocuments];
461
+ }
462
+
463
+ export function clearConfiguredDocuments(): void {
464
+ configuredDocuments = [];
465
+ }
466
+
467
+ export const documentFetcher = new DocumentFetcherService();
468
+ export default documentFetcher;