agentlang 0.9.9 → 0.9.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/out/extension/main.cjs +38 -38
  2. package/out/extension/main.cjs.map +2 -2
  3. package/out/language/generated/ast.d.ts +1 -1
  4. package/out/language/generated/ast.js +1 -1
  5. package/out/language/generated/grammar.d.ts +1 -1
  6. package/out/language/generated/grammar.js +1 -1
  7. package/out/language/generated/module.d.ts +1 -1
  8. package/out/language/generated/module.js +1 -1
  9. package/out/language/main.cjs +850 -2388
  10. package/out/language/main.cjs.map +4 -4
  11. package/out/runtime/agents/common.d.ts +3 -1
  12. package/out/runtime/agents/common.d.ts.map +1 -1
  13. package/out/runtime/agents/common.js +35 -31
  14. package/out/runtime/agents/common.js.map +1 -1
  15. package/out/runtime/docs.d.ts +1 -0
  16. package/out/runtime/docs.d.ts.map +1 -1
  17. package/out/runtime/docs.js +16 -1
  18. package/out/runtime/docs.js.map +1 -1
  19. package/out/runtime/interpreter.d.ts +1 -0
  20. package/out/runtime/interpreter.d.ts.map +1 -1
  21. package/out/runtime/interpreter.js +60 -9
  22. package/out/runtime/interpreter.js.map +1 -1
  23. package/out/runtime/jsmodules.d.ts +2 -1
  24. package/out/runtime/jsmodules.d.ts.map +1 -1
  25. package/out/runtime/jsmodules.js +2 -1
  26. package/out/runtime/jsmodules.js.map +1 -1
  27. package/out/runtime/loader.d.ts.map +1 -1
  28. package/out/runtime/loader.js +3 -2
  29. package/out/runtime/loader.js.map +1 -1
  30. package/out/runtime/module.d.ts +1 -0
  31. package/out/runtime/module.d.ts.map +1 -1
  32. package/out/runtime/module.js +3 -0
  33. package/out/runtime/module.js.map +1 -1
  34. package/out/runtime/modules/ai.d.ts +12 -0
  35. package/out/runtime/modules/ai.d.ts.map +1 -1
  36. package/out/runtime/modules/ai.js +225 -28
  37. package/out/runtime/modules/ai.js.map +1 -1
  38. package/out/runtime/modules/core.d.ts.map +1 -1
  39. package/out/runtime/modules/core.js +7 -1
  40. package/out/runtime/modules/core.js.map +1 -1
  41. package/out/runtime/resolvers/sqldb/impl.d.ts.map +1 -1
  42. package/out/runtime/resolvers/sqldb/impl.js +37 -6
  43. package/out/runtime/resolvers/sqldb/impl.js.map +1 -1
  44. package/out/runtime/services/documentFetcher.d.ts +70 -0
  45. package/out/runtime/services/documentFetcher.d.ts.map +1 -0
  46. package/out/runtime/services/documentFetcher.js +582 -0
  47. package/out/runtime/services/documentFetcher.js.map +1 -0
  48. package/package.json +2 -1
  49. package/src/language/generated/ast.ts +1 -1
  50. package/src/language/generated/grammar.ts +1 -1
  51. package/src/language/generated/module.ts +1 -1
  52. package/src/runtime/agents/common.ts +37 -31
  53. package/src/runtime/docs.ts +17 -1
  54. package/src/runtime/interpreter.ts +64 -7
  55. package/src/runtime/jsmodules.ts +3 -1
  56. package/src/runtime/loader.ts +3 -2
  57. package/src/runtime/module.ts +4 -0
  58. package/src/runtime/modules/ai.ts +270 -33
  59. package/src/runtime/modules/core.ts +7 -1
  60. package/src/runtime/resolvers/sqldb/impl.ts +36 -6
  61. package/src/runtime/services/documentFetcher.ts +691 -0
@@ -0,0 +1,691 @@
1
+ import { GetObjectCommand, S3Client } from '@aws-sdk/client-s3';
2
+ import { readFile } from 'node:fs/promises';
3
+ import { logger } from '../logger.js';
4
+ import { parseAndEvaluateStatement } from '../interpreter.js';
5
+ import { CoreAIModuleName } from '../modules/ai.js';
6
+ import { TtlCache } from '../state.js';
7
+ import { preprocessRawConfig } from '../util.js';
8
+ import { marked } from 'marked';
9
+ import { isNodeEnv } from '../../utils/runtime.js';
10
+
11
+ // Provider-specific configurations
12
+ export interface S3Config {
13
+ region?: string;
14
+ endpoint?: string;
15
+ accessKeyId?: string;
16
+ secretAccessKey?: string;
17
+ forcePathStyle?: boolean;
18
+ }
19
+
20
+ // Generic retrieval configuration for any storage provider
21
+ export interface RetrievalConfig {
22
+ provider: 's3' | 'box' | 'gdrive' | 'azure' | 'onedrive' | 'document-service' | string;
23
+ config: S3Config | Record<string, any>;
24
+ }
25
+
26
+ export interface EmbeddingConfig {
27
+ provider?: string;
28
+ model?: string;
29
+ chunkSize?: number;
30
+ chunkOverlap?: number;
31
+ }
32
+
33
+ export interface DocumentConfig {
34
+ title: string;
35
+ url?: string;
36
+ documentServiceId?: string;
37
+ retrievalConfig?: RetrievalConfig;
38
+ embeddingConfig?: EmbeddingConfig;
39
+ }
40
+
41
+ export interface FetchedDocument {
42
+ title: string;
43
+ content: string;
44
+ url: string;
45
+ format: string;
46
+ fetchedAt: Date;
47
+ embeddingConfig?: EmbeddingConfig;
48
+ }
49
+
50
+ interface DocumentServiceConfig {
51
+ baseUrl: string;
52
+ appName: string;
53
+ authToken?: string;
54
+ getAuthToken?: () => Promise<string>;
55
+ }
56
+
57
+ class DocumentFetcherService {
58
+ private static readonly CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
59
+ private documentCache = new TtlCache<FetchedDocument>(DocumentFetcherService.CACHE_TTL_MS);
60
+ private s3Clients = new Map<string, any>();
61
+ private pdfParser: any = null;
62
+ private documentServiceConfig?: DocumentServiceConfig;
63
+
64
+ configureDocumentService(config: DocumentServiceConfig): void {
65
+ this.documentServiceConfig = config;
66
+ logger.info('Document service configured', { baseUrl: config.baseUrl });
67
+ }
68
+
69
+ async fetchDocument(config: DocumentConfig): Promise<FetchedDocument | null> {
70
+ this.ensureNodeEnv();
71
+ const cacheKey = `${config.title}:${config.url || config.documentServiceId}`;
72
+ const cached = this.documentCache.get(cacheKey);
73
+
74
+ if (cached) {
75
+ logger.debug('Returning cached document', { title: config.title });
76
+ return cached;
77
+ }
78
+
79
+ try {
80
+ let content: string;
81
+ let sourceUrl: string;
82
+
83
+ if (config.url?.startsWith('document-service://')) {
84
+ if (!config.retrievalConfig || config.retrievalConfig.provider !== 'document-service') {
85
+ throw new Error(
86
+ 'Document service URL requires retrievalConfig with provider: "document-service"'
87
+ );
88
+ }
89
+
90
+ const dsConfig = config.retrievalConfig.config as DocumentServiceConfig;
91
+ if (!dsConfig?.baseUrl) {
92
+ throw new Error('Document service config requires baseUrl');
93
+ }
94
+
95
+ const urlPath = config.url.replace('document-service://', '');
96
+ const parts = urlPath.split('/');
97
+
98
+ if (parts.length !== 3) {
99
+ throw new Error(
100
+ `Invalid document service URL format: ${config.url}. Expected: document-service://<user-uuid>/<app-uuid>/<doc-uuid>.ext`
101
+ );
102
+ }
103
+
104
+ const appUuid = parts[1];
105
+ const docIdWithExt = parts[2];
106
+ const docId = docIdWithExt.split('.')[0]; // Remove extension
107
+
108
+ this.documentServiceConfig = {
109
+ baseUrl: dsConfig.baseUrl,
110
+ appName: appUuid,
111
+ authToken: dsConfig.authToken,
112
+ getAuthToken: dsConfig.getAuthToken,
113
+ };
114
+
115
+ content = await this.fetchFromDocumentService(docId);
116
+ sourceUrl = config.url;
117
+ } else if (config.retrievalConfig?.provider === 'document-service') {
118
+ const dsConfig = config.retrievalConfig.config as DocumentServiceConfig;
119
+ if (!dsConfig?.baseUrl || !dsConfig?.appName) {
120
+ throw new Error('Document service config requires baseUrl and appName');
121
+ }
122
+
123
+ this.documentServiceConfig = {
124
+ baseUrl: dsConfig.baseUrl,
125
+ appName: dsConfig.appName,
126
+ authToken: dsConfig.authToken,
127
+ getAuthToken: dsConfig.getAuthToken,
128
+ };
129
+
130
+ const docId = await this.lookupDocumentByTitle(config.title);
131
+ if (docId) {
132
+ content = await this.fetchFromDocumentService(docId);
133
+ sourceUrl = `document-service://${docId}`;
134
+ } else {
135
+ throw new Error(`Document not found by title in document service: ${config.title}`);
136
+ }
137
+ } else if (config.documentServiceId && this.documentServiceConfig) {
138
+ content = await this.fetchFromDocumentService(config.documentServiceId);
139
+ sourceUrl = `document-service://${config.documentServiceId}`;
140
+ } else if (config.url?.startsWith('s3://')) {
141
+ content = await this.fetchFromS3(config);
142
+ sourceUrl = config.url;
143
+ } else if (config.url?.startsWith('http://') || config.url?.startsWith('https://')) {
144
+ content = await this.fetchFromUrl(config.url);
145
+ sourceUrl = config.url;
146
+ } else if (config.url) {
147
+ content = await this.fetchFromLocal(config.url);
148
+ sourceUrl = config.url;
149
+ } else {
150
+ if (this.documentServiceConfig) {
151
+ const docId = await this.lookupDocumentByTitle(config.title);
152
+ if (docId) {
153
+ content = await this.fetchFromDocumentService(docId);
154
+ sourceUrl = `document-service://${docId}`;
155
+ } else {
156
+ throw new Error(`Document not found by title: ${config.title}`);
157
+ }
158
+ } else {
159
+ throw new Error(`No URL or document service ID provided for: ${config.title}`);
160
+ }
161
+ }
162
+
163
+ const document: FetchedDocument = {
164
+ title: config.title,
165
+ content,
166
+ url: sourceUrl,
167
+ format: this.inferFormat(sourceUrl),
168
+ fetchedAt: new Date(),
169
+ embeddingConfig: config.embeddingConfig,
170
+ };
171
+
172
+ this.documentCache.set(cacheKey, document);
173
+
174
+ await this.createDocumentEntity(document);
175
+
176
+ return document;
177
+ } catch (error) {
178
+ logger.error('Failed to fetch document', {
179
+ title: config.title,
180
+ url: config.url,
181
+ documentServiceId: config.documentServiceId,
182
+ error: error instanceof Error ? error.message : String(error),
183
+ stack: error instanceof Error ? error.stack : undefined,
184
+ });
185
+ throw error;
186
+ }
187
+ }
188
+
189
+ async fetchDocumentByTitle(title: string): Promise<FetchedDocument | null> {
190
+ this.ensureNodeEnv();
191
+
192
+ try {
193
+ // First check if we have it in cache
194
+ const cacheKey = `${title}:lookup`;
195
+ const cached = this.documentCache.get(cacheKey);
196
+ if (cached) {
197
+ logger.debug('Returning cached document by title', { title });
198
+ return cached;
199
+ }
200
+
201
+ // Try document service lookup first (if configured)
202
+ if (this.documentServiceConfig) {
203
+ const docId = await this.lookupDocumentByTitle(title);
204
+ if (docId) {
205
+ return this.fetchDocument({
206
+ title,
207
+ documentServiceId: docId,
208
+ });
209
+ }
210
+ }
211
+
212
+ // Fall back to config-based lookup
213
+ const doc = this.findDocumentInConfig(title);
214
+ if (doc) {
215
+ return this.fetchDocument(doc);
216
+ }
217
+
218
+ logger.warn('Document not found', { title });
219
+ return null;
220
+ } catch (error) {
221
+ logger.error('Failed to fetch document by title', { title, error });
222
+ return null;
223
+ }
224
+ }
225
+
226
+ // Fetch from secure document-service API
227
+ private async fetchFromDocumentService(documentId: string): Promise<string> {
228
+ if (!this.documentServiceConfig) {
229
+ throw new Error('Document service not configured');
230
+ }
231
+
232
+ try {
233
+ // Get token - either static from config or dynamic from function
234
+ let token: string;
235
+ if (this.documentServiceConfig.authToken) {
236
+ token = this.documentServiceConfig.authToken;
237
+ } else if (this.documentServiceConfig.getAuthToken) {
238
+ token = await this.documentServiceConfig.getAuthToken();
239
+ } else {
240
+ throw new Error('Document service requires authToken or getAuthToken');
241
+ }
242
+
243
+ const url = `${this.documentServiceConfig.baseUrl}/api/documents/${documentId}/content`;
244
+
245
+ logger.debug('Fetching from document service', { documentId, url });
246
+
247
+ const response = await fetch(url, {
248
+ headers: {
249
+ Authorization: `Bearer ${token}`,
250
+ 'x-app-name': this.documentServiceConfig.appName,
251
+ Accept: 'application/json',
252
+ },
253
+ });
254
+
255
+ if (!response.ok) {
256
+ if (response.status === 404) {
257
+ throw new Error(`Document not found: ${documentId}`);
258
+ } else if (response.status === 403) {
259
+ throw new Error(`Access denied to document: ${documentId}`);
260
+ } else {
261
+ throw new Error(`Document service error: ${response.status} ${response.statusText}`);
262
+ }
263
+ }
264
+
265
+ const data = await response.json();
266
+
267
+ if (data.isBase64) {
268
+ if (data.mimeType?.includes('pdf') || data.format?.toLowerCase() === 'pdf') {
269
+ try {
270
+ const { parsePdfBuffer } = await import('../docs.js');
271
+ const buffer = Buffer.from(data.content, 'base64');
272
+ const text = await parsePdfBuffer(new Uint8Array(buffer));
273
+ logger.debug('Extracted text from PDF', { documentId, textLength: text.length });
274
+ return text;
275
+ } catch (pdfError: any) {
276
+ logger.error('Failed to parse PDF from document service', {
277
+ documentId,
278
+ error: pdfError.message,
279
+ });
280
+ throw new Error(`Failed to extract text from PDF: ${pdfError.message}`);
281
+ }
282
+ }
283
+ return Buffer.from(data.content, 'base64').toString('utf-8');
284
+ }
285
+
286
+ if (data.format?.toLowerCase() === 'md' || data.format?.toLowerCase() === 'markdown') {
287
+ try {
288
+ const parsedText = this.parseMarkdownText(data.content);
289
+ logger.debug('Parsed markdown content', { documentId, textLength: parsedText.length });
290
+ return parsedText;
291
+ } catch (mdError: any) {
292
+ logger.warn('Markdown parsing failed, returning raw content', {
293
+ documentId,
294
+ error: mdError.message,
295
+ });
296
+ return data.content;
297
+ }
298
+ }
299
+
300
+ return data.content;
301
+ } catch (error) {
302
+ logger.error('Document service fetch failed', {
303
+ documentId,
304
+ error: error instanceof Error ? error.message : String(error),
305
+ });
306
+ throw error;
307
+ }
308
+ }
309
+
310
+ private async lookupDocumentByTitle(title: string): Promise<string | null> {
311
+ if (!this.documentServiceConfig) {
312
+ return null;
313
+ }
314
+
315
+ try {
316
+ let token: string;
317
+ if (this.documentServiceConfig.authToken) {
318
+ token = this.documentServiceConfig.authToken;
319
+ } else if (this.documentServiceConfig.getAuthToken) {
320
+ token = await this.documentServiceConfig.getAuthToken();
321
+ } else {
322
+ throw new Error('Document service requires authToken or getAuthToken');
323
+ }
324
+
325
+ const url = `${this.documentServiceConfig.baseUrl}/api/documents/lookup/by-title?title=${encodeURIComponent(title)}`;
326
+
327
+ logger.debug('Looking up document by title', { title, url });
328
+
329
+ const response = await fetch(url, {
330
+ headers: {
331
+ Authorization: `Bearer ${token}`,
332
+ 'x-app-name': this.documentServiceConfig.appName,
333
+ Accept: 'application/json',
334
+ },
335
+ });
336
+
337
+ if (response.status === 404) {
338
+ logger.debug('Document not found by title', { title });
339
+ return null;
340
+ }
341
+
342
+ if (!response.ok) {
343
+ throw new Error(`Document service lookup error: ${response.status}`);
344
+ }
345
+
346
+ const data = await response.json();
347
+ logger.debug('Found document by title', { title, documentId: data.documentId });
348
+ return data.documentId;
349
+ } catch (error) {
350
+ logger.error('Document lookup failed', {
351
+ title,
352
+ error: error instanceof Error ? error.message : String(error),
353
+ });
354
+ return null;
355
+ }
356
+ }
357
+
358
+ private async fetchFromS3(config: DocumentConfig): Promise<string> {
359
+ const s3Config = this.parseS3Url(config.url!, config.retrievalConfig);
360
+ const client = await this.getOrCreateS3Client(s3Config);
361
+
362
+ try {
363
+ const response = await client.send(
364
+ new GetObjectCommand({
365
+ Bucket: s3Config.bucket,
366
+ Key: s3Config.key,
367
+ })
368
+ );
369
+
370
+ if (!response.Body) {
371
+ throw new Error('S3 object has no body');
372
+ }
373
+ const bodyBuffer = await this.readS3BodyToBuffer(response.Body as any);
374
+ const contentType = (response.ContentType || '').toLowerCase();
375
+ const lowerKey = s3Config.key.toLowerCase();
376
+ const isPdf = contentType.includes('application/pdf') || lowerKey.endsWith('.pdf');
377
+ const isMarkdown =
378
+ contentType.includes('text/markdown') ||
379
+ lowerKey.endsWith('.md') ||
380
+ lowerKey.endsWith('.markdown') ||
381
+ lowerKey.endsWith('.mdown');
382
+ if (isPdf) {
383
+ return await this.parsePdfBuffer(bodyBuffer);
384
+ }
385
+ if (isMarkdown) {
386
+ return this.parseMarkdownText(bodyBuffer.toString('utf-8'));
387
+ }
388
+ return bodyBuffer.toString('utf-8');
389
+ } catch (error) {
390
+ const errorMessage = error instanceof Error ? error.message : String(error);
391
+ const errorStack = error instanceof Error ? error.stack : undefined;
392
+ logger.error('S3 fetch failed', {
393
+ url: config.url,
394
+ bucket: s3Config.bucket,
395
+ key: s3Config.key,
396
+ region: s3Config.region,
397
+ hasAccessKey: !!s3Config.accessKeyId,
398
+ error: errorMessage,
399
+ stack: errorStack,
400
+ });
401
+ throw new Error(
402
+ `Failed to fetch from S3 (bucket: ${s3Config.bucket}, key: ${s3Config.key}, region: ${s3Config.region}): ${errorMessage}`
403
+ );
404
+ }
405
+ }
406
+
407
+ private async fetchFromUrl(url: string): Promise<string> {
408
+ try {
409
+ const response = await fetch(url, {
410
+ signal: AbortSignal.timeout(30000),
411
+ });
412
+
413
+ if (!response.ok) {
414
+ throw new Error(`HTTP ${response.status} ${response.statusText}`);
415
+ }
416
+
417
+ const body = await response.arrayBuffer();
418
+ const maxSize = 50 * 1024 * 1024;
419
+ if (body.byteLength > maxSize) {
420
+ throw new Error(`Response too large: ${body.byteLength} bytes`);
421
+ }
422
+
423
+ const contentType = (response.headers.get('content-type') || '').toLowerCase();
424
+ const lowerUrl = url.toLowerCase();
425
+ const isMarkdown =
426
+ contentType.includes('text/markdown') ||
427
+ lowerUrl.endsWith('.md') ||
428
+ lowerUrl.endsWith('.markdown');
429
+
430
+ if (isMarkdown) {
431
+ return this.parseMarkdownText(Buffer.from(body).toString('utf-8'));
432
+ }
433
+
434
+ return Buffer.from(body).toString('utf-8');
435
+ } catch (error) {
436
+ logger.error('URL fetch failed', {
437
+ url,
438
+ error: error instanceof Error ? error.message : String(error),
439
+ });
440
+ throw error;
441
+ }
442
+ }
443
+
444
+ private async fetchFromLocal(filePath: string): Promise<string> {
445
+ try {
446
+ const content = await readFile(filePath, 'utf-8');
447
+ const lowerPath = filePath.toLowerCase();
448
+ const isMarkdown = lowerPath.endsWith('.md') || lowerPath.endsWith('.markdown');
449
+
450
+ if (isMarkdown) {
451
+ return this.parseMarkdownText(content);
452
+ }
453
+
454
+ return content;
455
+ } catch (error) {
456
+ logger.error('Local file read failed', {
457
+ path: filePath,
458
+ error: error instanceof Error ? error.message : String(error),
459
+ });
460
+ throw error;
461
+ }
462
+ }
463
+
464
+ private parseS3Url(
465
+ url: string,
466
+ retrievalConfig?: RetrievalConfig
467
+ ): {
468
+ bucket: string;
469
+ key: string;
470
+ region: string;
471
+ endpoint?: string;
472
+ accessKeyId?: string;
473
+ secretAccessKey?: string;
474
+ forcePathStyle?: boolean;
475
+ } {
476
+ // Parse s3://bucket/key format
477
+ const withoutProtocol = url.replace('s3://', '');
478
+ const firstSlash = withoutProtocol.indexOf('/');
479
+
480
+ if (firstSlash === -1) {
481
+ throw new Error(`Invalid S3 URL format: ${url}`);
482
+ }
483
+
484
+ const bucket = withoutProtocol.slice(0, firstSlash);
485
+ const key = withoutProtocol.slice(firstSlash + 1);
486
+
487
+ const normalizedRetrievalConfig = this.normalizeRetrievalConfig(retrievalConfig);
488
+
489
+ // Get S3-specific config from retrievalConfig if provider is s3
490
+ let s3SpecificConfig: S3Config = {};
491
+ if (normalizedRetrievalConfig?.provider === 's3' && normalizedRetrievalConfig.config) {
492
+ s3SpecificConfig = normalizedRetrievalConfig.config as S3Config;
493
+ }
494
+
495
+ return {
496
+ bucket,
497
+ key,
498
+ region: s3SpecificConfig.region || process.env.AWS_REGION || 'us-east-1',
499
+ endpoint: s3SpecificConfig.endpoint,
500
+ accessKeyId: s3SpecificConfig.accessKeyId || process.env.AWS_ACCESS_KEY_ID,
501
+ secretAccessKey: s3SpecificConfig.secretAccessKey || process.env.AWS_SECRET_ACCESS_KEY,
502
+ forcePathStyle: s3SpecificConfig.forcePathStyle,
503
+ };
504
+ }
505
+
506
+ private normalizeRetrievalConfig(config?: RetrievalConfig): RetrievalConfig | undefined {
507
+ if (!config) {
508
+ return undefined;
509
+ }
510
+
511
+ // Handle nested config structure from Agentlang
512
+ const normalizedConfig = preprocessRawConfig(config) as RetrievalConfig;
513
+
514
+ return normalizedConfig;
515
+ }
516
+
517
+ private async getOrCreateS3Client(config: {
518
+ region: string;
519
+ endpoint?: string;
520
+ accessKeyId?: string;
521
+ secretAccessKey?: string;
522
+ forcePathStyle?: boolean;
523
+ }): Promise<any> {
524
+ const clientKey = `${config.region}:${config.endpoint || 'default'}:${config.accessKeyId || 'default'}`;
525
+
526
+ if (!this.s3Clients.has(clientKey)) {
527
+ const client = new S3Client({
528
+ region: config.region,
529
+ endpoint: config.endpoint,
530
+ forcePathStyle: config.forcePathStyle,
531
+ credentials:
532
+ config.accessKeyId && config.secretAccessKey
533
+ ? {
534
+ accessKeyId: config.accessKeyId,
535
+ secretAccessKey: config.secretAccessKey,
536
+ }
537
+ : undefined,
538
+ });
539
+
540
+ this.s3Clients.set(clientKey, client);
541
+ }
542
+
543
+ return this.s3Clients.get(clientKey)!;
544
+ }
545
+
546
+ private async parsePdfBuffer(buffer: Buffer): Promise<string> {
547
+ // Lazy load PDF parser
548
+ if (!this.pdfParser) {
549
+ try {
550
+ const pdfParse = await import('pdf-parse');
551
+ // Handle both ESM and CSM module formats
552
+ const parser = (pdfParse as any).default || pdfParse;
553
+ this.pdfParser = parser;
554
+ } catch (error) {
555
+ logger.error('Failed to load PDF parser', { error });
556
+ throw new Error(
557
+ 'PDF parsing not available. Please install pdf-parse: npm install pdf-parse'
558
+ );
559
+ }
560
+ }
561
+
562
+ try {
563
+ const result = await this.pdfParser(buffer);
564
+ return result.text || '';
565
+ } catch (error) {
566
+ logger.error('PDF parsing failed', { error });
567
+ throw new Error(`Failed to parse PDF: ${error}`);
568
+ }
569
+ }
570
+
571
+ private parseMarkdownText(text: string): string {
572
+ // Convert markdown to plain text for embedding
573
+ // This removes formatting but preserves content structure
574
+ try {
575
+ const html = marked.parse(text) as string;
576
+ // Simple HTML to text conversion
577
+ return html
578
+ .replace(/<[^>]+>/g, ' ') // Remove HTML tags
579
+ .replace(/\s+/g, ' ') // Normalize whitespace
580
+ .replace(/&lt;/g, '<')
581
+ .replace(/&gt;/g, '>')
582
+ .replace(/&amp;/g, '&')
583
+ .replace(/&quot;/g, '"')
584
+ .trim();
585
+ } catch (error) {
586
+ logger.warn('Markdown parsing failed, returning raw text', { error });
587
+ return text;
588
+ }
589
+ }
590
+
591
+ private async readS3BodyToBuffer(body: any): Promise<Buffer> {
592
+ if (body.transformToByteArray) {
593
+ const data = await body.transformToByteArray();
594
+ return Buffer.from(data);
595
+ }
596
+
597
+ // Fallback for Readable streams
598
+ const chunks: Buffer[] = [];
599
+ for await (const chunk of body) {
600
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
601
+ }
602
+ return Buffer.concat(chunks);
603
+ }
604
+
605
+ private async createDocumentEntity(document: FetchedDocument): Promise<void> {
606
+ try {
607
+ // Build the Document entity attributes
608
+ let docAttrs = `{title "${document.title}", content "${this.escapeContent(document.content)}"`;
609
+
610
+ // Add embeddingConfig if present
611
+ if (document.embeddingConfig) {
612
+ const configStr = JSON.stringify(document.embeddingConfig).replace(/"/g, '\\"');
613
+ docAttrs += `, embeddingConfig "${configStr}"`;
614
+ }
615
+
616
+ docAttrs += '}';
617
+
618
+ // Upsert to database
619
+ await parseAndEvaluateStatement(`{${CoreAIModuleName}/Document ${docAttrs}, @upsert}`);
620
+
621
+ logger.debug('Created Document entity', {
622
+ title: document.title,
623
+ url: document.url,
624
+ hasEmbeddingConfig: !!document.embeddingConfig,
625
+ });
626
+ } catch (error) {
627
+ logger.error('Failed to create Document entity', {
628
+ title: document.title,
629
+ error,
630
+ });
631
+ }
632
+ }
633
+
634
+ private escapeContent(content: string): string {
635
+ return content
636
+ .replace(/\\/g, '\\\\')
637
+ .replace(/"/g, '\\"')
638
+ .replace(/\n/g, '\\n')
639
+ .replace(/\r/g, '\\r')
640
+ .replace(/\t/g, '\\t');
641
+ }
642
+
643
+ private inferFormat(url: string): string {
644
+ // Handle document-service URLs
645
+ if (url.startsWith('document-service://')) {
646
+ return 'txt';
647
+ }
648
+ const parts = url.split('.');
649
+ if (parts.length > 1) {
650
+ return parts[parts.length - 1].toLowerCase();
651
+ }
652
+ return 'txt';
653
+ }
654
+
655
+ private findDocumentInConfig(title: string): DocumentConfig | null {
656
+ // This method should be called during config loading
657
+ // The documents are stored when the config is parsed
658
+ const docs = getConfiguredDocuments();
659
+ return docs.find(d => d.title === title) || null;
660
+ }
661
+
662
+ private ensureNodeEnv(): void {
663
+ if (!isNodeEnv) {
664
+ throw new Error('Document fetching is only available in Node.js environment');
665
+ }
666
+ }
667
+
668
+ clearCache(): void {
669
+ // Clear all cache
670
+ this.documentCache.clear();
671
+ }
672
+ }
673
+
674
+ // Singleton instance
675
+ const documentFetcher = new DocumentFetcherService();
676
+
677
+ // Helper function to get configured documents from module config
678
+ function getConfiguredDocuments(): DocumentConfig[] {
679
+ // This should be populated during config parsing
680
+ // For now, return empty array - actual implementation depends on how
681
+ // the config system stores document definitions
682
+ return (global as any).__configuredDocuments || [];
683
+ }
684
+
685
+ // Export for use in config loading
686
+ export function setConfiguredDocuments(docs: DocumentConfig[]): void {
687
+ (global as any).__configuredDocuments = docs;
688
+ }
689
+
690
+ export { documentFetcher };
691
+ export default documentFetcher;