agentlang 0.9.10 → 0.9.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/out/extension/main.cjs +38 -38
  2. package/out/extension/main.cjs.map +2 -2
  3. package/out/language/generated/ast.d.ts +1 -1
  4. package/out/language/generated/ast.js +1 -1
  5. package/out/language/generated/grammar.d.ts +1 -1
  6. package/out/language/generated/grammar.js +1 -1
  7. package/out/language/generated/module.d.ts +1 -1
  8. package/out/language/generated/module.js +1 -1
  9. package/out/language/main.cjs +850 -2388
  10. package/out/language/main.cjs.map +4 -4
  11. package/out/runtime/agents/common.d.ts +3 -1
  12. package/out/runtime/agents/common.d.ts.map +1 -1
  13. package/out/runtime/agents/common.js +35 -31
  14. package/out/runtime/agents/common.js.map +1 -1
  15. package/out/runtime/docs.d.ts +1 -0
  16. package/out/runtime/docs.d.ts.map +1 -1
  17. package/out/runtime/docs.js +16 -1
  18. package/out/runtime/docs.js.map +1 -1
  19. package/out/runtime/interpreter.d.ts +1 -0
  20. package/out/runtime/interpreter.d.ts.map +1 -1
  21. package/out/runtime/interpreter.js +41 -8
  22. package/out/runtime/interpreter.js.map +1 -1
  23. package/out/runtime/jsmodules.d.ts +2 -1
  24. package/out/runtime/jsmodules.d.ts.map +1 -1
  25. package/out/runtime/jsmodules.js +2 -1
  26. package/out/runtime/jsmodules.js.map +1 -1
  27. package/out/runtime/loader.d.ts.map +1 -1
  28. package/out/runtime/loader.js +3 -2
  29. package/out/runtime/loader.js.map +1 -1
  30. package/out/runtime/module.d.ts +1 -0
  31. package/out/runtime/module.d.ts.map +1 -1
  32. package/out/runtime/module.js +3 -0
  33. package/out/runtime/module.js.map +1 -1
  34. package/out/runtime/modules/ai.d.ts +11 -0
  35. package/out/runtime/modules/ai.d.ts.map +1 -1
  36. package/out/runtime/modules/ai.js +163 -10
  37. package/out/runtime/modules/ai.js.map +1 -1
  38. package/out/runtime/modules/core.d.ts.map +1 -1
  39. package/out/runtime/modules/core.js +7 -1
  40. package/out/runtime/modules/core.js.map +1 -1
  41. package/out/runtime/services/documentFetcher.d.ts +22 -14
  42. package/out/runtime/services/documentFetcher.d.ts.map +1 -1
  43. package/out/runtime/services/documentFetcher.js +348 -153
  44. package/out/runtime/services/documentFetcher.js.map +1 -1
  45. package/package.json +1 -1
  46. package/src/language/generated/ast.ts +1 -1
  47. package/src/language/generated/grammar.ts +1 -1
  48. package/src/language/generated/module.ts +1 -1
  49. package/src/runtime/agents/common.ts +37 -31
  50. package/src/runtime/docs.ts +17 -1
  51. package/src/runtime/interpreter.ts +44 -6
  52. package/src/runtime/jsmodules.ts +3 -1
  53. package/src/runtime/loader.ts +3 -2
  54. package/src/runtime/module.ts +4 -0
  55. package/src/runtime/modules/ai.ts +194 -9
  56. package/src/runtime/modules/core.ts +7 -1
  57. package/src/runtime/services/documentFetcher.ts +372 -149
@@ -1,4 +1,4 @@
1
- import { default as ai } from './ai.js';
1
+ import { default as ai, normalizeGeneratedCode } from './ai.js';
2
2
  import { default as auth } from './auth.js';
3
3
  import { default as files } from './files.js';
4
4
  import { default as mcp } from './mcp.js';
@@ -527,6 +527,12 @@ export function eventMonitorsData(
527
527
  export async function validateModule(moduleDef: any): Promise<Instance> {
528
528
  try {
529
529
  if (isString(moduleDef)) {
530
+ moduleDef = normalizeGeneratedCode(moduleDef);
531
+ if (!moduleDef.startsWith('module')) {
532
+ moduleDef = `module Temp
533
+ ${moduleDef}
534
+ `;
535
+ }
530
536
  await parseModule(moduleDef);
531
537
  return makeInstance(
532
538
  'agentlang',
@@ -1,6 +1,5 @@
1
- import { S3Client, GetObjectCommand } from '@aws-sdk/client-s3';
1
+ import { GetObjectCommand, S3Client } from '@aws-sdk/client-s3';
2
2
  import { readFile } from 'node:fs/promises';
3
- import path from 'node:path';
4
3
  import { logger } from '../logger.js';
5
4
  import { parseAndEvaluateStatement } from '../interpreter.js';
6
5
  import { CoreAIModuleName } from '../modules/ai.js';
@@ -20,7 +19,7 @@ export interface S3Config {
20
19
 
21
20
  // Generic retrieval configuration for any storage provider
22
21
  export interface RetrievalConfig {
23
- provider: 's3' | 'box' | 'gdrive' | 'azure' | 'onedrive' | string;
22
+ provider: 's3' | 'box' | 'gdrive' | 'azure' | 'onedrive' | 'document-service' | string;
24
23
  config: S3Config | Record<string, any>;
25
24
  }
26
25
 
@@ -33,7 +32,8 @@ export interface EmbeddingConfig {
33
32
 
34
33
  export interface DocumentConfig {
35
34
  title: string;
36
- url: string;
35
+ url?: string;
36
+ documentServiceId?: string;
37
37
  retrievalConfig?: RetrievalConfig;
38
38
  embeddingConfig?: EmbeddingConfig;
39
39
  }
@@ -47,15 +47,28 @@ export interface FetchedDocument {
47
47
  embeddingConfig?: EmbeddingConfig;
48
48
  }
49
49
 
50
+ interface DocumentServiceConfig {
51
+ baseUrl: string;
52
+ appName: string;
53
+ authToken?: string;
54
+ getAuthToken?: () => Promise<string>;
55
+ }
56
+
50
57
  class DocumentFetcherService {
51
58
  private static readonly CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
52
59
  private documentCache = new TtlCache<FetchedDocument>(DocumentFetcherService.CACHE_TTL_MS);
53
60
  private s3Clients = new Map<string, any>();
54
61
  private pdfParser: any = null;
62
+ private documentServiceConfig?: DocumentServiceConfig;
63
+
64
+ configureDocumentService(config: DocumentServiceConfig): void {
65
+ this.documentServiceConfig = config;
66
+ logger.info('Document service configured', { baseUrl: config.baseUrl });
67
+ }
55
68
 
56
69
  async fetchDocument(config: DocumentConfig): Promise<FetchedDocument | null> {
57
70
  this.ensureNodeEnv();
58
- const cacheKey = `${config.title}:${config.url}`;
71
+ const cacheKey = `${config.title}:${config.url || config.documentServiceId}`;
59
72
  const cached = this.documentCache.get(cacheKey);
60
73
 
61
74
  if (cached) {
@@ -65,28 +78,99 @@ class DocumentFetcherService {
65
78
 
66
79
  try {
67
80
  let content: string;
68
-
69
- if (config.url.startsWith('s3://')) {
81
+ let sourceUrl: string;
82
+
83
+ if (config.url?.startsWith('document-service://')) {
84
+ if (!config.retrievalConfig || config.retrievalConfig.provider !== 'document-service') {
85
+ throw new Error(
86
+ 'Document service URL requires retrievalConfig with provider: "document-service"'
87
+ );
88
+ }
89
+
90
+ const dsConfig = config.retrievalConfig.config as DocumentServiceConfig;
91
+ if (!dsConfig?.baseUrl) {
92
+ throw new Error('Document service config requires baseUrl');
93
+ }
94
+
95
+ const urlPath = config.url.replace('document-service://', '');
96
+ const parts = urlPath.split('/');
97
+
98
+ if (parts.length !== 3) {
99
+ throw new Error(
100
+ `Invalid document service URL format: ${config.url}. Expected: document-service://<user-uuid>/<app-uuid>/<doc-uuid>.ext`
101
+ );
102
+ }
103
+
104
+ const appUuid = parts[1];
105
+ const docIdWithExt = parts[2];
106
+ const docId = docIdWithExt.split('.')[0]; // Remove extension
107
+
108
+ this.documentServiceConfig = {
109
+ baseUrl: dsConfig.baseUrl,
110
+ appName: appUuid,
111
+ authToken: dsConfig.authToken,
112
+ getAuthToken: dsConfig.getAuthToken,
113
+ };
114
+
115
+ content = await this.fetchFromDocumentService(docId);
116
+ sourceUrl = config.url;
117
+ } else if (config.retrievalConfig?.provider === 'document-service') {
118
+ const dsConfig = config.retrievalConfig.config as DocumentServiceConfig;
119
+ if (!dsConfig?.baseUrl || !dsConfig?.appName) {
120
+ throw new Error('Document service config requires baseUrl and appName');
121
+ }
122
+
123
+ this.documentServiceConfig = {
124
+ baseUrl: dsConfig.baseUrl,
125
+ appName: dsConfig.appName,
126
+ authToken: dsConfig.authToken,
127
+ getAuthToken: dsConfig.getAuthToken,
128
+ };
129
+
130
+ const docId = await this.lookupDocumentByTitle(config.title);
131
+ if (docId) {
132
+ content = await this.fetchFromDocumentService(docId);
133
+ sourceUrl = `document-service://${docId}`;
134
+ } else {
135
+ throw new Error(`Document not found by title in document service: ${config.title}`);
136
+ }
137
+ } else if (config.documentServiceId && this.documentServiceConfig) {
138
+ content = await this.fetchFromDocumentService(config.documentServiceId);
139
+ sourceUrl = `document-service://${config.documentServiceId}`;
140
+ } else if (config.url?.startsWith('s3://')) {
70
141
  content = await this.fetchFromS3(config);
71
- } else if (config.url.startsWith('http://') || config.url.startsWith('https://')) {
142
+ sourceUrl = config.url;
143
+ } else if (config.url?.startsWith('http://') || config.url?.startsWith('https://')) {
72
144
  content = await this.fetchFromUrl(config.url);
73
- } else {
74
- // Local file path
145
+ sourceUrl = config.url;
146
+ } else if (config.url) {
75
147
  content = await this.fetchFromLocal(config.url);
148
+ sourceUrl = config.url;
149
+ } else {
150
+ if (this.documentServiceConfig) {
151
+ const docId = await this.lookupDocumentByTitle(config.title);
152
+ if (docId) {
153
+ content = await this.fetchFromDocumentService(docId);
154
+ sourceUrl = `document-service://${docId}`;
155
+ } else {
156
+ throw new Error(`Document not found by title: ${config.title}`);
157
+ }
158
+ } else {
159
+ throw new Error(`No URL or document service ID provided for: ${config.title}`);
160
+ }
76
161
  }
77
162
 
78
163
  const document: FetchedDocument = {
79
164
  title: config.title,
80
165
  content,
81
- url: config.url,
82
- format: this.inferFormat(config.url),
166
+ url: sourceUrl,
167
+ format: this.inferFormat(sourceUrl),
83
168
  fetchedAt: new Date(),
84
169
  embeddingConfig: config.embeddingConfig,
85
170
  };
86
171
 
87
172
  this.documentCache.set(cacheKey, document);
88
173
 
89
- // Auto-create Document entity from fetched content
90
174
  await this.createDocumentEntity(document);
91
175
 
92
176
  return document;
@@ -94,27 +178,44 @@ class DocumentFetcherService {
94
178
  logger.error('Failed to fetch document', {
95
179
  title: config.title,
96
180
  url: config.url,
181
+ documentServiceId: config.documentServiceId,
97
182
  error: error instanceof Error ? error.message : String(error),
98
183
  stack: error instanceof Error ? error.stack : undefined,
99
184
  });
100
- // Re-throw the error so the caller knows what happened
101
185
  throw error;
102
186
  }
103
187
  }
104
188
 
105
189
  async fetchDocumentByTitle(title: string): Promise<FetchedDocument | null> {
106
190
  this.ensureNodeEnv();
107
- // First check if we have it in cache
108
- // Note: TtlCache doesn't have a way to search by prefix, so we'll fetch directly
109
191
 
110
192
  try {
111
- // Try to find in loaded config
193
+ // First check if we have it in cache
194
+ const cacheKey = `${title}:lookup`;
195
+ const cached = this.documentCache.get(cacheKey);
196
+ if (cached) {
197
+ logger.debug('Returning cached document by title', { title });
198
+ return cached;
199
+ }
200
+
201
+ // Try document service lookup first (if configured)
202
+ if (this.documentServiceConfig) {
203
+ const docId = await this.lookupDocumentByTitle(title);
204
+ if (docId) {
205
+ return this.fetchDocument({
206
+ title,
207
+ documentServiceId: docId,
208
+ });
209
+ }
210
+ }
211
+
212
+ // Fall back to config-based lookup
112
213
  const doc = this.findDocumentInConfig(title);
113
214
  if (doc) {
114
215
  return this.fetchDocument(doc);
115
216
  }
116
217
 
117
- logger.warn('Document not found in config', { title });
218
+ logger.warn('Document not found', { title });
118
219
  return null;
119
220
  } catch (error) {
120
221
  logger.error('Failed to fetch document by title', { title, error });
@@ -122,15 +223,140 @@ class DocumentFetcherService {
122
223
  }
123
224
  }
124
225
 
125
- private findDocumentInConfig(title: string): DocumentConfig | null {
126
- // This method should be called during config loading
127
- // The documents are stored when the config is parsed
128
- const docs = getConfiguredDocuments();
129
- return docs.find(d => d.title === title) || null;
226
+ // Fetch from secure document-service API
227
+ private async fetchFromDocumentService(documentId: string): Promise<string> {
228
+ if (!this.documentServiceConfig) {
229
+ throw new Error('Document service not configured');
230
+ }
231
+
232
+ try {
233
+ // Get token - either static from config or dynamic from function
234
+ let token: string;
235
+ if (this.documentServiceConfig.authToken) {
236
+ token = this.documentServiceConfig.authToken;
237
+ } else if (this.documentServiceConfig.getAuthToken) {
238
+ token = await this.documentServiceConfig.getAuthToken();
239
+ } else {
240
+ throw new Error('Document service requires authToken or getAuthToken');
241
+ }
242
+
243
+ const url = `${this.documentServiceConfig.baseUrl}/api/documents/${documentId}/content`;
244
+
245
+ logger.debug('Fetching from document service', { documentId, url });
246
+
247
+ const response = await fetch(url, {
248
+ headers: {
249
+ Authorization: `Bearer ${token}`,
250
+ 'x-app-name': this.documentServiceConfig.appName,
251
+ Accept: 'application/json',
252
+ },
253
+ });
254
+
255
+ if (!response.ok) {
256
+ if (response.status === 404) {
257
+ throw new Error(`Document not found: ${documentId}`);
258
+ } else if (response.status === 403) {
259
+ throw new Error(`Access denied to document: ${documentId}`);
260
+ } else {
261
+ throw new Error(`Document service error: ${response.status} ${response.statusText}`);
262
+ }
263
+ }
264
+
265
+ const data = await response.json();
266
+
267
+ if (data.isBase64) {
268
+ if (data.mimeType?.includes('pdf') || data.format?.toLowerCase() === 'pdf') {
269
+ try {
270
+ const { parsePdfBuffer } = await import('../docs.js');
271
+ const buffer = Buffer.from(data.content, 'base64');
272
+ const text = await parsePdfBuffer(new Uint8Array(buffer));
273
+ logger.debug('Extracted text from PDF', { documentId, textLength: text.length });
274
+ return text;
275
+ } catch (pdfError: any) {
276
+ logger.error('Failed to parse PDF from document service', {
277
+ documentId,
278
+ error: pdfError.message,
279
+ });
280
+ throw new Error(`Failed to extract text from PDF: ${pdfError.message}`);
281
+ }
282
+ }
283
+ return Buffer.from(data.content, 'base64').toString('utf-8');
284
+ }
285
+
286
+ if (data.format?.toLowerCase() === 'md' || data.format?.toLowerCase() === 'markdown') {
287
+ try {
288
+ const parsedText = this.parseMarkdownText(data.content);
289
+ logger.debug('Parsed markdown content', { documentId, textLength: parsedText.length });
290
+ return parsedText;
291
+ } catch (mdError: any) {
292
+ logger.warn('Markdown parsing failed, returning raw content', {
293
+ documentId,
294
+ error: mdError.message,
295
+ });
296
+ return data.content;
297
+ }
298
+ }
299
+
300
+ return data.content;
301
+ } catch (error) {
302
+ logger.error('Document service fetch failed', {
303
+ documentId,
304
+ error: error instanceof Error ? error.message : String(error),
305
+ });
306
+ throw error;
307
+ }
308
+ }
309
+
310
+ private async lookupDocumentByTitle(title: string): Promise<string | null> {
311
+ if (!this.documentServiceConfig) {
312
+ return null;
313
+ }
314
+
315
+ try {
316
+ let token: string;
317
+ if (this.documentServiceConfig.authToken) {
318
+ token = this.documentServiceConfig.authToken;
319
+ } else if (this.documentServiceConfig.getAuthToken) {
320
+ token = await this.documentServiceConfig.getAuthToken();
321
+ } else {
322
+ throw new Error('Document service requires authToken or getAuthToken');
323
+ }
324
+
325
+ const url = `${this.documentServiceConfig.baseUrl}/api/documents/lookup/by-title?title=${encodeURIComponent(title)}`;
326
+
327
+ logger.debug('Looking up document by title', { title, url });
328
+
329
+ const response = await fetch(url, {
330
+ headers: {
331
+ Authorization: `Bearer ${token}`,
332
+ 'x-app-name': this.documentServiceConfig.appName,
333
+ Accept: 'application/json',
334
+ },
335
+ });
336
+
337
+ if (response.status === 404) {
338
+ logger.debug('Document not found by title', { title });
339
+ return null;
340
+ }
341
+
342
+ if (!response.ok) {
343
+ throw new Error(`Document service lookup error: ${response.status}`);
344
+ }
345
+
346
+ const data = await response.json();
347
+ logger.debug('Found document by title', { title, documentId: data.documentId });
348
+ return data.documentId;
349
+ } catch (error) {
350
+ logger.error('Document lookup failed', {
351
+ title,
352
+ error: error instanceof Error ? error.message : String(error),
353
+ });
354
+ return null;
355
+ }
130
356
  }
131
357
 
132
358
  private async fetchFromS3(config: DocumentConfig): Promise<string> {
133
- const s3Config = this.parseS3Url(config.url, config.retrievalConfig);
359
+ const s3Config = this.parseS3Url(config.url!, config.retrievalConfig);
134
360
  const client = await this.getOrCreateS3Client(s3Config);
135
361
 
136
362
  try {
@@ -199,29 +425,39 @@ class DocumentFetcherService {
199
425
  const isMarkdown =
200
426
  contentType.includes('text/markdown') ||
201
427
  lowerUrl.endsWith('.md') ||
202
- lowerUrl.endsWith('.markdown') ||
203
- lowerUrl.endsWith('.mdown');
204
- const text = Buffer.from(body).toString('utf-8');
205
- return isMarkdown ? this.parseMarkdownText(text) : text;
428
+ lowerUrl.endsWith('.markdown');
429
+
430
+ if (isMarkdown) {
431
+ return this.parseMarkdownText(Buffer.from(body).toString('utf-8'));
432
+ }
433
+
434
+ return Buffer.from(body).toString('utf-8');
206
435
  } catch (error) {
207
- logger.error('URL fetch failed', { url, error });
208
- throw new Error(`Failed to fetch from URL: ${error}`);
436
+ logger.error('URL fetch failed', {
437
+ url,
438
+ error: error instanceof Error ? error.message : String(error),
439
+ });
440
+ throw error;
209
441
  }
210
442
  }
211
443
 
212
444
  private async fetchFromLocal(filePath: string): Promise<string> {
213
445
  try {
214
- const resolvedPath = path.resolve(filePath);
215
- const content = await readFile(resolvedPath, 'utf-8');
216
- const lowerPath = resolvedPath.toLowerCase();
217
- const isMarkdown =
218
- lowerPath.endsWith('.md') ||
219
- lowerPath.endsWith('.markdown') ||
220
- lowerPath.endsWith('.mdown');
221
- return isMarkdown ? this.parseMarkdownText(content) : content;
446
+ const content = await readFile(filePath, 'utf-8');
447
+ const lowerPath = filePath.toLowerCase();
448
+ const isMarkdown = lowerPath.endsWith('.md') || lowerPath.endsWith('.markdown');
449
+
450
+ if (isMarkdown) {
451
+ return this.parseMarkdownText(content);
452
+ }
453
+
454
+ return content;
222
455
  } catch (error) {
223
- logger.error('Local file read failed', { path: filePath, error });
224
- throw new Error(`Failed to read local file: ${error}`);
456
+ logger.error('Local file read failed', {
457
+ path: filePath,
458
+ error: error instanceof Error ? error.message : String(error),
459
+ });
460
+ throw error;
225
461
  }
226
462
  }
227
463
 
@@ -238,15 +474,11 @@ class DocumentFetcherService {
238
474
  forcePathStyle?: boolean;
239
475
  } {
240
476
  // Parse s3://bucket/key format
241
- if (!url.startsWith('s3://')) {
242
- throw new Error('Invalid S3 URL format. Expected: s3://bucket/key');
243
- }
244
-
245
- const withoutProtocol = url.slice(5);
477
+ const withoutProtocol = url.replace('s3://', '');
246
478
  const firstSlash = withoutProtocol.indexOf('/');
247
479
 
248
480
  if (firstSlash === -1) {
249
- throw new Error('Invalid S3 URL format. Expected: s3://bucket/key');
481
+ throw new Error(`Invalid S3 URL format: ${url}`);
250
482
  }
251
483
 
252
484
  const bucket = withoutProtocol.slice(0, firstSlash);
@@ -271,6 +503,17 @@ class DocumentFetcherService {
271
503
  };
272
504
  }
273
505
 
506
+ private normalizeRetrievalConfig(config?: RetrievalConfig): RetrievalConfig | undefined {
507
+ if (!config) {
508
+ return undefined;
509
+ }
510
+
511
+ // Handle nested config structure from Agentlang
512
+ const normalizedConfig = preprocessRawConfig(config) as RetrievalConfig;
513
+
514
+ return normalizedConfig;
515
+ }
516
+
274
517
  private async getOrCreateS3Client(config: {
275
518
  region: string;
276
519
  endpoint?: string;
@@ -300,6 +543,65 @@ class DocumentFetcherService {
300
543
  return this.s3Clients.get(clientKey)!;
301
544
  }
302
545
 
546
+ private async parsePdfBuffer(buffer: Buffer): Promise<string> {
547
+ // Lazy load PDF parser
548
+ if (!this.pdfParser) {
549
+ try {
550
+ const pdfParse = await import('pdf-parse');
551
+ // Handle both ESM and CSM module formats
552
+ const parser = (pdfParse as any).default || pdfParse;
553
+ this.pdfParser = parser;
554
+ } catch (error) {
555
+ logger.error('Failed to load PDF parser', { error });
556
+ throw new Error(
557
+ 'PDF parsing not available. Please install pdf-parse: npm install pdf-parse'
558
+ );
559
+ }
560
+ }
561
+
562
+ try {
563
+ const result = await this.pdfParser(buffer);
564
+ return result.text || '';
565
+ } catch (error) {
566
+ logger.error('PDF parsing failed', { error });
567
+ throw new Error(`Failed to parse PDF: ${error}`);
568
+ }
569
+ }
570
+
571
+ private parseMarkdownText(text: string): string {
572
+ // Convert markdown to plain text for embedding
573
+ // This removes formatting but preserves content structure
574
+ try {
575
+ const html = marked.parse(text) as string;
576
+ // Simple HTML to text conversion
577
+ return html
578
+ .replace(/<[^>]+>/g, ' ') // Remove HTML tags
579
+ .replace(/\s+/g, ' ') // Normalize whitespace
580
+ .replace(/&lt;/g, '<')
581
+ .replace(/&gt;/g, '>')
582
+ .replace(/&amp;/g, '&')
583
+ .replace(/&quot;/g, '"')
584
+ .trim();
585
+ } catch (error) {
586
+ logger.warn('Markdown parsing failed, returning raw text', { error });
587
+ return text;
588
+ }
589
+ }
590
+
591
+ private async readS3BodyToBuffer(body: any): Promise<Buffer> {
592
+ if (body.transformToByteArray) {
593
+ const data = await body.transformToByteArray();
594
+ return Buffer.from(data);
595
+ }
596
+
597
+ // Fallback for Readable streams
598
+ const chunks: Buffer[] = [];
599
+ for await (const chunk of body) {
600
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
601
+ }
602
+ return Buffer.concat(chunks);
603
+ }
604
+
303
605
  private async createDocumentEntity(document: FetchedDocument): Promise<void> {
304
606
  try {
305
607
  // Build the Document entity attributes
@@ -339,6 +641,10 @@ class DocumentFetcherService {
339
641
  }
340
642
 
341
643
  private inferFormat(url: string): string {
644
+ // Handle document-service URLs
645
+ if (url.startsWith('document-service://')) {
646
+ return 'txt';
647
+ }
342
648
  const parts = url.split('.');
343
649
  if (parts.length > 1) {
344
650
  return parts[parts.length - 1].toLowerCase();
@@ -346,43 +652,11 @@ class DocumentFetcherService {
346
652
  return 'txt';
347
653
  }
348
654
 
349
- clearCache(title?: string): void {
350
- if (title) {
351
- // Note: TtlCache doesn't expose keys, clear all for now
352
- this.documentCache.clear();
353
- } else {
354
- this.documentCache.clear();
355
- }
356
- }
357
-
358
- private normalizeConfigValue(value: any): any {
359
- if (value instanceof Map) {
360
- const obj: Record<string, any> = {};
361
- value.forEach((v, k) => {
362
- obj[k] = this.normalizeConfigValue(v);
363
- });
364
- return obj;
365
- }
366
- if (Array.isArray(value)) {
367
- return value.map(v => this.normalizeConfigValue(v));
368
- }
369
- if (value && typeof value === 'object') {
370
- const obj: Record<string, any> = {};
371
- Object.entries(value).forEach(([k, v]) => {
372
- obj[k] = this.normalizeConfigValue(v);
373
- });
374
- return obj;
375
- }
376
- return value;
377
- }
378
-
379
- private normalizeRetrievalConfig(retrievalConfig?: RetrievalConfig): RetrievalConfig | undefined {
380
- if (!retrievalConfig) return undefined;
381
- const normalized = this.normalizeConfigValue(retrievalConfig);
382
- if (normalized && typeof normalized === 'object') {
383
- preprocessRawConfig(normalized);
384
- }
385
- return normalized as RetrievalConfig;
655
+ private findDocumentInConfig(title: string): DocumentConfig | null {
656
+ // This method should be called during config loading
657
+ // The documents are stored when the config is parsed
658
+ const docs = getConfiguredDocuments();
659
+ return docs.find(d => d.title === title) || null;
386
660
  }
387
661
 
388
662
  private ensureNodeEnv(): void {
@@ -391,78 +665,27 @@ class DocumentFetcherService {
391
665
  }
392
666
  }
393
667
 
394
- private async readS3BodyToBuffer(body: any): Promise<Buffer> {
395
- if (body.transformToByteArray) {
396
- const bytes = await body.transformToByteArray();
397
- return Buffer.from(bytes);
398
- }
399
- if (body.transformToString) {
400
- const text = await body.transformToString('utf-8');
401
- return Buffer.from(text, 'utf-8');
402
- }
403
- const chunks: Buffer[] = [];
404
- for await (const chunk of body) {
405
- chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
406
- }
407
- return Buffer.concat(chunks);
408
- }
409
-
410
- private async getPdfParser(): Promise<any> {
411
- if (!this.pdfParser) {
412
- const pdfModule: any = await import('pdf-parse');
413
- this.pdfParser = pdfModule.PDFParse || pdfModule.default;
414
- }
415
- return this.pdfParser;
416
- }
417
-
418
- private async parsePdfBuffer(buffer: Buffer): Promise<string> {
419
- try {
420
- const PDFParseClass = await this.getPdfParser();
421
- const parser = new PDFParseClass({
422
- data: buffer,
423
- verbosity: 0,
424
- });
425
- const data = await parser.getText();
426
- return data.text;
427
- } catch (error: any) {
428
- logger.error(`Failed to parse PDF: ${error.message}`);
429
- throw new Error(`PDF parsing failed: ${error.message}`);
430
- }
431
- }
432
-
433
- private parseMarkdownText(markdown: string): string {
434
- const html = marked.parse(markdown);
435
- if (typeof html !== 'string') {
436
- return markdown;
437
- }
438
- return html
439
- .replace(/<\s*br\s*\/?>/gi, '\n')
440
- .replace(/<\/(p|li|h[1-6]|blockquote|pre|tr|table)>/gi, '\n')
441
- .replace(/<[^>]+>/g, '')
442
- .replace(/\n{3,}/g, '\n\n')
443
- .trim();
668
+ clearCache(): void {
669
+ // Clear all cache
670
+ this.documentCache.clear();
444
671
  }
445
672
  }
446
673
 
447
- // Store configured documents from config.al
448
- let configuredDocuments: DocumentConfig[] = [];
449
-
450
- export function registerConfiguredDocument(doc: DocumentConfig): void {
451
- // Check if already registered
452
- const existing = configuredDocuments.find(d => d.title === doc.title);
453
- if (!existing) {
454
- configuredDocuments.push(doc);
455
- logger.debug('Registered configured document', { title: doc.title, url: doc.url });
456
- }
457
- }
674
+ // Singleton instance
675
+ const documentFetcher = new DocumentFetcherService();
458
676
 
459
- export function getConfiguredDocuments(): DocumentConfig[] {
460
- return [...configuredDocuments];
677
+ // Helper function to get configured documents from module config
678
+ function getConfiguredDocuments(): DocumentConfig[] {
679
+ // This should be populated during config parsing
680
+ // For now, return empty array - actual implementation depends on how
681
+ // the config system stores document definitions
682
+ return (global as any).__configuredDocuments || [];
461
683
  }
462
684
 
463
- export function clearConfiguredDocuments(): void {
464
- configuredDocuments = [];
685
+ // Export for use in config loading
686
+ export function setConfiguredDocuments(docs: DocumentConfig[]): void {
687
+ (global as any).__configuredDocuments = docs;
465
688
  }
466
689
 
467
- export const documentFetcher = new DocumentFetcherService();
690
+ export { documentFetcher };
468
691
  export default documentFetcher;