agentlang 0.9.10 → 0.9.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/extension/main.cjs +38 -38
- package/out/extension/main.cjs.map +2 -2
- package/out/language/generated/ast.d.ts +1 -1
- package/out/language/generated/ast.js +1 -1
- package/out/language/generated/grammar.d.ts +1 -1
- package/out/language/generated/grammar.js +1 -1
- package/out/language/generated/module.d.ts +1 -1
- package/out/language/generated/module.js +1 -1
- package/out/language/main.cjs +850 -2388
- package/out/language/main.cjs.map +4 -4
- package/out/runtime/agents/common.d.ts +3 -1
- package/out/runtime/agents/common.d.ts.map +1 -1
- package/out/runtime/agents/common.js +35 -31
- package/out/runtime/agents/common.js.map +1 -1
- package/out/runtime/docs.d.ts +1 -0
- package/out/runtime/docs.d.ts.map +1 -1
- package/out/runtime/docs.js +16 -1
- package/out/runtime/docs.js.map +1 -1
- package/out/runtime/interpreter.d.ts +1 -0
- package/out/runtime/interpreter.d.ts.map +1 -1
- package/out/runtime/interpreter.js +41 -8
- package/out/runtime/interpreter.js.map +1 -1
- package/out/runtime/jsmodules.d.ts +2 -1
- package/out/runtime/jsmodules.d.ts.map +1 -1
- package/out/runtime/jsmodules.js +2 -1
- package/out/runtime/jsmodules.js.map +1 -1
- package/out/runtime/loader.d.ts.map +1 -1
- package/out/runtime/loader.js +3 -2
- package/out/runtime/loader.js.map +1 -1
- package/out/runtime/module.d.ts +1 -0
- package/out/runtime/module.d.ts.map +1 -1
- package/out/runtime/module.js +3 -0
- package/out/runtime/module.js.map +1 -1
- package/out/runtime/modules/ai.d.ts +11 -0
- package/out/runtime/modules/ai.d.ts.map +1 -1
- package/out/runtime/modules/ai.js +163 -10
- package/out/runtime/modules/ai.js.map +1 -1
- package/out/runtime/modules/core.d.ts.map +1 -1
- package/out/runtime/modules/core.js +7 -1
- package/out/runtime/modules/core.js.map +1 -1
- package/out/runtime/services/documentFetcher.d.ts +22 -14
- package/out/runtime/services/documentFetcher.d.ts.map +1 -1
- package/out/runtime/services/documentFetcher.js +348 -153
- package/out/runtime/services/documentFetcher.js.map +1 -1
- package/package.json +1 -1
- package/src/language/generated/ast.ts +1 -1
- package/src/language/generated/grammar.ts +1 -1
- package/src/language/generated/module.ts +1 -1
- package/src/runtime/agents/common.ts +37 -31
- package/src/runtime/docs.ts +17 -1
- package/src/runtime/interpreter.ts +44 -6
- package/src/runtime/jsmodules.ts +3 -1
- package/src/runtime/loader.ts +3 -2
- package/src/runtime/module.ts +4 -0
- package/src/runtime/modules/ai.ts +194 -9
- package/src/runtime/modules/core.ts +7 -1
- package/src/runtime/services/documentFetcher.ts +372 -149
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { default as ai } from './ai.js';
|
|
1
|
+
import { default as ai, normalizeGeneratedCode } from './ai.js';
|
|
2
2
|
import { default as auth } from './auth.js';
|
|
3
3
|
import { default as files } from './files.js';
|
|
4
4
|
import { default as mcp } from './mcp.js';
|
|
@@ -527,6 +527,12 @@ export function eventMonitorsData(
|
|
|
527
527
|
export async function validateModule(moduleDef: any): Promise<Instance> {
|
|
528
528
|
try {
|
|
529
529
|
if (isString(moduleDef)) {
|
|
530
|
+
moduleDef = normalizeGeneratedCode(moduleDef);
|
|
531
|
+
if (!moduleDef.startsWith('module')) {
|
|
532
|
+
moduleDef = `module Temp
|
|
533
|
+
${moduleDef}
|
|
534
|
+
`;
|
|
535
|
+
}
|
|
530
536
|
await parseModule(moduleDef);
|
|
531
537
|
return makeInstance(
|
|
532
538
|
'agentlang',
|
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { GetObjectCommand, S3Client } from '@aws-sdk/client-s3';
|
|
2
2
|
import { readFile } from 'node:fs/promises';
|
|
3
|
-
import path from 'node:path';
|
|
4
3
|
import { logger } from '../logger.js';
|
|
5
4
|
import { parseAndEvaluateStatement } from '../interpreter.js';
|
|
6
5
|
import { CoreAIModuleName } from '../modules/ai.js';
|
|
@@ -20,7 +19,7 @@ export interface S3Config {
|
|
|
20
19
|
|
|
21
20
|
// Generic retrieval configuration for any storage provider
|
|
22
21
|
export interface RetrievalConfig {
|
|
23
|
-
provider: 's3' | 'box' | 'gdrive' | 'azure' | 'onedrive' | string;
|
|
22
|
+
provider: 's3' | 'box' | 'gdrive' | 'azure' | 'onedrive' | 'document-service' | string;
|
|
24
23
|
config: S3Config | Record<string, any>;
|
|
25
24
|
}
|
|
26
25
|
|
|
@@ -33,7 +32,8 @@ export interface EmbeddingConfig {
|
|
|
33
32
|
|
|
34
33
|
export interface DocumentConfig {
|
|
35
34
|
title: string;
|
|
36
|
-
url
|
|
35
|
+
url?: string;
|
|
36
|
+
documentServiceId?: string;
|
|
37
37
|
retrievalConfig?: RetrievalConfig;
|
|
38
38
|
embeddingConfig?: EmbeddingConfig;
|
|
39
39
|
}
|
|
@@ -47,15 +47,28 @@ export interface FetchedDocument {
|
|
|
47
47
|
embeddingConfig?: EmbeddingConfig;
|
|
48
48
|
}
|
|
49
49
|
|
|
50
|
+
interface DocumentServiceConfig {
|
|
51
|
+
baseUrl: string;
|
|
52
|
+
appName: string;
|
|
53
|
+
authToken?: string;
|
|
54
|
+
getAuthToken?: () => Promise<string>;
|
|
55
|
+
}
|
|
56
|
+
|
|
50
57
|
class DocumentFetcherService {
|
|
51
58
|
private static readonly CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
|
|
52
59
|
private documentCache = new TtlCache<FetchedDocument>(DocumentFetcherService.CACHE_TTL_MS);
|
|
53
60
|
private s3Clients = new Map<string, any>();
|
|
54
61
|
private pdfParser: any = null;
|
|
62
|
+
private documentServiceConfig?: DocumentServiceConfig;
|
|
63
|
+
|
|
64
|
+
configureDocumentService(config: DocumentServiceConfig): void {
|
|
65
|
+
this.documentServiceConfig = config;
|
|
66
|
+
logger.info('Document service configured', { baseUrl: config.baseUrl });
|
|
67
|
+
}
|
|
55
68
|
|
|
56
69
|
async fetchDocument(config: DocumentConfig): Promise<FetchedDocument | null> {
|
|
57
70
|
this.ensureNodeEnv();
|
|
58
|
-
const cacheKey = `${config.title}:${config.url}`;
|
|
71
|
+
const cacheKey = `${config.title}:${config.url || config.documentServiceId}`;
|
|
59
72
|
const cached = this.documentCache.get(cacheKey);
|
|
60
73
|
|
|
61
74
|
if (cached) {
|
|
@@ -65,28 +78,99 @@ class DocumentFetcherService {
|
|
|
65
78
|
|
|
66
79
|
try {
|
|
67
80
|
let content: string;
|
|
68
|
-
|
|
69
|
-
|
|
81
|
+
let sourceUrl: string;
|
|
82
|
+
|
|
83
|
+
if (config.url?.startsWith('document-service://')) {
|
|
84
|
+
if (!config.retrievalConfig || config.retrievalConfig.provider !== 'document-service') {
|
|
85
|
+
throw new Error(
|
|
86
|
+
'Document service URL requires retrievalConfig with provider: "document-service"'
|
|
87
|
+
);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
const dsConfig = config.retrievalConfig.config as DocumentServiceConfig;
|
|
91
|
+
if (!dsConfig?.baseUrl) {
|
|
92
|
+
throw new Error('Document service config requires baseUrl');
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const urlPath = config.url.replace('document-service://', '');
|
|
96
|
+
const parts = urlPath.split('/');
|
|
97
|
+
|
|
98
|
+
if (parts.length !== 3) {
|
|
99
|
+
throw new Error(
|
|
100
|
+
`Invalid document service URL format: ${config.url}. Expected: document-service://<user-uuid>/<app-uuid>/<doc-uuid>.ext`
|
|
101
|
+
);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const appUuid = parts[1];
|
|
105
|
+
const docIdWithExt = parts[2];
|
|
106
|
+
const docId = docIdWithExt.split('.')[0]; // Remove extension
|
|
107
|
+
|
|
108
|
+
this.documentServiceConfig = {
|
|
109
|
+
baseUrl: dsConfig.baseUrl,
|
|
110
|
+
appName: appUuid,
|
|
111
|
+
authToken: dsConfig.authToken,
|
|
112
|
+
getAuthToken: dsConfig.getAuthToken,
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
content = await this.fetchFromDocumentService(docId);
|
|
116
|
+
sourceUrl = config.url;
|
|
117
|
+
} else if (config.retrievalConfig?.provider === 'document-service') {
|
|
118
|
+
const dsConfig = config.retrievalConfig.config as DocumentServiceConfig;
|
|
119
|
+
if (!dsConfig?.baseUrl || !dsConfig?.appName) {
|
|
120
|
+
throw new Error('Document service config requires baseUrl and appName');
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
this.documentServiceConfig = {
|
|
124
|
+
baseUrl: dsConfig.baseUrl,
|
|
125
|
+
appName: dsConfig.appName,
|
|
126
|
+
authToken: dsConfig.authToken,
|
|
127
|
+
getAuthToken: dsConfig.getAuthToken,
|
|
128
|
+
};
|
|
129
|
+
|
|
130
|
+
const docId = await this.lookupDocumentByTitle(config.title);
|
|
131
|
+
if (docId) {
|
|
132
|
+
content = await this.fetchFromDocumentService(docId);
|
|
133
|
+
sourceUrl = `document-service://${docId}`;
|
|
134
|
+
} else {
|
|
135
|
+
throw new Error(`Document not found by title in document service: ${config.title}`);
|
|
136
|
+
}
|
|
137
|
+
} else if (config.documentServiceId && this.documentServiceConfig) {
|
|
138
|
+
content = await this.fetchFromDocumentService(config.documentServiceId);
|
|
139
|
+
sourceUrl = `document-service://${config.documentServiceId}`;
|
|
140
|
+
} else if (config.url?.startsWith('s3://')) {
|
|
70
141
|
content = await this.fetchFromS3(config);
|
|
71
|
-
|
|
142
|
+
sourceUrl = config.url;
|
|
143
|
+
} else if (config.url?.startsWith('http://') || config.url?.startsWith('https://')) {
|
|
72
144
|
content = await this.fetchFromUrl(config.url);
|
|
73
|
-
|
|
74
|
-
|
|
145
|
+
sourceUrl = config.url;
|
|
146
|
+
} else if (config.url) {
|
|
75
147
|
content = await this.fetchFromLocal(config.url);
|
|
148
|
+
sourceUrl = config.url;
|
|
149
|
+
} else {
|
|
150
|
+
if (this.documentServiceConfig) {
|
|
151
|
+
const docId = await this.lookupDocumentByTitle(config.title);
|
|
152
|
+
if (docId) {
|
|
153
|
+
content = await this.fetchFromDocumentService(docId);
|
|
154
|
+
sourceUrl = `document-service://${docId}`;
|
|
155
|
+
} else {
|
|
156
|
+
throw new Error(`Document not found by title: ${config.title}`);
|
|
157
|
+
}
|
|
158
|
+
} else {
|
|
159
|
+
throw new Error(`No URL or document service ID provided for: ${config.title}`);
|
|
160
|
+
}
|
|
76
161
|
}
|
|
77
162
|
|
|
78
163
|
const document: FetchedDocument = {
|
|
79
164
|
title: config.title,
|
|
80
165
|
content,
|
|
81
|
-
url:
|
|
82
|
-
format: this.inferFormat(
|
|
166
|
+
url: sourceUrl,
|
|
167
|
+
format: this.inferFormat(sourceUrl),
|
|
83
168
|
fetchedAt: new Date(),
|
|
84
169
|
embeddingConfig: config.embeddingConfig,
|
|
85
170
|
};
|
|
86
171
|
|
|
87
172
|
this.documentCache.set(cacheKey, document);
|
|
88
173
|
|
|
89
|
-
// Auto-create Document entity from fetched content
|
|
90
174
|
await this.createDocumentEntity(document);
|
|
91
175
|
|
|
92
176
|
return document;
|
|
@@ -94,27 +178,44 @@ class DocumentFetcherService {
|
|
|
94
178
|
logger.error('Failed to fetch document', {
|
|
95
179
|
title: config.title,
|
|
96
180
|
url: config.url,
|
|
181
|
+
documentServiceId: config.documentServiceId,
|
|
97
182
|
error: error instanceof Error ? error.message : String(error),
|
|
98
183
|
stack: error instanceof Error ? error.stack : undefined,
|
|
99
184
|
});
|
|
100
|
-
// Re-throw the error so the caller knows what happened
|
|
101
185
|
throw error;
|
|
102
186
|
}
|
|
103
187
|
}
|
|
104
188
|
|
|
105
189
|
async fetchDocumentByTitle(title: string): Promise<FetchedDocument | null> {
|
|
106
190
|
this.ensureNodeEnv();
|
|
107
|
-
// First check if we have it in cache
|
|
108
|
-
// Note: TtlCache doesn't have a way to search by prefix, so we'll fetch directly
|
|
109
191
|
|
|
110
192
|
try {
|
|
111
|
-
//
|
|
193
|
+
// First check if we have it in cache
|
|
194
|
+
const cacheKey = `${title}:lookup`;
|
|
195
|
+
const cached = this.documentCache.get(cacheKey);
|
|
196
|
+
if (cached) {
|
|
197
|
+
logger.debug('Returning cached document by title', { title });
|
|
198
|
+
return cached;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Try document service lookup first (if configured)
|
|
202
|
+
if (this.documentServiceConfig) {
|
|
203
|
+
const docId = await this.lookupDocumentByTitle(title);
|
|
204
|
+
if (docId) {
|
|
205
|
+
return this.fetchDocument({
|
|
206
|
+
title,
|
|
207
|
+
documentServiceId: docId,
|
|
208
|
+
});
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// Fall back to config-based lookup
|
|
112
213
|
const doc = this.findDocumentInConfig(title);
|
|
113
214
|
if (doc) {
|
|
114
215
|
return this.fetchDocument(doc);
|
|
115
216
|
}
|
|
116
217
|
|
|
117
|
-
logger.warn('Document not found
|
|
218
|
+
logger.warn('Document not found', { title });
|
|
118
219
|
return null;
|
|
119
220
|
} catch (error) {
|
|
120
221
|
logger.error('Failed to fetch document by title', { title, error });
|
|
@@ -122,15 +223,140 @@ class DocumentFetcherService {
|
|
|
122
223
|
}
|
|
123
224
|
}
|
|
124
225
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
226
|
+
// Fetch from secure document-service API
|
|
227
|
+
private async fetchFromDocumentService(documentId: string): Promise<string> {
|
|
228
|
+
if (!this.documentServiceConfig) {
|
|
229
|
+
throw new Error('Document service not configured');
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
try {
|
|
233
|
+
// Get token - either static from config or dynamic from function
|
|
234
|
+
let token: string;
|
|
235
|
+
if (this.documentServiceConfig.authToken) {
|
|
236
|
+
token = this.documentServiceConfig.authToken;
|
|
237
|
+
} else if (this.documentServiceConfig.getAuthToken) {
|
|
238
|
+
token = await this.documentServiceConfig.getAuthToken();
|
|
239
|
+
} else {
|
|
240
|
+
throw new Error('Document service requires authToken or getAuthToken');
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
const url = `${this.documentServiceConfig.baseUrl}/api/documents/${documentId}/content`;
|
|
244
|
+
|
|
245
|
+
logger.debug('Fetching from document service', { documentId, url });
|
|
246
|
+
|
|
247
|
+
const response = await fetch(url, {
|
|
248
|
+
headers: {
|
|
249
|
+
Authorization: `Bearer ${token}`,
|
|
250
|
+
'x-app-name': this.documentServiceConfig.appName,
|
|
251
|
+
Accept: 'application/json',
|
|
252
|
+
},
|
|
253
|
+
});
|
|
254
|
+
|
|
255
|
+
if (!response.ok) {
|
|
256
|
+
if (response.status === 404) {
|
|
257
|
+
throw new Error(`Document not found: ${documentId}`);
|
|
258
|
+
} else if (response.status === 403) {
|
|
259
|
+
throw new Error(`Access denied to document: ${documentId}`);
|
|
260
|
+
} else {
|
|
261
|
+
throw new Error(`Document service error: ${response.status} ${response.statusText}`);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
const data = await response.json();
|
|
266
|
+
|
|
267
|
+
if (data.isBase64) {
|
|
268
|
+
if (data.mimeType?.includes('pdf') || data.format?.toLowerCase() === 'pdf') {
|
|
269
|
+
try {
|
|
270
|
+
const { parsePdfBuffer } = await import('../docs.js');
|
|
271
|
+
const buffer = Buffer.from(data.content, 'base64');
|
|
272
|
+
const text = await parsePdfBuffer(new Uint8Array(buffer));
|
|
273
|
+
logger.debug('Extracted text from PDF', { documentId, textLength: text.length });
|
|
274
|
+
return text;
|
|
275
|
+
} catch (pdfError: any) {
|
|
276
|
+
logger.error('Failed to parse PDF from document service', {
|
|
277
|
+
documentId,
|
|
278
|
+
error: pdfError.message,
|
|
279
|
+
});
|
|
280
|
+
throw new Error(`Failed to extract text from PDF: ${pdfError.message}`);
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
return Buffer.from(data.content, 'base64').toString('utf-8');
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
if (data.format?.toLowerCase() === 'md' || data.format?.toLowerCase() === 'markdown') {
|
|
287
|
+
try {
|
|
288
|
+
const parsedText = this.parseMarkdownText(data.content);
|
|
289
|
+
logger.debug('Parsed markdown content', { documentId, textLength: parsedText.length });
|
|
290
|
+
return parsedText;
|
|
291
|
+
} catch (mdError: any) {
|
|
292
|
+
logger.warn('Markdown parsing failed, returning raw content', {
|
|
293
|
+
documentId,
|
|
294
|
+
error: mdError.message,
|
|
295
|
+
});
|
|
296
|
+
return data.content;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
return data.content;
|
|
301
|
+
} catch (error) {
|
|
302
|
+
logger.error('Document service fetch failed', {
|
|
303
|
+
documentId,
|
|
304
|
+
error: error instanceof Error ? error.message : String(error),
|
|
305
|
+
});
|
|
306
|
+
throw error;
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
private async lookupDocumentByTitle(title: string): Promise<string | null> {
|
|
311
|
+
if (!this.documentServiceConfig) {
|
|
312
|
+
return null;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
try {
|
|
316
|
+
let token: string;
|
|
317
|
+
if (this.documentServiceConfig.authToken) {
|
|
318
|
+
token = this.documentServiceConfig.authToken;
|
|
319
|
+
} else if (this.documentServiceConfig.getAuthToken) {
|
|
320
|
+
token = await this.documentServiceConfig.getAuthToken();
|
|
321
|
+
} else {
|
|
322
|
+
throw new Error('Document service requires authToken or getAuthToken');
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
const url = `${this.documentServiceConfig.baseUrl}/api/documents/lookup/by-title?title=${encodeURIComponent(title)}`;
|
|
326
|
+
|
|
327
|
+
logger.debug('Looking up document by title', { title, url });
|
|
328
|
+
|
|
329
|
+
const response = await fetch(url, {
|
|
330
|
+
headers: {
|
|
331
|
+
Authorization: `Bearer ${token}`,
|
|
332
|
+
'x-app-name': this.documentServiceConfig.appName,
|
|
333
|
+
Accept: 'application/json',
|
|
334
|
+
},
|
|
335
|
+
});
|
|
336
|
+
|
|
337
|
+
if (response.status === 404) {
|
|
338
|
+
logger.debug('Document not found by title', { title });
|
|
339
|
+
return null;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
if (!response.ok) {
|
|
343
|
+
throw new Error(`Document service lookup error: ${response.status}`);
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
const data = await response.json();
|
|
347
|
+
logger.debug('Found document by title', { title, documentId: data.documentId });
|
|
348
|
+
return data.documentId;
|
|
349
|
+
} catch (error) {
|
|
350
|
+
logger.error('Document lookup failed', {
|
|
351
|
+
title,
|
|
352
|
+
error: error instanceof Error ? error.message : String(error),
|
|
353
|
+
});
|
|
354
|
+
return null;
|
|
355
|
+
}
|
|
130
356
|
}
|
|
131
357
|
|
|
132
358
|
private async fetchFromS3(config: DocumentConfig): Promise<string> {
|
|
133
|
-
const s3Config = this.parseS3Url(config.url
|
|
359
|
+
const s3Config = this.parseS3Url(config.url!, config.retrievalConfig);
|
|
134
360
|
const client = await this.getOrCreateS3Client(s3Config);
|
|
135
361
|
|
|
136
362
|
try {
|
|
@@ -199,29 +425,39 @@ class DocumentFetcherService {
|
|
|
199
425
|
const isMarkdown =
|
|
200
426
|
contentType.includes('text/markdown') ||
|
|
201
427
|
lowerUrl.endsWith('.md') ||
|
|
202
|
-
lowerUrl.endsWith('.markdown')
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
428
|
+
lowerUrl.endsWith('.markdown');
|
|
429
|
+
|
|
430
|
+
if (isMarkdown) {
|
|
431
|
+
return this.parseMarkdownText(Buffer.from(body).toString('utf-8'));
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
return Buffer.from(body).toString('utf-8');
|
|
206
435
|
} catch (error) {
|
|
207
|
-
logger.error('URL fetch failed', {
|
|
208
|
-
|
|
436
|
+
logger.error('URL fetch failed', {
|
|
437
|
+
url,
|
|
438
|
+
error: error instanceof Error ? error.message : String(error),
|
|
439
|
+
});
|
|
440
|
+
throw error;
|
|
209
441
|
}
|
|
210
442
|
}
|
|
211
443
|
|
|
212
444
|
private async fetchFromLocal(filePath: string): Promise<string> {
|
|
213
445
|
try {
|
|
214
|
-
const
|
|
215
|
-
const
|
|
216
|
-
const
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
446
|
+
const content = await readFile(filePath, 'utf-8');
|
|
447
|
+
const lowerPath = filePath.toLowerCase();
|
|
448
|
+
const isMarkdown = lowerPath.endsWith('.md') || lowerPath.endsWith('.markdown');
|
|
449
|
+
|
|
450
|
+
if (isMarkdown) {
|
|
451
|
+
return this.parseMarkdownText(content);
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
return content;
|
|
222
455
|
} catch (error) {
|
|
223
|
-
logger.error('Local file read failed', {
|
|
224
|
-
|
|
456
|
+
logger.error('Local file read failed', {
|
|
457
|
+
path: filePath,
|
|
458
|
+
error: error instanceof Error ? error.message : String(error),
|
|
459
|
+
});
|
|
460
|
+
throw error;
|
|
225
461
|
}
|
|
226
462
|
}
|
|
227
463
|
|
|
@@ -238,15 +474,11 @@ class DocumentFetcherService {
|
|
|
238
474
|
forcePathStyle?: boolean;
|
|
239
475
|
} {
|
|
240
476
|
// Parse s3://bucket/key format
|
|
241
|
-
|
|
242
|
-
throw new Error('Invalid S3 URL format. Expected: s3://bucket/key');
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
const withoutProtocol = url.slice(5);
|
|
477
|
+
const withoutProtocol = url.replace('s3://', '');
|
|
246
478
|
const firstSlash = withoutProtocol.indexOf('/');
|
|
247
479
|
|
|
248
480
|
if (firstSlash === -1) {
|
|
249
|
-
throw new Error(
|
|
481
|
+
throw new Error(`Invalid S3 URL format: ${url}`);
|
|
250
482
|
}
|
|
251
483
|
|
|
252
484
|
const bucket = withoutProtocol.slice(0, firstSlash);
|
|
@@ -271,6 +503,17 @@ class DocumentFetcherService {
|
|
|
271
503
|
};
|
|
272
504
|
}
|
|
273
505
|
|
|
506
|
+
private normalizeRetrievalConfig(config?: RetrievalConfig): RetrievalConfig | undefined {
|
|
507
|
+
if (!config) {
|
|
508
|
+
return undefined;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
// Handle nested config structure from Agentlang
|
|
512
|
+
const normalizedConfig = preprocessRawConfig(config) as RetrievalConfig;
|
|
513
|
+
|
|
514
|
+
return normalizedConfig;
|
|
515
|
+
}
|
|
516
|
+
|
|
274
517
|
private async getOrCreateS3Client(config: {
|
|
275
518
|
region: string;
|
|
276
519
|
endpoint?: string;
|
|
@@ -300,6 +543,65 @@ class DocumentFetcherService {
|
|
|
300
543
|
return this.s3Clients.get(clientKey)!;
|
|
301
544
|
}
|
|
302
545
|
|
|
546
|
+
private async parsePdfBuffer(buffer: Buffer): Promise<string> {
|
|
547
|
+
// Lazy load PDF parser
|
|
548
|
+
if (!this.pdfParser) {
|
|
549
|
+
try {
|
|
550
|
+
const pdfParse = await import('pdf-parse');
|
|
551
|
+
// Handle both ESM and CSM module formats
|
|
552
|
+
const parser = (pdfParse as any).default || pdfParse;
|
|
553
|
+
this.pdfParser = parser;
|
|
554
|
+
} catch (error) {
|
|
555
|
+
logger.error('Failed to load PDF parser', { error });
|
|
556
|
+
throw new Error(
|
|
557
|
+
'PDF parsing not available. Please install pdf-parse: npm install pdf-parse'
|
|
558
|
+
);
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
try {
|
|
563
|
+
const result = await this.pdfParser(buffer);
|
|
564
|
+
return result.text || '';
|
|
565
|
+
} catch (error) {
|
|
566
|
+
logger.error('PDF parsing failed', { error });
|
|
567
|
+
throw new Error(`Failed to parse PDF: ${error}`);
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
private parseMarkdownText(text: string): string {
|
|
572
|
+
// Convert markdown to plain text for embedding
|
|
573
|
+
// This removes formatting but preserves content structure
|
|
574
|
+
try {
|
|
575
|
+
const html = marked.parse(text) as string;
|
|
576
|
+
// Simple HTML to text conversion
|
|
577
|
+
return html
|
|
578
|
+
.replace(/<[^>]+>/g, ' ') // Remove HTML tags
|
|
579
|
+
.replace(/\s+/g, ' ') // Normalize whitespace
|
|
580
|
+
.replace(/</g, '<')
|
|
581
|
+
.replace(/>/g, '>')
|
|
582
|
+
.replace(/&/g, '&')
|
|
583
|
+
.replace(/"/g, '"')
|
|
584
|
+
.trim();
|
|
585
|
+
} catch (error) {
|
|
586
|
+
logger.warn('Markdown parsing failed, returning raw text', { error });
|
|
587
|
+
return text;
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
private async readS3BodyToBuffer(body: any): Promise<Buffer> {
|
|
592
|
+
if (body.transformToByteArray) {
|
|
593
|
+
const data = await body.transformToByteArray();
|
|
594
|
+
return Buffer.from(data);
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
// Fallback for Readable streams
|
|
598
|
+
const chunks: Buffer[] = [];
|
|
599
|
+
for await (const chunk of body) {
|
|
600
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
601
|
+
}
|
|
602
|
+
return Buffer.concat(chunks);
|
|
603
|
+
}
|
|
604
|
+
|
|
303
605
|
private async createDocumentEntity(document: FetchedDocument): Promise<void> {
|
|
304
606
|
try {
|
|
305
607
|
// Build the Document entity attributes
|
|
@@ -339,6 +641,10 @@ class DocumentFetcherService {
|
|
|
339
641
|
}
|
|
340
642
|
|
|
341
643
|
private inferFormat(url: string): string {
|
|
644
|
+
// Handle document-service URLs
|
|
645
|
+
if (url.startsWith('document-service://')) {
|
|
646
|
+
return 'txt';
|
|
647
|
+
}
|
|
342
648
|
const parts = url.split('.');
|
|
343
649
|
if (parts.length > 1) {
|
|
344
650
|
return parts[parts.length - 1].toLowerCase();
|
|
@@ -346,43 +652,11 @@ class DocumentFetcherService {
|
|
|
346
652
|
return 'txt';
|
|
347
653
|
}
|
|
348
654
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
this.documentCache.clear();
|
|
355
|
-
}
|
|
356
|
-
}
|
|
357
|
-
|
|
358
|
-
private normalizeConfigValue(value: any): any {
|
|
359
|
-
if (value instanceof Map) {
|
|
360
|
-
const obj: Record<string, any> = {};
|
|
361
|
-
value.forEach((v, k) => {
|
|
362
|
-
obj[k] = this.normalizeConfigValue(v);
|
|
363
|
-
});
|
|
364
|
-
return obj;
|
|
365
|
-
}
|
|
366
|
-
if (Array.isArray(value)) {
|
|
367
|
-
return value.map(v => this.normalizeConfigValue(v));
|
|
368
|
-
}
|
|
369
|
-
if (value && typeof value === 'object') {
|
|
370
|
-
const obj: Record<string, any> = {};
|
|
371
|
-
Object.entries(value).forEach(([k, v]) => {
|
|
372
|
-
obj[k] = this.normalizeConfigValue(v);
|
|
373
|
-
});
|
|
374
|
-
return obj;
|
|
375
|
-
}
|
|
376
|
-
return value;
|
|
377
|
-
}
|
|
378
|
-
|
|
379
|
-
private normalizeRetrievalConfig(retrievalConfig?: RetrievalConfig): RetrievalConfig | undefined {
|
|
380
|
-
if (!retrievalConfig) return undefined;
|
|
381
|
-
const normalized = this.normalizeConfigValue(retrievalConfig);
|
|
382
|
-
if (normalized && typeof normalized === 'object') {
|
|
383
|
-
preprocessRawConfig(normalized);
|
|
384
|
-
}
|
|
385
|
-
return normalized as RetrievalConfig;
|
|
655
|
+
private findDocumentInConfig(title: string): DocumentConfig | null {
|
|
656
|
+
// This method should be called during config loading
|
|
657
|
+
// The documents are stored when the config is parsed
|
|
658
|
+
const docs = getConfiguredDocuments();
|
|
659
|
+
return docs.find(d => d.title === title) || null;
|
|
386
660
|
}
|
|
387
661
|
|
|
388
662
|
private ensureNodeEnv(): void {
|
|
@@ -391,78 +665,27 @@ class DocumentFetcherService {
|
|
|
391
665
|
}
|
|
392
666
|
}
|
|
393
667
|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
return Buffer.from(bytes);
|
|
398
|
-
}
|
|
399
|
-
if (body.transformToString) {
|
|
400
|
-
const text = await body.transformToString('utf-8');
|
|
401
|
-
return Buffer.from(text, 'utf-8');
|
|
402
|
-
}
|
|
403
|
-
const chunks: Buffer[] = [];
|
|
404
|
-
for await (const chunk of body) {
|
|
405
|
-
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
406
|
-
}
|
|
407
|
-
return Buffer.concat(chunks);
|
|
408
|
-
}
|
|
409
|
-
|
|
410
|
-
private async getPdfParser(): Promise<any> {
|
|
411
|
-
if (!this.pdfParser) {
|
|
412
|
-
const pdfModule: any = await import('pdf-parse');
|
|
413
|
-
this.pdfParser = pdfModule.PDFParse || pdfModule.default;
|
|
414
|
-
}
|
|
415
|
-
return this.pdfParser;
|
|
416
|
-
}
|
|
417
|
-
|
|
418
|
-
private async parsePdfBuffer(buffer: Buffer): Promise<string> {
|
|
419
|
-
try {
|
|
420
|
-
const PDFParseClass = await this.getPdfParser();
|
|
421
|
-
const parser = new PDFParseClass({
|
|
422
|
-
data: buffer,
|
|
423
|
-
verbosity: 0,
|
|
424
|
-
});
|
|
425
|
-
const data = await parser.getText();
|
|
426
|
-
return data.text;
|
|
427
|
-
} catch (error: any) {
|
|
428
|
-
logger.error(`Failed to parse PDF: ${error.message}`);
|
|
429
|
-
throw new Error(`PDF parsing failed: ${error.message}`);
|
|
430
|
-
}
|
|
431
|
-
}
|
|
432
|
-
|
|
433
|
-
private parseMarkdownText(markdown: string): string {
|
|
434
|
-
const html = marked.parse(markdown);
|
|
435
|
-
if (typeof html !== 'string') {
|
|
436
|
-
return markdown;
|
|
437
|
-
}
|
|
438
|
-
return html
|
|
439
|
-
.replace(/<\s*br\s*\/?>/gi, '\n')
|
|
440
|
-
.replace(/<\/(p|li|h[1-6]|blockquote|pre|tr|table)>/gi, '\n')
|
|
441
|
-
.replace(/<[^>]+>/g, '')
|
|
442
|
-
.replace(/\n{3,}/g, '\n\n')
|
|
443
|
-
.trim();
|
|
668
|
+
clearCache(): void {
|
|
669
|
+
// Clear all cache
|
|
670
|
+
this.documentCache.clear();
|
|
444
671
|
}
|
|
445
672
|
}
|
|
446
673
|
|
|
447
|
-
//
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
export function registerConfiguredDocument(doc: DocumentConfig): void {
|
|
451
|
-
// Check if already registered
|
|
452
|
-
const existing = configuredDocuments.find(d => d.title === doc.title);
|
|
453
|
-
if (!existing) {
|
|
454
|
-
configuredDocuments.push(doc);
|
|
455
|
-
logger.debug('Registered configured document', { title: doc.title, url: doc.url });
|
|
456
|
-
}
|
|
457
|
-
}
|
|
674
|
+
// Singleton instance
|
|
675
|
+
const documentFetcher = new DocumentFetcherService();
|
|
458
676
|
|
|
459
|
-
|
|
460
|
-
|
|
677
|
+
// Helper function to get configured documents from module config
|
|
678
|
+
function getConfiguredDocuments(): DocumentConfig[] {
|
|
679
|
+
// This should be populated during config parsing
|
|
680
|
+
// For now, return empty array - actual implementation depends on how
|
|
681
|
+
// the config system stores document definitions
|
|
682
|
+
return (global as any).__configuredDocuments || [];
|
|
461
683
|
}
|
|
462
684
|
|
|
463
|
-
|
|
464
|
-
|
|
685
|
+
// Export for use in config loading
|
|
686
|
+
export function setConfiguredDocuments(docs: DocumentConfig[]): void {
|
|
687
|
+
(global as any).__configuredDocuments = docs;
|
|
465
688
|
}
|
|
466
689
|
|
|
467
|
-
export
|
|
690
|
+
export { documentFetcher };
|
|
468
691
|
export default documentFetcher;
|