agentlang 0.9.10 → 0.9.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/extension/main.cjs +38 -38
- package/out/extension/main.cjs.map +2 -2
- package/out/language/generated/ast.d.ts +1 -1
- package/out/language/generated/ast.js +1 -1
- package/out/language/generated/grammar.d.ts +1 -1
- package/out/language/generated/grammar.js +1 -1
- package/out/language/generated/module.d.ts +1 -1
- package/out/language/generated/module.js +1 -1
- package/out/language/main.cjs +850 -2388
- package/out/language/main.cjs.map +4 -4
- package/out/runtime/agents/common.d.ts +3 -1
- package/out/runtime/agents/common.d.ts.map +1 -1
- package/out/runtime/agents/common.js +35 -31
- package/out/runtime/agents/common.js.map +1 -1
- package/out/runtime/docs.d.ts +1 -0
- package/out/runtime/docs.d.ts.map +1 -1
- package/out/runtime/docs.js +16 -1
- package/out/runtime/docs.js.map +1 -1
- package/out/runtime/interpreter.d.ts +1 -0
- package/out/runtime/interpreter.d.ts.map +1 -1
- package/out/runtime/interpreter.js +41 -8
- package/out/runtime/interpreter.js.map +1 -1
- package/out/runtime/jsmodules.d.ts +2 -1
- package/out/runtime/jsmodules.d.ts.map +1 -1
- package/out/runtime/jsmodules.js +2 -1
- package/out/runtime/jsmodules.js.map +1 -1
- package/out/runtime/loader.d.ts.map +1 -1
- package/out/runtime/loader.js +3 -2
- package/out/runtime/loader.js.map +1 -1
- package/out/runtime/module.d.ts +1 -0
- package/out/runtime/module.d.ts.map +1 -1
- package/out/runtime/module.js +3 -0
- package/out/runtime/module.js.map +1 -1
- package/out/runtime/modules/ai.d.ts +11 -0
- package/out/runtime/modules/ai.d.ts.map +1 -1
- package/out/runtime/modules/ai.js +163 -10
- package/out/runtime/modules/ai.js.map +1 -1
- package/out/runtime/modules/core.d.ts.map +1 -1
- package/out/runtime/modules/core.js +7 -1
- package/out/runtime/modules/core.js.map +1 -1
- package/out/runtime/services/documentFetcher.d.ts +22 -14
- package/out/runtime/services/documentFetcher.d.ts.map +1 -1
- package/out/runtime/services/documentFetcher.js +348 -153
- package/out/runtime/services/documentFetcher.js.map +1 -1
- package/package.json +1 -1
- package/src/language/generated/ast.ts +1 -1
- package/src/language/generated/grammar.ts +1 -1
- package/src/language/generated/module.ts +1 -1
- package/src/runtime/agents/common.ts +37 -31
- package/src/runtime/docs.ts +17 -1
- package/src/runtime/interpreter.ts +44 -6
- package/src/runtime/jsmodules.ts +3 -1
- package/src/runtime/loader.ts +3 -2
- package/src/runtime/module.ts +4 -0
- package/src/runtime/modules/ai.ts +194 -9
- package/src/runtime/modules/core.ts +7 -1
- package/src/runtime/services/documentFetcher.ts +372 -149
|
@@ -5,9 +5,8 @@ var __asyncValues = (this && this.__asyncValues) || function (o) {
|
|
|
5
5
|
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
|
|
6
6
|
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
|
|
7
7
|
};
|
|
8
|
-
import {
|
|
8
|
+
import { GetObjectCommand, S3Client } from '@aws-sdk/client-s3';
|
|
9
9
|
import { readFile } from 'node:fs/promises';
|
|
10
|
-
import path from 'node:path';
|
|
11
10
|
import { logger } from '../logger.js';
|
|
12
11
|
import { parseAndEvaluateStatement } from '../interpreter.js';
|
|
13
12
|
import { CoreAIModuleName } from '../modules/ai.js';
|
|
@@ -21,9 +20,14 @@ class DocumentFetcherService {
|
|
|
21
20
|
this.s3Clients = new Map();
|
|
22
21
|
this.pdfParser = null;
|
|
23
22
|
}
|
|
23
|
+
configureDocumentService(config) {
|
|
24
|
+
this.documentServiceConfig = config;
|
|
25
|
+
logger.info('Document service configured', { baseUrl: config.baseUrl });
|
|
26
|
+
}
|
|
24
27
|
async fetchDocument(config) {
|
|
28
|
+
var _a, _b, _c, _d, _e;
|
|
25
29
|
this.ensureNodeEnv();
|
|
26
|
-
const cacheKey = `${config.title}:${config.url}`;
|
|
30
|
+
const cacheKey = `${config.title}:${config.url || config.documentServiceId}`;
|
|
27
31
|
const cached = this.documentCache.get(cacheKey);
|
|
28
32
|
if (cached) {
|
|
29
33
|
logger.debug('Returning cached document', { title: config.title });
|
|
@@ -31,26 +35,92 @@ class DocumentFetcherService {
|
|
|
31
35
|
}
|
|
32
36
|
try {
|
|
33
37
|
let content;
|
|
34
|
-
|
|
38
|
+
let sourceUrl;
|
|
39
|
+
if ((_a = config.url) === null || _a === void 0 ? void 0 : _a.startsWith('document-service://')) {
|
|
40
|
+
if (!config.retrievalConfig || config.retrievalConfig.provider !== 'document-service') {
|
|
41
|
+
throw new Error('Document service URL requires retrievalConfig with provider: "document-service"');
|
|
42
|
+
}
|
|
43
|
+
const dsConfig = config.retrievalConfig.config;
|
|
44
|
+
if (!(dsConfig === null || dsConfig === void 0 ? void 0 : dsConfig.baseUrl)) {
|
|
45
|
+
throw new Error('Document service config requires baseUrl');
|
|
46
|
+
}
|
|
47
|
+
const urlPath = config.url.replace('document-service://', '');
|
|
48
|
+
const parts = urlPath.split('/');
|
|
49
|
+
if (parts.length !== 3) {
|
|
50
|
+
throw new Error(`Invalid document service URL format: ${config.url}. Expected: document-service://<user-uuid>/<app-uuid>/<doc-uuid>.ext`);
|
|
51
|
+
}
|
|
52
|
+
const appUuid = parts[1];
|
|
53
|
+
const docIdWithExt = parts[2];
|
|
54
|
+
const docId = docIdWithExt.split('.')[0]; // Remove extension
|
|
55
|
+
this.documentServiceConfig = {
|
|
56
|
+
baseUrl: dsConfig.baseUrl,
|
|
57
|
+
appName: appUuid,
|
|
58
|
+
authToken: dsConfig.authToken,
|
|
59
|
+
getAuthToken: dsConfig.getAuthToken,
|
|
60
|
+
};
|
|
61
|
+
content = await this.fetchFromDocumentService(docId);
|
|
62
|
+
sourceUrl = config.url;
|
|
63
|
+
}
|
|
64
|
+
else if (((_b = config.retrievalConfig) === null || _b === void 0 ? void 0 : _b.provider) === 'document-service') {
|
|
65
|
+
const dsConfig = config.retrievalConfig.config;
|
|
66
|
+
if (!(dsConfig === null || dsConfig === void 0 ? void 0 : dsConfig.baseUrl) || !(dsConfig === null || dsConfig === void 0 ? void 0 : dsConfig.appName)) {
|
|
67
|
+
throw new Error('Document service config requires baseUrl and appName');
|
|
68
|
+
}
|
|
69
|
+
this.documentServiceConfig = {
|
|
70
|
+
baseUrl: dsConfig.baseUrl,
|
|
71
|
+
appName: dsConfig.appName,
|
|
72
|
+
authToken: dsConfig.authToken,
|
|
73
|
+
getAuthToken: dsConfig.getAuthToken,
|
|
74
|
+
};
|
|
75
|
+
const docId = await this.lookupDocumentByTitle(config.title);
|
|
76
|
+
if (docId) {
|
|
77
|
+
content = await this.fetchFromDocumentService(docId);
|
|
78
|
+
sourceUrl = `document-service://${docId}`;
|
|
79
|
+
}
|
|
80
|
+
else {
|
|
81
|
+
throw new Error(`Document not found by title in document service: ${config.title}`);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
else if (config.documentServiceId && this.documentServiceConfig) {
|
|
85
|
+
content = await this.fetchFromDocumentService(config.documentServiceId);
|
|
86
|
+
sourceUrl = `document-service://${config.documentServiceId}`;
|
|
87
|
+
}
|
|
88
|
+
else if ((_c = config.url) === null || _c === void 0 ? void 0 : _c.startsWith('s3://')) {
|
|
35
89
|
content = await this.fetchFromS3(config);
|
|
90
|
+
sourceUrl = config.url;
|
|
36
91
|
}
|
|
37
|
-
else if (config.url.startsWith('http://') || config.url.startsWith('https://')) {
|
|
92
|
+
else if (((_d = config.url) === null || _d === void 0 ? void 0 : _d.startsWith('http://')) || ((_e = config.url) === null || _e === void 0 ? void 0 : _e.startsWith('https://'))) {
|
|
38
93
|
content = await this.fetchFromUrl(config.url);
|
|
94
|
+
sourceUrl = config.url;
|
|
39
95
|
}
|
|
40
|
-
else {
|
|
41
|
-
// Local file path
|
|
96
|
+
else if (config.url) {
|
|
42
97
|
content = await this.fetchFromLocal(config.url);
|
|
98
|
+
sourceUrl = config.url;
|
|
99
|
+
}
|
|
100
|
+
else {
|
|
101
|
+
if (this.documentServiceConfig) {
|
|
102
|
+
const docId = await this.lookupDocumentByTitle(config.title);
|
|
103
|
+
if (docId) {
|
|
104
|
+
content = await this.fetchFromDocumentService(docId);
|
|
105
|
+
sourceUrl = `document-service://${docId}`;
|
|
106
|
+
}
|
|
107
|
+
else {
|
|
108
|
+
throw new Error(`Document not found by title: ${config.title}`);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
else {
|
|
112
|
+
throw new Error(`No URL or document service ID provided for: ${config.title}`);
|
|
113
|
+
}
|
|
43
114
|
}
|
|
44
115
|
const document = {
|
|
45
116
|
title: config.title,
|
|
46
117
|
content,
|
|
47
|
-
url:
|
|
48
|
-
format: this.inferFormat(
|
|
118
|
+
url: sourceUrl,
|
|
119
|
+
format: this.inferFormat(sourceUrl),
|
|
49
120
|
fetchedAt: new Date(),
|
|
50
121
|
embeddingConfig: config.embeddingConfig,
|
|
51
122
|
};
|
|
52
123
|
this.documentCache.set(cacheKey, document);
|
|
53
|
-
// Auto-create Document entity from fetched content
|
|
54
124
|
await this.createDocumentEntity(document);
|
|
55
125
|
return document;
|
|
56
126
|
}
|
|
@@ -58,24 +128,39 @@ class DocumentFetcherService {
|
|
|
58
128
|
logger.error('Failed to fetch document', {
|
|
59
129
|
title: config.title,
|
|
60
130
|
url: config.url,
|
|
131
|
+
documentServiceId: config.documentServiceId,
|
|
61
132
|
error: error instanceof Error ? error.message : String(error),
|
|
62
133
|
stack: error instanceof Error ? error.stack : undefined,
|
|
63
134
|
});
|
|
64
|
-
// Re-throw the error so the caller knows what happened
|
|
65
135
|
throw error;
|
|
66
136
|
}
|
|
67
137
|
}
|
|
68
138
|
async fetchDocumentByTitle(title) {
|
|
69
139
|
this.ensureNodeEnv();
|
|
70
|
-
// First check if we have it in cache
|
|
71
|
-
// Note: TtlCache doesn't have a way to search by prefix, so we'll fetch directly
|
|
72
140
|
try {
|
|
73
|
-
//
|
|
141
|
+
// First check if we have it in cache
|
|
142
|
+
const cacheKey = `${title}:lookup`;
|
|
143
|
+
const cached = this.documentCache.get(cacheKey);
|
|
144
|
+
if (cached) {
|
|
145
|
+
logger.debug('Returning cached document by title', { title });
|
|
146
|
+
return cached;
|
|
147
|
+
}
|
|
148
|
+
// Try document service lookup first (if configured)
|
|
149
|
+
if (this.documentServiceConfig) {
|
|
150
|
+
const docId = await this.lookupDocumentByTitle(title);
|
|
151
|
+
if (docId) {
|
|
152
|
+
return this.fetchDocument({
|
|
153
|
+
title,
|
|
154
|
+
documentServiceId: docId,
|
|
155
|
+
});
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
// Fall back to config-based lookup
|
|
74
159
|
const doc = this.findDocumentInConfig(title);
|
|
75
160
|
if (doc) {
|
|
76
161
|
return this.fetchDocument(doc);
|
|
77
162
|
}
|
|
78
|
-
logger.warn('Document not found
|
|
163
|
+
logger.warn('Document not found', { title });
|
|
79
164
|
return null;
|
|
80
165
|
}
|
|
81
166
|
catch (error) {
|
|
@@ -83,11 +168,130 @@ class DocumentFetcherService {
|
|
|
83
168
|
return null;
|
|
84
169
|
}
|
|
85
170
|
}
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
171
|
+
// Fetch from secure document-service API
|
|
172
|
+
async fetchFromDocumentService(documentId) {
|
|
173
|
+
var _a, _b, _c, _d;
|
|
174
|
+
if (!this.documentServiceConfig) {
|
|
175
|
+
throw new Error('Document service not configured');
|
|
176
|
+
}
|
|
177
|
+
try {
|
|
178
|
+
// Get token - either static from config or dynamic from function
|
|
179
|
+
let token;
|
|
180
|
+
if (this.documentServiceConfig.authToken) {
|
|
181
|
+
token = this.documentServiceConfig.authToken;
|
|
182
|
+
}
|
|
183
|
+
else if (this.documentServiceConfig.getAuthToken) {
|
|
184
|
+
token = await this.documentServiceConfig.getAuthToken();
|
|
185
|
+
}
|
|
186
|
+
else {
|
|
187
|
+
throw new Error('Document service requires authToken or getAuthToken');
|
|
188
|
+
}
|
|
189
|
+
const url = `${this.documentServiceConfig.baseUrl}/api/documents/${documentId}/content`;
|
|
190
|
+
logger.debug('Fetching from document service', { documentId, url });
|
|
191
|
+
const response = await fetch(url, {
|
|
192
|
+
headers: {
|
|
193
|
+
Authorization: `Bearer ${token}`,
|
|
194
|
+
'x-app-name': this.documentServiceConfig.appName,
|
|
195
|
+
Accept: 'application/json',
|
|
196
|
+
},
|
|
197
|
+
});
|
|
198
|
+
if (!response.ok) {
|
|
199
|
+
if (response.status === 404) {
|
|
200
|
+
throw new Error(`Document not found: ${documentId}`);
|
|
201
|
+
}
|
|
202
|
+
else if (response.status === 403) {
|
|
203
|
+
throw new Error(`Access denied to document: ${documentId}`);
|
|
204
|
+
}
|
|
205
|
+
else {
|
|
206
|
+
throw new Error(`Document service error: ${response.status} ${response.statusText}`);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
const data = await response.json();
|
|
210
|
+
if (data.isBase64) {
|
|
211
|
+
if (((_a = data.mimeType) === null || _a === void 0 ? void 0 : _a.includes('pdf')) || ((_b = data.format) === null || _b === void 0 ? void 0 : _b.toLowerCase()) === 'pdf') {
|
|
212
|
+
try {
|
|
213
|
+
const { parsePdfBuffer } = await import('../docs.js');
|
|
214
|
+
const buffer = Buffer.from(data.content, 'base64');
|
|
215
|
+
const text = await parsePdfBuffer(new Uint8Array(buffer));
|
|
216
|
+
logger.debug('Extracted text from PDF', { documentId, textLength: text.length });
|
|
217
|
+
return text;
|
|
218
|
+
}
|
|
219
|
+
catch (pdfError) {
|
|
220
|
+
logger.error('Failed to parse PDF from document service', {
|
|
221
|
+
documentId,
|
|
222
|
+
error: pdfError.message,
|
|
223
|
+
});
|
|
224
|
+
throw new Error(`Failed to extract text from PDF: ${pdfError.message}`);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
return Buffer.from(data.content, 'base64').toString('utf-8');
|
|
228
|
+
}
|
|
229
|
+
if (((_c = data.format) === null || _c === void 0 ? void 0 : _c.toLowerCase()) === 'md' || ((_d = data.format) === null || _d === void 0 ? void 0 : _d.toLowerCase()) === 'markdown') {
|
|
230
|
+
try {
|
|
231
|
+
const parsedText = this.parseMarkdownText(data.content);
|
|
232
|
+
logger.debug('Parsed markdown content', { documentId, textLength: parsedText.length });
|
|
233
|
+
return parsedText;
|
|
234
|
+
}
|
|
235
|
+
catch (mdError) {
|
|
236
|
+
logger.warn('Markdown parsing failed, returning raw content', {
|
|
237
|
+
documentId,
|
|
238
|
+
error: mdError.message,
|
|
239
|
+
});
|
|
240
|
+
return data.content;
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
return data.content;
|
|
244
|
+
}
|
|
245
|
+
catch (error) {
|
|
246
|
+
logger.error('Document service fetch failed', {
|
|
247
|
+
documentId,
|
|
248
|
+
error: error instanceof Error ? error.message : String(error),
|
|
249
|
+
});
|
|
250
|
+
throw error;
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
async lookupDocumentByTitle(title) {
|
|
254
|
+
if (!this.documentServiceConfig) {
|
|
255
|
+
return null;
|
|
256
|
+
}
|
|
257
|
+
try {
|
|
258
|
+
let token;
|
|
259
|
+
if (this.documentServiceConfig.authToken) {
|
|
260
|
+
token = this.documentServiceConfig.authToken;
|
|
261
|
+
}
|
|
262
|
+
else if (this.documentServiceConfig.getAuthToken) {
|
|
263
|
+
token = await this.documentServiceConfig.getAuthToken();
|
|
264
|
+
}
|
|
265
|
+
else {
|
|
266
|
+
throw new Error('Document service requires authToken or getAuthToken');
|
|
267
|
+
}
|
|
268
|
+
const url = `${this.documentServiceConfig.baseUrl}/api/documents/lookup/by-title?title=${encodeURIComponent(title)}`;
|
|
269
|
+
logger.debug('Looking up document by title', { title, url });
|
|
270
|
+
const response = await fetch(url, {
|
|
271
|
+
headers: {
|
|
272
|
+
Authorization: `Bearer ${token}`,
|
|
273
|
+
'x-app-name': this.documentServiceConfig.appName,
|
|
274
|
+
Accept: 'application/json',
|
|
275
|
+
},
|
|
276
|
+
});
|
|
277
|
+
if (response.status === 404) {
|
|
278
|
+
logger.debug('Document not found by title', { title });
|
|
279
|
+
return null;
|
|
280
|
+
}
|
|
281
|
+
if (!response.ok) {
|
|
282
|
+
throw new Error(`Document service lookup error: ${response.status}`);
|
|
283
|
+
}
|
|
284
|
+
const data = await response.json();
|
|
285
|
+
logger.debug('Found document by title', { title, documentId: data.documentId });
|
|
286
|
+
return data.documentId;
|
|
287
|
+
}
|
|
288
|
+
catch (error) {
|
|
289
|
+
logger.error('Document lookup failed', {
|
|
290
|
+
title,
|
|
291
|
+
error: error instanceof Error ? error.message : String(error),
|
|
292
|
+
});
|
|
293
|
+
return null;
|
|
294
|
+
}
|
|
91
295
|
}
|
|
92
296
|
async fetchFromS3(config) {
|
|
93
297
|
const s3Config = this.parseS3Url(config.url, config.retrievalConfig);
|
|
@@ -148,40 +352,44 @@ class DocumentFetcherService {
|
|
|
148
352
|
const lowerUrl = url.toLowerCase();
|
|
149
353
|
const isMarkdown = contentType.includes('text/markdown') ||
|
|
150
354
|
lowerUrl.endsWith('.md') ||
|
|
151
|
-
lowerUrl.endsWith('.markdown')
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
355
|
+
lowerUrl.endsWith('.markdown');
|
|
356
|
+
if (isMarkdown) {
|
|
357
|
+
return this.parseMarkdownText(Buffer.from(body).toString('utf-8'));
|
|
358
|
+
}
|
|
359
|
+
return Buffer.from(body).toString('utf-8');
|
|
155
360
|
}
|
|
156
361
|
catch (error) {
|
|
157
|
-
logger.error('URL fetch failed', {
|
|
158
|
-
|
|
362
|
+
logger.error('URL fetch failed', {
|
|
363
|
+
url,
|
|
364
|
+
error: error instanceof Error ? error.message : String(error),
|
|
365
|
+
});
|
|
366
|
+
throw error;
|
|
159
367
|
}
|
|
160
368
|
}
|
|
161
369
|
async fetchFromLocal(filePath) {
|
|
162
370
|
try {
|
|
163
|
-
const
|
|
164
|
-
const
|
|
165
|
-
const
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
return
|
|
371
|
+
const content = await readFile(filePath, 'utf-8');
|
|
372
|
+
const lowerPath = filePath.toLowerCase();
|
|
373
|
+
const isMarkdown = lowerPath.endsWith('.md') || lowerPath.endsWith('.markdown');
|
|
374
|
+
if (isMarkdown) {
|
|
375
|
+
return this.parseMarkdownText(content);
|
|
376
|
+
}
|
|
377
|
+
return content;
|
|
170
378
|
}
|
|
171
379
|
catch (error) {
|
|
172
|
-
logger.error('Local file read failed', {
|
|
173
|
-
|
|
380
|
+
logger.error('Local file read failed', {
|
|
381
|
+
path: filePath,
|
|
382
|
+
error: error instanceof Error ? error.message : String(error),
|
|
383
|
+
});
|
|
384
|
+
throw error;
|
|
174
385
|
}
|
|
175
386
|
}
|
|
176
387
|
parseS3Url(url, retrievalConfig) {
|
|
177
388
|
// Parse s3://bucket/key format
|
|
178
|
-
|
|
179
|
-
throw new Error('Invalid S3 URL format. Expected: s3://bucket/key');
|
|
180
|
-
}
|
|
181
|
-
const withoutProtocol = url.slice(5);
|
|
389
|
+
const withoutProtocol = url.replace('s3://', '');
|
|
182
390
|
const firstSlash = withoutProtocol.indexOf('/');
|
|
183
391
|
if (firstSlash === -1) {
|
|
184
|
-
throw new Error(
|
|
392
|
+
throw new Error(`Invalid S3 URL format: ${url}`);
|
|
185
393
|
}
|
|
186
394
|
const bucket = withoutProtocol.slice(0, firstSlash);
|
|
187
395
|
const key = withoutProtocol.slice(firstSlash + 1);
|
|
@@ -201,6 +409,14 @@ class DocumentFetcherService {
|
|
|
201
409
|
forcePathStyle: s3SpecificConfig.forcePathStyle,
|
|
202
410
|
};
|
|
203
411
|
}
|
|
412
|
+
normalizeRetrievalConfig(config) {
|
|
413
|
+
if (!config) {
|
|
414
|
+
return undefined;
|
|
415
|
+
}
|
|
416
|
+
// Handle nested config structure from Agentlang
|
|
417
|
+
const normalizedConfig = preprocessRawConfig(config);
|
|
418
|
+
return normalizedConfig;
|
|
419
|
+
}
|
|
204
420
|
async getOrCreateS3Client(config) {
|
|
205
421
|
const clientKey = `${config.region}:${config.endpoint || 'default'}:${config.accessKeyId || 'default'}`;
|
|
206
422
|
if (!this.s3Clients.has(clientKey)) {
|
|
@@ -219,6 +435,74 @@ class DocumentFetcherService {
|
|
|
219
435
|
}
|
|
220
436
|
return this.s3Clients.get(clientKey);
|
|
221
437
|
}
|
|
438
|
+
async parsePdfBuffer(buffer) {
|
|
439
|
+
// Lazy load PDF parser
|
|
440
|
+
if (!this.pdfParser) {
|
|
441
|
+
try {
|
|
442
|
+
const pdfParse = await import('pdf-parse');
|
|
443
|
+
// Handle both ESM and CSM module formats
|
|
444
|
+
const parser = pdfParse.default || pdfParse;
|
|
445
|
+
this.pdfParser = parser;
|
|
446
|
+
}
|
|
447
|
+
catch (error) {
|
|
448
|
+
logger.error('Failed to load PDF parser', { error });
|
|
449
|
+
throw new Error('PDF parsing not available. Please install pdf-parse: npm install pdf-parse');
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
try {
|
|
453
|
+
const result = await this.pdfParser(buffer);
|
|
454
|
+
return result.text || '';
|
|
455
|
+
}
|
|
456
|
+
catch (error) {
|
|
457
|
+
logger.error('PDF parsing failed', { error });
|
|
458
|
+
throw new Error(`Failed to parse PDF: ${error}`);
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
parseMarkdownText(text) {
|
|
462
|
+
// Convert markdown to plain text for embedding
|
|
463
|
+
// This removes formatting but preserves content structure
|
|
464
|
+
try {
|
|
465
|
+
const html = marked.parse(text);
|
|
466
|
+
// Simple HTML to text conversion
|
|
467
|
+
return html
|
|
468
|
+
.replace(/<[^>]+>/g, ' ') // Remove HTML tags
|
|
469
|
+
.replace(/\s+/g, ' ') // Normalize whitespace
|
|
470
|
+
.replace(/</g, '<')
|
|
471
|
+
.replace(/>/g, '>')
|
|
472
|
+
.replace(/&/g, '&')
|
|
473
|
+
.replace(/"/g, '"')
|
|
474
|
+
.trim();
|
|
475
|
+
}
|
|
476
|
+
catch (error) {
|
|
477
|
+
logger.warn('Markdown parsing failed, returning raw text', { error });
|
|
478
|
+
return text;
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
async readS3BodyToBuffer(body) {
|
|
482
|
+
var _a, e_1, _b, _c;
|
|
483
|
+
if (body.transformToByteArray) {
|
|
484
|
+
const data = await body.transformToByteArray();
|
|
485
|
+
return Buffer.from(data);
|
|
486
|
+
}
|
|
487
|
+
// Fallback for Readable streams
|
|
488
|
+
const chunks = [];
|
|
489
|
+
try {
|
|
490
|
+
for (var _d = true, body_1 = __asyncValues(body), body_1_1; body_1_1 = await body_1.next(), _a = body_1_1.done, !_a; _d = true) {
|
|
491
|
+
_c = body_1_1.value;
|
|
492
|
+
_d = false;
|
|
493
|
+
const chunk = _c;
|
|
494
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
498
|
+
finally {
|
|
499
|
+
try {
|
|
500
|
+
if (!_d && !_a && (_b = body_1.return)) await _b.call(body_1);
|
|
501
|
+
}
|
|
502
|
+
finally { if (e_1) throw e_1.error; }
|
|
503
|
+
}
|
|
504
|
+
return Buffer.concat(chunks);
|
|
505
|
+
}
|
|
222
506
|
async createDocumentEntity(document) {
|
|
223
507
|
try {
|
|
224
508
|
// Build the Document entity attributes
|
|
@@ -253,135 +537,46 @@ class DocumentFetcherService {
|
|
|
253
537
|
.replace(/\t/g, '\\t');
|
|
254
538
|
}
|
|
255
539
|
inferFormat(url) {
|
|
540
|
+
// Handle document-service URLs
|
|
541
|
+
if (url.startsWith('document-service://')) {
|
|
542
|
+
return 'txt';
|
|
543
|
+
}
|
|
256
544
|
const parts = url.split('.');
|
|
257
545
|
if (parts.length > 1) {
|
|
258
546
|
return parts[parts.length - 1].toLowerCase();
|
|
259
547
|
}
|
|
260
548
|
return 'txt';
|
|
261
549
|
}
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
else {
|
|
268
|
-
this.documentCache.clear();
|
|
269
|
-
}
|
|
270
|
-
}
|
|
271
|
-
normalizeConfigValue(value) {
|
|
272
|
-
if (value instanceof Map) {
|
|
273
|
-
const obj = {};
|
|
274
|
-
value.forEach((v, k) => {
|
|
275
|
-
obj[k] = this.normalizeConfigValue(v);
|
|
276
|
-
});
|
|
277
|
-
return obj;
|
|
278
|
-
}
|
|
279
|
-
if (Array.isArray(value)) {
|
|
280
|
-
return value.map(v => this.normalizeConfigValue(v));
|
|
281
|
-
}
|
|
282
|
-
if (value && typeof value === 'object') {
|
|
283
|
-
const obj = {};
|
|
284
|
-
Object.entries(value).forEach(([k, v]) => {
|
|
285
|
-
obj[k] = this.normalizeConfigValue(v);
|
|
286
|
-
});
|
|
287
|
-
return obj;
|
|
288
|
-
}
|
|
289
|
-
return value;
|
|
290
|
-
}
|
|
291
|
-
normalizeRetrievalConfig(retrievalConfig) {
|
|
292
|
-
if (!retrievalConfig)
|
|
293
|
-
return undefined;
|
|
294
|
-
const normalized = this.normalizeConfigValue(retrievalConfig);
|
|
295
|
-
if (normalized && typeof normalized === 'object') {
|
|
296
|
-
preprocessRawConfig(normalized);
|
|
297
|
-
}
|
|
298
|
-
return normalized;
|
|
550
|
+
findDocumentInConfig(title) {
|
|
551
|
+
// This method should be called during config loading
|
|
552
|
+
// The documents are stored when the config is parsed
|
|
553
|
+
const docs = getConfiguredDocuments();
|
|
554
|
+
return docs.find(d => d.title === title) || null;
|
|
299
555
|
}
|
|
300
556
|
ensureNodeEnv() {
|
|
301
557
|
if (!isNodeEnv) {
|
|
302
558
|
throw new Error('Document fetching is only available in Node.js environment');
|
|
303
559
|
}
|
|
304
560
|
}
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
const bytes = await body.transformToByteArray();
|
|
309
|
-
return Buffer.from(bytes);
|
|
310
|
-
}
|
|
311
|
-
if (body.transformToString) {
|
|
312
|
-
const text = await body.transformToString('utf-8');
|
|
313
|
-
return Buffer.from(text, 'utf-8');
|
|
314
|
-
}
|
|
315
|
-
const chunks = [];
|
|
316
|
-
try {
|
|
317
|
-
for (var _d = true, body_1 = __asyncValues(body), body_1_1; body_1_1 = await body_1.next(), _a = body_1_1.done, !_a; _d = true) {
|
|
318
|
-
_c = body_1_1.value;
|
|
319
|
-
_d = false;
|
|
320
|
-
const chunk = _c;
|
|
321
|
-
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
322
|
-
}
|
|
323
|
-
}
|
|
324
|
-
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
325
|
-
finally {
|
|
326
|
-
try {
|
|
327
|
-
if (!_d && !_a && (_b = body_1.return)) await _b.call(body_1);
|
|
328
|
-
}
|
|
329
|
-
finally { if (e_1) throw e_1.error; }
|
|
330
|
-
}
|
|
331
|
-
return Buffer.concat(chunks);
|
|
332
|
-
}
|
|
333
|
-
async getPdfParser() {
|
|
334
|
-
if (!this.pdfParser) {
|
|
335
|
-
const pdfModule = await import('pdf-parse');
|
|
336
|
-
this.pdfParser = pdfModule.PDFParse || pdfModule.default;
|
|
337
|
-
}
|
|
338
|
-
return this.pdfParser;
|
|
339
|
-
}
|
|
340
|
-
async parsePdfBuffer(buffer) {
|
|
341
|
-
try {
|
|
342
|
-
const PDFParseClass = await this.getPdfParser();
|
|
343
|
-
const parser = new PDFParseClass({
|
|
344
|
-
data: buffer,
|
|
345
|
-
verbosity: 0,
|
|
346
|
-
});
|
|
347
|
-
const data = await parser.getText();
|
|
348
|
-
return data.text;
|
|
349
|
-
}
|
|
350
|
-
catch (error) {
|
|
351
|
-
logger.error(`Failed to parse PDF: ${error.message}`);
|
|
352
|
-
throw new Error(`PDF parsing failed: ${error.message}`);
|
|
353
|
-
}
|
|
354
|
-
}
|
|
355
|
-
parseMarkdownText(markdown) {
|
|
356
|
-
const html = marked.parse(markdown);
|
|
357
|
-
if (typeof html !== 'string') {
|
|
358
|
-
return markdown;
|
|
359
|
-
}
|
|
360
|
-
return html
|
|
361
|
-
.replace(/<\s*br\s*\/?>/gi, '\n')
|
|
362
|
-
.replace(/<\/(p|li|h[1-6]|blockquote|pre|tr|table)>/gi, '\n')
|
|
363
|
-
.replace(/<[^>]+>/g, '')
|
|
364
|
-
.replace(/\n{3,}/g, '\n\n')
|
|
365
|
-
.trim();
|
|
561
|
+
clearCache() {
|
|
562
|
+
// Clear all cache
|
|
563
|
+
this.documentCache.clear();
|
|
366
564
|
}
|
|
367
565
|
}
|
|
368
566
|
DocumentFetcherService.CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
|
|
369
|
-
//
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
}
|
|
378
|
-
}
|
|
379
|
-
export function getConfiguredDocuments() {
|
|
380
|
-
return [...configuredDocuments];
|
|
567
|
+
// Singleton instance
|
|
568
|
+
const documentFetcher = new DocumentFetcherService();
|
|
569
|
+
// Helper function to get configured documents from module config
|
|
570
|
+
function getConfiguredDocuments() {
|
|
571
|
+
// This should be populated during config parsing
|
|
572
|
+
// For now, return empty array - actual implementation depends on how
|
|
573
|
+
// the config system stores document definitions
|
|
574
|
+
return global.__configuredDocuments || [];
|
|
381
575
|
}
|
|
382
|
-
|
|
383
|
-
|
|
576
|
+
// Export for use in config loading
|
|
577
|
+
export function setConfiguredDocuments(docs) {
|
|
578
|
+
global.__configuredDocuments = docs;
|
|
384
579
|
}
|
|
385
|
-
export
|
|
580
|
+
export { documentFetcher };
|
|
386
581
|
export default documentFetcher;
|
|
387
582
|
//# sourceMappingURL=documentFetcher.js.map
|