@librechat/agents 2.4.83 → 2.4.85
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/llm/google/utils/common.cjs +13 -0
- package/dist/cjs/llm/google/utils/common.cjs.map +1 -1
- package/dist/cjs/main.cjs +1 -1
- package/dist/cjs/messages/format.cjs +52 -34
- package/dist/cjs/messages/format.cjs.map +1 -1
- package/dist/cjs/tools/search/firecrawl.cjs +3 -1
- package/dist/cjs/tools/search/firecrawl.cjs.map +1 -1
- package/dist/cjs/tools/search/search.cjs +5 -5
- package/dist/cjs/tools/search/search.cjs.map +1 -1
- package/dist/cjs/tools/search/serper-scraper.cjs +132 -0
- package/dist/cjs/tools/search/serper-scraper.cjs.map +1 -0
- package/dist/cjs/tools/search/tool.cjs +45 -9
- package/dist/cjs/tools/search/tool.cjs.map +1 -1
- package/dist/esm/llm/google/utils/common.mjs +13 -0
- package/dist/esm/llm/google/utils/common.mjs.map +1 -1
- package/dist/esm/main.mjs +1 -1
- package/dist/esm/messages/format.mjs +52 -34
- package/dist/esm/messages/format.mjs.map +1 -1
- package/dist/esm/tools/search/firecrawl.mjs +3 -1
- package/dist/esm/tools/search/firecrawl.mjs.map +1 -1
- package/dist/esm/tools/search/search.mjs +5 -5
- package/dist/esm/tools/search/search.mjs.map +1 -1
- package/dist/esm/tools/search/serper-scraper.mjs +129 -0
- package/dist/esm/tools/search/serper-scraper.mjs.map +1 -0
- package/dist/esm/tools/search/tool.mjs +45 -9
- package/dist/esm/tools/search/tool.mjs.map +1 -1
- package/dist/types/messages/format.d.ts +23 -20
- package/dist/types/tools/search/firecrawl.d.ts +2 -1
- package/dist/types/tools/search/search.d.ts +1 -2
- package/dist/types/tools/search/serper-scraper.d.ts +59 -0
- package/dist/types/tools/search/tool.d.ts +21 -0
- package/dist/types/tools/search/types.d.ts +30 -1
- package/package.json +1 -1
- package/src/llm/google/utils/common.ts +14 -0
- package/src/messages/format.ts +67 -39
- package/src/messages/formatMessage.test.ts +418 -2
- package/src/scripts/search.ts +5 -1
- package/src/tools/search/firecrawl.ts +5 -2
- package/src/tools/search/search.ts +6 -8
- package/src/tools/search/serper-scraper.ts +155 -0
- package/src/tools/search/tool.ts +47 -8
- package/src/tools/search/types.ts +45 -0
|
@@ -1,8 +1,43 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
1
|
+
import {
|
|
2
|
+
HumanMessage,
|
|
3
|
+
AIMessage,
|
|
4
|
+
SystemMessage,
|
|
5
|
+
} from '@langchain/core/messages';
|
|
6
|
+
import type { MessageContentComplex } from '@/types';
|
|
7
|
+
import {
|
|
8
|
+
formatMessage,
|
|
9
|
+
formatLangChainMessages,
|
|
10
|
+
formatFromLangChain,
|
|
11
|
+
formatMediaMessage,
|
|
12
|
+
} from './format';
|
|
13
|
+
import { Providers } from '@/common';
|
|
3
14
|
|
|
4
15
|
const NO_PARENT = '00000000-0000-0000-0000-000000000000';
|
|
5
16
|
|
|
17
|
+
/**
|
|
18
|
+
* Type for formatted message results with media content
|
|
19
|
+
*/
|
|
20
|
+
interface FormattedMediaMessage {
|
|
21
|
+
role: string;
|
|
22
|
+
content: MessageContentComplex[];
|
|
23
|
+
name?: string;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Type guard to check if result is a FormattedMediaMessage
|
|
28
|
+
*/
|
|
29
|
+
function isFormattedMediaMessage(
|
|
30
|
+
result: unknown
|
|
31
|
+
): result is FormattedMediaMessage {
|
|
32
|
+
return (
|
|
33
|
+
typeof result === 'object' &&
|
|
34
|
+
result !== null &&
|
|
35
|
+
'role' in result &&
|
|
36
|
+
'content' in result &&
|
|
37
|
+
Array.isArray((result as FormattedMediaMessage).content)
|
|
38
|
+
);
|
|
39
|
+
}
|
|
40
|
+
|
|
6
41
|
describe('formatMessage', () => {
|
|
7
42
|
it('formats user message', () => {
|
|
8
43
|
const input = {
|
|
@@ -187,6 +222,387 @@ describe('formatMessage', () => {
|
|
|
187
222
|
});
|
|
188
223
|
});
|
|
189
224
|
|
|
225
|
+
describe('formatMediaMessage', () => {
|
|
226
|
+
it('formats message with images for default provider', () => {
|
|
227
|
+
const message = {
|
|
228
|
+
role: 'user',
|
|
229
|
+
content: 'Check out this image',
|
|
230
|
+
name: 'John',
|
|
231
|
+
};
|
|
232
|
+
const mediaParts = [
|
|
233
|
+
{
|
|
234
|
+
type: 'image_url',
|
|
235
|
+
image_url: { url: 'https://example.com/image1.jpg' },
|
|
236
|
+
},
|
|
237
|
+
{
|
|
238
|
+
type: 'image_url',
|
|
239
|
+
image_url: { url: 'https://example.com/image2.jpg' },
|
|
240
|
+
},
|
|
241
|
+
];
|
|
242
|
+
|
|
243
|
+
const result = formatMediaMessage({ message, mediaParts });
|
|
244
|
+
|
|
245
|
+
expect(result.role).toBe('user');
|
|
246
|
+
expect(result.name).toBe('John');
|
|
247
|
+
expect(Array.isArray(result.content)).toBe(true);
|
|
248
|
+
expect(result.content).toHaveLength(3);
|
|
249
|
+
expect(result.content[0]).toEqual({
|
|
250
|
+
type: 'text',
|
|
251
|
+
text: 'Check out this image',
|
|
252
|
+
});
|
|
253
|
+
expect(result.content[1]).toEqual(mediaParts[0]);
|
|
254
|
+
expect(result.content[2]).toEqual(mediaParts[1]);
|
|
255
|
+
});
|
|
256
|
+
|
|
257
|
+
it('formats message with images for Anthropic (media first)', () => {
|
|
258
|
+
const message = {
|
|
259
|
+
role: 'user',
|
|
260
|
+
content: 'Check out this image',
|
|
261
|
+
};
|
|
262
|
+
const mediaParts = [
|
|
263
|
+
{
|
|
264
|
+
type: 'image_url',
|
|
265
|
+
image_url: { url: 'https://example.com/image.jpg' },
|
|
266
|
+
},
|
|
267
|
+
];
|
|
268
|
+
|
|
269
|
+
const result = formatMediaMessage({
|
|
270
|
+
message,
|
|
271
|
+
mediaParts,
|
|
272
|
+
endpoint: Providers.ANTHROPIC,
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
expect(result.content).toHaveLength(2);
|
|
276
|
+
expect(result.content[0]).toEqual(mediaParts[0]);
|
|
277
|
+
expect(result.content[1]).toEqual({
|
|
278
|
+
type: 'text',
|
|
279
|
+
text: 'Check out this image',
|
|
280
|
+
});
|
|
281
|
+
});
|
|
282
|
+
|
|
283
|
+
it('formats message with multiple media types', () => {
|
|
284
|
+
const message = {
|
|
285
|
+
role: 'user',
|
|
286
|
+
content: 'Check out these files',
|
|
287
|
+
};
|
|
288
|
+
const mediaParts = [
|
|
289
|
+
{ type: 'document', document: { url: 'https://example.com/doc.pdf' } },
|
|
290
|
+
{ type: 'video', video: { url: 'https://example.com/video.mp4' } },
|
|
291
|
+
{ type: 'audio', audio: { url: 'https://example.com/audio.mp3' } },
|
|
292
|
+
{
|
|
293
|
+
type: 'image_url',
|
|
294
|
+
image_url: { url: 'https://example.com/image.jpg' },
|
|
295
|
+
},
|
|
296
|
+
];
|
|
297
|
+
|
|
298
|
+
const result = formatMediaMessage({ message, mediaParts });
|
|
299
|
+
|
|
300
|
+
expect(result.content).toHaveLength(5);
|
|
301
|
+
expect(result.content[0]).toEqual({
|
|
302
|
+
type: 'text',
|
|
303
|
+
text: 'Check out these files',
|
|
304
|
+
});
|
|
305
|
+
expect(result.content[1]).toEqual(mediaParts[0]);
|
|
306
|
+
expect(result.content[2]).toEqual(mediaParts[1]);
|
|
307
|
+
expect(result.content[3]).toEqual(mediaParts[2]);
|
|
308
|
+
expect(result.content[4]).toEqual(mediaParts[3]);
|
|
309
|
+
});
|
|
310
|
+
});
|
|
311
|
+
|
|
312
|
+
describe('formatMessage with media', () => {
|
|
313
|
+
it('formats user message with image_urls (backward compatibility)', () => {
|
|
314
|
+
const input = {
|
|
315
|
+
message: {
|
|
316
|
+
sender: 'user',
|
|
317
|
+
text: 'Check out this image',
|
|
318
|
+
image_urls: [
|
|
319
|
+
{
|
|
320
|
+
type: 'image_url' as const,
|
|
321
|
+
image_url: { url: 'https://example.com/image.jpg' },
|
|
322
|
+
},
|
|
323
|
+
],
|
|
324
|
+
},
|
|
325
|
+
userName: 'John',
|
|
326
|
+
};
|
|
327
|
+
|
|
328
|
+
const result = formatMessage(input);
|
|
329
|
+
|
|
330
|
+
expect(isFormattedMediaMessage(result)).toBe(true);
|
|
331
|
+
if (isFormattedMediaMessage(result)) {
|
|
332
|
+
expect(result.role).toBe('user');
|
|
333
|
+
expect(result.name).toBe('John');
|
|
334
|
+
expect(Array.isArray(result.content)).toBe(true);
|
|
335
|
+
expect(result.content).toHaveLength(2);
|
|
336
|
+
expect(result.content[0]).toEqual({
|
|
337
|
+
type: 'text',
|
|
338
|
+
text: 'Check out this image',
|
|
339
|
+
});
|
|
340
|
+
expect(result.content[1]).toEqual(input.message.image_urls[0]);
|
|
341
|
+
}
|
|
342
|
+
});
|
|
343
|
+
|
|
344
|
+
it('formats user message with documents', () => {
|
|
345
|
+
const input = {
|
|
346
|
+
message: {
|
|
347
|
+
role: 'user',
|
|
348
|
+
content: 'Review this document',
|
|
349
|
+
documents: [
|
|
350
|
+
{
|
|
351
|
+
type: 'document',
|
|
352
|
+
document: { url: 'https://example.com/report.pdf' },
|
|
353
|
+
},
|
|
354
|
+
],
|
|
355
|
+
},
|
|
356
|
+
};
|
|
357
|
+
|
|
358
|
+
const result = formatMessage(input);
|
|
359
|
+
|
|
360
|
+
expect(isFormattedMediaMessage(result)).toBe(true);
|
|
361
|
+
if (isFormattedMediaMessage(result)) {
|
|
362
|
+
expect(result.role).toBe('user');
|
|
363
|
+
expect(Array.isArray(result.content)).toBe(true);
|
|
364
|
+
expect(result.content).toHaveLength(2);
|
|
365
|
+
expect(result.content[0]).toEqual({
|
|
366
|
+
type: 'text',
|
|
367
|
+
text: 'Review this document',
|
|
368
|
+
});
|
|
369
|
+
expect(result.content[1]).toEqual(input.message.documents[0]);
|
|
370
|
+
}
|
|
371
|
+
});
|
|
372
|
+
|
|
373
|
+
it('formats user message with videos', () => {
|
|
374
|
+
const input = {
|
|
375
|
+
message: {
|
|
376
|
+
role: 'user',
|
|
377
|
+
content: 'Watch this video',
|
|
378
|
+
videos: [
|
|
379
|
+
{ type: 'video', video: { url: 'https://example.com/demo.mp4' } },
|
|
380
|
+
],
|
|
381
|
+
},
|
|
382
|
+
};
|
|
383
|
+
|
|
384
|
+
const result = formatMessage(input);
|
|
385
|
+
|
|
386
|
+
expect(isFormattedMediaMessage(result)).toBe(true);
|
|
387
|
+
if (isFormattedMediaMessage(result)) {
|
|
388
|
+
expect(result.role).toBe('user');
|
|
389
|
+
expect(Array.isArray(result.content)).toBe(true);
|
|
390
|
+
expect(result.content).toHaveLength(2);
|
|
391
|
+
expect(result.content[0]).toEqual({
|
|
392
|
+
type: 'text',
|
|
393
|
+
text: 'Watch this video',
|
|
394
|
+
});
|
|
395
|
+
expect(result.content[1]).toEqual(input.message.videos[0]);
|
|
396
|
+
}
|
|
397
|
+
});
|
|
398
|
+
|
|
399
|
+
it('formats user message with audios', () => {
|
|
400
|
+
const input = {
|
|
401
|
+
message: {
|
|
402
|
+
role: 'user',
|
|
403
|
+
content: 'Listen to this',
|
|
404
|
+
audios: [
|
|
405
|
+
{ type: 'audio', audio: { url: 'https://example.com/podcast.mp3' } },
|
|
406
|
+
],
|
|
407
|
+
},
|
|
408
|
+
};
|
|
409
|
+
|
|
410
|
+
const result = formatMessage(input);
|
|
411
|
+
|
|
412
|
+
expect(isFormattedMediaMessage(result)).toBe(true);
|
|
413
|
+
if (isFormattedMediaMessage(result)) {
|
|
414
|
+
expect(result.role).toBe('user');
|
|
415
|
+
expect(Array.isArray(result.content)).toBe(true);
|
|
416
|
+
expect(result.content).toHaveLength(2);
|
|
417
|
+
expect(result.content[0]).toEqual({
|
|
418
|
+
type: 'text',
|
|
419
|
+
text: 'Listen to this',
|
|
420
|
+
});
|
|
421
|
+
expect(result.content[1]).toEqual(input.message.audios[0]);
|
|
422
|
+
}
|
|
423
|
+
});
|
|
424
|
+
|
|
425
|
+
it('formats user message with all media types in correct order', () => {
|
|
426
|
+
const input = {
|
|
427
|
+
message: {
|
|
428
|
+
role: 'user',
|
|
429
|
+
content: 'Check out all these files',
|
|
430
|
+
documents: [
|
|
431
|
+
{
|
|
432
|
+
type: 'document',
|
|
433
|
+
document: { url: 'https://example.com/doc.pdf' },
|
|
434
|
+
},
|
|
435
|
+
],
|
|
436
|
+
videos: [
|
|
437
|
+
{ type: 'video', video: { url: 'https://example.com/video.mp4' } },
|
|
438
|
+
],
|
|
439
|
+
audios: [
|
|
440
|
+
{ type: 'audio', audio: { url: 'https://example.com/audio.mp3' } },
|
|
441
|
+
],
|
|
442
|
+
image_urls: [
|
|
443
|
+
{
|
|
444
|
+
type: 'image_url' as const,
|
|
445
|
+
image_url: { url: 'https://example.com/image.jpg' },
|
|
446
|
+
},
|
|
447
|
+
],
|
|
448
|
+
},
|
|
449
|
+
};
|
|
450
|
+
|
|
451
|
+
const result = formatMessage(input);
|
|
452
|
+
|
|
453
|
+
expect(isFormattedMediaMessage(result)).toBe(true);
|
|
454
|
+
if (isFormattedMediaMessage(result)) {
|
|
455
|
+
expect(result.role).toBe('user');
|
|
456
|
+
expect(Array.isArray(result.content)).toBe(true);
|
|
457
|
+
expect(result.content).toHaveLength(5);
|
|
458
|
+
// Text first
|
|
459
|
+
expect(result.content[0]).toEqual({
|
|
460
|
+
type: 'text',
|
|
461
|
+
text: 'Check out all these files',
|
|
462
|
+
});
|
|
463
|
+
// Then documents, videos, audios, images
|
|
464
|
+
expect(result.content[1]).toEqual(input.message.documents[0]);
|
|
465
|
+
expect(result.content[2]).toEqual(input.message.videos[0]);
|
|
466
|
+
expect(result.content[3]).toEqual(input.message.audios[0]);
|
|
467
|
+
expect(result.content[4]).toEqual(input.message.image_urls[0]);
|
|
468
|
+
}
|
|
469
|
+
});
|
|
470
|
+
|
|
471
|
+
it('formats user message with multiple files of the same type', () => {
|
|
472
|
+
const input = {
|
|
473
|
+
message: {
|
|
474
|
+
role: 'user',
|
|
475
|
+
content: 'Review these documents',
|
|
476
|
+
documents: [
|
|
477
|
+
{
|
|
478
|
+
type: 'document',
|
|
479
|
+
document: { url: 'https://example.com/doc1.pdf' },
|
|
480
|
+
},
|
|
481
|
+
{
|
|
482
|
+
type: 'document',
|
|
483
|
+
document: { url: 'https://example.com/doc2.pdf' },
|
|
484
|
+
},
|
|
485
|
+
{
|
|
486
|
+
type: 'document',
|
|
487
|
+
document: { url: 'https://example.com/doc3.pdf' },
|
|
488
|
+
},
|
|
489
|
+
],
|
|
490
|
+
},
|
|
491
|
+
};
|
|
492
|
+
|
|
493
|
+
const result = formatMessage(input);
|
|
494
|
+
|
|
495
|
+
expect(isFormattedMediaMessage(result)).toBe(true);
|
|
496
|
+
if (isFormattedMediaMessage(result)) {
|
|
497
|
+
expect(result.content).toHaveLength(4);
|
|
498
|
+
expect(result.content[0].type).toBe('text');
|
|
499
|
+
expect(result.content[1]).toEqual(input.message.documents[0]);
|
|
500
|
+
expect(result.content[2]).toEqual(input.message.documents[1]);
|
|
501
|
+
expect(result.content[3]).toEqual(input.message.documents[2]);
|
|
502
|
+
}
|
|
503
|
+
});
|
|
504
|
+
|
|
505
|
+
it('respects Anthropic provider ordering (media before text)', () => {
|
|
506
|
+
const input = {
|
|
507
|
+
message: {
|
|
508
|
+
role: 'user',
|
|
509
|
+
content: 'Check this out',
|
|
510
|
+
documents: [
|
|
511
|
+
{
|
|
512
|
+
type: 'document',
|
|
513
|
+
document: { url: 'https://example.com/doc.pdf' },
|
|
514
|
+
},
|
|
515
|
+
],
|
|
516
|
+
image_urls: [
|
|
517
|
+
{
|
|
518
|
+
type: 'image_url' as const,
|
|
519
|
+
image_url: { url: 'https://example.com/image.jpg' },
|
|
520
|
+
},
|
|
521
|
+
],
|
|
522
|
+
},
|
|
523
|
+
endpoint: Providers.ANTHROPIC,
|
|
524
|
+
};
|
|
525
|
+
|
|
526
|
+
const result = formatMessage(input);
|
|
527
|
+
|
|
528
|
+
expect(isFormattedMediaMessage(result)).toBe(true);
|
|
529
|
+
if (isFormattedMediaMessage(result)) {
|
|
530
|
+
expect(result.content).toHaveLength(3);
|
|
531
|
+
// Media first for Anthropic
|
|
532
|
+
expect(result.content[0]).toEqual(input.message.documents[0]);
|
|
533
|
+
expect(result.content[1]).toEqual(input.message.image_urls[0]);
|
|
534
|
+
expect(result.content[2]).toEqual({
|
|
535
|
+
type: 'text',
|
|
536
|
+
text: 'Check this out',
|
|
537
|
+
});
|
|
538
|
+
}
|
|
539
|
+
});
|
|
540
|
+
|
|
541
|
+
it('does not format media for assistant messages', () => {
|
|
542
|
+
const input = {
|
|
543
|
+
message: {
|
|
544
|
+
role: 'assistant',
|
|
545
|
+
content: 'Here is a response',
|
|
546
|
+
documents: [
|
|
547
|
+
{
|
|
548
|
+
type: 'document',
|
|
549
|
+
document: { url: 'https://example.com/doc.pdf' },
|
|
550
|
+
},
|
|
551
|
+
],
|
|
552
|
+
},
|
|
553
|
+
};
|
|
554
|
+
|
|
555
|
+
const result = formatMessage(input);
|
|
556
|
+
|
|
557
|
+
expect(result).toMatchObject({
|
|
558
|
+
role: 'assistant',
|
|
559
|
+
content: 'Here is a response',
|
|
560
|
+
});
|
|
561
|
+
});
|
|
562
|
+
|
|
563
|
+
it('handles empty media arrays gracefully', () => {
|
|
564
|
+
const input = {
|
|
565
|
+
message: {
|
|
566
|
+
role: 'user',
|
|
567
|
+
content: 'Just text',
|
|
568
|
+
documents: [],
|
|
569
|
+
videos: [],
|
|
570
|
+
audios: [],
|
|
571
|
+
image_urls: [],
|
|
572
|
+
},
|
|
573
|
+
};
|
|
574
|
+
|
|
575
|
+
const result = formatMessage(input);
|
|
576
|
+
|
|
577
|
+
expect(result).toMatchObject({
|
|
578
|
+
role: 'user',
|
|
579
|
+
content: 'Just text',
|
|
580
|
+
});
|
|
581
|
+
});
|
|
582
|
+
|
|
583
|
+
it('formats media with langChain flag', () => {
|
|
584
|
+
const input = {
|
|
585
|
+
message: {
|
|
586
|
+
role: 'user',
|
|
587
|
+
content: 'Check this image',
|
|
588
|
+
image_urls: [
|
|
589
|
+
{
|
|
590
|
+
type: 'image_url' as const,
|
|
591
|
+
image_url: { url: 'https://example.com/image.jpg' },
|
|
592
|
+
},
|
|
593
|
+
],
|
|
594
|
+
},
|
|
595
|
+
langChain: true,
|
|
596
|
+
};
|
|
597
|
+
|
|
598
|
+
const result = formatMessage(input);
|
|
599
|
+
|
|
600
|
+
expect(result).toBeInstanceOf(HumanMessage);
|
|
601
|
+
expect(Array.isArray(result.lc_kwargs.content)).toBe(true);
|
|
602
|
+
expect(result.lc_kwargs.content).toHaveLength(2);
|
|
603
|
+
});
|
|
604
|
+
});
|
|
605
|
+
|
|
190
606
|
describe('formatLangChainMessages', () => {
|
|
191
607
|
it('formats an array of messages for LangChain', () => {
|
|
192
608
|
const messages = [
|
package/src/scripts/search.ts
CHANGED
|
@@ -83,7 +83,11 @@ async function testStandardStreaming(): Promise<void> {
|
|
|
83
83
|
graphConfig: {
|
|
84
84
|
type: 'standard',
|
|
85
85
|
llmConfig,
|
|
86
|
-
tools: [
|
|
86
|
+
tools: [
|
|
87
|
+
createSearchTool({
|
|
88
|
+
scraperProvider: 'serper',
|
|
89
|
+
}),
|
|
90
|
+
],
|
|
87
91
|
instructions:
|
|
88
92
|
'You are a friendly AI assistant. Always address the user by their name.',
|
|
89
93
|
// additional_instructions: `The user's name is ${userName} and they are located in ${location}.`,
|
|
@@ -7,9 +7,10 @@ import { createDefaultLogger } from './utils';
|
|
|
7
7
|
* Firecrawl scraper implementation
|
|
8
8
|
* Uses the Firecrawl API to scrape web pages
|
|
9
9
|
*/
|
|
10
|
-
export class FirecrawlScraper {
|
|
10
|
+
export class FirecrawlScraper implements t.BaseScraper {
|
|
11
11
|
private apiKey: string;
|
|
12
12
|
private apiUrl: string;
|
|
13
|
+
private version: string;
|
|
13
14
|
private defaultFormats: string[];
|
|
14
15
|
private timeout: number;
|
|
15
16
|
private logger: t.Logger;
|
|
@@ -32,11 +33,13 @@ export class FirecrawlScraper {
|
|
|
32
33
|
constructor(config: t.FirecrawlScraperConfig = {}) {
|
|
33
34
|
this.apiKey = config.apiKey ?? process.env.FIRECRAWL_API_KEY ?? '';
|
|
34
35
|
|
|
36
|
+
this.version = config.version ?? 'v2';
|
|
37
|
+
|
|
35
38
|
const baseUrl =
|
|
36
39
|
config.apiUrl ??
|
|
37
40
|
process.env.FIRECRAWL_BASE_URL ??
|
|
38
41
|
'https://api.firecrawl.dev';
|
|
39
|
-
this.apiUrl = `${baseUrl.replace(/\/+$/, '')}/
|
|
42
|
+
this.apiUrl = `${baseUrl.replace(/\/+$/, '')}/${this.version}/scrape`;
|
|
40
43
|
|
|
41
44
|
this.defaultFormats = config.formats ?? ['markdown', 'rawHtml'];
|
|
42
45
|
this.timeout = config.timeout ?? 7500;
|
|
@@ -2,7 +2,6 @@ import axios from 'axios';
|
|
|
2
2
|
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
|
|
3
3
|
import type * as t from './types';
|
|
4
4
|
import { getAttribution, createDefaultLogger } from './utils';
|
|
5
|
-
import { FirecrawlScraper } from './firecrawl';
|
|
6
5
|
import { BaseReranker } from './rerankers';
|
|
7
6
|
|
|
8
7
|
const chunker = {
|
|
@@ -434,7 +433,7 @@ export const createSearchAPI = (
|
|
|
434
433
|
|
|
435
434
|
export const createSourceProcessor = (
|
|
436
435
|
config: t.ProcessSourcesConfig = {},
|
|
437
|
-
scraperInstance?:
|
|
436
|
+
scraperInstance?: t.BaseScraper
|
|
438
437
|
): {
|
|
439
438
|
processSources: (
|
|
440
439
|
fields: t.ProcessSourcesFields
|
|
@@ -442,7 +441,7 @@ export const createSourceProcessor = (
|
|
|
442
441
|
topResults: number;
|
|
443
442
|
} => {
|
|
444
443
|
if (!scraperInstance) {
|
|
445
|
-
throw new Error('
|
|
444
|
+
throw new Error('Scraper instance is required');
|
|
446
445
|
}
|
|
447
446
|
const {
|
|
448
447
|
topResults = 5,
|
|
@@ -453,7 +452,7 @@ export const createSourceProcessor = (
|
|
|
453
452
|
} = config;
|
|
454
453
|
|
|
455
454
|
const logger_ = logger || createDefaultLogger();
|
|
456
|
-
const
|
|
455
|
+
const scraper = scraperInstance;
|
|
457
456
|
|
|
458
457
|
const webScraper = {
|
|
459
458
|
scrapeMany: async ({
|
|
@@ -465,12 +464,12 @@ export const createSourceProcessor = (
|
|
|
465
464
|
links: string[];
|
|
466
465
|
onGetHighlights: t.SearchToolConfig['onGetHighlights'];
|
|
467
466
|
}): Promise<Array<t.ScrapeResult>> => {
|
|
468
|
-
logger_.debug(`Scraping ${links.length} links
|
|
467
|
+
logger_.debug(`Scraping ${links.length} links`);
|
|
469
468
|
const promises: Array<Promise<t.ScrapeResult>> = [];
|
|
470
469
|
try {
|
|
471
470
|
for (let i = 0; i < links.length; i++) {
|
|
472
471
|
const currentLink = links[i];
|
|
473
|
-
const promise: Promise<t.ScrapeResult> =
|
|
472
|
+
const promise: Promise<t.ScrapeResult> = scraper
|
|
474
473
|
.scrapeUrl(currentLink, {})
|
|
475
474
|
.then(([url, response]) => {
|
|
476
475
|
const attribution = getAttribution(
|
|
@@ -479,8 +478,7 @@ export const createSourceProcessor = (
|
|
|
479
478
|
logger_
|
|
480
479
|
);
|
|
481
480
|
if (response.success && response.data) {
|
|
482
|
-
const [content, references] =
|
|
483
|
-
firecrawlScraper.extractContent(response);
|
|
481
|
+
const [content, references] = scraper.extractContent(response);
|
|
484
482
|
return {
|
|
485
483
|
url,
|
|
486
484
|
references,
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import axios from 'axios';
|
|
2
|
+
import type * as t from './types';
|
|
3
|
+
import { createDefaultLogger } from './utils';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Serper scraper implementation
|
|
7
|
+
* Uses the Serper Scrape API (https://scrape.serper.dev) to scrape web pages
|
|
8
|
+
*
|
|
9
|
+
* Features:
|
|
10
|
+
* - Simple API with single endpoint
|
|
11
|
+
* - Returns both text and markdown content
|
|
12
|
+
* - Includes metadata from scraped pages
|
|
13
|
+
* - Credits-based pricing model
|
|
14
|
+
*
|
|
15
|
+
* @example
|
|
16
|
+
* ```typescript
|
|
17
|
+
* const scraper = createSerperScraper({
|
|
18
|
+
* apiKey: 'your-serper-api-key',
|
|
19
|
+
* includeMarkdown: true,
|
|
20
|
+
* timeout: 10000
|
|
21
|
+
* });
|
|
22
|
+
*
|
|
23
|
+
* const [url, response] = await scraper.scrapeUrl('https://example.com');
|
|
24
|
+
* if (response.success) {
|
|
25
|
+
* const [content] = scraper.extractContent(response);
|
|
26
|
+
* console.log(content);
|
|
27
|
+
* }
|
|
28
|
+
* ```
|
|
29
|
+
*/
|
|
30
|
+
export class SerperScraper implements t.BaseScraper {
|
|
31
|
+
private apiKey: string;
|
|
32
|
+
private apiUrl: string;
|
|
33
|
+
private timeout: number;
|
|
34
|
+
private logger: t.Logger;
|
|
35
|
+
private includeMarkdown: boolean;
|
|
36
|
+
|
|
37
|
+
constructor(config: t.SerperScraperConfig = {}) {
|
|
38
|
+
this.apiKey = config.apiKey ?? process.env.SERPER_API_KEY ?? '';
|
|
39
|
+
|
|
40
|
+
this.apiUrl =
|
|
41
|
+
config.apiUrl ??
|
|
42
|
+
process.env.SERPER_SCRAPE_URL ??
|
|
43
|
+
'https://scrape.serper.dev';
|
|
44
|
+
|
|
45
|
+
this.timeout = config.timeout ?? 7500;
|
|
46
|
+
this.includeMarkdown = config.includeMarkdown ?? true;
|
|
47
|
+
|
|
48
|
+
this.logger = config.logger || createDefaultLogger();
|
|
49
|
+
|
|
50
|
+
if (!this.apiKey) {
|
|
51
|
+
this.logger.warn('SERPER_API_KEY is not set. Scraping will not work.');
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
this.logger.debug(
|
|
55
|
+
`Serper scraper initialized with API URL: ${this.apiUrl}`
|
|
56
|
+
);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Scrape a single URL
|
|
61
|
+
* @param url URL to scrape
|
|
62
|
+
* @param options Scrape options
|
|
63
|
+
* @returns Scrape response
|
|
64
|
+
*/
|
|
65
|
+
async scrapeUrl(
|
|
66
|
+
url: string,
|
|
67
|
+
options: t.SerperScrapeOptions = {}
|
|
68
|
+
): Promise<[string, t.SerperScrapeResponse]> {
|
|
69
|
+
if (!this.apiKey) {
|
|
70
|
+
return [
|
|
71
|
+
url,
|
|
72
|
+
{
|
|
73
|
+
success: false,
|
|
74
|
+
error: 'SERPER_API_KEY is not set',
|
|
75
|
+
},
|
|
76
|
+
];
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
try {
|
|
80
|
+
const payload = {
|
|
81
|
+
url,
|
|
82
|
+
includeMarkdown: options.includeMarkdown ?? this.includeMarkdown,
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
const response = await axios.post(this.apiUrl, payload, {
|
|
86
|
+
headers: {
|
|
87
|
+
'X-API-KEY': this.apiKey,
|
|
88
|
+
'Content-Type': 'application/json',
|
|
89
|
+
},
|
|
90
|
+
timeout: options.timeout ?? this.timeout,
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
return [url, { success: true, data: response.data }];
|
|
94
|
+
} catch (error) {
|
|
95
|
+
const errorMessage =
|
|
96
|
+
error instanceof Error ? error.message : String(error);
|
|
97
|
+
return [
|
|
98
|
+
url,
|
|
99
|
+
{
|
|
100
|
+
success: false,
|
|
101
|
+
error: `Serper Scrape API request failed: ${errorMessage}`,
|
|
102
|
+
},
|
|
103
|
+
];
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Extract content from scrape response
|
|
109
|
+
* @param response Scrape response
|
|
110
|
+
* @returns Extracted content or empty string if not available
|
|
111
|
+
*/
|
|
112
|
+
extractContent(
|
|
113
|
+
response: t.SerperScrapeResponse
|
|
114
|
+
): [string, undefined | t.References] {
|
|
115
|
+
if (!response.success || !response.data) {
|
|
116
|
+
return ['', undefined];
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
if (response.data.markdown != null) {
|
|
120
|
+
return [response.data.markdown, undefined];
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
if (response.data.text != null) {
|
|
124
|
+
return [response.data.text, undefined];
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
return ['', undefined];
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Extract metadata from scrape response
|
|
132
|
+
* @param response Scrape response
|
|
133
|
+
* @returns Metadata object
|
|
134
|
+
*/
|
|
135
|
+
extractMetadata(
|
|
136
|
+
response: t.SerperScrapeResponse
|
|
137
|
+
): Record<string, string | number | boolean | null | undefined> {
|
|
138
|
+
if (!response.success || !response.data || !response.data.metadata) {
|
|
139
|
+
return {};
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
return response.data.metadata;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Create a Serper scraper instance
|
|
148
|
+
* @param config Scraper configuration
|
|
149
|
+
* @returns Serper scraper instance
|
|
150
|
+
*/
|
|
151
|
+
export const createSerperScraper = (
|
|
152
|
+
config: t.SerperScraperConfig = {}
|
|
153
|
+
): SerperScraper => {
|
|
154
|
+
return new SerperScraper(config);
|
|
155
|
+
};
|