@librechat/agents 2.4.31 → 2.4.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/events.cjs +3 -3
- package/dist/cjs/events.cjs.map +1 -1
- package/dist/cjs/graphs/Graph.cjs +2 -1
- package/dist/cjs/graphs/Graph.cjs.map +1 -1
- package/dist/cjs/main.cjs +5 -2
- package/dist/cjs/main.cjs.map +1 -1
- package/dist/cjs/messages/ids.cjs +23 -0
- package/dist/cjs/messages/ids.cjs.map +1 -0
- package/dist/cjs/splitStream.cjs +2 -1
- package/dist/cjs/splitStream.cjs.map +1 -1
- package/dist/cjs/stream.cjs +87 -154
- package/dist/cjs/stream.cjs.map +1 -1
- package/dist/cjs/tools/ToolNode.cjs +14 -3
- package/dist/cjs/tools/ToolNode.cjs.map +1 -1
- package/dist/cjs/tools/handlers.cjs +144 -0
- package/dist/cjs/tools/handlers.cjs.map +1 -0
- package/dist/cjs/tools/search/content.cjs +140 -0
- package/dist/cjs/tools/search/content.cjs.map +1 -0
- package/dist/cjs/tools/search/firecrawl.cjs +23 -41
- package/dist/cjs/tools/search/firecrawl.cjs.map +1 -1
- package/dist/cjs/tools/search/format.cjs +161 -74
- package/dist/cjs/tools/search/format.cjs.map +1 -1
- package/dist/cjs/tools/search/highlights.cjs +64 -12
- package/dist/cjs/tools/search/highlights.cjs.map +1 -1
- package/dist/cjs/tools/search/rerankers.cjs +35 -50
- package/dist/cjs/tools/search/rerankers.cjs.map +1 -1
- package/dist/cjs/tools/search/schema.cjs +70 -0
- package/dist/cjs/tools/search/schema.cjs.map +1 -0
- package/dist/cjs/tools/search/search.cjs +153 -69
- package/dist/cjs/tools/search/search.cjs.map +1 -1
- package/dist/cjs/tools/search/tool.cjs +247 -58
- package/dist/cjs/tools/search/tool.cjs.map +1 -1
- package/dist/cjs/tools/search/utils.cjs +66 -0
- package/dist/cjs/tools/search/utils.cjs.map +1 -0
- package/dist/esm/events.mjs +1 -1
- package/dist/esm/events.mjs.map +1 -1
- package/dist/esm/graphs/Graph.mjs +2 -1
- package/dist/esm/graphs/Graph.mjs.map +1 -1
- package/dist/esm/main.mjs +3 -1
- package/dist/esm/main.mjs.map +1 -1
- package/dist/esm/messages/ids.mjs +21 -0
- package/dist/esm/messages/ids.mjs.map +1 -0
- package/dist/esm/splitStream.mjs +2 -1
- package/dist/esm/splitStream.mjs.map +1 -1
- package/dist/esm/stream.mjs +87 -152
- package/dist/esm/stream.mjs.map +1 -1
- package/dist/esm/tools/ToolNode.mjs +14 -3
- package/dist/esm/tools/ToolNode.mjs.map +1 -1
- package/dist/esm/tools/handlers.mjs +141 -0
- package/dist/esm/tools/handlers.mjs.map +1 -0
- package/dist/esm/tools/search/content.mjs +119 -0
- package/dist/esm/tools/search/content.mjs.map +1 -0
- package/dist/esm/tools/search/firecrawl.mjs +24 -41
- package/dist/esm/tools/search/firecrawl.mjs.map +1 -1
- package/dist/esm/tools/search/format.mjs +161 -74
- package/dist/esm/tools/search/format.mjs.map +1 -1
- package/dist/esm/tools/search/highlights.mjs +64 -12
- package/dist/esm/tools/search/highlights.mjs.map +1 -1
- package/dist/esm/tools/search/rerankers.mjs +35 -50
- package/dist/esm/tools/search/rerankers.mjs.map +1 -1
- package/dist/esm/tools/search/schema.mjs +61 -0
- package/dist/esm/tools/search/schema.mjs.map +1 -0
- package/dist/esm/tools/search/search.mjs +153 -69
- package/dist/esm/tools/search/search.mjs.map +1 -1
- package/dist/esm/tools/search/tool.mjs +246 -57
- package/dist/esm/tools/search/tool.mjs.map +1 -1
- package/dist/esm/tools/search/utils.mjs +61 -0
- package/dist/esm/tools/search/utils.mjs.map +1 -0
- package/dist/types/graphs/Graph.d.ts +1 -1
- package/dist/types/index.d.ts +1 -0
- package/dist/types/messages/ids.d.ts +3 -0
- package/dist/types/messages/index.d.ts +1 -0
- package/dist/types/stream.d.ts +0 -8
- package/dist/types/tools/ToolNode.d.ts +6 -0
- package/dist/types/tools/example.d.ts +23 -3
- package/dist/types/tools/handlers.d.ts +8 -0
- package/dist/types/tools/search/content.d.ts +4 -0
- package/dist/types/tools/search/firecrawl.d.ts +7 -86
- package/dist/types/tools/search/format.d.ts +4 -1
- package/dist/types/tools/search/highlights.d.ts +1 -1
- package/dist/types/tools/search/rerankers.d.ts +8 -5
- package/dist/types/tools/search/schema.d.ts +16 -0
- package/dist/types/tools/search/search.d.ts +2 -2
- package/dist/types/tools/search/test.d.ts +1 -0
- package/dist/types/tools/search/tool.d.ts +25 -4
- package/dist/types/tools/search/types.d.ts +443 -53
- package/dist/types/tools/search/utils.d.ts +10 -0
- package/package.json +9 -7
- package/src/events.ts +49 -15
- package/src/graphs/Graph.ts +6 -2
- package/src/index.ts +1 -0
- package/src/messages/ids.ts +26 -0
- package/src/messages/index.ts +1 -0
- package/src/scripts/search.ts +8 -3
- package/src/splitStream.test.ts +132 -71
- package/src/splitStream.ts +2 -1
- package/src/stream.ts +94 -183
- package/src/tools/ToolNode.ts +37 -14
- package/src/tools/handlers.ts +167 -0
- package/src/tools/search/content.test.ts +173 -0
- package/src/tools/search/content.ts +147 -0
- package/src/tools/search/firecrawl.ts +36 -148
- package/src/tools/search/format.ts +205 -74
- package/src/tools/search/highlights.ts +99 -16
- package/src/tools/search/output.md +2775 -0
- package/src/tools/search/rerankers.ts +50 -62
- package/src/tools/search/schema.ts +63 -0
- package/src/tools/search/search.ts +232 -116
- package/src/tools/search/test.html +884 -0
- package/src/tools/search/test.md +643 -0
- package/src/tools/search/test.ts +159 -0
- package/src/tools/search/tool.ts +363 -87
- package/src/tools/search/types.ts +503 -61
- package/src/tools/search/utils.ts +79 -0
- package/src/utils/llmConfig.ts +1 -1
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
/* eslint-disable no-console */
|
|
2
1
|
import axios from 'axios';
|
|
3
2
|
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
|
|
4
3
|
import type * as t from './types';
|
|
5
|
-
import { getAttribution,
|
|
4
|
+
import { getAttribution, createDefaultLogger } from './utils';
|
|
5
|
+
import { FirecrawlScraper } from './firecrawl';
|
|
6
6
|
import { BaseReranker } from './rerankers';
|
|
7
7
|
|
|
8
8
|
const chunker = {
|
|
@@ -51,12 +51,14 @@ const chunker = {
|
|
|
51
51
|
chunkSize?: number;
|
|
52
52
|
chunkOverlap?: number;
|
|
53
53
|
separators?: string[];
|
|
54
|
-
}
|
|
54
|
+
},
|
|
55
|
+
logger?: t.Logger
|
|
55
56
|
): Promise<string[][]> => {
|
|
56
57
|
// Split multiple texts
|
|
58
|
+
const logger_ = logger || createDefaultLogger();
|
|
57
59
|
const promises = texts.map((text) =>
|
|
58
60
|
chunker.splitText(text, options).catch((error) => {
|
|
59
|
-
|
|
61
|
+
logger_.error('Error splitting text:', error);
|
|
60
62
|
return [text];
|
|
61
63
|
})
|
|
62
64
|
);
|
|
@@ -64,7 +66,7 @@ const chunker = {
|
|
|
64
66
|
},
|
|
65
67
|
};
|
|
66
68
|
|
|
67
|
-
|
|
69
|
+
function createSourceUpdateCallback(sourceMap: Map<string, t.ValidSource>) {
|
|
68
70
|
return (link: string, update?: Partial<t.ValidSource>): void => {
|
|
69
71
|
const source = sourceMap.get(link);
|
|
70
72
|
if (source) {
|
|
@@ -74,25 +76,29 @@ const createSourceUpdateCallback = (sourceMap: Map<string, t.ValidSource>) => {
|
|
|
74
76
|
});
|
|
75
77
|
}
|
|
76
78
|
};
|
|
77
|
-
}
|
|
79
|
+
}
|
|
78
80
|
|
|
79
81
|
const getHighlights = async ({
|
|
80
82
|
query,
|
|
81
83
|
content,
|
|
82
84
|
reranker,
|
|
83
85
|
topResults = 5,
|
|
86
|
+
logger,
|
|
84
87
|
}: {
|
|
85
88
|
content: string;
|
|
86
89
|
query: string;
|
|
87
90
|
reranker?: BaseReranker;
|
|
88
91
|
topResults?: number;
|
|
92
|
+
logger?: t.Logger;
|
|
89
93
|
}): Promise<t.Highlight[] | undefined> => {
|
|
94
|
+
const logger_ = logger || createDefaultLogger();
|
|
95
|
+
|
|
90
96
|
if (!content) {
|
|
91
|
-
|
|
97
|
+
logger_.warn('No content provided for highlights');
|
|
92
98
|
return;
|
|
93
99
|
}
|
|
94
100
|
if (!reranker) {
|
|
95
|
-
|
|
101
|
+
logger_.warn('No reranker provided for highlights');
|
|
96
102
|
return;
|
|
97
103
|
}
|
|
98
104
|
|
|
@@ -101,14 +107,14 @@ const getHighlights = async ({
|
|
|
101
107
|
if (Array.isArray(documents)) {
|
|
102
108
|
return await reranker.rerank(query, documents, topResults);
|
|
103
109
|
} else {
|
|
104
|
-
|
|
110
|
+
logger_.error(
|
|
105
111
|
'Expected documents to be an array, got:',
|
|
106
112
|
typeof documents
|
|
107
113
|
);
|
|
108
114
|
return;
|
|
109
115
|
}
|
|
110
116
|
} catch (error) {
|
|
111
|
-
|
|
117
|
+
logger_.error('Error in content processing:', error);
|
|
112
118
|
return;
|
|
113
119
|
}
|
|
114
120
|
};
|
|
@@ -116,16 +122,11 @@ const getHighlights = async ({
|
|
|
116
122
|
const createSerperAPI = (
|
|
117
123
|
apiKey?: string
|
|
118
124
|
): {
|
|
119
|
-
getSources: (
|
|
120
|
-
query: string,
|
|
121
|
-
numResults?: number,
|
|
122
|
-
storedLocation?: string
|
|
123
|
-
) => Promise<t.SearchResult>;
|
|
125
|
+
getSources: (params: t.GetSourcesParams) => Promise<t.SearchResult>;
|
|
124
126
|
} => {
|
|
125
127
|
const config = {
|
|
126
128
|
apiKey: apiKey ?? process.env.SERPER_API_KEY,
|
|
127
129
|
apiUrl: 'https://google.serper.dev/search',
|
|
128
|
-
defaultLocation: 'us',
|
|
129
130
|
timeout: 10000,
|
|
130
131
|
};
|
|
131
132
|
|
|
@@ -133,43 +134,72 @@ const createSerperAPI = (
|
|
|
133
134
|
throw new Error('SERPER_API_KEY is required for SerperAPI');
|
|
134
135
|
}
|
|
135
136
|
|
|
136
|
-
const getSources = async (
|
|
137
|
-
query
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
137
|
+
const getSources = async ({
|
|
138
|
+
query,
|
|
139
|
+
date,
|
|
140
|
+
country,
|
|
141
|
+
safeSearch,
|
|
142
|
+
numResults = 8,
|
|
143
|
+
type,
|
|
144
|
+
}: t.GetSourcesParams): Promise<t.SearchResult> => {
|
|
141
145
|
if (!query.trim()) {
|
|
142
146
|
return { success: false, error: 'Query cannot be empty' };
|
|
143
147
|
}
|
|
144
148
|
|
|
145
149
|
try {
|
|
146
|
-
const
|
|
147
|
-
|
|
148
|
-
).toLowerCase();
|
|
149
|
-
|
|
150
|
-
const payload = {
|
|
150
|
+
const safe = ['off', 'moderate', 'active'] as const;
|
|
151
|
+
const payload: t.SerperSearchPayload = {
|
|
151
152
|
q: query,
|
|
153
|
+
safe: safe[safeSearch ?? 1],
|
|
152
154
|
num: Math.min(Math.max(1, numResults), 10),
|
|
153
|
-
gl: searchLocation,
|
|
154
155
|
};
|
|
155
156
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
157
|
+
// Set the search type if provided
|
|
158
|
+
if (type) {
|
|
159
|
+
payload.type = type;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if (date != null) {
|
|
163
|
+
payload.tbs = `qdr:${date}`;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
if (country != null && country !== '') {
|
|
167
|
+
payload['gl'] = country.toLowerCase();
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Determine the API endpoint based on the search type
|
|
171
|
+
let apiEndpoint = config.apiUrl;
|
|
172
|
+
if (type === 'images') {
|
|
173
|
+
apiEndpoint = 'https://google.serper.dev/images';
|
|
174
|
+
} else if (type === 'videos') {
|
|
175
|
+
apiEndpoint = 'https://google.serper.dev/videos';
|
|
176
|
+
} else if (type === 'news') {
|
|
177
|
+
apiEndpoint = 'https://google.serper.dev/news';
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
const response = await axios.post<t.SerperResultData>(
|
|
181
|
+
apiEndpoint,
|
|
182
|
+
payload,
|
|
183
|
+
{
|
|
184
|
+
headers: {
|
|
185
|
+
'X-API-KEY': config.apiKey,
|
|
186
|
+
'Content-Type': 'application/json',
|
|
187
|
+
},
|
|
188
|
+
timeout: config.timeout,
|
|
189
|
+
}
|
|
190
|
+
);
|
|
163
191
|
|
|
164
192
|
const data = response.data;
|
|
165
193
|
const results: t.SearchResultData = {
|
|
166
194
|
organic: data.organic,
|
|
167
195
|
images: data.images ?? [],
|
|
196
|
+
answerBox: data.answerBox,
|
|
168
197
|
topStories: data.topStories ?? [],
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
198
|
+
peopleAlsoAsk: data.peopleAlsoAsk,
|
|
199
|
+
knowledgeGraph: data.knowledgeGraph,
|
|
200
|
+
relatedSearches: data.relatedSearches,
|
|
201
|
+
videos: data.videos ?? [],
|
|
202
|
+
news: data.news ?? [],
|
|
173
203
|
};
|
|
174
204
|
|
|
175
205
|
return { success: true, data: results };
|
|
@@ -187,11 +217,7 @@ const createSearXNGAPI = (
|
|
|
187
217
|
instanceUrl?: string,
|
|
188
218
|
apiKey?: string
|
|
189
219
|
): {
|
|
190
|
-
getSources: (
|
|
191
|
-
query: string,
|
|
192
|
-
numResults?: number,
|
|
193
|
-
storedLocation?: string
|
|
194
|
-
) => Promise<t.SearchResult>;
|
|
220
|
+
getSources: (params: t.GetSourcesParams) => Promise<t.SearchResult>;
|
|
195
221
|
} => {
|
|
196
222
|
const config = {
|
|
197
223
|
instanceUrl: instanceUrl ?? process.env.SEARXNG_INSTANCE_URL,
|
|
@@ -204,11 +230,11 @@ const createSearXNGAPI = (
|
|
|
204
230
|
throw new Error('SEARXNG_INSTANCE_URL is required for SearXNG API');
|
|
205
231
|
}
|
|
206
232
|
|
|
207
|
-
const getSources = async (
|
|
208
|
-
query
|
|
209
|
-
numResults
|
|
210
|
-
|
|
211
|
-
): Promise<t.SearchResult> => {
|
|
233
|
+
const getSources = async ({
|
|
234
|
+
query,
|
|
235
|
+
numResults = 8,
|
|
236
|
+
type,
|
|
237
|
+
}: t.GetSourcesParams): Promise<t.SearchResult> => {
|
|
212
238
|
if (!query.trim()) {
|
|
213
239
|
return { success: false, error: 'Query cannot be empty' };
|
|
214
240
|
}
|
|
@@ -224,22 +250,27 @@ const createSearXNGAPI = (
|
|
|
224
250
|
searchUrl = searchUrl.replace(/\/$/, '') + '/search';
|
|
225
251
|
}
|
|
226
252
|
|
|
253
|
+
// Determine the search category based on the type
|
|
254
|
+
let category = 'general';
|
|
255
|
+
if (type === 'images') {
|
|
256
|
+
category = 'images';
|
|
257
|
+
} else if (type === 'videos') {
|
|
258
|
+
category = 'videos';
|
|
259
|
+
} else if (type === 'news') {
|
|
260
|
+
category = 'news';
|
|
261
|
+
}
|
|
262
|
+
|
|
227
263
|
// Prepare parameters for SearXNG
|
|
228
|
-
const params:
|
|
264
|
+
const params: t.SearxNGSearchPayload = {
|
|
229
265
|
q: query,
|
|
230
266
|
format: 'json',
|
|
231
267
|
pageno: 1,
|
|
232
|
-
categories:
|
|
268
|
+
categories: category,
|
|
233
269
|
language: 'all',
|
|
234
270
|
safesearch: 0,
|
|
235
271
|
engines: 'google,bing,duckduckgo',
|
|
236
|
-
max_results: Math.min(Math.max(1, numResults), 20),
|
|
237
272
|
};
|
|
238
273
|
|
|
239
|
-
if (storedLocation != null && storedLocation !== 'all') {
|
|
240
|
-
params.language = storedLocation;
|
|
241
|
-
}
|
|
242
|
-
|
|
243
274
|
const headers: Record<string, string> = {
|
|
244
275
|
'Content-Type': 'application/json',
|
|
245
276
|
};
|
|
@@ -282,6 +313,8 @@ const createSearXNGAPI = (
|
|
|
282
313
|
topStories: [],
|
|
283
314
|
// Use undefined instead of null for optional properties
|
|
284
315
|
relatedSearches: data.suggestions ?? [],
|
|
316
|
+
videos: [],
|
|
317
|
+
news: [],
|
|
285
318
|
};
|
|
286
319
|
|
|
287
320
|
return { success: true, data: results };
|
|
@@ -301,11 +334,7 @@ const createSearXNGAPI = (
|
|
|
301
334
|
export const createSearchAPI = (
|
|
302
335
|
config: t.SearchConfig
|
|
303
336
|
): {
|
|
304
|
-
getSources: (
|
|
305
|
-
query: string,
|
|
306
|
-
numResults?: number,
|
|
307
|
-
storedLocation?: string
|
|
308
|
-
) => Promise<t.SearchResult>;
|
|
337
|
+
getSources: (params: t.GetSourcesParams) => Promise<t.SearchResult>;
|
|
309
338
|
} => {
|
|
310
339
|
const {
|
|
311
340
|
searchProvider = 'serper',
|
|
@@ -330,10 +359,7 @@ export const createSourceProcessor = (
|
|
|
330
359
|
scraperInstance?: FirecrawlScraper
|
|
331
360
|
): {
|
|
332
361
|
processSources: (
|
|
333
|
-
|
|
334
|
-
numElements: number,
|
|
335
|
-
query: string,
|
|
336
|
-
proMode?: boolean
|
|
362
|
+
fields: t.ProcessSourcesFields
|
|
337
363
|
) => Promise<t.SearchResultData>;
|
|
338
364
|
topResults: number;
|
|
339
365
|
} => {
|
|
@@ -345,46 +371,61 @@ export const createSourceProcessor = (
|
|
|
345
371
|
// strategies = ['no_extraction'],
|
|
346
372
|
// filterContent = true,
|
|
347
373
|
reranker,
|
|
374
|
+
logger,
|
|
348
375
|
} = config;
|
|
349
376
|
|
|
377
|
+
const logger_ = logger || createDefaultLogger();
|
|
350
378
|
const firecrawlScraper = scraperInstance;
|
|
351
379
|
|
|
352
380
|
const webScraper = {
|
|
353
381
|
scrapeMany: async ({
|
|
354
382
|
query,
|
|
355
383
|
links,
|
|
384
|
+
onGetHighlights,
|
|
356
385
|
}: {
|
|
357
386
|
query: string;
|
|
358
387
|
links: string[];
|
|
388
|
+
onGetHighlights: t.SearchToolConfig['onGetHighlights'];
|
|
359
389
|
}): Promise<Array<t.ScrapeResult>> => {
|
|
360
|
-
|
|
390
|
+
logger_.debug(`Scraping ${links.length} links with Firecrawl`);
|
|
361
391
|
const promises: Array<Promise<t.ScrapeResult>> = [];
|
|
362
392
|
try {
|
|
363
|
-
for (
|
|
393
|
+
for (let i = 0; i < links.length; i++) {
|
|
394
|
+
const currentLink = links[i];
|
|
364
395
|
const promise: Promise<t.ScrapeResult> = firecrawlScraper
|
|
365
396
|
.scrapeUrl(currentLink, {})
|
|
366
397
|
.then(([url, response]) => {
|
|
367
|
-
const attribution = getAttribution(
|
|
398
|
+
const attribution = getAttribution(
|
|
399
|
+
url,
|
|
400
|
+
response.data?.metadata,
|
|
401
|
+
logger_
|
|
402
|
+
);
|
|
368
403
|
if (response.success && response.data) {
|
|
369
|
-
const content =
|
|
404
|
+
const [content, references] =
|
|
405
|
+
firecrawlScraper.extractContent(response);
|
|
370
406
|
return {
|
|
371
407
|
url,
|
|
408
|
+
references,
|
|
372
409
|
attribution,
|
|
373
410
|
content: chunker.cleanText(content),
|
|
374
|
-
};
|
|
411
|
+
} as t.ScrapeResult;
|
|
412
|
+
} else {
|
|
413
|
+
logger_.error(
|
|
414
|
+
`Error scraping ${url}: ${response.error ?? 'Unknown error'}`
|
|
415
|
+
);
|
|
375
416
|
}
|
|
376
417
|
|
|
377
418
|
return {
|
|
378
419
|
url,
|
|
379
420
|
attribution,
|
|
380
421
|
error: true,
|
|
381
|
-
content:
|
|
382
|
-
};
|
|
422
|
+
content: '',
|
|
423
|
+
} as t.ScrapeResult;
|
|
383
424
|
})
|
|
384
425
|
.then(async (result) => {
|
|
385
426
|
try {
|
|
386
427
|
if (result.error != null) {
|
|
387
|
-
|
|
428
|
+
logger_.error(
|
|
388
429
|
`Error scraping ${result.url}: ${result.content}`
|
|
389
430
|
);
|
|
390
431
|
return {
|
|
@@ -395,31 +436,35 @@ export const createSourceProcessor = (
|
|
|
395
436
|
query,
|
|
396
437
|
reranker,
|
|
397
438
|
content: result.content,
|
|
439
|
+
logger: logger_,
|
|
398
440
|
});
|
|
441
|
+
if (onGetHighlights) {
|
|
442
|
+
onGetHighlights(result.url);
|
|
443
|
+
}
|
|
399
444
|
return {
|
|
400
445
|
...result,
|
|
401
446
|
highlights,
|
|
402
447
|
};
|
|
403
448
|
} catch (error) {
|
|
404
|
-
|
|
449
|
+
logger_.error('Error processing scraped content:', error);
|
|
405
450
|
return {
|
|
406
451
|
...result,
|
|
407
452
|
};
|
|
408
453
|
}
|
|
409
454
|
})
|
|
410
455
|
.catch((error) => {
|
|
411
|
-
|
|
456
|
+
logger_.error(`Error scraping ${currentLink}:`, error);
|
|
412
457
|
return {
|
|
413
458
|
url: currentLink,
|
|
414
459
|
error: true,
|
|
415
|
-
content:
|
|
460
|
+
content: '',
|
|
416
461
|
};
|
|
417
462
|
});
|
|
418
463
|
promises.push(promise);
|
|
419
464
|
}
|
|
420
465
|
return await Promise.all(promises);
|
|
421
466
|
} catch (error) {
|
|
422
|
-
|
|
467
|
+
logger_.error('Error in scrapeMany:', error);
|
|
423
468
|
return [];
|
|
424
469
|
}
|
|
425
470
|
},
|
|
@@ -429,35 +474,44 @@ export const createSourceProcessor = (
|
|
|
429
474
|
links,
|
|
430
475
|
query,
|
|
431
476
|
target,
|
|
477
|
+
onGetHighlights,
|
|
432
478
|
onContentScraped,
|
|
433
479
|
}: {
|
|
434
480
|
links: string[];
|
|
435
481
|
query: string;
|
|
436
482
|
target: number;
|
|
483
|
+
onGetHighlights: t.SearchToolConfig['onGetHighlights'];
|
|
437
484
|
onContentScraped?: (link: string, update?: Partial<t.ValidSource>) => void;
|
|
438
485
|
}): Promise<void> => {
|
|
439
486
|
const initialLinks = links.slice(0, target);
|
|
440
487
|
// const remainingLinks = links.slice(target).reverse();
|
|
441
|
-
const results = await webScraper.scrapeMany({
|
|
488
|
+
const results = await webScraper.scrapeMany({
|
|
489
|
+
query,
|
|
490
|
+
links: initialLinks,
|
|
491
|
+
onGetHighlights,
|
|
492
|
+
});
|
|
442
493
|
for (const result of results) {
|
|
443
494
|
if (result.error === true) {
|
|
444
495
|
continue;
|
|
445
496
|
}
|
|
446
|
-
const { url, content, attribution, highlights } = result;
|
|
497
|
+
const { url, content, attribution, references, highlights } = result;
|
|
447
498
|
onContentScraped?.(url, {
|
|
448
499
|
content,
|
|
449
500
|
attribution,
|
|
501
|
+
references,
|
|
450
502
|
highlights,
|
|
451
503
|
});
|
|
452
504
|
}
|
|
453
505
|
};
|
|
454
506
|
|
|
455
|
-
const processSources = async (
|
|
456
|
-
result
|
|
457
|
-
numElements
|
|
458
|
-
query
|
|
459
|
-
|
|
460
|
-
|
|
507
|
+
const processSources = async ({
|
|
508
|
+
result,
|
|
509
|
+
numElements,
|
|
510
|
+
query,
|
|
511
|
+
news,
|
|
512
|
+
proMode = true,
|
|
513
|
+
onGetHighlights,
|
|
514
|
+
}: t.ProcessSourcesFields): Promise<t.SearchResultData> => {
|
|
461
515
|
try {
|
|
462
516
|
if (!result.data) {
|
|
463
517
|
return {
|
|
@@ -485,6 +539,7 @@ export const createSourceProcessor = (
|
|
|
485
539
|
await fetchContents({
|
|
486
540
|
query,
|
|
487
541
|
target: 1,
|
|
542
|
+
onGetHighlights,
|
|
488
543
|
onContentScraped,
|
|
489
544
|
links: [wikiSources[0].link],
|
|
490
545
|
});
|
|
@@ -504,51 +559,69 @@ export const createSourceProcessor = (
|
|
|
504
559
|
}
|
|
505
560
|
|
|
506
561
|
const sourceMap = new Map<string, t.ValidSource>();
|
|
507
|
-
const
|
|
562
|
+
const organicLinksSet = new Set<string>();
|
|
508
563
|
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
564
|
+
// Collect organic links
|
|
565
|
+
const organicLinks = collectLinks(
|
|
566
|
+
result.data.organic,
|
|
567
|
+
sourceMap,
|
|
568
|
+
organicLinksSet
|
|
569
|
+
);
|
|
570
|
+
|
|
571
|
+
// Collect top story links, excluding any that are already in organic links
|
|
572
|
+
const topStories = result.data.topStories ?? [];
|
|
573
|
+
const topStoryLinks = collectLinks(
|
|
574
|
+
topStories,
|
|
575
|
+
sourceMap,
|
|
576
|
+
organicLinksSet
|
|
577
|
+
);
|
|
515
578
|
|
|
516
|
-
if (
|
|
579
|
+
if (organicLinks.length === 0 && (topStoryLinks.length === 0 || !news)) {
|
|
517
580
|
return result.data;
|
|
518
581
|
}
|
|
519
582
|
|
|
520
583
|
const onContentScraped = createSourceUpdateCallback(sourceMap);
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
584
|
+
const promises: Promise<void>[] = [];
|
|
585
|
+
|
|
586
|
+
// Process organic links
|
|
587
|
+
if (organicLinks.length > 0) {
|
|
588
|
+
promises.push(
|
|
589
|
+
fetchContents({
|
|
590
|
+
query,
|
|
591
|
+
onGetHighlights,
|
|
592
|
+
onContentScraped,
|
|
593
|
+
links: organicLinks,
|
|
594
|
+
target: numElements,
|
|
595
|
+
})
|
|
596
|
+
);
|
|
597
|
+
}
|
|
527
598
|
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
599
|
+
// Process top story links
|
|
600
|
+
if (news && topStoryLinks.length > 0) {
|
|
601
|
+
promises.push(
|
|
602
|
+
fetchContents({
|
|
603
|
+
query,
|
|
604
|
+
onGetHighlights,
|
|
605
|
+
onContentScraped,
|
|
606
|
+
links: topStoryLinks,
|
|
607
|
+
target: numElements,
|
|
608
|
+
})
|
|
609
|
+
);
|
|
537
610
|
}
|
|
538
611
|
|
|
539
|
-
|
|
540
|
-
.filter(
|
|
541
|
-
(source) =>
|
|
542
|
-
source.content != null && !source.content.startsWith('Failed')
|
|
543
|
-
)
|
|
544
|
-
.slice(0, numElements);
|
|
612
|
+
await Promise.all(promises);
|
|
545
613
|
|
|
546
|
-
if (
|
|
547
|
-
result.data.organic
|
|
614
|
+
if (result.data.organic.length > 0) {
|
|
615
|
+
updateSourcesWithContent(result.data.organic, sourceMap);
|
|
548
616
|
}
|
|
617
|
+
|
|
618
|
+
if (news && topStories.length > 0) {
|
|
619
|
+
updateSourcesWithContent(topStories, sourceMap);
|
|
620
|
+
}
|
|
621
|
+
|
|
549
622
|
return result.data;
|
|
550
623
|
} catch (error) {
|
|
551
|
-
|
|
624
|
+
logger_.error('Error in processSources:', error);
|
|
552
625
|
return {
|
|
553
626
|
organic: [],
|
|
554
627
|
topStories: [],
|
|
@@ -565,3 +638,46 @@ export const createSourceProcessor = (
|
|
|
565
638
|
topResults,
|
|
566
639
|
};
|
|
567
640
|
};
|
|
641
|
+
|
|
642
|
+
/** Helper function to collect links and update sourceMap */
|
|
643
|
+
function collectLinks(
|
|
644
|
+
sources: Array<t.OrganicResult | t.TopStoryResult>,
|
|
645
|
+
sourceMap: Map<string, t.ValidSource>,
|
|
646
|
+
existingLinksSet?: Set<string>
|
|
647
|
+
): string[] {
|
|
648
|
+
const links: string[] = [];
|
|
649
|
+
|
|
650
|
+
for (const source of sources) {
|
|
651
|
+
if (source.link) {
|
|
652
|
+
// For topStories, only add if not already in organic links
|
|
653
|
+
if (existingLinksSet && existingLinksSet.has(source.link)) {
|
|
654
|
+
continue;
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
links.push(source.link);
|
|
658
|
+
if (existingLinksSet) {
|
|
659
|
+
existingLinksSet.add(source.link);
|
|
660
|
+
}
|
|
661
|
+
sourceMap.set(source.link, source as t.ValidSource);
|
|
662
|
+
}
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
return links;
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
/** Helper function to update sources with scraped content */
|
|
669
|
+
function updateSourcesWithContent<T extends t.ValidSource>(
|
|
670
|
+
sources: T[],
|
|
671
|
+
sourceMap: Map<string, t.ValidSource>
|
|
672
|
+
): void {
|
|
673
|
+
for (let i = 0; i < sources.length; i++) {
|
|
674
|
+
const source = sources[i];
|
|
675
|
+
const updatedSource = sourceMap.get(source.link);
|
|
676
|
+
if (updatedSource) {
|
|
677
|
+
sources[i] = {
|
|
678
|
+
...source,
|
|
679
|
+
...updatedSource,
|
|
680
|
+
} as T;
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
}
|