@librechat/agents 2.4.31 → 2.4.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/events.cjs +3 -3
- package/dist/cjs/events.cjs.map +1 -1
- package/dist/cjs/graphs/Graph.cjs +2 -1
- package/dist/cjs/graphs/Graph.cjs.map +1 -1
- package/dist/cjs/main.cjs +5 -2
- package/dist/cjs/main.cjs.map +1 -1
- package/dist/cjs/messages/ids.cjs +23 -0
- package/dist/cjs/messages/ids.cjs.map +1 -0
- package/dist/cjs/splitStream.cjs +2 -1
- package/dist/cjs/splitStream.cjs.map +1 -1
- package/dist/cjs/stream.cjs +87 -154
- package/dist/cjs/stream.cjs.map +1 -1
- package/dist/cjs/tools/ToolNode.cjs +14 -3
- package/dist/cjs/tools/ToolNode.cjs.map +1 -1
- package/dist/cjs/tools/handlers.cjs +144 -0
- package/dist/cjs/tools/handlers.cjs.map +1 -0
- package/dist/cjs/tools/search/content.cjs +140 -0
- package/dist/cjs/tools/search/content.cjs.map +1 -0
- package/dist/cjs/tools/search/firecrawl.cjs +23 -41
- package/dist/cjs/tools/search/firecrawl.cjs.map +1 -1
- package/dist/cjs/tools/search/format.cjs +161 -74
- package/dist/cjs/tools/search/format.cjs.map +1 -1
- package/dist/cjs/tools/search/highlights.cjs +64 -12
- package/dist/cjs/tools/search/highlights.cjs.map +1 -1
- package/dist/cjs/tools/search/rerankers.cjs +43 -36
- package/dist/cjs/tools/search/rerankers.cjs.map +1 -1
- package/dist/cjs/tools/search/schema.cjs +70 -0
- package/dist/cjs/tools/search/schema.cjs.map +1 -0
- package/dist/cjs/tools/search/search.cjs +150 -69
- package/dist/cjs/tools/search/search.cjs.map +1 -1
- package/dist/cjs/tools/search/tool.cjs +247 -58
- package/dist/cjs/tools/search/tool.cjs.map +1 -1
- package/dist/cjs/tools/search/utils.cjs +66 -0
- package/dist/cjs/tools/search/utils.cjs.map +1 -0
- package/dist/esm/events.mjs +1 -1
- package/dist/esm/events.mjs.map +1 -1
- package/dist/esm/graphs/Graph.mjs +2 -1
- package/dist/esm/graphs/Graph.mjs.map +1 -1
- package/dist/esm/main.mjs +3 -1
- package/dist/esm/main.mjs.map +1 -1
- package/dist/esm/messages/ids.mjs +21 -0
- package/dist/esm/messages/ids.mjs.map +1 -0
- package/dist/esm/splitStream.mjs +2 -1
- package/dist/esm/splitStream.mjs.map +1 -1
- package/dist/esm/stream.mjs +87 -152
- package/dist/esm/stream.mjs.map +1 -1
- package/dist/esm/tools/ToolNode.mjs +14 -3
- package/dist/esm/tools/ToolNode.mjs.map +1 -1
- package/dist/esm/tools/handlers.mjs +141 -0
- package/dist/esm/tools/handlers.mjs.map +1 -0
- package/dist/esm/tools/search/content.mjs +119 -0
- package/dist/esm/tools/search/content.mjs.map +1 -0
- package/dist/esm/tools/search/firecrawl.mjs +24 -41
- package/dist/esm/tools/search/firecrawl.mjs.map +1 -1
- package/dist/esm/tools/search/format.mjs +161 -74
- package/dist/esm/tools/search/format.mjs.map +1 -1
- package/dist/esm/tools/search/highlights.mjs +64 -12
- package/dist/esm/tools/search/highlights.mjs.map +1 -1
- package/dist/esm/tools/search/rerankers.mjs +43 -36
- package/dist/esm/tools/search/rerankers.mjs.map +1 -1
- package/dist/esm/tools/search/schema.mjs +61 -0
- package/dist/esm/tools/search/schema.mjs.map +1 -0
- package/dist/esm/tools/search/search.mjs +150 -69
- package/dist/esm/tools/search/search.mjs.map +1 -1
- package/dist/esm/tools/search/tool.mjs +246 -57
- package/dist/esm/tools/search/tool.mjs.map +1 -1
- package/dist/esm/tools/search/utils.mjs +61 -0
- package/dist/esm/tools/search/utils.mjs.map +1 -0
- package/dist/types/graphs/Graph.d.ts +1 -1
- package/dist/types/index.d.ts +1 -0
- package/dist/types/messages/ids.d.ts +3 -0
- package/dist/types/messages/index.d.ts +1 -0
- package/dist/types/stream.d.ts +0 -8
- package/dist/types/tools/ToolNode.d.ts +6 -0
- package/dist/types/tools/example.d.ts +23 -3
- package/dist/types/tools/handlers.d.ts +8 -0
- package/dist/types/tools/search/content.d.ts +4 -0
- package/dist/types/tools/search/firecrawl.d.ts +7 -86
- package/dist/types/tools/search/format.d.ts +4 -1
- package/dist/types/tools/search/highlights.d.ts +1 -1
- package/dist/types/tools/search/rerankers.d.ts +8 -4
- package/dist/types/tools/search/schema.d.ts +16 -0
- package/dist/types/tools/search/search.d.ts +2 -2
- package/dist/types/tools/search/test.d.ts +1 -0
- package/dist/types/tools/search/tool.d.ts +25 -4
- package/dist/types/tools/search/types.d.ts +443 -53
- package/dist/types/tools/search/utils.d.ts +10 -0
- package/package.json +9 -7
- package/src/events.ts +49 -15
- package/src/graphs/Graph.ts +6 -2
- package/src/index.ts +1 -0
- package/src/messages/ids.ts +26 -0
- package/src/messages/index.ts +1 -0
- package/src/scripts/search.ts +8 -3
- package/src/splitStream.test.ts +132 -71
- package/src/splitStream.ts +2 -1
- package/src/stream.ts +94 -183
- package/src/tools/ToolNode.ts +37 -14
- package/src/tools/handlers.ts +167 -0
- package/src/tools/search/content.test.ts +173 -0
- package/src/tools/search/content.ts +147 -0
- package/src/tools/search/firecrawl.ts +36 -148
- package/src/tools/search/format.ts +205 -74
- package/src/tools/search/highlights.ts +99 -16
- package/src/tools/search/output.md +2775 -0
- package/src/tools/search/rerankers.ts +57 -36
- package/src/tools/search/schema.ts +63 -0
- package/src/tools/search/search.ts +230 -117
- package/src/tools/search/test.html +884 -0
- package/src/tools/search/test.md +643 -0
- package/src/tools/search/test.ts +159 -0
- package/src/tools/search/tool.ts +363 -87
- package/src/tools/search/types.ts +503 -61
- package/src/tools/search/utils.ts +79 -0
- package/src/utils/llmConfig.ts +1 -1
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
/* eslint-disable no-console */
|
|
2
1
|
import axios from 'axios';
|
|
3
2
|
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
|
|
4
3
|
import type * as t from './types';
|
|
5
|
-
import { getAttribution,
|
|
4
|
+
import { getAttribution, createDefaultLogger } from './utils';
|
|
5
|
+
import { FirecrawlScraper } from './firecrawl';
|
|
6
6
|
import { BaseReranker } from './rerankers';
|
|
7
7
|
|
|
8
8
|
const chunker = {
|
|
@@ -51,12 +51,14 @@ const chunker = {
|
|
|
51
51
|
chunkSize?: number;
|
|
52
52
|
chunkOverlap?: number;
|
|
53
53
|
separators?: string[];
|
|
54
|
-
}
|
|
54
|
+
},
|
|
55
|
+
logger?: t.Logger
|
|
55
56
|
): Promise<string[][]> => {
|
|
56
57
|
// Split multiple texts
|
|
58
|
+
const logger_ = logger || createDefaultLogger();
|
|
57
59
|
const promises = texts.map((text) =>
|
|
58
60
|
chunker.splitText(text, options).catch((error) => {
|
|
59
|
-
|
|
61
|
+
logger_.error('Error splitting text:', error);
|
|
60
62
|
return [text];
|
|
61
63
|
})
|
|
62
64
|
);
|
|
@@ -64,7 +66,7 @@ const chunker = {
|
|
|
64
66
|
},
|
|
65
67
|
};
|
|
66
68
|
|
|
67
|
-
|
|
69
|
+
function createSourceUpdateCallback(sourceMap: Map<string, t.ValidSource>) {
|
|
68
70
|
return (link: string, update?: Partial<t.ValidSource>): void => {
|
|
69
71
|
const source = sourceMap.get(link);
|
|
70
72
|
if (source) {
|
|
@@ -74,25 +76,29 @@ const createSourceUpdateCallback = (sourceMap: Map<string, t.ValidSource>) => {
|
|
|
74
76
|
});
|
|
75
77
|
}
|
|
76
78
|
};
|
|
77
|
-
}
|
|
79
|
+
}
|
|
78
80
|
|
|
79
81
|
const getHighlights = async ({
|
|
80
82
|
query,
|
|
81
83
|
content,
|
|
82
84
|
reranker,
|
|
83
85
|
topResults = 5,
|
|
86
|
+
logger,
|
|
84
87
|
}: {
|
|
85
88
|
content: string;
|
|
86
89
|
query: string;
|
|
87
90
|
reranker?: BaseReranker;
|
|
88
91
|
topResults?: number;
|
|
92
|
+
logger?: t.Logger;
|
|
89
93
|
}): Promise<t.Highlight[] | undefined> => {
|
|
94
|
+
const logger_ = logger || createDefaultLogger();
|
|
95
|
+
|
|
90
96
|
if (!content) {
|
|
91
|
-
|
|
97
|
+
logger_.warn('No content provided for highlights');
|
|
92
98
|
return;
|
|
93
99
|
}
|
|
94
100
|
if (!reranker) {
|
|
95
|
-
|
|
101
|
+
logger_.warn('No reranker provided for highlights');
|
|
96
102
|
return;
|
|
97
103
|
}
|
|
98
104
|
|
|
@@ -101,14 +107,14 @@ const getHighlights = async ({
|
|
|
101
107
|
if (Array.isArray(documents)) {
|
|
102
108
|
return await reranker.rerank(query, documents, topResults);
|
|
103
109
|
} else {
|
|
104
|
-
|
|
110
|
+
logger_.error(
|
|
105
111
|
'Expected documents to be an array, got:',
|
|
106
112
|
typeof documents
|
|
107
113
|
);
|
|
108
114
|
return;
|
|
109
115
|
}
|
|
110
116
|
} catch (error) {
|
|
111
|
-
|
|
117
|
+
logger_.error('Error in content processing:', error);
|
|
112
118
|
return;
|
|
113
119
|
}
|
|
114
120
|
};
|
|
@@ -116,16 +122,11 @@ const getHighlights = async ({
|
|
|
116
122
|
const createSerperAPI = (
|
|
117
123
|
apiKey?: string
|
|
118
124
|
): {
|
|
119
|
-
getSources: (
|
|
120
|
-
query: string,
|
|
121
|
-
numResults?: number,
|
|
122
|
-
storedLocation?: string
|
|
123
|
-
) => Promise<t.SearchResult>;
|
|
125
|
+
getSources: (params: t.GetSourcesParams) => Promise<t.SearchResult>;
|
|
124
126
|
} => {
|
|
125
127
|
const config = {
|
|
126
128
|
apiKey: apiKey ?? process.env.SERPER_API_KEY,
|
|
127
129
|
apiUrl: 'https://google.serper.dev/search',
|
|
128
|
-
defaultLocation: 'us',
|
|
129
130
|
timeout: 10000,
|
|
130
131
|
};
|
|
131
132
|
|
|
@@ -133,43 +134,72 @@ const createSerperAPI = (
|
|
|
133
134
|
throw new Error('SERPER_API_KEY is required for SerperAPI');
|
|
134
135
|
}
|
|
135
136
|
|
|
136
|
-
const getSources = async (
|
|
137
|
-
query
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
137
|
+
const getSources = async ({
|
|
138
|
+
query,
|
|
139
|
+
date,
|
|
140
|
+
country,
|
|
141
|
+
safeSearch,
|
|
142
|
+
numResults = 8,
|
|
143
|
+
type,
|
|
144
|
+
}: t.GetSourcesParams): Promise<t.SearchResult> => {
|
|
141
145
|
if (!query.trim()) {
|
|
142
146
|
return { success: false, error: 'Query cannot be empty' };
|
|
143
147
|
}
|
|
144
148
|
|
|
145
149
|
try {
|
|
146
|
-
const
|
|
147
|
-
|
|
148
|
-
).toLowerCase();
|
|
149
|
-
|
|
150
|
-
const payload = {
|
|
150
|
+
const safe = ['off', 'moderate', 'active'] as const;
|
|
151
|
+
const payload: t.SerperSearchPayload = {
|
|
151
152
|
q: query,
|
|
153
|
+
safe: safe[safeSearch ?? 1],
|
|
152
154
|
num: Math.min(Math.max(1, numResults), 10),
|
|
153
|
-
gl: searchLocation,
|
|
154
155
|
};
|
|
155
156
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
157
|
+
// Set the search type if provided
|
|
158
|
+
if (type) {
|
|
159
|
+
payload.type = type;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if (date != null) {
|
|
163
|
+
payload.tbs = `qdr:${date}`;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
if (country != null && country !== '') {
|
|
167
|
+
payload['gl'] = country.toLowerCase();
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Determine the API endpoint based on the search type
|
|
171
|
+
let apiEndpoint = config.apiUrl;
|
|
172
|
+
if (type === 'images') {
|
|
173
|
+
apiEndpoint = 'https://google.serper.dev/images';
|
|
174
|
+
} else if (type === 'videos') {
|
|
175
|
+
apiEndpoint = 'https://google.serper.dev/videos';
|
|
176
|
+
} else if (type === 'news') {
|
|
177
|
+
apiEndpoint = 'https://google.serper.dev/news';
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
const response = await axios.post<t.SerperResultData>(
|
|
181
|
+
apiEndpoint,
|
|
182
|
+
payload,
|
|
183
|
+
{
|
|
184
|
+
headers: {
|
|
185
|
+
'X-API-KEY': config.apiKey,
|
|
186
|
+
'Content-Type': 'application/json',
|
|
187
|
+
},
|
|
188
|
+
timeout: config.timeout,
|
|
189
|
+
}
|
|
190
|
+
);
|
|
163
191
|
|
|
164
192
|
const data = response.data;
|
|
165
193
|
const results: t.SearchResultData = {
|
|
166
194
|
organic: data.organic,
|
|
167
195
|
images: data.images ?? [],
|
|
196
|
+
answerBox: data.answerBox,
|
|
168
197
|
topStories: data.topStories ?? [],
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
198
|
+
peopleAlsoAsk: data.peopleAlsoAsk,
|
|
199
|
+
knowledgeGraph: data.knowledgeGraph,
|
|
200
|
+
relatedSearches: data.relatedSearches,
|
|
201
|
+
videos: data.videos ?? [],
|
|
202
|
+
news: data.news ?? [],
|
|
173
203
|
};
|
|
174
204
|
|
|
175
205
|
return { success: true, data: results };
|
|
@@ -187,11 +217,7 @@ const createSearXNGAPI = (
|
|
|
187
217
|
instanceUrl?: string,
|
|
188
218
|
apiKey?: string
|
|
189
219
|
): {
|
|
190
|
-
getSources: (
|
|
191
|
-
query: string,
|
|
192
|
-
numResults?: number,
|
|
193
|
-
storedLocation?: string
|
|
194
|
-
) => Promise<t.SearchResult>;
|
|
220
|
+
getSources: (params: t.GetSourcesParams) => Promise<t.SearchResult>;
|
|
195
221
|
} => {
|
|
196
222
|
const config = {
|
|
197
223
|
instanceUrl: instanceUrl ?? process.env.SEARXNG_INSTANCE_URL,
|
|
@@ -204,11 +230,11 @@ const createSearXNGAPI = (
|
|
|
204
230
|
throw new Error('SEARXNG_INSTANCE_URL is required for SearXNG API');
|
|
205
231
|
}
|
|
206
232
|
|
|
207
|
-
const getSources = async (
|
|
208
|
-
query
|
|
209
|
-
numResults
|
|
210
|
-
|
|
211
|
-
): Promise<t.SearchResult> => {
|
|
233
|
+
const getSources = async ({
|
|
234
|
+
query,
|
|
235
|
+
numResults = 8,
|
|
236
|
+
type,
|
|
237
|
+
}: t.GetSourcesParams): Promise<t.SearchResult> => {
|
|
212
238
|
if (!query.trim()) {
|
|
213
239
|
return { success: false, error: 'Query cannot be empty' };
|
|
214
240
|
}
|
|
@@ -224,22 +250,27 @@ const createSearXNGAPI = (
|
|
|
224
250
|
searchUrl = searchUrl.replace(/\/$/, '') + '/search';
|
|
225
251
|
}
|
|
226
252
|
|
|
253
|
+
// Determine the search category based on the type
|
|
254
|
+
let category = 'general';
|
|
255
|
+
if (type === 'images') {
|
|
256
|
+
category = 'images';
|
|
257
|
+
} else if (type === 'videos') {
|
|
258
|
+
category = 'videos';
|
|
259
|
+
} else if (type === 'news') {
|
|
260
|
+
category = 'news';
|
|
261
|
+
}
|
|
262
|
+
|
|
227
263
|
// Prepare parameters for SearXNG
|
|
228
|
-
const params:
|
|
264
|
+
const params: t.SearxNGSearchPayload = {
|
|
229
265
|
q: query,
|
|
230
266
|
format: 'json',
|
|
231
267
|
pageno: 1,
|
|
232
|
-
categories:
|
|
268
|
+
categories: category,
|
|
233
269
|
language: 'all',
|
|
234
270
|
safesearch: 0,
|
|
235
271
|
engines: 'google,bing,duckduckgo',
|
|
236
|
-
max_results: Math.min(Math.max(1, numResults), 20),
|
|
237
272
|
};
|
|
238
273
|
|
|
239
|
-
if (storedLocation != null && storedLocation !== 'all') {
|
|
240
|
-
params.language = storedLocation;
|
|
241
|
-
}
|
|
242
|
-
|
|
243
274
|
const headers: Record<string, string> = {
|
|
244
275
|
'Content-Type': 'application/json',
|
|
245
276
|
};
|
|
@@ -282,6 +313,8 @@ const createSearXNGAPI = (
|
|
|
282
313
|
topStories: [],
|
|
283
314
|
// Use undefined instead of null for optional properties
|
|
284
315
|
relatedSearches: data.suggestions ?? [],
|
|
316
|
+
videos: [],
|
|
317
|
+
news: [],
|
|
285
318
|
};
|
|
286
319
|
|
|
287
320
|
return { success: true, data: results };
|
|
@@ -301,11 +334,7 @@ const createSearXNGAPI = (
|
|
|
301
334
|
export const createSearchAPI = (
|
|
302
335
|
config: t.SearchConfig
|
|
303
336
|
): {
|
|
304
|
-
getSources: (
|
|
305
|
-
query: string,
|
|
306
|
-
numResults?: number,
|
|
307
|
-
storedLocation?: string
|
|
308
|
-
) => Promise<t.SearchResult>;
|
|
337
|
+
getSources: (params: t.GetSourcesParams) => Promise<t.SearchResult>;
|
|
309
338
|
} => {
|
|
310
339
|
const {
|
|
311
340
|
searchProvider = 'serper',
|
|
@@ -330,10 +359,7 @@ export const createSourceProcessor = (
|
|
|
330
359
|
scraperInstance?: FirecrawlScraper
|
|
331
360
|
): {
|
|
332
361
|
processSources: (
|
|
333
|
-
|
|
334
|
-
numElements: number,
|
|
335
|
-
query: string,
|
|
336
|
-
proMode?: boolean
|
|
362
|
+
fields: t.ProcessSourcesFields
|
|
337
363
|
) => Promise<t.SearchResultData>;
|
|
338
364
|
topResults: number;
|
|
339
365
|
} => {
|
|
@@ -345,47 +371,59 @@ export const createSourceProcessor = (
|
|
|
345
371
|
// strategies = ['no_extraction'],
|
|
346
372
|
// filterContent = true,
|
|
347
373
|
reranker,
|
|
374
|
+
logger,
|
|
348
375
|
} = config;
|
|
349
376
|
|
|
377
|
+
const logger_ = logger || createDefaultLogger();
|
|
350
378
|
const firecrawlScraper = scraperInstance;
|
|
351
379
|
|
|
352
380
|
const webScraper = {
|
|
353
381
|
scrapeMany: async ({
|
|
354
382
|
query,
|
|
355
383
|
links,
|
|
384
|
+
onGetHighlights,
|
|
356
385
|
}: {
|
|
357
386
|
query: string;
|
|
358
387
|
links: string[];
|
|
388
|
+
onGetHighlights: t.SearchToolConfig['onGetHighlights'];
|
|
359
389
|
}): Promise<Array<t.ScrapeResult>> => {
|
|
360
|
-
|
|
390
|
+
logger_.debug(`Scraping ${links.length} links with Firecrawl`);
|
|
361
391
|
const promises: Array<Promise<t.ScrapeResult>> = [];
|
|
362
392
|
try {
|
|
363
|
-
for (
|
|
393
|
+
for (let i = 0; i < links.length; i++) {
|
|
394
|
+
const currentLink = links[i];
|
|
364
395
|
const promise: Promise<t.ScrapeResult> = firecrawlScraper
|
|
365
396
|
.scrapeUrl(currentLink, {})
|
|
366
397
|
.then(([url, response]) => {
|
|
367
|
-
const attribution = getAttribution(
|
|
398
|
+
const attribution = getAttribution(
|
|
399
|
+
url,
|
|
400
|
+
response.data?.metadata,
|
|
401
|
+
logger_
|
|
402
|
+
);
|
|
368
403
|
if (response.success && response.data) {
|
|
369
|
-
const content =
|
|
404
|
+
const [content, references] =
|
|
405
|
+
firecrawlScraper.extractContent(response);
|
|
370
406
|
return {
|
|
371
407
|
url,
|
|
408
|
+
references,
|
|
372
409
|
attribution,
|
|
373
410
|
content: chunker.cleanText(content),
|
|
374
|
-
};
|
|
411
|
+
} as t.ScrapeResult;
|
|
375
412
|
}
|
|
376
413
|
|
|
377
414
|
return {
|
|
378
415
|
url,
|
|
379
416
|
attribution,
|
|
380
417
|
error: true,
|
|
381
|
-
content:
|
|
382
|
-
};
|
|
418
|
+
content: '',
|
|
419
|
+
} as t.ScrapeResult;
|
|
383
420
|
})
|
|
384
421
|
.then(async (result) => {
|
|
385
422
|
try {
|
|
386
423
|
if (result.error != null) {
|
|
387
|
-
|
|
388
|
-
`Error scraping ${result.url}: ${result.content}
|
|
424
|
+
logger_.error(
|
|
425
|
+
`Error scraping ${result.url}: ${result.content}`,
|
|
426
|
+
result.error
|
|
389
427
|
);
|
|
390
428
|
return {
|
|
391
429
|
...result,
|
|
@@ -395,31 +433,35 @@ export const createSourceProcessor = (
|
|
|
395
433
|
query,
|
|
396
434
|
reranker,
|
|
397
435
|
content: result.content,
|
|
436
|
+
logger: logger_,
|
|
398
437
|
});
|
|
438
|
+
if (onGetHighlights) {
|
|
439
|
+
onGetHighlights(result.url);
|
|
440
|
+
}
|
|
399
441
|
return {
|
|
400
442
|
...result,
|
|
401
443
|
highlights,
|
|
402
444
|
};
|
|
403
445
|
} catch (error) {
|
|
404
|
-
|
|
446
|
+
logger_.error('Error processing scraped content:', error);
|
|
405
447
|
return {
|
|
406
448
|
...result,
|
|
407
449
|
};
|
|
408
450
|
}
|
|
409
451
|
})
|
|
410
452
|
.catch((error) => {
|
|
411
|
-
|
|
453
|
+
logger_.error(`Error scraping ${currentLink}:`, error);
|
|
412
454
|
return {
|
|
413
455
|
url: currentLink,
|
|
414
456
|
error: true,
|
|
415
|
-
content:
|
|
457
|
+
content: '',
|
|
416
458
|
};
|
|
417
459
|
});
|
|
418
460
|
promises.push(promise);
|
|
419
461
|
}
|
|
420
462
|
return await Promise.all(promises);
|
|
421
463
|
} catch (error) {
|
|
422
|
-
|
|
464
|
+
logger_.error('Error in scrapeMany:', error);
|
|
423
465
|
return [];
|
|
424
466
|
}
|
|
425
467
|
},
|
|
@@ -429,35 +471,44 @@ export const createSourceProcessor = (
|
|
|
429
471
|
links,
|
|
430
472
|
query,
|
|
431
473
|
target,
|
|
474
|
+
onGetHighlights,
|
|
432
475
|
onContentScraped,
|
|
433
476
|
}: {
|
|
434
477
|
links: string[];
|
|
435
478
|
query: string;
|
|
436
479
|
target: number;
|
|
480
|
+
onGetHighlights: t.SearchToolConfig['onGetHighlights'];
|
|
437
481
|
onContentScraped?: (link: string, update?: Partial<t.ValidSource>) => void;
|
|
438
482
|
}): Promise<void> => {
|
|
439
483
|
const initialLinks = links.slice(0, target);
|
|
440
484
|
// const remainingLinks = links.slice(target).reverse();
|
|
441
|
-
const results = await webScraper.scrapeMany({
|
|
485
|
+
const results = await webScraper.scrapeMany({
|
|
486
|
+
query,
|
|
487
|
+
links: initialLinks,
|
|
488
|
+
onGetHighlights,
|
|
489
|
+
});
|
|
442
490
|
for (const result of results) {
|
|
443
491
|
if (result.error === true) {
|
|
444
492
|
continue;
|
|
445
493
|
}
|
|
446
|
-
const { url, content, attribution, highlights } = result;
|
|
494
|
+
const { url, content, attribution, references, highlights } = result;
|
|
447
495
|
onContentScraped?.(url, {
|
|
448
496
|
content,
|
|
449
497
|
attribution,
|
|
498
|
+
references,
|
|
450
499
|
highlights,
|
|
451
500
|
});
|
|
452
501
|
}
|
|
453
502
|
};
|
|
454
503
|
|
|
455
|
-
const processSources = async (
|
|
456
|
-
result
|
|
457
|
-
numElements
|
|
458
|
-
query
|
|
459
|
-
|
|
460
|
-
|
|
504
|
+
const processSources = async ({
|
|
505
|
+
result,
|
|
506
|
+
numElements,
|
|
507
|
+
query,
|
|
508
|
+
news,
|
|
509
|
+
proMode = true,
|
|
510
|
+
onGetHighlights,
|
|
511
|
+
}: t.ProcessSourcesFields): Promise<t.SearchResultData> => {
|
|
461
512
|
try {
|
|
462
513
|
if (!result.data) {
|
|
463
514
|
return {
|
|
@@ -485,6 +536,7 @@ export const createSourceProcessor = (
|
|
|
485
536
|
await fetchContents({
|
|
486
537
|
query,
|
|
487
538
|
target: 1,
|
|
539
|
+
onGetHighlights,
|
|
488
540
|
onContentScraped,
|
|
489
541
|
links: [wikiSources[0].link],
|
|
490
542
|
});
|
|
@@ -504,51 +556,69 @@ export const createSourceProcessor = (
|
|
|
504
556
|
}
|
|
505
557
|
|
|
506
558
|
const sourceMap = new Map<string, t.ValidSource>();
|
|
507
|
-
const
|
|
559
|
+
const organicLinksSet = new Set<string>();
|
|
508
560
|
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
561
|
+
// Collect organic links
|
|
562
|
+
const organicLinks = collectLinks(
|
|
563
|
+
result.data.organic,
|
|
564
|
+
sourceMap,
|
|
565
|
+
organicLinksSet
|
|
566
|
+
);
|
|
567
|
+
|
|
568
|
+
// Collect top story links, excluding any that are already in organic links
|
|
569
|
+
const topStories = result.data.topStories ?? [];
|
|
570
|
+
const topStoryLinks = collectLinks(
|
|
571
|
+
topStories,
|
|
572
|
+
sourceMap,
|
|
573
|
+
organicLinksSet
|
|
574
|
+
);
|
|
515
575
|
|
|
516
|
-
if (
|
|
576
|
+
if (organicLinks.length === 0 && (topStoryLinks.length === 0 || !news)) {
|
|
517
577
|
return result.data;
|
|
518
578
|
}
|
|
519
579
|
|
|
520
580
|
const onContentScraped = createSourceUpdateCallback(sourceMap);
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
581
|
+
const promises: Promise<void>[] = [];
|
|
582
|
+
|
|
583
|
+
// Process organic links
|
|
584
|
+
if (organicLinks.length > 0) {
|
|
585
|
+
promises.push(
|
|
586
|
+
fetchContents({
|
|
587
|
+
query,
|
|
588
|
+
onGetHighlights,
|
|
589
|
+
onContentScraped,
|
|
590
|
+
links: organicLinks,
|
|
591
|
+
target: numElements,
|
|
592
|
+
})
|
|
593
|
+
);
|
|
594
|
+
}
|
|
527
595
|
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
596
|
+
// Process top story links
|
|
597
|
+
if (news && topStoryLinks.length > 0) {
|
|
598
|
+
promises.push(
|
|
599
|
+
fetchContents({
|
|
600
|
+
query,
|
|
601
|
+
onGetHighlights,
|
|
602
|
+
onContentScraped,
|
|
603
|
+
links: topStoryLinks,
|
|
604
|
+
target: numElements,
|
|
605
|
+
})
|
|
606
|
+
);
|
|
537
607
|
}
|
|
538
608
|
|
|
539
|
-
|
|
540
|
-
.filter(
|
|
541
|
-
(source) =>
|
|
542
|
-
source.content != null && !source.content.startsWith('Failed')
|
|
543
|
-
)
|
|
544
|
-
.slice(0, numElements);
|
|
609
|
+
await Promise.all(promises);
|
|
545
610
|
|
|
546
|
-
if (
|
|
547
|
-
result.data.organic
|
|
611
|
+
if (result.data.organic.length > 0) {
|
|
612
|
+
updateSourcesWithContent(result.data.organic, sourceMap);
|
|
548
613
|
}
|
|
614
|
+
|
|
615
|
+
if (news && topStories.length > 0) {
|
|
616
|
+
updateSourcesWithContent(topStories, sourceMap);
|
|
617
|
+
}
|
|
618
|
+
|
|
549
619
|
return result.data;
|
|
550
620
|
} catch (error) {
|
|
551
|
-
|
|
621
|
+
logger_.error('Error in processSources:', error);
|
|
552
622
|
return {
|
|
553
623
|
organic: [],
|
|
554
624
|
topStories: [],
|
|
@@ -565,3 +635,46 @@ export const createSourceProcessor = (
|
|
|
565
635
|
topResults,
|
|
566
636
|
};
|
|
567
637
|
};
|
|
638
|
+
|
|
639
|
+
/** Helper function to collect links and update sourceMap */
|
|
640
|
+
function collectLinks(
|
|
641
|
+
sources: Array<t.OrganicResult | t.TopStoryResult>,
|
|
642
|
+
sourceMap: Map<string, t.ValidSource>,
|
|
643
|
+
existingLinksSet?: Set<string>
|
|
644
|
+
): string[] {
|
|
645
|
+
const links: string[] = [];
|
|
646
|
+
|
|
647
|
+
for (const source of sources) {
|
|
648
|
+
if (source.link) {
|
|
649
|
+
// For topStories, only add if not already in organic links
|
|
650
|
+
if (existingLinksSet && existingLinksSet.has(source.link)) {
|
|
651
|
+
continue;
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
links.push(source.link);
|
|
655
|
+
if (existingLinksSet) {
|
|
656
|
+
existingLinksSet.add(source.link);
|
|
657
|
+
}
|
|
658
|
+
sourceMap.set(source.link, source as t.ValidSource);
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
return links;
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
/** Helper function to update sources with scraped content */
|
|
666
|
+
function updateSourcesWithContent<T extends t.ValidSource>(
|
|
667
|
+
sources: T[],
|
|
668
|
+
sourceMap: Map<string, t.ValidSource>
|
|
669
|
+
): void {
|
|
670
|
+
for (let i = 0; i < sources.length; i++) {
|
|
671
|
+
const source = sources[i];
|
|
672
|
+
const updatedSource = sourceMap.get(source.link);
|
|
673
|
+
if (updatedSource) {
|
|
674
|
+
sources[i] = {
|
|
675
|
+
...source,
|
|
676
|
+
...updatedSource,
|
|
677
|
+
} as T;
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
}
|