@apmantza/greedysearch-pi 1.9.0 → 1.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +46 -0
- package/README.md +11 -1
- package/bin/launch-visible.mjs +65 -0
- package/bin/launch.mjs +442 -417
- package/bin/search.mjs +757 -679
- package/extractors/bing-copilot.mjs +490 -374
- package/extractors/common.mjs +703 -596
- package/extractors/consent.mjs +421 -388
- package/extractors/selectors.mjs +55 -54
- package/index.ts +176 -177
- package/package.json +8 -3
- package/skills/greedy-search/skill.md +5 -19
- package/src/fetcher.mjs +666 -652
- package/src/formatters/synthesis.ts +1 -5
- package/src/search/output.mjs +23 -1
- package/src/search/research.mjs +1581 -0
- package/src/search/sources.mjs +488 -466
- package/src/search/synthesis-runner.mjs +52 -46
- package/src/tools/greedy-search-handler.ts +298 -124
- package/test.mjs +971 -534
package/src/search/sources.mjs
CHANGED
|
@@ -1,466 +1,488 @@
|
|
|
1
|
-
// src/search/sources.mjs — Source registry, URL normalization, domain inference, classification
|
|
2
|
-
//
|
|
3
|
-
// Responsible for: deduplicating sources across engines, normalizing URLs,
|
|
4
|
-
// classifying source types, inferring preferred domains from queries, and
|
|
5
|
-
// merging fetch data into source objects.
|
|
6
|
-
|
|
7
|
-
export const TRACKING_PARAMS = [
|
|
8
|
-
"fbclid",
|
|
9
|
-
"gclid",
|
|
10
|
-
"ref",
|
|
11
|
-
"ref_src",
|
|
12
|
-
"ref_url",
|
|
13
|
-
"source",
|
|
14
|
-
"utm_campaign",
|
|
15
|
-
"utm_content",
|
|
16
|
-
"utm_medium",
|
|
17
|
-
"utm_source",
|
|
18
|
-
"utm_term",
|
|
19
|
-
];
|
|
20
|
-
|
|
21
|
-
export const COMMUNITY_HOSTS = [
|
|
22
|
-
"dev.to",
|
|
23
|
-
"hashnode.com",
|
|
24
|
-
"medium.com",
|
|
25
|
-
"reddit.com",
|
|
26
|
-
"stackoverflow.com",
|
|
27
|
-
"stackexchange.com",
|
|
28
|
-
"substack.com",
|
|
29
|
-
];
|
|
30
|
-
|
|
31
|
-
export const NEWS_HOSTS = [
|
|
32
|
-
"arstechnica.com",
|
|
33
|
-
"techcrunch.com",
|
|
34
|
-
"theverge.com",
|
|
35
|
-
"venturebeat.com",
|
|
36
|
-
"wired.com",
|
|
37
|
-
"zdnet.com",
|
|
38
|
-
];
|
|
39
|
-
|
|
40
|
-
export
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
export function
|
|
51
|
-
const clean =
|
|
52
|
-
if (
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
const
|
|
68
|
-
const
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
if (!
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
if (
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
const
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
const
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
//
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
1
|
+
// src/search/sources.mjs — Source registry, URL normalization, domain inference, classification
|
|
2
|
+
//
|
|
3
|
+
// Responsible for: deduplicating sources across engines, normalizing URLs,
|
|
4
|
+
// classifying source types, inferring preferred domains from queries, and
|
|
5
|
+
// merging fetch data into source objects.
|
|
6
|
+
|
|
7
|
+
export const TRACKING_PARAMS = [
|
|
8
|
+
"fbclid",
|
|
9
|
+
"gclid",
|
|
10
|
+
"ref",
|
|
11
|
+
"ref_src",
|
|
12
|
+
"ref_url",
|
|
13
|
+
"source",
|
|
14
|
+
"utm_campaign",
|
|
15
|
+
"utm_content",
|
|
16
|
+
"utm_medium",
|
|
17
|
+
"utm_source",
|
|
18
|
+
"utm_term",
|
|
19
|
+
];
|
|
20
|
+
|
|
21
|
+
export const COMMUNITY_HOSTS = [
|
|
22
|
+
"dev.to",
|
|
23
|
+
"hashnode.com",
|
|
24
|
+
"medium.com",
|
|
25
|
+
"reddit.com",
|
|
26
|
+
"stackoverflow.com",
|
|
27
|
+
"stackexchange.com",
|
|
28
|
+
"substack.com",
|
|
29
|
+
];
|
|
30
|
+
|
|
31
|
+
export const NEWS_HOSTS = [
|
|
32
|
+
"arstechnica.com",
|
|
33
|
+
"techcrunch.com",
|
|
34
|
+
"theverge.com",
|
|
35
|
+
"venturebeat.com",
|
|
36
|
+
"wired.com",
|
|
37
|
+
"zdnet.com",
|
|
38
|
+
];
|
|
39
|
+
|
|
40
|
+
export const SOCIAL_HOSTS = [
|
|
41
|
+
"facebook.com",
|
|
42
|
+
"instagram.com",
|
|
43
|
+
"linkedin.com",
|
|
44
|
+
"pinterest.com",
|
|
45
|
+
"tiktok.com",
|
|
46
|
+
"twitter.com",
|
|
47
|
+
"x.com",
|
|
48
|
+
];
|
|
49
|
+
|
|
50
|
+
export function trimText(text = "", maxChars = 240) {
|
|
51
|
+
const clean = String(text).replaceAll(/\s+/g, " ").trim();
|
|
52
|
+
if (clean.length <= maxChars) return clean;
|
|
53
|
+
const truncated = clean.slice(0, maxChars);
|
|
54
|
+
const lastSpace = truncated.lastIndexOf(" ");
|
|
55
|
+
return lastSpace > 0
|
|
56
|
+
? `${truncated.slice(0, lastSpace)}...`
|
|
57
|
+
: `${truncated}...`;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export function normalizeSourceTitle(title = "") {
|
|
61
|
+
const clean = trimText(title, 180);
|
|
62
|
+
if (!clean) return "";
|
|
63
|
+
if (/^https?:\/\//i.test(clean)) return "";
|
|
64
|
+
|
|
65
|
+
const wordCount = clean.split(/\s+/).filter(Boolean).length;
|
|
66
|
+
const hasUppercase = /[A-Z]/.test(clean);
|
|
67
|
+
const hasDigit = /\d/.test(clean);
|
|
68
|
+
const looksLikeFragment =
|
|
69
|
+
clean === clean.toLowerCase() &&
|
|
70
|
+
wordCount <= 4 &&
|
|
71
|
+
!hasUppercase &&
|
|
72
|
+
!hasDigit;
|
|
73
|
+
return looksLikeFragment ? "" : clean;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
export function pickPreferredTitle(currentTitle = "", nextTitle = "") {
|
|
77
|
+
const current = normalizeSourceTitle(currentTitle);
|
|
78
|
+
const next = normalizeSourceTitle(nextTitle);
|
|
79
|
+
if (!next) return current;
|
|
80
|
+
if (!current) return next;
|
|
81
|
+
const currentLooksLikeUrl = /^https?:\/\//i.test(current);
|
|
82
|
+
const nextLooksLikeUrl = /^https?:\/\//i.test(next);
|
|
83
|
+
if (currentLooksLikeUrl && !nextLooksLikeUrl) return next;
|
|
84
|
+
if (!currentLooksLikeUrl && nextLooksLikeUrl) return current;
|
|
85
|
+
return next.length > current.length ? next : current;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
export function normalizeUrl(rawUrl) {
|
|
89
|
+
if (!rawUrl) return null;
|
|
90
|
+
try {
|
|
91
|
+
const url = new URL(rawUrl);
|
|
92
|
+
if (!["http:", "https:"].includes(url.protocol)) return null;
|
|
93
|
+
url.hash = "";
|
|
94
|
+
url.hostname = url.hostname.toLowerCase();
|
|
95
|
+
if (
|
|
96
|
+
(url.protocol === "https:" && url.port === "443") ||
|
|
97
|
+
(url.protocol === "http:" && url.port === "80")
|
|
98
|
+
) {
|
|
99
|
+
url.port = "";
|
|
100
|
+
}
|
|
101
|
+
for (const key of [...url.searchParams.keys()]) {
|
|
102
|
+
const lower = key.toLowerCase();
|
|
103
|
+
if (TRACKING_PARAMS.includes(lower) || lower.startsWith("utm_")) {
|
|
104
|
+
url.searchParams.delete(key);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
url.searchParams.sort();
|
|
108
|
+
const normalizedPath = url.pathname.replace(/\/{1,10}$/, "") || "/";
|
|
109
|
+
url.pathname = normalizedPath;
|
|
110
|
+
const normalized = url.toString();
|
|
111
|
+
return normalizedPath === "/" ? normalized.replace(/\/$/, "") : normalized;
|
|
112
|
+
} catch {
|
|
113
|
+
return null;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
export function getDomain(rawUrl) {
|
|
118
|
+
try {
|
|
119
|
+
const domain = new URL(rawUrl).hostname.toLowerCase();
|
|
120
|
+
return domain.replace(/^www\./, "");
|
|
121
|
+
} catch {
|
|
122
|
+
return "";
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
export function matchesDomain(domain, hosts) {
|
|
127
|
+
return hosts.some((host) => domain === host || domain.endsWith(`.${host}`));
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
export function classifySourceType(domain, title = "", rawUrl = "") {
|
|
131
|
+
const lowerTitle = title.toLowerCase();
|
|
132
|
+
const lowerUrl = rawUrl.toLowerCase();
|
|
133
|
+
|
|
134
|
+
if (domain === "github.com" || domain === "gitlab.com") return "repo";
|
|
135
|
+
if (matchesDomain(domain, SOCIAL_HOSTS)) return "social";
|
|
136
|
+
if (matchesDomain(domain, COMMUNITY_HOSTS)) return "community";
|
|
137
|
+
if (matchesDomain(domain, NEWS_HOSTS)) return "news";
|
|
138
|
+
if (
|
|
139
|
+
domain.startsWith("docs.") ||
|
|
140
|
+
domain.startsWith("developer.") ||
|
|
141
|
+
domain.startsWith("developers.") ||
|
|
142
|
+
domain.startsWith("api.") ||
|
|
143
|
+
lowerTitle.includes("documentation") ||
|
|
144
|
+
lowerTitle.includes("docs") ||
|
|
145
|
+
lowerTitle.includes("reference") ||
|
|
146
|
+
lowerUrl.includes("/docs/") ||
|
|
147
|
+
lowerUrl.includes("/reference/") ||
|
|
148
|
+
lowerUrl.includes("/api/")
|
|
149
|
+
) {
|
|
150
|
+
return "official-docs";
|
|
151
|
+
}
|
|
152
|
+
if (domain.startsWith("blog.") || lowerUrl.includes("/blog/"))
|
|
153
|
+
return "maintainer-blog";
|
|
154
|
+
return "website";
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
export function sourceTypePriority(sourceType) {
|
|
158
|
+
switch (sourceType) {
|
|
159
|
+
case "official-docs":
|
|
160
|
+
return 5;
|
|
161
|
+
case "repo":
|
|
162
|
+
return 4;
|
|
163
|
+
case "maintainer-blog":
|
|
164
|
+
return 3;
|
|
165
|
+
case "website":
|
|
166
|
+
return 2;
|
|
167
|
+
case "community":
|
|
168
|
+
return 1;
|
|
169
|
+
case "news":
|
|
170
|
+
return 0;
|
|
171
|
+
case "social":
|
|
172
|
+
return -6;
|
|
173
|
+
default:
|
|
174
|
+
return 0;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
export function bestRank(source) {
|
|
179
|
+
const ranks = Object.values(source.perEngine || {}).map((v) => v?.rank || 99);
|
|
180
|
+
return ranks.length ? Math.min(...ranks) : 99;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Discussion-only hosts that get a stronger penalty vs. general community hosts.
|
|
184
|
+
// Q&A sites (stackoverflow, stackexchange) are intentionally excluded.
|
|
185
|
+
const DISCUSSION_HOSTS = ["reddit.com", "news.ycombinator.com", "lobste.rs"];
|
|
186
|
+
|
|
187
|
+
/**
|
|
188
|
+
* Composite relevance score combining all signals continuously instead of
|
|
189
|
+
* cascading tiebreakers. Weights chosen so a query-relevant official source
|
|
190
|
+
* ranked #1 by one engine beats any multi-engine consensus from generic sites,
|
|
191
|
+
* while multi-engine consensus beats a single-engine community post.
|
|
192
|
+
*/
|
|
193
|
+
export function computeCompositeScore(source) {
|
|
194
|
+
return (
|
|
195
|
+
source.smartScore * 3 +
|
|
196
|
+
source.engineCount * 5 +
|
|
197
|
+
sourceTypePriority(source.sourceType) * 2 +
|
|
198
|
+
Math.max(0, 7 - bestRank(source))
|
|
199
|
+
);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
export function inferPreferredDomains(query) {
|
|
203
|
+
const normalized = query.toLowerCase();
|
|
204
|
+
const matches = [];
|
|
205
|
+
|
|
206
|
+
if (
|
|
207
|
+
normalized.includes("openai") ||
|
|
208
|
+
normalized.includes("gpt") ||
|
|
209
|
+
normalized.includes("chatgpt")
|
|
210
|
+
) {
|
|
211
|
+
matches.push("openai.com", "platform.openai.com", "help.openai.com");
|
|
212
|
+
}
|
|
213
|
+
if (normalized.includes("anthropic") || normalized.includes("claude")) {
|
|
214
|
+
matches.push("anthropic.com", "docs.anthropic.com");
|
|
215
|
+
}
|
|
216
|
+
if (normalized.includes("bun")) {
|
|
217
|
+
matches.push("bun.sh", "bun.com");
|
|
218
|
+
}
|
|
219
|
+
if (normalized.includes("next.js") || normalized.includes("nextjs")) {
|
|
220
|
+
matches.push("nextjs.org", "vercel.com");
|
|
221
|
+
}
|
|
222
|
+
if (normalized.includes("playwright")) {
|
|
223
|
+
matches.push("playwright.dev");
|
|
224
|
+
}
|
|
225
|
+
if (normalized.includes("supabase")) {
|
|
226
|
+
matches.push("supabase.com", "supabase.io");
|
|
227
|
+
}
|
|
228
|
+
if (normalized.includes("prisma")) {
|
|
229
|
+
matches.push("prisma.io");
|
|
230
|
+
}
|
|
231
|
+
if (normalized.includes("tailwind")) {
|
|
232
|
+
matches.push("tailwindcss.com");
|
|
233
|
+
}
|
|
234
|
+
if (normalized.includes("vite")) {
|
|
235
|
+
matches.push("vitejs.dev", "vite.dev");
|
|
236
|
+
}
|
|
237
|
+
if (normalized.includes("astro")) {
|
|
238
|
+
matches.push("astro.build");
|
|
239
|
+
}
|
|
240
|
+
if (normalized.includes("svelte")) {
|
|
241
|
+
matches.push("svelte.dev");
|
|
242
|
+
}
|
|
243
|
+
if (normalized.includes("solid")) {
|
|
244
|
+
matches.push("solidjs.com");
|
|
245
|
+
}
|
|
246
|
+
if (normalized.includes("vue") || normalized.includes("nuxt")) {
|
|
247
|
+
matches.push("vuejs.org", "nuxt.com");
|
|
248
|
+
}
|
|
249
|
+
if (normalized.includes("react") || normalized.includes("react native")) {
|
|
250
|
+
matches.push("react.dev", "reactnative.dev");
|
|
251
|
+
}
|
|
252
|
+
if (normalized.includes("angular")) {
|
|
253
|
+
matches.push("angular.io", "angular.dev");
|
|
254
|
+
}
|
|
255
|
+
if (normalized.includes("node.js") || normalized.includes("nodejs")) {
|
|
256
|
+
matches.push("nodejs.org", "nodejs.dev", "npmjs.com");
|
|
257
|
+
}
|
|
258
|
+
if (/\bgo\b/.test(normalized) || normalized.includes("golang")) {
|
|
259
|
+
matches.push("go.dev", "golang.org", "pkg.go.dev");
|
|
260
|
+
}
|
|
261
|
+
if (normalized.includes("deno")) {
|
|
262
|
+
matches.push("deno.land", "deno.com");
|
|
263
|
+
}
|
|
264
|
+
if (normalized.includes("fresh")) {
|
|
265
|
+
matches.push("fresh.deno.dev");
|
|
266
|
+
}
|
|
267
|
+
if (normalized.includes("typescript") || normalized.includes("ts")) {
|
|
268
|
+
matches.push("typescriptlang.org");
|
|
269
|
+
}
|
|
270
|
+
if (normalized.includes("python")) {
|
|
271
|
+
matches.push("python.org", "docs.python.org");
|
|
272
|
+
}
|
|
273
|
+
if (normalized.includes("rust")) {
|
|
274
|
+
matches.push("rust-lang.org", "docs.rs", "crates.io");
|
|
275
|
+
}
|
|
276
|
+
if (normalized.includes("zig")) {
|
|
277
|
+
matches.push("ziglang.org");
|
|
278
|
+
}
|
|
279
|
+
if (normalized.includes("docker")) {
|
|
280
|
+
matches.push("docker.com", "docs.docker.com", "hub.docker.com");
|
|
281
|
+
}
|
|
282
|
+
if (normalized.includes("kubernetes") || normalized.includes("k8s")) {
|
|
283
|
+
matches.push("kubernetes.io", "k8s.io");
|
|
284
|
+
}
|
|
285
|
+
if (normalized.includes("postgres") || normalized.includes("postgresql")) {
|
|
286
|
+
matches.push("postgresql.org", "neon.tech", "supabase.com");
|
|
287
|
+
}
|
|
288
|
+
if (normalized.includes("redis")) {
|
|
289
|
+
matches.push("redis.io");
|
|
290
|
+
}
|
|
291
|
+
if (normalized.includes("sqlite")) {
|
|
292
|
+
matches.push("sqlite.org");
|
|
293
|
+
}
|
|
294
|
+
if (normalized.includes("cloudflare")) {
|
|
295
|
+
matches.push("developers.cloudflare.com", "cloudflare.com");
|
|
296
|
+
}
|
|
297
|
+
if (normalized.includes("vercel")) {
|
|
298
|
+
matches.push("vercel.com", "nextjs.org");
|
|
299
|
+
}
|
|
300
|
+
if (normalized.includes("netlify")) {
|
|
301
|
+
matches.push("netlify.com", "docs.netlify.com");
|
|
302
|
+
}
|
|
303
|
+
if (normalized.includes("stripe")) {
|
|
304
|
+
matches.push("stripe.com", "docs.stripe.com");
|
|
305
|
+
}
|
|
306
|
+
if (normalized.includes("github")) {
|
|
307
|
+
matches.push("github.com", "docs.github.com");
|
|
308
|
+
}
|
|
309
|
+
if (normalized.includes("gitlab")) {
|
|
310
|
+
matches.push("gitlab.com", "docs.gitlab.com");
|
|
311
|
+
}
|
|
312
|
+
if (normalized.includes("aws")) {
|
|
313
|
+
matches.push("aws.amazon.com", "docs.aws.amazon.com");
|
|
314
|
+
}
|
|
315
|
+
if (normalized.includes("azure")) {
|
|
316
|
+
matches.push("azure.microsoft.com", "learn.microsoft.com");
|
|
317
|
+
}
|
|
318
|
+
if (normalized.includes("gcp") || normalized.includes("google cloud")) {
|
|
319
|
+
matches.push("cloud.google.com", "developers.google.com");
|
|
320
|
+
}
|
|
321
|
+
if (normalized.includes("gemini") || normalized.includes("google ai")) {
|
|
322
|
+
matches.push("ai.google.dev", "developers.google.com");
|
|
323
|
+
}
|
|
324
|
+
for (const socialHost of SOCIAL_HOSTS) {
|
|
325
|
+
const bareName = socialHost.replace(/\.com$/, "");
|
|
326
|
+
if (normalized.includes(bareName)) matches.push(socialHost);
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
return [...new Set(matches)];
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
export function domainMatches(hostname, candidate) {
|
|
333
|
+
return hostname === candidate || hostname.endsWith(`.${candidate}`);
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
export function buildSourceRegistry(out, query = "") {
|
|
337
|
+
const seen = new Map();
|
|
338
|
+
const engineOrder = ["perplexity", "bing", "google"];
|
|
339
|
+
|
|
340
|
+
// Get preferred domains for this query
|
|
341
|
+
const preferredDomains = inferPreferredDomains(query);
|
|
342
|
+
|
|
343
|
+
for (const engine of engineOrder) {
|
|
344
|
+
const result = out[engine];
|
|
345
|
+
if (!result?.sources) continue;
|
|
346
|
+
|
|
347
|
+
for (let i = 0; i < result.sources.length; i++) {
|
|
348
|
+
const source = result.sources[i];
|
|
349
|
+
const canonicalUrl = normalizeUrl(source.url);
|
|
350
|
+
if (!canonicalUrl || canonicalUrl.length < 10) continue;
|
|
351
|
+
|
|
352
|
+
const title = normalizeSourceTitle(source.title || "");
|
|
353
|
+
const domain = getDomain(canonicalUrl);
|
|
354
|
+
const sourceType = classifySourceType(domain, title, canonicalUrl);
|
|
355
|
+
|
|
356
|
+
// Calculate smart score boost
|
|
357
|
+
let smartScore = 0;
|
|
358
|
+
|
|
359
|
+
// Boost preferred domains for this query
|
|
360
|
+
if (preferredDomains.some((pd) => domainMatches(domain, pd))) {
|
|
361
|
+
smartScore += 10; // Strong boost for query-relevant official docs
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
// Boost docs/developer sites
|
|
365
|
+
if (sourceType === "official-docs") {
|
|
366
|
+
smartScore += 3;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// Boost based on URL path patterns
|
|
370
|
+
const lowerUrl = canonicalUrl.toLowerCase();
|
|
371
|
+
if (
|
|
372
|
+
/\/docs\/|\/documentation\/|\.dev\/|\/api\/|\/reference\//.test(
|
|
373
|
+
lowerUrl,
|
|
374
|
+
)
|
|
375
|
+
) {
|
|
376
|
+
smartScore += 2;
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
// Penalize discussion/social sites for technical queries — high noise,
|
|
380
|
+
// hard to fetch cleanly, and rarely canonical. Q&A sites (StackOverflow,
|
|
381
|
+
// StackExchange) are excluded from the community penalty.
|
|
382
|
+
const queryTargetsSocialHost = preferredDomains.some((pd) =>
|
|
383
|
+
domainMatches(domain, pd),
|
|
384
|
+
);
|
|
385
|
+
if (sourceType === "social" && !queryTargetsSocialHost) {
|
|
386
|
+
smartScore -= 12;
|
|
387
|
+
}
|
|
388
|
+
if (preferredDomains.length > 0) {
|
|
389
|
+
if (matchesDomain(domain, DISCUSSION_HOSTS)) {
|
|
390
|
+
smartScore -= 3;
|
|
391
|
+
} else if (
|
|
392
|
+
sourceType === "community" &&
|
|
393
|
+
!matchesDomain(domain, ["stackoverflow.com", "stackexchange.com"])
|
|
394
|
+
) {
|
|
395
|
+
smartScore -= 1;
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
const existing = seen.get(canonicalUrl) || {
|
|
400
|
+
id: "",
|
|
401
|
+
canonicalUrl,
|
|
402
|
+
displayUrl: source.url || canonicalUrl,
|
|
403
|
+
domain,
|
|
404
|
+
title: "",
|
|
405
|
+
engines: [],
|
|
406
|
+
engineCount: 0,
|
|
407
|
+
perEngine: {},
|
|
408
|
+
sourceType,
|
|
409
|
+
isOfficial: sourceType === "official-docs",
|
|
410
|
+
smartScore: 0,
|
|
411
|
+
};
|
|
412
|
+
|
|
413
|
+
existing.title = pickPreferredTitle(existing.title, title);
|
|
414
|
+
existing.displayUrl = existing.displayUrl || source.url || canonicalUrl;
|
|
415
|
+
existing.sourceType = existing.sourceType || sourceType;
|
|
416
|
+
existing.isOfficial =
|
|
417
|
+
existing.isOfficial || sourceType === "official-docs";
|
|
418
|
+
existing.smartScore = Math.max(existing.smartScore, smartScore);
|
|
419
|
+
|
|
420
|
+
if (!existing.engines.includes(engine)) {
|
|
421
|
+
existing.engines.push(engine);
|
|
422
|
+
}
|
|
423
|
+
existing.perEngine[engine] = {
|
|
424
|
+
rank: i + 1,
|
|
425
|
+
title: pickPreferredTitle(
|
|
426
|
+
existing.perEngine[engine]?.title || "",
|
|
427
|
+
title,
|
|
428
|
+
),
|
|
429
|
+
};
|
|
430
|
+
|
|
431
|
+
seen.set(canonicalUrl, existing);
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
const sources = Array.from(seen.values())
|
|
436
|
+
.map((source) => ({
|
|
437
|
+
...source,
|
|
438
|
+
engineCount: source.engines.length,
|
|
439
|
+
}))
|
|
440
|
+
.sort((a, b) => {
|
|
441
|
+
// Single composite score so all signals contribute simultaneously.
|
|
442
|
+
// Avoids rank being ignored when engineCount differs, and smartScore
|
|
443
|
+
// dominating even when rank/type signal would break the tie better.
|
|
444
|
+
const diff = computeCompositeScore(b) - computeCompositeScore(a);
|
|
445
|
+
if (diff !== 0) return diff;
|
|
446
|
+
return a.domain.localeCompare(b.domain);
|
|
447
|
+
})
|
|
448
|
+
.slice(0, 12)
|
|
449
|
+
.map((source, index) => ({
|
|
450
|
+
...source,
|
|
451
|
+
id: `S${index + 1}`,
|
|
452
|
+
title: source.title || source.domain || source.canonicalUrl,
|
|
453
|
+
}));
|
|
454
|
+
|
|
455
|
+
return sources;
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
export function mergeFetchDataIntoSources(sources, fetchedSources) {
|
|
459
|
+
const byId = new Map(fetchedSources.map((source) => [source.id, source]));
|
|
460
|
+
return sources.map((source) => {
|
|
461
|
+
const fetched = byId.get(source.id);
|
|
462
|
+
if (!fetched) return source;
|
|
463
|
+
|
|
464
|
+
const title = pickPreferredTitle(source.title, fetched.title || "");
|
|
465
|
+
return {
|
|
466
|
+
...source,
|
|
467
|
+
title: title || source.title,
|
|
468
|
+
fetch: {
|
|
469
|
+
attempted: true,
|
|
470
|
+
ok: !fetched.error && fetched.contentChars > 100,
|
|
471
|
+
status: fetched.status || null,
|
|
472
|
+
finalUrl: fetched.finalUrl || fetched.url || source.canonicalUrl,
|
|
473
|
+
contentType: fetched.contentType || "",
|
|
474
|
+
lastModified: fetched.lastModified || "",
|
|
475
|
+
publishedTime: fetched.publishedTime || "",
|
|
476
|
+
byline: fetched.byline || "",
|
|
477
|
+
siteName: fetched.siteName || "",
|
|
478
|
+
lang: fetched.lang || "",
|
|
479
|
+
title: fetched.title || "",
|
|
480
|
+
snippet: fetched.snippet || "",
|
|
481
|
+
contentChars: fetched.contentChars || 0,
|
|
482
|
+
source: fetched.source || "unknown", // "http" | "browser"
|
|
483
|
+
duration: fetched.duration || 0,
|
|
484
|
+
error: fetched.error || "",
|
|
485
|
+
},
|
|
486
|
+
};
|
|
487
|
+
});
|
|
488
|
+
}
|