@ahkohd/yagami 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.beads/.beads-credential-key +1 -0
- package/.beads/README.md +81 -0
- package/.beads/config.yaml +54 -0
- package/.beads/hooks/post-checkout +24 -0
- package/.beads/hooks/post-merge +24 -0
- package/.beads/hooks/pre-commit +24 -0
- package/.beads/hooks/pre-push +24 -0
- package/.beads/hooks/prepare-commit-msg +24 -0
- package/.beads/metadata.json +7 -0
- package/.github/workflows/ci.yml +43 -0
- package/.github/workflows/release.yml +115 -0
- package/AGENTS.md +150 -0
- package/README.md +210 -0
- package/biome.json +36 -0
- package/config/mcporter.json +8 -0
- package/dist/cli/theme.js +202 -0
- package/dist/cli/theme.js.map +1 -0
- package/dist/cli.js +1883 -0
- package/dist/cli.js.map +1 -0
- package/dist/config.js +223 -0
- package/dist/config.js.map +1 -0
- package/dist/daemon.js +745 -0
- package/dist/daemon.js.map +1 -0
- package/dist/engine/constants.js +131 -0
- package/dist/engine/constants.js.map +1 -0
- package/dist/engine/deep-research.js +167 -0
- package/dist/engine/deep-research.js.map +1 -0
- package/dist/engine/defuddle-utils.js +57 -0
- package/dist/engine/defuddle-utils.js.map +1 -0
- package/dist/engine/github-fetch.js +232 -0
- package/dist/engine/github-fetch.js.map +1 -0
- package/dist/engine/helpers.js +372 -0
- package/dist/engine/helpers.js.map +1 -0
- package/dist/engine/limiter.js +75 -0
- package/dist/engine/limiter.js.map +1 -0
- package/dist/engine/policy.js +313 -0
- package/dist/engine/policy.js.map +1 -0
- package/dist/engine/runtime-utils.js +65 -0
- package/dist/engine/runtime-utils.js.map +1 -0
- package/dist/engine/search-discovery.js +275 -0
- package/dist/engine/search-discovery.js.map +1 -0
- package/dist/engine/url-utils.js +72 -0
- package/dist/engine/url-utils.js.map +1 -0
- package/dist/engine.js +2030 -0
- package/dist/engine.js.map +1 -0
- package/dist/mcp.js +282 -0
- package/dist/mcp.js.map +1 -0
- package/dist/types/cli.js +2 -0
- package/dist/types/cli.js.map +1 -0
- package/dist/types/config.js +2 -0
- package/dist/types/config.js.map +1 -0
- package/dist/types/daemon.js +2 -0
- package/dist/types/daemon.js.map +1 -0
- package/dist/types/engine.js +2 -0
- package/dist/types/engine.js.map +1 -0
- package/package.json +66 -0
- package/packages/pi-yagami-search/README.md +39 -0
- package/packages/pi-yagami-search/extensions/yagami-search.ts +273 -0
- package/packages/pi-yagami-search/package.json +41 -0
- package/src/cli/theme.ts +260 -0
- package/src/cli.ts +2226 -0
- package/src/config.ts +250 -0
- package/src/daemon.ts +990 -0
- package/src/engine/constants.ts +147 -0
- package/src/engine/deep-research.ts +207 -0
- package/src/engine/defuddle-utils.ts +75 -0
- package/src/engine/github-fetch.ts +265 -0
- package/src/engine/helpers.ts +394 -0
- package/src/engine/limiter.ts +97 -0
- package/src/engine/policy.ts +392 -0
- package/src/engine/runtime-utils.ts +79 -0
- package/src/engine/search-discovery.ts +351 -0
- package/src/engine/url-utils.ts +86 -0
- package/src/engine.ts +2516 -0
- package/src/mcp.ts +337 -0
- package/src/shims-cli.d.ts +3 -0
- package/src/types/cli.ts +7 -0
- package/src/types/config.ts +53 -0
- package/src/types/daemon.ts +22 -0
- package/src/types/engine.ts +194 -0
- package/tsconfig.json +18 -0
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
import {
|
|
2
|
+
categoryProfile,
|
|
3
|
+
clampInteger,
|
|
4
|
+
domainMatches,
|
|
5
|
+
getHostname,
|
|
6
|
+
isTrackingOrAdUrl,
|
|
7
|
+
isValidPublicHostname,
|
|
8
|
+
normalizeDomainFilter,
|
|
9
|
+
normalizeWhitespace,
|
|
10
|
+
parseStringList,
|
|
11
|
+
stripHtml,
|
|
12
|
+
unwrapDuckDuckGoHref,
|
|
13
|
+
} from "./helpers.js";
|
|
14
|
+
import { normalizeUrl } from "./url-utils.js";
|
|
15
|
+
import type { SearchEnginePreset } from "../types/config.js";
|
|
16
|
+
|
|
17
|
+
const SEARCH_ENGINE_PRESETS: Readonly<Record<Exclude<SearchEnginePreset, "custom">, string>> = {
|
|
18
|
+
duckduckgo: "https://duckduckgo.com/html/?q={query}",
|
|
19
|
+
bing: "https://www.bing.com/search?q={query}",
|
|
20
|
+
google: "https://www.google.com/search?q={query}",
|
|
21
|
+
brave: "https://search.brave.com/search?q={query}",
|
|
22
|
+
};
|
|
23
|
+
|
|
24
|
+
function normalizeSearchEngine(value: unknown, fallback: SearchEnginePreset = "duckduckgo"): SearchEnginePreset {
|
|
25
|
+
const normalized = String(value ?? "")
|
|
26
|
+
.trim()
|
|
27
|
+
.toLowerCase();
|
|
28
|
+
|
|
29
|
+
if (normalized === "duckduckgo") return "duckduckgo";
|
|
30
|
+
if (normalized === "bing") return "bing";
|
|
31
|
+
if (normalized === "google") return "google";
|
|
32
|
+
if (normalized === "brave") return "brave";
|
|
33
|
+
if (normalized === "custom") return "custom";
|
|
34
|
+
|
|
35
|
+
return fallback;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export function resolveSearchEngineTemplate(
|
|
39
|
+
engine: SearchEnginePreset,
|
|
40
|
+
customTemplate: unknown,
|
|
41
|
+
): { engine: SearchEnginePreset; template: string } {
|
|
42
|
+
const normalizedTemplate = String(customTemplate ?? "").trim();
|
|
43
|
+
|
|
44
|
+
if (normalizedTemplate) {
|
|
45
|
+
return {
|
|
46
|
+
engine: "custom",
|
|
47
|
+
template: normalizedTemplate,
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
if (engine !== "custom") {
|
|
52
|
+
return {
|
|
53
|
+
engine,
|
|
54
|
+
template: SEARCH_ENGINE_PRESETS[engine],
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return {
|
|
59
|
+
engine: "duckduckgo",
|
|
60
|
+
template: SEARCH_ENGINE_PRESETS.duckduckgo,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function buildSearchUrlFromTemplate(template: string, query: string): string {
|
|
65
|
+
const encodedQuery = encodeURIComponent(query);
|
|
66
|
+
|
|
67
|
+
if (template.includes("{query}")) {
|
|
68
|
+
return template.replace(/\{query\}/g, encodedQuery);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
if (template.includes("%s")) {
|
|
72
|
+
return template.replace(/%s/g, encodedQuery);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
try {
|
|
76
|
+
const url = new URL(template);
|
|
77
|
+
url.searchParams.set("q", query);
|
|
78
|
+
return url.toString();
|
|
79
|
+
} catch {
|
|
80
|
+
return `${template}${encodedQuery}`;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
export async function parseDuckDuckGoResults(
|
|
85
|
+
html: string,
|
|
86
|
+
options: { limit?: number } = {},
|
|
87
|
+
): Promise<Array<Record<string, unknown>>> {
|
|
88
|
+
const limit = options.limit ?? 40;
|
|
89
|
+
const { parseHTML } = (await import("linkedom")) as unknown as {
|
|
90
|
+
parseHTML: (raw: string) => { document: unknown };
|
|
91
|
+
};
|
|
92
|
+
|
|
93
|
+
const { document } = parseHTML(html || "") as { document: Record<string, unknown> };
|
|
94
|
+
|
|
95
|
+
const results: Array<Record<string, unknown>> = [];
|
|
96
|
+
const seen = new Set<string>();
|
|
97
|
+
const resultNodes = Array.from(
|
|
98
|
+
((document.querySelectorAll as (selector: string) => unknown[])?.call(document, ".result") || []) as unknown[],
|
|
99
|
+
) as Array<Record<string, unknown>>;
|
|
100
|
+
|
|
101
|
+
for (const node of resultNodes) {
|
|
102
|
+
const querySelector = node.querySelector as ((selector: string) => Record<string, unknown> | null) | undefined;
|
|
103
|
+
if (!querySelector) continue;
|
|
104
|
+
|
|
105
|
+
const link =
|
|
106
|
+
querySelector.call(node, "a.result__a") ||
|
|
107
|
+
querySelector.call(node, "h2 a") ||
|
|
108
|
+
querySelector.call(node, "a[href]");
|
|
109
|
+
|
|
110
|
+
if (!link) continue;
|
|
111
|
+
|
|
112
|
+
const getAttribute = link.getAttribute as ((name: string) => string | null) | undefined;
|
|
113
|
+
const href = unwrapDuckDuckGoHref((getAttribute?.call(link, "href") || "") as string);
|
|
114
|
+
let normalized: string;
|
|
115
|
+
try {
|
|
116
|
+
normalized = normalizeUrl(href);
|
|
117
|
+
} catch {
|
|
118
|
+
continue;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const hostname = getHostname(normalized);
|
|
122
|
+
if (!isValidPublicHostname(hostname)) continue;
|
|
123
|
+
if (isTrackingOrAdUrl(normalized)) continue;
|
|
124
|
+
if (seen.has(normalized)) continue;
|
|
125
|
+
seen.add(normalized);
|
|
126
|
+
|
|
127
|
+
const textContent = link.textContent as string | undefined;
|
|
128
|
+
const innerHTML = link.innerHTML as string | undefined;
|
|
129
|
+
const title = normalizeWhitespace(textContent || stripHtml(innerHTML || ""));
|
|
130
|
+
|
|
131
|
+
const snippetNode =
|
|
132
|
+
querySelector.call(node, ".result__snippet") ||
|
|
133
|
+
querySelector.call(node, ".result-snippet") ||
|
|
134
|
+
querySelector.call(node, ".result__extras");
|
|
135
|
+
const snippet = normalizeWhitespace(String(snippetNode?.textContent || ""));
|
|
136
|
+
|
|
137
|
+
results.push({
|
|
138
|
+
rank: results.length + 1,
|
|
139
|
+
url: normalized,
|
|
140
|
+
title,
|
|
141
|
+
snippet,
|
|
142
|
+
domain: hostname,
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
if (results.length >= limit) break;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if (results.length > 0) {
|
|
149
|
+
return results;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
const fallback: Array<Record<string, unknown>> = [];
|
|
153
|
+
const seenFallback = new Set<string>();
|
|
154
|
+
const regex = /<a[^>]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi;
|
|
155
|
+
|
|
156
|
+
const htmlInput = html || "";
|
|
157
|
+
let match: RegExpExecArray | null = regex.exec(htmlInput);
|
|
158
|
+
while (match !== null) {
|
|
159
|
+
const href = unwrapDuckDuckGoHref(match[1]);
|
|
160
|
+
let normalized: string;
|
|
161
|
+
try {
|
|
162
|
+
normalized = normalizeUrl(href);
|
|
163
|
+
} catch {
|
|
164
|
+
match = regex.exec(htmlInput);
|
|
165
|
+
continue;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const hostname = getHostname(normalized);
|
|
169
|
+
if (!isValidPublicHostname(hostname)) {
|
|
170
|
+
match = regex.exec(htmlInput);
|
|
171
|
+
continue;
|
|
172
|
+
}
|
|
173
|
+
if (isTrackingOrAdUrl(normalized)) {
|
|
174
|
+
match = regex.exec(htmlInput);
|
|
175
|
+
continue;
|
|
176
|
+
}
|
|
177
|
+
if (seenFallback.has(normalized)) {
|
|
178
|
+
match = regex.exec(htmlInput);
|
|
179
|
+
continue;
|
|
180
|
+
}
|
|
181
|
+
seenFallback.add(normalized);
|
|
182
|
+
|
|
183
|
+
const title = stripHtml(match[2]);
|
|
184
|
+
if (!title) {
|
|
185
|
+
match = regex.exec(htmlInput);
|
|
186
|
+
continue;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
fallback.push({
|
|
190
|
+
rank: fallback.length + 1,
|
|
191
|
+
url: normalized,
|
|
192
|
+
title,
|
|
193
|
+
snippet: "",
|
|
194
|
+
domain: hostname,
|
|
195
|
+
});
|
|
196
|
+
|
|
197
|
+
if (fallback.length >= limit) break;
|
|
198
|
+
match = regex.exec(htmlInput);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
return fallback;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
export async function discoverSearchResults(
|
|
205
|
+
query: string,
|
|
206
|
+
options: Record<string, unknown> = {},
|
|
207
|
+
): Promise<Record<string, unknown>> {
|
|
208
|
+
const startedAt = Date.now();
|
|
209
|
+
|
|
210
|
+
const numResults = clampInteger(options.numResults, 12, { min: 1, max: 100 });
|
|
211
|
+
const category = String(options.category || "")
|
|
212
|
+
.trim()
|
|
213
|
+
.toLowerCase();
|
|
214
|
+
|
|
215
|
+
const includeDomains = parseStringList(options.includeDomains)
|
|
216
|
+
.map((value) => normalizeDomainFilter(value))
|
|
217
|
+
.filter(Boolean);
|
|
218
|
+
const excludeDomains = parseStringList(options.excludeDomains)
|
|
219
|
+
.map((value) => normalizeDomainFilter(value))
|
|
220
|
+
.filter(Boolean);
|
|
221
|
+
|
|
222
|
+
const includeText = parseStringList(options.includeText).map((value) => value.toLowerCase());
|
|
223
|
+
const excludeText = parseStringList(options.excludeText).map((value) => value.toLowerCase());
|
|
224
|
+
|
|
225
|
+
const profile = categoryProfile(category);
|
|
226
|
+
|
|
227
|
+
const mergedIncludeDomains = Array.from(
|
|
228
|
+
new Set([...includeDomains, ...profile.includeDomains.map((value) => normalizeDomainFilter(value))]),
|
|
229
|
+
).filter(Boolean);
|
|
230
|
+
|
|
231
|
+
const mergedIncludeText = Array.from(
|
|
232
|
+
new Set([...includeText, ...profile.includeText.map((value) => value.toLowerCase())]),
|
|
233
|
+
).filter(Boolean);
|
|
234
|
+
|
|
235
|
+
const queryHint = String(profile.queryHint || "").trim();
|
|
236
|
+
const effectiveQuery = queryHint ? `${query} ${queryHint}` : query;
|
|
237
|
+
|
|
238
|
+
const requestedEngine = normalizeSearchEngine(options.searchEngine, "duckduckgo");
|
|
239
|
+
const requestedSearch = resolveSearchEngineTemplate(requestedEngine, options.searchEngineUrlTemplate);
|
|
240
|
+
const fallbackSearch =
|
|
241
|
+
requestedSearch.engine === "duckduckgo"
|
|
242
|
+
? null
|
|
243
|
+
: {
|
|
244
|
+
engine: "duckduckgo" as const,
|
|
245
|
+
template: SEARCH_ENGINE_PRESETS.duckduckgo,
|
|
246
|
+
};
|
|
247
|
+
|
|
248
|
+
const attempts = fallbackSearch ? [requestedSearch, fallbackSearch] : [requestedSearch];
|
|
249
|
+
|
|
250
|
+
let activeSearch = requestedSearch;
|
|
251
|
+
let searchUrl = "";
|
|
252
|
+
let results: Array<Record<string, unknown>> = [];
|
|
253
|
+
let lastError: unknown = null;
|
|
254
|
+
|
|
255
|
+
for (const attempt of attempts) {
|
|
256
|
+
const candidateSearchUrl = buildSearchUrlFromTemplate(attempt.template, effectiveQuery);
|
|
257
|
+
|
|
258
|
+
try {
|
|
259
|
+
const response = await fetch(candidateSearchUrl, {
|
|
260
|
+
method: "GET",
|
|
261
|
+
headers: {
|
|
262
|
+
"user-agent":
|
|
263
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36",
|
|
264
|
+
"accept-language": "en-US,en;q=0.9",
|
|
265
|
+
},
|
|
266
|
+
});
|
|
267
|
+
|
|
268
|
+
if (!response.ok) {
|
|
269
|
+
const text = await response.text();
|
|
270
|
+
throw new Error(`Search discovery failed (HTTP ${response.status}): ${text.slice(0, 300)}`);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
const html = await response.text();
|
|
274
|
+
const parsedResults = await parseDuckDuckGoResults(html, { limit: Math.max(numResults * 4, 24) });
|
|
275
|
+
const minAcceptableResults = Math.max(1, Math.min(2, numResults));
|
|
276
|
+
|
|
277
|
+
if (parsedResults.length < minAcceptableResults && attempt.engine !== "duckduckgo" && fallbackSearch) {
|
|
278
|
+
continue;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
activeSearch = attempt;
|
|
282
|
+
searchUrl = candidateSearchUrl;
|
|
283
|
+
results = parsedResults;
|
|
284
|
+
break;
|
|
285
|
+
} catch (error) {
|
|
286
|
+
lastError = error;
|
|
287
|
+
|
|
288
|
+
if (attempt.engine !== "duckduckgo" && fallbackSearch) {
|
|
289
|
+
continue;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
throw error;
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
if (!searchUrl) {
|
|
297
|
+
if (lastError instanceof Error) throw lastError;
|
|
298
|
+
if (lastError !== null && lastError !== undefined) throw new Error(String(lastError));
|
|
299
|
+
throw new Error("Search discovery failed: no search URL could be resolved.");
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
if (category === "pdf") {
|
|
303
|
+
results = results.filter((result) =>
|
|
304
|
+
String(result.url || "")
|
|
305
|
+
.toLowerCase()
|
|
306
|
+
.includes(".pdf"),
|
|
307
|
+
);
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
if (mergedIncludeDomains.length > 0) {
|
|
311
|
+
results = results.filter((result) => domainMatches(String(result.domain || ""), mergedIncludeDomains));
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
if (excludeDomains.length > 0) {
|
|
315
|
+
results = results.filter((result) => !domainMatches(String(result.domain || ""), excludeDomains));
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
if (mergedIncludeText.length > 0) {
|
|
319
|
+
results = results.filter((result) => {
|
|
320
|
+
const haystack = `${result.title || ""} ${result.snippet || ""} ${result.url || ""}`.toLowerCase();
|
|
321
|
+
return mergedIncludeText.every((term) => haystack.includes(term));
|
|
322
|
+
});
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
if (excludeText.length > 0) {
|
|
326
|
+
results = results.filter((result) => {
|
|
327
|
+
const haystack = `${result.title || ""} ${result.snippet || ""} ${result.url || ""}`.toLowerCase();
|
|
328
|
+
return excludeText.every((term) => !haystack.includes(term));
|
|
329
|
+
});
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
results = results.slice(0, numResults).map((result, index) => ({ ...result, rank: index + 1 }));
|
|
333
|
+
|
|
334
|
+
return {
|
|
335
|
+
query,
|
|
336
|
+
effectiveQuery,
|
|
337
|
+
category: category || null,
|
|
338
|
+
searchEngine: activeSearch.engine,
|
|
339
|
+
searchEngineRequested: requestedSearch.engine,
|
|
340
|
+
searchUrlTemplate: activeSearch.template,
|
|
341
|
+
searchUrl,
|
|
342
|
+
results,
|
|
343
|
+
durationMs: Date.now() - startedAt,
|
|
344
|
+
filters: {
|
|
345
|
+
includeDomains: mergedIncludeDomains,
|
|
346
|
+
excludeDomains,
|
|
347
|
+
includeText: mergedIncludeText,
|
|
348
|
+
excludeText,
|
|
349
|
+
},
|
|
350
|
+
};
|
|
351
|
+
}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
const URL_REGEX = /https?:\/\/[^\s)\]>]+/g;
|
|
2
|
+
|
|
3
|
+
export function sanitizeUrlCandidate(input: unknown): string {
|
|
4
|
+
let value = String(input ?? "").trim();
|
|
5
|
+
if (!value) return "";
|
|
6
|
+
|
|
7
|
+
if (value.startsWith("<") && value.endsWith(">")) {
|
|
8
|
+
value = value.slice(1, -1).trim();
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
value = value.replace(/^[\s("'`[]+/, "");
|
|
12
|
+
|
|
13
|
+
for (let i = 0; i < 3; i += 1) {
|
|
14
|
+
const next = value
|
|
15
|
+
.replace(/(?:\*{1,3}|_{1,3}|`{1,3})+$/g, "")
|
|
16
|
+
.replace(/[>\]"'`]+$/g, "")
|
|
17
|
+
.replace(/[.,;:!?]+$/g, "")
|
|
18
|
+
.trim();
|
|
19
|
+
|
|
20
|
+
if (next === value) break;
|
|
21
|
+
value = next;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
return value;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export function normalizeUrl(input: unknown): string {
|
|
28
|
+
const raw = sanitizeUrlCandidate(input);
|
|
29
|
+
if (!raw) throw new Error("URL is required");
|
|
30
|
+
|
|
31
|
+
const withProtocol = /^https?:\/\//i.test(raw) ? raw : `https://${raw}`;
|
|
32
|
+
const url = new URL(withProtocol);
|
|
33
|
+
|
|
34
|
+
if (!["http:", "https:"].includes(url.protocol)) {
|
|
35
|
+
throw new Error(`Unsupported URL protocol: ${url.protocol}`);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
return url.toString();
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export function normalizeUrlForDedupe(input: unknown): string {
|
|
42
|
+
const normalized = normalizeUrl(input);
|
|
43
|
+
const url = new URL(normalized);
|
|
44
|
+
|
|
45
|
+
url.hash = "";
|
|
46
|
+
|
|
47
|
+
if ((url.protocol === "http:" && url.port === "80") || (url.protocol === "https:" && url.port === "443")) {
|
|
48
|
+
url.port = "";
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
if (url.pathname !== "/") {
|
|
52
|
+
const trimmedPath = url.pathname.replace(/\/+$/g, "");
|
|
53
|
+
url.pathname = trimmedPath || "/";
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
return url.toString();
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export function normalizeUniqueUrls(values: Iterable<unknown>): string[] {
|
|
60
|
+
const urls: string[] = [];
|
|
61
|
+
const seen = new Set<string>();
|
|
62
|
+
|
|
63
|
+
for (const value of values) {
|
|
64
|
+
try {
|
|
65
|
+
const normalized = normalizeUrl(value);
|
|
66
|
+
const dedupeKey = normalizeUrlForDedupe(normalized);
|
|
67
|
+
if (seen.has(dedupeKey)) continue;
|
|
68
|
+
seen.add(dedupeKey);
|
|
69
|
+
urls.push(normalized);
|
|
70
|
+
} catch {
|
|
71
|
+
// ignore malformed URL-like fragments
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
return urls;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export function extractSeedUrls(text: unknown): string[] {
|
|
79
|
+
const matches = String(text ?? "").match(URL_REGEX) ?? [];
|
|
80
|
+
return normalizeUniqueUrls(matches);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
export function extractCitationUrls(text: unknown): string[] {
|
|
84
|
+
const matches = String(text ?? "").match(URL_REGEX) ?? [];
|
|
85
|
+
return normalizeUniqueUrls(matches);
|
|
86
|
+
}
|