@dpopsuev/web-spider 0.10.4 → 0.10.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/batch.js.map +1 -0
- package/dist/cache.js.map +1 -0
- package/dist/convert.js.map +1 -0
- package/dist/crawl.js.map +1 -0
- package/dist/disk-cache.js.map +1 -0
- package/dist/graph.js.map +1 -0
- package/dist/index.js.map +1 -0
- package/dist/parse.js.map +1 -0
- package/dist/playwright.js.map +1 -0
- package/dist/ports.js.map +1 -0
- package/dist/robots.js.map +1 -0
- package/dist/search.js.map +1 -0
- package/dist/sitemap.js.map +1 -0
- package/dist/spider.js.map +1 -0
- package/dist/throttle.js.map +1 -0
- package/dist/tree.js.map +1 -0
- package/dist/types.js.map +1 -0
- package/dist/views.js.map +1 -0
- package/dist/web-search.js.map +1 -0
- package/package.json +2 -1
- package/fixtures/article-with-images.html +0 -94
- package/fixtures/gh-shell.html +0 -32
- package/fixtures/guide-ai-agents-web-scraping.json +0 -552
- package/fixtures/images/large.jpg +0 -0
- package/fixtures/images/small.jpg +0 -0
- package/fixtures/images/tiny.png +0 -0
- package/fixtures/quotes-index.json +0 -40
- package/scripts/fetch-guide.mjs +0 -25
- package/src/cache.ts +0 -99
- package/src/convert.ts +0 -161
- package/src/crawl.ts +0 -186
- package/src/disk-cache.ts +0 -228
- package/src/graph.ts +0 -189
- package/src/index.ts +0 -74
- package/src/parse.ts +0 -154
- package/src/playwright.ts +0 -193
- package/src/ports.ts +0 -131
- package/src/robots.ts +0 -121
- package/src/search.ts +0 -173
- package/src/sitemap.ts +0 -67
- package/src/spider.ts +0 -475
- package/src/throttle.ts +0 -118
- package/src/tree.ts +0 -379
- package/src/types.ts +0 -225
- package/src/views.ts +0 -42
- package/src/web-search.ts +0 -548
- package/test/convert-images.test.ts +0 -69
- package/test/disk-cache-images.test.ts +0 -193
- package/test/engine-registry.test.ts +0 -114
- package/test/exports.test.ts +0 -124
- package/test/get-chunk.test.ts +0 -115
- package/test/images-integration.test.ts +0 -359
- package/test/improvements.test.ts +0 -279
- package/test/inbound-count.test.ts +0 -111
- package/test/lean.test.ts +0 -105
- package/test/playwright.test.ts +0 -128
- package/test/ports.test.ts +0 -161
- package/test/search.test.ts +0 -219
- package/test/spider-images.test.ts +0 -180
- package/test/spider-unit.test.ts +0 -610
- package/test/tree.test.ts +0 -272
- package/test/types.test.ts +0 -169
- package/test/web-search-integration.test.ts +0 -180
- package/test/web-search.test.ts +0 -305
- package/tsconfig.json +0 -9
- package/tsconfig.test.json +0 -7
- package/vitest.config.ts +0 -8
package/src/playwright.ts
DELETED
|
@@ -1,193 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Playwright adapter — implements IHttpClient using a headless browser.
|
|
3
|
-
*
|
|
4
|
-
* Uses playwright-extra with the stealth plugin, which patches ~15 headless
|
|
5
|
-
* fingerprint signals (navigator.webdriver, User-Agent, plugins, WebGL, etc.)
|
|
6
|
-
* so the browser is indistinguishable from a real Chrome session.
|
|
7
|
-
*
|
|
8
|
-
* Requires system-installed Chrome (channel:"chrome") — no browser binary
|
|
9
|
-
* is downloaded. Falls back gracefully to plain playwright-core if
|
|
10
|
-
* playwright-extra or the stealth plugin are not installed.
|
|
11
|
-
*
|
|
12
|
-
* Browser lifecycle:
|
|
13
|
-
* - Launched lazily on the first fetch() call.
|
|
14
|
-
* - Reused across all subsequent requests (one browser, one tab per request).
|
|
15
|
-
* - Call close() when done to release the browser process.
|
|
16
|
-
*
|
|
17
|
-
* Usage:
|
|
18
|
-
* const client = new PlaywrightHttpClient()
|
|
19
|
-
* const page = await spider(url, { httpClient: client })
|
|
20
|
-
* await client.close()
|
|
21
|
-
*/
|
|
22
|
-
|
|
23
|
-
import type { HttpRequest, HttpResponse, IHttpClient } from "./ports.js";
|
|
24
|
-
|
|
25
|
-
export interface PlaywrightClientOptions {
|
|
26
|
-
/**
|
|
27
|
-
* Browser channel — finds a system-installed browser automatically.
|
|
28
|
-
* "chrome" — Google Chrome (default)
|
|
29
|
-
* "msedge" — Microsoft Edge
|
|
30
|
-
* "chromium" — Playwright's own Chromium (must be installed separately)
|
|
31
|
-
*/
|
|
32
|
-
channel?: "chrome" | "msedge" | "chromium";
|
|
33
|
-
/**
|
|
34
|
-
* Explicit path to a browser executable.
|
|
35
|
-
* Overrides `channel`. Use when Chrome is not in the standard location.
|
|
36
|
-
*/
|
|
37
|
-
executablePath?: string;
|
|
38
|
-
/**
|
|
39
|
-
* Navigation timeout in ms. Default: 30 000.
|
|
40
|
-
*/
|
|
41
|
-
timeoutMs?: number;
|
|
42
|
-
/**
|
|
43
|
-
* When to consider navigation complete.
|
|
44
|
-
* "networkidle" — no network activity for 500ms (best for SPAs, default).
|
|
45
|
-
* "domcontentloaded" — HTML parsed; faster but may miss lazy-loaded content.
|
|
46
|
-
* "load" — window load event fired.
|
|
47
|
-
*/
|
|
48
|
-
waitUntil?: "load" | "domcontentloaded" | "networkidle" | "commit";
|
|
49
|
-
/**
|
|
50
|
-
* When true, image and media resource types are allowed through instead of
|
|
51
|
-
* being aborted. Required when spider() is called with captureImages: true
|
|
52
|
-
* so that individual image fetches via this client succeed.
|
|
53
|
-
* Fonts are always blocked regardless of this flag.
|
|
54
|
-
* Default: false.
|
|
55
|
-
*/
|
|
56
|
-
captureImages?: boolean;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
// Module-level flag: stealth is wired to the playwright-extra chromium
|
|
60
|
-
// singleton once and stays active for the lifetime of the process.
|
|
61
|
-
let stealthApplied = false;
|
|
62
|
-
|
|
63
|
-
export class PlaywrightHttpClient implements IHttpClient {
|
|
64
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
65
|
-
private browser: any | null = null;
|
|
66
|
-
private readonly channel: string;
|
|
67
|
-
private readonly executablePath: string;
|
|
68
|
-
private readonly timeoutMs: number;
|
|
69
|
-
private readonly waitUntil: string;
|
|
70
|
-
private readonly captureImages: boolean;
|
|
71
|
-
|
|
72
|
-
constructor(opts: PlaywrightClientOptions = {}) {
|
|
73
|
-
this.channel = opts.channel ?? "chrome";
|
|
74
|
-
this.executablePath = opts.executablePath ?? "";
|
|
75
|
-
this.timeoutMs = opts.timeoutMs ?? 30_000;
|
|
76
|
-
this.waitUntil = opts.waitUntil ?? "networkidle";
|
|
77
|
-
this.captureImages = opts.captureImages ?? false;
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
private async getChromium() {
|
|
81
|
-
// Prefer playwright-extra + stealth — patches headless fingerprints.
|
|
82
|
-
// Falls back to plain playwright-core if playwright-extra isn't installed.
|
|
83
|
-
try {
|
|
84
|
-
const { chromium } = await import("playwright-extra");
|
|
85
|
-
if (!stealthApplied) {
|
|
86
|
-
const { default: StealthPlugin } = await import("puppeteer-extra-plugin-stealth");
|
|
87
|
-
chromium.use(StealthPlugin());
|
|
88
|
-
stealthApplied = true;
|
|
89
|
-
}
|
|
90
|
-
return chromium;
|
|
91
|
-
} catch {
|
|
92
|
-
const { chromium } = await import("playwright-core");
|
|
93
|
-
return chromium;
|
|
94
|
-
}
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
private async getBrowser() {
|
|
98
|
-
if (this.browser?.isConnected()) return this.browser;
|
|
99
|
-
const chromium = await this.getChromium();
|
|
100
|
-
const launchOpts = this.executablePath
|
|
101
|
-
? { executablePath: this.executablePath, headless: true }
|
|
102
|
-
: { channel: this.channel, headless: true };
|
|
103
|
-
this.browser = await chromium.launch(launchOpts);
|
|
104
|
-
return this.browser;
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
async fetch(req: HttpRequest): Promise<HttpResponse> {
|
|
108
|
-
const browser = await this.getBrowser();
|
|
109
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
110
|
-
const page: any = await browser.newPage();
|
|
111
|
-
|
|
112
|
-
// Suppress browser-side console output and JS errors — they are not
|
|
113
|
-
// useful to the caller and would leak into Pi's TUI stream.
|
|
114
|
-
page.on("console", () => {});
|
|
115
|
-
page.on("pageerror", () => {});
|
|
116
|
-
|
|
117
|
-
try {
|
|
118
|
-
// Block fonts always (never needed for HTML extraction).
|
|
119
|
-
// Block images and media during page navigation for speed — unless
|
|
120
|
-
// this is a direct image fetch (Accept: image/*), in which case
|
|
121
|
-
// captureImages:true lets it through so fetchImages() can retrieve
|
|
122
|
-
// the binary via arrayBuffer().
|
|
123
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
124
|
-
await page.route("**/*", (route: any) => {
|
|
125
|
-
const type: string = route.request().resourceType();
|
|
126
|
-
const accept: string = route.request().headers()["accept"] ?? "";
|
|
127
|
-
const isImageFetch = accept.startsWith("image/");
|
|
128
|
-
|
|
129
|
-
if (type === "font") {
|
|
130
|
-
route.abort();
|
|
131
|
-
} else if (["image", "media"].includes(type) && !(this.captureImages && isImageFetch)) {
|
|
132
|
-
route.abort();
|
|
133
|
-
} else {
|
|
134
|
-
route.continue();
|
|
135
|
-
}
|
|
136
|
-
});
|
|
137
|
-
|
|
138
|
-
const response = await page.goto(req.url, {
|
|
139
|
-
timeout: this.timeoutMs,
|
|
140
|
-
waitUntil: this.waitUntil,
|
|
141
|
-
});
|
|
142
|
-
|
|
143
|
-
if (!response) {
|
|
144
|
-
throw new Error(`Navigation failed — no response for ${req.url}`);
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
const status: number = response.status();
|
|
148
|
-
if (status >= 400) {
|
|
149
|
-
throw new Error(`HTTP ${status} ${response.statusText()} — ${req.url}`);
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
// page.content() returns the full serialised DOM after JS execution.
|
|
153
|
-
const html: string = await page.content();
|
|
154
|
-
const headers: Record<string, string> = await response.allHeaders();
|
|
155
|
-
|
|
156
|
-
return {
|
|
157
|
-
ok: true,
|
|
158
|
-
status,
|
|
159
|
-
statusText: response.statusText(),
|
|
160
|
-
headers: { get: (name: string) => headers[name.toLowerCase()] ?? null },
|
|
161
|
-
text: async () => html,
|
|
162
|
-
arrayBuffer: async () => {
|
|
163
|
-
const buf: Buffer = await response.body();
|
|
164
|
-
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength) as ArrayBuffer;
|
|
165
|
-
},
|
|
166
|
-
};
|
|
167
|
-
} finally {
|
|
168
|
-
await page.close();
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
/** Close the shared browser process. Call when the client is no longer needed. */
|
|
173
|
-
async close(): Promise<void> {
|
|
174
|
-
if (this.browser) {
|
|
175
|
-
await this.browser.close();
|
|
176
|
-
this.browser = null;
|
|
177
|
-
}
|
|
178
|
-
}
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
/**
|
|
182
|
-
* Create a PlaywrightHttpClient, returning null if playwright-core is not
|
|
183
|
-
* installed. Useful for graceful degradation in environments without a browser.
|
|
184
|
-
*/
|
|
185
|
-
export function createPlaywrightClient(
|
|
186
|
-
opts?: PlaywrightClientOptions,
|
|
187
|
-
): PlaywrightHttpClient | null {
|
|
188
|
-
try {
|
|
189
|
-
return new PlaywrightHttpClient(opts);
|
|
190
|
-
} catch {
|
|
191
|
-
return null;
|
|
192
|
-
}
|
|
193
|
-
}
|
package/src/ports.ts
DELETED
|
@@ -1,131 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Port interfaces — the contracts the core depends on.
|
|
3
|
-
*
|
|
4
|
-
* No concrete imports. Adapters implement these; the core orchestrates them.
|
|
5
|
-
* All ports are optional in SpiderOptions — concrete defaults are wired in
|
|
6
|
-
* spider.ts and crawl.ts so callers need not supply them unless they want
|
|
7
|
-
* to substitute (e.g. inject a mock HTTP client for testing).
|
|
8
|
-
*/
|
|
9
|
-
|
|
10
|
-
// ---------------------------------------------------------------------------
|
|
11
|
-
// IHttpClient
|
|
12
|
-
// ---------------------------------------------------------------------------
|
|
13
|
-
|
|
14
|
-
export interface HttpRequest {
|
|
15
|
-
url: string;
|
|
16
|
-
headers?: Record<string, string>;
|
|
17
|
-
signal?: AbortSignal;
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
export interface HttpResponse {
|
|
21
|
-
ok: boolean;
|
|
22
|
-
status: number;
|
|
23
|
-
statusText: string;
|
|
24
|
-
headers: { get(name: string): string | null };
|
|
25
|
-
text(): Promise<string>;
|
|
26
|
-
arrayBuffer(): Promise<ArrayBuffer>;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Minimal HTTP client port.
|
|
31
|
-
* Default adapter wraps global fetch().
|
|
32
|
-
* Swap for tests: return fixed HTML without touching the network.
|
|
33
|
-
*/
|
|
34
|
-
export interface IHttpClient {
|
|
35
|
-
fetch(req: HttpRequest): Promise<HttpResponse>;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
// ---------------------------------------------------------------------------
|
|
39
|
-
// ICache<K, V>
|
|
40
|
-
// ---------------------------------------------------------------------------
|
|
41
|
-
|
|
42
|
-
/**
|
|
43
|
-
* Generic cache port.
|
|
44
|
-
* Default adapter: SpiderCache (LRU, TTL).
|
|
45
|
-
* Swap for tests or production: in-memory Map, Redis, SQLite, etc.
|
|
46
|
-
*/
|
|
47
|
-
export interface ICache<K, V> {
|
|
48
|
-
get(key: K): V | undefined;
|
|
49
|
-
set(key: K, value: V): void;
|
|
50
|
-
has(key: K): boolean;
|
|
51
|
-
delete(key: K): void;
|
|
52
|
-
/** All currently valid (non-expired) values. */
|
|
53
|
-
values(): V[];
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
// ---------------------------------------------------------------------------
|
|
57
|
-
// IThrottle
|
|
58
|
-
// ---------------------------------------------------------------------------
|
|
59
|
-
|
|
60
|
-
/**
|
|
61
|
-
* Per-domain request throttle port.
|
|
62
|
-
* Default adapter: DomainThrottle (token bucket + exponential backoff).
|
|
63
|
-
* Swap for tests: no-op implementation that always resolves immediately.
|
|
64
|
-
*/
|
|
65
|
-
export interface IThrottle {
|
|
66
|
-
wait(url: string): Promise<void>;
|
|
67
|
-
success(url: string): void;
|
|
68
|
-
rateLimit(url: string, retryAfterHeader: string | null): number;
|
|
69
|
-
setDomainDelay(host: string, ms: number): void;
|
|
70
|
-
readonly maxRetries: number;
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
// ---------------------------------------------------------------------------
|
|
74
|
-
// IRobotsChecker
|
|
75
|
-
// ---------------------------------------------------------------------------
|
|
76
|
-
|
|
77
|
-
export interface RobotsResult {
|
|
78
|
-
allowed: boolean;
|
|
79
|
-
crawlDelayMs?: number;
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
/**
|
|
83
|
-
* robots.txt compliance port.
|
|
84
|
-
* Default adapter: RobotsCache (fetches + parses per origin, 1h TTL).
|
|
85
|
-
* Swap for tests: permissive stub that always returns { allowed: true }.
|
|
86
|
-
*/
|
|
87
|
-
export interface IRobotsChecker {
|
|
88
|
-
check(url: string): Promise<RobotsResult>;
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
// ---------------------------------------------------------------------------
|
|
92
|
-
// ISearchEngine
|
|
93
|
-
// ---------------------------------------------------------------------------
|
|
94
|
-
|
|
95
|
-
export interface SearchQuery {
|
|
96
|
-
query: string;
|
|
97
|
-
numResults?: number;
|
|
98
|
-
/**
|
|
99
|
-
* Restrict results to content published within this window.
|
|
100
|
-
* Supported by Tavily ("day"|"week"|"month"|"year") and Brave ("pd"|"pw"|"pm"|"py").
|
|
101
|
-
* Adapters map this to their engine-specific parameter name.
|
|
102
|
-
*/
|
|
103
|
-
timeRange?: "day" | "week" | "month" | "year";
|
|
104
|
-
/**
|
|
105
|
-
* Search topic mode. "news" prioritises freshly indexed news articles.
|
|
106
|
-
* Supported by Tavily. Ignored by engines that don't support it.
|
|
107
|
-
*/
|
|
108
|
-
topic?: "news" | "general";
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
/**
|
|
112
|
-
* A single result from a web search engine.
|
|
113
|
-
* Defined here so port interfaces have no dependency on adapter modules.
|
|
114
|
-
*/
|
|
115
|
-
export interface WebSearchResult {
|
|
116
|
-
url: string;
|
|
117
|
-
title: string;
|
|
118
|
-
/** Short description or snippet from the search engine. */
|
|
119
|
-
snippet: string;
|
|
120
|
-
/** ISO-8601 or human-readable date, if the engine returned one. */
|
|
121
|
-
publishedAt?: string;
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
/**
|
|
125
|
-
* Web search engine port.
|
|
126
|
-
* Adapters: BraveSearchEngine, TavilySearchEngine (in web-search.ts).
|
|
127
|
-
* Swap for tests: stub returning fixed results.
|
|
128
|
-
*/
|
|
129
|
-
export interface ISearchEngine {
|
|
130
|
-
search(req: SearchQuery): Promise<WebSearchResult[]>;
|
|
131
|
-
}
|
package/src/robots.ts
DELETED
|
@@ -1,121 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Minimal robots.txt fetcher and per-domain cache.
|
|
3
|
-
* Respects User-agent: * directives (Allow, Disallow, Crawl-delay).
|
|
4
|
-
* Fails open — any fetch/parse error allows all URLs.
|
|
5
|
-
*/
|
|
6
|
-
|
|
7
|
-
interface RobotsDirective {
|
|
8
|
-
allow: boolean;
|
|
9
|
-
path: string;
|
|
10
|
-
}
|
|
11
|
-
|
|
12
|
-
interface ParsedRobots {
|
|
13
|
-
directives: RobotsDirective[];
|
|
14
|
-
/** Crawl-delay in ms, if the robots.txt specified one (capped at 60s). */
|
|
15
|
-
crawlDelayMs?: number;
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
function parse(text: string): ParsedRobots {
|
|
19
|
-
const lines = text.split(/\r?\n/);
|
|
20
|
-
const directives: RobotsDirective[] = [];
|
|
21
|
-
let crawlDelayMs: number | undefined;
|
|
22
|
-
let inBlock = false;
|
|
23
|
-
|
|
24
|
-
for (const raw of lines) {
|
|
25
|
-
const line = raw.split("#")[0].trim();
|
|
26
|
-
if (!line) continue;
|
|
27
|
-
|
|
28
|
-
const colon = line.indexOf(":");
|
|
29
|
-
if (colon === -1) continue;
|
|
30
|
-
const key = line.slice(0, colon).trim().toLowerCase();
|
|
31
|
-
const value = line.slice(colon + 1).trim();
|
|
32
|
-
|
|
33
|
-
if (key === "user-agent") {
|
|
34
|
-
inBlock = value === "*";
|
|
35
|
-
} else if (inBlock) {
|
|
36
|
-
if (key === "disallow" && value) {
|
|
37
|
-
directives.push({ allow: false, path: value });
|
|
38
|
-
} else if (key === "allow" && value) {
|
|
39
|
-
directives.push({ allow: true, path: value });
|
|
40
|
-
} else if (key === "crawl-delay") {
|
|
41
|
-
const s = parseFloat(value);
|
|
42
|
-
if (!isNaN(s) && s > 0) crawlDelayMs = Math.min(s * 1_000, 60_000);
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
return { directives, crawlDelayMs };
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
function isAllowed(robots: ParsedRobots, path: string): boolean {
|
|
51
|
-
// Longest matching path prefix wins.
|
|
52
|
-
let best: RobotsDirective | undefined;
|
|
53
|
-
for (const d of robots.directives) {
|
|
54
|
-
if (path.startsWith(d.path)) {
|
|
55
|
-
if (!best || d.path.length > best.path.length) best = d;
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
return best?.allow ?? true; // default: allow
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
import type { IRobotsChecker, RobotsResult } from "./ports.js";
|
|
62
|
-
|
|
63
|
-
const TTL_MS = 60 * 60 * 1_000; // 1 hour
|
|
64
|
-
|
|
65
|
-
export class RobotsCache implements IRobotsChecker {
|
|
66
|
-
private readonly cache = new Map<string, { robots: ParsedRobots; expiresAt: number }>();
|
|
67
|
-
private readonly userAgent: string;
|
|
68
|
-
|
|
69
|
-
constructor(userAgent = "web-spider/0.1") {
|
|
70
|
-
this.userAgent = userAgent;
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
/**
|
|
74
|
-
* Returns whether the URL is allowed and the crawl-delay if specified.
|
|
75
|
-
* Caches per origin for 1 hour. Fails open on any error.
|
|
76
|
-
*/
|
|
77
|
-
async check(url: string): Promise<RobotsResult> {
|
|
78
|
-
const { origin, pathname } = new URL(url);
|
|
79
|
-
let entry = this.cache.get(origin);
|
|
80
|
-
|
|
81
|
-
if (!entry || Date.now() > entry.expiresAt) {
|
|
82
|
-
const robots = await this.fetchRobots(`${origin}/robots.txt`);
|
|
83
|
-
entry = { robots, expiresAt: Date.now() + TTL_MS };
|
|
84
|
-
this.cache.set(origin, entry);
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
return {
|
|
88
|
-
allowed: isAllowed(entry.robots, pathname),
|
|
89
|
-
crawlDelayMs: entry.robots.crawlDelayMs,
|
|
90
|
-
};
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
private async fetchRobots(robotsUrl: string): Promise<ParsedRobots> {
|
|
94
|
-
try {
|
|
95
|
-
const controller = new AbortController();
|
|
96
|
-
const timer = setTimeout(() => controller.abort(), 5_000);
|
|
97
|
-
let res: Response;
|
|
98
|
-
try {
|
|
99
|
-
res = await globalThis.fetch(robotsUrl, {
|
|
100
|
-
signal: controller.signal,
|
|
101
|
-
headers: { "User-Agent": this.userAgent },
|
|
102
|
-
});
|
|
103
|
-
} finally {
|
|
104
|
-
clearTimeout(timer);
|
|
105
|
-
}
|
|
106
|
-
if (!res.ok) return { directives: [] }; // 404 → allow all
|
|
107
|
-
return parse(await res.text());
|
|
108
|
-
} catch {
|
|
109
|
-
return { directives: [] }; // network error → fail open
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
/**
|
|
115
|
-
* Factory — avoids jiti/Bun CJS re-export interop where class constructors
|
|
116
|
-
* accessed through a re-export chain can appear undefined at call site.
|
|
117
|
-
* Use this in extension code instead of `new RobotsCache()`.
|
|
118
|
-
*/
|
|
119
|
-
export function createRobotsCache(userAgent?: string): RobotsCache {
|
|
120
|
-
return new RobotsCache(userAgent);
|
|
121
|
-
}
|
package/src/search.ts
DELETED
|
@@ -1,173 +0,0 @@
|
|
|
1
|
-
import MiniSearch from "minisearch";
|
|
2
|
-
import type { SpideredPage } from "./types.js";
|
|
3
|
-
|
|
4
|
-
/** A single ranked match from fuzzySearch. */
|
|
5
|
-
export interface SearchHit {
|
|
6
|
-
/** URL of the page the match came from. */
|
|
7
|
-
url: string;
|
|
8
|
-
/**
|
|
9
|
-
* Stable chunk ID ("url#chunk-N") when the match is in body text.
|
|
10
|
-
* Empty string when the match is in page metadata (title, description,
|
|
11
|
-
* headings).
|
|
12
|
-
*/
|
|
13
|
-
chunkId: string;
|
|
14
|
-
/** Nearest heading for the matched chunk, or the matched field name for
|
|
15
|
-
* metadata hits (e.g. "title", "description"). */
|
|
16
|
-
heading: string;
|
|
17
|
-
/** Normalised score 0–1. Higher is a better match. */
|
|
18
|
-
score: number;
|
|
19
|
-
/** Short context window around the best match, ≤ 2×snippetRadius chars.
|
|
20
|
-
* Prefixed/suffixed with "…" when truncated. */
|
|
21
|
-
snippet: string;
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
export interface FuzzySearchOptions {
|
|
25
|
-
/** Maximum hits to return (default 10). */
|
|
26
|
-
topN?: number;
|
|
27
|
-
/**
|
|
28
|
-
* Characters of context on each side of the match in the snippet
|
|
29
|
-
* (default 100). Keep low to save tokens; raise when you need more context.
|
|
30
|
-
*/
|
|
31
|
-
snippetRadius?: number;
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
// ---------------------------------------------------------------------------
|
|
35
|
-
// Internal types
|
|
36
|
-
// ---------------------------------------------------------------------------
|
|
37
|
-
|
|
38
|
-
interface SearchDoc {
|
|
39
|
-
/** Unique stable ID used by MiniSearch — chunk id or synthetic meta id. */
|
|
40
|
-
id: string;
|
|
41
|
-
url: string;
|
|
42
|
-
/** Nearest heading or metadata field name ("title", "description", "h2", …). */
|
|
43
|
-
heading: string;
|
|
44
|
-
/** The text that was indexed and will be searched. */
|
|
45
|
-
text: string;
|
|
46
|
-
/** Same as id for chunks; empty string for metadata docs. */
|
|
47
|
-
chunkId: string;
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
// ---------------------------------------------------------------------------
|
|
51
|
-
// Snippet builder — kept from v1, MiniSearch doesn't generate snippets.
|
|
52
|
-
// ---------------------------------------------------------------------------
|
|
53
|
-
|
|
54
|
-
/**
|
|
55
|
-
* Build a short snippet around the best match position.
|
|
56
|
-
* Falls back to the start of the text when no match is found.
|
|
57
|
-
*/
|
|
58
|
-
function buildSnippet(text: string, fullQuery: string, queryTokens: string[], radius: number): string {
|
|
59
|
-
const lower = text.toLowerCase();
|
|
60
|
-
|
|
61
|
-
let pos = lower.indexOf(fullQuery);
|
|
62
|
-
if (pos === -1) {
|
|
63
|
-
for (const qt of queryTokens) {
|
|
64
|
-
const p = lower.indexOf(qt);
|
|
65
|
-
if (p !== -1) {
|
|
66
|
-
pos = p;
|
|
67
|
-
break;
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
if (pos === -1) pos = 0;
|
|
72
|
-
|
|
73
|
-
const start = Math.max(0, pos - radius);
|
|
74
|
-
const end = Math.min(text.length, pos + Math.max(fullQuery.length, queryTokens[0]?.length ?? 1) + radius);
|
|
75
|
-
const raw = text.slice(start, end).replace(/\s+/g, " ").trim();
|
|
76
|
-
return (start > 0 ? "…" : "") + raw + (end < text.length ? "…" : "");
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
/** Tokenise and lower-case a string — used only for snippet generation. */
|
|
80
|
-
function tokenise(s: string): string[] {
|
|
81
|
-
return s
|
|
82
|
-
.toLowerCase()
|
|
83
|
-
.split(/[\s\-_.,;:!?()[\]{}"'`/\\]+/)
|
|
84
|
-
.filter((t) => t.length > 1);
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
// ---------------------------------------------------------------------------
|
|
88
|
-
// Public API
|
|
89
|
-
// ---------------------------------------------------------------------------
|
|
90
|
-
|
|
91
|
-
/**
|
|
92
|
-
* Full-text search across a set of already-spidered pages using MiniSearch
|
|
93
|
-
* (BM25F ranking, fuzzy edit-distance, prefix search, heading field boost ×2).
|
|
94
|
-
*
|
|
95
|
-
* Searches both body chunks and page metadata (title, description, headings).
|
|
96
|
-
* Returns results ranked by score descending, normalised to 0–1.
|
|
97
|
-
*
|
|
98
|
-
* Designed for agent use: call after fetching pages to locate a specific
|
|
99
|
-
* fact, term, or section without dumping all content into context.
|
|
100
|
-
*
|
|
101
|
-
* @example
|
|
102
|
-
* const hits = searchPages(pages, "cost optimization selectors", { topN: 5 })
|
|
103
|
-
* // hits[0].snippet → "…LLM extraction vs Selectors…"
|
|
104
|
-
*/
|
|
105
|
-
export function searchPages(pages: SpideredPage[], query: string, opts: FuzzySearchOptions = {}): SearchHit[] {
|
|
106
|
-
const { topN = 10, snippetRadius = 100 } = opts;
|
|
107
|
-
|
|
108
|
-
if (!query.trim()) return [];
|
|
109
|
-
|
|
110
|
-
// Build a flat document list — one entry per chunk, one per metadata field.
|
|
111
|
-
const docs: SearchDoc[] = [];
|
|
112
|
-
|
|
113
|
-
for (const page of pages) {
|
|
114
|
-
// Metadata documents
|
|
115
|
-
const metaDocs: Array<{ id: string; heading: string; text: string }> = [
|
|
116
|
-
{ id: `${page.url}#meta-title`, heading: "title", text: page.title },
|
|
117
|
-
...(page.description
|
|
118
|
-
? [{ id: `${page.url}#meta-description`, heading: "description", text: page.description }]
|
|
119
|
-
: []),
|
|
120
|
-
...page.headings.map((h, i) => ({
|
|
121
|
-
id: `${page.url}#meta-h${i}`,
|
|
122
|
-
heading: `h${h.level}`,
|
|
123
|
-
text: h.text,
|
|
124
|
-
})),
|
|
125
|
-
];
|
|
126
|
-
for (const m of metaDocs) {
|
|
127
|
-
docs.push({ id: m.id, url: page.url, heading: m.heading, text: m.text, chunkId: "" });
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
// Chunk documents
|
|
131
|
-
for (const c of page.chunks) {
|
|
132
|
-
docs.push({ id: c.id, url: page.url, heading: c.heading, text: c.text, chunkId: c.id });
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
if (docs.length === 0) return [];
|
|
137
|
-
|
|
138
|
-
const ms = new MiniSearch<SearchDoc>({
|
|
139
|
-
fields: ["text", "heading"],
|
|
140
|
-
storeFields: ["url", "heading", "chunkId", "text"],
|
|
141
|
-
searchOptions: {
|
|
142
|
-
// BM25F: headings are 2× more important than body text.
|
|
143
|
-
boost: { heading: 2 },
|
|
144
|
-
// Edit-distance fuzzy — 0.2 × term length, rounded (e.g. ≤1 for 5-char terms).
|
|
145
|
-
fuzzy: 0.2,
|
|
146
|
-
// Prefix match: "automat" finds "automation", "automated".
|
|
147
|
-
prefix: true,
|
|
148
|
-
},
|
|
149
|
-
});
|
|
150
|
-
|
|
151
|
-
ms.addAll(docs);
|
|
152
|
-
|
|
153
|
-
const results = ms.search(query);
|
|
154
|
-
if (results.length === 0) return [];
|
|
155
|
-
|
|
156
|
-
// Normalise raw BM25 scores to 0–1 by dividing by the top score.
|
|
157
|
-
// This preserves relative ranking while keeping values agent-friendly.
|
|
158
|
-
const maxRaw = results[0].score;
|
|
159
|
-
|
|
160
|
-
const fullQuery = query.trim().toLowerCase();
|
|
161
|
-
const queryTokens = tokenise(query);
|
|
162
|
-
|
|
163
|
-
return results.slice(0, topN).map((r) => ({
|
|
164
|
-
url: String(r["url"]),
|
|
165
|
-
chunkId: String(r["chunkId"]),
|
|
166
|
-
heading: String(r["heading"]),
|
|
167
|
-
score: Math.round(Math.min(r.score / maxRaw, 1) * 100) / 100,
|
|
168
|
-
snippet: buildSnippet(String(r["text"]), fullQuery, queryTokens, snippetRadius),
|
|
169
|
-
}));
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
/** @deprecated Use {@link searchPages} — renamed in v0.4.0 to reflect BM25F ranking. */
|
|
173
|
-
export const fuzzySearch = searchPages
|
package/src/sitemap.ts
DELETED
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Sitemap fetcher and parser.
|
|
3
|
-
*
|
|
4
|
-
* Attempts /sitemap.xml and /sitemap_index.xml. Extracts <loc> URLs.
|
|
5
|
-
* Fails open — any error returns an empty array so callers fall back
|
|
6
|
-
* to normal BFS without noise.
|
|
7
|
-
*/
|
|
8
|
-
|
|
9
|
-
import type { IHttpClient } from "./ports.js";
|
|
10
|
-
|
|
11
|
-
/**
|
|
12
|
-
* Fetch and parse sitemap URLs for the given origin.
|
|
13
|
-
* Supports both standard sitemaps and sitemap index files.
|
|
14
|
-
* Returns deduplicated absolute URLs, empty array on any failure.
|
|
15
|
-
*/
|
|
16
|
-
export async function fetchSitemapUrls(
|
|
17
|
-
origin: string,
|
|
18
|
-
httpClient: IHttpClient,
|
|
19
|
-
): Promise<string[]> {
|
|
20
|
-
const candidates = [`${origin}/sitemap.xml`, `${origin}/sitemap_index.xml`];
|
|
21
|
-
const urls = new Set<string>();
|
|
22
|
-
|
|
23
|
-
for (const sitemapUrl of candidates) {
|
|
24
|
-
try {
|
|
25
|
-
const res = await httpClient.fetch({
|
|
26
|
-
url: sitemapUrl,
|
|
27
|
-
headers: { Accept: "application/xml, text/xml, */*" },
|
|
28
|
-
});
|
|
29
|
-
if (!res.ok) continue;
|
|
30
|
-
const xml = await res.text();
|
|
31
|
-
for (const loc of extractLocs(xml)) {
|
|
32
|
-
// Sitemap index entries point to other sitemaps — fetch those too
|
|
33
|
-
if (loc.endsWith(".xml")) {
|
|
34
|
-
const nested = await fetchSitemapXml(loc, httpClient);
|
|
35
|
-
for (const u of nested) urls.add(u);
|
|
36
|
-
} else {
|
|
37
|
-
urls.add(loc);
|
|
38
|
-
}
|
|
39
|
-
}
|
|
40
|
-
if (urls.size > 0) break; // found a working sitemap
|
|
41
|
-
} catch {
|
|
42
|
-
continue;
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
return [...urls];
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
async function fetchSitemapXml(url: string, httpClient: IHttpClient): Promise<string[]> {
|
|
50
|
-
try {
|
|
51
|
-
const res = await httpClient.fetch({ url });
|
|
52
|
-
if (!res.ok) return [];
|
|
53
|
-
return extractLocs(await res.text());
|
|
54
|
-
} catch {
|
|
55
|
-
return [];
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
function extractLocs(xml: string): string[] {
|
|
60
|
-
const urls: string[] = [];
|
|
61
|
-
const re = /<loc>\s*(https?:\/\/[^<\s]+)\s*<\/loc>/gi;
|
|
62
|
-
let match: RegExpExecArray | null;
|
|
63
|
-
while ((match = re.exec(xml)) !== null) {
|
|
64
|
-
urls.push(match[1].trim());
|
|
65
|
-
}
|
|
66
|
-
return urls;
|
|
67
|
-
}
|