@dpopsuev/web-spider 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/batch.d.ts +24 -0
  2. package/dist/batch.d.ts.map +1 -0
  3. package/dist/batch.js +68 -0
  4. package/dist/cache.d.ts +40 -0
  5. package/dist/cache.d.ts.map +1 -0
  6. package/dist/cache.js +78 -0
  7. package/dist/convert.d.ts +29 -0
  8. package/dist/convert.d.ts.map +1 -0
  9. package/dist/convert.js +131 -0
  10. package/dist/crawl.d.ts +56 -0
  11. package/dist/crawl.d.ts.map +1 -0
  12. package/dist/crawl.js +126 -0
  13. package/dist/disk-cache.d.ts +75 -0
  14. package/dist/disk-cache.d.ts.map +1 -0
  15. package/dist/disk-cache.js +185 -0
  16. package/dist/graph.d.ts +76 -0
  17. package/dist/graph.d.ts.map +1 -0
  18. package/dist/graph.js +156 -0
  19. package/dist/index.d.ts +45 -0
  20. package/dist/index.d.ts.map +1 -0
  21. package/dist/index.js +44 -0
  22. package/dist/parse.d.ts +27 -0
  23. package/dist/parse.d.ts.map +1 -0
  24. package/dist/parse.js +131 -0
  25. package/dist/playwright.d.ts +75 -0
  26. package/dist/playwright.d.ts.map +1 -0
  27. package/dist/playwright.js +141 -0
  28. package/dist/ports.d.ts +104 -0
  29. package/dist/ports.d.ts.map +1 -0
  30. package/dist/ports.js +10 -0
  31. package/dist/robots.d.ts +24 -0
  32. package/dist/robots.d.ts.map +1 -0
  33. package/dist/robots.js +104 -0
  34. package/dist/search.d.ts +47 -0
  35. package/dist/search.d.ts.map +1 -0
  36. package/dist/search.js +112 -0
  37. package/dist/sitemap.d.ts +15 -0
  38. package/dist/sitemap.d.ts.map +1 -0
  39. package/dist/sitemap.js +65 -0
  40. package/dist/spider.d.ts +74 -0
  41. package/dist/spider.d.ts.map +1 -0
  42. package/dist/spider.js +349 -0
  43. package/dist/throttle.d.ts +49 -0
  44. package/dist/throttle.d.ts.map +1 -0
  45. package/dist/throttle.js +85 -0
  46. package/dist/tree.d.ts +34 -0
  47. package/dist/tree.d.ts.map +1 -0
  48. package/dist/tree.js +354 -0
  49. package/dist/types.d.ts +189 -0
  50. package/dist/types.d.ts.map +1 -0
  51. package/dist/types.js +2 -0
  52. package/dist/views.d.ts +17 -0
  53. package/dist/views.d.ts.map +1 -0
  54. package/dist/views.js +39 -0
  55. package/dist/web-search.d.ts +184 -0
  56. package/dist/web-search.d.ts.map +1 -0
  57. package/dist/web-search.js +399 -0
  58. package/fixtures/article-with-images.html +94 -0
  59. package/fixtures/gh-shell.html +32 -0
  60. package/fixtures/guide-ai-agents-web-scraping.json +552 -0
  61. package/fixtures/images/large.jpg +0 -0
  62. package/fixtures/images/small.jpg +0 -0
  63. package/fixtures/images/tiny.png +0 -0
  64. package/fixtures/quotes-index.json +40 -0
  65. package/package.json +47 -0
  66. package/scripts/fetch-guide.mjs +25 -0
  67. package/src/cache.ts +99 -0
  68. package/src/convert.ts +161 -0
  69. package/src/crawl.ts +186 -0
  70. package/src/disk-cache.ts +228 -0
  71. package/src/graph.ts +189 -0
  72. package/src/index.ts +74 -0
  73. package/src/parse.ts +154 -0
  74. package/src/playwright.ts +193 -0
  75. package/src/ports.ts +131 -0
  76. package/src/robots.ts +121 -0
  77. package/src/search.ts +173 -0
  78. package/src/sitemap.ts +67 -0
  79. package/src/spider.ts +475 -0
  80. package/src/throttle.ts +118 -0
  81. package/src/tree.ts +379 -0
  82. package/src/types.ts +225 -0
  83. package/src/views.ts +42 -0
  84. package/src/web-search.ts +548 -0
  85. package/test/convert-images.test.ts +69 -0
  86. package/test/disk-cache-images.test.ts +193 -0
  87. package/test/engine-registry.test.ts +114 -0
  88. package/test/exports.test.ts +124 -0
  89. package/test/get-chunk.test.ts +115 -0
  90. package/test/images-integration.test.ts +359 -0
  91. package/test/improvements.test.ts +279 -0
  92. package/test/inbound-count.test.ts +111 -0
  93. package/test/lean.test.ts +105 -0
  94. package/test/playwright.test.ts +128 -0
  95. package/test/ports.test.ts +161 -0
  96. package/test/search.test.ts +219 -0
  97. package/test/spider-images.test.ts +180 -0
  98. package/test/spider-unit.test.ts +610 -0
  99. package/test/tree.test.ts +272 -0
  100. package/test/types.test.ts +169 -0
  101. package/test/web-search-integration.test.ts +180 -0
  102. package/test/web-search.test.ts +305 -0
  103. package/tsconfig.json +9 -0
  104. package/tsconfig.test.json +7 -0
  105. package/vitest.config.ts +8 -0
package/dist/parse.js ADDED
@@ -0,0 +1,131 @@
1
+ /**
2
+ * DOM parsing helpers.
3
+ *
4
+ * Owns the DOM parsing dependency. spider.ts calls these after fetching HTML;
5
+ * it never touches the DOM library directly.
6
+ */
7
+ import { parseHTML } from "linkedom";
8
+ // ---------------------------------------------------------------------------
9
+ // DOM creation
10
+ // ---------------------------------------------------------------------------
11
+ /**
12
+ * Parse raw HTML into a DOM Document.
13
+ * Uses linkedom — a lightweight server-side DOM that has no CSS engine,
14
+ * no module-level Maps, and a flat CJS dependency tree. Safe to load
15
+ * through jiti's transform pipeline without nativeModules workarounds.
16
+ */
17
+ export function parseDom(html, url) {
18
+ return parseHTML(html, { url }).document;
19
+ }
20
+ // ---------------------------------------------------------------------------
21
+ // Nav classification
22
+ // ---------------------------------------------------------------------------
23
+ const NAV_CLASS_RE = /^(nav|navbar|navigation|menu|menubar|header|footer|sidebar|breadcrumb|topbar|toolbar|site-nav|main-nav|primary-nav|global-nav)$/i;
24
+ /** True if el or any ancestor up to 5 levels looks like navigation chrome. */
25
+ export function isNavElement(el) {
26
+ if (el.closest("nav, header, footer, aside"))
27
+ return true;
28
+ if (el.closest("[role='navigation'],[role='banner'],[role='contentinfo'],[role='complementary']"))
29
+ return true;
30
+ let node = el;
31
+ for (let i = 0; i < 5; i++) {
32
+ if (!node)
33
+ break;
34
+ for (const cls of node.classList) {
35
+ if (NAV_CLASS_RE.test(cls))
36
+ return true;
37
+ }
38
+ node = node.parentElement;
39
+ }
40
+ return false;
41
+ }
42
+ // ---------------------------------------------------------------------------
43
+ // Link text extraction
44
+ // ---------------------------------------------------------------------------
45
+ /** Extract visible text from an anchor, skipping SVG subtrees. */
46
+ export function anchorText(a) {
47
+ if (!a.querySelector("svg")) {
48
+ return (a.textContent ?? "").replace(/\s+/g, " ").trim();
49
+ }
50
+ const clone = a.cloneNode(true);
51
+ for (const svg of [...clone.querySelectorAll("svg")])
52
+ svg.remove();
53
+ return (clone.textContent ?? "").replace(/\s+/g, " ").trim();
54
+ }
55
+ // ---------------------------------------------------------------------------
56
+ // Link extraction
57
+ // ---------------------------------------------------------------------------
58
+ /** Extract outbound links from the DOM, classified as body or nav. */
59
+ export function extractLinks(doc, baseUrl) {
60
+ const origin = new URL(baseUrl).origin;
61
+ return Array.from(doc.querySelectorAll("a[href]"))
62
+ .map((a) => {
63
+ const href = a.href;
64
+ const text = anchorText(a)
65
+ .replace(/\b(open_in_new|navigate_next|navigate_before|arrow_drop_down|arrow_drop_up|chevron_right|chevron_left|expand_more|expand_less)\b/g, "")
66
+ .replace(/\s+/g, " ")
67
+ .trim();
68
+ if (!href || !text || href.startsWith("javascript:"))
69
+ return null;
70
+ return {
71
+ href,
72
+ text,
73
+ isExternal: !href.startsWith(origin),
74
+ rel: isNavElement(a) ? "nav" : "body",
75
+ };
76
+ })
77
+ .filter((l) => l !== null)
78
+ .slice(0, 200);
79
+ }
80
+ // ---------------------------------------------------------------------------
81
+ // Heading extraction
82
+ // ---------------------------------------------------------------------------
83
+ /** Extract h1/h2/h3 headings from Readability article HTML. */
84
+ export function extractHeadings(html) {
85
+ const { document } = parseHTML(`<html><body>${html}</body></html>`);
86
+ const headings = [];
87
+ document.querySelectorAll("h1, h2, h3").forEach((el) => {
88
+ const level = parseInt(el.tagName[1], 10);
89
+ const text = (el.textContent ?? "").trim();
90
+ if (text)
91
+ headings.push({ level, text });
92
+ });
93
+ return headings;
94
+ }
95
+ // ---------------------------------------------------------------------------
96
+ // Tag extraction
97
+ // ---------------------------------------------------------------------------
98
+ /** Extract topic tags from meta keywords and article:tag. */
99
+ export function extractTags(doc) {
100
+ const tags = new Set();
101
+ const keywords = doc.querySelector('meta[name="keywords"]')?.getAttribute("content") ?? "";
102
+ for (const k of keywords
103
+ .split(/[,;]/)
104
+ .map((k) => k.trim().toLowerCase())
105
+ .filter(Boolean)) {
106
+ tags.add(k);
107
+ }
108
+ doc.querySelectorAll('meta[property="article:tag"], meta[name="article:tag"]').forEach((el) => {
109
+ const t = el.getAttribute("content")?.trim().toLowerCase();
110
+ if (t)
111
+ tags.add(t);
112
+ });
113
+ const section = doc.querySelector('meta[property="article:section"]')?.getAttribute("content") ??
114
+ doc.querySelector('meta[property="og:article:section"]')?.getAttribute("content");
115
+ if (section)
116
+ tags.add(section.trim().toLowerCase());
117
+ return [...tags].slice(0, 20);
118
+ }
119
+ // ---------------------------------------------------------------------------
120
+ // Canonical URL extraction
121
+ // ---------------------------------------------------------------------------
122
+ /** Extract canonical URL from link[rel=canonical] or og:url. */
123
+ export function extractCanonicalUrl(doc, fetchedUrl) {
124
+ const canonical = doc.querySelector('link[rel="canonical"]')?.getAttribute("href") ??
125
+ doc.querySelector('meta[property="og:url"]')?.getAttribute("content");
126
+ if (!canonical)
127
+ return undefined;
128
+ const norm = (u) => u.replace(/\/$/, "");
129
+ return norm(canonical) !== norm(fetchedUrl) ? canonical : undefined;
130
+ }
131
+ //# sourceMappingURL=parse.js.map
@@ -0,0 +1,75 @@
1
+ /**
2
+ * Playwright adapter — implements IHttpClient using a headless browser.
3
+ *
4
+ * Uses playwright-extra with the stealth plugin, which patches ~15 headless
5
+ * fingerprint signals (navigator.webdriver, User-Agent, plugins, WebGL, etc.)
6
+ * so the browser is indistinguishable from a real Chrome session.
7
+ *
8
+ * Requires system-installed Chrome (channel:"chrome") — no browser binary
9
+ * is downloaded. Falls back gracefully to plain playwright-core if
10
+ * playwright-extra or the stealth plugin are not installed.
11
+ *
12
+ * Browser lifecycle:
13
+ * - Launched lazily on the first fetch() call.
14
+ * - Reused across all subsequent requests (one browser, one tab per request).
15
+ * - Call close() when done to release the browser process.
16
+ *
17
+ * Usage:
18
+ * const client = new PlaywrightHttpClient()
19
+ * const page = await spider(url, { httpClient: client })
20
+ * await client.close()
21
+ */
22
+ import type { HttpRequest, HttpResponse, IHttpClient } from "./ports.js";
23
+ export interface PlaywrightClientOptions {
24
+ /**
25
+ * Browser channel — finds a system-installed browser automatically.
26
+ * "chrome" — Google Chrome (default)
27
+ * "msedge" — Microsoft Edge
28
+ * "chromium" — Playwright's own Chromium (must be installed separately)
29
+ */
30
+ channel?: "chrome" | "msedge" | "chromium";
31
+ /**
32
+ * Explicit path to a browser executable.
33
+ * Overrides `channel`. Use when Chrome is not in the standard location.
34
+ */
35
+ executablePath?: string;
36
+ /**
37
+ * Navigation timeout in ms. Default: 30 000.
38
+ */
39
+ timeoutMs?: number;
40
+ /**
41
+ * When to consider navigation complete.
42
+ * "networkidle" — no network activity for 500ms (best for SPAs, default).
43
+ * "domcontentloaded" — HTML parsed; faster but may miss lazy-loaded content.
44
+ * "load" — window load event fired.
45
+ */
46
+ waitUntil?: "load" | "domcontentloaded" | "networkidle" | "commit";
47
+ /**
48
+ * When true, image and media resource types are allowed through instead of
49
+ * being aborted. Required when spider() is called with captureImages: true
50
+ * so that individual image fetches via this client succeed.
51
+ * Fonts are always blocked regardless of this flag.
52
+ * Default: false.
53
+ */
54
+ captureImages?: boolean;
55
+ }
56
+ export declare class PlaywrightHttpClient implements IHttpClient {
57
+ private browser;
58
+ private readonly channel;
59
+ private readonly executablePath;
60
+ private readonly timeoutMs;
61
+ private readonly waitUntil;
62
+ private readonly captureImages;
63
+ constructor(opts?: PlaywrightClientOptions);
64
+ private getChromium;
65
+ private getBrowser;
66
+ fetch(req: HttpRequest): Promise<HttpResponse>;
67
+ /** Close the shared browser process. Call when the client is no longer needed. */
68
+ close(): Promise<void>;
69
+ }
70
+ /**
71
+ * Create a PlaywrightHttpClient, returning null if playwright-core is not
72
+ * installed. Useful for graceful degradation in environments without a browser.
73
+ */
74
+ export declare function createPlaywrightClient(opts?: PlaywrightClientOptions): PlaywrightHttpClient | null;
75
+ //# sourceMappingURL=playwright.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"playwright.d.ts","sourceRoot":"","sources":["../src/playwright.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,YAAY,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAEzE,MAAM,WAAW,uBAAuB;IACvC;;;;;OAKG;IACH,OAAO,CAAC,EAAE,QAAQ,GAAG,QAAQ,GAAG,UAAU,CAAC;IAC3C;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB;;OAEG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;;;;OAKG;IACH,SAAS,CAAC,EAAE,MAAM,GAAG,kBAAkB,GAAG,aAAa,GAAG,QAAQ,CAAC;IACnE;;;;;;OAMG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;CACxB;AAMD,qBAAa,oBAAqB,YAAW,WAAW;IAEvD,OAAO,CAAC,OAAO,CAAoB;IACnC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAS;IACxC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAU;gBAE5B,IAAI,GAAE,uBAA4B;YAQhC,WAAW;YAiBX,UAAU;IAUlB,KAAK,CAAC,GAAG,EAAE,WAAW,GAAG,OAAO,CAAC,YAAY,CAAC;IAiEpD,kFAAkF;IAC5E,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAM5B;AAED;;;GAGG;AACH,wBAAgB,sBAAsB,CACrC,IAAI,CAAC,EAAE,uBAAuB,GAC5B,oBAAoB,GAAG,IAAI,CAM7B"}
@@ -0,0 +1,141 @@
1
+ /**
2
+ * Playwright adapter — implements IHttpClient using a headless browser.
3
+ *
4
+ * Uses playwright-extra with the stealth plugin, which patches ~15 headless
5
+ * fingerprint signals (navigator.webdriver, User-Agent, plugins, WebGL, etc.)
6
+ * so the browser is indistinguishable from a real Chrome session.
7
+ *
8
+ * Requires system-installed Chrome (channel:"chrome") — no browser binary
9
+ * is downloaded. Falls back gracefully to plain playwright-core if
10
+ * playwright-extra or the stealth plugin are not installed.
11
+ *
12
+ * Browser lifecycle:
13
+ * - Launched lazily on the first fetch() call.
14
+ * - Reused across all subsequent requests (one browser, one tab per request).
15
+ * - Call close() when done to release the browser process.
16
+ *
17
+ * Usage:
18
+ * const client = new PlaywrightHttpClient()
19
+ * const page = await spider(url, { httpClient: client })
20
+ * await client.close()
21
+ */
22
+ // Module-level flag: stealth is wired to the playwright-extra chromium
23
+ // singleton once and stays active for the lifetime of the process.
24
+ let stealthApplied = false;
25
+ export class PlaywrightHttpClient {
26
+ constructor(opts = {}) {
27
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
28
+ this.browser = null;
29
+ this.channel = opts.channel ?? "chrome";
30
+ this.executablePath = opts.executablePath ?? "";
31
+ this.timeoutMs = opts.timeoutMs ?? 30_000;
32
+ this.waitUntil = opts.waitUntil ?? "networkidle";
33
+ this.captureImages = opts.captureImages ?? false;
34
+ }
35
+ async getChromium() {
36
+ // Prefer playwright-extra + stealth — patches headless fingerprints.
37
+ // Falls back to plain playwright-core if playwright-extra isn't installed.
38
+ try {
39
+ const { chromium } = await import("playwright-extra");
40
+ if (!stealthApplied) {
41
+ const { default: StealthPlugin } = await import("puppeteer-extra-plugin-stealth");
42
+ chromium.use(StealthPlugin());
43
+ stealthApplied = true;
44
+ }
45
+ return chromium;
46
+ }
47
+ catch {
48
+ const { chromium } = await import("playwright-core");
49
+ return chromium;
50
+ }
51
+ }
52
+ async getBrowser() {
53
+ if (this.browser?.isConnected())
54
+ return this.browser;
55
+ const chromium = await this.getChromium();
56
+ const launchOpts = this.executablePath
57
+ ? { executablePath: this.executablePath, headless: true }
58
+ : { channel: this.channel, headless: true };
59
+ this.browser = await chromium.launch(launchOpts);
60
+ return this.browser;
61
+ }
62
+ async fetch(req) {
63
+ const browser = await this.getBrowser();
64
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
65
+ const page = await browser.newPage();
66
+ // Suppress browser-side console output and JS errors — they are not
67
+ // useful to the caller and would leak into Pi's TUI stream.
68
+ page.on("console", () => { });
69
+ page.on("pageerror", () => { });
70
+ try {
71
+ // Block fonts always (never needed for HTML extraction).
72
+ // Block images and media during page navigation for speed — unless
73
+ // this is a direct image fetch (Accept: image/*), in which case
74
+ // captureImages:true lets it through so fetchImages() can retrieve
75
+ // the binary via arrayBuffer().
76
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
77
+ await page.route("**/*", (route) => {
78
+ const type = route.request().resourceType();
79
+ const accept = route.request().headers()["accept"] ?? "";
80
+ const isImageFetch = accept.startsWith("image/");
81
+ if (type === "font") {
82
+ route.abort();
83
+ }
84
+ else if (["image", "media"].includes(type) && !(this.captureImages && isImageFetch)) {
85
+ route.abort();
86
+ }
87
+ else {
88
+ route.continue();
89
+ }
90
+ });
91
+ const response = await page.goto(req.url, {
92
+ timeout: this.timeoutMs,
93
+ waitUntil: this.waitUntil,
94
+ });
95
+ if (!response) {
96
+ throw new Error(`Navigation failed — no response for ${req.url}`);
97
+ }
98
+ const status = response.status();
99
+ if (status >= 400) {
100
+ throw new Error(`HTTP ${status} ${response.statusText()} — ${req.url}`);
101
+ }
102
+ // page.content() returns the full serialised DOM after JS execution.
103
+ const html = await page.content();
104
+ const headers = await response.allHeaders();
105
+ return {
106
+ ok: true,
107
+ status,
108
+ statusText: response.statusText(),
109
+ headers: { get: (name) => headers[name.toLowerCase()] ?? null },
110
+ text: async () => html,
111
+ arrayBuffer: async () => {
112
+ const buf = await response.body();
113
+ return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
114
+ },
115
+ };
116
+ }
117
+ finally {
118
+ await page.close();
119
+ }
120
+ }
121
+ /** Close the shared browser process. Call when the client is no longer needed. */
122
+ async close() {
123
+ if (this.browser) {
124
+ await this.browser.close();
125
+ this.browser = null;
126
+ }
127
+ }
128
+ }
129
+ /**
130
+ * Create a PlaywrightHttpClient, returning null if playwright-core is not
131
+ * installed. Useful for graceful degradation in environments without a browser.
132
+ */
133
+ export function createPlaywrightClient(opts) {
134
+ try {
135
+ return new PlaywrightHttpClient(opts);
136
+ }
137
+ catch {
138
+ return null;
139
+ }
140
+ }
141
+ //# sourceMappingURL=playwright.js.map
@@ -0,0 +1,104 @@
1
+ /**
2
+ * Port interfaces — the contracts the core depends on.
3
+ *
4
+ * No concrete imports. Adapters implement these; the core orchestrates them.
5
+ * All ports are optional in SpiderOptions — concrete defaults are wired in
6
+ * spider.ts and crawl.ts so callers need not supply them unless they want
7
+ * to substitute (e.g. inject a mock HTTP client for testing).
8
+ */
9
+ export interface HttpRequest {
10
+ url: string;
11
+ headers?: Record<string, string>;
12
+ signal?: AbortSignal;
13
+ }
14
+ export interface HttpResponse {
15
+ ok: boolean;
16
+ status: number;
17
+ statusText: string;
18
+ headers: {
19
+ get(name: string): string | null;
20
+ };
21
+ text(): Promise<string>;
22
+ arrayBuffer(): Promise<ArrayBuffer>;
23
+ }
24
+ /**
25
+ * Minimal HTTP client port.
26
+ * Default adapter wraps global fetch().
27
+ * Swap for tests: return fixed HTML without touching the network.
28
+ */
29
+ export interface IHttpClient {
30
+ fetch(req: HttpRequest): Promise<HttpResponse>;
31
+ }
32
+ /**
33
+ * Generic cache port.
34
+ * Default adapter: SpiderCache (LRU, TTL).
35
+ * Swap for tests or production: in-memory Map, Redis, SQLite, etc.
36
+ */
37
+ export interface ICache<K, V> {
38
+ get(key: K): V | undefined;
39
+ set(key: K, value: V): void;
40
+ has(key: K): boolean;
41
+ delete(key: K): void;
42
+ /** All currently valid (non-expired) values. */
43
+ values(): V[];
44
+ }
45
+ /**
46
+ * Per-domain request throttle port.
47
+ * Default adapter: DomainThrottle (token bucket + exponential backoff).
48
+ * Swap for tests: no-op implementation that always resolves immediately.
49
+ */
50
+ export interface IThrottle {
51
+ wait(url: string): Promise<void>;
52
+ success(url: string): void;
53
+ rateLimit(url: string, retryAfterHeader: string | null): number;
54
+ setDomainDelay(host: string, ms: number): void;
55
+ readonly maxRetries: number;
56
+ }
57
+ export interface RobotsResult {
58
+ allowed: boolean;
59
+ crawlDelayMs?: number;
60
+ }
61
+ /**
62
+ * robots.txt compliance port.
63
+ * Default adapter: RobotsCache (fetches + parses per origin, 1h TTL).
64
+ * Swap for tests: permissive stub that always returns { allowed: true }.
65
+ */
66
+ export interface IRobotsChecker {
67
+ check(url: string): Promise<RobotsResult>;
68
+ }
69
+ export interface SearchQuery {
70
+ query: string;
71
+ numResults?: number;
72
+ /**
73
+ * Restrict results to content published within this window.
74
+ * Supported by Tavily ("day"|"week"|"month"|"year") and Brave ("pd"|"pw"|"pm"|"py").
75
+ * Adapters map this to their engine-specific parameter name.
76
+ */
77
+ timeRange?: "day" | "week" | "month" | "year";
78
+ /**
79
+ * Search topic mode. "news" prioritises freshly indexed news articles.
80
+ * Supported by Tavily. Ignored by engines that don't support it.
81
+ */
82
+ topic?: "news" | "general";
83
+ }
84
+ /**
85
+ * A single result from a web search engine.
86
+ * Defined here so port interfaces have no dependency on adapter modules.
87
+ */
88
+ export interface WebSearchResult {
89
+ url: string;
90
+ title: string;
91
+ /** Short description or snippet from the search engine. */
92
+ snippet: string;
93
+ /** ISO-8601 or human-readable date, if the engine returned one. */
94
+ publishedAt?: string;
95
+ }
96
+ /**
97
+ * Web search engine port.
98
+ * Adapters: BraveSearchEngine, TavilySearchEngine (in web-search.ts).
99
+ * Swap for tests: stub returning fixed results.
100
+ */
101
+ export interface ISearchEngine {
102
+ search(req: SearchQuery): Promise<WebSearchResult[]>;
103
+ }
104
+ //# sourceMappingURL=ports.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ports.d.ts","sourceRoot":"","sources":["../src/ports.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAMH,MAAM,WAAW,WAAW;IAC3B,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,MAAM,CAAC,EAAE,WAAW,CAAC;CACrB;AAED,MAAM,WAAW,YAAY;IAC5B,EAAE,EAAE,OAAO,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE;QAAE,GAAG,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAAA;KAAE,CAAC;IAC9C,IAAI,IAAI,OAAO,CAAC,MAAM,CAAC,CAAC;IACxB,WAAW,IAAI,OAAO,CAAC,WAAW,CAAC,CAAC;CACpC;AAED;;;;GAIG;AACH,MAAM,WAAW,WAAW;IAC3B,KAAK,CAAC,GAAG,EAAE,WAAW,GAAG,OAAO,CAAC,YAAY,CAAC,CAAC;CAC/C;AAMD;;;;GAIG;AACH,MAAM,WAAW,MAAM,CAAC,CAAC,EAAE,CAAC;IAC3B,GAAG,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,GAAG,SAAS,CAAC;IAC3B,GAAG,CAAC,GAAG,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,GAAG,IAAI,CAAC;IAC5B,GAAG,CAAC,GAAG,EAAE,CAAC,GAAG,OAAO,CAAC;IACrB,MAAM,CAAC,GAAG,EAAE,CAAC,GAAG,IAAI,CAAC;IACrB,gDAAgD;IAChD,MAAM,IAAI,CAAC,EAAE,CAAC;CACd;AAMD;;;;GAIG;AACH,MAAM,WAAW,SAAS;IACzB,IAAI,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IACjC,OAAO,CAAC,GAAG,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,gBAAgB,EAAE,MAAM,GAAG,IAAI,GAAG,MAAM,CAAC;IAChE,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/C,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;CAC5B;AAMD,MAAM,WAAW,YAAY;IAC5B,OAAO,EAAE,OAAO,CAAC;IACjB,YAAY,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;;;GAIG;AACH,MAAM,WAAW,cAAc;IAC9B,KAAK,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC,CAAC;CAC1C;AAMD,MAAM,WAAW,WAAW;IAC3B,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;;;OAIG;IACH,SAAS,CAAC,EAAE,KAAK,GAAG,MAAM,GAAG,OAAO,GAAG,MAAM,CAAC;IAC9C;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;CAC3B;AAED;;;GAGG;AACH,MAAM,WAAW,eAAe;IAC/B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,2DAA2D;IAC3D,OAAO,EAAE,MAAM,CAAC;IAChB,mEAAmE;IACnE,WAAW,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;;;GAIG;AACH,MAAM,WAAW,aAAa;IAC7B,MAAM,CAAC,GAAG,EAAE,WAAW,GAAG,OAAO,CAAC,eAAe,EAAE,CAAC,CAAC;CACrD"}
package/dist/ports.js ADDED
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Port interfaces — the contracts the core depends on.
3
+ *
4
+ * No concrete imports. Adapters implement these; the core orchestrates them.
5
+ * All ports are optional in SpiderOptions — concrete defaults are wired in
6
+ * spider.ts and crawl.ts so callers need not supply them unless they want
7
+ * to substitute (e.g. inject a mock HTTP client for testing).
8
+ */
9
+ export {};
10
+ //# sourceMappingURL=ports.js.map
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Minimal robots.txt fetcher and per-domain cache.
3
+ * Respects User-agent: * directives (Allow, Disallow, Crawl-delay).
4
+ * Fails open — any fetch/parse error allows all URLs.
5
+ */
6
+ import type { IRobotsChecker, RobotsResult } from "./ports.js";
7
+ export declare class RobotsCache implements IRobotsChecker {
8
+ private readonly cache;
9
+ private readonly userAgent;
10
+ constructor(userAgent?: string);
11
+ /**
12
+ * Returns whether the URL is allowed and the crawl-delay if specified.
13
+ * Caches per origin for 1 hour. Fails open on any error.
14
+ */
15
+ check(url: string): Promise<RobotsResult>;
16
+ private fetchRobots;
17
+ }
18
+ /**
19
+ * Factory — avoids jiti/Bun CJS re-export interop where class constructors
20
+ * accessed through a re-export chain can appear undefined at call site.
21
+ * Use this in extension code instead of `new RobotsCache()`.
22
+ */
23
+ export declare function createRobotsCache(userAgent?: string): RobotsCache;
24
+ //# sourceMappingURL=robots.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"robots.d.ts","sourceRoot":"","sources":["../src/robots.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAwDH,OAAO,KAAK,EAAE,cAAc,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAI/D,qBAAa,WAAY,YAAW,cAAc;IACjD,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAkE;IACxF,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;gBAEvB,SAAS,SAAmB;IAIxC;;;OAGG;IACG,KAAK,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC;YAgBjC,WAAW;CAmBzB;AAED;;;;GAIG;AACH,wBAAgB,iBAAiB,CAAC,SAAS,CAAC,EAAE,MAAM,GAAG,WAAW,CAEjE"}
package/dist/robots.js ADDED
@@ -0,0 +1,104 @@
1
+ /**
2
+ * Minimal robots.txt fetcher and per-domain cache.
3
+ * Respects User-agent: * directives (Allow, Disallow, Crawl-delay).
4
+ * Fails open — any fetch/parse error allows all URLs.
5
+ */
6
+ function parse(text) {
7
+ const lines = text.split(/\r?\n/);
8
+ const directives = [];
9
+ let crawlDelayMs;
10
+ let inBlock = false;
11
+ for (const raw of lines) {
12
+ const line = raw.split("#")[0].trim();
13
+ if (!line)
14
+ continue;
15
+ const colon = line.indexOf(":");
16
+ if (colon === -1)
17
+ continue;
18
+ const key = line.slice(0, colon).trim().toLowerCase();
19
+ const value = line.slice(colon + 1).trim();
20
+ if (key === "user-agent") {
21
+ inBlock = value === "*";
22
+ }
23
+ else if (inBlock) {
24
+ if (key === "disallow" && value) {
25
+ directives.push({ allow: false, path: value });
26
+ }
27
+ else if (key === "allow" && value) {
28
+ directives.push({ allow: true, path: value });
29
+ }
30
+ else if (key === "crawl-delay") {
31
+ const s = parseFloat(value);
32
+ if (!isNaN(s) && s > 0)
33
+ crawlDelayMs = Math.min(s * 1_000, 60_000);
34
+ }
35
+ }
36
+ }
37
+ return { directives, crawlDelayMs };
38
+ }
39
+ function isAllowed(robots, path) {
40
+ // Longest matching path prefix wins.
41
+ let best;
42
+ for (const d of robots.directives) {
43
+ if (path.startsWith(d.path)) {
44
+ if (!best || d.path.length > best.path.length)
45
+ best = d;
46
+ }
47
+ }
48
+ return best?.allow ?? true; // default: allow
49
+ }
50
+ const TTL_MS = 60 * 60 * 1_000; // 1 hour
51
+ export class RobotsCache {
52
+ constructor(userAgent = "web-spider/0.1") {
53
+ this.cache = new Map();
54
+ this.userAgent = userAgent;
55
+ }
56
+ /**
57
+ * Returns whether the URL is allowed and the crawl-delay if specified.
58
+ * Caches per origin for 1 hour. Fails open on any error.
59
+ */
60
+ async check(url) {
61
+ const { origin, pathname } = new URL(url);
62
+ let entry = this.cache.get(origin);
63
+ if (!entry || Date.now() > entry.expiresAt) {
64
+ const robots = await this.fetchRobots(`${origin}/robots.txt`);
65
+ entry = { robots, expiresAt: Date.now() + TTL_MS };
66
+ this.cache.set(origin, entry);
67
+ }
68
+ return {
69
+ allowed: isAllowed(entry.robots, pathname),
70
+ crawlDelayMs: entry.robots.crawlDelayMs,
71
+ };
72
+ }
73
+ async fetchRobots(robotsUrl) {
74
+ try {
75
+ const controller = new AbortController();
76
+ const timer = setTimeout(() => controller.abort(), 5_000);
77
+ let res;
78
+ try {
79
+ res = await globalThis.fetch(robotsUrl, {
80
+ signal: controller.signal,
81
+ headers: { "User-Agent": this.userAgent },
82
+ });
83
+ }
84
+ finally {
85
+ clearTimeout(timer);
86
+ }
87
+ if (!res.ok)
88
+ return { directives: [] }; // 404 → allow all
89
+ return parse(await res.text());
90
+ }
91
+ catch {
92
+ return { directives: [] }; // network error → fail open
93
+ }
94
+ }
95
+ }
96
+ /**
97
+ * Factory — avoids jiti/Bun CJS re-export interop where class constructors
98
+ * accessed through a re-export chain can appear undefined at call site.
99
+ * Use this in extension code instead of `new RobotsCache()`.
100
+ */
101
+ export function createRobotsCache(userAgent) {
102
+ return new RobotsCache(userAgent);
103
+ }
104
+ //# sourceMappingURL=robots.js.map
@@ -0,0 +1,47 @@
1
+ import type { SpideredPage } from "./types.js";
2
+ /** A single ranked match from fuzzySearch. */
3
+ export interface SearchHit {
4
+ /** URL of the page the match came from. */
5
+ url: string;
6
+ /**
7
+ * Stable chunk ID ("url#chunk-N") when the match is in body text.
8
+ * Empty string when the match is in page metadata (title, description,
9
+ * headings).
10
+ */
11
+ chunkId: string;
12
+ /** Nearest heading for the matched chunk, or the matched field name for
13
+ * metadata hits (e.g. "title", "description"). */
14
+ heading: string;
15
+ /** Normalised score 0–1. Higher is a better match. */
16
+ score: number;
17
+ /** Short context window around the best match, ≤ 2×snippetRadius chars.
18
+ * Prefixed/suffixed with "…" when truncated. */
19
+ snippet: string;
20
+ }
21
+ export interface FuzzySearchOptions {
22
+ /** Maximum hits to return (default 10). */
23
+ topN?: number;
24
+ /**
25
+ * Characters of context on each side of the match in the snippet
26
+ * (default 100). Keep low to save tokens; raise when you need more context.
27
+ */
28
+ snippetRadius?: number;
29
+ }
30
+ /**
31
+ * Full-text search across a set of already-spidered pages using MiniSearch
32
+ * (BM25F ranking, fuzzy edit-distance, prefix search, heading field boost ×2).
33
+ *
34
+ * Searches both body chunks and page metadata (title, description, headings).
35
+ * Returns results ranked by score descending, normalised to 0–1.
36
+ *
37
+ * Designed for agent use: call after fetching pages to locate a specific
38
+ * fact, term, or section without dumping all content into context.
39
+ *
40
+ * @example
41
+ * const hits = searchPages(pages, "cost optimization selectors", { topN: 5 })
42
+ * // hits[0].snippet → "…LLM extraction vs Selectors…"
43
+ */
44
+ export declare function searchPages(pages: SpideredPage[], query: string, opts?: FuzzySearchOptions): SearchHit[];
45
+ /** @deprecated Use {@link searchPages} — renamed in v0.4.0 to reflect BM25F ranking. */
46
+ export declare const fuzzySearch: typeof searchPages;
47
+ //# sourceMappingURL=search.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"search.d.ts","sourceRoot":"","sources":["../src/search.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAE/C,8CAA8C;AAC9C,MAAM,WAAW,SAAS;IACzB,2CAA2C;IAC3C,GAAG,EAAE,MAAM,CAAC;IACZ;;;;OAIG;IACH,OAAO,EAAE,MAAM,CAAC;IAChB;uDACmD;IACnD,OAAO,EAAE,MAAM,CAAC;IAChB,sDAAsD;IACtD,KAAK,EAAE,MAAM,CAAC;IACd;qDACiD;IACjD,OAAO,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,kBAAkB;IAClC,2CAA2C;IAC3C,IAAI,CAAC,EAAE,MAAM,CAAC;IACd;;;OAGG;IACH,aAAa,CAAC,EAAE,MAAM,CAAC;CACvB;AA2DD;;;;;;;;;;;;;GAaG;AACH,wBAAgB,WAAW,CAAC,KAAK,EAAE,YAAY,EAAE,EAAE,KAAK,EAAE,MAAM,EAAE,IAAI,GAAE,kBAAuB,GAAG,SAAS,EAAE,CAiE5G;AAED,wFAAwF;AACxF,eAAO,MAAM,WAAW,oBAAc,CAAA"}