@dpopsuev/web-spider 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/batch.d.ts +24 -0
  2. package/dist/batch.d.ts.map +1 -0
  3. package/dist/batch.js +68 -0
  4. package/dist/cache.d.ts +40 -0
  5. package/dist/cache.d.ts.map +1 -0
  6. package/dist/cache.js +78 -0
  7. package/dist/convert.d.ts +29 -0
  8. package/dist/convert.d.ts.map +1 -0
  9. package/dist/convert.js +131 -0
  10. package/dist/crawl.d.ts +56 -0
  11. package/dist/crawl.d.ts.map +1 -0
  12. package/dist/crawl.js +126 -0
  13. package/dist/disk-cache.d.ts +75 -0
  14. package/dist/disk-cache.d.ts.map +1 -0
  15. package/dist/disk-cache.js +185 -0
  16. package/dist/graph.d.ts +76 -0
  17. package/dist/graph.d.ts.map +1 -0
  18. package/dist/graph.js +156 -0
  19. package/dist/index.d.ts +45 -0
  20. package/dist/index.d.ts.map +1 -0
  21. package/dist/index.js +44 -0
  22. package/dist/parse.d.ts +27 -0
  23. package/dist/parse.d.ts.map +1 -0
  24. package/dist/parse.js +131 -0
  25. package/dist/playwright.d.ts +75 -0
  26. package/dist/playwright.d.ts.map +1 -0
  27. package/dist/playwright.js +141 -0
  28. package/dist/ports.d.ts +104 -0
  29. package/dist/ports.d.ts.map +1 -0
  30. package/dist/ports.js +10 -0
  31. package/dist/robots.d.ts +24 -0
  32. package/dist/robots.d.ts.map +1 -0
  33. package/dist/robots.js +104 -0
  34. package/dist/search.d.ts +47 -0
  35. package/dist/search.d.ts.map +1 -0
  36. package/dist/search.js +112 -0
  37. package/dist/sitemap.d.ts +15 -0
  38. package/dist/sitemap.d.ts.map +1 -0
  39. package/dist/sitemap.js +65 -0
  40. package/dist/spider.d.ts +74 -0
  41. package/dist/spider.d.ts.map +1 -0
  42. package/dist/spider.js +349 -0
  43. package/dist/throttle.d.ts +49 -0
  44. package/dist/throttle.d.ts.map +1 -0
  45. package/dist/throttle.js +85 -0
  46. package/dist/tree.d.ts +34 -0
  47. package/dist/tree.d.ts.map +1 -0
  48. package/dist/tree.js +354 -0
  49. package/dist/types.d.ts +189 -0
  50. package/dist/types.d.ts.map +1 -0
  51. package/dist/types.js +2 -0
  52. package/dist/views.d.ts +17 -0
  53. package/dist/views.d.ts.map +1 -0
  54. package/dist/views.js +39 -0
  55. package/dist/web-search.d.ts +184 -0
  56. package/dist/web-search.d.ts.map +1 -0
  57. package/dist/web-search.js +399 -0
  58. package/fixtures/article-with-images.html +94 -0
  59. package/fixtures/gh-shell.html +32 -0
  60. package/fixtures/guide-ai-agents-web-scraping.json +552 -0
  61. package/fixtures/images/large.jpg +0 -0
  62. package/fixtures/images/small.jpg +0 -0
  63. package/fixtures/images/tiny.png +0 -0
  64. package/fixtures/quotes-index.json +40 -0
  65. package/package.json +47 -0
  66. package/scripts/fetch-guide.mjs +25 -0
  67. package/src/cache.ts +99 -0
  68. package/src/convert.ts +161 -0
  69. package/src/crawl.ts +186 -0
  70. package/src/disk-cache.ts +228 -0
  71. package/src/graph.ts +189 -0
  72. package/src/index.ts +74 -0
  73. package/src/parse.ts +154 -0
  74. package/src/playwright.ts +193 -0
  75. package/src/ports.ts +131 -0
  76. package/src/robots.ts +121 -0
  77. package/src/search.ts +173 -0
  78. package/src/sitemap.ts +67 -0
  79. package/src/spider.ts +475 -0
  80. package/src/throttle.ts +118 -0
  81. package/src/tree.ts +379 -0
  82. package/src/types.ts +225 -0
  83. package/src/views.ts +42 -0
  84. package/src/web-search.ts +548 -0
  85. package/test/convert-images.test.ts +69 -0
  86. package/test/disk-cache-images.test.ts +193 -0
  87. package/test/engine-registry.test.ts +114 -0
  88. package/test/exports.test.ts +124 -0
  89. package/test/get-chunk.test.ts +115 -0
  90. package/test/images-integration.test.ts +359 -0
  91. package/test/improvements.test.ts +279 -0
  92. package/test/inbound-count.test.ts +111 -0
  93. package/test/lean.test.ts +105 -0
  94. package/test/playwright.test.ts +128 -0
  95. package/test/ports.test.ts +161 -0
  96. package/test/search.test.ts +219 -0
  97. package/test/spider-images.test.ts +180 -0
  98. package/test/spider-unit.test.ts +610 -0
  99. package/test/tree.test.ts +272 -0
  100. package/test/types.test.ts +169 -0
  101. package/test/web-search-integration.test.ts +180 -0
  102. package/test/web-search.test.ts +305 -0
  103. package/tsconfig.json +9 -0
  104. package/tsconfig.test.json +7 -0
  105. package/vitest.config.ts +8 -0
@@ -0,0 +1,24 @@
1
+ import type { SpiderCache } from "./cache.js";
2
+ import type { SpiderOptions } from "./spider.js";
3
+ import type { SpideredPage } from "./types.js";
4
+ export interface BatchOptions extends SpiderOptions {
5
+ /** Max concurrent fetches (default 3 — be polite) */
6
+ concurrency?: number;
7
+ /** Fixed delay in ms between each fetch start (default 300) */
8
+ delayMs?: number;
9
+ /** Optional cache — already-cached URLs are skipped */
10
+ cache?: SpiderCache;
11
+ /** Called after each URL completes (success or failure) */
12
+ onProgress?: (done: number, total: number, url: string, error?: Error) => void;
13
+ }
14
+ /**
15
+ * Spider multiple URLs concurrently with a bounded semaphore.
16
+ *
17
+ * Returns a Map keyed by URL. Value is either a SpideredPage (success)
18
+ * or an Error (failure). Errors do not poison the batch.
19
+ *
20
+ * Cache integration: if `opts.cache` is provided, cached pages are
21
+ * returned immediately and do not count toward concurrency.
22
+ */
23
+ export declare function batchSpider(urls: string[], opts?: BatchOptions): Promise<Map<string, SpideredPage | Error>>;
24
+ //# sourceMappingURL=batch.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"batch.d.ts","sourceRoot":"","sources":["../src/batch.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAC9C,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAEjD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAE/C,MAAM,WAAW,YAAa,SAAQ,aAAa;IAClD,qDAAqD;IACrD,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,+DAA+D;IAC/D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uDAAuD;IACvD,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,2DAA2D;IAC3D,UAAU,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,KAAK,KAAK,IAAI,CAAC;CAC/E;AAED;;;;;;;;GAQG;AACH,wBAAsB,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,IAAI,GAAE,YAAiB,GAAG,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,YAAY,GAAG,KAAK,CAAC,CAAC,CAmErH"}
package/dist/batch.js ADDED
@@ -0,0 +1,68 @@
1
+ import { spider } from "./spider.js";
2
+ /**
3
+ * Spider multiple URLs concurrently with a bounded semaphore.
4
+ *
5
+ * Returns a Map keyed by URL. Value is either a SpideredPage (success)
6
+ * or an Error (failure). Errors do not poison the batch.
7
+ *
8
+ * Cache integration: if `opts.cache` is provided, cached pages are
9
+ * returned immediately and do not count toward concurrency.
10
+ */
11
+ export async function batchSpider(urls, opts = {}) {
12
+ // Strip crawl-only options that batchSpider doesn't use so they don't
13
+ // confuse callers and don't get forwarded to spider() where they'd be
14
+ // applied per-call rather than shared (use crawl() for that).
15
+ const { concurrency = 3, delayMs = 300, cache, onProgress, throttle: _throttle, robotsCache: _robotsCache, // consumed here, not forwarded
16
+ ...spiderOpts } = opts;
17
+ const results = new Map();
18
+ const unique = [...new Set(urls)];
19
+ let done = 0;
20
+ // Satisfy cache hits synchronously before touching the network
21
+ const toFetch = [];
22
+ for (const url of unique) {
23
+ const cached = cache?.get(url);
24
+ if (cached) {
25
+ results.set(url, cached);
26
+ done++;
27
+ onProgress?.(done, unique.length, url);
28
+ }
29
+ else {
30
+ toFetch.push(url);
31
+ }
32
+ }
33
+ if (toFetch.length === 0)
34
+ return results;
35
+ // Semaphore: at most `concurrency` in-flight at once
36
+ let inFlight = 0;
37
+ let index = 0;
38
+ await new Promise((resolve) => {
39
+ const tryNext = () => {
40
+ while (inFlight < concurrency && index < toFetch.length) {
41
+ const url = toFetch[index++];
42
+ inFlight++;
43
+ const delay = delayMs > 0 ? new Promise((r) => setTimeout(r, delayMs * (index - 1))) : Promise.resolve();
44
+ delay
45
+ .then(() => spider(url, spiderOpts))
46
+ .then((page) => {
47
+ results.set(url, page);
48
+ cache?.set(url, page);
49
+ })
50
+ .catch((err) => {
51
+ results.set(url, err instanceof Error ? err : new Error(String(err)));
52
+ })
53
+ .finally(() => {
54
+ done++;
55
+ onProgress?.(done, unique.length, url, results.get(url) instanceof Error ? results.get(url) : undefined);
56
+ inFlight--;
57
+ if (done === unique.length)
58
+ resolve();
59
+ else
60
+ tryNext();
61
+ });
62
+ }
63
+ };
64
+ tryNext();
65
+ });
66
+ return results;
67
+ }
68
+ //# sourceMappingURL=batch.js.map
@@ -0,0 +1,40 @@
1
+ import type { ICache } from "./ports.js";
2
+ import type { SpideredPage } from "./types.js";
3
+ export interface SpiderCacheOptions {
4
+ /** Maximum number of pages to hold (default 500) */
5
+ maxSize?: number;
6
+ /** Time-to-live in milliseconds (default 30 min) */
7
+ ttlMs?: number;
8
+ }
9
+ /**
10
+ * LRU cache for spidered pages.
11
+ *
12
+ * Implements the Identity Map pattern from Local Materialized View:
13
+ * exactly one entry per normalised URL — duplicate fetches never happen.
14
+ *
15
+ * Uses a plain object (Object.create(null)) for storage rather than a Map.
16
+ * Plain objects carry no realm-specific internal slots, so they are safe
17
+ * across V8 context (realm) boundaries — e.g. when the cache is constructed
18
+ * in an ESM module realm but called from a jiti VM-sandbox realm.
19
+ *
20
+ * JavaScript objects maintain insertion order for string keys (ES2015+),
21
+ * so delete-then-reinsert gives the same LRU-tail promotion semantics as a
22
+ * Map without any cross-realm risk.
23
+ */
24
+ export declare class SpiderCache implements ICache<string, SpideredPage> {
25
+ private readonly store;
26
+ private readonly maxSize;
27
+ private readonly ttlMs;
28
+ constructor(opts?: SpiderCacheOptions);
29
+ /** Normalise a URL so http/https and trailing slashes don't cause misses. */
30
+ private key;
31
+ get(url: string): SpideredPage | undefined;
32
+ set(url: string, page: SpideredPage): void;
33
+ has(url: string): boolean;
34
+ delete(url: string): void;
35
+ clear(): void;
36
+ get size(): number;
37
+ /** All currently valid pages (does not update LRU order). */
38
+ values(): SpideredPage[];
39
+ }
40
+ //# sourceMappingURL=cache.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cache.d.ts","sourceRoot":"","sources":["../src/cache.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,YAAY,CAAC;AACzC,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAO/C,MAAM,WAAW,kBAAkB;IAClC,oDAAoD;IACpD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,oDAAoD;IACpD,KAAK,CAAC,EAAE,MAAM,CAAC;CACf;AAED;;;;;;;;;;;;;;GAcG;AACH,qBAAa,WAAY,YAAW,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC;IAC/D,OAAO,CAAC,QAAQ,CAAC,KAAK,CAA+D;IACrF,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAS;gBAEnB,IAAI,GAAE,kBAAuB;IAKzC,6EAA6E;IAC7E,OAAO,CAAC,GAAG;IAUX,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,YAAY,GAAG,SAAS;IAe1C,GAAG,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,YAAY,GAAG,IAAI;IAS1C,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAIzB,MAAM,CAAC,GAAG,EAAE,MAAM,GAAG,IAAI;IAIzB,KAAK,IAAI,IAAI;IAIb,IAAI,IAAI,IAAI,MAAM,CAEjB;IAED,6DAA6D;IAC7D,MAAM,IAAI,YAAY,EAAE;CAMxB"}
package/dist/cache.js ADDED
@@ -0,0 +1,78 @@
1
+ /**
2
+ * LRU cache for spidered pages.
3
+ *
4
+ * Implements the Identity Map pattern from Local Materialized View:
5
+ * exactly one entry per normalised URL — duplicate fetches never happen.
6
+ *
7
+ * Uses a plain object (Object.create(null)) for storage rather than a Map.
8
+ * Plain objects carry no realm-specific internal slots, so they are safe
9
+ * across V8 context (realm) boundaries — e.g. when the cache is constructed
10
+ * in an ESM module realm but called from a jiti VM-sandbox realm.
11
+ *
12
+ * JavaScript objects maintain insertion order for string keys (ES2015+),
13
+ * so delete-then-reinsert gives the same LRU-tail promotion semantics as a
14
+ * Map without any cross-realm risk.
15
+ */
16
+ export class SpiderCache {
17
+ constructor(opts = {}) {
18
+ this.store = Object.create(null);
19
+ this.maxSize = opts.maxSize ?? 500;
20
+ this.ttlMs = opts.ttlMs ?? 30 * 60 * 1000;
21
+ }
22
+ /** Normalise a URL so http/https and trailing slashes don't cause misses. */
23
+ key(url) {
24
+ try {
25
+ const u = new URL(url);
26
+ u.hash = "";
27
+ return u.toString().replace(/\/$/, "");
28
+ }
29
+ catch {
30
+ return url;
31
+ }
32
+ }
33
+ get(url) {
34
+ const k = this.key(url);
35
+ const entry = this.store[k];
36
+ if (!entry)
37
+ return undefined;
38
+ if (Date.now() > entry.expiresAt) {
39
+ delete this.store[k];
40
+ return undefined;
41
+ }
42
+ // Promote to tail (most-recently-used) by delete + reinsert.
43
+ // Object insertion order is preserved for string keys in ES2015+.
44
+ delete this.store[k];
45
+ this.store[k] = entry;
46
+ return entry.page;
47
+ }
48
+ set(url, page) {
49
+ const k = this.key(url);
50
+ if (Object.keys(this.store).length >= this.maxSize && !(k in this.store)) {
51
+ const lruKey = Object.keys(this.store)[0];
52
+ if (lruKey !== undefined)
53
+ delete this.store[lruKey];
54
+ }
55
+ this.store[k] = { page, expiresAt: Date.now() + this.ttlMs };
56
+ }
57
+ has(url) {
58
+ return this.get(url) !== undefined;
59
+ }
60
+ delete(url) {
61
+ delete this.store[this.key(url)];
62
+ }
63
+ clear() {
64
+ for (const k of Object.keys(this.store))
65
+ delete this.store[k];
66
+ }
67
+ get size() {
68
+ return Object.keys(this.store).length;
69
+ }
70
+ /** All currently valid pages (does not update LRU order). */
71
+ values() {
72
+ const now = Date.now();
73
+ return Object.values(this.store)
74
+ .filter((e) => e !== undefined && e.expiresAt > now)
75
+ .map((e) => e.page);
76
+ }
77
+ }
78
+ //# sourceMappingURL=cache.js.map
@@ -0,0 +1,29 @@
1
+ /**
2
+ * Markdown conversion and chunk splitting.
3
+ *
4
+ * Owns the Turndown dependency. spider.ts calls toMarkdown() and chunk();
5
+ * it never imports Turndown directly.
6
+ */
7
+ import type { Chunk, ChunkType } from "./types.js";
8
+ export interface ToMarkdownOptions {
9
+ /**
10
+ * When true, <img> tags are rendered as ![alt](src) instead of being stripped.
11
+ * Use when captureImages is enabled so image references appear in the markdown.
12
+ * Default: false.
13
+ */
14
+ keepImages?: boolean;
15
+ }
16
+ /** Convert Readability article HTML to clean markdown. */
17
+ export declare function toMarkdown(html: string, opts?: ToMarkdownOptions): string;
18
+ /** Detect the dominant content type from a markdown buffer. */
19
+ export declare function detectContentType(lines: string[]): ChunkType;
20
+ /**
21
+ * Split markdown into RAG-ready chunks at heading boundaries.
22
+ *
23
+ * Atomicity guarantees:
24
+ * - Fenced code blocks (``` ... ```) are never split.
25
+ * - Markdown tables (lines starting with |) are always flushed as a single
26
+ * chunk. Prose before the table is flushed first so the table is isolated.
27
+ */
28
+ export declare function chunk(markdown: string, baseUrl: string): Chunk[];
29
+ //# sourceMappingURL=convert.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"convert.d.ts","sourceRoot":"","sources":["../src/convert.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,KAAK,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AA8BnD,MAAM,WAAW,iBAAiB;IACjC;;;;OAIG;IACH,UAAU,CAAC,EAAE,OAAO,CAAC;CACrB;AAED,0DAA0D;AAC1D,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,iBAAiB,GAAG,MAAM,CAGzE;AAQD,+DAA+D;AAC/D,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,SAAS,CAW5D;AAMD;;;;;;;GAOG;AACH,wBAAgB,KAAK,CAAC,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,KAAK,EAAE,CA2EhE"}
@@ -0,0 +1,131 @@
1
+ /**
2
+ * Markdown conversion and chunk splitting.
3
+ *
4
+ * Owns the Turndown dependency. spider.ts calls toMarkdown() and chunk();
5
+ * it never imports Turndown directly.
6
+ */
7
+ import TurndownService from "turndown";
8
+ const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
9
+ // Disable escape — Turndown escapes markdown-special chars by default,
10
+ // producing backslash noise that is unnatural for agent consumption.
11
+ turndown.escape = (s) => s;
12
+ // Strip images by default — agents cannot see them and alt-text is noise.
13
+ // Disabled when keepImages: true is passed to toMarkdown().
14
+ turndown.addRule("strip-images", {
15
+ filter: "img",
16
+ replacement: () => "",
17
+ });
18
+ const turndownWithImages = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
19
+ turndownWithImages.escape = (s) => s;
20
+ /** Convert Readability article HTML to clean markdown. */
21
+ export function toMarkdown(html, opts) {
22
+ if (opts?.keepImages)
23
+ return turndownWithImages.turndown(html);
24
+ return turndown.turndown(html);
25
+ }
26
+ // ---------------------------------------------------------------------------
27
+ // Content type detection
28
+ // ---------------------------------------------------------------------------
29
+ const CHUNK_TARGET_WORDS = 150;
30
+ /** Detect the dominant content type from a markdown buffer. */
31
+ export function detectContentType(lines) {
32
+ for (const line of lines) {
33
+ const t = line.trim();
34
+ if (!t)
35
+ continue;
36
+ if (t.startsWith("```"))
37
+ return "code";
38
+ if (t.startsWith("|"))
39
+ return "table";
40
+ if (/^[-*+] /.test(t) || /^\d+\. /.test(t))
41
+ return "list";
42
+ if (t.startsWith(">"))
43
+ return "blockquote";
44
+ return "text";
45
+ }
46
+ return "text";
47
+ }
48
+ // ---------------------------------------------------------------------------
49
+ // Chunking
50
+ // ---------------------------------------------------------------------------
51
+ /**
52
+ * Split markdown into RAG-ready chunks at heading boundaries.
53
+ *
54
+ * Atomicity guarantees:
55
+ * - Fenced code blocks (``` ... ```) are never split.
56
+ * - Markdown tables (lines starting with |) are always flushed as a single
57
+ * chunk. Prose before the table is flushed first so the table is isolated.
58
+ */
59
+ export function chunk(markdown, baseUrl) {
60
+ const chunks = [];
61
+ const lines = markdown.split("\n");
62
+ let heading = "";
63
+ let buffer = [];
64
+ let index = 0;
65
+ let inCode = false;
66
+ let inTable = false;
67
+ const flush = () => {
68
+ const text = buffer.join("\n").trim();
69
+ if (!text)
70
+ return;
71
+ const wordCount = text.split(/\s+/).filter(Boolean).length;
72
+ if (wordCount < 10)
73
+ return;
74
+ const contentType = detectContentType(buffer);
75
+ chunks.push({ id: `${baseUrl}#chunk-${index}`, index, heading, text, wordCount, contentType });
76
+ index++;
77
+ buffer = [];
78
+ };
79
+ for (const line of lines) {
80
+ const trimmed = line.trim();
81
+ // ── Fenced code block toggle ──────────────────────────────────────────
82
+ if (trimmed.startsWith("```")) {
83
+ inCode = !inCode;
84
+ buffer.push(line);
85
+ continue;
86
+ }
87
+ if (inCode) {
88
+ buffer.push(line);
89
+ continue;
90
+ }
91
+ // ── Table rows ────────────────────────────────────────────────────────
92
+ const isTableRow = trimmed.startsWith("|");
93
+ if (isTableRow) {
94
+ if (!inTable) {
95
+ // Table is starting — flush any preceding prose so the table
96
+ // gets its own isolated chunk.
97
+ flush();
98
+ inTable = true;
99
+ }
100
+ buffer.push(line);
101
+ continue;
102
+ }
103
+ if (inTable) {
104
+ // Table just ended — flush it before processing the next line.
105
+ flush();
106
+ inTable = false;
107
+ }
108
+ // ── Normal prose / headings ───────────────────────────────────────────
109
+ if (!trimmed) {
110
+ buffer.push(line);
111
+ continue;
112
+ }
113
+ const headingMatch = /^#{1,3} (.+)/.exec(trimmed);
114
+ if (headingMatch) {
115
+ const currentWords = buffer.join(" ").split(/\s+/).filter(Boolean).length;
116
+ if (currentWords >= CHUNK_TARGET_WORDS)
117
+ flush();
118
+ heading = headingMatch[1];
119
+ buffer.push(line);
120
+ }
121
+ else {
122
+ buffer.push(line);
123
+ const currentWords = buffer.join(" ").split(/\s+/).filter(Boolean).length;
124
+ if (currentWords >= CHUNK_TARGET_WORDS)
125
+ flush();
126
+ }
127
+ }
128
+ flush();
129
+ return chunks;
130
+ }
131
+ //# sourceMappingURL=convert.js.map
@@ -0,0 +1,56 @@
1
+ import { PageGraph } from "./graph.js";
2
+ import type { ICache } from "./ports.js";
3
+ import type { SpiderOptions } from "./spider.js";
4
+ import type { SpideredPage } from "./types.js";
5
+ export interface CrawlOptions extends SpiderOptions {
6
+ /** How many link hops from the start URL (default 2) */
7
+ maxDepth?: number;
8
+ /** Hard cap on total pages spidered (default 50) */
9
+ maxPages?: number;
10
+ /** Only follow links on the same domain as the start URL (default true) */
11
+ sameDomainOnly?: boolean;
12
+ /** Max concurrent fetches (default 3) */
13
+ concurrency?: number;
14
+ /**
15
+ * Minimum delay between requests to the same domain (ms).
16
+ * When a throttle is provided this sets its minDelayMs.
17
+ * Default 500.
18
+ */
19
+ delayMs?: number;
20
+ /** Bring your own cache — already-spidered URLs are skipped */
21
+ cache?: ICache<string, SpideredPage>;
22
+ /** Bring your own graph — nodes/edges added as pages are spidered */
23
+ graph?: PageGraph;
24
+ /** Called with each successfully spidered page */
25
+ onPage?: (page: SpideredPage, depth: number) => void;
26
+ /** Return false to skip a URL before fetching it */
27
+ urlFilter?: (url: string) => boolean;
28
+ /**
29
+ * Whether to check and respect robots.txt for each domain (default true).
30
+ * Automatically creates a RobotsCache if not provided via SpiderOptions.
31
+ */
32
+ respectRobots?: boolean;
33
+ /**
34
+ * Attempt to fetch /sitemap.xml before BFS to seed the frontier with
35
+ * all known URLs. Falls back to normal BFS on any error (default true).
36
+ */
37
+ useSitemap?: boolean;
38
+ }
39
+ export interface CrawlResult {
40
+ pages: Map<string, SpideredPage>;
41
+ graph: PageGraph;
42
+ errors: Map<string, Error>;
43
+ }
44
+ /**
45
+ * Recursive BFS crawler.
46
+ *
47
+ * Starts at `startUrl`, spiders it, extracts links, filters them, then
48
+ * recurses up to `maxDepth` hops. Respects `maxPages`, `sameDomainOnly`,
49
+ * and `urlFilter`. Populates the provided (or freshly created) cache and
50
+ * graph as it goes.
51
+ *
52
+ * Concurrency is bounded per depth level — we fully finish each level
53
+ * before proceeding, giving BFS ordering and predictable memory use.
54
+ */
55
+ export declare function crawl(startUrl: string, opts?: CrawlOptions): Promise<CrawlResult>;
56
+ //# sourceMappingURL=crawl.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"crawl.d.ts","sourceRoot":"","sources":["../src/crawl.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AACvC,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,YAAY,CAAC;AAGzC,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAGjD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAE/C,MAAM,WAAW,YAAa,SAAQ,aAAa;IAClD,wDAAwD;IACxD,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,oDAAoD;IACpD,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,2EAA2E;IAC3E,cAAc,CAAC,EAAE,OAAO,CAAC;IACzB,yCAAyC;IACzC,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;;OAIG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,+DAA+D;IAC/D,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;IACrC,qEAAqE;IACrE,KAAK,CAAC,EAAE,SAAS,CAAC;IAClB,kDAAkD;IAClD,MAAM,CAAC,EAAE,CAAC,IAAI,EAAE,YAAY,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;IACrD,oDAAoD;IACpD,SAAS,CAAC,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,OAAO,CAAC;IACrC;;;OAGG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB;;;OAGG;IACH,UAAU,CAAC,EAAE,OAAO,CAAC;CACrB;AAED,MAAM,WAAW,WAAW;IAC3B,KAAK,EAAE,GAAG,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;IACjC,KAAK,EAAE,SAAS,CAAC;IACjB,MAAM,EAAE,GAAG,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;CAC3B;AAED;;;;;;;;;;GAUG;AACH,wBAAsB,KAAK,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,GAAE,YAAiB,GAAG,OAAO,CAAC,WAAW,CAAC,CA2H3F"}
package/dist/crawl.js ADDED
@@ -0,0 +1,126 @@
1
+ import { SpiderCache } from "./cache.js";
2
+ import { PageGraph } from "./graph.js";
3
+ import { RobotsCache } from "./robots.js";
4
+ import { fetchSitemapUrls } from "./sitemap.js";
5
+ import { spider } from "./spider.js";
6
+ import { DomainThrottle } from "./throttle.js";
7
+ /**
8
+ * Recursive BFS crawler.
9
+ *
10
+ * Starts at `startUrl`, spiders it, extracts links, filters them, then
11
+ * recurses up to `maxDepth` hops. Respects `maxPages`, `sameDomainOnly`,
12
+ * and `urlFilter`. Populates the provided (or freshly created) cache and
13
+ * graph as it goes.
14
+ *
15
+ * Concurrency is bounded per depth level — we fully finish each level
16
+ * before proceeding, giving BFS ordering and predictable memory use.
17
+ */
18
+ export async function crawl(startUrl, opts = {}) {
19
+ const { maxDepth = 2, maxPages = 50, sameDomainOnly = true, concurrency = 3, delayMs = 500, cache = new SpiderCache(), graph = new PageGraph(), onPage, urlFilter, respectRobots = true, useSitemap = true, ...spiderOpts } = opts;
20
+ const throttle = spiderOpts.throttle ?? new DomainThrottle({ minDelayMs: delayMs });
21
+ const robotsCache = spiderOpts.robotsCache ?? (respectRobots ? new RobotsCache(spiderOpts.userAgent) : undefined);
22
+ const httpClient = spiderOpts.httpClient;
23
+ const startDomain = new URL(startUrl).hostname;
24
+ const pages = new Map();
25
+ const errors = new Map();
26
+ const seen = new Set();
27
+ const shouldVisit = (url) => {
28
+ if (seen.has(url))
29
+ return false;
30
+ if (pages.size + errors.size >= maxPages)
31
+ return false;
32
+ try {
33
+ const u = new URL(url);
34
+ if (!["http:", "https:"].includes(u.protocol))
35
+ return false;
36
+ if (sameDomainOnly && u.hostname !== startDomain)
37
+ return false;
38
+ }
39
+ catch {
40
+ return false;
41
+ }
42
+ if (urlFilter && !urlFilter(url))
43
+ return false;
44
+ return true;
45
+ };
46
+ // Throttle and robots.txt are handled inside spider() via shared instances.
47
+ const fetchBatch = async (urls, depth) => {
48
+ let index = 0;
49
+ let inFlight = 0;
50
+ let completed = 0;
51
+ await new Promise((resolve) => {
52
+ const tryNext = () => {
53
+ while (inFlight < concurrency && index < urls.length) {
54
+ const url = urls[index++];
55
+ inFlight++;
56
+ const fetch_ = cache.has(url)
57
+ ? Promise.resolve(cache.get(url))
58
+ : spider(url, { ...spiderOpts, throttle, robotsCache });
59
+ fetch_
60
+ .then((page) => {
61
+ pages.set(url, page);
62
+ cache.set(url, page);
63
+ graph.addPage(page);
64
+ onPage?.(page, depth);
65
+ })
66
+ .catch((err) => {
67
+ errors.set(url, err instanceof Error ? err : new Error(String(err)));
68
+ })
69
+ .finally(() => {
70
+ completed++;
71
+ inFlight--;
72
+ if (completed === urls.length)
73
+ resolve();
74
+ else
75
+ tryNext();
76
+ });
77
+ }
78
+ };
79
+ tryNext();
80
+ });
81
+ };
82
+ let frontier = [startUrl];
83
+ seen.add(startUrl);
84
+ if (useSitemap) {
85
+ const origin = new URL(startUrl).origin;
86
+ // Use a minimal default httpClient if none was injected
87
+ const client = httpClient ?? {
88
+ async fetch(req) {
89
+ return globalThis.fetch(req.url, { headers: req.headers });
90
+ },
91
+ };
92
+ const sitemapUrls = await fetchSitemapUrls(origin, client);
93
+ for (const u of sitemapUrls) {
94
+ if (shouldVisit(u)) {
95
+ seen.add(u);
96
+ frontier.push(u);
97
+ }
98
+ }
99
+ }
100
+ for (let depth = 0; depth <= maxDepth; depth++) {
101
+ if (frontier.length === 0)
102
+ break;
103
+ if (pages.size + errors.size >= maxPages)
104
+ break;
105
+ const remaining = maxPages - pages.size - errors.size;
106
+ const batch = frontier.slice(0, remaining);
107
+ await fetchBatch(batch, depth);
108
+ if (depth === maxDepth)
109
+ break;
110
+ const nextFrontier = [];
111
+ for (const url of batch) {
112
+ const page = pages.get(url);
113
+ if (!page)
114
+ continue;
115
+ for (const link of page.links) {
116
+ if (shouldVisit(link.href)) {
117
+ seen.add(link.href);
118
+ nextFrontier.push(link.href);
119
+ }
120
+ }
121
+ }
122
+ frontier = nextFrontier;
123
+ }
124
+ return { pages, graph, errors };
125
+ }
126
+ //# sourceMappingURL=crawl.js.map
@@ -0,0 +1,75 @@
1
+ /**
2
+ * Disk-backed cache implementing ICache<string, SpideredPage>.
3
+ *
4
+ * Persists to a JSON file so the cache survives extension reloads and
5
+ * pi restarts. Call flush() to write — set() auto-flushes by default.
6
+ *
7
+ * The images directory is derived automatically from `dirname(path)/images`.
8
+ * Callers do not need to create it — DiskCache creates it on first large-image
9
+ * flush. Pre-creating it at startup (e.g. in the extension boot path) is
10
+ * harmless and avoids a first-write delay.
11
+ *
12
+ * Internal storage uses a plain object (Object.create(null)) rather than a
13
+ * Map. Plain objects carry no realm-specific internal slots, making them safe
14
+ * across V8 context (realm) boundaries — e.g. when DiskCache is constructed
15
+ * in an ESM module realm but called from a jiti VM-sandbox realm (Bun binary
16
+ * mode). The Map-backed version threw "Map operation called on non-Map object"
17
+ * in that scenario.
18
+ *
19
+ * A schema version field in the persisted JSON guards against stale cache
20
+ * files from previous major versions being silently loaded with wrong shapes.
21
+ */
22
+ import type { ICache } from "./ports.js";
23
+ import type { SpideredPage } from "./types.js";
24
+ export interface DiskCacheOptions {
25
+ /** Time-to-live in ms. Default 30 min. */
26
+ ttlMs?: number;
27
+ /** Max entries. Default 500. */
28
+ maxSize?: number;
29
+ /** Auto-flush to disk on every set(). Default true. */
30
+ autoFlush?: boolean;
31
+ /**
32
+ * Base64 byte threshold for inline vs. file storage of images.
33
+ * Images whose base64 string length exceeds this are written as binary
34
+ * files to <cache-dir>/images/ instead of being stored inline in the JSON.
35
+ * Default: 32 * 1024 (32 KB of base64 ≈ 24 KB binary).
36
+ */
37
+ inlineImageThreshold?: number;
38
+ }
39
+ export declare class DiskCache implements ICache<string, SpideredPage> {
40
+ private readonly store;
41
+ private readonly path;
42
+ private readonly ttlMs;
43
+ private readonly maxSize;
44
+ private readonly autoFlush;
45
+ private readonly inlineImageThreshold;
46
+ /** Directory where large image binaries are stored. */
47
+ private readonly imagesDir;
48
+ constructor(path: string, opts?: DiskCacheOptions);
49
+ private key;
50
+ set(url: string, page: SpideredPage): void;
51
+ has(url: string): boolean;
52
+ delete(url: string): void;
53
+ /** Derive a stable filename for an image binary from its src URL. */
54
+ private imageFilename;
55
+ /**
56
+ * Prepare images for serialisation:
57
+ * - Images whose base64 length ≤ threshold are kept inline.
58
+ * - Larger images are written to imagesDir as binary files; base64 is
59
+ * replaced by filePath in the serialised entry.
60
+ */
61
+ private spill;
62
+ /**
63
+ * Hydrate images on read: if an image has filePath but no base64,
64
+ * load the binary from disk and re-encode.
65
+ */
66
+ private hydrate;
67
+ /** Write current contents to disk. Large images are spilled to imagesDir. */
68
+ flush(): void;
69
+ private load;
70
+ /** All currently valid (non-expired) pages, sorted newest-first. */
71
+ values(): SpideredPage[];
72
+ /** Retrieve a page, hydrating any file-backed images from disk. */
73
+ get(url: string): SpideredPage | undefined;
74
+ }
75
+ //# sourceMappingURL=disk-cache.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"disk-cache.d.ts","sourceRoot":"","sources":["../src/disk-cache.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAKH,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,YAAY,CAAC;AACzC,OAAO,KAAK,EAAY,YAAY,EAAE,MAAM,YAAY,CAAC;AAKzD,MAAM,WAAW,gBAAgB;IAChC,0CAA0C;IAC1C,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,gCAAgC;IAChC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,uDAAuD;IACvD,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB;;;;;OAKG;IACH,oBAAoB,CAAC,EAAE,MAAM,CAAC;CAC9B;AAaD,qBAAa,SAAU,YAAW,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC;IAC7D,OAAO,CAAC,QAAQ,CAAC,KAAK,CAA0D;IAChF,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAS;IAC9B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAS;IAC/B,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IACjC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAU;IACpC,OAAO,CAAC,QAAQ,CAAC,oBAAoB,CAAS;IAC9C,uDAAuD;IACvD,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;gBAEvB,IAAI,EAAE,MAAM,EAAE,IAAI,GAAE,gBAAqB;IAUrD,OAAO,CAAC,GAAG;IAUX,GAAG,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,YAAY,GAAG,IAAI;IAU1C,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAIzB,MAAM,CAAC,GAAG,EAAE,MAAM,GAAG,IAAI;IASzB,qEAAqE;IACrE,OAAO,CAAC,aAAa;IAMrB;;;;;OAKG;IACH,OAAO,CAAC,KAAK;IAgBb;;;OAGG;IACH,OAAO,CAAC,OAAO;IAiBf,6EAA6E;IAC7E,KAAK,IAAI,IAAI;IAeb,OAAO,CAAC,IAAI;IAyBZ,oEAAoE;IACpE,MAAM,IAAI,YAAY,EAAE;IAWxB,mEAAmE;IACnE,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,YAAY,GAAG,SAAS;CAY1C"}