@dpopsuev/web-spider 0.10.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/batch.d.ts +24 -0
  2. package/dist/batch.d.ts.map +1 -0
  3. package/dist/batch.js +68 -0
  4. package/dist/cache.d.ts +40 -0
  5. package/dist/cache.d.ts.map +1 -0
  6. package/dist/cache.js +78 -0
  7. package/dist/convert.d.ts +29 -0
  8. package/dist/convert.d.ts.map +1 -0
  9. package/dist/convert.js +131 -0
  10. package/dist/crawl.d.ts +56 -0
  11. package/dist/crawl.d.ts.map +1 -0
  12. package/dist/crawl.js +126 -0
  13. package/dist/disk-cache.d.ts +75 -0
  14. package/dist/disk-cache.d.ts.map +1 -0
  15. package/dist/disk-cache.js +185 -0
  16. package/dist/graph.d.ts +76 -0
  17. package/dist/graph.d.ts.map +1 -0
  18. package/dist/graph.js +156 -0
  19. package/dist/index.d.ts +45 -0
  20. package/dist/index.d.ts.map +1 -0
  21. package/dist/index.js +44 -0
  22. package/dist/parse.d.ts +27 -0
  23. package/dist/parse.d.ts.map +1 -0
  24. package/dist/parse.js +131 -0
  25. package/dist/playwright.d.ts +75 -0
  26. package/dist/playwright.d.ts.map +1 -0
  27. package/dist/playwright.js +141 -0
  28. package/dist/ports.d.ts +104 -0
  29. package/dist/ports.d.ts.map +1 -0
  30. package/dist/ports.js +10 -0
  31. package/dist/robots.d.ts +24 -0
  32. package/dist/robots.d.ts.map +1 -0
  33. package/dist/robots.js +104 -0
  34. package/dist/search.d.ts +47 -0
  35. package/dist/search.d.ts.map +1 -0
  36. package/dist/search.js +112 -0
  37. package/dist/sitemap.d.ts +15 -0
  38. package/dist/sitemap.d.ts.map +1 -0
  39. package/dist/sitemap.js +65 -0
  40. package/dist/spider.d.ts +74 -0
  41. package/dist/spider.d.ts.map +1 -0
  42. package/dist/spider.js +349 -0
  43. package/dist/throttle.d.ts +49 -0
  44. package/dist/throttle.d.ts.map +1 -0
  45. package/dist/throttle.js +85 -0
  46. package/dist/tree.d.ts +34 -0
  47. package/dist/tree.d.ts.map +1 -0
  48. package/dist/tree.js +354 -0
  49. package/dist/types.d.ts +189 -0
  50. package/dist/types.d.ts.map +1 -0
  51. package/dist/types.js +2 -0
  52. package/dist/views.d.ts +17 -0
  53. package/dist/views.d.ts.map +1 -0
  54. package/dist/views.js +39 -0
  55. package/dist/web-search.d.ts +184 -0
  56. package/dist/web-search.d.ts.map +1 -0
  57. package/dist/web-search.js +399 -0
  58. package/fixtures/article-with-images.html +94 -0
  59. package/fixtures/gh-shell.html +32 -0
  60. package/fixtures/guide-ai-agents-web-scraping.json +552 -0
  61. package/fixtures/images/large.jpg +0 -0
  62. package/fixtures/images/small.jpg +0 -0
  63. package/fixtures/images/tiny.png +0 -0
  64. package/fixtures/quotes-index.json +40 -0
  65. package/package.json +47 -0
  66. package/scripts/fetch-guide.mjs +25 -0
  67. package/src/cache.ts +99 -0
  68. package/src/convert.ts +161 -0
  69. package/src/crawl.ts +186 -0
  70. package/src/disk-cache.ts +228 -0
  71. package/src/graph.ts +189 -0
  72. package/src/index.ts +74 -0
  73. package/src/parse.ts +154 -0
  74. package/src/playwright.ts +193 -0
  75. package/src/ports.ts +131 -0
  76. package/src/robots.ts +121 -0
  77. package/src/search.ts +173 -0
  78. package/src/sitemap.ts +67 -0
  79. package/src/spider.ts +475 -0
  80. package/src/throttle.ts +118 -0
  81. package/src/tree.ts +379 -0
  82. package/src/types.ts +225 -0
  83. package/src/views.ts +42 -0
  84. package/src/web-search.ts +548 -0
  85. package/test/convert-images.test.ts +69 -0
  86. package/test/disk-cache-images.test.ts +193 -0
  87. package/test/engine-registry.test.ts +114 -0
  88. package/test/exports.test.ts +124 -0
  89. package/test/get-chunk.test.ts +115 -0
  90. package/test/images-integration.test.ts +359 -0
  91. package/test/improvements.test.ts +279 -0
  92. package/test/inbound-count.test.ts +111 -0
  93. package/test/lean.test.ts +105 -0
  94. package/test/playwright.test.ts +128 -0
  95. package/test/ports.test.ts +161 -0
  96. package/test/search.test.ts +219 -0
  97. package/test/spider-images.test.ts +180 -0
  98. package/test/spider-unit.test.ts +610 -0
  99. package/test/tree.test.ts +272 -0
  100. package/test/types.test.ts +169 -0
  101. package/test/web-search-integration.test.ts +180 -0
  102. package/test/web-search.test.ts +305 -0
  103. package/tsconfig.json +9 -0
  104. package/tsconfig.test.json +7 -0
  105. package/vitest.config.ts +8 -0
@@ -0,0 +1,228 @@
1
+ /**
2
+ * Disk-backed cache implementing ICache<string, SpideredPage>.
3
+ *
4
+ * Persists to a JSON file so the cache survives extension reloads and
5
+ * pi restarts. Call flush() to write — set() auto-flushes by default.
6
+ *
7
+ * The images directory is derived automatically from `dirname(path)/images`.
8
+ * Callers do not need to create it — DiskCache creates it on first large-image
9
+ * flush. Pre-creating it at startup (e.g. in the extension boot path) is
10
+ * harmless and avoids a first-write delay.
11
+ *
12
+ * Internal storage uses a plain object (Object.create(null)) rather than a
13
+ * Map. Plain objects carry no realm-specific internal slots, making them safe
14
+ * across V8 context (realm) boundaries — e.g. when DiskCache is constructed
15
+ * in an ESM module realm but called from a jiti VM-sandbox realm (Bun binary
16
+ * mode). The Map-backed version threw "Map operation called on non-Map object"
17
+ * in that scenario.
18
+ *
19
+ * A schema version field in the persisted JSON guards against stale cache
20
+ * files from previous major versions being silently loaded with wrong shapes.
21
+ */
22
+
23
+ import { createHash } from "node:crypto";
24
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
25
+ import { dirname, extname, join } from "node:path";
26
+ import type { ICache } from "./ports.js";
27
+ import type { ImageRef, SpideredPage } from "./types.js";
28
+
29
+ /** Bump when the on-disk entry shape changes incompatibly. */
30
+ const SCHEMA_VERSION = 2;
31
+
32
+ export interface DiskCacheOptions {
33
+ /** Time-to-live in ms. Default 30 min. */
34
+ ttlMs?: number;
35
+ /** Max entries. Default 500. */
36
+ maxSize?: number;
37
+ /** Auto-flush to disk on every set(). Default true. */
38
+ autoFlush?: boolean;
39
+ /**
40
+ * Base64 byte threshold for inline vs. file storage of images.
41
+ * Images whose base64 string length exceeds this are written as binary
42
+ * files to <cache-dir>/images/ instead of being stored inline in the JSON.
43
+ * Default: 32 * 1024 (32 KB of base64 ≈ 24 KB binary).
44
+ */
45
+ inlineImageThreshold?: number;
46
+ }
47
+
48
+ interface Entry {
49
+ page: SpideredPage;
50
+ expiresAt: number;
51
+ }
52
+
53
+ /** Versioned wrapper written to disk. */
54
+ interface DiskPayload {
55
+ v: number;
56
+ entries: Record<string, Entry>;
57
+ }
58
+
59
+ export class DiskCache implements ICache<string, SpideredPage> {
60
+ private readonly store: Record<string, Entry | undefined> = Object.create(null);
61
+ private readonly path: string;
62
+ private readonly ttlMs: number;
63
+ private readonly maxSize: number;
64
+ private readonly autoFlush: boolean;
65
+ private readonly inlineImageThreshold: number;
66
+ /** Directory where large image binaries are stored. */
67
+ private readonly imagesDir: string;
68
+
69
+ constructor(path: string, opts: DiskCacheOptions = {}) {
70
+ this.path = path;
71
+ this.ttlMs = opts.ttlMs ?? 30 * 60 * 1000;
72
+ this.maxSize = opts.maxSize ?? 500;
73
+ this.autoFlush = opts.autoFlush ?? true;
74
+ this.inlineImageThreshold = opts.inlineImageThreshold ?? 32 * 1024;
75
+ this.imagesDir = join(dirname(path), "images");
76
+ this.load();
77
+ }
78
+
79
+ private key(url: string): string {
80
+ try {
81
+ const u = new URL(url);
82
+ u.hash = "";
83
+ return u.toString().replace(/\/$/, "");
84
+ } catch {
85
+ return url;
86
+ }
87
+ }
88
+
89
+ set(url: string, page: SpideredPage): void {
90
+ const k = this.key(url);
91
+ if (Object.keys(this.store).length >= this.maxSize && !(k in this.store)) {
92
+ const oldest = Object.keys(this.store)[0];
93
+ if (oldest !== undefined) delete this.store[oldest];
94
+ }
95
+ this.store[k] = { page, expiresAt: Date.now() + this.ttlMs };
96
+ if (this.autoFlush) this.flush();
97
+ }
98
+
99
+ has(url: string): boolean {
100
+ return this.get(url) !== undefined;
101
+ }
102
+
103
+ delete(url: string): void {
104
+ delete this.store[this.key(url)];
105
+ if (this.autoFlush) this.flush();
106
+ }
107
+
108
+ // ---------------------------------------------------------------------------
109
+ // Image helpers
110
+ // ---------------------------------------------------------------------------
111
+
112
+ /** Derive a stable filename for an image binary from its src URL. */
113
+ private imageFilename(src: string): string {
114
+ const hash = createHash("sha1").update(src).digest("hex");
115
+ const ext = extname(src.split("?")[0]) || ".bin";
116
+ return `${hash}${ext}`;
117
+ }
118
+
119
+ /**
120
+ * Prepare images for serialisation:
121
+ * - Images whose base64 length ≤ threshold are kept inline.
122
+ * - Larger images are written to imagesDir as binary files; base64 is
123
+ * replaced by filePath in the serialised entry.
124
+ */
125
+ private spill(images: ImageRef[]): ImageRef[] {
126
+ if (!existsSync(this.imagesDir)) {
127
+ mkdirSync(this.imagesDir, { recursive: true });
128
+ }
129
+ return images.map((img) => {
130
+ if (!img.base64 || img.base64.length <= this.inlineImageThreshold) {
131
+ return img;
132
+ }
133
+ const filename = this.imageFilename(img.src);
134
+ const filePath = join(this.imagesDir, filename);
135
+ writeFileSync(filePath, Buffer.from(img.base64, "base64"));
136
+ const { base64: _omit, ...rest } = img;
137
+ return { ...rest, filePath };
138
+ });
139
+ }
140
+
141
+ /**
142
+ * Hydrate images on read: if an image has filePath but no base64,
143
+ * load the binary from disk and re-encode.
144
+ */
145
+ private hydrate(images: ImageRef[]): ImageRef[] {
146
+ return images.map((img) => {
147
+ if (img.base64 || !img.filePath) return img;
148
+ if (!existsSync(img.filePath)) return img;
149
+ try {
150
+ const base64 = readFileSync(img.filePath).toString("base64");
151
+ return { ...img, base64 };
152
+ } catch {
153
+ return img;
154
+ }
155
+ });
156
+ }
157
+
158
+ // ---------------------------------------------------------------------------
159
+ // Persistence
160
+ // ---------------------------------------------------------------------------
161
+
162
+ /** Write current contents to disk. Large images are spilled to imagesDir. */
163
+ flush(): void {
164
+ const now = Date.now();
165
+ const entries: Record<string, Entry> = {};
166
+ for (const [k, v] of Object.entries(this.store)) {
167
+ if (!v || v.expiresAt <= now) continue;
168
+ const page = v.page;
169
+ const serialised: SpideredPage = page.images
170
+ ? { ...page, images: this.spill(page.images) }
171
+ : page;
172
+ entries[k] = { page: serialised, expiresAt: v.expiresAt };
173
+ }
174
+ const payload: DiskPayload = { v: SCHEMA_VERSION, entries };
175
+ writeFileSync(this.path, JSON.stringify(payload), "utf8");
176
+ }
177
+
178
+ private load(): void {
179
+ if (!existsSync(this.path)) return;
180
+ try {
181
+ const raw = JSON.parse(readFileSync(this.path, "utf8")) as unknown;
182
+
183
+ // Reject files from incompatible schema versions (including old
184
+ // unversioned files that lack the "v" field entirely).
185
+ if (
186
+ typeof raw !== "object" ||
187
+ raw === null ||
188
+ (raw as { v?: unknown }).v !== SCHEMA_VERSION
189
+ ) {
190
+ return; // stale schema — start fresh, do not throw
191
+ }
192
+
193
+ const payload = raw as DiskPayload;
194
+ const now = Date.now();
195
+ for (const [k, v] of Object.entries(payload.entries)) {
196
+ if (v.expiresAt > now) this.store[k] = v;
197
+ }
198
+ } catch {
199
+ // Corrupt or unreadable file — start fresh.
200
+ }
201
+ }
202
+
203
+ /** All currently valid (non-expired) pages, sorted newest-first. */
204
+ values(): SpideredPage[] {
205
+ const now = Date.now();
206
+ return Object.values(this.store)
207
+ .filter((e): e is Entry => e !== undefined && e.expiresAt > now)
208
+ .sort((a, b) => b.expiresAt - a.expiresAt)
209
+ .map((e) => {
210
+ const page = e.page;
211
+ return page.images ? { ...page, images: this.hydrate(page.images) } : page;
212
+ });
213
+ }
214
+
215
+ /** Retrieve a page, hydrating any file-backed images from disk. */
216
+ get(url: string): SpideredPage | undefined {
217
+ const k = this.key(url);
218
+ const entry = this.store[k];
219
+ if (!entry) return undefined;
220
+ if (Date.now() > entry.expiresAt) {
221
+ delete this.store[k];
222
+ return undefined;
223
+ }
224
+ const page = entry.page;
225
+ if (page.images) return { ...page, images: this.hydrate(page.images) };
226
+ return page;
227
+ }
228
+ }
package/src/graph.ts ADDED
@@ -0,0 +1,189 @@
1
+ import type { SpideredPage } from "./types.js";
2
+
3
+ /** A node in the knowledge graph — lightweight reference, not the full page. */
4
+ export interface PageNode {
5
+ url: string;
6
+ domain: string;
7
+ title: string;
8
+ description: string;
9
+ wordCount: number;
10
+ fetchedAt: string;
11
+ chunkCount: number;
12
+ }
13
+
14
+ /** A directed edge between two pages. */
15
+ export interface PageEdge {
16
+ from: string;
17
+ to: string;
18
+ /** Anchor text of the link */
19
+ text: string;
20
+ isExternal: boolean;
21
+ }
22
+
23
+ /** Serialisable snapshot for storage or embedding. */
24
+ export interface PageGraphSnapshot {
25
+ nodes: PageNode[];
26
+ edges: PageEdge[];
27
+ }
28
+
29
+ /**
30
+ * Directed knowledge graph of spidered pages.
31
+ *
32
+ * Nodes are pages. Edges are outbound links.
33
+ * Maintains a reverse index (inbound links) for O(1) lookup.
34
+ *
35
+ * All graph queries return plain data — no PageNode references —
36
+ * so the graph is trivially serialisable.
37
+ *
38
+ * Internal storage uses plain objects (Object.create(null)) rather than
39
+ * Maps. Plain objects carry no realm-specific internal slots, making them
40
+ * safe across V8 context (realm) boundaries — e.g. when the graph is
41
+ * constructed in an ESM module realm but called from a jiti VM-sandbox.
42
+ */
43
+ export class PageGraph {
44
+ private readonly nodes: Record<string, PageNode | undefined> = Object.create(null);
45
+ /** url → outbound edges */
46
+ private readonly out: Record<string, PageEdge[] | undefined> = Object.create(null);
47
+ /** url → inbound source urls */
48
+ private readonly in_: Record<string, string[] | undefined> = Object.create(null);
49
+
50
+ /** Add or update a node from a spidered page. */
51
+ addPage(page: SpideredPage): void {
52
+ this.nodes[page.url] = {
53
+ url: page.url,
54
+ domain: page.domain,
55
+ title: page.title,
56
+ description: page.description,
57
+ wordCount: page.wordCount,
58
+ fetchedAt: page.fetchedAt,
59
+ chunkCount: page.chunks.length,
60
+ };
61
+
62
+ for (const link of page.links) {
63
+ if (!link.href) continue;
64
+ this.addEdge(page.url, link.href, link.text, link.isExternal);
65
+ }
66
+ }
67
+
68
+ /** Add a directed edge without requiring the target to be spidered yet. */
69
+ addEdge(from: string, to: string, text: string, isExternal: boolean): void {
70
+ const edge: PageEdge = { from, to, text, isExternal };
71
+ const existing = this.out[from] ?? [];
72
+ if (!existing.some((e) => e.to === to)) {
73
+ this.out[from] = [...existing, edge];
74
+ }
75
+ const inbound = this.in_[to] ?? [];
76
+ if (!inbound.includes(from)) {
77
+ this.in_[to] = [...inbound, from];
78
+ }
79
+ }
80
+
81
+ node(url: string): PageNode | undefined {
82
+ return this.nodes[url];
83
+ }
84
+
85
+ /** Outbound edges from a node. */
86
+ outbound(url: string): PageEdge[] {
87
+ return this.out[url] ?? [];
88
+ }
89
+
90
+ /** URLs that link TO this page. */
91
+ inbound(url: string): string[] {
92
+ return this.in_[url] ?? [];
93
+ }
94
+
95
+ /** Pages with no inbound links — entry points to the graph. */
96
+ roots(): PageNode[] {
97
+ return Object.values(this.nodes)
98
+ .filter((n): n is PageNode => n !== undefined && (this.in_[n.url] ?? []).length === 0);
99
+ }
100
+
101
+ /** Pages with no outbound links to other spidered nodes. */
102
+ sinks(): PageNode[] {
103
+ return Object.values(this.nodes)
104
+ .filter((n): n is PageNode => {
105
+ if (!n) return false;
106
+ const edges = this.out[n.url] ?? [];
107
+ return !edges.some((e) => e.to in this.nodes);
108
+ });
109
+ }
110
+
111
+ /** BFS shortest path between two page URLs. Returns null if unreachable. */
112
+ findPath(from: string, to: string): string[] | null {
113
+ if (from === to) return [from];
114
+ const visited = new Set<string>([from]);
115
+ const queue: Array<string[]> = [[from]];
116
+
117
+ while (queue.length > 0) {
118
+ const path = queue.shift()!;
119
+ const current = path[path.length - 1];
120
+ for (const edge of this.out[current] ?? []) {
121
+ if (edge.to === to) return [...path, to];
122
+ if (!visited.has(edge.to) && edge.to in this.nodes) {
123
+ visited.add(edge.to);
124
+ queue.push([...path, edge.to]);
125
+ }
126
+ }
127
+ }
128
+ return null;
129
+ }
130
+
131
+ /**
132
+ * All pages reachable from `startUrl` via spidered links.
133
+ * BFS, bounded by the nodes present in the graph.
134
+ */
135
+ reachableFrom(startUrl: string): PageNode[] {
136
+ const visited = new Set<string>([startUrl]);
137
+ const queue = [startUrl];
138
+ while (queue.length > 0) {
139
+ const url = queue.shift()!;
140
+ for (const edge of this.out[url] ?? []) {
141
+ if (!visited.has(edge.to) && edge.to in this.nodes) {
142
+ visited.add(edge.to);
143
+ queue.push(edge.to);
144
+ }
145
+ }
146
+ }
147
+ visited.delete(startUrl);
148
+ return [...visited].map((u) => this.nodes[u]).filter((n): n is PageNode => n !== undefined);
149
+ }
150
+
151
+ /** Nodes ranked by inbound link count (highest first). */
152
+ byPageRank(): Array<{ node: PageNode; inboundCount: number }> {
153
+ return Object.values(this.nodes)
154
+ .filter((n): n is PageNode => n !== undefined)
155
+ .map((n) => ({ node: n, inboundCount: (this.in_[n.url] ?? []).length }))
156
+ .sort((a, b) => b.inboundCount - a.inboundCount);
157
+ }
158
+
159
+ get nodeCount(): number {
160
+ return Object.keys(this.nodes).length;
161
+ }
162
+
163
+ get edgeCount(): number {
164
+ let total = 0;
165
+ for (const edges of Object.values(this.out)) {
166
+ if (edges) total += edges.length;
167
+ }
168
+ return total;
169
+ }
170
+
171
+ /** Plain snapshot — safe to JSON.stringify or embed. */
172
+ toJSON(): PageGraphSnapshot {
173
+ const edges: PageEdge[] = [];
174
+ for (const edgeList of Object.values(this.out)) {
175
+ if (edgeList) edges.push(...edgeList);
176
+ }
177
+ return {
178
+ nodes: Object.values(this.nodes).filter((n): n is PageNode => n !== undefined),
179
+ edges,
180
+ };
181
+ }
182
+
183
+ static fromJSON(snap: PageGraphSnapshot): PageGraph {
184
+ const g = new PageGraph();
185
+ for (const n of snap.nodes) g.nodes[n.url] = n;
186
+ for (const e of snap.edges) g.addEdge(e.from, e.to, e.text, e.isExternal);
187
+ return g;
188
+ }
189
+ }
package/src/index.ts ADDED
@@ -0,0 +1,74 @@
1
+ // ---------------------------------------------------------------------------
2
+ // Public API — what most consumers need
3
+ // ---------------------------------------------------------------------------
4
+
5
+ export type { SpiderCacheOptions } from "./cache.js";
6
+ export { SpiderCache } from "./cache.js";
7
+ export type { CrawlOptions, CrawlResult } from "./crawl.js";
8
+ export { crawl } from "./crawl.js";
9
+ export type { PageEdge, PageGraphSnapshot, PageNode } from "./graph.js";
10
+ export { PageGraph } from "./graph.js";
11
+ export type { FuzzySearchOptions, SearchHit } from "./search.js";
12
+ export { searchPages } from "./search.js";
13
+ /** @deprecated Use {@link searchPages} — renamed in v0.4.0 to reflect BM25F ranking (not fuzzy-only). */
14
+ export { searchPages as fuzzySearch } from "./search.js";
15
+ export type { SpiderOptions, TreePage } from "./spider.js";
16
+ export { spider } from "./spider.js";
17
+ export type { QueryTreeOptions } from "./tree.js";
18
+ export { buildTree, navigateTree, queryTree } from "./tree.js";
19
+ export type { Chunk, ChunkType, DOMNode, ImageRef, LeanLink, LeanPage, Link, PageView, SpideredPage, TreeHit } from "./types.js";
20
+ export { toLean } from "./views.js";
21
+ export type { BraveSearchOptions, DdgSearchOptions, ExaSearchOptions, FallbackSearchEngineOptions, SearchEngine, TavilySearchOptions, WebSearchResult } from "./web-search.js";
22
+ export { braveSearch, ddgSearch, exaSearch, registerSearchEngine, resolveSearchEngine, tavilySearch, webSearch } from "./web-search.js";
23
+
24
+ // ---------------------------------------------------------------------------
25
+ // Utilities
26
+ // ---------------------------------------------------------------------------
27
+
28
+ import type { ICache } from "./ports.js";
29
+ import type { Chunk, SpideredPage } from "./types.js";
30
+
31
+ /**
32
+ * Retrieve a single chunk from a cached page by URL and chunk index.
33
+ *
34
+ * Avoids loading the full page markdown when an agent only needs one
35
+ * specific chunk — e.g. to re-read a section after a highlights hit.
36
+ *
37
+ * Returns undefined when the URL is not cached, the index is out of range,
38
+ * or the index is negative.
39
+ *
40
+ * @example
41
+ * const chunk = getChunk(cache, "https://example.com/article", 3)
42
+ * if (chunk) console.log(chunk.text)
43
+ */
44
+ export function getChunk(
45
+ cache: ICache<string, SpideredPage>,
46
+ url: string,
47
+ index: number,
48
+ ): Chunk | undefined {
49
+ if (index < 0) return undefined;
50
+ return cache.get(url)?.chunks[index];
51
+ }
52
+
53
+ // ---------------------------------------------------------------------------
54
+ // Extension / DI — port interfaces and their concrete adapters.
55
+ // Import these when you need to inject custom implementations.
56
+ // ---------------------------------------------------------------------------
57
+
58
+ export type { HttpRequest, HttpResponse, ICache, IHttpClient, IRobotsChecker, ISearchEngine, IThrottle, RobotsResult, SearchQuery } from "./ports.js";
59
+ export type { DiskCacheOptions } from "./disk-cache.js";
60
+ export { DiskCache } from "./disk-cache.js";
61
+ export type { PlaywrightClientOptions } from "./playwright.js";
62
+ export { PlaywrightHttpClient, createPlaywrightClient } from "./playwright.js";
63
+ export { RobotsCache, createRobotsCache } from "./robots.js";
64
+ export { fetchSitemapUrls } from "./sitemap.js";
65
+ export type { ThrottleOptions } from "./throttle.js";
66
+ export { DomainThrottle, createThrottle } from "./throttle.js";
67
+ export { BraveSearchEngine, DdgSearchEngine, ExaSearchEngine, FallbackSearchEngine, TavilySearchEngine, defaultSearchEngine } from "./web-search.js";
68
+
69
+ // ---------------------------------------------------------------------------
70
+ // parse.ts, convert.ts, views.ts are internal implementation modules.
71
+ // They are NOT exported here — they are consumed only by spider.ts.
72
+ // If you need lower-level DOM or markdown utilities, import from the
73
+ // sub-modules directly (not covered by semver stability guarantees).
74
+ // ---------------------------------------------------------------------------
package/src/parse.ts ADDED
@@ -0,0 +1,154 @@
1
+ /**
2
+ * DOM parsing helpers.
3
+ *
4
+ * Owns the DOM parsing dependency. spider.ts calls these after fetching HTML;
5
+ * it never touches the DOM library directly.
6
+ */
7
+
8
+ import { parseHTML } from "linkedom";
9
+ import type { Link, SpideredPage } from "./types.js";
10
+
11
+ // ---------------------------------------------------------------------------
12
+ // DOM creation
13
+ // ---------------------------------------------------------------------------
14
+
15
+ /**
16
+ * Parse raw HTML into a DOM Document.
17
+ * Uses linkedom — a lightweight server-side DOM that has no CSS engine,
18
+ * no module-level Maps, and a flat CJS dependency tree. Safe to load
19
+ * through jiti's transform pipeline without nativeModules workarounds.
20
+ */
21
+ export function parseDom(html: string, url: string): Document {
22
+ return parseHTML(html, { url }).document as unknown as Document;
23
+ }
24
+
25
+ // ---------------------------------------------------------------------------
26
+ // Nav classification
27
+ // ---------------------------------------------------------------------------
28
+
29
+ const NAV_CLASS_RE =
30
+ /^(nav|navbar|navigation|menu|menubar|header|footer|sidebar|breadcrumb|topbar|toolbar|site-nav|main-nav|primary-nav|global-nav)$/i;
31
+
32
+ /** True if el or any ancestor up to 5 levels looks like navigation chrome. */
33
+ export function isNavElement(el: Element): boolean {
34
+ if (el.closest("nav, header, footer, aside")) return true;
35
+ if (
36
+ el.closest(
37
+ "[role='navigation'],[role='banner'],[role='contentinfo'],[role='complementary']",
38
+ )
39
+ )
40
+ return true;
41
+
42
+ let node: Element | null = el;
43
+ for (let i = 0; i < 5; i++) {
44
+ if (!node) break;
45
+ for (const cls of node.classList) {
46
+ if (NAV_CLASS_RE.test(cls)) return true;
47
+ }
48
+ node = node.parentElement;
49
+ }
50
+ return false;
51
+ }
52
+
53
+ // ---------------------------------------------------------------------------
54
+ // Link text extraction
55
+ // ---------------------------------------------------------------------------
56
+
57
+ /** Extract visible text from an anchor, skipping SVG subtrees. */
58
+ export function anchorText(a: Element): string {
59
+ if (!a.querySelector("svg")) {
60
+ return (a.textContent ?? "").replace(/\s+/g, " ").trim();
61
+ }
62
+ const clone = a.cloneNode(true) as Element;
63
+ for (const svg of [...clone.querySelectorAll("svg")]) svg.remove();
64
+ return (clone.textContent ?? "").replace(/\s+/g, " ").trim();
65
+ }
66
+
67
+ // ---------------------------------------------------------------------------
68
+ // Link extraction
69
+ // ---------------------------------------------------------------------------
70
+
71
+ /** Extract outbound links from the DOM, classified as body or nav. */
72
+ export function extractLinks(doc: Document, baseUrl: string): Link[] {
73
+ const origin = new URL(baseUrl).origin;
74
+ return Array.from(doc.querySelectorAll("a[href]"))
75
+ .map((a) => {
76
+ const href = (a as HTMLAnchorElement).href;
77
+ const text = anchorText(a)
78
+ .replace(
79
+ /\b(open_in_new|navigate_next|navigate_before|arrow_drop_down|arrow_drop_up|chevron_right|chevron_left|expand_more|expand_less)\b/g,
80
+ "",
81
+ )
82
+ .replace(/\s+/g, " ")
83
+ .trim();
84
+ if (!href || !text || href.startsWith("javascript:")) return null;
85
+
86
+ return {
87
+ href,
88
+ text,
89
+ isExternal: !href.startsWith(origin),
90
+ rel: isNavElement(a) ? ("nav" as const) : ("body" as const),
91
+ } satisfies Link;
92
+ })
93
+ .filter((l): l is Link => l !== null)
94
+ .slice(0, 200);
95
+ }
96
+
97
+ // ---------------------------------------------------------------------------
98
+ // Heading extraction
99
+ // ---------------------------------------------------------------------------
100
+
101
+ /** Extract h1/h2/h3 headings from Readability article HTML. */
102
+ export function extractHeadings(html: string): SpideredPage["headings"] {
103
+ const { document } = parseHTML(`<html><body>${html}</body></html>`);
104
+ const headings: SpideredPage["headings"] = [];
105
+ document.querySelectorAll("h1, h2, h3").forEach((el) => {
106
+ const level = parseInt(el.tagName[1], 10) as 1 | 2 | 3;
107
+ const text = (el.textContent ?? "").trim();
108
+ if (text) headings.push({ level, text });
109
+ });
110
+ return headings;
111
+ }
112
+
113
+ // ---------------------------------------------------------------------------
114
+ // Tag extraction
115
+ // ---------------------------------------------------------------------------
116
+
117
+ /** Extract topic tags from meta keywords and article:tag. */
118
+ export function extractTags(doc: Document): string[] {
119
+ const tags = new Set<string>();
120
+
121
+ const keywords = doc.querySelector('meta[name="keywords"]')?.getAttribute("content") ?? "";
122
+ for (const k of keywords
123
+ .split(/[,;]/)
124
+ .map((k) => k.trim().toLowerCase())
125
+ .filter(Boolean)) {
126
+ tags.add(k);
127
+ }
128
+
129
+ doc.querySelectorAll('meta[property="article:tag"], meta[name="article:tag"]').forEach((el) => {
130
+ const t = el.getAttribute("content")?.trim().toLowerCase();
131
+ if (t) tags.add(t);
132
+ });
133
+
134
+ const section =
135
+ doc.querySelector('meta[property="article:section"]')?.getAttribute("content") ??
136
+ doc.querySelector('meta[property="og:article:section"]')?.getAttribute("content");
137
+ if (section) tags.add(section.trim().toLowerCase());
138
+
139
+ return [...tags].slice(0, 20);
140
+ }
141
+
142
+ // ---------------------------------------------------------------------------
143
+ // Canonical URL extraction
144
+ // ---------------------------------------------------------------------------
145
+
146
+ /** Extract canonical URL from link[rel=canonical] or og:url. */
147
+ export function extractCanonicalUrl(doc: Document, fetchedUrl: string): string | undefined {
148
+ const canonical =
149
+ doc.querySelector('link[rel="canonical"]')?.getAttribute("href") ??
150
+ doc.querySelector('meta[property="og:url"]')?.getAttribute("content");
151
+ if (!canonical) return undefined;
152
+ const norm = (u: string) => u.replace(/\/$/, "");
153
+ return norm(canonical) !== norm(fetchedUrl) ? canonical : undefined;
154
+ }