@dpopsuev/web-spider 0.10.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/batch.d.ts +24 -0
- package/dist/batch.d.ts.map +1 -0
- package/dist/batch.js +68 -0
- package/dist/cache.d.ts +40 -0
- package/dist/cache.d.ts.map +1 -0
- package/dist/cache.js +78 -0
- package/dist/convert.d.ts +29 -0
- package/dist/convert.d.ts.map +1 -0
- package/dist/convert.js +131 -0
- package/dist/crawl.d.ts +56 -0
- package/dist/crawl.d.ts.map +1 -0
- package/dist/crawl.js +126 -0
- package/dist/disk-cache.d.ts +75 -0
- package/dist/disk-cache.d.ts.map +1 -0
- package/dist/disk-cache.js +185 -0
- package/dist/graph.d.ts +76 -0
- package/dist/graph.d.ts.map +1 -0
- package/dist/graph.js +156 -0
- package/dist/index.d.ts +45 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +44 -0
- package/dist/parse.d.ts +27 -0
- package/dist/parse.d.ts.map +1 -0
- package/dist/parse.js +131 -0
- package/dist/playwright.d.ts +75 -0
- package/dist/playwright.d.ts.map +1 -0
- package/dist/playwright.js +141 -0
- package/dist/ports.d.ts +104 -0
- package/dist/ports.d.ts.map +1 -0
- package/dist/ports.js +10 -0
- package/dist/robots.d.ts +24 -0
- package/dist/robots.d.ts.map +1 -0
- package/dist/robots.js +104 -0
- package/dist/search.d.ts +47 -0
- package/dist/search.d.ts.map +1 -0
- package/dist/search.js +112 -0
- package/dist/sitemap.d.ts +15 -0
- package/dist/sitemap.d.ts.map +1 -0
- package/dist/sitemap.js +65 -0
- package/dist/spider.d.ts +74 -0
- package/dist/spider.d.ts.map +1 -0
- package/dist/spider.js +349 -0
- package/dist/throttle.d.ts +49 -0
- package/dist/throttle.d.ts.map +1 -0
- package/dist/throttle.js +85 -0
- package/dist/tree.d.ts +34 -0
- package/dist/tree.d.ts.map +1 -0
- package/dist/tree.js +354 -0
- package/dist/types.d.ts +189 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/views.d.ts +17 -0
- package/dist/views.d.ts.map +1 -0
- package/dist/views.js +39 -0
- package/dist/web-search.d.ts +184 -0
- package/dist/web-search.d.ts.map +1 -0
- package/dist/web-search.js +399 -0
- package/fixtures/article-with-images.html +94 -0
- package/fixtures/gh-shell.html +32 -0
- package/fixtures/guide-ai-agents-web-scraping.json +552 -0
- package/fixtures/images/large.jpg +0 -0
- package/fixtures/images/small.jpg +0 -0
- package/fixtures/images/tiny.png +0 -0
- package/fixtures/quotes-index.json +40 -0
- package/package.json +47 -0
- package/scripts/fetch-guide.mjs +25 -0
- package/src/cache.ts +99 -0
- package/src/convert.ts +161 -0
- package/src/crawl.ts +186 -0
- package/src/disk-cache.ts +228 -0
- package/src/graph.ts +189 -0
- package/src/index.ts +74 -0
- package/src/parse.ts +154 -0
- package/src/playwright.ts +193 -0
- package/src/ports.ts +131 -0
- package/src/robots.ts +121 -0
- package/src/search.ts +173 -0
- package/src/sitemap.ts +67 -0
- package/src/spider.ts +475 -0
- package/src/throttle.ts +118 -0
- package/src/tree.ts +379 -0
- package/src/types.ts +225 -0
- package/src/views.ts +42 -0
- package/src/web-search.ts +548 -0
- package/test/convert-images.test.ts +69 -0
- package/test/disk-cache-images.test.ts +193 -0
- package/test/engine-registry.test.ts +114 -0
- package/test/exports.test.ts +124 -0
- package/test/get-chunk.test.ts +115 -0
- package/test/images-integration.test.ts +359 -0
- package/test/improvements.test.ts +279 -0
- package/test/inbound-count.test.ts +111 -0
- package/test/lean.test.ts +105 -0
- package/test/playwright.test.ts +128 -0
- package/test/ports.test.ts +161 -0
- package/test/search.test.ts +219 -0
- package/test/spider-images.test.ts +180 -0
- package/test/spider-unit.test.ts +610 -0
- package/test/tree.test.ts +272 -0
- package/test/types.test.ts +169 -0
- package/test/web-search-integration.test.ts +180 -0
- package/test/web-search.test.ts +305 -0
- package/tsconfig.json +9 -0
- package/tsconfig.test.json +7 -0
- package/vitest.config.ts +8 -0
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Disk-backed cache implementing ICache<string, SpideredPage>.
|
|
3
|
+
*
|
|
4
|
+
* Persists to a JSON file so the cache survives extension reloads and
|
|
5
|
+
* pi restarts. Call flush() to write — set() auto-flushes by default.
|
|
6
|
+
*
|
|
7
|
+
* The images directory is derived automatically from `dirname(path)/images`.
|
|
8
|
+
* Callers do not need to create it — DiskCache creates it on first large-image
|
|
9
|
+
* flush. Pre-creating it at startup (e.g. in the extension boot path) is
|
|
10
|
+
* harmless and avoids a first-write delay.
|
|
11
|
+
*
|
|
12
|
+
* Internal storage uses a plain object (Object.create(null)) rather than a
|
|
13
|
+
* Map. Plain objects carry no realm-specific internal slots, making them safe
|
|
14
|
+
* across V8 context (realm) boundaries — e.g. when DiskCache is constructed
|
|
15
|
+
* in an ESM module realm but called from a jiti VM-sandbox realm (Bun binary
|
|
16
|
+
* mode). The Map-backed version threw "Map operation called on non-Map object"
|
|
17
|
+
* in that scenario.
|
|
18
|
+
*
|
|
19
|
+
* A schema version field in the persisted JSON guards against stale cache
|
|
20
|
+
* files from previous major versions being silently loaded with wrong shapes.
|
|
21
|
+
*/
|
|
22
|
+
import { createHash } from "node:crypto";
|
|
23
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
24
|
+
import { dirname, extname, join } from "node:path";
|
|
25
|
+
/** Bump when the on-disk entry shape changes incompatibly. */
|
|
26
|
+
const SCHEMA_VERSION = 2;
|
|
27
|
+
export class DiskCache {
|
|
28
|
+
constructor(path, opts = {}) {
|
|
29
|
+
this.store = Object.create(null);
|
|
30
|
+
this.path = path;
|
|
31
|
+
this.ttlMs = opts.ttlMs ?? 30 * 60 * 1000;
|
|
32
|
+
this.maxSize = opts.maxSize ?? 500;
|
|
33
|
+
this.autoFlush = opts.autoFlush ?? true;
|
|
34
|
+
this.inlineImageThreshold = opts.inlineImageThreshold ?? 32 * 1024;
|
|
35
|
+
this.imagesDir = join(dirname(path), "images");
|
|
36
|
+
this.load();
|
|
37
|
+
}
|
|
38
|
+
key(url) {
|
|
39
|
+
try {
|
|
40
|
+
const u = new URL(url);
|
|
41
|
+
u.hash = "";
|
|
42
|
+
return u.toString().replace(/\/$/, "");
|
|
43
|
+
}
|
|
44
|
+
catch {
|
|
45
|
+
return url;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
set(url, page) {
|
|
49
|
+
const k = this.key(url);
|
|
50
|
+
if (Object.keys(this.store).length >= this.maxSize && !(k in this.store)) {
|
|
51
|
+
const oldest = Object.keys(this.store)[0];
|
|
52
|
+
if (oldest !== undefined)
|
|
53
|
+
delete this.store[oldest];
|
|
54
|
+
}
|
|
55
|
+
this.store[k] = { page, expiresAt: Date.now() + this.ttlMs };
|
|
56
|
+
if (this.autoFlush)
|
|
57
|
+
this.flush();
|
|
58
|
+
}
|
|
59
|
+
has(url) {
|
|
60
|
+
return this.get(url) !== undefined;
|
|
61
|
+
}
|
|
62
|
+
delete(url) {
|
|
63
|
+
delete this.store[this.key(url)];
|
|
64
|
+
if (this.autoFlush)
|
|
65
|
+
this.flush();
|
|
66
|
+
}
|
|
67
|
+
// ---------------------------------------------------------------------------
|
|
68
|
+
// Image helpers
|
|
69
|
+
// ---------------------------------------------------------------------------
|
|
70
|
+
/** Derive a stable filename for an image binary from its src URL. */
|
|
71
|
+
imageFilename(src) {
|
|
72
|
+
const hash = createHash("sha1").update(src).digest("hex");
|
|
73
|
+
const ext = extname(src.split("?")[0]) || ".bin";
|
|
74
|
+
return `${hash}${ext}`;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Prepare images for serialisation:
|
|
78
|
+
* - Images whose base64 length ≤ threshold are kept inline.
|
|
79
|
+
* - Larger images are written to imagesDir as binary files; base64 is
|
|
80
|
+
* replaced by filePath in the serialised entry.
|
|
81
|
+
*/
|
|
82
|
+
spill(images) {
|
|
83
|
+
if (!existsSync(this.imagesDir)) {
|
|
84
|
+
mkdirSync(this.imagesDir, { recursive: true });
|
|
85
|
+
}
|
|
86
|
+
return images.map((img) => {
|
|
87
|
+
if (!img.base64 || img.base64.length <= this.inlineImageThreshold) {
|
|
88
|
+
return img;
|
|
89
|
+
}
|
|
90
|
+
const filename = this.imageFilename(img.src);
|
|
91
|
+
const filePath = join(this.imagesDir, filename);
|
|
92
|
+
writeFileSync(filePath, Buffer.from(img.base64, "base64"));
|
|
93
|
+
const { base64: _omit, ...rest } = img;
|
|
94
|
+
return { ...rest, filePath };
|
|
95
|
+
});
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Hydrate images on read: if an image has filePath but no base64,
|
|
99
|
+
* load the binary from disk and re-encode.
|
|
100
|
+
*/
|
|
101
|
+
hydrate(images) {
|
|
102
|
+
return images.map((img) => {
|
|
103
|
+
if (img.base64 || !img.filePath)
|
|
104
|
+
return img;
|
|
105
|
+
if (!existsSync(img.filePath))
|
|
106
|
+
return img;
|
|
107
|
+
try {
|
|
108
|
+
const base64 = readFileSync(img.filePath).toString("base64");
|
|
109
|
+
return { ...img, base64 };
|
|
110
|
+
}
|
|
111
|
+
catch {
|
|
112
|
+
return img;
|
|
113
|
+
}
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
// ---------------------------------------------------------------------------
|
|
117
|
+
// Persistence
|
|
118
|
+
// ---------------------------------------------------------------------------
|
|
119
|
+
/** Write current contents to disk. Large images are spilled to imagesDir. */
|
|
120
|
+
flush() {
|
|
121
|
+
const now = Date.now();
|
|
122
|
+
const entries = {};
|
|
123
|
+
for (const [k, v] of Object.entries(this.store)) {
|
|
124
|
+
if (!v || v.expiresAt <= now)
|
|
125
|
+
continue;
|
|
126
|
+
const page = v.page;
|
|
127
|
+
const serialised = page.images
|
|
128
|
+
? { ...page, images: this.spill(page.images) }
|
|
129
|
+
: page;
|
|
130
|
+
entries[k] = { page: serialised, expiresAt: v.expiresAt };
|
|
131
|
+
}
|
|
132
|
+
const payload = { v: SCHEMA_VERSION, entries };
|
|
133
|
+
writeFileSync(this.path, JSON.stringify(payload), "utf8");
|
|
134
|
+
}
|
|
135
|
+
load() {
|
|
136
|
+
if (!existsSync(this.path))
|
|
137
|
+
return;
|
|
138
|
+
try {
|
|
139
|
+
const raw = JSON.parse(readFileSync(this.path, "utf8"));
|
|
140
|
+
// Reject files from incompatible schema versions (including old
|
|
141
|
+
// unversioned files that lack the "v" field entirely).
|
|
142
|
+
if (typeof raw !== "object" ||
|
|
143
|
+
raw === null ||
|
|
144
|
+
raw.v !== SCHEMA_VERSION) {
|
|
145
|
+
return; // stale schema — start fresh, do not throw
|
|
146
|
+
}
|
|
147
|
+
const payload = raw;
|
|
148
|
+
const now = Date.now();
|
|
149
|
+
for (const [k, v] of Object.entries(payload.entries)) {
|
|
150
|
+
if (v.expiresAt > now)
|
|
151
|
+
this.store[k] = v;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
catch {
|
|
155
|
+
// Corrupt or unreadable file — start fresh.
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
/** All currently valid (non-expired) pages, sorted newest-first. */
|
|
159
|
+
values() {
|
|
160
|
+
const now = Date.now();
|
|
161
|
+
return Object.values(this.store)
|
|
162
|
+
.filter((e) => e !== undefined && e.expiresAt > now)
|
|
163
|
+
.sort((a, b) => b.expiresAt - a.expiresAt)
|
|
164
|
+
.map((e) => {
|
|
165
|
+
const page = e.page;
|
|
166
|
+
return page.images ? { ...page, images: this.hydrate(page.images) } : page;
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
/** Retrieve a page, hydrating any file-backed images from disk. */
|
|
170
|
+
get(url) {
|
|
171
|
+
const k = this.key(url);
|
|
172
|
+
const entry = this.store[k];
|
|
173
|
+
if (!entry)
|
|
174
|
+
return undefined;
|
|
175
|
+
if (Date.now() > entry.expiresAt) {
|
|
176
|
+
delete this.store[k];
|
|
177
|
+
return undefined;
|
|
178
|
+
}
|
|
179
|
+
const page = entry.page;
|
|
180
|
+
if (page.images)
|
|
181
|
+
return { ...page, images: this.hydrate(page.images) };
|
|
182
|
+
return page;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
//# sourceMappingURL=disk-cache.js.map
|
package/dist/graph.d.ts
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import type { SpideredPage } from "./types.js";
|
|
2
|
+
/** A node in the knowledge graph — lightweight reference, not the full page. */
|
|
3
|
+
export interface PageNode {
|
|
4
|
+
url: string;
|
|
5
|
+
domain: string;
|
|
6
|
+
title: string;
|
|
7
|
+
description: string;
|
|
8
|
+
wordCount: number;
|
|
9
|
+
fetchedAt: string;
|
|
10
|
+
chunkCount: number;
|
|
11
|
+
}
|
|
12
|
+
/** A directed edge between two pages. */
|
|
13
|
+
export interface PageEdge {
|
|
14
|
+
from: string;
|
|
15
|
+
to: string;
|
|
16
|
+
/** Anchor text of the link */
|
|
17
|
+
text: string;
|
|
18
|
+
isExternal: boolean;
|
|
19
|
+
}
|
|
20
|
+
/** Serialisable snapshot for storage or embedding. */
|
|
21
|
+
export interface PageGraphSnapshot {
|
|
22
|
+
nodes: PageNode[];
|
|
23
|
+
edges: PageEdge[];
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Directed knowledge graph of spidered pages.
|
|
27
|
+
*
|
|
28
|
+
* Nodes are pages. Edges are outbound links.
|
|
29
|
+
* Maintains a reverse index (inbound links) for O(1) lookup.
|
|
30
|
+
*
|
|
31
|
+
* All graph queries return plain data — no PageNode references —
|
|
32
|
+
* so the graph is trivially serialisable.
|
|
33
|
+
*
|
|
34
|
+
* Internal storage uses plain objects (Object.create(null)) rather than
|
|
35
|
+
* Maps. Plain objects carry no realm-specific internal slots, making them
|
|
36
|
+
* safe across V8 context (realm) boundaries — e.g. when the graph is
|
|
37
|
+
* constructed in an ESM module realm but called from a jiti VM-sandbox.
|
|
38
|
+
*/
|
|
39
|
+
export declare class PageGraph {
|
|
40
|
+
private readonly nodes;
|
|
41
|
+
/** url → outbound edges */
|
|
42
|
+
private readonly out;
|
|
43
|
+
/** url → inbound source urls */
|
|
44
|
+
private readonly in_;
|
|
45
|
+
/** Add or update a node from a spidered page. */
|
|
46
|
+
addPage(page: SpideredPage): void;
|
|
47
|
+
/** Add a directed edge without requiring the target to be spidered yet. */
|
|
48
|
+
addEdge(from: string, to: string, text: string, isExternal: boolean): void;
|
|
49
|
+
node(url: string): PageNode | undefined;
|
|
50
|
+
/** Outbound edges from a node. */
|
|
51
|
+
outbound(url: string): PageEdge[];
|
|
52
|
+
/** URLs that link TO this page. */
|
|
53
|
+
inbound(url: string): string[];
|
|
54
|
+
/** Pages with no inbound links — entry points to the graph. */
|
|
55
|
+
roots(): PageNode[];
|
|
56
|
+
/** Pages with no outbound links to other spidered nodes. */
|
|
57
|
+
sinks(): PageNode[];
|
|
58
|
+
/** BFS shortest path between two page URLs. Returns null if unreachable. */
|
|
59
|
+
findPath(from: string, to: string): string[] | null;
|
|
60
|
+
/**
|
|
61
|
+
* All pages reachable from `startUrl` via spidered links.
|
|
62
|
+
* BFS, bounded by the nodes present in the graph.
|
|
63
|
+
*/
|
|
64
|
+
reachableFrom(startUrl: string): PageNode[];
|
|
65
|
+
/** Nodes ranked by inbound link count (highest first). */
|
|
66
|
+
byPageRank(): Array<{
|
|
67
|
+
node: PageNode;
|
|
68
|
+
inboundCount: number;
|
|
69
|
+
}>;
|
|
70
|
+
get nodeCount(): number;
|
|
71
|
+
get edgeCount(): number;
|
|
72
|
+
/** Plain snapshot — safe to JSON.stringify or embed. */
|
|
73
|
+
toJSON(): PageGraphSnapshot;
|
|
74
|
+
static fromJSON(snap: PageGraphSnapshot): PageGraph;
|
|
75
|
+
}
|
|
76
|
+
//# sourceMappingURL=graph.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"graph.d.ts","sourceRoot":"","sources":["../src/graph.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAE/C,gFAAgF;AAChF,MAAM,WAAW,QAAQ;IACxB,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;CACnB;AAED,yCAAyC;AACzC,MAAM,WAAW,QAAQ;IACxB,IAAI,EAAE,MAAM,CAAC;IACb,EAAE,EAAE,MAAM,CAAC;IACX,8BAA8B;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,OAAO,CAAC;CACpB;AAED,sDAAsD;AACtD,MAAM,WAAW,iBAAiB;IACjC,KAAK,EAAE,QAAQ,EAAE,CAAC;IAClB,KAAK,EAAE,QAAQ,EAAE,CAAC;CAClB;AAED;;;;;;;;;;;;;GAaG;AACH,qBAAa,SAAS;IACrB,OAAO,CAAC,QAAQ,CAAC,KAAK,CAA6D;IACnF,2BAA2B;IAC3B,OAAO,CAAC,QAAQ,CAAC,GAAG,CAA+D;IACnF,gCAAgC;IAChC,OAAO,CAAC,QAAQ,CAAC,GAAG,CAA6D;IAEjF,iDAAiD;IACjD,OAAO,CAAC,IAAI,EAAE,YAAY,GAAG,IAAI;IAiBjC,2EAA2E;IAC3E,OAAO,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,UAAU,EAAE,OAAO,GAAG,IAAI;IAY1E,IAAI,CAAC,GAAG,EAAE,MAAM,GAAG,QAAQ,GAAG,SAAS;IAIvC,kCAAkC;IAClC,QAAQ,CAAC,GAAG,EAAE,MAAM,GAAG,QAAQ,EAAE;IAIjC,mCAAmC;IACnC,OAAO,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,EAAE;IAI9B,+DAA+D;IAC/D,KAAK,IAAI,QAAQ,EAAE;IAKnB,4DAA4D;IAC5D,KAAK,IAAI,QAAQ,EAAE;IASnB,4EAA4E;IAC5E,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,IAAI;IAmBnD;;;OAGG;IACH,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG,QAAQ,EAAE;IAgB3C,0DAA0D;IAC1D,UAAU,IAAI,KAAK,CAAC;QAAE,IAAI,EAAE,QAAQ,CAAC;QAAC,YAAY,EAAE,MAAM,CAAA;KAAE,CAAC;IAO7D,IAAI,SAAS,IAAI,MAAM,CAEtB;IAED,IAAI,SAAS,IAAI,MAAM,CAMtB;IAED,wDAAwD;IACxD,MAAM,IAAI,iBAAiB;IAW3B,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,iBAAiB,GAAG,SAAS;CAMnD"}
|
package/dist/graph.js
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Directed knowledge graph of spidered pages.
|
|
3
|
+
*
|
|
4
|
+
* Nodes are pages. Edges are outbound links.
|
|
5
|
+
* Maintains a reverse index (inbound links) for O(1) lookup.
|
|
6
|
+
*
|
|
7
|
+
* All graph queries return plain data — no PageNode references —
|
|
8
|
+
* so the graph is trivially serialisable.
|
|
9
|
+
*
|
|
10
|
+
* Internal storage uses plain objects (Object.create(null)) rather than
|
|
11
|
+
* Maps. Plain objects carry no realm-specific internal slots, making them
|
|
12
|
+
* safe across V8 context (realm) boundaries — e.g. when the graph is
|
|
13
|
+
* constructed in an ESM module realm but called from a jiti VM-sandbox.
|
|
14
|
+
*/
|
|
15
|
+
export class PageGraph {
|
|
16
|
+
constructor() {
|
|
17
|
+
this.nodes = Object.create(null);
|
|
18
|
+
/** url → outbound edges */
|
|
19
|
+
this.out = Object.create(null);
|
|
20
|
+
/** url → inbound source urls */
|
|
21
|
+
this.in_ = Object.create(null);
|
|
22
|
+
}
|
|
23
|
+
/** Add or update a node from a spidered page. */
|
|
24
|
+
addPage(page) {
|
|
25
|
+
this.nodes[page.url] = {
|
|
26
|
+
url: page.url,
|
|
27
|
+
domain: page.domain,
|
|
28
|
+
title: page.title,
|
|
29
|
+
description: page.description,
|
|
30
|
+
wordCount: page.wordCount,
|
|
31
|
+
fetchedAt: page.fetchedAt,
|
|
32
|
+
chunkCount: page.chunks.length,
|
|
33
|
+
};
|
|
34
|
+
for (const link of page.links) {
|
|
35
|
+
if (!link.href)
|
|
36
|
+
continue;
|
|
37
|
+
this.addEdge(page.url, link.href, link.text, link.isExternal);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
/** Add a directed edge without requiring the target to be spidered yet. */
|
|
41
|
+
addEdge(from, to, text, isExternal) {
|
|
42
|
+
const edge = { from, to, text, isExternal };
|
|
43
|
+
const existing = this.out[from] ?? [];
|
|
44
|
+
if (!existing.some((e) => e.to === to)) {
|
|
45
|
+
this.out[from] = [...existing, edge];
|
|
46
|
+
}
|
|
47
|
+
const inbound = this.in_[to] ?? [];
|
|
48
|
+
if (!inbound.includes(from)) {
|
|
49
|
+
this.in_[to] = [...inbound, from];
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
node(url) {
|
|
53
|
+
return this.nodes[url];
|
|
54
|
+
}
|
|
55
|
+
/** Outbound edges from a node. */
|
|
56
|
+
outbound(url) {
|
|
57
|
+
return this.out[url] ?? [];
|
|
58
|
+
}
|
|
59
|
+
/** URLs that link TO this page. */
|
|
60
|
+
inbound(url) {
|
|
61
|
+
return this.in_[url] ?? [];
|
|
62
|
+
}
|
|
63
|
+
/** Pages with no inbound links — entry points to the graph. */
|
|
64
|
+
roots() {
|
|
65
|
+
return Object.values(this.nodes)
|
|
66
|
+
.filter((n) => n !== undefined && (this.in_[n.url] ?? []).length === 0);
|
|
67
|
+
}
|
|
68
|
+
/** Pages with no outbound links to other spidered nodes. */
|
|
69
|
+
sinks() {
|
|
70
|
+
return Object.values(this.nodes)
|
|
71
|
+
.filter((n) => {
|
|
72
|
+
if (!n)
|
|
73
|
+
return false;
|
|
74
|
+
const edges = this.out[n.url] ?? [];
|
|
75
|
+
return !edges.some((e) => e.to in this.nodes);
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
/** BFS shortest path between two page URLs. Returns null if unreachable. */
|
|
79
|
+
findPath(from, to) {
|
|
80
|
+
if (from === to)
|
|
81
|
+
return [from];
|
|
82
|
+
const visited = new Set([from]);
|
|
83
|
+
const queue = [[from]];
|
|
84
|
+
while (queue.length > 0) {
|
|
85
|
+
const path = queue.shift();
|
|
86
|
+
const current = path[path.length - 1];
|
|
87
|
+
for (const edge of this.out[current] ?? []) {
|
|
88
|
+
if (edge.to === to)
|
|
89
|
+
return [...path, to];
|
|
90
|
+
if (!visited.has(edge.to) && edge.to in this.nodes) {
|
|
91
|
+
visited.add(edge.to);
|
|
92
|
+
queue.push([...path, edge.to]);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
return null;
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* All pages reachable from `startUrl` via spidered links.
|
|
100
|
+
* BFS, bounded by the nodes present in the graph.
|
|
101
|
+
*/
|
|
102
|
+
reachableFrom(startUrl) {
|
|
103
|
+
const visited = new Set([startUrl]);
|
|
104
|
+
const queue = [startUrl];
|
|
105
|
+
while (queue.length > 0) {
|
|
106
|
+
const url = queue.shift();
|
|
107
|
+
for (const edge of this.out[url] ?? []) {
|
|
108
|
+
if (!visited.has(edge.to) && edge.to in this.nodes) {
|
|
109
|
+
visited.add(edge.to);
|
|
110
|
+
queue.push(edge.to);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
visited.delete(startUrl);
|
|
115
|
+
return [...visited].map((u) => this.nodes[u]).filter((n) => n !== undefined);
|
|
116
|
+
}
|
|
117
|
+
/** Nodes ranked by inbound link count (highest first). */
|
|
118
|
+
byPageRank() {
|
|
119
|
+
return Object.values(this.nodes)
|
|
120
|
+
.filter((n) => n !== undefined)
|
|
121
|
+
.map((n) => ({ node: n, inboundCount: (this.in_[n.url] ?? []).length }))
|
|
122
|
+
.sort((a, b) => b.inboundCount - a.inboundCount);
|
|
123
|
+
}
|
|
124
|
+
get nodeCount() {
|
|
125
|
+
return Object.keys(this.nodes).length;
|
|
126
|
+
}
|
|
127
|
+
get edgeCount() {
|
|
128
|
+
let total = 0;
|
|
129
|
+
for (const edges of Object.values(this.out)) {
|
|
130
|
+
if (edges)
|
|
131
|
+
total += edges.length;
|
|
132
|
+
}
|
|
133
|
+
return total;
|
|
134
|
+
}
|
|
135
|
+
/** Plain snapshot — safe to JSON.stringify or embed. */
|
|
136
|
+
toJSON() {
|
|
137
|
+
const edges = [];
|
|
138
|
+
for (const edgeList of Object.values(this.out)) {
|
|
139
|
+
if (edgeList)
|
|
140
|
+
edges.push(...edgeList);
|
|
141
|
+
}
|
|
142
|
+
return {
|
|
143
|
+
nodes: Object.values(this.nodes).filter((n) => n !== undefined),
|
|
144
|
+
edges,
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
static fromJSON(snap) {
|
|
148
|
+
const g = new PageGraph();
|
|
149
|
+
for (const n of snap.nodes)
|
|
150
|
+
g.nodes[n.url] = n;
|
|
151
|
+
for (const e of snap.edges)
|
|
152
|
+
g.addEdge(e.from, e.to, e.text, e.isExternal);
|
|
153
|
+
return g;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
//# sourceMappingURL=graph.js.map
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
export type { SpiderCacheOptions } from "./cache.js";
|
|
2
|
+
export { SpiderCache } from "./cache.js";
|
|
3
|
+
export type { CrawlOptions, CrawlResult } from "./crawl.js";
|
|
4
|
+
export { crawl } from "./crawl.js";
|
|
5
|
+
export type { PageEdge, PageGraphSnapshot, PageNode } from "./graph.js";
|
|
6
|
+
export { PageGraph } from "./graph.js";
|
|
7
|
+
export type { FuzzySearchOptions, SearchHit } from "./search.js";
|
|
8
|
+
export { searchPages } from "./search.js";
|
|
9
|
+
/** @deprecated Use {@link searchPages} — renamed in v0.4.0 to reflect BM25F ranking (not fuzzy-only). */
|
|
10
|
+
export { searchPages as fuzzySearch } from "./search.js";
|
|
11
|
+
export type { SpiderOptions, TreePage } from "./spider.js";
|
|
12
|
+
export { spider } from "./spider.js";
|
|
13
|
+
export type { QueryTreeOptions } from "./tree.js";
|
|
14
|
+
export { buildTree, navigateTree, queryTree } from "./tree.js";
|
|
15
|
+
export type { Chunk, ChunkType, DOMNode, ImageRef, LeanLink, LeanPage, Link, PageView, SpideredPage, TreeHit } from "./types.js";
|
|
16
|
+
export { toLean } from "./views.js";
|
|
17
|
+
export type { BraveSearchOptions, DdgSearchOptions, ExaSearchOptions, FallbackSearchEngineOptions, SearchEngine, TavilySearchOptions, WebSearchResult } from "./web-search.js";
|
|
18
|
+
export { braveSearch, ddgSearch, exaSearch, registerSearchEngine, resolveSearchEngine, tavilySearch, webSearch } from "./web-search.js";
|
|
19
|
+
import type { ICache } from "./ports.js";
|
|
20
|
+
import type { Chunk, SpideredPage } from "./types.js";
|
|
21
|
+
/**
|
|
22
|
+
* Retrieve a single chunk from a cached page by URL and chunk index.
|
|
23
|
+
*
|
|
24
|
+
* Avoids loading the full page markdown when an agent only needs one
|
|
25
|
+
* specific chunk — e.g. to re-read a section after a highlights hit.
|
|
26
|
+
*
|
|
27
|
+
* Returns undefined when the URL is not cached, the index is out of range,
|
|
28
|
+
* or the index is negative.
|
|
29
|
+
*
|
|
30
|
+
* @example
|
|
31
|
+
* const chunk = getChunk(cache, "https://example.com/article", 3)
|
|
32
|
+
* if (chunk) console.log(chunk.text)
|
|
33
|
+
*/
|
|
34
|
+
export declare function getChunk(cache: ICache<string, SpideredPage>, url: string, index: number): Chunk | undefined;
|
|
35
|
+
export type { HttpRequest, HttpResponse, ICache, IHttpClient, IRobotsChecker, ISearchEngine, IThrottle, RobotsResult, SearchQuery } from "./ports.js";
|
|
36
|
+
export type { DiskCacheOptions } from "./disk-cache.js";
|
|
37
|
+
export { DiskCache } from "./disk-cache.js";
|
|
38
|
+
export type { PlaywrightClientOptions } from "./playwright.js";
|
|
39
|
+
export { PlaywrightHttpClient, createPlaywrightClient } from "./playwright.js";
|
|
40
|
+
export { RobotsCache, createRobotsCache } from "./robots.js";
|
|
41
|
+
export { fetchSitemapUrls } from "./sitemap.js";
|
|
42
|
+
export type { ThrottleOptions } from "./throttle.js";
|
|
43
|
+
export { DomainThrottle, createThrottle } from "./throttle.js";
|
|
44
|
+
export { BraveSearchEngine, DdgSearchEngine, ExaSearchEngine, FallbackSearchEngine, TavilySearchEngine, defaultSearchEngine } from "./web-search.js";
|
|
45
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAIA,YAAY,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AACrD,OAAO,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AACzC,YAAY,EAAE,YAAY,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAC5D,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,YAAY,EAAE,QAAQ,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AACxE,OAAO,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AACvC,YAAY,EAAE,kBAAkB,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACjE,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAC1C,yGAAyG;AACzG,OAAO,EAAE,WAAW,IAAI,WAAW,EAAE,MAAM,aAAa,CAAC;AACzD,YAAY,EAAE,aAAa,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAC3D,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACrC,YAAY,EAAE,gBAAgB,EAAE,MAAM,WAAW,CAAC;AAClD,OAAO,EAAE,SAAS,EAAE,YAAY,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAC/D,YAAY,EAAE,KAAK,EAAE,SAAS,EAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AACjI,OAAO,EAAE,MAAM,EAAE,MAAM,YAAY,CAAC;AACpC,YAAY,EAAE,kBAAkB,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,2BAA2B,EAAE,YAAY,EAAE,mBAAmB,EAAE,eAAe,EAAE,MAAM,iBAAiB,CAAC;AAC/K,OAAO,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,YAAY,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAMxI,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,YAAY,CAAC;AACzC,OAAO,KAAK,EAAE,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAEtD;;;;;;;;;;;;GAYG;AACH,wBAAgB,QAAQ,CACvB,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC,EACnC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,GACX,KAAK,GAAG,SAAS,CAGnB;AAOD,YAAY,EAAE,WAAW,EAAE,YAAY,EAAE,MAAM,EAAE,WAAW,EAAE,cAAc,EAAE,aAAa,EAAE,SAAS,EAAE,YAAY,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AACtJ,YAAY,EAAE,gBAAgB,EAAE,MAAM,iBAAiB,CAAC;AACxD,OAAO,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAC;AAC5C,YAAY,EAAE,uBAAuB,EAAE,MAAM,iBAAiB,CAAC;AAC/D,OAAO,EAAE,oBAAoB,EAAE,sBAAsB,EAAE,MAAM,iBAAiB,CAAC;AAC/E,OAAO,EAAE,WAAW,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC;AAC7D,OAAO,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAChD,YAAY,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AACrD,OAAO,EAAE,cAAc,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAC/D,OAAO,EAAE,iBAAiB,EAAE,eAAe,EAAE,eAAe,EAAE,oBAAoB,EAAE,kBAAkB,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// Public API — what most consumers need
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
export { SpiderCache } from "./cache.js";
|
|
5
|
+
export { crawl } from "./crawl.js";
|
|
6
|
+
export { PageGraph } from "./graph.js";
|
|
7
|
+
export { searchPages } from "./search.js";
|
|
8
|
+
/** @deprecated Use {@link searchPages} — renamed in v0.4.0 to reflect BM25F ranking (not fuzzy-only). */
|
|
9
|
+
export { searchPages as fuzzySearch } from "./search.js";
|
|
10
|
+
export { spider } from "./spider.js";
|
|
11
|
+
export { buildTree, navigateTree, queryTree } from "./tree.js";
|
|
12
|
+
export { toLean } from "./views.js";
|
|
13
|
+
export { braveSearch, ddgSearch, exaSearch, registerSearchEngine, resolveSearchEngine, tavilySearch, webSearch } from "./web-search.js";
|
|
14
|
+
/**
|
|
15
|
+
* Retrieve a single chunk from a cached page by URL and chunk index.
|
|
16
|
+
*
|
|
17
|
+
* Avoids loading the full page markdown when an agent only needs one
|
|
18
|
+
* specific chunk — e.g. to re-read a section after a highlights hit.
|
|
19
|
+
*
|
|
20
|
+
* Returns undefined when the URL is not cached, the index is out of range,
|
|
21
|
+
* or the index is negative.
|
|
22
|
+
*
|
|
23
|
+
* @example
|
|
24
|
+
* const chunk = getChunk(cache, "https://example.com/article", 3)
|
|
25
|
+
* if (chunk) console.log(chunk.text)
|
|
26
|
+
*/
|
|
27
|
+
export function getChunk(cache, url, index) {
|
|
28
|
+
if (index < 0)
|
|
29
|
+
return undefined;
|
|
30
|
+
return cache.get(url)?.chunks[index];
|
|
31
|
+
}
|
|
32
|
+
export { DiskCache } from "./disk-cache.js";
|
|
33
|
+
export { PlaywrightHttpClient, createPlaywrightClient } from "./playwright.js";
|
|
34
|
+
export { RobotsCache, createRobotsCache } from "./robots.js";
|
|
35
|
+
export { fetchSitemapUrls } from "./sitemap.js";
|
|
36
|
+
export { DomainThrottle, createThrottle } from "./throttle.js";
|
|
37
|
+
export { BraveSearchEngine, DdgSearchEngine, ExaSearchEngine, FallbackSearchEngine, TavilySearchEngine, defaultSearchEngine } from "./web-search.js";
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
// parse.ts, convert.ts, views.ts are internal implementation modules.
|
|
40
|
+
// They are NOT exported here — they are consumed only by spider.ts.
|
|
41
|
+
// If you need lower-level DOM or markdown utilities, import from the
|
|
42
|
+
// sub-modules directly (not covered by semver stability guarantees).
|
|
43
|
+
// ---------------------------------------------------------------------------
|
|
44
|
+
//# sourceMappingURL=index.js.map
|
package/dist/parse.d.ts
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DOM parsing helpers.
|
|
3
|
+
*
|
|
4
|
+
* Owns the DOM parsing dependency. spider.ts calls these after fetching HTML;
|
|
5
|
+
* it never touches the DOM library directly.
|
|
6
|
+
*/
|
|
7
|
+
import type { Link, SpideredPage } from "./types.js";
|
|
8
|
+
/**
|
|
9
|
+
* Parse raw HTML into a DOM Document.
|
|
10
|
+
* Uses linkedom — a lightweight server-side DOM that has no CSS engine,
|
|
11
|
+
* no module-level Maps, and a flat CJS dependency tree. Safe to load
|
|
12
|
+
* through jiti's transform pipeline without nativeModules workarounds.
|
|
13
|
+
*/
|
|
14
|
+
export declare function parseDom(html: string, url: string): Document;
|
|
15
|
+
/** True if el or any ancestor up to 5 levels looks like navigation chrome. */
|
|
16
|
+
export declare function isNavElement(el: Element): boolean;
|
|
17
|
+
/** Extract visible text from an anchor, skipping SVG subtrees. */
|
|
18
|
+
export declare function anchorText(a: Element): string;
|
|
19
|
+
/** Extract outbound links from the DOM, classified as body or nav. */
|
|
20
|
+
export declare function extractLinks(doc: Document, baseUrl: string): Link[];
|
|
21
|
+
/** Extract h1/h2/h3 headings from Readability article HTML. */
|
|
22
|
+
export declare function extractHeadings(html: string): SpideredPage["headings"];
|
|
23
|
+
/** Extract topic tags from meta keywords and article:tag. */
|
|
24
|
+
export declare function extractTags(doc: Document): string[];
|
|
25
|
+
/** Extract canonical URL from link[rel=canonical] or og:url. */
|
|
26
|
+
export declare function extractCanonicalUrl(doc: Document, fetchedUrl: string): string | undefined;
|
|
27
|
+
//# sourceMappingURL=parse.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"parse.d.ts","sourceRoot":"","sources":["../src/parse.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,KAAK,EAAE,IAAI,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAMrD;;;;;GAKG;AACH,wBAAgB,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,QAAQ,CAE5D;AASD,8EAA8E;AAC9E,wBAAgB,YAAY,CAAC,EAAE,EAAE,OAAO,GAAG,OAAO,CAkBjD;AAMD,kEAAkE;AAClE,wBAAgB,UAAU,CAAC,CAAC,EAAE,OAAO,GAAG,MAAM,CAO7C;AAMD,sEAAsE;AACtE,wBAAgB,YAAY,CAAC,GAAG,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,GAAG,IAAI,EAAE,CAuBnE;AAMD,+DAA+D;AAC/D,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY,CAAC,UAAU,CAAC,CAStE;AAMD,6DAA6D;AAC7D,wBAAgB,WAAW,CAAC,GAAG,EAAE,QAAQ,GAAG,MAAM,EAAE,CAsBnD;AAMD,gEAAgE;AAChE,wBAAgB,mBAAmB,CAAC,GAAG,EAAE,QAAQ,EAAE,UAAU,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAOzF"}
|