@dpopsuev/web-spider 0.10.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/batch.d.ts +24 -0
- package/dist/batch.d.ts.map +1 -0
- package/dist/batch.js +68 -0
- package/dist/cache.d.ts +40 -0
- package/dist/cache.d.ts.map +1 -0
- package/dist/cache.js +78 -0
- package/dist/convert.d.ts +29 -0
- package/dist/convert.d.ts.map +1 -0
- package/dist/convert.js +131 -0
- package/dist/crawl.d.ts +56 -0
- package/dist/crawl.d.ts.map +1 -0
- package/dist/crawl.js +126 -0
- package/dist/disk-cache.d.ts +75 -0
- package/dist/disk-cache.d.ts.map +1 -0
- package/dist/disk-cache.js +185 -0
- package/dist/graph.d.ts +76 -0
- package/dist/graph.d.ts.map +1 -0
- package/dist/graph.js +156 -0
- package/dist/index.d.ts +45 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +44 -0
- package/dist/parse.d.ts +27 -0
- package/dist/parse.d.ts.map +1 -0
- package/dist/parse.js +131 -0
- package/dist/playwright.d.ts +75 -0
- package/dist/playwright.d.ts.map +1 -0
- package/dist/playwright.js +141 -0
- package/dist/ports.d.ts +104 -0
- package/dist/ports.d.ts.map +1 -0
- package/dist/ports.js +10 -0
- package/dist/robots.d.ts +24 -0
- package/dist/robots.d.ts.map +1 -0
- package/dist/robots.js +104 -0
- package/dist/search.d.ts +47 -0
- package/dist/search.d.ts.map +1 -0
- package/dist/search.js +112 -0
- package/dist/sitemap.d.ts +15 -0
- package/dist/sitemap.d.ts.map +1 -0
- package/dist/sitemap.js +65 -0
- package/dist/spider.d.ts +74 -0
- package/dist/spider.d.ts.map +1 -0
- package/dist/spider.js +349 -0
- package/dist/throttle.d.ts +49 -0
- package/dist/throttle.d.ts.map +1 -0
- package/dist/throttle.js +85 -0
- package/dist/tree.d.ts +34 -0
- package/dist/tree.d.ts.map +1 -0
- package/dist/tree.js +354 -0
- package/dist/types.d.ts +189 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +2 -0
- package/dist/views.d.ts +17 -0
- package/dist/views.d.ts.map +1 -0
- package/dist/views.js +39 -0
- package/dist/web-search.d.ts +184 -0
- package/dist/web-search.d.ts.map +1 -0
- package/dist/web-search.js +399 -0
- package/fixtures/article-with-images.html +94 -0
- package/fixtures/gh-shell.html +32 -0
- package/fixtures/guide-ai-agents-web-scraping.json +552 -0
- package/fixtures/images/large.jpg +0 -0
- package/fixtures/images/small.jpg +0 -0
- package/fixtures/images/tiny.png +0 -0
- package/fixtures/quotes-index.json +40 -0
- package/package.json +47 -0
- package/scripts/fetch-guide.mjs +25 -0
- package/src/cache.ts +99 -0
- package/src/convert.ts +161 -0
- package/src/crawl.ts +186 -0
- package/src/disk-cache.ts +228 -0
- package/src/graph.ts +189 -0
- package/src/index.ts +74 -0
- package/src/parse.ts +154 -0
- package/src/playwright.ts +193 -0
- package/src/ports.ts +131 -0
- package/src/robots.ts +121 -0
- package/src/search.ts +173 -0
- package/src/sitemap.ts +67 -0
- package/src/spider.ts +475 -0
- package/src/throttle.ts +118 -0
- package/src/tree.ts +379 -0
- package/src/types.ts +225 -0
- package/src/views.ts +42 -0
- package/src/web-search.ts +548 -0
- package/test/convert-images.test.ts +69 -0
- package/test/disk-cache-images.test.ts +193 -0
- package/test/engine-registry.test.ts +114 -0
- package/test/exports.test.ts +124 -0
- package/test/get-chunk.test.ts +115 -0
- package/test/images-integration.test.ts +359 -0
- package/test/improvements.test.ts +279 -0
- package/test/inbound-count.test.ts +111 -0
- package/test/lean.test.ts +105 -0
- package/test/playwright.test.ts +128 -0
- package/test/ports.test.ts +161 -0
- package/test/search.test.ts +219 -0
- package/test/spider-images.test.ts +180 -0
- package/test/spider-unit.test.ts +610 -0
- package/test/tree.test.ts +272 -0
- package/test/types.test.ts +169 -0
- package/test/web-search-integration.test.ts +180 -0
- package/test/web-search.test.ts +305 -0
- package/tsconfig.json +9 -0
- package/tsconfig.test.json +7 -0
- package/vitest.config.ts +8 -0
package/dist/throttle.js
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-domain request throttle with exponential backoff and jitter.
|
|
3
|
+
*
|
|
4
|
+
* Enforces a minimum gap between requests to the same hostname.
|
|
5
|
+
* On 429/503, backs off exponentially and respects Retry-After headers.
|
|
6
|
+
* Shared instances should be passed into spider() and crawl() so that
|
|
7
|
+
* all requests to a domain coordinate through one rate limiter.
|
|
8
|
+
*/
|
|
9
|
+
function sleep(ms) {
|
|
10
|
+
return new Promise((r) => setTimeout(r, ms));
|
|
11
|
+
}
|
|
12
|
+
function parseRetryAfter(header) {
|
|
13
|
+
if (!header)
|
|
14
|
+
return 0;
|
|
15
|
+
const seconds = parseInt(header, 10);
|
|
16
|
+
if (!isNaN(seconds))
|
|
17
|
+
return seconds * 1_000;
|
|
18
|
+
const date = new Date(header).getTime();
|
|
19
|
+
if (!isNaN(date))
|
|
20
|
+
return Math.max(0, date - Date.now());
|
|
21
|
+
return 0;
|
|
22
|
+
}
|
|
23
|
+
export class DomainThrottle {
|
|
24
|
+
constructor(opts = {}) {
|
|
25
|
+
this.states = new Map();
|
|
26
|
+
this.minDelayMs = opts.minDelayMs ?? 500;
|
|
27
|
+
this.backoffBaseMs = opts.backoffBaseMs ?? 1_000;
|
|
28
|
+
this.backoffCapMs = opts.backoffCapMs ?? 30_000;
|
|
29
|
+
this.maxRetries = opts.maxRetries ?? 3;
|
|
30
|
+
}
|
|
31
|
+
state(host) {
|
|
32
|
+
let s = this.states.get(host);
|
|
33
|
+
if (!s) {
|
|
34
|
+
s = { lastAt: 0, backoffUntil: 0, errors: 0 };
|
|
35
|
+
this.states.set(host, s);
|
|
36
|
+
}
|
|
37
|
+
return s;
|
|
38
|
+
}
|
|
39
|
+
/** Wait until the domain's rate limit and backoff have cleared. */
|
|
40
|
+
async wait(url) {
|
|
41
|
+
const s = this.state(new URL(url).hostname);
|
|
42
|
+
const minDelay = s.minDelayMs ?? this.minDelayMs;
|
|
43
|
+
const now = Date.now();
|
|
44
|
+
const delay = Math.max(Math.max(0, s.backoffUntil - now), Math.max(0, s.lastAt + minDelay - now));
|
|
45
|
+
if (delay > 0)
|
|
46
|
+
await sleep(delay);
|
|
47
|
+
s.lastAt = Date.now();
|
|
48
|
+
}
|
|
49
|
+
/** Record a successful request — resets backoff for the domain. */
|
|
50
|
+
success(url) {
|
|
51
|
+
const s = this.state(new URL(url).hostname);
|
|
52
|
+
s.errors = 0;
|
|
53
|
+
s.backoffUntil = 0;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Record a rate-limit hit. Applies exponential backoff with jitter,
|
|
57
|
+
* using Retry-After header when present. Returns the wait duration in ms.
|
|
58
|
+
*/
|
|
59
|
+
rateLimit(url, retryAfterHeader) {
|
|
60
|
+
const s = this.state(new URL(url).hostname);
|
|
61
|
+
s.errors++;
|
|
62
|
+
const retryAfterMs = parseRetryAfter(retryAfterHeader);
|
|
63
|
+
const jitter = Math.random() * this.backoffBaseMs;
|
|
64
|
+
const backoffMs = Math.min(this.backoffCapMs, this.backoffBaseMs * 2 ** (s.errors - 1) + jitter);
|
|
65
|
+
const waitMs = Math.max(retryAfterMs, backoffMs);
|
|
66
|
+
s.backoffUntil = Date.now() + waitMs;
|
|
67
|
+
return waitMs;
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Override the minimum delay for a specific domain.
|
|
71
|
+
* Used to honour robots.txt Crawl-delay directives.
|
|
72
|
+
*/
|
|
73
|
+
setDomainDelay(host, ms) {
|
|
74
|
+
this.state(host).minDelayMs = ms;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Factory — avoids jiti/Bun CJS re-export interop where class constructors
|
|
79
|
+
* accessed through a re-export chain can appear undefined at call site.
|
|
80
|
+
* Use this in extension code instead of `new DomainThrottle()`.
|
|
81
|
+
*/
|
|
82
|
+
export function createThrottle(opts) {
|
|
83
|
+
return new DomainThrottle(opts);
|
|
84
|
+
}
|
|
85
|
+
//# sourceMappingURL=throttle.js.map
|
package/dist/tree.d.ts
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import type { DOMNode, TreeHit } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Build a simplified semantic DOM tree from Readability article HTML.
|
|
4
|
+
*
|
|
5
|
+
* The root is always an "article" node. Presentational wrappers are collapsed,
|
|
6
|
+
* single-child chains are simplified, and only semantic tags survive.
|
|
7
|
+
*/
|
|
8
|
+
export declare function buildTree(articleHtml: string, baseUrl: string): DOMNode;
|
|
9
|
+
/**
|
|
10
|
+
* Navigate to a specific node by its dot-bracket path.
|
|
11
|
+
* Returns null if the path does not exist in the tree.
|
|
12
|
+
*
|
|
13
|
+
* @example navigateTree(tree, "article.section[1].pre[0]")
|
|
14
|
+
*/
|
|
15
|
+
export declare function navigateTree(root: DOMNode, path: string): DOMNode | null;
|
|
16
|
+
export interface QueryTreeOptions {
|
|
17
|
+
/** Max hits to return (default 10). */
|
|
18
|
+
topN?: number;
|
|
19
|
+
/** Context chars around match in snippet (default 100). */
|
|
20
|
+
snippetRadius?: number;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Fuzzy-search a DOM tree for a query string.
|
|
24
|
+
*
|
|
25
|
+
* Returns hits ranked by score. Each hit is the nearest semantic ancestor
|
|
26
|
+
* that contains the match (a section, li, pre, p — not a raw div). This
|
|
27
|
+
* means code blocks and table rows are always returned whole.
|
|
28
|
+
*
|
|
29
|
+
* @example
|
|
30
|
+
* const hits = queryTree(tree, "declaration merge", { topN: 3 })
|
|
31
|
+
* // hits[0].node is the full <section> containing that heading
|
|
32
|
+
*/
|
|
33
|
+
export declare function queryTree(root: DOMNode, query: string, opts?: QueryTreeOptions): TreeHit[];
|
|
34
|
+
//# sourceMappingURL=tree.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tree.d.ts","sourceRoot":"","sources":["../src/tree.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AA6MnD;;;;;GAKG;AACH,wBAAgB,SAAS,CAAC,WAAW,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,OAAO,CAmBvE;AAeD;;;;;GAKG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO,GAAG,IAAI,CAGxE;AAoED,MAAM,WAAW,gBAAgB;IAChC,uCAAuC;IACvC,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,2DAA2D;IAC3D,aAAa,CAAC,EAAE,MAAM,CAAC;CACvB;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,IAAI,GAAE,gBAAqB,GAAG,OAAO,EAAE,CAqC9F"}
|
package/dist/tree.js
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
import { parseHTML } from "linkedom";
|
|
2
|
+
// ---------------------------------------------------------------------------
|
|
3
|
+
// Semantic tag sets
|
|
4
|
+
// ---------------------------------------------------------------------------
|
|
5
|
+
/**
|
|
6
|
+
* Tags that are kept as-is in the simplified tree.
|
|
7
|
+
* Everything else is either collapsed (single-child wrappers) or stripped.
|
|
8
|
+
*/
|
|
9
|
+
const BLOCK_TAGS = new Set([
|
|
10
|
+
"article",
|
|
11
|
+
"section",
|
|
12
|
+
"main",
|
|
13
|
+
"aside",
|
|
14
|
+
"h1",
|
|
15
|
+
"h2",
|
|
16
|
+
"h3",
|
|
17
|
+
"h4",
|
|
18
|
+
"h5",
|
|
19
|
+
"h6",
|
|
20
|
+
"p",
|
|
21
|
+
"blockquote",
|
|
22
|
+
"pre",
|
|
23
|
+
"figure",
|
|
24
|
+
"figcaption",
|
|
25
|
+
"ul",
|
|
26
|
+
"ol",
|
|
27
|
+
"li",
|
|
28
|
+
"table",
|
|
29
|
+
"thead",
|
|
30
|
+
"tbody",
|
|
31
|
+
"tfoot",
|
|
32
|
+
"tr",
|
|
33
|
+
"th",
|
|
34
|
+
"td",
|
|
35
|
+
"details",
|
|
36
|
+
"summary",
|
|
37
|
+
]);
|
|
38
|
+
const INLINE_TAGS = new Set(["a", "code", "strong", "em", "abbr", "time", "mark", "s", "del", "ins"]);
|
|
39
|
+
const SEMANTIC_TAGS = new Set([...BLOCK_TAGS, ...INLINE_TAGS]);
|
|
40
|
+
/** Tags whose subtrees should be flattened to a single text node. */
|
|
41
|
+
const LEAF_CONTAINERS = new Set(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "td", "th", "figcaption", "summary"]);
|
|
42
|
+
/** Tags where we want to preserve full child structure. */
|
|
43
|
+
const BRANCH_CONTAINERS = new Set([
|
|
44
|
+
"pre",
|
|
45
|
+
"ul",
|
|
46
|
+
"ol",
|
|
47
|
+
"table",
|
|
48
|
+
"thead",
|
|
49
|
+
"tbody",
|
|
50
|
+
"tfoot",
|
|
51
|
+
"tr",
|
|
52
|
+
"section",
|
|
53
|
+
"article",
|
|
54
|
+
"aside",
|
|
55
|
+
"blockquote",
|
|
56
|
+
"details",
|
|
57
|
+
"figure",
|
|
58
|
+
]);
|
|
59
|
+
// ---------------------------------------------------------------------------
|
|
60
|
+
// Tree building
|
|
61
|
+
// ---------------------------------------------------------------------------
|
|
62
|
+
/**
|
|
63
|
+
* Extract the language from a <code> or <pre> element's class attribute.
|
|
64
|
+
* Turndown and most syntax highlighters use class="language-typescript" etc.
|
|
65
|
+
*/
|
|
66
|
+
function extractLang(el) {
|
|
67
|
+
const cls = el.getAttribute("class") ?? "";
|
|
68
|
+
const m = /language-([a-zA-Z0-9_+-]+)/.exec(cls);
|
|
69
|
+
return m ? m[1] : undefined;
|
|
70
|
+
}
|
|
71
|
+
/** Flatten all descendant text content into one trimmed string. */
|
|
72
|
+
function flattenText(el) {
|
|
73
|
+
return (el.textContent ?? "").replace(/\s+/g, " ").trim();
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Recursively build a DOMNode from an Element.
|
|
77
|
+
* Returns null if the element has no meaningful content.
|
|
78
|
+
*/
|
|
79
|
+
function buildNode(el, pathPrefix, siblingIndex) {
|
|
80
|
+
const tag = el.tagName.toLowerCase();
|
|
81
|
+
// Compute path segment
|
|
82
|
+
const count = siblingIndex.get(tag) ?? 0;
|
|
83
|
+
siblingIndex.set(tag, count + 1);
|
|
84
|
+
const segment = count === 0 ? tag : `${tag}[${count}]`;
|
|
85
|
+
const path = pathPrefix ? `${pathPrefix}.${segment}` : segment;
|
|
86
|
+
// Attrs
|
|
87
|
+
const attrs = {};
|
|
88
|
+
if (tag === "a") {
|
|
89
|
+
const href = el.getAttribute("href");
|
|
90
|
+
if (href && !href.startsWith("javascript:"))
|
|
91
|
+
attrs.href = href;
|
|
92
|
+
}
|
|
93
|
+
if (tag === "code" || tag === "pre") {
|
|
94
|
+
const lang = extractLang(el) ?? extractLang(el.querySelector("code") ?? el);
|
|
95
|
+
if (lang)
|
|
96
|
+
attrs.lang = lang;
|
|
97
|
+
}
|
|
98
|
+
if (tag === "abbr") {
|
|
99
|
+
const title = el.getAttribute("title");
|
|
100
|
+
if (title)
|
|
101
|
+
attrs.title = title;
|
|
102
|
+
}
|
|
103
|
+
if (tag === "time") {
|
|
104
|
+
const dt = el.getAttribute("datetime");
|
|
105
|
+
if (dt)
|
|
106
|
+
attrs.datetime = dt;
|
|
107
|
+
}
|
|
108
|
+
// Leaf containers — flatten to text
|
|
109
|
+
if (LEAF_CONTAINERS.has(tag)) {
|
|
110
|
+
const text = flattenText(el);
|
|
111
|
+
if (!text)
|
|
112
|
+
return null;
|
|
113
|
+
return {
|
|
114
|
+
tag,
|
|
115
|
+
path,
|
|
116
|
+
text,
|
|
117
|
+
...(Object.keys(attrs).length > 0 ? { attrs } : {}),
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
// pre — treat the entire block (including nested <code>) as one leaf
|
|
121
|
+
if (tag === "pre") {
|
|
122
|
+
const text = (el.textContent ?? "").trimEnd();
|
|
123
|
+
if (!text.trim())
|
|
124
|
+
return null;
|
|
125
|
+
return {
|
|
126
|
+
tag,
|
|
127
|
+
path,
|
|
128
|
+
text,
|
|
129
|
+
...(Object.keys(attrs).length > 0 ? { attrs } : {}),
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
// Branch containers — recurse into children
|
|
133
|
+
const children = [];
|
|
134
|
+
const childIndex = new Map();
|
|
135
|
+
for (const child of Array.from(el.children)) {
|
|
136
|
+
const childTag = child.tagName.toLowerCase();
|
|
137
|
+
if (SEMANTIC_TAGS.has(childTag)) {
|
|
138
|
+
const node = buildNode(child, path, childIndex);
|
|
139
|
+
if (node)
|
|
140
|
+
children.push(node);
|
|
141
|
+
}
|
|
142
|
+
else {
|
|
143
|
+
// Non-semantic wrapper: collapse by recursing with the same path/index
|
|
144
|
+
const collapsed = collapseWrapper(child, path, childIndex);
|
|
145
|
+
children.push(...collapsed);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
if (children.length === 0) {
|
|
149
|
+
// Branch with no semantic children — try as text leaf
|
|
150
|
+
const text = flattenText(el);
|
|
151
|
+
if (!text)
|
|
152
|
+
return null;
|
|
153
|
+
return { tag, path, text, ...(Object.keys(attrs).length > 0 ? { attrs } : {}) };
|
|
154
|
+
}
|
|
155
|
+
// Collapse single-child branches with the same tag family
|
|
156
|
+
if (children.length === 1 && !BRANCH_CONTAINERS.has(tag)) {
|
|
157
|
+
// Promote the child up, but keep the parent path
|
|
158
|
+
return children[0];
|
|
159
|
+
}
|
|
160
|
+
return {
|
|
161
|
+
tag,
|
|
162
|
+
path,
|
|
163
|
+
children,
|
|
164
|
+
...(Object.keys(attrs).length > 0 ? { attrs } : {}),
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Collapse a non-semantic wrapper element, returning its semantic descendants.
|
|
169
|
+
*/
|
|
170
|
+
function collapseWrapper(el, pathPrefix, siblingIndex) {
|
|
171
|
+
const results = [];
|
|
172
|
+
for (const child of Array.from(el.children)) {
|
|
173
|
+
const childTag = child.tagName.toLowerCase();
|
|
174
|
+
if (SEMANTIC_TAGS.has(childTag)) {
|
|
175
|
+
const node = buildNode(child, pathPrefix, siblingIndex);
|
|
176
|
+
if (node)
|
|
177
|
+
results.push(node);
|
|
178
|
+
}
|
|
179
|
+
else {
|
|
180
|
+
results.push(...collapseWrapper(child, pathPrefix, siblingIndex));
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
// If no semantic children found, treat wrapper text as a paragraph
|
|
184
|
+
if (results.length === 0) {
|
|
185
|
+
const text = flattenText(el);
|
|
186
|
+
if (text.length > 20) {
|
|
187
|
+
const count = siblingIndex.get("p") ?? 0;
|
|
188
|
+
siblingIndex.set("p", count + 1);
|
|
189
|
+
const segment = count === 0 ? "p" : `p[${count}]`;
|
|
190
|
+
results.push({ tag: "p", path: `${pathPrefix}.${segment}`, text });
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
return results;
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Build a simplified semantic DOM tree from Readability article HTML.
|
|
197
|
+
*
|
|
198
|
+
* The root is always an "article" node. Presentational wrappers are collapsed,
|
|
199
|
+
* single-child chains are simplified, and only semantic tags survive.
|
|
200
|
+
*/
|
|
201
|
+
export function buildTree(articleHtml, baseUrl) {
|
|
202
|
+
const { document } = parseHTML(`<html><body>${articleHtml}</body></html>`, { url: baseUrl });
|
|
203
|
+
const body = document.body;
|
|
204
|
+
const children = [];
|
|
205
|
+
const siblingIndex = new Map();
|
|
206
|
+
for (const child of Array.from(body.children)) {
|
|
207
|
+
const childTag = child.tagName.toLowerCase();
|
|
208
|
+
if (SEMANTIC_TAGS.has(childTag)) {
|
|
209
|
+
const node = buildNode(child, "article", siblingIndex);
|
|
210
|
+
if (node)
|
|
211
|
+
children.push(node);
|
|
212
|
+
}
|
|
213
|
+
else {
|
|
214
|
+
const collapsed = collapseWrapper(child, "article", siblingIndex);
|
|
215
|
+
children.push(...collapsed);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
return { tag: "article", path: "article", children };
|
|
219
|
+
}
|
|
220
|
+
// ---------------------------------------------------------------------------
|
|
221
|
+
// Tree navigation
|
|
222
|
+
// ---------------------------------------------------------------------------
|
|
223
|
+
/** Collect all nodes in the tree as a flat list (depth-first). */
|
|
224
|
+
function allNodes(node) {
|
|
225
|
+
const result = [node];
|
|
226
|
+
if (node.children) {
|
|
227
|
+
for (const child of node.children)
|
|
228
|
+
result.push(...allNodes(child));
|
|
229
|
+
}
|
|
230
|
+
return result;
|
|
231
|
+
}
|
|
232
|
+
/**
|
|
233
|
+
* Navigate to a specific node by its dot-bracket path.
|
|
234
|
+
* Returns null if the path does not exist in the tree.
|
|
235
|
+
*
|
|
236
|
+
* @example navigateTree(tree, "article.section[1].pre[0]")
|
|
237
|
+
*/
|
|
238
|
+
export function navigateTree(root, path) {
|
|
239
|
+
const nodes = allNodes(root);
|
|
240
|
+
return nodes.find((n) => n.path === path) ?? null;
|
|
241
|
+
}
|
|
242
|
+
// ---------------------------------------------------------------------------
|
|
243
|
+
// Tree fuzzy search
|
|
244
|
+
// ---------------------------------------------------------------------------
|
|
245
|
+
/** Extract all text content from a node recursively. */
|
|
246
|
+
function nodeText(node) {
|
|
247
|
+
if (node.text)
|
|
248
|
+
return node.text;
|
|
249
|
+
if (!node.children)
|
|
250
|
+
return "";
|
|
251
|
+
return node.children.map(nodeText).join(" ");
|
|
252
|
+
}
|
|
253
|
+
/** Semantic "block" tags that make good hit containers. */
|
|
254
|
+
const HIT_CONTAINERS = new Set([
|
|
255
|
+
"section",
|
|
256
|
+
"article",
|
|
257
|
+
"aside",
|
|
258
|
+
"blockquote",
|
|
259
|
+
"details",
|
|
260
|
+
"li",
|
|
261
|
+
"pre",
|
|
262
|
+
"p",
|
|
263
|
+
"figure",
|
|
264
|
+
"h1",
|
|
265
|
+
"h2",
|
|
266
|
+
"h3",
|
|
267
|
+
"h4",
|
|
268
|
+
"h5",
|
|
269
|
+
"h6",
|
|
270
|
+
"tr",
|
|
271
|
+
]);
|
|
272
|
+
/**
|
|
273
|
+
* Score text against a query using token overlap + exact phrase bonus.
|
|
274
|
+
* Returns 0–1.
|
|
275
|
+
*/
|
|
276
|
+
function scoreText(text, queryTokens, fullQuery) {
|
|
277
|
+
if (!text)
|
|
278
|
+
return 0;
|
|
279
|
+
const lower = text.toLowerCase();
|
|
280
|
+
let score = lower.includes(fullQuery) ? 0.6 : 0;
|
|
281
|
+
const perToken = 0.4 / Math.max(queryTokens.length, 1);
|
|
282
|
+
for (const qt of queryTokens) {
|
|
283
|
+
if (lower.includes(qt))
|
|
284
|
+
score += perToken;
|
|
285
|
+
}
|
|
286
|
+
return Math.min(score, 1);
|
|
287
|
+
}
|
|
288
|
+
/** Build a short snippet around the best match position. */
|
|
289
|
+
function buildSnippet(text, fullQuery, queryTokens, radius = 100) {
|
|
290
|
+
const lower = text.toLowerCase();
|
|
291
|
+
let pos = lower.indexOf(fullQuery);
|
|
292
|
+
if (pos === -1) {
|
|
293
|
+
for (const qt of queryTokens) {
|
|
294
|
+
const p = lower.indexOf(qt);
|
|
295
|
+
if (p !== -1) {
|
|
296
|
+
pos = p;
|
|
297
|
+
break;
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
if (pos === -1)
|
|
302
|
+
pos = 0;
|
|
303
|
+
const start = Math.max(0, pos - radius);
|
|
304
|
+
const end = Math.min(text.length, pos + Math.max(fullQuery.length, 1) + radius);
|
|
305
|
+
const raw = text.slice(start, end).replace(/\s+/g, " ").trim();
|
|
306
|
+
return (start > 0 ? "…" : "") + raw + (end < text.length ? "…" : "");
|
|
307
|
+
}
|
|
308
|
+
/**
|
|
309
|
+
* Fuzzy-search a DOM tree for a query string.
|
|
310
|
+
*
|
|
311
|
+
* Returns hits ranked by score. Each hit is the nearest semantic ancestor
|
|
312
|
+
* that contains the match (a section, li, pre, p — not a raw div). This
|
|
313
|
+
* means code blocks and table rows are always returned whole.
|
|
314
|
+
*
|
|
315
|
+
* @example
|
|
316
|
+
* const hits = queryTree(tree, "declaration merge", { topN: 3 })
|
|
317
|
+
* // hits[0].node is the full <section> containing that heading
|
|
318
|
+
*/
|
|
319
|
+
export function queryTree(root, query, opts = {}) {
|
|
320
|
+
const { topN = 10, snippetRadius = 100 } = opts;
|
|
321
|
+
if (!query.trim())
|
|
322
|
+
return [];
|
|
323
|
+
const fullQuery = query.trim().toLowerCase();
|
|
324
|
+
const queryTokens = fullQuery.split(/\s+/).filter((t) => t.length > 1);
|
|
325
|
+
const nodes = allNodes(root);
|
|
326
|
+
const hits = [];
|
|
327
|
+
for (const node of nodes) {
|
|
328
|
+
// Only return hit containers — not intermediate wrappers, not the root.
|
|
329
|
+
if (!HIT_CONTAINERS.has(node.tag))
|
|
330
|
+
continue;
|
|
331
|
+
if (node.path === "article")
|
|
332
|
+
continue; // root always matches everything — skip it
|
|
333
|
+
const text = nodeText(node);
|
|
334
|
+
const score = scoreText(text, queryTokens, fullQuery);
|
|
335
|
+
if (score === 0)
|
|
336
|
+
continue;
|
|
337
|
+
hits.push({
|
|
338
|
+
path: node.path,
|
|
339
|
+
score,
|
|
340
|
+
node,
|
|
341
|
+
snippet: buildSnippet(text, fullQuery, queryTokens, snippetRadius),
|
|
342
|
+
});
|
|
343
|
+
}
|
|
344
|
+
// Deduplicate: if a parent and child both match, keep only the more specific
|
|
345
|
+
// (higher-scoring) one. If scores are equal, prefer the ancestor (more context).
|
|
346
|
+
const deduped = hits
|
|
347
|
+
.sort((a, b) => b.score - a.score)
|
|
348
|
+
.filter((hit, i, arr) => {
|
|
349
|
+
// Remove this hit if a better-scoring ancestor is already in the list
|
|
350
|
+
return !arr.slice(0, i).some((other) => hit.path.startsWith(`${other.path}.`) && other.score >= hit.score);
|
|
351
|
+
});
|
|
352
|
+
return deduped.slice(0, topN);
|
|
353
|
+
}
|
|
354
|
+
//# sourceMappingURL=tree.js.map
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
/** Selects how much content spider() returns. */
|
|
2
|
+
export type PageView = "lean" | "full" | "tree";
|
|
3
|
+
/**
|
|
4
|
+
* A single node in the simplified DOM tree.
|
|
5
|
+
*
|
|
6
|
+
* The tree is built from the Readability article HTML with all presentational
|
|
7
|
+
* wrapper elements collapsed. Only semantically meaningful tags survive.
|
|
8
|
+
* Single-child chains (div > div > p) are reduced to the leaf (p).
|
|
9
|
+
*
|
|
10
|
+
* Paths use bracket notation for siblings of the same tag:
|
|
11
|
+
* "article.section[1].pre[0].code"
|
|
12
|
+
*
|
|
13
|
+
* Agents can:
|
|
14
|
+
* - Read the tree to understand page structure without fetching full markdown.
|
|
15
|
+
* - Call navigateTree(tree, path) to extract one exact node.
|
|
16
|
+
* - Call queryTree(tree, query) to fuzzy-search and get matching subtrees.
|
|
17
|
+
*/
|
|
18
|
+
export interface DOMNode {
|
|
19
|
+
/** HTML tag name, lower-cased. */
|
|
20
|
+
tag: string;
|
|
21
|
+
/** Stable dot-bracket path from the tree root, e.g. "article.section[1].pre[0].code". */
|
|
22
|
+
path: string;
|
|
23
|
+
/**
|
|
24
|
+
* Text content of this node.
|
|
25
|
+
* For leaf nodes: the raw text. For branch nodes: concatenated descendant text.
|
|
26
|
+
* Omitted when the node has children to avoid duplication.
|
|
27
|
+
*/
|
|
28
|
+
text?: string;
|
|
29
|
+
/**
|
|
30
|
+
* Semantically useful attributes only.
|
|
31
|
+
* a → href, code → lang (from class="language-*"), abbr → title.
|
|
32
|
+
*/
|
|
33
|
+
attrs?: Record<string, string>;
|
|
34
|
+
/** Child nodes. Present on branch nodes, absent on leaves. */
|
|
35
|
+
children?: DOMNode[];
|
|
36
|
+
}
|
|
37
|
+
/** A hit returned by queryTree — a matching subtree with score and context. */
|
|
38
|
+
export interface TreeHit {
|
|
39
|
+
/** Dot-bracket path of the matching node. */
|
|
40
|
+
path: string;
|
|
41
|
+
/** Score 0–1. Higher is a better match. */
|
|
42
|
+
score: number;
|
|
43
|
+
/** The matching node (may be a branch — e.g. a whole section). */
|
|
44
|
+
node: DOMNode;
|
|
45
|
+
/** Short context around the best match, ≤ 200 chars. */
|
|
46
|
+
snippet: string;
|
|
47
|
+
}
|
|
48
|
+
/** Dominant content type of a chunk — detected from the markdown buffer. */
|
|
49
|
+
export type ChunkType = "text" | "code" | "table" | "list" | "blockquote";
|
|
50
|
+
/** One embeddable, self-contained segment of a page. The unit of RAG. */
|
|
51
|
+
export interface Chunk {
|
|
52
|
+
/** Stable reference: "<url>#chunk-<index>" */
|
|
53
|
+
id: string;
|
|
54
|
+
index: number;
|
|
55
|
+
/** Nearest ancestor heading, empty string if none */
|
|
56
|
+
heading: string;
|
|
57
|
+
/** Clean Markdown text */
|
|
58
|
+
text: string;
|
|
59
|
+
wordCount: number;
|
|
60
|
+
/** Dominant content type — lets agents skip code/table chunks when summarising. */
|
|
61
|
+
contentType: ChunkType;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* A single image scraped from a page.
|
|
65
|
+
*
|
|
66
|
+
* Storage contract:
|
|
67
|
+
* - base64 is populated when the image is small enough to store inline.
|
|
68
|
+
* - filePath is populated when the image has been spilled to disk.
|
|
69
|
+
* - At least one of base64 or filePath is present on a hydrated ImageRef.
|
|
70
|
+
*
|
|
71
|
+
* LLM wire format (works with OpenAI, Anthropic, Together, Gemini):
|
|
72
|
+
* `data:${mimeType};base64,${base64}`
|
|
73
|
+
*/
|
|
74
|
+
export interface ImageRef {
|
|
75
|
+
/** Original absolute src URL of the image. */
|
|
76
|
+
src: string;
|
|
77
|
+
/** Base64-encoded image bytes. Omitted when the image is stored on disk. */
|
|
78
|
+
base64?: string;
|
|
79
|
+
/** MIME type detected from Content-Type or src extension, e.g. "image/jpeg". */
|
|
80
|
+
mimeType: string;
|
|
81
|
+
/** Alt text from the <img> tag, empty string when absent. */
|
|
82
|
+
alt: string;
|
|
83
|
+
/** Path to the binary file when the image has been persisted to disk. */
|
|
84
|
+
filePath?: string;
|
|
85
|
+
}
|
|
86
|
+
/** An outbound link — one edge in the knowledge graph. */
|
|
87
|
+
export interface Link {
|
|
88
|
+
href: string;
|
|
89
|
+
text: string;
|
|
90
|
+
isExternal: boolean;
|
|
91
|
+
/**
|
|
92
|
+
* Where in the page the link was found.
|
|
93
|
+
* "body" — inside the article content (strongest signal).
|
|
94
|
+
* "nav" — inside nav, header, footer, or aside (navigation chrome).
|
|
95
|
+
*/
|
|
96
|
+
rel: "body" | "nav";
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Minimal link for lean views — isExternal omitted (inferable from the URL).
|
|
100
|
+
* Saves tokens when pages carry hundreds of links.
|
|
101
|
+
*/
|
|
102
|
+
export interface LeanLink {
|
|
103
|
+
href: string;
|
|
104
|
+
text: string;
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Compact page view — identity, metadata, and structural outline only.
|
|
108
|
+
* No chunk text, no markdown body. Use when deciding whether/where to dig
|
|
109
|
+
* deeper. Roughly 5–20× fewer tokens than a full SpideredPage.
|
|
110
|
+
*
|
|
111
|
+
* Headings are flat markdown strings ("## Section") rather than objects —
|
|
112
|
+
* same information, ~half the tokens.
|
|
113
|
+
*/
|
|
114
|
+
export interface LeanPage {
|
|
115
|
+
readonly view: "lean";
|
|
116
|
+
url: string;
|
|
117
|
+
domain: string;
|
|
118
|
+
/** Canonical URL when it differs from the fetched URL (og:url / link[rel=canonical]). */
|
|
119
|
+
canonicalUrl?: string;
|
|
120
|
+
title: string;
|
|
121
|
+
description?: string;
|
|
122
|
+
author?: string;
|
|
123
|
+
publishedAt?: string;
|
|
124
|
+
lang: string;
|
|
125
|
+
/** Extracted topic tags — from meta keywords and article:tag. Compact vocabulary for grouping. */
|
|
126
|
+
tags: string[];
|
|
127
|
+
wordCount: number;
|
|
128
|
+
readingTimeMinutes: number;
|
|
129
|
+
/** How many RAG chunks a full view would produce. */
|
|
130
|
+
chunkCount: number;
|
|
131
|
+
/** Heading outline as flat markdown strings, e.g. "## Section Name". */
|
|
132
|
+
headings: string[];
|
|
133
|
+
/** Outbound links — href + anchor text only. */
|
|
134
|
+
links: LeanLink[];
|
|
135
|
+
/** True when the page appears JS-rendered — metadata may be partial. */
|
|
136
|
+
jsRendered?: boolean;
|
|
137
|
+
/**
|
|
138
|
+
* Number of other spidered pages that link to this page.
|
|
139
|
+
* Populated when a PageGraph is passed to toLean(). Omitted otherwise.
|
|
140
|
+
* Higher = more authoritative within the crawled corpus.
|
|
141
|
+
*/
|
|
142
|
+
inboundCount?: number;
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* A fully spidered page.
|
|
146
|
+
*
|
|
147
|
+
* Follows the Local Materialized View rule: every field is a named,
|
|
148
|
+
* independently readable value — never a serialized blob. Agents read
|
|
149
|
+
* individual fields; RAG embeds individual chunks; graph walkers follow
|
|
150
|
+
* individual links.
|
|
151
|
+
*/
|
|
152
|
+
export interface SpideredPage {
|
|
153
|
+
url: string;
|
|
154
|
+
domain: string;
|
|
155
|
+
fetchedAt: string;
|
|
156
|
+
/** Canonical URL when it differs from the fetched URL (og:url / link[rel=canonical]). */
|
|
157
|
+
canonicalUrl?: string;
|
|
158
|
+
title: string;
|
|
159
|
+
description: string;
|
|
160
|
+
author: string;
|
|
161
|
+
publishedAt: string;
|
|
162
|
+
lang: string;
|
|
163
|
+
/** Extracted topic tags — from meta keywords and article:tag. */
|
|
164
|
+
tags: string[];
|
|
165
|
+
wordCount: number;
|
|
166
|
+
readingTimeMinutes: number;
|
|
167
|
+
/** Heading outline — h1/h2/h3 only */
|
|
168
|
+
headings: Array<{
|
|
169
|
+
level: 1 | 2 | 3;
|
|
170
|
+
text: string;
|
|
171
|
+
}>;
|
|
172
|
+
/** RAG-ready chunks */
|
|
173
|
+
chunks: Chunk[];
|
|
174
|
+
/** Outbound links from this page */
|
|
175
|
+
links: Link[];
|
|
176
|
+
/**
|
|
177
|
+
* Images scraped from the article content.
|
|
178
|
+
* Only populated when spider() is called with captureImages: true.
|
|
179
|
+
*/
|
|
180
|
+
images?: ImageRef[];
|
|
181
|
+
markdown: string;
|
|
182
|
+
/**
|
|
183
|
+
* True when the page appears to be JavaScript-rendered (Readability
|
|
184
|
+
* found no content). metadata and links are still populated where
|
|
185
|
+
* possible; chunks and markdown are empty.
|
|
186
|
+
*/
|
|
187
|
+
jsRendered?: boolean;
|
|
188
|
+
}
|
|
189
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,iDAAiD;AACjD,MAAM,MAAM,QAAQ,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,CAAC;AAMhD;;;;;;;;;;;;;;GAcG;AACH,MAAM,WAAW,OAAO;IACvB,kCAAkC;IAClC,GAAG,EAAE,MAAM,CAAC;IACZ,yFAAyF;IACzF,IAAI,EAAE,MAAM,CAAC;IACb;;;;OAIG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IACd;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC/B,8DAA8D;IAC9D,QAAQ,CAAC,EAAE,OAAO,EAAE,CAAC;CACrB;AAED,+EAA+E;AAC/E,MAAM,WAAW,OAAO;IACvB,6CAA6C;IAC7C,IAAI,EAAE,MAAM,CAAC;IACb,2CAA2C;IAC3C,KAAK,EAAE,MAAM,CAAC;IACd,kEAAkE;IAClE,IAAI,EAAE,OAAO,CAAC;IACd,wDAAwD;IACxD,OAAO,EAAE,MAAM,CAAC;CAChB;AAED,4EAA4E;AAC5E,MAAM,MAAM,SAAS,GAAG,MAAM,GAAG,MAAM,GAAG,OAAO,GAAG,MAAM,GAAG,YAAY,CAAC;AAE1E,yEAAyE;AACzE,MAAM,WAAW,KAAK;IACrB,8CAA8C;IAC9C,EAAE,EAAE,MAAM,CAAC;IACX,KAAK,EAAE,MAAM,CAAC;IACd,qDAAqD;IACrD,OAAO,EAAE,MAAM,CAAC;IAChB,0BAA0B;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,EAAE,MAAM,CAAC;IAClB,mFAAmF;IACnF,WAAW,EAAE,SAAS,CAAC;CACvB;AAED;;;;;;;;;;GAUG;AACH,MAAM,WAAW,QAAQ;IACxB,8CAA8C;IAC9C,GAAG,EAAE,MAAM,CAAC;IACZ,4EAA4E;IAC5E,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,gFAAgF;IAChF,QAAQ,EAAE,MAAM,CAAC;IACjB,6DAA6D;IAC7D,GAAG,EAAE,MAAM,CAAC;IACZ,yEAAyE;IACzE,QAAQ,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,0DAA0D;AAC1D,MAAM,WAAW,IAAI;IACpB,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,OAAO,CAAC;IACpB;;;;OAIG;IACH,GAAG,EAAE,MAAM,GAAG,KAAK,CAAC;CACpB;AAED;;;GAGG;AACH,MAAM,WAAW,QAAQ;IACxB,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;CACb;AAED;;;;;;;GAOG;AACH,MAAM,WAAW,QAAQ;IACxB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IAGtB,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,yFAAyF;IACzF,YAAY,CAAC,EAAE,MAAM,CAAC;IAGtB,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,IAAI,EAAE,MAAM,CAAC;IACb,kGAAkG;IAClG,IAAI,EAAE,MAAM,EAAE,CAAC;IAGf,SAAS,EAAE,MAAM,CAAC;IAClB,kBAAkB,EAAE,MAAM,CAAC;IAC3B,qDAAqD;IACrD,UAAU,EAAE,MAAM,CAAC;IAGnB,wEAAwE;IACxE,QAAQ,EAAE,MAAM,EAAE,CAAC;IAGnB,gDAAgD;IAChD,KAAK,EAAE,QAAQ,EAAE,CAAC;IAElB,wEAAwE;IACxE,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB;;;;OAIG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC;CACtB;AAID;;;;;;;GAOG;AACH,MAAM,WAAW,YAAY;IAE5B,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,yFAAyF;IACzF,YAAY,CAAC,EAAE,MAAM,CAAC;IAGtB,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,EAAE,MAAM,CAAC;IACpB,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;IACpB,IAAI,EAAE,MAAM,CAAC;IACb,iEAAiE;IACjE,IAAI,EAAE,MAAM,EAAE,CAAC;IAGf,SAAS,EAAE,MAAM,CAAC;IAClB,kBAAkB,EAAE,MAAM,CAAC;IAG3B,sCAAsC;IACtC,QAAQ,EAAE,KAAK,CAAC;QAAE,KAAK,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACpD,uBAAuB;IACvB,MAAM,EAAE,KAAK,EAAE,CAAC;IAGhB,oCAAoC;IACpC,KAAK,EAAE,IAAI,EAAE,CAAC;IAGd;;;OAGG;IACH,MAAM,CAAC,EAAE,QAAQ,EAAE,CAAC;IAGpB,QAAQ,EAAE,MAAM,CAAC;IAEjB;;;;OAIG;IACH,UAAU,CAAC,EAAE,OAAO,CAAC;CACrB"}
|
package/dist/types.js
ADDED
package/dist/views.d.ts
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* View transformations — business logic that converts a SpideredPage into
|
|
3
|
+
* one of the available view shapes. Separated from types.ts which is pure
|
|
4
|
+
* data-shape definitions.
|
|
5
|
+
*/
|
|
6
|
+
import type { PageGraph } from "./graph.js";
|
|
7
|
+
import type { LeanPage, SpideredPage } from "./types.js";
|
|
8
|
+
/**
|
|
9
|
+
* Downgrade a full SpideredPage to a LeanPage.
|
|
10
|
+
*
|
|
11
|
+
* Pass a PageGraph as the second argument to populate `inboundCount` —
|
|
12
|
+
* the number of other spidered pages that link to this one. Agents can
|
|
13
|
+
* use this as a lightweight authority signal when ranking results from
|
|
14
|
+
* a crawl without running a full PageRank pass.
|
|
15
|
+
*/
|
|
16
|
+
export declare function toLean(page: SpideredPage, graph?: PageGraph): LeanPage;
|
|
17
|
+
//# sourceMappingURL=views.d.ts.map
|