docshark 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +4 -0
- package/dist/cli.js +77 -18
- package/dist/scraper/discoverer.d.ts +6 -1
- package/dist/scraper/discoverer.js +358 -9
- package/dist/server.js +35 -0
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
package/dist/cli.js
CHANGED
|
@@ -11,10 +11,11 @@ const program = new Command()
|
|
|
11
11
|
.version(VERSION, "-v, --version", "output the current version");
|
|
12
12
|
program
|
|
13
13
|
.command("start", { isDefault: true })
|
|
14
|
-
.
|
|
14
|
+
.alias("s")
|
|
15
|
+
.description("Start the MCP server (aliases: s, -s)")
|
|
15
16
|
.option("-p, --port <port>", "HTTP server port", "6380")
|
|
16
|
-
.option("--stdio", "Run in STDIO mode (for Claude Desktop, Cursor, etc.)")
|
|
17
|
-
.option("--data-dir <path>", "Data directory", "")
|
|
17
|
+
.option("-S, --stdio", "Run in STDIO mode (for Claude Desktop, Cursor, etc.)")
|
|
18
|
+
.option("-D, --data-dir <path>", "Data directory", "")
|
|
18
19
|
.action(async (opts) => {
|
|
19
20
|
if (opts.dataDir) {
|
|
20
21
|
process.env.DOCSHARK_DATA_DIR = opts.dataDir;
|
|
@@ -31,10 +32,11 @@ program
|
|
|
31
32
|
});
|
|
32
33
|
program
|
|
33
34
|
.command("add <url>")
|
|
34
|
-
.
|
|
35
|
+
.alias("a")
|
|
36
|
+
.description("Add a documentation library and start crawling (aliases: a, -a)")
|
|
35
37
|
.option("-n, --name <name>", "Library name (auto-generated from URL if omitted)")
|
|
36
38
|
.option("-d, --depth <n>", "Max crawl depth", "3")
|
|
37
|
-
.option("--lib-version <version>", "Library version")
|
|
39
|
+
.option("-V, --lib-version <version>", "Library version")
|
|
38
40
|
.action(async (url, opts) => {
|
|
39
41
|
db.init();
|
|
40
42
|
try {
|
|
@@ -57,9 +59,10 @@ program
|
|
|
57
59
|
});
|
|
58
60
|
program
|
|
59
61
|
.command("search <query>")
|
|
60
|
-
.
|
|
62
|
+
.alias("f")
|
|
63
|
+
.description("Search indexed documentation (aliases: f, -f)")
|
|
61
64
|
.option("-l, --library <name>", "Filter by library")
|
|
62
|
-
.option("--limit <n>", "Max results", "5")
|
|
65
|
+
.option("-m, --limit <n>", "Max results", "5")
|
|
63
66
|
.action(async (query, opts) => {
|
|
64
67
|
db.init();
|
|
65
68
|
const results = searchEngine.search(query, {
|
|
@@ -79,10 +82,12 @@ program
|
|
|
79
82
|
});
|
|
80
83
|
program
|
|
81
84
|
.command("list")
|
|
82
|
-
.
|
|
83
|
-
.
|
|
85
|
+
.alias("l")
|
|
86
|
+
.description("List indexed libraries (aliases: l, -l)")
|
|
87
|
+
.option("-s, --status <status>", "Filter by status (indexed, crawling, error, all)", "all")
|
|
88
|
+
.action((opts) => {
|
|
84
89
|
db.init();
|
|
85
|
-
const libs = db.listLibraries();
|
|
90
|
+
const libs = db.listLibraries(opts.status);
|
|
86
91
|
if (libs.length === 0) {
|
|
87
92
|
console.log('\nNo libraries indexed. Use "docshark add <url>" to add one.\n');
|
|
88
93
|
return;
|
|
@@ -98,7 +103,8 @@ program
|
|
|
98
103
|
});
|
|
99
104
|
program
|
|
100
105
|
.command("refresh <name>")
|
|
101
|
-
.
|
|
106
|
+
.alias("r")
|
|
107
|
+
.description("Refresh an existing documentation library (aliases: r, -r)")
|
|
102
108
|
.action(async (name) => {
|
|
103
109
|
db.init();
|
|
104
110
|
try {
|
|
@@ -118,7 +124,8 @@ program
|
|
|
118
124
|
});
|
|
119
125
|
program
|
|
120
126
|
.command("remove <name>")
|
|
121
|
-
.
|
|
127
|
+
.alias("rm")
|
|
128
|
+
.description("Remove a documentation library and its index (aliases: rm, -rm)")
|
|
122
129
|
.action((name) => {
|
|
123
130
|
db.init();
|
|
124
131
|
try {
|
|
@@ -134,13 +141,20 @@ program
|
|
|
134
141
|
}
|
|
135
142
|
});
|
|
136
143
|
program
|
|
137
|
-
.command("get
|
|
138
|
-
.
|
|
139
|
-
.
|
|
144
|
+
.command("get [url]")
|
|
145
|
+
.alias("g")
|
|
146
|
+
.description("Get the full markdown content of a specific indexed page (aliases: g, -g)")
|
|
147
|
+
.option("-l, --library <name>", "Library name to search within")
|
|
148
|
+
.option("-p, --path <path>", "Relative path within the library")
|
|
149
|
+
.action((url, opts) => {
|
|
150
|
+
if (!url && (!opts.library || !opts.path)) {
|
|
151
|
+
console.error(`\n❌ Please provide either a URL, or both --library and --path\n`);
|
|
152
|
+
process.exit(1);
|
|
153
|
+
}
|
|
140
154
|
db.init();
|
|
141
|
-
const page = db.getPage({ url });
|
|
155
|
+
const page = db.getPage({ url, library: opts.library, path: opts.path });
|
|
142
156
|
if (!page) {
|
|
143
|
-
console.error(`\n❌ Page not found in index
|
|
157
|
+
console.error(`\n❌ Page not found in index.\n`);
|
|
144
158
|
process.exit(1);
|
|
145
159
|
}
|
|
146
160
|
console.log(`\n--- ${page.title} ---`);
|
|
@@ -148,7 +162,52 @@ program
|
|
|
148
162
|
console.log(page.content_markdown);
|
|
149
163
|
console.log("\n");
|
|
150
164
|
});
|
|
151
|
-
|
|
165
|
+
// Intercept manual short flags (e.g., -l instead of l) so they act as command aliases
|
|
166
|
+
const args = process.argv;
|
|
167
|
+
const cmdAliases = {
|
|
168
|
+
"-s": "start",
|
|
169
|
+
"-a": "add",
|
|
170
|
+
"-f": "search",
|
|
171
|
+
"-l": "list",
|
|
172
|
+
"-r": "refresh",
|
|
173
|
+
"-rm": "remove",
|
|
174
|
+
"-g": "get",
|
|
175
|
+
"-i": "info",
|
|
176
|
+
};
|
|
177
|
+
if (args[2] && cmdAliases[args[2]]) {
|
|
178
|
+
args[2] = cmdAliases[args[2]];
|
|
179
|
+
}
|
|
180
|
+
program
|
|
181
|
+
.command("info <name>")
|
|
182
|
+
.alias("i")
|
|
183
|
+
.description("Get information about a library and list its pages (aliases: i, -i)")
|
|
184
|
+
.action((name) => {
|
|
185
|
+
db.init();
|
|
186
|
+
const lib = db.getLibraryByName(name);
|
|
187
|
+
if (!lib) {
|
|
188
|
+
console.error(`\n❌ Library not found: ${name}\n`);
|
|
189
|
+
process.exit(1);
|
|
190
|
+
}
|
|
191
|
+
console.log(`\n--- Library: ${lib.display_name} (${lib.name}) ---`);
|
|
192
|
+
console.log(`URL: ${lib.url}`);
|
|
193
|
+
console.log(`Status: ${lib.status}`);
|
|
194
|
+
console.log(`Pages: ${lib.page_count}`);
|
|
195
|
+
console.log(`Chunks: ${lib.chunk_count}`);
|
|
196
|
+
console.log(`Last Crawled: ${lib.last_crawled_at || "never"}`);
|
|
197
|
+
const pages = db.getPagesByLibrary(lib.id);
|
|
198
|
+
if (pages.length > 0) {
|
|
199
|
+
console.log(`\n--- Pages (${pages.length}) ---`);
|
|
200
|
+
console.table(pages.map((p) => ({
|
|
201
|
+
Title: p.title || "Untitled",
|
|
202
|
+
Path: p.path,
|
|
203
|
+
URL: p.url,
|
|
204
|
+
})));
|
|
205
|
+
}
|
|
206
|
+
else {
|
|
207
|
+
console.log(`\nNo pages found for this library.\n`);
|
|
208
|
+
}
|
|
209
|
+
});
|
|
210
|
+
program.parse(args);
|
|
152
211
|
/** Helper to wait for a crawl job to finish (CLI blocking mode) */
|
|
153
212
|
async function waitForCrawl(jobId) {
|
|
154
213
|
const { jobManager } = await import("./server.js");
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
import type { CrawlConfig } from '../types.js';
|
|
2
2
|
/**
|
|
3
3
|
* Discover all documentation page URLs from a base URL.
|
|
4
|
-
*
|
|
4
|
+
*
|
|
5
|
+
* Strategy cascade (stops at first strategy that yields >=1 URLs):
|
|
6
|
+
* A. sitemap.xml
|
|
7
|
+
* B. llms.txt (AI-friendly link manifest)
|
|
8
|
+
* C. Navigation-aware HTML link extraction (nav/sidebar elements)
|
|
9
|
+
* D. BFS link crawl (follows all same-origin links)
|
|
5
10
|
*/
|
|
6
11
|
export declare function discoverPages(baseUrl: string, config?: CrawlConfig): Promise<string[]>;
|
|
@@ -1,26 +1,76 @@
|
|
|
1
|
-
// src/scraper/discoverer.ts — Page URL discovery via sitemap +
|
|
1
|
+
// src/scraper/discoverer.ts — Page URL discovery via sitemap + llms.txt + nav-aware crawl + BFS fallback
|
|
2
2
|
import * as cheerio from 'cheerio';
|
|
3
3
|
import { getRobotsParser, isAllowed } from './robots.js';
|
|
4
4
|
import { RateLimiter } from './rate-limiter.js';
|
|
5
5
|
const USER_AGENT = 'DocShark/1.0';
|
|
6
|
+
/**
|
|
7
|
+
* Well-known entry points that doc sites commonly use.
|
|
8
|
+
* When the root page yields no links (JS-rendered SPA landing pages),
|
|
9
|
+
* we probe these paths to find a server-rendered doc page with navigation.
|
|
10
|
+
*/
|
|
11
|
+
const COMMON_DOC_ENTRY_PATHS = [
|
|
12
|
+
'/docs',
|
|
13
|
+
'/docs/',
|
|
14
|
+
'/documentation',
|
|
15
|
+
'/guide',
|
|
16
|
+
'/guides',
|
|
17
|
+
'/reference',
|
|
18
|
+
'/api',
|
|
19
|
+
'/getting-started',
|
|
20
|
+
'/docs/getting-started',
|
|
21
|
+
'/docs/introduction',
|
|
22
|
+
'/docs/installation',
|
|
23
|
+
'/docs/overview',
|
|
24
|
+
];
|
|
6
25
|
/**
|
|
7
26
|
* Discover all documentation page URLs from a base URL.
|
|
8
|
-
*
|
|
27
|
+
*
|
|
28
|
+
* Strategy cascade (stops at first strategy that yields >=1 URLs):
|
|
29
|
+
* A. sitemap.xml
|
|
30
|
+
* B. llms.txt (AI-friendly link manifest)
|
|
31
|
+
* C. Navigation-aware HTML link extraction (nav/sidebar elements)
|
|
32
|
+
* D. BFS link crawl (follows all same-origin links)
|
|
9
33
|
*/
|
|
10
34
|
export async function discoverPages(baseUrl, config = {}) {
|
|
11
35
|
const maxDepth = config.maxDepth ?? 3;
|
|
12
36
|
const robots = await getRobotsParser(baseUrl);
|
|
37
|
+
// ────────────────────────────────────────────
|
|
13
38
|
// Strategy A: Try sitemap first
|
|
39
|
+
// ────────────────────────────────────────────
|
|
14
40
|
const sitemapUrls = await discoverFromSitemap(baseUrl, robots);
|
|
15
41
|
if (sitemapUrls.length > 0) {
|
|
16
|
-
console.log(`[DocShark] Found ${sitemapUrls.length} URLs from sitemap`);
|
|
42
|
+
console.log(`[DocShark] ✅ Found ${sitemapUrls.length} URLs from sitemap`);
|
|
17
43
|
return filterUrls(sitemapUrls, baseUrl, config, robots);
|
|
18
44
|
}
|
|
19
|
-
//
|
|
20
|
-
|
|
45
|
+
// ────────────────────────────────────────────
|
|
46
|
+
// Strategy B: Try llms.txt / llms-full.txt
|
|
47
|
+
// ────────────────────────────────────────────
|
|
48
|
+
const llmsUrls = await discoverFromLlmsTxt(baseUrl);
|
|
49
|
+
if (llmsUrls.length > 0) {
|
|
50
|
+
console.log(`[DocShark] ✅ Found ${llmsUrls.length} URLs from llms.txt`);
|
|
51
|
+
return filterUrls(llmsUrls, baseUrl, config, robots);
|
|
52
|
+
}
|
|
53
|
+
// ────────────────────────────────────────────
|
|
54
|
+
// Strategy C: Navigation-aware link extraction
|
|
55
|
+
// ────────────────────────────────────────────
|
|
56
|
+
console.log(`[DocShark] No sitemap or llms.txt. Trying navigation-aware discovery...`);
|
|
57
|
+
const navUrls = await discoverFromNavigation(baseUrl, config, robots);
|
|
58
|
+
if (navUrls.length > 0) {
|
|
59
|
+
console.log(`[DocShark] ✅ Found ${navUrls.length} URLs from page navigation`);
|
|
60
|
+
// Enrich: BFS crawl from discovered nav URLs to find nested pages
|
|
61
|
+
const enrichedUrls = await enrichWithBfsCrawl(baseUrl, navUrls, maxDepth, config, robots);
|
|
62
|
+
return enrichedUrls;
|
|
63
|
+
}
|
|
64
|
+
// ────────────────────────────────────────────
|
|
65
|
+
// Strategy D: Full BFS link crawl (legacy fallback)
|
|
66
|
+
// ────────────────────────────────────────────
|
|
67
|
+
console.log(`[DocShark] No navigation links found, full BFS crawl (depth=${maxDepth})`);
|
|
21
68
|
const crawledUrls = await discoverByLinkCrawl(baseUrl, maxDepth, config, robots);
|
|
22
69
|
return crawledUrls;
|
|
23
70
|
}
|
|
71
|
+
// ═══════════════════════════════════════════════
|
|
72
|
+
// Strategy A: Sitemap
|
|
73
|
+
// ═══════════════════════════════════════════════
|
|
24
74
|
/** Parse sitemap.xml for page URLs */
|
|
25
75
|
async function discoverFromSitemap(baseUrl, robots) {
|
|
26
76
|
// Check for sitemap in robots.txt
|
|
@@ -82,13 +132,308 @@ async function fetchSitemapUrls(sitemapUrl) {
|
|
|
82
132
|
return [];
|
|
83
133
|
}
|
|
84
134
|
}
|
|
135
|
+
// ═══════════════════════════════════════════════
|
|
136
|
+
// Strategy B: llms.txt
|
|
137
|
+
// ═══════════════════════════════════════════════
|
|
138
|
+
/**
|
|
139
|
+
* Parse llms.txt / llms-full.txt for documentation URLs.
|
|
140
|
+
* The llms.txt standard uses markdown-style `[title](url)` links.
|
|
141
|
+
* @see https://llmstxt.org
|
|
142
|
+
*/
|
|
143
|
+
async function discoverFromLlmsTxt(baseUrl) {
|
|
144
|
+
const candidates = [
|
|
145
|
+
new URL('/llms-full.txt', baseUrl).href,
|
|
146
|
+
new URL('/llms.txt', baseUrl).href,
|
|
147
|
+
];
|
|
148
|
+
for (const llmsUrl of candidates) {
|
|
149
|
+
try {
|
|
150
|
+
const response = await fetch(llmsUrl, {
|
|
151
|
+
headers: { 'User-Agent': USER_AGENT },
|
|
152
|
+
signal: AbortSignal.timeout(15_000),
|
|
153
|
+
});
|
|
154
|
+
if (!response.ok)
|
|
155
|
+
continue;
|
|
156
|
+
const text = await response.text();
|
|
157
|
+
// Extract markdown-style links: [text](url)
|
|
158
|
+
const linkRegex = /\[([^\]]*)\]\(([^)]+)\)/g;
|
|
159
|
+
const urls = [];
|
|
160
|
+
let match;
|
|
161
|
+
while ((match = linkRegex.exec(text)) !== null) {
|
|
162
|
+
const href = match[2].trim();
|
|
163
|
+
try {
|
|
164
|
+
const resolved = new URL(href, baseUrl);
|
|
165
|
+
// Only same-origin, strip .md extension if present
|
|
166
|
+
if (resolved.origin === new URL(baseUrl).origin) {
|
|
167
|
+
let pathname = resolved.pathname;
|
|
168
|
+
// Strip .md extension — llms.txt often uses .md paths
|
|
169
|
+
// but the actual page URL doesn't have .md
|
|
170
|
+
if (pathname.endsWith('.md')) {
|
|
171
|
+
pathname = pathname.slice(0, -3);
|
|
172
|
+
}
|
|
173
|
+
resolved.pathname = pathname;
|
|
174
|
+
resolved.hash = '';
|
|
175
|
+
resolved.search = '';
|
|
176
|
+
urls.push(resolved.href);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
catch {
|
|
180
|
+
// Invalid URL, skip
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
if (urls.length > 0) {
|
|
184
|
+
// Deduplicate
|
|
185
|
+
return [...new Set(urls)];
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
catch {
|
|
189
|
+
// Fetch failed, try next candidate
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
return [];
|
|
193
|
+
}
|
|
194
|
+
// ═══════════════════════════════════════════════
|
|
195
|
+
// Strategy C: Navigation-aware link extraction
|
|
196
|
+
// ═══════════════════════════════════════════════
|
|
197
|
+
/**
|
|
198
|
+
* CSS selectors for navigation/sidebar elements in common doc site frameworks.
|
|
199
|
+
* These target areas where documentation sites list their page links.
|
|
200
|
+
*/
|
|
201
|
+
const NAV_SELECTORS = [
|
|
202
|
+
'nav a[href]',
|
|
203
|
+
'[role="navigation"] a[href]',
|
|
204
|
+
'aside a[href]',
|
|
205
|
+
'.sidebar a[href]',
|
|
206
|
+
'[class*="sidebar"] a[href]',
|
|
207
|
+
'[class*="nav"] a[href]',
|
|
208
|
+
'[class*="menu"] a[href]',
|
|
209
|
+
'[class*="toc"] a[href]',
|
|
210
|
+
'[data-sidebar] a[href]',
|
|
211
|
+
'[id*="sidebar"] a[href]',
|
|
212
|
+
'[id*="nav"] a[href]',
|
|
213
|
+
];
|
|
214
|
+
/**
|
|
215
|
+
* Extract links specifically from navigation elements (sidebar, nav, etc.)
|
|
216
|
+
* of a doc page. If the root page yields nothing (SPA), we try common
|
|
217
|
+
* doc entry points that are likely server-rendered.
|
|
218
|
+
*/
|
|
219
|
+
async function discoverFromNavigation(baseUrl, config, robots) {
|
|
220
|
+
const baseOrigin = new URL(baseUrl).origin;
|
|
221
|
+
// Step 1: Try extracting from the base URL first
|
|
222
|
+
let navLinks = await extractNavLinks(baseUrl, baseOrigin);
|
|
223
|
+
// Step 2: If root page yields very few links (likely JS-rendered landing),
|
|
224
|
+
// probe common doc entry paths
|
|
225
|
+
if (navLinks.length < 3) {
|
|
226
|
+
console.log(`[DocShark] Root page has only ${navLinks.length} nav links. Probing doc entry points...`);
|
|
227
|
+
for (const entryPath of COMMON_DOC_ENTRY_PATHS) {
|
|
228
|
+
const entryUrl = new URL(entryPath, baseUrl).href;
|
|
229
|
+
// Skip if robots disallow
|
|
230
|
+
if (!isAllowed(robots, entryUrl))
|
|
231
|
+
continue;
|
|
232
|
+
const entryLinks = await extractNavLinks(entryUrl, baseOrigin);
|
|
233
|
+
if (entryLinks.length > navLinks.length) {
|
|
234
|
+
console.log(`[DocShark] Found ${entryLinks.length} nav links at ${entryPath}`);
|
|
235
|
+
navLinks = entryLinks;
|
|
236
|
+
}
|
|
237
|
+
// If we found a rich source, stop probing
|
|
238
|
+
if (navLinks.length >= 10)
|
|
239
|
+
break;
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
// Step 3: If static fetch still yields nothing, try puppeteer on root
|
|
243
|
+
if (navLinks.length < 3) {
|
|
244
|
+
console.log(`[DocShark] Static fetch yielded few links. Trying headless browser...`);
|
|
245
|
+
const puppeteerLinks = await extractNavLinksWithPuppeteer(baseUrl, baseOrigin);
|
|
246
|
+
if (puppeteerLinks.length > navLinks.length) {
|
|
247
|
+
navLinks = puppeteerLinks;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
return filterUrls(navLinks, baseUrl, config, robots);
|
|
251
|
+
}
|
|
252
|
+
/**
|
|
253
|
+
* Fetch a page and extract links from navigation elements.
|
|
254
|
+
* Uses targeted CSS selectors to find sidebar/nav links.
|
|
255
|
+
*/
|
|
256
|
+
async function extractNavLinks(url, baseOrigin) {
|
|
257
|
+
try {
|
|
258
|
+
const response = await fetch(url, {
|
|
259
|
+
headers: { 'User-Agent': USER_AGENT },
|
|
260
|
+
signal: AbortSignal.timeout(15_000),
|
|
261
|
+
redirect: 'follow',
|
|
262
|
+
});
|
|
263
|
+
if (!response.ok)
|
|
264
|
+
return [];
|
|
265
|
+
const contentType = response.headers.get('content-type') || '';
|
|
266
|
+
if (!contentType.includes('text/html'))
|
|
267
|
+
return [];
|
|
268
|
+
const html = await response.text();
|
|
269
|
+
return extractLinksFromHtml(html, url, baseOrigin, true);
|
|
270
|
+
}
|
|
271
|
+
catch {
|
|
272
|
+
return [];
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
/**
|
|
276
|
+
* Extract links from HTML content.
|
|
277
|
+
*
|
|
278
|
+
* @param navOnly - If true, only extract links from nav-like elements.
|
|
279
|
+
* If false, extract all `a[href]` links.
|
|
280
|
+
*/
|
|
281
|
+
function extractLinksFromHtml(html, pageUrl, baseOrigin, navOnly) {
|
|
282
|
+
const $ = cheerio.load(html);
|
|
283
|
+
const urls = new Set();
|
|
284
|
+
const selector = navOnly ? NAV_SELECTORS.join(', ') : 'a[href]';
|
|
285
|
+
$(selector).each((_, el) => {
|
|
286
|
+
try {
|
|
287
|
+
const href = $(el).attr('href');
|
|
288
|
+
if (!href)
|
|
289
|
+
return;
|
|
290
|
+
const resolved = new URL(href, pageUrl);
|
|
291
|
+
resolved.hash = '';
|
|
292
|
+
resolved.search = '';
|
|
293
|
+
if (resolved.origin === baseOrigin &&
|
|
294
|
+
!isNonDocUrl(resolved.href)) {
|
|
295
|
+
urls.add(resolved.href);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
catch {
|
|
299
|
+
// Invalid URL, skip
|
|
300
|
+
}
|
|
301
|
+
});
|
|
302
|
+
return [...urls];
|
|
303
|
+
}
|
|
304
|
+
/**
|
|
305
|
+
* Use puppeteer-core to render a JS SPA and extract navigation links.
|
|
306
|
+
* Falls back silently if puppeteer is not installed.
|
|
307
|
+
*/
|
|
308
|
+
async function extractNavLinksWithPuppeteer(url, baseOrigin) {
|
|
309
|
+
try {
|
|
310
|
+
// @ts-ignore — puppeteer-core is an optional dependency
|
|
311
|
+
const puppeteer = await import('puppeteer-core');
|
|
312
|
+
const { existsSync } = await import('fs');
|
|
313
|
+
const executablePath = findChrome(existsSync);
|
|
314
|
+
if (!executablePath) {
|
|
315
|
+
console.warn(`[DocShark] Chrome not found for headless navigation discovery. ` +
|
|
316
|
+
`Install Chrome or set CHROME_PATH env var.`);
|
|
317
|
+
return [];
|
|
318
|
+
}
|
|
319
|
+
const browser = await puppeteer.default.launch({
|
|
320
|
+
headless: true,
|
|
321
|
+
executablePath,
|
|
322
|
+
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu'],
|
|
323
|
+
});
|
|
324
|
+
try {
|
|
325
|
+
const page = await browser.newPage();
|
|
326
|
+
// Block heavy resources for speed
|
|
327
|
+
await page.setRequestInterception(true);
|
|
328
|
+
page.on('request', (req) => {
|
|
329
|
+
const type = req.resourceType();
|
|
330
|
+
if (['image', 'stylesheet', 'font', 'media'].includes(type)) {
|
|
331
|
+
req.abort();
|
|
332
|
+
}
|
|
333
|
+
else {
|
|
334
|
+
req.continue();
|
|
335
|
+
}
|
|
336
|
+
});
|
|
337
|
+
await page.goto(url, { waitUntil: 'networkidle2', timeout: 30_000 });
|
|
338
|
+
const html = await page.content();
|
|
339
|
+
await page.close();
|
|
340
|
+
return extractLinksFromHtml(html, url, baseOrigin, true);
|
|
341
|
+
}
|
|
342
|
+
finally {
|
|
343
|
+
await browser.close();
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
catch (err) {
|
|
347
|
+
console.warn(`[DocShark] Puppeteer navigation discovery failed: ${err.message}`);
|
|
348
|
+
return [];
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
function findChrome(existsSync) {
|
|
352
|
+
const candidates = [
|
|
353
|
+
process.env.CHROME_PATH,
|
|
354
|
+
process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
355
|
+
'/usr/bin/google-chrome',
|
|
356
|
+
'/usr/bin/google-chrome-stable',
|
|
357
|
+
'/usr/bin/chromium-browser',
|
|
358
|
+
'/usr/bin/chromium',
|
|
359
|
+
'/snap/bin/chromium',
|
|
360
|
+
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
|
361
|
+
];
|
|
362
|
+
for (const path of candidates) {
|
|
363
|
+
if (path && existsSync(path))
|
|
364
|
+
return path;
|
|
365
|
+
}
|
|
366
|
+
return undefined;
|
|
367
|
+
}
|
|
368
|
+
// ═══════════════════════════════════════════════
|
|
369
|
+
// Strategy D: BFS Link Crawl
|
|
370
|
+
// ═══════════════════════════════════════════════
|
|
371
|
+
/**
|
|
372
|
+
* Enrich an initial set of discovered URLs by BFS-crawling each page
|
|
373
|
+
* for additional same-origin links. Useful after nav extraction to
|
|
374
|
+
* find nested pages that aren't in the top-level navigation.
|
|
375
|
+
*/
|
|
376
|
+
async function enrichWithBfsCrawl(baseUrl, seedUrls, maxDepth, config, robots) {
|
|
377
|
+
const visited = new Set(seedUrls);
|
|
378
|
+
const queue = seedUrls.map((url) => ({
|
|
379
|
+
url,
|
|
380
|
+
depth: 1, // Seed URLs are depth 1, their children are depth 2+
|
|
381
|
+
}));
|
|
382
|
+
const rateLimiter = new RateLimiter(config.rateLimit ?? 500);
|
|
383
|
+
const baseOrigin = new URL(baseUrl).origin;
|
|
384
|
+
while (queue.length > 0) {
|
|
385
|
+
const item = queue.shift();
|
|
386
|
+
// Only follow links from nav-discovered pages to find sub-pages
|
|
387
|
+
// e.g. /docs/data-table might link to /docs/data-table/sorting
|
|
388
|
+
if (item.depth > maxDepth)
|
|
389
|
+
continue;
|
|
390
|
+
if (!isAllowed(robots, item.url))
|
|
391
|
+
continue;
|
|
392
|
+
// We already have this URL in our set; only crawl to find *new* links
|
|
393
|
+
try {
|
|
394
|
+
await rateLimiter.wait();
|
|
395
|
+
const response = await fetch(item.url, {
|
|
396
|
+
headers: { 'User-Agent': USER_AGENT },
|
|
397
|
+
signal: AbortSignal.timeout(15_000),
|
|
398
|
+
});
|
|
399
|
+
if (!response.ok)
|
|
400
|
+
continue;
|
|
401
|
+
const contentType = response.headers.get('content-type') || '';
|
|
402
|
+
if (!contentType.includes('text/html'))
|
|
403
|
+
continue;
|
|
404
|
+
const html = await response.text();
|
|
405
|
+
// Extract ALL links from the page (not just nav) for BFS enrichment
|
|
406
|
+
const pageLinks = extractLinksFromHtml(html, item.url, baseOrigin, false);
|
|
407
|
+
for (const link of pageLinks) {
|
|
408
|
+
if (!visited.has(link) && !isNonDocUrl(link)) {
|
|
409
|
+
visited.add(link);
|
|
410
|
+
queue.push({ url: link, depth: item.depth + 1 });
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
catch {
|
|
415
|
+
// Fetch failed, skip
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
return filterUrls([...visited], baseUrl, config, robots);
|
|
419
|
+
}
|
|
85
420
|
/** BFS link crawl from the base URL */
|
|
86
421
|
async function discoverByLinkCrawl(baseUrl, maxDepth, config, robots) {
|
|
87
422
|
const visited = new Set();
|
|
88
|
-
const queue = [{ url: baseUrl, depth: 0 }];
|
|
89
|
-
const rateLimiter = new RateLimiter(config.rateLimit ?? 500);
|
|
90
423
|
const baseOrigin = new URL(baseUrl).origin;
|
|
91
424
|
const basePath = new URL(baseUrl).pathname;
|
|
425
|
+
// Seed queue: start with base URL + common doc entry points
|
|
426
|
+
const queue = [{ url: baseUrl, depth: 0 }];
|
|
427
|
+
// Also seed common doc entry points if base is root
|
|
428
|
+
if (basePath === '/' || basePath === '') {
|
|
429
|
+
for (const entryPath of COMMON_DOC_ENTRY_PATHS) {
|
|
430
|
+
const entryUrl = new URL(entryPath, baseUrl).href;
|
|
431
|
+
if (isAllowed(robots, entryUrl)) {
|
|
432
|
+
queue.push({ url: entryUrl, depth: 0 });
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
const rateLimiter = new RateLimiter(config.rateLimit ?? 500);
|
|
92
437
|
while (queue.length > 0) {
|
|
93
438
|
const item = queue.shift();
|
|
94
439
|
if (visited.has(item.url) || item.depth > maxDepth)
|
|
@@ -138,6 +483,9 @@ async function discoverByLinkCrawl(baseUrl, maxDepth, config, robots) {
|
|
|
138
483
|
}
|
|
139
484
|
return filterUrls([...visited], baseUrl, config, robots);
|
|
140
485
|
}
|
|
486
|
+
// ═══════════════════════════════════════════════
|
|
487
|
+
// Shared Utilities
|
|
488
|
+
// ═══════════════════════════════════════════════
|
|
141
489
|
/** Filter URLs based on config patterns */
|
|
142
490
|
function filterUrls(urls, baseUrl, config, robots) {
|
|
143
491
|
const baseOrigin = new URL(baseUrl).origin;
|
|
@@ -145,10 +493,11 @@ function filterUrls(urls, baseUrl, config, robots) {
|
|
|
145
493
|
return urls.filter((url) => {
|
|
146
494
|
try {
|
|
147
495
|
const parsed = new URL(url);
|
|
148
|
-
// Must be same origin
|
|
496
|
+
// Must be same origin
|
|
149
497
|
if (parsed.origin !== baseOrigin)
|
|
150
498
|
return false;
|
|
151
|
-
|
|
499
|
+
// Must be under base path (unless base is root)
|
|
500
|
+
if (basePath !== '/' && !parsed.pathname.startsWith(basePath))
|
|
152
501
|
return false;
|
|
153
502
|
// Check robots.txt
|
|
154
503
|
if (!isAllowed(robots, url))
|
package/dist/server.js
CHANGED
|
@@ -152,3 +152,38 @@ server.tool({
|
|
|
152
152
|
db.removeLibrary(lib.id);
|
|
153
153
|
return tool.text(`🗑️ Library "${lib.display_name}" removed.\nDeleted ${lib.page_count} pages and ${lib.chunk_count} chunks.`);
|
|
154
154
|
});
|
|
155
|
+
// ──────────────────────────────────────
|
|
156
|
+
// Tool 7: library_info — detailed stats and pages
|
|
157
|
+
// ──────────────────────────────────────
|
|
158
|
+
server.tool({
|
|
159
|
+
name: 'library_info',
|
|
160
|
+
description: 'Get detailed information about a specific documentation library, including a list of all its indexed pages and their paths. ' +
|
|
161
|
+
'Use this to see what pages are available in a library before retrieving them.',
|
|
162
|
+
schema: v.object({
|
|
163
|
+
library: v.pipe(v.string(), v.description('The library name to get information for.')),
|
|
164
|
+
}),
|
|
165
|
+
}, async ({ library }) => {
|
|
166
|
+
const lib = db.getLibraryByName(library);
|
|
167
|
+
if (!lib)
|
|
168
|
+
return tool.text(`Library "${library}" not found. Use list_libraries to see available libraries.`);
|
|
169
|
+
const pages = db.getPagesByLibrary(lib.id);
|
|
170
|
+
let output = `## Library: ${lib.display_name} (${lib.name})\n`;
|
|
171
|
+
output += `- **URL:** ${lib.url}\n`;
|
|
172
|
+
output += `- **Status:** ${lib.status}\n`;
|
|
173
|
+
output += `- **Pages:** ${lib.page_count}\n`;
|
|
174
|
+
output += `- **Chunks:** ${lib.chunk_count}\n`;
|
|
175
|
+
output += `- **Last Crawled:** ${lib.last_crawled_at || 'never'}\n\n`;
|
|
176
|
+
if (pages.length > 0) {
|
|
177
|
+
output += `### Pages (${pages.length})\n\n`;
|
|
178
|
+
output += '| Title | Path | URL |\n';
|
|
179
|
+
output += '| ----- | ---- | --- |\n';
|
|
180
|
+
for (const p of pages) {
|
|
181
|
+
const title = p.title?.replace(/\|/g, '-') || 'Untitled';
|
|
182
|
+
output += `| ${title} | \`${p.path}\` | ${p.url} |\n`;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
else {
|
|
186
|
+
output += `*No pages indexed yet for this library.*\n`;
|
|
187
|
+
}
|
|
188
|
+
return tool.text(output);
|
|
189
|
+
});
|
package/dist/version.d.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export declare const VERSION = "0.1.
|
|
1
|
+
export declare const VERSION = "0.1.8";
|
package/dist/version.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
// This file is automatically updated by release-please
|
|
2
|
-
export const VERSION = '0.1.
|
|
2
|
+
export const VERSION = '0.1.8'; // x-release-please-version
|