docshark 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/README.md +83 -30
- package/dist/api/router.js +77 -0
- package/dist/cli.d.ts +1 -1
- package/dist/cli.js +160 -164
- package/dist/http.js +84 -0
- package/dist/index.js +0 -1
- package/dist/jobs/events.js +15 -0
- package/dist/jobs/manager.js +49 -0
- package/dist/jobs/worker.js +120 -0
- package/dist/processor/chunker.js +79 -0
- package/dist/processor/extractor.js +81 -0
- package/dist/scraper/discoverer.js +206 -0
- package/dist/scraper/fetcher.js +129 -0
- package/dist/scraper/rate-limiter.js +18 -0
- package/dist/scraper/robots.js +26 -0
- package/dist/server.js +154 -0
- package/dist/services/library.js +66 -0
- package/dist/storage/db.js +228 -0
- package/dist/storage/search.js +49 -0
- package/dist/tools/add-library.js +35 -0
- package/dist/tools/get-doc-page.js +25 -0
- package/dist/tools/list-libraries.js +29 -0
- package/dist/tools/refresh-library.js +25 -0
- package/dist/tools/remove-library.js +25 -0
- package/dist/tools/search-docs.js +35 -0
- package/dist/types.js +2 -0
- package/dist/version.d.ts +1 -1
- package/dist/version.js +2 -0
- package/package.json +6 -2
package/dist/http.js
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
// src/http.ts — HTTP server (MCP + REST API + SSE + static dashboard)
|
|
2
|
+
import { serve } from 'srvx';
|
|
3
|
+
import { HttpTransport } from '@tmcp/transport-http';
|
|
4
|
+
import { SseTransport } from '@tmcp/transport-sse';
|
|
5
|
+
import { server, eventBus, db, searchEngine, jobManager, libraryService } from './server.js';
|
|
6
|
+
import { createApiRouter } from './api/router.js';
|
|
7
|
+
import { VERSION } from './version.js';
|
|
8
|
+
export async function startHttpServer(port) {
|
|
9
|
+
const httpTransport = new HttpTransport(server, { path: '/mcp' });
|
|
10
|
+
const sseTransport = new SseTransport(server, { path: '/sse' });
|
|
11
|
+
const apiRouter = createApiRouter({ db, searchEngine, jobManager, libraryService, eventBus });
|
|
12
|
+
const httpServer = serve({
|
|
13
|
+
port,
|
|
14
|
+
async fetch(request) {
|
|
15
|
+
const url = new URL(request.url);
|
|
16
|
+
// 1. MCP Streamable HTTP transport
|
|
17
|
+
if (url.pathname.startsWith('/mcp')) {
|
|
18
|
+
const response = await httpTransport.respond(request);
|
|
19
|
+
if (response)
|
|
20
|
+
return response;
|
|
21
|
+
}
|
|
22
|
+
// 2. MCP SSE transport
|
|
23
|
+
if (url.pathname.startsWith('/sse')) {
|
|
24
|
+
const response = await sseTransport.respond(request);
|
|
25
|
+
if (response)
|
|
26
|
+
return response;
|
|
27
|
+
}
|
|
28
|
+
// 3. SSE endpoint for real-time crawl events
|
|
29
|
+
if (url.pathname === '/api/crawl-events') {
|
|
30
|
+
return handleCrawlSSE(request);
|
|
31
|
+
}
|
|
32
|
+
// 4. REST API for dashboard
|
|
33
|
+
if (url.pathname.startsWith('/api/')) {
|
|
34
|
+
return apiRouter.handle(request);
|
|
35
|
+
}
|
|
36
|
+
// 5. Root — server info
|
|
37
|
+
return new Response(JSON.stringify({
|
|
38
|
+
name: 'DocShark',
|
|
39
|
+
version: VERSION,
|
|
40
|
+
description: '🦈 Documentation MCP Server',
|
|
41
|
+
endpoints: {
|
|
42
|
+
mcp: '/mcp',
|
|
43
|
+
sse: '/sse',
|
|
44
|
+
api: '/api',
|
|
45
|
+
crawlEvents: '/api/crawl-events',
|
|
46
|
+
},
|
|
47
|
+
}), { headers: { 'Content-Type': 'application/json' } });
|
|
48
|
+
},
|
|
49
|
+
});
|
|
50
|
+
console.log(`\n🦈 DocShark running on http://localhost:${port}`);
|
|
51
|
+
console.log(` MCP (HTTP): http://localhost:${port}/mcp`);
|
|
52
|
+
console.log(` MCP (SSE): http://localhost:${port}/sse`);
|
|
53
|
+
console.log(` REST API: http://localhost:${port}/api`);
|
|
54
|
+
console.log(` Health: http://localhost:${port}/api/health\n`);
|
|
55
|
+
return httpServer;
|
|
56
|
+
}
|
|
57
|
+
/** SSE handler for real-time crawl progress */
|
|
58
|
+
function handleCrawlSSE(request) {
|
|
59
|
+
const stream = new ReadableStream({
|
|
60
|
+
start(controller) {
|
|
61
|
+
const encoder = new TextEncoder();
|
|
62
|
+
const onProgress = (data) => {
|
|
63
|
+
controller.enqueue(encoder.encode(`data: ${JSON.stringify(data)}\n\n`));
|
|
64
|
+
};
|
|
65
|
+
eventBus.on('crawl:progress', onProgress);
|
|
66
|
+
eventBus.on('crawl:complete', onProgress);
|
|
67
|
+
eventBus.on('crawl:error', onProgress);
|
|
68
|
+
request.signal.addEventListener('abort', () => {
|
|
69
|
+
eventBus.off('crawl:progress', onProgress);
|
|
70
|
+
eventBus.off('crawl:complete', onProgress);
|
|
71
|
+
eventBus.off('crawl:error', onProgress);
|
|
72
|
+
controller.close();
|
|
73
|
+
});
|
|
74
|
+
},
|
|
75
|
+
});
|
|
76
|
+
return new Response(stream, {
|
|
77
|
+
headers: {
|
|
78
|
+
'Content-Type': 'text/event-stream',
|
|
79
|
+
'Cache-Control': 'no-cache',
|
|
80
|
+
Connection: 'keep-alive',
|
|
81
|
+
'Access-Control-Allow-Origin': '*',
|
|
82
|
+
},
|
|
83
|
+
});
|
|
84
|
+
}
|
package/dist/index.js
CHANGED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
// src/jobs/events.ts — EventBus for real-time crawl progress (SSE)
|
|
2
|
+
export class EventBus {
|
|
3
|
+
listeners = new Map();
|
|
4
|
+
on(event, listener) {
|
|
5
|
+
if (!this.listeners.has(event))
|
|
6
|
+
this.listeners.set(event, new Set());
|
|
7
|
+
this.listeners.get(event).add(listener);
|
|
8
|
+
}
|
|
9
|
+
off(event, listener) {
|
|
10
|
+
this.listeners.get(event)?.delete(listener);
|
|
11
|
+
}
|
|
12
|
+
emit(event, data) {
|
|
13
|
+
this.listeners.get(event)?.forEach((fn) => fn(data));
|
|
14
|
+
}
|
|
15
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
// src/jobs/manager.ts — Crawl job queue manager
|
|
2
|
+
import { nanoid } from 'nanoid';
|
|
3
|
+
import { CrawlWorker } from './worker.js';
|
|
4
|
+
export class JobManager {
|
|
5
|
+
db;
|
|
6
|
+
eventBus;
|
|
7
|
+
activeJobs = new Map();
|
|
8
|
+
constructor(db, eventBus) {
|
|
9
|
+
this.db = db;
|
|
10
|
+
this.eventBus = eventBus;
|
|
11
|
+
}
|
|
12
|
+
/** Start a crawl job for a library */
|
|
13
|
+
startCrawl(libraryId, opts) {
|
|
14
|
+
const jobId = nanoid();
|
|
15
|
+
const job = this.db.createJob({ id: jobId, libraryId });
|
|
16
|
+
// Run crawl async (non-blocking)
|
|
17
|
+
const worker = new CrawlWorker(this.db, this.eventBus);
|
|
18
|
+
this.activeJobs.set(jobId, worker);
|
|
19
|
+
// Fire and forget — runs in the background
|
|
20
|
+
worker
|
|
21
|
+
.crawl(libraryId, jobId)
|
|
22
|
+
.catch((err) => {
|
|
23
|
+
console.error(`[DocShark] Crawl job ${jobId} failed:`, err);
|
|
24
|
+
})
|
|
25
|
+
.finally(() => {
|
|
26
|
+
this.activeJobs.delete(jobId);
|
|
27
|
+
});
|
|
28
|
+
return job;
|
|
29
|
+
}
|
|
30
|
+
/** Get status of a specific job */
|
|
31
|
+
getJob(jobId) {
|
|
32
|
+
return this.db.getJob(jobId);
|
|
33
|
+
}
|
|
34
|
+
/** List all jobs, optionally filtered by library */
|
|
35
|
+
listJobs(libraryId) {
|
|
36
|
+
return this.db.listJobs(libraryId);
|
|
37
|
+
}
|
|
38
|
+
/** Check if a crawl is currently running for a library */
|
|
39
|
+
isRunning(libraryId) {
|
|
40
|
+
for (const [, worker] of this.activeJobs) {
|
|
41
|
+
// Check by iterating active jobs
|
|
42
|
+
const jobs = this.db.listJobs(libraryId);
|
|
43
|
+
if (jobs.some((j) => j.status === 'running' || j.status === 'queued')) {
|
|
44
|
+
return true;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
return false;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
// src/jobs/worker.ts — Crawl execution pipeline
|
|
2
|
+
import { nanoid } from 'nanoid';
|
|
3
|
+
import { createHash } from 'crypto';
|
|
4
|
+
import { discoverPages } from '../scraper/discoverer.js';
|
|
5
|
+
import { fetchPage } from '../scraper/fetcher.js';
|
|
6
|
+
import { extractAndConvert } from '../processor/extractor.js';
|
|
7
|
+
import { chunkMarkdown } from '../processor/chunker.js';
|
|
8
|
+
import { RateLimiter } from '../scraper/rate-limiter.js';
|
|
9
|
+
export class CrawlWorker {
|
|
10
|
+
db;
|
|
11
|
+
eventBus;
|
|
12
|
+
constructor(db, eventBus) {
|
|
13
|
+
this.db = db;
|
|
14
|
+
this.eventBus = eventBus;
|
|
15
|
+
}
|
|
16
|
+
async crawl(libraryId, jobId) {
|
|
17
|
+
const lib = this.db.getLibraryById(libraryId);
|
|
18
|
+
if (!lib)
|
|
19
|
+
throw new Error(`Library ${libraryId} not found`);
|
|
20
|
+
const config = lib.crawl_config ? JSON.parse(lib.crawl_config) : {};
|
|
21
|
+
this.db.updateLibraryStatus(libraryId, 'crawling');
|
|
22
|
+
this.db.updateJob(jobId, { status: 'running', started_at: new Date().toISOString() });
|
|
23
|
+
const rateLimiter = new RateLimiter(config.rateLimit ?? 500);
|
|
24
|
+
try {
|
|
25
|
+
// Phase 1: Discover pages
|
|
26
|
+
const urls = await discoverPages(lib.url, config);
|
|
27
|
+
this.db.updateJob(jobId, { pages_discovered: urls.length });
|
|
28
|
+
this.eventBus.emit('crawl:progress', {
|
|
29
|
+
jobId,
|
|
30
|
+
libraryId,
|
|
31
|
+
phase: 'discovering',
|
|
32
|
+
pagesDiscovered: urls.length,
|
|
33
|
+
});
|
|
34
|
+
console.log(`[DocShark] Discovered ${urls.length} pages for "${lib.display_name}"`);
|
|
35
|
+
let crawled = 0;
|
|
36
|
+
let failed = 0;
|
|
37
|
+
let totalChunks = 0;
|
|
38
|
+
// Phase 2-6: Fetch → Extract → Convert → Chunk → Index
|
|
39
|
+
for (const url of urls) {
|
|
40
|
+
try {
|
|
41
|
+
await rateLimiter.wait();
|
|
42
|
+
const result = await fetchPage(url, config.renderer);
|
|
43
|
+
// Extract content + convert to markdown
|
|
44
|
+
const { markdown, title, headings } = extractAndConvert(result.html, url);
|
|
45
|
+
if (!markdown || markdown.length < 50) {
|
|
46
|
+
crawled++;
|
|
47
|
+
continue; // Skip essentially empty pages
|
|
48
|
+
}
|
|
49
|
+
const contentHash = createHash('sha256').update(markdown).digest('hex');
|
|
50
|
+
// Store page
|
|
51
|
+
const path = new URL(url).pathname;
|
|
52
|
+
const pageId = this.db.upsertPage({
|
|
53
|
+
id: nanoid(),
|
|
54
|
+
libraryId,
|
|
55
|
+
url,
|
|
56
|
+
path,
|
|
57
|
+
title,
|
|
58
|
+
contentMarkdown: markdown,
|
|
59
|
+
contentHash,
|
|
60
|
+
headings,
|
|
61
|
+
});
|
|
62
|
+
// Delete old chunks for this page (for re-crawls)
|
|
63
|
+
this.db.deleteChunksByPage(pageId);
|
|
64
|
+
// Chunk and index
|
|
65
|
+
const chunks = chunkMarkdown(markdown, headings);
|
|
66
|
+
if (chunks.length > 0) {
|
|
67
|
+
const chunkRecords = chunks.map((c, i) => ({
|
|
68
|
+
id: nanoid(),
|
|
69
|
+
pageId,
|
|
70
|
+
libraryId,
|
|
71
|
+
content: c.content,
|
|
72
|
+
headingContext: c.headingContext,
|
|
73
|
+
chunkIndex: i,
|
|
74
|
+
tokenCount: c.tokenCount,
|
|
75
|
+
hasCodeBlock: c.hasCodeBlock,
|
|
76
|
+
}));
|
|
77
|
+
this.db.insertChunks(chunkRecords);
|
|
78
|
+
totalChunks += chunkRecords.length;
|
|
79
|
+
}
|
|
80
|
+
crawled++;
|
|
81
|
+
// Emit progress
|
|
82
|
+
this.eventBus.emit('crawl:progress', {
|
|
83
|
+
jobId,
|
|
84
|
+
libraryId,
|
|
85
|
+
phase: 'crawling',
|
|
86
|
+
pagesCrawled: crawled,
|
|
87
|
+
pagesDiscovered: urls.length,
|
|
88
|
+
currentUrl: url,
|
|
89
|
+
});
|
|
90
|
+
// Log progress every 10 pages
|
|
91
|
+
if (crawled % 10 === 0) {
|
|
92
|
+
console.log(`[DocShark] Progress: ${crawled}/${urls.length} pages (${totalChunks} chunks)`);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
catch (err) {
|
|
96
|
+
failed++;
|
|
97
|
+
console.error(`[DocShark] Failed to crawl ${url}:`, err.message);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
// Update final stats
|
|
101
|
+
this.db.updateLibraryStats(libraryId, crawled, totalChunks);
|
|
102
|
+
this.db.updateLibraryStatus(libraryId, 'indexed');
|
|
103
|
+
this.db.updateJob(jobId, {
|
|
104
|
+
status: 'completed',
|
|
105
|
+
pages_crawled: crawled,
|
|
106
|
+
pages_failed: failed,
|
|
107
|
+
chunks_created: totalChunks,
|
|
108
|
+
completed_at: new Date().toISOString(),
|
|
109
|
+
});
|
|
110
|
+
this.eventBus.emit('crawl:complete', { jobId, libraryId, crawled, failed, totalChunks });
|
|
111
|
+
console.log(`[DocShark] ✅ Crawl complete: ${crawled} pages, ${totalChunks} chunks, ${failed} failed`);
|
|
112
|
+
}
|
|
113
|
+
catch (err) {
|
|
114
|
+
this.db.updateLibraryStatus(libraryId, 'error');
|
|
115
|
+
this.db.updateJob(jobId, { status: 'failed', error_message: err.message });
|
|
116
|
+
this.eventBus.emit('crawl:error', { jobId, libraryId, error: err.message });
|
|
117
|
+
console.error(`[DocShark] ❌ Crawl failed for "${lib.display_name}":`, err.message);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
// src/processor/chunker.ts — Heading-based semantic chunking
|
|
2
|
+
const MAX_TOKENS = 1200;
|
|
3
|
+
const MIN_TOKENS = 50;
|
|
4
|
+
export function chunkMarkdown(markdown, _headings) {
|
|
5
|
+
const sections = splitByHeadings(markdown);
|
|
6
|
+
const chunks = [];
|
|
7
|
+
for (const section of sections) {
|
|
8
|
+
const tokens = estimateTokens(section.content);
|
|
9
|
+
if (tokens < MIN_TOKENS)
|
|
10
|
+
continue;
|
|
11
|
+
if (tokens <= MAX_TOKENS) {
|
|
12
|
+
chunks.push({
|
|
13
|
+
content: section.content,
|
|
14
|
+
headingContext: section.headingPath,
|
|
15
|
+
tokenCount: tokens,
|
|
16
|
+
hasCodeBlock: section.content.includes('```'),
|
|
17
|
+
});
|
|
18
|
+
}
|
|
19
|
+
else {
|
|
20
|
+
// Split large sections by paragraphs
|
|
21
|
+
const paras = splitByParagraphs(section.content);
|
|
22
|
+
let buffer = '';
|
|
23
|
+
for (const para of paras) {
|
|
24
|
+
if (estimateTokens(buffer + para) > MAX_TOKENS && buffer) {
|
|
25
|
+
chunks.push({
|
|
26
|
+
content: buffer.trim(),
|
|
27
|
+
headingContext: section.headingPath,
|
|
28
|
+
tokenCount: estimateTokens(buffer),
|
|
29
|
+
hasCodeBlock: buffer.includes('```'),
|
|
30
|
+
});
|
|
31
|
+
buffer = '';
|
|
32
|
+
}
|
|
33
|
+
buffer += para + '\n\n';
|
|
34
|
+
}
|
|
35
|
+
if (buffer.trim() && estimateTokens(buffer) >= MIN_TOKENS) {
|
|
36
|
+
chunks.push({
|
|
37
|
+
content: buffer.trim(),
|
|
38
|
+
headingContext: section.headingPath,
|
|
39
|
+
tokenCount: estimateTokens(buffer),
|
|
40
|
+
hasCodeBlock: buffer.includes('```'),
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
return chunks;
|
|
46
|
+
}
|
|
47
|
+
function splitByHeadings(md) {
|
|
48
|
+
const lines = md.split('\n');
|
|
49
|
+
const sections = [];
|
|
50
|
+
const headingStack = [];
|
|
51
|
+
let currentContent = '';
|
|
52
|
+
for (const line of lines) {
|
|
53
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
|
|
54
|
+
if (headingMatch) {
|
|
55
|
+
if (currentContent.trim()) {
|
|
56
|
+
sections.push({ content: currentContent.trim(), headingPath: headingStack.join(' > ') });
|
|
57
|
+
}
|
|
58
|
+
const level = headingMatch[1].length;
|
|
59
|
+
const text = headingMatch[2];
|
|
60
|
+
while (headingStack.length >= level)
|
|
61
|
+
headingStack.pop();
|
|
62
|
+
headingStack.push(text);
|
|
63
|
+
currentContent = line + '\n';
|
|
64
|
+
}
|
|
65
|
+
else {
|
|
66
|
+
currentContent += line + '\n';
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
if (currentContent.trim()) {
|
|
70
|
+
sections.push({ content: currentContent.trim(), headingPath: headingStack.join(' > ') });
|
|
71
|
+
}
|
|
72
|
+
return sections;
|
|
73
|
+
}
|
|
74
|
+
function splitByParagraphs(text) {
|
|
75
|
+
return text.split(/\n{2,}/).filter(Boolean);
|
|
76
|
+
}
|
|
77
|
+
function estimateTokens(text) {
|
|
78
|
+
return Math.ceil(text.length / 4);
|
|
79
|
+
}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
// src/processor/extractor.ts — HTML content extraction + Markdown conversion
|
|
2
|
+
import { Readability } from '@mozilla/readability';
|
|
3
|
+
import { parseHTML } from 'linkedom';
|
|
4
|
+
import TurndownService from 'turndown';
|
|
5
|
+
import { gfm } from 'turndown-plugin-gfm';
|
|
6
|
+
const turndown = new TurndownService({
|
|
7
|
+
headingStyle: 'atx',
|
|
8
|
+
codeBlockStyle: 'fenced',
|
|
9
|
+
bulletListMarker: '-',
|
|
10
|
+
});
|
|
11
|
+
turndown.use(gfm);
|
|
12
|
+
turndown.keep(['details', 'summary', 'kbd']);
|
|
13
|
+
// Preserve language attribute on code blocks
|
|
14
|
+
turndown.addRule('fencedCodeBlock', {
|
|
15
|
+
filter: (node) => node.nodeName === 'PRE' && node.querySelector('code'),
|
|
16
|
+
replacement: (_content, node) => {
|
|
17
|
+
const code = node.querySelector('code');
|
|
18
|
+
const lang = code?.className?.match(/language-(\w+)/)?.[1] || '';
|
|
19
|
+
const text = code?.textContent || '';
|
|
20
|
+
return `\n\`\`\`${lang}\n${text.trim()}\n\`\`\`\n`;
|
|
21
|
+
},
|
|
22
|
+
});
|
|
23
|
+
// Strip images (noisy for search context)
|
|
24
|
+
turndown.addRule('removeImages', {
|
|
25
|
+
filter: 'img',
|
|
26
|
+
replacement: () => '',
|
|
27
|
+
});
|
|
28
|
+
export function extractAndConvert(html, url) {
|
|
29
|
+
const { document } = parseHTML(html);
|
|
30
|
+
// Set the document URL for Readability to resolve relative links
|
|
31
|
+
if (url) {
|
|
32
|
+
try {
|
|
33
|
+
const baseEl = document.createElement('base');
|
|
34
|
+
baseEl.setAttribute('href', url);
|
|
35
|
+
document.head.appendChild(baseEl);
|
|
36
|
+
}
|
|
37
|
+
catch {
|
|
38
|
+
// Ignore if head doesn't exist
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
// PRE-PROCESS: Rescue code blocks from aggressive Readability stripping
|
|
42
|
+
// Many doc sites wrap <pre> blocks in complex UI (like <figure>) that Readability deletes.
|
|
43
|
+
const pres = Array.from(document.querySelectorAll('pre'));
|
|
44
|
+
for (const pre of pres) {
|
|
45
|
+
const codeText = pre.textContent || '';
|
|
46
|
+
if (!codeText)
|
|
47
|
+
continue;
|
|
48
|
+
const lang = pre.getAttribute('data-language') ||
|
|
49
|
+
pre.className?.match(/language-(\w+)/)?.[1] ||
|
|
50
|
+
pre.querySelector('code')?.getAttribute('data-language') ||
|
|
51
|
+
pre.querySelector('code')?.className?.match(/language-(\w+)/)?.[1] ||
|
|
52
|
+
'';
|
|
53
|
+
let wrapper = pre;
|
|
54
|
+
// Walk up to find the container that exclusively holds this code block
|
|
55
|
+
while (wrapper.parentElement &&
|
|
56
|
+
!['BODY', 'MAIN', 'ARTICLE', 'SECTION'].includes(wrapper.parentElement.tagName.toUpperCase()) &&
|
|
57
|
+
wrapper.parentElement.textContent?.trim() === pre.textContent?.trim()) {
|
|
58
|
+
wrapper = wrapper.parentElement;
|
|
59
|
+
}
|
|
60
|
+
const cleanPre = document.createElement('pre');
|
|
61
|
+
const cleanCode = document.createElement('code');
|
|
62
|
+
if (lang)
|
|
63
|
+
cleanCode.className = `language-${lang}`;
|
|
64
|
+
cleanCode.textContent = codeText;
|
|
65
|
+
cleanPre.appendChild(cleanCode);
|
|
66
|
+
wrapper.replaceWith(cleanPre);
|
|
67
|
+
}
|
|
68
|
+
const reader = new Readability(document, { charThreshold: 100 });
|
|
69
|
+
const article = reader.parse();
|
|
70
|
+
const title = article?.title || document.querySelector('title')?.textContent || '';
|
|
71
|
+
const contentHtml = article?.content || document.body?.innerHTML || '';
|
|
72
|
+
const markdown = turndown.turndown(contentHtml).trim();
|
|
73
|
+
// Extract heading hierarchy from the markdown
|
|
74
|
+
const headings = [];
|
|
75
|
+
const headingRegex = /^(#{1,6})\s+(.+)$/gm;
|
|
76
|
+
let match;
|
|
77
|
+
while ((match = headingRegex.exec(markdown)) !== null) {
|
|
78
|
+
headings.push({ level: match[1].length, text: match[2].trim() });
|
|
79
|
+
}
|
|
80
|
+
return { markdown, title, headings };
|
|
81
|
+
}
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
// src/scraper/discoverer.ts — Page URL discovery via sitemap + link crawl
|
|
2
|
+
import * as cheerio from 'cheerio';
|
|
3
|
+
import { getRobotsParser, isAllowed } from './robots.js';
|
|
4
|
+
import { RateLimiter } from './rate-limiter.js';
|
|
5
|
+
const USER_AGENT = 'DocShark/1.0';
|
|
6
|
+
/**
|
|
7
|
+
* Discover all documentation page URLs from a base URL.
|
|
8
|
+
* Strategy: sitemap.xml → link crawl fallback
|
|
9
|
+
*/
|
|
10
|
+
export async function discoverPages(baseUrl, config = {}) {
|
|
11
|
+
const maxDepth = config.maxDepth ?? 3;
|
|
12
|
+
const robots = await getRobotsParser(baseUrl);
|
|
13
|
+
// Strategy A: Try sitemap first
|
|
14
|
+
const sitemapUrls = await discoverFromSitemap(baseUrl, robots);
|
|
15
|
+
if (sitemapUrls.length > 0) {
|
|
16
|
+
console.log(`[DocShark] Found ${sitemapUrls.length} URLs from sitemap`);
|
|
17
|
+
return filterUrls(sitemapUrls, baseUrl, config, robots);
|
|
18
|
+
}
|
|
19
|
+
// Strategy B: BFS link crawl
|
|
20
|
+
console.log(`[DocShark] No sitemap found, crawling links (depth=${maxDepth})`);
|
|
21
|
+
const crawledUrls = await discoverByLinkCrawl(baseUrl, maxDepth, config, robots);
|
|
22
|
+
return crawledUrls;
|
|
23
|
+
}
|
|
24
|
+
/** Parse sitemap.xml for page URLs */
|
|
25
|
+
async function discoverFromSitemap(baseUrl, robots) {
|
|
26
|
+
// Check for sitemap in robots.txt
|
|
27
|
+
const sitemapUrl = robots?.getSitemaps()?.[0] || new URL('/sitemap.xml', baseUrl).href;
|
|
28
|
+
try {
|
|
29
|
+
const response = await fetch(sitemapUrl, {
|
|
30
|
+
headers: { 'User-Agent': USER_AGENT },
|
|
31
|
+
signal: AbortSignal.timeout(15_000),
|
|
32
|
+
});
|
|
33
|
+
if (!response.ok)
|
|
34
|
+
return [];
|
|
35
|
+
const xml = await response.text();
|
|
36
|
+
const $ = cheerio.load(xml, { xmlMode: true });
|
|
37
|
+
const urls = [];
|
|
38
|
+
// Handle sitemap index (sitemapindex > sitemap > loc)
|
|
39
|
+
const sitemapLocs = $('sitemapindex > sitemap > loc');
|
|
40
|
+
if (sitemapLocs.length > 0) {
|
|
41
|
+
for (const el of sitemapLocs.toArray()) {
|
|
42
|
+
const childSitemapUrl = $(el).text().trim();
|
|
43
|
+
if (childSitemapUrl) {
|
|
44
|
+
const childUrls = await fetchSitemapUrls(childSitemapUrl);
|
|
45
|
+
urls.push(...childUrls);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
else {
|
|
50
|
+
// Regular sitemap (urlset > url > loc)
|
|
51
|
+
$('urlset > url > loc').each((_, el) => {
|
|
52
|
+
const loc = $(el).text().trim();
|
|
53
|
+
if (loc)
|
|
54
|
+
urls.push(loc);
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
return urls;
|
|
58
|
+
}
|
|
59
|
+
catch {
|
|
60
|
+
return [];
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
async function fetchSitemapUrls(sitemapUrl) {
|
|
64
|
+
try {
|
|
65
|
+
const response = await fetch(sitemapUrl, {
|
|
66
|
+
headers: { 'User-Agent': USER_AGENT },
|
|
67
|
+
signal: AbortSignal.timeout(15_000),
|
|
68
|
+
});
|
|
69
|
+
if (!response.ok)
|
|
70
|
+
return [];
|
|
71
|
+
const xml = await response.text();
|
|
72
|
+
const $ = cheerio.load(xml, { xmlMode: true });
|
|
73
|
+
const urls = [];
|
|
74
|
+
$('urlset > url > loc').each((_, el) => {
|
|
75
|
+
const loc = $(el).text().trim();
|
|
76
|
+
if (loc)
|
|
77
|
+
urls.push(loc);
|
|
78
|
+
});
|
|
79
|
+
return urls;
|
|
80
|
+
}
|
|
81
|
+
catch {
|
|
82
|
+
return [];
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
/** BFS link crawl from the base URL */
|
|
86
|
+
async function discoverByLinkCrawl(baseUrl, maxDepth, config, robots) {
|
|
87
|
+
const visited = new Set();
|
|
88
|
+
const queue = [{ url: baseUrl, depth: 0 }];
|
|
89
|
+
const rateLimiter = new RateLimiter(config.rateLimit ?? 500);
|
|
90
|
+
const baseOrigin = new URL(baseUrl).origin;
|
|
91
|
+
const basePath = new URL(baseUrl).pathname;
|
|
92
|
+
while (queue.length > 0) {
|
|
93
|
+
const item = queue.shift();
|
|
94
|
+
if (visited.has(item.url) || item.depth > maxDepth)
|
|
95
|
+
continue;
|
|
96
|
+
if (!isAllowed(robots, item.url))
|
|
97
|
+
continue;
|
|
98
|
+
visited.add(item.url);
|
|
99
|
+
try {
|
|
100
|
+
await rateLimiter.wait();
|
|
101
|
+
const response = await fetch(item.url, {
|
|
102
|
+
headers: { 'User-Agent': USER_AGENT },
|
|
103
|
+
signal: AbortSignal.timeout(15_000),
|
|
104
|
+
});
|
|
105
|
+
if (!response.ok)
|
|
106
|
+
continue;
|
|
107
|
+
const contentType = response.headers.get('content-type') || '';
|
|
108
|
+
if (!contentType.includes('text/html'))
|
|
109
|
+
continue;
|
|
110
|
+
const html = await response.text();
|
|
111
|
+
const $ = cheerio.load(html);
|
|
112
|
+
$('a[href]').each((_, el) => {
|
|
113
|
+
try {
|
|
114
|
+
const href = $(el).attr('href');
|
|
115
|
+
if (!href)
|
|
116
|
+
return;
|
|
117
|
+
const resolved = new URL(href, item.url);
|
|
118
|
+
// Strip hash and query
|
|
119
|
+
resolved.hash = '';
|
|
120
|
+
resolved.search = '';
|
|
121
|
+
const resolvedUrl = resolved.href;
|
|
122
|
+
// Only follow same-origin links under the base path
|
|
123
|
+
if (resolved.origin === baseOrigin &&
|
|
124
|
+
resolved.pathname.startsWith(basePath) &&
|
|
125
|
+
!visited.has(resolvedUrl) &&
|
|
126
|
+
!isNonDocUrl(resolvedUrl)) {
|
|
127
|
+
queue.push({ url: resolvedUrl, depth: item.depth + 1 });
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
catch {
|
|
131
|
+
// Invalid URL, skip
|
|
132
|
+
}
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
catch {
|
|
136
|
+
// Fetch failed, skip
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
return filterUrls([...visited], baseUrl, config, robots);
|
|
140
|
+
}
|
|
141
|
+
/** Filter URLs based on config patterns */
|
|
142
|
+
function filterUrls(urls, baseUrl, config, robots) {
|
|
143
|
+
const baseOrigin = new URL(baseUrl).origin;
|
|
144
|
+
const basePath = new URL(baseUrl).pathname;
|
|
145
|
+
return urls.filter((url) => {
|
|
146
|
+
try {
|
|
147
|
+
const parsed = new URL(url);
|
|
148
|
+
// Must be same origin and under base path
|
|
149
|
+
if (parsed.origin !== baseOrigin)
|
|
150
|
+
return false;
|
|
151
|
+
if (!parsed.pathname.startsWith(basePath))
|
|
152
|
+
return false;
|
|
153
|
+
// Check robots.txt
|
|
154
|
+
if (!isAllowed(robots, url))
|
|
155
|
+
return false;
|
|
156
|
+
// Skip non-doc URLs
|
|
157
|
+
if (isNonDocUrl(url))
|
|
158
|
+
return false;
|
|
159
|
+
// Include/exclude patterns
|
|
160
|
+
if (config.includePatterns?.length) {
|
|
161
|
+
const matches = config.includePatterns.some((p) => url.includes(p));
|
|
162
|
+
if (!matches)
|
|
163
|
+
return false;
|
|
164
|
+
}
|
|
165
|
+
if (config.excludePatterns?.length) {
|
|
166
|
+
const excluded = config.excludePatterns.some((p) => url.includes(p));
|
|
167
|
+
if (excluded)
|
|
168
|
+
return false;
|
|
169
|
+
}
|
|
170
|
+
return true;
|
|
171
|
+
}
|
|
172
|
+
catch {
|
|
173
|
+
return false;
|
|
174
|
+
}
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
/** Heuristic: skip non-documentation URLs */
|
|
178
|
+
function isNonDocUrl(url) {
|
|
179
|
+
const skip = [
|
|
180
|
+
'/api/',
|
|
181
|
+
'/login',
|
|
182
|
+
'/signup',
|
|
183
|
+
'/auth/',
|
|
184
|
+
'.pdf',
|
|
185
|
+
'.zip',
|
|
186
|
+
'.tar',
|
|
187
|
+
'.gz',
|
|
188
|
+
'.png',
|
|
189
|
+
'.jpg',
|
|
190
|
+
'.jpeg',
|
|
191
|
+
'.gif',
|
|
192
|
+
'.svg',
|
|
193
|
+
'.ico',
|
|
194
|
+
'.css',
|
|
195
|
+
'.js',
|
|
196
|
+
'.woff',
|
|
197
|
+
'.woff2',
|
|
198
|
+
'.ttf',
|
|
199
|
+
'.eot',
|
|
200
|
+
'/feed',
|
|
201
|
+
'/rss',
|
|
202
|
+
'/atom',
|
|
203
|
+
];
|
|
204
|
+
const lower = url.toLowerCase();
|
|
205
|
+
return skip.some((s) => lower.includes(s));
|
|
206
|
+
}
|