@gulibs/safe-coder 0.0.24 → 0.0.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +447 -15
- package/dist/documentation/browser-manager.d.ts +51 -0
- package/dist/documentation/browser-manager.d.ts.map +1 -0
- package/dist/documentation/browser-manager.js +260 -0
- package/dist/documentation/browser-manager.js.map +1 -0
- package/dist/documentation/checkpoint-manager.d.ts +38 -0
- package/dist/documentation/checkpoint-manager.d.ts.map +1 -0
- package/dist/documentation/checkpoint-manager.js +101 -0
- package/dist/documentation/checkpoint-manager.js.map +1 -0
- package/dist/documentation/doc-crawler.d.ts +103 -1
- package/dist/documentation/doc-crawler.d.ts.map +1 -1
- package/dist/documentation/doc-crawler.js +931 -151
- package/dist/documentation/doc-crawler.js.map +1 -1
- package/dist/documentation/llms-txt/detector.d.ts +31 -0
- package/dist/documentation/llms-txt/detector.d.ts.map +1 -0
- package/dist/documentation/llms-txt/detector.js +77 -0
- package/dist/documentation/llms-txt/detector.js.map +1 -0
- package/dist/documentation/llms-txt/downloader.d.ts +30 -0
- package/dist/documentation/llms-txt/downloader.d.ts.map +1 -0
- package/dist/documentation/llms-txt/downloader.js +84 -0
- package/dist/documentation/llms-txt/downloader.js.map +1 -0
- package/dist/documentation/llms-txt/index.d.ts +4 -0
- package/dist/documentation/llms-txt/index.d.ts.map +1 -0
- package/dist/documentation/llms-txt/index.js +4 -0
- package/dist/documentation/llms-txt/index.js.map +1 -0
- package/dist/documentation/llms-txt/parser.d.ts +43 -0
- package/dist/documentation/llms-txt/parser.d.ts.map +1 -0
- package/dist/documentation/llms-txt/parser.js +177 -0
- package/dist/documentation/llms-txt/parser.js.map +1 -0
- package/dist/index.js +0 -0
- package/dist/server/mcp-server.d.ts.map +1 -1
- package/dist/server/mcp-server.js +48 -3
- package/dist/server/mcp-server.js.map +1 -1
- package/package.json +16 -11
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
import { launch } from 'puppeteer-core';
|
|
2
|
+
import * as chromeLauncher from 'chrome-launcher';
|
|
3
|
+
import { logger } from '../utils/logger.js';
|
|
4
|
+
import { platform } from 'os';
|
|
5
|
+
import { existsSync } from 'fs';
|
|
6
|
+
import { access } from 'fs/promises';
|
|
7
|
+
/**
|
|
8
|
+
* Browser manager for SPA rendering using puppeteer-core
|
|
9
|
+
* Detects and uses system Chrome/Chromium/Edge
|
|
10
|
+
*/
|
|
11
|
+
export class BrowserManager {
|
|
12
|
+
browser;
|
|
13
|
+
config;
|
|
14
|
+
// Browser paths by platform
|
|
15
|
+
BROWSER_PATHS = {
|
|
16
|
+
darwin: [
|
|
17
|
+
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
|
18
|
+
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
|
19
|
+
'/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge',
|
|
20
|
+
],
|
|
21
|
+
win32: [
|
|
22
|
+
'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
|
|
23
|
+
'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
|
|
24
|
+
'C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe',
|
|
25
|
+
'C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe',
|
|
26
|
+
],
|
|
27
|
+
linux: [
|
|
28
|
+
'/usr/bin/google-chrome',
|
|
29
|
+
'/usr/bin/google-chrome-stable',
|
|
30
|
+
'/usr/bin/chromium-browser',
|
|
31
|
+
'/usr/bin/chromium',
|
|
32
|
+
'/snap/bin/chromium',
|
|
33
|
+
'/usr/bin/microsoft-edge',
|
|
34
|
+
],
|
|
35
|
+
};
|
|
36
|
+
constructor() {
|
|
37
|
+
this.config = {
|
|
38
|
+
executablePath: undefined,
|
|
39
|
+
headless: true,
|
|
40
|
+
timeout: 30000,
|
|
41
|
+
waitForTimeout: 3000,
|
|
42
|
+
networkIdleTimeout: 500,
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Detect system browser
|
|
47
|
+
* Returns path to Chrome/Chromium/Edge or null
|
|
48
|
+
*/
|
|
49
|
+
async detectBrowser() {
|
|
50
|
+
logger.info('Detecting system browser');
|
|
51
|
+
// 1. Check environment variables
|
|
52
|
+
const envPaths = [
|
|
53
|
+
process.env.CHROME_PATH,
|
|
54
|
+
process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
55
|
+
process.env.CHROMIUM_PATH,
|
|
56
|
+
];
|
|
57
|
+
for (const envPath of envPaths) {
|
|
58
|
+
if (envPath && existsSync(envPath)) {
|
|
59
|
+
logger.info('Found browser via environment variable', { path: envPath });
|
|
60
|
+
return envPath;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
// 2. Use chrome-launcher to find Chrome
|
|
64
|
+
try {
|
|
65
|
+
const installations = await chromeLauncher.Launcher.getInstallations();
|
|
66
|
+
if (installations.length > 0) {
|
|
67
|
+
logger.info('Found browser via chrome-launcher', { path: installations[0] });
|
|
68
|
+
return installations[0];
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
catch (error) {
|
|
72
|
+
logger.debug('chrome-launcher detection failed', {
|
|
73
|
+
error: error instanceof Error ? error.message : String(error),
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
// 3. Check common paths for current platform
|
|
77
|
+
const platformKey = platform();
|
|
78
|
+
const paths = this.BROWSER_PATHS[platformKey] || [];
|
|
79
|
+
for (const path of paths) {
|
|
80
|
+
try {
|
|
81
|
+
await access(path);
|
|
82
|
+
logger.info('Found browser at common path', { path, platform: platformKey });
|
|
83
|
+
return path;
|
|
84
|
+
}
|
|
85
|
+
catch {
|
|
86
|
+
// Path doesn't exist, continue
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
// 4. Try to use bundled puppeteer if installed
|
|
90
|
+
try {
|
|
91
|
+
const puppeteer = await import('puppeteer');
|
|
92
|
+
if (puppeteer.executablePath) {
|
|
93
|
+
const bundledPath = puppeteer.executablePath();
|
|
94
|
+
logger.info('Found bundled Chromium from puppeteer', { path: bundledPath });
|
|
95
|
+
return bundledPath;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
catch {
|
|
99
|
+
// puppeteer not installed
|
|
100
|
+
logger.debug('Bundled puppeteer not available');
|
|
101
|
+
}
|
|
102
|
+
logger.warn('No browser found on system');
|
|
103
|
+
return null;
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Launch browser instance
|
|
107
|
+
*/
|
|
108
|
+
async launch(config) {
|
|
109
|
+
if (this.browser) {
|
|
110
|
+
logger.debug('Browser already launched');
|
|
111
|
+
return this.browser;
|
|
112
|
+
}
|
|
113
|
+
// Merge config
|
|
114
|
+
this.config = {
|
|
115
|
+
...this.config,
|
|
116
|
+
...config,
|
|
117
|
+
};
|
|
118
|
+
// Detect browser if not specified
|
|
119
|
+
if (!this.config.executablePath) {
|
|
120
|
+
const detected = await this.detectBrowser();
|
|
121
|
+
if (!detected) {
|
|
122
|
+
throw new Error('No browser found. Install Chrome or set CHROME_PATH environment variable. ' +
|
|
123
|
+
'See docs/SPA_BROWSER_SETUP.md for details.');
|
|
124
|
+
}
|
|
125
|
+
this.config.executablePath = detected;
|
|
126
|
+
}
|
|
127
|
+
logger.info('Launching browser', {
|
|
128
|
+
executablePath: this.config.executablePath,
|
|
129
|
+
headless: this.config.headless,
|
|
130
|
+
});
|
|
131
|
+
try {
|
|
132
|
+
this.browser = await launch({
|
|
133
|
+
executablePath: this.config.executablePath,
|
|
134
|
+
headless: this.config.headless,
|
|
135
|
+
args: [
|
|
136
|
+
'--no-sandbox',
|
|
137
|
+
'--disable-setuid-sandbox',
|
|
138
|
+
'--disable-dev-shm-usage',
|
|
139
|
+
'--disable-accelerated-2d-canvas',
|
|
140
|
+
'--disable-gpu',
|
|
141
|
+
'--no-first-run',
|
|
142
|
+
'--no-zygote',
|
|
143
|
+
'--disable-background-networking',
|
|
144
|
+
'--disable-background-timer-throttling',
|
|
145
|
+
'--disable-backgrounding-occluded-windows',
|
|
146
|
+
'--disable-renderer-backgrounding',
|
|
147
|
+
],
|
|
148
|
+
});
|
|
149
|
+
logger.info('Browser launched successfully');
|
|
150
|
+
return this.browser;
|
|
151
|
+
}
|
|
152
|
+
catch (error) {
|
|
153
|
+
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
154
|
+
logger.error('Failed to launch browser', { error: errorMsg });
|
|
155
|
+
throw new Error(`Failed to launch browser: ${errorMsg}`);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Render a page and extract content
|
|
160
|
+
* Uses balanced performance mode
|
|
161
|
+
*/
|
|
162
|
+
async renderPage(url) {
|
|
163
|
+
if (!this.browser) {
|
|
164
|
+
throw new Error('Browser not launched. Call launch() first.');
|
|
165
|
+
}
|
|
166
|
+
logger.info('Rendering page with browser', { url });
|
|
167
|
+
let page;
|
|
168
|
+
try {
|
|
169
|
+
page = await this.browser.newPage();
|
|
170
|
+
// Set viewport
|
|
171
|
+
await page.setViewport({ width: 1280, height: 800 });
|
|
172
|
+
// Optimize: disable unnecessary resources (balanced mode)
|
|
173
|
+
await page.setRequestInterception(true);
|
|
174
|
+
page.on('request', (req) => {
|
|
175
|
+
const resourceType = req.resourceType();
|
|
176
|
+
if (['image', 'stylesheet', 'font', 'media'].includes(resourceType)) {
|
|
177
|
+
req.abort();
|
|
178
|
+
}
|
|
179
|
+
else {
|
|
180
|
+
req.continue();
|
|
181
|
+
}
|
|
182
|
+
});
|
|
183
|
+
// Navigate to page
|
|
184
|
+
await page.goto(url, {
|
|
185
|
+
waitUntil: 'domcontentloaded', // Wait for DOM
|
|
186
|
+
timeout: this.config.timeout,
|
|
187
|
+
});
|
|
188
|
+
// Wait for main content (balanced mode)
|
|
189
|
+
await page.waitForTimeout(this.config.waitForTimeout);
|
|
190
|
+
// Try to wait for network idle (with fallback)
|
|
191
|
+
try {
|
|
192
|
+
await page.waitForNetworkIdle({
|
|
193
|
+
timeout: this.config.networkIdleTimeout,
|
|
194
|
+
});
|
|
195
|
+
}
|
|
196
|
+
catch (error) {
|
|
197
|
+
// Network idle timeout is not critical, continue
|
|
198
|
+
logger.debug('Network idle timeout, continuing', { url });
|
|
199
|
+
}
|
|
200
|
+
// Extract content using page.evaluate
|
|
201
|
+
// Using string evaluation to avoid DOM type errors in Node.js context
|
|
202
|
+
const result = await page.evaluate(`
|
|
203
|
+
(() => {
|
|
204
|
+
return {
|
|
205
|
+
html: document.documentElement.outerHTML,
|
|
206
|
+
title: document.title,
|
|
207
|
+
url: window.location.href,
|
|
208
|
+
links: Array.from(document.querySelectorAll('a')).map(a => ({
|
|
209
|
+
text: a.textContent?.trim() || '',
|
|
210
|
+
url: a.href,
|
|
211
|
+
})),
|
|
212
|
+
};
|
|
213
|
+
})()
|
|
214
|
+
`);
|
|
215
|
+
logger.info('Page rendered successfully', {
|
|
216
|
+
url,
|
|
217
|
+
title: result.title.substring(0, 50),
|
|
218
|
+
linksFound: result.links.length,
|
|
219
|
+
htmlSize: result.html.length,
|
|
220
|
+
});
|
|
221
|
+
return result;
|
|
222
|
+
}
|
|
223
|
+
catch (error) {
|
|
224
|
+
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
225
|
+
logger.error('Failed to render page', { url, error: errorMsg });
|
|
226
|
+
throw error;
|
|
227
|
+
}
|
|
228
|
+
finally {
|
|
229
|
+
// Always close the page to free resources
|
|
230
|
+
if (page) {
|
|
231
|
+
await page.close();
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
/**
|
|
236
|
+
* Close browser instance and cleanup resources
|
|
237
|
+
*/
|
|
238
|
+
async close() {
|
|
239
|
+
if (this.browser) {
|
|
240
|
+
logger.info('Closing browser');
|
|
241
|
+
try {
|
|
242
|
+
await this.browser.close();
|
|
243
|
+
this.browser = undefined;
|
|
244
|
+
logger.info('Browser closed successfully');
|
|
245
|
+
}
|
|
246
|
+
catch (error) {
|
|
247
|
+
logger.warn('Error closing browser', {
|
|
248
|
+
error: error instanceof Error ? error.message : String(error),
|
|
249
|
+
});
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
/**
|
|
254
|
+
* Check if browser is currently launched
|
|
255
|
+
*/
|
|
256
|
+
isLaunched() {
|
|
257
|
+
return this.browser !== undefined;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
//# sourceMappingURL=browser-manager.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"browser-manager.js","sourceRoot":"","sources":["../../src/documentation/browser-manager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAA2B,MAAM,gBAAgB,CAAC;AACjE,OAAO,KAAK,cAAc,MAAM,iBAAiB,CAAC;AAClD,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAC5C,OAAO,EAAE,QAAQ,EAAE,MAAM,IAAI,CAAC;AAC9B,OAAO,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC;AAChC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AA0BrC;;;GAGG;AACH,MAAM,OAAO,cAAc;IACjB,OAAO,CAAW;IAClB,MAAM,CAAwB;IAEtC,4BAA4B;IACX,aAAa,GAAG;QAC/B,MAAM,EAAE;YACN,8DAA8D;YAC9D,oDAAoD;YACpD,gEAAgE;SACjE;QACD,KAAK,EAAE;YACL,4DAA4D;YAC5D,kEAAkE;YAClE,6DAA6D;YAC7D,mEAAmE;SACpE;QACD,KAAK,EAAE;YACL,wBAAwB;YACxB,+BAA+B;YAC/B,2BAA2B;YAC3B,mBAAmB;YACnB,oBAAoB;YACpB,yBAAyB;SAC1B;KACF,CAAC;IAEF;QACE,IAAI,CAAC,MAAM,GAAG;YACZ,cAAc,EAAE,SAAS;YACzB,QAAQ,EAAE,IAAI;YACd,OAAO,EAAE,KAAK;YACd,cAAc,EAAE,IAAI;YACpB,kBAAkB,EAAE,GAAG;SACxB,CAAC;IACJ,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,aAAa;QACjB,MAAM,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;QAExC,iCAAiC;QACjC,MAAM,QAAQ,GAAG;YACf,OAAO,CAAC,GAAG,CAAC,WAAW;YACvB,OAAO,CAAC,GAAG,CAAC,yBAAyB;YACrC,OAAO,CAAC,GAAG,CAAC,aAAa;SAC1B,CAAC;QAEF,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAC/B,IAAI,OAAO,IAAI,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;gBACnC,MAAM,CAAC,IAAI,CAAC,wCAAwC,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC,CAAC;gBACzE,OAAO,OAAO,CAAC;YACjB,CAAC;QACH,CAAC;QAED,wCAAwC;QACxC,IAAI,CAAC;YACH,MAAM,aAAa,GAAG,MAAM,cAAc,CAAC,QAAQ,CAAC,gBAAgB,EAAE,CAAC;YACvE,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC7B,MAAM,CAAC,IAAI,CAAC,mCAAmC,EAAE,EAAE,IAAI,EAAE,aAAa,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;gBAC7E,OAAO,aAAa,CAAC,CAAC,CAAC,CAAC;YAC1B,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,CAAC,KAAK,CAAC,kCAAkC,EAAE;gBAC/C,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;aAC9D,CAAC,CAAC;QACL,CAAC;QAED,6CAA6C;QAC7C,MAAM,WAAW,GAAG,QAAQ,EAAqC,CAAC;QAClE,MAAM,KAAK,GAAG,IAAI,CAAC,aAAa,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC;QAEpD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,CAAC;gBACH,MAAM,MAAM,CAAC,IAAI,CAAC,CAAC;gBACnB,MAAM,CAAC,IAAI,CAAC,8BAA8B,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,CAAC,CAAC;gBAC7E,OAAO,IAAI,CAAC;YACd,CAAC;YAAC,MAAM,CAAC;gBACP,+BAA+B;YACjC,CAAC;QACH,CAAC;QAED,+CAA+C;QAC/C,IAAI,CAAC;YACH,MAAM,SAAS,GAAG,MAAM,MAAM,CAAC,WAAW,CAAC,CAAC;YAC5C,IAAI,SAAS,CAAC,cAAc,EAAE,CAAC;gBAC7B,MAAM,WAAW,GAAG,SAAS,CAAC,cAAc,EAAE,CAAC;gBAC/C,MAAM,CAAC,IAAI,CAAC,uCAAuC,EAAE,EAAE,IAAI,EAAE,WAAW,EAAE,CAAC,CAAC;gBAC5E,OAAO,WAAW,CAAC;YACrB,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,0BAA0B;YAC1B,MAAM,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;QAClD,CAAC;QAED,MAAM,CAAC,IAAI,CAAC,4BAA4B,CAAC,CAAC;QAC1C,OAAO,IAAI,CAAC;IACd,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,MAAM,CAAC,MAA+B;QAC1C,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;YACzC,OAAO,IAAI,CAAC,OAAO,CAAC;QACtB,CAAC;QAED,eAAe;QACf,IAAI,CAAC,MAAM,GAAG;YACZ,GAAG,IAAI,CAAC,MAAM;YACd,GAAG,MAAM;SACV,CAAC;QAEF,kCAAkC;QAClC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,cAAc,EAAE,CAAC;YAChC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,aAAa,EAAE,CAAC;YAC5C,IAAI,CAAC,QAAQ,EAAE,CAAC;gBACd,MAAM,IAAI,KAAK,CACb,4EAA4E;oBAC5E,4CAA4C,CAC7C,CAAC;YACJ,CAAC;YACD,IAAI,CAAC,MAAM,CAAC,cAAc,GAAG,QAAQ,CAAC;QACxC,CAAC;QAED,MAAM,CAAC,IAAI,CAAC,mBAAmB,EAAE;YAC/B,cAAc,EAAE,IAAI,CAAC,MAAM,CAAC,cAAc;YAC1C,QAAQ,EAAE,IAAI,CAAC,MAAM,CAAC,QAAQ;SAC/B,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,IAAI,CAAC,OAAO,GAAG,MAAM,MAAM,CAAC;gBAC1B,cAAc,EAAE,IAAI,CAAC,MAAM,CAAC,cAAc;gBAC1C,QAAQ,EAAE,IAAI,CAAC,MAAM,CAAC,QAAQ;gBAC9B,IAAI,EAAE;oBACJ,cAAc;oBACd,0BAA0B;oBAC1B,yBAAyB;oBACzB,iCAAiC;oBACjC,eAAe;oBACf,gBAAgB;oBAChB,aAAa;oBACb,iCAAiC;oBACjC,uCAAuC;oBACvC,0CAA0C;oBAC1C,kCAAkC;iBACnC;aACF,CAAC,CAAC;YAEH,MAAM,CAAC,IAAI,CAAC,+BAA+B,CAAC,CAAC;YAC7C,OAAO,IAAI,CAAC,OAAO,CAAC;QACtB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,QAAQ,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACxE,MAAM,CAAC,KAAK,CAAC,0BAA0B,EAAE,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;YAC9D,MAAM,IAAI,KAAK,CAAC,6BAA6B,QAAQ,EAAE,CAAC,CAAC;QAC3D,CAAC;IACH,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,UAAU,CAAC,GAAW;QAC1B,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,MAAM,IAAI,KAAK,CAAC,4CAA4C,CAAC,CAAC;QAChE,CAAC;QAED,MAAM,CAAC,IAAI,CAAC,6BAA6B,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;QAEpD,IAAI,IAAsB,CAAC;QAC3B,IAAI,CAAC;YACH,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;YAEpC,eAAe;YACf,MAAM,IAAI,CAAC,WAAW,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAAC;YAErD,0DAA0D;YAC1D,MAAM,IAAI,CAAC,sBAAsB,CAAC,IAAI,CAAC,CAAC;YACxC,IAAI,CAAC,EAAE,CAAC,SAAS,EAAE,CAAC,GAAG,EAAE,EAAE;gBACzB,MAAM,YAAY,GAAG,GAAG,CAAC,YAAY,EAAE,CAAC;gBACxC,IAAI,CAAC,OAAO,EAAE,YAAY,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE,CAAC;oBACpE,GAAG,CAAC,KAAK,EAAE,CAAC;gBACd,CAAC;qBAAM,CAAC;oBACN,GAAG,CAAC,QAAQ,EAAE,CAAC;gBACjB,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,mBAAmB;YACnB,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;gBACnB,SAAS,EAAE,kBAAkB,EAAE,eAAe;gBAC9C,OAAO,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO;aAC7B,CAAC,CAAC;YAEH,wCAAwC;YACxC,MAAM,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC;YAEtD,+CAA+C;YAC/C,IAAI,CAAC;gBACH,MAAM,IAAI,CAAC,kBAAkB,CAAC;oBAC5B,OAAO,EAAE,IAAI,CAAC,MAAM,CAAC,kBAAkB;iBACxC,CAAC,CAAC;YACL,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,iDAAiD;gBACjD,MAAM,CAAC,KAAK,CAAC,kCAAkC,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;YAC5D,CAAC;YAED,sCAAsC;YACtC,sEAAsE;YACtE,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC;;;;;;;;;;;;OAYlC,CAAwB,CAAC;YAE1B,MAAM,CAAC,IAAI,CAAC,4BAA4B,EAAE;gBACxC,GAAG;gBACH,KAAK,EAAE,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,EAAE,EAAE,CAAC;gBACpC,UAAU,EAAE,MAAM,CAAC,KAAK,CAAC,MAAM;gBAC/B,QAAQ,EAAE,MAAM,CAAC,IAAI,CAAC,MAAM;aAC7B,CAAC,CAAC;YAEH,OAAO,MAAM,CAAC;QAChB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,QAAQ,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACxE,MAAM,CAAC,KAAK,CAAC,uBAAuB,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;YAChE,MAAM,KAAK,CAAC;QACd,CAAC;gBAAS,CAAC;YACT,0CAA0C;YAC1C,IAAI,IAAI,EAAE,CAAC;gBACT,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC;YACrB,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,KAAK;QACT,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;YAC/B,IAAI,CAAC;gBACH,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;gBAC3B,IAAI,CAAC,OAAO,GAAG,SAAS,CAAC;gBACzB,MAAM,CAAC,IAAI,CAAC,6BAA6B,CAAC,CAAC;YAC7C,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,CAAC,IAAI,CAAC,uBAAuB,EAAE;oBACnC,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;iBAC9D,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACH,UAAU;QACR,OAAO,IAAI,CAAC,OAAO,KAAK,SAAS,CAAC;IACpC,CAAC;CACF"}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import type { CrawlOptions } from './doc-crawler.js';
|
|
2
|
+
export interface CheckpointData {
|
|
3
|
+
config: CrawlOptions;
|
|
4
|
+
visitedUrls: string[];
|
|
5
|
+
pendingUrls: Array<{
|
|
6
|
+
url: string;
|
|
7
|
+
depth: number;
|
|
8
|
+
}>;
|
|
9
|
+
pagesCrawled: number;
|
|
10
|
+
lastUpdated: string;
|
|
11
|
+
baseUrl: string;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Manager for crawl checkpoint/resume functionality
|
|
15
|
+
* Based on Skill_Seekers implementation
|
|
16
|
+
*/
|
|
17
|
+
export declare class CheckpointManager {
|
|
18
|
+
private checkpointFile;
|
|
19
|
+
constructor(checkpointFile: string);
|
|
20
|
+
/**
|
|
21
|
+
* Save checkpoint data to file
|
|
22
|
+
*/
|
|
23
|
+
saveCheckpoint(data: CheckpointData): Promise<void>;
|
|
24
|
+
/**
|
|
25
|
+
* Load checkpoint data from file
|
|
26
|
+
* Returns null if file doesn't exist or is invalid
|
|
27
|
+
*/
|
|
28
|
+
loadCheckpoint(): Promise<CheckpointData | null>;
|
|
29
|
+
/**
|
|
30
|
+
* Clear checkpoint file
|
|
31
|
+
*/
|
|
32
|
+
clearCheckpoint(): Promise<void>;
|
|
33
|
+
/**
|
|
34
|
+
* Check if checkpoint exists
|
|
35
|
+
*/
|
|
36
|
+
exists(): Promise<boolean>;
|
|
37
|
+
}
|
|
38
|
+
//# sourceMappingURL=checkpoint-manager.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"checkpoint-manager.d.ts","sourceRoot":"","sources":["../../src/documentation/checkpoint-manager.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAErD,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,YAAY,CAAC;IACrB,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,WAAW,EAAE,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACnD,YAAY,EAAE,MAAM,CAAC;IACrB,WAAW,EAAE,MAAM,CAAC;IACpB,OAAO,EAAE,MAAM,CAAC;CACjB;AAED;;;GAGG;AACH,qBAAa,iBAAiB;IAC5B,OAAO,CAAC,cAAc,CAAS;gBAEnB,cAAc,EAAE,MAAM;IAIlC;;OAEG;IACG,cAAc,CAAC,IAAI,EAAE,cAAc,GAAG,OAAO,CAAC,IAAI,CAAC;IAqBzD;;;OAGG;IACG,cAAc,IAAI,OAAO,CAAC,cAAc,GAAG,IAAI,CAAC;IAiCtD;;OAEG;IACG,eAAe,IAAI,OAAO,CAAC,IAAI,CAAC;IAkBtC;;OAEG;IACG,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC;CAQjC"}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import { writeFile, readFile, unlink, access } from 'fs/promises';
|
|
2
|
+
import { logger } from '../utils/logger.js';
|
|
3
|
+
/**
|
|
4
|
+
* Manager for crawl checkpoint/resume functionality
|
|
5
|
+
* Based on Skill_Seekers implementation
|
|
6
|
+
*/
|
|
7
|
+
export class CheckpointManager {
|
|
8
|
+
checkpointFile;
|
|
9
|
+
constructor(checkpointFile) {
|
|
10
|
+
this.checkpointFile = checkpointFile;
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Save checkpoint data to file
|
|
14
|
+
*/
|
|
15
|
+
async saveCheckpoint(data) {
|
|
16
|
+
try {
|
|
17
|
+
const json = JSON.stringify(data, null, 2);
|
|
18
|
+
await writeFile(this.checkpointFile, json, 'utf-8');
|
|
19
|
+
logger.info('Checkpoint saved', {
|
|
20
|
+
file: this.checkpointFile,
|
|
21
|
+
pagesCrawled: data.pagesCrawled,
|
|
22
|
+
pendingUrls: data.pendingUrls.length,
|
|
23
|
+
visitedUrls: data.visitedUrls.length,
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
catch (error) {
|
|
27
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
28
|
+
logger.error('Failed to save checkpoint', {
|
|
29
|
+
file: this.checkpointFile,
|
|
30
|
+
error: errorMessage,
|
|
31
|
+
});
|
|
32
|
+
throw new Error(`Failed to save checkpoint: ${errorMessage}`);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Load checkpoint data from file
|
|
37
|
+
* Returns null if file doesn't exist or is invalid
|
|
38
|
+
*/
|
|
39
|
+
async loadCheckpoint() {
|
|
40
|
+
try {
|
|
41
|
+
// Check if file exists
|
|
42
|
+
await access(this.checkpointFile);
|
|
43
|
+
// Read and parse checkpoint
|
|
44
|
+
const content = await readFile(this.checkpointFile, 'utf-8');
|
|
45
|
+
const data = JSON.parse(content);
|
|
46
|
+
logger.info('Checkpoint loaded', {
|
|
47
|
+
file: this.checkpointFile,
|
|
48
|
+
pagesCrawled: data.pagesCrawled,
|
|
49
|
+
pendingUrls: data.pendingUrls.length,
|
|
50
|
+
visitedUrls: data.visitedUrls.length,
|
|
51
|
+
lastUpdated: data.lastUpdated,
|
|
52
|
+
});
|
|
53
|
+
return data;
|
|
54
|
+
}
|
|
55
|
+
catch (error) {
|
|
56
|
+
if (error.code === 'ENOENT') {
|
|
57
|
+
logger.debug('No checkpoint file found', { file: this.checkpointFile });
|
|
58
|
+
return null;
|
|
59
|
+
}
|
|
60
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
61
|
+
logger.warn('Failed to load checkpoint', {
|
|
62
|
+
file: this.checkpointFile,
|
|
63
|
+
error: errorMessage,
|
|
64
|
+
});
|
|
65
|
+
return null;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Clear checkpoint file
|
|
70
|
+
*/
|
|
71
|
+
async clearCheckpoint() {
|
|
72
|
+
try {
|
|
73
|
+
await unlink(this.checkpointFile);
|
|
74
|
+
logger.info('Checkpoint cleared', { file: this.checkpointFile });
|
|
75
|
+
}
|
|
76
|
+
catch (error) {
|
|
77
|
+
if (error.code === 'ENOENT') {
|
|
78
|
+
// File doesn't exist, that's fine
|
|
79
|
+
return;
|
|
80
|
+
}
|
|
81
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
82
|
+
logger.warn('Failed to clear checkpoint', {
|
|
83
|
+
file: this.checkpointFile,
|
|
84
|
+
error: errorMessage,
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Check if checkpoint exists
|
|
90
|
+
*/
|
|
91
|
+
async exists() {
|
|
92
|
+
try {
|
|
93
|
+
await access(this.checkpointFile);
|
|
94
|
+
return true;
|
|
95
|
+
}
|
|
96
|
+
catch {
|
|
97
|
+
return false;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
//# sourceMappingURL=checkpoint-manager.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"checkpoint-manager.js","sourceRoot":"","sources":["../../src/documentation/checkpoint-manager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAClE,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAY5C;;;GAGG;AACH,MAAM,OAAO,iBAAiB;IACpB,cAAc,CAAS;IAE/B,YAAY,cAAsB;QAChC,IAAI,CAAC,cAAc,GAAG,cAAc,CAAC;IACvC,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,cAAc,CAAC,IAAoB;QACvC,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;YAC3C,MAAM,SAAS,CAAC,IAAI,CAAC,cAAc,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;YAEpD,MAAM,CAAC,IAAI,CAAC,kBAAkB,EAAE;gBAC9B,IAAI,EAAE,IAAI,CAAC,cAAc;gBACzB,YAAY,EAAE,IAAI,CAAC,YAAY;gBAC/B,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,MAAM;gBACpC,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,MAAM;aACrC,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAC5E,MAAM,CAAC,KAAK,CAAC,2BAA2B,EAAE;gBACxC,IAAI,EAAE,IAAI,CAAC,cAAc;gBACzB,KAAK,EAAE,YAAY;aACpB,CAAC,CAAC;YACH,MAAM,IAAI,KAAK,CAAC,8BAA8B,YAAY,EAAE,CAAC,CAAC;QAChE,CAAC;IACH,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,cAAc;QAClB,IAAI,CAAC;YACH,uBAAuB;YACvB,MAAM,MAAM,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YAElC,4BAA4B;YAC5B,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,cAAc,EAAE,OAAO,CAAC,CAAC;YAC7D,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAmB,CAAC;YAEnD,MAAM,CAAC,IAAI,CAAC,mBAAmB,EAAE;gBAC/B,IAAI,EAAE,IAAI,CAAC,cAAc;gBACzB,YAAY,EAAE,IAAI,CAAC,YAAY;gBAC/B,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,MAAM;gBACpC,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,MAAM;gBACpC,WAAW,EAAE,IAAI,CAAC,WAAW;aAC9B,CAAC,CAAC;YAEH,OAAO,IAAI,CAAC;QACd,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,IAAK,KAA+B,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;gBACvD,MAAM,CAAC,KAAK,CAAC,0BAA0B,EAAE,EAAE,IAAI,EAAE,IAAI,CAAC,cAAc,EAAE,CAAC,CAAC;gBACxE,OAAO,IAAI,CAAC;YACd,CAAC;YAED,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAC5E,MAAM,CAAC,IAAI,CAAC,2BAA2B,EAAE;gBACvC,IAAI,EAAE,IAAI,CAAC,cAAc;gBACzB,KAAK,EAAE,YAAY;aACpB,CAAC,CAAC;YACH,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,eAAe;QACnB,IAAI,CAAC;YACH,MAAM,MAAM,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YAClC,MAAM,CAAC,IAAI,CAAC,oBAAoB,EAAE,EAAE,IAAI,EAAE,IAAI,CAAC,cAAc,EAAE,CAAC,CAAC;QACnE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,IAAK,KAA+B,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;gBACvD,kCAAkC;gBAClC,OAAO;YACT,CAAC;YAED,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAC5E,MAAM,CAAC,IAAI,CAAC,4BAA4B,EAAE;gBACxC,IAAI,EAAE,IAAI,CAAC,cAAc;gBACzB,KAAK,EAAE,YAAY;aACpB,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,MAAM;QACV,IAAI,CAAC;YACH,MAAM,MAAM,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YAClC,OAAO,IAAI,CAAC;QACd,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;CACF"}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { HttpClient } from '../utils/http-client.js';
|
|
2
2
|
export interface CrawlOptions {
|
|
3
|
+
crawlStrategy?: 'bfs' | 'dfs';
|
|
3
4
|
maxDepth?: number;
|
|
4
5
|
maxPages?: number;
|
|
5
6
|
includePaths?: string[];
|
|
@@ -8,6 +9,21 @@ export interface CrawlOptions {
|
|
|
8
9
|
maxRetries?: number;
|
|
9
10
|
retryDelay?: number;
|
|
10
11
|
useBrowserAutomation?: boolean;
|
|
12
|
+
skipLlmsTxt?: boolean;
|
|
13
|
+
workers?: number;
|
|
14
|
+
checkpoint?: {
|
|
15
|
+
enabled: boolean;
|
|
16
|
+
interval: number;
|
|
17
|
+
file?: string;
|
|
18
|
+
};
|
|
19
|
+
resume?: boolean;
|
|
20
|
+
spaStrategy?: 'smart' | 'auto' | 'manual';
|
|
21
|
+
spaFallback?: 'warn' | 'skip' | 'error';
|
|
22
|
+
browserConfig?: {
|
|
23
|
+
executablePath?: string;
|
|
24
|
+
waitForTimeout?: number;
|
|
25
|
+
networkIdleTimeout?: number;
|
|
26
|
+
};
|
|
11
27
|
}
|
|
12
28
|
export interface CrawledPage {
|
|
13
29
|
url: string;
|
|
@@ -62,6 +78,7 @@ export interface CrawlResult {
|
|
|
62
78
|
}
|
|
63
79
|
export declare class DocumentationCrawler {
|
|
64
80
|
private browser;
|
|
81
|
+
private browserManager?;
|
|
65
82
|
private visitedUrls;
|
|
66
83
|
private urlQueue;
|
|
67
84
|
private crawledPages;
|
|
@@ -69,6 +86,8 @@ export declare class DocumentationCrawler {
|
|
|
69
86
|
private options;
|
|
70
87
|
private baseUrl;
|
|
71
88
|
private linkDiscoveryStats;
|
|
89
|
+
private checkpointManager?;
|
|
90
|
+
private pagesSinceLastCheckpoint;
|
|
72
91
|
private readonly DOCUMENTATION_PATTERNS;
|
|
73
92
|
private readonly EXCLUDED_PATTERNS;
|
|
74
93
|
constructor(httpClient?: HttpClient);
|
|
@@ -76,8 +95,21 @@ export declare class DocumentationCrawler {
|
|
|
76
95
|
* Crawl documentation starting from a root URL
|
|
77
96
|
* Uses HTTP client (axios) exclusively - no browser automation
|
|
78
97
|
* For SPA sites that require JavaScript rendering, use Cursor/Claude's built-in browser tools
|
|
98
|
+
* Supports both BFS (breadth-first) and DFS (depth-first) crawl strategies
|
|
79
99
|
*/
|
|
80
100
|
crawl(rootUrl: string, options?: CrawlOptions): Promise<CrawlResult>;
|
|
101
|
+
/**
|
|
102
|
+
* Sequential crawling (single-threaded)
|
|
103
|
+
*/
|
|
104
|
+
private crawlSequential;
|
|
105
|
+
/**
|
|
106
|
+
* Parallel crawling with multiple workers
|
|
107
|
+
*/
|
|
108
|
+
private crawlWithWorkers;
|
|
109
|
+
/**
|
|
110
|
+
* Process a single page (shared by both sequential and parallel crawling)
|
|
111
|
+
*/
|
|
112
|
+
private processPage;
|
|
81
113
|
/**
|
|
82
114
|
* Discover documentation links from a crawled page
|
|
83
115
|
*/
|
|
@@ -92,13 +124,31 @@ export declare class DocumentationCrawler {
|
|
|
92
124
|
private shouldExclude;
|
|
93
125
|
/**
|
|
94
126
|
* Check if crawled content is sufficient for skill generation
|
|
95
|
-
*
|
|
127
|
+
* Enhanced with multi-dimensional quality metrics
|
|
96
128
|
*/
|
|
97
129
|
private canGenerateSkill;
|
|
130
|
+
/**
|
|
131
|
+
* Evaluate content quality with multi-dimensional metrics
|
|
132
|
+
*/
|
|
133
|
+
private evaluateContentQuality;
|
|
134
|
+
/**
|
|
135
|
+
* Check if should continue crawling based on content quality
|
|
136
|
+
*/
|
|
137
|
+
private shouldContinueCrawling;
|
|
98
138
|
/**
|
|
99
139
|
* Fetch a page with retry logic
|
|
140
|
+
* Supports HTML pages, Markdown files, and SPA rendering
|
|
100
141
|
*/
|
|
101
142
|
private fetchPageWithRetry;
|
|
143
|
+
/**
|
|
144
|
+
* Extract content from Markdown file
|
|
145
|
+
* Converts Markdown structure to WebDocumentationPage format
|
|
146
|
+
*/
|
|
147
|
+
private extractMarkdownContent;
|
|
148
|
+
/**
|
|
149
|
+
* Parse Markdown content into structured data
|
|
150
|
+
*/
|
|
151
|
+
private parseMarkdown;
|
|
102
152
|
/**
|
|
103
153
|
* Classify error type for better error messages
|
|
104
154
|
*/
|
|
@@ -111,6 +161,58 @@ export declare class DocumentationCrawler {
|
|
|
111
161
|
* Get error breakdown by type
|
|
112
162
|
*/
|
|
113
163
|
private getErrorBreakdown;
|
|
164
|
+
/**
|
|
165
|
+
* Try to detect and use llms.txt for optimized crawling
|
|
166
|
+
*/
|
|
167
|
+
private tryLlmsTxt;
|
|
168
|
+
/**
|
|
169
|
+
* Check if a URL is valid for crawling
|
|
170
|
+
*/
|
|
171
|
+
private isValidUrl;
|
|
172
|
+
/**
|
|
173
|
+
* Save checkpoint
|
|
174
|
+
*/
|
|
175
|
+
private saveCheckpoint;
|
|
176
|
+
/**
|
|
177
|
+
* Load checkpoint and restore state
|
|
178
|
+
*/
|
|
179
|
+
private loadCheckpoint;
|
|
180
|
+
/**
|
|
181
|
+
* Clear checkpoint after successful crawl
|
|
182
|
+
*/
|
|
183
|
+
private clearCheckpoint;
|
|
184
|
+
/**
|
|
185
|
+
* Sanitize filename for checkpoint
|
|
186
|
+
*/
|
|
187
|
+
private sanitizeFilename;
|
|
188
|
+
/**
|
|
189
|
+
* Check if browser rendering is needed
|
|
190
|
+
*/
|
|
191
|
+
private shouldUseBrowser;
|
|
192
|
+
/**
|
|
193
|
+
* Fetch page using browser rendering
|
|
194
|
+
*/
|
|
195
|
+
private fetchWithBrowser;
|
|
196
|
+
/**
|
|
197
|
+
* Parse browser-rendered HTML into WebDocumentationPage
|
|
198
|
+
*/
|
|
199
|
+
private parseRenderedPage;
|
|
200
|
+
/**
|
|
201
|
+
* Handle browser rendering failure based on fallback strategy
|
|
202
|
+
*/
|
|
203
|
+
private handleBrowserFailure;
|
|
204
|
+
/**
|
|
205
|
+
* Create empty page placeholder
|
|
206
|
+
*/
|
|
207
|
+
private createEmptyPage;
|
|
208
|
+
/**
|
|
209
|
+
* Create page with browser installation guide
|
|
210
|
+
*/
|
|
211
|
+
private createPageWithGuide;
|
|
212
|
+
/**
|
|
213
|
+
* Cleanup resources (browser, checkpoint, etc.)
|
|
214
|
+
*/
|
|
215
|
+
cleanup(): Promise<void>;
|
|
114
216
|
/**
|
|
115
217
|
* Delay helper for rate limiting
|
|
116
218
|
*/
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"doc-crawler.d.ts","sourceRoot":"","sources":["../../src/documentation/doc-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;
|
|
1
|
+
{"version":3,"file":"doc-crawler.d.ts","sourceRoot":"","sources":["../../src/documentation/doc-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AAUrD,MAAM,WAAW,YAAY;IAC3B,aAAa,CAAC,EAAE,KAAK,GAAG,KAAK,CAAC;IAC9B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,oBAAoB,CAAC,EAAE,OAAO,CAAC;IAC/B,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE;QACX,OAAO,EAAE,OAAO,CAAC;QACjB,QAAQ,EAAE,MAAM,CAAC;QACjB,IAAI,CAAC,EAAE,MAAM,CAAC;KACf,CAAC;IACF,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,WAAW,CAAC,EAAE,OAAO,GAAG,MAAM,GAAG,QAAQ,CAAC;IAC1C,WAAW,CAAC,EAAE,MAAM,GAAG,MAAM,GAAG,OAAO,CAAC;IACxC,aAAa,CAAC,EAAE;QACd,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,kBAAkB,CAAC,EAAE,MAAM,CAAC;KAC7B,CAAC;CACH;AAED,MAAM,WAAW,WAAW;IAC1B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,KAAK,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,OAAO,EAAE,MAAM,CAAC;QAChB,MAAM,CAAC,EAAE,MAAM,CAAC;KACjB,CAAC,CAAC;IACH,eAAe,EAAE,KAAK,CAAC;QACrB,IAAI,EAAE,MAAM,CAAC;QACb,GAAG,EAAE,MAAM,CAAC;QACZ,UAAU,EAAE,OAAO,CAAC;KACrB,CAAC,CAAC;IACH,QAAQ,EAAE,KAAK,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,IAAI,EAAE,MAAM,CAAC;QACb,EAAE,CAAC,EAAE,MAAM,CAAC;KACb,CAAC,CAAC;IACH,WAAW,EAAE,KAAK,CAAC;QACjB,IAAI,EAAE,MAAM,CAAC;QACb,QAAQ,EAAE,MAAM,CAAC;KAClB,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,kBAAkB;IACjC,eAAe,EAAE,MAAM,CAAC;IACxB,aAAa,EAAE;QACb,UAAU,EAAE,MAAM,CAAC;QACnB,cAAc,EAAE,MAAM,CAAC;QACvB,cAAc,EAAE,MAAM,CAAC;QACvB,eAAe,EAAE,MAAM,CAAC;QACxB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;IACF,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,CAAC;IACxB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,MAAM,aAAa,GACrB,sBAAsB,GACtB,YAAY,GACZ,aAAa,GACb,uBAAuB,CAAC;AAE5B,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,WAAW,EAAE,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,eAAe,EAAE,MAAM,CAAC;IACxB,MAAM,EAAE,KAAK,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;QACZ,KAAK,EAAE,MAAM,CAAC;KACf,CAAC,CAAC;IACH,kBAAkB,EAAE,kBAAkB,CAAC;IACvC,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,aAAa,CAAC,EAAE,aAAa,CAAC;CAC/B;AAOD,qBAAa,oBAAoB;IAC/B,OAAO,CAAC,OAAO,CAA0B;IACzC,OAAO,CAAC,cAAc,CAAC,CAAiB;IACxC,OAAO,CAAC,WAAW,CAAc;IACjC,OAAO,CAAC,QAAQ,CAAc;IAC9B,OAAO,CAAC,YAAY,CAAgB;IACpC,OAAO,CAAC,MAAM,CAAwC;IACtD,OAAO,CAAC,OAAO,CAcb;IACF,OAAO,CAAC,OAAO,CAAM;IACrB,OAAO,CAAC,kBAAkB,CAAqB;IAC/C,OAAO,CAAC,iBAAiB,CAAC,CAAoB;IAC9C,OAAO,CAAC,wBAAwB,CAAS;IACzC,OAAO,CAAC,QAAQ,CAAC,sBAAsB,CAWrC;IACF,OAAO,CAAC,QAAQ,CAAC,iBAAiB,CAUhC;gBAEU,UAAU,CAAC,EAAE,UAAU;IAuCnC;;;;;OAKG;IACG,KAAK,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,OAAO,CAAC,WAAW,CAAC;IA4K9E;;OAEG;YACW,eAAe;IAyD7B;;OAEG;YACW,gBAAgB;IAoE9B;;OAEG;YACW,WAAW;IAgHzB;;OAEG;IACH,OAAO,CAAC,0BAA0B;IAiJlC;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAmC3B;;OAEG;IACH,OAAO,CAAC,aAAa;IAIrB;;;OAGG;IACH,OAAO,CAAC,gBAAgB;IAyBxB;;OAEG;IACH,OAAO,CAAC,sBAAsB;IAoF9B;;OAEG;IACH,OAAO,CAAC,sBAAsB;IAsC9B;;;OAGG;YACW,kBAAkB;IA+DhC;;;OAGG;YACW,sBAAsB;IA4BpC;;OAEG;IACH,OAAO,CAAC,aAAa;IAyJrB;;OAEG;IACH,OAAO,CAAC,aAAa;IA0CrB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAmBxB;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAWzB;;OAEG;YACW,UAAU;IAmExB;;OAEG;IACH,OAAO,CAAC,UAAU;IAiBlB;;OAEG;YACW,cAAc;IAuB5B;;OAEG;YACW,cAAc;IAgC5B;;OAEG;YACW,eAAe;IAY7B;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAOxB;;OAEG;YACW,gBAAgB;IAiC9B;;OAEG;YACW,gBAAgB;IAuB9B;;OAEG;IACH,OAAO,CAAC,iBAAiB;IA6EzB;;OAEG;YACW,oBAAoB;IAkBlC;;OAEG;IACH,OAAO,CAAC,eAAe;IAcvB;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAsD3B;;OAEG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAO9B;;OAEG;IACH,OAAO,CAAC,KAAK;CAGd"}
|