crawlforge-mcp-server 3.4.0 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -2
- package/package.json +6 -4
- package/server.js +166 -32
- package/src/cli/commands/actions.js +36 -0
- package/src/cli/commands/analyze.js +19 -0
- package/src/cli/commands/batch.js +45 -0
- package/src/cli/commands/crawl.js +30 -0
- package/src/cli/commands/extract.js +45 -0
- package/src/cli/commands/install-skills.js +46 -0
- package/src/cli/commands/llmstxt.js +24 -0
- package/src/cli/commands/localize.js +29 -0
- package/src/cli/commands/map.js +26 -0
- package/src/cli/commands/monitor.js +29 -0
- package/src/cli/commands/research.js +26 -0
- package/src/cli/commands/scrape.js +37 -0
- package/src/cli/commands/search.js +28 -0
- package/src/cli/commands/stealth.js +29 -0
- package/src/cli/commands/template.js +26 -0
- package/src/cli/commands/track.js +24 -0
- package/src/cli/commands/uninstall-skills.js +35 -0
- package/src/cli/formatter.js +57 -0
- package/src/cli/index.js +94 -0
- package/src/cli/lib/runTool.js +40 -0
- package/src/core/ActionExecutor.js +8 -6
- package/src/core/AuthManager.js +103 -3
- package/src/core/ChangeTracker.js +34 -0
- package/src/core/ElicitationHelper.js +112 -0
- package/src/core/JobManager.js +36 -2
- package/src/core/LocalizationManager.js +19 -5
- package/src/core/PerformanceManager.js +53 -17
- package/src/core/ResearchOrchestrator.js +40 -5
- package/src/core/SamplingClient.js +191 -0
- package/src/core/StealthBrowserManager.js +248 -2
- package/src/core/WebhookDispatcher.js +18 -10
- package/src/prompts/PromptRegistry.js +199 -0
- package/src/resources/ResourceRegistry.js +273 -0
- package/src/server/transports/streamableHttp.js +6 -6
- package/src/server/withAuth.js +25 -0
- package/src/skills/crawlforge-cli.md +157 -0
- package/src/skills/crawlforge-mcp.md +80 -0
- package/src/skills/crawlforge-research.md +104 -0
- package/src/skills/crawlforge-stealth.md +98 -0
- package/src/skills/installer.js +141 -0
- package/src/tools/advanced/batchScrape/index.js +30 -0
- package/src/tools/advanced/batchScrape/schema.js +1 -1
- package/src/tools/basic/extractText.js +19 -8
- package/src/tools/crawl/crawlDeep.js +27 -0
- package/src/tools/extract/extractContent.js +5 -17
- package/src/tools/extract/extractStructured.js +8 -0
- package/src/tools/extract/extractWithLlm.js +35 -25
- package/src/tools/extract/listOllamaModels.js +66 -0
- package/src/tools/extract/processDocument.js +7 -1
- package/src/tools/extract/summarizeContent.js +17 -0
- package/src/tools/research/deepResearch.js +34 -0
- package/src/tools/templates/ScrapeTemplateTool.js +68 -0
- package/src/tools/templates/TemplateRegistry.js +311 -0
- package/src/utils/Logger.js +15 -0
- package/src/utils/htmlToMarkdown.js +54 -0
- package/src/utils/secretMask.js +86 -0
package/src/utils/Logger.js
CHANGED
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
6
|
import winston from 'winston';
|
|
7
|
+
import { maskSecrets } from './secretMask.js';
|
|
7
8
|
import { fileURLToPath } from 'url';
|
|
8
9
|
import { dirname, join } from 'path';
|
|
9
10
|
import { existsSync, mkdirSync } from 'fs';
|
|
@@ -70,7 +71,21 @@ export class Logger {
|
|
|
70
71
|
* @returns {winston.Format} Winston format
|
|
71
72
|
*/
|
|
72
73
|
createFormat(enableJson) {
|
|
74
|
+
// D2.9: global secret masking format applied first
|
|
75
|
+
const secretMaskFormat = winston.format((info) => {
|
|
76
|
+
if (info.metadata) info.metadata = maskSecrets(info.metadata);
|
|
77
|
+
if (typeof info.message === 'string') {
|
|
78
|
+
// lightweight heuristic mask on the message string itself
|
|
79
|
+
info.message = info.message
|
|
80
|
+
.replace(/(Bearer\s+)\S+/gi, '$1[REDACTED]')
|
|
81
|
+
.replace(/(api[_-]?key[:=]\s*)\S+/gi, '$1[REDACTED]')
|
|
82
|
+
.replace(/(x-api-key[:=]\s*)\S+/gi, '$1[REDACTED]');
|
|
83
|
+
}
|
|
84
|
+
return info;
|
|
85
|
+
})();
|
|
86
|
+
|
|
73
87
|
const formats = [
|
|
88
|
+
secretMaskFormat,
|
|
74
89
|
winston.format.timestamp({ format: 'YYYY-MM-DD HH:mm:ss.SSS' }),
|
|
75
90
|
winston.format.errors({ stack: true }),
|
|
76
91
|
winston.format.metadata({ fillExcept: ['message', 'level', 'timestamp', 'service'] })
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* htmlToMarkdown -- thin wrapper around the Turndown HTML-to-Markdown library.
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js';
|
|
6
|
+
* const md = htmlToMarkdown(rawHtml);
|
|
7
|
+
*
|
|
8
|
+
* Design notes:
|
|
9
|
+
* - Turndown is the most widely-used, battle-tested HTML->Markdown converter.
|
|
10
|
+
* - We configure it with sensible defaults for RAG workflows:
|
|
11
|
+
* headingStyle: 'atx' -> # H1 / ## H2 instead of underline style
|
|
12
|
+
* codeBlockStyle: 'fenced' -> triple-backtick fences
|
|
13
|
+
* bulletListMarker: '-'
|
|
14
|
+
* - Tables fall back to prose (no GFM plugin loaded by default).
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import TurndownService from 'turndown';
|
|
18
|
+
|
|
19
|
+
let _td = null;
|
|
20
|
+
|
|
21
|
+
function getTurndown() {
|
|
22
|
+
if (_td === null) {
|
|
23
|
+
_td = new TurndownService({
|
|
24
|
+
headingStyle: 'atx',
|
|
25
|
+
codeBlockStyle: 'fenced',
|
|
26
|
+
bulletListMarker: '-',
|
|
27
|
+
emDelimiter: '_',
|
|
28
|
+
strongDelimiter: '**',
|
|
29
|
+
hr: '---',
|
|
30
|
+
linkStyle: 'inlined'
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
// Remove boilerplate elements before converting
|
|
34
|
+
_td.remove(['script', 'style', 'nav', 'footer', 'aside', 'noscript']);
|
|
35
|
+
}
|
|
36
|
+
return _td;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Convert an HTML string to Markdown.
|
|
41
|
+
* Returns an empty string if html is falsy.
|
|
42
|
+
*
|
|
43
|
+
* @param {string} html
|
|
44
|
+
* @returns {string}
|
|
45
|
+
*/
|
|
46
|
+
export function htmlToMarkdown(html) {
|
|
47
|
+
if (!html) return '';
|
|
48
|
+
try {
|
|
49
|
+
return getTurndown().turndown(html).trim();
|
|
50
|
+
} catch {
|
|
51
|
+
// Fallback: strip tags, return plain text
|
|
52
|
+
return html.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
|
|
53
|
+
}
|
|
54
|
+
}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* secretMask -- redact sensitive values from objects/strings before they reach logs.
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* import { maskSecrets, maskString } from './secretMask.js';
|
|
6
|
+
* logger.error('fetch failed', maskSecrets({ apiKey, url, error }));
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
const SECRET_KEYS_RE = /api[_-]?key|apikey|x-api-key|password|passwd|secret|token|authorization|auth|credential|private[_-]?key|access[_-]?key|proxy_url|proxyurl/i;
|
|
10
|
+
|
|
11
|
+
const MASK = '[REDACTED]';
|
|
12
|
+
const PARTIAL_MASK_LEN = 4; // show last N chars of long secrets
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Mask a single string value.
|
|
16
|
+
* Shows last 4 chars if string is long enough to give context, else full mask.
|
|
17
|
+
* @param {string} value
|
|
18
|
+
* @returns {string}
|
|
19
|
+
*/
|
|
20
|
+
export function maskString(value) {
|
|
21
|
+
if (typeof value !== 'string' || value.length === 0) return MASK;
|
|
22
|
+
if (value.length <= PARTIAL_MASK_LEN) return MASK;
|
|
23
|
+
return `${MASK}...${value.slice(-PARTIAL_MASK_LEN)}`;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Deep-clone obj and redact any key whose name matches SECRET_KEYS_RE.
|
|
28
|
+
* Handles plain objects, arrays, and primitive values.
|
|
29
|
+
* Does NOT mutate the original.
|
|
30
|
+
* @param {*} obj
|
|
31
|
+
* @param {number} depth - internal recursion guard
|
|
32
|
+
* @returns {*}
|
|
33
|
+
*/
|
|
34
|
+
export function maskSecrets(obj, depth = 0) {
|
|
35
|
+
if (depth > 10) return obj; // guard against circular-ish structures
|
|
36
|
+
|
|
37
|
+
if (Array.isArray(obj)) {
|
|
38
|
+
return obj.map(item => maskSecrets(item, depth + 1));
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
if (obj !== null && typeof obj === 'object') {
|
|
42
|
+
const result = {};
|
|
43
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
44
|
+
if (SECRET_KEYS_RE.test(key)) {
|
|
45
|
+
result[key] = typeof value === 'string' ? maskString(value) : MASK;
|
|
46
|
+
} else {
|
|
47
|
+
result[key] = maskSecrets(value, depth + 1);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return result;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return obj;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Redact secrets from an Error's message and stack.
|
|
58
|
+
* Returns a new plain-object representation safe for logging.
|
|
59
|
+
* @param {Error} error
|
|
60
|
+
* @returns {{ name: string, message: string, stack: string|undefined, code: string|undefined }}
|
|
61
|
+
*/
|
|
62
|
+
export function maskError(error) {
|
|
63
|
+
if (!(error instanceof Error)) return error;
|
|
64
|
+
return {
|
|
65
|
+
name: error.name,
|
|
66
|
+
message: redactSecretsFromString(error.message),
|
|
67
|
+
stack: error.stack ? redactSecretsFromString(error.stack) : undefined,
|
|
68
|
+
code: error.code
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Heuristic: redact strings that look like API keys / tokens embedded in text.
|
|
74
|
+
* @param {string} str
|
|
75
|
+
* @returns {string}
|
|
76
|
+
*/
|
|
77
|
+
function redactSecretsFromString(str) {
|
|
78
|
+
if (typeof str !== 'string') return str;
|
|
79
|
+
return str
|
|
80
|
+
.replace(/(Bearer\s+)\S+/gi, `$1${MASK}`)
|
|
81
|
+
.replace(/(api[_-]?key\s*[:=]\s*)\S+/gi, `$1${MASK}`)
|
|
82
|
+
.replace(/(x-api-key\s*[:=]\s*)\S+/gi, `$1${MASK}`)
|
|
83
|
+
.replace(/(password\s*[:=]\s*)\S+/gi, `$1${MASK}`)
|
|
84
|
+
.replace(/(secret\s*[:=]\s*)\S+/gi, `$1${MASK}`)
|
|
85
|
+
.replace(/(token\s*[:=]\s*)\S+/gi, `$1${MASK}`);
|
|
86
|
+
}
|