crawlforge-mcp-server 3.5.1 → 4.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/package.json +6 -4
  2. package/server.js +138 -26
  3. package/src/cli/commands/actions.js +36 -0
  4. package/src/cli/commands/analyze.js +19 -0
  5. package/src/cli/commands/batch.js +45 -0
  6. package/src/cli/commands/crawl.js +30 -0
  7. package/src/cli/commands/extract.js +45 -0
  8. package/src/cli/commands/install-skills.js +46 -0
  9. package/src/cli/commands/llmstxt.js +24 -0
  10. package/src/cli/commands/localize.js +29 -0
  11. package/src/cli/commands/map.js +26 -0
  12. package/src/cli/commands/monitor.js +29 -0
  13. package/src/cli/commands/research.js +26 -0
  14. package/src/cli/commands/scrape.js +37 -0
  15. package/src/cli/commands/search.js +28 -0
  16. package/src/cli/commands/stealth.js +29 -0
  17. package/src/cli/commands/template.js +26 -0
  18. package/src/cli/commands/track.js +24 -0
  19. package/src/cli/commands/uninstall-skills.js +35 -0
  20. package/src/cli/formatter.js +57 -0
  21. package/src/cli/index.js +94 -0
  22. package/src/cli/lib/runTool.js +40 -0
  23. package/src/core/ActionExecutor.js +8 -6
  24. package/src/core/AuthManager.js +103 -3
  25. package/src/core/ChangeTracker.js +34 -0
  26. package/src/core/ElicitationHelper.js +112 -0
  27. package/src/core/JobManager.js +36 -2
  28. package/src/core/LocalizationManager.js +19 -5
  29. package/src/core/PerformanceManager.js +53 -17
  30. package/src/core/ResearchOrchestrator.js +40 -5
  31. package/src/core/SamplingClient.js +191 -0
  32. package/src/core/StealthBrowserManager.js +248 -2
  33. package/src/core/WebhookDispatcher.js +18 -10
  34. package/src/prompts/PromptRegistry.js +199 -0
  35. package/src/resources/ResourceRegistry.js +273 -0
  36. package/src/server/withAuth.js +25 -0
  37. package/src/skills/crawlforge-cli.md +157 -0
  38. package/src/skills/crawlforge-mcp.md +80 -0
  39. package/src/skills/crawlforge-research.md +104 -0
  40. package/src/skills/crawlforge-stealth.md +98 -0
  41. package/src/skills/installer.js +141 -0
  42. package/src/tools/advanced/batchScrape/index.js +30 -0
  43. package/src/tools/advanced/batchScrape/schema.js +1 -1
  44. package/src/tools/basic/extractText.js +19 -8
  45. package/src/tools/crawl/crawlDeep.js +27 -0
  46. package/src/tools/extract/extractContent.js +5 -17
  47. package/src/tools/extract/extractStructured.js +8 -0
  48. package/src/tools/extract/extractWithLlm.js +25 -5
  49. package/src/tools/extract/processDocument.js +7 -1
  50. package/src/tools/extract/summarizeContent.js +17 -0
  51. package/src/tools/research/deepResearch.js +34 -0
  52. package/src/tools/templates/ScrapeTemplateTool.js +68 -0
  53. package/src/tools/templates/TemplateRegistry.js +311 -0
  54. package/src/utils/Logger.js +15 -0
  55. package/src/utils/htmlToMarkdown.js +54 -0
  56. package/src/utils/secretMask.js +86 -0
@@ -8,6 +8,15 @@
8
8
  */
9
9
 
10
10
  import { fetchAndParse } from './_fetchAndParse.js';
11
+ // D1.3: SamplingClient for MCP sampling fallback (lazy — only imported if needed)
12
+ let _SamplingClient = null;
13
+ async function getSamplingClient() {
14
+ if (!_SamplingClient) {
15
+ const mod = await import('../../core/SamplingClient.js');
16
+ _SamplingClient = mod.SamplingClient;
17
+ }
18
+ return _SamplingClient;
19
+ }
11
20
 
12
21
  // ── Constants ─────────────────────────────────────────────────────────────────
13
22
 
@@ -297,14 +306,25 @@ export class ExtractWithLlm {
297
306
 
298
307
  const userMessage = buildUserMessage(prompt, text, schema);
299
308
 
300
- // Step 2: First LLM call
301
- let rawText, usage;
309
+ // Step 2: First LLM call — with sampling fallback for 'auto' provider
310
+ // Fallback chain: Ollama → API key (handled by resolveProvider) → sampling → error
311
+ let rawText, usage, resolvedModel = model;
302
312
  try {
303
313
  ({ rawText, usage } = await callLLM({
304
314
  provider, apiKey, model, systemMessage, userMessage, maxTokens, schema
305
315
  }));
306
316
  } catch (llmErr) {
307
- return { success: false, error: `LLM call failed: ${llmErr.message}` };
317
+ // D1.3: If provider is 'auto'/'ollama' and it failed, try sampling as final fallback
318
+ if (providerParam === 'auto' || providerParam === 'ollama') {
319
+ try {
320
+ ({ rawText, usage } = await callViaSampling({ systemMessage, userMessage, maxTokens }));
321
+ resolvedModel = 'sampling';
322
+ } catch (samplingErr) {
323
+ return { success: false, error: `LLM call failed: ${llmErr.message}. Sampling fallback also failed: ${samplingErr.message}` };
324
+ }
325
+ } else {
326
+ return { success: false, error: `LLM call failed: ${llmErr.message}` };
327
+ }
308
328
  }
309
329
 
310
330
  // Step 3: Parse JSON; retry once with stricter prompt if it fails
@@ -345,8 +365,8 @@ export class ExtractWithLlm {
345
365
  return {
346
366
  success: true,
347
367
  data: parsed,
348
- provider,
349
- model,
368
+ provider: resolvedModel === 'sampling' ? 'sampling' : provider,
369
+ model: resolvedModel || model,
350
370
  usage
351
371
  };
352
372
  }
@@ -8,6 +8,7 @@ import { PDFProcessor } from '../../core/processing/PDFProcessor.js';
8
8
  import { ContentProcessor } from '../../core/processing/ContentProcessor.js';
9
9
  import { BrowserProcessor } from '../../core/processing/BrowserProcessor.js';
10
10
  import { HTMLCleaner, ContentQualityAssessor } from '../../utils/contentUtils.js';
11
+ import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js'; // D3.1
11
12
 
12
13
  const ProcessDocumentSchema = z.object({
13
14
  source: z.string().min(1),
@@ -28,7 +29,7 @@ const ProcessDocumentSchema = z.object({
28
29
  // Processing options
29
30
  assessContentQuality: z.boolean().default(true),
30
31
  includeStatistics: z.boolean().default(true),
31
- outputFormat: z.enum(['text', 'structured', 'full']).default('structured'),
32
+ outputFormat: z.enum(['text', 'structured', 'full', 'markdown']).default('structured'),
32
33
 
33
34
  // Content filtering
34
35
  minContentLength: z.number().min(0).default(50),
@@ -328,6 +329,11 @@ export class ProcessDocumentTool {
328
329
  result.content.html = html;
329
330
  }
330
331
 
332
+ // D3.1: Markdown output mode — convert extracted HTML to markdown via Turndown
333
+ if (options.outputFormat === 'markdown') {
334
+ result.content.markdown = htmlToMarkdown(extractedContent || html);
335
+ }
336
+
331
337
  // Step 4: Set metadata
332
338
  if (processingResult.metadata) {
333
339
  result.metadata = {
@@ -4,6 +4,15 @@
4
4
  */
5
5
 
6
6
  import { z } from 'zod';
7
+ // D1.3: lazy SamplingClient for abstractive mode when no LLM keys are set
8
+ let _SamplingClient = null;
9
+ async function getSamplingClient() {
10
+ if (!_SamplingClient) {
11
+ const mod = await import('../../core/SamplingClient.js');
12
+ _SamplingClient = mod.SamplingClient;
13
+ }
14
+ return _SamplingClient;
15
+ }
7
16
  import { ContentAnalyzer } from '../../core/analysis/ContentAnalyzer.js';
8
17
  import { splitSentences } from '../../core/analysis/sentenceUtils.js';
9
18
 
@@ -122,6 +131,14 @@ export class SummarizeContentTool {
122
131
  // Step 2: Set summary result
123
132
  result.summary = analysisResult.summary;
124
133
 
134
+ // D1.3: If abstractive mode requested, attempt sampling-based enhancement
135
+ if (options.summaryType === 'abstractive') {
136
+ const abstractive = await this._abstractiveSummaryViaSampling(text, analysisResult.summary, options.summaryLength);
137
+ if (abstractive) {
138
+ result.summary = abstractive;
139
+ }
140
+ }
141
+
125
142
  // Step 3: Extract key points if requested
126
143
  if (options.includeKeypoints) {
127
144
  result.keypoints = await this.extractKeyPoints(text, analysisResult.summary);
@@ -1,4 +1,6 @@
1
1
  import { z } from 'zod';
2
+ // D1.4: Elicitation helper (injected from server.js or can be used standalone)
3
+ import { ElicitationHelper } from '../../core/ElicitationHelper.js';
2
4
  import { ResearchOrchestrator } from '../../core/ResearchOrchestrator.js';
3
5
  import { Logger } from '../../utils/Logger.js';
4
6
 
@@ -93,6 +95,17 @@ export class DeepResearchTool {
93
95
  cacheTTL,
94
96
  ...orchestratorOptions
95
97
  };
98
+ // D1.4: Elicitation helper (set mcpServer via setMcpServer() after instantiation)
99
+ this._elicitation = new ElicitationHelper({});
100
+ }
101
+
102
+ /**
103
+ * D1.4: Set the MCP server instance for elicitation support.
104
+ * Call this from server.js after instantiating DeepResearchTool.
105
+ * @param {object} mcpServer
106
+ */
107
+ setMcpServer(mcpServer) {
108
+ this._elicitation = new ElicitationHelper({ mcpServer });
96
109
  }
97
110
 
98
111
  async execute(params) {
@@ -116,6 +129,27 @@ export class DeepResearchTool {
116
129
  };
117
130
  }
118
131
 
132
+ // D1.4: Elicitation — warn user if projected cost exceeds 50 credits
133
+ // deep_research costs approximately 1 credit per URL; maxUrls > 50 → confirm
134
+ if (validated.maxUrls > 50) {
135
+ const projectedCredits = validated.maxUrls;
136
+ const proceed = await this._elicitation.confirm(
137
+ `deep_research will scan up to ${validated.maxUrls} URLs, projecting ~${projectedCredits} credits.`,
138
+ {
139
+ topic: validated.topic,
140
+ projected_credits: projectedCredits,
141
+ max_urls: validated.maxUrls,
142
+ }
143
+ );
144
+ if (!proceed) {
145
+ return {
146
+ success: false,
147
+ error: 'Research cancelled by user before starting (elicitation declined).',
148
+ sessionId,
149
+ };
150
+ }
151
+ }
152
+
119
153
  // Configure research orchestrator based on research approach
120
154
  const orchestratorConfig = this.buildOrchestratorConfig(validated);
121
155
  const orchestrator = new ResearchOrchestrator(orchestratorConfig);
@@ -0,0 +1,68 @@
1
+ /**
2
+ * ScrapeTemplateTool — wraps TemplateRegistry to expose the `scrape_template` MCP tool.
3
+ *
4
+ * Usage pattern (D3.3):
5
+ * const tool = new ScrapeTemplateTool();
6
+ * const result = await tool.execute({ template: "github-repo", url: "https://github.com/user/repo" });
7
+ */
8
+
9
+ import { TemplateRegistry } from './TemplateRegistry.js';
10
+
11
+ export class ScrapeTemplateTool {
12
+ constructor() {
13
+ this.registry = new TemplateRegistry();
14
+ }
15
+
16
+ /**
17
+ * Execute the scrape_template tool.
18
+ * @param {{ template: string, url: string, timeout?: number }} params
19
+ * @returns {Promise<object>}
20
+ */
21
+ async execute({ template, url, timeout = 15000 }) {
22
+ // list mode — return available templates without scraping
23
+ if (template === 'list' || !url) {
24
+ return {
25
+ templates: this.registry.list(),
26
+ count: this.registry.list().length
27
+ };
28
+ }
29
+
30
+ // Validate template exists before making network call
31
+ const tpl = this.registry.get(template);
32
+ if (!tpl) {
33
+ const available = this.registry.list().map(t => t.id).join(', ');
34
+ throw new Error(`Unknown template "${template}". Available templates: ${available}`);
35
+ }
36
+
37
+ // Fetch the page
38
+ const controller = new AbortController();
39
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
40
+ let html;
41
+ try {
42
+ const response = await fetch(url, {
43
+ signal: controller.signal,
44
+ headers: {
45
+ 'User-Agent': 'Mozilla/5.0 (compatible; CrawlForge-TemplateScraper/4.0)'
46
+ }
47
+ });
48
+ clearTimeout(timeoutId);
49
+
50
+ if (!response.ok) {
51
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
52
+ }
53
+ html = await response.text();
54
+ } catch (error) {
55
+ clearTimeout(timeoutId);
56
+ if (error.name === 'AbortError') {
57
+ throw new Error(`Request timeout after ${timeout}ms`);
58
+ }
59
+ throw error;
60
+ }
61
+
62
+ // Run the template extractor
63
+ const result = await this.registry.run(template, html, url);
64
+ return result;
65
+ }
66
+ }
67
+
68
+ export default ScrapeTemplateTool;
@@ -0,0 +1,311 @@
1
+ /**
2
+ * TemplateRegistry — pre-built scraping templates for popular sites (D3.3).
3
+ *
4
+ * Each template is a self-contained object with:
5
+ * id — unique slug used as the `template` parameter
6
+ * name — human-readable name
7
+ * description — when to use this template
8
+ * targetPattern — regex matching URLs this template handles
9
+ * selectors — CSS selectors mapping field names to DOM locations
10
+ * postProcess — optional function(raw: Object) → Object for cleanup
11
+ *
12
+ * Templates do NOT make network calls. The ScrapeTemplateTool fetches the
13
+ * page and passes the parsed HTML to the template's extract() method.
14
+ */
15
+
16
+ import { load } from 'cheerio';
17
+
18
+ // ── Helpers ──────────────────────────────────────────────────────────────────
19
+
20
+ function text($, sel) {
21
+ return $(sel).first().text().trim() || null;
22
+ }
23
+
24
+ function attr($, sel, attribute) {
25
+ return $(sel).first().attr(attribute) || null;
26
+ }
27
+
28
+ function list($, sel) {
29
+ return $(sel).map((_, el) => $(el).text().trim()).get().filter(Boolean);
30
+ }
31
+
32
+ function listAttr($, sel, attribute) {
33
+ return $(sel).map((_, el) => $(el).attr(attribute)).get().filter(Boolean);
34
+ }
35
+
36
+ // ── Template definitions ─────────────────────────────────────────────────────
37
+
38
+ const TEMPLATES = [
39
+ {
40
+ id: 'amazon-product',
41
+ name: 'Amazon Product',
42
+ description: 'Scrape an Amazon product page for title, price, rating, reviews, ASIN, and description.',
43
+ targetPattern: /amazon\.(com|co\.uk|de|fr|jp|ca|com\.au)/i,
44
+ extract($) {
45
+ return {
46
+ title: text($, '#productTitle'),
47
+ price: text($, '.a-price .a-offscreen') || text($, '#priceblock_ourprice') || text($, '#priceblock_dealprice'),
48
+ currency: attr($, 'meta[itemprop="priceCurrency"]', 'content'),
49
+ rating: text($, '#acrPopover .a-size-base'),
50
+ review_count: text($, '#acrCustomerReviewText'),
51
+ asin: text($, 'input#ASIN') || attr($, 'input[name="ASIN"]', 'value'),
52
+ brand: text($, '#bylineInfo'),
53
+ description: text($, '#productDescription p') || text($, '#feature-bullets'),
54
+ images: listAttr($, '#altImages img.a-thumbnail-image', 'src').slice(0, 8),
55
+ availability: text($, '#availability span'),
56
+ category_breadcrumb: list($, '#wayfinding-breadcrumbs_feature_div a')
57
+ };
58
+ }
59
+ },
60
+
61
+ {
62
+ id: 'linkedin-profile',
63
+ name: 'LinkedIn Profile',
64
+ description: 'Scrape a LinkedIn public profile for name, headline, location, and about section.',
65
+ targetPattern: /linkedin\.com\/in\//i,
66
+ extract($) {
67
+ return {
68
+ name: text($, 'h1') || text($, '.top-card-layout__title'),
69
+ headline: text($, '.top-card-layout__headline') || text($, 'h2'),
70
+ location: text($, '.top-card-layout__first-subline') || text($, '.profile-info-subheader'),
71
+ about: text($, '.core-section-container__content p') || text($, '.summary'),
72
+ connections: text($, '.top-card__connections'),
73
+ current_company: text($, '.top-card-layout__card-inner-full-width .top-card-link'),
74
+ note: 'LinkedIn requires authentication for full profiles. This template works on public profile pages only.'
75
+ };
76
+ }
77
+ },
78
+
79
+ {
80
+ id: 'github-repo',
81
+ name: 'GitHub Repository',
82
+ description: 'Scrape a GitHub repository page for stars, forks, description, language, topics, and README summary.',
83
+ targetPattern: /github\.com\/[^/]+\/[^/]+\/?$/i,
84
+ extract($) {
85
+ return {
86
+ name: text($, 'strong[itemprop="name"] a') || text($, '.repository-content h1'),
87
+ description: attr($, 'meta[property="og:description"]', 'content') || text($, 'p.f4.my-3'),
88
+ stars: text($, '#repo-stars-counter-star') || text($, '[aria-label*="stargazers"]'),
89
+ forks: text($, '#repo-network-counter') || text($, '[aria-label*="forks"]'),
90
+ watchers: text($, '[aria-label*="watchers"]'),
91
+ language: text($, 'span[itemprop="programmingLanguage"]') || text($, '.d-inline-flex[class*="language"]'),
92
+ topics: list($, 'a.topic-tag'),
93
+ license: text($, 'a[href*="blob/"][href*="LICENSE"]') || text($, '.octicon-law ~ span'),
94
+ last_updated: attr($, 'relative-time', 'datetime'),
95
+ homepage: attr($, 'a[href][rel="noopener noreferrer"]', 'href'),
96
+ open_issues: text($, '.Counter[aria-label*="issue"]')
97
+ };
98
+ }
99
+ },
100
+
101
+ {
102
+ id: 'youtube-video',
103
+ name: 'YouTube Video',
104
+ description: 'Scrape a YouTube video page for title, channel, views, likes, publish date, and description.',
105
+ targetPattern: /youtube\.com\/watch/i,
106
+ extract($) {
107
+ return {
108
+ title: attr($, 'meta[name="title"]', 'content') || attr($, 'meta[property="og:title"]', 'content'),
109
+ channel: attr($, 'link[itemprop="name"]', 'content') || text($, '#channel-name'),
110
+ channel_url: attr($, 'span[itemprop="author"] link[itemprop="url"]', 'href'),
111
+ views: attr($, 'meta[itemprop="interactionCount"]', 'content'),
112
+ published: attr($, 'meta[itemprop="uploadDate"]', 'content') || attr($, 'meta[itemprop="datePublished"]', 'content'),
113
+ description: attr($, 'meta[property="og:description"]', 'content'),
114
+ thumbnail: attr($, 'meta[property="og:image"]', 'content'),
115
+ duration: attr($, 'meta[itemprop="duration"]', 'content'),
116
+ video_id: new URL($('link[rel="canonical"]').attr('href') || 'https://youtube.com').searchParams.get('v')
117
+ };
118
+ }
119
+ },
120
+
121
+ {
122
+ id: 'tweet',
123
+ name: 'Tweet / X Post',
124
+ description: 'Scrape a tweet/X post for text, author, timestamp, likes, and retweets from the Open Graph / structured data.',
125
+ targetPattern: /(twitter|x)\.com\/[^/]+\/status\//i,
126
+ extract($) {
127
+ return {
128
+ text: attr($, 'meta[property="og:description"]', 'content'),
129
+ author: attr($, 'meta[property="og:title"]', 'content'),
130
+ url: attr($, 'meta[property="og:url"]', 'content') || attr($, 'link[rel="canonical"]', 'href'),
131
+ image: attr($, 'meta[property="og:image"]', 'content'),
132
+ note: 'X.com requires JavaScript rendering for full tweet data. Structured metadata is returned from static HTML.'
133
+ };
134
+ }
135
+ },
136
+
137
+ {
138
+ id: 'reddit-thread',
139
+ name: 'Reddit Thread',
140
+ description: 'Scrape a Reddit thread for title, subreddit, score, comment count, author, and top-level comments.',
141
+ targetPattern: /reddit\.com\/r\/[^/]+\/comments\//i,
142
+ extract($) {
143
+ return {
144
+ title: attr($, 'meta[property="og:title"]', 'content') || text($, 'h1'),
145
+ subreddit: text($, 'a[href*="/r/"][class*="subreddit"]') || (($('title').text().match(/r\/([^•]+)/) || [])[1] || '').trim(),
146
+ score: text($, '[data-score]') || attr($, '[itemprop="upvoteCount"]', 'content'),
147
+ author: text($, 'a[href*="/user/"]'),
148
+ posted: attr($, 'time[datetime]', 'datetime'),
149
+ body: text($, 'div[data-click-id="text"] p') || attr($, 'meta[property="og:description"]', 'content'),
150
+ url: attr($, 'meta[property="og:url"]', 'content'),
151
+ flair: text($, '[class*="flair"]')
152
+ };
153
+ }
154
+ },
155
+
156
+ {
157
+ id: 'hacker-news-front-page',
158
+ name: 'Hacker News Front Page',
159
+ description: 'Scrape the Hacker News front page for a list of stories with title, URL, score, and comment count.',
160
+ targetPattern: /news\.ycombinator\.com(\/news)?$/i,
161
+ extract($) {
162
+ const stories = [];
163
+ $('tr.athing').each((_, el) => {
164
+ const $row = $(el);
165
+ const $score = $row.next('.spacer').find('.score');
166
+ const $subtext = $row.next('.spacer').find('.subtext');
167
+ const $titleLink = $row.find('.titleline > a');
168
+ stories.push({
169
+ id: $row.attr('id'),
170
+ title: $titleLink.text().trim(),
171
+ url: $titleLink.attr('href'),
172
+ site: $row.find('.sitebit a').text().trim() || null,
173
+ score: $score.text().replace(' points', '').trim() || null,
174
+ author: $subtext.find('.hnuser').text().trim() || null,
175
+ posted: $subtext.find('.age a').attr('href') || null,
176
+ comments: $subtext.find('a[href*="item"]').last().text().trim() || null
177
+ });
178
+ });
179
+ return { stories: stories.slice(0, 30), scraped_at: new Date().toISOString() };
180
+ }
181
+ },
182
+
183
+ {
184
+ id: 'producthunt-launch',
185
+ name: 'Product Hunt Launch',
186
+ description: 'Scrape a Product Hunt product page for name, tagline, vote count, topics, and maker details.',
187
+ targetPattern: /producthunt\.com\/posts\//i,
188
+ extract($) {
189
+ return {
190
+ name: attr($, 'meta[property="og:title"]', 'content'),
191
+ tagline: attr($, 'meta[property="og:description"]', 'content'),
192
+ image: attr($, 'meta[property="og:image"]', 'content'),
193
+ url: attr($, 'meta[property="og:url"]', 'content'),
194
+ votes: text($, '[data-test="vote-button"] span') || text($, 'button[data-vote-button]'),
195
+ topics: list($, 'a[href*="/topics/"]'),
196
+ website: attr($, 'a[data-test="product-link"]', 'href') || attr($, 'a[href][rel="noopener"][target="_blank"]', 'href')
197
+ };
198
+ }
199
+ },
200
+
201
+ {
202
+ id: 'stackoverflow-question',
203
+ name: 'Stack Overflow Question',
204
+ description: 'Scrape a Stack Overflow question for title, body, votes, tags, answers, and accepted answer.',
205
+ targetPattern: /stackoverflow\.com\/questions\//i,
206
+ extract($) {
207
+ const answers = [];
208
+ $('.answer').each((_, el) => {
209
+ const $a = $(el);
210
+ answers.push({
211
+ votes: $a.find('[itemprop="upvoteCount"]').attr('content') || $a.find('.js-vote-count').text().trim(),
212
+ accepted: $a.hasClass('accepted-answer'),
213
+ body: $a.find('.s-prose').first().text().trim().slice(0, 500)
214
+ });
215
+ });
216
+
217
+ return {
218
+ title: text($, '#question-header h1'),
219
+ body: text($, '.question .s-prose'),
220
+ votes: text($, '.question .js-vote-count') || attr($, '.question [itemprop="upvoteCount"]', 'content'),
221
+ views: text($, '.js-view-count') || attr($, 'meta[name="twitter:data1"]', 'content'),
222
+ tags: list($, '.post-tag'),
223
+ author: text($, '.question .user-details a'),
224
+ asked: attr($, '.question time', 'datetime'),
225
+ answers: answers.slice(0, 5),
226
+ answered: $('div.accepted-answer').length > 0
227
+ };
228
+ }
229
+ },
230
+
231
+ {
232
+ id: 'npm-package',
233
+ name: 'npm Package',
234
+ description: 'Scrape an npm package page for name, version, description, weekly downloads, license, and dependencies.',
235
+ targetPattern: /npmjs\.com\/package\//i,
236
+ extract($) {
237
+ const scripts = [];
238
+ $('script[type="application/ld+json"]').each((_, el) => {
239
+ try { scripts.push(JSON.parse($(el).html())); } catch {}
240
+ });
241
+ const ld = scripts[0] || {};
242
+
243
+ return {
244
+ name: text($, 'h1') || ld.name,
245
+ version: text($, 'h3[data-testid="package-version-number"]') || text($, '[class*="version"]'),
246
+ description: attr($, 'meta[name="description"]', 'content') || text($, 'p[class*="description"]'),
247
+ license: text($, 'span[class*="license"]') || text($, '[data-cy="license"]') || ld.license,
248
+ weekly_downloads: text($, 'span[class*="weekly-downloads"]') || text($, '[data-cy="downloads"]'),
249
+ install_command: `npm install ${ld.name || text($, 'h1') || ''}`.trim(),
250
+ homepage: attr($, 'a[href][class*="homepage"]', 'href'),
251
+ repository: attr($, 'a[href*="github.com"]', 'href'),
252
+ maintainers: list($, 'a[href*="/~"]')
253
+ };
254
+ }
255
+ }
256
+ ];
257
+
258
+ // ── Registry ─────────────────────────────────────────────────────────────────
259
+
260
+ export class TemplateRegistry {
261
+ constructor() {
262
+ this._templates = new Map(TEMPLATES.map(t => [t.id, t]));
263
+ }
264
+
265
+ /**
266
+ * List all registered template IDs and names.
267
+ * @returns {{ id: string, name: string, description: string }[]}
268
+ */
269
+ list() {
270
+ return TEMPLATES.map(({ id, name, description, targetPattern }) => ({
271
+ id, name, description,
272
+ targetPattern: targetPattern.toString()
273
+ }));
274
+ }
275
+
276
+ /**
277
+ * Look up a template by ID.
278
+ * @param {string} id
279
+ * @returns {object|undefined}
280
+ */
281
+ get(id) {
282
+ return this._templates.get(id);
283
+ }
284
+
285
+ /**
286
+ * Run a template against raw HTML.
287
+ * @param {string} id — template ID
288
+ * @param {string} html — raw HTML of the target page
289
+ * @param {string} url — original URL (for context)
290
+ * @returns {{ template: string, url: string, data: object, extractedAt: string }}
291
+ */
292
+ async run(id, html, url) {
293
+ const template = this.get(id);
294
+ if (!template) {
295
+ throw new Error(`Unknown template: "${id}". Available: ${TEMPLATES.map(t => t.id).join(', ')}`);
296
+ }
297
+
298
+ const $ = load(html);
299
+ const data = template.extract($);
300
+
301
+ return {
302
+ template: id,
303
+ template_name: template.name,
304
+ url,
305
+ data,
306
+ extractedAt: new Date().toISOString()
307
+ };
308
+ }
309
+ }
310
+
311
+ export default TemplateRegistry;
@@ -4,6 +4,7 @@
4
4
  */
5
5
 
6
6
  import winston from 'winston';
7
+ import { maskSecrets } from './secretMask.js';
7
8
  import { fileURLToPath } from 'url';
8
9
  import { dirname, join } from 'path';
9
10
  import { existsSync, mkdirSync } from 'fs';
@@ -70,7 +71,21 @@ export class Logger {
70
71
  * @returns {winston.Format} Winston format
71
72
  */
72
73
  createFormat(enableJson) {
74
+ // D2.9: global secret masking format applied first
75
+ const secretMaskFormat = winston.format((info) => {
76
+ if (info.metadata) info.metadata = maskSecrets(info.metadata);
77
+ if (typeof info.message === 'string') {
78
+ // lightweight heuristic mask on the message string itself
79
+ info.message = info.message
80
+ .replace(/(Bearer\s+)\S+/gi, '$1[REDACTED]')
81
+ .replace(/(api[_-]?key[:=]\s*)\S+/gi, '$1[REDACTED]')
82
+ .replace(/(x-api-key[:=]\s*)\S+/gi, '$1[REDACTED]');
83
+ }
84
+ return info;
85
+ })();
86
+
73
87
  const formats = [
88
+ secretMaskFormat,
74
89
  winston.format.timestamp({ format: 'YYYY-MM-DD HH:mm:ss.SSS' }),
75
90
  winston.format.errors({ stack: true }),
76
91
  winston.format.metadata({ fillExcept: ['message', 'level', 'timestamp', 'service'] })
@@ -0,0 +1,54 @@
1
+ /**
2
+ * htmlToMarkdown -- thin wrapper around the Turndown HTML-to-Markdown library.
3
+ *
4
+ * Usage:
5
+ * import { htmlToMarkdown } from '../../utils/htmlToMarkdown.js';
6
+ * const md = htmlToMarkdown(rawHtml);
7
+ *
8
+ * Design notes:
9
+ * - Turndown is the most widely-used, battle-tested HTML->Markdown converter.
10
+ * - We configure it with sensible defaults for RAG workflows:
11
+ * headingStyle: 'atx' -> # H1 / ## H2 instead of underline style
12
+ * codeBlockStyle: 'fenced' -> triple-backtick fences
13
+ * bulletListMarker: '-'
14
+ * - Tables fall back to prose (no GFM plugin loaded by default).
15
+ */
16
+
17
+ import TurndownService from 'turndown';
18
+
19
+ let _td = null;
20
+
21
+ function getTurndown() {
22
+ if (_td === null) {
23
+ _td = new TurndownService({
24
+ headingStyle: 'atx',
25
+ codeBlockStyle: 'fenced',
26
+ bulletListMarker: '-',
27
+ emDelimiter: '_',
28
+ strongDelimiter: '**',
29
+ hr: '---',
30
+ linkStyle: 'inlined'
31
+ });
32
+
33
+ // Remove boilerplate elements before converting
34
+ _td.remove(['script', 'style', 'nav', 'footer', 'aside', 'noscript']);
35
+ }
36
+ return _td;
37
+ }
38
+
39
+ /**
40
+ * Convert an HTML string to Markdown.
41
+ * Returns an empty string if html is falsy.
42
+ *
43
+ * @param {string} html
44
+ * @returns {string}
45
+ */
46
+ export function htmlToMarkdown(html) {
47
+ if (!html) return '';
48
+ try {
49
+ return getTurndown().turndown(html).trim();
50
+ } catch {
51
+ // Fallback: strip tags, return plain text
52
+ return html.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
53
+ }
54
+ }