brave-real-browser-mcp-server 2.15.5 → 2.15.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -67,73 +67,6 @@ export async function handleBatchElementScraper(args) {
67
67
  };
68
68
  }, 'Failed to batch scrape elements');
69
69
  }
70
- /**
71
- * Parent-child relationships maintain करते हुए data निकालता है
72
- */
73
- export async function handleNestedDataExtraction(args) {
74
- return await withErrorHandling(async () => {
75
- validateWorkflow('nested_data_extraction', {
76
- requireBrowser: true,
77
- requirePage: true,
78
- });
79
- const page = getCurrentPage();
80
- const parentSelector = args.parentSelector;
81
- const childSelector = args.childSelector;
82
- const maxParents = args.maxParents || 50;
83
- const nestedData = await page.evaluate(({ parentSelector, childSelector, maxParents }) => {
84
- const parents = document.querySelectorAll(parentSelector);
85
- const results = [];
86
- let count = 0;
87
- parents.forEach((parent) => {
88
- if (count >= maxParents)
89
- return;
90
- const parentData = {
91
- selector: parentSelector,
92
- text: Array.from(parent.childNodes)
93
- .filter((node) => node.nodeType === Node.TEXT_NODE)
94
- .map((node) => node.textContent?.trim())
95
- .filter((text) => text)
96
- .join(' '),
97
- attributes: {},
98
- };
99
- // Get parent attributes
100
- Array.from(parent.attributes).forEach((attr) => {
101
- parentData.attributes[attr.name] = attr.value;
102
- });
103
- // Get children
104
- const children = parent.querySelectorAll(childSelector);
105
- const childrenData = [];
106
- children.forEach((child) => {
107
- const childData = {
108
- selector: childSelector,
109
- text: child.textContent?.trim() || '',
110
- attributes: {},
111
- };
112
- Array.from(child.attributes).forEach((attr) => {
113
- childData.attributes[attr.name] = attr.value;
114
- });
115
- childrenData.push(childData);
116
- });
117
- if (childrenData.length > 0) {
118
- results.push({
119
- parent: parentData,
120
- children: childrenData,
121
- });
122
- count++;
123
- }
124
- });
125
- return results;
126
- }, { parentSelector, childSelector, maxParents });
127
- return {
128
- content: [
129
- {
130
- type: 'text',
131
- text: `✅ Extracted ${nestedData.length} parent-child relationships\n\n${JSON.stringify(nestedData, null, 2)}`,
132
- },
133
- ],
134
- };
135
- }, 'Failed to extract nested data');
136
- }
137
70
  /**
138
71
  * सभी elements के attributes (href, src, data-*) collect करता है
139
72
  */
@@ -143,3 +143,62 @@ async function withWorkflowValidation(toolName, args, operation) {
143
143
  throw error;
144
144
  }
145
145
  }
146
+ /**
147
+ * Site structure follow करके pages scrape करता है
148
+ */
149
+ export async function handleBreadcrumbNavigator(args) {
150
+ return await withWorkflowValidation('breadcrumb_navigator', args, async () => {
151
+ return await withErrorHandling(async () => {
152
+ const page = getPageInstance();
153
+ if (!page) {
154
+ throw new Error('Browser not initialized. Call browser_init first.');
155
+ }
156
+ const breadcrumbSelector = args.breadcrumbSelector || '.breadcrumb, nav[aria-label="breadcrumb"], .breadcrumbs';
157
+ const followLinks = args.followLinks || false;
158
+ const breadcrumbData = await page.evaluate((selector) => {
159
+ const breadcrumbs = document.querySelectorAll(selector);
160
+ const results = [];
161
+ breadcrumbs.forEach((breadcrumb) => {
162
+ const links = breadcrumb.querySelectorAll('a');
163
+ const items = [];
164
+ links.forEach((link, index) => {
165
+ items.push({
166
+ text: link.textContent?.trim() || '',
167
+ href: link.href,
168
+ level: index,
169
+ });
170
+ });
171
+ if (items.length > 0) {
172
+ results.push({
173
+ path: items.map((i) => i.text).join(' > '),
174
+ links: items,
175
+ });
176
+ }
177
+ });
178
+ return results;
179
+ }, breadcrumbSelector);
180
+ if (breadcrumbData.length === 0) {
181
+ return {
182
+ content: [
183
+ {
184
+ type: 'text',
185
+ text: '❌ No breadcrumbs found on page',
186
+ },
187
+ ],
188
+ };
189
+ }
190
+ let additionalData = '';
191
+ if (followLinks && breadcrumbData[0]?.links) {
192
+ additionalData = `\n\n📌 To scrape breadcrumb pages, use multi_page_scraper with URLs: ${JSON.stringify(breadcrumbData[0].links.map((l) => l.href))}`;
193
+ }
194
+ return {
195
+ content: [
196
+ {
197
+ type: 'text',
198
+ text: `✅ Found ${breadcrumbData.length} breadcrumb trail(s)\n\n${JSON.stringify(breadcrumbData, null, 2)}${additionalData}`,
199
+ },
200
+ ],
201
+ };
202
+ }, 'Failed to navigate breadcrumbs');
203
+ });
204
+ }
@@ -262,124 +262,3 @@ export async function handleAdvancedCSSSelectors(args) {
262
262
  return { content: [{ type: 'text', text: `❌ CSS selector query failed: ${error.message}` }], isError: true };
263
263
  }
264
264
  }
265
- /**
266
- * Visual Element Finder - Find elements by visual properties
267
- */
268
- export async function handleVisualElementFinder(args) {
269
- const { url, criteria } = args;
270
- try {
271
- const page = getPageInstance();
272
- if (!page) {
273
- throw new Error('Browser not initialized. Call browser_init first.');
274
- }
275
- if (url && page.url() !== url) {
276
- await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000 });
277
- }
278
- const results = await page.evaluate((crit) => {
279
- const allElements = Array.from(document.querySelectorAll('*'));
280
- const matches = [];
281
- allElements.forEach(element => {
282
- const computed = window.getComputedStyle(element);
283
- const rect = element.getBoundingClientRect();
284
- let matchScore = 0;
285
- const reasons = [];
286
- // Check visibility
287
- if (crit.visible !== undefined) {
288
- const isVisible = computed.display !== 'none' &&
289
- computed.visibility !== 'hidden' &&
290
- rect.width > 0 &&
291
- rect.height > 0;
292
- if (isVisible === crit.visible) {
293
- matchScore += 10;
294
- reasons.push('visibility');
295
- }
296
- }
297
- // Check color
298
- if (crit.color) {
299
- if (computed.color.includes(crit.color) || computed.backgroundColor.includes(crit.color)) {
300
- matchScore += 5;
301
- reasons.push('color');
302
- }
303
- }
304
- // Check size
305
- if (crit.minWidth && rect.width >= crit.minWidth) {
306
- matchScore += 3;
307
- reasons.push('minWidth');
308
- }
309
- if (crit.maxWidth && rect.width <= crit.maxWidth) {
310
- matchScore += 3;
311
- reasons.push('maxWidth');
312
- }
313
- if (crit.minHeight && rect.height >= crit.minHeight) {
314
- matchScore += 3;
315
- reasons.push('minHeight');
316
- }
317
- if (crit.maxHeight && rect.height <= crit.maxHeight) {
318
- matchScore += 3;
319
- reasons.push('maxHeight');
320
- }
321
- // Check position
322
- if (crit.position) {
323
- if (computed.position === crit.position) {
324
- matchScore += 5;
325
- reasons.push('position');
326
- }
327
- }
328
- // Check text content
329
- if (crit.hasText !== undefined) {
330
- const hasText = (element.textContent?.trim().length || 0) > 0;
331
- if (hasText === crit.hasText) {
332
- matchScore += 5;
333
- reasons.push('hasText');
334
- }
335
- }
336
- // Check if element is in viewport
337
- if (crit.inViewport !== undefined) {
338
- const inViewport = rect.top >= 0 &&
339
- rect.left >= 0 &&
340
- rect.bottom <= window.innerHeight &&
341
- rect.right <= window.innerWidth;
342
- if (inViewport === crit.inViewport) {
343
- matchScore += 5;
344
- reasons.push('inViewport');
345
- }
346
- }
347
- if (matchScore > 0) {
348
- matches.push({
349
- element: {
350
- tagName: element.tagName.toLowerCase(),
351
- id: element.id,
352
- className: element.className,
353
- text: element.textContent?.substring(0, 100)
354
- },
355
- score: matchScore,
356
- matchedCriteria: reasons,
357
- visualProperties: {
358
- display: computed.display,
359
- visibility: computed.visibility,
360
- position: computed.position,
361
- color: computed.color,
362
- backgroundColor: computed.backgroundColor,
363
- width: rect.width,
364
- height: rect.height,
365
- top: rect.top,
366
- left: rect.left
367
- }
368
- });
369
- }
370
- });
371
- matches.sort((a, b) => b.score - a.score);
372
- return {
373
- totalMatches: matches.length,
374
- topMatches: matches.slice(0, 20)
375
- };
376
- }, criteria);
377
- const resultText = `✅ Visual Element Finder Results\n\nCriteria: ${JSON.stringify(criteria, null, 2)}\nTotal Matches: ${results.totalMatches}\n\nTop Matches:\n${JSON.stringify(results.topMatches, null, 2)}`;
378
- return {
379
- content: [{ type: 'text', text: resultText }],
380
- };
381
- }
382
- catch (error) {
383
- return { content: [{ type: 'text', text: `❌ Visual element finder failed: ${error.message}` }], isError: true };
384
- }
385
- }
package/dist/index.js CHANGED
@@ -28,15 +28,13 @@ import { handleSaveContentAsMarkdown } from "./handlers/file-handlers.js";
28
28
  // Import new data extraction handlers
29
29
  import { handleExtractJSON, handleScrapeMetaTags, handleExtractSchema, } from "./handlers/data-extraction-handlers.js";
30
30
  // Import multi-element handlers
31
- import { handleBatchElementScraper, handleNestedDataExtraction, handleAttributeHarvester, handleLinkHarvester, handleMediaExtractor, } from "./handlers/multi-element-handlers.js";
31
+ import { handleBatchElementScraper, handleAttributeHarvester, handleLinkHarvester, handleMediaExtractor, } from "./handlers/multi-element-handlers.js";
32
32
  // Import pagination handlers
33
- import { handleMultiPageScraper, handleBreadcrumbNavigator, } from "./handlers/pagination-handlers.js";
34
- // Import data processing handlers
35
- import { handleHTMLToText, } from "./handlers/data-processing-handlers.js";
33
+ import { handleBreadcrumbNavigator, } from "./handlers/navigation-handlers.js";
36
34
  // Import AI-powered handlers
37
35
  import { handleSmartSelectorGenerator, handleContentClassification, } from "./handlers/ai-powered-handlers.js";
38
36
  // Import search & filter handlers
39
- import { handleKeywordSearch, handleRegexPatternMatcher, handleXPathSupport, handleAdvancedCSSSelectors, handleVisualElementFinder, } from "./handlers/search-filter-handlers.js";
37
+ import { handleKeywordSearch, handleRegexPatternMatcher, handleXPathSupport, handleAdvancedCSSSelectors, } from "./handlers/search-filter-handlers.js";
40
38
  // Import data quality handlers
41
39
  import { handleDataTypeValidator, } from "./handlers/data-quality-handlers.js";
42
40
  // Import captcha handlers
@@ -152,9 +150,6 @@ export async function executeToolByName(name, args) {
152
150
  case TOOL_NAMES.BATCH_ELEMENT_SCRAPER:
153
151
  result = await handleBatchElementScraper(args);
154
152
  break;
155
- case TOOL_NAMES.NESTED_DATA_EXTRACTION:
156
- result = await handleNestedDataExtraction(args);
157
- break;
158
153
  case TOOL_NAMES.ATTRIBUTE_HARVESTER:
159
154
  result = await handleAttributeHarvester(args);
160
155
  break;
@@ -167,16 +162,12 @@ export async function executeToolByName(name, args) {
167
162
  break;
168
163
  // Pagination Tools
169
164
  // Pagination Tools
170
- case TOOL_NAMES.MULTI_PAGE_SCRAPER:
171
- result = await handleMultiPageScraper(args);
172
- break;
165
+ // Pagination Tools
173
166
  case TOOL_NAMES.BREADCRUMB_NAVIGATOR:
174
167
  result = await handleBreadcrumbNavigator(args || {});
175
168
  break;
176
169
  // Data Processing Tools
177
- case TOOL_NAMES.HTML_TO_TEXT:
178
- result = await handleHTMLToText(args);
179
- break;
170
+ // Data Processing Tools
180
171
  // AI-Powered Features
181
172
  case TOOL_NAMES.SMART_SELECTOR_GENERATOR:
182
173
  result = await handleSmartSelectorGenerator(args);
@@ -197,9 +188,6 @@ export async function executeToolByName(name, args) {
197
188
  case TOOL_NAMES.ADVANCED_CSS_SELECTORS:
198
189
  result = await handleAdvancedCSSSelectors(args);
199
190
  break;
200
- case TOOL_NAMES.VISUAL_ELEMENT_FINDER:
201
- result = await handleVisualElementFinder(args);
202
- break;
203
191
  // Data Quality & Validation
204
192
  case TOOL_NAMES.DATA_TYPE_VALIDATOR:
205
193
  result = await handleDataTypeValidator(args);
@@ -381,19 +381,6 @@ export const TOOLS = [
381
381
  required: ['selector'],
382
382
  },
383
383
  },
384
- {
385
- name: 'nested_data_extraction',
386
- description: 'Extract data maintaining parent-child relationships',
387
- inputSchema: {
388
- type: 'object',
389
- properties: {
390
- parentSelector: { type: 'string' },
391
- childSelector: { type: 'string' },
392
- maxParents: { type: 'number', default: 50 },
393
- },
394
- required: ['parentSelector', 'childSelector'],
395
- },
396
- },
397
384
  {
398
385
  name: 'attribute_harvester',
399
386
  description: 'Collect attributes (href, src, data-*) from elements',
@@ -432,19 +419,6 @@ export const TOOLS = [
432
419
  },
433
420
  },
434
421
  // Pagination Tools
435
- {
436
- name: 'multi_page_scraper',
437
- description: 'Collect and merge data from multiple pages',
438
- inputSchema: {
439
- type: 'object',
440
- properties: {
441
- urls: { type: 'array', items: { type: 'string' } },
442
- dataSelector: { type: 'string' },
443
- waitBetweenPages: { type: 'number', default: 1000 },
444
- },
445
- required: ['urls', 'dataSelector'],
446
- },
447
- },
448
422
  {
449
423
  name: 'breadcrumb_navigator',
450
424
  description: 'Extract navigation path by following site structure',
@@ -457,19 +431,7 @@ export const TOOLS = [
457
431
  },
458
432
  },
459
433
  // Data Processing Tools
460
- {
461
- name: 'html_to_text',
462
- description: 'Convert HTML content to clean text',
463
- inputSchema: {
464
- type: 'object',
465
- properties: {
466
- html: { type: 'string' },
467
- preserveLinks: { type: 'boolean', default: false },
468
- preserveFormatting: { type: 'boolean', default: false },
469
- },
470
- required: ['html'],
471
- },
472
- },
434
+ // Data Validation Tools
473
435
  // Data Validation Tools
474
436
  // AI-Powered Features (5 tools)
475
437
  {
@@ -553,21 +515,6 @@ export const TOOLS = [
553
515
  required: ['selector'],
554
516
  },
555
517
  },
556
- {
557
- name: 'visual_element_finder',
558
- description: 'Find elements by visual properties',
559
- inputSchema: {
560
- type: 'object',
561
- properties: {
562
- url: { type: 'string' },
563
- criteria: {
564
- type: 'object',
565
- description: 'Visual criteria (color, size, position, etc.)'
566
- },
567
- },
568
- required: ['criteria'],
569
- },
570
- },
571
518
  // Data Quality & Validation (5 tools)
572
519
  {
573
520
  name: 'data_type_validator',
@@ -1001,7 +948,6 @@ export const TOOL_NAMES = {
1001
948
  EXTRACT_SCHEMA: 'extract_schema',
1002
949
  // Multi-Element Extractors
1003
950
  BATCH_ELEMENT_SCRAPER: 'batch_element_scraper',
1004
- NESTED_DATA_EXTRACTION: 'nested_data_extraction',
1005
951
  ATTRIBUTE_HARVESTER: 'attribute_harvester',
1006
952
  // Content Type Specific
1007
953
  LINK_HARVESTER: 'link_harvester',
@@ -1020,10 +966,8 @@ export const TOOL_NAMES = {
1020
966
  NETWORK_RECORDER: 'network_recorder',
1021
967
  API_FINDER: 'api_finder',
1022
968
  // Pagination Tools
1023
- MULTI_PAGE_SCRAPER: 'multi_page_scraper',
1024
969
  BREADCRUMB_NAVIGATOR: 'breadcrumb_navigator',
1025
970
  // Data Processing
1026
- HTML_TO_TEXT: 'html_to_text',
1027
971
  // AI-Powered Features
1028
972
  SMART_SELECTOR_GENERATOR: 'smart_selector_generator',
1029
973
  CONTENT_CLASSIFICATION: 'content_classification',
@@ -1036,7 +980,6 @@ export const TOOL_NAMES = {
1036
980
  REGEX_PATTERN_MATCHER: 'regex_pattern_matcher',
1037
981
  XPATH_SUPPORT: 'xpath_support',
1038
982
  ADVANCED_CSS_SELECTORS: 'advanced_css_selectors',
1039
- VISUAL_ELEMENT_FINDER: 'visual_element_finder',
1040
983
  // Data Quality & Validation
1041
984
  DATA_TYPE_VALIDATOR: 'data_type_validator',
1042
985
  // Advanced Captcha Handling
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "brave-real-browser-mcp-server",
3
- "version": "2.15.5",
3
+ "version": "2.15.6",
4
4
  "description": "Universal AI IDE MCP Server - Auto-detects and supports all AI IDEs (Claude Desktop, Cursor, Windsurf, Cline, Zed, VSCode, Qoder AI, etc.) with Brave browser automation",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -34,7 +34,7 @@
34
34
  "test:brave:cleanup": "taskkill /F /IM brave.exe || pkill -f brave || true"
35
35
  },
36
36
  "dependencies": {
37
- "@modelcontextprotocol/sdk": "^1.25.0",
37
+ "@modelcontextprotocol/sdk": "^1.25.1",
38
38
  "@types/turndown": "^5.0.6",
39
39
  "ajv": "^8.12.0",
40
40
  "axios": "^1.6.5",
@@ -0,0 +1,98 @@
1
+
2
+ import { handleBrowserInit, handleBrowserClose } from '../src/handlers/browser-handlers.js';
3
+ import { handleNavigate, handleWait } from '../src/handlers/navigation-handlers.js';
4
+ import { handleGetContent, handleFindSelector } from '../src/handlers/content-handlers.js';
5
+ import { handleBreadcrumbNavigator } from '../src/handlers/navigation-handlers.js';
6
+ import {
7
+ handleBatchElementScraper,
8
+ handleAttributeHarvester,
9
+ handleLinkHarvester,
10
+ handleMediaExtractor
11
+ } from '../src/handlers/multi-element-handlers.js';
12
+ import {
13
+ handleKeywordSearch,
14
+ handleRegexPatternMatcher,
15
+ handleXPathSupport,
16
+ handleAdvancedCSSSelectors
17
+ } from '../src/handlers/search-filter-handlers.js';
18
+ import { handleRandomScroll } from '../src/handlers/interaction-handlers.js';
19
+ import { handleScrapeMetaTags, handleExtractSchema } from '../src/handlers/data-extraction-handlers.js';
20
+
21
+ async function runFullVerification() {
22
+ console.log('🚀 Starting Comprehensive Tool Verification...');
23
+
24
+ try {
25
+ await handleBrowserInit({ headless: true });
26
+
27
+ // Testing on one site primarily to save time, then brief check on second
28
+ const url = 'https://moviesdrive.forum/';
29
+ console.log(`\n--------------------------------------------------`);
30
+ console.log(`🔍 Targeting: ${url}`);
31
+ console.log(`--------------------------------------------------`);
32
+
33
+ // --- Navigation & Basic ---
34
+ console.log(`\n[1/12] 🟢 Testing Navigation & Wait...`);
35
+ await handleNavigate({ url });
36
+ await handleWait({ type: 'timeout', value: '2000' });
37
+ console.log(' ✅ Navigation complete.');
38
+
39
+ // --- Interaction ---
40
+ console.log(`\n[2/12] 🟢 Testing Random Scroll...`);
41
+ await handleRandomScroll({});
42
+ console.log(' ✅ Scroll complete.');
43
+
44
+ // --- Content Handlers ---
45
+ console.log(`\n[3/12] 🟢 Testing Find Selector (Text search)...`);
46
+ const findRes = await handleFindSelector({ text: 'Movie' }); // Assuming "Movie" exists
47
+ console.log(` Result: Found ${findRes.content[0].text.length > 50 ? 'matches' : 'no matches'} (Length: ${findRes.content[0].text.length})`);
48
+
49
+ // --- Multi-Element Handlers (The file we kept) ---
50
+ console.log(`\n[4/12] 🟢 Testing Batch Element Scraper...`);
51
+ const batchRes = await handleBatchElementScraper({ selector: 'a', maxElements: 3 });
52
+ console.log(` Result: ${batchRes.content[0].text.substring(0, 100)}...`);
53
+
54
+ console.log(`\n[5/12] 🟢 Testing Attribute Harvester...`);
55
+ const attrRes = await handleAttributeHarvester({ selector: 'img', attributes: ['src'], maxElements: 3 });
56
+ console.log(` Result: ${attrRes.content[0].text.substring(0, 100)}...`);
57
+
58
+ console.log(`\n[6/12] 🟢 Testing Media Extractor...`); // Might be empty on home page but runs logic
59
+ const mediaRes = await handleMediaExtractor({ types: ['video', 'iframe'] });
60
+ console.log(` Result: ${mediaRes.content[0].text.substring(0, 100)}...`);
61
+
62
+ // --- Search & Filter Handlers (The file we kept) ---
63
+ console.log(`\n[7/12] 🟢 Testing Keyword Search...`);
64
+ const keyRes = await handleKeywordSearch({ keywords: ['Bollywood', 'Hollywood'] });
65
+ console.log(` Result: ${keyRes.content[0].text.substring(0, 100)}...`);
66
+
67
+ console.log(`\n[8/12] 🟢 Testing Regex Pattern Matcher...`);
68
+ const regexRes = await handleRegexPatternMatcher({ pattern: 'https?://[^\\s"\']+' });
69
+ console.log(` Result: ${regexRes.content[0].text.substring(0, 100)}...`);
70
+
71
+ console.log(`\n[9/12] 🟢 Testing XPath Support...`);
72
+ const xpathRes = await handleXPathSupport({ xpath: '//body//div' });
73
+ console.log(` Result: ${xpathRes.content[0].text.substring(0, 100)}...`);
74
+
75
+ console.log(`\n[10/12] 🟢 Testing Advanced CSS Selectors...`);
76
+ const cssRes = await handleAdvancedCSSSelectors({ selector: 'div > a', operation: 'query' });
77
+ console.log(` Result: ${cssRes.content[0].text.substring(0, 100)}...`);
78
+
79
+ // --- Data Extraction ---
80
+ console.log(`\n[11/12] 🟢 Testing Schema Extraction...`);
81
+ const schemaRes = await handleExtractSchema({});
82
+ console.log(` Result: ${schemaRes.content[0].text.substring(0, 100)}...`);
83
+
84
+ // --- Pagination (Refactored) ---
85
+ console.log(`\n[12/12] 🟢 Testing Breadcrumb Navigator...`);
86
+ const breadRes = await handleBreadcrumbNavigator({});
87
+ console.log(` Result: ${breadRes.content[0].text.substring(0, 100)}...`);
88
+
89
+ console.log('\n✅ All primary handler categories verified successfully.');
90
+
91
+ } catch (error) {
92
+ console.error('\n❌ Verification Failed:', error);
93
+ } finally {
94
+ await handleBrowserClose({});
95
+ }
96
+ }
97
+
98
+ runFullVerification();
@@ -0,0 +1,61 @@
1
+
2
+ import { handleBrowserInit, handleBrowserClose } from '../src/handlers/browser-handlers.js';
3
+ import { handleNavigate } from '../src/handlers/navigation-handlers.js';
4
+ import { handleGetContent } from '../src/handlers/content-handlers.js';
5
+ import { handleBreadcrumbNavigator } from '../src/handlers/navigation-handlers.js';
6
+ import { handleLinkHarvester } from '../src/handlers/multi-element-handlers.js';
7
+ import { handleScrapeMetaTags } from '../src/handlers/data-extraction-handlers.js';
8
+
9
+ async function runVerification() {
10
+ console.log('🚀 Starting Verification on Live Sites...');
11
+
12
+ try {
13
+ // 1. Initialize Browser
14
+ console.log('\n🔵 Initializing Browser...');
15
+ await handleBrowserInit({ headless: true });
16
+
17
+ const sites = [
18
+ 'https://moviesdrive.forum/',
19
+ 'https://multimovies.golf/'
20
+ ];
21
+
22
+ for (const url of sites) {
23
+ console.log(`\n--------------------------------------------------`);
24
+ console.log(`🔍 Testing Site: ${url}`);
25
+ console.log(`--------------------------------------------------`);
26
+
27
+ // 2. Navigate
28
+ console.log(`\n➡️ Navigating to ${url}...`);
29
+ await handleNavigate({ url });
30
+
31
+ // 3. Get Content (HTML preview)
32
+ console.log(`\n📄 Fetching Content (Preview)...`);
33
+ const contentRes = await handleGetContent({ type: 'text' });
34
+ console.log(` Result: ${contentRes.content[0].text.substring(0, 100)}...`);
35
+
36
+ // 4. Test Breadcrumb Navigator (Newly moved)
37
+ console.log(`\nnav Testing Breadcrumb Navigator...`);
38
+ const breadcrumbRes = await handleBreadcrumbNavigator({});
39
+ console.log(` Result: ${breadcrumbRes.content[0].text.substring(0, 200)}...`);
40
+
41
+ // 5. Test Link Harvester (Existing tool)
42
+ console.log(`\n🔗 Testing Link Harvester (First 5 links)...`);
43
+ const linksRes = await handleLinkHarvester({ maxElements: 5 });
44
+ console.log(` Result: ${linksRes.content[0].text.substring(0, 200)}...`);
45
+
46
+ // 6. Test Meta Tags (Data extraction)
47
+ console.log(`\n🏷️ Testing Meta Tag Scraper...`);
48
+ const metaRes = await handleScrapeMetaTags({});
49
+ console.log(` Result: ${metaRes.content[0].text.substring(0, 200)}...`);
50
+ }
51
+
52
+ } catch (error) {
53
+ console.error('\n❌ Verification Failed:', error);
54
+ } finally {
55
+ // 7. Cleanup
56
+ console.log('\n🔴 Closing Browser...');
57
+ await handleBrowserClose({});
58
+ }
59
+ }
60
+
61
+ runVerification();
@@ -1,49 +0,0 @@
1
- // Data Processing & Transformation Handlers
2
- // Text cleaning, validation, formatting utilities
3
- // @ts-nocheck
4
- import { withErrorHandling } from '../system-utils.js';
5
- /**
6
- * HTML tags intelligently remove करता है
7
- */
8
- export async function handleHTMLToText(args) {
9
- return await withErrorHandling(async () => {
10
- const html = args.html;
11
- const preserveLinks = args.preserveLinks || false;
12
- const preserveFormatting = args.preserveFormatting || false;
13
- // Simple HTML to text conversion (can be enhanced with turndown)
14
- let text = html;
15
- // Preserve links if requested
16
- if (preserveLinks) {
17
- text = text.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '$2 ($1)');
18
- }
19
- // Preserve basic formatting
20
- if (preserveFormatting) {
21
- text = text.replace(/<br\s*\/?>/gi, '\n');
22
- text = text.replace(/<\/p>/gi, '\n\n');
23
- text = text.replace(/<li>/gi, '• ');
24
- text = text.replace(/<\/li>/gi, '\n');
25
- }
26
- // Remove all other HTML tags
27
- text = text.replace(/<[^>]*>/g, '');
28
- // Decode HTML entities
29
- text = text
30
- .replace(/&nbsp;/g, ' ')
31
- .replace(/&amp;/g, '&')
32
- .replace(/&lt;/g, '<')
33
- .replace(/&gt;/g, '>')
34
- .replace(/&quot;/g, '"')
35
- .replace(/&#39;/g, "'");
36
- // Clean up whitespace
37
- text = text.replace(/\n\s*\n/g, '\n\n');
38
- text = text.trim();
39
- return {
40
- content: [
41
- {
42
- type: 'text',
43
- text: `✅ HTML converted to text\n\n${text}`,
44
- },
45
- ],
46
- };
47
- }, 'Failed to convert HTML to text');
48
- }
49
- // Duplicate Remover Arguments
@@ -1,115 +0,0 @@
1
- // Pagination & Navigation Tools
2
- // Auto pagination, infinite scroll, multi-page scraping, sitemap parser
3
- // @ts-nocheck
4
- import { getCurrentPage } from '../browser-manager.js';
5
- import { validateWorkflow } from '../workflow-validation.js';
6
- import { withErrorHandling, sleep } from '../system-utils.js';
7
- /**
8
- * Multiple pages से data collect और merge करता है
9
- */
10
- export async function handleMultiPageScraper(args) {
11
- return await withErrorHandling(async () => {
12
- validateWorkflow('multi_page_scraper', {
13
- requireBrowser: true,
14
- requirePage: true,
15
- });
16
- const page = getCurrentPage();
17
- const urls = args.urls;
18
- const dataSelector = args.dataSelector;
19
- const waitBetweenPages = args.waitBetweenPages || 1000;
20
- const allData = [];
21
- for (let i = 0; i < urls.length; i++) {
22
- const url = urls[i];
23
- try {
24
- await page.goto(url, { waitUntil: 'domcontentloaded' });
25
- await sleep(waitBetweenPages);
26
- const pageData = await page.evaluate((selector) => {
27
- const elements = document.querySelectorAll(selector);
28
- return Array.from(elements).map((el) => ({
29
- text: el.textContent?.trim() || '',
30
- html: el.innerHTML,
31
- }));
32
- }, dataSelector);
33
- allData.push({
34
- url,
35
- pageIndex: i,
36
- itemCount: pageData.length,
37
- data: pageData,
38
- });
39
- }
40
- catch (error) {
41
- allData.push({
42
- url,
43
- pageIndex: i,
44
- error: error instanceof Error ? error.message : String(error),
45
- });
46
- }
47
- }
48
- return {
49
- content: [
50
- {
51
- type: 'text',
52
- text: `✅ Scraped ${urls.length} pages\n\n${JSON.stringify(allData, null, 2)}`,
53
- },
54
- ],
55
- };
56
- }, 'Failed to scrape multiple pages');
57
- }
58
- /**
59
- * Site structure follow करके pages scrape करता है
60
- */
61
- export async function handleBreadcrumbNavigator(args) {
62
- return await withErrorHandling(async () => {
63
- validateWorkflow('breadcrumb_navigator', {
64
- requireBrowser: true,
65
- requirePage: true,
66
- });
67
- const page = getCurrentPage();
68
- const breadcrumbSelector = args.breadcrumbSelector || '.breadcrumb, nav[aria-label="breadcrumb"], .breadcrumbs';
69
- const followLinks = args.followLinks || false;
70
- const breadcrumbData = await page.evaluate((selector) => {
71
- const breadcrumbs = document.querySelectorAll(selector);
72
- const results = [];
73
- breadcrumbs.forEach((breadcrumb) => {
74
- const links = breadcrumb.querySelectorAll('a');
75
- const items = [];
76
- links.forEach((link, index) => {
77
- items.push({
78
- text: link.textContent?.trim() || '',
79
- href: link.href,
80
- level: index,
81
- });
82
- });
83
- if (items.length > 0) {
84
- results.push({
85
- path: items.map((i) => i.text).join(' > '),
86
- links: items,
87
- });
88
- }
89
- });
90
- return results;
91
- }, breadcrumbSelector);
92
- if (breadcrumbData.length === 0) {
93
- return {
94
- content: [
95
- {
96
- type: 'text',
97
- text: '❌ No breadcrumbs found on page',
98
- },
99
- ],
100
- };
101
- }
102
- let additionalData = '';
103
- if (followLinks && breadcrumbData[0]?.links) {
104
- additionalData = `\n\n📌 To scrape breadcrumb pages, use multi_page_scraper with URLs: ${JSON.stringify(breadcrumbData[0].links.map((l) => l.href))}`;
105
- }
106
- return {
107
- content: [
108
- {
109
- type: 'text',
110
- text: `✅ Found ${breadcrumbData.length} breadcrumb trail(s)\n\n${JSON.stringify(breadcrumbData, null, 2)}${additionalData}`,
111
- },
112
- ],
113
- };
114
- }, 'Failed to navigate breadcrumbs');
115
- }