glippy-mcp 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +90 -13
- package/package.json +2 -1
- package/src/chrome-fetcher.js +213 -0
- package/src/geo-checker.js +480 -30
- package/src/index.js +168 -24
package/README.md
CHANGED
|
@@ -9,14 +9,15 @@ This MCP server enables AI models (Claude, GPT, etc.) to directly analyse any do
|
|
|
9
9
|
It wraps the Glippy desktop app's server-side analysis engine (`geo-checker.js`) and exposes it over the standard MCP protocol via stdio transport.
|
|
10
10
|
|
|
11
11
|
**Key features:**
|
|
12
|
-
- Full
|
|
12
|
+
- Full 16-category GEO analysis with weighted scoring
|
|
13
13
|
- robots.txt AI crawler access detection
|
|
14
14
|
- llms.txt file discovery and parsing
|
|
15
15
|
- Sitemap crawling and multi-page analysis
|
|
16
16
|
- Domain comparison and competitive analysis
|
|
17
17
|
- Export to styled Markdown or HTML reports
|
|
18
|
-
- **Smart caching**
|
|
19
|
-
- **JSON output mode**
|
|
18
|
+
- **Smart caching** - automatic deduplication of repeated analyses
|
|
19
|
+
- **JSON output mode** - pass analysis results between tools to avoid re-crawling
|
|
20
|
+
- **Headless Chrome fallback** - automatically retries via a real browser when a site blocks bot-shaped fetches (Cloudflare, Akamai, DataDome, etc.)
|
|
20
21
|
|
|
21
22
|
---
|
|
22
23
|
|
|
@@ -41,6 +42,7 @@ It wraps the Glippy desktop app's server-side analysis engine (`geo-checker.js`)
|
|
|
41
42
|
- [GEO Scoring Categories](#geo-scoring-categories)
|
|
42
43
|
- [Rate Limiting](#rate-limiting)
|
|
43
44
|
- [Output Formats](#output-formats)
|
|
45
|
+
- [Chrome Rendering Fallback](#chrome-rendering-fallback)
|
|
44
46
|
- [Architecture](#architecture)
|
|
45
47
|
- [Manual Testing](#manual-testing)
|
|
46
48
|
- [Troubleshooting](#troubleshooting)
|
|
@@ -68,6 +70,7 @@ npx -y glippy-mcp
|
|
|
68
70
|
|
|
69
71
|
- Node.js 18.0.0 or higher
|
|
70
72
|
- Valid Glippy MCP license key
|
|
73
|
+
- **Optional:** Google Chrome or Chromium installed locally. Only needed if you want the Chrome-rendered fallback to kick in when a target site blocks static fetches. Without Chrome the server still works; it just cannot recover from WAF-blocked pages.
|
|
71
74
|
|
|
72
75
|
---
|
|
73
76
|
|
|
@@ -124,8 +127,13 @@ Add to your `.mcp.json` in your project root or `~/.claude/.mcp.json` for global
|
|
|
124
127
|
|
|
125
128
|
| Variable | Required | Default | Description |
|
|
126
129
|
|----------|----------|---------|-------------|
|
|
127
|
-
| `GLIPPY_LICENSE_KEY` | Yes |
|
|
130
|
+
| `GLIPPY_LICENSE_KEY` | Yes | - | Your MCP license key (`GLMCP-XXXX-XXXX-XXXX`) |
|
|
128
131
|
| `GLIPPY_RATE_LIMIT` | No | `5` | Default max requests/second per domain for batch tools |
|
|
132
|
+
| `CHROME_PATH` | No | auto-detect | Absolute path to your Chrome/Chromium binary. Overrides the built-in detection list. |
|
|
133
|
+
| `PUPPETEER_EXECUTABLE_PATH` | No | auto-detect | Alternative name for `CHROME_PATH`, honored for puppeteer-core compatibility. |
|
|
134
|
+
| `CHROME_REMOTE_URL` | No | - | Attach to an already-running Chrome instead of launching a new one. Accepts either `http://host:9222` (browserURL) or `ws://...` (browserWSEndpoint). Start Chrome with `--remote-debugging-port=9222`. |
|
|
135
|
+
| `CHROME_HEADLESS` | No | `new` | Set to `0` or `false` to run Chrome visible. Useful for sites that aggressively detect headless. |
|
|
136
|
+
| `CHROME_USER_DATA_DIR` | No | - | Path to a Chrome user-data directory. Lets the fallback reuse cookies, extensions, and auth state from a dedicated profile. |
|
|
129
137
|
|
|
130
138
|
---
|
|
131
139
|
|
|
@@ -160,7 +168,7 @@ The integration guide includes:
|
|
|
160
168
|
|
|
161
169
|
Run a comprehensive GEO readiness analysis on a domain.
|
|
162
170
|
|
|
163
|
-
**Description:** Checks robots.txt, llms.txt, homepage HTML (
|
|
171
|
+
**Description:** Checks robots.txt, llms.txt, homepage HTML (16 scoring categories), sitemap.xml, and security headers. Returns an overall weighted score (0-100) with per-category breakdowns and actionable recommendations. Use `render_mode="auto"` to transparently fall back to headless Chrome when a site blocks static fetches (Cloudflare, Akamai, etc.). Use `output_format="json"` to get raw results that can be passed to `export_report`.
|
|
164
172
|
|
|
165
173
|
**Parameters:**
|
|
166
174
|
|
|
@@ -168,6 +176,7 @@ Run a comprehensive GEO readiness analysis on a domain.
|
|
|
168
176
|
|-----------|------|----------|-------------|
|
|
169
177
|
| `domain` | string | Yes | The domain to analyse, e.g. `"example.com"`. Do not include `https://` prefix. |
|
|
170
178
|
| `max_pages` | integer | No | Maximum pages to crawl (1-10). Default: `10`. |
|
|
179
|
+
| `render_mode` | enum | No | `"static"` (default) = plain Node fetch, fastest. `"auto"` = static first, falls back to a local headless Chrome on bot-blocked responses (401/403/407/429/503 or empty 2xx). `"chrome"` = always render via Chrome. Chrome modes need a local Chrome binary (see [Chrome Rendering Fallback](#chrome-rendering-fallback)). |
|
|
171
180
|
| `output_format` | enum | No | `"text"` (default) for human-readable report, `"json"` for raw results to pass to `export_report`. |
|
|
172
181
|
|
|
173
182
|
**Example:**
|
|
@@ -184,11 +193,12 @@ analyze_domain domain="example.com" max_pages=5 output_format="json"
|
|
|
184
193
|
**Returns:**
|
|
185
194
|
- Overall GEO score (0-100) with letter grade
|
|
186
195
|
- Page type detection (article, product, homepage, etc.)
|
|
187
|
-
-
|
|
196
|
+
- 16 category scores with pass/fail/warn checks
|
|
188
197
|
- robots.txt analysis with AI crawler access
|
|
189
198
|
- llms.txt presence and content preview
|
|
190
199
|
- Sitemap discovery status
|
|
191
200
|
- Multi-page aggregated scores (if `max_pages > 1`)
|
|
201
|
+
- `renderMode` flag on the result: `static`, `chrome-fallback`, or an error code if both paths failed
|
|
192
202
|
|
|
193
203
|
---
|
|
194
204
|
|
|
@@ -264,6 +274,7 @@ Get a concise GEO readiness summary for quick assessment.
|
|
|
264
274
|
| Parameter | Type | Required | Description |
|
|
265
275
|
|-----------|------|----------|-------------|
|
|
266
276
|
| `domain` | string | Yes | The domain to check, e.g. `"example.com"`. Do not include `https://` prefix. |
|
|
277
|
+
| `render_mode` | enum | No | `"static"` (default), `"auto"` (static with Chrome fallback on bot-block), or `"chrome"` (always Chrome). See [Chrome Rendering Fallback](#chrome-rendering-fallback). |
|
|
267
278
|
|
|
268
279
|
**Example:**
|
|
269
280
|
```
|
|
@@ -291,6 +302,7 @@ Analyse multiple domains in parallel and compare scores.
|
|
|
291
302
|
|-----------|------|----------|-------------|
|
|
292
303
|
| `domains` | array[string] | Yes | List of 2-10 domains to compare, e.g. `["example.com", "competitor.com"]`. Do not include `https://` prefix. |
|
|
293
304
|
| `max_pages` | integer | No | Maximum pages to crawl per domain (1-10). Default: `10`. |
|
|
305
|
+
| `render_mode` | enum | No | `"static"` (default), `"auto"` (static with Chrome fallback on bot-block), or `"chrome"` (always Chrome). See [Chrome Rendering Fallback](#chrome-rendering-fallback). |
|
|
294
306
|
| `output_format` | enum | No | `"text"` (default) for comparison table, `"json"` for raw results to pass to `export_bulk_report`. |
|
|
295
307
|
|
|
296
308
|
**Example:**
|
|
@@ -300,7 +312,7 @@ Compare GEO scores of example.com, competitor1.com, and competitor2.com
|
|
|
300
312
|
|
|
301
313
|
**Returns:**
|
|
302
314
|
- Ranked list of domains by score
|
|
303
|
-
- Category comparison table (all
|
|
315
|
+
- Category comparison table (all 16 categories)
|
|
304
316
|
- Quick facts comparison (robots.txt, llms.txt, sitemap, blocked crawlers)
|
|
305
317
|
- Error details for any failed analyses
|
|
306
318
|
|
|
@@ -319,6 +331,7 @@ Fetch a sitemap and analyse all discovered pages.
|
|
|
319
331
|
| `sitemap_url` | string | Yes | Full URL to sitemap, e.g. `"https://example.com/sitemap.xml"` |
|
|
320
332
|
| `max_urls` | integer | No | Maximum URLs to analyse (1-50,000). Default: all URLs found. |
|
|
321
333
|
| `rate_limit` | number | No | Max requests/second per domain (0.1-100). Default: `5`. |
|
|
334
|
+
| `render_mode` | enum | No | `"static"` (default), `"auto"` (static with Chrome fallback on bot-block), or `"chrome"` (always Chrome). Applied per URL. See [Chrome Rendering Fallback](#chrome-rendering-fallback). |
|
|
322
335
|
| `output_format` | enum | No | `"text"` (default) for report, `"json"` for raw results to pass to `export_bulk_report`. |
|
|
323
336
|
|
|
324
337
|
**Example:**
|
|
@@ -350,6 +363,7 @@ Run GEO analysis on a list of specific URLs.
|
|
|
350
363
|
|-----------|------|----------|-------------|
|
|
351
364
|
| `urls` | array[string] | Yes | List of 1-50,000 full URLs, e.g. `["https://example.com/about", "https://example.com/pricing"]`. Include `https://` prefix. |
|
|
352
365
|
| `rate_limit` | number | No | Max requests/second per domain (0.1-100). Default: `5`. |
|
|
366
|
+
| `render_mode` | enum | No | `"static"` (default), `"auto"` (static with Chrome fallback on bot-block), or `"chrome"` (always Chrome). Applied per URL. See [Chrome Rendering Fallback](#chrome-rendering-fallback). |
|
|
353
367
|
| `output_format` | enum | No | `"text"` (default) for report, `"json"` for raw results to pass to `export_bulk_report`. |
|
|
354
368
|
|
|
355
369
|
**Example:**
|
|
@@ -377,6 +391,7 @@ Generate a styled, shareable report file.
|
|
|
377
391
|
| `domain` | string | No* | The domain to analyse, e.g. `"example.com"`. Do not include `https://` prefix. |
|
|
378
392
|
| `format` | enum | Yes | Report format: `"markdown"` (recommendations only), `"markdown_full"` (all categories and checks), or `"html"` (standalone styled page). |
|
|
379
393
|
| `max_pages` | integer | No | Maximum pages to crawl (1-10). Default: `10`. Ignored if `analysis_result` is provided. |
|
|
394
|
+
| `render_mode` | enum | No | `"static"` (default), `"auto"` (Chrome fallback on bot-block), or `"chrome"` (always Chrome). Ignored if `analysis_result` is provided. See [Chrome Rendering Fallback](#chrome-rendering-fallback). |
|
|
380
395
|
| `analysis_result` | object | No* | Pre-computed analysis result from `analyze_domain` (with `output_format="json"`). Skips re-crawling. |
|
|
381
396
|
|
|
382
397
|
*Either `domain` or `analysis_result` must be provided.
|
|
@@ -420,6 +435,7 @@ Generate a styled report for bulk analysis.
|
|
|
420
435
|
| `max_pages` | integer | No | For domain mode: pages per domain (1-10). Default: `10`. Ignored if `analysis_results` provided. |
|
|
421
436
|
| `max_urls` | integer | No | For sitemap mode: max URLs to analyse. Default: all. Ignored if `analysis_results` provided. |
|
|
422
437
|
| `rate_limit` | number | No | Max requests/second per domain. Default: `5`. Ignored if `analysis_results` provided. |
|
|
438
|
+
| `render_mode` | enum | No | `"static"` (default), `"auto"` (Chrome fallback on bot-block), or `"chrome"` (always Chrome). Ignored if `analysis_results` provided. See [Chrome Rendering Fallback](#chrome-rendering-fallback). |
|
|
423
439
|
|
|
424
440
|
*Provide exactly one of: `domains`, `urls`, `sitemap_url`, or `analysis_results`.
|
|
425
441
|
|
|
@@ -445,7 +461,7 @@ export_bulk_report format="html" analysis_results=<result from above>
|
|
|
445
461
|
|
|
446
462
|
## GEO Scoring Categories
|
|
447
463
|
|
|
448
|
-
The analysis evaluates
|
|
464
|
+
The analysis evaluates 16 categories, each with a weight reflecting its importance for AI/LLM readiness:
|
|
449
465
|
|
|
450
466
|
| # | Category | Weight | What It Measures |
|
|
451
467
|
|---|----------|--------|------------------|
|
|
@@ -455,10 +471,16 @@ The analysis evaluates 10 categories, each with a weight reflecting its importan
|
|
|
455
471
|
| 4 | **Internal Linking** | 1.0x | Link density, navigation structure, breadcrumb markup |
|
|
456
472
|
| 5 | **Meta & Discoverability** | 1.0x | Title, meta description, canonical URL, Open Graph tags, hreflang |
|
|
457
473
|
| 6 | **Machine Readability** | 1.5x | SSR detection, bot blocking checks, robots.txt rules, llms.txt presence* |
|
|
458
|
-
| 7 | **Entity & Authority** | 1.0x | Author
|
|
474
|
+
| 7 | **Entity & Authority** | 1.0x | Author info, publication dates, organization schema, E-E-A-T signals, credentials, editorial policy, contact completeness |
|
|
459
475
|
| 8 | **Citability & Answer-Readiness** | 1.3x | FAQ content, data tables, lists, lead paragraph quality |
|
|
460
476
|
| 9 | **Performance & Crawlability** | 0.3x | Image dimensions, lazy loading, resource hints |
|
|
461
477
|
| 10 | **Agent Interactivity** | 0.2x | WebMCP tools, form annotations, agent-callable actions |
|
|
478
|
+
| 11 | **Content Positioning** | 1.2x | Brand differentiation, proof points, social proof |
|
|
479
|
+
| 12 | **Content Freshness** | 0.8x | Date signals, content age, temporal language |
|
|
480
|
+
| 13 | **Information Density** | 1.0x | Substantive-to-filler ratio, section depth, claim-evidence pairing |
|
|
481
|
+
| 14 | **Factual Verifiability** | 0.8x | Citations, source attribution, methodology disclosure |
|
|
482
|
+
| 15 | **Content Comprehensiveness** | 0.8x | Word count, heading coverage, definitions, comparisons |
|
|
483
|
+
| 16 | **Multimodal Content** | 0.5x | Image alt text, figures, video/audio, SVG, multimedia schema |
|
|
462
484
|
|
|
463
485
|
*\*llms.txt is checked for presence but is not currently supported or consumed by any major AI model or crawler. It has minimal practical impact on GEO readiness today — see the [`check_llms_txt`](#check_llms_txt) section for details.*
|
|
464
486
|
|
|
@@ -593,13 +615,68 @@ export_bulk_report format="html" analysis_results=<JSON from step 1>
|
|
|
593
615
|
|
|
594
616
|
---
|
|
595
617
|
|
|
618
|
+
## Chrome Rendering Fallback
|
|
619
|
+
|
|
620
|
+
Some sites (Cloudflare, Akamai, PerimeterX, DataDome, Incapsula) refuse static Node fetches with 401/403/429/503 responses. The server can drive a real Chrome instance to fetch those pages instead, so they still get scored.
|
|
621
|
+
|
|
622
|
+
### Choosing a render mode
|
|
623
|
+
|
|
624
|
+
Every analysis tool (`analyze_domain`, `get_geo_summary`, `compare_domains`, `analyze_urls`, `analyze_sitemap`, `export_report`, `export_bulk_report`) accepts a `render_mode` parameter:
|
|
625
|
+
|
|
626
|
+
| Mode | Behavior | Use when |
|
|
627
|
+
|------|----------|----------|
|
|
628
|
+
| `static` *(default)* | Plain Node fetch. Fast. No Chrome required. | You're scoring sites that don't block bots, or you explicitly want to see how a static crawler experiences the page. |
|
|
629
|
+
| `auto` | Static fetch first. If it looks bot-blocked (status 401/403/407/429/503, or 2xx with an empty body), retry that URL via Chrome. | Mixed workloads - most sites fast-path through static; only blocked ones pay the Chrome cost. Recommended for competitive audits across a list of domains. |
|
|
630
|
+
| `chrome` | Every URL fetched via Chrome. Slowest, most resilient. | You know the targets aggressively detect headless and want to front-load the Chrome cost, or you're debugging rendering differences. |
|
|
631
|
+
|
|
632
|
+
The result object includes a `renderMode` field so you can tell which path ran: `static`, `chrome`, `chrome-fallback`, `chrome-blocked-<code>` (Chrome tried but also got blocked), or `static-blocked` (both paths failed).
|
|
633
|
+
|
|
634
|
+
### Setup
|
|
635
|
+
|
|
636
|
+
Chrome modes need a Chrome or Chromium binary. The server looks in these locations, in order:
|
|
637
|
+
|
|
638
|
+
1. `CHROME_PATH` env var
|
|
639
|
+
2. `PUPPETEER_EXECUTABLE_PATH` env var
|
|
640
|
+
3. `C:/Program Files/Google/Chrome/Application/chrome.exe`
|
|
641
|
+
4. `C:/Program Files (x86)/Google/Chrome/Application/chrome.exe`
|
|
642
|
+
5. `/Applications/Google Chrome.app/Contents/MacOS/Google Chrome`
|
|
643
|
+
6. `/usr/bin/google-chrome`, `/usr/bin/chromium`, `/usr/bin/chromium-browser`
|
|
644
|
+
|
|
645
|
+
If none exist, `render_mode: "static"` still works; only the Chrome-backed modes become unavailable.
|
|
646
|
+
|
|
647
|
+
### Attaching to your own Chrome
|
|
648
|
+
|
|
649
|
+
For sites that fingerprint headless Chrome, start a Chrome instance with remote debugging and point the server at it. The server will attach to that instance instead of launching its own:
|
|
650
|
+
|
|
651
|
+
```bash
|
|
652
|
+
# macOS
|
|
653
|
+
/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome \
|
|
654
|
+
--remote-debugging-port=9222 --user-data-dir=/tmp/glippy-chrome
|
|
655
|
+
|
|
656
|
+
# Windows (PowerShell)
|
|
657
|
+
& "C:\Program Files\Google\Chrome\Application\chrome.exe" `
|
|
658
|
+
--remote-debugging-port=9222 --user-data-dir=C:\Temp\glippy-chrome
|
|
659
|
+
|
|
660
|
+
# Then in your MCP config env:
|
|
661
|
+
# CHROME_REMOTE_URL=http://127.0.0.1:9222
|
|
662
|
+
```
|
|
663
|
+
|
|
664
|
+
Using a dedicated `--user-data-dir` keeps this session isolated from your normal browsing. When attached, the fetcher leaves UA/headers/stealth untouched so requests look identical to a human using that browser.
|
|
665
|
+
|
|
666
|
+
### Visible mode
|
|
667
|
+
|
|
668
|
+
For debugging, set `CHROME_HEADLESS=0` to watch Chrome drive itself. Purely for development - leave it off in production.
|
|
669
|
+
|
|
670
|
+
---
|
|
671
|
+
|
|
596
672
|
## Architecture
|
|
597
673
|
|
|
598
674
|
```
|
|
599
675
|
research-mcp/
|
|
600
676
|
├── src/
|
|
601
|
-
│ ├── index.js
|
|
602
|
-
│
|
|
677
|
+
│ ├── index.js # MCP server - tool registration, JSON-RPC handling, license validation
|
|
678
|
+
│ ├── geo-checker.js # GEO analysis engine - fetches & scores domains
|
|
679
|
+
│ └── chrome-fetcher.js # Headless Chrome adapter (puppeteer-core) for WAF-blocked sites
|
|
603
680
|
├── package.json
|
|
604
681
|
└── README.md
|
|
605
682
|
```
|
|
@@ -609,13 +686,13 @@ research-mcp/
|
|
|
609
686
|
1. **Fetch resources in parallel:**
|
|
610
687
|
- robots.txt
|
|
611
688
|
- llms.txt
|
|
612
|
-
- Homepage HTML
|
|
689
|
+
- Homepage HTML (static fetch first, Chrome fallback if bot-blocked)
|
|
613
690
|
- sitemap.xml
|
|
614
691
|
- UCP profile (/.well-known/ucp)
|
|
615
692
|
|
|
616
693
|
2. **Parse HTML with cheerio** (server-side DOM)
|
|
617
694
|
|
|
618
|
-
3. **Run
|
|
695
|
+
3. **Run 16 weighted scoring categories**
|
|
619
696
|
|
|
620
697
|
4. **Return comprehensive analysis** with actionable recommendations
|
|
621
698
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "glippy-mcp",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "MCP server for GEO (Generative Engine Optimization) analysis — check any domain's AI-readiness",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"type": "module",
|
|
@@ -38,6 +38,7 @@
|
|
|
38
38
|
"dependencies": {
|
|
39
39
|
"@modelcontextprotocol/sdk": "^1.12.1",
|
|
40
40
|
"cheerio": "^1.0.0",
|
|
41
|
+
"puppeteer-core": "^24.40.0",
|
|
41
42
|
"zod": "^3.24.0"
|
|
42
43
|
}
|
|
43
44
|
}
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
// Chrome-backed fetch adapter for geo-checker.
|
|
2
|
+
//
|
|
3
|
+
// Exposes the same shape as the internal throttledFetchUrl:
|
|
4
|
+
// { body, statusCode, headers, finalUrl }
|
|
5
|
+
// but drives a headless Chrome via puppeteer-core so that bot-mitigation
|
|
6
|
+
// layers (Cloudflare, Akamai, PerimeterX, DataDome, Incapsula) that block
|
|
7
|
+
// raw Node fetches don't keep us out.
|
|
8
|
+
//
|
|
9
|
+
// The module holds a single long-lived browser + page pair. Callers fetch
|
|
10
|
+
// URLs sequentially; this is fine for the audit path (one domain at a time
|
|
11
|
+
// per checkGEO call) and avoids spinning up a new chromium process per page.
|
|
12
|
+
|
|
13
|
+
import puppeteer from 'puppeteer-core';
|
|
14
|
+
|
|
15
|
+
const DEFAULT_TIMEOUT_MS = 30_000;
|
|
16
|
+
const WAIT_UNTIL = 'networkidle2';
|
|
17
|
+
|
|
18
|
+
const DEFAULT_CHROME_PATHS = [
|
|
19
|
+
process.env.CHROME_PATH,
|
|
20
|
+
process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
21
|
+
'C:/Program Files/Google/Chrome/Application/chrome.exe',
|
|
22
|
+
'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe',
|
|
23
|
+
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
|
24
|
+
'/usr/bin/google-chrome',
|
|
25
|
+
'/usr/bin/chromium',
|
|
26
|
+
'/usr/bin/chromium-browser',
|
|
27
|
+
].filter(Boolean);
|
|
28
|
+
|
|
29
|
+
let browserPromise = null;
|
|
30
|
+
let connectedToExisting = false;
|
|
31
|
+
|
|
32
|
+
async function resolveChromePath() {
|
|
33
|
+
const fs = await import('node:fs/promises');
|
|
34
|
+
for (const p of DEFAULT_CHROME_PATHS) {
|
|
35
|
+
try {
|
|
36
|
+
await fs.access(p);
|
|
37
|
+
return p;
|
|
38
|
+
} catch {
|
|
39
|
+
// try next
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return null;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
async function getBrowser() {
|
|
46
|
+
if (browserPromise) return browserPromise;
|
|
47
|
+
browserPromise = (async () => {
|
|
48
|
+
// Mode 1: attach to a user's already-running Chrome via CDP.
|
|
49
|
+
// Start Chrome with `--remote-debugging-port=9222` and (if they want to
|
|
50
|
+
// reuse their normal profile) pass `--user-data-dir=...` to a dedicated
|
|
51
|
+
// clone. CHROME_REMOTE_URL can be either browserURL (http://host:port)
|
|
52
|
+
// or a browserWSEndpoint (ws://...).
|
|
53
|
+
const remoteUrl = process.env.CHROME_REMOTE_URL;
|
|
54
|
+
if (remoteUrl) {
|
|
55
|
+
const opts = remoteUrl.startsWith('ws')
|
|
56
|
+
? { browserWSEndpoint: remoteUrl }
|
|
57
|
+
: { browserURL: remoteUrl };
|
|
58
|
+
const browser = await puppeteer.connect({
|
|
59
|
+
...opts,
|
|
60
|
+
defaultViewport: null,
|
|
61
|
+
});
|
|
62
|
+
connectedToExisting = true;
|
|
63
|
+
return browser;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Mode 2: launch our own Chrome. Headless by default; set
|
|
67
|
+
// CHROME_HEADLESS=0 to run visible (useful for sites that aggressively
|
|
68
|
+
// detect headless).
|
|
69
|
+
const executablePath = await resolveChromePath();
|
|
70
|
+
if (!executablePath) {
|
|
71
|
+
throw new Error(
|
|
72
|
+
'Chrome executable not found. Set CHROME_PATH or install Chrome/Chromium.',
|
|
73
|
+
);
|
|
74
|
+
}
|
|
75
|
+
const headlessEnv = process.env.CHROME_HEADLESS;
|
|
76
|
+
const headless = headlessEnv === '0' || headlessEnv === 'false' ? false : 'new';
|
|
77
|
+
const userDataDir = process.env.CHROME_USER_DATA_DIR || undefined;
|
|
78
|
+
const browser = await puppeteer.launch({
|
|
79
|
+
executablePath,
|
|
80
|
+
headless,
|
|
81
|
+
userDataDir,
|
|
82
|
+
args: [
|
|
83
|
+
'--no-sandbox',
|
|
84
|
+
'--disable-dev-shm-usage',
|
|
85
|
+
'--disable-blink-features=AutomationControlled',
|
|
86
|
+
'--disable-features=IsolateOrigins,site-per-process',
|
|
87
|
+
],
|
|
88
|
+
});
|
|
89
|
+
return browser;
|
|
90
|
+
})();
|
|
91
|
+
return browserPromise;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
async function applyStealth(page) {
|
|
95
|
+
// Minimal stealth: mask the navigator.webdriver flag and add common
|
|
96
|
+
// properties that headless Chrome misses. This won't defeat enterprise
|
|
97
|
+
// bot mitigation, but clears the trivial checks many WAFs rely on.
|
|
98
|
+
await page.evaluateOnNewDocument(() => {
|
|
99
|
+
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
100
|
+
// languages / plugins
|
|
101
|
+
Object.defineProperty(navigator, 'languages', { get: () => ['nl-NL', 'nl', 'en-US', 'en'] });
|
|
102
|
+
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
|
103
|
+
// chrome.runtime stub
|
|
104
|
+
window.chrome = window.chrome || { runtime: {} };
|
|
105
|
+
// permissions query patch (Notification)
|
|
106
|
+
const originalQuery = window.navigator.permissions && window.navigator.permissions.query;
|
|
107
|
+
if (originalQuery) {
|
|
108
|
+
window.navigator.permissions.query = (parameters) =>
|
|
109
|
+
parameters.name === 'notifications'
|
|
110
|
+
? Promise.resolve({ state: Notification.permission })
|
|
111
|
+
: originalQuery(parameters);
|
|
112
|
+
}
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
export async function chromeFetch(url, timeoutMs = DEFAULT_TIMEOUT_MS) {
|
|
117
|
+
const empty = { body: null, statusCode: null, headers: {}, finalUrl: null };
|
|
118
|
+
let page;
|
|
119
|
+
try {
|
|
120
|
+
const browser = await getBrowser();
|
|
121
|
+
page = await browser.newPage();
|
|
122
|
+
// When attached to a user's Chrome, leave UA/headers/stealth alone —
|
|
123
|
+
// their real profile already looks like a human. Only shape the
|
|
124
|
+
// request when we launched Chrome ourselves.
|
|
125
|
+
if (!connectedToExisting) {
|
|
126
|
+
await page.setViewport({ width: 1366, height: 768 });
|
|
127
|
+
await page.setUserAgent(
|
|
128
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
129
|
+
);
|
|
130
|
+
await page.setExtraHTTPHeaders({
|
|
131
|
+
'Accept-Language': 'nl-NL,nl;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
132
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
|
133
|
+
'Upgrade-Insecure-Requests': '1',
|
|
134
|
+
'Sec-Fetch-Dest': 'document',
|
|
135
|
+
'Sec-Fetch-Mode': 'navigate',
|
|
136
|
+
'Sec-Fetch-Site': 'none',
|
|
137
|
+
'Sec-Fetch-User': '?1',
|
|
138
|
+
});
|
|
139
|
+
await applyStealth(page);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
const response = await page.goto(url, {
|
|
143
|
+
waitUntil: WAIT_UNTIL,
|
|
144
|
+
timeout: timeoutMs,
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
if (!response) return empty;
|
|
148
|
+
|
|
149
|
+
let statusCode = response.status();
|
|
150
|
+
let headers = response.headers() || {};
|
|
151
|
+
// Some WAFs (Cloudflare) serve a 403 interstitial, then JS solves
|
|
152
|
+
// the challenge and navigates to real content. Give it a brief window
|
|
153
|
+
// to settle and re-read the final status from the live document.
|
|
154
|
+
if (statusCode === 403 || statusCode === 503) {
|
|
155
|
+
try {
|
|
156
|
+
await page.waitForFunction(
|
|
157
|
+
() => {
|
|
158
|
+
const html = document.documentElement ? document.documentElement.outerHTML : '';
|
|
159
|
+
// Cloudflare challenge markers
|
|
160
|
+
return !/cf-challenge|cf-browser-verification|Just a moment/i.test(html);
|
|
161
|
+
},
|
|
162
|
+
{ timeout: 8000 },
|
|
163
|
+
);
|
|
164
|
+
// Re-evaluate: if navigation happened, fetch the new main response.
|
|
165
|
+
const finalResp = page.mainFrame().url() !== url
|
|
166
|
+
? await page.waitForResponse(() => true, { timeout: 2000 }).catch(() => null)
|
|
167
|
+
: null;
|
|
168
|
+
if (finalResp) {
|
|
169
|
+
statusCode = finalResp.status();
|
|
170
|
+
headers = finalResp.headers() || headers;
|
|
171
|
+
}
|
|
172
|
+
} catch {
|
|
173
|
+
// challenge didn't clear — keep the 403/503 so caller can decide.
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
const finalUrl = page.url();
|
|
178
|
+
let body = null;
|
|
179
|
+
try {
|
|
180
|
+
body = await page.content();
|
|
181
|
+
} catch {
|
|
182
|
+
body = null;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
return { body, statusCode, headers, finalUrl };
|
|
186
|
+
} catch (err) {
|
|
187
|
+
return { ...empty, error: err.message };
|
|
188
|
+
} finally {
|
|
189
|
+
if (page) {
|
|
190
|
+
try { await page.close(); } catch { /* ignore */ }
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
export async function closeBrowser() {
|
|
196
|
+
if (!browserPromise) return;
|
|
197
|
+
try {
|
|
198
|
+
const browser = await browserPromise;
|
|
199
|
+
if (connectedToExisting) {
|
|
200
|
+
await browser.disconnect();
|
|
201
|
+
} else {
|
|
202
|
+
await browser.close();
|
|
203
|
+
}
|
|
204
|
+
} catch { /* ignore */ }
|
|
205
|
+
browserPromise = null;
|
|
206
|
+
connectedToExisting = false;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// Close on process exit so the Chrome process doesn't linger.
|
|
210
|
+
const shutdown = () => { closeBrowser().catch(() => {}); };
|
|
211
|
+
process.once('exit', shutdown);
|
|
212
|
+
process.once('SIGINT', () => { shutdown(); process.exit(130); });
|
|
213
|
+
process.once('SIGTERM', () => { shutdown(); process.exit(143); });
|