glippy-mcp 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +90 -13
- package/package.json +2 -1
- package/src/chrome-fetcher.js +213 -0
- package/src/geo-checker.js +257 -21
- package/src/index.js +168 -24
package/README.md
CHANGED
|
@@ -9,14 +9,15 @@ This MCP server enables AI models (Claude, GPT, etc.) to directly analyse any do
|
|
|
9
9
|
It wraps the Glippy desktop app's server-side analysis engine (`geo-checker.js`) and exposes it over the standard MCP protocol via stdio transport.
|
|
10
10
|
|
|
11
11
|
**Key features:**
|
|
12
|
-
- Full
|
|
12
|
+
- Full 16-category GEO analysis with weighted scoring
|
|
13
13
|
- robots.txt AI crawler access detection
|
|
14
14
|
- llms.txt file discovery and parsing
|
|
15
15
|
- Sitemap crawling and multi-page analysis
|
|
16
16
|
- Domain comparison and competitive analysis
|
|
17
17
|
- Export to styled Markdown or HTML reports
|
|
18
|
-
- **Smart caching**
|
|
19
|
-
- **JSON output mode**
|
|
18
|
+
- **Smart caching** - automatic deduplication of repeated analyses
|
|
19
|
+
- **JSON output mode** - pass analysis results between tools to avoid re-crawling
|
|
20
|
+
- **Headless Chrome fallback** - automatically retries via a real browser when a site blocks bot-shaped fetches (Cloudflare, Akamai, DataDome, etc.)
|
|
20
21
|
|
|
21
22
|
---
|
|
22
23
|
|
|
@@ -41,6 +42,7 @@ It wraps the Glippy desktop app's server-side analysis engine (`geo-checker.js`)
|
|
|
41
42
|
- [GEO Scoring Categories](#geo-scoring-categories)
|
|
42
43
|
- [Rate Limiting](#rate-limiting)
|
|
43
44
|
- [Output Formats](#output-formats)
|
|
45
|
+
- [Chrome Rendering Fallback](#chrome-rendering-fallback)
|
|
44
46
|
- [Architecture](#architecture)
|
|
45
47
|
- [Manual Testing](#manual-testing)
|
|
46
48
|
- [Troubleshooting](#troubleshooting)
|
|
@@ -68,6 +70,7 @@ npx -y glippy-mcp
|
|
|
68
70
|
|
|
69
71
|
- Node.js 18.0.0 or higher
|
|
70
72
|
- Valid Glippy MCP license key
|
|
73
|
+
- **Optional:** Google Chrome or Chromium installed locally. Only needed if you want the Chrome-rendered fallback to kick in when a target site blocks static fetches. Without Chrome the server still works; it just cannot recover from WAF-blocked pages.
|
|
71
74
|
|
|
72
75
|
---
|
|
73
76
|
|
|
@@ -124,8 +127,13 @@ Add to your `.mcp.json` in your project root or `~/.claude/.mcp.json` for global
|
|
|
124
127
|
|
|
125
128
|
| Variable | Required | Default | Description |
|
|
126
129
|
|----------|----------|---------|-------------|
|
|
127
|
-
| `GLIPPY_LICENSE_KEY` | Yes |
|
|
130
|
+
| `GLIPPY_LICENSE_KEY` | Yes | - | Your MCP license key (`GLMCP-XXXX-XXXX-XXXX`) |
|
|
128
131
|
| `GLIPPY_RATE_LIMIT` | No | `5` | Default max requests/second per domain for batch tools |
|
|
132
|
+
| `CHROME_PATH` | No | auto-detect | Absolute path to your Chrome/Chromium binary. Overrides the built-in detection list. |
|
|
133
|
+
| `PUPPETEER_EXECUTABLE_PATH` | No | auto-detect | Alternative name for `CHROME_PATH`, honored for puppeteer-core compatibility. |
|
|
134
|
+
| `CHROME_REMOTE_URL` | No | - | Attach to an already-running Chrome instead of launching a new one. Accepts either `http://host:9222` (browserURL) or `ws://...` (browserWSEndpoint). Start Chrome with `--remote-debugging-port=9222`. |
|
|
135
|
+
| `CHROME_HEADLESS` | No | `new` | Set to `0` or `false` to run Chrome visible. Useful for sites that aggressively detect headless. |
|
|
136
|
+
| `CHROME_USER_DATA_DIR` | No | - | Path to a Chrome user-data directory. Lets the fallback reuse cookies, extensions, and auth state from a dedicated profile. |
|
|
129
137
|
|
|
130
138
|
---
|
|
131
139
|
|
|
@@ -160,7 +168,7 @@ The integration guide includes:
|
|
|
160
168
|
|
|
161
169
|
Run a comprehensive GEO readiness analysis on a domain.
|
|
162
170
|
|
|
163
|
-
**Description:** Checks robots.txt, llms.txt, homepage HTML (
|
|
171
|
+
**Description:** Checks robots.txt, llms.txt, homepage HTML (16 scoring categories), sitemap.xml, and security headers. Returns an overall weighted score (0-100) with per-category breakdowns and actionable recommendations. Use `render_mode="auto"` to transparently fall back to headless Chrome when a site blocks static fetches (Cloudflare, Akamai, etc.). Use `output_format="json"` to get raw results that can be passed to `export_report`.
|
|
164
172
|
|
|
165
173
|
**Parameters:**
|
|
166
174
|
|
|
@@ -168,6 +176,7 @@ Run a comprehensive GEO readiness analysis on a domain.
|
|
|
168
176
|
|-----------|------|----------|-------------|
|
|
169
177
|
| `domain` | string | Yes | The domain to analyse, e.g. `"example.com"`. Do not include `https://` prefix. |
|
|
170
178
|
| `max_pages` | integer | No | Maximum pages to crawl (1-10). Default: `10`. |
|
|
179
|
+
| `render_mode` | enum | No | `"static"` (default) = plain Node fetch, fastest. `"auto"` = static first, falls back to a local headless Chrome on bot-blocked responses (401/403/407/429/503 or empty 2xx). `"chrome"` = always render via Chrome. Chrome modes need a local Chrome binary (see [Chrome Rendering Fallback](#chrome-rendering-fallback)). |
|
|
171
180
|
| `output_format` | enum | No | `"text"` (default) for human-readable report, `"json"` for raw results to pass to `export_report`. |
|
|
172
181
|
|
|
173
182
|
**Example:**
|
|
@@ -184,11 +193,12 @@ analyze_domain domain="example.com" max_pages=5 output_format="json"
|
|
|
184
193
|
**Returns:**
|
|
185
194
|
- Overall GEO score (0-100) with letter grade
|
|
186
195
|
- Page type detection (article, product, homepage, etc.)
|
|
187
|
-
-
|
|
196
|
+
- 16 category scores with pass/fail/warn checks
|
|
188
197
|
- robots.txt analysis with AI crawler access
|
|
189
198
|
- llms.txt presence and content preview
|
|
190
199
|
- Sitemap discovery status
|
|
191
200
|
- Multi-page aggregated scores (if `max_pages > 1`)
|
|
201
|
+
- `renderMode` flag on the result: `static`, `chrome-fallback`, or an error code if both paths failed
|
|
192
202
|
|
|
193
203
|
---
|
|
194
204
|
|
|
@@ -264,6 +274,7 @@ Get a concise GEO readiness summary for quick assessment.
|
|
|
264
274
|
| Parameter | Type | Required | Description |
|
|
265
275
|
|-----------|------|----------|-------------|
|
|
266
276
|
| `domain` | string | Yes | The domain to check, e.g. `"example.com"`. Do not include `https://` prefix. |
|
|
277
|
+
| `render_mode` | enum | No | `"static"` (default), `"auto"` (static with Chrome fallback on bot-block), or `"chrome"` (always Chrome). See [Chrome Rendering Fallback](#chrome-rendering-fallback). |
|
|
267
278
|
|
|
268
279
|
**Example:**
|
|
269
280
|
```
|
|
@@ -291,6 +302,7 @@ Analyse multiple domains in parallel and compare scores.
|
|
|
291
302
|
|-----------|------|----------|-------------|
|
|
292
303
|
| `domains` | array[string] | Yes | List of 2-10 domains to compare, e.g. `["example.com", "competitor.com"]`. Do not include `https://` prefix. |
|
|
293
304
|
| `max_pages` | integer | No | Maximum pages to crawl per domain (1-10). Default: `10`. |
|
|
305
|
+
| `render_mode` | enum | No | `"static"` (default), `"auto"` (static with Chrome fallback on bot-block), or `"chrome"` (always Chrome). See [Chrome Rendering Fallback](#chrome-rendering-fallback). |
|
|
294
306
|
| `output_format` | enum | No | `"text"` (default) for comparison table, `"json"` for raw results to pass to `export_bulk_report`. |
|
|
295
307
|
|
|
296
308
|
**Example:**
|
|
@@ -300,7 +312,7 @@ Compare GEO scores of example.com, competitor1.com, and competitor2.com
|
|
|
300
312
|
|
|
301
313
|
**Returns:**
|
|
302
314
|
- Ranked list of domains by score
|
|
303
|
-
- Category comparison table (all
|
|
315
|
+
- Category comparison table (all 16 categories)
|
|
304
316
|
- Quick facts comparison (robots.txt, llms.txt, sitemap, blocked crawlers)
|
|
305
317
|
- Error details for any failed analyses
|
|
306
318
|
|
|
@@ -319,6 +331,7 @@ Fetch a sitemap and analyse all discovered pages.
|
|
|
319
331
|
| `sitemap_url` | string | Yes | Full URL to sitemap, e.g. `"https://example.com/sitemap.xml"` |
|
|
320
332
|
| `max_urls` | integer | No | Maximum URLs to analyse (1-50,000). Default: all URLs found. |
|
|
321
333
|
| `rate_limit` | number | No | Max requests/second per domain (0.1-100). Default: `5`. |
|
|
334
|
+
| `render_mode` | enum | No | `"static"` (default), `"auto"` (static with Chrome fallback on bot-block), or `"chrome"` (always Chrome). Applied per URL. See [Chrome Rendering Fallback](#chrome-rendering-fallback). |
|
|
322
335
|
| `output_format` | enum | No | `"text"` (default) for report, `"json"` for raw results to pass to `export_bulk_report`. |
|
|
323
336
|
|
|
324
337
|
**Example:**
|
|
@@ -350,6 +363,7 @@ Run GEO analysis on a list of specific URLs.
|
|
|
350
363
|
|-----------|------|----------|-------------|
|
|
351
364
|
| `urls` | array[string] | Yes | List of 1-50,000 full URLs, e.g. `["https://example.com/about", "https://example.com/pricing"]`. Include `https://` prefix. |
|
|
352
365
|
| `rate_limit` | number | No | Max requests/second per domain (0.1-100). Default: `5`. |
|
|
366
|
+
| `render_mode` | enum | No | `"static"` (default), `"auto"` (static with Chrome fallback on bot-block), or `"chrome"` (always Chrome). Applied per URL. See [Chrome Rendering Fallback](#chrome-rendering-fallback). |
|
|
353
367
|
| `output_format` | enum | No | `"text"` (default) for report, `"json"` for raw results to pass to `export_bulk_report`. |
|
|
354
368
|
|
|
355
369
|
**Example:**
|
|
@@ -377,6 +391,7 @@ Generate a styled, shareable report file.
|
|
|
377
391
|
| `domain` | string | No* | The domain to analyse, e.g. `"example.com"`. Do not include `https://` prefix. |
|
|
378
392
|
| `format` | enum | Yes | Report format: `"markdown"` (recommendations only), `"markdown_full"` (all categories and checks), or `"html"` (standalone styled page). |
|
|
379
393
|
| `max_pages` | integer | No | Maximum pages to crawl (1-10). Default: `10`. Ignored if `analysis_result` is provided. |
|
|
394
|
+
| `render_mode` | enum | No | `"static"` (default), `"auto"` (Chrome fallback on bot-block), or `"chrome"` (always Chrome). Ignored if `analysis_result` is provided. See [Chrome Rendering Fallback](#chrome-rendering-fallback). |
|
|
380
395
|
| `analysis_result` | object | No* | Pre-computed analysis result from `analyze_domain` (with `output_format="json"`). Skips re-crawling. |
|
|
381
396
|
|
|
382
397
|
*Either `domain` or `analysis_result` must be provided.
|
|
@@ -420,6 +435,7 @@ Generate a styled report for bulk analysis.
|
|
|
420
435
|
| `max_pages` | integer | No | For domain mode: pages per domain (1-10). Default: `10`. Ignored if `analysis_results` provided. |
|
|
421
436
|
| `max_urls` | integer | No | For sitemap mode: max URLs to analyse. Default: all. Ignored if `analysis_results` provided. |
|
|
422
437
|
| `rate_limit` | number | No | Max requests/second per domain. Default: `5`. Ignored if `analysis_results` provided. |
|
|
438
|
+
| `render_mode` | enum | No | `"static"` (default), `"auto"` (Chrome fallback on bot-block), or `"chrome"` (always Chrome). Ignored if `analysis_results` provided. See [Chrome Rendering Fallback](#chrome-rendering-fallback). |
|
|
423
439
|
|
|
424
440
|
*Provide exactly one of: `domains`, `urls`, `sitemap_url`, or `analysis_results`.
|
|
425
441
|
|
|
@@ -445,7 +461,7 @@ export_bulk_report format="html" analysis_results=<result from above>
|
|
|
445
461
|
|
|
446
462
|
## GEO Scoring Categories
|
|
447
463
|
|
|
448
|
-
The analysis evaluates
|
|
464
|
+
The analysis evaluates 16 categories, each with a weight reflecting its importance for AI/LLM readiness:
|
|
449
465
|
|
|
450
466
|
| # | Category | Weight | What It Measures |
|
|
451
467
|
|---|----------|--------|------------------|
|
|
@@ -455,10 +471,16 @@ The analysis evaluates 10 categories, each with a weight reflecting its importan
|
|
|
455
471
|
| 4 | **Internal Linking** | 1.0x | Link density, navigation structure, breadcrumb markup |
|
|
456
472
|
| 5 | **Meta & Discoverability** | 1.0x | Title, meta description, canonical URL, Open Graph tags, hreflang |
|
|
457
473
|
| 6 | **Machine Readability** | 1.5x | SSR detection, bot blocking checks, robots.txt rules, llms.txt presence* |
|
|
458
|
-
| 7 | **Entity & Authority** | 1.0x | Author
|
|
474
|
+
| 7 | **Entity & Authority** | 1.0x | Author info, publication dates, organization schema, E-E-A-T signals, credentials, editorial policy, contact completeness |
|
|
459
475
|
| 8 | **Citability & Answer-Readiness** | 1.3x | FAQ content, data tables, lists, lead paragraph quality |
|
|
460
476
|
| 9 | **Performance & Crawlability** | 0.3x | Image dimensions, lazy loading, resource hints |
|
|
461
477
|
| 10 | **Agent Interactivity** | 0.2x | WebMCP tools, form annotations, agent-callable actions |
|
|
478
|
+
| 11 | **Content Positioning** | 1.2x | Brand differentiation, proof points, social proof |
|
|
479
|
+
| 12 | **Content Freshness** | 0.8x | Date signals, content age, temporal language |
|
|
480
|
+
| 13 | **Information Density** | 1.0x | Substantive-to-filler ratio, section depth, claim-evidence pairing |
|
|
481
|
+
| 14 | **Factual Verifiability** | 0.8x | Citations, source attribution, methodology disclosure |
|
|
482
|
+
| 15 | **Content Comprehensiveness** | 0.8x | Word count, heading coverage, definitions, comparisons |
|
|
483
|
+
| 16 | **Multimodal Content** | 0.5x | Image alt text, figures, video/audio, SVG, multimedia schema |
|
|
462
484
|
|
|
463
485
|
*\*llms.txt is checked for presence but is not currently supported or consumed by any major AI model or crawler. It has minimal practical impact on GEO readiness today — see the [`check_llms_txt`](#check_llms_txt) section for details.*
|
|
464
486
|
|
|
@@ -593,13 +615,68 @@ export_bulk_report format="html" analysis_results=<JSON from step 1>
|
|
|
593
615
|
|
|
594
616
|
---
|
|
595
617
|
|
|
618
|
+
## Chrome Rendering Fallback
|
|
619
|
+
|
|
620
|
+
Some sites (Cloudflare, Akamai, PerimeterX, DataDome, Incapsula) refuse static Node fetches with 401/403/429/503 responses. The server can drive a real Chrome instance to fetch those pages instead, so they still get scored.
|
|
621
|
+
|
|
622
|
+
### Choosing a render mode
|
|
623
|
+
|
|
624
|
+
Every analysis tool (`analyze_domain`, `get_geo_summary`, `compare_domains`, `analyze_urls`, `analyze_sitemap`, `export_report`, `export_bulk_report`) accepts a `render_mode` parameter:
|
|
625
|
+
|
|
626
|
+
| Mode | Behavior | Use when |
|
|
627
|
+
|------|----------|----------|
|
|
628
|
+
| `static` *(default)* | Plain Node fetch. Fast. No Chrome required. | You're scoring sites that don't block bots, or you explicitly want to see how a static crawler experiences the page. |
|
|
629
|
+
| `auto` | Static fetch first. If it looks bot-blocked (status 401/403/407/429/503, or 2xx with an empty body), retry that URL via Chrome. | Mixed workloads - most sites fast-path through static; only blocked ones pay the Chrome cost. Recommended for competitive audits across a list of domains. |
|
|
630
|
+
| `chrome` | Every URL fetched via Chrome. Slowest, most resilient. | You know the targets aggressively detect headless and want to front-load the Chrome cost, or you're debugging rendering differences. |
|
|
631
|
+
|
|
632
|
+
The result object includes a `renderMode` field so you can tell which path ran: `static`, `chrome`, `chrome-fallback`, `chrome-blocked-<code>` (Chrome tried but also got blocked), or `static-blocked` (both paths failed).
|
|
633
|
+
|
|
634
|
+
### Setup
|
|
635
|
+
|
|
636
|
+
Chrome modes need a Chrome or Chromium binary. The server looks in these locations, in order:
|
|
637
|
+
|
|
638
|
+
1. `CHROME_PATH` env var
|
|
639
|
+
2. `PUPPETEER_EXECUTABLE_PATH` env var
|
|
640
|
+
3. `C:/Program Files/Google/Chrome/Application/chrome.exe`
|
|
641
|
+
4. `C:/Program Files (x86)/Google/Chrome/Application/chrome.exe`
|
|
642
|
+
5. `/Applications/Google Chrome.app/Contents/MacOS/Google Chrome`
|
|
643
|
+
6. `/usr/bin/google-chrome`, `/usr/bin/chromium`, `/usr/bin/chromium-browser`
|
|
644
|
+
|
|
645
|
+
If none exist, `render_mode: "static"` still works; only the Chrome-backed modes become unavailable.
|
|
646
|
+
|
|
647
|
+
### Attaching to your own Chrome
|
|
648
|
+
|
|
649
|
+
For sites that fingerprint headless Chrome, start a Chrome instance with remote debugging and point the server at it. The server will attach to that instance instead of launching its own:
|
|
650
|
+
|
|
651
|
+
```bash
|
|
652
|
+
# macOS
|
|
653
|
+
/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome \
|
|
654
|
+
--remote-debugging-port=9222 --user-data-dir=/tmp/glippy-chrome
|
|
655
|
+
|
|
656
|
+
# Windows (PowerShell)
|
|
657
|
+
& "C:\Program Files\Google\Chrome\Application\chrome.exe" `
|
|
658
|
+
--remote-debugging-port=9222 --user-data-dir=C:\Temp\glippy-chrome
|
|
659
|
+
|
|
660
|
+
# Then in your MCP config env:
|
|
661
|
+
# CHROME_REMOTE_URL=http://127.0.0.1:9222
|
|
662
|
+
```
|
|
663
|
+
|
|
664
|
+
Using a dedicated `--user-data-dir` keeps this session isolated from your normal browsing. When attached, the fetcher leaves UA/headers/stealth untouched so requests look identical to a human using that browser.
|
|
665
|
+
|
|
666
|
+
### Visible mode
|
|
667
|
+
|
|
668
|
+
For debugging, set `CHROME_HEADLESS=0` to watch Chrome drive itself. Purely for development - leave it off in production.
|
|
669
|
+
|
|
670
|
+
---
|
|
671
|
+
|
|
596
672
|
## Architecture
|
|
597
673
|
|
|
598
674
|
```
|
|
599
675
|
research-mcp/
|
|
600
676
|
├── src/
|
|
601
|
-
│ ├── index.js
|
|
602
|
-
│
|
|
677
|
+
│ ├── index.js # MCP server - tool registration, JSON-RPC handling, license validation
|
|
678
|
+
│ ├── geo-checker.js # GEO analysis engine - fetches & scores domains
|
|
679
|
+
│ └── chrome-fetcher.js # Headless Chrome adapter (puppeteer-core) for WAF-blocked sites
|
|
603
680
|
├── package.json
|
|
604
681
|
└── README.md
|
|
605
682
|
```
|
|
@@ -609,13 +686,13 @@ research-mcp/
|
|
|
609
686
|
1. **Fetch resources in parallel:**
|
|
610
687
|
- robots.txt
|
|
611
688
|
- llms.txt
|
|
612
|
-
- Homepage HTML
|
|
689
|
+
- Homepage HTML (static fetch first, Chrome fallback if bot-blocked)
|
|
613
690
|
- sitemap.xml
|
|
614
691
|
- UCP profile (/.well-known/ucp)
|
|
615
692
|
|
|
616
693
|
2. **Parse HTML with cheerio** (server-side DOM)
|
|
617
694
|
|
|
618
|
-
3. **Run
|
|
695
|
+
3. **Run 16 weighted scoring categories**
|
|
619
696
|
|
|
620
697
|
4. **Return comprehensive analysis** with actionable recommendations
|
|
621
698
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "glippy-mcp",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "MCP server for GEO (Generative Engine Optimization) analysis — check any domain's AI-readiness",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"type": "module",
|
|
@@ -38,6 +38,7 @@
|
|
|
38
38
|
"dependencies": {
|
|
39
39
|
"@modelcontextprotocol/sdk": "^1.12.1",
|
|
40
40
|
"cheerio": "^1.0.0",
|
|
41
|
+
"puppeteer-core": "^24.40.0",
|
|
41
42
|
"zod": "^3.24.0"
|
|
42
43
|
}
|
|
43
44
|
}
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
// Chrome-backed fetch adapter for geo-checker.
|
|
2
|
+
//
|
|
3
|
+
// Exposes the same shape as the internal throttledFetchUrl:
|
|
4
|
+
// { body, statusCode, headers, finalUrl }
|
|
5
|
+
// but drives a headless Chrome via puppeteer-core so that bot-mitigation
|
|
6
|
+
// layers (Cloudflare, Akamai, PerimeterX, DataDome, Incapsula) that block
|
|
7
|
+
// raw Node fetches don't keep us out.
|
|
8
|
+
//
|
|
9
|
+
// The module holds a single long-lived browser + page pair. Callers fetch
|
|
10
|
+
// URLs sequentially; this is fine for the audit path (one domain at a time
|
|
11
|
+
// per checkGEO call) and avoids spinning up a new chromium process per page.
|
|
12
|
+
|
|
13
|
+
import puppeteer from 'puppeteer-core';
|
|
14
|
+
|
|
15
|
+
const DEFAULT_TIMEOUT_MS = 30_000;
|
|
16
|
+
const WAIT_UNTIL = 'networkidle2';
|
|
17
|
+
|
|
18
|
+
const DEFAULT_CHROME_PATHS = [
|
|
19
|
+
process.env.CHROME_PATH,
|
|
20
|
+
process.env.PUPPETEER_EXECUTABLE_PATH,
|
|
21
|
+
'C:/Program Files/Google/Chrome/Application/chrome.exe',
|
|
22
|
+
'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe',
|
|
23
|
+
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
|
24
|
+
'/usr/bin/google-chrome',
|
|
25
|
+
'/usr/bin/chromium',
|
|
26
|
+
'/usr/bin/chromium-browser',
|
|
27
|
+
].filter(Boolean);
|
|
28
|
+
|
|
29
|
+
let browserPromise = null;
|
|
30
|
+
let connectedToExisting = false;
|
|
31
|
+
|
|
32
|
+
async function resolveChromePath() {
|
|
33
|
+
const fs = await import('node:fs/promises');
|
|
34
|
+
for (const p of DEFAULT_CHROME_PATHS) {
|
|
35
|
+
try {
|
|
36
|
+
await fs.access(p);
|
|
37
|
+
return p;
|
|
38
|
+
} catch {
|
|
39
|
+
// try next
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return null;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
async function getBrowser() {
|
|
46
|
+
if (browserPromise) return browserPromise;
|
|
47
|
+
browserPromise = (async () => {
|
|
48
|
+
// Mode 1: attach to a user's already-running Chrome via CDP.
|
|
49
|
+
// Start Chrome with `--remote-debugging-port=9222` and (if they want to
|
|
50
|
+
// reuse their normal profile) pass `--user-data-dir=...` to a dedicated
|
|
51
|
+
// clone. CHROME_REMOTE_URL can be either browserURL (http://host:port)
|
|
52
|
+
// or a browserWSEndpoint (ws://...).
|
|
53
|
+
const remoteUrl = process.env.CHROME_REMOTE_URL;
|
|
54
|
+
if (remoteUrl) {
|
|
55
|
+
const opts = remoteUrl.startsWith('ws')
|
|
56
|
+
? { browserWSEndpoint: remoteUrl }
|
|
57
|
+
: { browserURL: remoteUrl };
|
|
58
|
+
const browser = await puppeteer.connect({
|
|
59
|
+
...opts,
|
|
60
|
+
defaultViewport: null,
|
|
61
|
+
});
|
|
62
|
+
connectedToExisting = true;
|
|
63
|
+
return browser;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Mode 2: launch our own Chrome. Headless by default; set
|
|
67
|
+
// CHROME_HEADLESS=0 to run visible (useful for sites that aggressively
|
|
68
|
+
// detect headless).
|
|
69
|
+
const executablePath = await resolveChromePath();
|
|
70
|
+
if (!executablePath) {
|
|
71
|
+
throw new Error(
|
|
72
|
+
'Chrome executable not found. Set CHROME_PATH or install Chrome/Chromium.',
|
|
73
|
+
);
|
|
74
|
+
}
|
|
75
|
+
const headlessEnv = process.env.CHROME_HEADLESS;
|
|
76
|
+
const headless = headlessEnv === '0' || headlessEnv === 'false' ? false : 'new';
|
|
77
|
+
const userDataDir = process.env.CHROME_USER_DATA_DIR || undefined;
|
|
78
|
+
const browser = await puppeteer.launch({
|
|
79
|
+
executablePath,
|
|
80
|
+
headless,
|
|
81
|
+
userDataDir,
|
|
82
|
+
args: [
|
|
83
|
+
'--no-sandbox',
|
|
84
|
+
'--disable-dev-shm-usage',
|
|
85
|
+
'--disable-blink-features=AutomationControlled',
|
|
86
|
+
'--disable-features=IsolateOrigins,site-per-process',
|
|
87
|
+
],
|
|
88
|
+
});
|
|
89
|
+
return browser;
|
|
90
|
+
})();
|
|
91
|
+
return browserPromise;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
async function applyStealth(page) {
|
|
95
|
+
// Minimal stealth: mask the navigator.webdriver flag and add common
|
|
96
|
+
// properties that headless Chrome misses. This won't defeat enterprise
|
|
97
|
+
// bot mitigation, but clears the trivial checks many WAFs rely on.
|
|
98
|
+
await page.evaluateOnNewDocument(() => {
|
|
99
|
+
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
100
|
+
// languages / plugins
|
|
101
|
+
Object.defineProperty(navigator, 'languages', { get: () => ['nl-NL', 'nl', 'en-US', 'en'] });
|
|
102
|
+
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
|
103
|
+
// chrome.runtime stub
|
|
104
|
+
window.chrome = window.chrome || { runtime: {} };
|
|
105
|
+
// permissions query patch (Notification)
|
|
106
|
+
const originalQuery = window.navigator.permissions && window.navigator.permissions.query;
|
|
107
|
+
if (originalQuery) {
|
|
108
|
+
window.navigator.permissions.query = (parameters) =>
|
|
109
|
+
parameters.name === 'notifications'
|
|
110
|
+
? Promise.resolve({ state: Notification.permission })
|
|
111
|
+
: originalQuery(parameters);
|
|
112
|
+
}
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
export async function chromeFetch(url, timeoutMs = DEFAULT_TIMEOUT_MS) {
|
|
117
|
+
const empty = { body: null, statusCode: null, headers: {}, finalUrl: null };
|
|
118
|
+
let page;
|
|
119
|
+
try {
|
|
120
|
+
const browser = await getBrowser();
|
|
121
|
+
page = await browser.newPage();
|
|
122
|
+
// When attached to a user's Chrome, leave UA/headers/stealth alone —
|
|
123
|
+
// their real profile already looks like a human. Only shape the
|
|
124
|
+
// request when we launched Chrome ourselves.
|
|
125
|
+
if (!connectedToExisting) {
|
|
126
|
+
await page.setViewport({ width: 1366, height: 768 });
|
|
127
|
+
await page.setUserAgent(
|
|
128
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
129
|
+
);
|
|
130
|
+
await page.setExtraHTTPHeaders({
|
|
131
|
+
'Accept-Language': 'nl-NL,nl;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
132
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
|
133
|
+
'Upgrade-Insecure-Requests': '1',
|
|
134
|
+
'Sec-Fetch-Dest': 'document',
|
|
135
|
+
'Sec-Fetch-Mode': 'navigate',
|
|
136
|
+
'Sec-Fetch-Site': 'none',
|
|
137
|
+
'Sec-Fetch-User': '?1',
|
|
138
|
+
});
|
|
139
|
+
await applyStealth(page);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
const response = await page.goto(url, {
|
|
143
|
+
waitUntil: WAIT_UNTIL,
|
|
144
|
+
timeout: timeoutMs,
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
if (!response) return empty;
|
|
148
|
+
|
|
149
|
+
let statusCode = response.status();
|
|
150
|
+
let headers = response.headers() || {};
|
|
151
|
+
// Some WAFs (Cloudflare) serve a 403 interstitial, then JS solves
|
|
152
|
+
// the challenge and navigates to real content. Give it a brief window
|
|
153
|
+
// to settle and re-read the final status from the live document.
|
|
154
|
+
if (statusCode === 403 || statusCode === 503) {
|
|
155
|
+
try {
|
|
156
|
+
await page.waitForFunction(
|
|
157
|
+
() => {
|
|
158
|
+
const html = document.documentElement ? document.documentElement.outerHTML : '';
|
|
159
|
+
// Cloudflare challenge markers
|
|
160
|
+
return !/cf-challenge|cf-browser-verification|Just a moment/i.test(html);
|
|
161
|
+
},
|
|
162
|
+
{ timeout: 8000 },
|
|
163
|
+
);
|
|
164
|
+
// Re-evaluate: if navigation happened, fetch the new main response.
|
|
165
|
+
const finalResp = page.mainFrame().url() !== url
|
|
166
|
+
? await page.waitForResponse(() => true, { timeout: 2000 }).catch(() => null)
|
|
167
|
+
: null;
|
|
168
|
+
if (finalResp) {
|
|
169
|
+
statusCode = finalResp.status();
|
|
170
|
+
headers = finalResp.headers() || headers;
|
|
171
|
+
}
|
|
172
|
+
} catch {
|
|
173
|
+
// challenge didn't clear — keep the 403/503 so caller can decide.
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
const finalUrl = page.url();
|
|
178
|
+
let body = null;
|
|
179
|
+
try {
|
|
180
|
+
body = await page.content();
|
|
181
|
+
} catch {
|
|
182
|
+
body = null;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
return { body, statusCode, headers, finalUrl };
|
|
186
|
+
} catch (err) {
|
|
187
|
+
return { ...empty, error: err.message };
|
|
188
|
+
} finally {
|
|
189
|
+
if (page) {
|
|
190
|
+
try { await page.close(); } catch { /* ignore */ }
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
export async function closeBrowser() {
|
|
196
|
+
if (!browserPromise) return;
|
|
197
|
+
try {
|
|
198
|
+
const browser = await browserPromise;
|
|
199
|
+
if (connectedToExisting) {
|
|
200
|
+
await browser.disconnect();
|
|
201
|
+
} else {
|
|
202
|
+
await browser.close();
|
|
203
|
+
}
|
|
204
|
+
} catch { /* ignore */ }
|
|
205
|
+
browserPromise = null;
|
|
206
|
+
connectedToExisting = false;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// Close on process exit so the Chrome process doesn't linger.
|
|
210
|
+
const shutdown = () => { closeBrowser().catch(() => {}); };
|
|
211
|
+
process.once('exit', shutdown);
|
|
212
|
+
process.once('SIGINT', () => { shutdown(); process.exit(130); });
|
|
213
|
+
process.once('SIGTERM', () => { shutdown(); process.exit(143); });
|
package/src/geo-checker.js
CHANGED
|
@@ -9,6 +9,19 @@ import http from 'node:http';
|
|
|
9
9
|
import https from 'node:https';
|
|
10
10
|
import { URL } from 'node:url';
|
|
11
11
|
import * as cheerio from 'cheerio';
|
|
12
|
+
import { chromeFetch } from './chrome-fetcher.js';
|
|
13
|
+
|
|
14
|
+
// Status codes that indicate the server is refusing or stalling a bot-shaped
|
|
15
|
+
// request rather than serving real content. 202 (Amazon) and 400 (Douglas)
|
|
16
|
+
// sit here because in practice those are only returned to non-browser UAs.
|
|
17
|
+
const BOT_BLOCK_STATUS = new Set([202, 400, 401, 403, 407, 429, 503]);
|
|
18
|
+
function looksBotBlocked(res) {
|
|
19
|
+
if (!res) return true;
|
|
20
|
+
if (res.statusCode == null) return true;
|
|
21
|
+
if (BOT_BLOCK_STATUS.has(res.statusCode)) return true;
|
|
22
|
+
if (res.statusCode >= 200 && res.statusCode < 300 && !res.body) return true;
|
|
23
|
+
return false;
|
|
24
|
+
}
|
|
12
25
|
|
|
13
26
|
// ---------------------------------------------------------------------------
|
|
14
27
|
// Constants
|
|
@@ -750,8 +763,11 @@ function detectPageType($, schemaTypes, pathname) {
|
|
|
750
763
|
if (['Article', 'NewsArticle', 'BlogPosting', 'TechArticle'].some((t) => schemaTypes.has(t))) return 'article';
|
|
751
764
|
if (['LocalBusiness', 'Restaurant', 'Store'].some((t) => schemaTypes.has(t))) return 'local-business';
|
|
752
765
|
|
|
753
|
-
// Heuristic: homepage detection
|
|
754
|
-
|
|
766
|
+
// Heuristic: homepage detection (including language/locale-prefixed homepages like /en/, /de-DE/, /nl/)
|
|
767
|
+
// Strip a leading language or locale segment before checking so multilingual
|
|
768
|
+
// sites hosting their homepage at /en/ or /nl-NL/ are not treated as generic.
|
|
769
|
+
const normalizedPath = pathname.replace(/^\/[a-z]{2}(?:[-_][a-z]{2,3})?\/?$/i, '/');
|
|
770
|
+
if (normalizedPath === '/' || normalizedPath === '/index.html' || normalizedPath === '/index.php' || normalizedPath === '') return 'homepage';
|
|
755
771
|
|
|
756
772
|
// Heuristic: FAQ page via DOM
|
|
757
773
|
const faqIndicators = $('[class*="faq"], [id*="faq"], details, [class*="accordion"]');
|
|
@@ -1439,7 +1455,7 @@ function checkAccessibility($) {
|
|
|
1439
1455
|
const unlabeledInputList = [];
|
|
1440
1456
|
inputs.each((_, el) => {
|
|
1441
1457
|
const id = $(el).attr('id');
|
|
1442
|
-
const hasLabel = id && $(`label[for="${id}"]`).length > 0;
|
|
1458
|
+
const hasLabel = id && $(`label[for="${id.replace(/(["\\])/g, '\\$1')}"]`).length > 0;
|
|
1443
1459
|
const hasAriaLabel = $(el).attr('aria-label') || $(el).attr('aria-labelledby');
|
|
1444
1460
|
const wrappedInLabel = $(el).closest('label').length > 0;
|
|
1445
1461
|
const hasPlaceholder = $(el).attr('placeholder');
|
|
@@ -1885,6 +1901,63 @@ function checkMachineReadability($, robotsTxtData, llmsTxtData, responseHeaders)
|
|
|
1885
1901
|
return { checks, score: maxScore > 0 ? Math.round((score / maxScore) * 100) : 0, category: 'Machine Readability' };
|
|
1886
1902
|
}
|
|
1887
1903
|
|
|
1904
|
+
// ---------------------------------------------------------------------------
|
|
1905
|
+
// Trust signal evidence extractor
|
|
1906
|
+
// ---------------------------------------------------------------------------
|
|
1907
|
+
|
|
1908
|
+
/**
|
|
1909
|
+
* Extract raw nav/header/footer links plus language signals. Hardcoded pattern
|
|
1910
|
+
* lists cannot keep up with ~100 languages and typos; instead we surface the
|
|
1911
|
+
* raw anchor text + href so the calling LLM (or downstream consumer) can
|
|
1912
|
+
* classify trust signals (about / contact / legal / imprint / cookies)
|
|
1913
|
+
* semantically in whatever language the site uses.
|
|
1914
|
+
*
|
|
1915
|
+
* @param {cheerio.CheerioAPI} $
|
|
1916
|
+
* @returns {{
|
|
1917
|
+
* htmlLang: string|null,
|
|
1918
|
+
* hreflangs: string[],
|
|
1919
|
+
* navLinks: Array<{href: string, text: string, rel: string|null}>,
|
|
1920
|
+
* footerLinks: Array<{href: string, text: string, rel: string|null}>,
|
|
1921
|
+
* }}
|
|
1922
|
+
*/
|
|
1923
|
+
function extractTrustSignals($) {
|
|
1924
|
+
const PER_LOCATION_LIMIT = 80;
|
|
1925
|
+
const MAX_TEXT_LEN = 120;
|
|
1926
|
+
|
|
1927
|
+
function collect(selector) {
|
|
1928
|
+
const out = [];
|
|
1929
|
+
const seen = new Set();
|
|
1930
|
+
$(selector).find('a[href]').each((_, el) => {
|
|
1931
|
+
if (out.length >= PER_LOCATION_LIMIT) return false;
|
|
1932
|
+
const $el = $(el);
|
|
1933
|
+
const href = ($el.attr('href') || '').trim();
|
|
1934
|
+
if (!href || href.startsWith('#') || href.toLowerCase().startsWith('javascript:')) return;
|
|
1935
|
+
const text = $el.text().trim().replace(/\s+/g, ' ').slice(0, MAX_TEXT_LEN);
|
|
1936
|
+
const key = `${href}|${text}`;
|
|
1937
|
+
if (seen.has(key)) return;
|
|
1938
|
+
seen.add(key);
|
|
1939
|
+
out.push({ href, text, rel: $el.attr('rel') || null });
|
|
1940
|
+
});
|
|
1941
|
+
return out;
|
|
1942
|
+
}
|
|
1943
|
+
|
|
1944
|
+
const navLinks = collect('header, nav, [role="navigation"], [class*="menu" i], [class*="navigation" i], [id*="menu" i], [id*="nav" i]');
|
|
1945
|
+
const footerLinks = collect('footer, [role="contentinfo"], [class*="footer" i], [id*="footer" i]');
|
|
1946
|
+
|
|
1947
|
+
const hreflangs = [];
|
|
1948
|
+
$('link[rel="alternate"][hreflang]').each((_, el) => {
|
|
1949
|
+
const hl = $(el).attr('hreflang');
|
|
1950
|
+
if (hl) hreflangs.push(hl);
|
|
1951
|
+
});
|
|
1952
|
+
|
|
1953
|
+
return {
|
|
1954
|
+
htmlLang: $('html').attr('lang') || null,
|
|
1955
|
+
hreflangs,
|
|
1956
|
+
navLinks,
|
|
1957
|
+
footerLinks,
|
|
1958
|
+
};
|
|
1959
|
+
}
|
|
1960
|
+
|
|
1888
1961
|
// ---------------------------------------------------------------------------
|
|
1889
1962
|
// CHECK CATEGORY 7: Entity & Authority
|
|
1890
1963
|
// ---------------------------------------------------------------------------
|
|
@@ -2464,20 +2537,133 @@ function checkEntity($, jsonLdData) {
|
|
|
2464
2537
|
checks.push({ status: 'info', label: 'No About/Contact page links detected', detail: 'Link to organizational info for E-E-A-T' });
|
|
2465
2538
|
}
|
|
2466
2539
|
|
|
2467
|
-
// Privacy / Terms links (trust signals)
|
|
2468
|
-
|
|
2469
|
-
|
|
2470
|
-
|
|
2471
|
-
const
|
|
2472
|
-
|
|
2473
|
-
|
|
2540
|
+
// Privacy / Terms / Imprint / Cookies links (trust signals, multi-language)
|
|
2541
|
+
// Hardcoded patterns are a fallback heuristic; the extractTrustSignals
|
|
2542
|
+
// evidence payload on the analysis result lets LLM callers reclassify
|
|
2543
|
+
// semantically in any language.
|
|
2544
|
+
const privacyPatterns = [
|
|
2545
|
+
// English
|
|
2546
|
+
'privacy', 'privacy-policy',
|
|
2547
|
+
// Latin-alphabet European languages
|
|
2548
|
+
'datenschutz', 'privatsphaere', 'privatsphare',
|
|
2549
|
+
'privacidad', 'politica-de-privacidad',
|
|
2550
|
+
'privacidade', 'politica-de-privacidade',
|
|
2551
|
+
'confidentialite', 'politique-de-confidentialite', 'vie-privee',
|
|
2552
|
+
'riservatezza', 'privacy-italia',
|
|
2553
|
+
'privacybeleid', 'privacyverklaring',
|
|
2554
|
+
'integritet', 'integritetspolicy',
|
|
2555
|
+
'personvern',
|
|
2556
|
+
'tietosuoja', 'yksityisyys',
|
|
2557
|
+
'persondata', 'fortrolighed',
|
|
2558
|
+
'adatvedelem',
|
|
2559
|
+
'prywatnosc', 'polityka-prywatnosci',
|
|
2560
|
+
'soukromi', 'ochrana-osobnich-udaju',
|
|
2561
|
+
'ochrana-osobnych-udajov',
|
|
2562
|
+
'confidentialitate',
|
|
2563
|
+
'poverljivost', 'privatnost',
|
|
2564
|
+
'zasebnost',
|
|
2565
|
+
'privatesia', 'privatnost-hr',
|
|
2566
|
+
'konfidentsialnost', 'privatnost-ba',
|
|
2567
|
+
'gizlilik',
|
|
2568
|
+
'privatumas', 'privatuma',
|
|
2569
|
+
'yasslilik',
|
|
2570
|
+
// Romanized non-Latin
|
|
2571
|
+
'konfidentsialnost', 'konfidentsialnost-ua', 'konfidentsialnist',
|
|
2572
|
+
'idiotikotita', 'aporrito', 'prostasia-dedomenon',
|
|
2573
|
+
'puraibashi', 'puraibasi-porisi',
|
|
2574
|
+
'geinsajeongbobo', 'gaeinjeongbo',
|
|
2575
|
+
'yinsi', 'yinsi-zhengce',
|
|
2576
|
+
'khasusiyat', 'khososi',
|
|
2577
|
+
'harimiyat',
|
|
2578
|
+
'niji-gopaniyata', 'gopaniyata',
|
|
2579
|
+
'gopniyata',
|
|
2580
|
+
'kerahasiaan', 'privasi',
|
|
2581
|
+
'quyen-rieng-tu', 'bao-mat',
|
|
2582
|
+
'khwam-pen-suanto', 'nayobai-khwampensuntu',
|
|
2583
|
+
];
|
|
2584
|
+
const termsPatterns = [
|
|
2585
|
+
// English
|
|
2586
|
+
'terms', 'terms-of-service', 'terms-of-use', 'terms-conditions', 'tos',
|
|
2587
|
+
// Latin-alphabet European languages
|
|
2588
|
+
'agb', 'nutzungsbedingungen', 'geschaeftsbedingungen',
|
|
2589
|
+
'condiciones', 'terminos', 'terminos-y-condiciones', 'condiciones-de-uso',
|
|
2590
|
+
'termos', 'termos-de-uso', 'termos-de-servico',
|
|
2591
|
+
'conditions-generales', 'cgu', 'cgv', 'mentions-contrat',
|
|
2592
|
+
'condizioni', 'termini', 'termini-e-condizioni',
|
|
2593
|
+
'voorwaarden', 'algemene-voorwaarden', 'gebruiksvoorwaarden',
|
|
2594
|
+
'villkor', 'anvandarvillkor', 'allmanna-villkor',
|
|
2595
|
+
'brukervilkar', 'vilkar',
|
|
2596
|
+
'kayttoehdot', 'ehdot',
|
|
2597
|
+
'betingelser', 'vilkaar', 'handelsbetingelser',
|
|
2598
|
+
'szerzodesi-feltetelek', 'felhasznalasi-feltetelek',
|
|
2599
|
+
'regulamin', 'warunki',
|
|
2600
|
+
'podminky', 'vseobecne-obchodni-podminky', 'obchodni-podminky',
|
|
2601
|
+
'obchodne-podmienky',
|
|
2602
|
+
'termeni-si-conditii', 'termeni',
|
|
2603
|
+
'uslovi', 'uvjeti', 'pogoji',
|
|
2604
|
+
'kosullar', 'kullanim-kosullari',
|
|
2605
|
+
'salygos', 'naudojimo-salygos',
|
|
2606
|
+
'noteikumi',
|
|
2607
|
+
'kasutustingimused',
|
|
2608
|
+
// Romanized non-Latin
|
|
2609
|
+
'usloviya', 'usloviya-ispolzovaniya', 'pravila',
|
|
2610
|
+
'umovy', 'pravyla',
|
|
2611
|
+
'oroi', 'oroi-xrisis',
|
|
2612
|
+
'riyoukiyaku', 'riyou-kiyaku', 'kiyaku',
|
|
2613
|
+
'iyong-yakgwan', 'yakgwan',
|
|
2614
|
+
'tiaokuan', 'fuwu-tiaokuan', 'shiyong-tiaokuan',
|
|
2615
|
+
'shuruth', 'shuroot-alistikhdam',
|
|
2616
|
+
'sharayit-estefadeh', 'sharayet',
|
|
2617
|
+
'niyam-shartein', 'shartein',
|
|
2618
|
+
'sharth-o',
|
|
2619
|
+
'ketentuan', 'syarat-ketentuan',
|
|
2620
|
+
'dieu-khoan', 'dieu-khoan-su-dung',
|
|
2621
|
+
'khoapkamnot', 'ngeuankhai-kan-chai',
|
|
2622
|
+
];
|
|
2623
|
+
const imprintPatterns = [
|
|
2624
|
+
// Legally required in DE/AT/CH, common across DACH + EU
|
|
2625
|
+
'impressum', 'imprint', 'mentions-legales', 'aviso-legal',
|
|
2626
|
+
'note-legali', 'colofon', 'colophon', 'wettelijke-vermelding',
|
|
2627
|
+
'juridisk-information', 'oikeudellinen-huomautus',
|
|
2628
|
+
'aviso-legal-pt', 'noticia-legal',
|
|
2629
|
+
'pravni-udaje', 'pravne-informacie',
|
|
2630
|
+
'yasal-bildirim', 'yasal-uyari',
|
|
2631
|
+
'informacje-prawne',
|
|
2632
|
+
'hukuki-bilgiler',
|
|
2633
|
+
'impresum',
|
|
2634
|
+
];
|
|
2635
|
+
const cookiePatterns = [
|
|
2636
|
+
'cookie', 'cookies', 'cookiebeleid', 'cookie-policy',
|
|
2637
|
+
'politique-cookies', 'politica-cookies', 'politica-de-cookies',
|
|
2638
|
+
'cookierichtlinie', 'cookie-einstellungen',
|
|
2639
|
+
'kekse', 'cookie-instellingen',
|
|
2640
|
+
'soubory-cookie', 'sukromie-cookie',
|
|
2641
|
+
'cerezler', 'gizlilik-cerezler',
|
|
2642
|
+
'pliki-cookie',
|
|
2643
|
+
'fichiers-cookie',
|
|
2644
|
+
'kukit',
|
|
2645
|
+
];
|
|
2646
|
+
const buildSelector = (patterns) => patterns.map((p) => `a[href*="${p}" i]`).join(', ');
|
|
2647
|
+
const privacyLink = $(buildSelector(privacyPatterns));
|
|
2648
|
+
const termsLink = $(buildSelector(termsPatterns));
|
|
2649
|
+
const imprintLink = $(buildSelector(imprintPatterns));
|
|
2650
|
+
const cookieLink = $(buildSelector(cookiePatterns));
|
|
2474
2651
|
|
|
2475
2652
|
maxScore += 5;
|
|
2476
|
-
|
|
2653
|
+
const legalSignals = [];
|
|
2654
|
+
if (privacyLink.length > 0) legalSignals.push('privacy');
|
|
2655
|
+
if (termsLink.length > 0) legalSignals.push('terms');
|
|
2656
|
+
if (imprintLink.length > 0) legalSignals.push('imprint');
|
|
2657
|
+
if (cookieLink.length > 0) legalSignals.push('cookies');
|
|
2658
|
+
|
|
2659
|
+
if (legalSignals.length >= 2) {
|
|
2477
2660
|
score += 5;
|
|
2478
|
-
checks.push({ status: 'pass', label:
|
|
2661
|
+
checks.push({ status: 'pass', label: `Legal pages linked (${legalSignals.length})`, detail: `Detected: ${legalSignals.join(', ')}` });
|
|
2662
|
+
} else if (legalSignals.length === 1) {
|
|
2663
|
+
score += 3;
|
|
2664
|
+
checks.push({ status: 'warn', label: `Only one legal page linked (${legalSignals[0]})`, detail: 'Add the others (privacy, terms, imprint, cookies) for full trust signals. Heuristic may miss non-Latin scripts — check evidence payload.' });
|
|
2479
2665
|
} else {
|
|
2480
|
-
checks.push({ status: 'info', label: 'No
|
|
2666
|
+
checks.push({ status: 'info', label: 'No legal page links detected by heuristic', detail: 'If the site is non-English, verify via the footerLinks evidence payload before treating as missing.' });
|
|
2481
2667
|
}
|
|
2482
2668
|
|
|
2483
2669
|
// E-E-A-T Experience Signals (10 pts)
|
|
@@ -2539,7 +2725,7 @@ function checkEntity($, jsonLdData) {
|
|
|
2539
2725
|
const hasPhone = /(\+?\d{1,3}[-.\s]?)?\(?\d{2,4}\)?[-.\s]?\d{3,4}[-.\s]?\d{3,4}/.test(bodyText);
|
|
2540
2726
|
const hasEmail = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z]{2,}\b/i.test(bodyText);
|
|
2541
2727
|
const hasAddress = $('[itemprop="address"], [class*="address"], address').length > 0;
|
|
2542
|
-
const hasContactPage =
|
|
2728
|
+
const hasContactPage = contactLink.length > 0;
|
|
2543
2729
|
const contactSignals = (hasPhone ? 1 : 0) + (hasEmail ? 1 : 0) + (hasAddress ? 1 : 0) + (hasContactPage ? 1 : 0);
|
|
2544
2730
|
maxScore += 5;
|
|
2545
2731
|
if (contactSignals >= 3) {
|
|
@@ -2985,7 +3171,7 @@ function checkWebMCP($, pageType, ucpData) {
|
|
|
2985
3171
|
const name = input.attr('name');
|
|
2986
3172
|
const type = input.attr('type');
|
|
2987
3173
|
const id = input.attr('id');
|
|
2988
|
-
const label = id ? $(`label[for="${id}"]`).length > 0 : false;
|
|
3174
|
+
const label = id ? $(`label[for="${id.replace(/(["\\])/g, '\\$1')}"]`).length > 0 : false;
|
|
2989
3175
|
const ariaLabel = input.attr('aria-label');
|
|
2990
3176
|
const placeholder = input.attr('placeholder');
|
|
2991
3177
|
|
|
@@ -4243,6 +4429,16 @@ function analyseHTML(html, domain, robotsTxtData, llmsTxtData, responseHeaders,
|
|
|
4243
4429
|
headings: { h1: [], h2: [] },
|
|
4244
4430
|
lang: null,
|
|
4245
4431
|
hasStructuredData: false,
|
|
4432
|
+
// Raw evidence for language-agnostic trust signal classification.
|
|
4433
|
+
// Populated by extractTrustSignals; consumers running inside an LLM can
|
|
4434
|
+
// reclassify legal / about / contact / imprint / cookies semantically
|
|
4435
|
+
// instead of relying on the heuristic pattern lists.
|
|
4436
|
+
evidence: {
|
|
4437
|
+
htmlLang: null,
|
|
4438
|
+
hreflangs: [],
|
|
4439
|
+
navLinks: [],
|
|
4440
|
+
footerLinks: [],
|
|
4441
|
+
},
|
|
4246
4442
|
};
|
|
4247
4443
|
|
|
4248
4444
|
if (!html) return result;
|
|
@@ -4265,6 +4461,9 @@ function analyseHTML(html, domain, robotsTxtData, llmsTxtData, responseHeaders,
|
|
|
4265
4461
|
const pageType = detectPageType($, schemaTypes, pathname);
|
|
4266
4462
|
result.pageType = pageType;
|
|
4267
4463
|
|
|
4464
|
+
// Extract language-agnostic trust signal evidence
|
|
4465
|
+
result.evidence = extractTrustSignals($);
|
|
4466
|
+
|
|
4268
4467
|
// Populate basic metadata fields (backward-compatible with old analyseHTML)
|
|
4269
4468
|
result.title = $('title').first().text().trim() || null;
|
|
4270
4469
|
result.lang = $('html').attr('lang') || null;
|
|
@@ -4391,6 +4590,7 @@ function analyseHTML(html, domain, robotsTxtData, llmsTxtData, responseHeaders,
|
|
|
4391
4590
|
async function checkGEO(domain, options = {}) {
|
|
4392
4591
|
const maxPages = options.maxPages ?? MAX_PAGES_PER_DOMAIN;
|
|
4393
4592
|
const skipCache = options.skipCache ?? false;
|
|
4593
|
+
const renderMode = options.renderMode ?? 'auto'; // 'static' | 'chrome' | 'auto'
|
|
4394
4594
|
|
|
4395
4595
|
// Check cache first (unless explicitly skipped)
|
|
4396
4596
|
if (!skipCache) {
|
|
@@ -4500,7 +4700,9 @@ async function checkGEO(domain, options = {}) {
|
|
|
4500
4700
|
[robotsRes, llmsRes, homepageRes, sitemapRes, ucpRes] = await Promise.all([
|
|
4501
4701
|
throttledFetchUrl(robotsUrl, FETCH_TIMEOUT_MS, MAX_TEXT_BODY_SIZE).catch(() => ({ body: null, statusCode: null, headers: {} })),
|
|
4502
4702
|
throttledFetchUrl(llmsUrl, FETCH_TIMEOUT_MS, MAX_TEXT_BODY_SIZE).catch(() => ({ body: null, statusCode: null, headers: {} })),
|
|
4503
|
-
|
|
4703
|
+
renderMode === 'chrome'
|
|
4704
|
+
? chromeFetch(homepageUrl).catch(() => ({ body: null, statusCode: null, headers: {} }))
|
|
4705
|
+
: throttledFetchUrl(homepageUrl).catch(() => ({ body: null, statusCode: null, headers: {} })),
|
|
4504
4706
|
throttledFetchUrl(sitemapUrl, FETCH_TIMEOUT_MS, MAX_TEXT_BODY_SIZE).catch(() => ({ body: null, statusCode: null, headers: {} })),
|
|
4505
4707
|
throttledFetchUrl(ucpUrl, FETCH_TIMEOUT_MS, MAX_TEXT_BODY_SIZE).catch(() => ({ body: null, statusCode: null, headers: {} })),
|
|
4506
4708
|
]);
|
|
@@ -4509,6 +4711,31 @@ async function checkGEO(domain, options = {}) {
|
|
|
4509
4711
|
return output;
|
|
4510
4712
|
}
|
|
4511
4713
|
|
|
4714
|
+
// Auto fallback: if static fetch couldn't get the homepage (bot block,
|
|
4715
|
+
// WAF, or network error), retry via headless Chrome. Record that we
|
|
4716
|
+
// rendered via Chrome so downstream multi-page crawl uses it too.
|
|
4717
|
+
let useChromeForCrawl = renderMode === 'chrome';
|
|
4718
|
+
if (renderMode === 'auto' && looksBotBlocked(homepageRes)) {
|
|
4719
|
+
const chromeRes = await chromeFetch(homepageUrl).catch(() => null);
|
|
4720
|
+
const chromeOk =
|
|
4721
|
+
chromeRes &&
|
|
4722
|
+
typeof chromeRes.statusCode === 'number' &&
|
|
4723
|
+
chromeRes.statusCode >= 200 &&
|
|
4724
|
+
chromeRes.statusCode < 300 &&
|
|
4725
|
+
chromeRes.body;
|
|
4726
|
+
if (chromeOk) {
|
|
4727
|
+
homepageRes = chromeRes;
|
|
4728
|
+
useChromeForCrawl = true;
|
|
4729
|
+
output.renderMode = 'chrome-fallback';
|
|
4730
|
+
} else {
|
|
4731
|
+
output.renderMode = chromeRes && chromeRes.statusCode
|
|
4732
|
+
? `chrome-blocked-${chromeRes.statusCode}`
|
|
4733
|
+
: 'static-blocked';
|
|
4734
|
+
}
|
|
4735
|
+
} else {
|
|
4736
|
+
output.renderMode = renderMode === 'chrome' ? 'chrome' : 'static';
|
|
4737
|
+
}
|
|
4738
|
+
|
|
4512
4739
|
// --- robots.txt ---
|
|
4513
4740
|
try {
|
|
4514
4741
|
if (robotsRes.statusCode === 200 && robotsRes.body) {
|
|
@@ -4547,7 +4774,12 @@ async function checkGEO(domain, options = {}) {
|
|
|
4547
4774
|
// --- Homepage (full 16-category analysis) ---
|
|
4548
4775
|
try {
|
|
4549
4776
|
output.homepage.statusCode = homepageRes.statusCode;
|
|
4550
|
-
|
|
4777
|
+
// Accept any 2xx that came back with a body. In practice Chrome often
|
|
4778
|
+
// surfaces 202 (Amazon) or 206 responses that still carry the rendered
|
|
4779
|
+
// document; analysing those is strictly better than dropping the score.
|
|
4780
|
+
const homepageUsable = homepageRes.statusCode >= 200 &&
|
|
4781
|
+
homepageRes.statusCode < 300 && !!homepageRes.body;
|
|
4782
|
+
if (homepageUsable) {
|
|
4551
4783
|
output.homepage.analysis = analyseHTML(
|
|
4552
4784
|
homepageRes.body,
|
|
4553
4785
|
cleanDomain,
|
|
@@ -4633,14 +4865,18 @@ async function checkGEO(domain, options = {}) {
|
|
|
4633
4865
|
error: output.homepage.error,
|
|
4634
4866
|
});
|
|
4635
4867
|
|
|
4868
|
+
// Chrome fetches are serial (one tab at a time), static fetches run in batches.
|
|
4869
|
+
const concurrency = useChromeForCrawl ? 1 : MAX_CONCURRENT_PAGE_FETCHES;
|
|
4636
4870
|
// Fetch remaining pages in controlled batches
|
|
4637
|
-
for (let i = 0; i < pagesToCrawl.length; i +=
|
|
4638
|
-
const batch = pagesToCrawl.slice(i, i +
|
|
4871
|
+
for (let i = 0; i < pagesToCrawl.length; i += concurrency) {
|
|
4872
|
+
const batch = pagesToCrawl.slice(i, i + concurrency);
|
|
4639
4873
|
const batchResults = await Promise.all(
|
|
4640
4874
|
batch.map(async (pageUrl) => {
|
|
4641
4875
|
try {
|
|
4642
|
-
const res =
|
|
4643
|
-
|
|
4876
|
+
const res = useChromeForCrawl
|
|
4877
|
+
? await chromeFetch(pageUrl, PAGE_CRAWL_TIMEOUT_MS)
|
|
4878
|
+
: await throttledFetchUrl(pageUrl, PAGE_CRAWL_TIMEOUT_MS);
|
|
4879
|
+
if (res.statusCode >= 200 && res.statusCode < 300 && res.body) {
|
|
4644
4880
|
// Determine pathname for page type detection
|
|
4645
4881
|
let pathname = '/';
|
|
4646
4882
|
try { pathname = new URL(pageUrl).pathname; } catch {}
|
package/src/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
1
|
+
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* Glippy MCP Server
|
|
@@ -36,6 +36,27 @@ import {
|
|
|
36
36
|
parseSitemapUrls,
|
|
37
37
|
aggregatePageScores,
|
|
38
38
|
} from "./geo-checker.js";
|
|
39
|
+
import { chromeFetch } from "./chrome-fetcher.js";
|
|
40
|
+
|
|
41
|
+
// Render-mode: how to fetch HTML for scoring.
|
|
42
|
+
// 'static' (default for tools that don't specify) - raw Node fetch, fastest
|
|
43
|
+
// 'chrome' - always render via headless Chrome
|
|
44
|
+
// 'auto' - static first, Chrome fallback on bot-block
|
|
45
|
+
//
|
|
46
|
+
// Chrome modes require a local Chrome/Chromium binary. Auto-resolves from
|
|
47
|
+
// CHROME_PATH / PUPPETEER_EXECUTABLE_PATH / common install locations, or
|
|
48
|
+
// attaches to an already-running Chrome when CHROME_REMOTE_URL is set
|
|
49
|
+
// (e.g. "http://localhost:9222" after launching Chrome with
|
|
50
|
+
// --remote-debugging-port=9222).
|
|
51
|
+
const RENDER_MODES = ["static", "chrome", "auto"];
|
|
52
|
+
|
|
53
|
+
function looksBotBlockedResponse(res) {
|
|
54
|
+
if (!res) return true;
|
|
55
|
+
if (res.statusCode == null) return true;
|
|
56
|
+
if ([401, 403, 407, 429, 503].includes(res.statusCode)) return true;
|
|
57
|
+
if (res.statusCode >= 200 && res.statusCode < 300 && !res.body) return true;
|
|
58
|
+
return false;
|
|
59
|
+
}
|
|
39
60
|
|
|
40
61
|
// ---------------------------------------------------------------------------
|
|
41
62
|
// License validation
|
|
@@ -1597,10 +1618,15 @@ const server = new McpServer({
|
|
|
1597
1618
|
server.tool(
|
|
1598
1619
|
"analyze_domain",
|
|
1599
1620
|
"Run a comprehensive GEO (Generative Engine Optimization) readiness analysis on a domain. " +
|
|
1600
|
-
"Checks robots.txt, llms.txt (note: llms.txt is not currently supported by major AI models
|
|
1601
|
-
"homepage HTML (
|
|
1621
|
+
"Checks robots.txt, llms.txt (note: llms.txt is not currently supported by major AI models - having one cannot hurt but is not a meaningful optimization), " +
|
|
1622
|
+
"homepage HTML (16 scoring categories), sitemap.xml, and security headers. " +
|
|
1602
1623
|
"Returns an overall weighted score (0-100) with per-category breakdowns and actionable recommendations. " +
|
|
1603
|
-
"
|
|
1624
|
+
"The response includes a 'Trust Signal Evidence' section (htmlLang, hreflangs, nav/footer link anchor text + href). " +
|
|
1625
|
+
"IMPORTANT: the Entity & Authority heuristic for about/contact/legal/imprint/cookies uses a pattern fallback. " +
|
|
1626
|
+
"When the site is not in English (htmlLang != 'en' or hreflangs indicate otherwise), you (the calling LLM) should " +
|
|
1627
|
+
"inspect navLinks/footerLinks semantically and override the heuristic classification in your final explanation - " +
|
|
1628
|
+
"the anchor text tells you what each link is regardless of URL patterns. " +
|
|
1629
|
+
"Use output_format='json' to get the full raw result (includes homepage.analysis.evidence) for programmatic use.",
|
|
1604
1630
|
{
|
|
1605
1631
|
domain: z
|
|
1606
1632
|
.string()
|
|
@@ -1616,6 +1642,14 @@ server.tool(
|
|
|
1616
1642
|
.describe(
|
|
1617
1643
|
"Maximum pages to crawl (1 = homepage only, up to 10 for multi-page analysis). Defaults to 10."
|
|
1618
1644
|
),
|
|
1645
|
+
render_mode: z
|
|
1646
|
+
.enum(RENDER_MODES)
|
|
1647
|
+
.optional()
|
|
1648
|
+
.describe(
|
|
1649
|
+
'How to fetch HTML. "static" (fast, plain Node fetch) is default for this tool. ' +
|
|
1650
|
+
'"auto" tries static first and falls back to a local headless Chrome for bot-blocked responses (401/403/407/429/503 or empty 2xx). ' +
|
|
1651
|
+
'"chrome" always renders via Chrome. Chrome modes need a local Chrome binary (CHROME_PATH) or an attached instance (CHROME_REMOTE_URL).'
|
|
1652
|
+
),
|
|
1619
1653
|
output_format: z
|
|
1620
1654
|
.enum(["text", "json"])
|
|
1621
1655
|
.optional()
|
|
@@ -1624,10 +1658,11 @@ server.tool(
|
|
|
1624
1658
|
'"json" returns the raw analysis result object that can be passed to export_report\'s analysis_result parameter.'
|
|
1625
1659
|
),
|
|
1626
1660
|
},
|
|
1627
|
-
withLicense(async ({ domain, max_pages, output_format }) => {
|
|
1661
|
+
withLicense(async ({ domain, max_pages, render_mode, output_format }) => {
|
|
1628
1662
|
try {
|
|
1629
1663
|
const result = await checkGEO(domain, {
|
|
1630
1664
|
maxPages: max_pages ?? 10,
|
|
1665
|
+
renderMode: render_mode ?? "static",
|
|
1631
1666
|
});
|
|
1632
1667
|
|
|
1633
1668
|
if (result.error) {
|
|
@@ -1678,6 +1713,36 @@ server.tool(
|
|
|
1678
1713
|
}
|
|
1679
1714
|
}
|
|
1680
1715
|
lines.push("");
|
|
1716
|
+
|
|
1717
|
+
// Trust Signal Evidence - language-agnostic raw data for LLM reclassification
|
|
1718
|
+
const ev = analysis.evidence;
|
|
1719
|
+
if (ev) {
|
|
1720
|
+
lines.push("## Trust Signal Evidence");
|
|
1721
|
+
lines.push(`htmlLang: ${ev.htmlLang || "(none)"}`);
|
|
1722
|
+
if (ev.hreflangs && ev.hreflangs.length > 0) {
|
|
1723
|
+
lines.push(`hreflangs: ${ev.hreflangs.slice(0, 20).join(", ")}${ev.hreflangs.length > 20 ? ` (+${ev.hreflangs.length - 20} more)` : ""}`);
|
|
1724
|
+
}
|
|
1725
|
+
const isNonEnglish = ev.htmlLang && !/^en(-|$)/i.test(ev.htmlLang);
|
|
1726
|
+
if (isNonEnglish) {
|
|
1727
|
+
lines.push(
|
|
1728
|
+
`NOTE: Site is in '${ev.htmlLang}'. If the Entity & Authority category flagged missing about/contact/legal/imprint/cookies links, ` +
|
|
1729
|
+
`verify against the footerLinks below - anchor text like 'Über uns', '会社概要', 'Impressum', 'Politique de confidentialité' count as trust signals ` +
|
|
1730
|
+
`regardless of URL patterns. Override the heuristic if links are clearly present.`
|
|
1731
|
+
);
|
|
1732
|
+
}
|
|
1733
|
+
const formatLink = (l) => ` - ${l.text || "(no text)"} -> ${l.href}`;
|
|
1734
|
+
if (ev.footerLinks && ev.footerLinks.length > 0) {
|
|
1735
|
+
lines.push(`footerLinks (${ev.footerLinks.length}):`);
|
|
1736
|
+
ev.footerLinks.slice(0, 40).forEach((l) => lines.push(formatLink(l)));
|
|
1737
|
+
if (ev.footerLinks.length > 40) lines.push(` ... (+${ev.footerLinks.length - 40} more)`);
|
|
1738
|
+
}
|
|
1739
|
+
if (ev.navLinks && ev.navLinks.length > 0) {
|
|
1740
|
+
lines.push(`navLinks (${ev.navLinks.length}):`);
|
|
1741
|
+
ev.navLinks.slice(0, 30).forEach((l) => lines.push(formatLink(l)));
|
|
1742
|
+
if (ev.navLinks.length > 30) lines.push(` ... (+${ev.navLinks.length - 30} more)`);
|
|
1743
|
+
}
|
|
1744
|
+
lines.push("");
|
|
1745
|
+
}
|
|
1681
1746
|
}
|
|
1682
1747
|
|
|
1683
1748
|
// robots.txt
|
|
@@ -1941,17 +2006,28 @@ server.tool(
|
|
|
1941
2006
|
server.tool(
|
|
1942
2007
|
"get_geo_summary",
|
|
1943
2008
|
"Get a concise GEO readiness summary for a domain: overall score, grade, top 3 strengths, and top 3 issues to fix. " +
|
|
1944
|
-
"Use this for a quick overview; use analyze_domain for full details
|
|
2009
|
+
"Use this for a quick overview; use analyze_domain for full details including the Trust Signal Evidence payload " +
|
|
2010
|
+
"(raw nav/footer links for LLM-driven semantic classification on non-English sites).",
|
|
1945
2011
|
{
|
|
1946
2012
|
domain: z
|
|
1947
2013
|
.string()
|
|
1948
2014
|
.describe(
|
|
1949
2015
|
'The domain to check, e.g. "example.com". Do not include https:// prefix.'
|
|
1950
2016
|
),
|
|
2017
|
+
render_mode: z
|
|
2018
|
+
.enum(RENDER_MODES)
|
|
2019
|
+
.optional()
|
|
2020
|
+
.describe(
|
|
2021
|
+
'How to fetch the homepage. "static" (default), "auto" (static with Chrome fallback on bot-block), ' +
|
|
2022
|
+
'or "chrome" (always render via local headless Chrome).'
|
|
2023
|
+
),
|
|
1951
2024
|
},
|
|
1952
|
-
withLicense(async ({ domain }) => {
|
|
2025
|
+
withLicense(async ({ domain, render_mode }) => {
|
|
1953
2026
|
try {
|
|
1954
|
-
const result = await checkGEO(domain, {
|
|
2027
|
+
const result = await checkGEO(domain, {
|
|
2028
|
+
maxPages: 1,
|
|
2029
|
+
renderMode: render_mode ?? "static",
|
|
2030
|
+
});
|
|
1955
2031
|
|
|
1956
2032
|
if (result.error) {
|
|
1957
2033
|
return {
|
|
@@ -1987,6 +2063,15 @@ server.tool(
|
|
|
1987
2063
|
lines.push(`# GEO Summary: ${result.domain}`);
|
|
1988
2064
|
lines.push(`Overall Score: ${analysis.overallScore}% (${grade})`);
|
|
1989
2065
|
lines.push(`Page Type: ${analysis.pageType}`);
|
|
2066
|
+
const evLang = analysis.evidence?.htmlLang;
|
|
2067
|
+
if (evLang) {
|
|
2068
|
+
lines.push(`Site Language: ${evLang}`);
|
|
2069
|
+
if (!/^en(-|$)/i.test(evLang)) {
|
|
2070
|
+
lines.push(
|
|
2071
|
+
`(Non-English site - use analyze_domain for the footerLinks evidence payload to reclassify trust signals semantically.)`
|
|
2072
|
+
);
|
|
2073
|
+
}
|
|
2074
|
+
}
|
|
1990
2075
|
lines.push("");
|
|
1991
2076
|
|
|
1992
2077
|
// Sort categories by score
|
|
@@ -2071,6 +2156,13 @@ server.tool(
|
|
|
2071
2156
|
.describe(
|
|
2072
2157
|
"Maximum pages to crawl per domain (1 = homepage only). Defaults to 10."
|
|
2073
2158
|
),
|
|
2159
|
+
render_mode: z
|
|
2160
|
+
.enum(RENDER_MODES)
|
|
2161
|
+
.optional()
|
|
2162
|
+
.describe(
|
|
2163
|
+
'How to fetch HTML for each domain. "static" (default), "auto" (static with Chrome fallback on bot-block), ' +
|
|
2164
|
+
'or "chrome" (always render via local headless Chrome).'
|
|
2165
|
+
),
|
|
2074
2166
|
output_format: z
|
|
2075
2167
|
.enum(["text", "json"])
|
|
2076
2168
|
.optional()
|
|
@@ -2082,13 +2174,14 @@ server.tool(
|
|
|
2082
2174
|
withTierFeature(
|
|
2083
2175
|
"compareDomains",
|
|
2084
2176
|
"Domain comparison requires a Pro or Agency license.",
|
|
2085
|
-
async ({ domains, max_pages, output_format }) => {
|
|
2177
|
+
async ({ domains, max_pages, render_mode, output_format }) => {
|
|
2086
2178
|
const maxPages = max_pages ?? 10;
|
|
2179
|
+
const renderMode = render_mode ?? "static";
|
|
2087
2180
|
|
|
2088
2181
|
// Run all analyses in parallel
|
|
2089
2182
|
const results = await Promise.allSettled(
|
|
2090
2183
|
domains.map((domain) =>
|
|
2091
|
-
checkGEO(domain, { maxPages }).then((result) => ({
|
|
2184
|
+
checkGEO(domain, { maxPages, renderMode }).then((result) => ({
|
|
2092
2185
|
domain,
|
|
2093
2186
|
result,
|
|
2094
2187
|
}))
|
|
@@ -2240,7 +2333,7 @@ const DEFAULT_RATE_LIMIT = parseInt(process.env.GLIPPY_RATE_LIMIT, 10) || 5;
|
|
|
2240
2333
|
* @param {number} domainRateLimit - Max requests/second per domain (0 = unlimited)
|
|
2241
2334
|
* @returns {Promise<{pageResults: object[], domainMeta: Map}>}
|
|
2242
2335
|
*/
|
|
2243
|
-
async function analyseUrls(urls, concurrency = 3, domainRateLimit = DEFAULT_RATE_LIMIT) {
|
|
2336
|
+
async function analyseUrls(urls, concurrency = 3, domainRateLimit = DEFAULT_RATE_LIMIT, renderMode = "static") {
|
|
2244
2337
|
// Group URLs by domain
|
|
2245
2338
|
const domainMap = new Map(); // domain → [urls]
|
|
2246
2339
|
for (const url of urls) {
|
|
@@ -2318,13 +2411,34 @@ async function analyseUrls(urls, concurrency = 3, domainRateLimit = DEFAULT_RATE
|
|
|
2318
2411
|
try {
|
|
2319
2412
|
const pathname = new URL(url).pathname;
|
|
2320
2413
|
const meta = domainMeta.get(domain);
|
|
2321
|
-
|
|
2414
|
+
let res;
|
|
2415
|
+
let rendered = "static";
|
|
2416
|
+
if (renderMode === "chrome") {
|
|
2417
|
+
res = await chromeFetch(url, 30000);
|
|
2418
|
+
rendered = "chrome";
|
|
2419
|
+
} else {
|
|
2420
|
+
res = await throttledFetchUrl(url, 15000);
|
|
2421
|
+
if (renderMode === "auto" && looksBotBlockedResponse(res)) {
|
|
2422
|
+
const chromeRes = await chromeFetch(url, 30000).catch(() => null);
|
|
2423
|
+
if (
|
|
2424
|
+
chromeRes &&
|
|
2425
|
+
typeof chromeRes.statusCode === "number" &&
|
|
2426
|
+
chromeRes.statusCode >= 200 &&
|
|
2427
|
+
chromeRes.statusCode < 300 &&
|
|
2428
|
+
chromeRes.body
|
|
2429
|
+
) {
|
|
2430
|
+
res = chromeRes;
|
|
2431
|
+
rendered = "chrome-fallback";
|
|
2432
|
+
}
|
|
2433
|
+
}
|
|
2434
|
+
}
|
|
2322
2435
|
|
|
2323
|
-
if (res.statusCode
|
|
2436
|
+
if (!res || res.statusCode == null || res.statusCode < 200 || res.statusCode >= 300 || !res.body) {
|
|
2324
2437
|
return {
|
|
2325
2438
|
url,
|
|
2326
2439
|
analysis: null,
|
|
2327
|
-
error: res.statusCode ? `HTTP ${res.statusCode}` : "Failed to fetch",
|
|
2440
|
+
error: res && res.statusCode ? `HTTP ${res.statusCode}` : "Failed to fetch",
|
|
2441
|
+
renderMode: rendered,
|
|
2328
2442
|
};
|
|
2329
2443
|
}
|
|
2330
2444
|
|
|
@@ -2337,7 +2451,7 @@ async function analyseUrls(urls, concurrency = 3, domainRateLimit = DEFAULT_RATE
|
|
|
2337
2451
|
pathname
|
|
2338
2452
|
);
|
|
2339
2453
|
|
|
2340
|
-
return { url, analysis, error: null };
|
|
2454
|
+
return { url, analysis, error: null, renderMode: rendered };
|
|
2341
2455
|
} catch (err) {
|
|
2342
2456
|
return { url, analysis: null, error: err.message };
|
|
2343
2457
|
}
|
|
@@ -2453,6 +2567,13 @@ server.tool(
|
|
|
2453
2567
|
"Defaults to 5 req/s (or GLIPPY_RATE_LIMIT env var). Set lower for polite crawling, higher if you control the target server. " +
|
|
2454
2568
|
"Use 0.5 for 1 request every 2 seconds, 10 for aggressive crawling."
|
|
2455
2569
|
),
|
|
2570
|
+
render_mode: z
|
|
2571
|
+
.enum(RENDER_MODES)
|
|
2572
|
+
.optional()
|
|
2573
|
+
.describe(
|
|
2574
|
+
'How to fetch each URL. "static" (default), "auto" (static with Chrome fallback on bot-block), ' +
|
|
2575
|
+
'or "chrome" (always render via local headless Chrome).'
|
|
2576
|
+
),
|
|
2456
2577
|
output_format: z
|
|
2457
2578
|
.enum(["text", "json", "summary"])
|
|
2458
2579
|
.optional()
|
|
@@ -2480,7 +2601,7 @@ server.tool(
|
|
|
2480
2601
|
"Recommended: 10-20 for detailed results to stay within output limits."
|
|
2481
2602
|
),
|
|
2482
2603
|
},
|
|
2483
|
-
withLicense(async ({ sitemap_url, max_urls, rate_limit, output_format, offset, limit }) => {
|
|
2604
|
+
withLicense(async ({ sitemap_url, max_urls, rate_limit, render_mode, output_format, offset, limit }) => {
|
|
2484
2605
|
const features = getFeatures();
|
|
2485
2606
|
|
|
2486
2607
|
// Check if sitemap analysis is available for this tier
|
|
@@ -2555,7 +2676,7 @@ server.tool(
|
|
|
2555
2676
|
|
|
2556
2677
|
// Analyse all URLs with rate limiting
|
|
2557
2678
|
const rateLimit = rate_limit ?? DEFAULT_RATE_LIMIT;
|
|
2558
|
-
const { pageResults } = await analyseUrls(urlsToAnalyse, 3, rateLimit);
|
|
2679
|
+
const { pageResults } = await analyseUrls(urlsToAnalyse, 3, rateLimit, render_mode ?? "static");
|
|
2559
2680
|
const aggregated = aggregatePageScores(pageResults);
|
|
2560
2681
|
|
|
2561
2682
|
// Summary output mode - compact JSON with minimal page info (ideal for large sitemaps)
|
|
@@ -2665,6 +2786,13 @@ server.tool(
|
|
|
2665
2786
|
"Defaults to 5 req/s (or GLIPPY_RATE_LIMIT env var). Set lower for polite crawling, higher if you control the target server. " +
|
|
2666
2787
|
"Use 0.5 for 1 request every 2 seconds, 10 for aggressive crawling."
|
|
2667
2788
|
),
|
|
2789
|
+
render_mode: z
|
|
2790
|
+
.enum(RENDER_MODES)
|
|
2791
|
+
.optional()
|
|
2792
|
+
.describe(
|
|
2793
|
+
'How to fetch each URL. "static" (default), "auto" (static with Chrome fallback on bot-block), ' +
|
|
2794
|
+
'or "chrome" (always render via local headless Chrome).'
|
|
2795
|
+
),
|
|
2668
2796
|
output_format: z
|
|
2669
2797
|
.enum(["text", "json", "summary"])
|
|
2670
2798
|
.optional()
|
|
@@ -2692,7 +2820,7 @@ server.tool(
|
|
|
2692
2820
|
"Recommended: 10-20 for detailed results to stay within output limits."
|
|
2693
2821
|
),
|
|
2694
2822
|
},
|
|
2695
|
-
withLicense(async ({ urls, rate_limit, output_format, offset, limit }) => {
|
|
2823
|
+
withLicense(async ({ urls, rate_limit, render_mode, output_format, offset, limit }) => {
|
|
2696
2824
|
const features = getFeatures();
|
|
2697
2825
|
|
|
2698
2826
|
// Check if batch analysis is available for this tier
|
|
@@ -2721,7 +2849,7 @@ server.tool(
|
|
|
2721
2849
|
|
|
2722
2850
|
try {
|
|
2723
2851
|
const rateLimit = rate_limit ?? DEFAULT_RATE_LIMIT;
|
|
2724
|
-
const { pageResults } = await analyseUrls(urls, 3, rateLimit);
|
|
2852
|
+
const { pageResults } = await analyseUrls(urls, 3, rateLimit, render_mode ?? "static");
|
|
2725
2853
|
const aggregated = aggregatePageScores(pageResults);
|
|
2726
2854
|
|
|
2727
2855
|
// Summary output mode - compact JSON with minimal page info (ideal for large batches)
|
|
@@ -2834,6 +2962,13 @@ server.tool(
|
|
|
2834
2962
|
"Maximum pages to crawl (1 = homepage only, up to 10 for multi-page analysis). Defaults to 10. " +
|
|
2835
2963
|
"Ignored if analysis_result is provided."
|
|
2836
2964
|
),
|
|
2965
|
+
render_mode: z
|
|
2966
|
+
.enum(RENDER_MODES)
|
|
2967
|
+
.optional()
|
|
2968
|
+
.describe(
|
|
2969
|
+
'How to fetch HTML. "static" (default), "auto" (static with Chrome fallback on bot-block), ' +
|
|
2970
|
+
'or "chrome" (always render via local headless Chrome). Ignored if analysis_result is provided.'
|
|
2971
|
+
),
|
|
2837
2972
|
analysis_result: z
|
|
2838
2973
|
.object({})
|
|
2839
2974
|
.passthrough()
|
|
@@ -2844,7 +2979,7 @@ server.tool(
|
|
|
2844
2979
|
"and export in multiple formats without redundant crawling."
|
|
2845
2980
|
),
|
|
2846
2981
|
},
|
|
2847
|
-
withLicense(async ({ domain, format, max_pages, analysis_result }) => {
|
|
2982
|
+
withLicense(async ({ domain, format, max_pages, render_mode, analysis_result }) => {
|
|
2848
2983
|
try {
|
|
2849
2984
|
let result;
|
|
2850
2985
|
|
|
@@ -2866,6 +3001,7 @@ server.tool(
|
|
|
2866
3001
|
// Run fresh analysis (may use cache automatically)
|
|
2867
3002
|
result = await checkGEO(domain, {
|
|
2868
3003
|
maxPages: max_pages ?? 10,
|
|
3004
|
+
renderMode: render_mode ?? "static",
|
|
2869
3005
|
});
|
|
2870
3006
|
} else {
|
|
2871
3007
|
return {
|
|
@@ -3003,11 +3139,19 @@ server.tool(
|
|
|
3003
3139
|
.describe(
|
|
3004
3140
|
"Max requests/second per domain for URL/sitemap modes. Defaults to 5. Ignored if analysis_results provided."
|
|
3005
3141
|
),
|
|
3142
|
+
render_mode: z
|
|
3143
|
+
.enum(RENDER_MODES)
|
|
3144
|
+
.optional()
|
|
3145
|
+
.describe(
|
|
3146
|
+
'How to fetch HTML. "static" (default), "auto" (static with Chrome fallback on bot-block), ' +
|
|
3147
|
+
'or "chrome" (always render via local headless Chrome). Ignored if analysis_results provided.'
|
|
3148
|
+
),
|
|
3006
3149
|
},
|
|
3007
3150
|
withTierFeature(
|
|
3008
3151
|
"bulkExport",
|
|
3009
3152
|
"Bulk report exports require a Pro or Agency license.",
|
|
3010
|
-
async ({ format, domains, urls, sitemap_url, analysis_results, max_pages, max_urls, rate_limit }) => {
|
|
3153
|
+
async ({ format, domains, urls, sitemap_url, analysis_results, max_pages, max_urls, rate_limit, render_mode }) => {
|
|
3154
|
+
const renderMode = render_mode ?? "static";
|
|
3011
3155
|
// Validate: exactly one input mode
|
|
3012
3156
|
const modes = [domains, urls, sitemap_url, analysis_results].filter(Boolean).length;
|
|
3013
3157
|
if (modes !== 1) {
|
|
@@ -3116,7 +3260,7 @@ server.tool(
|
|
|
3116
3260
|
const maxPages = max_pages ?? 10;
|
|
3117
3261
|
const results = await Promise.allSettled(
|
|
3118
3262
|
domains.map((domain) =>
|
|
3119
|
-
checkGEO(domain, { maxPages }).then((result) => ({
|
|
3263
|
+
checkGEO(domain, { maxPages, renderMode }).then((result) => ({
|
|
3120
3264
|
domain,
|
|
3121
3265
|
result,
|
|
3122
3266
|
}))
|
|
@@ -3173,7 +3317,7 @@ server.tool(
|
|
|
3173
3317
|
// ------------------------------------------------------------------
|
|
3174
3318
|
if (urls) {
|
|
3175
3319
|
const rateLimit = rate_limit ?? DEFAULT_RATE_LIMIT;
|
|
3176
|
-
const { pageResults } = await analyseUrls(urls, 3, rateLimit);
|
|
3320
|
+
const { pageResults } = await analyseUrls(urls, 3, rateLimit, renderMode);
|
|
3177
3321
|
const aggregated = aggregatePageScores(pageResults);
|
|
3178
3322
|
const title = `${urls.length} URLs`;
|
|
3179
3323
|
|
|
@@ -3239,7 +3383,7 @@ server.tool(
|
|
|
3239
3383
|
|
|
3240
3384
|
const urlsToAnalyse = allUrls.slice(0, max_urls ?? 50000);
|
|
3241
3385
|
const rateLimit = rate_limit ?? DEFAULT_RATE_LIMIT;
|
|
3242
|
-
const { pageResults } = await analyseUrls(urlsToAnalyse, 3, rateLimit);
|
|
3386
|
+
const { pageResults } = await analyseUrls(urlsToAnalyse, 3, rateLimit, renderMode);
|
|
3243
3387
|
const aggregated = aggregatePageScores(pageResults);
|
|
3244
3388
|
const title = `Sitemap: ${sitemap_url} (${urlsToAnalyse.length} of ${allUrls.length} URLs)`;
|
|
3245
3389
|
|