seo-intel 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +41 -0
- package/LICENSE +75 -0
- package/README.md +243 -0
- package/Start SEO Intel.bat +9 -0
- package/Start SEO Intel.command +8 -0
- package/cli.js +3727 -0
- package/config/example.json +29 -0
- package/config/setup-wizard.js +522 -0
- package/crawler/index.js +566 -0
- package/crawler/robots.js +103 -0
- package/crawler/sanitize.js +124 -0
- package/crawler/schema-parser.js +168 -0
- package/crawler/sitemap.js +103 -0
- package/crawler/stealth.js +393 -0
- package/crawler/subdomain-discovery.js +341 -0
- package/db/db.js +213 -0
- package/db/schema.sql +120 -0
- package/exports/competitive.js +186 -0
- package/exports/heuristics.js +67 -0
- package/exports/queries.js +197 -0
- package/exports/suggestive.js +230 -0
- package/exports/technical.js +180 -0
- package/exports/templates.js +77 -0
- package/lib/gate.js +204 -0
- package/lib/license.js +369 -0
- package/lib/oauth.js +432 -0
- package/lib/updater.js +324 -0
- package/package.json +68 -0
- package/reports/generate-html.js +6194 -0
- package/reports/generate-site-graph.js +949 -0
- package/reports/gsc-loader.js +190 -0
- package/scheduler.js +142 -0
- package/seo-audit.js +619 -0
- package/seo-intel.png +0 -0
- package/server.js +602 -0
- package/setup/ROADMAP.md +109 -0
- package/setup/checks.js +483 -0
- package/setup/config-builder.js +227 -0
- package/setup/engine.js +65 -0
- package/setup/installers.js +197 -0
- package/setup/models.js +328 -0
- package/setup/openclaw-bridge.js +329 -0
- package/setup/validator.js +395 -0
- package/setup/web-routes.js +688 -0
- package/setup/wizard.html +2920 -0
- package/start-seo-intel.sh +8 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Sanitize scraped text before sending to any AI model.
|
|
3
|
+
* Defense against prompt injection from malicious web content.
|
|
4
|
+
*/
|
|
5
|
+
import TurndownService from 'turndown';
|
|
6
|
+
|
|
7
|
+
// Patterns that look like prompt injection attempts
|
|
8
|
+
const INJECTION_PATTERNS = [
|
|
9
|
+
/ignore\s+(previous|above|all|prior)\s+instructions?/gi,
|
|
10
|
+
/forget\s+(everything|all|prior|previous)/gi,
|
|
11
|
+
/you\s+are\s+now\s+a/gi,
|
|
12
|
+
/new\s+instructions?:/gi,
|
|
13
|
+
/system\s*:/gi,
|
|
14
|
+
/assistant\s*:/gi,
|
|
15
|
+
/\[INST\]/gi,
|
|
16
|
+
/\[\/INST\]/gi,
|
|
17
|
+
/<\|im_start\|>/gi,
|
|
18
|
+
/<\|im_end\|>/gi,
|
|
19
|
+
/###\s*(instruction|system|human|assistant)/gi,
|
|
20
|
+
];
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Strip HTML tags and extract clean visible text.
|
|
24
|
+
*/
|
|
25
|
+
export function stripHtml(html) {
|
|
26
|
+
return html
|
|
27
|
+
// Remove script + style blocks entirely
|
|
28
|
+
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
|
|
29
|
+
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
|
|
30
|
+
.replace(/<noscript[\s\S]*?<\/noscript>/gi, ' ')
|
|
31
|
+
// Remove HTML comments
|
|
32
|
+
.replace(/<!--[\s\S]*?-->/g, ' ')
|
|
33
|
+
// Remove remaining tags
|
|
34
|
+
.replace(/<[^>]+>/g, ' ')
|
|
35
|
+
// Decode common entities
|
|
36
|
+
.replace(/&/g, '&')
|
|
37
|
+
.replace(/</g, '<')
|
|
38
|
+
.replace(/>/g, '>')
|
|
39
|
+
.replace(/"/g, '"')
|
|
40
|
+
.replace(/'/g, "'")
|
|
41
|
+
.replace(/ /g, ' ')
|
|
42
|
+
// Collapse whitespace
|
|
43
|
+
.replace(/\s{3,}/g, '\n\n')
|
|
44
|
+
.trim();
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Remove prompt injection patterns from text.
|
|
49
|
+
* Replaces suspicious phrases with [REMOVED].
|
|
50
|
+
*/
|
|
51
|
+
export function removeInjections(text) {
|
|
52
|
+
let cleaned = text;
|
|
53
|
+
for (const pattern of INJECTION_PATTERNS) {
|
|
54
|
+
cleaned = cleaned.replace(pattern, '[REMOVED]');
|
|
55
|
+
}
|
|
56
|
+
return cleaned;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Truncate text to a safe token limit (rough estimate: 4 chars/token).
|
|
61
|
+
*/
|
|
62
|
+
export function truncate(text, maxTokens = 2000) {
|
|
63
|
+
const maxChars = maxTokens * 4;
|
|
64
|
+
if (text.length <= maxChars) return text;
|
|
65
|
+
return text.slice(0, maxChars) + '\n[TRUNCATED]';
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Full sanitization pipeline for scraped content.
|
|
70
|
+
*/
|
|
71
|
+
export function sanitize(rawHtml, maxTokens = 2000) {
|
|
72
|
+
const text = stripHtml(rawHtml);
|
|
73
|
+
const cleaned = removeInjections(text);
|
|
74
|
+
return truncate(cleaned, maxTokens);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Extract only text from specific CSS selectors (safer than full page).
|
|
79
|
+
* Pass the Playwright page object and a list of selectors.
|
|
80
|
+
*/
|
|
81
|
+
export async function extractSelective(page, selectors = ['h1', 'h2', 'h3', 'p', 'li', 'title']) {
|
|
82
|
+
const parts = [];
|
|
83
|
+
for (const sel of selectors) {
|
|
84
|
+
try {
|
|
85
|
+
const texts = await page.$$eval(sel, els => els.map(e => e.innerText?.trim()).filter(Boolean));
|
|
86
|
+
parts.push(...texts);
|
|
87
|
+
} catch {}
|
|
88
|
+
}
|
|
89
|
+
return removeInjections(parts.join('\n').replace(/\s{3,}/g, '\n\n').trim());
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Extract page content as clean Markdown via Turndown.
|
|
94
|
+
* Tries <main> or <article> first for focused content, falls back to <body>.
|
|
95
|
+
* Strips nav/footer/header/aside/script/style before conversion.
|
|
96
|
+
*/
|
|
97
|
+
const turndown = new TurndownService({
|
|
98
|
+
headingStyle: 'atx',
|
|
99
|
+
codeBlockStyle: 'fenced',
|
|
100
|
+
bulletListMarker: '-',
|
|
101
|
+
});
|
|
102
|
+
// Skip images in markdown output (no value for SEO text extraction)
|
|
103
|
+
turndown.addRule('removeImages', { filter: 'img', replacement: () => '' });
|
|
104
|
+
|
|
105
|
+
export async function extractAsMarkdown(page) {
|
|
106
|
+
// Get focused content HTML — prefer <main> or <article>, fall back to <body>
|
|
107
|
+
const html = await page.evaluate(() => {
|
|
108
|
+
const el = document.querySelector('main') || document.querySelector('article') || document.body;
|
|
109
|
+
if (!el) return '';
|
|
110
|
+
// Clone to avoid mutating the live DOM
|
|
111
|
+
const clone = el.cloneNode(true);
|
|
112
|
+
// Strip non-content elements
|
|
113
|
+
for (const tag of ['nav', 'footer', 'header', 'aside', 'script', 'style', 'noscript', 'iframe']) {
|
|
114
|
+
clone.querySelectorAll(tag).forEach(n => n.remove());
|
|
115
|
+
}
|
|
116
|
+
return clone.innerHTML;
|
|
117
|
+
}).catch(() => '');
|
|
118
|
+
|
|
119
|
+
if (!html) return '';
|
|
120
|
+
|
|
121
|
+
const md = turndown.turndown(html);
|
|
122
|
+
const cleaned = removeInjections(md);
|
|
123
|
+
return truncate(cleaned, 2000);
|
|
124
|
+
}
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JSON-LD Schema Parser
|
|
3
|
+
*
|
|
4
|
+
* Extracts structured data from <script type="application/ld+json"> blocks.
|
|
5
|
+
* Parses @type, name, description, aggregateRating, offers, author, dates,
|
|
6
|
+
* images — everything Google actually uses for rich results.
|
|
7
|
+
*
|
|
8
|
+
* Returns normalized objects ready for page_schemas table insertion.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Parse all JSON-LD blocks from raw HTML string.
|
|
13
|
+
* Works on raw HTML (no DOM needed) — runs during crawl before Qwen extraction.
|
|
14
|
+
*
|
|
15
|
+
* @param {string} html - Raw HTML string
|
|
16
|
+
* @returns {Array<Object>} Parsed schema objects
|
|
17
|
+
*/
|
|
18
|
+
export function parseJsonLd(html) {
|
|
19
|
+
if (!html) return [];
|
|
20
|
+
|
|
21
|
+
const blocks = extractJsonLdBlocks(html);
|
|
22
|
+
const schemas = [];
|
|
23
|
+
|
|
24
|
+
for (const block of blocks) {
|
|
25
|
+
try {
|
|
26
|
+
const parsed = JSON.parse(block);
|
|
27
|
+
// Handle @graph arrays (common in Yoast, Schema.org generators)
|
|
28
|
+
if (parsed['@graph'] && Array.isArray(parsed['@graph'])) {
|
|
29
|
+
for (const item of parsed['@graph']) {
|
|
30
|
+
const normalized = normalizeSchema(item);
|
|
31
|
+
if (normalized) schemas.push(normalized);
|
|
32
|
+
}
|
|
33
|
+
} else if (Array.isArray(parsed)) {
|
|
34
|
+
// Some sites output an array of schemas
|
|
35
|
+
for (const item of parsed) {
|
|
36
|
+
const normalized = normalizeSchema(item);
|
|
37
|
+
if (normalized) schemas.push(normalized);
|
|
38
|
+
}
|
|
39
|
+
} else {
|
|
40
|
+
const normalized = normalizeSchema(parsed);
|
|
41
|
+
if (normalized) schemas.push(normalized);
|
|
42
|
+
}
|
|
43
|
+
} catch {
|
|
44
|
+
// Malformed JSON-LD — skip silently
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
return schemas;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Extract raw JSON strings from <script type="application/ld+json"> tags.
|
|
53
|
+
* Uses regex — no DOM parser needed.
|
|
54
|
+
*/
|
|
55
|
+
function extractJsonLdBlocks(html) {
|
|
56
|
+
const regex = /<script[^>]*type\s*=\s*["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
|
|
57
|
+
const blocks = [];
|
|
58
|
+
let match;
|
|
59
|
+
while ((match = regex.exec(html)) !== null) {
|
|
60
|
+
const content = match[1].trim();
|
|
61
|
+
if (content) blocks.push(content);
|
|
62
|
+
}
|
|
63
|
+
return blocks;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Normalize a single JSON-LD object into a flat structure for DB storage.
|
|
68
|
+
*/
|
|
69
|
+
function normalizeSchema(obj) {
|
|
70
|
+
if (!obj || typeof obj !== 'object') return null;
|
|
71
|
+
|
|
72
|
+
const type = resolveType(obj['@type']);
|
|
73
|
+
if (!type) return null;
|
|
74
|
+
|
|
75
|
+
const rating = extractRating(obj);
|
|
76
|
+
const offers = extractOffers(obj);
|
|
77
|
+
const author = extractAuthor(obj);
|
|
78
|
+
const image = extractImage(obj);
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
type,
|
|
82
|
+
name: str(obj.name) || str(obj.headline) || null,
|
|
83
|
+
description: str(obj.description) || null,
|
|
84
|
+
rating: rating.value,
|
|
85
|
+
ratingCount: rating.count,
|
|
86
|
+
price: offers.price,
|
|
87
|
+
currency: offers.currency,
|
|
88
|
+
author,
|
|
89
|
+
datePublished: str(obj.datePublished) || null,
|
|
90
|
+
dateModified: str(obj.dateModified) || null,
|
|
91
|
+
imageUrl: image,
|
|
92
|
+
raw: obj,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// ── Extractors ───────────────────────────────────────────────────────────────
|
|
97
|
+
|
|
98
|
+
function resolveType(t) {
|
|
99
|
+
if (!t) return null;
|
|
100
|
+
if (Array.isArray(t)) return t[0]; // take first type
|
|
101
|
+
return String(t);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function extractRating(obj) {
|
|
105
|
+
const ar = obj.aggregateRating;
|
|
106
|
+
if (!ar) return { value: null, count: null };
|
|
107
|
+
return {
|
|
108
|
+
value: parseFloat(ar.ratingValue) || null,
|
|
109
|
+
count: parseInt(ar.reviewCount || ar.ratingCount) || null,
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function extractOffers(obj) {
|
|
114
|
+
const offers = obj.offers;
|
|
115
|
+
if (!offers) return { price: null, currency: null };
|
|
116
|
+
|
|
117
|
+
// Single offer
|
|
118
|
+
if (offers.price !== undefined) {
|
|
119
|
+
return {
|
|
120
|
+
price: String(offers.price),
|
|
121
|
+
currency: str(offers.priceCurrency) || null,
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// Offer with priceRange
|
|
126
|
+
if (offers.priceRange) {
|
|
127
|
+
return { price: str(offers.priceRange), currency: str(offers.priceCurrency) || null };
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// AggregateOffer
|
|
131
|
+
if (offers.lowPrice !== undefined || offers.highPrice !== undefined) {
|
|
132
|
+
const low = offers.lowPrice ?? '?';
|
|
133
|
+
const high = offers.highPrice ?? '?';
|
|
134
|
+
return {
|
|
135
|
+
price: `${low}-${high}`,
|
|
136
|
+
currency: str(offers.priceCurrency) || null,
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Array of offers — take first
|
|
141
|
+
if (Array.isArray(offers) && offers.length > 0) {
|
|
142
|
+
return extractOffers({ offers: offers[0] });
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return { price: null, currency: null };
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
function extractAuthor(obj) {
|
|
149
|
+
const author = obj.author;
|
|
150
|
+
if (!author) return null;
|
|
151
|
+
if (typeof author === 'string') return author;
|
|
152
|
+
if (Array.isArray(author)) return author.map(a => str(a.name) || str(a)).filter(Boolean).join(', ');
|
|
153
|
+
return str(author.name) || null;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
function extractImage(obj) {
|
|
157
|
+
const img = obj.image;
|
|
158
|
+
if (!img) return null;
|
|
159
|
+
if (typeof img === 'string') return img;
|
|
160
|
+
if (Array.isArray(img)) return typeof img[0] === 'string' ? img[0] : img[0]?.url || null;
|
|
161
|
+
return str(img.url) || null;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
function str(v) {
|
|
165
|
+
if (v === null || v === undefined) return null;
|
|
166
|
+
if (typeof v === 'string') return v.trim() || null;
|
|
167
|
+
return String(v);
|
|
168
|
+
}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Sitemap.xml fetcher + parser
|
|
3
|
+
* Discovers URLs from sitemap before link-following begins.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import fetch from 'node-fetch';
|
|
7
|
+
|
|
8
|
+
const SITEMAP_TIMEOUT = 10000;
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Fetch and parse sitemap.xml for a domain.
|
|
12
|
+
* Handles sitemap index files (multiple sitemaps) and regular sitemaps.
|
|
13
|
+
* Returns array of { url, lastmod? } objects.
|
|
14
|
+
*/
|
|
15
|
+
export async function fetchSitemap(startUrl) {
|
|
16
|
+
const base = new URL(startUrl);
|
|
17
|
+
const sitemapUrls = [
|
|
18
|
+
`${base.origin}/sitemap.xml`,
|
|
19
|
+
`${base.origin}/sitemap_index.xml`,
|
|
20
|
+
`${base.origin}/sitemap-index.xml`,
|
|
21
|
+
];
|
|
22
|
+
|
|
23
|
+
const allUrls = [];
|
|
24
|
+
const seen = new Set();
|
|
25
|
+
|
|
26
|
+
for (const sitemapUrl of sitemapUrls) {
|
|
27
|
+
try {
|
|
28
|
+
const urls = await parseSitemapUrl(sitemapUrl, base.hostname, seen);
|
|
29
|
+
allUrls.push(...urls);
|
|
30
|
+
if (urls.length > 0) break; // found a working sitemap
|
|
31
|
+
} catch {
|
|
32
|
+
continue;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return allUrls;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
async function parseSitemapUrl(url, hostname, seen, depth = 0) {
|
|
40
|
+
if (depth > 2 || seen.has(url)) return []; // prevent infinite recursion
|
|
41
|
+
seen.add(url);
|
|
42
|
+
|
|
43
|
+
let text;
|
|
44
|
+
try {
|
|
45
|
+
const res = await fetch(url, {
|
|
46
|
+
timeout: SITEMAP_TIMEOUT,
|
|
47
|
+
headers: { 'User-Agent': 'SEOIntelBot/1.0' },
|
|
48
|
+
});
|
|
49
|
+
if (!res.ok) return [];
|
|
50
|
+
text = await res.text();
|
|
51
|
+
} catch {
|
|
52
|
+
return [];
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Check if it's a sitemap index (contains <sitemap> tags)
|
|
56
|
+
if (text.includes('<sitemap>') || text.includes('<sitemapindex')) {
|
|
57
|
+
const childUrls = extractTagContent(text, 'loc');
|
|
58
|
+
const results = [];
|
|
59
|
+
for (const childUrl of childUrls.slice(0, 20)) { // max 20 child sitemaps
|
|
60
|
+
const childResults = await parseSitemapUrl(childUrl, hostname, seen, depth + 1);
|
|
61
|
+
results.push(...childResults);
|
|
62
|
+
}
|
|
63
|
+
return results;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Regular sitemap — extract <url> entries
|
|
67
|
+
const urls = [];
|
|
68
|
+
const locs = extractTagContent(text, 'loc');
|
|
69
|
+
const lastmods = extractTagContent(text, 'lastmod');
|
|
70
|
+
|
|
71
|
+
for (let i = 0; i < locs.length; i++) {
|
|
72
|
+
const loc = locs[i];
|
|
73
|
+
try {
|
|
74
|
+
const parsed = new URL(loc);
|
|
75
|
+
// Only include URLs from the same hostname
|
|
76
|
+
if (parsed.hostname !== hostname) continue;
|
|
77
|
+
// Skip non-page resources
|
|
78
|
+
if (/\.(pdf|png|jpg|jpeg|gif|svg|css|js|woff|ico|xml)$/i.test(parsed.pathname)) continue;
|
|
79
|
+
|
|
80
|
+
urls.push({
|
|
81
|
+
url: parsed.href,
|
|
82
|
+
lastmod: lastmods[i] || null,
|
|
83
|
+
});
|
|
84
|
+
} catch {
|
|
85
|
+
continue;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return urls;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Simple XML tag content extractor (no full XML parser needed).
|
|
94
|
+
*/
|
|
95
|
+
function extractTagContent(xml, tagName) {
|
|
96
|
+
const regex = new RegExp(`<${tagName}[^>]*>([^<]+)</${tagName}>`, 'gi');
|
|
97
|
+
const results = [];
|
|
98
|
+
let match;
|
|
99
|
+
while ((match = regex.exec(xml)) !== null) {
|
|
100
|
+
results.push(match[1].trim());
|
|
101
|
+
}
|
|
102
|
+
return results;
|
|
103
|
+
}
|