seo-intel 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/.env.example +41 -0
  2. package/LICENSE +75 -0
  3. package/README.md +243 -0
  4. package/Start SEO Intel.bat +9 -0
  5. package/Start SEO Intel.command +8 -0
  6. package/cli.js +3727 -0
  7. package/config/example.json +29 -0
  8. package/config/setup-wizard.js +522 -0
  9. package/crawler/index.js +566 -0
  10. package/crawler/robots.js +103 -0
  11. package/crawler/sanitize.js +124 -0
  12. package/crawler/schema-parser.js +168 -0
  13. package/crawler/sitemap.js +103 -0
  14. package/crawler/stealth.js +393 -0
  15. package/crawler/subdomain-discovery.js +341 -0
  16. package/db/db.js +213 -0
  17. package/db/schema.sql +120 -0
  18. package/exports/competitive.js +186 -0
  19. package/exports/heuristics.js +67 -0
  20. package/exports/queries.js +197 -0
  21. package/exports/suggestive.js +230 -0
  22. package/exports/technical.js +180 -0
  23. package/exports/templates.js +77 -0
  24. package/lib/gate.js +204 -0
  25. package/lib/license.js +369 -0
  26. package/lib/oauth.js +432 -0
  27. package/lib/updater.js +324 -0
  28. package/package.json +68 -0
  29. package/reports/generate-html.js +6194 -0
  30. package/reports/generate-site-graph.js +949 -0
  31. package/reports/gsc-loader.js +190 -0
  32. package/scheduler.js +142 -0
  33. package/seo-audit.js +619 -0
  34. package/seo-intel.png +0 -0
  35. package/server.js +602 -0
  36. package/setup/ROADMAP.md +109 -0
  37. package/setup/checks.js +483 -0
  38. package/setup/config-builder.js +227 -0
  39. package/setup/engine.js +65 -0
  40. package/setup/installers.js +197 -0
  41. package/setup/models.js +328 -0
  42. package/setup/openclaw-bridge.js +329 -0
  43. package/setup/validator.js +395 -0
  44. package/setup/web-routes.js +688 -0
  45. package/setup/wizard.html +2920 -0
  46. package/start-seo-intel.sh +8 -0
@@ -0,0 +1,124 @@
1
+ /**
2
+ * Sanitize scraped text before sending to any AI model.
3
+ * Defense against prompt injection from malicious web content.
4
+ */
5
+ import TurndownService from 'turndown';
6
+
7
+ // Patterns that look like prompt injection attempts
8
+ const INJECTION_PATTERNS = [
9
+ /ignore\s+(previous|above|all|prior)\s+instructions?/gi,
10
+ /forget\s+(everything|all|prior|previous)/gi,
11
+ /you\s+are\s+now\s+a/gi,
12
+ /new\s+instructions?:/gi,
13
+ /system\s*:/gi,
14
+ /assistant\s*:/gi,
15
+ /\[INST\]/gi,
16
+ /\[\/INST\]/gi,
17
+ /<\|im_start\|>/gi,
18
+ /<\|im_end\|>/gi,
19
+ /###\s*(instruction|system|human|assistant)/gi,
20
+ ];
21
+
22
+ /**
23
+ * Strip HTML tags and extract clean visible text.
24
+ */
25
+ export function stripHtml(html) {
26
+ return html
27
+ // Remove script + style blocks entirely
28
+ .replace(/<script[\s\S]*?<\/script>/gi, ' ')
29
+ .replace(/<style[\s\S]*?<\/style>/gi, ' ')
30
+ .replace(/<noscript[\s\S]*?<\/noscript>/gi, ' ')
31
+ // Remove HTML comments
32
+ .replace(/<!--[\s\S]*?-->/g, ' ')
33
+ // Remove remaining tags
34
+ .replace(/<[^>]+>/g, ' ')
35
+ // Decode common entities
36
+ .replace(/&amp;/g, '&')
37
+ .replace(/&lt;/g, '<')
38
+ .replace(/&gt;/g, '>')
39
+ .replace(/&quot;/g, '"')
40
+ .replace(/&#39;/g, "'")
41
+ .replace(/&nbsp;/g, ' ')
42
+ // Collapse whitespace
43
+ .replace(/\s{3,}/g, '\n\n')
44
+ .trim();
45
+ }
46
+
47
+ /**
48
+ * Remove prompt injection patterns from text.
49
+ * Replaces suspicious phrases with [REMOVED].
50
+ */
51
+ export function removeInjections(text) {
52
+ let cleaned = text;
53
+ for (const pattern of INJECTION_PATTERNS) {
54
+ cleaned = cleaned.replace(pattern, '[REMOVED]');
55
+ }
56
+ return cleaned;
57
+ }
58
+
59
+ /**
60
+ * Truncate text to a safe token limit (rough estimate: 4 chars/token).
61
+ */
62
+ export function truncate(text, maxTokens = 2000) {
63
+ const maxChars = maxTokens * 4;
64
+ if (text.length <= maxChars) return text;
65
+ return text.slice(0, maxChars) + '\n[TRUNCATED]';
66
+ }
67
+
68
+ /**
69
+ * Full sanitization pipeline for scraped content.
70
+ */
71
+ export function sanitize(rawHtml, maxTokens = 2000) {
72
+ const text = stripHtml(rawHtml);
73
+ const cleaned = removeInjections(text);
74
+ return truncate(cleaned, maxTokens);
75
+ }
76
+
77
+ /**
78
+ * Extract only text from specific CSS selectors (safer than full page).
79
+ * Pass the Playwright page object and a list of selectors.
80
+ */
81
+ export async function extractSelective(page, selectors = ['h1', 'h2', 'h3', 'p', 'li', 'title']) {
82
+ const parts = [];
83
+ for (const sel of selectors) {
84
+ try {
85
+ const texts = await page.$$eval(sel, els => els.map(e => e.innerText?.trim()).filter(Boolean));
86
+ parts.push(...texts);
87
+ } catch {}
88
+ }
89
+ return removeInjections(parts.join('\n').replace(/\s{3,}/g, '\n\n').trim());
90
+ }
91
+
92
+ /**
93
+ * Extract page content as clean Markdown via Turndown.
94
+ * Tries <main> or <article> first for focused content, falls back to <body>.
95
+ * Strips nav/footer/header/aside/script/style before conversion.
96
+ */
97
+ const turndown = new TurndownService({
98
+ headingStyle: 'atx',
99
+ codeBlockStyle: 'fenced',
100
+ bulletListMarker: '-',
101
+ });
102
+ // Skip images in markdown output (no value for SEO text extraction)
103
+ turndown.addRule('removeImages', { filter: 'img', replacement: () => '' });
104
+
105
+ export async function extractAsMarkdown(page) {
106
+ // Get focused content HTML — prefer <main> or <article>, fall back to <body>
107
+ const html = await page.evaluate(() => {
108
+ const el = document.querySelector('main') || document.querySelector('article') || document.body;
109
+ if (!el) return '';
110
+ // Clone to avoid mutating the live DOM
111
+ const clone = el.cloneNode(true);
112
+ // Strip non-content elements
113
+ for (const tag of ['nav', 'footer', 'header', 'aside', 'script', 'style', 'noscript', 'iframe']) {
114
+ clone.querySelectorAll(tag).forEach(n => n.remove());
115
+ }
116
+ return clone.innerHTML;
117
+ }).catch(() => '');
118
+
119
+ if (!html) return '';
120
+
121
+ const md = turndown.turndown(html);
122
+ const cleaned = removeInjections(md);
123
+ return truncate(cleaned, 2000);
124
+ }
@@ -0,0 +1,168 @@
1
+ /**
2
+ * JSON-LD Schema Parser
3
+ *
4
+ * Extracts structured data from <script type="application/ld+json"> blocks.
5
+ * Parses @type, name, description, aggregateRating, offers, author, dates,
6
+ * images — everything Google actually uses for rich results.
7
+ *
8
+ * Returns normalized objects ready for page_schemas table insertion.
9
+ */
10
+
11
+ /**
12
+ * Parse all JSON-LD blocks from raw HTML string.
13
+ * Works on raw HTML (no DOM needed) — runs during crawl before Qwen extraction.
14
+ *
15
+ * @param {string} html - Raw HTML string
16
+ * @returns {Array<Object>} Parsed schema objects
17
+ */
18
+ export function parseJsonLd(html) {
19
+ if (!html) return [];
20
+
21
+ const blocks = extractJsonLdBlocks(html);
22
+ const schemas = [];
23
+
24
+ for (const block of blocks) {
25
+ try {
26
+ const parsed = JSON.parse(block);
27
+ // Handle @graph arrays (common in Yoast, Schema.org generators)
28
+ if (parsed['@graph'] && Array.isArray(parsed['@graph'])) {
29
+ for (const item of parsed['@graph']) {
30
+ const normalized = normalizeSchema(item);
31
+ if (normalized) schemas.push(normalized);
32
+ }
33
+ } else if (Array.isArray(parsed)) {
34
+ // Some sites output an array of schemas
35
+ for (const item of parsed) {
36
+ const normalized = normalizeSchema(item);
37
+ if (normalized) schemas.push(normalized);
38
+ }
39
+ } else {
40
+ const normalized = normalizeSchema(parsed);
41
+ if (normalized) schemas.push(normalized);
42
+ }
43
+ } catch {
44
+ // Malformed JSON-LD — skip silently
45
+ }
46
+ }
47
+
48
+ return schemas;
49
+ }
50
+
51
+ /**
52
+ * Extract raw JSON strings from <script type="application/ld+json"> tags.
53
+ * Uses regex — no DOM parser needed.
54
+ */
55
+ function extractJsonLdBlocks(html) {
56
+ const regex = /<script[^>]*type\s*=\s*["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
57
+ const blocks = [];
58
+ let match;
59
+ while ((match = regex.exec(html)) !== null) {
60
+ const content = match[1].trim();
61
+ if (content) blocks.push(content);
62
+ }
63
+ return blocks;
64
+ }
65
+
66
+ /**
67
+ * Normalize a single JSON-LD object into a flat structure for DB storage.
68
+ */
69
+ function normalizeSchema(obj) {
70
+ if (!obj || typeof obj !== 'object') return null;
71
+
72
+ const type = resolveType(obj['@type']);
73
+ if (!type) return null;
74
+
75
+ const rating = extractRating(obj);
76
+ const offers = extractOffers(obj);
77
+ const author = extractAuthor(obj);
78
+ const image = extractImage(obj);
79
+
80
+ return {
81
+ type,
82
+ name: str(obj.name) || str(obj.headline) || null,
83
+ description: str(obj.description) || null,
84
+ rating: rating.value,
85
+ ratingCount: rating.count,
86
+ price: offers.price,
87
+ currency: offers.currency,
88
+ author,
89
+ datePublished: str(obj.datePublished) || null,
90
+ dateModified: str(obj.dateModified) || null,
91
+ imageUrl: image,
92
+ raw: obj,
93
+ };
94
+ }
95
+
96
+ // ── Extractors ───────────────────────────────────────────────────────────────
97
+
98
+ function resolveType(t) {
99
+ if (!t) return null;
100
+ if (Array.isArray(t)) return t[0]; // take first type
101
+ return String(t);
102
+ }
103
+
104
+ function extractRating(obj) {
105
+ const ar = obj.aggregateRating;
106
+ if (!ar) return { value: null, count: null };
107
+ return {
108
+ value: parseFloat(ar.ratingValue) || null,
109
+ count: parseInt(ar.reviewCount || ar.ratingCount) || null,
110
+ };
111
+ }
112
+
113
+ function extractOffers(obj) {
114
+ const offers = obj.offers;
115
+ if (!offers) return { price: null, currency: null };
116
+
117
+ // Single offer
118
+ if (offers.price !== undefined) {
119
+ return {
120
+ price: String(offers.price),
121
+ currency: str(offers.priceCurrency) || null,
122
+ };
123
+ }
124
+
125
+ // Offer with priceRange
126
+ if (offers.priceRange) {
127
+ return { price: str(offers.priceRange), currency: str(offers.priceCurrency) || null };
128
+ }
129
+
130
+ // AggregateOffer
131
+ if (offers.lowPrice !== undefined || offers.highPrice !== undefined) {
132
+ const low = offers.lowPrice ?? '?';
133
+ const high = offers.highPrice ?? '?';
134
+ return {
135
+ price: `${low}-${high}`,
136
+ currency: str(offers.priceCurrency) || null,
137
+ };
138
+ }
139
+
140
+ // Array of offers — take first
141
+ if (Array.isArray(offers) && offers.length > 0) {
142
+ return extractOffers({ offers: offers[0] });
143
+ }
144
+
145
+ return { price: null, currency: null };
146
+ }
147
+
148
+ function extractAuthor(obj) {
149
+ const author = obj.author;
150
+ if (!author) return null;
151
+ if (typeof author === 'string') return author;
152
+ if (Array.isArray(author)) return author.map(a => str(a.name) || str(a)).filter(Boolean).join(', ');
153
+ return str(author.name) || null;
154
+ }
155
+
156
+ function extractImage(obj) {
157
+ const img = obj.image;
158
+ if (!img) return null;
159
+ if (typeof img === 'string') return img;
160
+ if (Array.isArray(img)) return typeof img[0] === 'string' ? img[0] : img[0]?.url || null;
161
+ return str(img.url) || null;
162
+ }
163
+
164
+ function str(v) {
165
+ if (v === null || v === undefined) return null;
166
+ if (typeof v === 'string') return v.trim() || null;
167
+ return String(v);
168
+ }
@@ -0,0 +1,103 @@
1
+ /**
2
+ * Sitemap.xml fetcher + parser
3
+ * Discovers URLs from sitemap before link-following begins.
4
+ */
5
+
6
+ import fetch from 'node-fetch';
7
+
8
+ const SITEMAP_TIMEOUT = 10000;
9
+
10
+ /**
11
+ * Fetch and parse sitemap.xml for a domain.
12
+ * Handles sitemap index files (multiple sitemaps) and regular sitemaps.
13
+ * Returns array of { url, lastmod? } objects.
14
+ */
15
+ export async function fetchSitemap(startUrl) {
16
+ const base = new URL(startUrl);
17
+ const sitemapUrls = [
18
+ `${base.origin}/sitemap.xml`,
19
+ `${base.origin}/sitemap_index.xml`,
20
+ `${base.origin}/sitemap-index.xml`,
21
+ ];
22
+
23
+ const allUrls = [];
24
+ const seen = new Set();
25
+
26
+ for (const sitemapUrl of sitemapUrls) {
27
+ try {
28
+ const urls = await parseSitemapUrl(sitemapUrl, base.hostname, seen);
29
+ allUrls.push(...urls);
30
+ if (urls.length > 0) break; // found a working sitemap
31
+ } catch {
32
+ continue;
33
+ }
34
+ }
35
+
36
+ return allUrls;
37
+ }
38
+
39
+ async function parseSitemapUrl(url, hostname, seen, depth = 0) {
40
+ if (depth > 2 || seen.has(url)) return []; // prevent infinite recursion
41
+ seen.add(url);
42
+
43
+ let text;
44
+ try {
45
+ const res = await fetch(url, {
46
+ timeout: SITEMAP_TIMEOUT,
47
+ headers: { 'User-Agent': 'SEOIntelBot/1.0' },
48
+ });
49
+ if (!res.ok) return [];
50
+ text = await res.text();
51
+ } catch {
52
+ return [];
53
+ }
54
+
55
+ // Check if it's a sitemap index (contains <sitemap> tags)
56
+ if (text.includes('<sitemap>') || text.includes('<sitemapindex')) {
57
+ const childUrls = extractTagContent(text, 'loc');
58
+ const results = [];
59
+ for (const childUrl of childUrls.slice(0, 20)) { // max 20 child sitemaps
60
+ const childResults = await parseSitemapUrl(childUrl, hostname, seen, depth + 1);
61
+ results.push(...childResults);
62
+ }
63
+ return results;
64
+ }
65
+
66
+ // Regular sitemap — extract <url> entries
67
+ const urls = [];
68
+ const locs = extractTagContent(text, 'loc');
69
+ const lastmods = extractTagContent(text, 'lastmod');
70
+
71
+ for (let i = 0; i < locs.length; i++) {
72
+ const loc = locs[i];
73
+ try {
74
+ const parsed = new URL(loc);
75
+ // Only include URLs from the same hostname
76
+ if (parsed.hostname !== hostname) continue;
77
+ // Skip non-page resources
78
+ if (/\.(pdf|png|jpg|jpeg|gif|svg|css|js|woff|ico|xml)$/i.test(parsed.pathname)) continue;
79
+
80
+ urls.push({
81
+ url: parsed.href,
82
+ lastmod: lastmods[i] || null,
83
+ });
84
+ } catch {
85
+ continue;
86
+ }
87
+ }
88
+
89
+ return urls;
90
+ }
91
+
92
+ /**
93
+ * Simple XML tag content extractor (no full XML parser needed).
94
+ */
95
+ function extractTagContent(xml, tagName) {
96
+ const regex = new RegExp(`<${tagName}[^>]*>([^<]+)</${tagName}>`, 'gi');
97
+ const results = [];
98
+ let match;
99
+ while ((match = regex.exec(xml)) !== null) {
100
+ results.push(match[1].trim());
101
+ }
102
+ return results;
103
+ }