seo-intel 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +41 -0
- package/LICENSE +75 -0
- package/README.md +243 -0
- package/Start SEO Intel.bat +9 -0
- package/Start SEO Intel.command +8 -0
- package/cli.js +3727 -0
- package/config/example.json +29 -0
- package/config/setup-wizard.js +522 -0
- package/crawler/index.js +566 -0
- package/crawler/robots.js +103 -0
- package/crawler/sanitize.js +124 -0
- package/crawler/schema-parser.js +168 -0
- package/crawler/sitemap.js +103 -0
- package/crawler/stealth.js +393 -0
- package/crawler/subdomain-discovery.js +341 -0
- package/db/db.js +213 -0
- package/db/schema.sql +120 -0
- package/exports/competitive.js +186 -0
- package/exports/heuristics.js +67 -0
- package/exports/queries.js +197 -0
- package/exports/suggestive.js +230 -0
- package/exports/technical.js +180 -0
- package/exports/templates.js +77 -0
- package/lib/gate.js +204 -0
- package/lib/license.js +369 -0
- package/lib/oauth.js +432 -0
- package/lib/updater.js +324 -0
- package/package.json +68 -0
- package/reports/generate-html.js +6194 -0
- package/reports/generate-site-graph.js +949 -0
- package/reports/gsc-loader.js +190 -0
- package/scheduler.js +142 -0
- package/seo-audit.js +619 -0
- package/seo-intel.png +0 -0
- package/server.js +602 -0
- package/setup/ROADMAP.md +109 -0
- package/setup/checks.js +483 -0
- package/setup/config-builder.js +227 -0
- package/setup/engine.js +65 -0
- package/setup/installers.js +197 -0
- package/setup/models.js +328 -0
- package/setup/openclaw-bridge.js +329 -0
- package/setup/validator.js +395 -0
- package/setup/web-routes.js +688 -0
- package/setup/wizard.html +2920 -0
- package/start-seo-intel.sh +8 -0
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
import { createHash } from 'crypto';
|
|
2
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync, statSync, unlinkSync } from 'fs';
|
|
3
|
+
import { join, dirname } from 'path';
|
|
4
|
+
import { fileURLToPath } from 'url';
|
|
5
|
+
import { chromium } from 'playwright';
|
|
6
|
+
import { sanitize, extractSelective, extractAsMarkdown } from './sanitize.js';
|
|
7
|
+
|
|
8
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
9
|
+
const SESSIONS_DIR = join(__dirname, '..', '.sessions');
|
|
10
|
+
const SESSION_MAX_AGE_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
|
|
11
|
+
|
|
12
|
+
// ── Stealth fingerprint pools ───────────────────────────────────────────
|
|
13
|
+
|
|
14
|
+
const USER_AGENTS = [
|
|
15
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
16
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
17
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
|
|
18
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
|
|
19
|
+
];
|
|
20
|
+
|
|
21
|
+
const VIEWPORTS = [
|
|
22
|
+
{ width: 1920, height: 1080 },
|
|
23
|
+
{ width: 1536, height: 864 },
|
|
24
|
+
{ width: 1440, height: 900 },
|
|
25
|
+
{ width: 1366, height: 768 },
|
|
26
|
+
{ width: 1280, height: 720 },
|
|
27
|
+
];
|
|
28
|
+
|
|
29
|
+
const REFERRERS = [
|
|
30
|
+
'https://www.google.com/',
|
|
31
|
+
'https://www.google.com/search?q=site',
|
|
32
|
+
'https://www.google.com/search?q=',
|
|
33
|
+
];
|
|
34
|
+
|
|
35
|
+
const TIMEZONES = [
|
|
36
|
+
'America/New_York',
|
|
37
|
+
'America/Chicago',
|
|
38
|
+
'America/Los_Angeles',
|
|
39
|
+
'Europe/London',
|
|
40
|
+
];
|
|
41
|
+
|
|
42
|
+
const LOCALES = ['en-US', 'en-GB', 'en'];
|
|
43
|
+
|
|
44
|
+
// ── Advanced rendering script — injected before any page JS runs ───────────
|
|
45
|
+
|
|
46
|
+
export const STEALTH_INIT_SCRIPT = `
|
|
47
|
+
// 1. navigator.webdriver = false (headless sets this to true)
|
|
48
|
+
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
|
49
|
+
|
|
50
|
+
// 2. Fake plugins array (headless Chrome has 0 plugins)
|
|
51
|
+
Object.defineProperty(navigator, 'plugins', {
|
|
52
|
+
get: () => {
|
|
53
|
+
const plugins = [
|
|
54
|
+
{ name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format', length: 1 },
|
|
55
|
+
{ name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai', description: '', length: 1 },
|
|
56
|
+
{ name: 'Native Client', filename: 'internal-nacl-plugin', description: '', length: 1 },
|
|
57
|
+
];
|
|
58
|
+
plugins.length = 3;
|
|
59
|
+
return plugins;
|
|
60
|
+
}
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
// 3. Fake languages (headless often shows empty or minimal)
|
|
64
|
+
Object.defineProperty(navigator, 'languages', {
|
|
65
|
+
get: () => ['en-US', 'en']
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
// 4. chrome.runtime should exist but be empty (headless has undefined)
|
|
69
|
+
if (!window.chrome) window.chrome = {};
|
|
70
|
+
if (!window.chrome.runtime) window.chrome.runtime = {};
|
|
71
|
+
|
|
72
|
+
// 5. Permissions API — "notifications" should return "denied" not "prompt"
|
|
73
|
+
const originalQuery = window.navigator.permissions?.query;
|
|
74
|
+
if (originalQuery) {
|
|
75
|
+
window.navigator.permissions.query = (params) => {
|
|
76
|
+
if (params.name === 'notifications') {
|
|
77
|
+
return Promise.resolve({ state: Notification.permission });
|
|
78
|
+
}
|
|
79
|
+
return originalQuery.call(window.navigator.permissions, params);
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// 6. WebGL vendor/renderer (headless returns "Google Inc." / "ANGLE...")
|
|
84
|
+
try {
|
|
85
|
+
const getParameter = WebGLRenderingContext.prototype.getParameter;
|
|
86
|
+
WebGLRenderingContext.prototype.getParameter = function(parameter) {
|
|
87
|
+
if (parameter === 37445) return 'Intel Inc.';
|
|
88
|
+
if (parameter === 37446) return 'Intel Iris OpenGL Engine';
|
|
89
|
+
return getParameter.call(this, parameter);
|
|
90
|
+
};
|
|
91
|
+
} catch {}
|
|
92
|
+
|
|
93
|
+
// 7. Fake connection info (headless may report differently)
|
|
94
|
+
if (navigator.connection) {
|
|
95
|
+
Object.defineProperty(navigator.connection, 'rtt', { get: () => 50 });
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// 8. Prevent iframe-based detection (window.length, window.parent)
|
|
99
|
+
Object.defineProperty(window, 'outerHeight', { get: () => window.innerHeight + 85 });
|
|
100
|
+
Object.defineProperty(window, 'outerWidth', { get: () => window.innerWidth + 15 });
|
|
101
|
+
`;
|
|
102
|
+
|
|
103
|
+
// ── Utility ─────────────────────────────────────────────────────────────
|
|
104
|
+
|
|
105
|
+
function pick(arr) {
|
|
106
|
+
return arr[Math.floor(Math.random() * arr.length)];
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// ── Content quality gate ────────────────────────────────────────────────
|
|
110
|
+
const SHELL_PATTERNS = /id=["'](root|app|__next|__nuxt)["']|<noscript[^>]*>.*enable javascript/i;
|
|
111
|
+
const CAPTCHA_PATTERNS = /cf-browser-verification|checking your browser|just a moment|verify you are human|challenge-platform/i;
|
|
112
|
+
|
|
113
|
+
function assessQuality({ wordCount, bodyText, title }) {
|
|
114
|
+
if (CAPTCHA_PATTERNS.test(bodyText)) return { ok: false, reason: 'blocked' };
|
|
115
|
+
if (wordCount < 30 && title && SHELL_PATTERNS.test(bodyText)) return { ok: false, reason: 'js-shell' };
|
|
116
|
+
if (wordCount < 10) return { ok: false, reason: 'empty' };
|
|
117
|
+
return { ok: true, reason: null };
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// ── Session persistence — save/load cookies across stealth runs ──────────
|
|
121
|
+
|
|
122
|
+
export function loadSessionState(domain) {
|
|
123
|
+
const sessionPath = join(SESSIONS_DIR, `${domain}.json`);
|
|
124
|
+
try {
|
|
125
|
+
if (!existsSync(sessionPath)) return null;
|
|
126
|
+
const age = Date.now() - statSync(sessionPath).mtimeMs;
|
|
127
|
+
if (age > SESSION_MAX_AGE_MS) {
|
|
128
|
+
unlinkSync(sessionPath);
|
|
129
|
+
console.log(`[stealth] Session expired for ${domain} (${Math.round(age / 86400000)}d old) — starting fresh`);
|
|
130
|
+
return null;
|
|
131
|
+
}
|
|
132
|
+
console.log(`[stealth] Reusing session for ${domain} (${Math.round(age / 3600000)}h old)`);
|
|
133
|
+
return sessionPath;
|
|
134
|
+
} catch { return null; }
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
export async function saveSessionState(context, domain) {
|
|
138
|
+
try {
|
|
139
|
+
if (!existsSync(SESSIONS_DIR)) mkdirSync(SESSIONS_DIR, { recursive: true });
|
|
140
|
+
const sessionPath = join(SESSIONS_DIR, `${domain}.json`);
|
|
141
|
+
const state = await context.storageState();
|
|
142
|
+
writeFileSync(sessionPath, JSON.stringify(state));
|
|
143
|
+
console.log(`[stealth] Session saved for ${domain}`);
|
|
144
|
+
} catch (err) {
|
|
145
|
+
console.log(`[stealth] Failed to save session for ${domain}: ${err.message}`);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
export function discardSession(domain) {
|
|
150
|
+
const sessionPath = join(SESSIONS_DIR, `${domain}.json`);
|
|
151
|
+
try { unlinkSync(sessionPath); } catch {}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
function contentHash(text) {
|
|
155
|
+
return createHash('sha256').update(text || '').digest('hex').slice(0, 16);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// ── Human-like scrolling ────────────────────────────────────────────────
|
|
159
|
+
|
|
160
|
+
async function humanScroll(page) {
|
|
161
|
+
try {
|
|
162
|
+
const bodyHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
163
|
+
const viewportHeight = page.viewportSize()?.height || 900;
|
|
164
|
+
const scrollTarget = Math.min(bodyHeight, viewportHeight * 3);
|
|
165
|
+
let scrolled = 0;
|
|
166
|
+
|
|
167
|
+
while (scrolled < scrollTarget) {
|
|
168
|
+
const step = 200 + Math.random() * 300;
|
|
169
|
+
await page.mouse.wheel(0, step);
|
|
170
|
+
scrolled += step;
|
|
171
|
+
await page.waitForTimeout(150 + Math.random() * 350);
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Scroll back to top (natural behavior)
|
|
175
|
+
await page.evaluate(() => window.scrollTo(0, 0));
|
|
176
|
+
await page.waitForTimeout(300);
|
|
177
|
+
} catch {
|
|
178
|
+
// Scrolling is best-effort — don't crash if page is weird
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// ── Shared stealth config for crawlDomain() ─────────────────────────────
|
|
183
|
+
|
|
184
|
+
export function getStealthConfig() {
|
|
185
|
+
const userAgent = pick(USER_AGENTS);
|
|
186
|
+
const viewport = pick(VIEWPORTS);
|
|
187
|
+
return {
|
|
188
|
+
launchArgs: {
|
|
189
|
+
args: [
|
|
190
|
+
'--disable-blink-features=AutomationControlled',
|
|
191
|
+
'--disable-features=IsolateOrigins,site-per-process',
|
|
192
|
+
'--disable-infobars',
|
|
193
|
+
'--no-first-run',
|
|
194
|
+
],
|
|
195
|
+
},
|
|
196
|
+
contextOpts: {
|
|
197
|
+
userAgent,
|
|
198
|
+
viewport,
|
|
199
|
+
locale: pick(LOCALES),
|
|
200
|
+
timezoneId: pick(TIMEZONES),
|
|
201
|
+
ignoreHTTPSErrors: true,
|
|
202
|
+
extraHTTPHeaders: {
|
|
203
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
|
204
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
205
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
|
206
|
+
'Sec-Ch-Ua': '"Chromium";v="131", "Not_A Brand";v="24"',
|
|
207
|
+
'Sec-Ch-Ua-Mobile': '?0',
|
|
208
|
+
'Sec-Ch-Ua-Platform': '"macOS"',
|
|
209
|
+
'Sec-Fetch-Dest': 'document',
|
|
210
|
+
'Sec-Fetch-Mode': 'navigate',
|
|
211
|
+
'Sec-Fetch-Site': 'cross-site',
|
|
212
|
+
'Sec-Fetch-User': '?1',
|
|
213
|
+
'Upgrade-Insecure-Requests': '1',
|
|
214
|
+
},
|
|
215
|
+
},
|
|
216
|
+
};
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// ── Stealth route handler (blocks images/fonts/tracking) ────────────────
|
|
220
|
+
|
|
221
|
+
export async function applyStealthRoutes(context) {
|
|
222
|
+
await context.route('**/*', (route) => {
|
|
223
|
+
const type = route.request().resourceType();
|
|
224
|
+
if (['image', 'media', 'font'].includes(type)) {
|
|
225
|
+
return route.abort();
|
|
226
|
+
}
|
|
227
|
+
const url = route.request().url();
|
|
228
|
+
if (/google-analytics|googletagmanager|facebook\.net|doubleclick|hotjar|segment\.io|intercom|sentry\.io/.test(url)) {
|
|
229
|
+
return route.abort();
|
|
230
|
+
}
|
|
231
|
+
return route.continue();
|
|
232
|
+
});
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// ── Session-based stealth fetcher (for extract command) ─────────────────
|
|
236
|
+
|
|
237
|
+
export async function createStealthSession(opts = {}) {
|
|
238
|
+
const stealthCfg = getStealthConfig();
|
|
239
|
+
|
|
240
|
+
const browser = await chromium.launch({
|
|
241
|
+
headless: true,
|
|
242
|
+
...stealthCfg.launchArgs,
|
|
243
|
+
});
|
|
244
|
+
|
|
245
|
+
const context = await browser.newContext(stealthCfg.contextOpts);
|
|
246
|
+
|
|
247
|
+
// Inject stealth patches before any page loads
|
|
248
|
+
await context.addInitScript(STEALTH_INIT_SCRIPT);
|
|
249
|
+
|
|
250
|
+
// Block unnecessary resources
|
|
251
|
+
await applyStealthRoutes(context);
|
|
252
|
+
|
|
253
|
+
let fetchCount = 0;
|
|
254
|
+
const TIMEOUT = parseInt(process.env.CRAWL_TIMEOUT_MS || '15000');
|
|
255
|
+
|
|
256
|
+
// ── fetchPage: extract full page data from a single URL ─────────────
|
|
257
|
+
|
|
258
|
+
async function fetchPage(url) {
|
|
259
|
+
const page = await context.newPage();
|
|
260
|
+
|
|
261
|
+
try {
|
|
262
|
+
const referrer = pick(REFERRERS);
|
|
263
|
+
const t0 = Date.now();
|
|
264
|
+
let status = 0;
|
|
265
|
+
|
|
266
|
+
// Navigate with referrer
|
|
267
|
+
let res;
|
|
268
|
+
for (const waitUntil of ['domcontentloaded', 'load']) {
|
|
269
|
+
try {
|
|
270
|
+
res = await page.goto(url, { waitUntil, timeout: TIMEOUT, referer: referrer });
|
|
271
|
+
break;
|
|
272
|
+
} catch (err) {
|
|
273
|
+
if (waitUntil === 'load') throw err;
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
status = res?.status() || 0;
|
|
278
|
+
const loadMs = Date.now() - t0;
|
|
279
|
+
|
|
280
|
+
if (status >= 400) {
|
|
281
|
+
return {
|
|
282
|
+
url, depth: 0, status, loadMs, wordCount: 0, isIndexable: false,
|
|
283
|
+
title: '', metaDesc: '', headings: [], links: [], bodyText: '',
|
|
284
|
+
schemaTypes: [], vitals: {}, publishedDate: null, modifiedDate: null,
|
|
285
|
+
contentHash: null,
|
|
286
|
+
};
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// Scroll like a human to trigger lazy content
|
|
290
|
+
await humanScroll(page);
|
|
291
|
+
|
|
292
|
+
// ── Extract all page data (mirrors processPage from crawler/index.js) ──
|
|
293
|
+
|
|
294
|
+
const title = await page.title().catch(() => '');
|
|
295
|
+
const metaDesc = await page.$eval('meta[name="description"]', el => el.content).catch(() => '');
|
|
296
|
+
|
|
297
|
+
const headings = await page.$$eval('h1,h2,h3,h4,h5,h6', els =>
|
|
298
|
+
els.map(el => ({ level: parseInt(el.tagName[1]), text: el.innerText?.trim().slice(0, 200) })).filter(h => h.text)
|
|
299
|
+
).catch(() => []);
|
|
300
|
+
|
|
301
|
+
const base = new URL(url);
|
|
302
|
+
const links = await page.$$eval('a[href]', (els, baseHref) =>
|
|
303
|
+
els.map(el => {
|
|
304
|
+
try { return { url: new URL(el.href, baseHref).href, anchor: el.innerText?.trim().slice(0, 100) || '' }; }
|
|
305
|
+
catch { return null; }
|
|
306
|
+
}).filter(Boolean), base.href
|
|
307
|
+
).catch(() => []);
|
|
308
|
+
|
|
309
|
+
const getRootDomain = h => h.split('.').slice(-2).join('.');
|
|
310
|
+
const internalLinks = links.filter(l => {
|
|
311
|
+
try { const h = new URL(l.url).hostname; return h === base.hostname || getRootDomain(h) === getRootDomain(base.hostname); }
|
|
312
|
+
catch { return false; }
|
|
313
|
+
}).map(l => ({ ...l, isInternal: true }));
|
|
314
|
+
const externalLinks = links.filter(l => {
|
|
315
|
+
try { return new URL(l.url).hostname !== base.hostname; }
|
|
316
|
+
catch { return false; }
|
|
317
|
+
}).map(l => ({ ...l, isInternal: false }));
|
|
318
|
+
|
|
319
|
+
const bodyText = await extractAsMarkdown(page).catch(() => '')
|
|
320
|
+
|| await extractSelective(page, ['h1', 'h2', 'h3', 'p', 'li', 'span.hero', 'div.tagline']).catch(() => '');
|
|
321
|
+
|
|
322
|
+
const schemaTypes = await page.$$eval('script[type="application/ld+json"]', els => {
|
|
323
|
+
const types = [];
|
|
324
|
+
for (const el of els) { try { const d = JSON.parse(el.textContent); types.push(d['@type']); } catch {} }
|
|
325
|
+
return types.filter(Boolean);
|
|
326
|
+
}).catch(() => []);
|
|
327
|
+
|
|
328
|
+
const vitals = await Promise.race([
|
|
329
|
+
page.evaluate(() => new Promise(resolve => {
|
|
330
|
+
let lcp = null;
|
|
331
|
+
try {
|
|
332
|
+
new PerformanceObserver(list => { lcp = list.getEntries().at(-1)?.startTime || null; })
|
|
333
|
+
.observe({ type: 'largest-contentful-paint', buffered: true });
|
|
334
|
+
} catch {}
|
|
335
|
+
setTimeout(() => resolve({ lcp }), 1000);
|
|
336
|
+
})),
|
|
337
|
+
new Promise(resolve => setTimeout(() => resolve({}), 1500)),
|
|
338
|
+
]).catch(() => ({}));
|
|
339
|
+
|
|
340
|
+
const wordCount = await page.$eval('body', el => el.innerText.split(/\s+/).filter(Boolean).length).catch(() => 0);
|
|
341
|
+
|
|
342
|
+
const robotsMeta = await page.$eval('meta[name="robots"]', el => el.content).catch(() => '');
|
|
343
|
+
const isIndexable = !robotsMeta.toLowerCase().includes('noindex');
|
|
344
|
+
|
|
345
|
+
const publishedDate = await page.evaluate(() => {
|
|
346
|
+
for (const sel of ['meta[property="article:published_time"]', 'meta[name="date"]', 'meta[itemprop="datePublished"]']) {
|
|
347
|
+
const el = document.querySelector(sel);
|
|
348
|
+
if (el?.content) return el.content;
|
|
349
|
+
}
|
|
350
|
+
for (const el of document.querySelectorAll('script[type="application/ld+json"]')) {
|
|
351
|
+
try { const d = JSON.parse(el.textContent); if (d.datePublished) return d.datePublished; } catch {}
|
|
352
|
+
}
|
|
353
|
+
return null;
|
|
354
|
+
}).catch(() => null);
|
|
355
|
+
|
|
356
|
+
const modifiedDate = await page.evaluate(() => {
|
|
357
|
+
for (const sel of ['meta[property="article:modified_time"]', 'meta[name="last-modified"]', 'meta[itemprop="dateModified"]']) {
|
|
358
|
+
const el = document.querySelector(sel);
|
|
359
|
+
if (el?.content) return el.content;
|
|
360
|
+
}
|
|
361
|
+
for (const el of document.querySelectorAll('script[type="application/ld+json"]')) {
|
|
362
|
+
try { const d = JSON.parse(el.textContent); if (d.dateModified) return d.dateModified; } catch {}
|
|
363
|
+
}
|
|
364
|
+
return null;
|
|
365
|
+
}).catch(() => null);
|
|
366
|
+
|
|
367
|
+
const hash = contentHash(bodyText);
|
|
368
|
+
fetchCount++;
|
|
369
|
+
|
|
370
|
+
// ── Quality gate ──
|
|
371
|
+
const quality = assessQuality({ wordCount, bodyText, title });
|
|
372
|
+
|
|
373
|
+
return {
|
|
374
|
+
url, depth: 0, status, loadMs, wordCount, isIndexable,
|
|
375
|
+
title, metaDesc, headings,
|
|
376
|
+
links: [...internalLinks, ...externalLinks],
|
|
377
|
+
bodyText: sanitize(bodyText, 2000),
|
|
378
|
+
schemaTypes, vitals, publishedDate, modifiedDate,
|
|
379
|
+
contentHash: hash,
|
|
380
|
+
quality: quality.ok, qualityReason: quality.reason,
|
|
381
|
+
};
|
|
382
|
+
|
|
383
|
+
} finally {
|
|
384
|
+
await page.close().catch(() => {});
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
async function close() {
|
|
389
|
+
await browser.close().catch(() => {});
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
return { fetchPage, close, getPageCount: () => fetchCount };
|
|
393
|
+
}
|