create-backlist 10.0.3 → 10.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,305 @@
1
+ // Smart crawler with HTTP fallback when browser unavailable
2
+ import { URL } from 'node:url';
3
+ import { shortId } from '../qa-engine.js';
4
+ import { getBrowserLaunchOptions } from './installer.js';
5
+
6
+ const FETCH_TIMEOUT = 12_000;
7
+
8
+ // ── HTTP-only crawler (no browser needed) ─────────────────────────────────
9
+ export class HTTPCrawler {
10
+ #visited = new Set();
11
+ #baseUrl;
12
+
13
+ async crawl(baseUrl, { maxPages = 40, onRoute } = {}) {
14
+ this.#baseUrl = baseUrl;
15
+ this.#visited.clear();
16
+ const queue = [{ url: baseUrl, depth: 0 }];
17
+ const routes = [];
18
+
19
+ while (queue.length > 0 && routes.length < maxPages) {
20
+ const { url, depth } = queue.shift();
21
+ const norm = this.#norm(url);
22
+ if (!norm || this.#visited.has(norm)) continue;
23
+ if (!this.#sameOrigin(norm, baseUrl)) continue;
24
+ if (depth > 3) continue;
25
+
26
+ this.#visited.add(norm);
27
+
28
+ const route = await this.#probeURL(norm, depth);
29
+ routes.push(route);
30
+ if (onRoute) onRoute(route);
31
+
32
+ // Extract links from HTML response
33
+ if (route.links) {
34
+ for (const link of route.links) {
35
+ const ln = this.#norm(link);
36
+ if (ln && !this.#visited.has(ln) && this.#sameOrigin(ln, baseUrl)) {
37
+ queue.push({ url: ln, depth: depth + 1 });
38
+ }
39
+ }
40
+ }
41
+ }
42
+
43
+ // Also probe common API paths
44
+ const apiPaths = [
45
+ '/api/health', '/api/status', '/api/v1/health',
46
+ '/api/v1/users', '/api/v1/products', '/health',
47
+ '/api/docs', '/sitemap.xml', '/robots.txt',
48
+ ];
49
+
50
+ for (const p of apiPaths) {
51
+ const url = new URL(p, baseUrl).toString();
52
+ const norm = this.#norm(url);
53
+ if (this.#visited.has(norm)) continue;
54
+ this.#visited.add(norm);
55
+ const route = await this.#probeURL(url, 0);
56
+ if (route.status > 0 && route.status < 500) {
57
+ routes.push(route);
58
+ if (onRoute) onRoute(route);
59
+ }
60
+ }
61
+
62
+ return routes;
63
+ }
64
+
65
+ async #probeURL(url, depth) {
66
+ const t0 = Date.now();
67
+ try {
68
+ const controller = new AbortController();
69
+ const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT);
70
+
71
+ const res = await fetch(url, {
72
+ signal : controller.signal,
73
+ headers : { 'User-Agent': 'Backlist-QA/12.0', Accept: 'text/html,application/json,*/*' },
74
+ redirect: 'follow',
75
+ });
76
+ clearTimeout(timer);
77
+
78
+ const ct = res.headers.get('content-type') || '';
79
+ const duration = Date.now() - t0;
80
+ const headers = {};
81
+ res.headers.forEach((v, k) => { headers[k] = v; });
82
+
83
+ let text = '';
84
+ let links = [];
85
+ let forms = [];
86
+
87
+ if (ct.includes('text/html')) {
88
+ try { text = await res.text(); } catch {}
89
+ links = this.#extractLinks(text, url);
90
+ forms = this.#extractForms(text);
91
+ }
92
+
93
+ const type = this.#detectType(url, ct, res.status);
94
+
95
+ return {
96
+ id: shortId(), url, type, status: res.status,
97
+ duration, depth, links, forms, headers,
98
+ contentType: ct,
99
+ error: null,
100
+ };
101
+ } catch (err) {
102
+ return {
103
+ id: shortId(), url, type: 'error', status: 0,
104
+ duration: Date.now() - t0, depth,
105
+ links: [], forms: [], error: err.message,
106
+ };
107
+ }
108
+ }
109
+
110
+ #extractLinks(html, base) {
111
+ const links = [];
112
+ const re = /href=["']([^"'#?][^"']*?)["']/gi;
113
+ let m;
114
+ while ((m = re.exec(html)) !== null) {
115
+ try { links.push(new URL(m[1], base).toString()); } catch {}
116
+ }
117
+ return [...new Set(links)].slice(0, 30);
118
+ }
119
+
120
+ #extractForms(html) {
121
+ const forms = [];
122
+ const re = /<form([^>]*)>([\s\S]*?)<\/form>/gi;
123
+ let m;
124
+ while ((m = re.exec(html)) !== null) {
125
+ const attrs = m[1];
126
+ const body = m[2];
127
+ const action = (attrs.match(/action=["']([^"']+)["']/) || [])[1] || '';
128
+ const method = (attrs.match(/method=["']([^"']+)["']/) || [])[1] || 'GET';
129
+ const fields = [];
130
+ const inpRe = /<input([^>]*)>/gi;
131
+ let inp;
132
+ while ((inp = inpRe.exec(body)) !== null) {
133
+ const name = (inp[1].match(/name=["']([^"']+)["']/) || [])[1];
134
+ const type = (inp[1].match(/type=["']([^"']+)["']/) || [])[1] || 'text';
135
+ const required = /required/i.test(inp[1]);
136
+ if (name) fields.push({ name, type, required });
137
+ }
138
+ forms.push({ action, method, fields });
139
+ }
140
+ return forms;
141
+ }
142
+
143
+ #norm(url) {
144
+ try { const u = new URL(url); u.hash = ''; return u.toString(); }
145
+ catch { return null; }
146
+ }
147
+
148
+ #sameOrigin(url, base) {
149
+ try { return new URL(url).origin === new URL(base).origin; }
150
+ catch { return false; }
151
+ }
152
+
153
+ #detectType(url, ct, status) {
154
+ if (status >= 400) return 'error-page';
155
+ if (ct.includes('json') || url.includes('/api/')) return 'api';
156
+ if (url.endsWith('.xml') || url.endsWith('.txt')) return 'resource';
157
+ if (/\/(login|signin|auth)/i.test(url)) return 'auth';
158
+ if (/\/(admin)/i.test(url)) return 'admin';
159
+ if (/\/(dashboard)/i.test(url)) return 'dashboard';
160
+ return 'page';
161
+ }
162
+ }
163
+
164
+ // ── Browser-powered crawler (Playwright) ──────────────────────────────────
165
+ export class SmartCrawler {
166
+ #visited = new Set();
167
+ #launchOpts = null;
168
+
169
+ constructor(_playwright) {} // playwright passed but we resolve it ourselves
170
+
171
+ async crawl(baseUrl, { maxPages = 60, maxDepth = 4, onRoute } = {}) {
172
+ // Resolve launch options including auto-install
173
+ if (!this.#launchOpts) {
174
+ this.#launchOpts = await getBrowserLaunchOptions();
175
+ }
176
+
177
+ // If browser unavailable — delegate to HTTP crawler
178
+ if (!this.#launchOpts.available) {
179
+ const http = new HTTPCrawler();
180
+ return http.crawl(baseUrl, { maxPages, onRoute });
181
+ }
182
+
183
+ this.#visited.clear();
184
+
185
+ let playwright;
186
+ try { playwright = await import('playwright'); }
187
+ catch {
188
+ const http = new HTTPCrawler();
189
+ return http.crawl(baseUrl, { maxPages, onRoute });
190
+ }
191
+
192
+ const { executablePath, headless, args } = this.#launchOpts;
193
+
194
+ let browser;
195
+ try {
196
+ browser = await playwright.chromium.launch({ executablePath, headless, args });
197
+ } catch (err) {
198
+ console.log(chalk.yellow(` ⚠ Browser launch failed: ${err.message}`));
199
+ console.log(chalk.gray(' Falling back to HTTP-only crawler...'));
200
+ const http = new HTTPCrawler();
201
+ return http.crawl(baseUrl, { maxPages, onRoute });
202
+ }
203
+
204
+ const context = await browser.newContext({
205
+ userAgent : 'Backlist-QA-Crawler/12.0',
206
+ ignoreHTTPSErrors: true,
207
+ });
208
+
209
+ const routes = [];
210
+ const queue = [{ url: baseUrl, depth: 0 }];
211
+
212
+ try {
213
+ while (queue.length > 0 && routes.length < maxPages) {
214
+ const { url, depth } = queue.shift();
215
+ const norm = this.#norm(url);
216
+ if (!norm || this.#visited.has(norm)) continue;
217
+ if (!this.#sameOrigin(norm, baseUrl)) continue;
218
+ if (depth > maxDepth) continue;
219
+ this.#visited.add(norm);
220
+
221
+ const route = await this.#probePage(context, norm, depth, baseUrl);
222
+ if (!route) continue;
223
+
224
+ routes.push(route);
225
+ if (onRoute) onRoute(route);
226
+
227
+ for (const link of (route.links || [])) {
228
+ const ln = this.#norm(link);
229
+ if (ln && !this.#visited.has(ln)) {
230
+ queue.push({ url: ln, depth: depth + 1 });
231
+ }
232
+ }
233
+ }
234
+ } finally {
235
+ await context.close().catch(() => {});
236
+ await browser.close().catch(() => {});
237
+ }
238
+
239
+ return routes;
240
+ }
241
+
242
+ async #probePage(context, url, depth, baseUrl) {
243
+ const page = await context.newPage();
244
+ const networkRequests = [];
245
+ const links = new Set();
246
+
247
+ page.on('request', req => {
248
+ const u = req.url();
249
+ if ((u.includes('/api/') || req.resourceType() === 'fetch') && this.#sameOrigin(u, baseUrl)) {
250
+ networkRequests.push({ url: u, method: req.method(), type: 'api' });
251
+ }
252
+ });
253
+
254
+ try {
255
+ const response = await page.goto(url, { waitUntil: 'networkidle', timeout: 15_000 });
256
+ const status = response?.status() || 0;
257
+ const ct = response?.headers()['content-type'] || '';
258
+
259
+ if (!ct.includes('text/html') && !ct.includes('application/xhtml')) {
260
+ return { id: shortId(), url, type: this.#detectType(url, ct, status), status, depth, links: [], forms: [] };
261
+ }
262
+
263
+ const hrefs = await page.$$eval('a[href]', els => els.map(e => e.href).filter(Boolean)).catch(() => []);
264
+ hrefs.forEach(h => links.add(h));
265
+
266
+ const forms = await page.$$eval('form', els => els.map(f => ({
267
+ action: f.action, method: f.method || 'GET',
268
+ fields: Array.from(f.elements).map(el => ({
269
+ name: el.name, type: el.type, required: el.required,
270
+ })).filter(f => f.name),
271
+ }))).catch(() => []);
272
+
273
+ networkRequests.forEach(r => { if (this.#sameOrigin(r.url, baseUrl)) links.add(r.url); });
274
+
275
+ return {
276
+ id: shortId(), url, type: this.#detectType(url, ct, status),
277
+ status, depth, links: [...links], forms, contentType: ct,
278
+ };
279
+ } catch (err) {
280
+ return { id: shortId(), url, type: 'error', status: 0, depth, links: [], forms: [], error: err.message };
281
+ } finally {
282
+ await page.close().catch(() => {});
283
+ }
284
+ }
285
+
286
+ #norm(url) {
287
+ try { const u = new URL(url); u.hash = ''; return u.toString(); }
288
+ catch { return null; }
289
+ }
290
+
291
+ #sameOrigin(url, base) {
292
+ try { return new URL(url).origin === new URL(base).origin; }
293
+ catch { return false; }
294
+ }
295
+
296
+ #detectType(url, ct, status) {
297
+ if (status >= 400) return 'error-page';
298
+ if (ct.includes('json') || url.includes('/api/')) return 'api';
299
+ if (url.endsWith('.xml') || url.endsWith('.txt')) return 'resource';
300
+ if (/\/(login|signin|auth)/i.test(url)) return 'auth';
301
+ if (/\/(admin)/i.test(url)) return 'admin';
302
+ if (/\/(dashboard)/i.test(url)) return 'dashboard';
303
+ return 'page';
304
+ }
305
+ }
@@ -0,0 +1,209 @@
1
+ // ═══════════════════════════════════════════════════════════════════════════
2
+ // Playwright Auto-Installer & Browser Manager
3
+ // Handles missing browsers gracefully with auto-install
4
+ // ═══════════════════════════════════════════════════════════════════════════
5
+
6
+ import { execSync, spawn } from 'node:child_process';
7
+ import path from 'node:path';
8
+ import fs from 'fs-extra';
9
+ import chalk from 'chalk';
10
+ import ora from 'ora';
11
+
12
+ // ── Browser executable paths per platform ────────────────────────────────
13
+ function getPlaywrightCacheDir() {
14
+ const home = process.env.HOME || process.env.USERPROFILE || '';
15
+ switch (process.platform) {
16
+ case 'win32': return path.join(process.env.LOCALAPPDATA || path.join(home, 'AppData', 'Local'), 'ms-playwright');
17
+ case 'darwin': return path.join(home, 'Library', 'Caches', 'ms-playwright');
18
+ default: return path.join(home, '.cache', 'ms-playwright');
19
+ }
20
+ }
21
+
22
+ function findChromiumExecutable() {
23
+ const cacheDir = getPlaywrightCacheDir();
24
+
25
+ try {
26
+ if (!fs.existsSync(cacheDir)) return null;
27
+
28
+ const entries = fs.readdirSync(cacheDir);
29
+
30
+ // Look for any chromium variant folder
31
+ const chromiumDirs = entries
32
+ .filter(e => e.startsWith('chromium') || e.startsWith('chrome'))
33
+ .sort()
34
+ .reverse(); // prefer newest
35
+
36
+ for (const dir of chromiumDirs) {
37
+ const candidates = [
38
+ // Headless shell (newer Playwright)
39
+ path.join(cacheDir, dir, 'chrome-headless-shell-win64', 'chrome-headless-shell.exe'),
40
+ path.join(cacheDir, dir, 'chrome-headless-shell-linux', 'chrome-headless-shell'),
41
+ path.join(cacheDir, dir, 'chrome-headless-shell-mac_arm', 'chrome-headless-shell'),
42
+ path.join(cacheDir, dir, 'chrome-headless-shell-mac_x64', 'chrome-headless-shell'),
43
+ // Regular chromium
44
+ path.join(cacheDir, dir, 'chrome-win64', 'chrome.exe'),
45
+ path.join(cacheDir, dir, 'chrome-linux', 'chrome'),
46
+ path.join(cacheDir, dir, 'chrome-mac', 'Chromium.app', 'Contents', 'MacOS', 'Chromium'),
47
+ path.join(cacheDir, dir, 'chrome-mac_arm', 'Chromium.app', 'Contents', 'MacOS', 'Chromium'),
48
+ path.join(cacheDir, dir, 'chromium-linux', 'chrome-linux', 'chrome'),
49
+ path.join(cacheDir, dir, 'chromium-win64', 'chrome.exe'),
50
+ path.join(cacheDir, dir, 'chromium-mac', 'chrome-mac', 'Chromium.app', 'Contents', 'MacOS', 'Chromium'),
51
+ ];
52
+
53
+ for (const candidate of candidates) {
54
+ if (fs.existsSync(candidate)) return candidate;
55
+ }
56
+ }
57
+ } catch {}
58
+
59
+ return null;
60
+ }
61
+
62
+ // ── Check system Chrome/Chromium ──────────────────────────────────────────
63
+ function findSystemChrome() {
64
+ const candidates = {
65
+ win32: [
66
+ 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
67
+ 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
68
+ path.join(process.env.LOCALAPPDATA || '', 'Google\\Chrome\\Application\\chrome.exe'),
69
+ 'C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe',
70
+ path.join(process.env.PROGRAMFILES || '', 'Microsoft\\Edge\\Application\\msedge.exe'),
71
+ ],
72
+ darwin: [
73
+ '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
74
+ '/Applications/Chromium.app/Contents/MacOS/Chromium',
75
+ '/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge',
76
+ ],
77
+ linux: [
78
+ '/usr/bin/google-chrome',
79
+ '/usr/bin/google-chrome-stable',
80
+ '/usr/bin/chromium-browser',
81
+ '/usr/bin/chromium',
82
+ '/snap/bin/chromium',
83
+ ],
84
+ };
85
+
86
+ const platform = process.platform;
87
+ const paths = candidates[platform] || candidates.linux;
88
+
89
+ for (const p of paths) {
90
+ try {
91
+ if (fs.existsSync(p)) return p;
92
+ } catch {}
93
+ }
94
+
95
+ return null;
96
+ }
97
+
98
+ // ── Install Playwright browsers ───────────────────────────────────────────
99
+ export async function installPlaywrightBrowsers() {
100
+ const spinner = ora({
101
+ text : chalk.cyan('Installing Playwright Chromium browser...'),
102
+ spinner: 'dots12',
103
+ color : 'cyan',
104
+ }).start();
105
+
106
+ return new Promise((resolve) => {
107
+ const proc = spawn(
108
+ process.execPath, // use same node binary
109
+ ['node_modules/.bin/playwright', 'install', 'chromium', '--with-deps'],
110
+ {
111
+ cwd : process.cwd(),
112
+ stdio : ['ignore', 'pipe', 'pipe'],
113
+ shell : process.platform === 'win32',
114
+ env : { ...process.env },
115
+ }
116
+ );
117
+
118
+ let output = '';
119
+ proc.stdout?.on('data', d => { output += d.toString(); });
120
+ proc.stderr?.on('data', d => { output += d.toString(); });
121
+
122
+ proc.on('close', (code) => {
123
+ if (code === 0) {
124
+ spinner.succeed(chalk.green('Chromium installed successfully ✓'));
125
+ resolve({ success: true });
126
+ } else {
127
+ spinner.fail(chalk.red(`Chromium install failed (code ${code})`));
128
+ resolve({ success: false, output });
129
+ }
130
+ });
131
+
132
+ proc.on('error', (err) => {
133
+ spinner.fail(chalk.red(`Install process error: ${err.message}`));
134
+ resolve({ success: false, error: err.message });
135
+ });
136
+
137
+ // Timeout after 3 minutes
138
+ setTimeout(() => {
139
+ proc.kill();
140
+ spinner.fail(chalk.red('Browser install timed out (3 min)'));
141
+ resolve({ success: false, error: 'timeout' });
142
+ }, 180_000);
143
+ });
144
+ }
145
+
146
+ // ── Main browser check & setup ────────────────────────────────────────────
147
+ export async function ensureBrowser() {
148
+ // 1. Check Playwright-managed Chromium
149
+ const playwrightChrome = findChromiumExecutable();
150
+ if (playwrightChrome) {
151
+ return { executablePath: playwrightChrome, source: 'playwright-managed' };
152
+ }
153
+
154
+ // 2. Check system Chrome/Edge/Chromium
155
+ const systemChrome = findSystemChrome();
156
+ if (systemChrome) {
157
+ console.log(chalk.gray(` Using system browser: ${systemChrome}`));
158
+ return { executablePath: systemChrome, source: 'system' };
159
+ }
160
+
161
+ // 3. Try to auto-install
162
+ console.log('');
163
+ console.log(chalk.yellow(' ⚠ Playwright browser not found.'));
164
+ console.log(chalk.gray(' Attempting automatic installation...\n'));
165
+
166
+ const result = await installPlaywrightBrowsers();
167
+
168
+ if (result.success) {
169
+ const newPath = findChromiumExecutable();
170
+ if (newPath) {
171
+ return { executablePath: newPath, source: 'auto-installed' };
172
+ }
173
+ }
174
+
175
+ // 4. Return null — caller will use HTTP-only fallback
176
+ return { executablePath: null, source: 'unavailable', error: result?.error };
177
+ }
178
+
179
+ // ── Check if browser is available (fast check) ───────────────────────────
180
+ export function isBrowserAvailable() {
181
+ return !!(findChromiumExecutable() || findSystemChrome());
182
+ }
183
+
184
+ // ── Get launch options with correct executable ────────────────────────────
185
+ export async function getBrowserLaunchOptions() {
186
+ const browser = await ensureBrowser();
187
+
188
+ const baseArgs = [
189
+ '--no-sandbox',
190
+ '--disable-setuid-sandbox',
191
+ '--disable-dev-shm-usage',
192
+ '--disable-gpu',
193
+ '--no-first-run',
194
+ '--no-default-browser-check',
195
+ '--disable-extensions',
196
+ ];
197
+
198
+ if (!browser.executablePath) {
199
+ return { available: false, source: 'unavailable' };
200
+ }
201
+
202
+ return {
203
+ available : true,
204
+ source : browser.source,
205
+ executablePath : browser.executablePath,
206
+ headless : true,
207
+ args : baseArgs,
208
+ };
209
+ }