playwright-archaeologist 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +392 -0
- package/bin/cli.js +2 -0
- package/dist/chunk-7ZQGW5OV.js +255 -0
- package/dist/chunk-7ZQGW5OV.js.map +1 -0
- package/dist/chunk-F5WCXM7I.js +4469 -0
- package/dist/chunk-F5WCXM7I.js.map +1 -0
- package/dist/chunk-RWPEKZOW.js +118 -0
- package/dist/chunk-RWPEKZOW.js.map +1 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +310 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.d.ts +1948 -0
- package/dist/index.js +789 -0
- package/dist/index.js.map +1 -0
- package/dist/page-scanner-Q76HROEW.js +8 -0
- package/dist/page-scanner-Q76HROEW.js.map +1 -0
- package/package.json +83 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 playwright-archaeologist contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
# playwright-archaeologist
|
|
2
|
+
|
|
3
|
+
**Generate a complete behavioral specification of any running web app — no source code required.**
|
|
4
|
+
|
|
5
|
+
[](https://www.npmjs.com/package/playwright-archaeologist)
|
|
6
|
+
[](./LICENSE)
|
|
7
|
+
[](https://nodejs.org/)
|
|
8
|
+
|
|
9
|
+
Point `playwright-archaeologist` at a URL and get back a full behavioral spec: sitemap, form catalog, API map with OpenAPI 3.0 schema, screenshots, navigation flow graph, and a regression baseline you can diff later.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Quick Start
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# Install globally
|
|
17
|
+
npm install -g playwright-archaeologist
|
|
18
|
+
|
|
19
|
+
# Download Chromium (one-time)
|
|
20
|
+
pa install
|
|
21
|
+
|
|
22
|
+
# Crawl a site
|
|
23
|
+
pa dig https://example.com
|
|
24
|
+
|
|
25
|
+
# View the report
|
|
26
|
+
open .archaeologist/report.html
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Or use `npx` without installing:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
npx playwright-archaeologist install
|
|
33
|
+
npx playwright-archaeologist dig https://example.com
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Features
|
|
39
|
+
|
|
40
|
+
- **Zero source code access** — works on any running web app, staging or production
|
|
41
|
+
- **SPA-aware crawling** — Navigation API + History API patching + MutationObserver for client-side route detection
|
|
42
|
+
- **Authenticated crawling** — run auth scripts or inject cookies before crawling protected sites
|
|
43
|
+
- **Screenshot atlas** — full-page and viewport screenshots with a browsable gallery
|
|
44
|
+
- **API discovery** — auto-generates OpenAPI 3.0 specs from observed network traffic
|
|
45
|
+
- **Form catalog** — extracts every form with field metadata, validation rules, and structure
|
|
46
|
+
- **Flow graph** — Mermaid navigation diagrams showing how pages connect
|
|
47
|
+
- **Regression diff** — compare two crawl snapshots, detect structural and visual changes
|
|
48
|
+
- **Security-first** — SSRF protection, credential scrubbing, browser CSP hardening
|
|
49
|
+
- **Resume support** — checkpoint and resume interrupted crawls
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
### Requirements
|
|
56
|
+
|
|
57
|
+
- Node.js >= 20.0.0
|
|
58
|
+
- Chromium is downloaded automatically via `pa install`
|
|
59
|
+
|
|
60
|
+
### npm
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
npm install -g playwright-archaeologist
|
|
64
|
+
pa install
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### As a dev dependency
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
npm install --save-dev playwright-archaeologist
|
|
71
|
+
npx pa install
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Usage
|
|
77
|
+
|
|
78
|
+
### Crawl a website
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
# Basic crawl
|
|
82
|
+
pa dig https://myapp.com
|
|
83
|
+
|
|
84
|
+
# Limit depth and pages
|
|
85
|
+
pa dig https://myapp.com --depth 3 --max-pages 100
|
|
86
|
+
|
|
87
|
+
# Custom viewport
|
|
88
|
+
pa dig https://myapp.com --viewport 1440x900
|
|
89
|
+
|
|
90
|
+
# Skip screenshots for a faster crawl
|
|
91
|
+
pa dig https://myapp.com --no-screenshots
|
|
92
|
+
|
|
93
|
+
# Enable deep click exploration for SPAs
|
|
94
|
+
pa dig https://myapp.com --deep-click
|
|
95
|
+
|
|
96
|
+
# Custom output directory
|
|
97
|
+
pa dig https://myapp.com -o ./crawl-output
|
|
98
|
+
|
|
99
|
+
# Resume an interrupted crawl
|
|
100
|
+
pa dig https://myapp.com --resume
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Compare two snapshots
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
# Compare crawl bundles (exit code 0 = identical, 1 = changes)
|
|
107
|
+
pa diff .archaeologist/bundle-old.zip .archaeologist/bundle-new.zip
|
|
108
|
+
|
|
109
|
+
# Generate an HTML diff report
|
|
110
|
+
pa diff old.zip new.zip --format-html diff-report.html
|
|
111
|
+
|
|
112
|
+
# Generate a JSON diff report
|
|
113
|
+
pa diff old.zip new.zip --format-json diff-report.json
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Authenticated crawling
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
# Using an auth script
|
|
120
|
+
pa dig https://myapp.com --auth ./login.js
|
|
121
|
+
|
|
122
|
+
# Using cookies
|
|
123
|
+
pa dig https://myapp.com --cookies ./cookies.json
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## Configuration Reference
|
|
129
|
+
|
|
130
|
+
### `pa dig` options
|
|
131
|
+
|
|
132
|
+
| Option | Default | Description |
|
|
133
|
+
|---|---|---|
|
|
134
|
+
| `-d, --depth <n>` | `5` | Maximum crawl depth from the entry URL |
|
|
135
|
+
| `--max-pages <n>` | `1000` | Maximum number of pages to visit |
|
|
136
|
+
| `-c, --concurrency <n>` | `3` | Number of parallel browser contexts |
|
|
137
|
+
| `--auth <script>` | — | Path to an auth script (runs before crawling) |
|
|
138
|
+
| `--cookies <file>` | — | Path to a cookies JSON file |
|
|
139
|
+
| `-o, --output <dir>` | `.archaeologist` | Output directory for all artifacts |
|
|
140
|
+
| `--no-screenshots` | `false` | Skip screenshot capture |
|
|
141
|
+
| `--viewport <WxH>` | `1280x720` | Viewport dimensions |
|
|
142
|
+
| `--viewports <list>` | — | Comma-separated viewport list for multi-viewport screenshots |
|
|
143
|
+
| `--deep-click` | `false` | Click interactive elements to discover SPA routes |
|
|
144
|
+
| `--resume` | `false` | Resume from the last checkpoint |
|
|
145
|
+
| `--include <pattern>` | — | URL patterns to include (repeatable) |
|
|
146
|
+
| `--exclude <pattern>` | — | URL patterns to exclude (repeatable) |
|
|
147
|
+
|
|
148
|
+
### `pa diff` options
|
|
149
|
+
|
|
150
|
+
| Option | Description |
|
|
151
|
+
|---|---|
|
|
152
|
+
| `--format-html <path>` | Write an HTML diff report |
|
|
153
|
+
| `--format-json <path>` | Write a JSON diff report |
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## Output Structure
|
|
158
|
+
|
|
159
|
+
After a crawl, the `.archaeologist/` directory contains:
|
|
160
|
+
|
|
161
|
+
```
|
|
162
|
+
.archaeologist/
|
|
163
|
+
report.html # Browsable HTML report with all findings
|
|
164
|
+
sitemap.json # Discovered pages with metadata
|
|
165
|
+
forms.json # Form catalog with field details
|
|
166
|
+
api.json # Observed API endpoints
|
|
167
|
+
openapi.yaml # Generated OpenAPI 3.0 specification
|
|
168
|
+
flow-graph.svg # Navigation flow diagram (Mermaid)
|
|
169
|
+
screenshots/ # Full-page and viewport screenshots
|
|
170
|
+
index.png
|
|
171
|
+
about.png
|
|
172
|
+
...
|
|
173
|
+
bundle.zip # Snapshot bundle for regression diffing
|
|
174
|
+
checkpoint.json # Resume checkpoint (deleted on completion)
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
## Programmatic API
|
|
180
|
+
|
|
181
|
+
Use `playwright-archaeologist` as a library in your own tools:
|
|
182
|
+
|
|
183
|
+
```typescript
|
|
184
|
+
import { dig } from 'playwright-archaeologist';
|
|
185
|
+
|
|
186
|
+
const result = await dig({
|
|
187
|
+
entryUrl: 'https://myapp.com',
|
|
188
|
+
depth: 3,
|
|
189
|
+
maxPages: 50,
|
|
190
|
+
concurrency: 2,
|
|
191
|
+
output: './my-output',
|
|
192
|
+
screenshots: true,
|
|
193
|
+
viewport: { width: 1280, height: 720 },
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
console.log(`Crawled ${result.pages.length} pages`);
|
|
197
|
+
console.log(`Found ${result.forms.length} forms`);
|
|
198
|
+
console.log(`Discovered ${result.apis.length} API endpoints`);
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### Comparing snapshots programmatically
|
|
202
|
+
|
|
203
|
+
```typescript
|
|
204
|
+
import { diffBundles, generateDiffReportHtml } from 'playwright-archaeologist';
|
|
205
|
+
|
|
206
|
+
const diff = await diffBundles('./old-bundle.zip', './new-bundle.zip');
|
|
207
|
+
|
|
208
|
+
if (diff.hasChanges) {
|
|
209
|
+
console.log('Changes detected:');
|
|
210
|
+
console.log(` Pages added: ${diff.pages.added.length}`);
|
|
211
|
+
console.log(` Pages removed: ${diff.pages.removed.length}`);
|
|
212
|
+
console.log(` APIs changed: ${diff.apis.modified.length}`);
|
|
213
|
+
|
|
214
|
+
// Generate HTML report
|
|
215
|
+
const html = generateDiffReportHtml(diff);
|
|
216
|
+
await fs.writeFile('diff-report.html', html);
|
|
217
|
+
}
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
### Using individual collectors
|
|
221
|
+
|
|
222
|
+
```typescript
|
|
223
|
+
import { scanPage, probeForms, captureScreenshots } from 'playwright-archaeologist';
|
|
224
|
+
import { chromium } from 'playwright';
|
|
225
|
+
|
|
226
|
+
const browser = await chromium.launch();
|
|
227
|
+
const context = await browser.newContext();
|
|
228
|
+
const page = await context.newPage();
|
|
229
|
+
|
|
230
|
+
await page.goto('https://myapp.com/login');
|
|
231
|
+
|
|
232
|
+
// Scan page structure
|
|
233
|
+
const scan = await scanPage(page);
|
|
234
|
+
|
|
235
|
+
// Probe forms
|
|
236
|
+
const forms = await probeForms(page);
|
|
237
|
+
|
|
238
|
+
// Capture screenshots
|
|
239
|
+
const screenshots = await captureScreenshots(page, {
|
|
240
|
+
viewport: { width: 1280, height: 720 },
|
|
241
|
+
});
|
|
242
|
+
|
|
243
|
+
await browser.close();
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
---
|
|
247
|
+
|
|
248
|
+
## Auth Script Example
|
|
249
|
+
|
|
250
|
+
Auth scripts run in a real browser context before crawling begins. They receive a Playwright `page` object:
|
|
251
|
+
|
|
252
|
+
```javascript
|
|
253
|
+
// login.js
|
|
254
|
+
export default async function authenticate(page) {
|
|
255
|
+
await page.goto('https://myapp.com/login');
|
|
256
|
+
await page.fill('#email', 'test@example.com');
|
|
257
|
+
await page.fill('#password', process.env.TEST_PASSWORD);
|
|
258
|
+
await page.click('button[type="submit"]');
|
|
259
|
+
await page.waitForURL('**/dashboard');
|
|
260
|
+
}
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
```bash
|
|
264
|
+
TEST_PASSWORD=secret pa dig https://myapp.com --auth ./login.js
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
Auth scripts are statically analyzed before execution and require confirmation for scripts that access the filesystem, network, or run shell commands.
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
## Cookies File Format
|
|
272
|
+
|
|
273
|
+
The cookies file follows the Playwright cookie format:
|
|
274
|
+
|
|
275
|
+
```json
|
|
276
|
+
[
|
|
277
|
+
{
|
|
278
|
+
"name": "session",
|
|
279
|
+
"value": "abc123",
|
|
280
|
+
"domain": "myapp.com",
|
|
281
|
+
"path": "/",
|
|
282
|
+
"httpOnly": true,
|
|
283
|
+
"secure": true
|
|
284
|
+
}
|
|
285
|
+
]
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
---
|
|
289
|
+
|
|
290
|
+
## Security Considerations
|
|
291
|
+
|
|
292
|
+
`playwright-archaeologist` is designed to crawl potentially untrusted web applications. Several protections are built in:
|
|
293
|
+
|
|
294
|
+
- **SSRF protection** — Private/internal IP ranges (10.x, 172.16-31.x, 169.254.x, 127.x, ::1) are blocked by default. Only same-origin navigation is permitted unless explicitly expanded.
|
|
295
|
+
- **Credential scrubbing** — Authorization headers, cookies, and bearer tokens are redacted from all output artifacts by default.
|
|
296
|
+
- **Browser hardening** — `bypassCSP: true` for instrumentation, `serviceWorkers: 'block'`, `acceptDownloads: false`, and automatic dialog dismissal to prevent crawler hangs.
|
|
297
|
+
- **Auth script sandboxing** — Auth scripts undergo static analysis before execution. Scripts accessing `fs`, `child_process`, or making network requests outside the target domain trigger a confirmation prompt.
|
|
298
|
+
- **Output sanitization** — All target-sourced data is entity-encoded in HTML reports. Reports include a restrictive CSP meta tag.
|
|
299
|
+
|
|
300
|
+
---
|
|
301
|
+
|
|
302
|
+
## CI / Regression Testing
|
|
303
|
+
|
|
304
|
+
Use `playwright-archaeologist` in CI to catch behavioral regressions:
|
|
305
|
+
|
|
306
|
+
```yaml
|
|
307
|
+
# .github/workflows/behavioral-regression.yml
|
|
308
|
+
name: Behavioral Regression
|
|
309
|
+
on: [pull_request]
|
|
310
|
+
|
|
311
|
+
jobs:
|
|
312
|
+
regression:
|
|
313
|
+
runs-on: ubuntu-latest
|
|
314
|
+
steps:
|
|
315
|
+
- uses: actions/checkout@v4
|
|
316
|
+
|
|
317
|
+
- name: Start app
|
|
318
|
+
run: npm start &
|
|
319
|
+
|
|
320
|
+
- name: Install pa
|
|
321
|
+
run: npx playwright-archaeologist install
|
|
322
|
+
|
|
323
|
+
- name: Crawl
|
|
324
|
+
run: npx pa dig http://localhost:3000 -o ./current
|
|
325
|
+
|
|
326
|
+
- name: Download baseline
|
|
327
|
+
uses: actions/download-artifact@v4
|
|
328
|
+
with:
|
|
329
|
+
name: behavioral-baseline
|
|
330
|
+
path: ./baseline
|
|
331
|
+
|
|
332
|
+
- name: Diff
|
|
333
|
+
run: |
|
|
334
|
+
npx pa diff ./baseline/bundle.zip ./current/bundle.zip \
|
|
335
|
+
--format-html regression-report.html
|
|
336
|
+
|
|
337
|
+
- name: Upload report
|
|
338
|
+
if: failure()
|
|
339
|
+
uses: actions/upload-artifact@v4
|
|
340
|
+
with:
|
|
341
|
+
name: regression-report
|
|
342
|
+
path: regression-report.html
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
---
|
|
346
|
+
|
|
347
|
+
## Contributing
|
|
348
|
+
|
|
349
|
+
Contributions are welcome. Please open an issue first to discuss what you would like to change.
|
|
350
|
+
|
|
351
|
+
```bash
|
|
352
|
+
# Clone and install
|
|
353
|
+
git clone https://github.com/AshGw/playwright-archaeologist.git
|
|
354
|
+
cd playwright-archaeologist
|
|
355
|
+
npm install
|
|
356
|
+
|
|
357
|
+
# Build
|
|
358
|
+
npm run build
|
|
359
|
+
|
|
360
|
+
# Run tests
|
|
361
|
+
npm test
|
|
362
|
+
|
|
363
|
+
# Run tests in watch mode
|
|
364
|
+
npm run test:watch
|
|
365
|
+
|
|
366
|
+
# Run benchmarks
|
|
367
|
+
npm run bench
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
### Project structure
|
|
371
|
+
|
|
372
|
+
```
|
|
373
|
+
src/
|
|
374
|
+
cli.ts # CLI entry point (Commander.js)
|
|
375
|
+
index.ts # Programmatic API exports
|
|
376
|
+
crawl/ # BFS crawler, frontier, context pool, checkpoints
|
|
377
|
+
collectors/ # Page scanner, form prober, network logger, screenshots
|
|
378
|
+
assembler/ # API grouper, flow graph builder
|
|
379
|
+
auth/ # Auth script handler
|
|
380
|
+
report/ # HTML report generator
|
|
381
|
+
diff/ # Snapshot diff engine and reports
|
|
382
|
+
bundle/ # ZIP bundle creator
|
|
383
|
+
security/ # SSRF guard, credential scrubber, output sanitizer
|
|
384
|
+
types/ # TypeScript interfaces, Zod schemas, error hierarchy
|
|
385
|
+
utils/ # Logger, URL utilities, progress tracker
|
|
386
|
+
```
|
|
387
|
+
|
|
388
|
+
---
|
|
389
|
+
|
|
390
|
+
## License
|
|
391
|
+
|
|
392
|
+
[MIT](./LICENSE)
|
package/bin/cli.js
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
import {
|
|
2
|
+
isSameOrigin,
|
|
3
|
+
resolveUrl
|
|
4
|
+
} from "./chunk-RWPEKZOW.js";
|
|
5
|
+
|
|
6
|
+
// src/collectors/page-scanner.ts
|
|
7
|
+
async function scanPage(page, baseUrl, response) {
|
|
8
|
+
const url = page.url();
|
|
9
|
+
const statusCode = response?.status() ?? 200;
|
|
10
|
+
const [domData, timingData] = await Promise.all([
|
|
11
|
+
extractDomData(page),
|
|
12
|
+
extractTiming(page)
|
|
13
|
+
]);
|
|
14
|
+
const links = processLinks(domData.links, url, baseUrl);
|
|
15
|
+
const interactiveElements = processInteractiveElements(domData.interactiveElements);
|
|
16
|
+
const hashRoutingDetected = domData.hashIndicators >= 3;
|
|
17
|
+
return {
|
|
18
|
+
url,
|
|
19
|
+
canonicalUrl: domData.canonicalHref ?? void 0,
|
|
20
|
+
statusCode,
|
|
21
|
+
title: domData.title,
|
|
22
|
+
metaTags: domData.metaTags,
|
|
23
|
+
headings: domData.headings,
|
|
24
|
+
landmarks: domData.landmarks,
|
|
25
|
+
links,
|
|
26
|
+
interactiveElements,
|
|
27
|
+
timing: timingData,
|
|
28
|
+
contentHash: domData.contentHash,
|
|
29
|
+
hashRoutingDetected
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
async function extractDomData(page) {
|
|
33
|
+
try {
|
|
34
|
+
return await page.evaluate(() => {
|
|
35
|
+
const canonicalEl = document.querySelector('link[rel="canonical"]');
|
|
36
|
+
const canonicalHref = canonicalEl?.href ?? null;
|
|
37
|
+
const title = document.title ?? "";
|
|
38
|
+
const metaEls = document.querySelectorAll("meta[name], meta[property], meta[content]");
|
|
39
|
+
const metaTags = [];
|
|
40
|
+
metaEls.forEach((el) => {
|
|
41
|
+
const content = el.getAttribute("content");
|
|
42
|
+
if (!content) return;
|
|
43
|
+
const entry = { content };
|
|
44
|
+
const name = el.getAttribute("name");
|
|
45
|
+
const property = el.getAttribute("property");
|
|
46
|
+
if (name) entry.name = name;
|
|
47
|
+
if (property) entry.property = property;
|
|
48
|
+
if (name || property) {
|
|
49
|
+
metaTags.push(entry);
|
|
50
|
+
}
|
|
51
|
+
});
|
|
52
|
+
const headingEls = document.querySelectorAll("h1, h2, h3, h4, h5, h6");
|
|
53
|
+
const headings = [];
|
|
54
|
+
headingEls.forEach((el) => {
|
|
55
|
+
const text = (el.textContent ?? "").trim();
|
|
56
|
+
if (!text) return;
|
|
57
|
+
const level = parseInt(el.tagName[1], 10);
|
|
58
|
+
headings.push({ level, text });
|
|
59
|
+
});
|
|
60
|
+
const landmarkSelectors = "nav, main, aside, footer, header, [role]";
|
|
61
|
+
const landmarkEls = document.querySelectorAll(landmarkSelectors);
|
|
62
|
+
const landmarks = [];
|
|
63
|
+
const seenLandmarks = /* @__PURE__ */ new Set();
|
|
64
|
+
landmarkEls.forEach((el) => {
|
|
65
|
+
const tagName = el.tagName.toLowerCase();
|
|
66
|
+
const explicitRole = el.getAttribute("role");
|
|
67
|
+
let role;
|
|
68
|
+
if (explicitRole) {
|
|
69
|
+
role = explicitRole;
|
|
70
|
+
} else {
|
|
71
|
+
const implicitRoles = {
|
|
72
|
+
nav: "navigation",
|
|
73
|
+
main: "main",
|
|
74
|
+
aside: "complementary",
|
|
75
|
+
footer: "contentinfo",
|
|
76
|
+
header: "banner"
|
|
77
|
+
};
|
|
78
|
+
role = implicitRoles[tagName] ?? tagName;
|
|
79
|
+
}
|
|
80
|
+
const label = el.getAttribute("aria-label") ?? el.getAttribute("aria-labelledby") ?? void 0;
|
|
81
|
+
const key = `${role}|${tagName}|${label ?? ""}`;
|
|
82
|
+
if (seenLandmarks.has(key)) return;
|
|
83
|
+
seenLandmarks.add(key);
|
|
84
|
+
const entry = { role, tagName };
|
|
85
|
+
if (label) entry.label = label;
|
|
86
|
+
landmarks.push(entry);
|
|
87
|
+
});
|
|
88
|
+
const linkEls = document.querySelectorAll("a[href]");
|
|
89
|
+
const links = [];
|
|
90
|
+
const seenHrefs = /* @__PURE__ */ new Set();
|
|
91
|
+
linkEls.forEach((el) => {
|
|
92
|
+
const href = el.getAttribute("href");
|
|
93
|
+
if (!href || href === "#" || href.startsWith("javascript:")) return;
|
|
94
|
+
const resolvedHref = el.href;
|
|
95
|
+
if (seenHrefs.has(resolvedHref)) return;
|
|
96
|
+
seenHrefs.add(resolvedHref);
|
|
97
|
+
const text = (el.textContent ?? "").trim();
|
|
98
|
+
const rel = el.getAttribute("rel");
|
|
99
|
+
links.push({ href: resolvedHref, text, rel });
|
|
100
|
+
});
|
|
101
|
+
const interactiveSelectors = 'button, input, select, textarea, [role="button"], [role="tab"], [role="combobox"], [role="listbox"], [role="slider"], [role="spinbutton"], [role="switch"]';
|
|
102
|
+
const interactiveEls = document.querySelectorAll(interactiveSelectors);
|
|
103
|
+
const interactiveElements = [];
|
|
104
|
+
interactiveEls.forEach((el, index) => {
|
|
105
|
+
const tagName = el.tagName.toLowerCase();
|
|
106
|
+
const type = el.getAttribute("type");
|
|
107
|
+
const role = el.getAttribute("role");
|
|
108
|
+
const ariaLabel = el.getAttribute("aria-label");
|
|
109
|
+
if (tagName === "input" && type === "hidden") return;
|
|
110
|
+
let text = "";
|
|
111
|
+
if (tagName === "input" || tagName === "textarea") {
|
|
112
|
+
text = el.placeholder ?? el.value ?? "";
|
|
113
|
+
} else {
|
|
114
|
+
text = (el.textContent ?? "").trim();
|
|
115
|
+
}
|
|
116
|
+
let selector;
|
|
117
|
+
const id = el.getAttribute("id");
|
|
118
|
+
if (id) {
|
|
119
|
+
selector = `${tagName}#${CSS.escape(id)}`;
|
|
120
|
+
} else {
|
|
121
|
+
const name = el.getAttribute("name");
|
|
122
|
+
if (name) {
|
|
123
|
+
selector = `${tagName}[name="${CSS.escape(name)}"]`;
|
|
124
|
+
} else {
|
|
125
|
+
const parent = el.parentElement;
|
|
126
|
+
if (parent) {
|
|
127
|
+
const siblings = Array.from(parent.querySelectorAll(`:scope > ${tagName}`));
|
|
128
|
+
const nth = siblings.indexOf(el) + 1;
|
|
129
|
+
const parentId = parent.getAttribute("id");
|
|
130
|
+
if (parentId) {
|
|
131
|
+
selector = `#${CSS.escape(parentId)} > ${tagName}:nth-of-type(${nth})`;
|
|
132
|
+
} else {
|
|
133
|
+
selector = `${tagName}:nth-of-type(${nth})`;
|
|
134
|
+
}
|
|
135
|
+
} else {
|
|
136
|
+
selector = `${tagName}[data-arch-index="${index}"]`;
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
interactiveElements.push({
|
|
141
|
+
tagName,
|
|
142
|
+
type,
|
|
143
|
+
text: text.slice(0, 200),
|
|
144
|
+
// cap text length
|
|
145
|
+
role,
|
|
146
|
+
ariaLabel,
|
|
147
|
+
selector
|
|
148
|
+
});
|
|
149
|
+
});
|
|
150
|
+
const bodyText = (document.body?.innerText ?? "").trim();
|
|
151
|
+
let hash = 5381;
|
|
152
|
+
for (let i = 0; i < bodyText.length; i++) {
|
|
153
|
+
hash = (hash << 5) + hash + bodyText.charCodeAt(i) | 0;
|
|
154
|
+
}
|
|
155
|
+
const contentHash = (hash >>> 0).toString(16).padStart(8, "0");
|
|
156
|
+
let hashIndicators = 0;
|
|
157
|
+
linkEls.forEach((el) => {
|
|
158
|
+
const h = el.getAttribute("href");
|
|
159
|
+
if (h && (h.startsWith("#/") || h.startsWith("#!/"))) {
|
|
160
|
+
hashIndicators++;
|
|
161
|
+
}
|
|
162
|
+
});
|
|
163
|
+
return {
|
|
164
|
+
canonicalHref,
|
|
165
|
+
title,
|
|
166
|
+
metaTags,
|
|
167
|
+
headings,
|
|
168
|
+
landmarks,
|
|
169
|
+
links,
|
|
170
|
+
interactiveElements,
|
|
171
|
+
contentHash,
|
|
172
|
+
hashIndicators
|
|
173
|
+
};
|
|
174
|
+
});
|
|
175
|
+
} catch {
|
|
176
|
+
return {
|
|
177
|
+
canonicalHref: null,
|
|
178
|
+
title: "",
|
|
179
|
+
metaTags: [],
|
|
180
|
+
headings: [],
|
|
181
|
+
landmarks: [],
|
|
182
|
+
links: [],
|
|
183
|
+
interactiveElements: [],
|
|
184
|
+
contentHash: "00000000",
|
|
185
|
+
hashIndicators: 0
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
async function extractTiming(page) {
|
|
190
|
+
try {
|
|
191
|
+
const raw = await page.evaluate(() => {
|
|
192
|
+
const entries = performance.getEntriesByType("navigation");
|
|
193
|
+
if (entries.length === 0) {
|
|
194
|
+
return { loadTime: 0, domContentLoaded: 0, firstContentfulPaint: null };
|
|
195
|
+
}
|
|
196
|
+
const nav = entries[0];
|
|
197
|
+
const loadTime = nav.loadEventEnd > 0 ? Math.round(nav.loadEventEnd - nav.startTime) : 0;
|
|
198
|
+
const domContentLoaded = nav.domContentLoadedEventEnd > 0 ? Math.round(nav.domContentLoadedEventEnd - nav.startTime) : 0;
|
|
199
|
+
let firstContentfulPaint = null;
|
|
200
|
+
const paintEntries = performance.getEntriesByType("paint");
|
|
201
|
+
for (const entry of paintEntries) {
|
|
202
|
+
if (entry.name === "first-contentful-paint") {
|
|
203
|
+
firstContentfulPaint = Math.round(entry.startTime);
|
|
204
|
+
break;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
return { loadTime, domContentLoaded, firstContentfulPaint };
|
|
208
|
+
});
|
|
209
|
+
const timing = {
|
|
210
|
+
loadTime: raw.loadTime,
|
|
211
|
+
domContentLoaded: raw.domContentLoaded
|
|
212
|
+
};
|
|
213
|
+
if (raw.firstContentfulPaint != null) {
|
|
214
|
+
timing.firstContentfulPaint = raw.firstContentfulPaint;
|
|
215
|
+
}
|
|
216
|
+
return timing;
|
|
217
|
+
} catch {
|
|
218
|
+
return { loadTime: 0, domContentLoaded: 0 };
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
function processLinks(rawLinks, pageUrl, baseUrl) {
|
|
222
|
+
const results = [];
|
|
223
|
+
for (const raw of rawLinks) {
|
|
224
|
+
const resolved = resolveUrl(raw.href, pageUrl);
|
|
225
|
+
const isExternal = !isSameOrigin(resolved, baseUrl);
|
|
226
|
+
const link = {
|
|
227
|
+
href: resolved,
|
|
228
|
+
text: raw.text,
|
|
229
|
+
isExternal
|
|
230
|
+
};
|
|
231
|
+
if (raw.rel) {
|
|
232
|
+
link.rel = raw.rel;
|
|
233
|
+
}
|
|
234
|
+
results.push(link);
|
|
235
|
+
}
|
|
236
|
+
return results;
|
|
237
|
+
}
|
|
238
|
+
function processInteractiveElements(rawElements) {
|
|
239
|
+
return rawElements.map((raw) => {
|
|
240
|
+
const el = {
|
|
241
|
+
tagName: raw.tagName,
|
|
242
|
+
text: raw.text,
|
|
243
|
+
selector: raw.selector
|
|
244
|
+
};
|
|
245
|
+
if (raw.type) el.type = raw.type;
|
|
246
|
+
if (raw.role) el.role = raw.role;
|
|
247
|
+
if (raw.ariaLabel) el.ariaLabel = raw.ariaLabel;
|
|
248
|
+
return el;
|
|
249
|
+
});
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
export {
|
|
253
|
+
scanPage
|
|
254
|
+
};
|
|
255
|
+
//# sourceMappingURL=chunk-7ZQGW5OV.js.map
|