@vinikjkkj/wa-fetcher 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +84 -0
- package/action.yml +55 -0
- package/package.json +57 -0
- package/src/cli.cjs +82 -0
- package/src/index.cjs +249 -0
package/README.md
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# @vinikjkkj/wa-fetcher
|
|
2
|
+
|
|
3
|
+
Headless scraper for web.whatsapp.com bundles. Downloads every loaded JS
|
|
4
|
+
chunk to disk and writes a manifest — that's all. Per-domain extractors
|
|
5
|
+
(proto, mex, diff, …) consume the raw dump independently.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```sh
|
|
10
|
+
npm i @vinikjkkj/wa-fetcher
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## CLI
|
|
14
|
+
|
|
15
|
+
```sh
|
|
16
|
+
# Default: download every bundle + write manifest
|
|
17
|
+
npx wa-fetcher --out dump/
|
|
18
|
+
|
|
19
|
+
# Discovery only: just the URL list (no download). Useful when piping into
|
|
20
|
+
# tools that have their own downloader (e.g. wa-modules-loader).
|
|
21
|
+
npx wa-fetcher --urls-only --out urls.json
|
|
22
|
+
npx wa-fetcher --urls-only > urls.json
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
| Flag | Default | Notes |
|
|
26
|
+
|---|---|---|
|
|
27
|
+
| `--out <path>` | `dump` (dir) / stdout (urls-only) | Output destination |
|
|
28
|
+
| `--urls-only` | off | Skip download; emit only the discovered URL array (JSON) |
|
|
29
|
+
| `--auth <file>` | none | Saved cookie state JSON for authenticated fetch (captures more lazy chunks) |
|
|
30
|
+
| `--extra-wait <ms>` | `5000` | Wait this long after network-idle for lazy chunks |
|
|
31
|
+
|
|
32
|
+
**Output layout:**
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
dump/
|
|
36
|
+
├── manifest.json { waVersion, fetchedAt, bundles[] }
|
|
37
|
+
└── raw/
|
|
38
|
+
└── <wa-version>/
|
|
39
|
+
├── chunk-AAAA.js
|
|
40
|
+
├── chunk-BBBB.js
|
|
41
|
+
└── …
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Library
|
|
45
|
+
|
|
46
|
+
```js
|
|
47
|
+
const { discoverBundleUrls, fetchBundles } = require('@vinikjkkj/wa-fetcher')
|
|
48
|
+
|
|
49
|
+
// Discovery only — no download. Returns the same URL list the full fetcher
|
|
50
|
+
// would have downloaded (sorted, deduped, host-filtered to static.whatsapp.net).
|
|
51
|
+
const { waVersion, urls } = await discoverBundleUrls()
|
|
52
|
+
// waVersion "2.3000.xxxxxxx" | null
|
|
53
|
+
// urls string[]
|
|
54
|
+
|
|
55
|
+
// Discovery + download.
|
|
56
|
+
const dump = await fetchBundles({ out: 'dump' })
|
|
57
|
+
// dump.waVersion "2.3000.xxxxxxx" | null
|
|
58
|
+
// dump.bundles[] [{ url, file, bytes }, ...]
|
|
59
|
+
// dump.paths.raw absolute path to dump/raw/<version>/
|
|
60
|
+
// dump.paths.manifest absolute path to dump/manifest.json
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## GitHub Action
|
|
64
|
+
|
|
65
|
+
```yaml
|
|
66
|
+
- uses: vinikjkkj/wa-spec/packages/fetcher@v1
|
|
67
|
+
id: fetch
|
|
68
|
+
with:
|
|
69
|
+
out: dump
|
|
70
|
+
- run: npx wa-mex apply --bundles ${{ steps.fetch.outputs.raw-dir }}
|
|
71
|
+
- run: npx wa-proto apply --bundles ${{ steps.fetch.outputs.raw-dir }}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Caveats
|
|
75
|
+
|
|
76
|
+
- **Lazy chunks** that the SPA only loads via interaction (Settings, Profile,
|
|
77
|
+
Premium) won't be in the dump. Use `--auth` with a pre-saved cookie state
|
|
78
|
+
to maximize what loads naturally.
|
|
79
|
+
- **Anti-bot** — `puppeteer-real-browser` works today but Meta can tighten
|
|
80
|
+
detection. If the fetcher returns blank pages, re-evaluate the strategy.
|
|
81
|
+
- **No extraction** — this package is intentionally dumb. The extractors live
|
|
82
|
+
in [`@vinikjkkj/wa-mex`](https://github.com/vinikjkkj/wa-spec/tree/main/packages/mex)
|
|
83
|
+
and [`@vinikjkkj/wa-proto`](https://github.com/vinikjkkj/wa-spec/tree/main/packages/proto)
|
|
84
|
+
so adding a new artifact never requires touching the fetcher.
|
package/action.yml
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
name: 'wa-fetcher'
|
|
2
|
+
description: 'Download every loaded web.whatsapp.com JS bundle + emit a manifest.json (no extraction)'
|
|
3
|
+
author: 'vinikjkkj'
|
|
4
|
+
branding:
|
|
5
|
+
icon: 'download-cloud'
|
|
6
|
+
color: 'green'
|
|
7
|
+
|
|
8
|
+
inputs:
|
|
9
|
+
out:
|
|
10
|
+
description: 'Output directory for raw bundles + manifest.json'
|
|
11
|
+
required: false
|
|
12
|
+
default: 'dump'
|
|
13
|
+
auth-state:
|
|
14
|
+
description: 'Optional path to a saved cookie state JSON for authenticated fetch'
|
|
15
|
+
required: false
|
|
16
|
+
extra-wait:
|
|
17
|
+
description: 'Extra wait (ms) after network-idle, gives lazy chunks time to load'
|
|
18
|
+
required: false
|
|
19
|
+
default: '5000'
|
|
20
|
+
|
|
21
|
+
outputs:
|
|
22
|
+
manifest-path:
|
|
23
|
+
description: 'Path to the generated manifest.json'
|
|
24
|
+
value: ${{ steps.run.outputs.manifest-path }}
|
|
25
|
+
raw-dir:
|
|
26
|
+
description: 'Path to the raw bundles directory (raw/<wa-version>/)'
|
|
27
|
+
value: ${{ steps.run.outputs.raw-dir }}
|
|
28
|
+
wa-version:
|
|
29
|
+
description: 'Detected WhatsApp Web version (or "unknown")'
|
|
30
|
+
value: ${{ steps.run.outputs.wa-version }}
|
|
31
|
+
|
|
32
|
+
runs:
|
|
33
|
+
using: 'composite'
|
|
34
|
+
steps:
|
|
35
|
+
- name: Install fetcher
|
|
36
|
+
shell: bash
|
|
37
|
+
run: npm install --no-save @vinikjkkj/wa-fetcher@latest
|
|
38
|
+
|
|
39
|
+
- name: Run fetcher
|
|
40
|
+
id: run
|
|
41
|
+
shell: bash
|
|
42
|
+
env:
|
|
43
|
+
FETCHER_OUT: ${{ inputs.out }}
|
|
44
|
+
FETCHER_AUTH: ${{ inputs.auth-state }}
|
|
45
|
+
FETCHER_EXTRA_WAIT: ${{ inputs.extra-wait }}
|
|
46
|
+
run: |
|
|
47
|
+
ARGS="--out $FETCHER_OUT --extra-wait $FETCHER_EXTRA_WAIT"
|
|
48
|
+
if [ -n "$FETCHER_AUTH" ]; then ARGS="$ARGS --auth $FETCHER_AUTH"; fi
|
|
49
|
+
npx wa-fetcher $ARGS
|
|
50
|
+
MANIFEST="$FETCHER_OUT/manifest.json"
|
|
51
|
+
echo "manifest-path=$MANIFEST" >> "$GITHUB_OUTPUT"
|
|
52
|
+
RAW_DIR=$(node -e "console.log(JSON.parse(require('node:fs').readFileSync(process.argv[1])).rawDir)" "$MANIFEST")
|
|
53
|
+
echo "raw-dir=$FETCHER_OUT/$RAW_DIR" >> "$GITHUB_OUTPUT"
|
|
54
|
+
VERSION=$(node -e "console.log(JSON.parse(require('node:fs').readFileSync(process.argv[1])).waVersion ?? 'unknown')" "$MANIFEST")
|
|
55
|
+
echo "wa-version=$VERSION" >> "$GITHUB_OUTPUT"
|
package/package.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@vinikjkkj/wa-fetcher",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Headless scraper for web.whatsapp.com bundles — downloads every loaded JS chunk + emits a manifest. Pairs with @vinikjkkj/wa-mex and @vinikjkkj/wa-proto.",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"whatsapp",
|
|
7
|
+
"whatsapp-web",
|
|
8
|
+
"scraper",
|
|
9
|
+
"fetcher",
|
|
10
|
+
"puppeteer",
|
|
11
|
+
"puppeteer-real-browser",
|
|
12
|
+
"reverse-engineering",
|
|
13
|
+
"wa-spec"
|
|
14
|
+
],
|
|
15
|
+
"license": "MIT",
|
|
16
|
+
"author": {
|
|
17
|
+
"name": "vinikjkkj",
|
|
18
|
+
"url": "https://github.com/vinikjkkj"
|
|
19
|
+
},
|
|
20
|
+
"homepage": "https://github.com/vinikjkkj/wa-spec/tree/main/packages/fetcher#readme",
|
|
21
|
+
"bugs": {
|
|
22
|
+
"url": "https://github.com/vinikjkkj/wa-spec/issues"
|
|
23
|
+
},
|
|
24
|
+
"repository": {
|
|
25
|
+
"type": "git",
|
|
26
|
+
"url": "git+https://github.com/vinikjkkj/wa-spec.git",
|
|
27
|
+
"directory": "packages/fetcher"
|
|
28
|
+
},
|
|
29
|
+
"funding": [
|
|
30
|
+
{
|
|
31
|
+
"type": "github",
|
|
32
|
+
"url": "https://github.com/sponsors/vinikjkkj"
|
|
33
|
+
}
|
|
34
|
+
],
|
|
35
|
+
"main": "src/index.cjs",
|
|
36
|
+
"bin": {
|
|
37
|
+
"wa-fetcher": "src/cli.cjs"
|
|
38
|
+
},
|
|
39
|
+
"files": [
|
|
40
|
+
"src/**/*.cjs",
|
|
41
|
+
"src/**/*.js",
|
|
42
|
+
"action.yml",
|
|
43
|
+
"README.md"
|
|
44
|
+
],
|
|
45
|
+
"scripts": {
|
|
46
|
+
"fetch": "node src/cli.cjs --out dump/"
|
|
47
|
+
},
|
|
48
|
+
"dependencies": {
|
|
49
|
+
"puppeteer-real-browser": "^1.4.4"
|
|
50
|
+
},
|
|
51
|
+
"engines": {
|
|
52
|
+
"node": ">=20.9.0"
|
|
53
|
+
},
|
|
54
|
+
"publishConfig": {
|
|
55
|
+
"access": "public"
|
|
56
|
+
}
|
|
57
|
+
}
|
package/src/cli.cjs
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict'
|
|
3
|
+
|
|
4
|
+
const fs = require('node:fs')
|
|
5
|
+
const path = require('node:path')
|
|
6
|
+
const { discoverBundleUrls, fetchBundles } = require('./index.cjs')
|
|
7
|
+
|
|
8
|
+
function parseArgs(argv) {
|
|
9
|
+
const opts = { out: null, urlsOnly: false }
|
|
10
|
+
for (let i = 2; i < argv.length; i++) {
|
|
11
|
+
const a = argv[i]
|
|
12
|
+
if (a === '--out') opts.out = argv[++i]
|
|
13
|
+
else if (a === '--auth') opts.authState = argv[++i]
|
|
14
|
+
else if (a === '--extra-wait') opts.extraWaitMs = Number(argv[++i])
|
|
15
|
+
else if (a === '--urls-only') opts.urlsOnly = true
|
|
16
|
+
else if (a === '--help' || a === '-h') {
|
|
17
|
+
printHelp()
|
|
18
|
+
process.exit(0)
|
|
19
|
+
} else {
|
|
20
|
+
console.error('unknown flag:', a)
|
|
21
|
+
printHelp()
|
|
22
|
+
process.exit(2)
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
return opts
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function printHelp() {
|
|
29
|
+
console.error(
|
|
30
|
+
[
|
|
31
|
+
'usage: wa-fetcher [options]',
|
|
32
|
+
'',
|
|
33
|
+
'options:',
|
|
34
|
+
' --out <path> output path',
|
|
35
|
+
' - default mode: directory for raw bundles + manifest.json (default: dump)',
|
|
36
|
+
' - --urls-only mode: file to write URL array JSON (default: stdout)',
|
|
37
|
+
' --auth <file> optional saved cookie state JSON',
|
|
38
|
+
' --extra-wait <ms> extra wait after networkidle for lazy chunks (default: 5000)',
|
|
39
|
+
' --urls-only skip download; output just the discovered URL array as JSON',
|
|
40
|
+
' -h, --help show this help'
|
|
41
|
+
].join('\n')
|
|
42
|
+
)
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
async function main() {
|
|
46
|
+
const opts = parseArgs(process.argv)
|
|
47
|
+
const start = Date.now()
|
|
48
|
+
|
|
49
|
+
if (opts.urlsOnly) {
|
|
50
|
+
console.error('[wa-fetcher] discovering URLs only (no download)…')
|
|
51
|
+
const { waVersion, urls } = await discoverBundleUrls(opts)
|
|
52
|
+
const json = JSON.stringify(urls, null, 4) + '\n'
|
|
53
|
+
if (opts.out) {
|
|
54
|
+
fs.mkdirSync(path.dirname(path.resolve(opts.out)), { recursive: true })
|
|
55
|
+
fs.writeFileSync(opts.out, json)
|
|
56
|
+
console.error(
|
|
57
|
+
`[wa-fetcher] done in ${((Date.now() - start) / 1000).toFixed(1)}s — ` +
|
|
58
|
+
`version=${waVersion ?? 'unknown'} urls=${urls.length} → ${opts.out}`
|
|
59
|
+
)
|
|
60
|
+
} else {
|
|
61
|
+
process.stdout.write(json)
|
|
62
|
+
}
|
|
63
|
+
return
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
const outDir = opts.out ?? 'dump'
|
|
67
|
+
console.error('[wa-fetcher] launching headless browser…')
|
|
68
|
+
const result = await fetchBundles({ ...opts, out: outDir })
|
|
69
|
+
const elapsed = ((Date.now() - start) / 1000).toFixed(1)
|
|
70
|
+
const mb = (result.bundles.reduce((s, b) => s + b.bytes, 0) / 1024 / 1024).toFixed(1)
|
|
71
|
+
console.error(
|
|
72
|
+
`[wa-fetcher] done in ${elapsed}s — version=${result.waVersion ?? 'unknown'} ` +
|
|
73
|
+
`bundles=${result.bundles.length} (${mb}MB)`
|
|
74
|
+
)
|
|
75
|
+
console.error(`[wa-fetcher] raw → ${result.paths.raw}`)
|
|
76
|
+
console.error(`[wa-fetcher] manifest → ${result.paths.manifest}`)
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
main().catch((err) => {
|
|
80
|
+
console.error('[wa-fetcher] failed:', err.stack || err.message || err)
|
|
81
|
+
process.exit(1)
|
|
82
|
+
})
|
package/src/index.cjs
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
'use strict'
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* @vinikjkkj/wa-fetcher
|
|
5
|
+
*
|
|
6
|
+
* const { discoverBundleUrls } = require('@vinikjkkj/wa-fetcher')
|
|
7
|
+
* const { waVersion, urls } = await discoverBundleUrls()
|
|
8
|
+
* // urls[] ['https://static.whatsapp.net/.../chunk.js', ...]
|
|
9
|
+
* // waVersion '2.3000.xxx' | null
|
|
10
|
+
*
|
|
11
|
+
* const { fetchBundles } = require('@vinikjkkj/wa-fetcher')
|
|
12
|
+
* const dump = await fetchBundles({ out: 'dump' })
|
|
13
|
+
* // dump.bundles[] [{ url, file, bytes }, ...]
|
|
14
|
+
* // dump.paths.raw absolute path to <out>/raw/<version>/
|
|
15
|
+
* // dump.paths.manifest
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
const fs = require('node:fs')
|
|
19
|
+
const path = require('node:path')
|
|
20
|
+
const { connect } = require('puppeteer-real-browser')
|
|
21
|
+
|
|
22
|
+
const WHATSAPP_URL = 'https://web.whatsapp.com/'
|
|
23
|
+
const NAV_TIMEOUT_MS = 60_000
|
|
24
|
+
const LAZY_WAIT_MS = 5_000
|
|
25
|
+
|
|
26
|
+
// In-page discovery: walks `<script data-sjs>` rsrcMap manifests, preload
|
|
27
|
+
// links, inline script literals, and performance entries to find every JS
|
|
28
|
+
// bundle URL Meta references — not just the chunks the SPA has loaded.
|
|
29
|
+
// (Technique cribbed from vinikjkkj/wa-diff/fetch.js.)
|
|
30
|
+
const DISCOVERY_FN = function () {
|
|
31
|
+
const found = new Set()
|
|
32
|
+
const allowedHost = 'static.whatsapp.net'
|
|
33
|
+
const jsFileRe = /\.m?js(?:[?#]|$)/i
|
|
34
|
+
const jsInTextRe =
|
|
35
|
+
/(?:https?:)?\/\/[^\s"'`<>]+?\.m?js(?:[?#][^\s"'`<>]*)?|(?:\/|\.\/|\.\.\/)[^\s"'`<>]+?\.m?js(?:[?#][^\s"'`<>]*)?/gi
|
|
36
|
+
function addUrl(raw) {
|
|
37
|
+
if (typeof raw !== 'string') return
|
|
38
|
+
const cleaned = raw.trim().replace(/\\\//g, '/')
|
|
39
|
+
if (!cleaned || !jsFileRe.test(cleaned)) return
|
|
40
|
+
try {
|
|
41
|
+
let normalized = cleaned
|
|
42
|
+
if (
|
|
43
|
+
/^[a-z0-9.-]+\.[a-z]{2,}(?:[/?#]|$)/i.test(normalized) &&
|
|
44
|
+
!/^[a-z][a-z0-9+.-]*:/i.test(normalized)
|
|
45
|
+
) {
|
|
46
|
+
normalized = `https://${normalized}`
|
|
47
|
+
}
|
|
48
|
+
const parsed = new URL(normalized, location.href)
|
|
49
|
+
if (parsed.hostname.toLowerCase() !== allowedHost) return
|
|
50
|
+
found.add(parsed.href)
|
|
51
|
+
} catch {}
|
|
52
|
+
}
|
|
53
|
+
function extractFromText(text) {
|
|
54
|
+
if (typeof text !== 'string' || !text) return
|
|
55
|
+
const matches = text.match(jsInTextRe)
|
|
56
|
+
if (!matches) return
|
|
57
|
+
for (const m of matches) addUrl(m)
|
|
58
|
+
}
|
|
59
|
+
function walk(obj, seen = new WeakSet()) {
|
|
60
|
+
if (!obj || typeof obj !== 'object') return
|
|
61
|
+
if (seen.has(obj)) return
|
|
62
|
+
seen.add(obj)
|
|
63
|
+
if (obj.rsrcMap && typeof obj.rsrcMap === 'object') {
|
|
64
|
+
for (const r of Object.values(obj.rsrcMap)) {
|
|
65
|
+
if (!r || typeof r !== 'object') continue
|
|
66
|
+
if (r.type === 'js') addUrl(r.src || r.url || r.href || r.uri)
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
for (const value of Object.values(obj)) {
|
|
70
|
+
if (typeof value === 'string') addUrl(value)
|
|
71
|
+
else walk(value, seen)
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
for (const s of document.querySelectorAll('script[data-sjs]')) {
|
|
75
|
+
extractFromText(s.textContent || '')
|
|
76
|
+
try {
|
|
77
|
+
walk(JSON.parse(s.textContent || ''))
|
|
78
|
+
} catch {}
|
|
79
|
+
}
|
|
80
|
+
for (const s of document.querySelectorAll('script[src]')) {
|
|
81
|
+
addUrl(s.src || s.getAttribute('src'))
|
|
82
|
+
}
|
|
83
|
+
for (const l of document.querySelectorAll(
|
|
84
|
+
'link[rel="preload"][as="script"][href], link[rel="modulepreload"][href], link[rel="prefetch"][as="script"][href], link[rel="prefetch"][href]'
|
|
85
|
+
)) {
|
|
86
|
+
addUrl(l.href || l.getAttribute('href'))
|
|
87
|
+
}
|
|
88
|
+
for (const s of document.querySelectorAll('script:not([src])')) {
|
|
89
|
+
extractFromText(s.textContent || '')
|
|
90
|
+
}
|
|
91
|
+
if (typeof performance?.getEntriesByType === 'function') {
|
|
92
|
+
for (const e of performance.getEntriesByType('resource')) {
|
|
93
|
+
if (e && typeof e.name === 'string') addUrl(e.name)
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
return Array.from(found)
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
const VERSION_FN = function () {
|
|
100
|
+
try {
|
|
101
|
+
if (typeof window !== 'undefined' && window.Debug && window.Debug.VERSION) {
|
|
102
|
+
return String(window.Debug.VERSION)
|
|
103
|
+
}
|
|
104
|
+
} catch {}
|
|
105
|
+
return null
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
async function openPage(opts) {
|
|
109
|
+
const authState = opts.authState ?? null
|
|
110
|
+
const extraWaitMs = opts.extraWaitMs ?? LAZY_WAIT_MS
|
|
111
|
+
|
|
112
|
+
const { browser, page } = await connect({
|
|
113
|
+
headless: true,
|
|
114
|
+
turnstile: true,
|
|
115
|
+
connectOption: { defaultViewport: null },
|
|
116
|
+
customConfig: {},
|
|
117
|
+
plugins: []
|
|
118
|
+
})
|
|
119
|
+
|
|
120
|
+
try {
|
|
121
|
+
if (authState && fs.existsSync(authState)) {
|
|
122
|
+
const state = JSON.parse(fs.readFileSync(authState, 'utf8'))
|
|
123
|
+
if (Array.isArray(state.cookies) && state.cookies.length > 0) {
|
|
124
|
+
await page.setCookie(...state.cookies)
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
await page.goto(WHATSAPP_URL, { waitUntil: 'networkidle2', timeout: NAV_TIMEOUT_MS })
|
|
128
|
+
await new Promise((r) => setTimeout(r, extraWaitMs))
|
|
129
|
+
return { browser, page }
|
|
130
|
+
} catch (err) {
|
|
131
|
+
await browser.close().catch(() => {})
|
|
132
|
+
throw err
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Discover every JS bundle URL referenced by web.whatsapp.com.
|
|
138
|
+
* Returns `{ waVersion, urls }` — sorted, deduped, host-filtered to
|
|
139
|
+
* `static.whatsapp.net`.
|
|
140
|
+
*/
|
|
141
|
+
async function discoverBundleUrls(opts = {}) {
|
|
142
|
+
const { browser, page } = await openPage(opts)
|
|
143
|
+
try {
|
|
144
|
+
const [urls, waVersion] = await Promise.all([
|
|
145
|
+
page.evaluate(DISCOVERY_FN),
|
|
146
|
+
page.evaluate(VERSION_FN)
|
|
147
|
+
])
|
|
148
|
+
return { waVersion, urls: [...new Set(urls)].sort() }
|
|
149
|
+
} finally {
|
|
150
|
+
await browser.close().catch(() => {})
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Discover + download every JS bundle. Writes raw bundles to
|
|
156
|
+
* `<out>/raw/<wa-version>/*.js` and a `<out>/manifest.json` index.
|
|
157
|
+
*/
|
|
158
|
+
async function fetchBundles(opts = {}) {
|
|
159
|
+
const out = path.resolve(opts.out ?? 'dump')
|
|
160
|
+
fs.mkdirSync(out, { recursive: true })
|
|
161
|
+
|
|
162
|
+
const { browser, page } = await openPage(opts)
|
|
163
|
+
try {
|
|
164
|
+
const [urls, waVersion] = await Promise.all([
|
|
165
|
+
page.evaluate(DISCOVERY_FN),
|
|
166
|
+
page.evaluate(VERSION_FN)
|
|
167
|
+
])
|
|
168
|
+
const sortedUrls = [...new Set(urls)].sort()
|
|
169
|
+
|
|
170
|
+
// CORS-safe in-page fetch (same origin as the SPA).
|
|
171
|
+
const downloaded = await page.evaluate(async (uList) => {
|
|
172
|
+
const out = []
|
|
173
|
+
await Promise.all(
|
|
174
|
+
uList.map(async (u) => {
|
|
175
|
+
try {
|
|
176
|
+
const r = await fetch(u)
|
|
177
|
+
const t = await r.text()
|
|
178
|
+
out.push({ url: u, text: t })
|
|
179
|
+
} catch {
|
|
180
|
+
out.push({ url: u, text: '' })
|
|
181
|
+
}
|
|
182
|
+
})
|
|
183
|
+
)
|
|
184
|
+
return out
|
|
185
|
+
}, sortedUrls)
|
|
186
|
+
|
|
187
|
+
const versionDir = path.join(
|
|
188
|
+
out,
|
|
189
|
+
'raw',
|
|
190
|
+
waVersion ?? `unknown-${new Date().toISOString().replace(/[:.]/g, '-')}`
|
|
191
|
+
)
|
|
192
|
+
fs.mkdirSync(versionDir, { recursive: true })
|
|
193
|
+
|
|
194
|
+
const bundles = []
|
|
195
|
+
let totalBytes = 0
|
|
196
|
+
for (const b of downloaded) {
|
|
197
|
+
const file = sanitizeFilename(b.url)
|
|
198
|
+
fs.writeFileSync(path.join(versionDir, file), b.text)
|
|
199
|
+
bundles.push({ url: b.url, file, bytes: b.text.length })
|
|
200
|
+
totalBytes += b.text.length
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
const manifest = {
|
|
204
|
+
waVersion,
|
|
205
|
+
fetchedAt: new Date().toISOString(),
|
|
206
|
+
bundleCount: bundles.length,
|
|
207
|
+
totalBytes,
|
|
208
|
+
rawDir: path.relative(out, versionDir),
|
|
209
|
+
bundles
|
|
210
|
+
}
|
|
211
|
+
const manifestPath = path.join(out, 'manifest.json')
|
|
212
|
+
fs.writeFileSync(manifestPath, JSON.stringify(manifest, null, 2) + '\n')
|
|
213
|
+
|
|
214
|
+
return {
|
|
215
|
+
waVersion,
|
|
216
|
+
bundles,
|
|
217
|
+
paths: {
|
|
218
|
+
out,
|
|
219
|
+
raw: versionDir,
|
|
220
|
+
manifest: manifestPath
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
} finally {
|
|
224
|
+
await browser.close().catch(() => {})
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
function sanitizeFilename(url) {
|
|
229
|
+
// Some WA bundle URLs have very long randomised basenames that bust the
|
|
230
|
+
// 260-char path limit on Windows. Truncate to a fixed prefix and append a
|
|
231
|
+
// short hash of the full URL to preserve uniqueness.
|
|
232
|
+
const MAX_BASE_LEN = 64
|
|
233
|
+
const { createHash } = require('node:crypto')
|
|
234
|
+
let base
|
|
235
|
+
try {
|
|
236
|
+
const u = new URL(url)
|
|
237
|
+
base = u.pathname.split('/').pop() || 'bundle'
|
|
238
|
+
} catch {
|
|
239
|
+
base = url
|
|
240
|
+
}
|
|
241
|
+
base = base.replace(/[^A-Za-z0-9._-]/g, '_')
|
|
242
|
+
if (base.length <= MAX_BASE_LEN) return base
|
|
243
|
+
const ext = base.match(/\.[A-Za-z0-9]{1,8}$/)?.[0] ?? ''
|
|
244
|
+
const stem = base.slice(0, MAX_BASE_LEN - ext.length - 9) // leave room for hash
|
|
245
|
+
const hash = createHash('sha1').update(url).digest('hex').slice(0, 8)
|
|
246
|
+
return `${stem}-${hash}${ext}`
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
module.exports = { discoverBundleUrls, fetchBundles }
|