@vinikjkkj/wa-fetcher 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,84 @@
1
+ # @vinikjkkj/wa-fetcher
2
+
3
+ Headless scraper for web.whatsapp.com bundles. Downloads every loaded JS
4
+ chunk to disk and writes a manifest — that's all. Per-domain extractors
5
+ (proto, mex, diff, …) consume the raw dump independently.
6
+
7
+ ## Install
8
+
9
+ ```sh
10
+ npm i @vinikjkkj/wa-fetcher
11
+ ```
12
+
13
+ ## CLI
14
+
15
+ ```sh
16
+ # Default: download every bundle + write manifest
17
+ npx wa-fetcher --out dump/
18
+
19
+ # Discovery only: just the URL list (no download). Useful when piping into
20
+ # tools that have their own downloader (e.g. wa-modules-loader).
21
+ npx wa-fetcher --urls-only --out urls.json
22
+ npx wa-fetcher --urls-only > urls.json
23
+ ```
24
+
25
+ | Flag | Default | Notes |
26
+ |---|---|---|
27
+ | `--out <path>` | `dump` (dir) / stdout (urls-only) | Output destination |
28
+ | `--urls-only` | off | Skip download; emit only the discovered URL array (JSON) |
29
+ | `--auth <file>` | none | Saved cookie state JSON for authenticated fetch (captures more lazy chunks) |
30
+ | `--extra-wait <ms>` | `5000` | Wait this long after network-idle for lazy chunks |
31
+
32
+ **Output layout:**
33
+
34
+ ```
35
+ dump/
36
+ ├── manifest.json { waVersion, fetchedAt, bundles[] }
37
+ └── raw/
38
+ └── <wa-version>/
39
+ ├── chunk-AAAA.js
40
+ ├── chunk-BBBB.js
41
+ └── …
42
+ ```
43
+
44
+ ## Library
45
+
46
+ ```js
47
+ const { discoverBundleUrls, fetchBundles } = require('@vinikjkkj/wa-fetcher')
48
+
49
+ // Discovery only — no download. Returns the same URL list the full fetcher
50
+ // would have downloaded (sorted, deduped, host-filtered to static.whatsapp.net).
51
+ const { waVersion, urls } = await discoverBundleUrls()
52
+ // waVersion "2.3000.xxxxxxx" | null
53
+ // urls string[]
54
+
55
+ // Discovery + download.
56
+ const dump = await fetchBundles({ out: 'dump' })
57
+ // dump.waVersion "2.3000.xxxxxxx" | null
58
+ // dump.bundles[] [{ url, file, bytes }, ...]
59
+ // dump.paths.raw absolute path to dump/raw/<version>/
60
+ // dump.paths.manifest absolute path to dump/manifest.json
61
+ ```
62
+
63
+ ## GitHub Action
64
+
65
+ ```yaml
66
+ - uses: vinikjkkj/wa-spec/packages/fetcher@v1
67
+ id: fetch
68
+ with:
69
+ out: dump
70
+ - run: npx wa-mex apply --bundles ${{ steps.fetch.outputs.raw-dir }}
71
+ - run: npx wa-proto apply --bundles ${{ steps.fetch.outputs.raw-dir }}
72
+ ```
73
+
74
+ ## Caveats
75
+
76
+ - **Lazy chunks** that the SPA only loads via interaction (Settings, Profile,
77
+ Premium) won't be in the dump. Use `--auth` with a pre-saved cookie state
78
+ to maximize what loads naturally.
79
+ - **Anti-bot** — `puppeteer-real-browser` works today but Meta can tighten
80
+ detection. If the fetcher returns blank pages, re-evaluate the strategy.
81
+ - **No extraction** — this package is intentionally dumb. The extractors live
82
+ in [`@vinikjkkj/wa-mex`](https://github.com/vinikjkkj/wa-spec/tree/main/packages/mex)
83
+ and [`@vinikjkkj/wa-proto`](https://github.com/vinikjkkj/wa-spec/tree/main/packages/proto)
84
+ so adding a new artifact never requires touching the fetcher.
package/action.yml ADDED
@@ -0,0 +1,55 @@
1
+ name: 'wa-fetcher'
2
+ description: 'Download every loaded web.whatsapp.com JS bundle + emit a manifest.json (no extraction)'
3
+ author: 'vinikjkkj'
4
+ branding:
5
+ icon: 'download-cloud'
6
+ color: 'green'
7
+
8
+ inputs:
9
+ out:
10
+ description: 'Output directory for raw bundles + manifest.json'
11
+ required: false
12
+ default: 'dump'
13
+ auth-state:
14
+ description: 'Optional path to a saved cookie state JSON for authenticated fetch'
15
+ required: false
16
+ extra-wait:
17
+ description: 'Extra wait (ms) after network-idle, gives lazy chunks time to load'
18
+ required: false
19
+ default: '5000'
20
+
21
+ outputs:
22
+ manifest-path:
23
+ description: 'Path to the generated manifest.json'
24
+ value: ${{ steps.run.outputs.manifest-path }}
25
+ raw-dir:
26
+ description: 'Path to the raw bundles directory (raw/<wa-version>/)'
27
+ value: ${{ steps.run.outputs.raw-dir }}
28
+ wa-version:
29
+ description: 'Detected WhatsApp Web version (or "unknown")'
30
+ value: ${{ steps.run.outputs.wa-version }}
31
+
32
+ runs:
33
+ using: 'composite'
34
+ steps:
35
+ - name: Install fetcher
36
+ shell: bash
37
+ run: npm install --no-save @vinikjkkj/wa-fetcher@latest
38
+
39
+ - name: Run fetcher
40
+ id: run
41
+ shell: bash
42
+ env:
43
+ FETCHER_OUT: ${{ inputs.out }}
44
+ FETCHER_AUTH: ${{ inputs.auth-state }}
45
+ FETCHER_EXTRA_WAIT: ${{ inputs.extra-wait }}
46
+ run: |
47
+ ARGS="--out $FETCHER_OUT --extra-wait $FETCHER_EXTRA_WAIT"
48
+ if [ -n "$FETCHER_AUTH" ]; then ARGS="$ARGS --auth $FETCHER_AUTH"; fi
49
+ npx wa-fetcher $ARGS
50
+ MANIFEST="$FETCHER_OUT/manifest.json"
51
+ echo "manifest-path=$MANIFEST" >> "$GITHUB_OUTPUT"
52
+ RAW_DIR=$(node -e "console.log(JSON.parse(require('node:fs').readFileSync(process.argv[1])).rawDir)" "$MANIFEST")
53
+ echo "raw-dir=$FETCHER_OUT/$RAW_DIR" >> "$GITHUB_OUTPUT"
54
+ VERSION=$(node -e "console.log(JSON.parse(require('node:fs').readFileSync(process.argv[1])).waVersion ?? 'unknown')" "$MANIFEST")
55
+ echo "wa-version=$VERSION" >> "$GITHUB_OUTPUT"
package/package.json ADDED
@@ -0,0 +1,57 @@
1
+ {
2
+ "name": "@vinikjkkj/wa-fetcher",
3
+ "version": "0.1.0",
4
+ "description": "Headless scraper for web.whatsapp.com bundles — downloads every loaded JS chunk + emits a manifest. Pairs with @vinikjkkj/wa-mex and @vinikjkkj/wa-proto.",
5
+ "keywords": [
6
+ "whatsapp",
7
+ "whatsapp-web",
8
+ "scraper",
9
+ "fetcher",
10
+ "puppeteer",
11
+ "puppeteer-real-browser",
12
+ "reverse-engineering",
13
+ "wa-spec"
14
+ ],
15
+ "license": "MIT",
16
+ "author": {
17
+ "name": "vinikjkkj",
18
+ "url": "https://github.com/vinikjkkj"
19
+ },
20
+ "homepage": "https://github.com/vinikjkkj/wa-spec/tree/main/packages/fetcher#readme",
21
+ "bugs": {
22
+ "url": "https://github.com/vinikjkkj/wa-spec/issues"
23
+ },
24
+ "repository": {
25
+ "type": "git",
26
+ "url": "git+https://github.com/vinikjkkj/wa-spec.git",
27
+ "directory": "packages/fetcher"
28
+ },
29
+ "funding": [
30
+ {
31
+ "type": "github",
32
+ "url": "https://github.com/sponsors/vinikjkkj"
33
+ }
34
+ ],
35
+ "main": "src/index.cjs",
36
+ "bin": {
37
+ "wa-fetcher": "src/cli.cjs"
38
+ },
39
+ "files": [
40
+ "src/**/*.cjs",
41
+ "src/**/*.js",
42
+ "action.yml",
43
+ "README.md"
44
+ ],
45
+ "scripts": {
46
+ "fetch": "node src/cli.cjs --out dump/"
47
+ },
48
+ "dependencies": {
49
+ "puppeteer-real-browser": "^1.4.4"
50
+ },
51
+ "engines": {
52
+ "node": ">=20.9.0"
53
+ },
54
+ "publishConfig": {
55
+ "access": "public"
56
+ }
57
+ }
package/src/cli.cjs ADDED
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env node
2
+ 'use strict'
3
+
4
+ const fs = require('node:fs')
5
+ const path = require('node:path')
6
+ const { discoverBundleUrls, fetchBundles } = require('./index.cjs')
7
+
8
+ function parseArgs(argv) {
9
+ const opts = { out: null, urlsOnly: false }
10
+ for (let i = 2; i < argv.length; i++) {
11
+ const a = argv[i]
12
+ if (a === '--out') opts.out = argv[++i]
13
+ else if (a === '--auth') opts.authState = argv[++i]
14
+ else if (a === '--extra-wait') opts.extraWaitMs = Number(argv[++i])
15
+ else if (a === '--urls-only') opts.urlsOnly = true
16
+ else if (a === '--help' || a === '-h') {
17
+ printHelp()
18
+ process.exit(0)
19
+ } else {
20
+ console.error('unknown flag:', a)
21
+ printHelp()
22
+ process.exit(2)
23
+ }
24
+ }
25
+ return opts
26
+ }
27
+
28
+ function printHelp() {
29
+ console.error(
30
+ [
31
+ 'usage: wa-fetcher [options]',
32
+ '',
33
+ 'options:',
34
+ ' --out <path> output path',
35
+ ' - default mode: directory for raw bundles + manifest.json (default: dump)',
36
+ ' - --urls-only mode: file to write URL array JSON (default: stdout)',
37
+ ' --auth <file> optional saved cookie state JSON',
38
+ ' --extra-wait <ms> extra wait after networkidle for lazy chunks (default: 5000)',
39
+ ' --urls-only skip download; output just the discovered URL array as JSON',
40
+ ' -h, --help show this help'
41
+ ].join('\n')
42
+ )
43
+ }
44
+
45
+ async function main() {
46
+ const opts = parseArgs(process.argv)
47
+ const start = Date.now()
48
+
49
+ if (opts.urlsOnly) {
50
+ console.error('[wa-fetcher] discovering URLs only (no download)…')
51
+ const { waVersion, urls } = await discoverBundleUrls(opts)
52
+ const json = JSON.stringify(urls, null, 4) + '\n'
53
+ if (opts.out) {
54
+ fs.mkdirSync(path.dirname(path.resolve(opts.out)), { recursive: true })
55
+ fs.writeFileSync(opts.out, json)
56
+ console.error(
57
+ `[wa-fetcher] done in ${((Date.now() - start) / 1000).toFixed(1)}s — ` +
58
+ `version=${waVersion ?? 'unknown'} urls=${urls.length} → ${opts.out}`
59
+ )
60
+ } else {
61
+ process.stdout.write(json)
62
+ }
63
+ return
64
+ }
65
+
66
+ const outDir = opts.out ?? 'dump'
67
+ console.error('[wa-fetcher] launching headless browser…')
68
+ const result = await fetchBundles({ ...opts, out: outDir })
69
+ const elapsed = ((Date.now() - start) / 1000).toFixed(1)
70
+ const mb = (result.bundles.reduce((s, b) => s + b.bytes, 0) / 1024 / 1024).toFixed(1)
71
+ console.error(
72
+ `[wa-fetcher] done in ${elapsed}s — version=${result.waVersion ?? 'unknown'} ` +
73
+ `bundles=${result.bundles.length} (${mb}MB)`
74
+ )
75
+ console.error(`[wa-fetcher] raw → ${result.paths.raw}`)
76
+ console.error(`[wa-fetcher] manifest → ${result.paths.manifest}`)
77
+ }
78
+
79
+ main().catch((err) => {
80
+ console.error('[wa-fetcher] failed:', err.stack || err.message || err)
81
+ process.exit(1)
82
+ })
package/src/index.cjs ADDED
@@ -0,0 +1,249 @@
1
+ 'use strict'
2
+
3
+ /**
4
+ * @vinikjkkj/wa-fetcher
5
+ *
6
+ * const { discoverBundleUrls } = require('@vinikjkkj/wa-fetcher')
7
+ * const { waVersion, urls } = await discoverBundleUrls()
8
+ * // urls[] ['https://static.whatsapp.net/.../chunk.js', ...]
9
+ * // waVersion '2.3000.xxx' | null
10
+ *
11
+ * const { fetchBundles } = require('@vinikjkkj/wa-fetcher')
12
+ * const dump = await fetchBundles({ out: 'dump' })
13
+ * // dump.bundles[] [{ url, file, bytes }, ...]
14
+ * // dump.paths.raw absolute path to <out>/raw/<version>/
15
+ * // dump.paths.manifest
16
+ */
17
+
18
+ const fs = require('node:fs')
19
+ const path = require('node:path')
20
+ const { connect } = require('puppeteer-real-browser')
21
+
22
+ const WHATSAPP_URL = 'https://web.whatsapp.com/'
23
+ const NAV_TIMEOUT_MS = 60_000
24
+ const LAZY_WAIT_MS = 5_000
25
+
26
+ // In-page discovery: walks `<script data-sjs>` rsrcMap manifests, preload
27
+ // links, inline script literals, and performance entries to find every JS
28
+ // bundle URL Meta references — not just the chunks the SPA has loaded.
29
+ // (Technique cribbed from vinikjkkj/wa-diff/fetch.js.)
30
+ const DISCOVERY_FN = function () {
31
+ const found = new Set()
32
+ const allowedHost = 'static.whatsapp.net'
33
+ const jsFileRe = /\.m?js(?:[?#]|$)/i
34
+ const jsInTextRe =
35
+ /(?:https?:)?\/\/[^\s"'`<>]+?\.m?js(?:[?#][^\s"'`<>]*)?|(?:\/|\.\/|\.\.\/)[^\s"'`<>]+?\.m?js(?:[?#][^\s"'`<>]*)?/gi
36
+ function addUrl(raw) {
37
+ if (typeof raw !== 'string') return
38
+ const cleaned = raw.trim().replace(/\\\//g, '/')
39
+ if (!cleaned || !jsFileRe.test(cleaned)) return
40
+ try {
41
+ let normalized = cleaned
42
+ if (
43
+ /^[a-z0-9.-]+\.[a-z]{2,}(?:[/?#]|$)/i.test(normalized) &&
44
+ !/^[a-z][a-z0-9+.-]*:/i.test(normalized)
45
+ ) {
46
+ normalized = `https://${normalized}`
47
+ }
48
+ const parsed = new URL(normalized, location.href)
49
+ if (parsed.hostname.toLowerCase() !== allowedHost) return
50
+ found.add(parsed.href)
51
+ } catch {}
52
+ }
53
+ function extractFromText(text) {
54
+ if (typeof text !== 'string' || !text) return
55
+ const matches = text.match(jsInTextRe)
56
+ if (!matches) return
57
+ for (const m of matches) addUrl(m)
58
+ }
59
+ function walk(obj, seen = new WeakSet()) {
60
+ if (!obj || typeof obj !== 'object') return
61
+ if (seen.has(obj)) return
62
+ seen.add(obj)
63
+ if (obj.rsrcMap && typeof obj.rsrcMap === 'object') {
64
+ for (const r of Object.values(obj.rsrcMap)) {
65
+ if (!r || typeof r !== 'object') continue
66
+ if (r.type === 'js') addUrl(r.src || r.url || r.href || r.uri)
67
+ }
68
+ }
69
+ for (const value of Object.values(obj)) {
70
+ if (typeof value === 'string') addUrl(value)
71
+ else walk(value, seen)
72
+ }
73
+ }
74
+ for (const s of document.querySelectorAll('script[data-sjs]')) {
75
+ extractFromText(s.textContent || '')
76
+ try {
77
+ walk(JSON.parse(s.textContent || ''))
78
+ } catch {}
79
+ }
80
+ for (const s of document.querySelectorAll('script[src]')) {
81
+ addUrl(s.src || s.getAttribute('src'))
82
+ }
83
+ for (const l of document.querySelectorAll(
84
+ 'link[rel="preload"][as="script"][href], link[rel="modulepreload"][href], link[rel="prefetch"][as="script"][href], link[rel="prefetch"][href]'
85
+ )) {
86
+ addUrl(l.href || l.getAttribute('href'))
87
+ }
88
+ for (const s of document.querySelectorAll('script:not([src])')) {
89
+ extractFromText(s.textContent || '')
90
+ }
91
+ if (typeof performance?.getEntriesByType === 'function') {
92
+ for (const e of performance.getEntriesByType('resource')) {
93
+ if (e && typeof e.name === 'string') addUrl(e.name)
94
+ }
95
+ }
96
+ return Array.from(found)
97
+ }
98
+
99
+ const VERSION_FN = function () {
100
+ try {
101
+ if (typeof window !== 'undefined' && window.Debug && window.Debug.VERSION) {
102
+ return String(window.Debug.VERSION)
103
+ }
104
+ } catch {}
105
+ return null
106
+ }
107
+
108
+ async function openPage(opts) {
109
+ const authState = opts.authState ?? null
110
+ const extraWaitMs = opts.extraWaitMs ?? LAZY_WAIT_MS
111
+
112
+ const { browser, page } = await connect({
113
+ headless: true,
114
+ turnstile: true,
115
+ connectOption: { defaultViewport: null },
116
+ customConfig: {},
117
+ plugins: []
118
+ })
119
+
120
+ try {
121
+ if (authState && fs.existsSync(authState)) {
122
+ const state = JSON.parse(fs.readFileSync(authState, 'utf8'))
123
+ if (Array.isArray(state.cookies) && state.cookies.length > 0) {
124
+ await page.setCookie(...state.cookies)
125
+ }
126
+ }
127
+ await page.goto(WHATSAPP_URL, { waitUntil: 'networkidle2', timeout: NAV_TIMEOUT_MS })
128
+ await new Promise((r) => setTimeout(r, extraWaitMs))
129
+ return { browser, page }
130
+ } catch (err) {
131
+ await browser.close().catch(() => {})
132
+ throw err
133
+ }
134
+ }
135
+
136
+ /**
137
+ * Discover every JS bundle URL referenced by web.whatsapp.com.
138
+ * Returns `{ waVersion, urls }` — sorted, deduped, host-filtered to
139
+ * `static.whatsapp.net`.
140
+ */
141
+ async function discoverBundleUrls(opts = {}) {
142
+ const { browser, page } = await openPage(opts)
143
+ try {
144
+ const [urls, waVersion] = await Promise.all([
145
+ page.evaluate(DISCOVERY_FN),
146
+ page.evaluate(VERSION_FN)
147
+ ])
148
+ return { waVersion, urls: [...new Set(urls)].sort() }
149
+ } finally {
150
+ await browser.close().catch(() => {})
151
+ }
152
+ }
153
+
154
+ /**
155
+ * Discover + download every JS bundle. Writes raw bundles to
156
+ * `<out>/raw/<wa-version>/*.js` and a `<out>/manifest.json` index.
157
+ */
158
+ async function fetchBundles(opts = {}) {
159
+ const out = path.resolve(opts.out ?? 'dump')
160
+ fs.mkdirSync(out, { recursive: true })
161
+
162
+ const { browser, page } = await openPage(opts)
163
+ try {
164
+ const [urls, waVersion] = await Promise.all([
165
+ page.evaluate(DISCOVERY_FN),
166
+ page.evaluate(VERSION_FN)
167
+ ])
168
+ const sortedUrls = [...new Set(urls)].sort()
169
+
170
+ // CORS-safe in-page fetch (same origin as the SPA).
171
+ const downloaded = await page.evaluate(async (uList) => {
172
+ const out = []
173
+ await Promise.all(
174
+ uList.map(async (u) => {
175
+ try {
176
+ const r = await fetch(u)
177
+ const t = await r.text()
178
+ out.push({ url: u, text: t })
179
+ } catch {
180
+ out.push({ url: u, text: '' })
181
+ }
182
+ })
183
+ )
184
+ return out
185
+ }, sortedUrls)
186
+
187
+ const versionDir = path.join(
188
+ out,
189
+ 'raw',
190
+ waVersion ?? `unknown-${new Date().toISOString().replace(/[:.]/g, '-')}`
191
+ )
192
+ fs.mkdirSync(versionDir, { recursive: true })
193
+
194
+ const bundles = []
195
+ let totalBytes = 0
196
+ for (const b of downloaded) {
197
+ const file = sanitizeFilename(b.url)
198
+ fs.writeFileSync(path.join(versionDir, file), b.text)
199
+ bundles.push({ url: b.url, file, bytes: b.text.length })
200
+ totalBytes += b.text.length
201
+ }
202
+
203
+ const manifest = {
204
+ waVersion,
205
+ fetchedAt: new Date().toISOString(),
206
+ bundleCount: bundles.length,
207
+ totalBytes,
208
+ rawDir: path.relative(out, versionDir),
209
+ bundles
210
+ }
211
+ const manifestPath = path.join(out, 'manifest.json')
212
+ fs.writeFileSync(manifestPath, JSON.stringify(manifest, null, 2) + '\n')
213
+
214
+ return {
215
+ waVersion,
216
+ bundles,
217
+ paths: {
218
+ out,
219
+ raw: versionDir,
220
+ manifest: manifestPath
221
+ }
222
+ }
223
+ } finally {
224
+ await browser.close().catch(() => {})
225
+ }
226
+ }
227
+
228
+ function sanitizeFilename(url) {
229
+ // Some WA bundle URLs have very long randomised basenames that bust the
230
+ // 260-char path limit on Windows. Truncate to a fixed prefix and append a
231
+ // short hash of the full URL to preserve uniqueness.
232
+ const MAX_BASE_LEN = 64
233
+ const { createHash } = require('node:crypto')
234
+ let base
235
+ try {
236
+ const u = new URL(url)
237
+ base = u.pathname.split('/').pop() || 'bundle'
238
+ } catch {
239
+ base = url
240
+ }
241
+ base = base.replace(/[^A-Za-z0-9._-]/g, '_')
242
+ if (base.length <= MAX_BASE_LEN) return base
243
+ const ext = base.match(/\.[A-Za-z0-9]{1,8}$/)?.[0] ?? ''
244
+ const stem = base.slice(0, MAX_BASE_LEN - ext.length - 9) // leave room for hash
245
+ const hash = createHash('sha1').update(url).digest('hex').slice(0, 8)
246
+ return `${stem}-${hash}${ext}`
247
+ }
248
+
249
+ module.exports = { discoverBundleUrls, fetchBundles }