website-api 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/cli.js +5 -0
- package/dist/src/capabilities/browser.d.ts +8 -0
- package/dist/src/capabilities/browser.js +59 -7
- package/dist/src/core/context.js +1 -0
- package/dist/src/sites/bloomberg.com/openapi.yaml +1 -1
- package/dist/src/sites/chase.com/openapi.yaml +1 -1
- package/dist/src/sites/chatgpt.com/openapi.yaml +1 -1
- package/dist/src/sites/claude.ai/openapi.yaml +1 -1
- package/dist/src/sites/cursor.com/openapi.yaml +1 -1
- package/dist/src/sites/e-zpassny.com/openapi.yaml +1 -1
- package/dist/src/sites/gemini.google.com/openapi.yaml +1 -1
- package/dist/src/sites/google.com/openapi.yaml +1 -1
- package/dist/src/sites/microcenter.com/openapi.yaml +44 -0
- package/dist/src/sites/ollama.com/openapi.yaml +1 -1
- package/dist/src/sites/perplexity.ai/openapi.yaml +1 -1
- package/dist/src/sites/pseg.com/openapi.yaml +1 -1
- package/dist/src/sites/voice.google.com/openapi.yaml +1 -1
- package/dist/src/sites/zillow.com/openapi.yaml +1 -1
- package/dist/src/util/args-parser.js +5 -0
- package/package.json +4 -3
package/dist/bin/cli.js
CHANGED
|
@@ -62,6 +62,11 @@ function printWebsiteHelp(adapter) {
|
|
|
62
62
|
type: "boolean",
|
|
63
63
|
description: "Show the managed Chrome window (default headless; reuses an already-open session)",
|
|
64
64
|
},
|
|
65
|
+
{
|
|
66
|
+
name: "proxy",
|
|
67
|
+
type: "string",
|
|
68
|
+
description: 'Route the managed browser through a proxy: "default", a port, host:port, or scheme://host:port',
|
|
69
|
+
},
|
|
65
70
|
{ name: "help", type: "boolean", description: "Show help for this website site", short: "h" },
|
|
66
71
|
];
|
|
67
72
|
for (const param of allParams) {
|
|
@@ -8,6 +8,14 @@ export interface BrowserOptions {
|
|
|
8
8
|
cdpEndpoint?: string;
|
|
9
9
|
/** Launch the managed browser headless. Ignored when `cdpEndpoint` is set. */
|
|
10
10
|
headless?: boolean;
|
|
11
|
+
/**
|
|
12
|
+
* Route the managed browser through a proxy. `true` / "default" → the default
|
|
13
|
+
* SOCKS5 proxy (socks5://127.0.0.1:1080); a port ("1080"), "host:port", or
|
|
14
|
+
* full "scheme://host:port" is accepted. Forwarded to chrome-cdp-manager's
|
|
15
|
+
* `launch({ proxy })`, so it applies only on a fresh launch (an already-running
|
|
16
|
+
* browser or an explicit `cdpEndpoint` is used as-is). Ignored when falsy.
|
|
17
|
+
*/
|
|
18
|
+
proxy?: string | boolean;
|
|
11
19
|
/** Close a tab opened by this session on dispose. Defaults to true. */
|
|
12
20
|
close?: boolean;
|
|
13
21
|
debug?: boolean;
|
|
@@ -31,20 +31,34 @@ async function loadCdpManager() {
|
|
|
31
31
|
* `CDP_ENDPOINT`) wins and is used as-is, so users can still point at a Chrome
|
|
32
32
|
* they manage themselves. Otherwise chrome-cdp-manager ensures a managed
|
|
33
33
|
* browser is running (launching it if needed) and returns its endpoint.
|
|
34
|
+
*
|
|
35
|
+
* When a proxy is requested, chrome-cdp-manager probes it: an unreachable proxy
|
|
36
|
+
* is dropped and the browser launches direct (we surface a "connecting directly"
|
|
37
|
+
* notice). A reachable proxy forces a fresh launch so it actually takes effect.
|
|
34
38
|
*/
|
|
35
39
|
async function resolveEndpoint(options) {
|
|
36
40
|
const explicit = options.cdpEndpoint || process.env.CDP_ENDPOINT;
|
|
37
41
|
if (explicit)
|
|
38
|
-
return explicit;
|
|
42
|
+
return { endpoint: explicit, managed: false, proxyApplied: false, cdpPort: 0 };
|
|
39
43
|
const { launch } = await loadCdpManager();
|
|
40
|
-
const { endpoint, launched } = await launch({
|
|
44
|
+
const { endpoint, launched, config, proxyRequested, proxyReachable } = (await launch({
|
|
45
|
+
headless: !!options.headless,
|
|
46
|
+
proxy: options.proxy || undefined,
|
|
47
|
+
}));
|
|
48
|
+
const proxyApplied = !!config?.proxy;
|
|
49
|
+
// Situation 1: a proxy was asked for but nothing was listening — we fell back
|
|
50
|
+
// to a direct connection. Surface it (stderr, so JSON stdout stays clean).
|
|
51
|
+
if (proxyRequested && !proxyReachable) {
|
|
52
|
+
console.error(`Proxy ${options.proxy} not reachable — connecting directly (no proxy).`);
|
|
53
|
+
}
|
|
41
54
|
if (options.debug) {
|
|
42
55
|
const mode = options.headless ? "headless" : "headed";
|
|
56
|
+
const via = proxyApplied ? ` via proxy ${config.proxy}` : "";
|
|
43
57
|
console.log(launched
|
|
44
|
-
? `Launched managed Chrome (${mode}) at ${endpoint}`
|
|
45
|
-
: `Attached to managed Chrome at ${endpoint}`);
|
|
58
|
+
? `Launched managed Chrome (${mode})${via} at ${endpoint}`
|
|
59
|
+
: `Attached to managed Chrome at ${endpoint} (already running)`);
|
|
46
60
|
}
|
|
47
|
-
return endpoint;
|
|
61
|
+
return { endpoint, managed: true, proxyApplied, cdpPort: config?.cdpPort ?? 9222 };
|
|
48
62
|
}
|
|
49
63
|
/**
|
|
50
64
|
* Connects to an existing Chrome over CDP and reuses (or opens) a tab for the
|
|
@@ -53,7 +67,7 @@ async function resolveEndpoint(options) {
|
|
|
53
67
|
*/
|
|
54
68
|
export const connectChrome = async (targetUrl, options = {}) => {
|
|
55
69
|
const debug = !!options.debug;
|
|
56
|
-
const endpoint = await resolveEndpoint(options);
|
|
70
|
+
const { endpoint, managed, proxyApplied, cdpPort } = await resolveEndpoint(options);
|
|
57
71
|
const chromium = await loadChromium();
|
|
58
72
|
const browser = await chromium.connectOverCDP(endpoint);
|
|
59
73
|
const context = browser.contexts()[0];
|
|
@@ -79,9 +93,33 @@ export const connectChrome = async (targetUrl, options = {}) => {
|
|
|
79
93
|
if (debug)
|
|
80
94
|
console.log(`Opening a new tab for ${targetUrl}`);
|
|
81
95
|
page = await context.newPage();
|
|
82
|
-
await page.goto(targetUrl, { waitUntil: "domcontentloaded" });
|
|
83
96
|
opened = true;
|
|
84
97
|
}
|
|
98
|
+
// Headless Chrome leaks "HeadlessChrome" in its network User-Agent header (the
|
|
99
|
+
// JS-level fingerprint init script only changes navigator.userAgent, not the
|
|
100
|
+
// request header). Some sites — e.g. Micro Center — serve a blank/blocked page
|
|
101
|
+
// to it, so the product grid never appears. Mirror what
|
|
102
|
+
// chrome-cdp-manager/playwright's connect() does and override the network UA to
|
|
103
|
+
// a de-headlessed value, before any navigation.
|
|
104
|
+
if (options.headless) {
|
|
105
|
+
try {
|
|
106
|
+
const liveUa = await page.evaluate(() => navigator.userAgent);
|
|
107
|
+
const ua = liveUa.replace(/HeadlessChrome/g, "Chrome");
|
|
108
|
+
if (ua && ua !== liveUa) {
|
|
109
|
+
await context.setExtraHTTPHeaders({ "user-agent": ua });
|
|
110
|
+
const cdp = await context.newCDPSession(page);
|
|
111
|
+
await cdp.send("Network.setUserAgentOverride", { userAgent: ua });
|
|
112
|
+
if (debug)
|
|
113
|
+
console.log(`De-headlessed network User-Agent → ${ua}`);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
catch {
|
|
117
|
+
// best-effort; ignore
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
if (opened) {
|
|
121
|
+
await page.goto(targetUrl, { waitUntil: "domcontentloaded" });
|
|
122
|
+
}
|
|
85
123
|
return {
|
|
86
124
|
page,
|
|
87
125
|
browser,
|
|
@@ -101,6 +139,20 @@ export const connectChrome = async (targetUrl, options = {}) => {
|
|
|
101
139
|
catch {
|
|
102
140
|
// ignore
|
|
103
141
|
}
|
|
142
|
+
// Situation 3: a proxy only takes effect at launch, so a proxied run owns
|
|
143
|
+
// an ephemeral browser — fully stop it once the request completes, leaving
|
|
144
|
+
// a clean slate for the next (possibly differently-proxied) run.
|
|
145
|
+
if (managed && proxyApplied) {
|
|
146
|
+
try {
|
|
147
|
+
const { closeBrowser } = await loadCdpManager();
|
|
148
|
+
await closeBrowser(cdpPort);
|
|
149
|
+
if (debug)
|
|
150
|
+
console.log(`Stopped managed Chrome on :${cdpPort} (proxied run complete)`);
|
|
151
|
+
}
|
|
152
|
+
catch {
|
|
153
|
+
// ignore
|
|
154
|
+
}
|
|
155
|
+
}
|
|
104
156
|
},
|
|
105
157
|
};
|
|
106
158
|
};
|
package/dist/src/core/context.js
CHANGED
|
@@ -4,7 +4,7 @@ info:
|
|
|
4
4
|
title: Bloomberg Billionaires Index
|
|
5
5
|
description: Extracts the full Bloomberg Billionaires Index (window.top500) via the browser,
|
|
6
6
|
auto-solving the PerimeterX challenge.
|
|
7
|
-
version: 1.1.
|
|
7
|
+
version: 1.1.4
|
|
8
8
|
servers:
|
|
9
9
|
- url: https://bloomberg.com
|
|
10
10
|
paths: {}
|
|
@@ -4,7 +4,7 @@ info:
|
|
|
4
4
|
title: E-ZPass New York
|
|
5
5
|
description: Fetches E-ZPass NY toll/payment history, lists account statements, and downloads
|
|
6
6
|
statement PDFs (browser transport, logs in fresh each run via your saved Chrome password).
|
|
7
|
-
version: 1.1.
|
|
7
|
+
version: 1.1.4
|
|
8
8
|
servers:
|
|
9
9
|
- url: https://e-zpassny.com
|
|
10
10
|
paths: {}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# Generated by `pnpm generate:openapi` — do not edit by hand.
|
|
2
|
+
openapi: 3.1.0
|
|
3
|
+
info:
|
|
4
|
+
title: Micro Center
|
|
5
|
+
description: Scrape Micro Center Apple search results — Mac desktops and/or MacBooks — into
|
|
6
|
+
structured JSON (browser transport, parses with cheerio). Defaults to both categories; pick one
|
|
7
|
+
with --mac / --macbook.
|
|
8
|
+
version: 1.1.4
|
|
9
|
+
servers:
|
|
10
|
+
- url: https://microcenter.com
|
|
11
|
+
paths: {}
|
|
12
|
+
components:
|
|
13
|
+
securitySchemes:
|
|
14
|
+
chromeSession:
|
|
15
|
+
type: apiKey
|
|
16
|
+
in: cookie
|
|
17
|
+
name: session
|
|
18
|
+
description: "Authenticated via the user's real Chrome session: website-api injects decrypted Chrome
|
|
19
|
+
cookies for microcenter.com into every request."
|
|
20
|
+
x-website-api:
|
|
21
|
+
id: microcenter
|
|
22
|
+
domain: microcenter.com
|
|
23
|
+
cookieDomain: microcenter.com
|
|
24
|
+
transport: browser
|
|
25
|
+
cookies: optional
|
|
26
|
+
requiresLogin: false
|
|
27
|
+
imperative: true
|
|
28
|
+
cli:
|
|
29
|
+
command: website-api microcenter
|
|
30
|
+
positionals: []
|
|
31
|
+
parameters:
|
|
32
|
+
- flag: --mac
|
|
33
|
+
type: boolean
|
|
34
|
+
description: Scrape Mac desktops (mini, Studio, Pro, iMac)
|
|
35
|
+
required: false
|
|
36
|
+
- flag: --macbook
|
|
37
|
+
type: boolean
|
|
38
|
+
description: Scrape MacBooks (Air, Pro)
|
|
39
|
+
required: false
|
|
40
|
+
- flag: --store
|
|
41
|
+
type: string
|
|
42
|
+
description: Micro Center store id for price/stock (default 075)
|
|
43
|
+
default: "075"
|
|
44
|
+
required: false
|
|
@@ -21,6 +21,11 @@ const GLOBAL_PARAMETERS = [
|
|
|
21
21
|
type: "boolean",
|
|
22
22
|
description: "Show the managed Chrome window (default headless; reuses an already-open session)",
|
|
23
23
|
},
|
|
24
|
+
{
|
|
25
|
+
name: "proxy",
|
|
26
|
+
type: "string",
|
|
27
|
+
description: 'Route the managed browser through a proxy: "default" (socks5://127.0.0.1:1080), a port, host:port, or scheme://host:port (applies on a fresh launch)',
|
|
28
|
+
},
|
|
24
29
|
{
|
|
25
30
|
name: "out",
|
|
26
31
|
type: "string",
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "website-api",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.6",
|
|
4
4
|
"description": "CLI and library to query website private APIs with your real logged-in Chrome session",
|
|
5
5
|
"main": "./dist/src/website-api.js",
|
|
6
6
|
"types": "./dist/src/website-api.d.ts",
|
|
@@ -48,8 +48,9 @@
|
|
|
48
48
|
},
|
|
49
49
|
"dependencies": {
|
|
50
50
|
"chalk": "^5.6.2",
|
|
51
|
-
"
|
|
52
|
-
"chrome-
|
|
51
|
+
"cheerio": "^1.2.0",
|
|
52
|
+
"chrome-cdp-manager": "^1.2.9",
|
|
53
|
+
"chrome-tools": "^1.1.5",
|
|
53
54
|
"cli-table3": "^0.6.5",
|
|
54
55
|
"commander": "^14.0.3"
|
|
55
56
|
},
|