gologin-web-access 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +1 -1
- package/README.md +23 -10
- package/dist/cli.js +7 -1
- package/dist/commands/batchScrape.js +46 -4
- package/dist/commands/close.js +4 -0
- package/dist/commands/configInit.js +10 -6
- package/dist/commands/scrapeJson.js +9 -0
- package/dist/commands/sessions.js +11 -2
- package/dist/config.js +2 -2
- package/dist/doctor.js +3 -3
- package/dist/internal-agent/commands/close.js +10 -0
- package/dist/internal-agent/commands/sessions.js +19 -1
- package/dist/internal-agent/daemon/server.js +9 -0
- package/dist/internal-agent/daemon/sessionManager.js +93 -24
- package/dist/internal-agent/lib/utils.js +1 -1
- package/dist/lib/browserRead.js +1 -1
- package/dist/lib/browserStructured.js +1 -1
- package/dist/lib/errors.js +30 -2
- package/dist/lib/output.js +3 -0
- package/dist/lib/readSource.js +2 -2
- package/dist/lib/search.js +1 -1
- package/dist/lib/structuredScrape.js +83 -0
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -14,6 +14,6 @@ Highlights:
|
|
|
14
14
|
- Unified CLI entry point for Gologin Web Unlocker and Gologin Cloud Browser workflows
|
|
15
15
|
- Scraping commands: `scrape`, `scrape-markdown`, `scrape-text`, `scrape-json`, `batch-scrape`
|
|
16
16
|
- Browser commands: `open`, `snapshot`, `click`, `type`, `screenshot`, `close`, `sessions`, `current`
|
|
17
|
-
- Clear two-key configuration model with `GOLOGIN_WEB_UNLOCKER_API_KEY` and `
|
|
17
|
+
- Clear two-key configuration model with `GOLOGIN_WEB_UNLOCKER_API_KEY` and `GOLOGIN_TOKEN`
|
|
18
18
|
- `doctor`, `config show`, and `config init` to reduce setup friction
|
|
19
19
|
- Compatibility support for legacy env names used by existing Gologin tools
|
package/README.md
CHANGED
|
@@ -26,6 +26,13 @@ The point of the unified CLI is that both modes live in one product with one com
|
|
|
26
26
|
|
|
27
27
|
## Command Groups
|
|
28
28
|
|
|
29
|
+
### Quick Picks
|
|
30
|
+
|
|
31
|
+
- `read` for "read this docs page/article" or "tell me what is on this page"
|
|
32
|
+
- `scrape-text` for plain text from one known page when you do not need headings/links metadata
|
|
33
|
+
- `scrape-json` for structured title, description, headings, and links from one known page
|
|
34
|
+
- `batch-scrape` for many known URLs at once; add `--output <path>` when the JSON may be large and add `--strict` only if partial success should fail the command
|
|
35
|
+
|
|
29
36
|
### Scraping / Read
|
|
30
37
|
|
|
31
38
|
These commands use Gologin Web Unlocker:
|
|
@@ -35,7 +42,7 @@ These commands use Gologin Web Unlocker:
|
|
|
35
42
|
- `gologin-web-access scrape-markdown <url> [--source auto|unlocker|browser]`
|
|
36
43
|
- `gologin-web-access scrape-text <url> [--source auto|unlocker|browser]`
|
|
37
44
|
- `gologin-web-access scrape-json <url> [--fallback none|browser]`
|
|
38
|
-
- `gologin-web-access batch-scrape <url...> [--format html|markdown|text|json] [--fallback none|browser] [--source auto|unlocker|browser] [--only-main-content] [--retry <n>] [--backoff-ms <ms>] [--summary]`
|
|
45
|
+
- `gologin-web-access batch-scrape <url...> [--format html|markdown|text|json] [--fallback none|browser] [--source auto|unlocker|browser] [--only-main-content] [--retry <n>] [--backoff-ms <ms>] [--summary] [--output <path>] [--strict]`
|
|
39
46
|
- `gologin-web-access batch-extract <url...> --schema <schema.json> [--source auto|unlocker|browser] [--retry <n>] [--backoff-ms <ms>] [--summary] [--output <path>]`
|
|
40
47
|
- `gologin-web-access search <query> [--limit <n>] [--country <cc>] [--language <lang>] [--source auto|unlocker|browser]`
|
|
41
48
|
- `gologin-web-access map <url> [--limit <n>] [--max-depth <n>] [--concurrency <n>] [--strict]`
|
|
@@ -104,6 +111,9 @@ Use these when you need state, interaction, or multi-step browser flows.
|
|
|
104
111
|
## When To Use `scrape` vs `browser`
|
|
105
112
|
|
|
106
113
|
- Use `scrape` commands when you need page content, extracted text, markdown, or simple structured output.
|
|
114
|
+
- Use `read` as the default for docs and article reading when you want one high-level main-content command rather than choosing HTML/text/markdown yourself.
|
|
115
|
+
- Use `scrape-text` when you already know you want plain text.
|
|
116
|
+
- Use `scrape-json` when you want structured metadata and headings instead of full prose.
|
|
107
117
|
- Use `search` when you need web discovery or SERP results before deciding what to scrape. It now tries multiple search paths automatically, validates that the response is a real SERP, and reuses a short local cache for repeated queries.
|
|
108
118
|
- Use `map` when you need internal link discovery or a site inventory.
|
|
109
119
|
- Use `crawl` when you need multi-page read-only extraction across a site.
|
|
@@ -153,18 +163,18 @@ If the browser surface grows substantially later, a nested namespace may become
|
|
|
153
163
|
|
|
154
164
|
## Credentials And Config
|
|
155
165
|
|
|
156
|
-
This CLI uses two different
|
|
166
|
+
This CLI uses two different GoLogin credentials on purpose, because the underlying products are different.
|
|
157
167
|
|
|
158
168
|
- `GOLOGIN_WEB_UNLOCKER_API_KEY`
|
|
159
169
|
Required for Scraping / Read commands.
|
|
160
|
-
- `
|
|
170
|
+
- `GOLOGIN_TOKEN`
|
|
161
171
|
Required for `gologin-web-access open` and for profile validation in `gologin-web-access doctor`.
|
|
162
172
|
- `GOLOGIN_DEFAULT_PROFILE_ID`
|
|
163
173
|
Optional default profile for browser flows.
|
|
164
174
|
- `GOLOGIN_DAEMON_PORT`
|
|
165
175
|
Optional local daemon port for browser workflows.
|
|
166
176
|
|
|
167
|
-
Recommended full setup for agents is to configure both `GOLOGIN_WEB_UNLOCKER_API_KEY` and `
|
|
177
|
+
Recommended full setup for agents is to configure both `GOLOGIN_WEB_UNLOCKER_API_KEY` and `GOLOGIN_TOKEN` before starting work, even if the current task looks read-only or browser-only.
|
|
168
178
|
|
|
169
179
|
Missing-key errors are command-group specific. Example:
|
|
170
180
|
|
|
@@ -174,7 +184,7 @@ Environment variables are the primary configuration mechanism:
|
|
|
174
184
|
|
|
175
185
|
```bash
|
|
176
186
|
export GOLOGIN_WEB_UNLOCKER_API_KEY="wu_..."
|
|
177
|
-
export
|
|
187
|
+
export GOLOGIN_TOKEN="gl_..."
|
|
178
188
|
export GOLOGIN_DEFAULT_PROFILE_ID="profile_123"
|
|
179
189
|
export GOLOGIN_DAEMON_PORT="4590"
|
|
180
190
|
```
|
|
@@ -188,8 +198,8 @@ gologin-web-access config init
|
|
|
188
198
|
Useful variants:
|
|
189
199
|
|
|
190
200
|
```bash
|
|
191
|
-
gologin-web-access config init --web-unlocker-api-key wu_... --
|
|
192
|
-
gologin-web-access config init --web-unlocker-key wu_... --
|
|
201
|
+
gologin-web-access config init --web-unlocker-api-key wu_... --token gl_...
|
|
202
|
+
gologin-web-access config init --web-unlocker-key wu_... --token gl_...
|
|
193
203
|
```
|
|
194
204
|
|
|
195
205
|
That writes `~/.gologin-web-access/config.json` once and the CLI will keep reading it on later runs.
|
|
@@ -211,7 +221,7 @@ Gologin Web Access will also read the older path `~/.gologin-web/config.json` if
|
|
|
211
221
|
Backward-compatible aliases are also accepted for existing setups:
|
|
212
222
|
|
|
213
223
|
- `GOLOGIN_WEBUNLOCKER_API_KEY`
|
|
214
|
-
- `
|
|
224
|
+
- `GOLOGIN_CLOUD_TOKEN`
|
|
215
225
|
- `GOLOGIN_PROFILE_ID`
|
|
216
226
|
|
|
217
227
|
Useful config commands:
|
|
@@ -259,7 +269,7 @@ gologin-web-access parse-document ./example.pdf
|
|
|
259
269
|
### Interact With A Site
|
|
260
270
|
|
|
261
271
|
```bash
|
|
262
|
-
export
|
|
272
|
+
export GOLOGIN_TOKEN="gl_..."
|
|
263
273
|
export GOLOGIN_DEFAULT_PROFILE_ID="profile_123"
|
|
264
274
|
|
|
265
275
|
gologin-web-access open https://example.com
|
|
@@ -280,7 +290,7 @@ gologin-web-access close
|
|
|
280
290
|
### Search In A Real Browser
|
|
281
291
|
|
|
282
292
|
```bash
|
|
283
|
-
export
|
|
293
|
+
export GOLOGIN_TOKEN="gl_..."
|
|
284
294
|
|
|
285
295
|
gologin-web-access search-browser "gologin antidetect browser"
|
|
286
296
|
gologin-web-access snapshot -i
|
|
@@ -299,7 +309,10 @@ gologin-web-access snapshot -i
|
|
|
299
309
|
- `batch-scrape --only-main-content` lets markdown, text, and html batch runs use the same readable-content isolation path as `read`.
|
|
300
310
|
- `crawl --only-main-content` uses the same readable-fragment extraction strategy for html, markdown, and text crawl output, but stays on the stateless unlocker path.
|
|
301
311
|
- `batch-scrape --summary` prints a one-line success/failure summary to `stderr` after the JSON payload.
|
|
312
|
+
- `batch-scrape` now returns exit code `0` on partial success by default and only fails the command when every URL failed. Add `--strict` if any single failed URL should make the whole batch exit non-zero.
|
|
313
|
+
- `batch-scrape --output <path>` writes the full JSON to disk so shells and agent consoles cannot truncate a large payload silently.
|
|
302
314
|
- `batch-scrape --format json` now returns the same structured scrape envelope as `scrape-json`, including `renderSource`, `fallbackAttempted`, `fallbackUsed`, and `request.attemptCount/retryCount/attempts`.
|
|
315
|
+
- `scrape-json` now surfaces explicit `BLOCKED_PAGE` failures when structured output clearly matches a challenge or block page, instead of silently looking like a valid empty result.
|
|
303
316
|
- `search` now returns `requestedLimit`, `returnedCount`, `warnings`, `cacheTtlMs`, and per-result `position`.
|
|
304
317
|
- `search` may return fewer results than the requested `--limit` when the upstream SERP contains fewer valid results; inspect `returnedCount`, `warnings`, and `attempts`.
|
|
305
318
|
- `change-track` now accepts `--retry`, `--backoff-ms`, and `--timeout-ms`, and JSON output includes request metadata.
|
package/dist/cli.js
CHANGED
|
@@ -154,6 +154,12 @@ async function main() {
|
|
|
154
154
|
configGroup.addCommand((0, configShow_1.buildConfigShowCommand)());
|
|
155
155
|
configGroup.addCommand((0, configInit_1.buildConfigInitCommand)());
|
|
156
156
|
program.addHelpText("after", `
|
|
157
|
+
Quick picks:
|
|
158
|
+
read Best default for "read this docs page/article" and main-content extraction
|
|
159
|
+
scrape-text Plain text from one known page when you do not need headings/links metadata
|
|
160
|
+
scrape-json Structured title, description, headings, and links from one known page
|
|
161
|
+
batch-scrape Fetch many known URLs at once; add --output <path> for large results and --strict only when partial success should fail the command
|
|
162
|
+
|
|
157
163
|
Command groups:
|
|
158
164
|
Scraping: gologin-web-access scrape|read|scrape-markdown|scrape-text|scrape-json|batch-scrape|batch-extract|search|map|crawl|crawl-start|crawl-status|crawl-result|crawl-errors|extract|change-track|batch-change-track|parse-document
|
|
159
165
|
Browser: gologin-web-access open|search-browser|scrape-screenshot|tabs|tabopen|tabfocus|tabclose|snapshot|click|dblclick|focus|type|fill|hover|select|check|uncheck|press|scroll|scrollintoview|wait|get|back|forward|reload|find|cookies|cookies-import|cookies-clear|storage-export|storage-import|storage-clear|eval|upload|pdf|screenshot|close|sessions|current
|
|
@@ -161,7 +167,7 @@ Command groups:
|
|
|
161
167
|
|
|
162
168
|
Key model:
|
|
163
169
|
${"GOLOGIN_WEB_UNLOCKER_API_KEY"} powers scraping commands.
|
|
164
|
-
${"
|
|
170
|
+
${"GOLOGIN_TOKEN"} powers browser commands.
|
|
165
171
|
Recommended setup: configure both keys up front, even if the current task only needs one path.
|
|
166
172
|
`);
|
|
167
173
|
await program.parseAsync(process.argv);
|
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
2
5
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
6
|
exports.buildBatchScrapeCommand = buildBatchScrapeCommand;
|
|
7
|
+
exports.resolveBatchScrapeExitCode = resolveBatchScrapeExitCode;
|
|
8
|
+
exports.shouldWarnAboutLargeBatchOutput = shouldWarnAboutLargeBatchOutput;
|
|
9
|
+
const fs_1 = require("fs");
|
|
10
|
+
const path_1 = __importDefault(require("path"));
|
|
4
11
|
const commander_1 = require("commander");
|
|
5
12
|
const config_1 = require("../config");
|
|
6
13
|
const output_1 = require("../lib/output");
|
|
@@ -17,7 +24,9 @@ function buildBatchScrapeCommand() {
|
|
|
17
24
|
.option("--fallback <mode>", "Structured scrape fallback: none or browser", "none")
|
|
18
25
|
.option("--source <source>", "Read source for --only-main-content mode: auto, unlocker, or browser", "auto")
|
|
19
26
|
.option("--only-main-content", "For html, markdown, or text formats, isolate the most readable content block per page")
|
|
27
|
+
.option("--output <path>", "Write the full batch result JSON to a file")
|
|
20
28
|
.option("--summary", "Print one-line summary stats to stderr after the JSON output")
|
|
29
|
+
.option("--strict", "Exit non-zero if any URL in the batch fails")
|
|
21
30
|
.action(async (urls, options) => {
|
|
22
31
|
const config = await (0, config_1.loadConfig)();
|
|
23
32
|
const format = normalizeFormat(options.format);
|
|
@@ -48,18 +57,29 @@ function buildBatchScrapeCommand() {
|
|
|
48
57
|
ok: false,
|
|
49
58
|
format,
|
|
50
59
|
error: error instanceof Error ? error.message : "Unknown error",
|
|
60
|
+
code: extractErrorCode(error),
|
|
51
61
|
status: extractStatusCode(error),
|
|
52
62
|
request,
|
|
53
63
|
};
|
|
54
64
|
}
|
|
55
65
|
});
|
|
56
|
-
|
|
66
|
+
const payload = `${JSON.stringify(results, null, 2)}\n`;
|
|
67
|
+
if (options.output) {
|
|
68
|
+
const outputPath = (0, shared_1.resolveOutputPath)(options.output);
|
|
69
|
+
await fs_1.promises.mkdir(path_1.default.dirname(outputPath), { recursive: true });
|
|
70
|
+
await fs_1.promises.writeFile(outputPath, payload, "utf8");
|
|
71
|
+
(0, output_1.printText)(outputPath);
|
|
72
|
+
}
|
|
73
|
+
else {
|
|
74
|
+
(0, output_1.printText)(payload);
|
|
75
|
+
if (shouldWarnAboutLargeBatchOutput(payload)) {
|
|
76
|
+
process.stderr.write("Batch output is large. If your shell or agent truncates stdout, rerun with --output <path> to keep the full JSON.\n");
|
|
77
|
+
}
|
|
78
|
+
}
|
|
57
79
|
if (options.summary) {
|
|
58
80
|
process.stderr.write(formatBatchSummary(results) + "\n");
|
|
59
81
|
}
|
|
60
|
-
|
|
61
|
-
process.exitCode = 1;
|
|
62
|
-
}
|
|
82
|
+
process.exitCode = resolveBatchScrapeExitCode(results, Boolean(options.strict));
|
|
63
83
|
})));
|
|
64
84
|
}
|
|
65
85
|
function normalizeFormat(value) {
|
|
@@ -120,6 +140,19 @@ function formatBatchSummary(results) {
|
|
|
120
140
|
const failed = requested - ok;
|
|
121
141
|
return `Summary: ${requested} requested, ${ok} ok, ${failed} failed.`;
|
|
122
142
|
}
|
|
143
|
+
function resolveBatchScrapeExitCode(results, strict) {
|
|
144
|
+
const okCount = results.filter((result) => result.ok).length;
|
|
145
|
+
if (okCount === 0) {
|
|
146
|
+
return 1;
|
|
147
|
+
}
|
|
148
|
+
if (strict && okCount !== results.length) {
|
|
149
|
+
return 1;
|
|
150
|
+
}
|
|
151
|
+
return 0;
|
|
152
|
+
}
|
|
153
|
+
function shouldWarnAboutLargeBatchOutput(payload) {
|
|
154
|
+
return payload.length >= 100_000;
|
|
155
|
+
}
|
|
123
156
|
function extractStatusCode(error) {
|
|
124
157
|
if (typeof error === "object" &&
|
|
125
158
|
error !== null &&
|
|
@@ -138,3 +171,12 @@ function extractRequestMeta(error) {
|
|
|
138
171
|
}
|
|
139
172
|
return undefined;
|
|
140
173
|
}
|
|
174
|
+
function extractErrorCode(error) {
|
|
175
|
+
if (typeof error === "object" &&
|
|
176
|
+
error !== null &&
|
|
177
|
+
"code" in error &&
|
|
178
|
+
typeof error.code === "string") {
|
|
179
|
+
return error.code;
|
|
180
|
+
}
|
|
181
|
+
return undefined;
|
|
182
|
+
}
|
package/dist/commands/close.js
CHANGED
|
@@ -8,9 +8,13 @@ function buildCloseCommand() {
|
|
|
8
8
|
return new commander_1.Command("close")
|
|
9
9
|
.description("Close the current browser session or a specific session.")
|
|
10
10
|
.option("--session <id>", "Session ID. Defaults to the current session.")
|
|
11
|
+
.option("--all", "Close every tracked browser session in the current daemon.")
|
|
11
12
|
.action(async (options) => {
|
|
12
13
|
const config = await (0, config_1.loadConfig)();
|
|
13
14
|
const args = ["close"];
|
|
15
|
+
if (options.all) {
|
|
16
|
+
args.push("--all");
|
|
17
|
+
}
|
|
14
18
|
if (options.session) {
|
|
15
19
|
args.push("--session", options.session);
|
|
16
20
|
}
|
|
@@ -8,10 +8,11 @@ const output_1 = require("../lib/output");
|
|
|
8
8
|
const unlocker_1 = require("../lib/unlocker");
|
|
9
9
|
function buildConfigInitCommand() {
|
|
10
10
|
return new commander_1.Command("init")
|
|
11
|
-
.description("Write ~/.gologin-web-access/config.json with current values or placeholders. Recommended: persist both Web Unlocker and
|
|
11
|
+
.description("Write ~/.gologin-web-access/config.json with current values or placeholders. Recommended: persist both the Web Unlocker key and the GoLogin token.")
|
|
12
12
|
.option("--web-unlocker-api-key <key>", "Persist a Web Unlocker API key")
|
|
13
13
|
.option("--web-unlocker-key <key>", "Alias for --web-unlocker-api-key")
|
|
14
|
-
.option("--
|
|
14
|
+
.option("--token <token>", "Persist a GoLogin token")
|
|
15
|
+
.option("--cloud-token <token>", "Backward-compatible alias for --token")
|
|
15
16
|
.option("--default-profile-id <id>", "Persist a default Gologin profile ID")
|
|
16
17
|
.option("--daemon-port <port>", "Persist a daemon port", String(config_1.DEFAULT_DAEMON_PORT))
|
|
17
18
|
.option("--no-validate", "Skip live key validation after writing config")
|
|
@@ -20,7 +21,10 @@ function buildConfigInitCommand() {
|
|
|
20
21
|
const webUnlockerApiKey = options.webUnlockerApiKey ?? options.webUnlockerKey ?? process.env[config_1.ENV_NAMES.webUnlockerApiKey];
|
|
21
22
|
const result = await (0, config_1.initConfigFile)({
|
|
22
23
|
webUnlockerApiKey,
|
|
23
|
-
cloudToken: options.
|
|
24
|
+
cloudToken: options.token ??
|
|
25
|
+
options.cloudToken ??
|
|
26
|
+
process.env[config_1.ENV_NAMES.cloudToken] ??
|
|
27
|
+
process.env.GOLOGIN_CLOUD_TOKEN,
|
|
24
28
|
defaultProfileId: options.defaultProfileId ?? process.env[config_1.ENV_NAMES.defaultProfileId],
|
|
25
29
|
daemonPort: Number(options.daemonPort ?? process.env[config_1.ENV_NAMES.daemonPort] ?? config_1.DEFAULT_DAEMON_PORT),
|
|
26
30
|
}, {
|
|
@@ -37,7 +41,7 @@ function buildConfigInitCommand() {
|
|
|
37
41
|
value: result.config.webUnlockerApiKey ? "written" : "left empty",
|
|
38
42
|
},
|
|
39
43
|
{
|
|
40
|
-
label: "
|
|
44
|
+
label: "GoLogin token",
|
|
41
45
|
value: result.config.cloudToken ? "written" : "left empty",
|
|
42
46
|
},
|
|
43
47
|
{
|
|
@@ -50,7 +54,7 @@ function buildConfigInitCommand() {
|
|
|
50
54
|
},
|
|
51
55
|
]);
|
|
52
56
|
if (!result.config.webUnlockerApiKey || !result.config.cloudToken) {
|
|
53
|
-
(0, output_1.printText)("Recommended next step: configure both GOLOGIN_WEB_UNLOCKER_API_KEY and
|
|
57
|
+
(0, output_1.printText)("Recommended next step: configure both GOLOGIN_WEB_UNLOCKER_API_KEY and GOLOGIN_TOKEN so agents can use scraping and browser flows without asking again.");
|
|
54
58
|
}
|
|
55
59
|
if (options.validate === false) {
|
|
56
60
|
return;
|
|
@@ -66,7 +70,7 @@ function buildConfigInitCommand() {
|
|
|
66
70
|
if (result.config.cloudToken) {
|
|
67
71
|
const validation = await (0, cloudApi_1.validateCloudToken)(result.config.cloudToken);
|
|
68
72
|
validationRows.push({
|
|
69
|
-
label: "
|
|
73
|
+
label: "GoLogin token validation",
|
|
70
74
|
value: validation.ok ? "ok" : `failed${validation.status ? ` (${validation.status})` : ""}: ${validation.detail}`,
|
|
71
75
|
});
|
|
72
76
|
}
|
|
@@ -19,6 +19,15 @@ function buildScrapeJsonCommand() {
|
|
|
19
19
|
profile: options.profile,
|
|
20
20
|
request: (0, shared_1.normalizeUnlockerRequestOptions)(options),
|
|
21
21
|
});
|
|
22
|
+
if (envelope.fallbackAttempted) {
|
|
23
|
+
const fallbackStatus = envelope.fallbackUsed
|
|
24
|
+
? "Browser fallback succeeded and replaced the unlocker result."
|
|
25
|
+
: `Browser fallback was attempted but not used. ${envelope.fallbackReason ?? "It did not improve the structured output."}`;
|
|
26
|
+
process.stderr.write(`${fallbackStatus}\n`);
|
|
27
|
+
}
|
|
28
|
+
if (envelope.warning) {
|
|
29
|
+
process.stderr.write(`${envelope.warning}\n`);
|
|
30
|
+
}
|
|
22
31
|
(0, output_1.printJson)(envelope);
|
|
23
32
|
})));
|
|
24
33
|
}
|
|
@@ -7,8 +7,17 @@ const agentCli_1 = require("../lib/agentCli");
|
|
|
7
7
|
function buildSessionsCommand() {
|
|
8
8
|
return new commander_1.Command("sessions")
|
|
9
9
|
.description("List active daemon-backed browser sessions.")
|
|
10
|
-
.
|
|
10
|
+
.option("--prune", "Close tracked sessions idle for too long before listing.")
|
|
11
|
+
.option("--older-than-ms <ms>", "Idle threshold used with --prune.")
|
|
12
|
+
.action(async (options) => {
|
|
11
13
|
const config = await (0, config_1.loadConfig)();
|
|
12
|
-
|
|
14
|
+
const args = ["sessions"];
|
|
15
|
+
if (options.prune) {
|
|
16
|
+
args.push("--prune");
|
|
17
|
+
}
|
|
18
|
+
if (options.olderThanMs) {
|
|
19
|
+
args.push("--older-than-ms", options.olderThanMs);
|
|
20
|
+
}
|
|
21
|
+
await (0, agentCli_1.runAgentCommand)(args, config);
|
|
13
22
|
});
|
|
14
23
|
}
|
package/dist/config.js
CHANGED
|
@@ -23,13 +23,13 @@ const CONFIG_FILENAME = "config.json";
|
|
|
23
23
|
exports.DEFAULT_DAEMON_PORT = 4590;
|
|
24
24
|
exports.ENV_NAMES = {
|
|
25
25
|
webUnlockerApiKey: "GOLOGIN_WEB_UNLOCKER_API_KEY",
|
|
26
|
-
cloudToken: "
|
|
26
|
+
cloudToken: "GOLOGIN_TOKEN",
|
|
27
27
|
defaultProfileId: "GOLOGIN_DEFAULT_PROFILE_ID",
|
|
28
28
|
daemonPort: "GOLOGIN_DAEMON_PORT",
|
|
29
29
|
};
|
|
30
30
|
const LEGACY_ENV_NAMES = {
|
|
31
31
|
webUnlockerApiKey: ["GOLOGIN_WEBUNLOCKER_API_KEY"],
|
|
32
|
-
cloudToken: ["
|
|
32
|
+
cloudToken: ["GOLOGIN_CLOUD_TOKEN"],
|
|
33
33
|
defaultProfileId: ["GOLOGIN_PROFILE_ID"],
|
|
34
34
|
daemonPort: [],
|
|
35
35
|
};
|
package/dist/doctor.js
CHANGED
|
@@ -17,7 +17,7 @@ async function runDoctor(options = {}) {
|
|
|
17
17
|
detail: config.webUnlockerApiKey ? `configured via ${config.sources.webUnlockerApiKey}` : "missing",
|
|
18
18
|
});
|
|
19
19
|
checks.push({
|
|
20
|
-
name: "
|
|
20
|
+
name: "GoLogin token",
|
|
21
21
|
status: config.cloudToken ? "ok" : "warn",
|
|
22
22
|
detail: config.cloudToken ? `configured via ${config.sources.cloudToken}` : "missing",
|
|
23
23
|
});
|
|
@@ -25,7 +25,7 @@ async function runDoctor(options = {}) {
|
|
|
25
25
|
name: "Recommended full setup",
|
|
26
26
|
status: recommended.ready ? "ok" : "warn",
|
|
27
27
|
detail: recommended.ready
|
|
28
|
-
? "both GOLOGIN_WEB_UNLOCKER_API_KEY and
|
|
28
|
+
? "both GOLOGIN_WEB_UNLOCKER_API_KEY and GOLOGIN_TOKEN are configured"
|
|
29
29
|
: `missing ${recommended.missing.join(" and ")}`,
|
|
30
30
|
});
|
|
31
31
|
checks.push({
|
|
@@ -46,7 +46,7 @@ async function runDoctor(options = {}) {
|
|
|
46
46
|
checks.push({
|
|
47
47
|
name: "Default profile",
|
|
48
48
|
status: "warn",
|
|
49
|
-
detail: `${config.defaultProfileId} configured, but
|
|
49
|
+
detail: `${config.defaultProfileId} configured, but GOLOGIN_TOKEN is missing so existence could not be verified`,
|
|
50
50
|
});
|
|
51
51
|
}
|
|
52
52
|
else {
|
|
@@ -1,10 +1,20 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.runCloseCommand = runCloseCommand;
|
|
4
|
+
const errors_1 = require("../lib/errors");
|
|
4
5
|
const utils_1 = require("../lib/utils");
|
|
5
6
|
async function runCloseCommand(context, argv) {
|
|
6
7
|
const parsed = (0, utils_1.parseArgs)(argv);
|
|
8
|
+
const closeAll = (0, utils_1.getFlagBoolean)(parsed, "all");
|
|
7
9
|
const sessionId = (0, utils_1.getFlagString)(parsed, "session");
|
|
10
|
+
if (closeAll) {
|
|
11
|
+
if (sessionId) {
|
|
12
|
+
throw new errors_1.AppError("BAD_REQUEST", "--all cannot be combined with --session", 400);
|
|
13
|
+
}
|
|
14
|
+
const response = await context.client.request("POST", "/sessions/close-all");
|
|
15
|
+
context.stdout.write(`closed ${response.closed} session(s)\n`);
|
|
16
|
+
return;
|
|
17
|
+
}
|
|
8
18
|
const resolvedSessionId = sessionId ??
|
|
9
19
|
(await context.client.request("GET", "/sessions/current")).sessionId;
|
|
10
20
|
const response = await context.client.request("POST", `/sessions/${resolvedSessionId}/close`);
|
|
@@ -1,9 +1,27 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.runSessionsCommand = runSessionsCommand;
|
|
4
|
+
const errors_1 = require("../lib/errors");
|
|
4
5
|
const utils_1 = require("../lib/utils");
|
|
6
|
+
function parseOlderThanMs(value) {
|
|
7
|
+
if (!value) {
|
|
8
|
+
return undefined;
|
|
9
|
+
}
|
|
10
|
+
const parsed = Number(value);
|
|
11
|
+
if (!Number.isInteger(parsed) || parsed < 0) {
|
|
12
|
+
throw new errors_1.AppError("BAD_REQUEST", "--older-than-ms must be a non-negative integer", 400);
|
|
13
|
+
}
|
|
14
|
+
return parsed;
|
|
15
|
+
}
|
|
5
16
|
async function runSessionsCommand(context, argv) {
|
|
6
|
-
(0, utils_1.parseArgs)(argv);
|
|
17
|
+
const parsed = (0, utils_1.parseArgs)(argv);
|
|
18
|
+
if ((0, utils_1.getFlagBoolean)(parsed, "prune")) {
|
|
19
|
+
const olderThanMs = parseOlderThanMs((0, utils_1.getFlagString)(parsed, "older-than-ms"));
|
|
20
|
+
const prune = await context.client.request("POST", "/sessions/prune", {
|
|
21
|
+
maxIdleMs: olderThanMs,
|
|
22
|
+
});
|
|
23
|
+
context.stderr.write(`pruned ${prune.closed} session(s) idle for at least ${prune.maxIdleMs}ms\n`);
|
|
24
|
+
}
|
|
7
25
|
const response = await context.client.request("GET", "/sessions");
|
|
8
26
|
if (response.sessions.length === 0) {
|
|
9
27
|
context.stdout.write("no sessions\n");
|
|
@@ -53,6 +53,15 @@ async function handleRequest(request, response) {
|
|
|
53
53
|
(0, utils_1.writeJsonResponse)(response, 200, await sessionManager.currentSession());
|
|
54
54
|
return;
|
|
55
55
|
}
|
|
56
|
+
if (method === "POST" && pathname === "/sessions/close-all") {
|
|
57
|
+
(0, utils_1.writeJsonResponse)(response, 200, await sessionManager.closeAll());
|
|
58
|
+
return;
|
|
59
|
+
}
|
|
60
|
+
if (method === "POST" && pathname === "/sessions/prune") {
|
|
61
|
+
const body = (await (0, utils_1.readJsonBody)(request));
|
|
62
|
+
(0, utils_1.writeJsonResponse)(response, 200, await sessionManager.pruneSessions(body?.maxIdleMs));
|
|
63
|
+
return;
|
|
64
|
+
}
|
|
56
65
|
if (method === "POST" && pathname === "/sessions/open") {
|
|
57
66
|
const body = (await (0, utils_1.readJsonBody)(request));
|
|
58
67
|
(0, utils_1.writeJsonResponse)(response, 200, await sessionManager.open(body));
|
|
@@ -13,6 +13,8 @@ const refStore_1 = require("./refStore");
|
|
|
13
13
|
const snapshot_1 = require("./snapshot");
|
|
14
14
|
class SessionManager {
|
|
15
15
|
config;
|
|
16
|
+
static DEFAULT_PRUNE_IDLE_MS = 10 * 60 * 1000;
|
|
17
|
+
static CLOUD_SLOT_RELEASE_WAIT_MS = 3_000;
|
|
16
18
|
sessions = new Map();
|
|
17
19
|
activeSessionId;
|
|
18
20
|
refStore = new refStore_1.RefStore();
|
|
@@ -38,6 +40,32 @@ class SessionManager {
|
|
|
38
40
|
}
|
|
39
41
|
return Date.now() - lastActivityAt > session.idleTimeoutMs;
|
|
40
42
|
}
|
|
43
|
+
sessionIdleMs(session) {
|
|
44
|
+
const lastActivityAt = Date.parse(session.lastActivityAt);
|
|
45
|
+
if (Number.isNaN(lastActivityAt)) {
|
|
46
|
+
return 0;
|
|
47
|
+
}
|
|
48
|
+
return Math.max(0, Date.now() - lastActivityAt);
|
|
49
|
+
}
|
|
50
|
+
isCloudSlotLimitError(error) {
|
|
51
|
+
return (error instanceof errors_1.AppError &&
|
|
52
|
+
error.code === "BROWSER_CONNECTION_FAILED" &&
|
|
53
|
+
/max parallel cloud launches limit/i.test(error.message));
|
|
54
|
+
}
|
|
55
|
+
async pruneInactiveSessions(maxIdleMs = SessionManager.DEFAULT_PRUNE_IDLE_MS) {
|
|
56
|
+
const closedSessionIds = [];
|
|
57
|
+
for (const session of Array.from(this.sessions.values())) {
|
|
58
|
+
if (this.sessionIdleMs(session) < maxIdleMs) {
|
|
59
|
+
continue;
|
|
60
|
+
}
|
|
61
|
+
closedSessionIds.push(session.sessionId);
|
|
62
|
+
await this.destroySession(session);
|
|
63
|
+
}
|
|
64
|
+
return closedSessionIds;
|
|
65
|
+
}
|
|
66
|
+
async waitForCloudSlotRelease() {
|
|
67
|
+
await new Promise((resolve) => setTimeout(resolve, SessionManager.CLOUD_SLOT_RELEASE_WAIT_MS));
|
|
68
|
+
}
|
|
41
69
|
async destroySession(session) {
|
|
42
70
|
await (0, browser_1.closeSessionHandles)(session).catch(() => undefined);
|
|
43
71
|
this.sessions.delete(session.sessionId);
|
|
@@ -116,6 +144,30 @@ class SessionManager {
|
|
|
116
144
|
throw new errors_1.AppError("BAD_REQUEST", "--idle-timeout-ms must be a positive integer", 400);
|
|
117
145
|
}
|
|
118
146
|
}
|
|
147
|
+
async createSessionRecord(token, sessionId, profileId, request, createdAt, resolvedProxy, autoCreatedProfile) {
|
|
148
|
+
const connection = await (0, browser_1.connectToBrowser)(this.config, token, profileId);
|
|
149
|
+
const currentUrl = await (0, browser_1.navigatePage)(connection.page, request.url, this.config.navigationTimeoutMs);
|
|
150
|
+
const lastActivityAt = this.nowIso();
|
|
151
|
+
if (!resolvedProxy && profileId) {
|
|
152
|
+
resolvedProxy = await (0, browser_1.getCloudProfileProxy)(token, profileId).catch(() => undefined);
|
|
153
|
+
}
|
|
154
|
+
return {
|
|
155
|
+
sessionId,
|
|
156
|
+
profileId,
|
|
157
|
+
autoCreatedProfile,
|
|
158
|
+
connectUrl: connection.connectUrl,
|
|
159
|
+
browser: connection.browser,
|
|
160
|
+
context: connection.context,
|
|
161
|
+
page: connection.page,
|
|
162
|
+
currentUrl,
|
|
163
|
+
hasSnapshot: false,
|
|
164
|
+
staleSnapshot: false,
|
|
165
|
+
proxy: resolvedProxy,
|
|
166
|
+
createdAt,
|
|
167
|
+
lastActivityAt,
|
|
168
|
+
idleTimeoutMs: request.idleTimeoutMs
|
|
169
|
+
};
|
|
170
|
+
}
|
|
119
171
|
async resolveTargetLocator(session, target) {
|
|
120
172
|
if ((0, utils_1.isRefTarget)(target)) {
|
|
121
173
|
const descriptor = this.refStore.get(session.sessionId, target);
|
|
@@ -142,6 +194,7 @@ class SessionManager {
|
|
|
142
194
|
async open(request) {
|
|
143
195
|
const token = this.requireToken();
|
|
144
196
|
this.validateIdleTimeout(request.idleTimeoutMs);
|
|
197
|
+
await this.pruneInactiveSessions();
|
|
145
198
|
if (request.profileId && request.proxy) {
|
|
146
199
|
throw new errors_1.AppError("BAD_REQUEST", "proxy flags cannot be combined with --profile", 400);
|
|
147
200
|
}
|
|
@@ -180,35 +233,36 @@ class SessionManager {
|
|
|
180
233
|
autoCreatedProfile = true;
|
|
181
234
|
}
|
|
182
235
|
try {
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
236
|
+
let session;
|
|
237
|
+
try {
|
|
238
|
+
session = await this.createSessionRecord(token, sessionId, profileId, request, createdAt, resolvedProxy, autoCreatedProfile);
|
|
239
|
+
}
|
|
240
|
+
catch (error) {
|
|
241
|
+
if (!this.isCloudSlotLimitError(error)) {
|
|
242
|
+
throw error;
|
|
243
|
+
}
|
|
244
|
+
if (this.sessions.size === 0) {
|
|
245
|
+
throw new errors_1.AppError("BROWSER_CONNECTION_FAILED", `${error.message}. No tracked local sessions were available to close. Wait for cloud slots to free up or close stale sessions from another daemon, then retry.`, error.status, error.details);
|
|
246
|
+
}
|
|
247
|
+
const closedSessionIds = (await this.closeAll()).closedSessionIds;
|
|
248
|
+
await this.waitForCloudSlotRelease();
|
|
249
|
+
try {
|
|
250
|
+
session = await this.createSessionRecord(token, sessionId, profileId, request, createdAt, resolvedProxy, autoCreatedProfile);
|
|
251
|
+
}
|
|
252
|
+
catch (retryError) {
|
|
253
|
+
if (retryError instanceof errors_1.AppError && retryError.code === "BROWSER_CONNECTION_FAILED") {
|
|
254
|
+
throw new errors_1.AppError(retryError.code, `${retryError.message}. Closed tracked sessions (${closedSessionIds.join(", ")}) and retried once, but the cloud slot was still unavailable.`, retryError.status, retryError.details);
|
|
255
|
+
}
|
|
256
|
+
throw retryError;
|
|
257
|
+
}
|
|
188
258
|
}
|
|
189
|
-
const session = {
|
|
190
|
-
sessionId,
|
|
191
|
-
profileId,
|
|
192
|
-
autoCreatedProfile,
|
|
193
|
-
connectUrl: connection.connectUrl,
|
|
194
|
-
browser: connection.browser,
|
|
195
|
-
context: connection.context,
|
|
196
|
-
page: connection.page,
|
|
197
|
-
currentUrl,
|
|
198
|
-
hasSnapshot: false,
|
|
199
|
-
staleSnapshot: false,
|
|
200
|
-
proxy: resolvedProxy,
|
|
201
|
-
createdAt,
|
|
202
|
-
lastActivityAt,
|
|
203
|
-
idleTimeoutMs: request.idleTimeoutMs
|
|
204
|
-
};
|
|
205
259
|
this.sessions.set(sessionId, session);
|
|
206
260
|
this.activeSessionId = sessionId;
|
|
207
261
|
this.refStore.clear(sessionId);
|
|
208
262
|
return {
|
|
209
263
|
sessionId,
|
|
210
264
|
profileId,
|
|
211
|
-
url: currentUrl,
|
|
265
|
+
url: session.currentUrl,
|
|
212
266
|
proxy: session.proxy,
|
|
213
267
|
idleTimeoutMs: session.idleTimeoutMs
|
|
214
268
|
};
|
|
@@ -436,10 +490,11 @@ class SessionManager {
|
|
|
436
490
|
value
|
|
437
491
|
};
|
|
438
492
|
}
|
|
439
|
-
|
|
493
|
+
const resolvedTarget = target ?? (kind === "text" || kind === "html" ? "body" : undefined);
|
|
494
|
+
if (!resolvedTarget) {
|
|
440
495
|
throw new errors_1.AppError("BAD_REQUEST", `get ${kind} requires a target`, 400);
|
|
441
496
|
}
|
|
442
|
-
const locator = await this.resolveTargetLocator(session,
|
|
497
|
+
const locator = await this.resolveTargetLocator(session, resolvedTarget);
|
|
443
498
|
const value = await (0, browser_1.readLocatorValue)(locator, kind, this.config.actionTimeoutMs);
|
|
444
499
|
this.markSessionState(session);
|
|
445
500
|
return {
|
|
@@ -673,12 +728,26 @@ class SessionManager {
|
|
|
673
728
|
async currentSession() {
|
|
674
729
|
return this.toSummary(await this.getSessionOrThrow());
|
|
675
730
|
}
|
|
731
|
+
async pruneSessions(maxIdleMs = SessionManager.DEFAULT_PRUNE_IDLE_MS) {
|
|
732
|
+
const closedSessionIds = await this.pruneInactiveSessions(maxIdleMs);
|
|
733
|
+
return {
|
|
734
|
+
closed: closedSessionIds.length,
|
|
735
|
+
closedSessionIds,
|
|
736
|
+
maxIdleMs,
|
|
737
|
+
};
|
|
738
|
+
}
|
|
676
739
|
async closeAll() {
|
|
740
|
+
const closedSessionIds = [];
|
|
677
741
|
for (const session of Array.from(this.sessions.values())) {
|
|
742
|
+
closedSessionIds.push(session.sessionId);
|
|
678
743
|
await this.destroySession(session);
|
|
679
744
|
}
|
|
680
745
|
this.sessions.clear();
|
|
681
746
|
this.activeSessionId = undefined;
|
|
747
|
+
return {
|
|
748
|
+
closed: closedSessionIds.length,
|
|
749
|
+
closedSessionIds,
|
|
750
|
+
};
|
|
682
751
|
}
|
|
683
752
|
}
|
|
684
753
|
exports.SessionManager = SessionManager;
|
|
@@ -26,7 +26,7 @@ const errors_1 = require("./errors");
|
|
|
26
26
|
function parseArgs(argv) {
|
|
27
27
|
const positional = [];
|
|
28
28
|
const flags = {};
|
|
29
|
-
const booleanFlags = new Set(["interactive", "exact", "annotate", "press-escape", "json", "clear"]);
|
|
29
|
+
const booleanFlags = new Set(["interactive", "exact", "annotate", "press-escape", "json", "clear", "all", "prune"]);
|
|
30
30
|
for (let index = 0; index < argv.length; index += 1) {
|
|
31
31
|
const token = argv[index];
|
|
32
32
|
if (token === "-i") {
|
package/dist/lib/browserRead.js
CHANGED
|
@@ -43,7 +43,7 @@ function ensureBrowserCommandOk(step, response, url) {
|
|
|
43
43
|
return;
|
|
44
44
|
}
|
|
45
45
|
const message = response.stderr.trim() || response.stdout.trim() || `Browser command failed for ${url}`;
|
|
46
|
-
throw
|
|
46
|
+
throw (0, errors_1.createBrowserCommandError)(step, url, message);
|
|
47
47
|
}
|
|
48
48
|
function buildReadableExtractionExpression() {
|
|
49
49
|
return `(() => {
|
|
@@ -30,7 +30,7 @@ function ensureBrowserCommandOk(step, response, url) {
|
|
|
30
30
|
return;
|
|
31
31
|
}
|
|
32
32
|
const message = response.stderr.trim() || response.stdout.trim() || `Browser command failed for ${url}`;
|
|
33
|
-
throw
|
|
33
|
+
throw (0, errors_1.createBrowserCommandError)(step, url, message);
|
|
34
34
|
}
|
|
35
35
|
function buildStructuredExtractionExpression() {
|
|
36
36
|
return `(() => {
|
package/dist/lib/errors.js
CHANGED
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.DaemonError = exports.HttpError = exports.MissingCredentialError = exports.SilentExitError = exports.CliError = void 0;
|
|
4
|
+
exports.createBrowserCommandError = createBrowserCommandError;
|
|
4
5
|
exports.toCliError = toCliError;
|
|
5
6
|
class CliError extends Error {
|
|
6
7
|
exitCode;
|
|
7
8
|
hint;
|
|
8
|
-
|
|
9
|
+
code;
|
|
10
|
+
constructor(message, exitCode = 1, hint, code) {
|
|
9
11
|
super(message);
|
|
10
12
|
this.name = new.target.name;
|
|
11
13
|
this.exitCode = exitCode;
|
|
12
14
|
this.hint = hint;
|
|
15
|
+
this.code = code;
|
|
13
16
|
}
|
|
14
17
|
}
|
|
15
18
|
exports.CliError = CliError;
|
|
@@ -23,7 +26,7 @@ class MissingCredentialError extends CliError {
|
|
|
23
26
|
constructor(envName, commandGroup) {
|
|
24
27
|
super(`Missing ${envName}. This is required for ${commandGroup}.`, 1, [
|
|
25
28
|
"This CLI only reads credentials from environment variables or ~/.gologin-web-access/config.json.",
|
|
26
|
-
"Recommended setup: configure both GOLOGIN_WEB_UNLOCKER_API_KEY and
|
|
29
|
+
"Recommended setup: configure both GOLOGIN_WEB_UNLOCKER_API_KEY and GOLOGIN_TOKEN up front so agents do not stop to ask again.",
|
|
27
30
|
`Set ${envName} in your environment or add it to ~/.gologin-web-access/config.json.`,
|
|
28
31
|
"Helpful commands: gologin-web-access config init, gologin-web-access config show, gologin-web-access doctor.",
|
|
29
32
|
].join("\n"));
|
|
@@ -44,6 +47,31 @@ class DaemonError extends CliError {
|
|
|
44
47
|
}
|
|
45
48
|
}
|
|
46
49
|
exports.DaemonError = DaemonError;
|
|
50
|
+
function createBrowserCommandError(step, url, rawMessage) {
|
|
51
|
+
const normalized = rawMessage.trim() || `Browser command failed for ${url}`;
|
|
52
|
+
if (/max parallel cloud launches limit/i.test(normalized)) {
|
|
53
|
+
return new CliError(`Cloud Browser ${step} failed: max parallel cloud launches limit reached.`, 1, [
|
|
54
|
+
"Close stale cloud sessions, run sessions --prune, or switch to gologin-local-agent-browser if the task can run locally.",
|
|
55
|
+
normalized,
|
|
56
|
+
].join("\n"), "CLOUD_SLOT_EXHAUSTED");
|
|
57
|
+
}
|
|
58
|
+
if (/BROWSER_CONNECTION_FAILED|connect ECONNREFUSED|connection failed/i.test(normalized)) {
|
|
59
|
+
return new CliError(`Cloud Browser ${step} failed: browser connection could not be established.`, 1, normalized, "BROWSER_CONNECTION_FAILED");
|
|
60
|
+
}
|
|
61
|
+
if (/(^|\D)403(\D|$)/.test(normalized)) {
|
|
62
|
+
return new CliError(`Cloud Browser ${step} failed with 403.`, 1, [
|
|
63
|
+
"Cloud Browser rejected the session. This can mean missing access, plan restrictions, or stale backend state.",
|
|
64
|
+
normalized,
|
|
65
|
+
].join("\n"), "CLOUD_BROWSER_403");
|
|
66
|
+
}
|
|
67
|
+
if (/(^|\D)503(\D|$)/.test(normalized)) {
|
|
68
|
+
return new CliError(`Cloud Browser ${step} failed with 503.`, 1, [
|
|
69
|
+
"Cloud Browser is temporarily unavailable or overloaded. Retry shortly or switch to a local GoLogin profile if the task can run locally.",
|
|
70
|
+
normalized,
|
|
71
|
+
].join("\n"), "CLOUD_BROWSER_503");
|
|
72
|
+
}
|
|
73
|
+
return new CliError(`Cloud Browser ${step} failed for ${url}.`, 1, normalized, "CLOUD_BROWSER_FAILED");
|
|
74
|
+
}
|
|
47
75
|
function toCliError(error) {
|
|
48
76
|
if (error instanceof CliError) {
|
|
49
77
|
return error;
|
package/dist/lib/output.js
CHANGED
package/dist/lib/readSource.js
CHANGED
|
@@ -79,7 +79,7 @@ async function readRenderedHtmlContent(url, config, apiKey, options = {}) {
|
|
|
79
79
|
renderSource: "unlocker",
|
|
80
80
|
fallbackAttempted: true,
|
|
81
81
|
fallbackUsed: false,
|
|
82
|
-
fallbackReason: `${assessment.reason};
|
|
82
|
+
fallbackReason: `${assessment.reason}; GOLOGIN_TOKEN is not configured`,
|
|
83
83
|
request: unlocker.request,
|
|
84
84
|
};
|
|
85
85
|
}
|
|
@@ -180,7 +180,7 @@ async function readReadableContent(url, config, apiKey, options) {
|
|
|
180
180
|
renderSource: "unlocker",
|
|
181
181
|
fallbackAttempted: true,
|
|
182
182
|
fallbackUsed: false,
|
|
183
|
-
fallbackReason: `${assessment.reason};
|
|
183
|
+
fallbackReason: `${assessment.reason}; GOLOGIN_TOKEN is not configured`,
|
|
184
184
|
request: unlocker.request,
|
|
185
185
|
};
|
|
186
186
|
}
|
package/dist/lib/search.js
CHANGED
|
@@ -317,7 +317,7 @@ async function searchViaUnlocker(query, config, options, engine) {
|
|
|
317
317
|
}
|
|
318
318
|
async function searchViaBrowser(query, config, options, engine) {
|
|
319
319
|
if (!config.cloudToken) {
|
|
320
|
-
throw new errors_1.CliError("Missing
|
|
320
|
+
throw new errors_1.CliError("Missing GOLOGIN_TOKEN for browser search fallback.");
|
|
321
321
|
}
|
|
322
322
|
const sessionId = `search-${(0, crypto_1.randomUUID)()}`;
|
|
323
323
|
const searchUrl = buildSearchUrl(engine, query, options);
|
|
@@ -4,9 +4,28 @@ exports.scrapeStructuredJson = scrapeStructuredJson;
|
|
|
4
4
|
exports.makeStructuredScrapeEnvelope = makeStructuredScrapeEnvelope;
|
|
5
5
|
exports.normalizeStructuredFallbackMode = normalizeStructuredFallbackMode;
|
|
6
6
|
exports.shouldUseBrowserFallback = shouldUseBrowserFallback;
|
|
7
|
+
exports.buildStructuredFallbackAdvisory = buildStructuredFallbackAdvisory;
|
|
8
|
+
exports.detectStructuredBlockReason = detectStructuredBlockReason;
|
|
7
9
|
const config_1 = require("../config");
|
|
10
|
+
const errors_1 = require("./errors");
|
|
8
11
|
const browserStructured_1 = require("./browserStructured");
|
|
9
12
|
const unlocker_1 = require("./unlocker");
|
|
13
|
+
class StructuredBlockedPageError extends errors_1.CliError {
|
|
14
|
+
status;
|
|
15
|
+
request;
|
|
16
|
+
constructor(url, status, request, reason, options) {
|
|
17
|
+
super(`Structured scrape returned a likely blocked or challenge page for ${url}.`, 1, [
|
|
18
|
+
`Reason: ${reason}.`,
|
|
19
|
+
options.fallbackAttempted
|
|
20
|
+
? options.fallbackUsed
|
|
21
|
+
? "Browser fallback was used but the page still looked blocked."
|
|
22
|
+
: `Browser fallback was attempted but not used. ${options.fallbackReason ?? "It did not improve the result."}`
|
|
23
|
+
: "Retry with --fallback browser, use read --source browser, or switch to gologin-local-agent-browser for full rendered DOM.",
|
|
24
|
+
].join("\n"), "BLOCKED_PAGE");
|
|
25
|
+
this.status = status;
|
|
26
|
+
this.request = request;
|
|
27
|
+
}
|
|
28
|
+
}
|
|
10
29
|
async function scrapeStructuredJson(url, config, apiKey, options = {}) {
|
|
11
30
|
const result = await (0, unlocker_1.scrapeJson)(url, apiKey, options.request);
|
|
12
31
|
const fallbackMode = options.fallback ?? "none";
|
|
@@ -15,6 +34,7 @@ async function scrapeStructuredJson(url, config, apiKey, options = {}) {
|
|
|
15
34
|
let fallbackAttempted = false;
|
|
16
35
|
let fallbackUsed = false;
|
|
17
36
|
let fallbackReason;
|
|
37
|
+
let { browserRecommended, warning } = buildStructuredFallbackAdvisory(data);
|
|
18
38
|
if (fallbackMode === "browser" && shouldUseBrowserFallback(data)) {
|
|
19
39
|
fallbackAttempted = true;
|
|
20
40
|
(0, config_1.requireCloudToken)(config);
|
|
@@ -26,16 +46,28 @@ async function scrapeStructuredJson(url, config, apiKey, options = {}) {
|
|
|
26
46
|
renderSource = "browser";
|
|
27
47
|
fallbackUsed = true;
|
|
28
48
|
fallbackReason = "unlocker structured data looked incomplete";
|
|
49
|
+
browserRecommended = false;
|
|
50
|
+
warning = undefined;
|
|
29
51
|
}
|
|
30
52
|
else {
|
|
31
53
|
fallbackReason = "browser fallback did not improve structured output";
|
|
32
54
|
}
|
|
33
55
|
}
|
|
56
|
+
const blockedReason = detectStructuredBlockReason(data);
|
|
57
|
+
if (blockedReason) {
|
|
58
|
+
throw new StructuredBlockedPageError(url, result.status, result.request, blockedReason, {
|
|
59
|
+
fallbackAttempted,
|
|
60
|
+
fallbackUsed,
|
|
61
|
+
fallbackReason,
|
|
62
|
+
});
|
|
63
|
+
}
|
|
34
64
|
return makeStructuredScrapeEnvelope(url, result, data, {
|
|
35
65
|
renderSource,
|
|
36
66
|
fallbackAttempted,
|
|
37
67
|
fallbackUsed,
|
|
38
68
|
fallbackReason,
|
|
69
|
+
browserRecommended,
|
|
70
|
+
warning,
|
|
39
71
|
});
|
|
40
72
|
}
|
|
41
73
|
function makeStructuredScrapeEnvelope(url, result, data, options = {}) {
|
|
@@ -46,6 +78,8 @@ function makeStructuredScrapeEnvelope(url, result, data, options = {}) {
|
|
|
46
78
|
fallbackAttempted: options.fallbackAttempted ?? false,
|
|
47
79
|
fallbackUsed: options.fallbackUsed ?? false,
|
|
48
80
|
fallbackReason: options.fallbackReason,
|
|
81
|
+
browserRecommended: options.browserRecommended,
|
|
82
|
+
warning: options.warning,
|
|
49
83
|
request: result.request,
|
|
50
84
|
data,
|
|
51
85
|
};
|
|
@@ -60,16 +94,40 @@ function normalizeStructuredFallbackMode(value) {
|
|
|
60
94
|
throw new Error(`Unsupported scrape-json fallback mode: ${value}`);
|
|
61
95
|
}
|
|
62
96
|
function shouldUseBrowserFallback(data) {
|
|
97
|
+
if (detectStructuredBlockReason(data)) {
|
|
98
|
+
return true;
|
|
99
|
+
}
|
|
63
100
|
const firstH1 = data.headingsByLevel.h1[0];
|
|
64
101
|
if (!firstH1) {
|
|
65
102
|
return true;
|
|
66
103
|
}
|
|
67
104
|
return looksSuspiciousHeadingText(firstH1);
|
|
68
105
|
}
|
|
106
|
+
function buildStructuredFallbackAdvisory(data) {
|
|
107
|
+
const blockedReason = detectStructuredBlockReason(data);
|
|
108
|
+
if (blockedReason) {
|
|
109
|
+
return {
|
|
110
|
+
browserRecommended: true,
|
|
111
|
+
warning: `Structured output looks blocked or challenge-gated (${blockedReason}). Retry with --fallback browser or use a rendered browser path.`,
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
if (!shouldUseBrowserFallback(data)) {
|
|
115
|
+
return { browserRecommended: false };
|
|
116
|
+
}
|
|
117
|
+
return {
|
|
118
|
+
browserRecommended: true,
|
|
119
|
+
warning: "Structured output looks incomplete or client-rendered. Retry with --fallback browser or use read/open for rendered DOM.",
|
|
120
|
+
};
|
|
121
|
+
}
|
|
69
122
|
function looksSuspiciousHeadingText(value) {
|
|
70
123
|
return /function\s*\(|window\.|document\.|const\s+|let\s+|var\s+|=>|import\s+/i.test(value) || value.length > 240;
|
|
71
124
|
}
|
|
72
125
|
function isBrowserDataBetter(current, candidate) {
|
|
126
|
+
const currentBlocked = Boolean(detectStructuredBlockReason(current));
|
|
127
|
+
const candidateBlocked = Boolean(detectStructuredBlockReason(candidate));
|
|
128
|
+
if (currentBlocked !== candidateBlocked) {
|
|
129
|
+
return currentBlocked && !candidateBlocked;
|
|
130
|
+
}
|
|
73
131
|
if (candidate.headingsByLevel.h1.length > current.headingsByLevel.h1.length) {
|
|
74
132
|
return true;
|
|
75
133
|
}
|
|
@@ -81,3 +139,28 @@ function isBrowserDataBetter(current, candidate) {
|
|
|
81
139
|
}
|
|
82
140
|
return false;
|
|
83
141
|
}
|
|
142
|
+
function detectStructuredBlockReason(data) {
|
|
143
|
+
const candidates = [
|
|
144
|
+
data.title,
|
|
145
|
+
data.description,
|
|
146
|
+
...data.headingsByLevel.h1.slice(0, 2),
|
|
147
|
+
...data.headingsByLevel.h2.slice(0, 2),
|
|
148
|
+
].filter((value) => Boolean(value && value.trim()));
|
|
149
|
+
for (const candidate of candidates) {
|
|
150
|
+
const reason = classifyBlockedText(candidate);
|
|
151
|
+
if (reason) {
|
|
152
|
+
return reason;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
return undefined;
|
|
156
|
+
}
|
|
157
|
+
function classifyBlockedText(value) {
|
|
158
|
+
const text = value.trim();
|
|
159
|
+
if (/(verify you are human|verify you are a human|are you human|captcha|security check|attention required|just a moment|checking your browser|enable javascript and cookies to continue|one more step)/i.test(text)) {
|
|
160
|
+
return "challenge markers matched the page title or heading";
|
|
161
|
+
}
|
|
162
|
+
if (/(access denied|forbidden|blocked request|request blocked|request unsuccessful|temporarily blocked|temporarily unavailable|you have been blocked|access to this page has been denied)/i.test(text)) {
|
|
163
|
+
return "blocked-page markers matched the page title or heading";
|
|
164
|
+
}
|
|
165
|
+
return undefined;
|
|
166
|
+
}
|
package/package.json
CHANGED