gologin-web-access 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -5,6 +5,12 @@
5
5
  - browser automation is now embedded directly in `gologin-web-access`, so one repo and one install contains both Web Unlocker and Cloud Browser flows
6
6
  - doctor now reports the embedded browser runtime source and version
7
7
 
8
+ ## 0.3.2 - 2026-04-03
9
+
10
+ - added unified page outcome classification across `read`, `scrape-json`, and `batch-scrape`
11
+ - structured and readable paths now distinguish `ok`, `empty`, `incomplete`, `authwall`, `challenge`, `blocked`, and `cookie_wall`
12
+ - batch and extract-oriented flows now propagate next-step hints and fallback metadata more consistently for agents
13
+
8
14
  ## 0.1.0 - 2026-03-10
9
15
 
10
16
  Initial public release of Gologin Web Access.
package/README.md CHANGED
@@ -305,6 +305,7 @@ gologin-web-access snapshot -i
305
305
  - `batch-extract` reuses the same extraction path across many URLs and returns one structured result per URL, including request and fallback metadata. Add `--output <path>` to save the full array directly.
306
306
  - `scrape-json` now returns both a flat `headings` array and `headingsByLevel` buckets for `h1` through `h6`.
307
307
  - `scrape-json --fallback browser` is available for JS-heavy pages where stateless extraction returns weak heading data.
308
+ - `scrape-json` now also classifies the page outcome as `ok`, `empty`, `incomplete`, `authwall`, `challenge`, `blocked`, or `cookie_wall`, and includes `nextActionHint` when the result is weak or gated.
308
309
  - `scrape`, `scrape-markdown`, `scrape-text`, `scrape-json`, and `batch-scrape` accept `--retry`, `--backoff-ms`, and `--timeout-ms`.
309
310
  - `batch-scrape --only-main-content` lets markdown, text, and html batch runs use the same readable-content isolation path as `read`.
310
311
  - `crawl --only-main-content` uses the same readable-fragment extraction strategy for html, markdown, and text crawl output, but stays on the stateless unlocker path.
@@ -312,6 +313,7 @@ gologin-web-access snapshot -i
312
313
  - `batch-scrape` now returns exit code `0` on partial success by default and only fails the command when every URL failed. Add `--strict` if any single failed URL should make the whole batch exit non-zero.
313
314
  - `batch-scrape --output <path>` writes the full JSON to disk so shells and agent consoles cannot truncate a large payload silently.
314
315
  - `batch-scrape --format json` now returns the same structured scrape envelope as `scrape-json`, including `renderSource`, `fallbackAttempted`, `fallbackUsed`, and `request.attemptCount/retryCount/attempts`.
316
+ - `batch-scrape --only-main-content` now propagates `outcome`, `outcomeReason`, `nextActionHint`, and fallback metadata per URL so agents can tell "weak page" from "gated page" without scraping log text.
315
317
  - `scrape-json` now surfaces explicit `BLOCKED_PAGE` failures when structured output clearly matches a challenge or block page, instead of silently looking like a valid empty result.
316
318
  - `search` now returns `requestedLimit`, `returnedCount`, `warnings`, `cacheTtlMs`, and per-result `position`.
317
319
  - `search` may return fewer results than the requested `--limit` when the upstream SERP contains fewer valid results; inspect `returnedCount`, `warnings`, and `attempts`.
package/dist/cli.js CHANGED
@@ -67,7 +67,7 @@ const wait_1 = require("./commands/wait");
67
67
  const doctor_1 = require("./doctor");
68
68
  const errors_1 = require("./lib/errors");
69
69
  const output_1 = require("./lib/output");
70
- const CLI_VERSION = "0.3.0";
70
+ const CLI_VERSION = "0.3.2";
71
71
  async function main() {
72
72
  const program = new commander_1.Command();
73
73
  program
@@ -47,7 +47,16 @@ function buildBatchScrapeCommand() {
47
47
  url,
48
48
  ok: true,
49
49
  format,
50
- output,
50
+ output: output.output,
51
+ outcome: output.outcome,
52
+ outcomeReason: output.outcomeReason,
53
+ nextActionHint: output.nextActionHint,
54
+ renderSource: output.renderSource,
55
+ fallbackAttempted: output.fallbackAttempted,
56
+ fallbackUsed: output.fallbackUsed,
57
+ fallbackReason: output.fallbackReason,
58
+ warning: output.warning,
59
+ request: output.request,
51
60
  };
52
61
  }
53
62
  catch (error) {
@@ -59,6 +68,8 @@ function buildBatchScrapeCommand() {
59
68
  error: error instanceof Error ? error.message : "Unknown error",
60
69
  code: extractErrorCode(error),
61
70
  status: extractStatusCode(error),
71
+ outcome: extractOutcome(error),
72
+ nextActionHint: extractNextActionHint(error),
62
73
  request,
63
74
  };
64
75
  }
@@ -97,31 +108,67 @@ async function formatOutput(url, config, apiKey, format, requestOptions, fallbac
97
108
  };
98
109
  switch (format) {
99
110
  case "html":
100
- return (await (0, readSource_1.readHtmlContent)(url, config, apiKey, readOptions)).content;
111
+ return mapReadableBatchResult(await (0, readSource_1.readHtmlContent)(url, config, apiKey, readOptions));
101
112
  case "markdown":
102
- return (await (0, readSource_1.readMarkdownContent)(url, config, apiKey, readOptions)).content;
113
+ return mapReadableBatchResult(await (0, readSource_1.readMarkdownContent)(url, config, apiKey, readOptions));
103
114
  case "text":
104
- return (await (0, readSource_1.readTextContent)(url, config, apiKey, readOptions)).content;
115
+ return mapReadableBatchResult(await (0, readSource_1.readTextContent)(url, config, apiKey, readOptions));
105
116
  default:
106
117
  break;
107
118
  }
108
119
  }
109
120
  switch (format) {
110
121
  case "html":
111
- return (await (0, unlocker_1.scrapeRenderedHtml)(url, apiKey, requestOptions)).content;
122
+ return {
123
+ output: (await (0, unlocker_1.scrapeRenderedHtml)(url, apiKey, requestOptions)).content,
124
+ };
112
125
  case "markdown":
113
- return (await (0, unlocker_1.scrapeMarkdown)(url, apiKey, requestOptions)).markdown;
126
+ return {
127
+ output: (await (0, unlocker_1.scrapeMarkdown)(url, apiKey, requestOptions)).markdown,
128
+ };
114
129
  case "text":
115
- return (await (0, unlocker_1.scrapeText)(url, apiKey, requestOptions)).text;
130
+ return {
131
+ output: (await (0, unlocker_1.scrapeText)(url, apiKey, requestOptions)).text,
132
+ };
116
133
  case "json":
117
- return await (0, structuredScrape_1.scrapeStructuredJson)(url, config, apiKey, {
134
+ return mapStructuredBatchResult(await (0, structuredScrape_1.scrapeStructuredJson)(url, config, apiKey, {
118
135
  fallback,
119
136
  request: requestOptions,
120
- });
137
+ }));
121
138
  default:
122
- return (await (0, unlocker_1.scrapeRenderedHtml)(url, apiKey, requestOptions)).content;
139
+ return {
140
+ output: (await (0, unlocker_1.scrapeRenderedHtml)(url, apiKey, requestOptions)).content,
141
+ };
123
142
  }
124
143
  }
144
+ function mapReadableBatchResult(result) {
145
+ return {
146
+ output: result.content,
147
+ outcome: result.outcome,
148
+ outcomeReason: result.outcomeReason,
149
+ nextActionHint: result.nextActionHint,
150
+ renderSource: result.renderSource,
151
+ fallbackAttempted: result.fallbackAttempted,
152
+ fallbackUsed: result.fallbackUsed,
153
+ fallbackReason: result.fallbackReason,
154
+ warning: result.warning,
155
+ request: result.request,
156
+ };
157
+ }
158
+ function mapStructuredBatchResult(result) {
159
+ return {
160
+ output: result,
161
+ outcome: result.outcome,
162
+ outcomeReason: result.outcomeReason,
163
+ nextActionHint: result.nextActionHint,
164
+ renderSource: result.renderSource,
165
+ fallbackAttempted: result.fallbackAttempted,
166
+ fallbackUsed: result.fallbackUsed,
167
+ fallbackReason: result.fallbackReason,
168
+ warning: result.warning,
169
+ request: result.request,
170
+ };
171
+ }
125
172
  async function mapWithConcurrency(items, concurrency, mapper) {
126
173
  const results = new Array(items.length);
127
174
  let nextIndex = 0;
@@ -180,3 +227,21 @@ function extractErrorCode(error) {
180
227
  }
181
228
  return undefined;
182
229
  }
230
+ function extractOutcome(error) {
231
+ if (typeof error === "object" &&
232
+ error !== null &&
233
+ "outcome" in error &&
234
+ typeof error.outcome === "string") {
235
+ return error.outcome;
236
+ }
237
+ return undefined;
238
+ }
239
+ function extractNextActionHint(error) {
240
+ if (typeof error === "object" &&
241
+ error !== null &&
242
+ "nextActionHint" in error &&
243
+ typeof error.nextActionHint === "string") {
244
+ return error.nextActionHint;
245
+ }
246
+ return undefined;
247
+ }
@@ -3,6 +3,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.buildReadCommand = buildReadCommand;
4
4
  const commander_1 = require("commander");
5
5
  const config_1 = require("../config");
6
+ const pageOutcome_1 = require("../lib/pageOutcome");
6
7
  const readSource_1 = require("../lib/readSource");
7
8
  const output_1 = require("../lib/output");
8
9
  const shared_1 = require("./shared");
@@ -27,7 +28,7 @@ function buildReadCommand() {
27
28
  : format === "markdown"
28
29
  ? await (0, readSource_1.readMarkdownContent)(url, config, apiKey, readOptions)
29
30
  : await (0, readSource_1.readTextContent)(url, config, apiKey, readOptions);
30
- emitReadNotice(result.fallbackAttempted, result.fallbackUsed, result.fallbackReason);
31
+ emitReadNotice(result);
31
32
  (0, output_1.printText)(result.content);
32
33
  })));
33
34
  }
@@ -37,15 +38,24 @@ function normalizeReadFormat(value) {
37
38
  }
38
39
  throw new Error(`Unsupported read format: ${value}`);
39
40
  }
40
- function emitReadNotice(fallbackAttempted, fallbackUsed, fallbackReason) {
41
- if (!fallbackAttempted) {
42
- return;
41
+ function emitReadNotice(result) {
42
+ if (result.fallbackAttempted) {
43
+ if (result.fallbackUsed) {
44
+ process.stderr.write(`JS-rendered page detected, retrying with browser. ${result.fallbackReason ?? ""}\n`);
45
+ }
46
+ else if (result.fallbackReason) {
47
+ process.stderr.write(`${result.fallbackReason}\n`);
48
+ }
49
+ }
50
+ if (result.outcome !== "ok") {
51
+ process.stderr.write(`Outcome: ${result.outcome}\n`);
43
52
  }
44
- if (fallbackUsed) {
45
- process.stderr.write(`JS-rendered page detected, retrying with browser. ${fallbackReason ?? ""}\n`);
53
+ if (result.warning) {
54
+ process.stderr.write(`${result.warning}\n`);
46
55
  return;
47
56
  }
48
- if (fallbackReason) {
49
- process.stderr.write(`${fallbackReason}\n`);
57
+ const hint = (0, pageOutcome_1.describeNextActionHint)(result.nextActionHint);
58
+ if (hint && result.outcome !== "ok") {
59
+ process.stderr.write(`${hint}\n`);
50
60
  }
51
61
  }
@@ -3,6 +3,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.buildScrapeMarkdownCommand = buildScrapeMarkdownCommand;
4
4
  const commander_1 = require("commander");
5
5
  const config_1 = require("../config");
6
+ const pageOutcome_1 = require("../lib/pageOutcome");
6
7
  const readSource_1 = require("../lib/readSource");
7
8
  const shared_1 = require("./shared");
8
9
  const output_1 = require("../lib/output");
@@ -19,19 +20,28 @@ function buildScrapeMarkdownCommand() {
19
20
  source,
20
21
  request: (0, shared_1.normalizeUnlockerRequestOptions)(options),
21
22
  });
22
- emitReadNotice(result.fallbackAttempted, result.fallbackUsed, result.fallbackReason);
23
+ emitReadNotice(result);
23
24
  (0, output_1.printText)(result.content);
24
25
  }));
25
26
  }
26
- function emitReadNotice(fallbackAttempted, fallbackUsed, fallbackReason) {
27
- if (!fallbackAttempted) {
28
- return;
27
+ function emitReadNotice(result) {
28
+ if (result.fallbackAttempted) {
29
+ if (result.fallbackUsed) {
30
+ process.stderr.write(`JS-rendered page detected, retrying with browser. ${result.fallbackReason ?? ""}\n`);
31
+ }
32
+ else if (result.fallbackReason) {
33
+ process.stderr.write(`${result.fallbackReason}\n`);
34
+ }
35
+ }
36
+ if (result.outcome !== "ok") {
37
+ process.stderr.write(`Outcome: ${result.outcome}\n`);
29
38
  }
30
- if (fallbackUsed) {
31
- process.stderr.write(`JS-rendered page detected, retrying with browser. ${fallbackReason ?? ""}\n`);
39
+ if (result.warning) {
40
+ process.stderr.write(`${result.warning}\n`);
32
41
  return;
33
42
  }
34
- if (fallbackReason) {
35
- process.stderr.write(`${fallbackReason}\n`);
43
+ const hint = (0, pageOutcome_1.describeNextActionHint)(result.nextActionHint);
44
+ if (hint && result.outcome !== "ok") {
45
+ process.stderr.write(`${hint}\n`);
36
46
  }
37
47
  }
@@ -3,6 +3,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.buildScrapeTextCommand = buildScrapeTextCommand;
4
4
  const commander_1 = require("commander");
5
5
  const config_1 = require("../config");
6
+ const pageOutcome_1 = require("../lib/pageOutcome");
6
7
  const readSource_1 = require("../lib/readSource");
7
8
  const shared_1 = require("./shared");
8
9
  const output_1 = require("../lib/output");
@@ -19,19 +20,28 @@ function buildScrapeTextCommand() {
19
20
  source,
20
21
  request: (0, shared_1.normalizeUnlockerRequestOptions)(options),
21
22
  });
22
- emitReadNotice(result.fallbackAttempted, result.fallbackUsed, result.fallbackReason);
23
+ emitReadNotice(result);
23
24
  (0, output_1.printText)(result.content);
24
25
  }));
25
26
  }
26
- function emitReadNotice(fallbackAttempted, fallbackUsed, fallbackReason) {
27
- if (!fallbackAttempted) {
28
- return;
27
+ function emitReadNotice(result) {
28
+ if (result.fallbackAttempted) {
29
+ if (result.fallbackUsed) {
30
+ process.stderr.write(`JS-rendered page detected, retrying with browser. ${result.fallbackReason ?? ""}\n`);
31
+ }
32
+ else if (result.fallbackReason) {
33
+ process.stderr.write(`${result.fallbackReason}\n`);
34
+ }
35
+ }
36
+ if (result.outcome !== "ok") {
37
+ process.stderr.write(`Outcome: ${result.outcome}\n`);
29
38
  }
30
- if (fallbackUsed) {
31
- process.stderr.write(`JS-rendered page detected, retrying with browser. ${fallbackReason ?? ""}\n`);
39
+ if (result.warning) {
40
+ process.stderr.write(`${result.warning}\n`);
32
41
  return;
33
42
  }
34
- if (fallbackReason) {
35
- process.stderr.write(`${fallbackReason}\n`);
43
+ const hint = (0, pageOutcome_1.describeNextActionHint)(result.nextActionHint);
44
+ if (hint && result.outcome !== "ok") {
45
+ process.stderr.write(`${hint}\n`);
36
46
  }
37
47
  }
@@ -16,6 +16,10 @@ async function extractUrlWithSchema(url, config, apiKey, schema, options = {}) {
16
16
  fallbackAttempted: rendered.fallbackAttempted,
17
17
  fallbackUsed: rendered.fallbackUsed,
18
18
  fallbackReason: rendered.fallbackReason,
19
+ outcome: rendered.outcome,
20
+ outcomeReason: rendered.outcomeReason,
21
+ nextActionHint: rendered.nextActionHint,
22
+ warning: rendered.warning,
19
23
  request: rendered.request,
20
24
  extracted: (0, extract_1.extractWithSchema)(rendered.html, schema),
21
25
  };
@@ -0,0 +1,192 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.assessStructuredPageOutcome = assessStructuredPageOutcome;
4
+ exports.assessReadablePageOutcome = assessReadablePageOutcome;
5
+ exports.detectStructuredChallengeReason = detectStructuredChallengeReason;
6
+ exports.detectStructuredBlockedReason = detectStructuredBlockedReason;
7
+ exports.detectStructuredAuthwallReason = detectStructuredAuthwallReason;
8
+ exports.detectStructuredCookieWallReason = detectStructuredCookieWallReason;
9
+ exports.describeNextActionHint = describeNextActionHint;
10
+ const cheerio_1 = require("cheerio");
11
+ function assessStructuredPageOutcome(data) {
12
+ const authwallReason = detectStructuredAuthwallReason(data);
13
+ if (authwallReason) {
14
+ return buildAssessment("authwall", authwallReason, "use_logged_in_session", true);
15
+ }
16
+ const challengeReason = detectStructuredChallengeReason(data);
17
+ if (challengeReason) {
18
+ return buildAssessment("challenge", challengeReason, "use_local_profile", true);
19
+ }
20
+ const blockedReason = detectStructuredBlockedReason(data);
21
+ if (blockedReason) {
22
+ return buildAssessment("blocked", blockedReason, "use_local_profile", true);
23
+ }
24
+ const cookieWallReason = detectStructuredCookieWallReason(data);
25
+ if (cookieWallReason) {
26
+ return buildAssessment("cookie_wall", cookieWallReason, "retry_with_browser", true);
27
+ }
28
+ if (isStructuredDataEmpty(data)) {
29
+ return buildAssessment("empty", "Structured output contained almost no readable fields", "retry_with_browser", true);
30
+ }
31
+ if (isStructuredDataLikelyIncomplete(data)) {
32
+ return buildAssessment("incomplete", "Structured output looks incomplete or client-rendered", "retry_with_browser", true);
33
+ }
34
+ return {
35
+ outcome: "ok",
36
+ browserRecommended: false,
37
+ };
38
+ }
39
+ function assessReadablePageOutcome(html, content, options = {}) {
40
+ const $ = (0, cheerio_1.load)(html);
41
+ const title = $("title").first().text().trim();
42
+ const firstHeading = $("h1").first().text().trim();
43
+ const canonical = ($("link[rel='canonical']").attr("href") ?? "").trim();
44
+ const normalizedText = normalizeText(content);
45
+ const inputCount = $("input, textarea, select").length;
46
+ const formCount = $("form").length;
47
+ const candidates = [
48
+ title,
49
+ firstHeading,
50
+ canonical,
51
+ normalizedText.slice(0, 1_500),
52
+ ].filter(Boolean);
53
+ if (canonical.includes("/authwall")) {
54
+ return buildAssessment("authwall", "Canonical URL points to an authwall path", "use_logged_in_session", true);
55
+ }
56
+ for (const candidate of candidates) {
57
+ const authwallReason = classifyAuthwallText(candidate);
58
+ if (authwallReason && (formCount > 0 || inputCount > 0 || normalizedText.length < 3_000)) {
59
+ return buildAssessment("authwall", authwallReason, "use_logged_in_session", true);
60
+ }
61
+ }
62
+ for (const candidate of candidates) {
63
+ const challengeReason = classifyChallengeText(candidate);
64
+ if (challengeReason) {
65
+ return buildAssessment("challenge", challengeReason, "use_local_profile", true);
66
+ }
67
+ }
68
+ for (const candidate of candidates) {
69
+ const blockedReason = classifyBlockedText(candidate);
70
+ if (blockedReason) {
71
+ return buildAssessment("blocked", blockedReason, "use_local_profile", true);
72
+ }
73
+ }
74
+ for (const candidate of candidates) {
75
+ const cookieWallReason = classifyCookieWallText(candidate);
76
+ if (cookieWallReason && (formCount > 0 || normalizedText.length < 2_500)) {
77
+ return buildAssessment("cookie_wall", cookieWallReason, "retry_with_browser", true);
78
+ }
79
+ }
80
+ if (normalizedText.length === 0) {
81
+ return buildAssessment("empty", "Readable output was empty", "retry_with_browser", true);
82
+ }
83
+ if (options.looksIncomplete) {
84
+ return buildAssessment("incomplete", options.incompleteReason ?? "Readable output looks incomplete", "retry_with_browser", true);
85
+ }
86
+ return {
87
+ outcome: "ok",
88
+ browserRecommended: false,
89
+ };
90
+ }
91
+ function detectStructuredChallengeReason(data) {
92
+ return findStructuredReason(data, classifyChallengeText);
93
+ }
94
+ function detectStructuredBlockedReason(data) {
95
+ return findStructuredReason(data, classifyBlockedText);
96
+ }
97
+ function detectStructuredAuthwallReason(data) {
98
+ if ((data.canonical ?? "").includes("/authwall")) {
99
+ return "Canonical URL points to an authwall path";
100
+ }
101
+ return findStructuredReason(data, classifyAuthwallText);
102
+ }
103
+ function detectStructuredCookieWallReason(data) {
104
+ return findStructuredReason(data, classifyCookieWallText);
105
+ }
106
+ function describeNextActionHint(hint) {
107
+ switch (hint) {
108
+ case "retry_with_browser":
109
+ return "Retry with --source browser or --fallback browser.";
110
+ case "use_logged_in_session":
111
+ return "Use a logged-in browser session or a GoLogin profile with saved cookies.";
112
+ case "use_local_profile":
113
+ return "Switch to gologin-local-agent-browser or another profile-backed browser path.";
114
+ default:
115
+ return undefined;
116
+ }
117
+ }
118
+ function buildAssessment(outcome, reason, nextActionHint, browserRecommended) {
119
+ const actionText = describeNextActionHint(nextActionHint);
120
+ return {
121
+ outcome,
122
+ reason,
123
+ nextActionHint,
124
+ browserRecommended,
125
+ warning: actionText ? `${capitalizeOutcome(outcome)} detected: ${reason}. ${actionText}` : `${capitalizeOutcome(outcome)} detected: ${reason}.`,
126
+ };
127
+ }
128
+ function capitalizeOutcome(value) {
129
+ return value.replace(/_/g, " ").replace(/^\w/, (char) => char.toUpperCase());
130
+ }
131
+ function isStructuredDataEmpty(data) {
132
+ return !data.title && !data.description && data.headings.length === 0 && data.links.length === 0;
133
+ }
134
+ function isStructuredDataLikelyIncomplete(data) {
135
+ const firstH1 = data.headingsByLevel.h1[0];
136
+ if (!firstH1) {
137
+ return true;
138
+ }
139
+ if (looksSuspiciousHeadingText(firstH1)) {
140
+ return true;
141
+ }
142
+ if (!data.title && data.headings.length < 2) {
143
+ return true;
144
+ }
145
+ return false;
146
+ }
147
+ function looksSuspiciousHeadingText(value) {
148
+ return /function\s*\(|window\.|document\.|const\s+|let\s+|var\s+|=>|import\s+/i.test(value) || value.length > 240;
149
+ }
150
+ function findStructuredReason(data, matcher) {
151
+ const candidates = [
152
+ data.title,
153
+ data.description,
154
+ data.canonical,
155
+ ...data.headingsByLevel.h1.slice(0, 2),
156
+ ...data.headingsByLevel.h2.slice(0, 2),
157
+ ].filter((value) => Boolean(value && value.trim()));
158
+ for (const candidate of candidates) {
159
+ const reason = matcher(candidate);
160
+ if (reason) {
161
+ return reason;
162
+ }
163
+ }
164
+ return undefined;
165
+ }
166
+ function normalizeText(value) {
167
+ return value.replace(/\s+/g, " ").trim();
168
+ }
169
+ function classifyChallengeText(value) {
170
+ if (/(verify you are human|verify you are a human|are you human|captcha|security check|attention required|just a moment|checking your browser|enable javascript and cookies to continue|one more step|security verification)/i.test(value)) {
171
+ return "Challenge or verification markers matched the page";
172
+ }
173
+ return undefined;
174
+ }
175
+ function classifyBlockedText(value) {
176
+ if (/(access denied|forbidden|blocked request|request blocked|request unsuccessful|temporarily blocked|temporarily unavailable|you have been blocked|access to this page has been denied)/i.test(value)) {
177
+ return "Blocked-page markers matched the page";
178
+ }
179
+ return undefined;
180
+ }
181
+ function classifyAuthwallText(value) {
182
+ if (/(sign in to view|sign in to continue|log in to continue|join to view|join now|join linkedin|join to continue|sign up to view|create a free account|view full profile|view profile|join to see more)/i.test(value)) {
183
+ return "Login or signup wall markers matched the page";
184
+ }
185
+ return undefined;
186
+ }
187
+ function classifyCookieWallText(value) {
188
+ if (/(cookie preferences|accept cookies|manage cookies|consent preferences|we use cookies|your privacy choices|respects your privacy)/i.test(value)) {
189
+ return "Cookie or consent wall markers matched the page";
190
+ }
191
+ return undefined;
192
+ }
@@ -10,6 +10,7 @@ exports.extractReadableSegmentFromHtml = extractReadableSegmentFromHtml;
10
10
  const cheerio_1 = require("cheerio");
11
11
  const config_1 = require("../config");
12
12
  const browserRead_1 = require("./browserRead");
13
+ const pageOutcome_1 = require("./pageOutcome");
13
14
  const unlocker_1 = require("./unlocker");
14
15
  function normalizeReadSourceMode(value, defaultMode = "auto") {
15
16
  if (!value) {
@@ -50,6 +51,7 @@ async function readRenderedHtmlContent(url, config, apiKey, options = {}) {
50
51
  renderSource: "browser",
51
52
  fallbackAttempted: false,
52
53
  fallbackUsed: false,
54
+ outcome: "ok",
53
55
  };
54
56
  }
55
57
  const unlocker = await (0, unlocker_1.scrapeRenderedHtml)(url, apiKey, options.request);
@@ -59,17 +61,26 @@ async function readRenderedHtmlContent(url, config, apiKey, options = {}) {
59
61
  renderSource: "unlocker",
60
62
  fallbackAttempted: false,
61
63
  fallbackUsed: false,
64
+ outcome: "ok",
62
65
  request: unlocker.request,
63
66
  };
64
67
  }
65
68
  const unlockerText = (0, unlocker_1.htmlToText)(unlocker.content);
66
69
  const assessment = assessReadableContent(unlocker.content, unlockerText);
70
+ const outcomeAssessment = (0, pageOutcome_1.assessReadablePageOutcome)(unlocker.content, unlockerText, {
71
+ looksIncomplete: assessment.shouldFallback,
72
+ incompleteReason: assessment.reason,
73
+ });
67
74
  if (!assessment.shouldFallback) {
68
75
  return {
69
76
  html: unlocker.content,
70
77
  renderSource: "unlocker",
71
78
  fallbackAttempted: false,
72
79
  fallbackUsed: false,
80
+ outcome: outcomeAssessment.outcome,
81
+ outcomeReason: outcomeAssessment.reason,
82
+ nextActionHint: outcomeAssessment.nextActionHint,
83
+ warning: outcomeAssessment.warning,
73
84
  request: unlocker.request,
74
85
  };
75
86
  }
@@ -80,6 +91,10 @@ async function readRenderedHtmlContent(url, config, apiKey, options = {}) {
80
91
  fallbackAttempted: true,
81
92
  fallbackUsed: false,
82
93
  fallbackReason: `${assessment.reason}; GOLOGIN_TOKEN is not configured`,
94
+ outcome: outcomeAssessment.outcome,
95
+ outcomeReason: outcomeAssessment.reason,
96
+ nextActionHint: outcomeAssessment.nextActionHint,
97
+ warning: outcomeAssessment.warning,
83
98
  request: unlocker.request,
84
99
  };
85
100
  }
@@ -92,6 +107,7 @@ async function readRenderedHtmlContent(url, config, apiKey, options = {}) {
92
107
  fallbackAttempted: true,
93
108
  fallbackUsed: true,
94
109
  fallbackReason: assessment.reason,
110
+ outcome: "ok",
95
111
  request: unlocker.request,
96
112
  };
97
113
  }
@@ -150,27 +166,40 @@ async function readReadableContent(url, config, apiKey, options) {
150
166
  renderSource: "browser",
151
167
  fallbackAttempted: false,
152
168
  fallbackUsed: false,
169
+ outcome: "ok",
153
170
  };
154
171
  }
155
172
  const unlocker = await (0, unlocker_1.scrapeRenderedHtml)(url, apiKey, options.request);
156
173
  const readable = extractReadableSegmentFromHtml(unlocker.content);
157
174
  const unlockerContent = formatReadableContent(options.format, readable.html, readable.text);
175
+ const assessment = assessReadableContent(unlocker.content, unlockerContent);
176
+ const outcomeAssessment = (0, pageOutcome_1.assessReadablePageOutcome)(unlocker.content, unlockerContent, {
177
+ looksIncomplete: assessment.shouldFallback,
178
+ incompleteReason: assessment.reason,
179
+ });
158
180
  if (source === "unlocker") {
159
181
  return {
160
182
  content: unlockerContent,
161
183
  renderSource: "unlocker",
162
184
  fallbackAttempted: false,
163
185
  fallbackUsed: false,
186
+ outcome: outcomeAssessment.outcome,
187
+ outcomeReason: outcomeAssessment.reason,
188
+ nextActionHint: outcomeAssessment.nextActionHint,
189
+ warning: outcomeAssessment.warning,
164
190
  request: unlocker.request,
165
191
  };
166
192
  }
167
- const assessment = assessReadableContent(unlocker.content, unlockerContent);
168
193
  if (!assessment.shouldFallback) {
169
194
  return {
170
195
  content: unlockerContent,
171
196
  renderSource: "unlocker",
172
197
  fallbackAttempted: false,
173
198
  fallbackUsed: false,
199
+ outcome: outcomeAssessment.outcome,
200
+ outcomeReason: outcomeAssessment.reason,
201
+ nextActionHint: outcomeAssessment.nextActionHint,
202
+ warning: outcomeAssessment.warning,
174
203
  request: unlocker.request,
175
204
  };
176
205
  }
@@ -181,6 +210,10 @@ async function readReadableContent(url, config, apiKey, options) {
181
210
  fallbackAttempted: true,
182
211
  fallbackUsed: false,
183
212
  fallbackReason: `${assessment.reason}; GOLOGIN_TOKEN is not configured`,
213
+ outcome: outcomeAssessment.outcome,
214
+ outcomeReason: outcomeAssessment.reason,
215
+ nextActionHint: outcomeAssessment.nextActionHint,
216
+ warning: outcomeAssessment.warning,
184
217
  request: unlocker.request,
185
218
  };
186
219
  }
@@ -195,6 +228,10 @@ async function readReadableContent(url, config, apiKey, options) {
195
228
  fallbackAttempted: true,
196
229
  fallbackUsed: false,
197
230
  fallbackReason: "Browser fallback did not improve readable output",
231
+ outcome: outcomeAssessment.outcome,
232
+ outcomeReason: outcomeAssessment.reason,
233
+ nextActionHint: outcomeAssessment.nextActionHint,
234
+ warning: outcomeAssessment.warning,
198
235
  request: unlocker.request,
199
236
  };
200
237
  }
@@ -204,6 +241,7 @@ async function readReadableContent(url, config, apiKey, options) {
204
241
  fallbackAttempted: true,
205
242
  fallbackUsed: true,
206
243
  fallbackReason: assessment.reason,
244
+ outcome: "ok",
207
245
  request: unlocker.request,
208
246
  };
209
247
  }
@@ -9,21 +9,27 @@ exports.detectStructuredBlockReason = detectStructuredBlockReason;
9
9
  const config_1 = require("../config");
10
10
  const errors_1 = require("./errors");
11
11
  const browserStructured_1 = require("./browserStructured");
12
+ const pageOutcome_1 = require("./pageOutcome");
12
13
  const unlocker_1 = require("./unlocker");
13
14
  class StructuredBlockedPageError extends errors_1.CliError {
14
15
  status;
15
16
  request;
16
- constructor(url, status, request, reason, options) {
17
- super(`Structured scrape returned a likely blocked or challenge page for ${url}.`, 1, [
17
+ outcome;
18
+ nextActionHint;
19
+ constructor(url, status, request, outcome, reason, nextActionHint, options) {
20
+ super(`Structured scrape returned ${outcome.replace(/_/g, " ")} content for ${url}.`, 1, [
18
21
  `Reason: ${reason}.`,
19
22
  options.fallbackAttempted
20
23
  ? options.fallbackUsed
21
24
  ? "Browser fallback was used but the page still looked blocked."
22
25
  : `Browser fallback was attempted but not used. ${options.fallbackReason ?? "It did not improve the result."}`
23
- : "Retry with --fallback browser, use read --source browser, or switch to gologin-local-agent-browser for full rendered DOM.",
24
- ].join("\n"), "BLOCKED_PAGE");
26
+ : (0, pageOutcome_1.describeNextActionHint)(nextActionHint) ??
27
+ "Retry with --fallback browser, use read --source browser, or switch to gologin-local-agent-browser for full rendered DOM.",
28
+ ].join("\n"), outcomeToErrorCode(outcome));
25
29
  this.status = status;
26
30
  this.request = request;
31
+ this.outcome = outcome;
32
+ this.nextActionHint = nextActionHint;
27
33
  }
28
34
  }
29
35
  async function scrapeStructuredJson(url, config, apiKey, options = {}) {
@@ -34,7 +40,7 @@ async function scrapeStructuredJson(url, config, apiKey, options = {}) {
34
40
  let fallbackAttempted = false;
35
41
  let fallbackUsed = false;
36
42
  let fallbackReason;
37
- let { browserRecommended, warning } = buildStructuredFallbackAdvisory(data);
43
+ let { outcome, reason, nextActionHint, browserRecommended, warning } = (0, pageOutcome_1.assessStructuredPageOutcome)(data);
38
44
  if (fallbackMode === "browser" && shouldUseBrowserFallback(data)) {
39
45
  fallbackAttempted = true;
40
46
  (0, config_1.requireCloudToken)(config);
@@ -46,16 +52,14 @@ async function scrapeStructuredJson(url, config, apiKey, options = {}) {
46
52
  renderSource = "browser";
47
53
  fallbackUsed = true;
48
54
  fallbackReason = "unlocker structured data looked incomplete";
49
- browserRecommended = false;
50
- warning = undefined;
55
+ ({ outcome, reason, nextActionHint, browserRecommended, warning } = (0, pageOutcome_1.assessStructuredPageOutcome)(data));
51
56
  }
52
57
  else {
53
58
  fallbackReason = "browser fallback did not improve structured output";
54
59
  }
55
60
  }
56
- const blockedReason = detectStructuredBlockReason(data);
57
- if (blockedReason) {
58
- throw new StructuredBlockedPageError(url, result.status, result.request, blockedReason, {
61
+ if (outcome === "authwall" || outcome === "challenge" || outcome === "blocked" || outcome === "cookie_wall") {
62
+ throw new StructuredBlockedPageError(url, result.status, result.request, outcome, reason ?? "Outcome matched page markers", nextActionHint, {
59
63
  fallbackAttempted,
60
64
  fallbackUsed,
61
65
  fallbackReason,
@@ -63,6 +67,9 @@ async function scrapeStructuredJson(url, config, apiKey, options = {}) {
63
67
  }
64
68
  return makeStructuredScrapeEnvelope(url, result, data, {
65
69
  renderSource,
70
+ outcome,
71
+ outcomeReason: reason,
72
+ nextActionHint,
66
73
  fallbackAttempted,
67
74
  fallbackUsed,
68
75
  fallbackReason,
@@ -74,6 +81,9 @@ function makeStructuredScrapeEnvelope(url, result, data, options = {}) {
74
81
  return {
75
82
  url,
76
83
  status: result.status,
84
+ outcome: options.outcome ?? "ok",
85
+ outcomeReason: options.outcomeReason,
86
+ nextActionHint: options.nextActionHint,
77
87
  renderSource: options.renderSource ?? "unlocker",
78
88
  fallbackAttempted: options.fallbackAttempted ?? false,
79
89
  fallbackUsed: options.fallbackUsed ?? false,
@@ -94,37 +104,18 @@ function normalizeStructuredFallbackMode(value) {
94
104
  throw new Error(`Unsupported scrape-json fallback mode: ${value}`);
95
105
  }
96
106
  function shouldUseBrowserFallback(data) {
97
- if (detectStructuredBlockReason(data)) {
98
- return true;
99
- }
100
- const firstH1 = data.headingsByLevel.h1[0];
101
- if (!firstH1) {
102
- return true;
103
- }
104
- return looksSuspiciousHeadingText(firstH1);
107
+ return (0, pageOutcome_1.assessStructuredPageOutcome)(data).outcome !== "ok";
105
108
  }
106
109
  function buildStructuredFallbackAdvisory(data) {
107
- const blockedReason = detectStructuredBlockReason(data);
108
- if (blockedReason) {
109
- return {
110
- browserRecommended: true,
111
- warning: `Structured output looks blocked or challenge-gated (${blockedReason}). Retry with --fallback browser or use a rendered browser path.`,
112
- };
113
- }
114
- if (!shouldUseBrowserFallback(data)) {
115
- return { browserRecommended: false };
116
- }
110
+ const assessment = (0, pageOutcome_1.assessStructuredPageOutcome)(data);
117
111
  return {
118
- browserRecommended: true,
119
- warning: "Structured output looks incomplete or client-rendered. Retry with --fallback browser or use read/open for rendered DOM.",
112
+ browserRecommended: assessment.browserRecommended,
113
+ warning: assessment.warning,
120
114
  };
121
115
  }
122
- function looksSuspiciousHeadingText(value) {
123
- return /function\s*\(|window\.|document\.|const\s+|let\s+|var\s+|=>|import\s+/i.test(value) || value.length > 240;
124
- }
125
116
  function isBrowserDataBetter(current, candidate) {
126
- const currentBlocked = Boolean(detectStructuredBlockReason(current));
127
- const candidateBlocked = Boolean(detectStructuredBlockReason(candidate));
117
+ const currentBlocked = Boolean((0, pageOutcome_1.detectStructuredBlockedReason)(current));
118
+ const candidateBlocked = Boolean((0, pageOutcome_1.detectStructuredBlockedReason)(candidate));
128
119
  if (currentBlocked !== candidateBlocked) {
129
120
  return currentBlocked && !candidateBlocked;
130
121
  }
@@ -140,27 +131,25 @@ function isBrowserDataBetter(current, candidate) {
140
131
  return false;
141
132
  }
142
133
  function detectStructuredBlockReason(data) {
143
- const candidates = [
144
- data.title,
145
- data.description,
146
- ...data.headingsByLevel.h1.slice(0, 2),
147
- ...data.headingsByLevel.h2.slice(0, 2),
148
- ].filter((value) => Boolean(value && value.trim()));
149
- for (const candidate of candidates) {
150
- const reason = classifyBlockedText(candidate);
151
- if (reason) {
152
- return reason;
153
- }
154
- }
155
- return undefined;
134
+ return ((0, pageOutcome_1.assessStructuredPageOutcome)(data).reason ??
135
+ (0, pageOutcome_1.detectStructuredBlockedReason)(data));
156
136
  }
157
- function classifyBlockedText(value) {
158
- const text = value.trim();
159
- if (/(verify you are human|verify you are a human|are you human|captcha|security check|attention required|just a moment|checking your browser|enable javascript and cookies to continue|one more step)/i.test(text)) {
160
- return "challenge markers matched the page title or heading";
161
- }
162
- if (/(access denied|forbidden|blocked request|request blocked|request unsuccessful|temporarily blocked|temporarily unavailable|you have been blocked|access to this page has been denied)/i.test(text)) {
163
- return "blocked-page markers matched the page title or heading";
137
+ function outcomeToErrorCode(outcome) {
138
+ switch (outcome) {
139
+ case "authwall":
140
+ return "AUTHWALL_PAGE";
141
+ case "challenge":
142
+ return "CHALLENGE_PAGE";
143
+ case "cookie_wall":
144
+ return "COOKIE_WALL_PAGE";
145
+ case "blocked":
146
+ return "BLOCKED_PAGE";
147
+ case "empty":
148
+ return "EMPTY_PAGE";
149
+ case "incomplete":
150
+ return "INCOMPLETE_PAGE";
151
+ case "ok":
152
+ default:
153
+ return "PAGE_OUTCOME";
164
154
  }
165
- return undefined;
166
155
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "gologin-web-access",
3
- "version": "0.3.1",
3
+ "version": "0.3.2",
4
4
  "description": "Unified web access CLI for developers and AI agents to read and interact with the web using Gologin Web Unlocker and Cloud Browser.",
5
5
  "main": "dist/cli.js",
6
6
  "bin": {