gologin-web-access 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/CHANGELOG.md +19 -0
  2. package/LICENSE +21 -0
  3. package/README.md +344 -0
  4. package/dist/cli.js +173 -0
  5. package/dist/commands/back.js +13 -0
  6. package/dist/commands/batch.js +81 -0
  7. package/dist/commands/batchChangeTrack.js +99 -0
  8. package/dist/commands/batchExtract.js +97 -0
  9. package/dist/commands/batchScrape.js +140 -0
  10. package/dist/commands/changeTrack.js +65 -0
  11. package/dist/commands/check.js +14 -0
  12. package/dist/commands/click.js +14 -0
  13. package/dist/commands/close.js +19 -0
  14. package/dist/commands/configInit.js +77 -0
  15. package/dist/commands/configShow.js +23 -0
  16. package/dist/commands/cookies.js +22 -0
  17. package/dist/commands/cookiesClear.js +13 -0
  18. package/dist/commands/cookiesImport.js +14 -0
  19. package/dist/commands/crawl.js +71 -0
  20. package/dist/commands/crawlErrors.js +20 -0
  21. package/dist/commands/crawlResult.js +27 -0
  22. package/dist/commands/crawlStart.js +56 -0
  23. package/dist/commands/crawlStatus.js +25 -0
  24. package/dist/commands/current.js +14 -0
  25. package/dist/commands/dblclick.js +14 -0
  26. package/dist/commands/eval.js +20 -0
  27. package/dist/commands/extract.js +44 -0
  28. package/dist/commands/fill.js +15 -0
  29. package/dist/commands/find.js +16 -0
  30. package/dist/commands/focus.js +14 -0
  31. package/dist/commands/forward.js +13 -0
  32. package/dist/commands/get.js +15 -0
  33. package/dist/commands/hover.js +14 -0
  34. package/dist/commands/jobs.js +47 -0
  35. package/dist/commands/map.js +61 -0
  36. package/dist/commands/open.js +22 -0
  37. package/dist/commands/parseDocument.js +34 -0
  38. package/dist/commands/pdf.js +14 -0
  39. package/dist/commands/press.js +15 -0
  40. package/dist/commands/read.js +51 -0
  41. package/dist/commands/reload.js +13 -0
  42. package/dist/commands/run.js +76 -0
  43. package/dist/commands/scrape.js +19 -0
  44. package/dist/commands/scrapeJson.js +24 -0
  45. package/dist/commands/scrapeMarkdown.js +37 -0
  46. package/dist/commands/scrapeScreenshot.js +65 -0
  47. package/dist/commands/scrapeText.js +37 -0
  48. package/dist/commands/screenshot.js +23 -0
  49. package/dist/commands/scroll.js +23 -0
  50. package/dist/commands/scrollIntoView.js +14 -0
  51. package/dist/commands/search.js +39 -0
  52. package/dist/commands/searchBrowser.js +28 -0
  53. package/dist/commands/select.js +15 -0
  54. package/dist/commands/sessions.js +14 -0
  55. package/dist/commands/shared.js +102 -0
  56. package/dist/commands/snapshot.js +18 -0
  57. package/dist/commands/storageClear.js +18 -0
  58. package/dist/commands/storageExport.js +26 -0
  59. package/dist/commands/storageImport.js +23 -0
  60. package/dist/commands/tabClose.js +18 -0
  61. package/dist/commands/tabFocus.js +15 -0
  62. package/dist/commands/tabOpen.js +19 -0
  63. package/dist/commands/tabs.js +13 -0
  64. package/dist/commands/type.js +15 -0
  65. package/dist/commands/uncheck.js +14 -0
  66. package/dist/commands/upload.js +15 -0
  67. package/dist/commands/wait.js +27 -0
  68. package/dist/config.js +260 -0
  69. package/dist/doctor.js +86 -0
  70. package/dist/internal-agent/cli.js +336 -0
  71. package/dist/internal-agent/commands/back.js +12 -0
  72. package/dist/internal-agent/commands/check.js +17 -0
  73. package/dist/internal-agent/commands/click.js +17 -0
  74. package/dist/internal-agent/commands/close.js +12 -0
  75. package/dist/internal-agent/commands/cookies.js +23 -0
  76. package/dist/internal-agent/commands/cookiesClear.js +12 -0
  77. package/dist/internal-agent/commands/cookiesImport.js +18 -0
  78. package/dist/internal-agent/commands/current.js +9 -0
  79. package/dist/internal-agent/commands/dblclick.js +17 -0
  80. package/dist/internal-agent/commands/doctor.js +53 -0
  81. package/dist/internal-agent/commands/eval.js +30 -0
  82. package/dist/internal-agent/commands/fill.js +18 -0
  83. package/dist/internal-agent/commands/find.js +86 -0
  84. package/dist/internal-agent/commands/focus.js +17 -0
  85. package/dist/internal-agent/commands/forward.js +12 -0
  86. package/dist/internal-agent/commands/get.js +19 -0
  87. package/dist/internal-agent/commands/hover.js +17 -0
  88. package/dist/internal-agent/commands/open.js +67 -0
  89. package/dist/internal-agent/commands/pdf.js +18 -0
  90. package/dist/internal-agent/commands/press.js +19 -0
  91. package/dist/internal-agent/commands/reload.js +12 -0
  92. package/dist/internal-agent/commands/screenshot.js +22 -0
  93. package/dist/internal-agent/commands/scroll.js +25 -0
  94. package/dist/internal-agent/commands/scrollIntoView.js +17 -0
  95. package/dist/internal-agent/commands/select.js +18 -0
  96. package/dist/internal-agent/commands/sessions.js +15 -0
  97. package/dist/internal-agent/commands/shared.js +51 -0
  98. package/dist/internal-agent/commands/snapshot.js +16 -0
  99. package/dist/internal-agent/commands/storageClear.js +13 -0
  100. package/dist/internal-agent/commands/storageExport.js +24 -0
  101. package/dist/internal-agent/commands/storageImport.js +20 -0
  102. package/dist/internal-agent/commands/tabClose.js +21 -0
  103. package/dist/internal-agent/commands/tabFocus.js +21 -0
  104. package/dist/internal-agent/commands/tabOpen.js +13 -0
  105. package/dist/internal-agent/commands/tabs.js +17 -0
  106. package/dist/internal-agent/commands/type.js +18 -0
  107. package/dist/internal-agent/commands/uncheck.js +17 -0
  108. package/dist/internal-agent/commands/upload.js +18 -0
  109. package/dist/internal-agent/commands/wait.js +41 -0
  110. package/dist/internal-agent/daemon/browser.js +818 -0
  111. package/dist/internal-agent/daemon/refStore.js +26 -0
  112. package/dist/internal-agent/daemon/server.js +330 -0
  113. package/dist/internal-agent/daemon/sessionManager.js +684 -0
  114. package/dist/internal-agent/daemon/snapshot.js +285 -0
  115. package/dist/internal-agent/lib/config.js +59 -0
  116. package/dist/internal-agent/lib/daemon.js +300 -0
  117. package/dist/internal-agent/lib/errors.js +63 -0
  118. package/dist/internal-agent/lib/types.js +2 -0
  119. package/dist/internal-agent/lib/utils.js +165 -0
  120. package/dist/jobRunner.js +56 -0
  121. package/dist/lib/agentCli.js +158 -0
  122. package/dist/lib/browserRead.js +125 -0
  123. package/dist/lib/browserStructured.js +77 -0
  124. package/dist/lib/changeTracking.js +117 -0
  125. package/dist/lib/cloudApi.js +41 -0
  126. package/dist/lib/concurrency.js +15 -0
  127. package/dist/lib/crawl.js +313 -0
  128. package/dist/lib/document.js +170 -0
  129. package/dist/lib/errors.js +55 -0
  130. package/dist/lib/extract.js +65 -0
  131. package/dist/lib/extractRunner.js +22 -0
  132. package/dist/lib/jobRegistry.js +164 -0
  133. package/dist/lib/output.js +122 -0
  134. package/dist/lib/readSource.js +297 -0
  135. package/dist/lib/runbooks.js +193 -0
  136. package/dist/lib/search.js +727 -0
  137. package/dist/lib/selfCli.js +136 -0
  138. package/dist/lib/structuredScrape.js +83 -0
  139. package/dist/lib/types.js +2 -0
  140. package/dist/lib/unlocker.js +383 -0
  141. package/package.json +67 -0
@@ -0,0 +1,136 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.resolveProjectRoot = resolveProjectRoot;
7
+ exports.resolveSelfCliInvocation = resolveSelfCliInvocation;
8
+ exports.runSelfCommandCapture = runSelfCommandCapture;
9
+ exports.spawnDetachedSelfCommand = spawnDetachedSelfCommand;
10
+ exports.resolveNodeScriptInvocation = resolveNodeScriptInvocation;
11
+ exports.spawnDetachedNodeInvocation = spawnDetachedNodeInvocation;
12
+ const child_process_1 = require("child_process");
13
+ const fs_1 = require("fs");
14
+ const path_1 = __importDefault(require("path"));
15
+ function resolveProjectRoot() {
16
+ return path_1.default.resolve(__dirname, "..", "..");
17
+ }
18
+ async function exists(targetPath) {
19
+ try {
20
+ await fs_1.promises.access(targetPath);
21
+ return true;
22
+ }
23
+ catch {
24
+ return false;
25
+ }
26
+ }
27
+ async function resolveSelfCliInvocation(env = process.env) {
28
+ const projectRoot = resolveProjectRoot();
29
+ const preferSource = env.GOLOGIN_WEB_ACCESS_USE_SOURCE_CLI === "1";
30
+ const distCli = path_1.default.join(projectRoot, "dist", "cli.js");
31
+ const tsxCli = path_1.default.join(projectRoot, "node_modules", "tsx", "dist", "cli.mjs");
32
+ const srcCli = path_1.default.join(projectRoot, "src", "cli.ts");
33
+ if (preferSource && (await exists(tsxCli)) && (await exists(srcCli))) {
34
+ return {
35
+ command: process.execPath,
36
+ args: [tsxCli, srcCli],
37
+ cwd: projectRoot
38
+ };
39
+ }
40
+ if (await exists(distCli)) {
41
+ return {
42
+ command: process.execPath,
43
+ args: [distCli],
44
+ cwd: projectRoot
45
+ };
46
+ }
47
+ if ((await exists(tsxCli)) && (await exists(srcCli))) {
48
+ return {
49
+ command: process.execPath,
50
+ args: [tsxCli, srcCli],
51
+ cwd: projectRoot
52
+ };
53
+ }
54
+ throw new Error(`Unable to resolve gologin-web-access CLI from ${projectRoot}`);
55
+ }
56
+ async function runSelfCommandCapture(args, options = {}) {
57
+ const childEnv = {
58
+ ...process.env,
59
+ ...options.env
60
+ };
61
+ const invocation = await resolveSelfCliInvocation(childEnv);
62
+ return new Promise((resolve, reject) => {
63
+ const child = (0, child_process_1.spawn)(invocation.command, [...invocation.args, ...args], {
64
+ cwd: options.cwd ?? invocation.cwd,
65
+ env: childEnv,
66
+ stdio: ["ignore", "pipe", "pipe"]
67
+ });
68
+ let stdout = "";
69
+ let stderr = "";
70
+ child.stdout.on("data", (chunk) => {
71
+ stdout += chunk.toString("utf8");
72
+ });
73
+ child.stderr.on("data", (chunk) => {
74
+ stderr += chunk.toString("utf8");
75
+ });
76
+ child.on("error", reject);
77
+ child.on("exit", (code, signal) => {
78
+ if (signal) {
79
+ reject(new Error(`gologin-web-access exited via signal ${signal}`));
80
+ return;
81
+ }
82
+ resolve({
83
+ exitCode: code ?? 1,
84
+ stdout,
85
+ stderr
86
+ });
87
+ });
88
+ });
89
+ }
90
+ async function spawnDetachedSelfCommand(args, options = {}) {
91
+ const invocation = await resolveSelfCliInvocation();
92
+ const child = (0, child_process_1.spawn)(invocation.command, [...invocation.args, ...args], {
93
+ cwd: options.cwd ?? invocation.cwd,
94
+ env: {
95
+ ...process.env,
96
+ ...options.env
97
+ },
98
+ detached: true,
99
+ stdio: "ignore"
100
+ });
101
+ child.unref();
102
+ }
103
+ async function resolveNodeScriptInvocation(scriptBasename) {
104
+ const projectRoot = resolveProjectRoot();
105
+ const distScript = path_1.default.join(projectRoot, "dist", `${scriptBasename}.js`);
106
+ if (await exists(distScript)) {
107
+ return {
108
+ command: process.execPath,
109
+ args: [distScript],
110
+ cwd: projectRoot
111
+ };
112
+ }
113
+ const tsxCli = path_1.default.join(projectRoot, "node_modules", "tsx", "dist", "cli.mjs");
114
+ const srcScript = path_1.default.join(projectRoot, "src", `${scriptBasename}.ts`);
115
+ if ((await exists(tsxCli)) && (await exists(srcScript))) {
116
+ return {
117
+ command: process.execPath,
118
+ args: [tsxCli, srcScript],
119
+ cwd: projectRoot
120
+ };
121
+ }
122
+ throw new Error(`Unable to resolve script ${scriptBasename} from ${projectRoot}`);
123
+ }
124
+ async function spawnDetachedNodeInvocation(scriptBasename, args, options = {}) {
125
+ const invocation = await resolveNodeScriptInvocation(scriptBasename);
126
+ const child = (0, child_process_1.spawn)(invocation.command, [...invocation.args, ...args], {
127
+ cwd: options.cwd ?? invocation.cwd,
128
+ env: {
129
+ ...process.env,
130
+ ...options.env
131
+ },
132
+ detached: true,
133
+ stdio: "ignore"
134
+ });
135
+ child.unref();
136
+ }
@@ -0,0 +1,83 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.scrapeStructuredJson = scrapeStructuredJson;
4
+ exports.makeStructuredScrapeEnvelope = makeStructuredScrapeEnvelope;
5
+ exports.normalizeStructuredFallbackMode = normalizeStructuredFallbackMode;
6
+ exports.shouldUseBrowserFallback = shouldUseBrowserFallback;
7
+ const config_1 = require("../config");
8
+ const browserStructured_1 = require("./browserStructured");
9
+ const unlocker_1 = require("./unlocker");
10
+ async function scrapeStructuredJson(url, config, apiKey, options = {}) {
11
+ const result = await (0, unlocker_1.scrapeJson)(url, apiKey, options.request);
12
+ const fallbackMode = options.fallback ?? "none";
13
+ let data = result.data;
14
+ let renderSource = "unlocker";
15
+ let fallbackAttempted = false;
16
+ let fallbackUsed = false;
17
+ let fallbackReason;
18
+ if (fallbackMode === "browser" && shouldUseBrowserFallback(data)) {
19
+ fallbackAttempted = true;
20
+ (0, config_1.requireCloudToken)(config);
21
+ const browserData = await (0, browserStructured_1.scrapeJsonViaBrowser)(url, config, {
22
+ profile: options.profile,
23
+ });
24
+ if (isBrowserDataBetter(data, browserData)) {
25
+ data = browserData;
26
+ renderSource = "browser";
27
+ fallbackUsed = true;
28
+ fallbackReason = "unlocker structured data looked incomplete";
29
+ }
30
+ else {
31
+ fallbackReason = "browser fallback did not improve structured output";
32
+ }
33
+ }
34
+ return makeStructuredScrapeEnvelope(url, result, data, {
35
+ renderSource,
36
+ fallbackAttempted,
37
+ fallbackUsed,
38
+ fallbackReason,
39
+ });
40
+ }
41
+ function makeStructuredScrapeEnvelope(url, result, data, options = {}) {
42
+ return {
43
+ url,
44
+ status: result.status,
45
+ renderSource: options.renderSource ?? "unlocker",
46
+ fallbackAttempted: options.fallbackAttempted ?? false,
47
+ fallbackUsed: options.fallbackUsed ?? false,
48
+ fallbackReason: options.fallbackReason,
49
+ request: result.request,
50
+ data,
51
+ };
52
+ }
53
+ function normalizeStructuredFallbackMode(value) {
54
+ if (!value || value === "none") {
55
+ return "none";
56
+ }
57
+ if (value === "browser") {
58
+ return "browser";
59
+ }
60
+ throw new Error(`Unsupported scrape-json fallback mode: ${value}`);
61
+ }
62
+ function shouldUseBrowserFallback(data) {
63
+ const firstH1 = data.headingsByLevel.h1[0];
64
+ if (!firstH1) {
65
+ return true;
66
+ }
67
+ return looksSuspiciousHeadingText(firstH1);
68
+ }
69
+ function looksSuspiciousHeadingText(value) {
70
+ return /function\s*\(|window\.|document\.|const\s+|let\s+|var\s+|=>|import\s+/i.test(value) || value.length > 240;
71
+ }
72
+ function isBrowserDataBetter(current, candidate) {
73
+ if (candidate.headingsByLevel.h1.length > current.headingsByLevel.h1.length) {
74
+ return true;
75
+ }
76
+ if (!current.title && Boolean(candidate.title)) {
77
+ return true;
78
+ }
79
+ if (candidate.headings.length > current.headings.length) {
80
+ return true;
81
+ }
82
+ return false;
83
+ }
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
@@ -0,0 +1,383 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.scrapeRenderedHtml = scrapeRenderedHtml;
4
+ exports.scrapeText = scrapeText;
5
+ exports.scrapeMarkdown = scrapeMarkdown;
6
+ exports.scrapeJson = scrapeJson;
7
+ exports.validateWebUnlockerKey = validateWebUnlockerKey;
8
+ exports.htmlToText = htmlToText;
9
+ exports.htmlToMarkdown = htmlToMarkdown;
10
+ exports.htmlToStructuredData = htmlToStructuredData;
11
+ const errors_1 = require("./errors");
12
+ const DEFAULT_BASE_URL = "https://parsing.webunlocker.gologin.com";
13
+ const DEFAULT_TIMEOUT_MS = 15_000;
14
+ const DEFAULT_MAX_RETRIES = 2;
15
+ const MAX_EXTRACTED_LINKS = 100;
16
+ const MAX_EXTRACTED_HEADINGS = 50;
17
+ class WebUnlockerClient {
18
+ apiKey;
19
+ baseUrl;
20
+ timeoutMs;
21
+ maxRetries;
22
+ backoffMs;
23
+ constructor(options) {
24
+ this.apiKey = options.apiKey;
25
+ this.baseUrl = normalizeBaseUrl(options.baseUrl ?? DEFAULT_BASE_URL);
26
+ this.timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
27
+ this.maxRetries = options.maxRetries ?? DEFAULT_MAX_RETRIES;
28
+ this.backoffMs = options.backoffMs ?? 250;
29
+ if (!this.apiKey.trim()) {
30
+ throw new Error("apiKey is required");
31
+ }
32
+ }
33
+ async scrape(url, options = {}) {
34
+ assertValidTargetUrl(url);
35
+ const requestUrl = new URL("/v1/scrape", this.baseUrl);
36
+ requestUrl.searchParams.set("url", url);
37
+ const { response, request } = await fetchWithRetry(requestUrl.toString(), {
38
+ headers: {
39
+ apikey: this.apiKey,
40
+ },
41
+ timeoutMs: options.timeoutMs ?? this.timeoutMs,
42
+ maxRetries: options.maxRetries ?? this.maxRetries,
43
+ backoffMs: options.backoffMs ?? this.backoffMs,
44
+ });
45
+ if (!response.ok) {
46
+ const body = await safeReadText(response, this.timeoutMs);
47
+ throw new errors_1.HttpError(`Web Unlocker request failed with status ${response.status}.`, response.status, body ? truncate(body, 300) : undefined);
48
+ }
49
+ const content = await readResponseTextWithTimeout(response, this.timeoutMs);
50
+ return {
51
+ success: true,
52
+ url,
53
+ content,
54
+ contentType: response.headers.get("content-type"),
55
+ status: response.status,
56
+ headers: headersToRecord(response.headers),
57
+ request,
58
+ };
59
+ }
60
+ }
61
+ async function scrapeRenderedHtml(url, apiKey, options = {}) {
62
+ return createWebUnlockerClient(apiKey).scrape(url, options);
63
+ }
64
+ async function scrapeText(url, apiKey, options = {}) {
65
+ const scraped = await createWebUnlockerClient(apiKey).scrape(url, options);
66
+ return {
67
+ ...scraped,
68
+ text: htmlToText(scraped.content),
69
+ };
70
+ }
71
+ async function scrapeMarkdown(url, apiKey, options = {}) {
72
+ const scraped = await createWebUnlockerClient(apiKey).scrape(url, options);
73
+ return {
74
+ ...scraped,
75
+ markdown: htmlToMarkdown(scraped.content),
76
+ };
77
+ }
78
+ async function scrapeJson(url, apiKey, options = {}) {
79
+ const scraped = await createWebUnlockerClient(apiKey).scrape(url, options);
80
+ return {
81
+ ...scraped,
82
+ data: htmlToStructuredData(scraped.content),
83
+ };
84
+ }
85
+ async function validateWebUnlockerKey(apiKey) {
86
+ try {
87
+ await scrapeRenderedHtml("https://example.com", apiKey, {
88
+ timeoutMs: 8_000,
89
+ maxRetries: 0,
90
+ });
91
+ return { ok: true };
92
+ }
93
+ catch (error) {
94
+ if (error instanceof errors_1.HttpError) {
95
+ return {
96
+ ok: false,
97
+ status: error.status,
98
+ detail: error.hint ?? error.message,
99
+ };
100
+ }
101
+ return {
102
+ ok: false,
103
+ detail: error instanceof Error ? error.message : String(error),
104
+ };
105
+ }
106
+ }
107
+ function createWebUnlockerClient(apiKey) {
108
+ return new WebUnlockerClient({ apiKey });
109
+ }
110
+ async function fetchWithRetry(url, options) {
111
+ const attempts = [];
112
+ let lastError;
113
+ let lastStatusError;
114
+ for (let attempt = 0; attempt <= options.maxRetries; attempt += 1) {
115
+ const controller = new AbortController();
116
+ const timeout = setTimeout(() => controller.abort(), options.timeoutMs);
117
+ try {
118
+ const response = await fetch(url, {
119
+ headers: options.headers,
120
+ signal: controller.signal,
121
+ });
122
+ clearTimeout(timeout);
123
+ if (response.ok) {
124
+ attempts.push({
125
+ attempt: attempt + 1,
126
+ status: response.status,
127
+ retriable: false,
128
+ });
129
+ return {
130
+ response,
131
+ request: buildScrapeRequestMeta(attempts),
132
+ };
133
+ }
134
+ const body = await safeReadText(response, options.timeoutMs);
135
+ const error = new errors_1.HttpError(`Web Unlocker request failed with status ${response.status}.`, response.status, body ? truncate(body, 300) : undefined);
136
+ const retriable = attempt < options.maxRetries && isRetriableStatus(response.status);
137
+ attempts.push({
138
+ attempt: attempt + 1,
139
+ status: response.status,
140
+ error: error.message,
141
+ retriable,
142
+ });
143
+ lastStatusError = error;
144
+ if (!retriable) {
145
+ throw attachRequestMeta(error, attempts);
146
+ }
147
+ await sleep(computeBackoffDelay(options.backoffMs, attempt));
148
+ continue;
149
+ }
150
+ catch (error) {
151
+ clearTimeout(timeout);
152
+ lastError = error;
153
+ if (error instanceof errors_1.HttpError) {
154
+ throw error;
155
+ }
156
+ const normalizedError = error instanceof Error && error.name === "AbortError"
157
+ ? new errors_1.HttpError("Web Unlocker request timed out.", 408)
158
+ : error instanceof Error
159
+ ? new errors_1.HttpError(error.message, 500)
160
+ : new errors_1.HttpError("Web Unlocker request failed.", 500);
161
+ const retriable = attempt < options.maxRetries;
162
+ attempts.push({
163
+ attempt: attempt + 1,
164
+ status: normalizedError.status,
165
+ error: normalizedError.message,
166
+ retriable,
167
+ });
168
+ if (attempt === options.maxRetries) {
169
+ throw attachRequestMeta(normalizedError, attempts);
170
+ }
171
+ await sleep(computeBackoffDelay(options.backoffMs, attempt));
172
+ }
173
+ }
174
+ if (lastStatusError) {
175
+ throw attachRequestMeta(lastStatusError, attempts);
176
+ }
177
+ if (lastError instanceof Error && lastError.name === "AbortError") {
178
+ throw attachRequestMeta(new errors_1.HttpError("Web Unlocker request timed out.", 408), attempts);
179
+ }
180
+ throw attachRequestMeta(lastError instanceof Error
181
+ ? new errors_1.HttpError(lastError.message, 500)
182
+ : new errors_1.HttpError("Web Unlocker request failed.", 500), attempts);
183
+ }
184
+ function buildScrapeRequestMeta(attempts) {
185
+ return {
186
+ attemptCount: attempts.length,
187
+ retryCount: Math.max(0, attempts.length - 1),
188
+ attempts: attempts.map((attempt) => ({ ...attempt })),
189
+ };
190
+ }
191
+ function attachRequestMeta(error, attempts) {
192
+ return Object.assign(error, {
193
+ request: buildScrapeRequestMeta(attempts),
194
+ });
195
+ }
196
+ function normalizeBaseUrl(baseUrl) {
197
+ return baseUrl.replace(/\/+$/, "");
198
+ }
199
+ function headersToRecord(headers) {
200
+ const record = {};
201
+ headers.forEach((value, key) => {
202
+ record[key] = value;
203
+ });
204
+ return record;
205
+ }
206
+ function assertValidTargetUrl(url) {
207
+ if (!url.trim()) {
208
+ throw new Error("url is required");
209
+ }
210
+ try {
211
+ new URL(url);
212
+ }
213
+ catch {
214
+ throw new Error("url must be a valid absolute URL");
215
+ }
216
+ }
217
+ async function safeReadText(response, timeoutMs = DEFAULT_TIMEOUT_MS) {
218
+ try {
219
+ return await readResponseTextWithTimeout(response, timeoutMs);
220
+ }
221
+ catch {
222
+ return "";
223
+ }
224
+ }
225
+ async function readResponseTextWithTimeout(response, timeoutMs) {
226
+ let timer;
227
+ try {
228
+ return await Promise.race([
229
+ response.text(),
230
+ new Promise((_, reject) => {
231
+ timer = setTimeout(() => {
232
+ void response.body?.cancel().catch(() => undefined);
233
+ reject(new errors_1.HttpError("Web Unlocker response body timed out.", 408));
234
+ }, timeoutMs);
235
+ }),
236
+ ]);
237
+ }
238
+ finally {
239
+ if (timer) {
240
+ clearTimeout(timer);
241
+ }
242
+ }
243
+ }
244
+ function sleep(ms) {
245
+ return new Promise((resolve) => {
246
+ setTimeout(resolve, ms);
247
+ });
248
+ }
249
+ function truncate(value, maxLength = 300) {
250
+ if (value.length <= maxLength) {
251
+ return value;
252
+ }
253
+ return `${value.slice(0, maxLength)}...`;
254
+ }
255
+ function htmlToText(html) {
256
+ const withoutScripts = html
257
+ .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gis, " ")
258
+ .replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gis, " ");
259
+ const withBreaks = withoutScripts
260
+ .replace(/<br\s*\/?>/gi, "\n")
261
+ .replace(/<\/(p|div|section|article|li|tr|h1|h2|h3|h4|h5|h6)>/gi, "\n");
262
+ const stripped = withBreaks.replace(/<[^>]+>/g, " ");
263
+ const decoded = decodeHtmlEntities(stripped);
264
+ return decoded
265
+ .replace(/\r/g, "")
266
+ .replace(/[ \t]+\n/g, "\n")
267
+ .replace(/\n{3,}/g, "\n\n")
268
+ .replace(/[ \t]{2,}/g, " ")
269
+ .trim();
270
+ }
271
+ function htmlToMarkdown(html) {
272
+ let markdown = html
273
+ .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gis, "")
274
+ .replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gis, "")
275
+ .replace(/<!--[\s\S]*?-->/g, "");
276
+ markdown = markdown.replace(/<h([1-6])\b[^>]*>([\s\S]*?)<\/h\1>/gi, (_, level, text) => `${"#".repeat(Number(level))} ${cleanInlineHtml(text)}\n\n`);
277
+ markdown = markdown.replace(/<a\b[^>]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi, (_, href, text) => `[${cleanInlineHtml(text)}](${href})`);
278
+ markdown = markdown
279
+ .replace(/<(strong|b)\b[^>]*>([\s\S]*?)<\/\1>/gi, (_, __, text) => `**${cleanInlineHtml(text)}**`)
280
+ .replace(/<(em|i)\b[^>]*>([\s\S]*?)<\/\1>/gi, (_, __, text) => `*${cleanInlineHtml(text)}*`)
281
+ .replace(/<code\b[^>]*>([\s\S]*?)<\/code>/gi, (_, text) => `\`${cleanInlineHtml(text)}\``)
282
+ .replace(/<li\b[^>]*>([\s\S]*?)<\/li>/gi, (_, text) => `- ${cleanInlineHtml(text)}\n`)
283
+ .replace(/<br\s*\/?>/gi, "\n")
284
+ .replace(/<\/(p|div|section|article|ul|ol|table|tr)>/gi, "\n\n")
285
+ .replace(/<[^>]+>/g, " ");
286
+ return decodeHtmlEntities(markdown)
287
+ .replace(/\r/g, "")
288
+ .replace(/[ \t]+\n/g, "\n")
289
+ .replace(/\n{3,}/g, "\n\n")
290
+ .replace(/[ \t]{2,}/g, " ")
291
+ .trim();
292
+ }
293
+ function htmlToStructuredData(html) {
294
+ const titleMatch = html.match(/<title\b[^>]*>([\s\S]*?)<\/title>/i);
295
+ const canonicalMatch = html.match(/<link\b[^>]*rel=["']canonical["'][^>]*>/i);
296
+ const meta = {};
297
+ const metaTagMatches = html.match(/<meta\b[^>]*>/gi) ?? [];
298
+ for (const tag of metaTagMatches) {
299
+ const name = getTagAttr(tag, "name") || getTagAttr(tag, "property");
300
+ const content = getTagAttr(tag, "content");
301
+ if (!name || !content) {
302
+ continue;
303
+ }
304
+ meta[name] = decodeHtmlEntities(content).trim();
305
+ }
306
+ const headingsByLevel = createEmptyHeadingBuckets();
307
+ const headings = [];
308
+ for (const match of Array.from(html.matchAll(/<h([1-6])\b[^>]*>([\s\S]*?)<\/h\1>/gi)).slice(0, MAX_EXTRACTED_HEADINGS)) {
309
+ const level = Number(match[1]);
310
+ const text = cleanInlineHtml(match[2]);
311
+ if (!text) {
312
+ continue;
313
+ }
314
+ const bucketName = `h${level}`;
315
+ headingsByLevel[bucketName].push(text);
316
+ headings.push(text);
317
+ }
318
+ const links = Array.from(html.matchAll(/<a\b[^>]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi))
319
+ .slice(0, MAX_EXTRACTED_LINKS)
320
+ .map((match) => ({
321
+ href: decodeHtmlEntities(match[1]).trim(),
322
+ text: decodeHtmlEntities(match[2].replace(/<[^>]+>/g, " ").replace(/\s+/g, " ")).trim(),
323
+ }))
324
+ .filter((link) => link.href.length > 0);
325
+ const canonical = canonicalMatch ? getTagAttr(canonicalMatch[0], "href") ?? null : null;
326
+ const title = titleMatch ? decodeHtmlEntities(titleMatch[1]).trim() : null;
327
+ const description = meta.description ?? meta["og:description"] ?? null;
328
+ return {
329
+ title,
330
+ description,
331
+ canonical: canonical ? decodeHtmlEntities(canonical).trim() : null,
332
+ meta,
333
+ headings,
334
+ headingsByLevel,
335
+ links,
336
+ };
337
+ }
338
+ function cleanInlineHtml(value) {
339
+ return decodeHtmlEntities(stripScriptAndStyleBlocks(value).replace(/<[^>]+>/g, " ").replace(/\s+/g, " ")).trim();
340
+ }
341
+ function getTagAttr(tag, attrName) {
342
+ const regex = new RegExp(`${attrName}\\s*=\\s*["']([^"']*)["']`, "i");
343
+ const match = tag.match(regex);
344
+ return match ? match[1] : null;
345
+ }
346
+ function decodeHtmlEntities(value) {
347
+ const namedEntities = {
348
+ "&amp;": "&",
349
+ "&lt;": "<",
350
+ "&gt;": ">",
351
+ "&quot;": "\"",
352
+ "&#39;": "'",
353
+ "&nbsp;": " ",
354
+ };
355
+ let decoded = value;
356
+ for (const [entity, plain] of Object.entries(namedEntities)) {
357
+ decoded = decoded.split(entity).join(plain);
358
+ }
359
+ decoded = decoded.replace(/&#(\d+);/g, (_, num) => String.fromCharCode(Number(num)));
360
+ decoded = decoded.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(parseInt(hex, 16)));
361
+ return decoded;
362
+ }
363
+ function stripScriptAndStyleBlocks(value) {
364
+ return value
365
+ .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gis, " ")
366
+ .replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gis, " ");
367
+ }
368
+ function createEmptyHeadingBuckets() {
369
+ return {
370
+ h1: [],
371
+ h2: [],
372
+ h3: [],
373
+ h4: [],
374
+ h5: [],
375
+ h6: [],
376
+ };
377
+ }
378
+ function isRetriableStatus(status) {
379
+ return status === 408 || status === 429 || status === 500 || status === 502 || status === 503 || status === 504;
380
+ }
381
+ function computeBackoffDelay(baseDelayMs, attempt) {
382
+ return Math.max(0, baseDelayMs) * Math.pow(2, attempt);
383
+ }
package/package.json ADDED
@@ -0,0 +1,67 @@
1
+ {
2
+ "name": "gologin-web-access",
3
+ "version": "0.3.0",
4
+ "description": "Unified web access CLI for developers and AI agents to read and interact with the web using Gologin Web Unlocker and Cloud Browser.",
5
+ "main": "dist/cli.js",
6
+ "bin": {
7
+ "gologin-web-access": "dist/cli.js"
8
+ },
9
+ "files": [
10
+ "dist",
11
+ "README.md",
12
+ "LICENSE",
13
+ "CHANGELOG.md"
14
+ ],
15
+ "preferGlobal": true,
16
+ "engines": {
17
+ "node": ">=18.17"
18
+ },
19
+ "scripts": {
20
+ "build": "tsc -p tsconfig.json",
21
+ "dev": "tsx src/cli.ts",
22
+ "start": "node dist/cli.js",
23
+ "prepare": "npm run build",
24
+ "test": "tsx --test tests/**/*.test.ts",
25
+ "typecheck": "tsc --noEmit -p tsconfig.json",
26
+ "release:check": "npm run typecheck && npm test && npm run build && npm pack --dry-run",
27
+ "prepublishOnly": "npm run release:check"
28
+ },
29
+ "dependencies": {
30
+ "cheerio": "^1.1.2",
31
+ "commander": "^14.0.1",
32
+ "diff": "^8.0.2",
33
+ "mammoth": "^1.11.0",
34
+ "pdf-parse": "^2.4.5",
35
+ "playwright": "^1.58.2",
36
+ "xlsx": "^0.18.5"
37
+ },
38
+ "keywords": [
39
+ "gologin",
40
+ "web-access",
41
+ "cli",
42
+ "webunlocker",
43
+ "scraping",
44
+ "cloud-browser",
45
+ "browser-automation",
46
+ "agent"
47
+ ],
48
+ "author": "Gologin",
49
+ "license": "MIT",
50
+ "homepage": "https://gologin.com",
51
+ "repository": {
52
+ "type": "git",
53
+ "url": "git+https://github.com/GologinLabs/gologin-web-access.git"
54
+ },
55
+ "bugs": {
56
+ "url": "https://github.com/GologinLabs/gologin-web-access/issues"
57
+ },
58
+ "packageManager": "npm@10",
59
+ "publishConfig": {
60
+ "access": "public"
61
+ },
62
+ "devDependencies": {
63
+ "@types/node": "^24.0.10",
64
+ "tsx": "^4.20.5",
65
+ "typescript": "^5.8.2"
66
+ }
67
+ }