gologin-web-access 0.3.3 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +9 -3
- package/README.md +24 -24
- package/dist/cli.js +3 -3
- package/dist/commands/batchChangeTrack.js +3 -3
- package/dist/commands/batchExtract.js +4 -4
- package/dist/commands/batchScrape.js +10 -10
- package/dist/commands/changeTrack.js +3 -3
- package/dist/commands/configInit.js +19 -13
- package/dist/commands/crawl.js +2 -2
- package/dist/commands/crawlStart.js +1 -1
- package/dist/commands/extract.js +4 -4
- package/dist/commands/map.js +2 -2
- package/dist/commands/read.js +4 -4
- package/dist/commands/scrape.js +5 -5
- package/dist/commands/scrapeJson.js +5 -5
- package/dist/commands/scrapeMarkdown.js +5 -5
- package/dist/commands/scrapeText.js +5 -5
- package/dist/commands/search.js +5 -2
- package/dist/commands/shared.js +4 -4
- package/dist/config.js +18 -18
- package/dist/doctor.js +4 -4
- package/dist/lib/changeTracking.js +5 -5
- package/dist/lib/crawl.js +7 -7
- package/dist/lib/document.js +2 -2
- package/dist/lib/errors.js +1 -1
- package/dist/lib/readSource.js +46 -43
- package/dist/lib/{unlocker.js → scrapingApi.js} +17 -17
- package/dist/lib/search.js +15 -15
- package/dist/lib/structuredScrape.js +5 -5
- package/package.json +4 -3
package/CHANGELOG.md
CHANGED
|
@@ -2,9 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
## Unreleased
|
|
4
4
|
|
|
5
|
-
- browser automation is now embedded directly in `gologin-web-access`, so one repo and one install contains both
|
|
5
|
+
- browser automation is now embedded directly in `gologin-web-access`, so one repo and one install contains both Scraping API and Cloud Browser flows
|
|
6
6
|
- doctor now reports the embedded browser runtime source and version
|
|
7
7
|
|
|
8
|
+
## 0.3.5 - 2026-05-14
|
|
9
|
+
|
|
10
|
+
- renamed the internal stateless scraping layer from Web Unlocker to Scraping API
|
|
11
|
+
- config files now write `scrapingApiKey`, while legacy `webUnlockerApiKey` configs and old env aliases still load
|
|
12
|
+
- `renderSource` and search transport values now use `scraping`; `unlocker` remains accepted as a legacy `--source` alias
|
|
13
|
+
|
|
8
14
|
## 0.3.2 - 2026-04-03
|
|
9
15
|
|
|
10
16
|
- added unified page outcome classification across `read`, `scrape-json`, and `batch-scrape`
|
|
@@ -17,9 +23,9 @@ Initial public release of Gologin Web Access.
|
|
|
17
23
|
|
|
18
24
|
Highlights:
|
|
19
25
|
|
|
20
|
-
- Unified CLI entry point for
|
|
26
|
+
- Unified CLI entry point for GoLogin Scraping API and Gologin Cloud Browser workflows
|
|
21
27
|
- Scraping commands: `scrape`, `scrape-markdown`, `scrape-text`, `scrape-json`, `batch-scrape`
|
|
22
28
|
- Browser commands: `open`, `snapshot`, `click`, `type`, `screenshot`, `close`, `sessions`, `current`
|
|
23
|
-
- Clear two-key configuration model with `
|
|
29
|
+
- Clear two-key configuration model with `GOLOGIN_SCRAPING_API_KEY` and `GOLOGIN_TOKEN`
|
|
24
30
|
- `doctor`, `config show`, and `config init` to reduce setup friction
|
|
25
31
|
- Compatibility support for legacy env names used by existing Gologin tools
|
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Gologin Web Access
|
|
2
2
|
|
|
3
|
-
Gologin Web Access lets developers and AI agents read and interact with the web using
|
|
3
|
+
Gologin Web Access lets developers and AI agents read and interact with the web using GoLogin Scraping API and Gologin Cloud Browser.
|
|
4
4
|
|
|
5
5
|
This is a unified web access layer, not just a scraping tool and not just a browser automation tool.
|
|
6
6
|
|
|
@@ -18,7 +18,7 @@ Package name and binary are the same:
|
|
|
18
18
|
|
|
19
19
|
Gologin Web Access combines two existing product surfaces behind one CLI:
|
|
20
20
|
|
|
21
|
-
-
|
|
21
|
+
- Scraping API
|
|
22
22
|
Stateless read and extraction. Best when you want page content quickly without maintaining a browser session.
|
|
23
23
|
- Cloud Browser
|
|
24
24
|
Stateful interaction. Best when you need navigation, clicks, typing, screenshots, or multi-step flows that persist across commands.
|
|
@@ -36,23 +36,23 @@ The point of the unified CLI is that both modes live in one product with one com
|
|
|
36
36
|
|
|
37
37
|
### Scraping / Read
|
|
38
38
|
|
|
39
|
-
These commands use
|
|
39
|
+
These commands use GoLogin Scraping API:
|
|
40
40
|
|
|
41
41
|
- `gologin-web-access scrape <url>`
|
|
42
|
-
- `gologin-web-access read <url> [--format text|markdown|html] [--source auto|
|
|
43
|
-
- `gologin-web-access scrape-markdown <url> [--source auto|
|
|
44
|
-
- `gologin-web-access scrape-text <url> [--source auto|
|
|
42
|
+
- `gologin-web-access read <url> [--format text|markdown|html] [--source auto|scraping|browser]`
|
|
43
|
+
- `gologin-web-access scrape-markdown <url> [--source auto|scraping|browser]`
|
|
44
|
+
- `gologin-web-access scrape-text <url> [--source auto|scraping|browser]`
|
|
45
45
|
- `gologin-web-access scrape-json <url> [--fallback none|browser]`
|
|
46
|
-
- `gologin-web-access batch-scrape <url...> [--format html|markdown|text|json] [--fallback none|browser] [--source auto|
|
|
47
|
-
- `gologin-web-access batch-extract <url...> --schema <schema.json> [--source auto|
|
|
48
|
-
- `gologin-web-access search <query> [--limit <n>] [--country <cc>] [--language <lang>] [--source auto|
|
|
46
|
+
- `gologin-web-access batch-scrape <url...> [--format html|markdown|text|json] [--fallback none|browser] [--source auto|scraping|browser] [--only-main-content] [--retry <n>] [--backoff-ms <ms>] [--summary] [--output <path>] [--strict]`
|
|
47
|
+
- `gologin-web-access batch-extract <url...> --schema <schema.json> [--source auto|scraping|browser] [--retry <n>] [--backoff-ms <ms>] [--summary] [--output <path>]`
|
|
48
|
+
- `gologin-web-access search <query> [--limit <n>] [--country <cc>] [--language <lang>] [--source auto|scraping|browser]`
|
|
49
49
|
- `gologin-web-access map <url> [--limit <n>] [--max-depth <n>] [--concurrency <n>] [--strict]`
|
|
50
50
|
- `gologin-web-access crawl <url> [--format html|markdown|text|json] [--limit <n>] [--max-depth <n>] [--only-main-content] [--strict]`
|
|
51
51
|
- `gologin-web-access crawl-start <url> ...`
|
|
52
52
|
- `gologin-web-access crawl-status <jobId>`
|
|
53
53
|
- `gologin-web-access crawl-result <jobId>`
|
|
54
54
|
- `gologin-web-access crawl-errors <jobId>`
|
|
55
|
-
- `gologin-web-access extract <url> --schema <schema.json> [--source auto|
|
|
55
|
+
- `gologin-web-access extract <url> --schema <schema.json> [--source auto|scraping|browser]`
|
|
56
56
|
- `gologin-web-access change-track <url> [--format html|markdown|text|json]`
|
|
57
57
|
- `gologin-web-access batch-change-track <url...> [--format html|markdown|text|json] [--retry <n>] [--backoff-ms <ms>] [--summary] [--output <path>]`
|
|
58
58
|
- `gologin-web-access parse-document <url-or-path>`
|
|
@@ -111,7 +111,7 @@ Use these when you need state, interaction, or multi-step browser flows.
|
|
|
111
111
|
|
|
112
112
|
### GoLogin API Helpers
|
|
113
113
|
|
|
114
|
-
These commands use the GoLogin REST API directly through `GOLOGIN_TOKEN`. They do not require
|
|
114
|
+
These commands use the GoLogin REST API directly through `GOLOGIN_TOKEN`. They do not require Scraping API and do not start the browser daemon:
|
|
115
115
|
|
|
116
116
|
- `gologin-web-access cloud-usage --profile <profileId> | --workspace <workspaceId> [--days <1-30>] [--json]`
|
|
117
117
|
- `gologin-web-access profile-cloud start <profileId> [--json]`
|
|
@@ -185,7 +185,7 @@ If the browser surface grows substantially later, a nested namespace may become
|
|
|
185
185
|
|
|
186
186
|
This CLI uses two different GoLogin credentials on purpose, because the underlying products are different.
|
|
187
187
|
|
|
188
|
-
- `
|
|
188
|
+
- `GOLOGIN_SCRAPING_API_KEY`
|
|
189
189
|
Required for Scraping / Read commands.
|
|
190
190
|
- `GOLOGIN_TOKEN`
|
|
191
191
|
Required for `gologin-web-access open`, GoLogin API helper commands, and profile validation in `gologin-web-access doctor`.
|
|
@@ -194,16 +194,16 @@ This CLI uses two different GoLogin credentials on purpose, because the underlyi
|
|
|
194
194
|
- `GOLOGIN_DAEMON_PORT`
|
|
195
195
|
Optional local daemon port for browser workflows.
|
|
196
196
|
|
|
197
|
-
Recommended full setup for agents is to configure both `
|
|
197
|
+
Recommended full setup for agents is to configure both `GOLOGIN_SCRAPING_API_KEY` and `GOLOGIN_TOKEN` before starting work, even if the current task looks read-only or browser-only.
|
|
198
198
|
|
|
199
199
|
Missing-key errors are command-group specific. Example:
|
|
200
200
|
|
|
201
|
-
`Missing
|
|
201
|
+
`Missing GOLOGIN_SCRAPING_API_KEY. This is required for scraping commands like \`gologin-web-access scrape\`.`
|
|
202
202
|
|
|
203
203
|
Environment variables are the primary configuration mechanism:
|
|
204
204
|
|
|
205
205
|
```bash
|
|
206
|
-
export
|
|
206
|
+
export GOLOGIN_SCRAPING_API_KEY="wu_..."
|
|
207
207
|
export GOLOGIN_TOKEN="gl_..."
|
|
208
208
|
export GOLOGIN_DEFAULT_PROFILE_ID="profile_123"
|
|
209
209
|
export GOLOGIN_DAEMON_PORT="4590"
|
|
@@ -218,8 +218,8 @@ gologin-web-access config init
|
|
|
218
218
|
Useful variants:
|
|
219
219
|
|
|
220
220
|
```bash
|
|
221
|
-
gologin-web-access config init --
|
|
222
|
-
gologin-web-access config init --web-unlocker-key wu_... --token gl_...
|
|
221
|
+
gologin-web-access config init --scraping-api-key wu_... --token gl_...
|
|
222
|
+
gologin-web-access config init --web-unlocker-key wu_... --token gl_... # legacy alias
|
|
223
223
|
```
|
|
224
224
|
|
|
225
225
|
That writes `~/.gologin-web-access/config.json` once and the CLI will keep reading it on later runs.
|
|
@@ -229,7 +229,7 @@ You can also write a minimal config file at `~/.gologin-web-access/config.json`:
|
|
|
229
229
|
|
|
230
230
|
```json
|
|
231
231
|
{
|
|
232
|
-
"
|
|
232
|
+
"scrapingApiKey": "wu_...",
|
|
233
233
|
"cloudToken": "gl_...",
|
|
234
234
|
"defaultProfileId": "profile_123",
|
|
235
235
|
"daemonPort": 4590
|
|
@@ -266,7 +266,7 @@ npm install -g gologin-web-access
|
|
|
266
266
|
### Read A Page
|
|
267
267
|
|
|
268
268
|
```bash
|
|
269
|
-
export
|
|
269
|
+
export GOLOGIN_SCRAPING_API_KEY="wu_..."
|
|
270
270
|
|
|
271
271
|
gologin-web-access scrape https://example.com
|
|
272
272
|
gologin-web-access read https://docs.browserbase.com/features/stealth-mode
|
|
@@ -332,17 +332,17 @@ gologin-web-access snapshot -i
|
|
|
332
332
|
|
|
333
333
|
## Structured Output And Retry Controls
|
|
334
334
|
|
|
335
|
-
- `scrape-markdown` and `scrape-text` now default to `--source auto`: they start with
|
|
335
|
+
- `scrape-markdown` and `scrape-text` now default to `--source auto`: they start with Scraping API, isolate the most readable content block, and can auto-retry with Cloud Browser when the output still looks like JS-rendered docs chrome.
|
|
336
336
|
- `read` is the shortest path for "look at this docs page" work: it targets the most readable content block and defaults to `--format text --source auto`.
|
|
337
|
-
- `scrape-markdown` and `scrape-text` also accept `--source
|
|
338
|
-
- `extract` now accepts `--source auto|
|
|
337
|
+
- `scrape-markdown` and `scrape-text` also accept `--source scraping` and `--source browser` when you want to force one path. `--source unlocker` remains as a legacy alias.
|
|
338
|
+
- `extract` now accepts `--source auto|scraping|browser` and returns `renderSource`, fallback flags, and request metadata with the extracted JSON.
|
|
339
339
|
- `batch-extract` reuses the same extraction path across many URLs and returns one structured result per URL, including request and fallback metadata. Add `--output <path>` to save the full array directly.
|
|
340
340
|
- `scrape-json` now returns both a flat `headings` array and `headingsByLevel` buckets for `h1` through `h6`.
|
|
341
341
|
- `scrape-json --fallback browser` is available for JS-heavy pages where stateless extraction returns weak heading data.
|
|
342
342
|
- `scrape-json` now also classifies the page outcome as `ok`, `empty`, `incomplete`, `authwall`, `challenge`, `blocked`, or `cookie_wall`, and includes `nextActionHint` when the result is weak or gated.
|
|
343
343
|
- `scrape`, `scrape-markdown`, `scrape-text`, `scrape-json`, and `batch-scrape` accept `--retry`, `--backoff-ms`, and `--timeout-ms`.
|
|
344
344
|
- `batch-scrape --only-main-content` lets markdown, text, and html batch runs use the same readable-content isolation path as `read`.
|
|
345
|
-
- `crawl --only-main-content` uses the same readable-fragment extraction strategy for html, markdown, and text crawl output, but stays on the stateless
|
|
345
|
+
- `crawl --only-main-content` uses the same readable-fragment extraction strategy for html, markdown, and text crawl output, but stays on the stateless Scraping API path.
|
|
346
346
|
- `batch-scrape --summary` prints a one-line success/failure summary to `stderr` after the JSON payload.
|
|
347
347
|
- `batch-scrape` now returns exit code `0` on partial success by default and only fails the command when every URL failed. Add `--strict` if any single failed URL should make the whole batch exit non-zero.
|
|
348
348
|
- `batch-scrape --output <path>` writes the full JSON to disk so shells and agent consoles cannot truncate a large payload silently.
|
|
@@ -370,7 +370,7 @@ gologin-web-access jobs
|
|
|
370
370
|
|
|
371
371
|
Gologin Web Access still has two runtime layers:
|
|
372
372
|
|
|
373
|
-
-
|
|
373
|
+
- Scraping API for stateless read and extraction
|
|
374
374
|
- Cloud Browser for stateful interaction
|
|
375
375
|
|
|
376
376
|
But both are now shipped inside the same package and the same repository. One install gives you the full read layer and the full browser/session layer.
|
package/dist/cli.js
CHANGED
|
@@ -68,12 +68,12 @@ const wait_1 = require("./commands/wait");
|
|
|
68
68
|
const doctor_1 = require("./doctor");
|
|
69
69
|
const errors_1 = require("./lib/errors");
|
|
70
70
|
const output_1 = require("./lib/output");
|
|
71
|
-
const CLI_VERSION = "0.3.
|
|
71
|
+
const CLI_VERSION = "0.3.5";
|
|
72
72
|
async function main() {
|
|
73
73
|
const program = new commander_1.Command();
|
|
74
74
|
program
|
|
75
75
|
.name("gologin-web-access")
|
|
76
|
-
.description("Read and interact with the web using
|
|
76
|
+
.description("Read and interact with the web using the GoLogin Scraping API and Cloud Browser.")
|
|
77
77
|
.version(CLI_VERSION)
|
|
78
78
|
.showHelpAfterError()
|
|
79
79
|
.showSuggestionAfterError();
|
|
@@ -174,7 +174,7 @@ Command groups:
|
|
|
174
174
|
Agent: gologin-web-access run|batch|jobs|job
|
|
175
175
|
|
|
176
176
|
Key model:
|
|
177
|
-
${"
|
|
177
|
+
${"GOLOGIN_SCRAPING_API_KEY"} powers scraping commands.
|
|
178
178
|
${"GOLOGIN_TOKEN"} powers browser commands.
|
|
179
179
|
Recommended setup: configure both keys up front, even if the current task only needs one path.
|
|
180
180
|
`);
|
|
@@ -13,7 +13,7 @@ const concurrency_1 = require("../lib/concurrency");
|
|
|
13
13
|
const output_1 = require("../lib/output");
|
|
14
14
|
const shared_1 = require("./shared");
|
|
15
15
|
function buildBatchChangeTrackCommand() {
|
|
16
|
-
return (0, shared_1.
|
|
16
|
+
return (0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("batch-change-track")
|
|
17
17
|
.description("Track multiple pages over time and report which ones are new, same, or changed.")
|
|
18
18
|
.argument("<urls...>", "One or more URLs")
|
|
19
19
|
.option("--format <format>", "html, markdown, text, or json", "markdown")
|
|
@@ -22,10 +22,10 @@ function buildBatchChangeTrackCommand() {
|
|
|
22
22
|
.option("--summary", "Print one-line status counts to stderr after the JSON output")
|
|
23
23
|
.action(async (urls, options) => {
|
|
24
24
|
const config = await (0, config_1.loadConfig)();
|
|
25
|
-
const apiKey = (0, config_1.
|
|
25
|
+
const apiKey = (0, config_1.requireScrapingApiKey)(config);
|
|
26
26
|
const format = (0, changeTracking_1.normalizeTrackingFormat)(options.format);
|
|
27
27
|
const concurrency = Math.max(1, Number(options.concurrency) || 4);
|
|
28
|
-
const requestOptions = (0, shared_1.
|
|
28
|
+
const requestOptions = (0, shared_1.normalizeScrapingApiRequestOptions)(options);
|
|
29
29
|
const results = await (0, concurrency_1.mapWithConcurrency)(urls, concurrency, async (url) => {
|
|
30
30
|
try {
|
|
31
31
|
const key = (0, changeTracking_1.buildTrackingKey)(url);
|
|
@@ -14,21 +14,21 @@ const output_1 = require("../lib/output");
|
|
|
14
14
|
const readSource_1 = require("../lib/readSource");
|
|
15
15
|
const shared_1 = require("./shared");
|
|
16
16
|
function buildBatchExtractCommand() {
|
|
17
|
-
return (0, shared_1.addProfileOption)((0, shared_1.
|
|
17
|
+
return (0, shared_1.addProfileOption)((0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("batch-extract")
|
|
18
18
|
.description("Extract structured data from multiple pages using one selector schema.")
|
|
19
19
|
.argument("<urls...>", "One or more URLs")
|
|
20
20
|
.requiredOption("--schema <path>", "Path to a JSON extraction schema")
|
|
21
|
-
.option("--source <source>", "Read source: auto,
|
|
21
|
+
.option("--source <source>", "Read source: auto, scraping, or browser. Legacy alias: unlocker", "auto")
|
|
22
22
|
.option("--concurrency <count>", "Number of concurrent requests", "4")
|
|
23
23
|
.option("--output <path>", "Write the full batch result JSON to a file")
|
|
24
24
|
.option("--summary", "Print one-line summary stats to stderr after the JSON output")
|
|
25
25
|
.action(async (urls, options) => {
|
|
26
26
|
const config = await (0, config_1.loadConfig)();
|
|
27
27
|
const source = (0, readSource_1.normalizeReadSourceMode)(options.source, "auto");
|
|
28
|
-
const apiKey = source === "browser" ? "" : (0, config_1.
|
|
28
|
+
const apiKey = source === "browser" ? "" : (0, config_1.requireScrapingApiKey)(config);
|
|
29
29
|
const schema = await readSchema(path_1.default.resolve(options.schema));
|
|
30
30
|
const concurrency = Math.max(1, Number(options.concurrency) || 4);
|
|
31
|
-
const request = (0, shared_1.
|
|
31
|
+
const request = (0, shared_1.normalizeScrapingApiRequestOptions)(options);
|
|
32
32
|
const results = await (0, concurrency_1.mapWithConcurrency)(urls, concurrency, async (url) => {
|
|
33
33
|
try {
|
|
34
34
|
return {
|
|
@@ -13,16 +13,16 @@ const config_1 = require("../config");
|
|
|
13
13
|
const output_1 = require("../lib/output");
|
|
14
14
|
const readSource_1 = require("../lib/readSource");
|
|
15
15
|
const structuredScrape_1 = require("../lib/structuredScrape");
|
|
16
|
-
const
|
|
16
|
+
const scrapingApi_1 = require("../lib/scrapingApi");
|
|
17
17
|
const shared_1 = require("./shared");
|
|
18
18
|
function buildBatchScrapeCommand() {
|
|
19
|
-
return (0, shared_1.addProfileOption)((0, shared_1.
|
|
20
|
-
.description("Fetch multiple pages through
|
|
19
|
+
return (0, shared_1.addProfileOption)((0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("batch-scrape")
|
|
20
|
+
.description("Fetch multiple pages through Scraping API and print a JSON array of results.")
|
|
21
21
|
.argument("<urls...>", "One or more URLs")
|
|
22
22
|
.option("--format <format>", "html, markdown, text, or json", "html")
|
|
23
23
|
.option("--concurrency <count>", "Number of concurrent requests", "4")
|
|
24
24
|
.option("--fallback <mode>", "Structured scrape fallback: none or browser", "none")
|
|
25
|
-
.option("--source <source>", "Read source for --only-main-content mode: auto,
|
|
25
|
+
.option("--source <source>", "Read source for --only-main-content mode: auto, scraping, or browser. Legacy alias: unlocker", "auto")
|
|
26
26
|
.option("--only-main-content", "For html, markdown, or text formats, isolate the most readable content block per page")
|
|
27
27
|
.option("--output <path>", "Write the full batch result JSON to a file")
|
|
28
28
|
.option("--summary", "Print one-line summary stats to stderr after the JSON output")
|
|
@@ -32,9 +32,9 @@ function buildBatchScrapeCommand() {
|
|
|
32
32
|
const format = normalizeFormat(options.format);
|
|
33
33
|
const source = (0, readSource_1.normalizeReadSourceMode)(options.source, "auto");
|
|
34
34
|
const usingBrowserOnlyMainContent = Boolean(options.onlyMainContent) && format !== "json" && source === "browser";
|
|
35
|
-
const apiKey = usingBrowserOnlyMainContent ? "" : (0, config_1.
|
|
35
|
+
const apiKey = usingBrowserOnlyMainContent ? "" : (0, config_1.requireScrapingApiKey)(config);
|
|
36
36
|
const concurrency = Math.max(1, Number(options.concurrency) || 4);
|
|
37
|
-
const requestOptions = (0, shared_1.
|
|
37
|
+
const requestOptions = (0, shared_1.normalizeScrapingApiRequestOptions)(options);
|
|
38
38
|
const fallback = (0, structuredScrape_1.normalizeStructuredFallbackMode)(options.fallback);
|
|
39
39
|
const results = await mapWithConcurrency(urls, concurrency, async (url) => {
|
|
40
40
|
try {
|
|
@@ -120,15 +120,15 @@ async function formatOutput(url, config, apiKey, format, requestOptions, fallbac
|
|
|
120
120
|
switch (format) {
|
|
121
121
|
case "html":
|
|
122
122
|
return {
|
|
123
|
-
output: (await (0,
|
|
123
|
+
output: (await (0, scrapingApi_1.scrapeRenderedHtml)(url, apiKey, requestOptions)).content,
|
|
124
124
|
};
|
|
125
125
|
case "markdown":
|
|
126
126
|
return {
|
|
127
|
-
output: (await (0,
|
|
127
|
+
output: (await (0, scrapingApi_1.scrapeMarkdown)(url, apiKey, requestOptions)).markdown,
|
|
128
128
|
};
|
|
129
129
|
case "text":
|
|
130
130
|
return {
|
|
131
|
-
output: (await (0,
|
|
131
|
+
output: (await (0, scrapingApi_1.scrapeText)(url, apiKey, requestOptions)).text,
|
|
132
132
|
};
|
|
133
133
|
case "json":
|
|
134
134
|
return mapStructuredBatchResult(await (0, structuredScrape_1.scrapeStructuredJson)(url, config, apiKey, {
|
|
@@ -137,7 +137,7 @@ async function formatOutput(url, config, apiKey, format, requestOptions, fallbac
|
|
|
137
137
|
}));
|
|
138
138
|
default:
|
|
139
139
|
return {
|
|
140
|
-
output: (await (0,
|
|
140
|
+
output: (await (0, scrapingApi_1.scrapeRenderedHtml)(url, apiKey, requestOptions)).content,
|
|
141
141
|
};
|
|
142
142
|
}
|
|
143
143
|
}
|
|
@@ -12,7 +12,7 @@ const changeTracking_1 = require("../lib/changeTracking");
|
|
|
12
12
|
const output_1 = require("../lib/output");
|
|
13
13
|
const shared_1 = require("./shared");
|
|
14
14
|
function buildChangeTrackCommand() {
|
|
15
|
-
return (0, shared_1.
|
|
15
|
+
return (0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("change-track")
|
|
16
16
|
.description("Track a page over time and report whether it changed since the last snapshot.")
|
|
17
17
|
.argument("<url>", "Target URL")
|
|
18
18
|
.option("--format <format>", "html, markdown, text, or json", "markdown")
|
|
@@ -21,10 +21,10 @@ function buildChangeTrackCommand() {
|
|
|
21
21
|
.option("--output <path>", "Write the current tracking result to a file")
|
|
22
22
|
.action(async (url, options) => {
|
|
23
23
|
const config = await (0, config_1.loadConfig)();
|
|
24
|
-
const apiKey = (0, config_1.
|
|
24
|
+
const apiKey = (0, config_1.requireScrapingApiKey)(config);
|
|
25
25
|
const format = (0, changeTracking_1.normalizeTrackingFormat)(options.format);
|
|
26
26
|
const key = (0, changeTracking_1.buildTrackingKey)(url, options.key);
|
|
27
|
-
const snapshot = await (0, changeTracking_1.scrapeForTracking)(url, apiKey, format, (0, shared_1.
|
|
27
|
+
const snapshot = await (0, changeTracking_1.scrapeForTracking)(url, apiKey, format, (0, shared_1.normalizeScrapingApiRequestOptions)(options));
|
|
28
28
|
const result = await (0, changeTracking_1.compareAndPersistSnapshot)(config, {
|
|
29
29
|
key,
|
|
30
30
|
url,
|
|
@@ -5,12 +5,13 @@ const commander_1 = require("commander");
|
|
|
5
5
|
const config_1 = require("../config");
|
|
6
6
|
const cloudApi_1 = require("../lib/cloudApi");
|
|
7
7
|
const output_1 = require("../lib/output");
|
|
8
|
-
const
|
|
8
|
+
const scrapingApi_1 = require("../lib/scrapingApi");
|
|
9
9
|
function buildConfigInitCommand() {
|
|
10
10
|
return new commander_1.Command("init")
|
|
11
|
-
.description("Write ~/.gologin-web-access/config.json with current values or placeholders. Recommended: persist both the
|
|
12
|
-
.option("--
|
|
13
|
-
.option("--web-unlocker-key <key>", "
|
|
11
|
+
.description("Write ~/.gologin-web-access/config.json with current values or placeholders. Recommended: persist both the Scraping API key and the GoLogin token.")
|
|
12
|
+
.option("--scraping-api-key <key>", "Persist a Scraping API key")
|
|
13
|
+
.option("--web-unlocker-api-key <key>", "Legacy alias for --scraping-api-key")
|
|
14
|
+
.option("--web-unlocker-key <key>", "Legacy alias for --scraping-api-key")
|
|
14
15
|
.option("--token <token>", "Persist a GoLogin token")
|
|
15
16
|
.option("--cloud-token <token>", "Backward-compatible alias for --token")
|
|
16
17
|
.option("--default-profile-id <id>", "Persist a default Gologin profile ID")
|
|
@@ -18,9 +19,14 @@ function buildConfigInitCommand() {
|
|
|
18
19
|
.option("--no-validate", "Skip live key validation after writing config")
|
|
19
20
|
.option("--force", "Overwrite an existing config file")
|
|
20
21
|
.action(async (options) => {
|
|
21
|
-
const
|
|
22
|
+
const scrapingApiKey = options.scrapingApiKey ??
|
|
23
|
+
options.webUnlockerApiKey ??
|
|
24
|
+
options.webUnlockerKey ??
|
|
25
|
+
process.env[config_1.ENV_NAMES.scrapingApiKey] ??
|
|
26
|
+
process.env.GOLOGIN_WEB_UNLOCKER_API_KEY ??
|
|
27
|
+
process.env.GOLOGIN_WEBUNLOCKER_API_KEY;
|
|
22
28
|
const result = await (0, config_1.initConfigFile)({
|
|
23
|
-
|
|
29
|
+
scrapingApiKey,
|
|
24
30
|
cloudToken: options.token ??
|
|
25
31
|
options.cloudToken ??
|
|
26
32
|
process.env[config_1.ENV_NAMES.cloudToken] ??
|
|
@@ -37,8 +43,8 @@ function buildConfigInitCommand() {
|
|
|
37
43
|
(0, output_1.printKeyValueRows)([
|
|
38
44
|
{ label: "Config file", value: result.path },
|
|
39
45
|
{
|
|
40
|
-
label: "
|
|
41
|
-
value: result.config.
|
|
46
|
+
label: "Scraping API key",
|
|
47
|
+
value: result.config.scrapingApiKey ? "written" : "left empty",
|
|
42
48
|
},
|
|
43
49
|
{
|
|
44
50
|
label: "GoLogin token",
|
|
@@ -53,17 +59,17 @@ function buildConfigInitCommand() {
|
|
|
53
59
|
value: String(result.config.daemonPort ?? config_1.DEFAULT_DAEMON_PORT),
|
|
54
60
|
},
|
|
55
61
|
]);
|
|
56
|
-
if (!result.config.
|
|
57
|
-
(0, output_1.printText)("Recommended next step: configure both
|
|
62
|
+
if (!result.config.scrapingApiKey || !result.config.cloudToken) {
|
|
63
|
+
(0, output_1.printText)("Recommended next step: configure both GOLOGIN_SCRAPING_API_KEY and GOLOGIN_TOKEN so agents can use scraping and browser flows without asking again.");
|
|
58
64
|
}
|
|
59
65
|
if (options.validate === false) {
|
|
60
66
|
return;
|
|
61
67
|
}
|
|
62
68
|
const validationRows = [];
|
|
63
|
-
if (result.config.
|
|
64
|
-
const validation = await (0,
|
|
69
|
+
if (result.config.scrapingApiKey) {
|
|
70
|
+
const validation = await (0, scrapingApi_1.validateScrapingApiKey)(result.config.scrapingApiKey);
|
|
65
71
|
validationRows.push({
|
|
66
|
-
label: "
|
|
72
|
+
label: "Scraping API validation",
|
|
67
73
|
value: validation.ok ? "ok" : `failed${validation.status ? ` (${validation.status})` : ""}: ${validation.detail}`,
|
|
68
74
|
});
|
|
69
75
|
}
|
package/dist/commands/crawl.js
CHANGED
|
@@ -7,7 +7,7 @@ const crawl_1 = require("../lib/crawl");
|
|
|
7
7
|
const output_1 = require("../lib/output");
|
|
8
8
|
function buildCrawlCommand() {
|
|
9
9
|
return new commander_1.Command("crawl")
|
|
10
|
-
.description("Crawl a website through
|
|
10
|
+
.description("Crawl a website through GoLogin Scraping API and return per-page extracted content.")
|
|
11
11
|
.argument("<url>", "Root website URL to crawl")
|
|
12
12
|
.option("--format <format>", "html, markdown, text, or json", "markdown")
|
|
13
13
|
.option("--limit <count>", "Maximum number of pages to visit", "25")
|
|
@@ -22,7 +22,7 @@ function buildCrawlCommand() {
|
|
|
22
22
|
.option("--strict", "Exit non-zero when any page fails during crawling")
|
|
23
23
|
.action(async (url, options) => {
|
|
24
24
|
const config = await (0, config_1.loadConfig)();
|
|
25
|
-
const apiKey = (0, config_1.
|
|
25
|
+
const apiKey = (0, config_1.requireScrapingApiKey)(config);
|
|
26
26
|
const format = normalizeFormat(options.format);
|
|
27
27
|
const result = await (0, crawl_1.crawlSite)(url, apiKey, format, {
|
|
28
28
|
limit: normalizePositiveInt(options.limit, 25),
|
|
@@ -21,7 +21,7 @@ function buildCrawlStartCommand() {
|
|
|
21
21
|
.option("--only-main-content", "For html, markdown, or text output, isolate the most readable content block on each page")
|
|
22
22
|
.action(async (url, options) => {
|
|
23
23
|
const config = await (0, config_1.loadConfig)();
|
|
24
|
-
(0, config_1.
|
|
24
|
+
(0, config_1.requireScrapingApiKey)(config);
|
|
25
25
|
const args = buildCrawlArgs(url, options);
|
|
26
26
|
const job = await (0, jobRegistry_1.createJob)(config, {
|
|
27
27
|
kind: "crawl",
|
package/dist/commands/extract.js
CHANGED
|
@@ -13,20 +13,20 @@ const output_1 = require("../lib/output");
|
|
|
13
13
|
const readSource_1 = require("../lib/readSource");
|
|
14
14
|
const shared_1 = require("./shared");
|
|
15
15
|
function buildExtractCommand() {
|
|
16
|
-
return (0, shared_1.
|
|
16
|
+
return (0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("extract")
|
|
17
17
|
.description("Extract structured data from a page using a selector schema.")
|
|
18
18
|
.argument("<url>", "Target URL")
|
|
19
19
|
.requiredOption("--schema <path>", "Path to a JSON extraction schema")
|
|
20
20
|
.option("--output <path>", "Write extracted JSON to a file")
|
|
21
|
-
.option("--source <source>", "Read source: auto,
|
|
21
|
+
.option("--source <source>", "Read source: auto, scraping, or browser. Legacy alias: unlocker", "auto")
|
|
22
22
|
.action(async (url, options) => {
|
|
23
23
|
const config = await (0, config_1.loadConfig)();
|
|
24
24
|
const source = (0, readSource_1.normalizeReadSourceMode)(options.source, "auto");
|
|
25
|
-
const apiKey = source === "browser" ? "" : (0, config_1.
|
|
25
|
+
const apiKey = source === "browser" ? "" : (0, config_1.requireScrapingApiKey)(config);
|
|
26
26
|
const schema = await readSchema(path_1.default.resolve(options.schema));
|
|
27
27
|
const result = await (0, extractRunner_1.extractUrlWithSchema)(url, config, apiKey, schema, {
|
|
28
28
|
source,
|
|
29
|
-
request: (0, shared_1.
|
|
29
|
+
request: (0, shared_1.normalizeScrapingApiRequestOptions)(options),
|
|
30
30
|
});
|
|
31
31
|
if (options.output) {
|
|
32
32
|
const outputPath = path_1.default.resolve(options.output);
|
package/dist/commands/map.js
CHANGED
|
@@ -7,7 +7,7 @@ const output_1 = require("../lib/output");
|
|
|
7
7
|
const crawl_1 = require("../lib/crawl");
|
|
8
8
|
function buildMapCommand() {
|
|
9
9
|
return new commander_1.Command("map")
|
|
10
|
-
.description("Discover internal website links through
|
|
10
|
+
.description("Discover internal website links through GoLogin Scraping API.")
|
|
11
11
|
.argument("<url>", "Root website URL to map")
|
|
12
12
|
.option("--limit <count>", "Maximum number of pages to visit", "100")
|
|
13
13
|
.option("--max-depth <depth>", "Maximum link depth from the root URL", "2")
|
|
@@ -20,7 +20,7 @@ function buildMapCommand() {
|
|
|
20
20
|
.option("--strict", "Exit non-zero when any page fails during mapping")
|
|
21
21
|
.action(async (url, options) => {
|
|
22
22
|
const config = await (0, config_1.loadConfig)();
|
|
23
|
-
const apiKey = (0, config_1.
|
|
23
|
+
const apiKey = (0, config_1.requireScrapingApiKey)(config);
|
|
24
24
|
const result = await (0, crawl_1.mapSite)(url, apiKey, {
|
|
25
25
|
limit: normalizePositiveInt(options.limit, 100),
|
|
26
26
|
maxDepth: normalizeNonNegativeInt(options.maxDepth, 2),
|
package/dist/commands/read.js
CHANGED
|
@@ -8,20 +8,20 @@ const readSource_1 = require("../lib/readSource");
|
|
|
8
8
|
const output_1 = require("../lib/output");
|
|
9
9
|
const shared_1 = require("./shared");
|
|
10
10
|
function buildReadCommand() {
|
|
11
|
-
return (0, shared_1.addProfileOption)((0, shared_1.
|
|
11
|
+
return (0, shared_1.addProfileOption)((0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("read")
|
|
12
12
|
.description("Read the main content of a docs page or article with automatic fallback to Cloud Browser when needed.")
|
|
13
13
|
.argument("<url>", "URL to read")
|
|
14
14
|
.option("--format <format>", "Output format: html, markdown, or text", "text")
|
|
15
|
-
.option("--source <source>", "Read source: auto,
|
|
15
|
+
.option("--source <source>", "Read source: auto, scraping, or browser. Legacy alias: unlocker", "auto")
|
|
16
16
|
.action(async (url, options) => {
|
|
17
17
|
const config = await (0, config_1.loadConfig)();
|
|
18
18
|
const format = normalizeReadFormat(options.format);
|
|
19
19
|
const source = (0, readSource_1.normalizeReadSourceMode)(options.source, "auto");
|
|
20
|
-
const apiKey = source === "browser" ? "" : (0, config_1.
|
|
20
|
+
const apiKey = source === "browser" ? "" : (0, config_1.requireScrapingApiKey)(config);
|
|
21
21
|
const readOptions = {
|
|
22
22
|
source,
|
|
23
23
|
profile: options.profile,
|
|
24
|
-
request: (0, shared_1.
|
|
24
|
+
request: (0, shared_1.normalizeScrapingApiRequestOptions)(options),
|
|
25
25
|
};
|
|
26
26
|
const result = format === "html"
|
|
27
27
|
? await (0, readSource_1.readHtmlContent)(url, config, apiKey, readOptions)
|
package/dist/commands/scrape.js
CHANGED
|
@@ -5,15 +5,15 @@ const commander_1 = require("commander");
|
|
|
5
5
|
const config_1 = require("../config");
|
|
6
6
|
const shared_1 = require("./shared");
|
|
7
7
|
const output_1 = require("../lib/output");
|
|
8
|
-
const
|
|
8
|
+
const scrapingApi_1 = require("../lib/scrapingApi");
|
|
9
9
|
function buildScrapeCommand() {
|
|
10
|
-
return (0, shared_1.
|
|
11
|
-
.description("Fetch rendered HTML through
|
|
10
|
+
return (0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("scrape")
|
|
11
|
+
.description("Fetch rendered HTML through GoLogin Scraping API.")
|
|
12
12
|
.argument("<url>", "URL to scrape")
|
|
13
13
|
.action(async (url, options) => {
|
|
14
14
|
const config = await (0, config_1.loadConfig)();
|
|
15
|
-
const apiKey = (0, config_1.
|
|
16
|
-
const result = await (0,
|
|
15
|
+
const apiKey = (0, config_1.requireScrapingApiKey)(config);
|
|
16
|
+
const result = await (0, scrapingApi_1.scrapeRenderedHtml)(url, apiKey, (0, shared_1.normalizeScrapingApiRequestOptions)(options));
|
|
17
17
|
(0, output_1.printText)(result.content);
|
|
18
18
|
}));
|
|
19
19
|
}
|
|
@@ -7,21 +7,21 @@ const output_1 = require("../lib/output");
|
|
|
7
7
|
const structuredScrape_1 = require("../lib/structuredScrape");
|
|
8
8
|
const shared_1 = require("./shared");
|
|
9
9
|
function buildScrapeJsonCommand() {
|
|
10
|
-
return (0, shared_1.addProfileOption)((0, shared_1.
|
|
11
|
-
.description("Fetch a page through
|
|
10
|
+
return (0, shared_1.addProfileOption)((0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("scrape-json")
|
|
11
|
+
.description("Fetch a page through Scraping API and print a structured JSON envelope.")
|
|
12
12
|
.argument("<url>", "URL to scrape")
|
|
13
13
|
.option("--fallback <mode>", "none or browser structured fallback for JS-heavy pages", "none")
|
|
14
14
|
.action(async (url, options) => {
|
|
15
15
|
const config = await (0, config_1.loadConfig)();
|
|
16
|
-
const apiKey = (0, config_1.
|
|
16
|
+
const apiKey = (0, config_1.requireScrapingApiKey)(config);
|
|
17
17
|
const envelope = await (0, structuredScrape_1.scrapeStructuredJson)(url, config, apiKey, {
|
|
18
18
|
fallback: (0, structuredScrape_1.normalizeStructuredFallbackMode)(options.fallback),
|
|
19
19
|
profile: options.profile,
|
|
20
|
-
request: (0, shared_1.
|
|
20
|
+
request: (0, shared_1.normalizeScrapingApiRequestOptions)(options),
|
|
21
21
|
});
|
|
22
22
|
if (envelope.fallbackAttempted) {
|
|
23
23
|
const fallbackStatus = envelope.fallbackUsed
|
|
24
|
-
? "Browser fallback succeeded and replaced the
|
|
24
|
+
? "Browser fallback succeeded and replaced the Scraping API result."
|
|
25
25
|
: `Browser fallback was attempted but not used. ${envelope.fallbackReason ?? "It did not improve the structured output."}`;
|
|
26
26
|
process.stderr.write(`${fallbackStatus}\n`);
|
|
27
27
|
}
|
|
@@ -8,17 +8,17 @@ const readSource_1 = require("../lib/readSource");
|
|
|
8
8
|
const shared_1 = require("./shared");
|
|
9
9
|
const output_1 = require("../lib/output");
|
|
10
10
|
function buildScrapeMarkdownCommand() {
|
|
11
|
-
return (0, shared_1.
|
|
12
|
-
.description("Fetch a page through
|
|
11
|
+
return (0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("scrape-markdown")
|
|
12
|
+
.description("Fetch a page through Scraping API and print Markdown.")
|
|
13
13
|
.argument("<url>", "URL to scrape")
|
|
14
|
-
.option("--source <source>", "Read source: auto,
|
|
14
|
+
.option("--source <source>", "Read source: auto, scraping, or browser. Legacy alias: unlocker", "auto")
|
|
15
15
|
.action(async (url, options) => {
|
|
16
16
|
const config = await (0, config_1.loadConfig)();
|
|
17
17
|
const source = (0, readSource_1.normalizeReadSourceMode)(options.source, "auto");
|
|
18
|
-
const apiKey = source === "browser" ? "" : (0, config_1.
|
|
18
|
+
const apiKey = source === "browser" ? "" : (0, config_1.requireScrapingApiKey)(config);
|
|
19
19
|
const result = await (0, readSource_1.readMarkdownContent)(url, config, apiKey, {
|
|
20
20
|
source,
|
|
21
|
-
request: (0, shared_1.
|
|
21
|
+
request: (0, shared_1.normalizeScrapingApiRequestOptions)(options),
|
|
22
22
|
});
|
|
23
23
|
emitReadNotice(result);
|
|
24
24
|
(0, output_1.printText)(result.content);
|
|
@@ -8,17 +8,17 @@ const readSource_1 = require("../lib/readSource");
|
|
|
8
8
|
const shared_1 = require("./shared");
|
|
9
9
|
const output_1 = require("../lib/output");
|
|
10
10
|
function buildScrapeTextCommand() {
|
|
11
|
-
return (0, shared_1.
|
|
12
|
-
.description("Fetch a page through
|
|
11
|
+
return (0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("scrape-text")
|
|
12
|
+
.description("Fetch a page through Scraping API and print plain text.")
|
|
13
13
|
.argument("<url>", "URL to scrape")
|
|
14
|
-
.option("--source <source>", "Read source: auto,
|
|
14
|
+
.option("--source <source>", "Read source: auto, scraping, or browser. Legacy alias: unlocker", "auto")
|
|
15
15
|
.action(async (url, options) => {
|
|
16
16
|
const config = await (0, config_1.loadConfig)();
|
|
17
17
|
const source = (0, readSource_1.normalizeReadSourceMode)(options.source, "auto");
|
|
18
|
-
const apiKey = source === "browser" ? "" : (0, config_1.
|
|
18
|
+
const apiKey = source === "browser" ? "" : (0, config_1.requireScrapingApiKey)(config);
|
|
19
19
|
const result = await (0, readSource_1.readTextContent)(url, config, apiKey, {
|
|
20
20
|
source,
|
|
21
|
-
request: (0, shared_1.
|
|
21
|
+
request: (0, shared_1.normalizeScrapingApiRequestOptions)(options),
|
|
22
22
|
});
|
|
23
23
|
emitReadNotice(result);
|
|
24
24
|
(0, output_1.printText)(result.content);
|