gologin-web-access 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +1 -1
- package/dist/cli.js +1 -1
- package/dist/commands/batchChangeTrack.js +3 -3
- package/dist/commands/batchExtract.js +3 -3
- package/dist/commands/batchScrape.js +8 -8
- package/dist/commands/changeTrack.js +3 -3
- package/dist/commands/configInit.js +8 -8
- package/dist/commands/crawl.js +1 -1
- package/dist/commands/crawlStart.js +1 -1
- package/dist/commands/extract.js +3 -3
- package/dist/commands/map.js +1 -1
- package/dist/commands/read.js +3 -3
- package/dist/commands/scrape.js +4 -4
- package/dist/commands/scrapeJson.js +4 -4
- package/dist/commands/scrapeMarkdown.js +3 -3
- package/dist/commands/scrapeText.js +3 -3
- package/dist/commands/search.js +3 -3
- package/dist/commands/shared.js +4 -4
- package/dist/config.js +17 -17
- package/dist/doctor.js +2 -2
- package/dist/lib/changeTracking.js +5 -5
- package/dist/lib/crawl.js +7 -7
- package/dist/lib/document.js +2 -2
- package/dist/lib/readSource.js +45 -45
- package/dist/lib/{unlocker.js → scrapingApi.js} +10 -10
- package/dist/lib/search.js +14 -14
- package/dist/lib/structuredScrape.js +5 -5
- package/package.json +2 -2
package/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,12 @@
|
|
|
5
5
|
- browser automation is now embedded directly in `gologin-web-access`, so one repo and one install contains both Scraping API and Cloud Browser flows
|
|
6
6
|
- doctor now reports the embedded browser runtime source and version
|
|
7
7
|
|
|
8
|
+
## 0.3.5 - 2026-05-14
|
|
9
|
+
|
|
10
|
+
- renamed the internal stateless scraping layer from Web Unlocker to Scraping API
|
|
11
|
+
- config files now write `scrapingApiKey`, while legacy `webUnlockerApiKey` configs and old env aliases still load
|
|
12
|
+
- `renderSource` and search transport values now use `scraping`; `unlocker` remains accepted as a legacy `--source` alias
|
|
13
|
+
|
|
8
14
|
## 0.3.2 - 2026-04-03
|
|
9
15
|
|
|
10
16
|
- added unified page outcome classification across `read`, `scrape-json`, and `batch-scrape`
|
package/README.md
CHANGED
|
@@ -229,7 +229,7 @@ You can also write a minimal config file at `~/.gologin-web-access/config.json`:
|
|
|
229
229
|
|
|
230
230
|
```json
|
|
231
231
|
{
|
|
232
|
-
"
|
|
232
|
+
"scrapingApiKey": "wu_...",
|
|
233
233
|
"cloudToken": "gl_...",
|
|
234
234
|
"defaultProfileId": "profile_123",
|
|
235
235
|
"daemonPort": 4590
|
package/dist/cli.js
CHANGED
|
@@ -68,7 +68,7 @@ const wait_1 = require("./commands/wait");
|
|
|
68
68
|
const doctor_1 = require("./doctor");
|
|
69
69
|
const errors_1 = require("./lib/errors");
|
|
70
70
|
const output_1 = require("./lib/output");
|
|
71
|
-
const CLI_VERSION = "0.3.
|
|
71
|
+
const CLI_VERSION = "0.3.5";
|
|
72
72
|
async function main() {
|
|
73
73
|
const program = new commander_1.Command();
|
|
74
74
|
program
|
|
@@ -13,7 +13,7 @@ const concurrency_1 = require("../lib/concurrency");
|
|
|
13
13
|
const output_1 = require("../lib/output");
|
|
14
14
|
const shared_1 = require("./shared");
|
|
15
15
|
function buildBatchChangeTrackCommand() {
|
|
16
|
-
return (0, shared_1.
|
|
16
|
+
return (0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("batch-change-track")
|
|
17
17
|
.description("Track multiple pages over time and report which ones are new, same, or changed.")
|
|
18
18
|
.argument("<urls...>", "One or more URLs")
|
|
19
19
|
.option("--format <format>", "html, markdown, text, or json", "markdown")
|
|
@@ -22,10 +22,10 @@ function buildBatchChangeTrackCommand() {
|
|
|
22
22
|
.option("--summary", "Print one-line status counts to stderr after the JSON output")
|
|
23
23
|
.action(async (urls, options) => {
|
|
24
24
|
const config = await (0, config_1.loadConfig)();
|
|
25
|
-
const apiKey = (0, config_1.
|
|
25
|
+
const apiKey = (0, config_1.requireScrapingApiKey)(config);
|
|
26
26
|
const format = (0, changeTracking_1.normalizeTrackingFormat)(options.format);
|
|
27
27
|
const concurrency = Math.max(1, Number(options.concurrency) || 4);
|
|
28
|
-
const requestOptions = (0, shared_1.
|
|
28
|
+
const requestOptions = (0, shared_1.normalizeScrapingApiRequestOptions)(options);
|
|
29
29
|
const results = await (0, concurrency_1.mapWithConcurrency)(urls, concurrency, async (url) => {
|
|
30
30
|
try {
|
|
31
31
|
const key = (0, changeTracking_1.buildTrackingKey)(url);
|
|
@@ -14,7 +14,7 @@ const output_1 = require("../lib/output");
|
|
|
14
14
|
const readSource_1 = require("../lib/readSource");
|
|
15
15
|
const shared_1 = require("./shared");
|
|
16
16
|
function buildBatchExtractCommand() {
|
|
17
|
-
return (0, shared_1.addProfileOption)((0, shared_1.
|
|
17
|
+
return (0, shared_1.addProfileOption)((0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("batch-extract")
|
|
18
18
|
.description("Extract structured data from multiple pages using one selector schema.")
|
|
19
19
|
.argument("<urls...>", "One or more URLs")
|
|
20
20
|
.requiredOption("--schema <path>", "Path to a JSON extraction schema")
|
|
@@ -25,10 +25,10 @@ function buildBatchExtractCommand() {
|
|
|
25
25
|
.action(async (urls, options) => {
|
|
26
26
|
const config = await (0, config_1.loadConfig)();
|
|
27
27
|
const source = (0, readSource_1.normalizeReadSourceMode)(options.source, "auto");
|
|
28
|
-
const apiKey = source === "browser" ? "" : (0, config_1.
|
|
28
|
+
const apiKey = source === "browser" ? "" : (0, config_1.requireScrapingApiKey)(config);
|
|
29
29
|
const schema = await readSchema(path_1.default.resolve(options.schema));
|
|
30
30
|
const concurrency = Math.max(1, Number(options.concurrency) || 4);
|
|
31
|
-
const request = (0, shared_1.
|
|
31
|
+
const request = (0, shared_1.normalizeScrapingApiRequestOptions)(options);
|
|
32
32
|
const results = await (0, concurrency_1.mapWithConcurrency)(urls, concurrency, async (url) => {
|
|
33
33
|
try {
|
|
34
34
|
return {
|
|
@@ -13,10 +13,10 @@ const config_1 = require("../config");
|
|
|
13
13
|
const output_1 = require("../lib/output");
|
|
14
14
|
const readSource_1 = require("../lib/readSource");
|
|
15
15
|
const structuredScrape_1 = require("../lib/structuredScrape");
|
|
16
|
-
const
|
|
16
|
+
const scrapingApi_1 = require("../lib/scrapingApi");
|
|
17
17
|
const shared_1 = require("./shared");
|
|
18
18
|
function buildBatchScrapeCommand() {
|
|
19
|
-
return (0, shared_1.addProfileOption)((0, shared_1.
|
|
19
|
+
return (0, shared_1.addProfileOption)((0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("batch-scrape")
|
|
20
20
|
.description("Fetch multiple pages through Scraping API and print a JSON array of results.")
|
|
21
21
|
.argument("<urls...>", "One or more URLs")
|
|
22
22
|
.option("--format <format>", "html, markdown, text, or json", "html")
|
|
@@ -32,9 +32,9 @@ function buildBatchScrapeCommand() {
|
|
|
32
32
|
const format = normalizeFormat(options.format);
|
|
33
33
|
const source = (0, readSource_1.normalizeReadSourceMode)(options.source, "auto");
|
|
34
34
|
const usingBrowserOnlyMainContent = Boolean(options.onlyMainContent) && format !== "json" && source === "browser";
|
|
35
|
-
const apiKey = usingBrowserOnlyMainContent ? "" : (0, config_1.
|
|
35
|
+
const apiKey = usingBrowserOnlyMainContent ? "" : (0, config_1.requireScrapingApiKey)(config);
|
|
36
36
|
const concurrency = Math.max(1, Number(options.concurrency) || 4);
|
|
37
|
-
const requestOptions = (0, shared_1.
|
|
37
|
+
const requestOptions = (0, shared_1.normalizeScrapingApiRequestOptions)(options);
|
|
38
38
|
const fallback = (0, structuredScrape_1.normalizeStructuredFallbackMode)(options.fallback);
|
|
39
39
|
const results = await mapWithConcurrency(urls, concurrency, async (url) => {
|
|
40
40
|
try {
|
|
@@ -120,15 +120,15 @@ async function formatOutput(url, config, apiKey, format, requestOptions, fallbac
|
|
|
120
120
|
switch (format) {
|
|
121
121
|
case "html":
|
|
122
122
|
return {
|
|
123
|
-
output: (await (0,
|
|
123
|
+
output: (await (0, scrapingApi_1.scrapeRenderedHtml)(url, apiKey, requestOptions)).content,
|
|
124
124
|
};
|
|
125
125
|
case "markdown":
|
|
126
126
|
return {
|
|
127
|
-
output: (await (0,
|
|
127
|
+
output: (await (0, scrapingApi_1.scrapeMarkdown)(url, apiKey, requestOptions)).markdown,
|
|
128
128
|
};
|
|
129
129
|
case "text":
|
|
130
130
|
return {
|
|
131
|
-
output: (await (0,
|
|
131
|
+
output: (await (0, scrapingApi_1.scrapeText)(url, apiKey, requestOptions)).text,
|
|
132
132
|
};
|
|
133
133
|
case "json":
|
|
134
134
|
return mapStructuredBatchResult(await (0, structuredScrape_1.scrapeStructuredJson)(url, config, apiKey, {
|
|
@@ -137,7 +137,7 @@ async function formatOutput(url, config, apiKey, format, requestOptions, fallbac
|
|
|
137
137
|
}));
|
|
138
138
|
default:
|
|
139
139
|
return {
|
|
140
|
-
output: (await (0,
|
|
140
|
+
output: (await (0, scrapingApi_1.scrapeRenderedHtml)(url, apiKey, requestOptions)).content,
|
|
141
141
|
};
|
|
142
142
|
}
|
|
143
143
|
}
|
|
@@ -12,7 +12,7 @@ const changeTracking_1 = require("../lib/changeTracking");
|
|
|
12
12
|
const output_1 = require("../lib/output");
|
|
13
13
|
const shared_1 = require("./shared");
|
|
14
14
|
function buildChangeTrackCommand() {
|
|
15
|
-
return (0, shared_1.
|
|
15
|
+
return (0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("change-track")
|
|
16
16
|
.description("Track a page over time and report whether it changed since the last snapshot.")
|
|
17
17
|
.argument("<url>", "Target URL")
|
|
18
18
|
.option("--format <format>", "html, markdown, text, or json", "markdown")
|
|
@@ -21,10 +21,10 @@ function buildChangeTrackCommand() {
|
|
|
21
21
|
.option("--output <path>", "Write the current tracking result to a file")
|
|
22
22
|
.action(async (url, options) => {
|
|
23
23
|
const config = await (0, config_1.loadConfig)();
|
|
24
|
-
const apiKey = (0, config_1.
|
|
24
|
+
const apiKey = (0, config_1.requireScrapingApiKey)(config);
|
|
25
25
|
const format = (0, changeTracking_1.normalizeTrackingFormat)(options.format);
|
|
26
26
|
const key = (0, changeTracking_1.buildTrackingKey)(url, options.key);
|
|
27
|
-
const snapshot = await (0, changeTracking_1.scrapeForTracking)(url, apiKey, format, (0, shared_1.
|
|
27
|
+
const snapshot = await (0, changeTracking_1.scrapeForTracking)(url, apiKey, format, (0, shared_1.normalizeScrapingApiRequestOptions)(options));
|
|
28
28
|
const result = await (0, changeTracking_1.compareAndPersistSnapshot)(config, {
|
|
29
29
|
key,
|
|
30
30
|
url,
|
|
@@ -5,7 +5,7 @@ const commander_1 = require("commander");
|
|
|
5
5
|
const config_1 = require("../config");
|
|
6
6
|
const cloudApi_1 = require("../lib/cloudApi");
|
|
7
7
|
const output_1 = require("../lib/output");
|
|
8
|
-
const
|
|
8
|
+
const scrapingApi_1 = require("../lib/scrapingApi");
|
|
9
9
|
function buildConfigInitCommand() {
|
|
10
10
|
return new commander_1.Command("init")
|
|
11
11
|
.description("Write ~/.gologin-web-access/config.json with current values or placeholders. Recommended: persist both the Scraping API key and the GoLogin token.")
|
|
@@ -19,14 +19,14 @@ function buildConfigInitCommand() {
|
|
|
19
19
|
.option("--no-validate", "Skip live key validation after writing config")
|
|
20
20
|
.option("--force", "Overwrite an existing config file")
|
|
21
21
|
.action(async (options) => {
|
|
22
|
-
const
|
|
22
|
+
const scrapingApiKey = options.scrapingApiKey ??
|
|
23
23
|
options.webUnlockerApiKey ??
|
|
24
24
|
options.webUnlockerKey ??
|
|
25
|
-
process.env[config_1.ENV_NAMES.
|
|
25
|
+
process.env[config_1.ENV_NAMES.scrapingApiKey] ??
|
|
26
26
|
process.env.GOLOGIN_WEB_UNLOCKER_API_KEY ??
|
|
27
27
|
process.env.GOLOGIN_WEBUNLOCKER_API_KEY;
|
|
28
28
|
const result = await (0, config_1.initConfigFile)({
|
|
29
|
-
|
|
29
|
+
scrapingApiKey,
|
|
30
30
|
cloudToken: options.token ??
|
|
31
31
|
options.cloudToken ??
|
|
32
32
|
process.env[config_1.ENV_NAMES.cloudToken] ??
|
|
@@ -44,7 +44,7 @@ function buildConfigInitCommand() {
|
|
|
44
44
|
{ label: "Config file", value: result.path },
|
|
45
45
|
{
|
|
46
46
|
label: "Scraping API key",
|
|
47
|
-
value: result.config.
|
|
47
|
+
value: result.config.scrapingApiKey ? "written" : "left empty",
|
|
48
48
|
},
|
|
49
49
|
{
|
|
50
50
|
label: "GoLogin token",
|
|
@@ -59,15 +59,15 @@ function buildConfigInitCommand() {
|
|
|
59
59
|
value: String(result.config.daemonPort ?? config_1.DEFAULT_DAEMON_PORT),
|
|
60
60
|
},
|
|
61
61
|
]);
|
|
62
|
-
if (!result.config.
|
|
62
|
+
if (!result.config.scrapingApiKey || !result.config.cloudToken) {
|
|
63
63
|
(0, output_1.printText)("Recommended next step: configure both GOLOGIN_SCRAPING_API_KEY and GOLOGIN_TOKEN so agents can use scraping and browser flows without asking again.");
|
|
64
64
|
}
|
|
65
65
|
if (options.validate === false) {
|
|
66
66
|
return;
|
|
67
67
|
}
|
|
68
68
|
const validationRows = [];
|
|
69
|
-
if (result.config.
|
|
70
|
-
const validation = await (0,
|
|
69
|
+
if (result.config.scrapingApiKey) {
|
|
70
|
+
const validation = await (0, scrapingApi_1.validateScrapingApiKey)(result.config.scrapingApiKey);
|
|
71
71
|
validationRows.push({
|
|
72
72
|
label: "Scraping API validation",
|
|
73
73
|
value: validation.ok ? "ok" : `failed${validation.status ? ` (${validation.status})` : ""}: ${validation.detail}`,
|
package/dist/commands/crawl.js
CHANGED
|
@@ -22,7 +22,7 @@ function buildCrawlCommand() {
|
|
|
22
22
|
.option("--strict", "Exit non-zero when any page fails during crawling")
|
|
23
23
|
.action(async (url, options) => {
|
|
24
24
|
const config = await (0, config_1.loadConfig)();
|
|
25
|
-
const apiKey = (0, config_1.
|
|
25
|
+
const apiKey = (0, config_1.requireScrapingApiKey)(config);
|
|
26
26
|
const format = normalizeFormat(options.format);
|
|
27
27
|
const result = await (0, crawl_1.crawlSite)(url, apiKey, format, {
|
|
28
28
|
limit: normalizePositiveInt(options.limit, 25),
|
|
@@ -21,7 +21,7 @@ function buildCrawlStartCommand() {
|
|
|
21
21
|
.option("--only-main-content", "For html, markdown, or text output, isolate the most readable content block on each page")
|
|
22
22
|
.action(async (url, options) => {
|
|
23
23
|
const config = await (0, config_1.loadConfig)();
|
|
24
|
-
(0, config_1.
|
|
24
|
+
(0, config_1.requireScrapingApiKey)(config);
|
|
25
25
|
const args = buildCrawlArgs(url, options);
|
|
26
26
|
const job = await (0, jobRegistry_1.createJob)(config, {
|
|
27
27
|
kind: "crawl",
|
package/dist/commands/extract.js
CHANGED
|
@@ -13,7 +13,7 @@ const output_1 = require("../lib/output");
|
|
|
13
13
|
const readSource_1 = require("../lib/readSource");
|
|
14
14
|
const shared_1 = require("./shared");
|
|
15
15
|
function buildExtractCommand() {
|
|
16
|
-
return (0, shared_1.
|
|
16
|
+
return (0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("extract")
|
|
17
17
|
.description("Extract structured data from a page using a selector schema.")
|
|
18
18
|
.argument("<url>", "Target URL")
|
|
19
19
|
.requiredOption("--schema <path>", "Path to a JSON extraction schema")
|
|
@@ -22,11 +22,11 @@ function buildExtractCommand() {
|
|
|
22
22
|
.action(async (url, options) => {
|
|
23
23
|
const config = await (0, config_1.loadConfig)();
|
|
24
24
|
const source = (0, readSource_1.normalizeReadSourceMode)(options.source, "auto");
|
|
25
|
-
const apiKey = source === "browser" ? "" : (0, config_1.
|
|
25
|
+
const apiKey = source === "browser" ? "" : (0, config_1.requireScrapingApiKey)(config);
|
|
26
26
|
const schema = await readSchema(path_1.default.resolve(options.schema));
|
|
27
27
|
const result = await (0, extractRunner_1.extractUrlWithSchema)(url, config, apiKey, schema, {
|
|
28
28
|
source,
|
|
29
|
-
request: (0, shared_1.
|
|
29
|
+
request: (0, shared_1.normalizeScrapingApiRequestOptions)(options),
|
|
30
30
|
});
|
|
31
31
|
if (options.output) {
|
|
32
32
|
const outputPath = path_1.default.resolve(options.output);
|
package/dist/commands/map.js
CHANGED
|
@@ -20,7 +20,7 @@ function buildMapCommand() {
|
|
|
20
20
|
.option("--strict", "Exit non-zero when any page fails during mapping")
|
|
21
21
|
.action(async (url, options) => {
|
|
22
22
|
const config = await (0, config_1.loadConfig)();
|
|
23
|
-
const apiKey = (0, config_1.
|
|
23
|
+
const apiKey = (0, config_1.requireScrapingApiKey)(config);
|
|
24
24
|
const result = await (0, crawl_1.mapSite)(url, apiKey, {
|
|
25
25
|
limit: normalizePositiveInt(options.limit, 100),
|
|
26
26
|
maxDepth: normalizeNonNegativeInt(options.maxDepth, 2),
|
package/dist/commands/read.js
CHANGED
|
@@ -8,7 +8,7 @@ const readSource_1 = require("../lib/readSource");
|
|
|
8
8
|
const output_1 = require("../lib/output");
|
|
9
9
|
const shared_1 = require("./shared");
|
|
10
10
|
function buildReadCommand() {
|
|
11
|
-
return (0, shared_1.addProfileOption)((0, shared_1.
|
|
11
|
+
return (0, shared_1.addProfileOption)((0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("read")
|
|
12
12
|
.description("Read the main content of a docs page or article with automatic fallback to Cloud Browser when needed.")
|
|
13
13
|
.argument("<url>", "URL to read")
|
|
14
14
|
.option("--format <format>", "Output format: html, markdown, or text", "text")
|
|
@@ -17,11 +17,11 @@ function buildReadCommand() {
|
|
|
17
17
|
const config = await (0, config_1.loadConfig)();
|
|
18
18
|
const format = normalizeReadFormat(options.format);
|
|
19
19
|
const source = (0, readSource_1.normalizeReadSourceMode)(options.source, "auto");
|
|
20
|
-
const apiKey = source === "browser" ? "" : (0, config_1.
|
|
20
|
+
const apiKey = source === "browser" ? "" : (0, config_1.requireScrapingApiKey)(config);
|
|
21
21
|
const readOptions = {
|
|
22
22
|
source,
|
|
23
23
|
profile: options.profile,
|
|
24
|
-
request: (0, shared_1.
|
|
24
|
+
request: (0, shared_1.normalizeScrapingApiRequestOptions)(options),
|
|
25
25
|
};
|
|
26
26
|
const result = format === "html"
|
|
27
27
|
? await (0, readSource_1.readHtmlContent)(url, config, apiKey, readOptions)
|
package/dist/commands/scrape.js
CHANGED
|
@@ -5,15 +5,15 @@ const commander_1 = require("commander");
|
|
|
5
5
|
const config_1 = require("../config");
|
|
6
6
|
const shared_1 = require("./shared");
|
|
7
7
|
const output_1 = require("../lib/output");
|
|
8
|
-
const
|
|
8
|
+
const scrapingApi_1 = require("../lib/scrapingApi");
|
|
9
9
|
function buildScrapeCommand() {
|
|
10
|
-
return (0, shared_1.
|
|
10
|
+
return (0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("scrape")
|
|
11
11
|
.description("Fetch rendered HTML through GoLogin Scraping API.")
|
|
12
12
|
.argument("<url>", "URL to scrape")
|
|
13
13
|
.action(async (url, options) => {
|
|
14
14
|
const config = await (0, config_1.loadConfig)();
|
|
15
|
-
const apiKey = (0, config_1.
|
|
16
|
-
const result = await (0,
|
|
15
|
+
const apiKey = (0, config_1.requireScrapingApiKey)(config);
|
|
16
|
+
const result = await (0, scrapingApi_1.scrapeRenderedHtml)(url, apiKey, (0, shared_1.normalizeScrapingApiRequestOptions)(options));
|
|
17
17
|
(0, output_1.printText)(result.content);
|
|
18
18
|
}));
|
|
19
19
|
}
|
|
@@ -7,21 +7,21 @@ const output_1 = require("../lib/output");
|
|
|
7
7
|
const structuredScrape_1 = require("../lib/structuredScrape");
|
|
8
8
|
const shared_1 = require("./shared");
|
|
9
9
|
function buildScrapeJsonCommand() {
|
|
10
|
-
return (0, shared_1.addProfileOption)((0, shared_1.
|
|
10
|
+
return (0, shared_1.addProfileOption)((0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("scrape-json")
|
|
11
11
|
.description("Fetch a page through Scraping API and print a structured JSON envelope.")
|
|
12
12
|
.argument("<url>", "URL to scrape")
|
|
13
13
|
.option("--fallback <mode>", "none or browser structured fallback for JS-heavy pages", "none")
|
|
14
14
|
.action(async (url, options) => {
|
|
15
15
|
const config = await (0, config_1.loadConfig)();
|
|
16
|
-
const apiKey = (0, config_1.
|
|
16
|
+
const apiKey = (0, config_1.requireScrapingApiKey)(config);
|
|
17
17
|
const envelope = await (0, structuredScrape_1.scrapeStructuredJson)(url, config, apiKey, {
|
|
18
18
|
fallback: (0, structuredScrape_1.normalizeStructuredFallbackMode)(options.fallback),
|
|
19
19
|
profile: options.profile,
|
|
20
|
-
request: (0, shared_1.
|
|
20
|
+
request: (0, shared_1.normalizeScrapingApiRequestOptions)(options),
|
|
21
21
|
});
|
|
22
22
|
if (envelope.fallbackAttempted) {
|
|
23
23
|
const fallbackStatus = envelope.fallbackUsed
|
|
24
|
-
? "Browser fallback succeeded and replaced the
|
|
24
|
+
? "Browser fallback succeeded and replaced the Scraping API result."
|
|
25
25
|
: `Browser fallback was attempted but not used. ${envelope.fallbackReason ?? "It did not improve the structured output."}`;
|
|
26
26
|
process.stderr.write(`${fallbackStatus}\n`);
|
|
27
27
|
}
|
|
@@ -8,17 +8,17 @@ const readSource_1 = require("../lib/readSource");
|
|
|
8
8
|
const shared_1 = require("./shared");
|
|
9
9
|
const output_1 = require("../lib/output");
|
|
10
10
|
function buildScrapeMarkdownCommand() {
|
|
11
|
-
return (0, shared_1.
|
|
11
|
+
return (0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("scrape-markdown")
|
|
12
12
|
.description("Fetch a page through Scraping API and print Markdown.")
|
|
13
13
|
.argument("<url>", "URL to scrape")
|
|
14
14
|
.option("--source <source>", "Read source: auto, scraping, or browser. Legacy alias: unlocker", "auto")
|
|
15
15
|
.action(async (url, options) => {
|
|
16
16
|
const config = await (0, config_1.loadConfig)();
|
|
17
17
|
const source = (0, readSource_1.normalizeReadSourceMode)(options.source, "auto");
|
|
18
|
-
const apiKey = source === "browser" ? "" : (0, config_1.
|
|
18
|
+
const apiKey = source === "browser" ? "" : (0, config_1.requireScrapingApiKey)(config);
|
|
19
19
|
const result = await (0, readSource_1.readMarkdownContent)(url, config, apiKey, {
|
|
20
20
|
source,
|
|
21
|
-
request: (0, shared_1.
|
|
21
|
+
request: (0, shared_1.normalizeScrapingApiRequestOptions)(options),
|
|
22
22
|
});
|
|
23
23
|
emitReadNotice(result);
|
|
24
24
|
(0, output_1.printText)(result.content);
|
|
@@ -8,17 +8,17 @@ const readSource_1 = require("../lib/readSource");
|
|
|
8
8
|
const shared_1 = require("./shared");
|
|
9
9
|
const output_1 = require("../lib/output");
|
|
10
10
|
function buildScrapeTextCommand() {
|
|
11
|
-
return (0, shared_1.
|
|
11
|
+
return (0, shared_1.addScrapingApiRequestOptions)(new commander_1.Command("scrape-text")
|
|
12
12
|
.description("Fetch a page through Scraping API and print plain text.")
|
|
13
13
|
.argument("<url>", "URL to scrape")
|
|
14
14
|
.option("--source <source>", "Read source: auto, scraping, or browser. Legacy alias: unlocker", "auto")
|
|
15
15
|
.action(async (url, options) => {
|
|
16
16
|
const config = await (0, config_1.loadConfig)();
|
|
17
17
|
const source = (0, readSource_1.normalizeReadSourceMode)(options.source, "auto");
|
|
18
|
-
const apiKey = source === "browser" ? "" : (0, config_1.
|
|
18
|
+
const apiKey = source === "browser" ? "" : (0, config_1.requireScrapingApiKey)(config);
|
|
19
19
|
const result = await (0, readSource_1.readTextContent)(url, config, apiKey, {
|
|
20
20
|
source,
|
|
21
|
-
request: (0, shared_1.
|
|
21
|
+
request: (0, shared_1.normalizeScrapingApiRequestOptions)(options),
|
|
22
22
|
});
|
|
23
23
|
emitReadNotice(result);
|
|
24
24
|
(0, output_1.printText)(result.content);
|
package/dist/commands/search.js
CHANGED
|
@@ -32,10 +32,10 @@ function normalizeLimit(value) {
|
|
|
32
32
|
return Math.min(Math.floor(parsed), 100);
|
|
33
33
|
}
|
|
34
34
|
function normalizeSource(value) {
|
|
35
|
-
if (value === "scraping" || value === "scraping-api") {
|
|
36
|
-
return "
|
|
35
|
+
if (value === "scraping" || value === "scraping-api" || value === "unlocker") {
|
|
36
|
+
return "scraping";
|
|
37
37
|
}
|
|
38
|
-
if (value === "auto" || value === "
|
|
38
|
+
if (value === "auto" || value === "browser") {
|
|
39
39
|
return value;
|
|
40
40
|
}
|
|
41
41
|
throw new Error(`Unsupported search source: ${value}`);
|
package/dist/commands/shared.js
CHANGED
|
@@ -8,8 +8,8 @@ exports.addProfileOption = addProfileOption;
|
|
|
8
8
|
exports.runBrowserCommand = runBrowserCommand;
|
|
9
9
|
exports.runOpenLikeCommand = runOpenLikeCommand;
|
|
10
10
|
exports.resolveOutputPath = resolveOutputPath;
|
|
11
|
-
exports.
|
|
12
|
-
exports.
|
|
11
|
+
exports.addScrapingApiRequestOptions = addScrapingApiRequestOptions;
|
|
12
|
+
exports.normalizeScrapingApiRequestOptions = normalizeScrapingApiRequestOptions;
|
|
13
13
|
const path_1 = __importDefault(require("path"));
|
|
14
14
|
const config_1 = require("../config");
|
|
15
15
|
const agentCli_1 = require("../lib/agentCli");
|
|
@@ -67,13 +67,13 @@ async function runOpenLikeCommand(url, options) {
|
|
|
67
67
|
function resolveOutputPath(targetPath) {
|
|
68
68
|
return path_1.default.resolve(targetPath);
|
|
69
69
|
}
|
|
70
|
-
function
|
|
70
|
+
function addScrapingApiRequestOptions(command) {
|
|
71
71
|
return command
|
|
72
72
|
.option("--retry <count>", "Retry attempts for timeout, 429, and 5xx responses")
|
|
73
73
|
.option("--backoff-ms <ms>", "Base exponential backoff in milliseconds for retried requests")
|
|
74
74
|
.option("--timeout-ms <ms>", "Per-request timeout in milliseconds");
|
|
75
75
|
}
|
|
76
|
-
function
|
|
76
|
+
function normalizeScrapingApiRequestOptions(options) {
|
|
77
77
|
return {
|
|
78
78
|
maxRetries: normalizeOptionalNonNegativeInt(options.retry),
|
|
79
79
|
backoffMs: normalizeOptionalNonNegativeInt(options.backoffMs),
|
package/dist/config.js
CHANGED
|
@@ -7,7 +7,7 @@ exports.ENV_NAMES = exports.DEFAULT_DAEMON_PORT = void 0;
|
|
|
7
7
|
exports.getDefaultConfigPath = getDefaultConfigPath;
|
|
8
8
|
exports.loadConfig = loadConfig;
|
|
9
9
|
exports.initConfigFile = initConfigFile;
|
|
10
|
-
exports.
|
|
10
|
+
exports.requireScrapingApiKey = requireScrapingApiKey;
|
|
11
11
|
exports.requireCloudToken = requireCloudToken;
|
|
12
12
|
exports.resolveProfileId = resolveProfileId;
|
|
13
13
|
exports.getRecommendedCredentialStatus = getRecommendedCredentialStatus;
|
|
@@ -22,13 +22,13 @@ const LEGACY_CONFIG_DIR = ".gologin-web";
|
|
|
22
22
|
const CONFIG_FILENAME = "config.json";
|
|
23
23
|
exports.DEFAULT_DAEMON_PORT = 4590;
|
|
24
24
|
exports.ENV_NAMES = {
|
|
25
|
-
|
|
25
|
+
scrapingApiKey: "GOLOGIN_SCRAPING_API_KEY",
|
|
26
26
|
cloudToken: "GOLOGIN_TOKEN",
|
|
27
27
|
defaultProfileId: "GOLOGIN_DEFAULT_PROFILE_ID",
|
|
28
28
|
daemonPort: "GOLOGIN_DAEMON_PORT",
|
|
29
29
|
};
|
|
30
30
|
const LEGACY_ENV_NAMES = {
|
|
31
|
-
|
|
31
|
+
scrapingApiKey: ["GOLOGIN_WEB_UNLOCKER_API_KEY", "GOLOGIN_WEBUNLOCKER_API_KEY"],
|
|
32
32
|
cloudToken: ["GOLOGIN_CLOUD_TOKEN"],
|
|
33
33
|
defaultProfileId: ["GOLOGIN_PROFILE_ID"],
|
|
34
34
|
daemonPort: [],
|
|
@@ -40,11 +40,11 @@ async function loadConfig() {
|
|
|
40
40
|
const configPath = await resolveConfigPath();
|
|
41
41
|
const stateDir = path_1.default.dirname(configPath);
|
|
42
42
|
const fileConfig = await readConfigFile(configPath);
|
|
43
|
-
const
|
|
43
|
+
const scrapingApiEnv = firstEnvValue(exports.ENV_NAMES.scrapingApiKey, LEGACY_ENV_NAMES.scrapingApiKey);
|
|
44
44
|
const cloudTokenEnv = firstEnvValue(exports.ENV_NAMES.cloudToken, LEGACY_ENV_NAMES.cloudToken);
|
|
45
45
|
const profileEnv = firstEnvValue(exports.ENV_NAMES.defaultProfileId, LEGACY_ENV_NAMES.defaultProfileId);
|
|
46
46
|
const daemonPortEnv = firstEnvValue(exports.ENV_NAMES.daemonPort, LEGACY_ENV_NAMES.daemonPort);
|
|
47
|
-
const
|
|
47
|
+
const scrapingApiKey = pickString(scrapingApiEnv, fileConfig.scrapingApiKey);
|
|
48
48
|
const cloudToken = pickString(cloudTokenEnv, fileConfig.cloudToken);
|
|
49
49
|
const defaultProfileId = pickString(profileEnv, fileConfig.defaultProfileId);
|
|
50
50
|
const daemonPort = pickNumber(daemonPortEnv, fileConfig.daemonPort, exports.DEFAULT_DAEMON_PORT);
|
|
@@ -54,12 +54,12 @@ async function loadConfig() {
|
|
|
54
54
|
jobsDir: path_1.default.join(stateDir, "jobs"),
|
|
55
55
|
trackingDir: path_1.default.join(stateDir, "tracking"),
|
|
56
56
|
artifactsDir: path_1.default.join(stateDir, "artifacts"),
|
|
57
|
-
|
|
57
|
+
scrapingApiKey,
|
|
58
58
|
cloudToken,
|
|
59
59
|
defaultProfileId,
|
|
60
60
|
daemonPort,
|
|
61
61
|
sources: {
|
|
62
|
-
|
|
62
|
+
scrapingApiKey: resolveSource(scrapingApiEnv, fileConfig.scrapingApiKey),
|
|
63
63
|
cloudToken: resolveSource(cloudTokenEnv, fileConfig.cloudToken),
|
|
64
64
|
defaultProfileId: resolveSource(profileEnv, fileConfig.defaultProfileId),
|
|
65
65
|
daemonPort: resolveNumberSource(daemonPortEnv, fileConfig.daemonPort),
|
|
@@ -80,7 +80,7 @@ async function initConfigFile(overrides, options = {}) {
|
|
|
80
80
|
};
|
|
81
81
|
}
|
|
82
82
|
const nextConfig = {
|
|
83
|
-
|
|
83
|
+
scrapingApiKey: overrides.scrapingApiKey,
|
|
84
84
|
cloudToken: overrides.cloudToken,
|
|
85
85
|
defaultProfileId: overrides.defaultProfileId,
|
|
86
86
|
daemonPort: overrides.daemonPort ?? exports.DEFAULT_DAEMON_PORT,
|
|
@@ -92,11 +92,11 @@ async function initConfigFile(overrides, options = {}) {
|
|
|
92
92
|
created: true,
|
|
93
93
|
};
|
|
94
94
|
}
|
|
95
|
-
function
|
|
96
|
-
if (!config.
|
|
97
|
-
throw new errors_1.MissingCredentialError(exports.ENV_NAMES.
|
|
95
|
+
function requireScrapingApiKey(config) {
|
|
96
|
+
if (!config.scrapingApiKey) {
|
|
97
|
+
throw new errors_1.MissingCredentialError(exports.ENV_NAMES.scrapingApiKey, "scraping commands like `gologin-web-access scrape`");
|
|
98
98
|
}
|
|
99
|
-
return config.
|
|
99
|
+
return config.scrapingApiKey;
|
|
100
100
|
}
|
|
101
101
|
function requireCloudToken(config) {
|
|
102
102
|
if (!config.cloudToken) {
|
|
@@ -109,8 +109,8 @@ function resolveProfileId(config, explicitProfileId) {
|
|
|
109
109
|
}
|
|
110
110
|
function getRecommendedCredentialStatus(config) {
|
|
111
111
|
const missing = [];
|
|
112
|
-
if (!config.
|
|
113
|
-
missing.push(exports.ENV_NAMES.
|
|
112
|
+
if (!config.scrapingApiKey) {
|
|
113
|
+
missing.push(exports.ENV_NAMES.scrapingApiKey);
|
|
114
114
|
}
|
|
115
115
|
if (!config.cloudToken) {
|
|
116
116
|
missing.push(exports.ENV_NAMES.cloudToken);
|
|
@@ -136,8 +136,8 @@ function getMaskedConfigRows(config) {
|
|
|
136
136
|
value: config.configPath,
|
|
137
137
|
},
|
|
138
138
|
{
|
|
139
|
-
label: exports.ENV_NAMES.
|
|
140
|
-
value: describeValue(config.
|
|
139
|
+
label: exports.ENV_NAMES.scrapingApiKey,
|
|
140
|
+
value: describeValue(config.scrapingApiKey, config.sources.scrapingApiKey),
|
|
141
141
|
},
|
|
142
142
|
{
|
|
143
143
|
label: exports.ENV_NAMES.cloudToken,
|
|
@@ -162,7 +162,7 @@ async function readConfigFile(configPath) {
|
|
|
162
162
|
const raw = await fs_1.promises.readFile(configPath, "utf8");
|
|
163
163
|
const parsed = JSON.parse(raw);
|
|
164
164
|
return {
|
|
165
|
-
|
|
165
|
+
scrapingApiKey: normalizeString(parsed.scrapingApiKey ?? parsed.webUnlockerApiKey),
|
|
166
166
|
cloudToken: normalizeString(parsed.cloudToken),
|
|
167
167
|
defaultProfileId: normalizeString(parsed.defaultProfileId),
|
|
168
168
|
daemonPort: parsed.daemonPort,
|
package/dist/doctor.js
CHANGED
|
@@ -13,8 +13,8 @@ async function runDoctor(options = {}) {
|
|
|
13
13
|
const recommended = (0, config_1.getRecommendedCredentialStatus)(config);
|
|
14
14
|
checks.push({
|
|
15
15
|
name: "Scraping API key",
|
|
16
|
-
status: config.
|
|
17
|
-
detail: config.
|
|
16
|
+
status: config.scrapingApiKey ? "ok" : "warn",
|
|
17
|
+
detail: config.scrapingApiKey ? `configured via ${config.sources.scrapingApiKey}` : "missing",
|
|
18
18
|
});
|
|
19
19
|
checks.push({
|
|
20
20
|
name: "GoLogin token",
|
|
@@ -11,7 +11,7 @@ const crypto_1 = require("crypto");
|
|
|
11
11
|
const fs_1 = require("fs");
|
|
12
12
|
const path_1 = __importDefault(require("path"));
|
|
13
13
|
const diff_1 = require("diff");
|
|
14
|
-
const
|
|
14
|
+
const scrapingApi_1 = require("./scrapingApi");
|
|
15
15
|
function buildTrackingKey(url, explicitKey) {
|
|
16
16
|
return (explicitKey ?? url)
|
|
17
17
|
.toLowerCase()
|
|
@@ -63,21 +63,21 @@ async function compareAndPersistSnapshot(config, snapshot) {
|
|
|
63
63
|
async function scrapeForTracking(url, apiKey, format, options = {}) {
|
|
64
64
|
switch (format) {
|
|
65
65
|
case "html": {
|
|
66
|
-
const result = await (0,
|
|
66
|
+
const result = await (0, scrapingApi_1.scrapeRenderedHtml)(url, apiKey, options);
|
|
67
67
|
return {
|
|
68
68
|
content: result.content,
|
|
69
69
|
request: result.request,
|
|
70
70
|
};
|
|
71
71
|
}
|
|
72
72
|
case "text": {
|
|
73
|
-
const result = await (0,
|
|
73
|
+
const result = await (0, scrapingApi_1.scrapeText)(url, apiKey, options);
|
|
74
74
|
return {
|
|
75
75
|
content: result.text,
|
|
76
76
|
request: result.request,
|
|
77
77
|
};
|
|
78
78
|
}
|
|
79
79
|
case "json": {
|
|
80
|
-
const result = await (0,
|
|
80
|
+
const result = await (0, scrapingApi_1.scrapeJson)(url, apiKey, options);
|
|
81
81
|
return {
|
|
82
82
|
title: result.data.title ?? undefined,
|
|
83
83
|
content: JSON.stringify(result.data, null, 2),
|
|
@@ -86,7 +86,7 @@ async function scrapeForTracking(url, apiKey, format, options = {}) {
|
|
|
86
86
|
}
|
|
87
87
|
case "markdown":
|
|
88
88
|
default: {
|
|
89
|
-
const result = await (0,
|
|
89
|
+
const result = await (0, scrapingApi_1.scrapeMarkdown)(url, apiKey, options);
|
|
90
90
|
return {
|
|
91
91
|
content: result.markdown,
|
|
92
92
|
request: result.request,
|
package/dist/lib/crawl.js
CHANGED
|
@@ -4,7 +4,7 @@ exports.mapSite = mapSite;
|
|
|
4
4
|
exports.crawlSite = crawlSite;
|
|
5
5
|
exports.resolveTraversalStatus = resolveTraversalStatus;
|
|
6
6
|
const readSource_1 = require("./readSource");
|
|
7
|
-
const
|
|
7
|
+
const scrapingApi_1 = require("./scrapingApi");
|
|
8
8
|
async function mapSite(rootUrl, apiKey, options) {
|
|
9
9
|
const pages = await traverseSite(rootUrl, apiKey, options);
|
|
10
10
|
const status = resolveTraversalStatus(pages.pages.length, pages.pages.filter((page) => !page.ok).length);
|
|
@@ -137,8 +137,8 @@ async function traverseSite(rootUrl, apiKey, options) {
|
|
|
137
137
|
};
|
|
138
138
|
}
|
|
139
139
|
async function scrapePage(url, apiKey, scope, options) {
|
|
140
|
-
const scraped = await (0,
|
|
141
|
-
const data = (0,
|
|
140
|
+
const scraped = await (0, scrapingApi_1.scrapeRenderedHtml)(url, apiKey);
|
|
141
|
+
const data = (0, scrapingApi_1.htmlToStructuredData)(scraped.content);
|
|
142
142
|
const links = extractScopedLinks(url, data.links, scope, options);
|
|
143
143
|
const readable = options.onlyMainContent ? (0, readSource_1.extractReadableSegmentFromHtml)(scraped.content) : null;
|
|
144
144
|
const htmlOutput = readable ? readable.html : scraped.content;
|
|
@@ -148,8 +148,8 @@ async function scrapePage(url, apiKey, scope, options) {
|
|
|
148
148
|
links,
|
|
149
149
|
outputByFormat: {
|
|
150
150
|
html: htmlOutput,
|
|
151
|
-
markdown: (0,
|
|
152
|
-
text: readable ? readable.text : (0,
|
|
151
|
+
markdown: (0, scrapingApi_1.htmlToMarkdown)(htmlOutput),
|
|
152
|
+
text: readable ? readable.text : (0, scrapingApi_1.htmlToText)(scraped.content),
|
|
153
153
|
json: data,
|
|
154
154
|
},
|
|
155
155
|
};
|
|
@@ -245,7 +245,7 @@ function isInScope(url, scope) {
|
|
|
245
245
|
async function fetchSitemapUrls(rootUrl, apiKey, scope, options) {
|
|
246
246
|
const root = new URL(rootUrl);
|
|
247
247
|
const sitemapUrl = new URL("/sitemap.xml", root.origin).toString();
|
|
248
|
-
const scraped = await (0,
|
|
248
|
+
const scraped = await (0, scrapingApi_1.scrapeRenderedHtml)(sitemapUrl, apiKey);
|
|
249
249
|
const directEntries = extractXmlLocValues(scraped.content)
|
|
250
250
|
.map((url) => normalizeVisitUrl(url, options.ignoreQueryParameters))
|
|
251
251
|
.filter((url) => isInScope(url, scope) && matchesScopeRules(url, options));
|
|
@@ -256,7 +256,7 @@ async function fetchSitemapUrls(rootUrl, apiKey, scope, options) {
|
|
|
256
256
|
const pageUrls = directEntries.filter((url) => !url.endsWith(".xml"));
|
|
257
257
|
const nestedPages = await mapWithConcurrency(nested.slice(0, 10), 2, async (url) => {
|
|
258
258
|
try {
|
|
259
|
-
const nestedScrape = await (0,
|
|
259
|
+
const nestedScrape = await (0, scrapingApi_1.scrapeRenderedHtml)(url, apiKey);
|
|
260
260
|
return extractXmlLocValues(nestedScrape.content)
|
|
261
261
|
.map((item) => normalizeVisitUrl(item, options.ignoreQueryParameters))
|
|
262
262
|
.filter((item) => isInScope(item, scope) && matchesScopeRules(item, options));
|
package/dist/lib/document.js
CHANGED
|
@@ -42,7 +42,7 @@ const path_1 = __importDefault(require("path"));
|
|
|
42
42
|
const mammoth_1 = __importDefault(require("mammoth"));
|
|
43
43
|
const pdf_parse_1 = require("pdf-parse");
|
|
44
44
|
const XLSX = __importStar(require("xlsx"));
|
|
45
|
-
const
|
|
45
|
+
const scrapingApi_1 = require("./scrapingApi");
|
|
46
46
|
async function parseDocumentSource(source) {
|
|
47
47
|
const loaded = await loadSource(source);
|
|
48
48
|
const kind = detectDocumentKind(source, loaded.contentType);
|
|
@@ -102,7 +102,7 @@ async function parseDocumentSource(source) {
|
|
|
102
102
|
return {
|
|
103
103
|
source,
|
|
104
104
|
kind,
|
|
105
|
-
text: (0,
|
|
105
|
+
text: (0, scrapingApi_1.htmlToText)(raw),
|
|
106
106
|
metadata: {}
|
|
107
107
|
};
|
|
108
108
|
}
|
package/dist/lib/readSource.js
CHANGED
|
@@ -11,15 +11,15 @@ const cheerio_1 = require("cheerio");
|
|
|
11
11
|
const config_1 = require("../config");
|
|
12
12
|
const browserRead_1 = require("./browserRead");
|
|
13
13
|
const pageOutcome_1 = require("./pageOutcome");
|
|
14
|
-
const
|
|
14
|
+
const scrapingApi_1 = require("./scrapingApi");
|
|
15
15
|
function normalizeReadSourceMode(value, defaultMode = "auto") {
|
|
16
16
|
if (!value) {
|
|
17
17
|
return defaultMode;
|
|
18
18
|
}
|
|
19
|
-
if (value === "scraping" || value === "scraping-api") {
|
|
20
|
-
return "
|
|
19
|
+
if (value === "scraping" || value === "scraping-api" || value === "unlocker") {
|
|
20
|
+
return "scraping";
|
|
21
21
|
}
|
|
22
|
-
if (value === "auto" || value === "
|
|
22
|
+
if (value === "auto" || value === "browser") {
|
|
23
23
|
return value;
|
|
24
24
|
}
|
|
25
25
|
throw new Error(`Unsupported source mode: ${value}`);
|
|
@@ -57,40 +57,40 @@ async function readRenderedHtmlContent(url, config, apiKey, options = {}) {
|
|
|
57
57
|
outcome: "ok",
|
|
58
58
|
};
|
|
59
59
|
}
|
|
60
|
-
const
|
|
61
|
-
if (source === "
|
|
60
|
+
const scraping = await (0, scrapingApi_1.scrapeRenderedHtml)(url, apiKey, options.request);
|
|
61
|
+
if (source === "scraping") {
|
|
62
62
|
return {
|
|
63
|
-
html:
|
|
64
|
-
renderSource: "
|
|
63
|
+
html: scraping.content,
|
|
64
|
+
renderSource: "scraping",
|
|
65
65
|
fallbackAttempted: false,
|
|
66
66
|
fallbackUsed: false,
|
|
67
67
|
outcome: "ok",
|
|
68
|
-
request:
|
|
68
|
+
request: scraping.request,
|
|
69
69
|
};
|
|
70
70
|
}
|
|
71
|
-
const
|
|
72
|
-
const assessment = assessReadableContent(
|
|
73
|
-
const outcomeAssessment = (0, pageOutcome_1.assessReadablePageOutcome)(
|
|
71
|
+
const scrapingText = (0, scrapingApi_1.htmlToText)(scraping.content);
|
|
72
|
+
const assessment = assessReadableContent(scraping.content, scrapingText);
|
|
73
|
+
const outcomeAssessment = (0, pageOutcome_1.assessReadablePageOutcome)(scraping.content, scrapingText, {
|
|
74
74
|
looksIncomplete: assessment.shouldFallback,
|
|
75
75
|
incompleteReason: assessment.reason,
|
|
76
76
|
});
|
|
77
77
|
if (!assessment.shouldFallback) {
|
|
78
78
|
return {
|
|
79
|
-
html:
|
|
80
|
-
renderSource: "
|
|
79
|
+
html: scraping.content,
|
|
80
|
+
renderSource: "scraping",
|
|
81
81
|
fallbackAttempted: false,
|
|
82
82
|
fallbackUsed: false,
|
|
83
83
|
outcome: outcomeAssessment.outcome,
|
|
84
84
|
outcomeReason: outcomeAssessment.reason,
|
|
85
85
|
nextActionHint: outcomeAssessment.nextActionHint,
|
|
86
86
|
warning: outcomeAssessment.warning,
|
|
87
|
-
request:
|
|
87
|
+
request: scraping.request,
|
|
88
88
|
};
|
|
89
89
|
}
|
|
90
90
|
if (!config.cloudToken) {
|
|
91
91
|
return {
|
|
92
|
-
html:
|
|
93
|
-
renderSource: "
|
|
92
|
+
html: scraping.content,
|
|
93
|
+
renderSource: "scraping",
|
|
94
94
|
fallbackAttempted: true,
|
|
95
95
|
fallbackUsed: false,
|
|
96
96
|
fallbackReason: `${assessment.reason}; GOLOGIN_TOKEN is not configured`,
|
|
@@ -98,7 +98,7 @@ async function readRenderedHtmlContent(url, config, apiKey, options = {}) {
|
|
|
98
98
|
outcomeReason: outcomeAssessment.reason,
|
|
99
99
|
nextActionHint: outcomeAssessment.nextActionHint,
|
|
100
100
|
warning: outcomeAssessment.warning,
|
|
101
|
-
request:
|
|
101
|
+
request: scraping.request,
|
|
102
102
|
};
|
|
103
103
|
}
|
|
104
104
|
const browser = await (0, browserRead_1.scrapeRenderedHtmlViaBrowser)(url, config, {
|
|
@@ -111,7 +111,7 @@ async function readRenderedHtmlContent(url, config, apiKey, options = {}) {
|
|
|
111
111
|
fallbackUsed: true,
|
|
112
112
|
fallbackReason: assessment.reason,
|
|
113
113
|
outcome: "ok",
|
|
114
|
-
request:
|
|
114
|
+
request: scraping.request,
|
|
115
115
|
};
|
|
116
116
|
}
|
|
117
117
|
function assessReadableContent(html, content) {
|
|
@@ -128,31 +128,31 @@ function assessReadableContent(html, content) {
|
|
|
128
128
|
if (mainLength < 200 && articleLength < 200 && linkCount > 40 && normalizedContentLength < 5000) {
|
|
129
129
|
return {
|
|
130
130
|
shouldFallback: true,
|
|
131
|
-
reason: "
|
|
131
|
+
reason: "Scraping API returned navigation-heavy shell with weak main/article content",
|
|
132
132
|
};
|
|
133
133
|
}
|
|
134
134
|
if (normalizedContentLength < 600 && (shellMarkers || linkCount > 30 || scriptCount > 10)) {
|
|
135
135
|
return {
|
|
136
136
|
shouldFallback: true,
|
|
137
|
-
reason: "
|
|
137
|
+
reason: "Scraping API returned very little readable text from a likely JS-rendered page",
|
|
138
138
|
};
|
|
139
139
|
}
|
|
140
140
|
if (shellMarkers && mainLength < 300 && paragraphCount < 3) {
|
|
141
141
|
return {
|
|
142
142
|
shouldFallback: true,
|
|
143
|
-
reason: "
|
|
143
|
+
reason: "Scraping API output looks like a JS docs shell without rendered article content",
|
|
144
144
|
};
|
|
145
145
|
}
|
|
146
146
|
if (paragraphCount < 3 && headingCount <= 1 && linkCount > 60 && scriptCount > 15) {
|
|
147
147
|
return {
|
|
148
148
|
shouldFallback: true,
|
|
149
|
-
reason: "
|
|
149
|
+
reason: "Scraping API output is link-heavy and content-light",
|
|
150
150
|
};
|
|
151
151
|
}
|
|
152
152
|
if (docsUiChromeMarkers) {
|
|
153
153
|
return {
|
|
154
154
|
shouldFallback: true,
|
|
155
|
-
reason: "
|
|
155
|
+
reason: "Scraping API output still contains docs UI chrome and action controls",
|
|
156
156
|
};
|
|
157
157
|
}
|
|
158
158
|
return { shouldFallback: false };
|
|
@@ -172,44 +172,44 @@ async function readReadableContent(url, config, apiKey, options) {
|
|
|
172
172
|
outcome: "ok",
|
|
173
173
|
};
|
|
174
174
|
}
|
|
175
|
-
const
|
|
176
|
-
const readable = extractReadableSegmentFromHtml(
|
|
177
|
-
const
|
|
178
|
-
const assessment = assessReadableContent(
|
|
179
|
-
const outcomeAssessment = (0, pageOutcome_1.assessReadablePageOutcome)(
|
|
175
|
+
const scraping = await (0, scrapingApi_1.scrapeRenderedHtml)(url, apiKey, options.request);
|
|
176
|
+
const readable = extractReadableSegmentFromHtml(scraping.content);
|
|
177
|
+
const scrapingContent = formatReadableContent(options.format, readable.html, readable.text);
|
|
178
|
+
const assessment = assessReadableContent(scraping.content, scrapingContent);
|
|
179
|
+
const outcomeAssessment = (0, pageOutcome_1.assessReadablePageOutcome)(scraping.content, scrapingContent, {
|
|
180
180
|
looksIncomplete: assessment.shouldFallback,
|
|
181
181
|
incompleteReason: assessment.reason,
|
|
182
182
|
});
|
|
183
|
-
if (source === "
|
|
183
|
+
if (source === "scraping") {
|
|
184
184
|
return {
|
|
185
|
-
content:
|
|
186
|
-
renderSource: "
|
|
185
|
+
content: scrapingContent,
|
|
186
|
+
renderSource: "scraping",
|
|
187
187
|
fallbackAttempted: false,
|
|
188
188
|
fallbackUsed: false,
|
|
189
189
|
outcome: outcomeAssessment.outcome,
|
|
190
190
|
outcomeReason: outcomeAssessment.reason,
|
|
191
191
|
nextActionHint: outcomeAssessment.nextActionHint,
|
|
192
192
|
warning: outcomeAssessment.warning,
|
|
193
|
-
request:
|
|
193
|
+
request: scraping.request,
|
|
194
194
|
};
|
|
195
195
|
}
|
|
196
196
|
if (!assessment.shouldFallback) {
|
|
197
197
|
return {
|
|
198
|
-
content:
|
|
199
|
-
renderSource: "
|
|
198
|
+
content: scrapingContent,
|
|
199
|
+
renderSource: "scraping",
|
|
200
200
|
fallbackAttempted: false,
|
|
201
201
|
fallbackUsed: false,
|
|
202
202
|
outcome: outcomeAssessment.outcome,
|
|
203
203
|
outcomeReason: outcomeAssessment.reason,
|
|
204
204
|
nextActionHint: outcomeAssessment.nextActionHint,
|
|
205
205
|
warning: outcomeAssessment.warning,
|
|
206
|
-
request:
|
|
206
|
+
request: scraping.request,
|
|
207
207
|
};
|
|
208
208
|
}
|
|
209
209
|
if (!config.cloudToken) {
|
|
210
210
|
return {
|
|
211
|
-
content:
|
|
212
|
-
renderSource: "
|
|
211
|
+
content: scrapingContent,
|
|
212
|
+
renderSource: "scraping",
|
|
213
213
|
fallbackAttempted: true,
|
|
214
214
|
fallbackUsed: false,
|
|
215
215
|
fallbackReason: `${assessment.reason}; GOLOGIN_TOKEN is not configured`,
|
|
@@ -217,17 +217,17 @@ async function readReadableContent(url, config, apiKey, options) {
|
|
|
217
217
|
outcomeReason: outcomeAssessment.reason,
|
|
218
218
|
nextActionHint: outcomeAssessment.nextActionHint,
|
|
219
219
|
warning: outcomeAssessment.warning,
|
|
220
|
-
request:
|
|
220
|
+
request: scraping.request,
|
|
221
221
|
};
|
|
222
222
|
}
|
|
223
223
|
const browser = await (0, browserRead_1.scrapeReadableContentViaBrowser)(url, config, {
|
|
224
224
|
profile: options.profile,
|
|
225
225
|
});
|
|
226
226
|
const browserContent = formatReadableContent(options.format, browser.html, browser.text);
|
|
227
|
-
if (meaningfulTextLength(browserContent) < Math.max(300, meaningfulTextLength(
|
|
227
|
+
if (meaningfulTextLength(browserContent) < Math.max(300, meaningfulTextLength(scrapingContent))) {
|
|
228
228
|
return {
|
|
229
|
-
content:
|
|
230
|
-
renderSource: "
|
|
229
|
+
content: scrapingContent,
|
|
230
|
+
renderSource: "scraping",
|
|
231
231
|
fallbackAttempted: true,
|
|
232
232
|
fallbackUsed: false,
|
|
233
233
|
fallbackReason: "Browser fallback did not improve readable output",
|
|
@@ -235,7 +235,7 @@ async function readReadableContent(url, config, apiKey, options) {
|
|
|
235
235
|
outcomeReason: outcomeAssessment.reason,
|
|
236
236
|
nextActionHint: outcomeAssessment.nextActionHint,
|
|
237
237
|
warning: outcomeAssessment.warning,
|
|
238
|
-
request:
|
|
238
|
+
request: scraping.request,
|
|
239
239
|
};
|
|
240
240
|
}
|
|
241
241
|
return {
|
|
@@ -245,14 +245,14 @@ async function readReadableContent(url, config, apiKey, options) {
|
|
|
245
245
|
fallbackUsed: true,
|
|
246
246
|
fallbackReason: assessment.reason,
|
|
247
247
|
outcome: "ok",
|
|
248
|
-
request:
|
|
248
|
+
request: scraping.request,
|
|
249
249
|
};
|
|
250
250
|
}
|
|
251
251
|
function formatReadableContent(format, html, text) {
|
|
252
252
|
if (format === "html") {
|
|
253
253
|
return html;
|
|
254
254
|
}
|
|
255
|
-
return format === "markdown" ? (0,
|
|
255
|
+
return format === "markdown" ? (0, scrapingApi_1.htmlToMarkdown)(html) : text.trim();
|
|
256
256
|
}
|
|
257
257
|
function meaningfulTextLength(value) {
|
|
258
258
|
return value.replace(/\s+/g, " ").trim().length;
|
|
@@ -4,17 +4,17 @@ exports.scrapeRenderedHtml = scrapeRenderedHtml;
|
|
|
4
4
|
exports.scrapeText = scrapeText;
|
|
5
5
|
exports.scrapeMarkdown = scrapeMarkdown;
|
|
6
6
|
exports.scrapeJson = scrapeJson;
|
|
7
|
-
exports.
|
|
7
|
+
exports.validateScrapingApiKey = validateScrapingApiKey;
|
|
8
8
|
exports.htmlToText = htmlToText;
|
|
9
9
|
exports.htmlToMarkdown = htmlToMarkdown;
|
|
10
10
|
exports.htmlToStructuredData = htmlToStructuredData;
|
|
11
11
|
const errors_1 = require("./errors");
|
|
12
|
-
const DEFAULT_BASE_URL = "https://parsing.
|
|
12
|
+
const DEFAULT_BASE_URL = "https://parsing.webscraping.gologin.com";
|
|
13
13
|
const DEFAULT_TIMEOUT_MS = 15_000;
|
|
14
14
|
const DEFAULT_MAX_RETRIES = 2;
|
|
15
15
|
const MAX_EXTRACTED_LINKS = 100;
|
|
16
16
|
const MAX_EXTRACTED_HEADINGS = 50;
|
|
17
|
-
class
|
|
17
|
+
class ScrapingApiClient {
|
|
18
18
|
apiKey;
|
|
19
19
|
baseUrl;
|
|
20
20
|
timeoutMs;
|
|
@@ -59,30 +59,30 @@ class WebUnlockerClient {
|
|
|
59
59
|
}
|
|
60
60
|
}
|
|
61
61
|
async function scrapeRenderedHtml(url, apiKey, options = {}) {
|
|
62
|
-
return
|
|
62
|
+
return createScrapingApiClient(apiKey).scrape(url, options);
|
|
63
63
|
}
|
|
64
64
|
async function scrapeText(url, apiKey, options = {}) {
|
|
65
|
-
const scraped = await
|
|
65
|
+
const scraped = await createScrapingApiClient(apiKey).scrape(url, options);
|
|
66
66
|
return {
|
|
67
67
|
...scraped,
|
|
68
68
|
text: htmlToText(scraped.content),
|
|
69
69
|
};
|
|
70
70
|
}
|
|
71
71
|
async function scrapeMarkdown(url, apiKey, options = {}) {
|
|
72
|
-
const scraped = await
|
|
72
|
+
const scraped = await createScrapingApiClient(apiKey).scrape(url, options);
|
|
73
73
|
return {
|
|
74
74
|
...scraped,
|
|
75
75
|
markdown: htmlToMarkdown(scraped.content),
|
|
76
76
|
};
|
|
77
77
|
}
|
|
78
78
|
async function scrapeJson(url, apiKey, options = {}) {
|
|
79
|
-
const scraped = await
|
|
79
|
+
const scraped = await createScrapingApiClient(apiKey).scrape(url, options);
|
|
80
80
|
return {
|
|
81
81
|
...scraped,
|
|
82
82
|
data: htmlToStructuredData(scraped.content),
|
|
83
83
|
};
|
|
84
84
|
}
|
|
85
|
-
async function
|
|
85
|
+
async function validateScrapingApiKey(apiKey) {
|
|
86
86
|
try {
|
|
87
87
|
await scrapeRenderedHtml("https://example.com", apiKey, {
|
|
88
88
|
timeoutMs: 8_000,
|
|
@@ -104,8 +104,8 @@ async function validateWebUnlockerKey(apiKey) {
|
|
|
104
104
|
};
|
|
105
105
|
}
|
|
106
106
|
}
|
|
107
|
-
function
|
|
108
|
-
return new
|
|
107
|
+
function createScrapingApiClient(apiKey) {
|
|
108
|
+
return new ScrapingApiClient({ apiKey });
|
|
109
109
|
}
|
|
110
110
|
async function fetchWithRetry(url, options) {
|
|
111
111
|
const attempts = [];
|
package/dist/lib/search.js
CHANGED
|
@@ -50,7 +50,7 @@ const cheerio = __importStar(require("cheerio"));
|
|
|
50
50
|
const config_1 = require("../config");
|
|
51
51
|
const errors_1 = require("./errors");
|
|
52
52
|
const agentCli_1 = require("./agentCli");
|
|
53
|
-
const
|
|
53
|
+
const scrapingApi_1 = require("./scrapingApi");
|
|
54
54
|
const SEARCH_CACHE_VERSION = 1;
|
|
55
55
|
const SEARCH_CACHE_TTL_MS = 10 * 60 * 1000;
|
|
56
56
|
async function searchWeb(query, config, options) {
|
|
@@ -64,7 +64,7 @@ async function searchWeb(query, config, options) {
|
|
|
64
64
|
for (const planItem of buildSearchAttemptPlan(options.source, Boolean(config.cloudToken))) {
|
|
65
65
|
const searchUrl = buildSearchUrl(planItem.engine, query, options);
|
|
66
66
|
try {
|
|
67
|
-
const executor = planItem.source === "
|
|
67
|
+
const executor = planItem.source === "scraping" ? searchViaScrapingApi : searchViaBrowser;
|
|
68
68
|
const result = await executor(query, config, options, planItem.engine);
|
|
69
69
|
const attempt = {
|
|
70
70
|
engine: planItem.engine,
|
|
@@ -148,17 +148,17 @@ function buildSearchAttemptPlan(source, hasCloudToken) {
|
|
|
148
148
|
if (source === "browser") {
|
|
149
149
|
return hasCloudToken ? [{ engine: "bing", source: "browser" }] : [];
|
|
150
150
|
}
|
|
151
|
-
if (source === "
|
|
151
|
+
if (source === "scraping") {
|
|
152
152
|
return [
|
|
153
|
-
{ engine: "google", source: "
|
|
154
|
-
{ engine: "duckduckgo", source: "
|
|
155
|
-
{ engine: "bing", source: "
|
|
153
|
+
{ engine: "google", source: "scraping" },
|
|
154
|
+
{ engine: "duckduckgo", source: "scraping" },
|
|
155
|
+
{ engine: "bing", source: "scraping" },
|
|
156
156
|
];
|
|
157
157
|
}
|
|
158
158
|
const plan = [
|
|
159
|
-
{ engine: "google", source: "
|
|
160
|
-
{ engine: "duckduckgo", source: "
|
|
161
|
-
{ engine: "bing", source: "
|
|
159
|
+
{ engine: "google", source: "scraping" },
|
|
160
|
+
{ engine: "duckduckgo", source: "scraping" },
|
|
161
|
+
{ engine: "bing", source: "scraping" },
|
|
162
162
|
];
|
|
163
163
|
if (hasCloudToken) {
|
|
164
164
|
plan.push({ engine: "bing", source: "browser" });
|
|
@@ -292,12 +292,12 @@ function classifySearchPage(engine, html, results) {
|
|
|
292
292
|
}
|
|
293
293
|
return "invalid";
|
|
294
294
|
}
|
|
295
|
-
async function
|
|
296
|
-
if (!config.
|
|
295
|
+
async function searchViaScrapingApi(query, config, options, engine) {
|
|
296
|
+
if (!config.scrapingApiKey) {
|
|
297
297
|
throw new errors_1.CliError("Missing GOLOGIN_SCRAPING_API_KEY for Scraping API search.");
|
|
298
298
|
}
|
|
299
299
|
const searchUrl = buildSearchUrl(engine, query, options);
|
|
300
|
-
const scraped = await (0,
|
|
300
|
+
const scraped = await (0, scrapingApi_1.scrapeRenderedHtml)(searchUrl, config.scrapingApiKey);
|
|
301
301
|
const results = engine === "google"
|
|
302
302
|
? parseGoogleSearchResults(scraped.content, options.limit)
|
|
303
303
|
: engine === "bing"
|
|
@@ -305,10 +305,10 @@ async function searchViaUnlocker(query, config, options, engine) {
|
|
|
305
305
|
: parseDuckDuckGoSearchResults(scraped.content, options.limit);
|
|
306
306
|
const pageState = classifySearchPage(engine, scraped.content, results);
|
|
307
307
|
if (pageState === "blocked") {
|
|
308
|
-
throw new errors_1.CliError(`
|
|
308
|
+
throw new errors_1.CliError(`Scraping API search was blocked on ${engine}.`, 1);
|
|
309
309
|
}
|
|
310
310
|
if (pageState === "invalid") {
|
|
311
|
-
throw new errors_1.CliError(`
|
|
311
|
+
throw new errors_1.CliError(`Scraping API search did not return a valid ${engine} search results page.`, 1);
|
|
312
312
|
}
|
|
313
313
|
return {
|
|
314
314
|
url: searchUrl,
|
|
@@ -10,7 +10,7 @@ const config_1 = require("../config");
|
|
|
10
10
|
const errors_1 = require("./errors");
|
|
11
11
|
const browserStructured_1 = require("./browserStructured");
|
|
12
12
|
const pageOutcome_1 = require("./pageOutcome");
|
|
13
|
-
const
|
|
13
|
+
const scrapingApi_1 = require("./scrapingApi");
|
|
14
14
|
class StructuredBlockedPageError extends errors_1.CliError {
|
|
15
15
|
status;
|
|
16
16
|
request;
|
|
@@ -33,10 +33,10 @@ class StructuredBlockedPageError extends errors_1.CliError {
|
|
|
33
33
|
}
|
|
34
34
|
}
|
|
35
35
|
async function scrapeStructuredJson(url, config, apiKey, options = {}) {
|
|
36
|
-
const result = await (0,
|
|
36
|
+
const result = await (0, scrapingApi_1.scrapeJson)(url, apiKey, options.request);
|
|
37
37
|
const fallbackMode = options.fallback ?? "none";
|
|
38
38
|
let data = result.data;
|
|
39
|
-
let renderSource = "
|
|
39
|
+
let renderSource = "scraping";
|
|
40
40
|
let fallbackAttempted = false;
|
|
41
41
|
let fallbackUsed = false;
|
|
42
42
|
let fallbackReason;
|
|
@@ -51,7 +51,7 @@ async function scrapeStructuredJson(url, config, apiKey, options = {}) {
|
|
|
51
51
|
data = browserData;
|
|
52
52
|
renderSource = "browser";
|
|
53
53
|
fallbackUsed = true;
|
|
54
|
-
fallbackReason = "
|
|
54
|
+
fallbackReason = "Scraping API structured data looked incomplete";
|
|
55
55
|
({ outcome, reason, nextActionHint, browserRecommended, warning } = (0, pageOutcome_1.assessStructuredPageOutcome)(data));
|
|
56
56
|
}
|
|
57
57
|
else {
|
|
@@ -84,7 +84,7 @@ function makeStructuredScrapeEnvelope(url, result, data, options = {}) {
|
|
|
84
84
|
outcome: options.outcome ?? "ok",
|
|
85
85
|
outcomeReason: options.outcomeReason,
|
|
86
86
|
nextActionHint: options.nextActionHint,
|
|
87
|
-
renderSource: options.renderSource ?? "
|
|
87
|
+
renderSource: options.renderSource ?? "scraping",
|
|
88
88
|
fallbackAttempted: options.fallbackAttempted ?? false,
|
|
89
89
|
fallbackUsed: options.fallbackUsed ?? false,
|
|
90
90
|
fallbackReason: options.fallbackReason,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "gologin-web-access",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.5",
|
|
4
4
|
"description": "Unified web access CLI for developers and AI agents to read and interact with the web using the GoLogin Scraping API and Cloud Browser.",
|
|
5
5
|
"main": "dist/cli.js",
|
|
6
6
|
"bin": {
|
|
@@ -40,7 +40,7 @@
|
|
|
40
40
|
"web-access",
|
|
41
41
|
"cli",
|
|
42
42
|
"scraping-api",
|
|
43
|
-
"
|
|
43
|
+
"scraping-api",
|
|
44
44
|
"scraping",
|
|
45
45
|
"cloud-browser",
|
|
46
46
|
"browser-automation",
|