scrapingbee-cli 1.2.3__tar.gz → 1.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrapingbee_cli-1.2.3/src/scrapingbee_cli.egg-info → scrapingbee_cli-1.3.1}/PKG-INFO +13 -2
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/README.md +12 -1
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/pyproject.toml +1 -1
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/__init__.py +1 -1
- scrapingbee_cli-1.3.1/src/scrapingbee_cli/audit.py +60 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/batch.py +7 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/cli_utils.py +28 -3
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/client.py +8 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/commands/__init__.py +3 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/commands/amazon.py +2 -1
- scrapingbee_cli-1.3.1/src/scrapingbee_cli/commands/auth.py +264 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/commands/crawl.py +11 -1
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/commands/google.py +2 -1
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/commands/schedule.py +7 -1
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/commands/scrape.py +8 -0
- scrapingbee_cli-1.3.1/src/scrapingbee_cli/commands/unsafe.py +84 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/commands/walmart.py +19 -3
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/commands/youtube.py +9 -4
- scrapingbee_cli-1.3.1/src/scrapingbee_cli/exec_gate.py +192 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1/src/scrapingbee_cli.egg-info}/PKG-INFO +13 -2
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli.egg-info/SOURCES.txt +3 -0
- scrapingbee_cli-1.2.3/src/scrapingbee_cli/commands/auth.py +0 -141
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/LICENSE +0 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/setup.cfg +0 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/cli.py +0 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/commands/chatgpt.py +0 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/commands/export.py +0 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/commands/fast_search.py +0 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/commands/usage.py +0 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/config.py +0 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/crawl.py +0 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli/credits.py +0 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli.egg-info/dependency_links.txt +0 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli.egg-info/entry_points.txt +0 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli.egg-info/requires.txt +0 -0
- {scrapingbee_cli-1.2.3 → scrapingbee_cli-1.3.1}/src/scrapingbee_cli.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scrapingbee-cli
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.1
|
|
4
4
|
Summary: Command-line client for the ScrapingBee API: scrape pages (single or batch), crawl sites, check usage/credits, and use Google Search, Fast Search, Amazon, Walmart, YouTube, and ChatGPT from the terminal.
|
|
5
5
|
Author: ScrapingBee
|
|
6
6
|
License-Expression: MIT
|
|
@@ -81,7 +81,9 @@ scrapingbee [command] [arguments] [options]
|
|
|
81
81
|
- **`scrapingbee --help`** – List all commands.
|
|
82
82
|
- **`scrapingbee [command] --help`** – Options and parameters for that command.
|
|
83
83
|
|
|
84
|
-
**Options are per-command.** Each command has its own set of options — run `scrapingbee [command] --help` to see them. Common options across batch-capable commands include `--output-file`, `--output-dir`, `--input-file`, `--input-column`, `--concurrency`, `--output-format`, `--retries`, `--backoff`, `--resume`, `--update-csv`, `--no-progress`, `--extract-field`, `--fields`, `--deduplicate`, `--sample`, `--post-process`, `--on-complete`, and `--verbose`. For details, see the [documentation](https://www.scrapingbee.com/documentation/).
|
|
84
|
+
**Options are per-command.** Each command has its own set of options — run `scrapingbee [command] --help` to see them. Common options across batch-capable commands include `--output-file`, `--output-dir`, `--input-file`, `--input-column`, `--concurrency`, `--output-format`, `--retries`, `--backoff`, `--resume`, `--update-csv`, `--no-progress`, `--extract-field`, `--fields`, `--deduplicate`, `--sample`, `--post-process`, `--on-complete`, `--scraping-config`, and `--verbose`. For details, see the [documentation](https://www.scrapingbee.com/documentation/).
|
|
85
|
+
|
|
86
|
+
**Parameter values:** Choice parameters accept both hyphens and underscores interchangeably (e.g. `--sort-by price-low` and `--sort-by price_low` both work).
|
|
85
87
|
|
|
86
88
|
### Commands
|
|
87
89
|
|
|
@@ -117,6 +119,7 @@ scrapingbee [command] [arguments] [options]
|
|
|
117
119
|
- **Scheduling:** `scrapingbee schedule --every 1d --name prices scrape --input-file products.csv --update-csv` registers a cron job. Use `--list`, `--stop NAME`, or `--stop all`.
|
|
118
120
|
- **Deduplication & sampling:** `--deduplicate` removes duplicate URLs; `--sample 100` processes only 100 random items.
|
|
119
121
|
- **RAG chunking:** `scrape --chunk-size 500 --chunk-overlap 50 --return-page-markdown true` outputs NDJSON chunks ready for vector DB ingestion.
|
|
122
|
+
- **Scraping configurations:** `--scraping-config "My-Config"` applies a pre-saved configuration from your ScrapingBee dashboard. Inline options override config settings. Create configurations in the [request builder](https://app.scrapingbee.com/).
|
|
120
123
|
|
|
121
124
|
### Examples
|
|
122
125
|
|
|
@@ -133,6 +136,14 @@ scrapingbee schedule --every 1d --name price-tracker scrape --input-file product
|
|
|
133
136
|
scrapingbee schedule --list
|
|
134
137
|
```
|
|
135
138
|
|
|
139
|
+
## Security
|
|
140
|
+
|
|
141
|
+
The `--post-process`, `--on-complete`, and `schedule` commands execute arbitrary shell commands on your machine. These features are **disabled by default** and require explicit human setup to enable.
|
|
142
|
+
|
|
143
|
+
For advanced features setup, see the Security section in our [CLI documentation](https://www.scrapingbee.com/documentation/cli/).
|
|
144
|
+
|
|
145
|
+
**Do not enable these features in AI agent environments** where commands may be constructed from scraped web content. ScrapingBee is not responsible for any damages caused by shell execution features. Use at your own discretion.
|
|
146
|
+
|
|
136
147
|
## More information
|
|
137
148
|
|
|
138
149
|
- **[CLI Documentation](https://www.scrapingbee.com/documentation/cli/)** – Full CLI reference with pipelines, parameters, and examples.
|
|
@@ -44,7 +44,9 @@ scrapingbee [command] [arguments] [options]
|
|
|
44
44
|
- **`scrapingbee --help`** – List all commands.
|
|
45
45
|
- **`scrapingbee [command] --help`** – Options and parameters for that command.
|
|
46
46
|
|
|
47
|
-
**Options are per-command.** Each command has its own set of options — run `scrapingbee [command] --help` to see them. Common options across batch-capable commands include `--output-file`, `--output-dir`, `--input-file`, `--input-column`, `--concurrency`, `--output-format`, `--retries`, `--backoff`, `--resume`, `--update-csv`, `--no-progress`, `--extract-field`, `--fields`, `--deduplicate`, `--sample`, `--post-process`, `--on-complete`, and `--verbose`. For details, see the [documentation](https://www.scrapingbee.com/documentation/).
|
|
47
|
+
**Options are per-command.** Each command has its own set of options — run `scrapingbee [command] --help` to see them. Common options across batch-capable commands include `--output-file`, `--output-dir`, `--input-file`, `--input-column`, `--concurrency`, `--output-format`, `--retries`, `--backoff`, `--resume`, `--update-csv`, `--no-progress`, `--extract-field`, `--fields`, `--deduplicate`, `--sample`, `--post-process`, `--on-complete`, `--scraping-config`, and `--verbose`. For details, see the [documentation](https://www.scrapingbee.com/documentation/).
|
|
48
|
+
|
|
49
|
+
**Parameter values:** Choice parameters accept both hyphens and underscores interchangeably (e.g. `--sort-by price-low` and `--sort-by price_low` both work).
|
|
48
50
|
|
|
49
51
|
### Commands
|
|
50
52
|
|
|
@@ -80,6 +82,7 @@ scrapingbee [command] [arguments] [options]
|
|
|
80
82
|
- **Scheduling:** `scrapingbee schedule --every 1d --name prices scrape --input-file products.csv --update-csv` registers a cron job. Use `--list`, `--stop NAME`, or `--stop all`.
|
|
81
83
|
- **Deduplication & sampling:** `--deduplicate` removes duplicate URLs; `--sample 100` processes only 100 random items.
|
|
82
84
|
- **RAG chunking:** `scrape --chunk-size 500 --chunk-overlap 50 --return-page-markdown true` outputs NDJSON chunks ready for vector DB ingestion.
|
|
85
|
+
- **Scraping configurations:** `--scraping-config "My-Config"` applies a pre-saved configuration from your ScrapingBee dashboard. Inline options override config settings. Create configurations in the [request builder](https://app.scrapingbee.com/).
|
|
83
86
|
|
|
84
87
|
### Examples
|
|
85
88
|
|
|
@@ -96,6 +99,14 @@ scrapingbee schedule --every 1d --name price-tracker scrape --input-file product
|
|
|
96
99
|
scrapingbee schedule --list
|
|
97
100
|
```
|
|
98
101
|
|
|
102
|
+
## Security
|
|
103
|
+
|
|
104
|
+
The `--post-process`, `--on-complete`, and `schedule` commands execute arbitrary shell commands on your machine. These features are **disabled by default** and require explicit human setup to enable.
|
|
105
|
+
|
|
106
|
+
For advanced features setup, see the Security section in our [CLI documentation](https://www.scrapingbee.com/documentation/cli/).
|
|
107
|
+
|
|
108
|
+
**Do not enable these features in AI agent environments** where commands may be constructed from scraped web content. ScrapingBee is not responsible for any damages caused by shell execution features. Use at your own discretion.
|
|
109
|
+
|
|
99
110
|
## More information
|
|
100
111
|
|
|
101
112
|
- **[CLI Documentation](https://www.scrapingbee.com/documentation/cli/)** – Full CLI reference with pipelines, parameters, and examples.
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "scrapingbee-cli"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.3.1"
|
|
8
8
|
description = "Command-line client for the ScrapingBee API: scrape pages (single or batch), crawl sites, check usage/credits, and use Google Search, Fast Search, Amazon, Walmart, YouTube, and ChatGPT from the terminal."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Audit logging for exec features (--post-process, --on-complete, schedule).
|
|
2
|
+
|
|
3
|
+
Logs every shell command execution to a fixed location for forensics
|
|
4
|
+
and guard skill monitoring.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
AUDIT_LOG_PATH = Path.home() / ".config" / "scrapingbee-cli" / "audit.log"
|
|
13
|
+
MAX_LINES = 10_000
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def log_exec(
|
|
17
|
+
feature: str,
|
|
18
|
+
command: str,
|
|
19
|
+
*,
|
|
20
|
+
input_source: str = "",
|
|
21
|
+
output_dir: str = "",
|
|
22
|
+
) -> None:
|
|
23
|
+
"""Append an entry to the audit log.
|
|
24
|
+
|
|
25
|
+
Format: ISO_TIMESTAMP | FEATURE | COMMAND | INPUT | OUTPUT_DIR
|
|
26
|
+
"""
|
|
27
|
+
AUDIT_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
timestamp = datetime.now(timezone.utc).isoformat()
|
|
29
|
+
entry = f"{timestamp} | {feature} | {command} | {input_source} | {output_dir}\n"
|
|
30
|
+
try:
|
|
31
|
+
with open(AUDIT_LOG_PATH, "a", encoding="utf-8") as f:
|
|
32
|
+
f.write(entry)
|
|
33
|
+
_rotate_if_needed()
|
|
34
|
+
except OSError:
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def read_audit_log(n: int = 50) -> str:
|
|
39
|
+
"""Read the last N lines of the audit log."""
|
|
40
|
+
if not AUDIT_LOG_PATH.is_file():
|
|
41
|
+
return "No audit log found."
|
|
42
|
+
try:
|
|
43
|
+
with open(AUDIT_LOG_PATH, encoding="utf-8") as f:
|
|
44
|
+
lines = f.readlines()
|
|
45
|
+
recent = lines[-n:] if len(lines) > n else lines
|
|
46
|
+
return "".join(recent)
|
|
47
|
+
except OSError:
|
|
48
|
+
return "Could not read audit log."
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _rotate_if_needed() -> None:
|
|
52
|
+
"""Keep only the last MAX_LINES entries."""
|
|
53
|
+
try:
|
|
54
|
+
with open(AUDIT_LOG_PATH, encoding="utf-8") as f:
|
|
55
|
+
lines = f.readlines()
|
|
56
|
+
if len(lines) > MAX_LINES:
|
|
57
|
+
with open(AUDIT_LOG_PATH, "w", encoding="utf-8") as f:
|
|
58
|
+
f.writelines(lines[-MAX_LINES:])
|
|
59
|
+
except OSError:
|
|
60
|
+
pass
|
|
@@ -705,6 +705,13 @@ def apply_post_process(body: bytes, cmd: str) -> bytes:
|
|
|
705
705
|
"""Run shell command with body as stdin, return stdout. On failure, return original body."""
|
|
706
706
|
import subprocess
|
|
707
707
|
|
|
708
|
+
from .audit import log_exec
|
|
709
|
+
from .exec_gate import require_exec
|
|
710
|
+
|
|
711
|
+
require_exec("--post-process", cmd)
|
|
712
|
+
log_exec("post-process", cmd)
|
|
713
|
+
click.echo(f"⚠ Executing: {cmd.split()[0] if cmd.split() else cmd} (whitelisted)", err=True)
|
|
714
|
+
|
|
708
715
|
try:
|
|
709
716
|
result = subprocess.run(
|
|
710
717
|
cmd,
|
|
@@ -9,6 +9,23 @@ from typing import Any
|
|
|
9
9
|
import click
|
|
10
10
|
|
|
11
11
|
|
|
12
|
+
class NormalizedChoice(click.Choice):
|
|
13
|
+
"""Choice type that accepts both hyphens and underscores.
|
|
14
|
+
|
|
15
|
+
Automatically converts underscores to hyphens before validation,
|
|
16
|
+
allowing users to use either format interchangeably.
|
|
17
|
+
Example: both --sort-by price-low and --sort-by price_low work.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def convert(self, value: str, param: Any, ctx: Any) -> str:
|
|
21
|
+
"""Convert underscores to hyphens before validation."""
|
|
22
|
+
if value is not None:
|
|
23
|
+
normalized = value.replace("_", "-")
|
|
24
|
+
else:
|
|
25
|
+
normalized = value
|
|
26
|
+
return super().convert(normalized, param, ctx)
|
|
27
|
+
|
|
28
|
+
|
|
12
29
|
def _output_options(f: Any) -> Any:
|
|
13
30
|
"""Output + Retry options (for commands without batch support)."""
|
|
14
31
|
f = click.option(
|
|
@@ -105,7 +122,7 @@ def _batch_options(f: Any) -> Any:
|
|
|
105
122
|
"post_process",
|
|
106
123
|
type=str,
|
|
107
124
|
default=None,
|
|
108
|
-
help="Batch: pipe each result through a shell command (e.g. 'jq .title').",
|
|
125
|
+
help="[Advanced] Batch: pipe each result through a shell command (e.g. 'jq .title'). Requires unsafe mode.",
|
|
109
126
|
)(f)
|
|
110
127
|
f = click.option(
|
|
111
128
|
"--update-csv",
|
|
@@ -132,7 +149,7 @@ def _batch_options(f: Any) -> Any:
|
|
|
132
149
|
"on_complete",
|
|
133
150
|
type=str,
|
|
134
151
|
default=None,
|
|
135
|
-
help="Batch: shell command to run after completion.",
|
|
152
|
+
help="[Advanced] Batch: shell command to run after completion. Requires unsafe mode.",
|
|
136
153
|
)(f)
|
|
137
154
|
f = click.option("--retries", type=int, default=3, help="Retry on errors (default: 3).")(f)
|
|
138
155
|
f = click.option(
|
|
@@ -385,6 +402,7 @@ def build_scrape_kwargs(
|
|
|
385
402
|
custom_google: str | None = None,
|
|
386
403
|
transparent_status_code: str | None = None,
|
|
387
404
|
body: str | None = None,
|
|
405
|
+
scraping_config: str | None = None,
|
|
388
406
|
) -> dict[str, Any]:
|
|
389
407
|
"""Build kwargs for Client.scrape() from scrape command options.
|
|
390
408
|
Single source of parse_bool for bool-like opts."""
|
|
@@ -424,6 +442,7 @@ def build_scrape_kwargs(
|
|
|
424
442
|
"custom_google": parse_bool(custom_google),
|
|
425
443
|
"transparent_status_code": parse_bool(transparent_status_code),
|
|
426
444
|
"body": body,
|
|
445
|
+
"scraping_config": scraping_config,
|
|
427
446
|
}
|
|
428
447
|
|
|
429
448
|
|
|
@@ -595,11 +614,17 @@ def run_on_complete(
|
|
|
595
614
|
import os
|
|
596
615
|
import subprocess
|
|
597
616
|
|
|
617
|
+
from .audit import log_exec
|
|
618
|
+
from .exec_gate import require_exec
|
|
619
|
+
|
|
620
|
+
require_exec("--on-complete", cmd)
|
|
621
|
+
log_exec("on-complete", cmd, output_dir=output_dir)
|
|
622
|
+
click.echo(f"⚠ Executing: {cmd.split()[0] if cmd.split() else cmd} (whitelisted)", err=True)
|
|
623
|
+
|
|
598
624
|
env = os.environ.copy()
|
|
599
625
|
env["SCRAPINGBEE_OUTPUT_DIR"] = output_dir
|
|
600
626
|
env["SCRAPINGBEE_SUCCEEDED"] = str(succeeded)
|
|
601
627
|
env["SCRAPINGBEE_FAILED"] = str(failed)
|
|
602
|
-
click.echo(f"[on-complete] Running: {cmd}", err=True)
|
|
603
628
|
result = subprocess.run(cmd, shell=True, env=env) # noqa: S602
|
|
604
629
|
if result.returncode != 0:
|
|
605
630
|
click.echo(f"[on-complete] Exit code: {result.returncode}", err=True)
|
|
@@ -177,6 +177,7 @@ class Client:
|
|
|
177
177
|
custom_google: bool | None = None,
|
|
178
178
|
transparent_status_code: bool | None = None,
|
|
179
179
|
body: str | None = None,
|
|
180
|
+
scraping_config: str | None = None,
|
|
180
181
|
retries: int = 3,
|
|
181
182
|
backoff: float = 2.0,
|
|
182
183
|
**kwargs: Any,
|
|
@@ -217,6 +218,7 @@ class Client:
|
|
|
217
218
|
("device", device),
|
|
218
219
|
("custom_google", self._bool(custom_google)),
|
|
219
220
|
("transparent_status_code", self._bool(transparent_status_code)),
|
|
221
|
+
("scraping_config", scraping_config),
|
|
220
222
|
]:
|
|
221
223
|
if v is not None:
|
|
222
224
|
params[k] = str(v) if not isinstance(v, str) else v
|
|
@@ -415,6 +417,7 @@ class Client:
|
|
|
415
417
|
async def walmart_search(
|
|
416
418
|
self,
|
|
417
419
|
query: str,
|
|
420
|
+
start_page: int | None = None,
|
|
418
421
|
min_price: int | None = None,
|
|
419
422
|
max_price: int | None = None,
|
|
420
423
|
sort_by: str | None = None,
|
|
@@ -432,6 +435,7 @@ class Client:
|
|
|
432
435
|
) -> tuple[bytes, dict, int]:
|
|
433
436
|
params = {
|
|
434
437
|
"query": query,
|
|
438
|
+
"start_page": start_page if start_page is not None else None,
|
|
435
439
|
"min_price": min_price if min_price is not None else None,
|
|
436
440
|
"max_price": max_price if max_price is not None else None,
|
|
437
441
|
"sort_by": sort_by,
|
|
@@ -455,6 +459,7 @@ class Client:
|
|
|
455
459
|
async def walmart_product(
|
|
456
460
|
self,
|
|
457
461
|
product_id: str,
|
|
462
|
+
device: str | None = None,
|
|
458
463
|
domain: str | None = None,
|
|
459
464
|
delivery_zip: str | None = None,
|
|
460
465
|
store_id: str | None = None,
|
|
@@ -466,6 +471,7 @@ class Client:
|
|
|
466
471
|
) -> tuple[bytes, dict, int]:
|
|
467
472
|
params = {
|
|
468
473
|
"product_id": product_id,
|
|
474
|
+
"device": device,
|
|
469
475
|
"domain": domain,
|
|
470
476
|
"delivery_zip": delivery_zip,
|
|
471
477
|
"store_id": store_id,
|
|
@@ -497,6 +503,7 @@ class Client:
|
|
|
497
503
|
hdr: bool | None = None,
|
|
498
504
|
location: bool | None = None,
|
|
499
505
|
vr180: bool | None = None,
|
|
506
|
+
purchased: bool | None = None,
|
|
500
507
|
retries: int = 3,
|
|
501
508
|
backoff: float = 2.0,
|
|
502
509
|
) -> tuple[bytes, dict, int]:
|
|
@@ -516,6 +523,7 @@ class Client:
|
|
|
516
523
|
"hdr": self._bool(hdr),
|
|
517
524
|
"location": self._bool(location),
|
|
518
525
|
"vr180": self._bool(vr180),
|
|
526
|
+
"purchased": self._bool(purchased),
|
|
519
527
|
}
|
|
520
528
|
return await self._get_with_retry(
|
|
521
529
|
"/youtube/search",
|
|
@@ -17,6 +17,7 @@ from ..batch import (
|
|
|
17
17
|
)
|
|
18
18
|
from ..cli_utils import (
|
|
19
19
|
DEVICE_DESKTOP_MOBILE_TABLET,
|
|
20
|
+
NormalizedChoice,
|
|
20
21
|
_batch_options,
|
|
21
22
|
_validate_page,
|
|
22
23
|
check_api_response,
|
|
@@ -191,7 +192,7 @@ def amazon_product_cmd(
|
|
|
191
192
|
@optgroup.option("--pages", type=int, default=None, help="Number of pages to fetch.")
|
|
192
193
|
@optgroup.option(
|
|
193
194
|
"--sort-by",
|
|
194
|
-
type=
|
|
195
|
+
type=NormalizedChoice(AMAZON_SORT_BY, case_sensitive=False),
|
|
195
196
|
default=None,
|
|
196
197
|
help="Sort order.",
|
|
197
198
|
)
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""Auth, docs, and logout commands."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import getpass
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
|
|
10
|
+
from ..client import Client
|
|
11
|
+
from ..config import (
|
|
12
|
+
BASE_URL,
|
|
13
|
+
auth_config_path,
|
|
14
|
+
get_api_key_if_set,
|
|
15
|
+
remove_api_key_from_dotenv,
|
|
16
|
+
save_api_key_to_dotenv,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
DOCS_URL = "https://www.scrapingbee.com/documentation/"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _validate_api_key(key: str) -> bool:
|
|
23
|
+
"""Validate API key by calling the usage endpoint. Returns True if valid."""
|
|
24
|
+
|
|
25
|
+
async def _check() -> int:
|
|
26
|
+
async with Client(key, BASE_URL) as client:
|
|
27
|
+
_, _, status_code = await client.usage(retries=1, backoff=1.0)
|
|
28
|
+
return status_code
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
status = asyncio.run(_check())
|
|
32
|
+
return status == 200
|
|
33
|
+
except Exception:
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
_UNSAFE_DISCLAIMER = """
|
|
38
|
+
════════════════════════════════════════════════════════════════
|
|
39
|
+
⚠ WARNING: UNSAFE MODE
|
|
40
|
+
════════════════════════════════════════════════════════════════
|
|
41
|
+
|
|
42
|
+
You are enabling shell execution features (--post-process,
|
|
43
|
+
--on-complete, and the schedule command). These execute ARBITRARY SHELL COMMANDS
|
|
44
|
+
on your machine.
|
|
45
|
+
|
|
46
|
+
RISKS:
|
|
47
|
+
• Data exfiltration (SSH keys, credentials, files)
|
|
48
|
+
• Arbitrary code execution
|
|
49
|
+
• Persistent backdoors via cron scheduling
|
|
50
|
+
|
|
51
|
+
DO NOT enable this in AI agent environments where commands
|
|
52
|
+
may be constructed from scraped web content.
|
|
53
|
+
|
|
54
|
+
ScrapingBee is NOT responsible for any damages caused by
|
|
55
|
+
these features. Use at your own discretion.
|
|
56
|
+
|
|
57
|
+
════════════════════════════════════════════════════════════════
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _wipe_api_key_everywhere() -> None:
|
|
62
|
+
"""Remove the API key from config .env, cwd .env, and os.environ."""
|
|
63
|
+
import os
|
|
64
|
+
from pathlib import Path
|
|
65
|
+
|
|
66
|
+
from ..config import ENV_API_KEY
|
|
67
|
+
|
|
68
|
+
# Remove from config .env
|
|
69
|
+
remove_api_key_from_dotenv()
|
|
70
|
+
|
|
71
|
+
# Remove from cwd .env if present
|
|
72
|
+
cwd_env = Path.cwd() / ".env"
|
|
73
|
+
if cwd_env.is_file():
|
|
74
|
+
try:
|
|
75
|
+
lines = []
|
|
76
|
+
with open(cwd_env, encoding="utf-8") as f:
|
|
77
|
+
for line in f:
|
|
78
|
+
if ENV_API_KEY not in line:
|
|
79
|
+
lines.append(line)
|
|
80
|
+
with open(cwd_env, "w", encoding="utf-8") as f:
|
|
81
|
+
f.writelines(lines)
|
|
82
|
+
except OSError:
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
# Remove from current process env
|
|
86
|
+
os.environ.pop(ENV_API_KEY, None)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@click.command()
|
|
90
|
+
@click.option(
|
|
91
|
+
"--api-key",
|
|
92
|
+
"auth_api_key",
|
|
93
|
+
default=None,
|
|
94
|
+
help="API key to save (non-interactive); otherwise uses env/.env or prompt.",
|
|
95
|
+
)
|
|
96
|
+
@click.option(
|
|
97
|
+
"--show",
|
|
98
|
+
"show_path_only",
|
|
99
|
+
is_flag=True,
|
|
100
|
+
default=False,
|
|
101
|
+
help="Only show the path where the API key is or would be stored; do not save.",
|
|
102
|
+
)
|
|
103
|
+
@click.option(
|
|
104
|
+
"--unsafe",
|
|
105
|
+
"unsafe_mode",
|
|
106
|
+
is_flag=True,
|
|
107
|
+
default=False,
|
|
108
|
+
hidden=True,
|
|
109
|
+
help="Enable advanced shell execution features.",
|
|
110
|
+
)
|
|
111
|
+
@click.pass_obj
|
|
112
|
+
def auth_cmd(obj: dict, auth_api_key: str | None, show_path_only: bool, unsafe_mode: bool) -> None:
|
|
113
|
+
"""Save API key to ~/.config/scrapingbee-cli/.env (from --api-key, env/.env, or prompt)."""
|
|
114
|
+
from ..exec_gate import is_exec_enabled, require_auth_unsafe, set_unsafe_verified
|
|
115
|
+
|
|
116
|
+
path = auth_config_path()
|
|
117
|
+
|
|
118
|
+
if show_path_only:
|
|
119
|
+
click.echo(str(path))
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
if unsafe_mode:
|
|
123
|
+
# Gate: check env vars are set (vague error if not)
|
|
124
|
+
if not require_auth_unsafe():
|
|
125
|
+
raise SystemExit(1)
|
|
126
|
+
|
|
127
|
+
# Gate: reject --api-key (must be interactive only)
|
|
128
|
+
if auth_api_key:
|
|
129
|
+
click.echo("Something went wrong. Please try again later.", err=True)
|
|
130
|
+
raise SystemExit(1)
|
|
131
|
+
|
|
132
|
+
# Wipe API key from everywhere
|
|
133
|
+
_wipe_api_key_everywhere()
|
|
134
|
+
click.echo("API key removed for security re-authentication.", err=True)
|
|
135
|
+
|
|
136
|
+
# Show disclaimer
|
|
137
|
+
click.echo(_UNSAFE_DISCLAIMER, err=True)
|
|
138
|
+
|
|
139
|
+
# Require acceptance
|
|
140
|
+
try:
|
|
141
|
+
answer = input("Do you accept the risks? (yes/no): ").strip().lower()
|
|
142
|
+
except (EOFError, KeyboardInterrupt):
|
|
143
|
+
click.echo("\nAborted.", err=True)
|
|
144
|
+
raise SystemExit(1)
|
|
145
|
+
if answer != "yes":
|
|
146
|
+
click.echo("Aborted. Unsafe mode not enabled.", err=True)
|
|
147
|
+
raise SystemExit(1)
|
|
148
|
+
|
|
149
|
+
# Prompt for API key (interactive only)
|
|
150
|
+
try:
|
|
151
|
+
raw = getpass.getpass("ScrapingBee API key: ")
|
|
152
|
+
except (EOFError, KeyboardInterrupt):
|
|
153
|
+
click.echo("\nAborted.", err=True)
|
|
154
|
+
raise SystemExit(1)
|
|
155
|
+
key = raw.strip()
|
|
156
|
+
if not key:
|
|
157
|
+
click.echo("No API key entered.", err=True)
|
|
158
|
+
raise SystemExit(1)
|
|
159
|
+
|
|
160
|
+
click.echo("Validating API key...", err=True)
|
|
161
|
+
if not _validate_api_key(key):
|
|
162
|
+
click.echo("Invalid API key.", err=True)
|
|
163
|
+
raise SystemExit(1)
|
|
164
|
+
|
|
165
|
+
# Save key and set unsafe verified
|
|
166
|
+
save_api_key_to_dotenv(key)
|
|
167
|
+
set_unsafe_verified()
|
|
168
|
+
click.echo("API key saved. Unsafe mode enabled.", err=True)
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
# Normal auth flow (show warning if unsafe is enabled)
|
|
172
|
+
if is_exec_enabled():
|
|
173
|
+
click.echo("⚠ Unsafe mode is active. Shell execution features are enabled.", err=True)
|
|
174
|
+
|
|
175
|
+
key = auth_api_key or get_api_key_if_set(None)
|
|
176
|
+
if not key:
|
|
177
|
+
try:
|
|
178
|
+
raw = getpass.getpass("ScrapingBee API key: ")
|
|
179
|
+
except (EOFError, KeyboardInterrupt):
|
|
180
|
+
click.echo(
|
|
181
|
+
"Cannot read API key (non-interactive). Use --api-key KEY or set SCRAPINGBEE_API_KEY.",
|
|
182
|
+
err=True,
|
|
183
|
+
)
|
|
184
|
+
raise SystemExit(1)
|
|
185
|
+
key = raw.strip()
|
|
186
|
+
if not key:
|
|
187
|
+
click.echo("No API key entered.", err=True)
|
|
188
|
+
raise SystemExit(1)
|
|
189
|
+
click.echo("Validating API key...", err=True)
|
|
190
|
+
if not _validate_api_key(key):
|
|
191
|
+
click.echo("Invalid API key. Please check your key and try again.", err=True)
|
|
192
|
+
raise SystemExit(1)
|
|
193
|
+
path = save_api_key_to_dotenv(key)
|
|
194
|
+
click.echo(f"API key saved to {path}. You can now run scrapingbee commands.")
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
@click.command()
|
|
198
|
+
@click.option(
|
|
199
|
+
"--open/--no-open",
|
|
200
|
+
"open_browser",
|
|
201
|
+
default=False,
|
|
202
|
+
help="Open the documentation URL in the default browser.",
|
|
203
|
+
)
|
|
204
|
+
def docs_cmd(open_browser: bool) -> None:
|
|
205
|
+
"""Print or open the ScrapingBee API documentation URL."""
|
|
206
|
+
click.echo(DOCS_URL)
|
|
207
|
+
if open_browser:
|
|
208
|
+
import webbrowser
|
|
209
|
+
|
|
210
|
+
webbrowser.open(DOCS_URL)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
@click.command()
|
|
214
|
+
@click.pass_obj
|
|
215
|
+
def logout_cmd(obj: dict) -> None:
|
|
216
|
+
"""Remove stored API key from ~/.config/scrapingbee-cli/.env."""
|
|
217
|
+
import json
|
|
218
|
+
from pathlib import Path
|
|
219
|
+
|
|
220
|
+
# Check for active schedules
|
|
221
|
+
registry_path = Path.home() / ".config" / "scrapingbee-cli" / "schedules.json"
|
|
222
|
+
try:
|
|
223
|
+
registry = json.loads(registry_path.read_text(encoding="utf-8"))
|
|
224
|
+
except Exception:
|
|
225
|
+
registry = {}
|
|
226
|
+
|
|
227
|
+
if registry:
|
|
228
|
+
names = ", ".join(registry)
|
|
229
|
+
click.echo(
|
|
230
|
+
f"Warning: you have {len(registry)} active schedule(s): {names}.\n"
|
|
231
|
+
"These will fail without an API key.",
|
|
232
|
+
err=True,
|
|
233
|
+
)
|
|
234
|
+
if not click.confirm("Stop all schedules and logout?"):
|
|
235
|
+
click.echo("Logout cancelled.", err=True)
|
|
236
|
+
return
|
|
237
|
+
# Stop all schedules
|
|
238
|
+
from .schedule import _remove_cron_entry, _save_registry
|
|
239
|
+
|
|
240
|
+
for name in list(registry):
|
|
241
|
+
_remove_cron_entry(name)
|
|
242
|
+
click.echo(f" Stopped schedule '{name}'.", err=True)
|
|
243
|
+
_save_registry({})
|
|
244
|
+
|
|
245
|
+
removed = remove_api_key_from_dotenv()
|
|
246
|
+
|
|
247
|
+
# Also remove unsafe verified flag
|
|
248
|
+
from ..exec_gate import remove_unsafe_verified
|
|
249
|
+
|
|
250
|
+
remove_unsafe_verified()
|
|
251
|
+
|
|
252
|
+
if removed:
|
|
253
|
+
click.echo(f"API key removed from {auth_config_path()}.")
|
|
254
|
+
else:
|
|
255
|
+
click.echo(f"No stored API key found in {auth_config_path()}.")
|
|
256
|
+
click.echo(
|
|
257
|
+
"If you set SCRAPINGBEE_API_KEY in your shell, unset it with: unset SCRAPINGBEE_API_KEY"
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def register(cli: click.Group) -> None:
|
|
262
|
+
cli.add_command(auth_cmd, "auth")
|
|
263
|
+
cli.add_command(docs_cmd, "docs")
|
|
264
|
+
cli.add_command(logout_cmd, "logout")
|
|
@@ -59,6 +59,7 @@ def _crawl_build_params(
|
|
|
59
59
|
device: str | None,
|
|
60
60
|
custom_google: str | None,
|
|
61
61
|
transparent_status_code: str | None,
|
|
62
|
+
scraping_config: str | None = None,
|
|
62
63
|
) -> dict[str, str]:
|
|
63
64
|
"""Build ScrapingBee API params dict from crawl options (quick-crawl URL mode)."""
|
|
64
65
|
kwargs = build_scrape_kwargs(
|
|
@@ -97,6 +98,7 @@ def _crawl_build_params(
|
|
|
97
98
|
custom_google=custom_google,
|
|
98
99
|
transparent_status_code=transparent_status_code,
|
|
99
100
|
body=None,
|
|
101
|
+
scraping_config=scraping_config,
|
|
100
102
|
)
|
|
101
103
|
return scrape_kwargs_to_api_params(kwargs)
|
|
102
104
|
|
|
@@ -117,6 +119,12 @@ def _crawl_build_params(
|
|
|
117
119
|
default=None,
|
|
118
120
|
help="Path to Scrapy project. Spider mode only.",
|
|
119
121
|
)
|
|
122
|
+
@click.option(
|
|
123
|
+
"--scraping-config",
|
|
124
|
+
type=str,
|
|
125
|
+
default=None,
|
|
126
|
+
help="Apply a pre-saved scraping configuration by name. Create configs in the ScrapingBee dashboard. Inline options override config settings.",
|
|
127
|
+
)
|
|
120
128
|
@optgroup.group("Rendering", help="JavaScript rendering and viewport options")
|
|
121
129
|
@optgroup.option(
|
|
122
130
|
"--render-js",
|
|
@@ -314,7 +322,7 @@ def _crawl_build_params(
|
|
|
314
322
|
"on_complete",
|
|
315
323
|
type=str,
|
|
316
324
|
default=None,
|
|
317
|
-
help="Shell command to run after crawl completes.",
|
|
325
|
+
help="[Advanced] Shell command to run after crawl completes. Requires unsafe mode.",
|
|
318
326
|
)
|
|
319
327
|
@_output_options
|
|
320
328
|
@click.pass_obj
|
|
@@ -323,6 +331,7 @@ def crawl_cmd(
|
|
|
323
331
|
target: tuple[str, ...],
|
|
324
332
|
from_sitemap: str | None,
|
|
325
333
|
project: str | None,
|
|
334
|
+
scraping_config: str | None,
|
|
326
335
|
render_js: str | None,
|
|
327
336
|
js_scenario: str | None,
|
|
328
337
|
wait: int | None,
|
|
@@ -467,6 +476,7 @@ def crawl_cmd(
|
|
|
467
476
|
device=device,
|
|
468
477
|
custom_google=custom_google,
|
|
469
478
|
transparent_status_code=transparent_status_code,
|
|
479
|
+
scraping_config=scraping_config,
|
|
470
480
|
)
|
|
471
481
|
except ValueError as e:
|
|
472
482
|
click.echo(str(e), err=True)
|
|
@@ -17,6 +17,7 @@ from ..batch import (
|
|
|
17
17
|
)
|
|
18
18
|
from ..cli_utils import (
|
|
19
19
|
DEVICE_DESKTOP_MOBILE,
|
|
20
|
+
NormalizedChoice,
|
|
20
21
|
_batch_options,
|
|
21
22
|
_validate_page,
|
|
22
23
|
check_api_response,
|
|
@@ -56,7 +57,7 @@ def _warn_empty_organic(data: bytes, search_type: str | None) -> None:
|
|
|
56
57
|
@optgroup.group("Search", help="Search type, locale, and pagination")
|
|
57
58
|
@optgroup.option(
|
|
58
59
|
"--search-type",
|
|
59
|
-
type=
|
|
60
|
+
type=NormalizedChoice(
|
|
60
61
|
["classic", "news", "maps", "lens", "shopping", "images", "ai-mode"],
|
|
61
62
|
case_sensitive=False,
|
|
62
63
|
),
|