scrapingbee-cli 1.2.2__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrapingbee_cli-1.2.2/src/scrapingbee_cli.egg-info → scrapingbee_cli-1.3.0}/PKG-INFO +21 -3
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/README.md +20 -2
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/pyproject.toml +1 -1
- scrapingbee_cli-1.3.0/src/scrapingbee_cli/__init__.py +16 -0
- scrapingbee_cli-1.3.0/src/scrapingbee_cli/audit.py +60 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/batch.py +26 -2
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/cli_utils.py +55 -8
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/client.py +13 -2
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/commands/__init__.py +3 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/commands/amazon.py +6 -0
- scrapingbee_cli-1.3.0/src/scrapingbee_cli/commands/auth.py +264 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/commands/chatgpt.py +35 -2
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/commands/crawl.py +8 -3
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/commands/fast_search.py +3 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/commands/google.py +3 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/commands/schedule.py +7 -1
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/commands/scrape.py +8 -10
- scrapingbee_cli-1.3.0/src/scrapingbee_cli/commands/unsafe.py +84 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/commands/walmart.py +6 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/commands/youtube.py +2 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/crawl.py +14 -4
- scrapingbee_cli-1.3.0/src/scrapingbee_cli/credits.py +57 -0
- scrapingbee_cli-1.3.0/src/scrapingbee_cli/exec_gate.py +147 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0/src/scrapingbee_cli.egg-info}/PKG-INFO +21 -3
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli.egg-info/SOURCES.txt +3 -0
- scrapingbee_cli-1.2.2/src/scrapingbee_cli/__init__.py +0 -3
- scrapingbee_cli-1.2.2/src/scrapingbee_cli/commands/auth.py +0 -141
- scrapingbee_cli-1.2.2/src/scrapingbee_cli/credits.py +0 -22
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/LICENSE +0 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/setup.cfg +0 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/cli.py +0 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/commands/export.py +0 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/commands/usage.py +0 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli/config.py +0 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli.egg-info/dependency_links.txt +0 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli.egg-info/entry_points.txt +0 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli.egg-info/requires.txt +0 -0
- {scrapingbee_cli-1.2.2 → scrapingbee_cli-1.3.0}/src/scrapingbee_cli.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scrapingbee-cli
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.0
|
|
4
4
|
Summary: Command-line client for the ScrapingBee API: scrape pages (single or batch), crawl sites, check usage/credits, and use Google Search, Fast Search, Amazon, Walmart, YouTube, and ChatGPT from the terminal.
|
|
5
5
|
Author: ScrapingBee
|
|
6
6
|
License-Expression: MIT
|
|
@@ -47,9 +47,17 @@ Command-line client for the [ScrapingBee](https://www.scrapingbee.com/) API: scr
|
|
|
47
47
|
|
|
48
48
|
## Installation
|
|
49
49
|
|
|
50
|
+
**Recommended** — install with [uv](https://docs.astral.sh/uv/) (no virtual environment needed):
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
54
|
+
uv tool install scrapingbee-cli
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
**Alternative** — install with pip in a virtual environment:
|
|
58
|
+
|
|
50
59
|
```bash
|
|
51
60
|
pip install scrapingbee-cli
|
|
52
|
-
# or (isolated): pipx install scrapingbee-cli
|
|
53
61
|
```
|
|
54
62
|
|
|
55
63
|
From source: clone the repo and run `pip install -e .` in the project root.
|
|
@@ -88,7 +96,7 @@ scrapingbee [command] [arguments] [options]
|
|
|
88
96
|
| `amazon-product` / `amazon-search` | Amazon product and search |
|
|
89
97
|
| `walmart-search` / `walmart-product` | Walmart search and product |
|
|
90
98
|
| `youtube-search` / `youtube-metadata` | YouTube search and video metadata |
|
|
91
|
-
| `chatgpt` | ChatGPT API |
|
|
99
|
+
| `chatgpt` | ChatGPT API (`--search true` for web-enhanced responses) |
|
|
92
100
|
| `export` | Merge batch/crawl output to ndjson, txt, or csv (with --flatten, --columns) |
|
|
93
101
|
| `schedule` | Schedule commands via cron (--name, --list, --stop) |
|
|
94
102
|
|
|
@@ -125,8 +133,18 @@ scrapingbee schedule --every 1d --name price-tracker scrape --input-file product
|
|
|
125
133
|
scrapingbee schedule --list
|
|
126
134
|
```
|
|
127
135
|
|
|
136
|
+
## Security
|
|
137
|
+
|
|
138
|
+
The `--post-process`, `--on-complete`, and `schedule` commands execute arbitrary shell commands on your machine. These features are **disabled by default** and require explicit human setup to enable.
|
|
139
|
+
|
|
140
|
+
For advanced features setup, see the Security section in our [CLI documentation](https://www.scrapingbee.com/documentation/cli/).
|
|
141
|
+
|
|
142
|
+
**Do not enable these features in AI agent environments** where commands may be constructed from scraped web content. ScrapingBee is not responsible for any damages caused by shell execution features. Use at your own discretion.
|
|
143
|
+
|
|
128
144
|
## More information
|
|
129
145
|
|
|
146
|
+
- **[CLI Documentation](https://www.scrapingbee.com/documentation/cli/)** – Full CLI reference with pipelines, parameters, and examples.
|
|
147
|
+
- **[Advanced usage examples](docs/advanced-usage.md)** – Shell piping, command chaining, batch workflows, monitoring scripts, NDJSON streaming, screenshots, Google search patterns, LLM chunking, and more.
|
|
130
148
|
- **[ScrapingBee API documentation](https://www.scrapingbee.com/documentation/)** – Parameters, response formats, credit costs, and best practices.
|
|
131
149
|
- **Claude / AI agents:** This repo includes a [Claude Skill](https://github.com/ScrapingBee/scrapingbee-cli/tree/main/skills/scrapingbee-cli) and [Claude Plugin](.claude-plugin/) for agent use with file-based output and security rules.
|
|
132
150
|
|
|
@@ -10,9 +10,17 @@ Command-line client for the [ScrapingBee](https://www.scrapingbee.com/) API: scr
|
|
|
10
10
|
|
|
11
11
|
## Installation
|
|
12
12
|
|
|
13
|
+
**Recommended** — install with [uv](https://docs.astral.sh/uv/) (no virtual environment needed):
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
17
|
+
uv tool install scrapingbee-cli
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
**Alternative** — install with pip in a virtual environment:
|
|
21
|
+
|
|
13
22
|
```bash
|
|
14
23
|
pip install scrapingbee-cli
|
|
15
|
-
# or (isolated): pipx install scrapingbee-cli
|
|
16
24
|
```
|
|
17
25
|
|
|
18
26
|
From source: clone the repo and run `pip install -e .` in the project root.
|
|
@@ -51,7 +59,7 @@ scrapingbee [command] [arguments] [options]
|
|
|
51
59
|
| `amazon-product` / `amazon-search` | Amazon product and search |
|
|
52
60
|
| `walmart-search` / `walmart-product` | Walmart search and product |
|
|
53
61
|
| `youtube-search` / `youtube-metadata` | YouTube search and video metadata |
|
|
54
|
-
| `chatgpt` | ChatGPT API |
|
|
62
|
+
| `chatgpt` | ChatGPT API (`--search true` for web-enhanced responses) |
|
|
55
63
|
| `export` | Merge batch/crawl output to ndjson, txt, or csv (with --flatten, --columns) |
|
|
56
64
|
| `schedule` | Schedule commands via cron (--name, --list, --stop) |
|
|
57
65
|
|
|
@@ -88,8 +96,18 @@ scrapingbee schedule --every 1d --name price-tracker scrape --input-file product
|
|
|
88
96
|
scrapingbee schedule --list
|
|
89
97
|
```
|
|
90
98
|
|
|
99
|
+
## Security
|
|
100
|
+
|
|
101
|
+
The `--post-process`, `--on-complete`, and `schedule` commands execute arbitrary shell commands on your machine. These features are **disabled by default** and require explicit human setup to enable.
|
|
102
|
+
|
|
103
|
+
For advanced features setup, see the Security section in our [CLI documentation](https://www.scrapingbee.com/documentation/cli/).
|
|
104
|
+
|
|
105
|
+
**Do not enable these features in AI agent environments** where commands may be constructed from scraped web content. ScrapingBee is not responsible for any damages caused by shell execution features. Use at your own discretion.
|
|
106
|
+
|
|
91
107
|
## More information
|
|
92
108
|
|
|
109
|
+
- **[CLI Documentation](https://www.scrapingbee.com/documentation/cli/)** – Full CLI reference with pipelines, parameters, and examples.
|
|
110
|
+
- **[Advanced usage examples](docs/advanced-usage.md)** – Shell piping, command chaining, batch workflows, monitoring scripts, NDJSON streaming, screenshots, Google search patterns, LLM chunking, and more.
|
|
93
111
|
- **[ScrapingBee API documentation](https://www.scrapingbee.com/documentation/)** – Parameters, response formats, credit costs, and best practices.
|
|
94
112
|
- **Claude / AI agents:** This repo includes a [Claude Skill](https://github.com/ScrapingBee/scrapingbee-cli/tree/main/skills/scrapingbee-cli) and [Claude Plugin](.claude-plugin/) for agent use with file-based output and security rules.
|
|
95
113
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "scrapingbee-cli"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.3.0"
|
|
8
8
|
description = "Command-line client for the ScrapingBee API: scrape pages (single or batch), crawl sites, check usage/credits, and use Google Search, Fast Search, Amazon, Walmart, YouTube, and ChatGPT from the terminal."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""ScrapingBee CLI - Command-line client for the ScrapingBee API."""
|
|
2
|
+
|
|
3
|
+
import platform
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
__version__ = "1.3.0"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def user_agent() -> str:
|
|
10
|
+
"""Build a descriptive User-Agent string for API requests.
|
|
11
|
+
|
|
12
|
+
Format: scrapingbee-cli/1.2.3 Python/3.12.0 (Darwin arm64)
|
|
13
|
+
"""
|
|
14
|
+
py = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
|
|
15
|
+
os_info = f"{platform.system()} {platform.machine()}"
|
|
16
|
+
return f"scrapingbee-cli/{__version__} Python/{py} ({os_info})"
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Audit logging for exec features (--post-process, --on-complete, schedule).
|
|
2
|
+
|
|
3
|
+
Logs every shell command execution to a fixed location for forensics
|
|
4
|
+
and guard skill monitoring.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
AUDIT_LOG_PATH = Path.home() / ".config" / "scrapingbee-cli" / "audit.log"
|
|
13
|
+
MAX_LINES = 10_000
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def log_exec(
|
|
17
|
+
feature: str,
|
|
18
|
+
command: str,
|
|
19
|
+
*,
|
|
20
|
+
input_source: str = "",
|
|
21
|
+
output_dir: str = "",
|
|
22
|
+
) -> None:
|
|
23
|
+
"""Append an entry to the audit log.
|
|
24
|
+
|
|
25
|
+
Format: ISO_TIMESTAMP | FEATURE | COMMAND | INPUT | OUTPUT_DIR
|
|
26
|
+
"""
|
|
27
|
+
AUDIT_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
timestamp = datetime.now(timezone.utc).isoformat()
|
|
29
|
+
entry = f"{timestamp} | {feature} | {command} | {input_source} | {output_dir}\n"
|
|
30
|
+
try:
|
|
31
|
+
with open(AUDIT_LOG_PATH, "a", encoding="utf-8") as f:
|
|
32
|
+
f.write(entry)
|
|
33
|
+
_rotate_if_needed()
|
|
34
|
+
except OSError:
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def read_audit_log(n: int = 50) -> str:
|
|
39
|
+
"""Read the last N lines of the audit log."""
|
|
40
|
+
if not AUDIT_LOG_PATH.is_file():
|
|
41
|
+
return "No audit log found."
|
|
42
|
+
try:
|
|
43
|
+
with open(AUDIT_LOG_PATH, encoding="utf-8") as f:
|
|
44
|
+
lines = f.readlines()
|
|
45
|
+
recent = lines[-n:] if len(lines) > n else lines
|
|
46
|
+
return "".join(recent)
|
|
47
|
+
except OSError:
|
|
48
|
+
return "Could not read audit log."
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _rotate_if_needed() -> None:
|
|
52
|
+
"""Keep only the last MAX_LINES entries."""
|
|
53
|
+
try:
|
|
54
|
+
with open(AUDIT_LOG_PATH, encoding="utf-8") as f:
|
|
55
|
+
lines = f.readlines()
|
|
56
|
+
if len(lines) > MAX_LINES:
|
|
57
|
+
with open(AUDIT_LOG_PATH, "w", encoding="utf-8") as f:
|
|
58
|
+
f.writelines(lines[-MAX_LINES:])
|
|
59
|
+
except OSError:
|
|
60
|
+
pass
|
|
@@ -297,10 +297,27 @@ async def _fetch_usage_async(api_key: str) -> dict:
|
|
|
297
297
|
return parse_usage(body)
|
|
298
298
|
|
|
299
299
|
|
|
300
|
+
# Cache usage API responses to avoid hitting the 6 calls/min rate limit.
|
|
301
|
+
_usage_cache: dict | None = None
|
|
302
|
+
_usage_cache_time: float = 0
|
|
303
|
+
_USAGE_CACHE_TTL = 30 # seconds
|
|
304
|
+
|
|
305
|
+
|
|
300
306
|
def get_batch_usage(api_key_flag: str | None) -> dict:
|
|
301
|
-
"""Return usage info (max_concurrency, credits) from usage API.
|
|
307
|
+
"""Return usage info (max_concurrency, credits) from usage API.
|
|
308
|
+
|
|
309
|
+
Caches the result for 30 seconds to avoid hitting the usage API
|
|
310
|
+
rate limit (6 calls/min).
|
|
311
|
+
"""
|
|
312
|
+
global _usage_cache, _usage_cache_time # noqa: PLW0603
|
|
313
|
+
now = time.monotonic()
|
|
314
|
+
if _usage_cache is not None and (now - _usage_cache_time) < _USAGE_CACHE_TTL:
|
|
315
|
+
return _usage_cache
|
|
302
316
|
key = get_api_key(api_key_flag)
|
|
303
|
-
|
|
317
|
+
result = asyncio.run(_fetch_usage_async(key))
|
|
318
|
+
_usage_cache = result
|
|
319
|
+
_usage_cache_time = now
|
|
320
|
+
return result
|
|
304
321
|
|
|
305
322
|
|
|
306
323
|
MIN_CREDITS_TO_RUN_BATCH = 100
|
|
@@ -688,6 +705,13 @@ def apply_post_process(body: bytes, cmd: str) -> bytes:
|
|
|
688
705
|
"""Run shell command with body as stdin, return stdout. On failure, return original body."""
|
|
689
706
|
import subprocess
|
|
690
707
|
|
|
708
|
+
from .audit import log_exec
|
|
709
|
+
from .exec_gate import require_exec
|
|
710
|
+
|
|
711
|
+
require_exec("--post-process", cmd)
|
|
712
|
+
log_exec("post-process", cmd)
|
|
713
|
+
click.echo(f"⚠ Executing: {cmd.split()[0] if cmd.split() else cmd} (whitelisted)", err=True)
|
|
714
|
+
|
|
691
715
|
try:
|
|
692
716
|
result = subprocess.run(
|
|
693
717
|
cmd,
|
|
@@ -105,7 +105,7 @@ def _batch_options(f: Any) -> Any:
|
|
|
105
105
|
"post_process",
|
|
106
106
|
type=str,
|
|
107
107
|
default=None,
|
|
108
|
-
help="Batch: pipe each result through a shell command (e.g. 'jq .title').",
|
|
108
|
+
help="[Advanced] Batch: pipe each result through a shell command (e.g. 'jq .title'). Requires unsafe mode.",
|
|
109
109
|
)(f)
|
|
110
110
|
f = click.option(
|
|
111
111
|
"--update-csv",
|
|
@@ -132,7 +132,7 @@ def _batch_options(f: Any) -> Any:
|
|
|
132
132
|
"on_complete",
|
|
133
133
|
type=str,
|
|
134
134
|
default=None,
|
|
135
|
-
help="Batch: shell command to run after completion.",
|
|
135
|
+
help="[Advanced] Batch: shell command to run after completion. Requires unsafe mode.",
|
|
136
136
|
)(f)
|
|
137
137
|
f = click.option("--retries", type=int, default=3, help="Retry on errors (default: 3).")(f)
|
|
138
138
|
f = click.option(
|
|
@@ -195,6 +195,28 @@ def _resolve_dotpath(obj: Any, keys: list[str]) -> Any:
|
|
|
195
195
|
return cur
|
|
196
196
|
|
|
197
197
|
|
|
198
|
+
def _collect_dotpaths(obj: Any, prefix: str = "", max_depth: int = 4) -> list[str]:
|
|
199
|
+
"""Recursively collect all valid dot-paths from a JSON object.
|
|
200
|
+
|
|
201
|
+
For arrays, peeks into the first element. Caps at *max_depth* to
|
|
202
|
+
avoid huge output on deeply nested structures.
|
|
203
|
+
"""
|
|
204
|
+
if max_depth <= 0:
|
|
205
|
+
return []
|
|
206
|
+
paths: list[str] = []
|
|
207
|
+
if isinstance(obj, dict):
|
|
208
|
+
for key in obj.keys():
|
|
209
|
+
full = f"{prefix}.{key}" if prefix else key
|
|
210
|
+
paths.append(full)
|
|
211
|
+
paths.extend(_collect_dotpaths(obj[key], full, max_depth - 1))
|
|
212
|
+
elif isinstance(obj, list) and obj:
|
|
213
|
+
# Peek into first element to show available sub-paths
|
|
214
|
+
first = obj[0] if isinstance(obj[0], dict) else None
|
|
215
|
+
if first:
|
|
216
|
+
paths.extend(_collect_dotpaths(first, prefix, max_depth - 1))
|
|
217
|
+
return paths
|
|
218
|
+
|
|
219
|
+
|
|
198
220
|
def _extract_field_values(data: bytes, path: str) -> bytes:
|
|
199
221
|
"""Extract values from JSON data using a dot-path expression.
|
|
200
222
|
|
|
@@ -215,6 +237,14 @@ def _extract_field_values(data: bytes, path: str) -> bytes:
|
|
|
215
237
|
result = _resolve_dotpath(obj, keys)
|
|
216
238
|
|
|
217
239
|
if result is None:
|
|
240
|
+
paths = _collect_dotpaths(obj)
|
|
241
|
+
hint = ""
|
|
242
|
+
if paths:
|
|
243
|
+
hint = "\n Available paths:\n " + "\n ".join(paths)
|
|
244
|
+
click.echo(
|
|
245
|
+
f"Warning: --extract-field '{path}' did not match any data.{hint}",
|
|
246
|
+
err=True,
|
|
247
|
+
)
|
|
218
248
|
return b""
|
|
219
249
|
if isinstance(result, list):
|
|
220
250
|
values = [str(v) for v in result if v is not None]
|
|
@@ -524,6 +554,13 @@ async def scrape_with_escalation(
|
|
|
524
554
|
return data, headers, status_code
|
|
525
555
|
|
|
526
556
|
|
|
557
|
+
def ensure_url_scheme(url: str) -> str:
|
|
558
|
+
"""Prepend https:// if the URL has no scheme (like curl/httpie do)."""
|
|
559
|
+
if url and not url.startswith(("http://", "https://", "ftp://")):
|
|
560
|
+
return "https://" + url
|
|
561
|
+
return url
|
|
562
|
+
|
|
563
|
+
|
|
527
564
|
def prepare_batch_inputs(inputs: list[str], obj: dict) -> list[str]:
|
|
528
565
|
"""Apply --deduplicate and --sample to batch inputs."""
|
|
529
566
|
from .batch import deduplicate_inputs, sample_inputs
|
|
@@ -558,11 +595,17 @@ def run_on_complete(
|
|
|
558
595
|
import os
|
|
559
596
|
import subprocess
|
|
560
597
|
|
|
598
|
+
from .audit import log_exec
|
|
599
|
+
from .exec_gate import require_exec
|
|
600
|
+
|
|
601
|
+
require_exec("--on-complete", cmd)
|
|
602
|
+
log_exec("on-complete", cmd, output_dir=output_dir)
|
|
603
|
+
click.echo(f"⚠ Executing: {cmd.split()[0] if cmd.split() else cmd} (whitelisted)", err=True)
|
|
604
|
+
|
|
561
605
|
env = os.environ.copy()
|
|
562
606
|
env["SCRAPINGBEE_OUTPUT_DIR"] = output_dir
|
|
563
607
|
env["SCRAPINGBEE_SUCCEEDED"] = str(succeeded)
|
|
564
608
|
env["SCRAPINGBEE_FAILED"] = str(failed)
|
|
565
|
-
click.echo(f"[on-complete] Running: {cmd}", err=True)
|
|
566
609
|
result = subprocess.run(cmd, shell=True, env=env) # noqa: S602
|
|
567
610
|
if result.returncode != 0:
|
|
568
611
|
click.echo(f"[on-complete] Exit code: {result.returncode}", err=True)
|
|
@@ -578,6 +621,7 @@ def write_output(
|
|
|
578
621
|
extract_field: str | None = None,
|
|
579
622
|
fields: str | None = None,
|
|
580
623
|
command: str | None = None,
|
|
624
|
+
credit_cost: int | None = None,
|
|
581
625
|
) -> None:
|
|
582
626
|
"""Write response data to file or stdout; optionally print verbose headers.
|
|
583
627
|
|
|
@@ -604,11 +648,14 @@ def write_output(
|
|
|
604
648
|
click.echo(f"{label}: {val}", err=True)
|
|
605
649
|
if key == "spb-cost":
|
|
606
650
|
spb_cost_present = True
|
|
607
|
-
if not spb_cost_present
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
651
|
+
if not spb_cost_present:
|
|
652
|
+
if credit_cost is not None:
|
|
653
|
+
click.echo(f"Credit Cost: {credit_cost}", err=True)
|
|
654
|
+
elif command:
|
|
655
|
+
from scrapingbee_cli.credits import ESTIMATED_CREDITS
|
|
656
|
+
|
|
657
|
+
if command in ESTIMATED_CREDITS:
|
|
658
|
+
click.echo(f"Credit Cost (estimated): {ESTIMATED_CREDITS[command]}", err=True)
|
|
612
659
|
click.echo("---", err=True)
|
|
613
660
|
if extract_field:
|
|
614
661
|
data = _extract_field_values(data, extract_field)
|
|
@@ -10,6 +10,7 @@ from typing import Any
|
|
|
10
10
|
import aiohttp
|
|
11
11
|
import certifi
|
|
12
12
|
|
|
13
|
+
from . import user_agent
|
|
13
14
|
from .config import BASE_URL
|
|
14
15
|
|
|
15
16
|
|
|
@@ -45,7 +46,7 @@ class Client:
|
|
|
45
46
|
self._session = aiohttp.ClientSession(
|
|
46
47
|
connector=connector,
|
|
47
48
|
timeout=timeout,
|
|
48
|
-
headers={"User-Agent":
|
|
49
|
+
headers={"User-Agent": user_agent()},
|
|
49
50
|
)
|
|
50
51
|
return self
|
|
51
52
|
|
|
@@ -539,12 +540,22 @@ class Client:
|
|
|
539
540
|
async def chatgpt(
|
|
540
541
|
self,
|
|
541
542
|
prompt: str,
|
|
543
|
+
search: bool | None = None,
|
|
544
|
+
add_html: bool | None = None,
|
|
545
|
+
country_code: str | None = None,
|
|
542
546
|
retries: int = 3,
|
|
543
547
|
backoff: float = 2.0,
|
|
544
548
|
) -> tuple[bytes, dict, int]:
|
|
549
|
+
params: dict[str, object] = {"prompt": prompt}
|
|
550
|
+
if search:
|
|
551
|
+
params["search"] = "true"
|
|
552
|
+
if add_html is not None:
|
|
553
|
+
params["add_html"] = str(add_html).lower()
|
|
554
|
+
if country_code is not None:
|
|
555
|
+
params["country_code"] = country_code
|
|
545
556
|
return await self._get_with_retry(
|
|
546
557
|
"/chatgpt",
|
|
547
|
-
|
|
558
|
+
params,
|
|
548
559
|
retries=retries,
|
|
549
560
|
backoff=backoff,
|
|
550
561
|
)
|
|
@@ -167,6 +167,8 @@ def amazon_product_cmd(
|
|
|
167
167
|
backoff=obj.get("backoff", 2.0) or 2.0,
|
|
168
168
|
)
|
|
169
169
|
check_api_response(data, status_code)
|
|
170
|
+
from ..credits import amazon_credits
|
|
171
|
+
|
|
170
172
|
write_output(
|
|
171
173
|
data,
|
|
172
174
|
headers,
|
|
@@ -176,6 +178,7 @@ def amazon_product_cmd(
|
|
|
176
178
|
extract_field=obj.get("extract_field"),
|
|
177
179
|
fields=obj.get("fields"),
|
|
178
180
|
command="amazon-product",
|
|
181
|
+
credit_cost=amazon_credits(parse_bool(light_request)),
|
|
179
182
|
)
|
|
180
183
|
|
|
181
184
|
asyncio.run(_single())
|
|
@@ -338,6 +341,8 @@ def amazon_search_cmd(
|
|
|
338
341
|
backoff=obj.get("backoff", 2.0) or 2.0,
|
|
339
342
|
)
|
|
340
343
|
check_api_response(data, status_code)
|
|
344
|
+
from ..credits import amazon_credits
|
|
345
|
+
|
|
341
346
|
write_output(
|
|
342
347
|
data,
|
|
343
348
|
headers,
|
|
@@ -347,6 +352,7 @@ def amazon_search_cmd(
|
|
|
347
352
|
extract_field=obj.get("extract_field"),
|
|
348
353
|
fields=obj.get("fields"),
|
|
349
354
|
command="amazon-search",
|
|
355
|
+
credit_cost=amazon_credits(parse_bool(light_request)),
|
|
350
356
|
)
|
|
351
357
|
|
|
352
358
|
asyncio.run(_single())
|