scrapingbee-cli 1.3.1__tar.gz → 1.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrapingbee_cli-1.3.1/src/scrapingbee_cli.egg-info → scrapingbee_cli-1.4.1}/PKG-INFO +16 -7
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/README.md +13 -6
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/pyproject.toml +7 -1
- scrapingbee_cli-1.4.1/src/scrapingbee_cli/__init__.py +29 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/audit.py +41 -4
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/batch.py +389 -62
- scrapingbee_cli-1.4.1/src/scrapingbee_cli/cli.py +199 -0
- scrapingbee_cli-1.4.1/src/scrapingbee_cli/cli_utils.py +1606 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/client.py +2 -2
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/commands/__init__.py +2 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/commands/amazon.py +22 -14
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/commands/auth.py +88 -15
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/commands/chatgpt.py +9 -5
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/commands/crawl.py +93 -4
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/commands/export.py +112 -20
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/commands/fast_search.py +11 -7
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/commands/google.py +11 -7
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/commands/schedule.py +36 -4
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/commands/scrape.py +133 -40
- scrapingbee_cli-1.4.1/src/scrapingbee_cli/commands/tutorial.py +135 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/commands/unsafe.py +52 -2
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/commands/usage.py +6 -3
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/commands/walmart.py +22 -14
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/commands/youtube.py +22 -14
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/crawl.py +64 -7
- scrapingbee_cli-1.4.1/src/scrapingbee_cli/extract.py +482 -0
- scrapingbee_cli-1.4.1/src/scrapingbee_cli/tutorial/__init__.py +1 -0
- scrapingbee_cli-1.4.1/src/scrapingbee_cli/tutorial/runner.py +925 -0
- scrapingbee_cli-1.4.1/src/scrapingbee_cli/tutorial/steps.py +712 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1/src/scrapingbee_cli.egg-info}/PKG-INFO +16 -7
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli.egg-info/SOURCES.txt +6 -1
- scrapingbee_cli-1.3.1/src/scrapingbee_cli/__init__.py +0 -16
- scrapingbee_cli-1.3.1/src/scrapingbee_cli/cli.py +0 -101
- scrapingbee_cli-1.3.1/src/scrapingbee_cli/cli_utils.py +0 -693
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/LICENSE +0 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/setup.cfg +0 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/config.py +0 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/credits.py +0 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli/exec_gate.py +0 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli.egg-info/dependency_links.txt +0 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli.egg-info/entry_points.txt +0 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli.egg-info/requires.txt +0 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.1}/src/scrapingbee_cli.egg-info/top_level.txt +0 -0
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scrapingbee-cli
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.1
|
|
4
4
|
Summary: Command-line client for the ScrapingBee API: scrape pages (single or batch), crawl sites, check usage/credits, and use Google Search, Fast Search, Amazon, Walmart, YouTube, and ChatGPT from the terminal.
|
|
5
5
|
Author: ScrapingBee
|
|
6
6
|
License-Expression: MIT
|
|
7
7
|
Project-URL: Homepage, https://www.scrapingbee.com/
|
|
8
8
|
Project-URL: Documentation, https://www.scrapingbee.com/documentation/
|
|
9
9
|
Project-URL: Repository, https://github.com/ScrapingBee/scrapingbee-cli
|
|
10
|
+
Project-URL: Changelog, https://github.com/ScrapingBee/scrapingbee-cli/blob/main/CHANGELOG.md
|
|
11
|
+
Project-URL: Issues, https://github.com/ScrapingBee/scrapingbee-cli/issues
|
|
10
12
|
Keywords: scrapingbee,scraping,crawl,scrapy,batch,google-search,amazon,walmart,youtube,chatgpt,cli,api
|
|
11
13
|
Classifier: Development Status :: 4 - Beta
|
|
12
14
|
Classifier: Environment :: Console
|
|
@@ -81,7 +83,7 @@ scrapingbee [command] [arguments] [options]
|
|
|
81
83
|
- **`scrapingbee --help`** – List all commands.
|
|
82
84
|
- **`scrapingbee [command] --help`** – Options and parameters for that command.
|
|
83
85
|
|
|
84
|
-
**Options are per-command.** Each command has its own set of options — run `scrapingbee [command] --help` to see them. Common options across batch-capable commands include `--output-file`, `--output-dir`, `--input-file`, `--input-column`, `--concurrency`, `--output-format`, `--retries`, `--backoff`, `--resume`, `--update-csv`, `--no-progress`, `--extract-field`, `--fields`, `--deduplicate`, `--sample`, `--post-process`, `--on-complete`, `--scraping-config`, and `--verbose`. For details, see the [documentation](https://www.scrapingbee.com/documentation/).
|
|
86
|
+
**Options are per-command.** Each command has its own set of options — run `scrapingbee [command] --help` to see them. Common options across batch-capable commands include `--output-file`, `--output-dir`, `--input-file`, `--input-column`, `--concurrency`, `--output-format`, `--overwrite`, `--retries`, `--backoff`, `--resume`, `--update-csv`, `--no-progress`, `--extract-field`, `--fields`, `--smart-extract`, `--deduplicate`, `--sample`, `--post-process`, `--on-complete`, `--scraping-config`, and `--verbose`. For details, see the [documentation](https://www.scrapingbee.com/documentation/).
|
|
85
87
|
|
|
86
88
|
**Parameter values:** Choice parameters accept both hyphens and underscores interchangeably (e.g. `--sort-by price-low` and `--sort-by price_low` both work).
|
|
87
89
|
|
|
@@ -101,8 +103,9 @@ scrapingbee [command] [arguments] [options]
|
|
|
101
103
|
| `chatgpt` | ChatGPT API (`--search true` for web-enhanced responses) |
|
|
102
104
|
| `export` | Merge batch/crawl output to ndjson, txt, or csv (with --flatten, --columns) |
|
|
103
105
|
| `schedule` | Schedule commands via cron (--name, --list, --stop) |
|
|
106
|
+
| `tutorial` | Interactive step-by-step guide to CLI features (`--chapter N`, `--reset`, `--list`, `--output-dir`) |
|
|
104
107
|
|
|
105
|
-
**Batch mode:** Commands that take a single input support `--input-file` (one line per input, or `.csv` with `--input-column`) and `--output-dir`. Use `--output-format` to
|
|
108
|
+
**Batch mode:** Commands that take a single input support `--input-file` (one line per input, or `.csv` with `--input-column`) and `--output-dir`. Use `--output-format csv` or `--output-format ndjson` to stream all results to a single file (or stdout) instead of individual files. Add `--deduplicate` to remove duplicate URLs, `--sample N` to test on a subset, or `--post-process 'jq .title'` to transform each result. Use `--resume` to skip already-completed items after interruption. Run bare `scrapingbee --resume` to discover incomplete batches in the current directory.
|
|
106
109
|
|
|
107
110
|
**Parameters and options:** Use space-separated values (e.g. `--render-js false`), not `--option=value`. For full parameter lists, response formats, and credit costs, see **`scrapingbee [command] --help`** and the [ScrapingBee API documentation](https://www.scrapingbee.com/documentation/).
|
|
108
111
|
|
|
@@ -110,16 +113,17 @@ scrapingbee [command] [arguments] [options]
|
|
|
110
113
|
|
|
111
114
|
- **AI extraction:** `--ai-extract-rules '{"price": "product price", "title": "product name"}'` pulls structured data from any page using natural language — no CSS selectors needed. Works with `scrape`, `crawl`, and batch mode.
|
|
112
115
|
- **CSS/XPath extraction:** `--extract-rules '{"title": "h1", "price": ".price"}'` for consistent, cheaper production scraping. Find selectors in browser DevTools.
|
|
113
|
-
- **Pipelines:** Chain commands with `--extract-field` — e.g. `google QUERY --extract-field organic_results.url > urls.txt` then `scrape --input-file urls.txt`.
|
|
116
|
+
- **Pipelines:** Chain commands with `--extract-field` — e.g. `google QUERY --extract-field organic_results.url > urls.txt` then `scrape --input-file urls.txt`. Use `--fields` to filter JSON output keys; supports dot notation (e.g. `--fields product.title,product.price`).
|
|
117
|
+
- **Smart Extract:** `--smart-extract` extracts data from any format (JSON, HTML, XML, CSV, Markdown) using a path expression. Auto-detects format. Supports slicing, regex filtering, and JSON schema output.
|
|
114
118
|
- **Update CSV:** `--update-csv` fetches fresh data and updates the input CSV in-place. Ideal for daily price tracking, inventory monitoring, or any dataset that needs periodic refresh.
|
|
115
119
|
- **Crawl with filtering:** `--include-pattern`, `--exclude-pattern` control which links to follow. `--save-pattern` only saves pages matching a regex (others are visited for link discovery but not saved).
|
|
116
|
-
- **Output formats:** `--output-format ndjson` streams results as JSON lines
|
|
120
|
+
- **Output formats:** `--output-format` accepts `ndjson` (streams results as JSON lines) or `csv` (writes a single CSV) — these are the only valid values. Default (no flag) writes individual files per item into `--output-dir`.
|
|
117
121
|
- **CSV input:** `--input-file products.csv --input-column url` reads URLs from a CSV column.
|
|
118
122
|
- **Export:** `scrapingbee export --input-dir batch/ --format csv --flatten --columns "title,price"` merges batch output with nested JSON flattening and column selection.
|
|
119
123
|
- **Scheduling:** `scrapingbee schedule --every 1d --name prices scrape --input-file products.csv --update-csv` registers a cron job. Use `--list`, `--stop NAME`, or `--stop all`.
|
|
120
124
|
- **Deduplication & sampling:** `--deduplicate` removes duplicate URLs; `--sample 100` processes only 100 random items.
|
|
121
125
|
- **RAG chunking:** `scrape --chunk-size 500 --chunk-overlap 50 --return-page-markdown true` outputs NDJSON chunks ready for vector DB ingestion.
|
|
122
|
-
- **Scraping configurations:** `--scraping-config "My-Config"` applies a pre-saved configuration from your ScrapingBee dashboard. Inline options override config settings. Create configurations in the [request builder](https://app.scrapingbee.com/).
|
|
126
|
+
- **Scraping configurations:** `--scraping-config "My-Config"` applies a pre-saved configuration from your ScrapingBee dashboard. Inline options override config settings. Create configurations in the [request builder](https://app.scrapingbee.com/). Running `scrapingbee --scraping-config NAME` (without a subcommand) auto-routes to `scrape`.
|
|
123
127
|
|
|
124
128
|
### Examples
|
|
125
129
|
|
|
@@ -134,6 +138,11 @@ scrapingbee export --input-dir products --format csv --flatten --columns "name,p
|
|
|
134
138
|
scrapingbee scrape --input-file products.csv --input-column url --update-csv --ai-extract-rules '{"price": "current price"}'
|
|
135
139
|
scrapingbee schedule --every 1d --name price-tracker scrape --input-file products.csv --input-column url --update-csv --ai-extract-rules '{"price": "price"}'
|
|
136
140
|
scrapingbee schedule --list
|
|
141
|
+
|
|
142
|
+
# Smart Extract — pull fields from any format with a path expression
|
|
143
|
+
scrapingbee google "pizza new york" --smart-extract 'organic_results[0:3].title'
|
|
144
|
+
scrapingbee scrape "https://example.com" --smart-extract '...a[href=/mailto/].text'
|
|
145
|
+
scrapingbee scrape "https://example.com" --smart-extract '{"titles": "...h1", "links": "...href[0:5]"}'
|
|
137
146
|
```
|
|
138
147
|
|
|
139
148
|
## Security
|
|
@@ -149,7 +158,7 @@ For advanced features setup, see the Security section in our [CLI documentation]
|
|
|
149
158
|
- **[CLI Documentation](https://www.scrapingbee.com/documentation/cli/)** – Full CLI reference with pipelines, parameters, and examples.
|
|
150
159
|
- **[Advanced usage examples](docs/advanced-usage.md)** – Shell piping, command chaining, batch workflows, monitoring scripts, NDJSON streaming, screenshots, Google search patterns, LLM chunking, and more.
|
|
151
160
|
- **[ScrapingBee API documentation](https://www.scrapingbee.com/documentation/)** – Parameters, response formats, credit costs, and best practices.
|
|
152
|
-
- **Claude / AI agents:** This repo includes a [Claude Skill](https://github.com/ScrapingBee/scrapingbee-cli/tree/main/skills/scrapingbee-cli) and [Claude Plugin](
|
|
161
|
+
- **Claude / AI agents:** This repo includes a [Claude Skill](https://github.com/ScrapingBee/scrapingbee-cli/tree/main/plugins/scrapingbee-cli/skills/scrapingbee-cli) and [Claude Plugin](plugins/scrapingbee-cli/.claude-plugin/) for agent use with file-based output and security rules.
|
|
153
162
|
|
|
154
163
|
## Testing
|
|
155
164
|
|
|
@@ -44,7 +44,7 @@ scrapingbee [command] [arguments] [options]
|
|
|
44
44
|
- **`scrapingbee --help`** – List all commands.
|
|
45
45
|
- **`scrapingbee [command] --help`** – Options and parameters for that command.
|
|
46
46
|
|
|
47
|
-
**Options are per-command.** Each command has its own set of options — run `scrapingbee [command] --help` to see them. Common options across batch-capable commands include `--output-file`, `--output-dir`, `--input-file`, `--input-column`, `--concurrency`, `--output-format`, `--retries`, `--backoff`, `--resume`, `--update-csv`, `--no-progress`, `--extract-field`, `--fields`, `--deduplicate`, `--sample`, `--post-process`, `--on-complete`, `--scraping-config`, and `--verbose`. For details, see the [documentation](https://www.scrapingbee.com/documentation/).
|
|
47
|
+
**Options are per-command.** Each command has its own set of options — run `scrapingbee [command] --help` to see them. Common options across batch-capable commands include `--output-file`, `--output-dir`, `--input-file`, `--input-column`, `--concurrency`, `--output-format`, `--overwrite`, `--retries`, `--backoff`, `--resume`, `--update-csv`, `--no-progress`, `--extract-field`, `--fields`, `--smart-extract`, `--deduplicate`, `--sample`, `--post-process`, `--on-complete`, `--scraping-config`, and `--verbose`. For details, see the [documentation](https://www.scrapingbee.com/documentation/).
|
|
48
48
|
|
|
49
49
|
**Parameter values:** Choice parameters accept both hyphens and underscores interchangeably (e.g. `--sort-by price-low` and `--sort-by price_low` both work).
|
|
50
50
|
|
|
@@ -64,8 +64,9 @@ scrapingbee [command] [arguments] [options]
|
|
|
64
64
|
| `chatgpt` | ChatGPT API (`--search true` for web-enhanced responses) |
|
|
65
65
|
| `export` | Merge batch/crawl output to ndjson, txt, or csv (with --flatten, --columns) |
|
|
66
66
|
| `schedule` | Schedule commands via cron (--name, --list, --stop) |
|
|
67
|
+
| `tutorial` | Interactive step-by-step guide to CLI features (`--chapter N`, `--reset`, `--list`, `--output-dir`) |
|
|
67
68
|
|
|
68
|
-
**Batch mode:** Commands that take a single input support `--input-file` (one line per input, or `.csv` with `--input-column`) and `--output-dir`. Use `--output-format` to
|
|
69
|
+
**Batch mode:** Commands that take a single input support `--input-file` (one line per input, or `.csv` with `--input-column`) and `--output-dir`. Use `--output-format csv` or `--output-format ndjson` to stream all results to a single file (or stdout) instead of individual files. Add `--deduplicate` to remove duplicate URLs, `--sample N` to test on a subset, or `--post-process 'jq .title'` to transform each result. Use `--resume` to skip already-completed items after interruption. Run bare `scrapingbee --resume` to discover incomplete batches in the current directory.
|
|
69
70
|
|
|
70
71
|
**Parameters and options:** Use space-separated values (e.g. `--render-js false`), not `--option=value`. For full parameter lists, response formats, and credit costs, see **`scrapingbee [command] --help`** and the [ScrapingBee API documentation](https://www.scrapingbee.com/documentation/).
|
|
71
72
|
|
|
@@ -73,16 +74,17 @@ scrapingbee [command] [arguments] [options]
|
|
|
73
74
|
|
|
74
75
|
- **AI extraction:** `--ai-extract-rules '{"price": "product price", "title": "product name"}'` pulls structured data from any page using natural language — no CSS selectors needed. Works with `scrape`, `crawl`, and batch mode.
|
|
75
76
|
- **CSS/XPath extraction:** `--extract-rules '{"title": "h1", "price": ".price"}'` for consistent, cheaper production scraping. Find selectors in browser DevTools.
|
|
76
|
-
- **Pipelines:** Chain commands with `--extract-field` — e.g. `google QUERY --extract-field organic_results.url > urls.txt` then `scrape --input-file urls.txt`.
|
|
77
|
+
- **Pipelines:** Chain commands with `--extract-field` — e.g. `google QUERY --extract-field organic_results.url > urls.txt` then `scrape --input-file urls.txt`. Use `--fields` to filter JSON output keys; supports dot notation (e.g. `--fields product.title,product.price`).
|
|
78
|
+
- **Smart Extract:** `--smart-extract` extracts data from any format (JSON, HTML, XML, CSV, Markdown) using a path expression. Auto-detects format. Supports slicing, regex filtering, and JSON schema output.
|
|
77
79
|
- **Update CSV:** `--update-csv` fetches fresh data and updates the input CSV in-place. Ideal for daily price tracking, inventory monitoring, or any dataset that needs periodic refresh.
|
|
78
80
|
- **Crawl with filtering:** `--include-pattern`, `--exclude-pattern` control which links to follow. `--save-pattern` only saves pages matching a regex (others are visited for link discovery but not saved).
|
|
79
|
-
- **Output formats:** `--output-format ndjson` streams results as JSON lines
|
|
81
|
+
- **Output formats:** `--output-format` accepts `ndjson` (streams results as JSON lines) or `csv` (writes a single CSV) — these are the only valid values. Default (no flag) writes individual files per item into `--output-dir`.
|
|
80
82
|
- **CSV input:** `--input-file products.csv --input-column url` reads URLs from a CSV column.
|
|
81
83
|
- **Export:** `scrapingbee export --input-dir batch/ --format csv --flatten --columns "title,price"` merges batch output with nested JSON flattening and column selection.
|
|
82
84
|
- **Scheduling:** `scrapingbee schedule --every 1d --name prices scrape --input-file products.csv --update-csv` registers a cron job. Use `--list`, `--stop NAME`, or `--stop all`.
|
|
83
85
|
- **Deduplication & sampling:** `--deduplicate` removes duplicate URLs; `--sample 100` processes only 100 random items.
|
|
84
86
|
- **RAG chunking:** `scrape --chunk-size 500 --chunk-overlap 50 --return-page-markdown true` outputs NDJSON chunks ready for vector DB ingestion.
|
|
85
|
-
- **Scraping configurations:** `--scraping-config "My-Config"` applies a pre-saved configuration from your ScrapingBee dashboard. Inline options override config settings. Create configurations in the [request builder](https://app.scrapingbee.com/).
|
|
87
|
+
- **Scraping configurations:** `--scraping-config "My-Config"` applies a pre-saved configuration from your ScrapingBee dashboard. Inline options override config settings. Create configurations in the [request builder](https://app.scrapingbee.com/). Running `scrapingbee --scraping-config NAME` (without a subcommand) auto-routes to `scrape`.
|
|
86
88
|
|
|
87
89
|
### Examples
|
|
88
90
|
|
|
@@ -97,6 +99,11 @@ scrapingbee export --input-dir products --format csv --flatten --columns "name,p
|
|
|
97
99
|
scrapingbee scrape --input-file products.csv --input-column url --update-csv --ai-extract-rules '{"price": "current price"}'
|
|
98
100
|
scrapingbee schedule --every 1d --name price-tracker scrape --input-file products.csv --input-column url --update-csv --ai-extract-rules '{"price": "price"}'
|
|
99
101
|
scrapingbee schedule --list
|
|
102
|
+
|
|
103
|
+
# Smart Extract — pull fields from any format with a path expression
|
|
104
|
+
scrapingbee google "pizza new york" --smart-extract 'organic_results[0:3].title'
|
|
105
|
+
scrapingbee scrape "https://example.com" --smart-extract '...a[href=/mailto/].text'
|
|
106
|
+
scrapingbee scrape "https://example.com" --smart-extract '{"titles": "...h1", "links": "...href[0:5]"}'
|
|
100
107
|
```
|
|
101
108
|
|
|
102
109
|
## Security
|
|
@@ -112,7 +119,7 @@ For advanced features setup, see the Security section in our [CLI documentation]
|
|
|
112
119
|
- **[CLI Documentation](https://www.scrapingbee.com/documentation/cli/)** – Full CLI reference with pipelines, parameters, and examples.
|
|
113
120
|
- **[Advanced usage examples](docs/advanced-usage.md)** – Shell piping, command chaining, batch workflows, monitoring scripts, NDJSON streaming, screenshots, Google search patterns, LLM chunking, and more.
|
|
114
121
|
- **[ScrapingBee API documentation](https://www.scrapingbee.com/documentation/)** – Parameters, response formats, credit costs, and best practices.
|
|
115
|
-
- **Claude / AI agents:** This repo includes a [Claude Skill](https://github.com/ScrapingBee/scrapingbee-cli/tree/main/skills/scrapingbee-cli) and [Claude Plugin](
|
|
122
|
+
- **Claude / AI agents:** This repo includes a [Claude Skill](https://github.com/ScrapingBee/scrapingbee-cli/tree/main/plugins/scrapingbee-cli/skills/scrapingbee-cli) and [Claude Plugin](plugins/scrapingbee-cli/.claude-plugin/) for agent use with file-based output and security rules.
|
|
116
123
|
|
|
117
124
|
## Testing
|
|
118
125
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "scrapingbee-cli"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.4.1"
|
|
8
8
|
description = "Command-line client for the ScrapingBee API: scrape pages (single or batch), crawl sites, check usage/credits, and use Google Search, Fast Search, Amazon, Walmart, YouTube, and ChatGPT from the terminal."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -48,6 +48,8 @@ dependencies = [
|
|
|
48
48
|
Homepage = "https://www.scrapingbee.com/"
|
|
49
49
|
Documentation = "https://www.scrapingbee.com/documentation/"
|
|
50
50
|
Repository = "https://github.com/ScrapingBee/scrapingbee-cli"
|
|
51
|
+
Changelog = "https://github.com/ScrapingBee/scrapingbee-cli/blob/main/CHANGELOG.md"
|
|
52
|
+
Issues = "https://github.com/ScrapingBee/scrapingbee-cli/issues"
|
|
51
53
|
|
|
52
54
|
[project.optional-dependencies]
|
|
53
55
|
dev = [
|
|
@@ -90,3 +92,7 @@ markers = [
|
|
|
90
92
|
"integration: marks tests that call the live API (deselect with '-m \"not integration\"')",
|
|
91
93
|
]
|
|
92
94
|
addopts = "-v --tb=short"
|
|
95
|
+
filterwarnings = [
|
|
96
|
+
"ignore::RuntimeWarning:cssselect",
|
|
97
|
+
"ignore:coroutine.*was never awaited:RuntimeWarning",
|
|
98
|
+
]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""ScrapingBee CLI - Command-line client for the ScrapingBee API."""
|
|
2
|
+
|
|
3
|
+
import platform
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
__version__ = "1.4.1"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def user_agent_headers() -> dict[str, str]:
|
|
10
|
+
"""Build structured User-Agent headers for API requests.
|
|
11
|
+
|
|
12
|
+
Returns a dict of headers:
|
|
13
|
+
User-Agent: ScrapingBee/CLI
|
|
14
|
+
User-Agent-Client: scrapingbee-cli
|
|
15
|
+
User-Agent-Client-Version: 1.4.1
|
|
16
|
+
User-Agent-Environment: python
|
|
17
|
+
User-Agent-Environment-Version: 3.14.2
|
|
18
|
+
User-Agent-OS: Darwin arm64
|
|
19
|
+
"""
|
|
20
|
+
py = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
|
|
21
|
+
os_info = f"{platform.system()} {platform.machine()}"
|
|
22
|
+
return {
|
|
23
|
+
"User-Agent": "ScrapingBee/CLI",
|
|
24
|
+
"User-Agent-Client": "scrapingbee-cli",
|
|
25
|
+
"User-Agent-Client-Version": __version__,
|
|
26
|
+
"User-Agent-Environment": "python",
|
|
27
|
+
"User-Agent-Environment-Version": py,
|
|
28
|
+
"User-Agent-OS": os_info,
|
|
29
|
+
}
|
|
@@ -35,18 +35,55 @@ def log_exec(
|
|
|
35
35
|
pass
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
def
|
|
39
|
-
"""
|
|
38
|
+
def _parse_timestamp(line: str) -> datetime | None:
|
|
39
|
+
"""Extract the ISO timestamp from the start of an audit log line."""
|
|
40
|
+
parts = line.split(" | ", 1)
|
|
41
|
+
if not parts:
|
|
42
|
+
return None
|
|
43
|
+
try:
|
|
44
|
+
return datetime.fromisoformat(parts[0].strip())
|
|
45
|
+
except (ValueError, IndexError):
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def read_audit_log(
|
|
50
|
+
n: int = 50,
|
|
51
|
+
since: datetime | None = None,
|
|
52
|
+
until: datetime | None = None,
|
|
53
|
+
) -> str:
|
|
54
|
+
"""Read audit log entries.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
n: Maximum number of lines to return (from the end). Ignored if since/until is set.
|
|
58
|
+
since: Only return entries at or after this time.
|
|
59
|
+
until: Only return entries at or before this time.
|
|
60
|
+
"""
|
|
40
61
|
if not AUDIT_LOG_PATH.is_file():
|
|
41
62
|
return "No audit log found."
|
|
42
63
|
try:
|
|
43
64
|
with open(AUDIT_LOG_PATH, encoding="utf-8") as f:
|
|
44
65
|
lines = f.readlines()
|
|
45
|
-
recent = lines[-n:] if len(lines) > n else lines
|
|
46
|
-
return "".join(recent)
|
|
47
66
|
except OSError:
|
|
48
67
|
return "Could not read audit log."
|
|
49
68
|
|
|
69
|
+
if since or until:
|
|
70
|
+
filtered = []
|
|
71
|
+
for line in lines:
|
|
72
|
+
ts = _parse_timestamp(line)
|
|
73
|
+
if ts is None:
|
|
74
|
+
continue
|
|
75
|
+
if since and ts < since:
|
|
76
|
+
continue
|
|
77
|
+
if until and ts > until:
|
|
78
|
+
continue
|
|
79
|
+
filtered.append(line)
|
|
80
|
+
if not filtered:
|
|
81
|
+
return "No entries found in the specified time range."
|
|
82
|
+
return "".join(filtered)
|
|
83
|
+
|
|
84
|
+
recent = lines[-n:] if len(lines) > n else lines
|
|
85
|
+
return "".join(recent)
|
|
86
|
+
|
|
50
87
|
|
|
51
88
|
def _rotate_if_needed() -> None:
|
|
52
89
|
"""Keep only the last MAX_LINES entries."""
|