scrapingbee-cli 1.3.1__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrapingbee_cli-1.3.1/src/scrapingbee_cli.egg-info → scrapingbee_cli-1.4.0}/PKG-INFO +14 -7
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/README.md +13 -6
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/pyproject.toml +5 -1
- scrapingbee_cli-1.4.0/src/scrapingbee_cli/__init__.py +29 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/audit.py +41 -4
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/batch.py +389 -62
- scrapingbee_cli-1.4.0/src/scrapingbee_cli/cli.py +199 -0
- scrapingbee_cli-1.4.0/src/scrapingbee_cli/cli_utils.py +1606 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/client.py +2 -2
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/commands/__init__.py +2 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/commands/amazon.py +22 -14
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/commands/auth.py +88 -15
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/commands/chatgpt.py +9 -5
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/commands/crawl.py +93 -4
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/commands/export.py +112 -20
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/commands/fast_search.py +11 -7
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/commands/google.py +11 -7
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/commands/schedule.py +36 -4
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/commands/scrape.py +133 -40
- scrapingbee_cli-1.4.0/src/scrapingbee_cli/commands/tutorial.py +135 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/commands/unsafe.py +52 -2
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/commands/usage.py +6 -3
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/commands/walmart.py +22 -14
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/commands/youtube.py +22 -14
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/crawl.py +57 -6
- scrapingbee_cli-1.4.0/src/scrapingbee_cli/extract.py +482 -0
- scrapingbee_cli-1.4.0/src/scrapingbee_cli/tutorial/__init__.py +1 -0
- scrapingbee_cli-1.4.0/src/scrapingbee_cli/tutorial/runner.py +925 -0
- scrapingbee_cli-1.4.0/src/scrapingbee_cli/tutorial/steps.py +712 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0/src/scrapingbee_cli.egg-info}/PKG-INFO +14 -7
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli.egg-info/SOURCES.txt +6 -1
- scrapingbee_cli-1.3.1/src/scrapingbee_cli/__init__.py +0 -16
- scrapingbee_cli-1.3.1/src/scrapingbee_cli/cli.py +0 -101
- scrapingbee_cli-1.3.1/src/scrapingbee_cli/cli_utils.py +0 -693
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/LICENSE +0 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/setup.cfg +0 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/config.py +0 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/credits.py +0 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli/exec_gate.py +0 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli.egg-info/dependency_links.txt +0 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli.egg-info/entry_points.txt +0 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli.egg-info/requires.txt +0 -0
- {scrapingbee_cli-1.3.1 → scrapingbee_cli-1.4.0}/src/scrapingbee_cli.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scrapingbee-cli
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.0
|
|
4
4
|
Summary: Command-line client for the ScrapingBee API: scrape pages (single or batch), crawl sites, check usage/credits, and use Google Search, Fast Search, Amazon, Walmart, YouTube, and ChatGPT from the terminal.
|
|
5
5
|
Author: ScrapingBee
|
|
6
6
|
License-Expression: MIT
|
|
@@ -81,7 +81,7 @@ scrapingbee [command] [arguments] [options]
|
|
|
81
81
|
- **`scrapingbee --help`** – List all commands.
|
|
82
82
|
- **`scrapingbee [command] --help`** – Options and parameters for that command.
|
|
83
83
|
|
|
84
|
-
**Options are per-command.** Each command has its own set of options — run `scrapingbee [command] --help` to see them. Common options across batch-capable commands include `--output-file`, `--output-dir`, `--input-file`, `--input-column`, `--concurrency`, `--output-format`, `--retries`, `--backoff`, `--resume`, `--update-csv`, `--no-progress`, `--extract-field`, `--fields`, `--deduplicate`, `--sample`, `--post-process`, `--on-complete`, `--scraping-config`, and `--verbose`. For details, see the [documentation](https://www.scrapingbee.com/documentation/).
|
|
84
|
+
**Options are per-command.** Each command has its own set of options — run `scrapingbee [command] --help` to see them. Common options across batch-capable commands include `--output-file`, `--output-dir`, `--input-file`, `--input-column`, `--concurrency`, `--output-format`, `--overwrite`, `--retries`, `--backoff`, `--resume`, `--update-csv`, `--no-progress`, `--extract-field`, `--fields`, `--smart-extract`, `--deduplicate`, `--sample`, `--post-process`, `--on-complete`, `--scraping-config`, and `--verbose`. For details, see the [documentation](https://www.scrapingbee.com/documentation/).
|
|
85
85
|
|
|
86
86
|
**Parameter values:** Choice parameters accept both hyphens and underscores interchangeably (e.g. `--sort-by price-low` and `--sort-by price_low` both work).
|
|
87
87
|
|
|
@@ -101,8 +101,9 @@ scrapingbee [command] [arguments] [options]
|
|
|
101
101
|
| `chatgpt` | ChatGPT API (`--search true` for web-enhanced responses) |
|
|
102
102
|
| `export` | Merge batch/crawl output to ndjson, txt, or csv (with --flatten, --columns) |
|
|
103
103
|
| `schedule` | Schedule commands via cron (--name, --list, --stop) |
|
|
104
|
+
| `tutorial` | Interactive step-by-step guide to CLI features (`--chapter N`, `--reset`, `--list`, `--output-dir`) |
|
|
104
105
|
|
|
105
|
-
**Batch mode:** Commands that take a single input support `--input-file` (one line per input, or `.csv` with `--input-column`) and `--output-dir`. Use `--output-format` to
|
|
106
|
+
**Batch mode:** Commands that take a single input support `--input-file` (one line per input, or `.csv` with `--input-column`) and `--output-dir`. Use `--output-format csv` or `--output-format ndjson` to stream all results to a single file (or stdout) instead of individual files. Add `--deduplicate` to remove duplicate URLs, `--sample N` to test on a subset, or `--post-process 'jq .title'` to transform each result. Use `--resume` to skip already-completed items after interruption. Run bare `scrapingbee --resume` to discover incomplete batches in the current directory.
|
|
106
107
|
|
|
107
108
|
**Parameters and options:** Use space-separated values (e.g. `--render-js false`), not `--option=value`. For full parameter lists, response formats, and credit costs, see **`scrapingbee [command] --help`** and the [ScrapingBee API documentation](https://www.scrapingbee.com/documentation/).
|
|
108
109
|
|
|
@@ -110,16 +111,17 @@ scrapingbee [command] [arguments] [options]
|
|
|
110
111
|
|
|
111
112
|
- **AI extraction:** `--ai-extract-rules '{"price": "product price", "title": "product name"}'` pulls structured data from any page using natural language — no CSS selectors needed. Works with `scrape`, `crawl`, and batch mode.
|
|
112
113
|
- **CSS/XPath extraction:** `--extract-rules '{"title": "h1", "price": ".price"}'` for consistent, cheaper production scraping. Find selectors in browser DevTools.
|
|
113
|
-
- **Pipelines:** Chain commands with `--extract-field` — e.g. `google QUERY --extract-field organic_results.url > urls.txt` then `scrape --input-file urls.txt`.
|
|
114
|
+
- **Pipelines:** Chain commands with `--extract-field` — e.g. `google QUERY --extract-field organic_results.url > urls.txt` then `scrape --input-file urls.txt`. Use `--fields` to filter JSON output keys; supports dot notation (e.g. `--fields product.title,product.price`).
|
|
115
|
+
- **Smart Extract:** `--smart-extract` extracts data from any format (JSON, HTML, XML, CSV, Markdown) using a path expression. Auto-detects format. Supports slicing, regex filtering, and JSON schema output.
|
|
114
116
|
- **Update CSV:** `--update-csv` fetches fresh data and updates the input CSV in-place. Ideal for daily price tracking, inventory monitoring, or any dataset that needs periodic refresh.
|
|
115
117
|
- **Crawl with filtering:** `--include-pattern`, `--exclude-pattern` control which links to follow. `--save-pattern` only saves pages matching a regex (others are visited for link discovery but not saved).
|
|
116
|
-
- **Output formats:** `--output-format ndjson` streams results as JSON lines
|
|
118
|
+
- **Output formats:** `--output-format` accepts `ndjson` (streams results as JSON lines) or `csv` (writes a single CSV) — these are the only valid values. Default (no flag) writes individual files per item into `--output-dir`.
|
|
117
119
|
- **CSV input:** `--input-file products.csv --input-column url` reads URLs from a CSV column.
|
|
118
120
|
- **Export:** `scrapingbee export --input-dir batch/ --format csv --flatten --columns "title,price"` merges batch output with nested JSON flattening and column selection.
|
|
119
121
|
- **Scheduling:** `scrapingbee schedule --every 1d --name prices scrape --input-file products.csv --update-csv` registers a cron job. Use `--list`, `--stop NAME`, or `--stop all`.
|
|
120
122
|
- **Deduplication & sampling:** `--deduplicate` removes duplicate URLs; `--sample 100` processes only 100 random items.
|
|
121
123
|
- **RAG chunking:** `scrape --chunk-size 500 --chunk-overlap 50 --return-page-markdown true` outputs NDJSON chunks ready for vector DB ingestion.
|
|
122
|
-
- **Scraping configurations:** `--scraping-config "My-Config"` applies a pre-saved configuration from your ScrapingBee dashboard. Inline options override config settings. Create configurations in the [request builder](https://app.scrapingbee.com/).
|
|
124
|
+
- **Scraping configurations:** `--scraping-config "My-Config"` applies a pre-saved configuration from your ScrapingBee dashboard. Inline options override config settings. Create configurations in the [request builder](https://app.scrapingbee.com/). Running `scrapingbee --scraping-config NAME` (without a subcommand) auto-routes to `scrape`.
|
|
123
125
|
|
|
124
126
|
### Examples
|
|
125
127
|
|
|
@@ -134,6 +136,11 @@ scrapingbee export --input-dir products --format csv --flatten --columns "name,p
|
|
|
134
136
|
scrapingbee scrape --input-file products.csv --input-column url --update-csv --ai-extract-rules '{"price": "current price"}'
|
|
135
137
|
scrapingbee schedule --every 1d --name price-tracker scrape --input-file products.csv --input-column url --update-csv --ai-extract-rules '{"price": "price"}'
|
|
136
138
|
scrapingbee schedule --list
|
|
139
|
+
|
|
140
|
+
# Smart Extract — pull fields from any format with a path expression
|
|
141
|
+
scrapingbee google "pizza new york" --smart-extract 'organic_results[0:3].title'
|
|
142
|
+
scrapingbee scrape "https://example.com" --smart-extract '...a[href=/mailto/].text'
|
|
143
|
+
scrapingbee scrape "https://example.com" --smart-extract '{"titles": "...h1", "links": "...href[0:5]"}'
|
|
137
144
|
```
|
|
138
145
|
|
|
139
146
|
## Security
|
|
@@ -149,7 +156,7 @@ For advanced features setup, see the Security section in our [CLI documentation]
|
|
|
149
156
|
- **[CLI Documentation](https://www.scrapingbee.com/documentation/cli/)** – Full CLI reference with pipelines, parameters, and examples.
|
|
150
157
|
- **[Advanced usage examples](docs/advanced-usage.md)** – Shell piping, command chaining, batch workflows, monitoring scripts, NDJSON streaming, screenshots, Google search patterns, LLM chunking, and more.
|
|
151
158
|
- **[ScrapingBee API documentation](https://www.scrapingbee.com/documentation/)** – Parameters, response formats, credit costs, and best practices.
|
|
152
|
-
- **Claude / AI agents:** This repo includes a [Claude Skill](https://github.com/ScrapingBee/scrapingbee-cli/tree/main/skills/scrapingbee-cli) and [Claude Plugin](
|
|
159
|
+
- **Claude / AI agents:** This repo includes a [Claude Skill](https://github.com/ScrapingBee/scrapingbee-cli/tree/main/plugins/scrapingbee-cli/skills/scrapingbee-cli) and [Claude Plugin](plugins/scrapingbee-cli/.claude-plugin/) for agent use with file-based output and security rules.
|
|
153
160
|
|
|
154
161
|
## Testing
|
|
155
162
|
|
|
@@ -44,7 +44,7 @@ scrapingbee [command] [arguments] [options]
|
|
|
44
44
|
- **`scrapingbee --help`** – List all commands.
|
|
45
45
|
- **`scrapingbee [command] --help`** – Options and parameters for that command.
|
|
46
46
|
|
|
47
|
-
**Options are per-command.** Each command has its own set of options — run `scrapingbee [command] --help` to see them. Common options across batch-capable commands include `--output-file`, `--output-dir`, `--input-file`, `--input-column`, `--concurrency`, `--output-format`, `--retries`, `--backoff`, `--resume`, `--update-csv`, `--no-progress`, `--extract-field`, `--fields`, `--deduplicate`, `--sample`, `--post-process`, `--on-complete`, `--scraping-config`, and `--verbose`. For details, see the [documentation](https://www.scrapingbee.com/documentation/).
|
|
47
|
+
**Options are per-command.** Each command has its own set of options — run `scrapingbee [command] --help` to see them. Common options across batch-capable commands include `--output-file`, `--output-dir`, `--input-file`, `--input-column`, `--concurrency`, `--output-format`, `--overwrite`, `--retries`, `--backoff`, `--resume`, `--update-csv`, `--no-progress`, `--extract-field`, `--fields`, `--smart-extract`, `--deduplicate`, `--sample`, `--post-process`, `--on-complete`, `--scraping-config`, and `--verbose`. For details, see the [documentation](https://www.scrapingbee.com/documentation/).
|
|
48
48
|
|
|
49
49
|
**Parameter values:** Choice parameters accept both hyphens and underscores interchangeably (e.g. `--sort-by price-low` and `--sort-by price_low` both work).
|
|
50
50
|
|
|
@@ -64,8 +64,9 @@ scrapingbee [command] [arguments] [options]
|
|
|
64
64
|
| `chatgpt` | ChatGPT API (`--search true` for web-enhanced responses) |
|
|
65
65
|
| `export` | Merge batch/crawl output to ndjson, txt, or csv (with --flatten, --columns) |
|
|
66
66
|
| `schedule` | Schedule commands via cron (--name, --list, --stop) |
|
|
67
|
+
| `tutorial` | Interactive step-by-step guide to CLI features (`--chapter N`, `--reset`, `--list`, `--output-dir`) |
|
|
67
68
|
|
|
68
|
-
**Batch mode:** Commands that take a single input support `--input-file` (one line per input, or `.csv` with `--input-column`) and `--output-dir`. Use `--output-format` to
|
|
69
|
+
**Batch mode:** Commands that take a single input support `--input-file` (one line per input, or `.csv` with `--input-column`) and `--output-dir`. Use `--output-format csv` or `--output-format ndjson` to stream all results to a single file (or stdout) instead of individual files. Add `--deduplicate` to remove duplicate URLs, `--sample N` to test on a subset, or `--post-process 'jq .title'` to transform each result. Use `--resume` to skip already-completed items after interruption. Run bare `scrapingbee --resume` to discover incomplete batches in the current directory.
|
|
69
70
|
|
|
70
71
|
**Parameters and options:** Use space-separated values (e.g. `--render-js false`), not `--option=value`. For full parameter lists, response formats, and credit costs, see **`scrapingbee [command] --help`** and the [ScrapingBee API documentation](https://www.scrapingbee.com/documentation/).
|
|
71
72
|
|
|
@@ -73,16 +74,17 @@ scrapingbee [command] [arguments] [options]
|
|
|
73
74
|
|
|
74
75
|
- **AI extraction:** `--ai-extract-rules '{"price": "product price", "title": "product name"}'` pulls structured data from any page using natural language — no CSS selectors needed. Works with `scrape`, `crawl`, and batch mode.
|
|
75
76
|
- **CSS/XPath extraction:** `--extract-rules '{"title": "h1", "price": ".price"}'` for consistent, cheaper production scraping. Find selectors in browser DevTools.
|
|
76
|
-
- **Pipelines:** Chain commands with `--extract-field` — e.g. `google QUERY --extract-field organic_results.url > urls.txt` then `scrape --input-file urls.txt`.
|
|
77
|
+
- **Pipelines:** Chain commands with `--extract-field` — e.g. `google QUERY --extract-field organic_results.url > urls.txt` then `scrape --input-file urls.txt`. Use `--fields` to filter JSON output keys; supports dot notation (e.g. `--fields product.title,product.price`).
|
|
78
|
+
- **Smart Extract:** `--smart-extract` extracts data from any format (JSON, HTML, XML, CSV, Markdown) using a path expression. Auto-detects format. Supports slicing, regex filtering, and JSON schema output.
|
|
77
79
|
- **Update CSV:** `--update-csv` fetches fresh data and updates the input CSV in-place. Ideal for daily price tracking, inventory monitoring, or any dataset that needs periodic refresh.
|
|
78
80
|
- **Crawl with filtering:** `--include-pattern`, `--exclude-pattern` control which links to follow. `--save-pattern` only saves pages matching a regex (others are visited for link discovery but not saved).
|
|
79
|
-
- **Output formats:** `--output-format ndjson` streams results as JSON lines
|
|
81
|
+
- **Output formats:** `--output-format` accepts `ndjson` (streams results as JSON lines) or `csv` (writes a single CSV) — these are the only valid values. Default (no flag) writes individual files per item into `--output-dir`.
|
|
80
82
|
- **CSV input:** `--input-file products.csv --input-column url` reads URLs from a CSV column.
|
|
81
83
|
- **Export:** `scrapingbee export --input-dir batch/ --format csv --flatten --columns "title,price"` merges batch output with nested JSON flattening and column selection.
|
|
82
84
|
- **Scheduling:** `scrapingbee schedule --every 1d --name prices scrape --input-file products.csv --update-csv` registers a cron job. Use `--list`, `--stop NAME`, or `--stop all`.
|
|
83
85
|
- **Deduplication & sampling:** `--deduplicate` removes duplicate URLs; `--sample 100` processes only 100 random items.
|
|
84
86
|
- **RAG chunking:** `scrape --chunk-size 500 --chunk-overlap 50 --return-page-markdown true` outputs NDJSON chunks ready for vector DB ingestion.
|
|
85
|
-
- **Scraping configurations:** `--scraping-config "My-Config"` applies a pre-saved configuration from your ScrapingBee dashboard. Inline options override config settings. Create configurations in the [request builder](https://app.scrapingbee.com/).
|
|
87
|
+
- **Scraping configurations:** `--scraping-config "My-Config"` applies a pre-saved configuration from your ScrapingBee dashboard. Inline options override config settings. Create configurations in the [request builder](https://app.scrapingbee.com/). Running `scrapingbee --scraping-config NAME` (without a subcommand) auto-routes to `scrape`.
|
|
86
88
|
|
|
87
89
|
### Examples
|
|
88
90
|
|
|
@@ -97,6 +99,11 @@ scrapingbee export --input-dir products --format csv --flatten --columns "name,p
|
|
|
97
99
|
scrapingbee scrape --input-file products.csv --input-column url --update-csv --ai-extract-rules '{"price": "current price"}'
|
|
98
100
|
scrapingbee schedule --every 1d --name price-tracker scrape --input-file products.csv --input-column url --update-csv --ai-extract-rules '{"price": "price"}'
|
|
99
101
|
scrapingbee schedule --list
|
|
102
|
+
|
|
103
|
+
# Smart Extract — pull fields from any format with a path expression
|
|
104
|
+
scrapingbee google "pizza new york" --smart-extract 'organic_results[0:3].title'
|
|
105
|
+
scrapingbee scrape "https://example.com" --smart-extract '...a[href=/mailto/].text'
|
|
106
|
+
scrapingbee scrape "https://example.com" --smart-extract '{"titles": "...h1", "links": "...href[0:5]"}'
|
|
100
107
|
```
|
|
101
108
|
|
|
102
109
|
## Security
|
|
@@ -112,7 +119,7 @@ For advanced features setup, see the Security section in our [CLI documentation]
|
|
|
112
119
|
- **[CLI Documentation](https://www.scrapingbee.com/documentation/cli/)** – Full CLI reference with pipelines, parameters, and examples.
|
|
113
120
|
- **[Advanced usage examples](docs/advanced-usage.md)** – Shell piping, command chaining, batch workflows, monitoring scripts, NDJSON streaming, screenshots, Google search patterns, LLM chunking, and more.
|
|
114
121
|
- **[ScrapingBee API documentation](https://www.scrapingbee.com/documentation/)** – Parameters, response formats, credit costs, and best practices.
|
|
115
|
-
- **Claude / AI agents:** This repo includes a [Claude Skill](https://github.com/ScrapingBee/scrapingbee-cli/tree/main/skills/scrapingbee-cli) and [Claude Plugin](
|
|
122
|
+
- **Claude / AI agents:** This repo includes a [Claude Skill](https://github.com/ScrapingBee/scrapingbee-cli/tree/main/plugins/scrapingbee-cli/skills/scrapingbee-cli) and [Claude Plugin](plugins/scrapingbee-cli/.claude-plugin/) for agent use with file-based output and security rules.
|
|
116
123
|
|
|
117
124
|
## Testing
|
|
118
125
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "scrapingbee-cli"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.4.0"
|
|
8
8
|
description = "Command-line client for the ScrapingBee API: scrape pages (single or batch), crawl sites, check usage/credits, and use Google Search, Fast Search, Amazon, Walmart, YouTube, and ChatGPT from the terminal."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -90,3 +90,7 @@ markers = [
|
|
|
90
90
|
"integration: marks tests that call the live API (deselect with '-m \"not integration\"')",
|
|
91
91
|
]
|
|
92
92
|
addopts = "-v --tb=short"
|
|
93
|
+
filterwarnings = [
|
|
94
|
+
"ignore::RuntimeWarning:cssselect",
|
|
95
|
+
"ignore:coroutine.*was never awaited:RuntimeWarning",
|
|
96
|
+
]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""ScrapingBee CLI - Command-line client for the ScrapingBee API."""
|
|
2
|
+
|
|
3
|
+
import platform
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
__version__ = "1.4.0"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def user_agent_headers() -> dict[str, str]:
|
|
10
|
+
"""Build structured User-Agent headers for API requests.
|
|
11
|
+
|
|
12
|
+
Returns a dict of headers:
|
|
13
|
+
User-Agent: ScrapingBee/CLI
|
|
14
|
+
User-Agent-Client: scrapingbee-cli
|
|
15
|
+
User-Agent-Client-Version: 1.4.0
|
|
16
|
+
User-Agent-Environment: python
|
|
17
|
+
User-Agent-Environment-Version: 3.14.2
|
|
18
|
+
User-Agent-OS: Darwin arm64
|
|
19
|
+
"""
|
|
20
|
+
py = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
|
|
21
|
+
os_info = f"{platform.system()} {platform.machine()}"
|
|
22
|
+
return {
|
|
23
|
+
"User-Agent": "ScrapingBee/CLI",
|
|
24
|
+
"User-Agent-Client": "scrapingbee-cli",
|
|
25
|
+
"User-Agent-Client-Version": __version__,
|
|
26
|
+
"User-Agent-Environment": "python",
|
|
27
|
+
"User-Agent-Environment-Version": py,
|
|
28
|
+
"User-Agent-OS": os_info,
|
|
29
|
+
}
|
|
@@ -35,18 +35,55 @@ def log_exec(
|
|
|
35
35
|
pass
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
def
|
|
39
|
-
"""
|
|
38
|
+
def _parse_timestamp(line: str) -> datetime | None:
|
|
39
|
+
"""Extract the ISO timestamp from the start of an audit log line."""
|
|
40
|
+
parts = line.split(" | ", 1)
|
|
41
|
+
if not parts:
|
|
42
|
+
return None
|
|
43
|
+
try:
|
|
44
|
+
return datetime.fromisoformat(parts[0].strip())
|
|
45
|
+
except (ValueError, IndexError):
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def read_audit_log(
|
|
50
|
+
n: int = 50,
|
|
51
|
+
since: datetime | None = None,
|
|
52
|
+
until: datetime | None = None,
|
|
53
|
+
) -> str:
|
|
54
|
+
"""Read audit log entries.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
n: Maximum number of lines to return (from the end). Ignored if since/until is set.
|
|
58
|
+
since: Only return entries at or after this time.
|
|
59
|
+
until: Only return entries at or before this time.
|
|
60
|
+
"""
|
|
40
61
|
if not AUDIT_LOG_PATH.is_file():
|
|
41
62
|
return "No audit log found."
|
|
42
63
|
try:
|
|
43
64
|
with open(AUDIT_LOG_PATH, encoding="utf-8") as f:
|
|
44
65
|
lines = f.readlines()
|
|
45
|
-
recent = lines[-n:] if len(lines) > n else lines
|
|
46
|
-
return "".join(recent)
|
|
47
66
|
except OSError:
|
|
48
67
|
return "Could not read audit log."
|
|
49
68
|
|
|
69
|
+
if since or until:
|
|
70
|
+
filtered = []
|
|
71
|
+
for line in lines:
|
|
72
|
+
ts = _parse_timestamp(line)
|
|
73
|
+
if ts is None:
|
|
74
|
+
continue
|
|
75
|
+
if since and ts < since:
|
|
76
|
+
continue
|
|
77
|
+
if until and ts > until:
|
|
78
|
+
continue
|
|
79
|
+
filtered.append(line)
|
|
80
|
+
if not filtered:
|
|
81
|
+
return "No entries found in the specified time range."
|
|
82
|
+
return "".join(filtered)
|
|
83
|
+
|
|
84
|
+
recent = lines[-n:] if len(lines) > n else lines
|
|
85
|
+
return "".join(recent)
|
|
86
|
+
|
|
50
87
|
|
|
51
88
|
def _rotate_if_needed() -> None:
|
|
52
89
|
"""Keep only the last MAX_LINES entries."""
|