sonde 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.1.0] - 2026-07-02
11
+
12
+ Initial release.
13
+
14
+ ### Added
15
+
16
+ - Five-phase probe pipeline — sanity, sequential, burst, sweep, and estimate —
17
+ that measures an HTTP API's rate limit, burst ceiling, recovery window, and
18
+ fastest sustainable request interval, then combines them into a recommended
19
+ interval and a full-scrape wall-clock estimate.
20
+ - Pluggable endpoint framework: subclass `Endpoint`, decorate with `@register`,
21
+ and the endpoint becomes a CLI subcommand. Paginated endpoints share the
22
+ `--page-size` / `--total-items` flags via `add_pagination_args`.
23
+ - Two built-in endpoints: `asset-owners` (Roblox collectible owners) and
24
+ `github-stargazers` (GitHub repository stargazers).
25
+ - Provider abstraction for parsing rate-limit response headers, with a generic
26
+ 200/429 + IETF-header provider and GitHub/Roblox specializations.
27
+ - CLI with endpoint-agnostic probe options (`--max-requests`, `--seq-cap`,
28
+ burst/recovery/sweep tuning, `--margin`) and per-endpoint options.
29
+ - Concurrent burst phase driven by `httpx` on a single asyncio event loop, with
30
+ adaptive geometric-backoff measurement of the throttle recovery window.
31
+ - JSON report output to a file or stdout (`--output -`).
32
+ - Structured logging subsystem with `plain` and `json` formats and `-v`/`-q`
33
+ verbosity control; logs go to stderr, the report to `--output`.
34
+ - Type annotations across the public API, with a PEP 561 `py.typed` marker so
35
+ downstream type checkers see them.
36
+ - Public extension API re-exported from the top-level `sonde` package
37
+ (`Endpoint`, `RequestSpec`, `PageResult`, `register`, `Provider`,
38
+ `add_pagination_args`, `pagination_from_args`).
39
+ - Defined process exit codes: `0` success, `2` precondition failure (bad
40
+ arguments, unwritable `--output`, or an unusable endpoint response), `1`
41
+ unexpected crash, `130` interrupted.
42
+ - Redaction of configured credentials from log output.
43
+ - Docker image and PyPI packaging.
44
+
45
+ [Unreleased]: https://github.com/Jartan-LLC/sonde/compare/v0.1.0...HEAD
46
+ [0.1.0]: https://github.com/Jartan-LLC/sonde/releases/tag/v0.1.0
@@ -0,0 +1,32 @@
1
+ # Contributing
2
+
3
+ ## Setup
4
+
5
+ ```bash
6
+ pip install -e '.[dev]'
7
+ ```
8
+
9
+ Requires Python 3.12+.
10
+
11
+ ## Verify before opening a PR
12
+
13
+ ```bash
14
+ ruff check .
15
+ ruff format --check .
16
+ pytest
17
+ ```
18
+
19
+ CI runs the same checks plus a Docker build and a `python -m build` / `twine check` of the
20
+ distribution. All must pass before merge.
21
+
22
+ ## Adding an endpoint
23
+
24
+ See [Adding an Endpoint](README.md#adding-an-endpoint) in the README. In short: subclass
25
+ `Endpoint`, decorate with `@register`, and import the module in `src/sonde/endpoints/__init__.py`.
26
+
27
+ ## Conventions
28
+
29
+ - Commits follow [Conventional Commits](https://www.conventionalcommits.org/) (`feat:`, `fix:`,
30
+ `docs:`, `refactor:`, `chore:`).
31
+ - Public API changes and behavior changes go in `CHANGELOG.md` under `## [Unreleased]`.
32
+ - Report security issues privately via [SECURITY.md](.github/SECURITY.md), not a public issue.
sonde-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jartan LLC
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,2 @@
1
+ include CHANGELOG.md
2
+ include CONTRIBUTING.md
sonde-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,248 @@
1
+ Metadata-Version: 2.4
2
+ Name: sonde
3
+ Version: 0.1.0
4
+ Summary: Probe any HTTP API for its rate limits, burst ceiling, and full-scrape time.
5
+ Author: Jartan LLC
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/Jartan-LLC/sonde
8
+ Project-URL: Repository, https://github.com/Jartan-LLC/sonde
9
+ Project-URL: Issues, https://github.com/Jartan-LLC/sonde/issues
10
+ Project-URL: Changelog, https://github.com/Jartan-LLC/sonde/blob/main/CHANGELOG.md
11
+ Keywords: rate-limit,http,api,scraping,benchmark,cli
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3 :: Only
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Topic :: Internet :: WWW/HTTP
21
+ Classifier: Topic :: Software Development :: Testing
22
+ Requires-Python: >=3.12
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: requests>=2.28
26
+ Requires-Dist: httpx>=0.24
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=7.0; extra == "dev"
29
+ Requires-Dist: ruff>=0.4; extra == "dev"
30
+ Dynamic: license-file
31
+
32
+ # Sonde
33
+
34
+ [![PyPI](https://img.shields.io/pypi/v/sonde)](https://pypi.org/project/sonde/)
35
+ [![CI](https://github.com/Jartan-LLC/sonde/actions/workflows/ci.yml/badge.svg)](https://github.com/Jartan-LLC/sonde/actions/workflows/ci.yml)
36
+ [![License](https://img.shields.io/badge/license-MIT-blue)](LICENSE)
37
+
38
+ Probe any HTTP API for its rate limits, burst ceiling, and full-scrape time. Provider-pluggable, safe by default.
39
+
40
+ ## Install
41
+
42
+ Requires Python 3.12+.
43
+
44
+ ```bash
45
+ pip install sonde
46
+ ```
47
+
48
+ From source:
49
+
50
+ ```bash
51
+ git clone https://github.com/Jartan-LLC/sonde.git
52
+ cd sonde
53
+ pip install -e .
54
+ ```
55
+
56
+ For Docker, see [Docker](#docker) below.
57
+
58
+ ## Quick Start
59
+
60
+ Probe the Roblox asset-owners endpoint:
61
+
62
+ ```bash
63
+ export ROBLOX_COOKIE="your_roblosecurity_cookie"
64
+ sonde asset-owners --asset-id 20573078 --total-items 1470000
65
+ ```
66
+
67
+ Probe GitHub stargazers:
68
+
69
+ ```bash
70
+ export GITHUB_TOKEN="ghp_..."
71
+ sonde github-stargazers --owner anthropics --repo anthropic-sdk-python --total-items 5000
72
+ ```
73
+
74
+ Anonymous probing (no auth) works too -- you'll just hit lower rate limits:
75
+
76
+ ```bash
77
+ sonde github-stargazers --owner torvalds --repo linux --total-items 190000
78
+ ```
79
+
80
+ Results are written to `sonde_report.json` by default:
81
+
82
+ ```bash
83
+ sonde asset-owners --asset-id 20573078 --output my_report.json
84
+ ```
85
+
86
+ ## How It Works
87
+
88
+ Sonde runs five phases against the target endpoint, then combines the measurements into a safe rate estimate.
89
+
90
+ | Phase | What it does |
91
+ |---|---|
92
+ | **Sanity** | One request. Validates auth, reads rate-limit response headers (e.g. `x-ratelimit-limit`, `x-ratelimit-remaining`), and records items-per-page for the scrape-time estimate. |
93
+ | **Sequential** | Fires back-to-back requests (up to `--seq-cap`, default 150) until the first 429 or the cap. Measures baseline throughput and how many requests the API allows before throttling. |
94
+ | **Burst** | Fires N truly-concurrent requests (default sizes: 10, 20, 40, 80) via httpx on a single asyncio event loop. After the first throttled burst, measures the **recovery window** -- how long until requests succeed again -- via adaptive geometric backoff. |
95
+ | **Sweep** | Drains the rate-limit bucket, then paces requests at progressively faster intervals (default: 8s down to 0.15s) to find the fastest sustainable interval from empty. Skipped by default when authoritative rate-limit headers are present (override with `--force-sweep`). |
96
+ | **Estimate** | Combines all measurements into a recommended request interval and, if a total item count is known, a wall-clock full-scrape estimate. |
97
+
98
+ ### How the estimate is produced
99
+
100
+ The estimate phase uses a priority ladder to determine the safe rate:
101
+
102
+ 1. **Authoritative headers** -- If the API returned `x-ratelimit-limit` and a window, use those directly (e.g. 100 requests per 60s).
103
+ 2. **Swept floor** -- If the sweep found a fastest sustainable interval, use that.
104
+ 3. **Token-bucket inference** -- If burst results show a clean burst size and a measured recovery window, infer the bucket rate.
105
+ 4. **Sequential fallback** -- Use the observed sequential throughput before the first 429.
106
+ 5. **No-throttle fallback** -- If nothing ever throttled, no ceiling was found, so fall back to a conservative fraction of the measured sequential throughput.
107
+
108
+ Every rung applies the safety margin (default 80%, configurable with `--margin`) -- the recommended pace is ~25% slower than the measured ceiling. Rung 5 has no measured ceiling, so it applies an extra 0.5 factor on top (~40% of observed throughput at the default margin).
109
+
110
+ ## Endpoints
111
+
112
+ ### asset-owners
113
+
114
+ Roblox `inventory.roblox.com/v2/assets/{id}/owners` -- paginated list of owners of a collectible asset.
115
+
116
+ | Option | Required | Default | Description |
117
+ |---|---|---|---|
118
+ | `--asset-id` | Yes | -- | Asset ID to probe (e.g. `20573078`) |
119
+ | `--sort-order` | No | Asc | `Asc` or `Desc` |
120
+ | `--page-size` | No | 100 | Items per page (capped at 100) |
121
+ | `--total-items` | No | None | Known total owners, for wall-clock estimate |
122
+
123
+ **Auth:** Set `ROBLOX_COOKIE` (legacy web-session) and/or `ROBLOX_BEARER` (Open Cloud) environment variables.
124
+
125
+ ### github-stargazers
126
+
127
+ GitHub `api.github.com/repos/{owner}/{repo}/stargazers` -- users who starred a repository.
128
+
129
+ | Option | Required | Default | Description |
130
+ |---|---|---|---|
131
+ | `--owner` | Yes | -- | Repository owner/org (e.g. `anthropics`) |
132
+ | `--repo` | Yes | -- | Repository name (e.g. `anthropic-sdk-python`) |
133
+ | `--page-size` | No | 100 | Items per page (capped at 100) |
134
+ | `--total-items` | No | None | Known stargazer count, for wall-clock estimate |
135
+
136
+ **Auth:** Set `GITHUB_TOKEN` environment variable. Without it, you get the anonymous rate limit (60 requests/hour).
137
+
138
+ ## Adding an Endpoint
139
+
140
+ 1. Create a new module in `src/sonde/endpoints/`.
141
+ 2. Subclass `Endpoint` and implement `build_request(cursor)` and `parse_page(response)`.
142
+ 3. Decorate with `@register` and set a unique `name` (becomes the CLI subcommand).
143
+ 4. Override `_make_provider()` to return the appropriate `Provider` (or use the generic one for standard 200/429 + IETF headers).
144
+ 5. Optionally implement `total_items()` for scrape-time estimates, `add_arguments()` / `from_args()` for CLI options, and `extra_headers()` for endpoint-specific headers.
145
+ 6. If the endpoint is paginated, call `add_pagination_args(parser, page_max=cls.MAX_PAGE)` in `add_arguments()` and `pagination_from_args(args, page_max=cls.MAX_PAGE)` in `from_args()` so it gets the shared `--page-size` / `--total-items` flags (clamped to your endpoint's cap).
146
+ 7. Import the new module in `src/sonde/endpoints/__init__.py` so it registers on package load.
147
+
148
+ Minimal example:
149
+
150
+ ```python
151
+ from sonde import Endpoint, RequestSpec, PageResult, register
152
+
153
+ @register
154
+ class MyEndpoint(Endpoint):
155
+ name = "my-endpoint"
156
+ help = "one-line description for --help"
157
+
158
+ def build_request(self, cursor):
159
+ return RequestSpec(url="https://api.example.com/items", params={"page": cursor or 1})
160
+
161
+ def parse_page(self, response):
162
+ data = response.json()
163
+ return PageResult(count=len(data["items"]), next_cursor=data.get("next_page"))
164
+ ```
165
+
166
+ ## CLI Reference
167
+
168
+ Common options shared by all endpoints:
169
+
170
+ | Option | Default | Description |
171
+ |---|---|---|
172
+ | `--max-requests` | 1200 | Hard global cap across all phases (safety budget) |
173
+ | `--seq-cap` | 150 | Max sequential requests before stopping |
174
+ | `--skip-burst` | off | Skip the concurrent burst phase |
175
+ | `--burst-sizes` | `10,20,40,80` | Comma-separated list of concurrent burst sizes |
176
+ | `--burst-cooldown` | 60.0 | Fallback seconds between bursts if the recovery window can't be measured |
177
+ | `--recovery-step` | 0.25 | Initial poll delay when measuring the throttle window (grows geometrically) |
178
+ | `--recovery-max` | 90.0 | Give up measuring the window after this many seconds |
179
+ | `--recovery-polls` | 15 | Max polls during recovery measurement |
180
+ | `--skip-sweep` | off | Skip the sustained-interval sweep phase |
181
+ | `--force-sweep` | off | Run the sweep even when authoritative rate-limit headers are present |
182
+ | `--sweep-intervals` | `8,5,3,2,1.2,0.6,0.3,0.15` | Inter-request intervals (seconds) to test, slow to fast |
183
+ | `--sweep-count` | 20 | Paced requests per interval after draining |
184
+ | `--sweep-drain` | 500 | Cap on rapid requests used to empty the bucket before each interval |
185
+ | `--sweep-tolerance` | 0.1 | Max fraction of 429s for an interval to count as sustainable |
186
+ | `--margin` | 0.8 | Safety margin: pace at 80% of the measured max rate (0.8 = 25% slower than ceiling) |
187
+ | `--output` | `sonde_report.json` | Path for the JSON report (use `-` for stdout) |
188
+ | `-v` / `--verbose` | off | Show per-request detail (sets log level to DEBUG) |
189
+ | `-q` / `--quiet` | off | Only show warnings and errors (sets log level to WARNING) |
190
+ | `--log-format` | `plain` | Log line format: `plain` (message-only) or `json` (structured) |
191
+
192
+ `-v` and `-q` are mutually exclusive. Logs always go to stderr; the report goes to `--output`.
193
+
194
+ **Exit codes:** `0` success, `2` precondition failure (bad arguments, unwritable `--output`, or the endpoint returned no usable response), `1` unexpected crash, `130` interrupted.
195
+
196
+ ### Piping and machine-readable output
197
+
198
+ Use `--output -` to write the JSON report to stdout instead of a file. Combine with `-q` to suppress INFO-level log noise on stderr:
199
+
200
+ ```bash
201
+ sonde asset-owners --asset-id 20573078 --output - -q | jq .estimate
202
+ ```
203
+
204
+ Use `--log-format json` for structured log lines on stderr (keys: `timestamp`, `level`, `logger`, `message`, plus `exc` on error lines), useful for log aggregators or CI pipelines:
205
+
206
+ ```bash
207
+ sonde asset-owners --asset-id 20573078 --log-format json 2>sonde.log
208
+ ```
209
+
210
+ ## Docker
211
+
212
+ Build:
213
+
214
+ ```bash
215
+ docker build -t sonde .
216
+ ```
217
+
218
+ Run (mount current directory so the report lands on the host):
219
+
220
+ ```bash
221
+ docker run --rm -v "$(pwd):/data" -e ROBLOX_COOKIE sonde \
222
+ asset-owners --asset-id 20573078 --total-items 1470000
223
+ ```
224
+
225
+ ```bash
226
+ docker run --rm -v "$(pwd):/data" -e GITHUB_TOKEN sonde \
227
+ github-stargazers --owner anthropics --repo anthropic-sdk-python --total-items 5000
228
+ ```
229
+
230
+ The container writes `sonde_report.json` to `/data` (the mounted volume).
231
+
232
+ ## Development
233
+
234
+ ```bash
235
+ pip install -e '.[dev]'
236
+ ```
237
+
238
+ Run tests and linting:
239
+
240
+ ```bash
241
+ pytest
242
+ ruff check .
243
+ ruff format --check .
244
+ ```
245
+
246
+ ## License
247
+
248
+ [MIT](LICENSE)
sonde-0.1.0/README.md ADDED
@@ -0,0 +1,217 @@
1
+ # Sonde
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/sonde)](https://pypi.org/project/sonde/)
4
+ [![CI](https://github.com/Jartan-LLC/sonde/actions/workflows/ci.yml/badge.svg)](https://github.com/Jartan-LLC/sonde/actions/workflows/ci.yml)
5
+ [![License](https://img.shields.io/badge/license-MIT-blue)](LICENSE)
6
+
7
+ Probe any HTTP API for its rate limits, burst ceiling, and full-scrape time. Provider-pluggable, safe by default.
8
+
9
+ ## Install
10
+
11
+ Requires Python 3.12+.
12
+
13
+ ```bash
14
+ pip install sonde
15
+ ```
16
+
17
+ From source:
18
+
19
+ ```bash
20
+ git clone https://github.com/Jartan-LLC/sonde.git
21
+ cd sonde
22
+ pip install -e .
23
+ ```
24
+
25
+ For Docker, see [Docker](#docker) below.
26
+
27
+ ## Quick Start
28
+
29
+ Probe the Roblox asset-owners endpoint:
30
+
31
+ ```bash
32
+ export ROBLOX_COOKIE="your_roblosecurity_cookie"
33
+ sonde asset-owners --asset-id 20573078 --total-items 1470000
34
+ ```
35
+
36
+ Probe GitHub stargazers:
37
+
38
+ ```bash
39
+ export GITHUB_TOKEN="ghp_..."
40
+ sonde github-stargazers --owner anthropics --repo anthropic-sdk-python --total-items 5000
41
+ ```
42
+
43
+ Anonymous probing (no auth) works too -- you'll just hit lower rate limits:
44
+
45
+ ```bash
46
+ sonde github-stargazers --owner torvalds --repo linux --total-items 190000
47
+ ```
48
+
49
+ Results are written to `sonde_report.json` by default:
50
+
51
+ ```bash
52
+ sonde asset-owners --asset-id 20573078 --output my_report.json
53
+ ```
54
+
55
+ ## How It Works
56
+
57
+ Sonde runs five phases against the target endpoint, then combines the measurements into a safe rate estimate.
58
+
59
+ | Phase | What it does |
60
+ |---|---|
61
+ | **Sanity** | One request. Validates auth, reads rate-limit response headers (e.g. `x-ratelimit-limit`, `x-ratelimit-remaining`), and records items-per-page for the scrape-time estimate. |
62
+ | **Sequential** | Fires back-to-back requests (up to `--seq-cap`, default 150) until the first 429 or the cap. Measures baseline throughput and how many requests the API allows before throttling. |
63
+ | **Burst** | Fires N truly-concurrent requests (default sizes: 10, 20, 40, 80) via httpx on a single asyncio event loop. After the first throttled burst, measures the **recovery window** -- how long until requests succeed again -- via adaptive geometric backoff. |
64
+ | **Sweep** | Drains the rate-limit bucket, then paces requests at progressively faster intervals (default: 8s down to 0.15s) to find the fastest sustainable interval from empty. Skipped by default when authoritative rate-limit headers are present (override with `--force-sweep`). |
65
+ | **Estimate** | Combines all measurements into a recommended request interval and, if a total item count is known, a wall-clock full-scrape estimate. |
66
+
67
+ ### How the estimate is produced
68
+
69
+ The estimate phase uses a priority ladder to determine the safe rate:
70
+
71
+ 1. **Authoritative headers** -- If the API returned `x-ratelimit-limit` and a window, use those directly (e.g. 100 requests per 60s).
72
+ 2. **Swept floor** -- If the sweep found a fastest sustainable interval, use that.
73
+ 3. **Token-bucket inference** -- If burst results show a clean burst size and a measured recovery window, infer the bucket rate.
74
+ 4. **Sequential fallback** -- Use the observed sequential throughput before the first 429.
75
+ 5. **No-throttle fallback** -- If nothing ever throttled, no ceiling was found, so fall back to a conservative fraction of the measured sequential throughput.
76
+
77
+ Every rung applies the safety margin (default 80%, configurable with `--margin`) -- the recommended pace is ~25% slower than the measured ceiling. Rung 5 has no measured ceiling, so it applies an extra 0.5 factor on top (~40% of observed throughput at the default margin).
78
+
79
+ ## Endpoints
80
+
81
+ ### asset-owners
82
+
83
+ Roblox `inventory.roblox.com/v2/assets/{id}/owners` -- paginated list of owners of a collectible asset.
84
+
85
+ | Option | Required | Default | Description |
86
+ |---|---|---|---|
87
+ | `--asset-id` | Yes | -- | Asset ID to probe (e.g. `20573078`) |
88
+ | `--sort-order` | No | Asc | `Asc` or `Desc` |
89
+ | `--page-size` | No | 100 | Items per page (capped at 100) |
90
+ | `--total-items` | No | None | Known total owners, for wall-clock estimate |
91
+
92
+ **Auth:** Set `ROBLOX_COOKIE` (legacy web-session) and/or `ROBLOX_BEARER` (Open Cloud) environment variables.
93
+
94
+ ### github-stargazers
95
+
96
+ GitHub `api.github.com/repos/{owner}/{repo}/stargazers` -- users who starred a repository.
97
+
98
+ | Option | Required | Default | Description |
99
+ |---|---|---|---|
100
+ | `--owner` | Yes | -- | Repository owner/org (e.g. `anthropics`) |
101
+ | `--repo` | Yes | -- | Repository name (e.g. `anthropic-sdk-python`) |
102
+ | `--page-size` | No | 100 | Items per page (capped at 100) |
103
+ | `--total-items` | No | None | Known stargazer count, for wall-clock estimate |
104
+
105
+ **Auth:** Set `GITHUB_TOKEN` environment variable. Without it, you get the anonymous rate limit (60 requests/hour).
106
+
107
+ ## Adding an Endpoint
108
+
109
+ 1. Create a new module in `src/sonde/endpoints/`.
110
+ 2. Subclass `Endpoint` and implement `build_request(cursor)` and `parse_page(response)`.
111
+ 3. Decorate with `@register` and set a unique `name` (becomes the CLI subcommand).
112
+ 4. Override `_make_provider()` to return the appropriate `Provider` (or use the generic one for standard 200/429 + IETF headers).
113
+ 5. Optionally implement `total_items()` for scrape-time estimates, `add_arguments()` / `from_args()` for CLI options, and `extra_headers()` for endpoint-specific headers.
114
+ 6. If the endpoint is paginated, call `add_pagination_args(parser, page_max=cls.MAX_PAGE)` in `add_arguments()` and `pagination_from_args(args, page_max=cls.MAX_PAGE)` in `from_args()` so it gets the shared `--page-size` / `--total-items` flags (clamped to your endpoint's cap).
115
+ 7. Import the new module in `src/sonde/endpoints/__init__.py` so it registers on package load.
116
+
117
+ Minimal example:
118
+
119
+ ```python
120
+ from sonde import Endpoint, RequestSpec, PageResult, register
121
+
122
+ @register
123
+ class MyEndpoint(Endpoint):
124
+ name = "my-endpoint"
125
+ help = "one-line description for --help"
126
+
127
+ def build_request(self, cursor):
128
+ return RequestSpec(url="https://api.example.com/items", params={"page": cursor or 1})
129
+
130
+ def parse_page(self, response):
131
+ data = response.json()
132
+ return PageResult(count=len(data["items"]), next_cursor=data.get("next_page"))
133
+ ```
134
+
135
+ ## CLI Reference
136
+
137
+ Common options shared by all endpoints:
138
+
139
+ | Option | Default | Description |
140
+ |---|---|---|
141
+ | `--max-requests` | 1200 | Hard global cap across all phases (safety budget) |
142
+ | `--seq-cap` | 150 | Max sequential requests before stopping |
143
+ | `--skip-burst` | off | Skip the concurrent burst phase |
144
+ | `--burst-sizes` | `10,20,40,80` | Comma-separated list of concurrent burst sizes |
145
+ | `--burst-cooldown` | 60.0 | Fallback seconds between bursts if the recovery window can't be measured |
146
+ | `--recovery-step` | 0.25 | Initial poll delay when measuring the throttle window (grows geometrically) |
147
+ | `--recovery-max` | 90.0 | Give up measuring the window after this many seconds |
148
+ | `--recovery-polls` | 15 | Max polls during recovery measurement |
149
+ | `--skip-sweep` | off | Skip the sustained-interval sweep phase |
150
+ | `--force-sweep` | off | Run the sweep even when authoritative rate-limit headers are present |
151
+ | `--sweep-intervals` | `8,5,3,2,1.2,0.6,0.3,0.15` | Inter-request intervals (seconds) to test, slow to fast |
152
+ | `--sweep-count` | 20 | Paced requests per interval after draining |
153
+ | `--sweep-drain` | 500 | Cap on rapid requests used to empty the bucket before each interval |
154
+ | `--sweep-tolerance` | 0.1 | Max fraction of 429s for an interval to count as sustainable |
155
+ | `--margin` | 0.8 | Safety margin: pace at 80% of the measured max rate (0.8 = 25% slower than ceiling) |
156
+ | `--output` | `sonde_report.json` | Path for the JSON report (use `-` for stdout) |
157
+ | `-v` / `--verbose` | off | Show per-request detail (sets log level to DEBUG) |
158
+ | `-q` / `--quiet` | off | Only show warnings and errors (sets log level to WARNING) |
159
+ | `--log-format` | `plain` | Log line format: `plain` (message-only) or `json` (structured) |
160
+
161
+ `-v` and `-q` are mutually exclusive. Logs always go to stderr; the report goes to `--output`.
162
+
163
+ **Exit codes:** `0` success, `2` precondition failure (bad arguments, unwritable `--output`, or the endpoint returned no usable response), `1` unexpected crash, `130` interrupted.
164
+
165
+ ### Piping and machine-readable output
166
+
167
+ Use `--output -` to write the JSON report to stdout instead of a file. Combine with `-q` to suppress INFO-level log noise on stderr:
168
+
169
+ ```bash
170
+ sonde asset-owners --asset-id 20573078 --output - -q | jq .estimate
171
+ ```
172
+
173
+ Use `--log-format json` for structured log lines on stderr (keys: `timestamp`, `level`, `logger`, `message`, plus `exc` on error lines), useful for log aggregators or CI pipelines:
174
+
175
+ ```bash
176
+ sonde asset-owners --asset-id 20573078 --log-format json 2>sonde.log
177
+ ```
178
+
179
+ ## Docker
180
+
181
+ Build:
182
+
183
+ ```bash
184
+ docker build -t sonde .
185
+ ```
186
+
187
+ Run (mount current directory so the report lands on the host):
188
+
189
+ ```bash
190
+ docker run --rm -v "$(pwd):/data" -e ROBLOX_COOKIE sonde \
191
+ asset-owners --asset-id 20573078 --total-items 1470000
192
+ ```
193
+
194
+ ```bash
195
+ docker run --rm -v "$(pwd):/data" -e GITHUB_TOKEN sonde \
196
+ github-stargazers --owner anthropics --repo anthropic-sdk-python --total-items 5000
197
+ ```
198
+
199
+ The container writes `sonde_report.json` to `/data` (the mounted volume).
200
+
201
+ ## Development
202
+
203
+ ```bash
204
+ pip install -e '.[dev]'
205
+ ```
206
+
207
+ Run tests and linting:
208
+
209
+ ```bash
210
+ pytest
211
+ ruff check .
212
+ ruff format --check .
213
+ ```
214
+
215
+ ## License
216
+
217
+ [MIT](LICENSE)
@@ -0,0 +1,65 @@
1
+ [build-system]
2
+ requires = ["setuptools>=77.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sonde"
7
+ dynamic = ["version"]
8
+ description = "Probe any HTTP API for its rate limits, burst ceiling, and full-scrape time."
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.12"
12
+ authors = [{ name = "Jartan LLC" }]
13
+ keywords = ["rate-limit", "http", "api", "scraping", "benchmark", "cli"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Environment :: Console",
17
+ "Intended Audience :: Developers",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3 :: Only",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
22
+ "Operating System :: OS Independent",
23
+ "Topic :: Internet :: WWW/HTTP",
24
+ "Topic :: Software Development :: Testing",
25
+ ]
26
+ dependencies = [
27
+ "requests>=2.28",
28
+ "httpx>=0.24",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "pytest>=7.0",
34
+ "ruff>=0.4",
35
+ ]
36
+
37
+ [project.scripts]
38
+ sonde = "sonde.cli:main"
39
+
40
+ [project.urls]
41
+ Homepage = "https://github.com/Jartan-LLC/sonde"
42
+ Repository = "https://github.com/Jartan-LLC/sonde"
43
+ Issues = "https://github.com/Jartan-LLC/sonde/issues"
44
+ Changelog = "https://github.com/Jartan-LLC/sonde/blob/main/CHANGELOG.md"
45
+
46
+ [tool.setuptools.dynamic]
47
+ version = {attr = "sonde.__version__"}
48
+
49
+ [tool.setuptools.packages.find]
50
+ where = ["src"]
51
+
52
+ [tool.setuptools.package-data]
53
+ sonde = ["py.typed"]
54
+
55
+ [tool.pytest.ini_options]
56
+ testpaths = ["tests"]
57
+ addopts = "-q"
58
+ pythonpath = ["."]
59
+
60
+ [tool.ruff]
61
+ target-version = "py312"
62
+ line-length = 100
63
+
64
+ [tool.ruff.lint]
65
+ select = ["E", "F", "W", "I"]
sonde-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,25 @@
1
+ """sonde — probe any HTTP API for its rate limits, burst ceiling, and full-scrape time."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ # Re-exported after __version__ so core.py's `from . import __version__` resolves.
6
+ from .endpoint import ( # noqa: E402
7
+ Endpoint,
8
+ PageResult,
9
+ RequestSpec,
10
+ add_pagination_args,
11
+ pagination_from_args,
12
+ register,
13
+ )
14
+ from .provider import Provider # noqa: E402
15
+
16
+ __all__ = [
17
+ "__version__",
18
+ "Endpoint",
19
+ "RequestSpec",
20
+ "PageResult",
21
+ "register",
22
+ "Provider",
23
+ "add_pagination_args",
24
+ "pagination_from_args",
25
+ ]
@@ -0,0 +1,5 @@
1
+ """Entry point: `python -m sonde <endpoint> [options]`."""
2
+
3
+ from .cli import main
4
+
5
+ main()