sonde 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sonde/__init__.py +25 -0
- sonde/__main__.py +5 -0
- sonde/cli.py +343 -0
- sonde/core.py +185 -0
- sonde/endpoint.py +154 -0
- sonde/endpoints/__init__.py +13 -0
- sonde/endpoints/asset_owners.py +83 -0
- sonde/endpoints/github_stargazers.py +94 -0
- sonde/logconfig.py +127 -0
- sonde/phases.py +600 -0
- sonde/provider.py +171 -0
- sonde/py.typed +0 -0
- sonde-0.1.0.dist-info/METADATA +248 -0
- sonde-0.1.0.dist-info/RECORD +18 -0
- sonde-0.1.0.dist-info/WHEEL +5 -0
- sonde-0.1.0.dist-info/entry_points.txt +2 -0
- sonde-0.1.0.dist-info/licenses/LICENSE +21 -0
- sonde-0.1.0.dist-info/top_level.txt +1 -0
sonde/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""sonde — probe any HTTP API for its rate limits, burst ceiling, and full-scrape time."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
# Re-exported after __version__ so core.py's `from . import __version__` resolves.
|
|
6
|
+
from .endpoint import ( # noqa: E402
|
|
7
|
+
Endpoint,
|
|
8
|
+
PageResult,
|
|
9
|
+
RequestSpec,
|
|
10
|
+
add_pagination_args,
|
|
11
|
+
pagination_from_args,
|
|
12
|
+
register,
|
|
13
|
+
)
|
|
14
|
+
from .provider import Provider # noqa: E402
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"__version__",
|
|
18
|
+
"Endpoint",
|
|
19
|
+
"RequestSpec",
|
|
20
|
+
"PageResult",
|
|
21
|
+
"register",
|
|
22
|
+
"Provider",
|
|
23
|
+
"add_pagination_args",
|
|
24
|
+
"pagination_from_args",
|
|
25
|
+
]
|
sonde/__main__.py
ADDED
sonde/cli.py
ADDED
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cli.py — argument parsing, endpoint selection, and run orchestration.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python -m sonde <endpoint> [common options] [endpoint options]
|
|
6
|
+
|
|
7
|
+
The common rate-limit options are shared across every endpoint; each registered
|
|
8
|
+
endpoint contributes its own options as a subcommand.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
import sys
|
|
17
|
+
from collections.abc import Iterable
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from . import (
|
|
21
|
+
core,
|
|
22
|
+
endpoint,
|
|
23
|
+
endpoints, # noqa: F401 (import registers all endpoints)
|
|
24
|
+
phases,
|
|
25
|
+
)
|
|
26
|
+
from .logconfig import register_log_secrets, setup_logging
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
# Header names whose values are credentials and must be kept out of logs.
|
|
31
|
+
_SECRET_HEADER_KEYS = frozenset({"authorization", "cookie", "proxy-authorization", "x-api-key"})
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _secret_variants(value: str) -> Iterable[str]:
|
|
35
|
+
"""The full header value plus the bare credential inside it, so a target that
|
|
36
|
+
echoes just the token (no `Bearer `, no `.ROBLOSECURITY=`) is still redacted."""
|
|
37
|
+
yield value
|
|
38
|
+
# Only emit a bare variant if it's long enough to be a real credential, so a
|
|
39
|
+
# short prefix can't over-redact unrelated log text.
|
|
40
|
+
after_scheme = value.split(" ", 1) # "Bearer <tok>" -> "<tok>"
|
|
41
|
+
if len(after_scheme) == 2 and len(after_scheme[1]) >= 8:
|
|
42
|
+
yield after_scheme[1]
|
|
43
|
+
after_eq = value.split("=", 1) # ".ROBLOSECURITY=<cookie>" -> "<cookie>"
|
|
44
|
+
if len(after_eq) == 2 and len(after_eq[1]) >= 8:
|
|
45
|
+
yield after_eq[1]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _int_list(raw: str) -> list[int]:
|
|
49
|
+
"""argparse type for a comma-separated list of ints (clean exit-2 on bad input)."""
|
|
50
|
+
try:
|
|
51
|
+
vals = [int(x) for x in raw.split(",") if x.strip()]
|
|
52
|
+
except ValueError as e:
|
|
53
|
+
raise argparse.ArgumentTypeError(f"comma-separated integers required: {e}")
|
|
54
|
+
if not vals:
|
|
55
|
+
raise argparse.ArgumentTypeError("at least one value required")
|
|
56
|
+
return vals
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _float_list(raw: str) -> list[float]:
|
|
60
|
+
"""argparse type for a comma-separated list of floats (clean exit-2 on bad input)."""
|
|
61
|
+
try:
|
|
62
|
+
vals = [float(x) for x in raw.split(",") if x.strip()]
|
|
63
|
+
except ValueError as e:
|
|
64
|
+
raise argparse.ArgumentTypeError(f"comma-separated numbers required: {e}")
|
|
65
|
+
if not vals:
|
|
66
|
+
raise argparse.ArgumentTypeError("at least one value required")
|
|
67
|
+
return vals
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def build_common_parser() -> argparse.ArgumentParser:
|
|
71
|
+
"""All endpoint-agnostic probe options (shared by every subcommand)."""
|
|
72
|
+
c = argparse.ArgumentParser(add_help=False)
|
|
73
|
+
g = c.add_argument_group("rate-limit probe options")
|
|
74
|
+
g.add_argument(
|
|
75
|
+
"--max-requests", type=int, default=1200, help="hard global cap across all phases (safety)"
|
|
76
|
+
)
|
|
77
|
+
g.add_argument(
|
|
78
|
+
"--seq-cap", type=int, default=150, help="max sequential requests before giving up on a 429"
|
|
79
|
+
)
|
|
80
|
+
g.add_argument("--skip-burst", action="store_true")
|
|
81
|
+
g.add_argument(
|
|
82
|
+
"--burst-sizes",
|
|
83
|
+
type=_int_list,
|
|
84
|
+
default=[10, 20, 40, 80],
|
|
85
|
+
help="comma list of concurrent burst sizes (default: 10,20,40,80)",
|
|
86
|
+
)
|
|
87
|
+
g.add_argument(
|
|
88
|
+
"--burst-cooldown",
|
|
89
|
+
type=float,
|
|
90
|
+
default=60.0,
|
|
91
|
+
help="fallback seconds between bursts if the window can't be measured",
|
|
92
|
+
)
|
|
93
|
+
g.add_argument(
|
|
94
|
+
"--recovery-step",
|
|
95
|
+
type=float,
|
|
96
|
+
default=0.25,
|
|
97
|
+
help="first poll delay when measuring the throttle window (grows geometrically)",
|
|
98
|
+
)
|
|
99
|
+
g.add_argument(
|
|
100
|
+
"--recovery-max",
|
|
101
|
+
type=float,
|
|
102
|
+
default=90.0,
|
|
103
|
+
help="give up measuring the window after this many seconds",
|
|
104
|
+
)
|
|
105
|
+
g.add_argument(
|
|
106
|
+
"--recovery-polls",
|
|
107
|
+
type=int,
|
|
108
|
+
default=15,
|
|
109
|
+
help="max polls during recovery measurement (bounds request count)",
|
|
110
|
+
)
|
|
111
|
+
g.add_argument("--skip-sweep", action="store_true", help="skip the sustained-interval sweep")
|
|
112
|
+
g.add_argument(
|
|
113
|
+
"--force-sweep",
|
|
114
|
+
action="store_true",
|
|
115
|
+
help="run the sweep even when authoritative headers are present "
|
|
116
|
+
"(skipped by default in that case; it's redundant and slow)",
|
|
117
|
+
)
|
|
118
|
+
g.add_argument(
|
|
119
|
+
"--sweep-intervals",
|
|
120
|
+
type=_float_list,
|
|
121
|
+
default=[8, 5, 3, 2, 1.2, 0.6, 0.3, 0.15],
|
|
122
|
+
help="inter-request intervals (s) to test, SLOW->FAST (default: "
|
|
123
|
+
"8,5,3,2,1.2,0.6,0.3,0.15). Wide so it can bracket slow limits; only "
|
|
124
|
+
"used as a fallback when headers are missing.",
|
|
125
|
+
)
|
|
126
|
+
g.add_argument(
|
|
127
|
+
"--sweep-count", type=int, default=20, help="paced requests per interval after draining"
|
|
128
|
+
)
|
|
129
|
+
g.add_argument(
|
|
130
|
+
"--sweep-drain",
|
|
131
|
+
type=int,
|
|
132
|
+
default=500,
|
|
133
|
+
help="cap on rapid requests used to empty the bucket before each interval; "
|
|
134
|
+
"the drain runs until empty or this cap",
|
|
135
|
+
)
|
|
136
|
+
g.add_argument(
|
|
137
|
+
"--sweep-tolerance",
|
|
138
|
+
type=float,
|
|
139
|
+
default=0.1,
|
|
140
|
+
help="max fraction of 429s from empty for an interval to count as sustainable",
|
|
141
|
+
)
|
|
142
|
+
g.add_argument(
|
|
143
|
+
"--margin",
|
|
144
|
+
type=float,
|
|
145
|
+
default=0.8,
|
|
146
|
+
help="safety margin: recommended interval = floor / margin (0.8 => 25%% slower)",
|
|
147
|
+
)
|
|
148
|
+
g.add_argument(
|
|
149
|
+
"--output",
|
|
150
|
+
default="sonde_report.json",
|
|
151
|
+
help="report output file (use '-' for stdout)",
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
vq = c.add_mutually_exclusive_group()
|
|
155
|
+
vq.add_argument(
|
|
156
|
+
"-v",
|
|
157
|
+
"--verbose",
|
|
158
|
+
action="store_true",
|
|
159
|
+
help="show per-request detail (sets log level to DEBUG)",
|
|
160
|
+
)
|
|
161
|
+
vq.add_argument(
|
|
162
|
+
"-q",
|
|
163
|
+
"--quiet",
|
|
164
|
+
action="store_true",
|
|
165
|
+
help="only show warnings and errors (sets log level to WARNING)",
|
|
166
|
+
)
|
|
167
|
+
c.add_argument(
|
|
168
|
+
"--log-format",
|
|
169
|
+
choices=["plain", "json"],
|
|
170
|
+
default="plain",
|
|
171
|
+
help="log line format: plain (message-only, default) or json (structured)",
|
|
172
|
+
)
|
|
173
|
+
return c
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
177
|
+
common = build_common_parser()
|
|
178
|
+
p = argparse.ArgumentParser(
|
|
179
|
+
prog="sonde",
|
|
180
|
+
description="Probe any HTTP API for its rate limits. Pick an endpoint subcommand.",
|
|
181
|
+
)
|
|
182
|
+
sub = p.add_subparsers(dest="endpoint", required=True, metavar="ENDPOINT")
|
|
183
|
+
for name, cls in sorted(endpoint.all_endpoints().items()):
|
|
184
|
+
sp = sub.add_parser(name, parents=[common], help=cls.help, description=cls.help)
|
|
185
|
+
cls.add_arguments(sp)
|
|
186
|
+
return p
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def run(args: argparse.Namespace) -> dict[str, Any]:
|
|
190
|
+
ep_cls = endpoint.get(args.endpoint)
|
|
191
|
+
if ep_cls is None:
|
|
192
|
+
raise SystemExit(f"unknown endpoint: {args.endpoint}")
|
|
193
|
+
_preflight_output(args.output)
|
|
194
|
+
ep = ep_cls.from_args(args)
|
|
195
|
+
provider = ep.provider()
|
|
196
|
+
burst_sizes = args.burst_sizes
|
|
197
|
+
sweep_intervals = sorted(args.sweep_intervals, reverse=True)
|
|
198
|
+
budget = core.Budget(max_requests=args.max_requests)
|
|
199
|
+
# base headers < provider auth < endpoint extras
|
|
200
|
+
headers = {**core.BASE_HEADERS, **provider.auth_headers(), **ep.extra_headers()}
|
|
201
|
+
# Keep our own credentials out of logs if the target echoes them back.
|
|
202
|
+
register_log_secrets(
|
|
203
|
+
variant
|
|
204
|
+
for k, v in headers.items()
|
|
205
|
+
if k.lower() in _SECRET_HEADER_KEYS
|
|
206
|
+
for variant in _secret_variants(v)
|
|
207
|
+
)
|
|
208
|
+
session = core.build_session(headers=headers)
|
|
209
|
+
|
|
210
|
+
logger.info("Endpoint : %s", ep.name)
|
|
211
|
+
logger.info("Provider : %s", provider.name)
|
|
212
|
+
logger.info(
|
|
213
|
+
"Auth : %s",
|
|
214
|
+
"credentials set" if provider.auth_headers() else "none (anonymous)",
|
|
215
|
+
)
|
|
216
|
+
logger.info("Budget : %s requests total", args.max_requests)
|
|
217
|
+
|
|
218
|
+
report = {"endpoint": ep.name, "provider": provider.name}
|
|
219
|
+
|
|
220
|
+
sanity, rl = phases.phase_sanity(session, ep, budget)
|
|
221
|
+
report["sanity"] = {
|
|
222
|
+
"status": sanity.status,
|
|
223
|
+
"rclass": sanity.rclass.value,
|
|
224
|
+
"items": sanity.count,
|
|
225
|
+
"headers": sanity.headers,
|
|
226
|
+
}
|
|
227
|
+
report["ratelimit_headers"] = rl
|
|
228
|
+
if sanity.rclass != core.RClass.OK:
|
|
229
|
+
logger.warning(
|
|
230
|
+
"\nAborting: no usable success response from the endpoint. "
|
|
231
|
+
"Fix auth / arguments and re-run."
|
|
232
|
+
)
|
|
233
|
+
_dump(args.output, report)
|
|
234
|
+
return report
|
|
235
|
+
page_count = sanity.count # items per successful page, for the estimate
|
|
236
|
+
|
|
237
|
+
seq_summary, cursor_pool = phases.phase_seq(session, ep, budget, args.seq_cap)
|
|
238
|
+
report["sequential"] = seq_summary
|
|
239
|
+
|
|
240
|
+
burst_results, measured_window = [], None
|
|
241
|
+
if not args.skip_burst:
|
|
242
|
+
burst_results, measured_window = phases.phase_burst(
|
|
243
|
+
headers,
|
|
244
|
+
ep,
|
|
245
|
+
budget,
|
|
246
|
+
burst_sizes,
|
|
247
|
+
args.burst_cooldown,
|
|
248
|
+
cursor_pool,
|
|
249
|
+
args.recovery_step,
|
|
250
|
+
args.recovery_max,
|
|
251
|
+
args.recovery_polls,
|
|
252
|
+
)
|
|
253
|
+
report["burst"] = burst_results
|
|
254
|
+
report["measured_window_seconds"] = measured_window
|
|
255
|
+
|
|
256
|
+
swept_interval, sweep_rows = None, []
|
|
257
|
+
headers_authoritative = bool(rl.get("limit") and rl.get("window_s"))
|
|
258
|
+
run_sweep = (not args.skip_sweep) and (args.force_sweep or not headers_authoritative)
|
|
259
|
+
if run_sweep:
|
|
260
|
+
swept_interval, sweep_rows = phases.phase_sweep(
|
|
261
|
+
session,
|
|
262
|
+
ep,
|
|
263
|
+
budget,
|
|
264
|
+
cursor_pool,
|
|
265
|
+
sweep_intervals,
|
|
266
|
+
args.sweep_count,
|
|
267
|
+
args.sweep_drain,
|
|
268
|
+
args.sweep_tolerance,
|
|
269
|
+
)
|
|
270
|
+
elif headers_authoritative and not args.skip_sweep:
|
|
271
|
+
logger.info("\n== PHASE: sustained-interval sweep ==")
|
|
272
|
+
logger.info(
|
|
273
|
+
" skipped: authoritative rate-limit headers already give the limit. "
|
|
274
|
+
"Use --force-sweep to run it anyway as an independent check."
|
|
275
|
+
)
|
|
276
|
+
report["sweep"] = sweep_rows
|
|
277
|
+
report["swept_floor_interval_s"] = swept_interval
|
|
278
|
+
|
|
279
|
+
report["estimate"] = phases.phase_estimate(
|
|
280
|
+
ep, page_count, seq_summary, burst_results, measured_window, swept_interval, args.margin, rl
|
|
281
|
+
)
|
|
282
|
+
report["requests_used"] = budget.used
|
|
283
|
+
|
|
284
|
+
_dump(args.output, report)
|
|
285
|
+
logger.info("\nRequests used: %s/%s", budget.used, args.max_requests)
|
|
286
|
+
if args.output != "-":
|
|
287
|
+
logger.info("Full report written to: %s", args.output)
|
|
288
|
+
return report
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _preflight_output(path: str) -> None:
|
|
292
|
+
"""Fail fast (exit 2) on an unwritable --output path before probing."""
|
|
293
|
+
if path == "-":
|
|
294
|
+
return
|
|
295
|
+
try:
|
|
296
|
+
# Append mode: tests writability without truncating an existing report.
|
|
297
|
+
# On a new path this creates a zero-byte file; if the probe is interrupted
|
|
298
|
+
# before _dump, that empty file remains (acceptable for fail-fast).
|
|
299
|
+
with open(path, "a"):
|
|
300
|
+
pass
|
|
301
|
+
except OSError as e:
|
|
302
|
+
logger.error("cannot write --output %r: %s", path, e)
|
|
303
|
+
raise SystemExit(2)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def _dump(path: str, report: dict[str, Any]) -> None:
|
|
307
|
+
if path == "-":
|
|
308
|
+
json.dump(report, sys.stdout, indent=2)
|
|
309
|
+
sys.stdout.write("\n")
|
|
310
|
+
else:
|
|
311
|
+
with open(path, "w") as f:
|
|
312
|
+
json.dump(report, f, indent=2)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _aborted(report: dict[str, Any]) -> bool:
|
|
316
|
+
"""True when the probe bailed because the endpoint returned no usable response
|
|
317
|
+
(non-OK sanity). main() maps this to a non-zero exit so CI can detect it."""
|
|
318
|
+
sanity = report.get("sanity")
|
|
319
|
+
return bool(sanity) and sanity.get("rclass") != core.RClass.OK.value
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def main(argv: list[str] | None = None) -> None:
|
|
323
|
+
"""Exit codes: 0 success, 2 precondition failure (bad args / unwritable output /
|
|
324
|
+
endpoint returned no usable response), 1 unexpected crash, 130 interrupted."""
|
|
325
|
+
args = build_parser().parse_args(argv)
|
|
326
|
+
level = logging.DEBUG if args.verbose else logging.WARNING if args.quiet else logging.INFO
|
|
327
|
+
setup_logging(level=level, fmt=args.log_format)
|
|
328
|
+
try:
|
|
329
|
+
report = run(args)
|
|
330
|
+
except KeyboardInterrupt:
|
|
331
|
+
logger.warning("interrupted.")
|
|
332
|
+
sys.exit(130)
|
|
333
|
+
except Exception:
|
|
334
|
+
# Route crashes through the logger so --log-format json keeps stderr valid
|
|
335
|
+
# JSON and the traceback is escaped (PlainFormatter) rather than dumped raw.
|
|
336
|
+
logger.error("unexpected error", exc_info=True)
|
|
337
|
+
sys.exit(1)
|
|
338
|
+
if _aborted(report):
|
|
339
|
+
sys.exit(2)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
if __name__ == "__main__":
|
|
343
|
+
main()
|
sonde/core.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""
|
|
2
|
+
core.py — endpoint- and provider-agnostic HTTP plumbing.
|
|
3
|
+
|
|
4
|
+
Response classification, rate-limit-header parsing, and auth are NOT here — those
|
|
5
|
+
vary per API and live behind the Provider interface (provider.py). core only knows
|
|
6
|
+
how to issue a request, time it, and hand the response to the endpoint's provider
|
|
7
|
+
for classification and to the endpoint for item extraction.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import threading
|
|
13
|
+
import time
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from http.cookiejar import DefaultCookiePolicy
|
|
17
|
+
from typing import TYPE_CHECKING, Any
|
|
18
|
+
|
|
19
|
+
import requests
|
|
20
|
+
from requests.adapters import HTTPAdapter
|
|
21
|
+
|
|
22
|
+
from . import __version__
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from .endpoint import Endpoint
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"RClass",
|
|
29
|
+
"Budget",
|
|
30
|
+
"Result",
|
|
31
|
+
"build_session",
|
|
32
|
+
"fetch",
|
|
33
|
+
"default_rclass",
|
|
34
|
+
"interesting_headers",
|
|
35
|
+
"BASE_HEADERS",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
BASE_HEADERS = {
|
|
39
|
+
"Accept": "application/json",
|
|
40
|
+
"User-Agent": f"sonde/{__version__} (one-time diagnostic)",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
# Response headers worth surfacing (case-insensitive substring match).
|
|
44
|
+
HEADER_SUBSTRINGS = ("ratelimit", "retry-after", "x-request", "server", "cf-ray")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# --------------------------------------------------------------------------- #
|
|
48
|
+
# Normalised response class — phases branch on this, never on raw status.
|
|
49
|
+
# --------------------------------------------------------------------------- #
|
|
50
|
+
class RClass(str, Enum):
|
|
51
|
+
OK = "ok" # a usable success response
|
|
52
|
+
THROTTLED = "throttled" # rate-limited (429, or provider-specific)
|
|
53
|
+
ERROR = "error" # any other non-success (4xx/5xx/network)
|
|
54
|
+
BUDGET = "budget" # local request budget exhausted (not a server response)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def default_rclass(status: int) -> RClass:
|
|
58
|
+
"""Fallback classification (also what the generic Provider uses)."""
|
|
59
|
+
if status == 200:
|
|
60
|
+
return RClass.OK
|
|
61
|
+
if status == 429:
|
|
62
|
+
return RClass.THROTTLED
|
|
63
|
+
if status == -1:
|
|
64
|
+
return RClass.BUDGET
|
|
65
|
+
return RClass.ERROR
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# --------------------------------------------------------------------------- #
|
|
69
|
+
# Request budget: thread-safe hard ceiling.
|
|
70
|
+
# --------------------------------------------------------------------------- #
|
|
71
|
+
@dataclass
|
|
72
|
+
class Budget:
|
|
73
|
+
max_requests: int
|
|
74
|
+
used: int = 0
|
|
75
|
+
_lock: threading.Lock = field(default_factory=threading.Lock, repr=False)
|
|
76
|
+
|
|
77
|
+
def take(self) -> bool:
|
|
78
|
+
with self._lock:
|
|
79
|
+
if self.used >= self.max_requests:
|
|
80
|
+
return False
|
|
81
|
+
self.used += 1
|
|
82
|
+
return True
|
|
83
|
+
|
|
84
|
+
def remaining(self) -> int:
|
|
85
|
+
with self._lock:
|
|
86
|
+
return max(0, self.max_requests - self.used)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# --------------------------------------------------------------------------- #
|
|
90
|
+
# Session
|
|
91
|
+
# --------------------------------------------------------------------------- #
|
|
92
|
+
def build_session(headers: dict[str, str] | None = None) -> requests.Session:
|
|
93
|
+
"""Session for the serial phases; auth rides on headers so the no-write cookie
|
|
94
|
+
jar is never mutated. The burst phase builds its own httpx client."""
|
|
95
|
+
s = requests.Session()
|
|
96
|
+
s.headers.update(headers or dict(BASE_HEADERS))
|
|
97
|
+
s.cookies.set_policy(DefaultCookiePolicy(allowed_domains=[]))
|
|
98
|
+
adapter = HTTPAdapter(pool_connections=4, pool_maxsize=10, max_retries=0)
|
|
99
|
+
s.mount("https://", adapter)
|
|
100
|
+
s.mount("http://", adapter)
|
|
101
|
+
return s
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# --------------------------------------------------------------------------- #
|
|
105
|
+
# Result + response handling
|
|
106
|
+
# --------------------------------------------------------------------------- #
|
|
107
|
+
@dataclass
|
|
108
|
+
class Result:
|
|
109
|
+
status: int
|
|
110
|
+
elapsed: float
|
|
111
|
+
# None only on input -> derived from status in __post_init__ (never None after construction).
|
|
112
|
+
rclass: RClass | None = None
|
|
113
|
+
count: int = 0
|
|
114
|
+
next_cursor: Any = None
|
|
115
|
+
retry_after: float | None = None
|
|
116
|
+
headers: dict[str, str] = field(default_factory=dict)
|
|
117
|
+
error: str | None = None
|
|
118
|
+
|
|
119
|
+
def __post_init__(self) -> None:
|
|
120
|
+
if self.rclass is None:
|
|
121
|
+
self.rclass = default_rclass(self.status)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def interesting_headers(resp: Any) -> dict[str, str]:
|
|
125
|
+
return {
|
|
126
|
+
k: v for k, v in resp.headers.items() if any(sub in k.lower() for sub in HEADER_SUBSTRINGS)
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
_PARSE_ERRORS = (ValueError, KeyError, TypeError, AttributeError, IndexError)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _parse_response(resp: Any, elapsed: float, endpoint: Endpoint) -> Result:
|
|
134
|
+
"""Classify via the endpoint's provider, then (on success) let the endpoint pull
|
|
135
|
+
item count + next cursor from the FULL response (so header-based pagination and
|
|
136
|
+
non-JSON bodies are possible)."""
|
|
137
|
+
provider = endpoint.provider()
|
|
138
|
+
rclass = provider.classify(resp)
|
|
139
|
+
|
|
140
|
+
ra = resp.headers.get("Retry-After")
|
|
141
|
+
retry_after = None
|
|
142
|
+
if ra is not None:
|
|
143
|
+
try:
|
|
144
|
+
retry_after = float(ra)
|
|
145
|
+
except (ValueError, TypeError):
|
|
146
|
+
retry_after = None
|
|
147
|
+
|
|
148
|
+
res = Result(
|
|
149
|
+
status=resp.status_code,
|
|
150
|
+
elapsed=elapsed,
|
|
151
|
+
rclass=rclass,
|
|
152
|
+
retry_after=retry_after,
|
|
153
|
+
headers=interesting_headers(resp),
|
|
154
|
+
)
|
|
155
|
+
if rclass == RClass.OK:
|
|
156
|
+
try:
|
|
157
|
+
page = endpoint.parse_page(resp)
|
|
158
|
+
res.count = page.count
|
|
159
|
+
res.next_cursor = page.next_cursor
|
|
160
|
+
except _PARSE_ERRORS as e:
|
|
161
|
+
res.error = f"OK response but parse_page failed: {e}"
|
|
162
|
+
elif rclass == RClass.ERROR and resp.status_code >= 400:
|
|
163
|
+
res.error = resp.text[:200]
|
|
164
|
+
return res
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def fetch(session: requests.Session, endpoint: Endpoint, cursor: Any, budget: Budget) -> Result:
|
|
168
|
+
"""One probe request for `endpoint` at pagination position `cursor`."""
|
|
169
|
+
if not budget.take():
|
|
170
|
+
return Result(
|
|
171
|
+
status=-1, elapsed=0.0, rclass=RClass.BUDGET, error="request budget exhausted"
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
provider = endpoint.provider()
|
|
175
|
+
spec = endpoint.build_request(cursor)
|
|
176
|
+
params = {**provider.auth_params(), **(spec.params or {})}
|
|
177
|
+
|
|
178
|
+
t0 = time.perf_counter()
|
|
179
|
+
try:
|
|
180
|
+
resp = session.request(
|
|
181
|
+
spec.method, spec.url, params=params, json=spec.json_body, timeout=30
|
|
182
|
+
)
|
|
183
|
+
except requests.RequestException as e:
|
|
184
|
+
return Result(status=0, elapsed=time.perf_counter() - t0, rclass=RClass.ERROR, error=str(e))
|
|
185
|
+
return _parse_response(resp, time.perf_counter() - t0, endpoint)
|