bulkurlchecker 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bulkurlchecker/__init__.py +45 -0
- bulkurlchecker/_version.py +1 -0
- bulkurlchecker/client.py +303 -0
- bulkurlchecker/exceptions.py +77 -0
- bulkurlchecker/py.typed +0 -0
- bulkurlchecker/types.py +118 -0
- bulkurlchecker-0.1.0.dist-info/METADATA +185 -0
- bulkurlchecker-0.1.0.dist-info/RECORD +11 -0
- bulkurlchecker-0.1.0.dist-info/WHEEL +5 -0
- bulkurlchecker-0.1.0.dist-info/licenses/LICENSE +21 -0
- bulkurlchecker-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""bulkurlchecker — Python client for the Bulk URL Checker API.
|
|
2
|
+
|
|
3
|
+
Quickstart:
|
|
4
|
+
|
|
5
|
+
from bulkurlchecker import Client
|
|
6
|
+
client = Client(api_key="uck_live_...")
|
|
7
|
+
results = client.check_urls([
|
|
8
|
+
"https://example.com",
|
|
9
|
+
"https://example.org",
|
|
10
|
+
])
|
|
11
|
+
for r in results.results:
|
|
12
|
+
print(r.url, r.status_code)
|
|
13
|
+
|
|
14
|
+
Get an API key at https://app.bulkurlchecker.com/dashboard/api-keys.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from ._version import __version__
|
|
18
|
+
from .client import Client
|
|
19
|
+
from .exceptions import (
|
|
20
|
+
AuthenticationError,
|
|
21
|
+
BulkURLCheckerError,
|
|
22
|
+
NotFoundError,
|
|
23
|
+
QuotaError,
|
|
24
|
+
RateLimitError,
|
|
25
|
+
ServerError,
|
|
26
|
+
TimeoutError,
|
|
27
|
+
ValidationError,
|
|
28
|
+
)
|
|
29
|
+
from .types import CheckResults, JobSummary, URLResult
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"__version__",
|
|
33
|
+
"Client",
|
|
34
|
+
"CheckResults",
|
|
35
|
+
"JobSummary",
|
|
36
|
+
"URLResult",
|
|
37
|
+
"BulkURLCheckerError",
|
|
38
|
+
"AuthenticationError",
|
|
39
|
+
"RateLimitError",
|
|
40
|
+
"QuotaError",
|
|
41
|
+
"ValidationError",
|
|
42
|
+
"NotFoundError",
|
|
43
|
+
"ServerError",
|
|
44
|
+
"TimeoutError",
|
|
45
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
bulkurlchecker/client.py
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
"""Synchronous Python client for the Bulk URL Checker REST API.
|
|
2
|
+
|
|
3
|
+
Designed to be the shortest path from "I need to check 50K URLs" to
|
|
4
|
+
"results are in my hands." If you find yourself writing httpx + asyncio
|
|
5
|
+
+ proxy rotation + per-domain rate limiting + retry classification +
|
|
6
|
+
soft-404 detection, stop and use this instead.
|
|
7
|
+
|
|
8
|
+
Quick example:
|
|
9
|
+
|
|
10
|
+
from bulkurlchecker import Client
|
|
11
|
+
client = Client(api_key="uck_live_...")
|
|
12
|
+
out = client.check_urls(["https://example.com", "https://example.org"])
|
|
13
|
+
for r in out.results:
|
|
14
|
+
print(r.url, r.status_code, "BROKEN" if r.is_broken else "ok")
|
|
15
|
+
|
|
16
|
+
For larger jobs that exceed the synchronous wait budget, use the
|
|
17
|
+
two-step pattern:
|
|
18
|
+
|
|
19
|
+
job = client.submit(my_urls)
|
|
20
|
+
# ... do other things ...
|
|
21
|
+
for batch in client.iter_results(job.job_id, page_size=1000):
|
|
22
|
+
process(batch)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import platform
|
|
28
|
+
import sys
|
|
29
|
+
import time
|
|
30
|
+
from typing import Iterable, Iterator, List, Optional
|
|
31
|
+
|
|
32
|
+
import requests
|
|
33
|
+
|
|
34
|
+
from ._version import __version__
|
|
35
|
+
from .exceptions import (
|
|
36
|
+
AuthenticationError,
|
|
37
|
+
BulkURLCheckerError,
|
|
38
|
+
NotFoundError,
|
|
39
|
+
QuotaError,
|
|
40
|
+
RateLimitError,
|
|
41
|
+
ServerError,
|
|
42
|
+
TimeoutError,
|
|
43
|
+
ValidationError,
|
|
44
|
+
)
|
|
45
|
+
from .types import CheckResults, JobSummary, URLResult
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
DEFAULT_BASE_URL = "https://api.bulkurlchecker.com"
|
|
49
|
+
DEFAULT_TIMEOUT = 30.0 # seconds, per HTTP call (not the wait endpoint)
|
|
50
|
+
USER_AGENT_PREFIX = "bulkurlchecker-python"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _build_user_agent() -> str:
|
|
54
|
+
"""Construct the User-Agent header.
|
|
55
|
+
|
|
56
|
+
Our server uses the bulkurlchecker- prefix to tag requests as
|
|
57
|
+
coming from an SDK so the channel telemetry can count them.
|
|
58
|
+
Including the Python + OS version helps us prioritize platform
|
|
59
|
+
support if a specific version misbehaves.
|
|
60
|
+
"""
|
|
61
|
+
py = f"python/{sys.version_info.major}.{sys.version_info.minor}"
|
|
62
|
+
osinfo = f"{platform.system()}/{platform.release()}"
|
|
63
|
+
return f"{USER_AGENT_PREFIX}/{__version__} ({py}; {osinfo})"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class Client:
|
|
67
|
+
"""High-level client for the Bulk URL Checker REST API.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
api_key: Your secret API key (looks like ``uck_live_...``).
|
|
71
|
+
Get one from https://app.bulkurlchecker.com/dashboard/api-keys
|
|
72
|
+
base_url: Override the API host. Useful for testing against a
|
|
73
|
+
staging deploy. Defaults to https://api.bulkurlchecker.com.
|
|
74
|
+
timeout: Per-call HTTP timeout in seconds. Does NOT bound the
|
|
75
|
+
server-side wait inside ``check_urls()``; for that use
|
|
76
|
+
the ``wait_seconds`` parameter.
|
|
77
|
+
session: Pre-configured ``requests.Session`` if you want to
|
|
78
|
+
share connection pooling with the rest of your app.
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
AuthenticationError: api_key empty or rejected.
|
|
82
|
+
RateLimitError: 429 with optional ``retry_after`` seconds.
|
|
83
|
+
QuotaError: out of credits.
|
|
84
|
+
ValidationError: malformed request (bad URLs, too many URLs).
|
|
85
|
+
NotFoundError: 404 (job_id not found / not owned).
|
|
86
|
+
ServerError: 5xx (transient, safe to retry with backoff).
|
|
87
|
+
TimeoutError: local timeout elapsed.
|
|
88
|
+
BulkURLCheckerError: catch-all parent.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def __init__(
|
|
92
|
+
self,
|
|
93
|
+
api_key: str,
|
|
94
|
+
*,
|
|
95
|
+
base_url: str = DEFAULT_BASE_URL,
|
|
96
|
+
timeout: float = DEFAULT_TIMEOUT,
|
|
97
|
+
session: Optional[requests.Session] = None,
|
|
98
|
+
) -> None:
|
|
99
|
+
if not api_key:
|
|
100
|
+
raise AuthenticationError("api_key must be a non-empty string")
|
|
101
|
+
self.api_key = api_key
|
|
102
|
+
self.base_url = base_url.rstrip("/")
|
|
103
|
+
self.timeout = timeout
|
|
104
|
+
self._session = session or requests.Session()
|
|
105
|
+
self._session.headers.update({
|
|
106
|
+
"Authorization": f"Bearer {api_key}",
|
|
107
|
+
"User-Agent": _build_user_agent(),
|
|
108
|
+
"Accept": "application/json",
|
|
109
|
+
})
|
|
110
|
+
|
|
111
|
+
# ---- Public API ----
|
|
112
|
+
|
|
113
|
+
def check_urls(
|
|
114
|
+
self,
|
|
115
|
+
urls: Iterable[str],
|
|
116
|
+
*,
|
|
117
|
+
wait_seconds: int = 60,
|
|
118
|
+
poll_interval: float = 2.0,
|
|
119
|
+
) -> CheckResults:
|
|
120
|
+
"""Submit URLs and block until results are ready (or timeout).
|
|
121
|
+
|
|
122
|
+
This is the 5-line-Python case. The server polls the job
|
|
123
|
+
on your behalf for up to ``wait_seconds`` and returns the
|
|
124
|
+
full result set in one response.
|
|
125
|
+
|
|
126
|
+
For lists > ~2,000 URLs, the wait will likely time out — use
|
|
127
|
+
``submit()`` + ``iter_results()`` instead so you're not
|
|
128
|
+
holding an HTTP connection open for minutes.
|
|
129
|
+
"""
|
|
130
|
+
urls_list = self._validate_urls(urls)
|
|
131
|
+
payload = {"urls": urls_list}
|
|
132
|
+
params = {"wait_seconds": int(wait_seconds), "poll_interval": float(poll_interval)}
|
|
133
|
+
body = self._request("POST", "/api/v2/jobs/wait", json=payload, params=params)
|
|
134
|
+
return CheckResults.from_dict(body)
|
|
135
|
+
|
|
136
|
+
def submit(self, urls: Iterable[str]) -> JobSummary:
|
|
137
|
+
"""Submit a job and return immediately with the job id.
|
|
138
|
+
|
|
139
|
+
Use this when your URL list is big enough that
|
|
140
|
+
``check_urls()`` would time out, or when you want to do
|
|
141
|
+
something else while the engine works.
|
|
142
|
+
"""
|
|
143
|
+
urls_list = self._validate_urls(urls)
|
|
144
|
+
body = self._request("POST", "/api/v2/jobs", json={"urls": urls_list})
|
|
145
|
+
return JobSummary.from_dict(body)
|
|
146
|
+
|
|
147
|
+
def get_job_status(self, job_id: str) -> JobSummary:
|
|
148
|
+
"""Look up the current state of a previously-submitted job."""
|
|
149
|
+
body = self._request("GET", f"/api/v2/jobs/{job_id}")
|
|
150
|
+
return JobSummary.from_dict(body)
|
|
151
|
+
|
|
152
|
+
def get_results(
|
|
153
|
+
self,
|
|
154
|
+
job_id: str,
|
|
155
|
+
*,
|
|
156
|
+
limit: int = 1000,
|
|
157
|
+
offset: int = 0,
|
|
158
|
+
) -> List[URLResult]:
|
|
159
|
+
"""Fetch one page of results. See ``iter_results()`` for streaming."""
|
|
160
|
+
params = {"limit": int(limit), "offset": int(offset)}
|
|
161
|
+
body = self._request("GET", f"/api/v2/jobs/{job_id}/results", params=params)
|
|
162
|
+
items = body.get("items") or body.get("results") or []
|
|
163
|
+
return [URLResult.from_dict(r) for r in items]
|
|
164
|
+
|
|
165
|
+
def iter_results(
|
|
166
|
+
self,
|
|
167
|
+
job_id: str,
|
|
168
|
+
*,
|
|
169
|
+
page_size: int = 1000,
|
|
170
|
+
) -> Iterator[List[URLResult]]:
|
|
171
|
+
"""Stream all results for a job in pages.
|
|
172
|
+
|
|
173
|
+
Yields lists of ``URLResult`` of at most ``page_size`` per
|
|
174
|
+
iteration. Iteration ends when the server returns an empty or
|
|
175
|
+
short page.
|
|
176
|
+
"""
|
|
177
|
+
offset = 0
|
|
178
|
+
while True:
|
|
179
|
+
batch = self.get_results(job_id, limit=page_size, offset=offset)
|
|
180
|
+
if not batch:
|
|
181
|
+
return
|
|
182
|
+
yield batch
|
|
183
|
+
if len(batch) < page_size:
|
|
184
|
+
return
|
|
185
|
+
offset += page_size
|
|
186
|
+
|
|
187
|
+
def wait_until_done(
|
|
188
|
+
self,
|
|
189
|
+
job_id: str,
|
|
190
|
+
*,
|
|
191
|
+
timeout: float = 900.0,
|
|
192
|
+
poll_interval: float = 2.0,
|
|
193
|
+
) -> JobSummary:
|
|
194
|
+
"""Client-side poll loop. Returns when the job hits a terminal state.
|
|
195
|
+
|
|
196
|
+
Convenience for the "I already submitted, just block until
|
|
197
|
+
ready" case. Raises ``TimeoutError`` if the deadline passes.
|
|
198
|
+
Terminal states are: completed, failed, cancelled, paused.
|
|
199
|
+
"""
|
|
200
|
+
deadline = time.monotonic() + float(timeout)
|
|
201
|
+
terminal = {"completed", "failed", "cancelled", "paused"}
|
|
202
|
+
while True:
|
|
203
|
+
job = self.get_job_status(job_id)
|
|
204
|
+
if job.status in terminal:
|
|
205
|
+
return job
|
|
206
|
+
if time.monotonic() >= deadline:
|
|
207
|
+
raise TimeoutError(
|
|
208
|
+
f"Job {job_id} did not finish within {timeout:.0f}s "
|
|
209
|
+
f"(last status: {job.status})"
|
|
210
|
+
)
|
|
211
|
+
time.sleep(poll_interval)
|
|
212
|
+
|
|
213
|
+
# ---- Internals ----
|
|
214
|
+
|
|
215
|
+
def _validate_urls(self, urls: Iterable[str]) -> List[str]:
|
|
216
|
+
out: List[str] = []
|
|
217
|
+
for u in urls:
|
|
218
|
+
s = (u or "").strip()
|
|
219
|
+
if not s:
|
|
220
|
+
continue
|
|
221
|
+
if not (s.lower().startswith("http://") or s.lower().startswith("https://")):
|
|
222
|
+
raise ValidationError(
|
|
223
|
+
f"URLs must include a scheme (http:// or https://). Got: {u!r}"
|
|
224
|
+
)
|
|
225
|
+
out.append(s)
|
|
226
|
+
if not out:
|
|
227
|
+
raise ValidationError("No valid URLs provided.")
|
|
228
|
+
return out
|
|
229
|
+
|
|
230
|
+
def _request(self, method: str, path: str, **kwargs):
|
|
231
|
+
url = f"{self.base_url}{path}"
|
|
232
|
+
try:
|
|
233
|
+
resp = self._session.request(method, url, timeout=self.timeout, **kwargs)
|
|
234
|
+
except requests.Timeout as e:
|
|
235
|
+
raise TimeoutError(f"HTTP {method} {path} timed out after {self.timeout}s") from e
|
|
236
|
+
except requests.RequestException as e:
|
|
237
|
+
raise BulkURLCheckerError(
|
|
238
|
+
f"Network error calling {method} {path}: {e}"
|
|
239
|
+
) from e
|
|
240
|
+
|
|
241
|
+
return self._handle_response(resp)
|
|
242
|
+
|
|
243
|
+
def _handle_response(self, resp: requests.Response):
|
|
244
|
+
request_id = resp.headers.get("X-Request-ID")
|
|
245
|
+
if 200 <= resp.status_code < 300:
|
|
246
|
+
try:
|
|
247
|
+
return resp.json()
|
|
248
|
+
except ValueError:
|
|
249
|
+
return {}
|
|
250
|
+
|
|
251
|
+
# Error path — try to extract the canonical {error: {code, message}}
|
|
252
|
+
# envelope. Fall back to a generic message if the body isn't JSON.
|
|
253
|
+
message = f"HTTP {resp.status_code} on {resp.request.method} {resp.url}"
|
|
254
|
+
code: Optional[str] = None
|
|
255
|
+
details = None
|
|
256
|
+
try:
|
|
257
|
+
body = resp.json()
|
|
258
|
+
err = body.get("error") if isinstance(body, dict) else None
|
|
259
|
+
if isinstance(err, dict):
|
|
260
|
+
code = err.get("code") or code
|
|
261
|
+
message = err.get("message") or message
|
|
262
|
+
details = err.get("details")
|
|
263
|
+
else:
|
|
264
|
+
# Some legacy endpoints still use `detail`
|
|
265
|
+
d = body.get("detail") if isinstance(body, dict) else None
|
|
266
|
+
if isinstance(d, str):
|
|
267
|
+
message = d
|
|
268
|
+
elif isinstance(d, dict):
|
|
269
|
+
code = d.get("error") or code
|
|
270
|
+
message = d.get("message") or message
|
|
271
|
+
details = {k: v for k, v in d.items() if k not in ("error", "message")}
|
|
272
|
+
except ValueError:
|
|
273
|
+
pass
|
|
274
|
+
|
|
275
|
+
status = resp.status_code
|
|
276
|
+
common = {
|
|
277
|
+
"status_code": status,
|
|
278
|
+
"code": code,
|
|
279
|
+
"request_id": request_id,
|
|
280
|
+
"details": details,
|
|
281
|
+
}
|
|
282
|
+
if status in (401, 403) and code == "no_credits":
|
|
283
|
+
raise QuotaError(message, **common)
|
|
284
|
+
if status in (401, 403):
|
|
285
|
+
raise AuthenticationError(message, **common)
|
|
286
|
+
if status == 404:
|
|
287
|
+
raise NotFoundError(message, **common)
|
|
288
|
+
if status == 429:
|
|
289
|
+
retry_after = None
|
|
290
|
+
ra = resp.headers.get("Retry-After")
|
|
291
|
+
if ra:
|
|
292
|
+
try:
|
|
293
|
+
retry_after = int(float(ra))
|
|
294
|
+
except (TypeError, ValueError):
|
|
295
|
+
retry_after = None
|
|
296
|
+
raise RateLimitError(message, retry_after=retry_after, **common)
|
|
297
|
+
if status == 402:
|
|
298
|
+
raise QuotaError(message, **common)
|
|
299
|
+
if status in (400, 422):
|
|
300
|
+
raise ValidationError(message, **common)
|
|
301
|
+
if 500 <= status < 600:
|
|
302
|
+
raise ServerError(message, **common)
|
|
303
|
+
raise BulkURLCheckerError(message, **common)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Exception hierarchy for the Bulk URL Checker SDK.
|
|
2
|
+
|
|
3
|
+
All errors derive from BulkURLCheckerError so callers can catch a
|
|
4
|
+
single exception type and branch on it. Specific subclasses exist
|
|
5
|
+
for the error categories devs actually want to handle differently:
|
|
6
|
+
authentication, rate limiting, quota, and validation.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any, Optional
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BulkURLCheckerError(Exception):
|
|
15
|
+
"""Base class for all SDK errors.
|
|
16
|
+
|
|
17
|
+
Carries the HTTP status code, the server's machine-readable
|
|
18
|
+
`error.code`, and the request_id so support requests are easy to
|
|
19
|
+
triage. Always raise the most-specific subclass possible.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
message: str,
|
|
25
|
+
*,
|
|
26
|
+
status_code: Optional[int] = None,
|
|
27
|
+
code: Optional[str] = None,
|
|
28
|
+
request_id: Optional[str] = None,
|
|
29
|
+
details: Optional[Any] = None,
|
|
30
|
+
) -> None:
|
|
31
|
+
super().__init__(message)
|
|
32
|
+
self.status_code = status_code
|
|
33
|
+
self.code = code
|
|
34
|
+
self.request_id = request_id
|
|
35
|
+
self.details = details
|
|
36
|
+
|
|
37
|
+
def __repr__(self) -> str: # pragma: no cover - debugging only
|
|
38
|
+
parts = [self.__class__.__name__, repr(str(self))]
|
|
39
|
+
if self.status_code is not None:
|
|
40
|
+
parts.append(f"status_code={self.status_code}")
|
|
41
|
+
if self.code:
|
|
42
|
+
parts.append(f"code={self.code!r}")
|
|
43
|
+
if self.request_id:
|
|
44
|
+
parts.append(f"request_id={self.request_id!r}")
|
|
45
|
+
return f"<{' '.join(parts)}>"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class AuthenticationError(BulkURLCheckerError):
|
|
49
|
+
"""401 / 403. The API key is missing, invalid, or revoked."""
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class RateLimitError(BulkURLCheckerError):
|
|
53
|
+
"""429. Slow down. Inspect `retry_after` (seconds) if present."""
|
|
54
|
+
|
|
55
|
+
def __init__(self, *args: Any, retry_after: Optional[int] = None, **kwargs: Any) -> None:
|
|
56
|
+
super().__init__(*args, **kwargs)
|
|
57
|
+
self.retry_after = retry_after
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class QuotaError(BulkURLCheckerError):
|
|
61
|
+
"""402 / 403 when the user has run out of credits or hit a plan limit."""
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class ValidationError(BulkURLCheckerError):
|
|
65
|
+
"""400 / 422. The request was malformed (bad URLs, too many URLs, etc)."""
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class NotFoundError(BulkURLCheckerError):
|
|
69
|
+
"""404. The job ID isn't owned by this API key, or doesn't exist."""
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class ServerError(BulkURLCheckerError):
|
|
73
|
+
"""5xx. Transient issue on our side; safe to retry with backoff."""
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class TimeoutError(BulkURLCheckerError): # noqa: A001 - intentional shadow
|
|
77
|
+
"""The local request timeout elapsed before the server responded."""
|
bulkurlchecker/py.typed
ADDED
|
File without changes
|
bulkurlchecker/types.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Public response types for the Bulk URL Checker SDK.
|
|
2
|
+
|
|
3
|
+
These are intentionally simple dataclasses (not pydantic models) so the
|
|
4
|
+
SDK has zero runtime dependencies beyond `requests`. If you want full
|
|
5
|
+
type validation, the OpenAPI spec lives at
|
|
6
|
+
https://api.bulkurlchecker.com/openapi.json.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class JobSummary:
|
|
17
|
+
"""High-level state of a single URL-checking job."""
|
|
18
|
+
|
|
19
|
+
job_id: str
|
|
20
|
+
status: str # 'pending' | 'parsing' | 'processing' | 'paused' | 'completed' | 'failed' | 'cancelled'
|
|
21
|
+
total_urls: int
|
|
22
|
+
completed_urls: int = 0
|
|
23
|
+
credits_allocated: int = 0
|
|
24
|
+
duplicates_removed: int = 0
|
|
25
|
+
invalid_urls_rejected: int = 0
|
|
26
|
+
created_at: Optional[str] = None
|
|
27
|
+
started_at: Optional[str] = None
|
|
28
|
+
completed_at: Optional[str] = None
|
|
29
|
+
|
|
30
|
+
@classmethod
|
|
31
|
+
def from_dict(cls, d: Dict[str, Any]) -> "JobSummary":
|
|
32
|
+
return cls(
|
|
33
|
+
job_id=str(d.get("job_id") or d.get("id")),
|
|
34
|
+
status=str(d.get("status") or "pending"),
|
|
35
|
+
total_urls=int(d.get("total_urls") or 0),
|
|
36
|
+
completed_urls=int(d.get("completed_urls") or 0),
|
|
37
|
+
credits_allocated=int(d.get("credits_allocated") or 0),
|
|
38
|
+
duplicates_removed=int(d.get("duplicates_removed") or 0),
|
|
39
|
+
invalid_urls_rejected=int(d.get("invalid_urls_rejected") or 0),
|
|
40
|
+
created_at=d.get("created_at"),
|
|
41
|
+
started_at=d.get("started_at"),
|
|
42
|
+
completed_at=d.get("completed_at"),
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class URLResult:
|
|
48
|
+
"""One URL check result. Shape mirrors the API response."""
|
|
49
|
+
|
|
50
|
+
url: str
|
|
51
|
+
final_url: Optional[str] = None
|
|
52
|
+
status_code: Optional[int] = None
|
|
53
|
+
response_time_ms: Optional[int] = None
|
|
54
|
+
redirect_chain: List[str] = field(default_factory=list)
|
|
55
|
+
is_broken: bool = False
|
|
56
|
+
is_soft_404: bool = False
|
|
57
|
+
error_code: Optional[str] = None
|
|
58
|
+
content_type: Optional[str] = None
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def from_dict(cls, d: Dict[str, Any]) -> "URLResult":
|
|
62
|
+
# The API returns slightly different field names depending on
|
|
63
|
+
# the endpoint version; this normalizer keeps the SDK shape
|
|
64
|
+
# stable across server-side changes.
|
|
65
|
+
return cls(
|
|
66
|
+
url=str(d.get("url") or ""),
|
|
67
|
+
final_url=d.get("final_url") or d.get("final"),
|
|
68
|
+
status_code=d.get("status_code") or d.get("status"),
|
|
69
|
+
response_time_ms=d.get("response_time_ms") or d.get("duration_ms"),
|
|
70
|
+
redirect_chain=list(d.get("redirect_chain") or []),
|
|
71
|
+
is_broken=bool(d.get("is_broken") or False),
|
|
72
|
+
is_soft_404=bool(d.get("is_soft_404") or False),
|
|
73
|
+
error_code=d.get("error_code") or d.get("error"),
|
|
74
|
+
content_type=d.get("content_type"),
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass
|
|
79
|
+
class CheckResults:
|
|
80
|
+
"""Complete result set from `Client.check_urls()` / `submit_and_wait()`."""
|
|
81
|
+
|
|
82
|
+
job_id: str
|
|
83
|
+
status: str
|
|
84
|
+
timed_out: bool
|
|
85
|
+
total_urls: int
|
|
86
|
+
completed_urls: int
|
|
87
|
+
duplicates_removed: int
|
|
88
|
+
invalid_urls_rejected: int
|
|
89
|
+
completed_at: Optional[str]
|
|
90
|
+
results: List[URLResult]
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def is_complete(self) -> bool:
|
|
94
|
+
"""True when the engine finished the job within the wait window."""
|
|
95
|
+
return self.status == "completed" and not self.timed_out
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def broken(self) -> List[URLResult]:
|
|
99
|
+
"""All results where the engine marked the URL broken."""
|
|
100
|
+
return [r for r in self.results if r.is_broken]
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def soft_404s(self) -> List[URLResult]:
|
|
104
|
+
return [r for r in self.results if r.is_soft_404]
|
|
105
|
+
|
|
106
|
+
@classmethod
|
|
107
|
+
def from_dict(cls, d: Dict[str, Any]) -> "CheckResults":
|
|
108
|
+
return cls(
|
|
109
|
+
job_id=str(d.get("job_id") or ""),
|
|
110
|
+
status=str(d.get("status") or ""),
|
|
111
|
+
timed_out=bool(d.get("timed_out") or False),
|
|
112
|
+
total_urls=int(d.get("total_urls") or 0),
|
|
113
|
+
completed_urls=int(d.get("completed_urls") or 0),
|
|
114
|
+
duplicates_removed=int(d.get("duplicates_removed") or 0),
|
|
115
|
+
invalid_urls_rejected=int(d.get("invalid_urls_rejected") or 0),
|
|
116
|
+
completed_at=d.get("completed_at"),
|
|
117
|
+
results=[URLResult.from_dict(r) for r in (d.get("results") or [])],
|
|
118
|
+
)
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bulkurlchecker
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python client for the Bulk URL Checker API. Skip the proxy-rotation + rate-limiter + soft-404-detector you would otherwise have to build.
|
|
5
|
+
Author-email: Bulk URL Checker <carlos@bulkurlchecker.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://bulkurlchecker.com
|
|
8
|
+
Project-URL: Documentation, https://bulkurlchecker.com/developers
|
|
9
|
+
Project-URL: Source Code, https://github.com/carlosofscience/bulkurlchecker-python
|
|
10
|
+
Project-URL: Bug Tracker, https://github.com/carlosofscience/bulkurlchecker-python/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/carlosofscience/bulkurlchecker-python/blob/main/CHANGELOG.md
|
|
12
|
+
Keywords: url-checker,bulk-url-checker,broken-link-checker,http-status-checker,seo-tools,link-validator,url-validation,redirect-checker
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
24
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
25
|
+
Classifier: Typing :: Typed
|
|
26
|
+
Requires-Python: >=3.8
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: requests>=2.25
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
32
|
+
Requires-Dist: pytest-cov>=4; extra == "dev"
|
|
33
|
+
Requires-Dist: responses>=0.23; extra == "dev"
|
|
34
|
+
Requires-Dist: ruff>=0.1; extra == "dev"
|
|
35
|
+
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
|
|
38
|
+
# bulkurlchecker
|
|
39
|
+
|
|
40
|
+
[](https://pypi.org/project/bulkurlchecker/)
|
|
41
|
+
[](https://pypi.org/project/bulkurlchecker/)
|
|
42
|
+
[](LICENSE)
|
|
43
|
+
|
|
44
|
+
Python client for the [Bulk URL Checker](https://bulkurlchecker.com) API.
|
|
45
|
+
|
|
46
|
+
**Skip the proxy-rotation, rate-limiter, soft-404 detector, and retry classifier you would otherwise spend two weeks building.** Submit thousands of URLs, get status codes, redirect chains, and broken-link detection back as plain Python objects. Backed by a managed cloud service with residential proxies and per-domain throttling.
|
|
47
|
+
|
|
48
|
+
## Install
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install bulkurlchecker
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## 5-line example
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from bulkurlchecker import Client
|
|
58
|
+
|
|
59
|
+
client = Client(api_key="uck_live_...")
|
|
60
|
+
results = client.check_urls(["https://example.com", "https://example.org"])
|
|
61
|
+
for r in results.results:
|
|
62
|
+
print(r.url, r.status_code, "BROKEN" if r.is_broken else "ok")
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Get an API key at https://app.bulkurlchecker.com/dashboard/api-keys. First 300 URLs are free, no card required.
|
|
66
|
+
|
|
67
|
+
## What you get back
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
results = client.check_urls(urls)
|
|
71
|
+
|
|
72
|
+
results.status # 'completed' | 'paused' | 'failed' | 'cancelled'
|
|
73
|
+
results.timed_out # True if the wait deadline passed (job still running)
|
|
74
|
+
results.total_urls # how many URLs the engine accepted
|
|
75
|
+
results.completed_urls # how many it finished checking
|
|
76
|
+
results.duplicates_removed
|
|
77
|
+
results.invalid_urls_rejected
|
|
78
|
+
|
|
79
|
+
for r in results.results:
|
|
80
|
+
r.url # the original URL you submitted
|
|
81
|
+
r.final_url # after redirects
|
|
82
|
+
r.status_code # 200, 301, 404, 429, 500, ...
|
|
83
|
+
r.redirect_chain # list of intermediate URLs
|
|
84
|
+
r.is_broken # True if the engine flagged this as broken
|
|
85
|
+
r.is_soft_404 # True if 200 OK but page content says "not found"
|
|
86
|
+
r.response_time_ms
|
|
87
|
+
|
|
88
|
+
# Convenience properties:
|
|
89
|
+
results.broken # list of URLResult where is_broken == True
|
|
90
|
+
results.soft_404s # list where is_soft_404 == True
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Larger jobs: submit and poll
|
|
94
|
+
|
|
95
|
+
`check_urls()` blocks for up to 15 minutes server-side. For lists where the wait would time out, use the two-step pattern:
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
job = client.submit(my_500k_urls)
|
|
99
|
+
print(f"Submitted {job.job_id}, {job.total_urls} URLs queued")
|
|
100
|
+
|
|
101
|
+
# Poll explicitly, or use the convenience method
|
|
102
|
+
done = client.wait_until_done(job.job_id, timeout=3600)
|
|
103
|
+
|
|
104
|
+
# Stream results in pages
|
|
105
|
+
for batch in client.iter_results(job.job_id, page_size=1000):
|
|
106
|
+
for r in batch:
|
|
107
|
+
if r.is_broken:
|
|
108
|
+
print(r.url, r.status_code)
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Error handling
|
|
112
|
+
|
|
113
|
+
All errors derive from `BulkURLCheckerError`. Catch specific subclasses when you want to branch on the failure mode:
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from bulkurlchecker import (
|
|
117
|
+
Client,
|
|
118
|
+
BulkURLCheckerError,
|
|
119
|
+
AuthenticationError,
|
|
120
|
+
RateLimitError,
|
|
121
|
+
QuotaError,
|
|
122
|
+
ValidationError,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
results = client.check_urls(urls)
|
|
127
|
+
except QuotaError as e:
|
|
128
|
+
print(f"Out of credits. Top up at https://app.bulkurlchecker.com/billing")
|
|
129
|
+
except RateLimitError as e:
|
|
130
|
+
print(f"Rate limited. Retry after {e.retry_after}s.")
|
|
131
|
+
except AuthenticationError:
|
|
132
|
+
print("API key rejected — check it's not revoked.")
|
|
133
|
+
except ValidationError as e:
|
|
134
|
+
print(f"Bad request: {e}") # bad URLs, too many URLs, etc.
|
|
135
|
+
except BulkURLCheckerError as e:
|
|
136
|
+
print(f"Other error: {e} (request_id={e.request_id})")
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
Every error carries `status_code`, `code` (server's machine-readable string), `request_id` (for support), and `details` (when the server provides them).
|
|
140
|
+
|
|
141
|
+
## Why use this instead of writing your own checker with httpx + asyncio?
|
|
142
|
+
|
|
143
|
+
Honest answer: for ≤500 URLs you don't need this. The standard `requests`/`httpx` toolchain handles it fine.
|
|
144
|
+
|
|
145
|
+
The wall hits at scale:
|
|
146
|
+
|
|
147
|
+
| Problem | Rolling your own | This SDK |
|
|
148
|
+
|---|---|---|
|
|
149
|
+
| Concurrency | `asyncio` + careful semaphores | done |
|
|
150
|
+
| Proxy rotation across residential IPs | $90+/mo Webshare / Bright Data subscription + custom code | done |
|
|
151
|
+
| Per-domain rate limiting (so you don't hammer one host) | wire it yourself | done |
|
|
152
|
+
| Distinguishing real 403 from "you got blocked" 403 | guess and check | done |
|
|
153
|
+
| Detecting soft 404s (200 OK + "not found" body) | regex / heuristic per template | done |
|
|
154
|
+
| Retry classification (transient vs permanent) | tune for weeks | done |
|
|
155
|
+
| Long-running job state (resume after crash) | Redis + queue + worker infra | done |
|
|
156
|
+
| Engineer time, weeks 1-4 | $$$ | nothing, ship today |
|
|
157
|
+
|
|
158
|
+
If you've already lost a weekend to httpx + proxy rotation, you know what we're talking about.
|
|
159
|
+
|
|
160
|
+
## Pricing
|
|
161
|
+
|
|
162
|
+
- **Free tier:** 300 URL checks. No signup required.
|
|
163
|
+
- **Starter:** $9/month or $90/year (~17% off) — 15,000 URLs/month
|
|
164
|
+
- **Pro:** $29/month or $290/year — 50,000 URLs/month, 5 scheduled checks, daily monitoring
|
|
165
|
+
- **Agency:** $99/month or $990/year — 200,000 URLs/month, 50 schedules, Slack + webhook alerts
|
|
166
|
+
|
|
167
|
+
Top-up credit packs available beyond the monthly pool. Credits never expire.
|
|
168
|
+
|
|
169
|
+
Full pricing: https://bulkurlchecker.com/#pricing
|
|
170
|
+
|
|
171
|
+
## Links
|
|
172
|
+
|
|
173
|
+
- [Web app](https://app.bulkurlchecker.com)
|
|
174
|
+
- [REST API reference](https://bulkurlchecker.com/developers)
|
|
175
|
+
- [OpenAPI spec](https://api.bulkurlchecker.com/openapi.json)
|
|
176
|
+
- [GitHub](https://github.com/carlosofscience/bulkurlchecker-python)
|
|
177
|
+
- [Changelog](CHANGELOG.md)
|
|
178
|
+
|
|
179
|
+
## Stability
|
|
180
|
+
|
|
181
|
+
The SDK follows semver. While we're at 0.x, breaking changes can land in minor releases (we'll always note them in `CHANGELOG.md`). Once we hit 1.0 you can pin major versions safely.
|
|
182
|
+
|
|
183
|
+
## License
|
|
184
|
+
|
|
185
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
bulkurlchecker/__init__.py,sha256=Ms9yWEEc2yb88KrdJZrhQ7CAHL9BZRqXUrqg6NdtWsk,991
|
|
2
|
+
bulkurlchecker/_version.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
|
|
3
|
+
bulkurlchecker/client.py,sha256=GzOb_lARVwlFgI4SDkcHZPEWrgRGGIX3Bxd0myJKBn0,11150
|
|
4
|
+
bulkurlchecker/exceptions.py,sha256=EWLXZWQmBg-pQohGRsEX45LHt9AOrmgsjzLFBQnNwfo,2528
|
|
5
|
+
bulkurlchecker/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
bulkurlchecker/types.py,sha256=9210KxWKV9SnghOAX6O-UUq39dq2SE7GSa09OMwjZzM,4278
|
|
7
|
+
bulkurlchecker-0.1.0.dist-info/licenses/LICENSE,sha256=R9-95i5U2iwohiXs4napv1aiu1nAd3tRvW3geDL8Ky4,1073
|
|
8
|
+
bulkurlchecker-0.1.0.dist-info/METADATA,sha256=30xYsMRMOs_LXkM2WM3XeGOQqCl610HYZI45P9HNv-A,7438
|
|
9
|
+
bulkurlchecker-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
10
|
+
bulkurlchecker-0.1.0.dist-info/top_level.txt,sha256=ApFhZQ33R6RdOCXCplNtbor2M2VffWUdDxv5okxSM3Y,15
|
|
11
|
+
bulkurlchecker-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Bulk URL Checker
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
bulkurlchecker
|