bulkurlchecker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,45 @@
1
+ """bulkurlchecker — Python client for the Bulk URL Checker API.
2
+
3
+ Quickstart:
4
+
5
+ from bulkurlchecker import Client
6
+ client = Client(api_key="uck_live_...")
7
+ results = client.check_urls([
8
+ "https://example.com",
9
+ "https://example.org",
10
+ ])
11
+ for r in results.results:
12
+ print(r.url, r.status_code)
13
+
14
+ Get an API key at https://app.bulkurlchecker.com/dashboard/api-keys.
15
+ """
16
+
17
+ from ._version import __version__
18
+ from .client import Client
19
+ from .exceptions import (
20
+ AuthenticationError,
21
+ BulkURLCheckerError,
22
+ NotFoundError,
23
+ QuotaError,
24
+ RateLimitError,
25
+ ServerError,
26
+ TimeoutError,
27
+ ValidationError,
28
+ )
29
+ from .types import CheckResults, JobSummary, URLResult
30
+
31
+ __all__ = [
32
+ "__version__",
33
+ "Client",
34
+ "CheckResults",
35
+ "JobSummary",
36
+ "URLResult",
37
+ "BulkURLCheckerError",
38
+ "AuthenticationError",
39
+ "RateLimitError",
40
+ "QuotaError",
41
+ "ValidationError",
42
+ "NotFoundError",
43
+ "ServerError",
44
+ "TimeoutError",
45
+ ]
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,303 @@
1
+ """Synchronous Python client for the Bulk URL Checker REST API.
2
+
3
+ Designed to be the shortest path from "I need to check 50K URLs" to
4
+ "results are in my hands." If you find yourself writing httpx + asyncio
5
+ + proxy rotation + per-domain rate limiting + retry classification +
6
+ soft-404 detection, stop and use this instead.
7
+
8
+ Quick example:
9
+
10
+ from bulkurlchecker import Client
11
+ client = Client(api_key="uck_live_...")
12
+ out = client.check_urls(["https://example.com", "https://example.org"])
13
+ for r in out.results:
14
+ print(r.url, r.status_code, "BROKEN" if r.is_broken else "ok")
15
+
16
+ For larger jobs that exceed the synchronous wait budget, use the
17
+ two-step pattern:
18
+
19
+ job = client.submit(my_urls)
20
+ # ... do other things ...
21
+ for batch in client.iter_results(job.job_id, page_size=1000):
22
+ process(batch)
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import platform
28
+ import sys
29
+ import time
30
+ from typing import Iterable, Iterator, List, Optional
31
+
32
+ import requests
33
+
34
+ from ._version import __version__
35
+ from .exceptions import (
36
+ AuthenticationError,
37
+ BulkURLCheckerError,
38
+ NotFoundError,
39
+ QuotaError,
40
+ RateLimitError,
41
+ ServerError,
42
+ TimeoutError,
43
+ ValidationError,
44
+ )
45
+ from .types import CheckResults, JobSummary, URLResult
46
+
47
+
48
+ DEFAULT_BASE_URL = "https://api.bulkurlchecker.com"
49
+ DEFAULT_TIMEOUT = 30.0 # seconds, per HTTP call (not the wait endpoint)
50
+ USER_AGENT_PREFIX = "bulkurlchecker-python"
51
+
52
+
53
+ def _build_user_agent() -> str:
54
+ """Construct the User-Agent header.
55
+
56
+ Our server uses the bulkurlchecker- prefix to tag requests as
57
+ coming from an SDK so the channel telemetry can count them.
58
+ Including the Python + OS version helps us prioritize platform
59
+ support if a specific version misbehaves.
60
+ """
61
+ py = f"python/{sys.version_info.major}.{sys.version_info.minor}"
62
+ osinfo = f"{platform.system()}/{platform.release()}"
63
+ return f"{USER_AGENT_PREFIX}/{__version__} ({py}; {osinfo})"
64
+
65
+
66
+ class Client:
67
+ """High-level client for the Bulk URL Checker REST API.
68
+
69
+ Args:
70
+ api_key: Your secret API key (looks like ``uck_live_...``).
71
+ Get one from https://app.bulkurlchecker.com/dashboard/api-keys
72
+ base_url: Override the API host. Useful for testing against a
73
+ staging deploy. Defaults to https://api.bulkurlchecker.com.
74
+ timeout: Per-call HTTP timeout in seconds. Does NOT bound the
75
+ server-side wait inside ``check_urls()``; for that use
76
+ the ``wait_seconds`` parameter.
77
+ session: Pre-configured ``requests.Session`` if you want to
78
+ share connection pooling with the rest of your app.
79
+
80
+ Raises:
81
+ AuthenticationError: api_key empty or rejected.
82
+ RateLimitError: 429 with optional ``retry_after`` seconds.
83
+ QuotaError: out of credits.
84
+ ValidationError: malformed request (bad URLs, too many URLs).
85
+ NotFoundError: 404 (job_id not found / not owned).
86
+ ServerError: 5xx (transient, safe to retry with backoff).
87
+ TimeoutError: local timeout elapsed.
88
+ BulkURLCheckerError: catch-all parent.
89
+ """
90
+
91
+ def __init__(
92
+ self,
93
+ api_key: str,
94
+ *,
95
+ base_url: str = DEFAULT_BASE_URL,
96
+ timeout: float = DEFAULT_TIMEOUT,
97
+ session: Optional[requests.Session] = None,
98
+ ) -> None:
99
+ if not api_key:
100
+ raise AuthenticationError("api_key must be a non-empty string")
101
+ self.api_key = api_key
102
+ self.base_url = base_url.rstrip("/")
103
+ self.timeout = timeout
104
+ self._session = session or requests.Session()
105
+ self._session.headers.update({
106
+ "Authorization": f"Bearer {api_key}",
107
+ "User-Agent": _build_user_agent(),
108
+ "Accept": "application/json",
109
+ })
110
+
111
+ # ---- Public API ----
112
+
113
+ def check_urls(
114
+ self,
115
+ urls: Iterable[str],
116
+ *,
117
+ wait_seconds: int = 60,
118
+ poll_interval: float = 2.0,
119
+ ) -> CheckResults:
120
+ """Submit URLs and block until results are ready (or timeout).
121
+
122
+ This is the 5-line-Python case. The server polls the job
123
+ on your behalf for up to ``wait_seconds`` and returns the
124
+ full result set in one response.
125
+
126
+ For lists > ~2,000 URLs, the wait will likely time out — use
127
+ ``submit()`` + ``iter_results()`` instead so you're not
128
+ holding an HTTP connection open for minutes.
129
+ """
130
+ urls_list = self._validate_urls(urls)
131
+ payload = {"urls": urls_list}
132
+ params = {"wait_seconds": int(wait_seconds), "poll_interval": float(poll_interval)}
133
+ body = self._request("POST", "/api/v2/jobs/wait", json=payload, params=params)
134
+ return CheckResults.from_dict(body)
135
+
136
+ def submit(self, urls: Iterable[str]) -> JobSummary:
137
+ """Submit a job and return immediately with the job id.
138
+
139
+ Use this when your URL list is big enough that
140
+ ``check_urls()`` would time out, or when you want to do
141
+ something else while the engine works.
142
+ """
143
+ urls_list = self._validate_urls(urls)
144
+ body = self._request("POST", "/api/v2/jobs", json={"urls": urls_list})
145
+ return JobSummary.from_dict(body)
146
+
147
+ def get_job_status(self, job_id: str) -> JobSummary:
148
+ """Look up the current state of a previously-submitted job."""
149
+ body = self._request("GET", f"/api/v2/jobs/{job_id}")
150
+ return JobSummary.from_dict(body)
151
+
152
+ def get_results(
153
+ self,
154
+ job_id: str,
155
+ *,
156
+ limit: int = 1000,
157
+ offset: int = 0,
158
+ ) -> List[URLResult]:
159
+ """Fetch one page of results. See ``iter_results()`` for streaming."""
160
+ params = {"limit": int(limit), "offset": int(offset)}
161
+ body = self._request("GET", f"/api/v2/jobs/{job_id}/results", params=params)
162
+ items = body.get("items") or body.get("results") or []
163
+ return [URLResult.from_dict(r) for r in items]
164
+
165
+ def iter_results(
166
+ self,
167
+ job_id: str,
168
+ *,
169
+ page_size: int = 1000,
170
+ ) -> Iterator[List[URLResult]]:
171
+ """Stream all results for a job in pages.
172
+
173
+ Yields lists of ``URLResult`` of at most ``page_size`` per
174
+ iteration. Iteration ends when the server returns an empty or
175
+ short page.
176
+ """
177
+ offset = 0
178
+ while True:
179
+ batch = self.get_results(job_id, limit=page_size, offset=offset)
180
+ if not batch:
181
+ return
182
+ yield batch
183
+ if len(batch) < page_size:
184
+ return
185
+ offset += page_size
186
+
187
+ def wait_until_done(
188
+ self,
189
+ job_id: str,
190
+ *,
191
+ timeout: float = 900.0,
192
+ poll_interval: float = 2.0,
193
+ ) -> JobSummary:
194
+ """Client-side poll loop. Returns when the job hits a terminal state.
195
+
196
+ Convenience for the "I already submitted, just block until
197
+ ready" case. Raises ``TimeoutError`` if the deadline passes.
198
+ Terminal states are: completed, failed, cancelled, paused.
199
+ """
200
+ deadline = time.monotonic() + float(timeout)
201
+ terminal = {"completed", "failed", "cancelled", "paused"}
202
+ while True:
203
+ job = self.get_job_status(job_id)
204
+ if job.status in terminal:
205
+ return job
206
+ if time.monotonic() >= deadline:
207
+ raise TimeoutError(
208
+ f"Job {job_id} did not finish within {timeout:.0f}s "
209
+ f"(last status: {job.status})"
210
+ )
211
+ time.sleep(poll_interval)
212
+
213
+ # ---- Internals ----
214
+
215
+ def _validate_urls(self, urls: Iterable[str]) -> List[str]:
216
+ out: List[str] = []
217
+ for u in urls:
218
+ s = (u or "").strip()
219
+ if not s:
220
+ continue
221
+ if not (s.lower().startswith("http://") or s.lower().startswith("https://")):
222
+ raise ValidationError(
223
+ f"URLs must include a scheme (http:// or https://). Got: {u!r}"
224
+ )
225
+ out.append(s)
226
+ if not out:
227
+ raise ValidationError("No valid URLs provided.")
228
+ return out
229
+
230
+ def _request(self, method: str, path: str, **kwargs):
231
+ url = f"{self.base_url}{path}"
232
+ try:
233
+ resp = self._session.request(method, url, timeout=self.timeout, **kwargs)
234
+ except requests.Timeout as e:
235
+ raise TimeoutError(f"HTTP {method} {path} timed out after {self.timeout}s") from e
236
+ except requests.RequestException as e:
237
+ raise BulkURLCheckerError(
238
+ f"Network error calling {method} {path}: {e}"
239
+ ) from e
240
+
241
+ return self._handle_response(resp)
242
+
243
+ def _handle_response(self, resp: requests.Response):
244
+ request_id = resp.headers.get("X-Request-ID")
245
+ if 200 <= resp.status_code < 300:
246
+ try:
247
+ return resp.json()
248
+ except ValueError:
249
+ return {}
250
+
251
+ # Error path — try to extract the canonical {error: {code, message}}
252
+ # envelope. Fall back to a generic message if the body isn't JSON.
253
+ message = f"HTTP {resp.status_code} on {resp.request.method} {resp.url}"
254
+ code: Optional[str] = None
255
+ details = None
256
+ try:
257
+ body = resp.json()
258
+ err = body.get("error") if isinstance(body, dict) else None
259
+ if isinstance(err, dict):
260
+ code = err.get("code") or code
261
+ message = err.get("message") or message
262
+ details = err.get("details")
263
+ else:
264
+ # Some legacy endpoints still use `detail`
265
+ d = body.get("detail") if isinstance(body, dict) else None
266
+ if isinstance(d, str):
267
+ message = d
268
+ elif isinstance(d, dict):
269
+ code = d.get("error") or code
270
+ message = d.get("message") or message
271
+ details = {k: v for k, v in d.items() if k not in ("error", "message")}
272
+ except ValueError:
273
+ pass
274
+
275
+ status = resp.status_code
276
+ common = {
277
+ "status_code": status,
278
+ "code": code,
279
+ "request_id": request_id,
280
+ "details": details,
281
+ }
282
+ if status in (401, 403) and code == "no_credits":
283
+ raise QuotaError(message, **common)
284
+ if status in (401, 403):
285
+ raise AuthenticationError(message, **common)
286
+ if status == 404:
287
+ raise NotFoundError(message, **common)
288
+ if status == 429:
289
+ retry_after = None
290
+ ra = resp.headers.get("Retry-After")
291
+ if ra:
292
+ try:
293
+ retry_after = int(float(ra))
294
+ except (TypeError, ValueError):
295
+ retry_after = None
296
+ raise RateLimitError(message, retry_after=retry_after, **common)
297
+ if status == 402:
298
+ raise QuotaError(message, **common)
299
+ if status in (400, 422):
300
+ raise ValidationError(message, **common)
301
+ if 500 <= status < 600:
302
+ raise ServerError(message, **common)
303
+ raise BulkURLCheckerError(message, **common)
@@ -0,0 +1,77 @@
1
+ """Exception hierarchy for the Bulk URL Checker SDK.
2
+
3
+ All errors derive from BulkURLCheckerError so callers can catch a
4
+ single exception type and branch on it. Specific subclasses exist
5
+ for the error categories devs actually want to handle differently:
6
+ authentication, rate limiting, quota, and validation.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any, Optional
12
+
13
+
14
+ class BulkURLCheckerError(Exception):
15
+ """Base class for all SDK errors.
16
+
17
+ Carries the HTTP status code, the server's machine-readable
18
+ `error.code`, and the request_id so support requests are easy to
19
+ triage. Always raise the most-specific subclass possible.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ message: str,
25
+ *,
26
+ status_code: Optional[int] = None,
27
+ code: Optional[str] = None,
28
+ request_id: Optional[str] = None,
29
+ details: Optional[Any] = None,
30
+ ) -> None:
31
+ super().__init__(message)
32
+ self.status_code = status_code
33
+ self.code = code
34
+ self.request_id = request_id
35
+ self.details = details
36
+
37
+ def __repr__(self) -> str: # pragma: no cover - debugging only
38
+ parts = [self.__class__.__name__, repr(str(self))]
39
+ if self.status_code is not None:
40
+ parts.append(f"status_code={self.status_code}")
41
+ if self.code:
42
+ parts.append(f"code={self.code!r}")
43
+ if self.request_id:
44
+ parts.append(f"request_id={self.request_id!r}")
45
+ return f"<{' '.join(parts)}>"
46
+
47
+
48
+ class AuthenticationError(BulkURLCheckerError):
49
+ """401 / 403. The API key is missing, invalid, or revoked."""
50
+
51
+
52
+ class RateLimitError(BulkURLCheckerError):
53
+ """429. Slow down. Inspect `retry_after` (seconds) if present."""
54
+
55
+ def __init__(self, *args: Any, retry_after: Optional[int] = None, **kwargs: Any) -> None:
56
+ super().__init__(*args, **kwargs)
57
+ self.retry_after = retry_after
58
+
59
+
60
+ class QuotaError(BulkURLCheckerError):
61
+ """402 / 403 when the user has run out of credits or hit a plan limit."""
62
+
63
+
64
+ class ValidationError(BulkURLCheckerError):
65
+ """400 / 422. The request was malformed (bad URLs, too many URLs, etc)."""
66
+
67
+
68
+ class NotFoundError(BulkURLCheckerError):
69
+ """404. The job ID isn't owned by this API key, or doesn't exist."""
70
+
71
+
72
+ class ServerError(BulkURLCheckerError):
73
+ """5xx. Transient issue on our side; safe to retry with backoff."""
74
+
75
+
76
+ class TimeoutError(BulkURLCheckerError): # noqa: A001 - intentional shadow
77
+ """The local request timeout elapsed before the server responded."""
File without changes
@@ -0,0 +1,118 @@
1
+ """Public response types for the Bulk URL Checker SDK.
2
+
3
+ These are intentionally simple dataclasses (not pydantic models) so the
4
+ SDK has zero runtime dependencies beyond `requests`. If you want full
5
+ type validation, the OpenAPI spec lives at
6
+ https://api.bulkurlchecker.com/openapi.json.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from dataclasses import dataclass, field
12
+ from typing import Any, Dict, List, Optional
13
+
14
+
15
+ @dataclass
16
+ class JobSummary:
17
+ """High-level state of a single URL-checking job."""
18
+
19
+ job_id: str
20
+ status: str # 'pending' | 'parsing' | 'processing' | 'paused' | 'completed' | 'failed' | 'cancelled'
21
+ total_urls: int
22
+ completed_urls: int = 0
23
+ credits_allocated: int = 0
24
+ duplicates_removed: int = 0
25
+ invalid_urls_rejected: int = 0
26
+ created_at: Optional[str] = None
27
+ started_at: Optional[str] = None
28
+ completed_at: Optional[str] = None
29
+
30
+ @classmethod
31
+ def from_dict(cls, d: Dict[str, Any]) -> "JobSummary":
32
+ return cls(
33
+ job_id=str(d.get("job_id") or d.get("id")),
34
+ status=str(d.get("status") or "pending"),
35
+ total_urls=int(d.get("total_urls") or 0),
36
+ completed_urls=int(d.get("completed_urls") or 0),
37
+ credits_allocated=int(d.get("credits_allocated") or 0),
38
+ duplicates_removed=int(d.get("duplicates_removed") or 0),
39
+ invalid_urls_rejected=int(d.get("invalid_urls_rejected") or 0),
40
+ created_at=d.get("created_at"),
41
+ started_at=d.get("started_at"),
42
+ completed_at=d.get("completed_at"),
43
+ )
44
+
45
+
46
+ @dataclass
47
+ class URLResult:
48
+ """One URL check result. Shape mirrors the API response."""
49
+
50
+ url: str
51
+ final_url: Optional[str] = None
52
+ status_code: Optional[int] = None
53
+ response_time_ms: Optional[int] = None
54
+ redirect_chain: List[str] = field(default_factory=list)
55
+ is_broken: bool = False
56
+ is_soft_404: bool = False
57
+ error_code: Optional[str] = None
58
+ content_type: Optional[str] = None
59
+
60
+ @classmethod
61
+ def from_dict(cls, d: Dict[str, Any]) -> "URLResult":
62
+ # The API returns slightly different field names depending on
63
+ # the endpoint version; this normalizer keeps the SDK shape
64
+ # stable across server-side changes.
65
+ return cls(
66
+ url=str(d.get("url") or ""),
67
+ final_url=d.get("final_url") or d.get("final"),
68
+ status_code=d.get("status_code") or d.get("status"),
69
+ response_time_ms=d.get("response_time_ms") or d.get("duration_ms"),
70
+ redirect_chain=list(d.get("redirect_chain") or []),
71
+ is_broken=bool(d.get("is_broken") or False),
72
+ is_soft_404=bool(d.get("is_soft_404") or False),
73
+ error_code=d.get("error_code") or d.get("error"),
74
+ content_type=d.get("content_type"),
75
+ )
76
+
77
+
78
+ @dataclass
79
+ class CheckResults:
80
+ """Complete result set from `Client.check_urls()` / `submit_and_wait()`."""
81
+
82
+ job_id: str
83
+ status: str
84
+ timed_out: bool
85
+ total_urls: int
86
+ completed_urls: int
87
+ duplicates_removed: int
88
+ invalid_urls_rejected: int
89
+ completed_at: Optional[str]
90
+ results: List[URLResult]
91
+
92
+ @property
93
+ def is_complete(self) -> bool:
94
+ """True when the engine finished the job within the wait window."""
95
+ return self.status == "completed" and not self.timed_out
96
+
97
+ @property
98
+ def broken(self) -> List[URLResult]:
99
+ """All results where the engine marked the URL broken."""
100
+ return [r for r in self.results if r.is_broken]
101
+
102
+ @property
103
+ def soft_404s(self) -> List[URLResult]:
104
+ return [r for r in self.results if r.is_soft_404]
105
+
106
+ @classmethod
107
+ def from_dict(cls, d: Dict[str, Any]) -> "CheckResults":
108
+ return cls(
109
+ job_id=str(d.get("job_id") or ""),
110
+ status=str(d.get("status") or ""),
111
+ timed_out=bool(d.get("timed_out") or False),
112
+ total_urls=int(d.get("total_urls") or 0),
113
+ completed_urls=int(d.get("completed_urls") or 0),
114
+ duplicates_removed=int(d.get("duplicates_removed") or 0),
115
+ invalid_urls_rejected=int(d.get("invalid_urls_rejected") or 0),
116
+ completed_at=d.get("completed_at"),
117
+ results=[URLResult.from_dict(r) for r in (d.get("results") or [])],
118
+ )
@@ -0,0 +1,185 @@
1
+ Metadata-Version: 2.4
2
+ Name: bulkurlchecker
3
+ Version: 0.1.0
4
+ Summary: Python client for the Bulk URL Checker API. Skip the proxy-rotation + rate-limiter + soft-404-detector you would otherwise have to build.
5
+ Author-email: Bulk URL Checker <carlos@bulkurlchecker.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://bulkurlchecker.com
8
+ Project-URL: Documentation, https://bulkurlchecker.com/developers
9
+ Project-URL: Source Code, https://github.com/carlosofscience/bulkurlchecker-python
10
+ Project-URL: Bug Tracker, https://github.com/carlosofscience/bulkurlchecker-python/issues
11
+ Project-URL: Changelog, https://github.com/carlosofscience/bulkurlchecker-python/blob/main/CHANGELOG.md
12
+ Keywords: url-checker,bulk-url-checker,broken-link-checker,http-status-checker,seo-tools,link-validator,url-validation,redirect-checker
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Internet :: WWW/HTTP
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
+ Classifier: Typing :: Typed
26
+ Requires-Python: >=3.8
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Requires-Dist: requests>=2.25
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest>=7; extra == "dev"
32
+ Requires-Dist: pytest-cov>=4; extra == "dev"
33
+ Requires-Dist: responses>=0.23; extra == "dev"
34
+ Requires-Dist: ruff>=0.1; extra == "dev"
35
+ Requires-Dist: mypy>=1.0; extra == "dev"
36
+ Dynamic: license-file
37
+
38
+ # bulkurlchecker
39
+
40
+ [![PyPI version](https://img.shields.io/pypi/v/bulkurlchecker.svg)](https://pypi.org/project/bulkurlchecker/)
41
+ [![Python versions](https://img.shields.io/pypi/pyversions/bulkurlchecker.svg)](https://pypi.org/project/bulkurlchecker/)
42
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
43
+
44
+ Python client for the [Bulk URL Checker](https://bulkurlchecker.com) API.
45
+
46
+ **Skip the proxy-rotation, rate-limiter, soft-404 detector, and retry classifier you would otherwise spend two weeks building.** Submit thousands of URLs, get status codes, redirect chains, and broken-link detection back as plain Python objects. Backed by a managed cloud service with residential proxies and per-domain throttling.
47
+
48
+ ## Install
49
+
50
+ ```bash
51
+ pip install bulkurlchecker
52
+ ```
53
+
54
+ ## 5-line example
55
+
56
+ ```python
57
+ from bulkurlchecker import Client
58
+
59
+ client = Client(api_key="uck_live_...")
60
+ results = client.check_urls(["https://example.com", "https://example.org"])
61
+ for r in results.results:
62
+ print(r.url, r.status_code, "BROKEN" if r.is_broken else "ok")
63
+ ```
64
+
65
+ Get an API key at https://app.bulkurlchecker.com/dashboard/api-keys. First 300 URLs are free, no card required.
66
+
67
+ ## What you get back
68
+
69
+ ```python
70
+ results = client.check_urls(urls)
71
+
72
+ results.status # 'completed' | 'paused' | 'failed' | 'cancelled'
73
+ results.timed_out # True if the wait deadline passed (job still running)
74
+ results.total_urls # how many URLs the engine accepted
75
+ results.completed_urls # how many it finished checking
76
+ results.duplicates_removed
77
+ results.invalid_urls_rejected
78
+
79
+ for r in results.results:
80
+ r.url # the original URL you submitted
81
+ r.final_url # after redirects
82
+ r.status_code # 200, 301, 404, 429, 500, ...
83
+ r.redirect_chain # list of intermediate URLs
84
+ r.is_broken # True if the engine flagged this as broken
85
+ r.is_soft_404 # True if 200 OK but page content says "not found"
86
+ r.response_time_ms
87
+
88
+ # Convenience properties:
89
+ results.broken # list of URLResult where is_broken == True
90
+ results.soft_404s # list where is_soft_404 == True
91
+ ```
92
+
93
+ ## Larger jobs: submit and poll
94
+
95
+ `check_urls()` blocks for up to 15 minutes server-side. For lists where the wait would time out, use the two-step pattern:
96
+
97
+ ```python
98
+ job = client.submit(my_500k_urls)
99
+ print(f"Submitted {job.job_id}, {job.total_urls} URLs queued")
100
+
101
+ # Poll explicitly, or use the convenience method
102
+ done = client.wait_until_done(job.job_id, timeout=3600)
103
+
104
+ # Stream results in pages
105
+ for batch in client.iter_results(job.job_id, page_size=1000):
106
+ for r in batch:
107
+ if r.is_broken:
108
+ print(r.url, r.status_code)
109
+ ```
110
+
111
+ ## Error handling
112
+
113
+ All errors derive from `BulkURLCheckerError`. Catch specific subclasses when you want to branch on the failure mode:
114
+
115
+ ```python
116
+ from bulkurlchecker import (
117
+ Client,
118
+ BulkURLCheckerError,
119
+ AuthenticationError,
120
+ RateLimitError,
121
+ QuotaError,
122
+ ValidationError,
123
+ )
124
+
125
+ try:
126
+ results = client.check_urls(urls)
127
+ except QuotaError as e:
128
+ print(f"Out of credits. Top up at https://app.bulkurlchecker.com/billing")
129
+ except RateLimitError as e:
130
+ print(f"Rate limited. Retry after {e.retry_after}s.")
131
+ except AuthenticationError:
132
+ print("API key rejected — check it's not revoked.")
133
+ except ValidationError as e:
134
+ print(f"Bad request: {e}") # bad URLs, too many URLs, etc.
135
+ except BulkURLCheckerError as e:
136
+ print(f"Other error: {e} (request_id={e.request_id})")
137
+ ```
138
+
139
+ Every error carries `status_code`, `code` (server's machine-readable string), `request_id` (for support), and `details` (when the server provides them).
140
+
141
+ ## Why use this instead of writing your own checker with httpx + asyncio?
142
+
143
+ Honest answer: for ≤500 URLs you don't need this. The standard `requests`/`httpx` toolchain handles it fine.
144
+
145
+ The wall hits at scale:
146
+
147
+ | Problem | Rolling your own | This SDK |
148
+ |---|---|---|
149
+ | Concurrency | `asyncio` + careful semaphores | done |
150
+ | Proxy rotation across residential IPs | $90+/mo Webshare / Bright Data subscription + custom code | done |
151
+ | Per-domain rate limiting (so you don't hammer one host) | wire it yourself | done |
152
+ | Distinguishing real 403 from "you got blocked" 403 | guess and check | done |
153
+ | Detecting soft 404s (200 OK + "not found" body) | regex / heuristic per template | done |
154
+ | Retry classification (transient vs permanent) | tune for weeks | done |
155
+ | Long-running job state (resume after crash) | Redis + queue + worker infra | done |
156
+ | Engineer time, weeks 1-4 | $$$ | nothing, ship today |
157
+
158
+ If you've already lost a weekend to httpx + proxy rotation, you know what we're talking about.
159
+
160
+ ## Pricing
161
+
162
+ - **Free tier:** 300 URL checks. No signup required.
163
+ - **Starter:** $9/month or $90/year (~17% off) — 15,000 URLs/month
164
+ - **Pro:** $29/month or $290/year — 50,000 URLs/month, 5 scheduled checks, daily monitoring
165
+ - **Agency:** $99/month or $990/year — 200,000 URLs/month, 50 schedules, Slack + webhook alerts
166
+
167
+ Top-up credit packs available beyond the monthly pool. Credits never expire.
168
+
169
+ Full pricing: https://bulkurlchecker.com/#pricing
170
+
171
+ ## Links
172
+
173
+ - [Web app](https://app.bulkurlchecker.com)
174
+ - [REST API reference](https://bulkurlchecker.com/developers)
175
+ - [OpenAPI spec](https://api.bulkurlchecker.com/openapi.json)
176
+ - [GitHub](https://github.com/carlosofscience/bulkurlchecker-python)
177
+ - [Changelog](CHANGELOG.md)
178
+
179
+ ## Stability
180
+
181
+ The SDK follows semver. While we're at 0.x, breaking changes can land in minor releases (we'll always note them in `CHANGELOG.md`). Once we hit 1.0 you can pin major versions safely.
182
+
183
+ ## License
184
+
185
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,11 @@
1
+ bulkurlchecker/__init__.py,sha256=Ms9yWEEc2yb88KrdJZrhQ7CAHL9BZRqXUrqg6NdtWsk,991
2
+ bulkurlchecker/_version.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
3
+ bulkurlchecker/client.py,sha256=GzOb_lARVwlFgI4SDkcHZPEWrgRGGIX3Bxd0myJKBn0,11150
4
+ bulkurlchecker/exceptions.py,sha256=EWLXZWQmBg-pQohGRsEX45LHt9AOrmgsjzLFBQnNwfo,2528
5
+ bulkurlchecker/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ bulkurlchecker/types.py,sha256=9210KxWKV9SnghOAX6O-UUq39dq2SE7GSa09OMwjZzM,4278
7
+ bulkurlchecker-0.1.0.dist-info/licenses/LICENSE,sha256=R9-95i5U2iwohiXs4napv1aiu1nAd3tRvW3geDL8Ky4,1073
8
+ bulkurlchecker-0.1.0.dist-info/METADATA,sha256=30xYsMRMOs_LXkM2WM3XeGOQqCl610HYZI45P9HNv-A,7438
9
+ bulkurlchecker-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
10
+ bulkurlchecker-0.1.0.dist-info/top_level.txt,sha256=ApFhZQ33R6RdOCXCplNtbor2M2VffWUdDxv5okxSM3Y,15
11
+ bulkurlchecker-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Bulk URL Checker
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ bulkurlchecker