bulkurlchecker 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Bulk URL Checker
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,185 @@
1
+ Metadata-Version: 2.4
2
+ Name: bulkurlchecker
3
+ Version: 0.1.0
4
+ Summary: Python client for the Bulk URL Checker API. Skip the proxy-rotation + rate-limiter + soft-404-detector you would otherwise have to build.
5
+ Author-email: Bulk URL Checker <carlos@bulkurlchecker.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://bulkurlchecker.com
8
+ Project-URL: Documentation, https://bulkurlchecker.com/developers
9
+ Project-URL: Source Code, https://github.com/carlosofscience/bulkurlchecker-python
10
+ Project-URL: Bug Tracker, https://github.com/carlosofscience/bulkurlchecker-python/issues
11
+ Project-URL: Changelog, https://github.com/carlosofscience/bulkurlchecker-python/blob/main/CHANGELOG.md
12
+ Keywords: url-checker,bulk-url-checker,broken-link-checker,http-status-checker,seo-tools,link-validator,url-validation,redirect-checker
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Internet :: WWW/HTTP
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
+ Classifier: Typing :: Typed
26
+ Requires-Python: >=3.8
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Requires-Dist: requests>=2.25
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest>=7; extra == "dev"
32
+ Requires-Dist: pytest-cov>=4; extra == "dev"
33
+ Requires-Dist: responses>=0.23; extra == "dev"
34
+ Requires-Dist: ruff>=0.1; extra == "dev"
35
+ Requires-Dist: mypy>=1.0; extra == "dev"
36
+ Dynamic: license-file
37
+
38
+ # bulkurlchecker
39
+
40
+ [![PyPI version](https://img.shields.io/pypi/v/bulkurlchecker.svg)](https://pypi.org/project/bulkurlchecker/)
41
+ [![Python versions](https://img.shields.io/pypi/pyversions/bulkurlchecker.svg)](https://pypi.org/project/bulkurlchecker/)
42
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
43
+
44
+ Python client for the [Bulk URL Checker](https://bulkurlchecker.com) API.
45
+
46
+ **Skip the proxy-rotation, rate-limiter, soft-404 detector, and retry classifier you would otherwise spend two weeks building.** Submit thousands of URLs, get status codes, redirect chains, and broken-link detection back as plain Python objects. Backed by a managed cloud service with residential proxies and per-domain throttling.
47
+
48
+ ## Install
49
+
50
+ ```bash
51
+ pip install bulkurlchecker
52
+ ```
53
+
54
+ ## 5-line example
55
+
56
+ ```python
57
+ from bulkurlchecker import Client
58
+
59
+ client = Client(api_key="uck_live_...")
60
+ results = client.check_urls(["https://example.com", "https://example.org"])
61
+ for r in results.results:
62
+ print(r.url, r.status_code, "BROKEN" if r.is_broken else "ok")
63
+ ```
64
+
65
+ Get an API key at https://app.bulkurlchecker.com/dashboard/api-keys. First 300 URLs are free, no card required.
66
+
67
+ ## What you get back
68
+
69
+ ```python
70
+ results = client.check_urls(urls)
71
+
72
+ results.status # 'completed' | 'paused' | 'failed' | 'cancelled'
73
+ results.timed_out # True if the wait deadline passed (job still running)
74
+ results.total_urls # how many URLs the engine accepted
75
+ results.completed_urls # how many it finished checking
76
+ results.duplicates_removed
77
+ results.invalid_urls_rejected
78
+
79
+ for r in results.results:
80
+ r.url # the original URL you submitted
81
+ r.final_url # after redirects
82
+ r.status_code # 200, 301, 404, 429, 500, ...
83
+ r.redirect_chain # list of intermediate URLs
84
+ r.is_broken # True if the engine flagged this as broken
85
+ r.is_soft_404 # True if 200 OK but page content says "not found"
86
+ r.response_time_ms
87
+
88
+ # Convenience properties:
89
+ results.broken # list of URLResult where is_broken == True
90
+ results.soft_404s # list where is_soft_404 == True
91
+ ```
92
+
93
+ ## Larger jobs: submit and poll
94
+
95
+ `check_urls()` blocks for up to 15 minutes server-side. For lists where the wait would time out, use the two-step pattern:
96
+
97
+ ```python
98
+ job = client.submit(my_500k_urls)
99
+ print(f"Submitted {job.job_id}, {job.total_urls} URLs queued")
100
+
101
+ # Poll explicitly, or use the convenience method
102
+ done = client.wait_until_done(job.job_id, timeout=3600)
103
+
104
+ # Stream results in pages
105
+ for batch in client.iter_results(job.job_id, page_size=1000):
106
+ for r in batch:
107
+ if r.is_broken:
108
+ print(r.url, r.status_code)
109
+ ```
110
+
111
+ ## Error handling
112
+
113
+ All errors derive from `BulkURLCheckerError`. Catch specific subclasses when you want to branch on the failure mode:
114
+
115
+ ```python
116
+ from bulkurlchecker import (
117
+ Client,
118
+ BulkURLCheckerError,
119
+ AuthenticationError,
120
+ RateLimitError,
121
+ QuotaError,
122
+ ValidationError,
123
+ )
124
+
125
+ try:
126
+ results = client.check_urls(urls)
127
+ except QuotaError as e:
128
+ print(f"Out of credits. Top up at https://app.bulkurlchecker.com/billing")
129
+ except RateLimitError as e:
130
+ print(f"Rate limited. Retry after {e.retry_after}s.")
131
+ except AuthenticationError:
132
+ print("API key rejected — check it's not revoked.")
133
+ except ValidationError as e:
134
+ print(f"Bad request: {e}") # bad URLs, too many URLs, etc.
135
+ except BulkURLCheckerError as e:
136
+ print(f"Other error: {e} (request_id={e.request_id})")
137
+ ```
138
+
139
+ Every error carries `status_code`, `code` (server's machine-readable string), `request_id` (for support), and `details` (when the server provides them).
140
+
141
+ ## Why use this instead of writing your own checker with httpx + asyncio?
142
+
143
+ Honest answer: for ≤500 URLs you don't need this. The standard `requests`/`httpx` toolchain handles it fine.
144
+
145
+ The wall hits at scale:
146
+
147
+ | Problem | Rolling your own | This SDK |
148
+ |---|---|---|
149
+ | Concurrency | `asyncio` + careful semaphores | done |
150
+ | Proxy rotation across residential IPs | $90+/mo Webshare / Bright Data subscription + custom code | done |
151
+ | Per-domain rate limiting (so you don't hammer one host) | wire it yourself | done |
152
+ | Distinguishing real 403 from "you got blocked" 403 | guess and check | done |
153
+ | Detecting soft 404s (200 OK + "not found" body) | regex / heuristic per template | done |
154
+ | Retry classification (transient vs permanent) | tune for weeks | done |
155
+ | Long-running job state (resume after crash) | Redis + queue + worker infra | done |
156
+ | Engineer time, weeks 1-4 | $$$ | nothing, ship today |
157
+
158
+ If you've already lost a weekend to httpx + proxy rotation, you know what we're talking about.
159
+
160
+ ## Pricing
161
+
162
+ - **Free tier:** 300 URL checks. No signup required.
163
+ - **Starter:** $9/month or $90/year (~17% off) — 15,000 URLs/month
164
+ - **Pro:** $29/month or $290/year — 50,000 URLs/month, 5 scheduled checks, daily monitoring
165
+ - **Agency:** $99/month or $990/year — 200,000 URLs/month, 50 schedules, Slack + webhook alerts
166
+
167
+ Top-up credit packs available beyond the monthly pool. Credits never expire.
168
+
169
+ Full pricing: https://bulkurlchecker.com/#pricing
170
+
171
+ ## Links
172
+
173
+ - [Web app](https://app.bulkurlchecker.com)
174
+ - [REST API reference](https://bulkurlchecker.com/developers)
175
+ - [OpenAPI spec](https://api.bulkurlchecker.com/openapi.json)
176
+ - [GitHub](https://github.com/carlosofscience/bulkurlchecker-python)
177
+ - [Changelog](CHANGELOG.md)
178
+
179
+ ## Stability
180
+
181
+ The SDK follows semver. While we're at 0.x, breaking changes can land in minor releases (we'll always note them in `CHANGELOG.md`). Once we hit 1.0 you can pin major versions safely.
182
+
183
+ ## License
184
+
185
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,148 @@
1
+ # bulkurlchecker
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/bulkurlchecker.svg)](https://pypi.org/project/bulkurlchecker/)
4
+ [![Python versions](https://img.shields.io/pypi/pyversions/bulkurlchecker.svg)](https://pypi.org/project/bulkurlchecker/)
5
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
6
+
7
+ Python client for the [Bulk URL Checker](https://bulkurlchecker.com) API.
8
+
9
+ **Skip the proxy-rotation, rate-limiter, soft-404 detector, and retry classifier you would otherwise spend two weeks building.** Submit thousands of URLs, get status codes, redirect chains, and broken-link detection back as plain Python objects. Backed by a managed cloud service with residential proxies and per-domain throttling.
10
+
11
+ ## Install
12
+
13
+ ```bash
14
+ pip install bulkurlchecker
15
+ ```
16
+
17
+ ## 5-line example
18
+
19
+ ```python
20
+ from bulkurlchecker import Client
21
+
22
+ client = Client(api_key="uck_live_...")
23
+ results = client.check_urls(["https://example.com", "https://example.org"])
24
+ for r in results.results:
25
+ print(r.url, r.status_code, "BROKEN" if r.is_broken else "ok")
26
+ ```
27
+
28
+ Get an API key at https://app.bulkurlchecker.com/dashboard/api-keys. First 300 URLs are free, no card required.
29
+
30
+ ## What you get back
31
+
32
+ ```python
33
+ results = client.check_urls(urls)
34
+
35
+ results.status # 'completed' | 'paused' | 'failed' | 'cancelled'
36
+ results.timed_out # True if the wait deadline passed (job still running)
37
+ results.total_urls # how many URLs the engine accepted
38
+ results.completed_urls # how many it finished checking
39
+ results.duplicates_removed
40
+ results.invalid_urls_rejected
41
+
42
+ for r in results.results:
43
+ r.url # the original URL you submitted
44
+ r.final_url # after redirects
45
+ r.status_code # 200, 301, 404, 429, 500, ...
46
+ r.redirect_chain # list of intermediate URLs
47
+ r.is_broken # True if the engine flagged this as broken
48
+ r.is_soft_404 # True if 200 OK but page content says "not found"
49
+ r.response_time_ms
50
+
51
+ # Convenience properties:
52
+ results.broken # list of URLResult where is_broken == True
53
+ results.soft_404s # list where is_soft_404 == True
54
+ ```
55
+
56
+ ## Larger jobs: submit and poll
57
+
58
+ `check_urls()` blocks for up to 15 minutes server-side. For lists where the wait would time out, use the two-step pattern:
59
+
60
+ ```python
61
+ job = client.submit(my_500k_urls)
62
+ print(f"Submitted {job.job_id}, {job.total_urls} URLs queued")
63
+
64
+ # Poll explicitly, or use the convenience method
65
+ done = client.wait_until_done(job.job_id, timeout=3600)
66
+
67
+ # Stream results in pages
68
+ for batch in client.iter_results(job.job_id, page_size=1000):
69
+ for r in batch:
70
+ if r.is_broken:
71
+ print(r.url, r.status_code)
72
+ ```
73
+
74
+ ## Error handling
75
+
76
+ All errors derive from `BulkURLCheckerError`. Catch specific subclasses when you want to branch on the failure mode:
77
+
78
+ ```python
79
+ from bulkurlchecker import (
80
+ Client,
81
+ BulkURLCheckerError,
82
+ AuthenticationError,
83
+ RateLimitError,
84
+ QuotaError,
85
+ ValidationError,
86
+ )
87
+
88
+ try:
89
+ results = client.check_urls(urls)
90
+ except QuotaError as e:
91
+ print(f"Out of credits. Top up at https://app.bulkurlchecker.com/billing")
92
+ except RateLimitError as e:
93
+ print(f"Rate limited. Retry after {e.retry_after}s.")
94
+ except AuthenticationError:
95
+ print("API key rejected — check it's not revoked.")
96
+ except ValidationError as e:
97
+ print(f"Bad request: {e}") # bad URLs, too many URLs, etc.
98
+ except BulkURLCheckerError as e:
99
+ print(f"Other error: {e} (request_id={e.request_id})")
100
+ ```
101
+
102
+ Every error carries `status_code`, `code` (server's machine-readable string), `request_id` (for support), and `details` (when the server provides them).
103
+
104
+ ## Why use this instead of writing your own checker with httpx + asyncio?
105
+
106
+ Honest answer: for ≤500 URLs you don't need this. The standard `requests`/`httpx` toolchain handles it fine.
107
+
108
+ The wall hits at scale:
109
+
110
+ | Problem | Rolling your own | This SDK |
111
+ |---|---|---|
112
+ | Concurrency | `asyncio` + careful semaphores | done |
113
+ | Proxy rotation across residential IPs | $90+/mo Webshare / Bright Data subscription + custom code | done |
114
+ | Per-domain rate limiting (so you don't hammer one host) | wire it yourself | done |
115
+ | Distinguishing real 403 from "you got blocked" 403 | guess and check | done |
116
+ | Detecting soft 404s (200 OK + "not found" body) | regex / heuristic per template | done |
117
+ | Retry classification (transient vs permanent) | tune for weeks | done |
118
+ | Long-running job state (resume after crash) | Redis + queue + worker infra | done |
119
+ | Engineer time, weeks 1-4 | $$$ | nothing, ship today |
120
+
121
+ If you've already lost a weekend to httpx + proxy rotation, you know what we're talking about.
122
+
123
+ ## Pricing
124
+
125
+ - **Free tier:** 300 URL checks. No signup required.
126
+ - **Starter:** $9/month or $90/year (~17% off) — 15,000 URLs/month
127
+ - **Pro:** $29/month or $290/year — 50,000 URLs/month, 5 scheduled checks, daily monitoring
128
+ - **Agency:** $99/month or $990/year — 200,000 URLs/month, 50 schedules, Slack + webhook alerts
129
+
130
+ Top-up credit packs available beyond the monthly pool. Credits never expire.
131
+
132
+ Full pricing: https://bulkurlchecker.com/#pricing
133
+
134
+ ## Links
135
+
136
+ - [Web app](https://app.bulkurlchecker.com)
137
+ - [REST API reference](https://bulkurlchecker.com/developers)
138
+ - [OpenAPI spec](https://api.bulkurlchecker.com/openapi.json)
139
+ - [GitHub](https://github.com/carlosofscience/bulkurlchecker-python)
140
+ - [Changelog](CHANGELOG.md)
141
+
142
+ ## Stability
143
+
144
+ The SDK follows semver. While we're at 0.x, breaking changes can land in minor releases (we'll always note them in `CHANGELOG.md`). Once we hit 1.0 you can pin major versions safely.
145
+
146
+ ## License
147
+
148
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,45 @@
1
+ """bulkurlchecker — Python client for the Bulk URL Checker API.
2
+
3
+ Quickstart:
4
+
5
+ from bulkurlchecker import Client
6
+ client = Client(api_key="uck_live_...")
7
+ results = client.check_urls([
8
+ "https://example.com",
9
+ "https://example.org",
10
+ ])
11
+ for r in results.results:
12
+ print(r.url, r.status_code)
13
+
14
+ Get an API key at https://app.bulkurlchecker.com/dashboard/api-keys.
15
+ """
16
+
17
+ from ._version import __version__
18
+ from .client import Client
19
+ from .exceptions import (
20
+ AuthenticationError,
21
+ BulkURLCheckerError,
22
+ NotFoundError,
23
+ QuotaError,
24
+ RateLimitError,
25
+ ServerError,
26
+ TimeoutError,
27
+ ValidationError,
28
+ )
29
+ from .types import CheckResults, JobSummary, URLResult
30
+
31
+ __all__ = [
32
+ "__version__",
33
+ "Client",
34
+ "CheckResults",
35
+ "JobSummary",
36
+ "URLResult",
37
+ "BulkURLCheckerError",
38
+ "AuthenticationError",
39
+ "RateLimitError",
40
+ "QuotaError",
41
+ "ValidationError",
42
+ "NotFoundError",
43
+ "ServerError",
44
+ "TimeoutError",
45
+ ]
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"