birddog 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- birddog-0.1.0/.gitignore +11 -0
- birddog-0.1.0/LICENSE +21 -0
- birddog-0.1.0/PKG-INFO +186 -0
- birddog-0.1.0/README.md +149 -0
- birddog-0.1.0/examples/scrape_demo.py +128 -0
- birddog-0.1.0/examples/watchdog_agent.py +217 -0
- birddog-0.1.0/pyproject.toml +53 -0
- birddog-0.1.0/src/birddog/__init__.py +63 -0
- birddog-0.1.0/src/birddog/birddog.py +312 -0
- birddog-0.1.0/src/birddog/dashboard.py +73 -0
- birddog-0.1.0/tests/test_birddog.py +193 -0
birddog-0.1.0/.gitignore
ADDED
birddog-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mukunda Rao Katta
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
birddog-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: birddog
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Domain allowlist, per-domain rate caps, response audit, and a Streamlit dashboard for Bright Data scraping agents. The picks-and-shovels layer for any agent that consumes the live web.
|
|
5
|
+
Project-URL: Homepage, https://github.com/MukundaKatta/birddog
|
|
6
|
+
Project-URL: Source, https://github.com/MukundaKatta/birddog
|
|
7
|
+
Project-URL: Issues, https://github.com/MukundaKatta/birddog/issues
|
|
8
|
+
Author-email: Mukunda Rao Katta <mukunda.vjcs6@gmail.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: agents,ai,audit,bright-data,egress,llm,rate-limit,scraping,web-data
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Requires-Dist: httpx>=0.27
|
|
26
|
+
Provides-Extra: brightdata
|
|
27
|
+
Provides-Extra: dashboard
|
|
28
|
+
Requires-Dist: pandas>=2.2; extra == 'dashboard'
|
|
29
|
+
Requires-Dist: streamlit>=1.40; extra == 'dashboard'
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pandas>=2.2; extra == 'dev'
|
|
32
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: respx>=0.21; extra == 'dev'
|
|
34
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
35
|
+
Requires-Dist: streamlit>=1.40; extra == 'dev'
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
# birddog
|
|
39
|
+
|
|
40
|
+
Audited Bright Data egress for AI agents. Drop one context manager
|
|
41
|
+
around an agent that scrapes the web and you get:
|
|
42
|
+
|
|
43
|
+
1. **Domain allowlist** — deny everything outside it, log the attempt
|
|
44
|
+
2. **Per-domain rate caps** — simple token bucket per host
|
|
45
|
+
3. **Response audit log** — one JSONL line per fetch (url, status, bytes, ms)
|
|
46
|
+
4. **Bright Data Web Unlocker proxy** — opt-in: route via Bright Data
|
|
47
|
+
5. **Streamlit dashboard** — point it at the JSONL, get per-host bytes,
|
|
48
|
+
denial counts, latency p50
|
|
49
|
+
|
|
50
|
+
Built for the kind of agent that hits live sites: research bots, price
|
|
51
|
+
trackers, RAG ingest jobs. If you've ever watched an agent rip
|
|
52
|
+
through a sponsor's free tier in 30 seconds, this is for you.
|
|
53
|
+
|
|
54
|
+
## Install
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install birddog # core
|
|
58
|
+
pip install "birddog[dashboard]" # + Streamlit dashboard
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
Python 3.10+.
|
|
62
|
+
|
|
63
|
+
## Why
|
|
64
|
+
|
|
65
|
+
LLM agents don't know what a sane scraping cadence looks like. They'll
|
|
66
|
+
hammer a site, ignore robots.txt, follow links into spammy subdomains,
|
|
67
|
+
and burn through a Bright Data quota in a single run.
|
|
68
|
+
|
|
69
|
+
`birddog` puts a leash on the egress side:
|
|
70
|
+
|
|
71
|
+
| Concern | What birddog does |
|
|
72
|
+
|-----------------------|----------------------------------------------------|
|
|
73
|
+
| Wandering off-domain | Allowlist with `example.com` + `*.example.com` |
|
|
74
|
+
| Burst scraping | Token bucket per host (qps + burst) |
|
|
75
|
+
| "What did it fetch?" | JSONL audit log, one event per fetch |
|
|
76
|
+
| Anti-bot blocks | Optional Bright Data Web Unlocker proxy |
|
|
77
|
+
| Post-run review | Bundled Streamlit dashboard |
|
|
78
|
+
|
|
79
|
+
It does **not** parse HTML, manage cookies, render JS, or rotate user
|
|
80
|
+
agents. That's what Bright Data + your scraping code are for.
|
|
81
|
+
|
|
82
|
+
## Usage
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from birddog import Birddog
|
|
86
|
+
|
|
87
|
+
bd = Birddog(
|
|
88
|
+
allowed_domains={"docs.brightdata.com", "*.example.com"},
|
|
89
|
+
per_domain_qps=1.0,
|
|
90
|
+
per_domain_burst=2.0,
|
|
91
|
+
audit_path="runs/scrape.jsonl",
|
|
92
|
+
# Optional — route through Bright Data Web Unlocker:
|
|
93
|
+
bright_data={
|
|
94
|
+
"host": "brd.superproxy.io:33335",
|
|
95
|
+
"username": "brd-customer-...-zone-web_unlocker",
|
|
96
|
+
"password": "...",
|
|
97
|
+
},
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
with bd.session("research-bot") as s:
|
|
101
|
+
r = s.fetch("https://docs.brightdata.com/api")
|
|
102
|
+
print(r.status, r.bytes_len, "bytes")
|
|
103
|
+
|
|
104
|
+
# second hit within 1s -> RateLimitedError (qps cap = 1)
|
|
105
|
+
s.fetch("https://docs.brightdata.com/pricing")
|
|
106
|
+
|
|
107
|
+
# off-allowlist -> DomainDeniedError, also logged
|
|
108
|
+
s.fetch("https://evil.example/exfil")
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
`FetchResult` carries `url`, `status`, `text`, `headers`, `elapsed_ms`,
|
|
112
|
+
and a `via_brightdata` flag so downstream code can tell whether the
|
|
113
|
+
response came through the proxy.
|
|
114
|
+
|
|
115
|
+
## Audit log
|
|
116
|
+
|
|
117
|
+
One JSON object per line, e.g.:
|
|
118
|
+
|
|
119
|
+
```json
|
|
120
|
+
{"ts":1747779600.12,"session_id":"research-bot","kind":"fetch_ok",
|
|
121
|
+
"url":"https://docs.brightdata.com/api","host":"docs.brightdata.com",
|
|
122
|
+
"status":200,"bytes":4221,"elapsed_ms":312.4}
|
|
123
|
+
{"ts":1747779600.45,"session_id":"research-bot","kind":"domain_denied",
|
|
124
|
+
"url":"https://evil.example/exfil","host":"evil.example",
|
|
125
|
+
"error":"host 'evil.example' not in allowlist"}
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Kinds: `session_open`, `fetch_ok`, `fetch_failed`, `domain_denied`,
|
|
129
|
+
`rate_limited`, `session_close`.
|
|
130
|
+
|
|
131
|
+
## Dashboard
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
pip install "birddog[dashboard]"
|
|
135
|
+
streamlit run -m birddog.dashboard -- --audit runs/scrape.jsonl
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
Shows total fetches, denials, bytes, and a per-host breakdown of
|
|
139
|
+
fetches + bytes + p50 latency.
|
|
140
|
+
|
|
141
|
+
## Demos
|
|
142
|
+
|
|
143
|
+
Two runnable examples in `examples/`:
|
|
144
|
+
|
|
145
|
+
**1. Smoke test — `scrape_demo.py`**
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
python examples/scrape_demo.py
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Hits each feature once: happy path, domain denial, rate-limit burst,
|
|
152
|
+
summary. Offline via `httpx.MockTransport`.
|
|
153
|
+
|
|
154
|
+
**2. Realistic agent — `watchdog_agent.py`**
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
python examples/watchdog_agent.py
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
A small price-tracker agent. Polls a watchlist of product pages,
|
|
161
|
+
extracts prices, alerts when something moves more than a per-product
|
|
162
|
+
threshold. Three passes show:
|
|
163
|
+
|
|
164
|
+
- allowlist denials (off-domain mirror URL is dropped)
|
|
165
|
+
- per-domain rate cap kicking in on pass 3
|
|
166
|
+
- threshold alerts (`Δ -6.4% > 3.0%`)
|
|
167
|
+
- a `runs/watchdog.jsonl` audit log you can dashboard
|
|
168
|
+
|
|
169
|
+
Set `BIRDDOG_USE_BRIGHTDATA=1` + your Bright Data Web Unlocker env
|
|
170
|
+
vars to flip the demo to a real proxy.
|
|
171
|
+
|
|
172
|
+
## Companion libraries
|
|
173
|
+
|
|
174
|
+
`birddog` is the egress half of a small agent-stack:
|
|
175
|
+
|
|
176
|
+
- [agentleash](https://github.com/MukundaKatta/agentleash) — USD/call budget cap + tool-arg schema gate
|
|
177
|
+
- [agentvet](https://github.com/MukundaKatta/agentvet) — tool-arg validation with LLM-friendly retry hints
|
|
178
|
+
- [agentsnap](https://github.com/MukundaKatta/agentsnap) — snapshot tests for agent traces
|
|
179
|
+
- [agenttrace](https://github.com/MukundaKatta/agenttrace) — cost + latency aggregation per run
|
|
180
|
+
|
|
181
|
+
Pair `birddog` with `agentleash` and you have egress allowlist + budget
|
|
182
|
+
cap on the same agent.
|
|
183
|
+
|
|
184
|
+
## License
|
|
185
|
+
|
|
186
|
+
MIT
|
birddog-0.1.0/README.md
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# birddog
|
|
2
|
+
|
|
3
|
+
Audited Bright Data egress for AI agents. Drop one context manager
|
|
4
|
+
around an agent that scrapes the web and you get:
|
|
5
|
+
|
|
6
|
+
1. **Domain allowlist** — deny everything outside it, log the attempt
|
|
7
|
+
2. **Per-domain rate caps** — simple token bucket per host
|
|
8
|
+
3. **Response audit log** — one JSONL line per fetch (url, status, bytes, ms)
|
|
9
|
+
4. **Bright Data Web Unlocker proxy** — opt-in: route via Bright Data
|
|
10
|
+
5. **Streamlit dashboard** — point it at the JSONL, get per-host bytes,
|
|
11
|
+
denial counts, latency p50
|
|
12
|
+
|
|
13
|
+
Built for the kind of agent that hits live sites: research bots, price
|
|
14
|
+
trackers, RAG ingest jobs. If you've ever watched an agent rip
|
|
15
|
+
through a sponsor's free tier in 30 seconds, this is for you.
|
|
16
|
+
|
|
17
|
+
## Install
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install birddog # core
|
|
21
|
+
pip install "birddog[dashboard]" # + Streamlit dashboard
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Python 3.10+.
|
|
25
|
+
|
|
26
|
+
## Why
|
|
27
|
+
|
|
28
|
+
LLM agents don't know what a sane scraping cadence looks like. They'll
|
|
29
|
+
hammer a site, ignore robots.txt, follow links into spammy subdomains,
|
|
30
|
+
and burn through a Bright Data quota in a single run.
|
|
31
|
+
|
|
32
|
+
`birddog` puts a leash on the egress side:
|
|
33
|
+
|
|
34
|
+
| Concern | What birddog does |
|
|
35
|
+
|-----------------------|----------------------------------------------------|
|
|
36
|
+
| Wandering off-domain | Allowlist with `example.com` + `*.example.com` |
|
|
37
|
+
| Burst scraping | Token bucket per host (qps + burst) |
|
|
38
|
+
| "What did it fetch?" | JSONL audit log, one event per fetch |
|
|
39
|
+
| Anti-bot blocks | Optional Bright Data Web Unlocker proxy |
|
|
40
|
+
| Post-run review | Bundled Streamlit dashboard |
|
|
41
|
+
|
|
42
|
+
It does **not** parse HTML, manage cookies, render JS, or rotate user
|
|
43
|
+
agents. That's what Bright Data + your scraping code are for.
|
|
44
|
+
|
|
45
|
+
## Usage
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from birddog import Birddog
|
|
49
|
+
|
|
50
|
+
bd = Birddog(
|
|
51
|
+
allowed_domains={"docs.brightdata.com", "*.example.com"},
|
|
52
|
+
per_domain_qps=1.0,
|
|
53
|
+
per_domain_burst=2.0,
|
|
54
|
+
audit_path="runs/scrape.jsonl",
|
|
55
|
+
# Optional — route through Bright Data Web Unlocker:
|
|
56
|
+
bright_data={
|
|
57
|
+
"host": "brd.superproxy.io:33335",
|
|
58
|
+
"username": "brd-customer-...-zone-web_unlocker",
|
|
59
|
+
"password": "...",
|
|
60
|
+
},
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
with bd.session("research-bot") as s:
|
|
64
|
+
r = s.fetch("https://docs.brightdata.com/api")
|
|
65
|
+
print(r.status, r.bytes_len, "bytes")
|
|
66
|
+
|
|
67
|
+
# second hit within 1s -> RateLimitedError (qps cap = 1)
|
|
68
|
+
s.fetch("https://docs.brightdata.com/pricing")
|
|
69
|
+
|
|
70
|
+
# off-allowlist -> DomainDeniedError, also logged
|
|
71
|
+
s.fetch("https://evil.example/exfil")
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
`FetchResult` carries `url`, `status`, `text`, `headers`, `elapsed_ms`,
|
|
75
|
+
and a `via_brightdata` flag so downstream code can tell whether the
|
|
76
|
+
response came through the proxy.
|
|
77
|
+
|
|
78
|
+
## Audit log
|
|
79
|
+
|
|
80
|
+
One JSON object per line, e.g.:
|
|
81
|
+
|
|
82
|
+
```json
|
|
83
|
+
{"ts":1747779600.12,"session_id":"research-bot","kind":"fetch_ok",
|
|
84
|
+
"url":"https://docs.brightdata.com/api","host":"docs.brightdata.com",
|
|
85
|
+
"status":200,"bytes":4221,"elapsed_ms":312.4}
|
|
86
|
+
{"ts":1747779600.45,"session_id":"research-bot","kind":"domain_denied",
|
|
87
|
+
"url":"https://evil.example/exfil","host":"evil.example",
|
|
88
|
+
"error":"host 'evil.example' not in allowlist"}
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Kinds: `session_open`, `fetch_ok`, `fetch_failed`, `domain_denied`,
|
|
92
|
+
`rate_limited`, `session_close`.
|
|
93
|
+
|
|
94
|
+
## Dashboard
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
pip install "birddog[dashboard]"
|
|
98
|
+
streamlit run -m birddog.dashboard -- --audit runs/scrape.jsonl
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Shows total fetches, denials, bytes, and a per-host breakdown of
|
|
102
|
+
fetches + bytes + p50 latency.
|
|
103
|
+
|
|
104
|
+
## Demos
|
|
105
|
+
|
|
106
|
+
Two runnable examples in `examples/`:
|
|
107
|
+
|
|
108
|
+
**1. Smoke test — `scrape_demo.py`**
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
python examples/scrape_demo.py
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Hits each feature once: happy path, domain denial, rate-limit burst,
|
|
115
|
+
summary. Offline via `httpx.MockTransport`.
|
|
116
|
+
|
|
117
|
+
**2. Realistic agent — `watchdog_agent.py`**
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
python examples/watchdog_agent.py
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
A small price-tracker agent. Polls a watchlist of product pages,
|
|
124
|
+
extracts prices, alerts when something moves more than a per-product
|
|
125
|
+
threshold. Three passes show:
|
|
126
|
+
|
|
127
|
+
- allowlist denials (off-domain mirror URL is dropped)
|
|
128
|
+
- per-domain rate cap kicking in on pass 3
|
|
129
|
+
- threshold alerts (`Δ -6.4% > 3.0%`)
|
|
130
|
+
- a `runs/watchdog.jsonl` audit log you can dashboard
|
|
131
|
+
|
|
132
|
+
Set `BIRDDOG_USE_BRIGHTDATA=1` + your Bright Data Web Unlocker env
|
|
133
|
+
vars to flip the demo to a real proxy.
|
|
134
|
+
|
|
135
|
+
## Companion libraries
|
|
136
|
+
|
|
137
|
+
`birddog` is the egress half of a small agent-stack:
|
|
138
|
+
|
|
139
|
+
- [agentleash](https://github.com/MukundaKatta/agentleash) — USD/call budget cap + tool-arg schema gate
|
|
140
|
+
- [agentvet](https://github.com/MukundaKatta/agentvet) — tool-arg validation with LLM-friendly retry hints
|
|
141
|
+
- [agentsnap](https://github.com/MukundaKatta/agentsnap) — snapshot tests for agent traces
|
|
142
|
+
- [agenttrace](https://github.com/MukundaKatta/agenttrace) — cost + latency aggregation per run
|
|
143
|
+
|
|
144
|
+
Pair `birddog` with `agentleash` and you have egress allowlist + budget
|
|
145
|
+
cap on the same agent.
|
|
146
|
+
|
|
147
|
+
## License
|
|
148
|
+
|
|
149
|
+
MIT
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Birddog demo without a real Bright Data account.
|
|
2
|
+
|
|
3
|
+
Uses httpx's MockTransport to simulate Bright Data responses so the demo
|
|
4
|
+
runs offline and reviewers can exercise the leash deterministically.
|
|
5
|
+
|
|
6
|
+
Run:
|
|
7
|
+
|
|
8
|
+
python examples/scrape_demo.py
|
|
9
|
+
|
|
10
|
+
Shows:
|
|
11
|
+
1. happy path — fetch from an allowed domain
|
|
12
|
+
2. domain denial — fetch a host outside the allowlist
|
|
13
|
+
3. rate-limit denial — burst past the per-domain QPS cap
|
|
14
|
+
4. audit log dump
|
|
15
|
+
|
|
16
|
+
Bytes + per-host metrics show up in the bundled Streamlit dashboard:
|
|
17
|
+
|
|
18
|
+
streamlit run -m birddog.dashboard -- --audit runs/scrape_demo.jsonl
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import os
|
|
24
|
+
import shutil
|
|
25
|
+
from contextlib import contextmanager
|
|
26
|
+
|
|
27
|
+
import httpx
|
|
28
|
+
|
|
29
|
+
from birddog import Birddog, DomainDeniedError, RateLimitedError
|
|
30
|
+
from birddog import birddog as _birddog_mod
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ---- fake Bright Data backend (httpx MockTransport) ------------------------
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _fake_handler(request: httpx.Request) -> httpx.Response:
|
|
37
|
+
host = request.url.host
|
|
38
|
+
path = request.url.path
|
|
39
|
+
if host == "docs.brightdata.com":
|
|
40
|
+
html = f"<html><h1>Bright Data docs ({path})</h1><p>OK.</p></html>"
|
|
41
|
+
return httpx.Response(200, html=html)
|
|
42
|
+
if host == "shop.example.com":
|
|
43
|
+
# mimic a product page response
|
|
44
|
+
return httpx.Response(
|
|
45
|
+
200,
|
|
46
|
+
json={
|
|
47
|
+
"url": str(request.url),
|
|
48
|
+
"title": "Demo product",
|
|
49
|
+
"price": 9.99,
|
|
50
|
+
"currency": "USD",
|
|
51
|
+
},
|
|
52
|
+
)
|
|
53
|
+
return httpx.Response(404, text="not found")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@contextmanager
|
|
57
|
+
def _mock_http(bd: Birddog):
|
|
58
|
+
"""Patch the session context-manager class so every Birddog session
|
|
59
|
+
in the block uses an httpx.Client driven by MockTransport. Lets the
|
|
60
|
+
demo run without any real network."""
|
|
61
|
+
SessionCM = _birddog_mod._SessionCM
|
|
62
|
+
real_enter = SessionCM.__enter__
|
|
63
|
+
|
|
64
|
+
def patched_enter(self):
|
|
65
|
+
session = real_enter(self)
|
|
66
|
+
session._http.close() # close the real httpx.Client
|
|
67
|
+
session._http = httpx.Client(
|
|
68
|
+
transport=httpx.MockTransport(_fake_handler),
|
|
69
|
+
follow_redirects=True,
|
|
70
|
+
)
|
|
71
|
+
return session
|
|
72
|
+
|
|
73
|
+
SessionCM.__enter__ = patched_enter
|
|
74
|
+
try:
|
|
75
|
+
yield
|
|
76
|
+
finally:
|
|
77
|
+
SessionCM.__enter__ = real_enter
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# ---- demo ------------------------------------------------------------------
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def main() -> None:
|
|
84
|
+
audit_dir = "runs"
|
|
85
|
+
if os.path.exists(audit_dir):
|
|
86
|
+
shutil.rmtree(audit_dir)
|
|
87
|
+
|
|
88
|
+
bd = Birddog(
|
|
89
|
+
allowed_domains={"docs.brightdata.com", "*.example.com"},
|
|
90
|
+
per_domain_qps=1.0,
|
|
91
|
+
per_domain_burst=2.0,
|
|
92
|
+
audit_path=f"{audit_dir}/scrape_demo.jsonl",
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
with _mock_http(bd):
|
|
96
|
+
with bd.session("scrape-demo") as s:
|
|
97
|
+
# 1. Happy path
|
|
98
|
+
print("\n[1] Happy path — fetch docs.brightdata.com")
|
|
99
|
+
r = s.fetch("https://docs.brightdata.com/api")
|
|
100
|
+
print(f" status={r.status} bytes={r.bytes_len} ms={r.elapsed_ms}")
|
|
101
|
+
|
|
102
|
+
# 2. Domain denial
|
|
103
|
+
print("\n[2] Domain denial — fetch evil.attacker.example")
|
|
104
|
+
try:
|
|
105
|
+
s.fetch("https://evil.attacker.example/exfil")
|
|
106
|
+
except DomainDeniedError as e:
|
|
107
|
+
print(f" denied: {e}")
|
|
108
|
+
|
|
109
|
+
# 3. Rate-limit denial: burst three fast hits on shop.example.com
|
|
110
|
+
print("\n[3] Rate-limit denial — burst on shop.example.com (cap 1 qps, burst 2)")
|
|
111
|
+
for i in range(4):
|
|
112
|
+
try:
|
|
113
|
+
r = s.fetch(f"https://shop.example.com/products/{i}")
|
|
114
|
+
print(f" [{i}] ok status={r.status}")
|
|
115
|
+
except RateLimitedError as e:
|
|
116
|
+
print(f" [{i}] denied: {e}")
|
|
117
|
+
|
|
118
|
+
# 4. Summary
|
|
119
|
+
print(
|
|
120
|
+
f"\n[summary] fetches_ok={s.fetches_ok} fetches_denied={s.fetches_denied} bytes={s.bytes_total}"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
print(f"\nAudit log written to: {audit_dir}/scrape_demo.jsonl")
|
|
124
|
+
print("Open dashboard with: streamlit run -m birddog.dashboard -- --audit runs/scrape_demo.jsonl")
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
if __name__ == "__main__":
|
|
128
|
+
main()
|