birddog 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ .pytest_cache/
5
+ .ruff_cache/
6
+ .venv/
7
+ dist/
8
+ build/
9
+ *.egg-info/
10
+ .DS_Store
11
+ runs/
birddog-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Mukunda Rao Katta
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
birddog-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,186 @@
1
+ Metadata-Version: 2.4
2
+ Name: birddog
3
+ Version: 0.1.0
4
+ Summary: Domain allowlist, per-domain rate caps, response audit, and a Streamlit dashboard for Bright Data scraping agents. The picks-and-shovels layer for any agent that consumes the live web.
5
+ Project-URL: Homepage, https://github.com/MukundaKatta/birddog
6
+ Project-URL: Source, https://github.com/MukundaKatta/birddog
7
+ Project-URL: Issues, https://github.com/MukundaKatta/birddog/issues
8
+ Author-email: Mukunda Rao Katta <mukunda.vjcs6@gmail.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: agents,ai,audit,bright-data,egress,llm,rate-limit,scraping,web-data
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3 :: Only
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Programming Language :: Python :: 3.14
23
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
+ Requires-Python: >=3.10
25
+ Requires-Dist: httpx>=0.27
26
+ Provides-Extra: brightdata
27
+ Provides-Extra: dashboard
28
+ Requires-Dist: pandas>=2.2; extra == 'dashboard'
29
+ Requires-Dist: streamlit>=1.40; extra == 'dashboard'
30
+ Provides-Extra: dev
31
+ Requires-Dist: pandas>=2.2; extra == 'dev'
32
+ Requires-Dist: pytest>=8.0; extra == 'dev'
33
+ Requires-Dist: respx>=0.21; extra == 'dev'
34
+ Requires-Dist: ruff>=0.4; extra == 'dev'
35
+ Requires-Dist: streamlit>=1.40; extra == 'dev'
36
+ Description-Content-Type: text/markdown
37
+
38
+ # birddog
39
+
40
+ Audited Bright Data egress for AI agents. Drop one context manager
41
+ around an agent that scrapes the web and you get:
42
+
43
+ 1. **Domain allowlist** — deny everything outside it, log the attempt
44
+ 2. **Per-domain rate caps** — simple token bucket per host
45
+ 3. **Response audit log** — one JSONL line per fetch (url, status, bytes, ms)
46
+ 4. **Bright Data Web Unlocker proxy** — opt-in: route via Bright Data
47
+ 5. **Streamlit dashboard** — point it at the JSONL, get per-host bytes,
48
+ denial counts, latency p50
49
+
50
+ Built for the kind of agent that hits live sites: research bots, price
51
+ trackers, RAG ingest jobs. If you've ever watched an agent rip
52
+ through a sponsor's free tier in 30 seconds, this is for you.
53
+
54
+ ## Install
55
+
56
+ ```bash
57
+ pip install birddog # core
58
+ pip install "birddog[dashboard]" # + Streamlit dashboard
59
+ ```
60
+
61
+ Python 3.10+.
62
+
63
+ ## Why
64
+
65
+ LLM agents don't know what a sane scraping cadence looks like. They'll
66
+ hammer a site, ignore robots.txt, follow links into spammy subdomains,
67
+ and burn through a Bright Data quota in a single run.
68
+
69
+ `birddog` puts a leash on the egress side:
70
+
71
+ | Concern | What birddog does |
72
+ |-----------------------|----------------------------------------------------|
73
+ | Wandering off-domain | Allowlist with `example.com` + `*.example.com` |
74
+ | Burst scraping | Token bucket per host (qps + burst) |
75
+ | "What did it fetch?" | JSONL audit log, one event per fetch |
76
+ | Anti-bot blocks | Optional Bright Data Web Unlocker proxy |
77
+ | Post-run review | Bundled Streamlit dashboard |
78
+
79
+ It does **not** parse HTML, manage cookies, render JS, or rotate user
80
+ agents. That's what Bright Data + your scraping code are for.
81
+
82
+ ## Usage
83
+
84
+ ```python
85
+ from birddog import Birddog
86
+
87
+ bd = Birddog(
88
+ allowed_domains={"docs.brightdata.com", "*.example.com"},
89
+ per_domain_qps=1.0,
90
+ per_domain_burst=2.0,
91
+ audit_path="runs/scrape.jsonl",
92
+ # Optional — route through Bright Data Web Unlocker:
93
+ bright_data={
94
+ "host": "brd.superproxy.io:33335",
95
+ "username": "brd-customer-...-zone-web_unlocker",
96
+ "password": "...",
97
+ },
98
+ )
99
+
100
+ with bd.session("research-bot") as s:
101
+ r = s.fetch("https://docs.brightdata.com/api")
102
+ print(r.status, r.bytes_len, "bytes")
103
+
104
+ # second hit within 1s -> RateLimitedError (qps cap = 1)
105
+ s.fetch("https://docs.brightdata.com/pricing")
106
+
107
+ # off-allowlist -> DomainDeniedError, also logged
108
+ s.fetch("https://evil.example/exfil")
109
+ ```
110
+
111
+ `FetchResult` carries `url`, `status`, `text`, `headers`, `elapsed_ms`,
112
+ and a `via_brightdata` flag so downstream code can tell whether the
113
+ response came through the proxy.
114
+
115
+ ## Audit log
116
+
117
+ One JSON object per line, e.g.:
118
+
119
+ ```json
120
+ {"ts":1747779600.12,"session_id":"research-bot","kind":"fetch_ok",
121
+ "url":"https://docs.brightdata.com/api","host":"docs.brightdata.com",
122
+ "status":200,"bytes":4221,"elapsed_ms":312.4}
123
+ {"ts":1747779600.45,"session_id":"research-bot","kind":"domain_denied",
124
+ "url":"https://evil.example/exfil","host":"evil.example",
125
+ "error":"host 'evil.example' not in allowlist"}
126
+ ```
127
+
128
+ Kinds: `session_open`, `fetch_ok`, `fetch_failed`, `domain_denied`,
129
+ `rate_limited`, `session_close`.
130
+
131
+ ## Dashboard
132
+
133
+ ```bash
134
+ pip install "birddog[dashboard]"
135
+ streamlit run -m birddog.dashboard -- --audit runs/scrape.jsonl
136
+ ```
137
+
138
+ Shows total fetches, denials, bytes, and a per-host breakdown of
139
+ fetches + bytes + p50 latency.
140
+
141
+ ## Demos
142
+
143
+ Two runnable examples in `examples/`:
144
+
145
+ **1. Smoke test — `scrape_demo.py`**
146
+
147
+ ```bash
148
+ python examples/scrape_demo.py
149
+ ```
150
+
151
+ Hits each feature once: happy path, domain denial, rate-limit burst,
152
+ summary. Offline via `httpx.MockTransport`.
153
+
154
+ **2. Realistic agent — `watchdog_agent.py`**
155
+
156
+ ```bash
157
+ python examples/watchdog_agent.py
158
+ ```
159
+
160
+ A small price-tracker agent. Polls a watchlist of product pages,
161
+ extracts prices, alerts when something moves more than a per-product
162
+ threshold. Three passes show:
163
+
164
+ - allowlist denials (off-domain mirror URL is dropped)
165
+ - per-domain rate cap kicking in on pass 3
166
+ - threshold alerts (`Δ -6.4% > 3.0%`)
167
+ - a `runs/watchdog.jsonl` audit log you can dashboard
168
+
169
+ Set `BIRDDOG_USE_BRIGHTDATA=1` + your Bright Data Web Unlocker env
170
+ vars to flip the demo to a real proxy.
171
+
172
+ ## Companion libraries
173
+
174
+ `birddog` is the egress half of a small agent-stack:
175
+
176
+ - [agentleash](https://github.com/MukundaKatta/agentleash) — USD/call budget cap + tool-arg schema gate
177
+ - [agentvet](https://github.com/MukundaKatta/agentvet) — tool-arg validation with LLM-friendly retry hints
178
+ - [agentsnap](https://github.com/MukundaKatta/agentsnap) — snapshot tests for agent traces
179
+ - [agenttrace](https://github.com/MukundaKatta/agenttrace) — cost + latency aggregation per run
180
+
181
+ Pair `birddog` with `agentleash` and you have egress allowlist + budget
182
+ cap on the same agent.
183
+
184
+ ## License
185
+
186
+ MIT
@@ -0,0 +1,149 @@
1
+ # birddog
2
+
3
+ Audited Bright Data egress for AI agents. Drop one context manager
4
+ around an agent that scrapes the web and you get:
5
+
6
+ 1. **Domain allowlist** — deny everything outside it, log the attempt
7
+ 2. **Per-domain rate caps** — simple token bucket per host
8
+ 3. **Response audit log** — one JSONL line per fetch (url, status, bytes, ms)
9
+ 4. **Bright Data Web Unlocker proxy** — opt-in: route via Bright Data
10
+ 5. **Streamlit dashboard** — point it at the JSONL, get per-host bytes,
11
+ denial counts, latency p50
12
+
13
+ Built for the kind of agent that hits live sites: research bots, price
14
+ trackers, RAG ingest jobs. If you've ever watched an agent rip
15
+ through a sponsor's free tier in 30 seconds, this is for you.
16
+
17
+ ## Install
18
+
19
+ ```bash
20
+ pip install birddog # core
21
+ pip install "birddog[dashboard]" # + Streamlit dashboard
22
+ ```
23
+
24
+ Python 3.10+.
25
+
26
+ ## Why
27
+
28
+ LLM agents don't know what a sane scraping cadence looks like. They'll
29
+ hammer a site, ignore robots.txt, follow links into spammy subdomains,
30
+ and burn through a Bright Data quota in a single run.
31
+
32
+ `birddog` puts a leash on the egress side:
33
+
34
+ | Concern | What birddog does |
35
+ |-----------------------|----------------------------------------------------|
36
+ | Wandering off-domain | Allowlist with `example.com` + `*.example.com` |
37
+ | Burst scraping | Token bucket per host (qps + burst) |
38
+ | "What did it fetch?" | JSONL audit log, one event per fetch |
39
+ | Anti-bot blocks | Optional Bright Data Web Unlocker proxy |
40
+ | Post-run review | Bundled Streamlit dashboard |
41
+
42
+ It does **not** parse HTML, manage cookies, render JS, or rotate user
43
+ agents. That's what Bright Data + your scraping code are for.
44
+
45
+ ## Usage
46
+
47
+ ```python
48
+ from birddog import Birddog
49
+
50
+ bd = Birddog(
51
+ allowed_domains={"docs.brightdata.com", "*.example.com"},
52
+ per_domain_qps=1.0,
53
+ per_domain_burst=2.0,
54
+ audit_path="runs/scrape.jsonl",
55
+ # Optional — route through Bright Data Web Unlocker:
56
+ bright_data={
57
+ "host": "brd.superproxy.io:33335",
58
+ "username": "brd-customer-...-zone-web_unlocker",
59
+ "password": "...",
60
+ },
61
+ )
62
+
63
+ with bd.session("research-bot") as s:
64
+ r = s.fetch("https://docs.brightdata.com/api")
65
+ print(r.status, r.bytes_len, "bytes")
66
+
67
+ # second hit within 1s -> RateLimitedError (qps cap = 1)
68
+ s.fetch("https://docs.brightdata.com/pricing")
69
+
70
+ # off-allowlist -> DomainDeniedError, also logged
71
+ s.fetch("https://evil.example/exfil")
72
+ ```
73
+
74
+ `FetchResult` carries `url`, `status`, `text`, `headers`, `elapsed_ms`,
75
+ and a `via_brightdata` flag so downstream code can tell whether the
76
+ response came through the proxy.
77
+
78
+ ## Audit log
79
+
80
+ One JSON object per line, e.g.:
81
+
82
+ ```json
83
+ {"ts":1747779600.12,"session_id":"research-bot","kind":"fetch_ok",
84
+ "url":"https://docs.brightdata.com/api","host":"docs.brightdata.com",
85
+ "status":200,"bytes":4221,"elapsed_ms":312.4}
86
+ {"ts":1747779600.45,"session_id":"research-bot","kind":"domain_denied",
87
+ "url":"https://evil.example/exfil","host":"evil.example",
88
+ "error":"host 'evil.example' not in allowlist"}
89
+ ```
90
+
91
+ Kinds: `session_open`, `fetch_ok`, `fetch_failed`, `domain_denied`,
92
+ `rate_limited`, `session_close`.
93
+
94
+ ## Dashboard
95
+
96
+ ```bash
97
+ pip install "birddog[dashboard]"
98
+ streamlit run -m birddog.dashboard -- --audit runs/scrape.jsonl
99
+ ```
100
+
101
+ Shows total fetches, denials, bytes, and a per-host breakdown of
102
+ fetches + bytes + p50 latency.
103
+
104
+ ## Demos
105
+
106
+ Two runnable examples in `examples/`:
107
+
108
+ **1. Smoke test — `scrape_demo.py`**
109
+
110
+ ```bash
111
+ python examples/scrape_demo.py
112
+ ```
113
+
114
+ Hits each feature once: happy path, domain denial, rate-limit burst,
115
+ summary. Offline via `httpx.MockTransport`.
116
+
117
+ **2. Realistic agent — `watchdog_agent.py`**
118
+
119
+ ```bash
120
+ python examples/watchdog_agent.py
121
+ ```
122
+
123
+ A small price-tracker agent. Polls a watchlist of product pages,
124
+ extracts prices, alerts when something moves more than a per-product
125
+ threshold. Three passes show:
126
+
127
+ - allowlist denials (off-domain mirror URL is dropped)
128
+ - per-domain rate cap kicking in on pass 3
129
+ - threshold alerts (`Δ -6.4% > 3.0%`)
130
+ - a `runs/watchdog.jsonl` audit log you can dashboard
131
+
132
+ Set `BIRDDOG_USE_BRIGHTDATA=1` + your Bright Data Web Unlocker env
133
+ vars to flip the demo to a real proxy.
134
+
135
+ ## Companion libraries
136
+
137
+ `birddog` is the egress half of a small agent-stack:
138
+
139
+ - [agentleash](https://github.com/MukundaKatta/agentleash) — USD/call budget cap + tool-arg schema gate
140
+ - [agentvet](https://github.com/MukundaKatta/agentvet) — tool-arg validation with LLM-friendly retry hints
141
+ - [agentsnap](https://github.com/MukundaKatta/agentsnap) — snapshot tests for agent traces
142
+ - [agenttrace](https://github.com/MukundaKatta/agenttrace) — cost + latency aggregation per run
143
+
144
+ Pair `birddog` with `agentleash` and you have egress allowlist + budget
145
+ cap on the same agent.
146
+
147
+ ## License
148
+
149
+ MIT
@@ -0,0 +1,128 @@
1
+ """Birddog demo without a real Bright Data account.
2
+
3
+ Uses httpx's MockTransport to simulate Bright Data responses so the demo
4
+ runs offline and reviewers can exercise the leash deterministically.
5
+
6
+ Run:
7
+
8
+ python examples/scrape_demo.py
9
+
10
+ Shows:
11
+ 1. happy path — fetch from an allowed domain
12
+ 2. domain denial — fetch a host outside the allowlist
13
+ 3. rate-limit denial — burst past the per-domain QPS cap
14
+ 4. audit log dump
15
+
16
+ Bytes + per-host metrics show up in the bundled Streamlit dashboard:
17
+
18
+ streamlit run -m birddog.dashboard -- --audit runs/scrape_demo.jsonl
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import os
24
+ import shutil
25
+ from contextlib import contextmanager
26
+
27
+ import httpx
28
+
29
+ from birddog import Birddog, DomainDeniedError, RateLimitedError
30
+ from birddog import birddog as _birddog_mod
31
+
32
+
33
+ # ---- fake Bright Data backend (httpx MockTransport) ------------------------
34
+
35
+
36
+ def _fake_handler(request: httpx.Request) -> httpx.Response:
37
+ host = request.url.host
38
+ path = request.url.path
39
+ if host == "docs.brightdata.com":
40
+ html = f"<html><h1>Bright Data docs ({path})</h1><p>OK.</p></html>"
41
+ return httpx.Response(200, html=html)
42
+ if host == "shop.example.com":
43
+ # mimic a product page response
44
+ return httpx.Response(
45
+ 200,
46
+ json={
47
+ "url": str(request.url),
48
+ "title": "Demo product",
49
+ "price": 9.99,
50
+ "currency": "USD",
51
+ },
52
+ )
53
+ return httpx.Response(404, text="not found")
54
+
55
+
56
+ @contextmanager
57
+ def _mock_http(bd: Birddog):
58
+ """Patch the session context-manager class so every Birddog session
59
+ in the block uses an httpx.Client driven by MockTransport. Lets the
60
+ demo run without any real network."""
61
+ SessionCM = _birddog_mod._SessionCM
62
+ real_enter = SessionCM.__enter__
63
+
64
+ def patched_enter(self):
65
+ session = real_enter(self)
66
+ session._http.close() # close the real httpx.Client
67
+ session._http = httpx.Client(
68
+ transport=httpx.MockTransport(_fake_handler),
69
+ follow_redirects=True,
70
+ )
71
+ return session
72
+
73
+ SessionCM.__enter__ = patched_enter
74
+ try:
75
+ yield
76
+ finally:
77
+ SessionCM.__enter__ = real_enter
78
+
79
+
80
+ # ---- demo ------------------------------------------------------------------
81
+
82
+
83
+ def main() -> None:
84
+ audit_dir = "runs"
85
+ if os.path.exists(audit_dir):
86
+ shutil.rmtree(audit_dir)
87
+
88
+ bd = Birddog(
89
+ allowed_domains={"docs.brightdata.com", "*.example.com"},
90
+ per_domain_qps=1.0,
91
+ per_domain_burst=2.0,
92
+ audit_path=f"{audit_dir}/scrape_demo.jsonl",
93
+ )
94
+
95
+ with _mock_http(bd):
96
+ with bd.session("scrape-demo") as s:
97
+ # 1. Happy path
98
+ print("\n[1] Happy path — fetch docs.brightdata.com")
99
+ r = s.fetch("https://docs.brightdata.com/api")
100
+ print(f" status={r.status} bytes={r.bytes_len} ms={r.elapsed_ms}")
101
+
102
+ # 2. Domain denial
103
+ print("\n[2] Domain denial — fetch evil.attacker.example")
104
+ try:
105
+ s.fetch("https://evil.attacker.example/exfil")
106
+ except DomainDeniedError as e:
107
+ print(f" denied: {e}")
108
+
109
+ # 3. Rate-limit denial: burst three fast hits on shop.example.com
110
+ print("\n[3] Rate-limit denial — burst on shop.example.com (cap 1 qps, burst 2)")
111
+ for i in range(4):
112
+ try:
113
+ r = s.fetch(f"https://shop.example.com/products/{i}")
114
+ print(f" [{i}] ok status={r.status}")
115
+ except RateLimitedError as e:
116
+ print(f" [{i}] denied: {e}")
117
+
118
+ # 4. Summary
119
+ print(
120
+ f"\n[summary] fetches_ok={s.fetches_ok} fetches_denied={s.fetches_denied} bytes={s.bytes_total}"
121
+ )
122
+
123
+ print(f"\nAudit log written to: {audit_dir}/scrape_demo.jsonl")
124
+ print("Open dashboard with: streamlit run -m birddog.dashboard -- --audit runs/scrape_demo.jsonl")
125
+
126
+
127
+ if __name__ == "__main__":
128
+ main()