site-mapper-agents 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- site_mapper_agents-0.1.0/.gitignore +59 -0
- site_mapper_agents-0.1.0/LICENSE +21 -0
- site_mapper_agents-0.1.0/PKG-INFO +403 -0
- site_mapper_agents-0.1.0/README.md +372 -0
- site_mapper_agents-0.1.0/SKILL.md +114 -0
- site_mapper_agents-0.1.0/pyproject.toml +91 -0
- site_mapper_agents-0.1.0/src/site_mapper_agents/__init__.py +117 -0
- site_mapper_agents-0.1.0/src/site_mapper_agents/architect.py +448 -0
- site_mapper_agents-0.1.0/src/site_mapper_agents/eavesdropper.py +259 -0
- site_mapper_agents-0.1.0/src/site_mapper_agents/healer.py +470 -0
- site_mapper_agents-0.1.0/src/site_mapper_agents/models.py +652 -0
- site_mapper_agents-0.1.0/src/site_mapper_agents/prompts.py +116 -0
- site_mapper_agents-0.1.0/src/site_mapper_agents/vocabulary.py +152 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
*.egg
|
|
8
|
+
*.egg-info/
|
|
9
|
+
dist/
|
|
10
|
+
build/
|
|
11
|
+
develop-eggs/
|
|
12
|
+
downloads/
|
|
13
|
+
eggs/
|
|
14
|
+
.eggs/
|
|
15
|
+
lib/
|
|
16
|
+
lib64/
|
|
17
|
+
parts/
|
|
18
|
+
sdist/
|
|
19
|
+
var/
|
|
20
|
+
wheels/
|
|
21
|
+
share/python-wheels/
|
|
22
|
+
*.egg-info/
|
|
23
|
+
.installed.cfg
|
|
24
|
+
*.manifest
|
|
25
|
+
*.spec
|
|
26
|
+
|
|
27
|
+
# Virtual environments
|
|
28
|
+
.venv/
|
|
29
|
+
venv/
|
|
30
|
+
ENV/
|
|
31
|
+
env/
|
|
32
|
+
|
|
33
|
+
# Test & cache
|
|
34
|
+
.pytest_cache/
|
|
35
|
+
.ruff_cache/
|
|
36
|
+
.mypy_cache/
|
|
37
|
+
.tox/
|
|
38
|
+
.coverage
|
|
39
|
+
.coverage.*
|
|
40
|
+
htmlcov/
|
|
41
|
+
.cache/
|
|
42
|
+
nosetests.xml
|
|
43
|
+
coverage.xml
|
|
44
|
+
*.cover
|
|
45
|
+
.hypothesis/
|
|
46
|
+
|
|
47
|
+
# IDE / editor
|
|
48
|
+
.idea/
|
|
49
|
+
.vscode/
|
|
50
|
+
*.swp
|
|
51
|
+
*~
|
|
52
|
+
|
|
53
|
+
# OS junk
|
|
54
|
+
.DS_Store
|
|
55
|
+
Thumbs.db
|
|
56
|
+
desktop.ini
|
|
57
|
+
|
|
58
|
+
# Build artefacts
|
|
59
|
+
*.log
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 axumquant
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: site-mapper-agents
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LLM-driven self-healing API discovery for undocumented SaaS portals via CDP
|
|
5
|
+
Project-URL: Homepage, https://github.com/axumquant/site-mapper-agents
|
|
6
|
+
Project-URL: Repository, https://github.com/axumquant/site-mapper-agents
|
|
7
|
+
Project-URL: Issues, https://github.com/axumquant/site-mapper-agents/issues
|
|
8
|
+
Author: axumquant
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: api-discovery,cdp,llm,pydantic-ai,scraping,self-healing,site-mapping
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Python: >=3.11
|
|
23
|
+
Requires-Dist: httpx>=0.27
|
|
24
|
+
Requires-Dist: pydantic-ai>=0.0.10
|
|
25
|
+
Requires-Dist: pydantic>=2.5
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
29
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
|
|
32
|
+
# site-mapper-agents
|
|
33
|
+
|
|
34
|
+
**LLM-once API discovery + self-healing extraction for any browser-accessible portal.**
|
|
35
|
+
|
|
36
|
+
Burst-record CDP network traffic from a portal you have a browser session on,
|
|
37
|
+
hand it to a three-agent team, get back a typed schema + signatures you can
|
|
38
|
+
extract from forever — with auto-repair when the portal's API shape drifts.
|
|
39
|
+
|
|
40
|
+
[](https://pypi.org/project/site-mapper-agents/)
|
|
41
|
+

|
|
42
|
+

|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## The problem
|
|
47
|
+
|
|
48
|
+
Every SaaS portal has a different API. Writing extractors for each is a
|
|
49
|
+
treadmill — and the schemas change without warning, so your extractors
|
|
50
|
+
silently break.
|
|
51
|
+
|
|
52
|
+
Pre-built connectors only cover the top 20 platforms. For everything else
|
|
53
|
+
(internal CRMs, niche-vertical tools, undocumented partner portals) you
|
|
54
|
+
either pay someone to reverse-engineer the API, or you give up and scrape
|
|
55
|
+
the DOM.
|
|
56
|
+
|
|
57
|
+
This library is the third option.
|
|
58
|
+
|
|
59
|
+
## What this solves
|
|
60
|
+
|
|
61
|
+
1. **Onboarding**: you point the system at a portal you have a real browser
|
|
62
|
+
session on. It records a burst of CDP network traffic while you click
|
|
63
|
+
around, then asks an LLM **once** to classify which endpoints carry the
|
|
64
|
+
data you want and to map response JSON keys to your fields. Output is a
|
|
65
|
+
typed `SiteSchema` and a list of `NetworkSignature` patterns.
|
|
66
|
+
2. **Extraction**: from that point forward, every CDP event is matched
|
|
67
|
+
against the saved signatures with pure Pydantic validation —
|
|
68
|
+
sub-millisecond, no LLM calls, no cost.
|
|
69
|
+
3. **Self-healing**: when the portal changes its response shape, an
|
|
70
|
+
`ExtractionFailed` event fires. The Healer compares the old key map
|
|
71
|
+
against the new response, fixes what it can deterministically, and
|
|
72
|
+
asks the LLM to semantically match the rest. Confident patches
|
|
73
|
+
auto-apply. Borderline patches surface for human review.
|
|
74
|
+
|
|
75
|
+
## The three agents
|
|
76
|
+
|
|
77
|
+
```
|
|
78
|
+
┌──────────────────────────────────────────────┐
|
|
79
|
+
│ Browser session → CDP forwarder → events │
|
|
80
|
+
└──────────────────────┬───────────────────────┘
|
|
81
|
+
│
|
|
82
|
+
┌──────────────────────┐ │ ┌───────────────────────┐
|
|
83
|
+
│ Architect │ ◀── once ──── │ ──── live ──▶│ Eavesdropper │
|
|
84
|
+
│ (LLM classifies │ │ │ (Pydantic only, │
|
|
85
|
+
│ endpoints, builds │ │ │ sub-ms hot path) │
|
|
86
|
+
│ SiteSchema + │ │ │ │
|
|
87
|
+
│ signatures) │ │ │ emits ExtractionResult
|
|
88
|
+
└──────────────────────┘ │ │ or ExtractionFailed │
|
|
89
|
+
│ │ └───────────┬───────────┘
|
|
90
|
+
▼ │ │
|
|
91
|
+
╔══════════════════════╗ │ ┌───────────▼───────────┐
|
|
92
|
+
║ MappedSite + ║◀──── heals ───┼──────────────│ Healer │
|
|
93
|
+
║ NetworkSignatures ║ │ │ (LLM re-maps stale │
|
|
94
|
+
╚══════════════════════╝ │ │ keys, auto-applies │
|
|
95
|
+
│ │ confident patches) │
|
|
96
|
+
│ └───────────────────────┘
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
- **Architect** — runs once. Expensive. Produces the schema.
|
|
100
|
+
- **Eavesdropper** — runs on every event. Free. Pure validation.
|
|
101
|
+
- **Healer** — runs only on failures. Costs nothing when nothing breaks.
|
|
102
|
+
|
|
103
|
+
## Install
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
pip install site-mapper-agents
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
For the runnable examples you'll also want a pydantic-ai provider:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
pip install 'pydantic-ai[anthropic]' # or [openai], [ollama], ...
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Quickstart
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
import asyncio
|
|
119
|
+
from pydantic_ai.models.test import TestModel
|
|
120
|
+
|
|
121
|
+
from site_mapper_agents import (
|
|
122
|
+
Architect,
|
|
123
|
+
CDPNetworkEvent,
|
|
124
|
+
Eavesdropper,
|
|
125
|
+
TargetField,
|
|
126
|
+
UserIntent,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# 1. Tell the system what you want to extract.
|
|
130
|
+
intent = UserIntent(
|
|
131
|
+
description="Customer account details",
|
|
132
|
+
target_fields=[
|
|
133
|
+
TargetField(name="account_id", description="Account UUID"),
|
|
134
|
+
TargetField(name="email", description="Primary contact email"),
|
|
135
|
+
],
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# 2. Construct the Architect. Replace TestModel with a real provider.
|
|
139
|
+
architect = Architect(model=TestModel()) # or AnthropicModel("claude-sonnet-4-5")
|
|
140
|
+
|
|
141
|
+
# 3. Feed it a burst of CDP traffic (your forwarder produced these).
|
|
142
|
+
architect.record_traffic(CDPNetworkEvent(
|
|
143
|
+
request_id="r1",
|
|
144
|
+
url="https://crm.example.com/api/v2/accounts/42",
|
|
145
|
+
method="GET",
|
|
146
|
+
body={"data": {"client": {"id": "acct_42", "email": "ada@example.com"}}},
|
|
147
|
+
))
|
|
148
|
+
|
|
149
|
+
# 4. Ask the Architect to propose a schema.
|
|
150
|
+
async def onboard():
|
|
151
|
+
proposal = await architect.propose(
|
|
152
|
+
target_url="https://crm.example.com/accounts",
|
|
153
|
+
user_intent=intent,
|
|
154
|
+
)
|
|
155
|
+
site = architect.build_mapped_site(
|
|
156
|
+
proposal=proposal,
|
|
157
|
+
target_url="https://crm.example.com/accounts",
|
|
158
|
+
user_intent=intent,
|
|
159
|
+
)
|
|
160
|
+
return site
|
|
161
|
+
|
|
162
|
+
site = asyncio.run(onboard())
|
|
163
|
+
|
|
164
|
+
# 5. From now on, every live CDP event runs through the Eavesdropper.
|
|
165
|
+
eaves = Eavesdropper()
|
|
166
|
+
result, event = eaves.ingest(
|
|
167
|
+
CDPNetworkEvent(
|
|
168
|
+
request_id="r2",
|
|
169
|
+
url="https://crm.example.com/api/v2/accounts/99",
|
|
170
|
+
method="GET",
|
|
171
|
+
body={"data": {"client": {"id": "acct_99", "email": "g@example.com"}}},
|
|
172
|
+
),
|
|
173
|
+
sites=[site],
|
|
174
|
+
)
|
|
175
|
+
print(result.data_payload if result else "no match")
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## API reference
|
|
179
|
+
|
|
180
|
+
### `Architect(model=None, vocabulary=None, policy=DEFAULT_ONBOARDING_POLICY, model_settings=None)`
|
|
181
|
+
|
|
182
|
+
The onboarding agent. LLM-once.
|
|
183
|
+
|
|
184
|
+
| Parameter | Type | Notes |
|
|
185
|
+
| ---------------- | --------------------------- | --------------------------------------------- |
|
|
186
|
+
| `model` | `pydantic_ai.Model \| None` | Any pydantic-ai model. `None` → heuristic. |
|
|
187
|
+
| `vocabulary` | `list[EndpointType] \| None`| Caller-supplied classifications. See below. |
|
|
188
|
+
| `policy` | `OnboardingPolicy` | Sample-count thresholds. |
|
|
189
|
+
| `model_settings` | `ModelSettings \| None` | max_tokens, temperature, etc. |
|
|
190
|
+
|
|
191
|
+
**Methods:**
|
|
192
|
+
|
|
193
|
+
- `record_traffic(event)` — buffer a CDP event during onboarding.
|
|
194
|
+
- `record_click()` — mark that the user clicked something.
|
|
195
|
+
- `has_enough_samples()` → `bool` — policy check.
|
|
196
|
+
- `detect_endpoints()` → `list[DetectedEndpoint]` — deterministic
|
|
197
|
+
pre-processing.
|
|
198
|
+
- `await propose(*, target_url, user_intent, llm_classify=None)` →
|
|
199
|
+
`ArchitectProposal` — the main entry point.
|
|
200
|
+
- `build_mapped_site(*, proposal, target_url, user_intent)` →
|
|
201
|
+
`MappedSite` — promote an approved proposal to an active site.
|
|
202
|
+
- `emit_event(site, *, success=True, reason="")` → `SiteMapped |
|
|
203
|
+
OnboardingFailed`.
|
|
204
|
+
- `reset()` — clear buffers for the next onboarding session.
|
|
205
|
+
|
|
206
|
+
### `Eavesdropper(policy=DEFAULT_EXTRACTION_POLICY)`
|
|
207
|
+
|
|
208
|
+
The runtime agent. No LLM. Pure Pydantic validation.
|
|
209
|
+
|
|
210
|
+
**Methods:**
|
|
211
|
+
|
|
212
|
+
- `ingest(event, sites)` → `(ExtractionResult | None, ExtractionSucceeded | ExtractionFailed | None)`.
|
|
213
|
+
|
|
214
|
+
### `Healer(model=None, policy=DEFAULT_HEALING_POLICY, model_settings=None)`
|
|
215
|
+
|
|
216
|
+
The self-healing agent.
|
|
217
|
+
|
|
218
|
+
**Methods:**
|
|
219
|
+
|
|
220
|
+
- `await diagnose(*, site, failed_event, new_response_body=None, llm_semantic_match=None)` →
|
|
221
|
+
`HealerPatch`.
|
|
222
|
+
- `apply_patch(site, patch)` → `(bool, SchemaHealed | HealingFailed | SiteDegraded)`.
|
|
223
|
+
|
|
224
|
+
### Models
|
|
225
|
+
|
|
226
|
+
| Class | Purpose |
|
|
227
|
+
| ------------------- | ----------------------------------------------------------------------- |
|
|
228
|
+
| `CDPNetworkEvent` | One captured network response. Library input. |
|
|
229
|
+
| `TargetField` | One data point the caller wants extracted. |
|
|
230
|
+
| `UserIntent` | A bundle of target fields with a human description. |
|
|
231
|
+
| `EndpointType` | One entry in the Architect's classification vocabulary. |
|
|
232
|
+
| `DetectedEndpoint` | Pre-LLM view of a unique endpoint. |
|
|
233
|
+
| `NetworkSignature` | URL pattern + JSON-key map. Saved per site. |
|
|
234
|
+
| `SiteSchema` | The extraction contract for one intent. |
|
|
235
|
+
| `ArchitectProposal` | Architect's structured output before user confirms. |
|
|
236
|
+
| `HealerPatch` | Healer's structured output for one repair attempt. |
|
|
237
|
+
| `MappedSite` | Aggregate root — schemas + signatures + status. |
|
|
238
|
+
| `ExtractionResult` | Eavesdropper's output for one matched event. |
|
|
239
|
+
|
|
240
|
+
### Domain events
|
|
241
|
+
|
|
242
|
+
`SiteMapped`, `OnboardingFailed`, `ExtractionSucceeded`,
|
|
243
|
+
`ExtractionFailed`, `SchemaHealed`, `HealingFailed`, `SiteDegraded`.
|
|
244
|
+
|
|
245
|
+
All extend `AutomationEvent` (frozen Pydantic model).
|
|
246
|
+
|
|
247
|
+
## Endpoint vocabularies
|
|
248
|
+
|
|
249
|
+
The Architect's LLM prompt embeds a list of `EndpointType` definitions
|
|
250
|
+
that tell the model "you may only classify endpoints into one of these
|
|
251
|
+
categories". The default vocabulary covers generic CRUD shapes:
|
|
252
|
+
|
|
253
|
+
| name | what it means |
|
|
254
|
+
| ----------------- | -------------------------------------------------------------- |
|
|
255
|
+
| `list_records` | Paginated list of records (grid/table views). |
|
|
256
|
+
| `detail_view` | One record's full detail (after click-through). |
|
|
257
|
+
| `search` | Filtered records based on user query. |
|
|
258
|
+
| `create_record` | POST/PUT that creates a new record. |
|
|
259
|
+
| `update_record` | PATCH/PUT that mutates an existing record. |
|
|
260
|
+
| `delete_record` | DELETE. |
|
|
261
|
+
| `reference_data` | Lookup / enum / config data. |
|
|
262
|
+
| `metrics` | Dashboard counts/aggregates. |
|
|
263
|
+
| `unknown` | Fallback when nothing fits. |
|
|
264
|
+
|
|
265
|
+
You'll usually want to extend this with site-specific categories:
|
|
266
|
+
|
|
267
|
+
```python
|
|
268
|
+
from site_mapper_agents import (
|
|
269
|
+
Architect,
|
|
270
|
+
default_vocabulary,
|
|
271
|
+
define_endpoint_type,
|
|
272
|
+
merge_vocabularies,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
vocab = merge_vocabularies(
|
|
276
|
+
default_vocabulary(),
|
|
277
|
+
[
|
|
278
|
+
define_endpoint_type(
|
|
279
|
+
name="invoice_pdf_download",
|
|
280
|
+
description="Streaming download of a generated invoice PDF",
|
|
281
|
+
expected_fields=["invoice_id", "pdf_url"],
|
|
282
|
+
),
|
|
283
|
+
define_endpoint_type(
|
|
284
|
+
name="webhook_subscription",
|
|
285
|
+
description="Webhook registration endpoint that returns the subscription id",
|
|
286
|
+
expected_fields=["subscription_id", "target_url", "events"],
|
|
287
|
+
),
|
|
288
|
+
],
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
architect = Architect(model=my_model, vocabulary=vocab)
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
## LLM providers
|
|
295
|
+
|
|
296
|
+
The library binds to any provider pydantic-ai supports — just pass a
|
|
297
|
+
`Model` instance (or its name) to the agent constructor:
|
|
298
|
+
|
|
299
|
+
```python
|
|
300
|
+
# Anthropic
|
|
301
|
+
from pydantic_ai.models.anthropic import AnthropicModel
|
|
302
|
+
architect = Architect(model=AnthropicModel("claude-sonnet-4-5"))
|
|
303
|
+
|
|
304
|
+
# OpenAI
|
|
305
|
+
from pydantic_ai.models.openai import OpenAIModel
|
|
306
|
+
architect = Architect(model=OpenAIModel("gpt-4o"))
|
|
307
|
+
|
|
308
|
+
# Ollama (or any OpenAI-compatible local server)
|
|
309
|
+
from pydantic_ai.models.openai import OpenAIModel
|
|
310
|
+
from pydantic_ai.providers.openai import OpenAIProvider
|
|
311
|
+
architect = Architect(model=OpenAIModel(
|
|
312
|
+
"llama3.1:8b",
|
|
313
|
+
provider=OpenAIProvider(base_url="http://localhost:11434/v1"),
|
|
314
|
+
))
|
|
315
|
+
|
|
316
|
+
# Deterministic stub for tests
|
|
317
|
+
from pydantic_ai.models.test import TestModel
|
|
318
|
+
architect = Architect(model=TestModel())
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
## CDP burst format
|
|
322
|
+
|
|
323
|
+
`CDPNetworkEvent` is the only input shape the library cares about:
|
|
324
|
+
|
|
325
|
+
```python
|
|
326
|
+
CDPNetworkEvent(
|
|
327
|
+
request_id="<unique-id>",
|
|
328
|
+
url="https://...",
|
|
329
|
+
method="GET",
|
|
330
|
+
status_code=200,
|
|
331
|
+
headers={"content-type": "application/json"},
|
|
332
|
+
body={"data": {"...": "..."}}, # parsed JSON
|
|
333
|
+
frame_origin=None, # set for iframe traffic
|
|
334
|
+
target_id=None, # CDP target id, for multi-frame disambiguation
|
|
335
|
+
timestamp=1715760000.0,
|
|
336
|
+
)
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
The library does not capture CDP traffic itself. Use a sibling tool —
|
|
340
|
+
e.g. **[axumquant/cdp-network-interceptor](https://github.com/axumquant/cdp-network-interceptor)**
|
|
341
|
+
— or your own Chrome extension / Puppeteer / Playwright session that
|
|
342
|
+
emits this shape.
|
|
343
|
+
|
|
344
|
+
## Self-healing flow
|
|
345
|
+
|
|
346
|
+
When does the Healer fire?
|
|
347
|
+
|
|
348
|
+
1. The Eavesdropper validates an incoming event and detects missing
|
|
349
|
+
fields against a registered signature.
|
|
350
|
+
2. It emits `ExtractionFailed` and returns it from `ingest()`.
|
|
351
|
+
3. Your orchestrator passes the failed event (plus the raw response
|
|
352
|
+
body) to `Healer.diagnose()`.
|
|
353
|
+
4. The Healer runs **structural** matching first (same key still exists?
|
|
354
|
+
then we just need a path tweak). If everything resolves
|
|
355
|
+
structurally, no LLM call happens.
|
|
356
|
+
5. Otherwise the Healer calls its pydantic-ai Agent with the old key
|
|
357
|
+
map + new available keys + unresolved field names.
|
|
358
|
+
6. The returned `HealerPatch` has an aggregate confidence:
|
|
359
|
+
- `≥ auto_approve_above` (default 0.90) → `apply_patch()` succeeds,
|
|
360
|
+
emits `SchemaHealed`, signature is replaced in-place.
|
|
361
|
+
- `[min_semantic_confidence, require_human_review_below)` (default
|
|
362
|
+
0.70–0.75) → `apply_patch()` returns `HealingFailed` with reason
|
|
363
|
+
`requires human review`. Surface this to the user.
|
|
364
|
+
- `< min_semantic_confidence` → site is marked DEGRADED, retried
|
|
365
|
+
up to `max_attempts` times, then marked BROKEN.
|
|
366
|
+
7. Persistence is the caller's job — the library mutates the
|
|
367
|
+
`MappedSite` aggregate in memory but doesn't write it anywhere.
|
|
368
|
+
|
|
369
|
+
## Use cases
|
|
370
|
+
|
|
371
|
+
- **Salesforce custom-object extraction** — Salesforce's API surface is
|
|
372
|
+
huge and per-tenant. Onboard once against the tenant you have a
|
|
373
|
+
session on, extract from then on.
|
|
374
|
+
- **HubSpot scraping** — undocumented internal endpoints powering the UI.
|
|
375
|
+
- **Internal CRM discovery** — your customer is on some no-name CRM you've
|
|
376
|
+
never seen. Onboarding takes minutes.
|
|
377
|
+
- **Pre-acquisition portal audits** — point it at a target's admin
|
|
378
|
+
portal, get back a structured map of their data surface.
|
|
379
|
+
- **Partner integrations** with companies who refuse to ship an API.
|
|
380
|
+
|
|
381
|
+
## Pitfalls
|
|
382
|
+
|
|
383
|
+
- **The Architect costs money** — it's an LLM call with a non-trivial
|
|
384
|
+
prompt + context. Budget for one call per site you map. The
|
|
385
|
+
Eavesdropper is free; the Healer only fires when something breaks.
|
|
386
|
+
- **Schema drift is real** — sites change shapes monthly. Wire the
|
|
387
|
+
Healer or you'll be debugging in production.
|
|
388
|
+
- **Auth-protected endpoints** — the library never authenticates for
|
|
389
|
+
you. You drive a real browser session; the CDP forwarder captures
|
|
390
|
+
authenticated traffic. The library only sees the resulting bodies.
|
|
391
|
+
- **Rate limits** — your scraping cadence is your problem. Polite
|
|
392
|
+
pacing is on you.
|
|
393
|
+
- **Iframe traffic** — the library handles `frame_origin` matching
|
|
394
|
+
correctly, but your CDP forwarder MUST populate it. Without
|
|
395
|
+
`frame_origin`, iframe responses match parent-frame signatures, which
|
|
396
|
+
produces garbage extractions.
|
|
397
|
+
- **The vocabulary matters** — generic CRUD works for most sites, but
|
|
398
|
+
niche portals benefit a lot from a custom vocabulary that names the
|
|
399
|
+
domain entities (e.g. `invoice_line_items` vs generic `list_records`).
|
|
400
|
+
|
|
401
|
+
## License
|
|
402
|
+
|
|
403
|
+
MIT — see [LICENSE](LICENSE).
|