agentatlas 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentatlas-0.1.0/PKG-INFO +92 -0
- agentatlas-0.1.0/README.md +73 -0
- agentatlas-0.1.0/agentatlas/__init__.py +4 -0
- agentatlas-0.1.0/agentatlas/atlas.py +403 -0
- agentatlas-0.1.0/agentatlas/crawler.py +164 -0
- agentatlas-0.1.0/agentatlas/executor.py +53 -0
- agentatlas-0.1.0/agentatlas/schemas.py +149 -0
- agentatlas-0.1.0/agentatlas/seed_registry.py +100 -0
- agentatlas-0.1.0/agentatlas/supabase_client.py +9 -0
- agentatlas-0.1.0/agentatlas.egg-info/PKG-INFO +92 -0
- agentatlas-0.1.0/agentatlas.egg-info/SOURCES.txt +14 -0
- agentatlas-0.1.0/agentatlas.egg-info/dependency_links.txt +1 -0
- agentatlas-0.1.0/agentatlas.egg-info/requires.txt +5 -0
- agentatlas-0.1.0/agentatlas.egg-info/top_level.txt +1 -0
- agentatlas-0.1.0/pyproject.toml +32 -0
- agentatlas-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agentatlas
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Shared browser interaction schema registry for AI agents. Reduces LLM token usage by 80-100% on known sites.
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/yourusername/agentatlas
|
|
7
|
+
Project-URL: Issues, https://github.com/yourusername/agentatlas/issues
|
|
8
|
+
Keywords: ai,agents,browser-automation,llm,playwright
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: supabase>=2.6.0
|
|
15
|
+
Requires-Dist: python-dotenv>=1.0.1
|
|
16
|
+
Requires-Dist: playwright>=1.44.0
|
|
17
|
+
Requires-Dist: openai>=1.109.1
|
|
18
|
+
Requires-Dist: playwright-stealth>=1.0.6
|
|
19
|
+
|
|
20
|
+
# AgentAtlas
|
|
21
|
+
|
|
22
|
+
**Shared browser interaction schema registry for AI agents.**
|
|
23
|
+
|
|
24
|
+
Reduces LLM token usage by 80-100% on known sites by storing and sharing site interaction schemas across all users.
|
|
25
|
+
|
|
26
|
+
## How it works
|
|
27
|
+
```
|
|
28
|
+
First user → LLM learns the site → saved to shared registry
|
|
29
|
+
Every user after → 0 tokens, instant response
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Benchmark results (real data)
|
|
33
|
+
|
|
34
|
+
| | Without AgentAtlas | With AgentAtlas |
|
|
35
|
+
|---|---|---|
|
|
36
|
+
| Tokens | 2,597 | 0-445 |
|
|
37
|
+
| Cost | $0.018 | $0.000-$0.002 |
|
|
38
|
+
| Time | 19s | 0.2-12s |
|
|
39
|
+
| Real URLs | ❌ | ✅ |
|
|
40
|
+
|
|
41
|
+
**82.9% token reduction** when LLM still needed. **100% reduction** for repeat workflows.
|
|
42
|
+
|
|
43
|
+
## Install
|
|
44
|
+
```bash
|
|
45
|
+
pip install agentatlas
|
|
46
|
+
playwright install chromium
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Usage
|
|
50
|
+
```python
|
|
51
|
+
from agentatlas.atlas import Atlas
|
|
52
|
+
|
|
53
|
+
atlas = Atlas()
|
|
54
|
+
|
|
55
|
+
# Get schema for any site
|
|
56
|
+
# Found in registry → 0 tokens
|
|
57
|
+
# New site → learns once, saves for everyone
|
|
58
|
+
schema = await atlas.get_schema(
|
|
59
|
+
site="greenhouse.io",
|
|
60
|
+
url="https://boards.greenhouse.io/anthropic"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Pass compact schema to YOUR LLM
|
|
64
|
+
# 150-500 tokens instead of 50,000
|
|
65
|
+
print(schema.elements)
|
|
66
|
+
print(schema.tokens_used) # 0 if registry hit
|
|
67
|
+
print(schema.source) # "registry" or "llm_learned"
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Environment variables
|
|
71
|
+
```bash
|
|
72
|
+
SUPABASE_URL=your_supabase_url
|
|
73
|
+
SUPABASE_SERVICE_ROLE_KEY=your_key
|
|
74
|
+
OPENAI_API_KEY=your_key
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## The flywheel
|
|
78
|
+
```
|
|
79
|
+
More developers use AgentAtlas
|
|
80
|
+
↓
|
|
81
|
+
More new sites get learned automatically
|
|
82
|
+
↓
|
|
83
|
+
Registry grows → higher hit rate
|
|
84
|
+
↓
|
|
85
|
+
Less tokens burned across the whole network
|
|
86
|
+
↓
|
|
87
|
+
Cheaper + faster → more developers adopt
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## License
|
|
91
|
+
|
|
92
|
+
MIT
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# AgentAtlas
|
|
2
|
+
|
|
3
|
+
**Shared browser interaction schema registry for AI agents.**
|
|
4
|
+
|
|
5
|
+
Reduces LLM token usage by 80-100% on known sites by storing and sharing site interaction schemas across all users.
|
|
6
|
+
|
|
7
|
+
## How it works
|
|
8
|
+
```
|
|
9
|
+
First user → LLM learns the site → saved to shared registry
|
|
10
|
+
Every user after → 0 tokens, instant response
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Benchmark results (real data)
|
|
14
|
+
|
|
15
|
+
| | Without AgentAtlas | With AgentAtlas |
|
|
16
|
+
|---|---|---|
|
|
17
|
+
| Tokens | 2,597 | 0-445 |
|
|
18
|
+
| Cost | $0.018 | $0.000-$0.002 |
|
|
19
|
+
| Time | 19s | 0.2-12s |
|
|
20
|
+
| Real URLs | ❌ | ✅ |
|
|
21
|
+
|
|
22
|
+
**82.9% token reduction** when LLM still needed. **100% reduction** for repeat workflows.
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
```bash
|
|
26
|
+
pip install agentatlas
|
|
27
|
+
playwright install chromium
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Usage
|
|
31
|
+
```python
|
|
32
|
+
from agentatlas.atlas import Atlas
|
|
33
|
+
|
|
34
|
+
atlas = Atlas()
|
|
35
|
+
|
|
36
|
+
# Get schema for any site
|
|
37
|
+
# Found in registry → 0 tokens
|
|
38
|
+
# New site → learns once, saves for everyone
|
|
39
|
+
schema = await atlas.get_schema(
|
|
40
|
+
site="greenhouse.io",
|
|
41
|
+
url="https://boards.greenhouse.io/anthropic"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Pass compact schema to YOUR LLM
|
|
45
|
+
# 150-500 tokens instead of 50,000
|
|
46
|
+
print(schema.elements)
|
|
47
|
+
print(schema.tokens_used) # 0 if registry hit
|
|
48
|
+
print(schema.source) # "registry" or "llm_learned"
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Environment variables
|
|
52
|
+
```bash
|
|
53
|
+
SUPABASE_URL=your_supabase_url
|
|
54
|
+
SUPABASE_SERVICE_ROLE_KEY=your_key
|
|
55
|
+
OPENAI_API_KEY=your_key
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## The flywheel
|
|
59
|
+
```
|
|
60
|
+
More developers use AgentAtlas
|
|
61
|
+
↓
|
|
62
|
+
More new sites get learned automatically
|
|
63
|
+
↓
|
|
64
|
+
Registry grows → higher hit rate
|
|
65
|
+
↓
|
|
66
|
+
Less tokens burned across the whole network
|
|
67
|
+
↓
|
|
68
|
+
Cheaper + faster → more developers adopt
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## License
|
|
72
|
+
|
|
73
|
+
MIT
|
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
"""
|
|
2
|
+
atlas.py — The complete AgentAtlas SDK
|
|
3
|
+
|
|
4
|
+
Flow:
|
|
5
|
+
1. Developer calls get_schema(site, url)
|
|
6
|
+
2. Check DB → found? return immediately (0 tokens)
|
|
7
|
+
3. Not found? → crawl page → LLM labels elements → save to DB → return
|
|
8
|
+
4. Every developer after gets it free
|
|
9
|
+
|
|
10
|
+
This is the flywheel.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
import asyncio
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from urllib.parse import urlparse
|
|
19
|
+
from dotenv import load_dotenv
|
|
20
|
+
from playwright.async_api import async_playwright
|
|
21
|
+
try:
|
|
22
|
+
from playwright_stealth import stealth_async
|
|
23
|
+
except ImportError:
|
|
24
|
+
stealth_async = None
|
|
25
|
+
from openai import OpenAI
|
|
26
|
+
from agentatlas.supabase_client import get_supabase
|
|
27
|
+
|
|
28
|
+
load_dotenv()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ─────────────────────────────────────────────────────────────
|
|
32
|
+
# What gets returned to the developer
|
|
33
|
+
# ─────────────────────────────────────────────────────────────
|
|
34
|
+
@dataclass
|
|
35
|
+
class SiteSchema:
|
|
36
|
+
site: str
|
|
37
|
+
url: str
|
|
38
|
+
route_key: str
|
|
39
|
+
status: str # "found" | "learned" | "not_found"
|
|
40
|
+
confidence: float
|
|
41
|
+
elements: dict # what the developer's LLM reads
|
|
42
|
+
source: str # "registry" | "llm_learned" | "not_found"
|
|
43
|
+
tokens_used: int # 0 if registry hit, >0 if LLM had to learn it
|
|
44
|
+
message: str # human readable explanation
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# ─────────────────────────────────────────────────────────────
|
|
48
|
+
# Main SDK class
|
|
49
|
+
# ─────────────────────────────────────────────────────────────
|
|
50
|
+
class Atlas:
|
|
51
|
+
def __init__(self):
|
|
52
|
+
self.sb = get_supabase()
|
|
53
|
+
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
54
|
+
|
|
55
|
+
# ─────────────────────────────────────────────
|
|
56
|
+
# PUBLIC: main entry point
|
|
57
|
+
# ─────────────────────────────────────────────
|
|
58
|
+
async def get_schema(self, site: str, url: str) -> SiteSchema:
|
|
59
|
+
"""
|
|
60
|
+
Get the interaction schema for a site/page.
|
|
61
|
+
|
|
62
|
+
If found in registry → returns immediately, 0 tokens
|
|
63
|
+
If not found → crawls page, learns schema,
|
|
64
|
+
saves to DB for future users,
|
|
65
|
+
returns schema
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
print(f"\n[AgentAtlas] Looking up: {site}")
|
|
69
|
+
|
|
70
|
+
# ── STEP 1: check registry ──────────────────────────
|
|
71
|
+
schema = await self._fetch_from_registry(site, url)
|
|
72
|
+
if schema:
|
|
73
|
+
print(f"[AgentAtlas] ✅ Registry hit — 0 tokens used")
|
|
74
|
+
return SiteSchema(
|
|
75
|
+
site=site, url=url,
|
|
76
|
+
route_key=schema["route_key"],
|
|
77
|
+
status="found",
|
|
78
|
+
confidence=schema["confidence"],
|
|
79
|
+
elements=schema["elements"],
|
|
80
|
+
source="registry",
|
|
81
|
+
tokens_used=0,
|
|
82
|
+
message="Schema found in registry. No LLM used."
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# ── STEP 2: not found → crawl + learn ───────────────
|
|
86
|
+
print(f"[AgentAtlas] ⚠ Not in registry. Crawling and learning...")
|
|
87
|
+
learned = await self._learn_site(site, url)
|
|
88
|
+
|
|
89
|
+
if not learned:
|
|
90
|
+
return SiteSchema(
|
|
91
|
+
site=site, url=url,
|
|
92
|
+
route_key="unknown",
|
|
93
|
+
status="not_found",
|
|
94
|
+
confidence=0.0,
|
|
95
|
+
elements={},
|
|
96
|
+
source="not_found",
|
|
97
|
+
tokens_used=0,
|
|
98
|
+
message="Could not learn site. Page may be blocked or empty."
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# ── STEP 3: save to DB for future users ─────────────
|
|
102
|
+
await self._save_to_registry(site, url, learned)
|
|
103
|
+
print(f"[AgentAtlas] 💾 Saved to registry — next user gets this free")
|
|
104
|
+
|
|
105
|
+
return SiteSchema(
|
|
106
|
+
site=site, url=url,
|
|
107
|
+
route_key=learned["route_key"],
|
|
108
|
+
status="learned",
|
|
109
|
+
confidence=0.6,
|
|
110
|
+
elements=learned["elements"],
|
|
111
|
+
source="llm_learned",
|
|
112
|
+
tokens_used=learned["tokens_used"],
|
|
113
|
+
message=f"Schema learned and saved. Tokens used: {learned['tokens_used']}. Free for all future users."
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# ─────────────────────────────────────────────
|
|
118
|
+
# PRIVATE: fetch from registry
|
|
119
|
+
# ─────────────────────────────────────────────
|
|
120
|
+
async def _fetch_from_registry(self, site: str, url: str) -> dict | None:
|
|
121
|
+
# look up site
|
|
122
|
+
site_row = self.sb.table("sites")\
|
|
123
|
+
.select("id")\
|
|
124
|
+
.eq("domain", site)\
|
|
125
|
+
.limit(1).execute().data
|
|
126
|
+
if not site_row:
|
|
127
|
+
return None
|
|
128
|
+
site_id = site_row[0]["id"]
|
|
129
|
+
|
|
130
|
+
# match url to route
|
|
131
|
+
routes = self.sb.table("page_routes")\
|
|
132
|
+
.select("id, route_key, path_pattern")\
|
|
133
|
+
.eq("site_id", site_id)\
|
|
134
|
+
.execute().data
|
|
135
|
+
matched = self._match_route(url, routes)
|
|
136
|
+
if not matched:
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
# fetch best active playbook
|
|
140
|
+
playbooks = self.sb.table("playbooks")\
|
|
141
|
+
.select("payload, confidence")\
|
|
142
|
+
.eq("site_id", site_id)\
|
|
143
|
+
.eq("route_id", matched["id"])\
|
|
144
|
+
.eq("status", "active")\
|
|
145
|
+
.order("confidence", desc=True)\
|
|
146
|
+
.limit(1).execute().data
|
|
147
|
+
if not playbooks:
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
payload = playbooks[0]["payload"]
|
|
151
|
+
elements = self._build_elements(payload)
|
|
152
|
+
if not elements:
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
"route_key" : matched["route_key"],
|
|
157
|
+
"confidence": playbooks[0]["confidence"],
|
|
158
|
+
"elements" : elements,
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# ─────────────────────────────────────────────
|
|
163
|
+
# PRIVATE: crawl page + ask LLM to label
|
|
164
|
+
# ─────────────────────────────────────────────
|
|
165
|
+
async def _learn_site(self, site: str, url: str) -> dict | None:
|
|
166
|
+
# crawl the page
|
|
167
|
+
elements_raw = await self._crawl_page(url)
|
|
168
|
+
if not elements_raw or len(elements_raw) < 10:
|
|
169
|
+
print(f"[AgentAtlas] ❌ Too few elements ({len(elements_raw) if elements_raw else 0}) — page may be blocked")
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
print(f"[AgentAtlas] 🔍 Crawled {len(elements_raw)} elements — asking LLM to label...")
|
|
173
|
+
|
|
174
|
+
# ask LLM to label elements
|
|
175
|
+
prompt = f"""
|
|
176
|
+
You are building a browser automation schema for {site}.
|
|
177
|
+
Below are raw DOM elements from the page at: {url}
|
|
178
|
+
|
|
179
|
+
Label the KEY interactive elements found on this page.
|
|
180
|
+
Be specific about what each element is for.
|
|
181
|
+
|
|
182
|
+
Return ONLY valid JSON — no markdown, no explanation:
|
|
183
|
+
{{
|
|
184
|
+
"route_key": "job_list | job_detail | search | product | home | other",
|
|
185
|
+
"elements": {{
|
|
186
|
+
"element_purpose": {{
|
|
187
|
+
"type": "css | role | text | aria_label | data_testid",
|
|
188
|
+
"selector": "the actual selector value",
|
|
189
|
+
"confidence": 0.0
|
|
190
|
+
}}
|
|
191
|
+
}}
|
|
192
|
+
}}
|
|
193
|
+
|
|
194
|
+
Rules:
|
|
195
|
+
- Only include elements you are confident about (confidence >= 0.5)
|
|
196
|
+
- Use stable selectors (aria-label, data-testid, name attr, semantic text)
|
|
197
|
+
- Avoid fragile selectors (hashed CSS classes like _abc123)
|
|
198
|
+
- Include 3-8 elements maximum — only the important ones
|
|
199
|
+
|
|
200
|
+
DOM elements:
|
|
201
|
+
{json.dumps(elements_raw[:80], indent=1)}
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
response = self.client.chat.completions.create(
|
|
205
|
+
model="gpt-4o",
|
|
206
|
+
messages=[{"role": "user", "content": prompt}],
|
|
207
|
+
temperature=0,
|
|
208
|
+
response_format={"type": "json_object"},
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
tokens_used = response.usage.total_tokens
|
|
212
|
+
raw = response.choices[0].message.content.strip()
|
|
213
|
+
labeled = json.loads(raw)
|
|
214
|
+
|
|
215
|
+
print(f"[AgentAtlas] 🤖 LLM labeled {len(labeled.get('elements', {}))} elements ({tokens_used} tokens)")
|
|
216
|
+
|
|
217
|
+
return {
|
|
218
|
+
"route_key" : labeled.get("route_key", "unknown"),
|
|
219
|
+
"elements" : labeled.get("elements", {}),
|
|
220
|
+
"tokens_used": tokens_used,
|
|
221
|
+
"raw_payload": labeled,
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
# ─────────────────────────────────────────────
|
|
226
|
+
# PRIVATE: save learned schema to DB
|
|
227
|
+
# ─────────────────────────────────────────────
|
|
228
|
+
async def _save_to_registry(self, site: str, url: str, learned: dict):
|
|
229
|
+
try:
|
|
230
|
+
# upsert site
|
|
231
|
+
self.sb.table("sites").upsert(
|
|
232
|
+
{"domain": site, "display_name": site},
|
|
233
|
+
on_conflict="domain"
|
|
234
|
+
).execute()
|
|
235
|
+
|
|
236
|
+
site_row = self.sb.table("sites")\
|
|
237
|
+
.select("id").eq("domain", site)\
|
|
238
|
+
.limit(1).execute().data
|
|
239
|
+
site_id = site_row[0]["id"]
|
|
240
|
+
|
|
241
|
+
# detect route from url path
|
|
242
|
+
path = urlparse(url).path or "/"
|
|
243
|
+
route_key = learned.get("route_key", "unknown")
|
|
244
|
+
|
|
245
|
+
# upsert route
|
|
246
|
+
self.sb.table("page_routes").upsert(
|
|
247
|
+
{
|
|
248
|
+
"site_id" : site_id,
|
|
249
|
+
"route_key" : route_key,
|
|
250
|
+
"path_pattern": f"^{re.escape(path)}",
|
|
251
|
+
"example_url" : url,
|
|
252
|
+
},
|
|
253
|
+
on_conflict="site_id,route_key"
|
|
254
|
+
).execute()
|
|
255
|
+
|
|
256
|
+
route_row = self.sb.table("page_routes")\
|
|
257
|
+
.select("id")\
|
|
258
|
+
.eq("site_id", site_id)\
|
|
259
|
+
.eq("route_key", route_key)\
|
|
260
|
+
.limit(1).execute().data
|
|
261
|
+
route_id = route_row[0]["id"]
|
|
262
|
+
|
|
263
|
+
# upsert task (generic)
|
|
264
|
+
self.sb.table("tasks").upsert(
|
|
265
|
+
{"task_key": "generic_extract", "description": "Generic extraction task"},
|
|
266
|
+
on_conflict="task_key"
|
|
267
|
+
).execute()
|
|
268
|
+
|
|
269
|
+
task_row = self.sb.table("tasks")\
|
|
270
|
+
.select("id").eq("task_key", "generic_extract")\
|
|
271
|
+
.limit(1).execute().data
|
|
272
|
+
task_id = task_row[0]["id"]
|
|
273
|
+
|
|
274
|
+
# build locators payload from labeled elements
|
|
275
|
+
locators = {}
|
|
276
|
+
for purpose, info in learned.get("elements", {}).items():
|
|
277
|
+
if info.get("confidence", 0) >= 0.5:
|
|
278
|
+
locators[purpose] = [{
|
|
279
|
+
"type" : info.get("type"),
|
|
280
|
+
"value" : info.get("selector"),
|
|
281
|
+
"priority" : 1,
|
|
282
|
+
"confidence": info.get("confidence"),
|
|
283
|
+
}]
|
|
284
|
+
|
|
285
|
+
# upsert playbook
|
|
286
|
+
self.sb.table("playbooks").upsert(
|
|
287
|
+
{
|
|
288
|
+
"site_id" : site_id,
|
|
289
|
+
"route_id" : route_id,
|
|
290
|
+
"task_id" : task_id,
|
|
291
|
+
"variant_key": "desktop_enUS_loggedout",
|
|
292
|
+
"version" : 1,
|
|
293
|
+
"status" : "active",
|
|
294
|
+
"confidence" : 0.6,
|
|
295
|
+
"ttl_days" : 14,
|
|
296
|
+
"payload" : {
|
|
297
|
+
"locators" : locators,
|
|
298
|
+
"fingerprint_source": "llm_learned",
|
|
299
|
+
"source_url" : url,
|
|
300
|
+
},
|
|
301
|
+
},
|
|
302
|
+
on_conflict="site_id,route_id,task_id,variant_key,version"
|
|
303
|
+
).execute()
|
|
304
|
+
|
|
305
|
+
except Exception as e:
|
|
306
|
+
print(f"[AgentAtlas] ⚠ Save failed: {e}")
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
# ─────────────────────────────────────────────
|
|
310
|
+
# PRIVATE: crawl a page with Playwright
|
|
311
|
+
# ─────────────────────────────────────────────
|
|
312
|
+
async def _crawl_page(self, url: str) -> list[dict]:
|
|
313
|
+
try:
|
|
314
|
+
async with async_playwright() as p:
|
|
315
|
+
browser = await p.chromium.launch(
|
|
316
|
+
headless=False,
|
|
317
|
+
args=["--disable-blink-features=AutomationControlled","--no-sandbox"]
|
|
318
|
+
)
|
|
319
|
+
context = await browser.new_context(
|
|
320
|
+
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
|
321
|
+
viewport={"width": 1280, "height": 800},
|
|
322
|
+
locale="en-US",
|
|
323
|
+
timezone_id="America/New_York",
|
|
324
|
+
)
|
|
325
|
+
page = await context.new_page()
|
|
326
|
+
|
|
327
|
+
if stealth_async:
|
|
328
|
+
await stealth_async(page)
|
|
329
|
+
print(f"[AgentAtlas] 🥷 Stealth mode active")
|
|
330
|
+
|
|
331
|
+
await page.route("**/*.{png,jpg,jpeg,gif,svg,woff,woff2}", lambda r: r.abort())
|
|
332
|
+
|
|
333
|
+
try:
|
|
334
|
+
await page.goto(url, wait_until="networkidle", timeout=30000)
|
|
335
|
+
except Exception:
|
|
336
|
+
pass
|
|
337
|
+
|
|
338
|
+
for wait_ms in [2000, 2000, 2000]:
|
|
339
|
+
await page.wait_for_timeout(wait_ms)
|
|
340
|
+
count = await page.evaluate("() => document.querySelectorAll('a, button').length")
|
|
341
|
+
print(f"[AgentAtlas] ⏳ Elements so far: {count}")
|
|
342
|
+
if count > 10:
|
|
343
|
+
break
|
|
344
|
+
|
|
345
|
+
elements = await page.evaluate("""
|
|
346
|
+
() => {
|
|
347
|
+
const sel = 'a, button, input, select, textarea, h1, h2, h3, [role=button], [role=link], [role=listitem]'
|
|
348
|
+
return Array.from(document.querySelectorAll(sel))
|
|
349
|
+
.filter(el => el.offsetParent !== null)
|
|
350
|
+
.slice(0, 120)
|
|
351
|
+
.map(el => ({
|
|
352
|
+
tag: el.tagName.toLowerCase(),
|
|
353
|
+
role: el.getAttribute('role'),
|
|
354
|
+
aria_label: el.getAttribute('aria-label'),
|
|
355
|
+
placeholder: el.getAttribute('placeholder'),
|
|
356
|
+
data_testid: el.getAttribute('data-testid'),
|
|
357
|
+
name: el.getAttribute('name'),
|
|
358
|
+
text: el.innerText?.trim().slice(0, 100),
|
|
359
|
+
href: el.href || null,
|
|
360
|
+
type: el.type || null,
|
|
361
|
+
}))
|
|
362
|
+
}
|
|
363
|
+
""")
|
|
364
|
+
await browser.close()
|
|
365
|
+
return elements
|
|
366
|
+
except Exception as e:
|
|
367
|
+
print(f"[AgentAtlas] Crawl error: {e}")
|
|
368
|
+
return []
|
|
369
|
+
|
|
370
|
+
# ─────────────────────────────────────────────
|
|
371
|
+
# PRIVATE: match url to stored route pattern
|
|
372
|
+
# ─────────────────────────────────────────────
|
|
373
|
+
def _match_route(self, url: str, routes: list) -> dict | None:
|
|
374
|
+
try:
|
|
375
|
+
path = urlparse(url).path
|
|
376
|
+
except Exception:
|
|
377
|
+
path = url
|
|
378
|
+
for route in routes:
|
|
379
|
+
try:
|
|
380
|
+
if re.search(route["path_pattern"], path):
|
|
381
|
+
return route
|
|
382
|
+
except Exception:
|
|
383
|
+
continue
|
|
384
|
+
return routes[0] if routes else None
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
# ─────────────────────────────────────────────
|
|
388
|
+
# PRIVATE: build clean elements dict from payload
|
|
389
|
+
# ─────────────────────────────────────────────
|
|
390
|
+
def _build_elements(self, payload: dict) -> dict:
|
|
391
|
+
locators = payload.get("locators", {})
|
|
392
|
+
elements = {}
|
|
393
|
+
for purpose, locs in locators.items():
|
|
394
|
+
if not locs:
|
|
395
|
+
continue
|
|
396
|
+
best = sorted(locs, key=lambda x: x.get("priority", 99))[0]
|
|
397
|
+
elements[purpose] = {
|
|
398
|
+
"type" : best.get("type"),
|
|
399
|
+
"selector" : best.get("value"),
|
|
400
|
+
"confidence": best.get("confidence", 0.5),
|
|
401
|
+
}
|
|
402
|
+
return elements
|
|
403
|
+
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""
|
|
2
|
+
crawler.py - v2
|
|
3
|
+
Same as before but with better wait strategy for JS-heavy / bot-protected sites.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
from dotenv import load_dotenv
|
|
10
|
+
from playwright.async_api import async_playwright
|
|
11
|
+
from openai import OpenAI
|
|
12
|
+
from agentatlas.supabase_client import get_supabase
|
|
13
|
+
|
|
14
|
+
load_dotenv()
|
|
15
|
+
|
|
16
|
+
SEED_TARGETS = [
|
|
17
|
+
{"site": "greenhouse.io", "url": "https://boards.greenhouse.io/anthropic", "route_key": "job_list"},
|
|
18
|
+
{"site": "lever.co", "url": "https://jobs.lever.co/netflix", "route_key": "job_list"},
|
|
19
|
+
{"site": "ashbyhq.com", "url": "https://jobs.ashbyhq.com/openai", "route_key": "job_list"},
|
|
20
|
+
{"site": "smartrecruiters.com", "url": "https://www.smartrecruiters.com/Spotify/jobs", "route_key": "job_list"},
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
async def extract_elements(url: str) -> list[dict]:
|
|
24
|
+
async with async_playwright() as p:
|
|
25
|
+
browser = await p.chromium.launch(headless=True)
|
|
26
|
+
context = await browser.new_context(
|
|
27
|
+
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
|
28
|
+
viewport={"width": 1280, "height": 800},
|
|
29
|
+
locale="en-US",
|
|
30
|
+
timezone_id="America/New_York",
|
|
31
|
+
)
|
|
32
|
+
page = await context.new_page()
|
|
33
|
+
|
|
34
|
+
# Block images/fonts to speed up load
|
|
35
|
+
await page.route("**/*.{png,jpg,jpeg,gif,svg,woff,woff2}", lambda r: r.abort())
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
|
39
|
+
except Exception:
|
|
40
|
+
pass # continue even if timeout, grab what loaded
|
|
41
|
+
|
|
42
|
+
# Wait progressively — up to 8 seconds for JS content to render
|
|
43
|
+
for wait_ms in [2000, 3000, 3000]:
|
|
44
|
+
await page.wait_for_timeout(wait_ms)
|
|
45
|
+
count = await page.evaluate("() => document.querySelectorAll('a, button').length")
|
|
46
|
+
if count > 10:
|
|
47
|
+
break
|
|
48
|
+
|
|
49
|
+
elements = await page.evaluate("""
|
|
50
|
+
() => {
|
|
51
|
+
const sel = 'a, button, input, select, textarea, h1, h2, h3, [role=button], [role=link], [role=listitem]'
|
|
52
|
+
return Array.from(document.querySelectorAll(sel))
|
|
53
|
+
.filter(el => el.offsetParent !== null)
|
|
54
|
+
.slice(0, 120)
|
|
55
|
+
.map(el => ({
|
|
56
|
+
tag: el.tagName.toLowerCase(),
|
|
57
|
+
role: el.getAttribute('role'),
|
|
58
|
+
aria_label: el.getAttribute('aria-label'),
|
|
59
|
+
placeholder: el.getAttribute('placeholder'),
|
|
60
|
+
data_testid: el.getAttribute('data-testid'),
|
|
61
|
+
name: el.getAttribute('name'),
|
|
62
|
+
text: el.innerText?.trim().slice(0, 100),
|
|
63
|
+
href: el.href || null,
|
|
64
|
+
type: el.type || null,
|
|
65
|
+
class_hint: el.className?.toString().slice(0, 60),
|
|
66
|
+
}))
|
|
67
|
+
}
|
|
68
|
+
""")
|
|
69
|
+
|
|
70
|
+
await browser.close()
|
|
71
|
+
print(f" → {len(elements)} elements found on {url}")
|
|
72
|
+
|
|
73
|
+
if len(elements) < 10:
|
|
74
|
+
print(f" ⚠ Low element count — likely bot-blocked or JS-heavy. Skipping LLM labeling.")
|
|
75
|
+
return []
|
|
76
|
+
|
|
77
|
+
return elements
|
|
78
|
+
|
|
79
|
+
def label_elements(elements: list[dict], site: str, route_key: str) -> dict:
|
|
80
|
+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
81
|
+
|
|
82
|
+
prompt = f"""
|
|
83
|
+
You are building a browser automation schema for {site} ({route_key} page).
|
|
84
|
+
Below are raw DOM elements extracted from the page.
|
|
85
|
+
|
|
86
|
+
Your job: identify the KEY elements needed for these tasks:
|
|
87
|
+
- extract_job_list: find job title links, location text, pagination
|
|
88
|
+
- extract_job_detail: find title h1, description block, apply button
|
|
89
|
+
- start_application: find the apply/submit button or link
|
|
90
|
+
|
|
91
|
+
Return ONLY valid JSON in this exact shape, no explanation, no markdown:
|
|
92
|
+
{{
|
|
93
|
+
"job_title_links": {{ "type": "...", "value": "...", "confidence": 0.0 }},
|
|
94
|
+
"location_text": {{ "type": "...", "value": "...", "confidence": 0.0 }},
|
|
95
|
+
"apply_button": {{ "type": "...", "value": "...", "confidence": 0.0 }},
|
|
96
|
+
"job_heading": {{ "type": "...", "value": "...", "confidence": 0.0 }},
|
|
97
|
+
"description_block": {{ "type": "...", "value": "...", "confidence": 0.0 }}
|
|
98
|
+
}}
|
|
99
|
+
|
|
100
|
+
For "type" use one of: css, role, text, aria_label, data_testid, placeholder
|
|
101
|
+
For "value" use the actual selector value (e.g. "a[data-mapped='true']" or "Apply now")
|
|
102
|
+
|
|
103
|
+
DOM elements:
|
|
104
|
+
{json.dumps(elements[:80], indent=1)}
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
response = client.chat.completions.create(
|
|
108
|
+
model="gpt-4o",
|
|
109
|
+
messages=[{"role": "user", "content": prompt}],
|
|
110
|
+
temperature=0,
|
|
111
|
+
response_format={"type": "json_object"},
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
return json.loads(response.choices[0].message.content.strip())
|
|
115
|
+
|
|
116
|
+
def update_playbook_payload(site_domain: str, route_key: str, labeled: dict):
|
|
117
|
+
sb = get_supabase()
|
|
118
|
+
|
|
119
|
+
site = sb.table("sites").select("id").eq("domain", site_domain).limit(1).execute().data
|
|
120
|
+
if not site:
|
|
121
|
+
print(f" ⚠ Site not found: {site_domain}")
|
|
122
|
+
return
|
|
123
|
+
site_id = site[0]["id"]
|
|
124
|
+
|
|
125
|
+
route = sb.table("page_routes").select("id").eq("site_id", site_id).eq("route_key", route_key).limit(1).execute().data
|
|
126
|
+
if not route:
|
|
127
|
+
print(f" ⚠ Route not found: {route_key}")
|
|
128
|
+
return
|
|
129
|
+
route_id = route[0]["id"]
|
|
130
|
+
|
|
131
|
+
locators = {}
|
|
132
|
+
for purpose, locator in labeled.items():
|
|
133
|
+
if locator.get("confidence", 0) >= 0.4:
|
|
134
|
+
locators[purpose] = [
|
|
135
|
+
{"type": locator["type"], "value": locator["value"], "priority": 1, "confidence": locator["confidence"]}
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
playbooks = sb.table("playbooks").select("id, payload").eq("site_id", site_id).eq("route_id", route_id).execute().data
|
|
139
|
+
for pb in playbooks:
|
|
140
|
+
updated_payload = {**pb["payload"], "locators": locators, "fingerprint_source": "crawled"}
|
|
141
|
+
sb.table("playbooks").update({
|
|
142
|
+
"payload": updated_payload,
|
|
143
|
+
"confidence": 0.6,
|
|
144
|
+
"status": "active",
|
|
145
|
+
}).eq("id", pb["id"]).execute()
|
|
146
|
+
|
|
147
|
+
print(f" ✅ Updated {len(playbooks)} playbook(s) for {site_domain}/{route_key}")
|
|
148
|
+
|
|
149
|
+
async def main():
|
|
150
|
+
for target in SEED_TARGETS:
|
|
151
|
+
print(f"\n🔍 Crawling {target['site']} — {target['url']}")
|
|
152
|
+
try:
|
|
153
|
+
elements = await extract_elements(target["url"])
|
|
154
|
+
if not elements:
|
|
155
|
+
print(f" ⏭ Skipped — not enough elements to label reliably")
|
|
156
|
+
continue
|
|
157
|
+
labeled = label_elements(elements, target["site"], target["route_key"])
|
|
158
|
+
print(f" GPT-4o labeled {len(labeled)} purposes")
|
|
159
|
+
update_playbook_payload(target["site"], target["route_key"], labeled)
|
|
160
|
+
except Exception as e:
|
|
161
|
+
print(f" ❌ Failed: {e}")
|
|
162
|
+
|
|
163
|
+
if __name__ == "__main__":
|
|
164
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""
|
|
2
|
+
executor.py
|
|
3
|
+
Takes a playbook from Supabase and executes it deterministically.
|
|
4
|
+
No LLM calls. Handles scrolling for lazy-loaded content.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
from playwright.async_api import async_playwright
|
|
9
|
+
|
|
10
|
+
async def scroll_to_bottom(page, max_scrolls: int = 10):
|
|
11
|
+
"""Scroll down to trigger lazy-loaded content."""
|
|
12
|
+
for _ in range(max_scrolls):
|
|
13
|
+
prev_height = await page.evaluate("document.body.scrollHeight")
|
|
14
|
+
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
15
|
+
await page.wait_for_timeout(800)
|
|
16
|
+
new_height = await page.evaluate("document.body.scrollHeight")
|
|
17
|
+
if new_height == prev_height:
|
|
18
|
+
break # nothing more to load
|
|
19
|
+
|
|
20
|
+
async def execute_job_list(url: str, selector: str) -> list[dict]:
|
|
21
|
+
async with async_playwright() as p:
|
|
22
|
+
browser = await p.chromium.launch(headless=True)
|
|
23
|
+
context = await browser.new_context(
|
|
24
|
+
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/124 Safari/537.36",
|
|
25
|
+
viewport={"width": 1280, "height": 800},
|
|
26
|
+
)
|
|
27
|
+
page = await context.new_page()
|
|
28
|
+
await page.goto(url, wait_until="networkidle", timeout=30000)
|
|
29
|
+
await page.wait_for_timeout(2000)
|
|
30
|
+
|
|
31
|
+
# Scroll to load all jobs
|
|
32
|
+
await scroll_to_bottom(page)
|
|
33
|
+
|
|
34
|
+
elements = await page.query_selector_all(selector)
|
|
35
|
+
jobs = []
|
|
36
|
+
for el in elements:
|
|
37
|
+
raw_text = (await el.inner_text()).strip()
|
|
38
|
+
href = await el.get_attribute("href")
|
|
39
|
+
|
|
40
|
+
# Split "Job Title\n\nLocation" into separate fields
|
|
41
|
+
parts = [p.strip() for p in raw_text.split("\n") if p.strip()]
|
|
42
|
+
title = parts[0] if parts else raw_text
|
|
43
|
+
location = parts[1] if len(parts) > 1 else ""
|
|
44
|
+
|
|
45
|
+
if title and href:
|
|
46
|
+
jobs.append({
|
|
47
|
+
"title" : title,
|
|
48
|
+
"location": location,
|
|
49
|
+
"url" : href if href.startswith("http") else f"https://boards.greenhouse.io{href}"
|
|
50
|
+
})
|
|
51
|
+
|
|
52
|
+
await browser.close()
|
|
53
|
+
return jobs
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
ATS_SITES = [
|
|
2
|
+
{"domain": "greenhouse.io", "display_name": "Greenhouse", "requires_login": False, "anti_bot_hint": "low/medium"},
|
|
3
|
+
{"domain": "lever.co", "display_name": "Lever", "requires_login": False, "anti_bot_hint": "low/medium"},
|
|
4
|
+
{"domain": "myworkdayjobs.com", "display_name": "Workday Jobs", "requires_login": False, "anti_bot_hint": "medium/high"},
|
|
5
|
+
{"domain": "icims.com", "display_name": "iCIMS", "requires_login": False, "anti_bot_hint": "medium"},
|
|
6
|
+
{"domain": "taleo.net", "display_name": "Taleo", "requires_login": False, "anti_bot_hint": "medium/high"},
|
|
7
|
+
{"domain": "smartrecruiters.com", "display_name": "SmartRecruiters", "requires_login": False, "anti_bot_hint": "low/medium"},
|
|
8
|
+
{"domain": "ashbyhq.com", "display_name": "Ashby", "requires_login": False, "anti_bot_hint": "low/medium"},
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
TASKS = [
|
|
12
|
+
{
|
|
13
|
+
"task_key": "extract_job_list",
|
|
14
|
+
"description": "Extract a list of jobs from a job listing page.",
|
|
15
|
+
"input_schema": {"type": "object", "properties": {"url": {"type": "string"}}, "required": ["url"]},
|
|
16
|
+
"output_schema": {
|
|
17
|
+
"type": "object",
|
|
18
|
+
"properties": {
|
|
19
|
+
"jobs": {
|
|
20
|
+
"type": "array",
|
|
21
|
+
"items": {
|
|
22
|
+
"type": "object",
|
|
23
|
+
"properties": {"title": {"type": "string"}, "location": {"type": "string"}, "url": {"type": "string"}},
|
|
24
|
+
"required": ["title", "url"],
|
|
25
|
+
},
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"required": ["jobs"],
|
|
29
|
+
},
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"task_key": "extract_job_detail",
|
|
33
|
+
"description": "Extract structured job detail from a job detail page.",
|
|
34
|
+
"input_schema": {"type": "object", "properties": {"url": {"type": "string"}}, "required": ["url"]},
|
|
35
|
+
"output_schema": {
|
|
36
|
+
"type": "object",
|
|
37
|
+
"properties": {
|
|
38
|
+
"title": {"type": "string"},
|
|
39
|
+
"company": {"type": "string"},
|
|
40
|
+
"location": {"type": "string"},
|
|
41
|
+
"description_text": {"type": "string"},
|
|
42
|
+
"apply_url": {"type": "string"},
|
|
43
|
+
},
|
|
44
|
+
"required": ["title", "description_text"],
|
|
45
|
+
},
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
"task_key": "start_application",
|
|
49
|
+
"description": "Navigate from job detail page to application form page (or external apply).",
|
|
50
|
+
"input_schema": {"type": "object", "properties": {"url": {"type": "string"}}, "required": ["url"]},
|
|
51
|
+
"output_schema": {"type": "object", "properties": {"application_url": {"type": "string"}}, "required": ["application_url"]},
|
|
52
|
+
},
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
ROUTES_BY_DOMAIN = {
|
|
56
|
+
"greenhouse.io": [
|
|
57
|
+
{"route_key": "job_list", "path_pattern": r"^/[^/]+/?$"},
|
|
58
|
+
{"route_key": "job_detail", "path_pattern": r"^/[^/]+/jobs/\d+.*$"},
|
|
59
|
+
],
|
|
60
|
+
"lever.co": [
|
|
61
|
+
{"route_key": "job_list", "path_pattern": r"^/[^/]+/?$"},
|
|
62
|
+
{"route_key": "job_detail", "path_pattern": r"^/[^/]+/[^/]+/?$"},
|
|
63
|
+
],
|
|
64
|
+
"myworkdayjobs.com": [
|
|
65
|
+
{"route_key": "job_list", "path_pattern": r"^/[^/]+/[^/]+/.*$"},
|
|
66
|
+
{"route_key": "job_detail", "path_pattern": r"^/[^/]+/[^/]+/job/.*$"},
|
|
67
|
+
],
|
|
68
|
+
"icims.com": [
|
|
69
|
+
{"route_key": "job_list", "path_pattern": r"^/jobs/.*$"},
|
|
70
|
+
{"route_key": "job_detail", "path_pattern": r"^/jobs/\d+/.*$"},
|
|
71
|
+
],
|
|
72
|
+
"taleo.net": [
|
|
73
|
+
{"route_key": "job_list", "path_pattern": r"^/careersection/.*$"},
|
|
74
|
+
{"route_key": "job_detail", "path_pattern": r"^/careersection/.*?/jobdetail\.ftl.*$"},
|
|
75
|
+
],
|
|
76
|
+
"smartrecruiters.com": [
|
|
77
|
+
{"route_key": "job_list", "path_pattern": r"^/[^/]+/jobs.*$"},
|
|
78
|
+
{"route_key": "job_detail", "path_pattern": r"^/[^/]+/job/.*$"},
|
|
79
|
+
],
|
|
80
|
+
"ashbyhq.com": [
|
|
81
|
+
{"route_key": "job_list", "path_pattern": r"^/[^/]+/jobs.*$"},
|
|
82
|
+
{"route_key": "job_detail", "path_pattern": r"^/[^/]+/jobs/.*$"},
|
|
83
|
+
],
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
def default_playbook_payload(task_key: str) -> dict:
|
|
87
|
+
"""
|
|
88
|
+
Starter payload template (generic). You’ll refine per ATS platform later.
|
|
89
|
+
"""
|
|
90
|
+
if task_key == "extract_job_list":
|
|
91
|
+
return {
|
|
92
|
+
"context": {"viewport": "desktop", "locale": "en-US", "logged_in": False},
|
|
93
|
+
"fingerprint": {"dom_hash": None, "anchors": ["Jobs", "Apply", "Location"]},
|
|
94
|
+
"preconditions": [{"type": "cookie_banner", "action": "accept_if_present"}],
|
|
95
|
+
"locators": {
|
|
96
|
+
"job_links": [
|
|
97
|
+
{"type": "role", "value": "link", "priority": 1},
|
|
98
|
+
{"type": "heuristic", "value": "links_that_look_like_job_details()", "priority": 2},
|
|
99
|
+
]
|
|
100
|
+
},
|
|
101
|
+
"extraction": [{"field": "jobs", "from": "job_links", "transform": "extract_title_location_url"}],
|
|
102
|
+
"validation": [{"field": "jobs", "op": "min_items", "value": 1}],
|
|
103
|
+
"network": None,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
if task_key == "extract_job_detail":
|
|
107
|
+
return {
|
|
108
|
+
"context": {"viewport": "desktop", "locale": "en-US", "logged_in": False},
|
|
109
|
+
"fingerprint": {"dom_hash": None, "anchors": ["Job Description", "Responsibilities", "Apply"]},
|
|
110
|
+
"preconditions": [{"type": "cookie_banner", "action": "accept_if_present"}],
|
|
111
|
+
"locators": {
|
|
112
|
+
"title": [
|
|
113
|
+
{"type": "role", "value": "heading", "priority": 1},
|
|
114
|
+
{"type": "css", "value": "h1", "priority": 2},
|
|
115
|
+
],
|
|
116
|
+
"description": [{"type": "heuristic", "value": "main_content_text_block()", "priority": 1}],
|
|
117
|
+
"apply_link": [
|
|
118
|
+
{"type": "text", "value": "Apply", "priority": 1},
|
|
119
|
+
{"type": "role", "value": "link[name~='Apply']", "priority": 2},
|
|
120
|
+
{"type": "role", "value": "button[name~='Apply']", "priority": 3},
|
|
121
|
+
],
|
|
122
|
+
},
|
|
123
|
+
"extraction": [
|
|
124
|
+
{"field": "title", "from": "title.text"},
|
|
125
|
+
{"field": "description_text", "from": "description.text"},
|
|
126
|
+
{"field": "apply_url", "from": "apply_link.href", "optional": True},
|
|
127
|
+
],
|
|
128
|
+
"validation": [{"field": "title", "op": "non_empty"}, {"field": "description_text", "op": "min_len", "value": 200}],
|
|
129
|
+
"network": None,
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
if task_key == "start_application":
|
|
133
|
+
return {
|
|
134
|
+
"context": {"viewport": "desktop", "locale": "en-US", "logged_in": False},
|
|
135
|
+
"fingerprint": {"dom_hash": None, "anchors": ["Apply", "Submit", "Continue"]},
|
|
136
|
+
"preconditions": [{"type": "cookie_banner", "action": "accept_if_present"}],
|
|
137
|
+
"locators": {
|
|
138
|
+
"apply_link": [
|
|
139
|
+
{"type": "text", "value": "Apply", "priority": 1},
|
|
140
|
+
{"type": "role", "value": "link[name~='Apply']", "priority": 2},
|
|
141
|
+
{"type": "role", "value": "button[name~='Apply']", "priority": 3},
|
|
142
|
+
]
|
|
143
|
+
},
|
|
144
|
+
"extraction": [{"field": "application_url", "from": "apply_link.href_or_navigation_url"}],
|
|
145
|
+
"validation": [{"field": "application_url", "op": "regex", "value": r"^https?://"}],
|
|
146
|
+
"network": None,
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
return {}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from dotenv import load_dotenv
|
|
2
|
+
from agentatlas.supabase_client import get_supabase
|
|
3
|
+
from agentatlas.schemas import ATS_SITES, TASKS, ROUTES_BY_DOMAIN, default_playbook_payload
|
|
4
|
+
|
|
5
|
+
load_dotenv()
|
|
6
|
+
|
|
7
|
+
def get_id_by_unique(sb, table: str, col: str, value: str):
|
|
8
|
+
res = sb.table(table).select("id").eq(col, value).limit(1).execute()
|
|
9
|
+
data = res.data or []
|
|
10
|
+
return data[0]["id"] if data else None
|
|
11
|
+
|
|
12
|
+
def main():
|
|
13
|
+
sb = get_supabase()
|
|
14
|
+
|
|
15
|
+
# 1) Upsert tasks
|
|
16
|
+
print("Seeding tasks...")
|
|
17
|
+
for t in TASKS:
|
|
18
|
+
sb.table("tasks").upsert(
|
|
19
|
+
{
|
|
20
|
+
"task_key": t["task_key"],
|
|
21
|
+
"description": t.get("description"),
|
|
22
|
+
"input_schema": t.get("input_schema", {}),
|
|
23
|
+
"output_schema": t.get("output_schema", {}),
|
|
24
|
+
},
|
|
25
|
+
on_conflict="task_key",
|
|
26
|
+
).execute()
|
|
27
|
+
|
|
28
|
+
# 2) Upsert sites, routes, playbooks
|
|
29
|
+
print("Seeding sites, routes, playbooks...")
|
|
30
|
+
for s in ATS_SITES:
|
|
31
|
+
sb.table("sites").upsert(
|
|
32
|
+
{
|
|
33
|
+
"domain": s["domain"],
|
|
34
|
+
"display_name": s.get("display_name"),
|
|
35
|
+
"base_url": s.get("base_url"),
|
|
36
|
+
"requires_login": s.get("requires_login", False),
|
|
37
|
+
"anti_bot_hint": s.get("anti_bot_hint"),
|
|
38
|
+
"notes": s.get("notes"),
|
|
39
|
+
},
|
|
40
|
+
on_conflict="domain",
|
|
41
|
+
).execute()
|
|
42
|
+
|
|
43
|
+
site_id = get_id_by_unique(sb, "sites", "domain", s["domain"])
|
|
44
|
+
if not site_id:
|
|
45
|
+
raise RuntimeError(f"Failed to fetch site_id for domain={s['domain']}")
|
|
46
|
+
|
|
47
|
+
routes = ROUTES_BY_DOMAIN.get(s["domain"], [])
|
|
48
|
+
for r in routes:
|
|
49
|
+
sb.table("page_routes").upsert(
|
|
50
|
+
{
|
|
51
|
+
"site_id": site_id,
|
|
52
|
+
"route_key": r["route_key"],
|
|
53
|
+
"path_pattern": r["path_pattern"],
|
|
54
|
+
"example_url": r.get("example_url"),
|
|
55
|
+
},
|
|
56
|
+
on_conflict="site_id,route_key",
|
|
57
|
+
).execute()
|
|
58
|
+
|
|
59
|
+
route_id = (
|
|
60
|
+
sb.table("page_routes")
|
|
61
|
+
.select("id")
|
|
62
|
+
.eq("site_id", site_id)
|
|
63
|
+
.eq("route_key", r["route_key"])
|
|
64
|
+
.limit(1)
|
|
65
|
+
.execute()
|
|
66
|
+
.data[0]["id"]
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
for task in TASKS:
|
|
70
|
+
task_key = task["task_key"]
|
|
71
|
+
|
|
72
|
+
# Map which tasks apply to which routes
|
|
73
|
+
if r["route_key"] == "job_list" and task_key not in ("extract_job_list",):
|
|
74
|
+
continue
|
|
75
|
+
if r["route_key"] == "job_detail" and task_key not in ("extract_job_detail", "start_application"):
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
task_id = get_id_by_unique(sb, "tasks", "task_key", task_key)
|
|
79
|
+
if not task_id:
|
|
80
|
+
raise RuntimeError(f"Failed to fetch task_id for task_key={task_key}")
|
|
81
|
+
|
|
82
|
+
sb.table("playbooks").upsert(
|
|
83
|
+
{
|
|
84
|
+
"site_id": site_id,
|
|
85
|
+
"route_id": route_id,
|
|
86
|
+
"task_id": task_id,
|
|
87
|
+
"variant_key": "desktop_enUS_loggedout",
|
|
88
|
+
"version": 1,
|
|
89
|
+
"status": "experimental",
|
|
90
|
+
"confidence": 0.2,
|
|
91
|
+
"ttl_days": 14,
|
|
92
|
+
"payload": default_playbook_payload(task_key),
|
|
93
|
+
},
|
|
94
|
+
on_conflict="site_id,route_id,task_id,variant_key,version",
|
|
95
|
+
).execute()
|
|
96
|
+
|
|
97
|
+
print("✅ Seed complete.")
|
|
98
|
+
|
|
99
|
+
if __name__ == "__main__":
|
|
100
|
+
main()
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from supabase import create_client, Client
|
|
3
|
+
|
|
4
|
+
def get_supabase() -> Client:
|
|
5
|
+
url = os.getenv("SUPABASE_URL", "").strip()
|
|
6
|
+
key = os.getenv("SUPABASE_SERVICE_ROLE_KEY", "").strip()
|
|
7
|
+
if not url or not key:
|
|
8
|
+
raise RuntimeError("Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY in environment.")
|
|
9
|
+
return create_client(url, key)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agentatlas
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Shared browser interaction schema registry for AI agents. Reduces LLM token usage by 80-100% on known sites.
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/yourusername/agentatlas
|
|
7
|
+
Project-URL: Issues, https://github.com/yourusername/agentatlas/issues
|
|
8
|
+
Keywords: ai,agents,browser-automation,llm,playwright
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: supabase>=2.6.0
|
|
15
|
+
Requires-Dist: python-dotenv>=1.0.1
|
|
16
|
+
Requires-Dist: playwright>=1.44.0
|
|
17
|
+
Requires-Dist: openai>=1.109.1
|
|
18
|
+
Requires-Dist: playwright-stealth>=1.0.6
|
|
19
|
+
|
|
20
|
+
# AgentAtlas
|
|
21
|
+
|
|
22
|
+
**Shared browser interaction schema registry for AI agents.**
|
|
23
|
+
|
|
24
|
+
Reduces LLM token usage by 80-100% on known sites by storing and sharing site interaction schemas across all users.
|
|
25
|
+
|
|
26
|
+
## How it works
|
|
27
|
+
```
|
|
28
|
+
First user → LLM learns the site → saved to shared registry
|
|
29
|
+
Every user after → 0 tokens, instant response
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Benchmark results (real data)
|
|
33
|
+
|
|
34
|
+
| | Without AgentAtlas | With AgentAtlas |
|
|
35
|
+
|---|---|---|
|
|
36
|
+
| Tokens | 2,597 | 0-445 |
|
|
37
|
+
| Cost | $0.018 | $0.000-$0.002 |
|
|
38
|
+
| Time | 19s | 0.2-12s |
|
|
39
|
+
| Real URLs | ❌ | ✅ |
|
|
40
|
+
|
|
41
|
+
**82.9% token reduction** when LLM still needed. **100% reduction** for repeat workflows.
|
|
42
|
+
|
|
43
|
+
## Install
|
|
44
|
+
```bash
|
|
45
|
+
pip install agentatlas
|
|
46
|
+
playwright install chromium
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Usage
|
|
50
|
+
```python
|
|
51
|
+
from agentatlas.atlas import Atlas
|
|
52
|
+
|
|
53
|
+
atlas = Atlas()
|
|
54
|
+
|
|
55
|
+
# Get schema for any site
|
|
56
|
+
# Found in registry → 0 tokens
|
|
57
|
+
# New site → learns once, saves for everyone
|
|
58
|
+
schema = await atlas.get_schema(
|
|
59
|
+
site="greenhouse.io",
|
|
60
|
+
url="https://boards.greenhouse.io/anthropic"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Pass compact schema to YOUR LLM
|
|
64
|
+
# 150-500 tokens instead of 50,000
|
|
65
|
+
print(schema.elements)
|
|
66
|
+
print(schema.tokens_used) # 0 if registry hit
|
|
67
|
+
print(schema.source) # "registry" or "llm_learned"
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Environment variables
|
|
71
|
+
```bash
|
|
72
|
+
SUPABASE_URL=your_supabase_url
|
|
73
|
+
SUPABASE_SERVICE_ROLE_KEY=your_key
|
|
74
|
+
OPENAI_API_KEY=your_key
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## The flywheel
|
|
78
|
+
```
|
|
79
|
+
More developers use AgentAtlas
|
|
80
|
+
↓
|
|
81
|
+
More new sites get learned automatically
|
|
82
|
+
↓
|
|
83
|
+
Registry grows → higher hit rate
|
|
84
|
+
↓
|
|
85
|
+
Less tokens burned across the whole network
|
|
86
|
+
↓
|
|
87
|
+
Cheaper + faster → more developers adopt
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## License
|
|
91
|
+
|
|
92
|
+
MIT
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
agentatlas/__init__.py
|
|
4
|
+
agentatlas/atlas.py
|
|
5
|
+
agentatlas/crawler.py
|
|
6
|
+
agentatlas/executor.py
|
|
7
|
+
agentatlas/schemas.py
|
|
8
|
+
agentatlas/seed_registry.py
|
|
9
|
+
agentatlas/supabase_client.py
|
|
10
|
+
agentatlas.egg-info/PKG-INFO
|
|
11
|
+
agentatlas.egg-info/SOURCES.txt
|
|
12
|
+
agentatlas.egg-info/dependency_links.txt
|
|
13
|
+
agentatlas.egg-info/requires.txt
|
|
14
|
+
agentatlas.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
agentatlas
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "agentatlas"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Shared browser interaction schema registry for AI agents. Reduces LLM token usage by 80-100% on known sites."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
keywords = ["ai", "agents", "browser-automation", "llm", "playwright"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
]
|
|
18
|
+
dependencies = [
|
|
19
|
+
"supabase>=2.6.0",
|
|
20
|
+
"python-dotenv>=1.0.1",
|
|
21
|
+
"playwright>=1.44.0",
|
|
22
|
+
"openai>=1.109.1",
|
|
23
|
+
"playwright-stealth>=1.0.6",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.urls]
|
|
27
|
+
Homepage = "https://github.com/yourusername/agentatlas"
|
|
28
|
+
Issues = "https://github.com/yourusername/agentatlas/issues"
|
|
29
|
+
|
|
30
|
+
[tool.setuptools.packages.find]
|
|
31
|
+
where = ["."]
|
|
32
|
+
include = ["agentatlas*"]
|