agentatlas 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,92 @@
1
+ Metadata-Version: 2.4
2
+ Name: agentatlas
3
+ Version: 0.1.0
4
+ Summary: Shared browser interaction schema registry for AI agents. Reduces LLM token usage by 80-100% on known sites.
5
+ License: MIT
6
+ Project-URL: Homepage, https://github.com/yourusername/agentatlas
7
+ Project-URL: Issues, https://github.com/yourusername/agentatlas/issues
8
+ Keywords: ai,agents,browser-automation,llm,playwright
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.10
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: supabase>=2.6.0
15
+ Requires-Dist: python-dotenv>=1.0.1
16
+ Requires-Dist: playwright>=1.44.0
17
+ Requires-Dist: openai>=1.109.1
18
+ Requires-Dist: playwright-stealth>=1.0.6
19
+
20
+ # AgentAtlas
21
+
22
+ **Shared browser interaction schema registry for AI agents.**
23
+
24
+ Reduces LLM token usage by 80-100% on known sites by storing and sharing site interaction schemas across all users.
25
+
26
+ ## How it works
27
+ ```
28
+ First user → LLM learns the site → saved to shared registry
29
+ Every user after → 0 tokens, instant response
30
+ ```
31
+
32
+ ## Benchmark results (real data)
33
+
34
+ | | Without AgentAtlas | With AgentAtlas |
35
+ |---|---|---|
36
+ | Tokens | 2,597 | 0-445 |
37
+ | Cost | $0.018 | $0.000-$0.002 |
38
+ | Time | 19s | 0.2-12s |
39
+ | Real URLs | ❌ | ✅ |
40
+
41
+ **82.9% token reduction** when LLM still needed. **100% reduction** for repeat workflows.
42
+
43
+ ## Install
44
+ ```bash
45
+ pip install agentatlas
46
+ playwright install chromium
47
+ ```
48
+
49
+ ## Usage
50
+ ```python
51
+ from agentatlas.atlas import Atlas
52
+
53
+ atlas = Atlas()
54
+
55
+ # Get schema for any site
56
+ # Found in registry → 0 tokens
57
+ # New site → learns once, saves for everyone
58
+ schema = await atlas.get_schema(
59
+ site="greenhouse.io",
60
+ url="https://boards.greenhouse.io/anthropic"
61
+ )
62
+
63
+ # Pass compact schema to YOUR LLM
64
+ # 150-500 tokens instead of 50,000
65
+ print(schema.elements)
66
+ print(schema.tokens_used) # 0 if registry hit
67
+ print(schema.source) # "registry" or "llm_learned"
68
+ ```
69
+
70
+ ## Environment variables
71
+ ```bash
72
+ SUPABASE_URL=your_supabase_url
73
+ SUPABASE_SERVICE_ROLE_KEY=your_key
74
+ OPENAI_API_KEY=your_key
75
+ ```
76
+
77
+ ## The flywheel
78
+ ```
79
+ More developers use AgentAtlas
80
+
81
+ More new sites get learned automatically
82
+
83
+ Registry grows → higher hit rate
84
+
85
+ Less tokens burned across the whole network
86
+
87
+ Cheaper + faster → more developers adopt
88
+ ```
89
+
90
+ ## License
91
+
92
+ MIT
@@ -0,0 +1,73 @@
1
+ # AgentAtlas
2
+
3
+ **Shared browser interaction schema registry for AI agents.**
4
+
5
+ Reduces LLM token usage by 80-100% on known sites by storing and sharing site interaction schemas across all users.
6
+
7
+ ## How it works
8
+ ```
9
+ First user → LLM learns the site → saved to shared registry
10
+ Every user after → 0 tokens, instant response
11
+ ```
12
+
13
+ ## Benchmark results (real data)
14
+
15
+ | | Without AgentAtlas | With AgentAtlas |
16
+ |---|---|---|
17
+ | Tokens | 2,597 | 0-445 |
18
+ | Cost | $0.018 | $0.000-$0.002 |
19
+ | Time | 19s | 0.2-12s |
20
+ | Real URLs | ❌ | ✅ |
21
+
22
+ **82.9% token reduction** when LLM still needed. **100% reduction** for repeat workflows.
23
+
24
+ ## Install
25
+ ```bash
26
+ pip install agentatlas
27
+ playwright install chromium
28
+ ```
29
+
30
+ ## Usage
31
+ ```python
32
+ from agentatlas.atlas import Atlas
33
+
34
+ atlas = Atlas()
35
+
36
+ # Get schema for any site
37
+ # Found in registry → 0 tokens
38
+ # New site → learns once, saves for everyone
39
+ schema = await atlas.get_schema(
40
+ site="greenhouse.io",
41
+ url="https://boards.greenhouse.io/anthropic"
42
+ )
43
+
44
+ # Pass compact schema to YOUR LLM
45
+ # 150-500 tokens instead of 50,000
46
+ print(schema.elements)
47
+ print(schema.tokens_used) # 0 if registry hit
48
+ print(schema.source) # "registry" or "llm_learned"
49
+ ```
50
+
51
+ ## Environment variables
52
+ ```bash
53
+ SUPABASE_URL=your_supabase_url
54
+ SUPABASE_SERVICE_ROLE_KEY=your_key
55
+ OPENAI_API_KEY=your_key
56
+ ```
57
+
58
+ ## The flywheel
59
+ ```
60
+ More developers use AgentAtlas
61
+
62
+ More new sites get learned automatically
63
+
64
+ Registry grows → higher hit rate
65
+
66
+ Less tokens burned across the whole network
67
+
68
+ Cheaper + faster → more developers adopt
69
+ ```
70
+
71
+ ## License
72
+
73
+ MIT
@@ -0,0 +1,4 @@
1
+ from agentatlas.atlas import Atlas, SiteSchema
2
+
3
+ __all__ = ["Atlas", "SiteSchema"]
4
+ __version__ = "0.1.0"
@@ -0,0 +1,403 @@
1
+ """
2
+ atlas.py — The complete AgentAtlas SDK
3
+
4
+ Flow:
5
+ 1. Developer calls get_schema(site, url)
6
+ 2. Check DB → found? return immediately (0 tokens)
7
+ 3. Not found? → crawl page → LLM labels elements → save to DB → return
8
+ 4. Every developer after gets it free
9
+
10
+ This is the flywheel.
11
+ """
12
+
13
+ import re
14
+ import json
15
+ import os
16
+ import asyncio
17
+ from dataclasses import dataclass
18
+ from urllib.parse import urlparse
19
+ from dotenv import load_dotenv
20
+ from playwright.async_api import async_playwright
21
+ try:
22
+ from playwright_stealth import stealth_async
23
+ except ImportError:
24
+ stealth_async = None
25
+ from openai import OpenAI
26
+ from agentatlas.supabase_client import get_supabase
27
+
28
+ load_dotenv()
29
+
30
+
31
+ # ─────────────────────────────────────────────────────────────
32
+ # What gets returned to the developer
33
+ # ─────────────────────────────────────────────────────────────
34
+ @dataclass
35
+ class SiteSchema:
36
+ site: str
37
+ url: str
38
+ route_key: str
39
+ status: str # "found" | "learned" | "not_found"
40
+ confidence: float
41
+ elements: dict # what the developer's LLM reads
42
+ source: str # "registry" | "llm_learned" | "not_found"
43
+ tokens_used: int # 0 if registry hit, >0 if LLM had to learn it
44
+ message: str # human readable explanation
45
+
46
+
47
+ # ─────────────────────────────────────────────────────────────
48
+ # Main SDK class
49
+ # ─────────────────────────────────────────────────────────────
50
+ class Atlas:
51
+ def __init__(self):
52
+ self.sb = get_supabase()
53
+ self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
54
+
55
+ # ─────────────────────────────────────────────
56
+ # PUBLIC: main entry point
57
+ # ─────────────────────────────────────────────
58
+ async def get_schema(self, site: str, url: str) -> SiteSchema:
59
+ """
60
+ Get the interaction schema for a site/page.
61
+
62
+ If found in registry → returns immediately, 0 tokens
63
+ If not found → crawls page, learns schema,
64
+ saves to DB for future users,
65
+ returns schema
66
+ """
67
+
68
+ print(f"\n[AgentAtlas] Looking up: {site}")
69
+
70
+ # ── STEP 1: check registry ──────────────────────────
71
+ schema = await self._fetch_from_registry(site, url)
72
+ if schema:
73
+ print(f"[AgentAtlas] ✅ Registry hit — 0 tokens used")
74
+ return SiteSchema(
75
+ site=site, url=url,
76
+ route_key=schema["route_key"],
77
+ status="found",
78
+ confidence=schema["confidence"],
79
+ elements=schema["elements"],
80
+ source="registry",
81
+ tokens_used=0,
82
+ message="Schema found in registry. No LLM used."
83
+ )
84
+
85
+ # ── STEP 2: not found → crawl + learn ───────────────
86
+ print(f"[AgentAtlas] ⚠ Not in registry. Crawling and learning...")
87
+ learned = await self._learn_site(site, url)
88
+
89
+ if not learned:
90
+ return SiteSchema(
91
+ site=site, url=url,
92
+ route_key="unknown",
93
+ status="not_found",
94
+ confidence=0.0,
95
+ elements={},
96
+ source="not_found",
97
+ tokens_used=0,
98
+ message="Could not learn site. Page may be blocked or empty."
99
+ )
100
+
101
+ # ── STEP 3: save to DB for future users ─────────────
102
+ await self._save_to_registry(site, url, learned)
103
+ print(f"[AgentAtlas] 💾 Saved to registry — next user gets this free")
104
+
105
+ return SiteSchema(
106
+ site=site, url=url,
107
+ route_key=learned["route_key"],
108
+ status="learned",
109
+ confidence=0.6,
110
+ elements=learned["elements"],
111
+ source="llm_learned",
112
+ tokens_used=learned["tokens_used"],
113
+ message=f"Schema learned and saved. Tokens used: {learned['tokens_used']}. Free for all future users."
114
+ )
115
+
116
+
117
+ # ─────────────────────────────────────────────
118
+ # PRIVATE: fetch from registry
119
+ # ─────────────────────────────────────────────
120
+ async def _fetch_from_registry(self, site: str, url: str) -> dict | None:
121
+ # look up site
122
+ site_row = self.sb.table("sites")\
123
+ .select("id")\
124
+ .eq("domain", site)\
125
+ .limit(1).execute().data
126
+ if not site_row:
127
+ return None
128
+ site_id = site_row[0]["id"]
129
+
130
+ # match url to route
131
+ routes = self.sb.table("page_routes")\
132
+ .select("id, route_key, path_pattern")\
133
+ .eq("site_id", site_id)\
134
+ .execute().data
135
+ matched = self._match_route(url, routes)
136
+ if not matched:
137
+ return None
138
+
139
+ # fetch best active playbook
140
+ playbooks = self.sb.table("playbooks")\
141
+ .select("payload, confidence")\
142
+ .eq("site_id", site_id)\
143
+ .eq("route_id", matched["id"])\
144
+ .eq("status", "active")\
145
+ .order("confidence", desc=True)\
146
+ .limit(1).execute().data
147
+ if not playbooks:
148
+ return None
149
+
150
+ payload = playbooks[0]["payload"]
151
+ elements = self._build_elements(payload)
152
+ if not elements:
153
+ return None
154
+
155
+ return {
156
+ "route_key" : matched["route_key"],
157
+ "confidence": playbooks[0]["confidence"],
158
+ "elements" : elements,
159
+ }
160
+
161
+
162
+ # ─────────────────────────────────────────────
163
+ # PRIVATE: crawl page + ask LLM to label
164
+ # ─────────────────────────────────────────────
165
+ async def _learn_site(self, site: str, url: str) -> dict | None:
166
+ # crawl the page
167
+ elements_raw = await self._crawl_page(url)
168
+ if not elements_raw or len(elements_raw) < 10:
169
+ print(f"[AgentAtlas] ❌ Too few elements ({len(elements_raw) if elements_raw else 0}) — page may be blocked")
170
+ return None
171
+
172
+ print(f"[AgentAtlas] 🔍 Crawled {len(elements_raw)} elements — asking LLM to label...")
173
+
174
+ # ask LLM to label elements
175
+ prompt = f"""
176
+ You are building a browser automation schema for {site}.
177
+ Below are raw DOM elements from the page at: {url}
178
+
179
+ Label the KEY interactive elements found on this page.
180
+ Be specific about what each element is for.
181
+
182
+ Return ONLY valid JSON — no markdown, no explanation:
183
+ {{
184
+ "route_key": "job_list | job_detail | search | product | home | other",
185
+ "elements": {{
186
+ "element_purpose": {{
187
+ "type": "css | role | text | aria_label | data_testid",
188
+ "selector": "the actual selector value",
189
+ "confidence": 0.0
190
+ }}
191
+ }}
192
+ }}
193
+
194
+ Rules:
195
+ - Only include elements you are confident about (confidence >= 0.5)
196
+ - Use stable selectors (aria-label, data-testid, name attr, semantic text)
197
+ - Avoid fragile selectors (hashed CSS classes like _abc123)
198
+ - Include 3-8 elements maximum — only the important ones
199
+
200
+ DOM elements:
201
+ {json.dumps(elements_raw[:80], indent=1)}
202
+ """
203
+
204
+ response = self.client.chat.completions.create(
205
+ model="gpt-4o",
206
+ messages=[{"role": "user", "content": prompt}],
207
+ temperature=0,
208
+ response_format={"type": "json_object"},
209
+ )
210
+
211
+ tokens_used = response.usage.total_tokens
212
+ raw = response.choices[0].message.content.strip()
213
+ labeled = json.loads(raw)
214
+
215
+ print(f"[AgentAtlas] 🤖 LLM labeled {len(labeled.get('elements', {}))} elements ({tokens_used} tokens)")
216
+
217
+ return {
218
+ "route_key" : labeled.get("route_key", "unknown"),
219
+ "elements" : labeled.get("elements", {}),
220
+ "tokens_used": tokens_used,
221
+ "raw_payload": labeled,
222
+ }
223
+
224
+
225
+ # ─────────────────────────────────────────────
226
+ # PRIVATE: save learned schema to DB
227
+ # ─────────────────────────────────────────────
228
+ async def _save_to_registry(self, site: str, url: str, learned: dict):
229
+ try:
230
+ # upsert site
231
+ self.sb.table("sites").upsert(
232
+ {"domain": site, "display_name": site},
233
+ on_conflict="domain"
234
+ ).execute()
235
+
236
+ site_row = self.sb.table("sites")\
237
+ .select("id").eq("domain", site)\
238
+ .limit(1).execute().data
239
+ site_id = site_row[0]["id"]
240
+
241
+ # detect route from url path
242
+ path = urlparse(url).path or "/"
243
+ route_key = learned.get("route_key", "unknown")
244
+
245
+ # upsert route
246
+ self.sb.table("page_routes").upsert(
247
+ {
248
+ "site_id" : site_id,
249
+ "route_key" : route_key,
250
+ "path_pattern": f"^{re.escape(path)}",
251
+ "example_url" : url,
252
+ },
253
+ on_conflict="site_id,route_key"
254
+ ).execute()
255
+
256
+ route_row = self.sb.table("page_routes")\
257
+ .select("id")\
258
+ .eq("site_id", site_id)\
259
+ .eq("route_key", route_key)\
260
+ .limit(1).execute().data
261
+ route_id = route_row[0]["id"]
262
+
263
+ # upsert task (generic)
264
+ self.sb.table("tasks").upsert(
265
+ {"task_key": "generic_extract", "description": "Generic extraction task"},
266
+ on_conflict="task_key"
267
+ ).execute()
268
+
269
+ task_row = self.sb.table("tasks")\
270
+ .select("id").eq("task_key", "generic_extract")\
271
+ .limit(1).execute().data
272
+ task_id = task_row[0]["id"]
273
+
274
+ # build locators payload from labeled elements
275
+ locators = {}
276
+ for purpose, info in learned.get("elements", {}).items():
277
+ if info.get("confidence", 0) >= 0.5:
278
+ locators[purpose] = [{
279
+ "type" : info.get("type"),
280
+ "value" : info.get("selector"),
281
+ "priority" : 1,
282
+ "confidence": info.get("confidence"),
283
+ }]
284
+
285
+ # upsert playbook
286
+ self.sb.table("playbooks").upsert(
287
+ {
288
+ "site_id" : site_id,
289
+ "route_id" : route_id,
290
+ "task_id" : task_id,
291
+ "variant_key": "desktop_enUS_loggedout",
292
+ "version" : 1,
293
+ "status" : "active",
294
+ "confidence" : 0.6,
295
+ "ttl_days" : 14,
296
+ "payload" : {
297
+ "locators" : locators,
298
+ "fingerprint_source": "llm_learned",
299
+ "source_url" : url,
300
+ },
301
+ },
302
+ on_conflict="site_id,route_id,task_id,variant_key,version"
303
+ ).execute()
304
+
305
+ except Exception as e:
306
+ print(f"[AgentAtlas] ⚠ Save failed: {e}")
307
+
308
+
309
+ # ─────────────────────────────────────────────
310
+ # PRIVATE: crawl a page with Playwright
311
+ # ─────────────────────────────────────────────
312
+ async def _crawl_page(self, url: str) -> list[dict]:
313
+ try:
314
+ async with async_playwright() as p:
315
+ browser = await p.chromium.launch(
316
+ headless=False,
317
+ args=["--disable-blink-features=AutomationControlled","--no-sandbox"]
318
+ )
319
+ context = await browser.new_context(
320
+ user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
321
+ viewport={"width": 1280, "height": 800},
322
+ locale="en-US",
323
+ timezone_id="America/New_York",
324
+ )
325
+ page = await context.new_page()
326
+
327
+ if stealth_async:
328
+ await stealth_async(page)
329
+ print(f"[AgentAtlas] 🥷 Stealth mode active")
330
+
331
+ await page.route("**/*.{png,jpg,jpeg,gif,svg,woff,woff2}", lambda r: r.abort())
332
+
333
+ try:
334
+ await page.goto(url, wait_until="networkidle", timeout=30000)
335
+ except Exception:
336
+ pass
337
+
338
+ for wait_ms in [2000, 2000, 2000]:
339
+ await page.wait_for_timeout(wait_ms)
340
+ count = await page.evaluate("() => document.querySelectorAll('a, button').length")
341
+ print(f"[AgentAtlas] ⏳ Elements so far: {count}")
342
+ if count > 10:
343
+ break
344
+
345
+ elements = await page.evaluate("""
346
+ () => {
347
+ const sel = 'a, button, input, select, textarea, h1, h2, h3, [role=button], [role=link], [role=listitem]'
348
+ return Array.from(document.querySelectorAll(sel))
349
+ .filter(el => el.offsetParent !== null)
350
+ .slice(0, 120)
351
+ .map(el => ({
352
+ tag: el.tagName.toLowerCase(),
353
+ role: el.getAttribute('role'),
354
+ aria_label: el.getAttribute('aria-label'),
355
+ placeholder: el.getAttribute('placeholder'),
356
+ data_testid: el.getAttribute('data-testid'),
357
+ name: el.getAttribute('name'),
358
+ text: el.innerText?.trim().slice(0, 100),
359
+ href: el.href || null,
360
+ type: el.type || null,
361
+ }))
362
+ }
363
+ """)
364
+ await browser.close()
365
+ return elements
366
+ except Exception as e:
367
+ print(f"[AgentAtlas] Crawl error: {e}")
368
+ return []
369
+
370
+ # ─────────────────────────────────────────────
371
+ # PRIVATE: match url to stored route pattern
372
+ # ─────────────────────────────────────────────
373
+ def _match_route(self, url: str, routes: list) -> dict | None:
374
+ try:
375
+ path = urlparse(url).path
376
+ except Exception:
377
+ path = url
378
+ for route in routes:
379
+ try:
380
+ if re.search(route["path_pattern"], path):
381
+ return route
382
+ except Exception:
383
+ continue
384
+ return routes[0] if routes else None
385
+
386
+
387
+ # ─────────────────────────────────────────────
388
+ # PRIVATE: build clean elements dict from payload
389
+ # ─────────────────────────────────────────────
390
+ def _build_elements(self, payload: dict) -> dict:
391
+ locators = payload.get("locators", {})
392
+ elements = {}
393
+ for purpose, locs in locators.items():
394
+ if not locs:
395
+ continue
396
+ best = sorted(locs, key=lambda x: x.get("priority", 99))[0]
397
+ elements[purpose] = {
398
+ "type" : best.get("type"),
399
+ "selector" : best.get("value"),
400
+ "confidence": best.get("confidence", 0.5),
401
+ }
402
+ return elements
403
+
@@ -0,0 +1,164 @@
1
+ """
2
+ crawler.py - v2
3
+ Same as before but with better wait strategy for JS-heavy / bot-protected sites.
4
+ """
5
+
6
+ import asyncio
7
+ import json
8
+ import os
9
+ from dotenv import load_dotenv
10
+ from playwright.async_api import async_playwright
11
+ from openai import OpenAI
12
+ from agentatlas.supabase_client import get_supabase
13
+
14
+ load_dotenv()
15
+
16
+ SEED_TARGETS = [
17
+ {"site": "greenhouse.io", "url": "https://boards.greenhouse.io/anthropic", "route_key": "job_list"},
18
+ {"site": "lever.co", "url": "https://jobs.lever.co/netflix", "route_key": "job_list"},
19
+ {"site": "ashbyhq.com", "url": "https://jobs.ashbyhq.com/openai", "route_key": "job_list"},
20
+ {"site": "smartrecruiters.com", "url": "https://www.smartrecruiters.com/Spotify/jobs", "route_key": "job_list"},
21
+ ]
22
+
23
+ async def extract_elements(url: str) -> list[dict]:
24
+ async with async_playwright() as p:
25
+ browser = await p.chromium.launch(headless=True)
26
+ context = await browser.new_context(
27
+ user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
28
+ viewport={"width": 1280, "height": 800},
29
+ locale="en-US",
30
+ timezone_id="America/New_York",
31
+ )
32
+ page = await context.new_page()
33
+
34
+ # Block images/fonts to speed up load
35
+ await page.route("**/*.{png,jpg,jpeg,gif,svg,woff,woff2}", lambda r: r.abort())
36
+
37
+ try:
38
+ await page.goto(url, wait_until="domcontentloaded", timeout=30000)
39
+ except Exception:
40
+ pass # continue even if timeout, grab what loaded
41
+
42
+ # Wait progressively — up to 8 seconds for JS content to render
43
+ for wait_ms in [2000, 3000, 3000]:
44
+ await page.wait_for_timeout(wait_ms)
45
+ count = await page.evaluate("() => document.querySelectorAll('a, button').length")
46
+ if count > 10:
47
+ break
48
+
49
+ elements = await page.evaluate("""
50
+ () => {
51
+ const sel = 'a, button, input, select, textarea, h1, h2, h3, [role=button], [role=link], [role=listitem]'
52
+ return Array.from(document.querySelectorAll(sel))
53
+ .filter(el => el.offsetParent !== null)
54
+ .slice(0, 120)
55
+ .map(el => ({
56
+ tag: el.tagName.toLowerCase(),
57
+ role: el.getAttribute('role'),
58
+ aria_label: el.getAttribute('aria-label'),
59
+ placeholder: el.getAttribute('placeholder'),
60
+ data_testid: el.getAttribute('data-testid'),
61
+ name: el.getAttribute('name'),
62
+ text: el.innerText?.trim().slice(0, 100),
63
+ href: el.href || null,
64
+ type: el.type || null,
65
+ class_hint: el.className?.toString().slice(0, 60),
66
+ }))
67
+ }
68
+ """)
69
+
70
+ await browser.close()
71
+ print(f" → {len(elements)} elements found on {url}")
72
+
73
+ if len(elements) < 10:
74
+ print(f" ⚠ Low element count — likely bot-blocked or JS-heavy. Skipping LLM labeling.")
75
+ return []
76
+
77
+ return elements
78
+
79
+ def label_elements(elements: list[dict], site: str, route_key: str) -> dict:
80
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
81
+
82
+ prompt = f"""
83
+ You are building a browser automation schema for {site} ({route_key} page).
84
+ Below are raw DOM elements extracted from the page.
85
+
86
+ Your job: identify the KEY elements needed for these tasks:
87
+ - extract_job_list: find job title links, location text, pagination
88
+ - extract_job_detail: find title h1, description block, apply button
89
+ - start_application: find the apply/submit button or link
90
+
91
+ Return ONLY valid JSON in this exact shape, no explanation, no markdown:
92
+ {{
93
+ "job_title_links": {{ "type": "...", "value": "...", "confidence": 0.0 }},
94
+ "location_text": {{ "type": "...", "value": "...", "confidence": 0.0 }},
95
+ "apply_button": {{ "type": "...", "value": "...", "confidence": 0.0 }},
96
+ "job_heading": {{ "type": "...", "value": "...", "confidence": 0.0 }},
97
+ "description_block": {{ "type": "...", "value": "...", "confidence": 0.0 }}
98
+ }}
99
+
100
+ For "type" use one of: css, role, text, aria_label, data_testid, placeholder
101
+ For "value" use the actual selector value (e.g. "a[data-mapped='true']" or "Apply now")
102
+
103
+ DOM elements:
104
+ {json.dumps(elements[:80], indent=1)}
105
+ """
106
+
107
+ response = client.chat.completions.create(
108
+ model="gpt-4o",
109
+ messages=[{"role": "user", "content": prompt}],
110
+ temperature=0,
111
+ response_format={"type": "json_object"},
112
+ )
113
+
114
+ return json.loads(response.choices[0].message.content.strip())
115
+
116
+ def update_playbook_payload(site_domain: str, route_key: str, labeled: dict):
117
+ sb = get_supabase()
118
+
119
+ site = sb.table("sites").select("id").eq("domain", site_domain).limit(1).execute().data
120
+ if not site:
121
+ print(f" ⚠ Site not found: {site_domain}")
122
+ return
123
+ site_id = site[0]["id"]
124
+
125
+ route = sb.table("page_routes").select("id").eq("site_id", site_id).eq("route_key", route_key).limit(1).execute().data
126
+ if not route:
127
+ print(f" ⚠ Route not found: {route_key}")
128
+ return
129
+ route_id = route[0]["id"]
130
+
131
+ locators = {}
132
+ for purpose, locator in labeled.items():
133
+ if locator.get("confidence", 0) >= 0.4:
134
+ locators[purpose] = [
135
+ {"type": locator["type"], "value": locator["value"], "priority": 1, "confidence": locator["confidence"]}
136
+ ]
137
+
138
+ playbooks = sb.table("playbooks").select("id, payload").eq("site_id", site_id).eq("route_id", route_id).execute().data
139
+ for pb in playbooks:
140
+ updated_payload = {**pb["payload"], "locators": locators, "fingerprint_source": "crawled"}
141
+ sb.table("playbooks").update({
142
+ "payload": updated_payload,
143
+ "confidence": 0.6,
144
+ "status": "active",
145
+ }).eq("id", pb["id"]).execute()
146
+
147
+ print(f" ✅ Updated {len(playbooks)} playbook(s) for {site_domain}/{route_key}")
148
+
149
+ async def main():
150
+ for target in SEED_TARGETS:
151
+ print(f"\n🔍 Crawling {target['site']} — {target['url']}")
152
+ try:
153
+ elements = await extract_elements(target["url"])
154
+ if not elements:
155
+ print(f" ⏭ Skipped — not enough elements to label reliably")
156
+ continue
157
+ labeled = label_elements(elements, target["site"], target["route_key"])
158
+ print(f" GPT-4o labeled {len(labeled)} purposes")
159
+ update_playbook_payload(target["site"], target["route_key"], labeled)
160
+ except Exception as e:
161
+ print(f" ❌ Failed: {e}")
162
+
163
+ if __name__ == "__main__":
164
+ asyncio.run(main())
@@ -0,0 +1,53 @@
1
+ """
2
+ executor.py
3
+ Takes a playbook from Supabase and executes it deterministically.
4
+ No LLM calls. Handles scrolling for lazy-loaded content.
5
+ """
6
+
7
+ import asyncio
8
+ from playwright.async_api import async_playwright
9
+
10
+ async def scroll_to_bottom(page, max_scrolls: int = 10):
11
+ """Scroll down to trigger lazy-loaded content."""
12
+ for _ in range(max_scrolls):
13
+ prev_height = await page.evaluate("document.body.scrollHeight")
14
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
15
+ await page.wait_for_timeout(800)
16
+ new_height = await page.evaluate("document.body.scrollHeight")
17
+ if new_height == prev_height:
18
+ break # nothing more to load
19
+
20
+ async def execute_job_list(url: str, selector: str) -> list[dict]:
21
+ async with async_playwright() as p:
22
+ browser = await p.chromium.launch(headless=True)
23
+ context = await browser.new_context(
24
+ user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/124 Safari/537.36",
25
+ viewport={"width": 1280, "height": 800},
26
+ )
27
+ page = await context.new_page()
28
+ await page.goto(url, wait_until="networkidle", timeout=30000)
29
+ await page.wait_for_timeout(2000)
30
+
31
+ # Scroll to load all jobs
32
+ await scroll_to_bottom(page)
33
+
34
+ elements = await page.query_selector_all(selector)
35
+ jobs = []
36
+ for el in elements:
37
+ raw_text = (await el.inner_text()).strip()
38
+ href = await el.get_attribute("href")
39
+
40
+ # Split "Job Title\n\nLocation" into separate fields
41
+ parts = [p.strip() for p in raw_text.split("\n") if p.strip()]
42
+ title = parts[0] if parts else raw_text
43
+ location = parts[1] if len(parts) > 1 else ""
44
+
45
+ if title and href:
46
+ jobs.append({
47
+ "title" : title,
48
+ "location": location,
49
+ "url" : href if href.startswith("http") else f"https://boards.greenhouse.io{href}"
50
+ })
51
+
52
+ await browser.close()
53
+ return jobs
@@ -0,0 +1,149 @@
1
+ ATS_SITES = [
2
+ {"domain": "greenhouse.io", "display_name": "Greenhouse", "requires_login": False, "anti_bot_hint": "low/medium"},
3
+ {"domain": "lever.co", "display_name": "Lever", "requires_login": False, "anti_bot_hint": "low/medium"},
4
+ {"domain": "myworkdayjobs.com", "display_name": "Workday Jobs", "requires_login": False, "anti_bot_hint": "medium/high"},
5
+ {"domain": "icims.com", "display_name": "iCIMS", "requires_login": False, "anti_bot_hint": "medium"},
6
+ {"domain": "taleo.net", "display_name": "Taleo", "requires_login": False, "anti_bot_hint": "medium/high"},
7
+ {"domain": "smartrecruiters.com", "display_name": "SmartRecruiters", "requires_login": False, "anti_bot_hint": "low/medium"},
8
+ {"domain": "ashbyhq.com", "display_name": "Ashby", "requires_login": False, "anti_bot_hint": "low/medium"},
9
+ ]
10
+
11
+ TASKS = [
12
+ {
13
+ "task_key": "extract_job_list",
14
+ "description": "Extract a list of jobs from a job listing page.",
15
+ "input_schema": {"type": "object", "properties": {"url": {"type": "string"}}, "required": ["url"]},
16
+ "output_schema": {
17
+ "type": "object",
18
+ "properties": {
19
+ "jobs": {
20
+ "type": "array",
21
+ "items": {
22
+ "type": "object",
23
+ "properties": {"title": {"type": "string"}, "location": {"type": "string"}, "url": {"type": "string"}},
24
+ "required": ["title", "url"],
25
+ },
26
+ }
27
+ },
28
+ "required": ["jobs"],
29
+ },
30
+ },
31
+ {
32
+ "task_key": "extract_job_detail",
33
+ "description": "Extract structured job detail from a job detail page.",
34
+ "input_schema": {"type": "object", "properties": {"url": {"type": "string"}}, "required": ["url"]},
35
+ "output_schema": {
36
+ "type": "object",
37
+ "properties": {
38
+ "title": {"type": "string"},
39
+ "company": {"type": "string"},
40
+ "location": {"type": "string"},
41
+ "description_text": {"type": "string"},
42
+ "apply_url": {"type": "string"},
43
+ },
44
+ "required": ["title", "description_text"],
45
+ },
46
+ },
47
+ {
48
+ "task_key": "start_application",
49
+ "description": "Navigate from job detail page to application form page (or external apply).",
50
+ "input_schema": {"type": "object", "properties": {"url": {"type": "string"}}, "required": ["url"]},
51
+ "output_schema": {"type": "object", "properties": {"application_url": {"type": "string"}}, "required": ["application_url"]},
52
+ },
53
+ ]
54
+
55
+ ROUTES_BY_DOMAIN = {
56
+ "greenhouse.io": [
57
+ {"route_key": "job_list", "path_pattern": r"^/[^/]+/?$"},
58
+ {"route_key": "job_detail", "path_pattern": r"^/[^/]+/jobs/\d+.*$"},
59
+ ],
60
+ "lever.co": [
61
+ {"route_key": "job_list", "path_pattern": r"^/[^/]+/?$"},
62
+ {"route_key": "job_detail", "path_pattern": r"^/[^/]+/[^/]+/?$"},
63
+ ],
64
+ "myworkdayjobs.com": [
65
+ {"route_key": "job_list", "path_pattern": r"^/[^/]+/[^/]+/.*$"},
66
+ {"route_key": "job_detail", "path_pattern": r"^/[^/]+/[^/]+/job/.*$"},
67
+ ],
68
+ "icims.com": [
69
+ {"route_key": "job_list", "path_pattern": r"^/jobs/.*$"},
70
+ {"route_key": "job_detail", "path_pattern": r"^/jobs/\d+/.*$"},
71
+ ],
72
+ "taleo.net": [
73
+ {"route_key": "job_list", "path_pattern": r"^/careersection/.*$"},
74
+ {"route_key": "job_detail", "path_pattern": r"^/careersection/.*?/jobdetail\.ftl.*$"},
75
+ ],
76
+ "smartrecruiters.com": [
77
+ {"route_key": "job_list", "path_pattern": r"^/[^/]+/jobs.*$"},
78
+ {"route_key": "job_detail", "path_pattern": r"^/[^/]+/job/.*$"},
79
+ ],
80
+ "ashbyhq.com": [
81
+ {"route_key": "job_list", "path_pattern": r"^/[^/]+/jobs.*$"},
82
+ {"route_key": "job_detail", "path_pattern": r"^/[^/]+/jobs/.*$"},
83
+ ],
84
+ }
85
+
86
+ def default_playbook_payload(task_key: str) -> dict:
87
+ """
88
+ Starter payload template (generic). You’ll refine per ATS platform later.
89
+ """
90
+ if task_key == "extract_job_list":
91
+ return {
92
+ "context": {"viewport": "desktop", "locale": "en-US", "logged_in": False},
93
+ "fingerprint": {"dom_hash": None, "anchors": ["Jobs", "Apply", "Location"]},
94
+ "preconditions": [{"type": "cookie_banner", "action": "accept_if_present"}],
95
+ "locators": {
96
+ "job_links": [
97
+ {"type": "role", "value": "link", "priority": 1},
98
+ {"type": "heuristic", "value": "links_that_look_like_job_details()", "priority": 2},
99
+ ]
100
+ },
101
+ "extraction": [{"field": "jobs", "from": "job_links", "transform": "extract_title_location_url"}],
102
+ "validation": [{"field": "jobs", "op": "min_items", "value": 1}],
103
+ "network": None,
104
+ }
105
+
106
+ if task_key == "extract_job_detail":
107
+ return {
108
+ "context": {"viewport": "desktop", "locale": "en-US", "logged_in": False},
109
+ "fingerprint": {"dom_hash": None, "anchors": ["Job Description", "Responsibilities", "Apply"]},
110
+ "preconditions": [{"type": "cookie_banner", "action": "accept_if_present"}],
111
+ "locators": {
112
+ "title": [
113
+ {"type": "role", "value": "heading", "priority": 1},
114
+ {"type": "css", "value": "h1", "priority": 2},
115
+ ],
116
+ "description": [{"type": "heuristic", "value": "main_content_text_block()", "priority": 1}],
117
+ "apply_link": [
118
+ {"type": "text", "value": "Apply", "priority": 1},
119
+ {"type": "role", "value": "link[name~='Apply']", "priority": 2},
120
+ {"type": "role", "value": "button[name~='Apply']", "priority": 3},
121
+ ],
122
+ },
123
+ "extraction": [
124
+ {"field": "title", "from": "title.text"},
125
+ {"field": "description_text", "from": "description.text"},
126
+ {"field": "apply_url", "from": "apply_link.href", "optional": True},
127
+ ],
128
+ "validation": [{"field": "title", "op": "non_empty"}, {"field": "description_text", "op": "min_len", "value": 200}],
129
+ "network": None,
130
+ }
131
+
132
+ if task_key == "start_application":
133
+ return {
134
+ "context": {"viewport": "desktop", "locale": "en-US", "logged_in": False},
135
+ "fingerprint": {"dom_hash": None, "anchors": ["Apply", "Submit", "Continue"]},
136
+ "preconditions": [{"type": "cookie_banner", "action": "accept_if_present"}],
137
+ "locators": {
138
+ "apply_link": [
139
+ {"type": "text", "value": "Apply", "priority": 1},
140
+ {"type": "role", "value": "link[name~='Apply']", "priority": 2},
141
+ {"type": "role", "value": "button[name~='Apply']", "priority": 3},
142
+ ]
143
+ },
144
+ "extraction": [{"field": "application_url", "from": "apply_link.href_or_navigation_url"}],
145
+ "validation": [{"field": "application_url", "op": "regex", "value": r"^https?://"}],
146
+ "network": None,
147
+ }
148
+
149
+ return {}
@@ -0,0 +1,100 @@
1
+ from dotenv import load_dotenv
2
+ from agentatlas.supabase_client import get_supabase
3
+ from agentatlas.schemas import ATS_SITES, TASKS, ROUTES_BY_DOMAIN, default_playbook_payload
4
+
5
+ load_dotenv()
6
+
7
+ def get_id_by_unique(sb, table: str, col: str, value: str):
8
+ res = sb.table(table).select("id").eq(col, value).limit(1).execute()
9
+ data = res.data or []
10
+ return data[0]["id"] if data else None
11
+
12
+ def main():
13
+ sb = get_supabase()
14
+
15
+ # 1) Upsert tasks
16
+ print("Seeding tasks...")
17
+ for t in TASKS:
18
+ sb.table("tasks").upsert(
19
+ {
20
+ "task_key": t["task_key"],
21
+ "description": t.get("description"),
22
+ "input_schema": t.get("input_schema", {}),
23
+ "output_schema": t.get("output_schema", {}),
24
+ },
25
+ on_conflict="task_key",
26
+ ).execute()
27
+
28
+ # 2) Upsert sites, routes, playbooks
29
+ print("Seeding sites, routes, playbooks...")
30
+ for s in ATS_SITES:
31
+ sb.table("sites").upsert(
32
+ {
33
+ "domain": s["domain"],
34
+ "display_name": s.get("display_name"),
35
+ "base_url": s.get("base_url"),
36
+ "requires_login": s.get("requires_login", False),
37
+ "anti_bot_hint": s.get("anti_bot_hint"),
38
+ "notes": s.get("notes"),
39
+ },
40
+ on_conflict="domain",
41
+ ).execute()
42
+
43
+ site_id = get_id_by_unique(sb, "sites", "domain", s["domain"])
44
+ if not site_id:
45
+ raise RuntimeError(f"Failed to fetch site_id for domain={s['domain']}")
46
+
47
+ routes = ROUTES_BY_DOMAIN.get(s["domain"], [])
48
+ for r in routes:
49
+ sb.table("page_routes").upsert(
50
+ {
51
+ "site_id": site_id,
52
+ "route_key": r["route_key"],
53
+ "path_pattern": r["path_pattern"],
54
+ "example_url": r.get("example_url"),
55
+ },
56
+ on_conflict="site_id,route_key",
57
+ ).execute()
58
+
59
+ route_id = (
60
+ sb.table("page_routes")
61
+ .select("id")
62
+ .eq("site_id", site_id)
63
+ .eq("route_key", r["route_key"])
64
+ .limit(1)
65
+ .execute()
66
+ .data[0]["id"]
67
+ )
68
+
69
+ for task in TASKS:
70
+ task_key = task["task_key"]
71
+
72
+ # Map which tasks apply to which routes
73
+ if r["route_key"] == "job_list" and task_key not in ("extract_job_list",):
74
+ continue
75
+ if r["route_key"] == "job_detail" and task_key not in ("extract_job_detail", "start_application"):
76
+ continue
77
+
78
+ task_id = get_id_by_unique(sb, "tasks", "task_key", task_key)
79
+ if not task_id:
80
+ raise RuntimeError(f"Failed to fetch task_id for task_key={task_key}")
81
+
82
+ sb.table("playbooks").upsert(
83
+ {
84
+ "site_id": site_id,
85
+ "route_id": route_id,
86
+ "task_id": task_id,
87
+ "variant_key": "desktop_enUS_loggedout",
88
+ "version": 1,
89
+ "status": "experimental",
90
+ "confidence": 0.2,
91
+ "ttl_days": 14,
92
+ "payload": default_playbook_payload(task_key),
93
+ },
94
+ on_conflict="site_id,route_id,task_id,variant_key,version",
95
+ ).execute()
96
+
97
+ print("✅ Seed complete.")
98
+
99
+ if __name__ == "__main__":
100
+ main()
@@ -0,0 +1,9 @@
1
+ import os
2
+ from supabase import create_client, Client
3
+
4
+ def get_supabase() -> Client:
5
+ url = os.getenv("SUPABASE_URL", "").strip()
6
+ key = os.getenv("SUPABASE_SERVICE_ROLE_KEY", "").strip()
7
+ if not url or not key:
8
+ raise RuntimeError("Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY in environment.")
9
+ return create_client(url, key)
@@ -0,0 +1,92 @@
1
+ Metadata-Version: 2.4
2
+ Name: agentatlas
3
+ Version: 0.1.0
4
+ Summary: Shared browser interaction schema registry for AI agents. Reduces LLM token usage by 80-100% on known sites.
5
+ License: MIT
6
+ Project-URL: Homepage, https://github.com/yourusername/agentatlas
7
+ Project-URL: Issues, https://github.com/yourusername/agentatlas/issues
8
+ Keywords: ai,agents,browser-automation,llm,playwright
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.10
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: supabase>=2.6.0
15
+ Requires-Dist: python-dotenv>=1.0.1
16
+ Requires-Dist: playwright>=1.44.0
17
+ Requires-Dist: openai>=1.109.1
18
+ Requires-Dist: playwright-stealth>=1.0.6
19
+
20
+ # AgentAtlas
21
+
22
+ **Shared browser interaction schema registry for AI agents.**
23
+
24
+ Reduces LLM token usage by 80-100% on known sites by storing and sharing site interaction schemas across all users.
25
+
26
+ ## How it works
27
+ ```
28
+ First user → LLM learns the site → saved to shared registry
29
+ Every user after → 0 tokens, instant response
30
+ ```
31
+
32
+ ## Benchmark results (real data)
33
+
34
+ | | Without AgentAtlas | With AgentAtlas |
35
+ |---|---|---|
36
+ | Tokens | 2,597 | 0-445 |
37
+ | Cost | $0.018 | $0.000-$0.002 |
38
+ | Time | 19s | 0.2-12s |
39
+ | Real URLs | ❌ | ✅ |
40
+
41
+ **82.9% token reduction** when LLM still needed. **100% reduction** for repeat workflows.
42
+
43
+ ## Install
44
+ ```bash
45
+ pip install agentatlas
46
+ playwright install chromium
47
+ ```
48
+
49
+ ## Usage
50
+ ```python
51
+ from agentatlas.atlas import Atlas
52
+
53
+ atlas = Atlas()
54
+
55
+ # Get schema for any site
56
+ # Found in registry → 0 tokens
57
+ # New site → learns once, saves for everyone
58
+ schema = await atlas.get_schema(
59
+ site="greenhouse.io",
60
+ url="https://boards.greenhouse.io/anthropic"
61
+ )
62
+
63
+ # Pass compact schema to YOUR LLM
64
+ # 150-500 tokens instead of 50,000
65
+ print(schema.elements)
66
+ print(schema.tokens_used) # 0 if registry hit
67
+ print(schema.source) # "registry" or "llm_learned"
68
+ ```
69
+
70
+ ## Environment variables
71
+ ```bash
72
+ SUPABASE_URL=your_supabase_url
73
+ SUPABASE_SERVICE_ROLE_KEY=your_key
74
+ OPENAI_API_KEY=your_key
75
+ ```
76
+
77
+ ## The flywheel
78
+ ```
79
+ More developers use AgentAtlas
80
+
81
+ More new sites get learned automatically
82
+
83
+ Registry grows → higher hit rate
84
+
85
+ Less tokens burned across the whole network
86
+
87
+ Cheaper + faster → more developers adopt
88
+ ```
89
+
90
+ ## License
91
+
92
+ MIT
@@ -0,0 +1,14 @@
1
+ README.md
2
+ pyproject.toml
3
+ agentatlas/__init__.py
4
+ agentatlas/atlas.py
5
+ agentatlas/crawler.py
6
+ agentatlas/executor.py
7
+ agentatlas/schemas.py
8
+ agentatlas/seed_registry.py
9
+ agentatlas/supabase_client.py
10
+ agentatlas.egg-info/PKG-INFO
11
+ agentatlas.egg-info/SOURCES.txt
12
+ agentatlas.egg-info/dependency_links.txt
13
+ agentatlas.egg-info/requires.txt
14
+ agentatlas.egg-info/top_level.txt
@@ -0,0 +1,5 @@
1
+ supabase>=2.6.0
2
+ python-dotenv>=1.0.1
3
+ playwright>=1.44.0
4
+ openai>=1.109.1
5
+ playwright-stealth>=1.0.6
@@ -0,0 +1 @@
1
+ agentatlas
@@ -0,0 +1,32 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "agentatlas"
7
+ version = "0.1.0"
8
+ description = "Shared browser interaction schema registry for AI agents. Reduces LLM token usage by 80-100% on known sites."
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.10"
12
+ keywords = ["ai", "agents", "browser-automation", "llm", "playwright"]
13
+ classifiers = [
14
+ "Programming Language :: Python :: 3",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Operating System :: OS Independent",
17
+ ]
18
+ dependencies = [
19
+ "supabase>=2.6.0",
20
+ "python-dotenv>=1.0.1",
21
+ "playwright>=1.44.0",
22
+ "openai>=1.109.1",
23
+ "playwright-stealth>=1.0.6",
24
+ ]
25
+
26
+ [project.urls]
27
+ Homepage = "https://github.com/yourusername/agentatlas"
28
+ Issues = "https://github.com/yourusername/agentatlas/issues"
29
+
30
+ [tool.setuptools.packages.find]
31
+ where = ["."]
32
+ include = ["agentatlas*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+