@pencil-agent/nano-pencil 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. package/README.md +267 -267
  2. package/dist/build-meta.json +3 -3
  3. package/dist/core/export-html/AGENT.md +11 -11
  4. package/dist/core/export-html/template.css +971 -971
  5. package/dist/core/export-html/template.html +54 -54
  6. package/dist/core/mcp/mcp-client.d.ts +3 -1
  7. package/dist/core/mcp/mcp-client.js +6 -6
  8. package/dist/core/mcp/mcp-config.d.ts +3 -3
  9. package/dist/core/mcp/mcp-config.js +1 -1
  10. package/dist/core/mcp/mcp-manager.d.ts +5 -1
  11. package/dist/core/mcp/mcp-manager.js +1 -1
  12. package/dist/core/platform/config/resource-loader.d.ts +2 -0
  13. package/dist/core/platform/config/resource-loader.js +2 -2
  14. package/dist/core/runtime/agent-session.d.ts +12 -0
  15. package/dist/core/runtime/agent-session.js +8 -8
  16. package/dist/core/runtime/sdk.d.ts +8 -0
  17. package/dist/core/runtime/sdk.js +1 -1
  18. package/dist/extensions/builtin/AGENT.md +115 -115
  19. package/dist/extensions/builtin/browser/AGENT.md +17 -17
  20. package/dist/extensions/builtin/browser/agent-workspace/agent_helpers.py +12 -12
  21. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/amazon/product-search.md +198 -198
  22. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/archive-org/scraping.md +341 -341
  23. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/arxiv/scraping.md +311 -311
  24. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/arxiv-bulk/scraping.md +333 -333
  25. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/atlas/overview.md +70 -70
  26. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/booking-com/scraping.md +578 -578
  27. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/capterra/scraping.md +440 -440
  28. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/centilebrain/generate-estimates.md +110 -110
  29. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coingecko/scraping.md +325 -325
  30. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coinmarketcap/scraping.md +463 -463
  31. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coursera/scraping.md +360 -360
  32. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/craigslist/scraping.md +390 -390
  33. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/crossref/scraping.md +568 -568
  34. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/dev-to/scraping.md +323 -323
  35. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/duckduckgo/scraping.md +349 -349
  36. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/ebay/scraping.md +435 -435
  37. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/etsy/scraping.md +506 -506
  38. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/eventbrite/scraping.md +363 -363
  39. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/expedia/automation.md +168 -168
  40. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/facebook/groups.md +236 -236
  41. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/facebook/pages.md +295 -295
  42. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/framer/editor.md +108 -108
  43. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/fred/scraping.md +493 -493
  44. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/g2/scraping.md +580 -580
  45. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/genius/scraping.md +511 -511
  46. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/github/repo-actions.md +65 -65
  47. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/github/scraping.md +184 -184
  48. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/glassdoor/scraping.md +543 -543
  49. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/gmail/compose.md +122 -122
  50. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/goodreads/scraping.md +461 -461
  51. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/gutenberg/scraping.md +383 -383
  52. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/hackernews/scraping.md +243 -243
  53. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/howlongtobeat/scraping.md +473 -473
  54. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/imdb/scraping.md +271 -271
  55. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/itch-io/scraping.md +436 -436
  56. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/job-boards/indeed-glassdoor.md +1021 -1021
  57. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/letterboxd/scraping.md +349 -349
  58. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/linkedin/invitation-manager.md +109 -109
  59. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/loom/folder-enumeration.md +170 -170
  60. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/macrotrends/scraping.md +537 -537
  61. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/medium/article-hydration.md +120 -120
  62. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/medium/scraping.md +414 -414
  63. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/metacritic/scraping.md +477 -477
  64. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/musicbrainz/scraping.md +478 -478
  65. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/nasa/scraping.md +339 -339
  66. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/news-aggregation/multi-source.md +205 -205
  67. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/open-library/scraping.md +472 -472
  68. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/openalex/scraping.md +470 -470
  69. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/openstreetmap/scraping.md +490 -490
  70. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/package-registries/npm-pypi.md +478 -478
  71. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/polymarket/scraping.md +234 -234
  72. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/producthunt/scraping.md +307 -307
  73. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/pubmed/scraping.md +421 -421
  74. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/quora/scraping.md +364 -364
  75. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/rawg/scraping.md +352 -352
  76. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/reddit/scraping.md +124 -124
  77. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/rest-countries/scraping.md +233 -233
  78. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/sec-edgar/scraping.md +361 -361
  79. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/README.md +36 -36
  80. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/embedded-apps.md +72 -72
  81. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/knowledge-base.md +109 -109
  82. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/polaris-inputs.md +137 -137
  83. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/soundcloud/scraping.md +362 -362
  84. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/spotify/scraping.md +339 -339
  85. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/stackoverflow/scraping.md +435 -435
  86. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/steam/scraping.md +575 -575
  87. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/substack/scraping.md +338 -338
  88. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/thetechgeeks/pricing.md +52 -52
  89. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/tiktok/upload.md +107 -107
  90. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/tradingview/scraping.md +309 -309
  91. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/trello/boards-and-lists.md +88 -88
  92. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/trustpilot/scraping.md +375 -375
  93. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/walmart/scraping.md +444 -444
  94. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/wayback-machine/scraping.md +306 -306
  95. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/weather/scraping.md +398 -398
  96. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/wellfound/scraping.md +596 -596
  97. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/world-bank/scraping.md +356 -356
  98. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/xiaohongshu/scraping.md +84 -84
  99. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/youtube/scraping.md +418 -418
  100. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/zillow/scraping.md +433 -433
  101. package/dist/extensions/builtin/browser/browser.md +73 -73
  102. package/dist/extensions/builtin/browser/install.md +142 -142
  103. package/dist/extensions/builtin/browser/interaction-skills/connection.md +48 -48
  104. package/dist/extensions/builtin/browser/interaction-skills/cookies.md +3 -3
  105. package/dist/extensions/builtin/browser/interaction-skills/cross-origin-iframes.md +3 -3
  106. package/dist/extensions/builtin/browser/interaction-skills/dialogs.md +64 -64
  107. package/dist/extensions/builtin/browser/interaction-skills/downloads.md +3 -3
  108. package/dist/extensions/builtin/browser/interaction-skills/drag-and-drop.md +3 -3
  109. package/dist/extensions/builtin/browser/interaction-skills/dropdowns.md +3 -3
  110. package/dist/extensions/builtin/browser/interaction-skills/iframes.md +3 -3
  111. package/dist/extensions/builtin/browser/interaction-skills/network-requests.md +3 -3
  112. package/dist/extensions/builtin/browser/interaction-skills/print-as-pdf.md +3 -3
  113. package/dist/extensions/builtin/browser/interaction-skills/profile-sync.md +90 -90
  114. package/dist/extensions/builtin/browser/interaction-skills/screenshots.md +17 -17
  115. package/dist/extensions/builtin/browser/interaction-skills/scrolling.md +3 -3
  116. package/dist/extensions/builtin/browser/interaction-skills/shadow-dom.md +3 -3
  117. package/dist/extensions/builtin/browser/interaction-skills/tabs.md +69 -69
  118. package/dist/extensions/builtin/browser/interaction-skills/uploads.md +1 -1
  119. package/dist/extensions/builtin/browser/interaction-skills/viewport.md +3 -3
  120. package/dist/extensions/builtin/browser/src/browser_harness/AGENT.md +15 -15
  121. package/dist/extensions/builtin/browser/src/browser_harness/__init__.py +8 -8
  122. package/dist/extensions/builtin/browser/src/browser_harness/_ipc.py +90 -90
  123. package/dist/extensions/builtin/browser/src/browser_harness/admin.py +722 -722
  124. package/dist/extensions/builtin/browser/src/browser_harness/daemon.py +328 -328
  125. package/dist/extensions/builtin/browser/src/browser_harness/helpers.py +396 -396
  126. package/dist/extensions/builtin/browser/src/browser_harness/run.py +103 -103
  127. package/dist/extensions/builtin/discipline/skills/brainstorming/SKILL.md +33 -33
  128. package/dist/extensions/builtin/discipline/skills/executing-plans/SKILL.md +25 -25
  129. package/dist/extensions/builtin/discipline/skills/finishing-development-branch/SKILL.md +25 -25
  130. package/dist/extensions/builtin/discipline/skills/receiving-code-review/SKILL.md +22 -22
  131. package/dist/extensions/builtin/discipline/skills/requesting-code-review/SKILL.md +31 -31
  132. package/dist/extensions/builtin/discipline/skills/systematic-debugging/SKILL.md +28 -28
  133. package/dist/extensions/builtin/discipline/skills/test-driven-development/SKILL.md +32 -32
  134. package/dist/extensions/builtin/discipline/skills/using-git-worktrees/SKILL.md +25 -25
  135. package/dist/extensions/builtin/discipline/skills/verification-before-completion/SKILL.md +27 -27
  136. package/dist/extensions/builtin/discipline/skills/writing-plans/SKILL.md +26 -26
  137. package/dist/extensions/builtin/goal/README.md +67 -67
  138. package/dist/extensions/builtin/grub/README.md +112 -112
  139. package/dist/extensions/builtin/link-world/agent-workspace/README.md +16 -16
  140. package/dist/extensions/builtin/link-world/internet-search/internet-search.md +65 -65
  141. package/dist/extensions/builtin/link-world/link-world-agent.md +82 -82
  142. package/dist/extensions/builtin/link-world/linkworld.md +313 -313
  143. package/dist/extensions/builtin/link-world/network-routing/network-routing.md +67 -67
  144. package/dist/extensions/builtin/loop/README.md +92 -92
  145. package/dist/extensions/builtin/mcp/figma-design.md +68 -68
  146. package/dist/extensions/builtin/mcp/mcp-management.md +85 -85
  147. package/dist/extensions/builtin/recap/AGENT.md +15 -15
  148. package/dist/extensions/builtin/sal/README.md +72 -72
  149. package/dist/extensions/builtin/security-audit/README.md +289 -289
  150. package/dist/extensions/builtin/team/AGENT.md +112 -112
  151. package/dist/extensions/builtin/team/TESTING.md +299 -299
  152. package/dist/extensions/builtin/token-save/README.md +56 -56
  153. package/dist/extensions/optional/AGENT.md +10 -10
  154. package/dist/modes/interactive/interactive-mode.js +36 -36
  155. package/dist/modes/interactive/theme/dark.json +85 -85
  156. package/dist/modes/interactive/theme/light.json +84 -84
  157. package/dist/modes/interactive/theme/theme-schema.json +335 -335
  158. package/dist/modes/interactive/theme/warm.json +81 -81
  159. package/dist/node_modules/@pencil-agent/agent-core/dist/agent-loop.js +3 -2
  160. package/dist/node_modules/@pencil-agent/agent-core/dist/structured-adaptive-agent-loop.js +2 -1
  161. package/dist/node_modules/@pencil-agent/ai/dist/cli.js +0 -0
  162. package/docs/cc-agent-design.md +1297 -0
  163. package/docs/cc-tui-design.md +1333 -0
  164. package/docs/codex-goal-command-impl.md +1055 -1055
  165. package/docs/codex-goal-vs-grub.md +500 -500
  166. package/docs/custom-provider.md +27 -27
  167. package/docs/extensions.md +27 -27
  168. package/docs/keybindings.md +27 -27
  169. package/docs/loop /351/207/215/346/236/204/345/256/214/346/210/220/346/200/273/347/273/223.md" +250 -250
  170. package/docs/loop /351/207/215/346/236/204/345/256/214/346/210/220/346/212/245/345/221/212.md" +122 -122
  171. package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210.md" +1222 -1222
  172. package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210/345/256/236/347/216/260/346/212/245/345/221/212.md" +158 -158
  173. package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210/345/257/271/346/257/224/345/210/206/346/236/220.md" +128 -128
  174. package/docs/loop /351/207/215/346/236/204/350/256/241/345/210/222.md" +320 -320
  175. package/docs/loop-usage-examples.md +214 -214
  176. package/docs/models.md +27 -27
  177. package/docs/nanoPencil-/345/255/246/344/271/240/350/256/241/345/210/222.md +170 -0
  178. package/docs/packages.md +27 -27
  179. package/docs/pi-design-philosophy.md +457 -457
  180. package/docs/planmode.md +1987 -1987
  181. package/docs/prompt-templates.md +27 -27
  182. package/docs/providers.md +27 -27
  183. package/docs/scan-report.md +3820 -0
  184. package/docs/sdk.md +27 -27
  185. package/docs/skills.md +27 -27
  186. package/docs/themes.md +27 -27
  187. package/docs/tui.md +27 -27
  188. package/docs//345/257/271/346/240/207Claude-Code.md +1775 -0
  189. package/docs//351/230/277/351/207/214/345/267/264/345/267/264/350/264/242/346/212/245/345/210/206/346/236/220/344/271/246.md +261 -0
  190. package/package.json +190 -190
  191. package/docs/ACP/345/215/217/350/256/256/351/233/206/346/210/220/345/274/200/345/217/221/346/226/207/346/241/243.md +0 -851
  192. package/docs/SDK-TESTING.md +0 -364
  193. package/docs/mem-core/346/212/200/346/234/257/346/226/207/346/241/243.md +0 -593
  194. package/docs/startup-performance-optimization.md +0 -301
  195. package/docs//350/256/244/347/237/245/345/234/260/345/233/276.md +0 -47
@@ -1,596 +1,596 @@
1
- # Wellfound (AngelList) — Startup Jobs & Company Profiles
2
-
3
- Field-tested against wellfound.com on 2026-04-18.
4
- All confirmed via live HTTP probes and response header analysis.
5
-
6
- ---
7
-
8
- ## Anti-bot verdict: browser required, no http_get workaround exists
9
-
10
- **`http_get` returns HTTP 403 on every Wellfound URL without exception** (except `robots.txt`).
11
-
12
- Tested endpoints (all 403):
13
- - `/company/stripe`
14
- - `/jobs`
15
- - `/jobs?role=engineer&location=remote`
16
- - `/company/stripe/jobs`
17
- - `/sitemap.xml`, `/sitemap_index.xml`
18
- - `/jobs.rss`
19
- - `POST /graphql` (HTTP 403, Cloudflare managed challenge)
20
-
21
- Old AngelList public API (`api.angel.co/1/...`) returns `404 Not Found` — permanently shut down.
22
-
23
- **Dual anti-bot stack confirmed from response headers:**
24
-
25
- | Layer | System | Evidence |
26
- |-------|--------|----------|
27
- | Page GETs | DataDome | `X-DataDome: protected`, `X-DD-B: 2`, `Set-Cookie: datadome=...` |
28
- | API POSTs | Cloudflare Bot Management | `Cf-Mitigated: challenge` |
29
-
30
- The 403 response body contains a DataDome captcha challenge script (`geo.captcha-delivery.com`) AND an embedded Cloudflare challenge (`window.__CF$cv$params`). Both fire simultaneously. Neither cookie can be replayed — both are TLS-fingerprint-bound.
31
-
32
- **Use `new_tab()` + `wait()` exclusively. Never use `http_get` for Wellfound.**
33
-
34
- ---
35
-
36
- ## Tech stack (confirmed from response headers)
37
-
38
- Wellfound is a **Ruby on Rails + React + Apollo GraphQL** hybrid app — NOT a pure Next.js app.
39
-
40
- Confirmed headers from `robots.txt` (the only accessible endpoint):
41
- ```
42
- x-runtime: 0.006700 → Rails rack middleware timer
43
- x-request-id: 4645fd66... → Rails request ID
44
- x-xss-protection: 1; mode=block → Rails security defaults
45
- Set-Cookie: _wellfound=... → Rails session cookie
46
- Server: cloudflare → Cloudflare CDN
47
- ```
48
-
49
- Implications:
50
- - **`__NEXT_DATA__` is NOT present** — not a Next.js app
51
- - **`window.__APOLLO_STATE__` or `window.gon` may be present** — check these instead
52
- - CSRF token is in a `<meta name="csrf-token">` tag (Rails default)
53
- - Session cookie is `_wellfound=...` for anonymous sessions; login sessions add `_wellfound_session=...`
54
-
55
- ---
56
-
57
- ## Do this first: open in new tab, wait for DataDome to resolve
58
-
59
- ```python
60
- new_tab("https://wellfound.com/company/stripe")
61
- wait_for_load()
62
- wait(5) # DataDome JS fingerprinting runs ~2-4s after readyState=complete
63
- ```
64
-
65
- Verify you are past the DataDome challenge before extracting:
66
-
67
- ```python
68
- title = js("document.title")
69
- url = page_info()["url"]
70
-
71
- if "wellfound.com" not in url or not title or "Just a moment" in title:
72
- # DataDome or CF challenge did not resolve — wait longer
73
- wait(8)
74
- title = js("document.title")
75
- if "Just a moment" in title or not title:
76
- capture_screenshot("/tmp/wellfound_block.png")
77
- raise RuntimeError("DataDome/CF challenge did not resolve — see screenshot")
78
- ```
79
-
80
- DataDome resolves **silently** in a real Chrome session via CDP — no user interaction required.
81
- The challenge is a JS fingerprint check that passes automatically when running in a real browser.
82
-
83
- ---
84
-
85
- ## URL patterns
86
-
87
- | Goal | URL |
88
- |------|-----|
89
- | Company profile | `https://wellfound.com/company/{slug}` |
90
- | Company jobs | `https://wellfound.com/company/{slug}/jobs` |
91
- | Company culture | `https://wellfound.com/company/{slug}/culture` |
92
- | Job board (all) | `https://wellfound.com/jobs` |
93
- | Job board filtered | `https://wellfound.com/jobs` — then use UI filters (query params are disallowed by robots.txt) |
94
- | Investor profile | `https://wellfound.com/investor/{slug}` |
95
- | User profile | `https://wellfound.com/u/{username}` (disallowed by robots.txt, login wall) |
96
-
97
- **Note on query params:** `robots.txt` disallows `?role=*`, `?jobId=*`, `?jobSlug=*`, `?location=*`.
98
- Wellfound enforces these with login walls or redirects for most filtered job searches.
99
-
100
- ---
101
-
102
- ## Workflow 1: Company profile — name, description, team size, funding, tags
103
-
104
- Navigate to the company page and extract structured data. Most fields are visible without login.
105
-
106
- ```python
107
- import json
108
-
109
- new_tab("https://wellfound.com/company/stripe")
110
- wait_for_load()
111
- wait(5)
112
-
113
- # Check for Apollo state (Rails + React app, not Next.js)
114
- # Wellfound embeds data in window.gon or inline script tags
115
- apollo_raw = js("""
116
- (function() {
117
- // Try window.__APOLLO_STATE__ (Apollo Client cache)
118
- if (window.__APOLLO_STATE__) return JSON.stringify(window.__APOLLO_STATE__);
119
- // Try window.gon (Rails Gon gem)
120
- if (window.gon) return JSON.stringify(window.gon);
121
- // Try inline <script> tags containing startup data
122
- var scripts = Array.from(document.querySelectorAll('script:not([src])'));
123
- for (var s of scripts) {
124
- var t = s.textContent || '';
125
- if (t.includes('"name"') && t.includes('"description"') && t.includes('teamSize')) {
126
- return t.substring(0, 5000);
127
- }
128
- }
129
- return null;
130
- })()
131
- """)
132
-
133
- if apollo_raw:
134
- try:
135
- data = json.loads(apollo_raw)
136
- # Apollo State: look for Startup:{id} keys
137
- for key, val in data.items():
138
- if key.startswith("Startup:") and isinstance(val, dict):
139
- print("Company:", val.get("name"))
140
- print("Description:", val.get("description") or val.get("highConcept"))
141
- print("Team size:", val.get("teamSize"))
142
- print("Total raised:", val.get("totalRaised"))
143
- print("Hiring:", val.get("hiring"))
144
- print(json.dumps(data, indent=2)[:3000])
145
- except json.JSONDecodeError:
146
- # Raw script tag — parse key fields with regex
147
- import re
148
- name = re.search(r'"name"\s*:\s*"([^"]+)"', apollo_raw)
149
- desc = re.search(r'"description"\s*:\s*"([^"]+)"', apollo_raw)
150
- print("Name:", name.group(1) if name else "not found")
151
- print("Desc:", desc.group(1) if desc else "not found")
152
- ```
153
-
154
- If the structured data path fails, fall back to DOM extraction:
155
-
156
- ```python
157
- # DOM extraction — company profile page
158
- profile = js("""
159
- (function() {
160
- // Company name — first h1 on the page
161
- var nameEl = document.querySelector('h1');
162
-
163
- // Description — first substantial paragraph or div with class containing 'description'
164
- var descEl = (
165
- document.querySelector('[class*="description"]') ||
166
- document.querySelector('[class*="about"]') ||
167
- document.querySelector('p[class*="startupDescription"]')
168
- );
169
-
170
- // Tags — market/role tags are links with /jobs?role= or /location/ in href
171
- // Wellfound uses Tailwind (no stable class names) — use href pattern
172
- var roleLinks = Array.from(document.querySelectorAll('a[href*="/jobs?role="]')).map(a => a.innerText.trim());
173
- var locationLinks = Array.from(document.querySelectorAll('a[href*="/location/"]')).map(a => a.innerText.trim());
174
-
175
- // Team size / funding — look in page text for patterns
176
- var bodyText = document.body.innerText;
177
-
178
- // Company size: "11-50 employees" or "51-200 people" pattern
179
- var sizeMatch = bodyText.match(/(\d+[-–]\d+)\s+(employees|people)/i);
180
- var teamSize = sizeMatch ? sizeMatch[0] : null;
181
-
182
- // Funding: "$X.XM" or "Raised $X" pattern
183
- var fundingMatch = bodyText.match(/\$[\d,.]+[KMBkm]\s*(raised|in funding|Series [A-Z])?/i);
184
- var funding = fundingMatch ? fundingMatch[0] : null;
185
-
186
- // Stage: "Series A", "Seed", "Series B", etc.
187
- var stageMatch = bodyText.match(/\b(Seed|Series [A-Z]\+?|Pre-seed|Angel|Late Stage|Public)\b/);
188
- var stage = stageMatch ? stageMatch[0] : null;
189
-
190
- return JSON.stringify({
191
- name: nameEl ? nameEl.innerText.trim() : null,
192
- desc: descEl ? descEl.innerText.trim().substring(0, 500) : null,
193
- teamSize: teamSize,
194
- funding: funding,
195
- stage: stage,
196
- roles: roleLinks.slice(0, 10),
197
- locations: locationLinks.slice(0, 5),
198
- });
199
- })()
200
- """)
201
-
202
- data = json.loads(profile)
203
- print(json.dumps(data, indent=2))
204
- ```
205
-
206
- ---
207
-
208
- ## Workflow 2: Company jobs listing
209
-
210
- ```python
211
- import json
212
-
213
- company_slug = "stripe"
214
- new_tab(f"https://wellfound.com/company/{company_slug}/jobs")
215
- wait_for_load()
216
- wait(5)
217
-
218
- jobs = js("""
219
- (function() {
220
- // Job listing cards — Wellfound uses role="listitem" or li elements in job list
221
- var cards = document.querySelectorAll('[data-test^="StartupJobListing"], li[class*="job"], div[class*="JobListing"]');
222
- if (!cards.length) {
223
- // Broad fallback: all anchor tags with /jobs/ in href
224
- var links = Array.from(document.querySelectorAll('a[href*="/jobs/"]'));
225
- return JSON.stringify(links.map(a => ({
226
- title: a.innerText.trim().split('\\n')[0],
227
- href: a.href,
228
- })).filter(j => j.title && j.title.length > 2).slice(0, 30));
229
- }
230
- return JSON.stringify(Array.from(cards).map(card => {
231
- var titleEl = card.querySelector('h2, h3, [class*="title"], [class*="jobTitle"]');
232
- var locEl = card.querySelector('[class*="location"], [class*="Location"]');
233
- var compEl = card.querySelector('[class*="salary"], [class*="comp"], [class*="equity"]');
234
- var linkEl = card.querySelector('a[href*="/jobs/"]');
235
- return {
236
- title: titleEl ? titleEl.innerText.trim() : '',
237
- location: locEl ? locEl.innerText.trim() : '',
238
- comp: compEl ? compEl.innerText.trim() : '',
239
- href: linkEl ? linkEl.href : '',
240
- };
241
- }).filter(j => j.title));
242
- })()
243
- """)
244
-
245
- results = json.loads(jobs)
246
- print(f"Found {len(results)} jobs")
247
- for j in results:
248
- print(f" {j['title']} | {j.get('location','?')} | {j.get('comp','?')}")
249
- ```
250
-
251
- ---
252
-
253
- ## Workflow 3: Job board — browse all jobs
254
-
255
- The main `/jobs` page shows a curated job feed. Filters are not accessible via URL params (DataDome blocks `?role=...`). Use the UI dropdown filters after loading the page.
256
-
257
- ```python
258
- import json
259
-
260
- new_tab("https://wellfound.com/jobs")
261
- wait_for_load()
262
- wait(5)
263
-
264
- # Extract visible job cards
265
- jobs = js("""
266
- (function() {
267
- // Job cards on the main /jobs board
268
- var cards = document.querySelectorAll(
269
- '[data-test*="job"], [class*="JobCard"], [class*="jobListing"], ' +
270
- 'li[class*="job"], article[class*="job"]'
271
- );
272
- if (!cards.length) {
273
- // Fallback: links to job detail pages
274
- var links = Array.from(document.querySelectorAll('a[href*="/company/"][href*="/jobs/"]'));
275
- return JSON.stringify(links.map(a => ({
276
- href: a.href,
277
- text: a.innerText.trim().substring(0, 100),
278
- })).slice(0, 30));
279
- }
280
- return JSON.stringify(Array.from(cards).map(card => {
281
- var titleEl = card.querySelector('h2, h3, [class*="title"]');
282
- var companyEl = card.querySelector('[class*="company"], [class*="startup"]');
283
- var locEl = card.querySelector('[class*="location"]');
284
- var linkEl = card.querySelector('a[href*="/jobs/"]');
285
- return {
286
- title: titleEl ? titleEl.innerText.trim() : '',
287
- company: companyEl ? companyEl.innerText.trim() : '',
288
- location: locEl ? locEl.innerText.trim() : '',
289
- href: linkEl ? linkEl.href : '',
290
- };
291
- }).filter(j => j.title));
292
- })()
293
- """)
294
-
295
- results = json.loads(jobs)
296
- print(f"Found {len(results)} jobs")
297
- ```
298
-
299
- ---
300
-
301
- ## Workflow 4: GraphQL API (authenticated sessions only)
302
-
303
- Wellfound's GraphQL endpoint (`/graphql`) requires:
304
- 1. A valid `_wellfound` session cookie from a real browser load
305
- 2. A CSRF token from the page's `<meta name="csrf-token">` tag
306
- 3. Cloudflare Bot Management to have passed (only happens in a real Chrome session)
307
-
308
- **This approach only works from inside a browser session (after navigating to any Wellfound page).**
309
-
310
- ```python
311
- import json
312
-
313
- # Step 1: Load any Wellfound page so the session cookie + DataDome cookie are set
314
- new_tab("https://wellfound.com/")
315
- wait_for_load()
316
- wait(5)
317
-
318
- # Step 2: Extract CSRF token from meta tag
319
- csrf = js("document.querySelector('meta[name=\"csrf-token\"]') ? document.querySelector('meta[name=\"csrf-token\"]').getAttribute('content') : null")
320
- if not csrf:
321
- raise RuntimeError("CSRF token not found — page may not have loaded correctly")
322
-
323
- print(f"CSRF token: {csrf[:20]}...")
324
-
325
- # Step 3: Execute GraphQL query via fetch() from within the browser
326
- # This uses the browser's existing cookies automatically
327
- result = js(f"""
328
- (async function() {{
329
- try {{
330
- var resp = await fetch('/graphql', {{
331
- method: 'POST',
332
- credentials: 'include',
333
- headers: {{
334
- 'Content-Type': 'application/json',
335
- 'Accept': 'application/json',
336
- 'x-csrf-token': '{csrf}',
337
- 'x-requested-with': 'XMLHttpRequest',
338
- }},
339
- body: JSON.stringify({{
340
- query: `query StartupShow($slug: String!) {{
341
- startup(slug: $slug) {{
342
- id
343
- name
344
- description: highConcept
345
- productDesc
346
- teamSize
347
- locations {{ displayName }}
348
- markets {{ displayName }}
349
- totalRaised
350
- fundingStage
351
- badges
352
- hiring
353
- jobListingsCount
354
- }}
355
- }}`,
356
- variables: {{ slug: "stripe" }}
357
- }})
358
- }});
359
- var data = await resp.json();
360
- return JSON.stringify(data);
361
- }} catch(e) {{
362
- return JSON.stringify({{error: e.message}});
363
- }}
364
- }})()
365
- """)
366
-
367
- # js() with async returns a Promise — use js_async() if available, or eval trick:
368
- # Note: the above may return None if js() doesn't await Promises.
369
- # Use this pattern instead if js() doesn't handle async:
370
- result_sync = js("""
371
- var done = false, out = null;
372
- fetch('/graphql', {
373
- method: 'POST',
374
- credentials: 'include',
375
- headers: {
376
- 'Content-Type': 'application/json',
377
- 'Accept': 'application/json',
378
- 'x-csrf-token': document.querySelector('meta[name="csrf-token"]').content,
379
- 'x-requested-with': 'XMLHttpRequest',
380
- },
381
- body: JSON.stringify({
382
- query: '{ __typename }',
383
- })
384
- }).then(r => r.json()).then(d => { window._wf_gql_result = JSON.stringify(d); });
385
- 'pending'
386
- """)
387
- # Wait for async result
388
- import time; time.sleep(3)
389
- gql_result = js("window._wf_gql_result || null")
390
- if gql_result:
391
- data = json.loads(gql_result)
392
- print("GraphQL response:", json.dumps(data, indent=2)[:1000])
393
- ```
394
-
395
- ### Known GraphQL operations
396
-
397
- | Operation | Purpose |
398
- |-----------|---------|
399
- | `StartupShow` | Full company profile (name, desc, funding, team size, markets) |
400
- | `JobListingsIndex` | Paginated job board |
401
- | `JobSearch` | Filtered job search by role/location |
402
- | `UserProfile` | User/candidate profile |
403
- | `InvestorShow` | VC/investor profile |
404
-
405
- ---
406
-
407
- ## Handling the login wall
408
-
409
- Wellfound shows a sign-in modal on:
410
- - Job detail pages (immediately or after 2-3 seconds)
411
- - Candidate profile pages (immediately)
412
- - Some company pages after scrolling
413
-
414
- Company overview pages typically show content without login. Job listings require login to see full details and apply.
415
-
416
- ```python
417
- def dismiss_wellfound_login_modal():
418
- """Close the Wellfound sign-in modal. Safe to call if no modal is present."""
419
- closed = js("""
420
- (function() {
421
- var selectors = [
422
- 'button[aria-label="Close"]',
423
- 'button[class*="close"]',
424
- 'button[class*="Close"]',
425
- '[data-test="close-modal"]',
426
- '[aria-label="Dismiss"]',
427
- 'button[class*="dismiss"]',
428
- // Wellfound-specific: modal overlay dismiss
429
- 'div[class*="Modal"] button[type="button"]',
430
- ];
431
- for (var s of selectors) {
432
- var btn = document.querySelector(s);
433
- if (btn && btn.offsetParent !== null) {
434
- btn.click();
435
- return s;
436
- }
437
- }
438
- // Try pressing Escape
439
- document.dispatchEvent(new KeyboardEvent('keydown', {key: 'Escape', keyCode: 27, bubbles: true}));
440
- return 'escape';
441
- })()
442
- """)
443
- if closed:
444
- wait(1)
445
- return closed
446
- ```
447
-
448
- ---
449
-
450
- ## Detecting DataDome / challenge page
451
-
452
- After `new_tab()` + `wait(5)`, verify you are on a real Wellfound page:
453
-
454
- ```python
455
- def wellfound_is_blocked() -> bool:
456
- """True if DataDome or Cloudflare challenge is still showing."""
457
- title = js("document.title") or ""
458
- url = page_info()["url"]
459
- # DataDome challenge page has no useful title; CF shows "Just a moment..."
460
- blocked = (
461
- "Just a moment" in title or
462
- "wellfound.com" not in url or
463
- "captcha-delivery.com" in js("document.body.innerHTML or ''") or
464
- not title
465
- )
466
- return blocked
467
-
468
- # Usage
469
- new_tab("https://wellfound.com/company/stripe")
470
- wait_for_load()
471
- wait(5)
472
-
473
- if wellfound_is_blocked():
474
- wait(8) # DataDome sometimes needs up to 10s total
475
- if wellfound_is_blocked():
476
- capture_screenshot("/tmp/wellfound_blocked.png")
477
- raise RuntimeError("DataDome/CF challenge did not resolve — see /tmp/wellfound_blocked.png")
478
- ```
479
-
480
- ---
481
-
482
- ## Key selectors reference
483
-
484
- Wellfound uses **Tailwind CSS** — no stable semantic class names. These patterns are robust:
485
-
486
- | Target | Selector strategy |
487
- |--------|------------------|
488
- | Company name | `h1` (first on page) |
489
- | Company description | `[class*="description"]`, `[class*="about"]` |
490
- | Team size | Text search: `/\d+[-–]\d+\s+(employees\|people)/i` |
491
- | Funding amount | Text search: `/\$[\d,.]+[KMBkm]/i` |
492
- | Funding stage | Text search: `/\b(Seed\|Series [A-Z]\+?\|Pre-seed\|Late Stage)\b/` |
493
- | Role/market tags | `a[href*="/jobs?role="]` |
494
- | Location tags | `a[href*="/location/"]` |
495
- | Job cards | `a[href*="/company/"][href*="/jobs/"]` (broad fallback) |
496
- | Job title | `h2`, `h3`, `[class*="title"]` within card |
497
- | CSRF token | `meta[name="csrf-token"]` |
498
- | Login modal | `button[aria-label="Close"]`, Escape key |
499
-
500
- ---
501
-
502
- ## Common pitfalls
503
-
504
- 1. **`http_get` is permanently blocked.** DataDome intercepts all non-browser HTTP requests with
505
- a 403 + captcha challenge. No User-Agent, header combination, or cookie replay works.
506
- `api.angel.co` is HTTP 404 (shut down). Use `new_tab()` exclusively.
507
-
508
- 2. **NOT a Next.js app.** Wellfound is Ruby on Rails + React. There is no `__NEXT_DATA__` JSON
509
- blob. Look for `window.__APOLLO_STATE__`, `window.gon`, or inline `<script>` tags instead.
510
-
511
- 3. **`wait(5)` minimum after `wait_for_load()`.** DataDome runs JS fingerprinting probes for
512
- 2-4 seconds after `readyState = complete`. Extracting before this resolves returns the challenge
513
- page HTML, not real content.
514
-
515
- 4. **Tailwind CSS — no stable class names.** Wellfound uses Tailwind utility classes. Never
516
- hardcode a specific class name. Use `href` attribute patterns, `data-test` attributes if present,
517
- or semantic element selectors (`h1`, `h2`, `li`, `article`).
518
-
519
- 5. **GraphQL requires both CSRF token AND browser session cookies.** The CSRF token is a
520
- per-session value from `<meta name="csrf-token">`. Cloudflare Bot Management blocks
521
- `POST /graphql` from non-browser sessions. Always fire GraphQL via `fetch()` inside the
522
- browser session (not from Python's `http_get`).
523
-
524
- 6. **`?role=` and `?location=` params are robots.txt-disallowed.** Wellfound may redirect or
525
- show a login wall for filtered job search URLs. Load `/jobs` unfiltered and use in-page
526
- UI filters (dropdowns) to narrow results.
527
-
528
- 7. **Login wall on job details and user profiles.** Company overview pages load without login.
529
- Individual job detail pages, and all `/u/{username}` profiles, hit a login modal immediately.
530
- Call `dismiss_wellfound_login_modal()` right after `wait(5)` on these pages.
531
-
532
- 8. **Rate limiting.** After ~5-10 rapid page navigations DataDome may harden. Use `wait(3)` between
533
- `goto_url()` calls. If you get a captcha that does not auto-resolve, wait 30-60 seconds.
534
-
535
- 9. **`new_tab()` over `goto_url()` for the first Wellfound page.** `goto_url()` in an existing tab
536
- may inherit a stale DataDome fingerprint. `new_tab()` gives a clean origin context that
537
- DataDome processes cleanly.
538
-
539
- ---
540
-
541
- ## Anti-bot response identification
542
-
543
- What you see in the 403 body when NOT in a browser:
544
-
545
- ```html
546
- <!-- DataDome challenge (page GETs) -->
547
- <script>var dd={'rt':'c','cid':'...','t':'bv','host':'geo.captcha-delivery.com',...}</script>
548
- <script src="https://ct.captcha-delivery.com/c.js"></script>
549
- <!-- rt='c' = captcha required; rt='i' = invisible solve; rt='b' = blocked -->
550
-
551
- <!-- Cloudflare challenge (API POSTs) -->
552
- <title>Just a moment...</title>
553
- <script>window.__CF$cv$params={r:'...',t:'...'}</script>
554
- ```
555
-
556
- In a real Chrome browser, both challenges resolve automatically without user interaction.
557
-
558
- ---
559
-
560
- ## Minimal working example
561
-
562
- ```python
563
- import json
564
-
565
- # Open Wellfound company page
566
- new_tab("https://wellfound.com/company/openai")
567
- wait_for_load()
568
- wait(5)
569
-
570
- # Verify not blocked
571
- title = js("document.title")
572
- assert "Just a moment" not in (title or ""), f"Still on challenge page: {title}"
573
-
574
- # Extract company overview
575
- data = js("""
576
- (function() {
577
- var name = document.querySelector('h1');
578
- var bodyText = document.body.innerText;
579
- var sizeMatch = bodyText.match(/(\\d+[-\\u2013]\\d+)\\s+(employees|people)/i);
580
- var fundingMatch = bodyText.match(/\\$[\\d,.]+[KMBkm](?:\\s+(?:raised|total))?/i);
581
- var stageMatch = bodyText.match(/\\b(Seed|Series [A-Z]\\+?|Pre-seed|Late Stage|Public)\\b/);
582
- var tags = Array.from(document.querySelectorAll('a[href*="/jobs?role="]')).map(a => a.innerText.trim());
583
- var locs = Array.from(document.querySelectorAll('a[href*="/location/"]')).map(a => a.innerText.trim());
584
- return JSON.stringify({
585
- name: name ? name.innerText.trim() : null,
586
- teamSize: sizeMatch ? sizeMatch[0] : null,
587
- funding: fundingMatch ? fundingMatch[0] : null,
588
- stage: stageMatch ? stageMatch[0] : null,
589
- roles: tags.slice(0, 8),
590
- locations: locs.slice(0, 5),
591
- });
592
- })()
593
- """)
594
-
595
- print(json.dumps(json.loads(data), indent=2))
596
- ```
1
+ # Wellfound (AngelList) — Startup Jobs & Company Profiles
2
+
3
+ Field-tested against wellfound.com on 2026-04-18.
4
+ All confirmed via live HTTP probes and response header analysis.
5
+
6
+ ---
7
+
8
+ ## Anti-bot verdict: browser required, no http_get workaround exists
9
+
10
+ **`http_get` returns HTTP 403 on every Wellfound URL without exception** (except `robots.txt`).
11
+
12
+ Tested endpoints (all 403):
13
+ - `/company/stripe`
14
+ - `/jobs`
15
+ - `/jobs?role=engineer&location=remote`
16
+ - `/company/stripe/jobs`
17
+ - `/sitemap.xml`, `/sitemap_index.xml`
18
+ - `/jobs.rss`
19
+ - `POST /graphql` (HTTP 403, Cloudflare managed challenge)
20
+
21
+ Old AngelList public API (`api.angel.co/1/...`) returns `404 Not Found` — permanently shut down.
22
+
23
+ **Dual anti-bot stack confirmed from response headers:**
24
+
25
+ | Layer | System | Evidence |
26
+ |-------|--------|----------|
27
+ | Page GETs | DataDome | `X-DataDome: protected`, `X-DD-B: 2`, `Set-Cookie: datadome=...` |
28
+ | API POSTs | Cloudflare Bot Management | `Cf-Mitigated: challenge` |
29
+
30
+ The 403 response body contains a DataDome captcha challenge script (`geo.captcha-delivery.com`) AND an embedded Cloudflare challenge (`window.__CF$cv$params`). Both fire simultaneously. Neither cookie can be replayed — both are TLS-fingerprint-bound.
31
+
32
+ **Use `new_tab()` + `wait()` exclusively. Never use `http_get` for Wellfound.**
33
+
34
+ ---
35
+
36
+ ## Tech stack (confirmed from response headers)
37
+
38
+ Wellfound is a **Ruby on Rails + React + Apollo GraphQL** hybrid app — NOT a pure Next.js app.
39
+
40
+ Confirmed headers from `robots.txt` (the only accessible endpoint):
41
+ ```
42
+ x-runtime: 0.006700 → Rails rack middleware timer
43
+ x-request-id: 4645fd66... → Rails request ID
44
+ x-xss-protection: 1; mode=block → Rails security defaults
45
+ Set-Cookie: _wellfound=... → Rails session cookie
46
+ Server: cloudflare → Cloudflare CDN
47
+ ```
48
+
49
+ Implications:
50
+ - **`__NEXT_DATA__` is NOT present** — not a Next.js app
51
+ - **`window.__APOLLO_STATE__` or `window.gon` may be present** — check these instead
52
+ - CSRF token is in a `<meta name="csrf-token">` tag (Rails default)
53
+ - Session cookie is `_wellfound=...` for anonymous sessions; login sessions add `_wellfound_session=...`
54
+
55
+ ---
56
+
57
+ ## Do this first: open in new tab, wait for DataDome to resolve
58
+
59
+ ```python
60
+ new_tab("https://wellfound.com/company/stripe")
61
+ wait_for_load()
62
+ wait(5) # DataDome JS fingerprinting runs ~2-4s after readyState=complete
63
+ ```
64
+
65
+ Verify you are past the DataDome challenge before extracting:
66
+
67
+ ```python
68
+ title = js("document.title")
69
+ url = page_info()["url"]
70
+
71
+ if "wellfound.com" not in url or not title or "Just a moment" in title:
72
+ # DataDome or CF challenge did not resolve — wait longer
73
+ wait(8)
74
+ title = js("document.title")
75
+ if "Just a moment" in title or not title:
76
+ capture_screenshot("/tmp/wellfound_block.png")
77
+ raise RuntimeError("DataDome/CF challenge did not resolve — see screenshot")
78
+ ```
79
+
80
+ DataDome resolves **silently** in a real Chrome session via CDP — no user interaction required.
81
+ The challenge is a JS fingerprint check that passes automatically when running in a real browser.
82
+
83
+ ---
84
+
85
+ ## URL patterns
86
+
87
+ | Goal | URL |
88
+ |------|-----|
89
+ | Company profile | `https://wellfound.com/company/{slug}` |
90
+ | Company jobs | `https://wellfound.com/company/{slug}/jobs` |
91
+ | Company culture | `https://wellfound.com/company/{slug}/culture` |
92
+ | Job board (all) | `https://wellfound.com/jobs` |
93
+ | Job board filtered | `https://wellfound.com/jobs` — then use UI filters (query params are disallowed by robots.txt) |
94
+ | Investor profile | `https://wellfound.com/investor/{slug}` |
95
+ | User profile | `https://wellfound.com/u/{username}` (disallowed by robots.txt, login wall) |
96
+
97
+ **Note on query params:** `robots.txt` disallows `?role=*`, `?jobId=*`, `?jobSlug=*`, `?location=*`.
98
+ Wellfound enforces these with login walls or redirects for most filtered job searches.
99
+
100
+ ---
101
+
102
+ ## Workflow 1: Company profile — name, description, team size, funding, tags
103
+
104
+ Navigate to the company page and extract structured data. Most fields are visible without login.
105
+
106
+ ```python
107
+ import json
108
+
109
+ new_tab("https://wellfound.com/company/stripe")
110
+ wait_for_load()
111
+ wait(5)
112
+
113
+ # Check for Apollo state (Rails + React app, not Next.js)
114
+ # Wellfound embeds data in window.gon or inline script tags
115
+ apollo_raw = js("""
116
+ (function() {
117
+ // Try window.__APOLLO_STATE__ (Apollo Client cache)
118
+ if (window.__APOLLO_STATE__) return JSON.stringify(window.__APOLLO_STATE__);
119
+ // Try window.gon (Rails Gon gem)
120
+ if (window.gon) return JSON.stringify(window.gon);
121
+ // Try inline <script> tags containing startup data
122
+ var scripts = Array.from(document.querySelectorAll('script:not([src])'));
123
+ for (var s of scripts) {
124
+ var t = s.textContent || '';
125
+ if (t.includes('"name"') && t.includes('"description"') && t.includes('teamSize')) {
126
+ return t.substring(0, 5000);
127
+ }
128
+ }
129
+ return null;
130
+ })()
131
+ """)
132
+
133
+ if apollo_raw:
134
+ try:
135
+ data = json.loads(apollo_raw)
136
+ # Apollo State: look for Startup:{id} keys
137
+ for key, val in data.items():
138
+ if key.startswith("Startup:") and isinstance(val, dict):
139
+ print("Company:", val.get("name"))
140
+ print("Description:", val.get("description") or val.get("highConcept"))
141
+ print("Team size:", val.get("teamSize"))
142
+ print("Total raised:", val.get("totalRaised"))
143
+ print("Hiring:", val.get("hiring"))
144
+ print(json.dumps(data, indent=2)[:3000])
145
+ except json.JSONDecodeError:
146
+ # Raw script tag — parse key fields with regex
147
+ import re
148
+ name = re.search(r'"name"\s*:\s*"([^"]+)"', apollo_raw)
149
+ desc = re.search(r'"description"\s*:\s*"([^"]+)"', apollo_raw)
150
+ print("Name:", name.group(1) if name else "not found")
151
+ print("Desc:", desc.group(1) if desc else "not found")
152
+ ```
153
+
154
+ If the structured data path fails, fall back to DOM extraction:
155
+
156
+ ```python
157
+ # DOM extraction — company profile page
158
+ profile = js("""
159
+ (function() {
160
+ // Company name — first h1 on the page
161
+ var nameEl = document.querySelector('h1');
162
+
163
+ // Description — first substantial paragraph or div with class containing 'description'
164
+ var descEl = (
165
+ document.querySelector('[class*="description"]') ||
166
+ document.querySelector('[class*="about"]') ||
167
+ document.querySelector('p[class*="startupDescription"]')
168
+ );
169
+
170
+ // Tags — market/role tags are links with /jobs?role= or /location/ in href
171
+ // Wellfound uses Tailwind (no stable class names) — use href pattern
172
+ var roleLinks = Array.from(document.querySelectorAll('a[href*="/jobs?role="]')).map(a => a.innerText.trim());
173
+ var locationLinks = Array.from(document.querySelectorAll('a[href*="/location/"]')).map(a => a.innerText.trim());
174
+
175
+ // Team size / funding — look in page text for patterns
176
+ var bodyText = document.body.innerText;
177
+
178
+ // Company size: "11-50 employees" or "51-200 people" pattern
179
+ var sizeMatch = bodyText.match(/(\d+[-–]\d+)\s+(employees|people)/i);
180
+ var teamSize = sizeMatch ? sizeMatch[0] : null;
181
+
182
+ // Funding: "$X.XM" or "Raised $X" pattern
183
+ var fundingMatch = bodyText.match(/\$[\d,.]+[KMBkm]\s*(raised|in funding|Series [A-Z])?/i);
184
+ var funding = fundingMatch ? fundingMatch[0] : null;
185
+
186
+ // Stage: "Series A", "Seed", "Series B", etc.
187
+ var stageMatch = bodyText.match(/\b(Seed|Series [A-Z]\+?|Pre-seed|Angel|Late Stage|Public)\b/);
188
+ var stage = stageMatch ? stageMatch[0] : null;
189
+
190
+ return JSON.stringify({
191
+ name: nameEl ? nameEl.innerText.trim() : null,
192
+ desc: descEl ? descEl.innerText.trim().substring(0, 500) : null,
193
+ teamSize: teamSize,
194
+ funding: funding,
195
+ stage: stage,
196
+ roles: roleLinks.slice(0, 10),
197
+ locations: locationLinks.slice(0, 5),
198
+ });
199
+ })()
200
+ """)
201
+
202
+ data = json.loads(profile)
203
+ print(json.dumps(data, indent=2))
204
+ ```
205
+
206
+ ---
207
+
208
+ ## Workflow 2: Company jobs listing
209
+
210
+ ```python
211
+ import json
212
+
213
+ company_slug = "stripe"
214
+ new_tab(f"https://wellfound.com/company/{company_slug}/jobs")
215
+ wait_for_load()
216
+ wait(5)
217
+
218
+ jobs = js("""
219
+ (function() {
220
+ // Job listing cards — Wellfound uses role="listitem" or li elements in job list
221
+ var cards = document.querySelectorAll('[data-test^="StartupJobListing"], li[class*="job"], div[class*="JobListing"]');
222
+ if (!cards.length) {
223
+ // Broad fallback: all anchor tags with /jobs/ in href
224
+ var links = Array.from(document.querySelectorAll('a[href*="/jobs/"]'));
225
+ return JSON.stringify(links.map(a => ({
226
+ title: a.innerText.trim().split('\\n')[0],
227
+ href: a.href,
228
+ })).filter(j => j.title && j.title.length > 2).slice(0, 30));
229
+ }
230
+ return JSON.stringify(Array.from(cards).map(card => {
231
+ var titleEl = card.querySelector('h2, h3, [class*="title"], [class*="jobTitle"]');
232
+ var locEl = card.querySelector('[class*="location"], [class*="Location"]');
233
+ var compEl = card.querySelector('[class*="salary"], [class*="comp"], [class*="equity"]');
234
+ var linkEl = card.querySelector('a[href*="/jobs/"]');
235
+ return {
236
+ title: titleEl ? titleEl.innerText.trim() : '',
237
+ location: locEl ? locEl.innerText.trim() : '',
238
+ comp: compEl ? compEl.innerText.trim() : '',
239
+ href: linkEl ? linkEl.href : '',
240
+ };
241
+ }).filter(j => j.title));
242
+ })()
243
+ """)
244
+
245
+ results = json.loads(jobs)
246
+ print(f"Found {len(results)} jobs")
247
+ for j in results:
248
+ print(f" {j['title']} | {j.get('location','?')} | {j.get('comp','?')}")
249
+ ```
250
+
251
+ ---
252
+
253
+ ## Workflow 3: Job board — browse all jobs
254
+
255
+ The main `/jobs` page shows a curated job feed. Filters are not accessible via URL params (DataDome blocks `?role=...`). Use the UI dropdown filters after loading the page.
256
+
257
+ ```python
258
+ import json
259
+
260
+ new_tab("https://wellfound.com/jobs")
261
+ wait_for_load()
262
+ wait(5)
263
+
264
+ # Extract visible job cards
265
+ jobs = js("""
266
+ (function() {
267
+ // Job cards on the main /jobs board
268
+ var cards = document.querySelectorAll(
269
+ '[data-test*="job"], [class*="JobCard"], [class*="jobListing"], ' +
270
+ 'li[class*="job"], article[class*="job"]'
271
+ );
272
+ if (!cards.length) {
273
+ // Fallback: links to job detail pages
274
+ var links = Array.from(document.querySelectorAll('a[href*="/company/"][href*="/jobs/"]'));
275
+ return JSON.stringify(links.map(a => ({
276
+ href: a.href,
277
+ text: a.innerText.trim().substring(0, 100),
278
+ })).slice(0, 30));
279
+ }
280
+ return JSON.stringify(Array.from(cards).map(card => {
281
+ var titleEl = card.querySelector('h2, h3, [class*="title"]');
282
+ var companyEl = card.querySelector('[class*="company"], [class*="startup"]');
283
+ var locEl = card.querySelector('[class*="location"]');
284
+ var linkEl = card.querySelector('a[href*="/jobs/"]');
285
+ return {
286
+ title: titleEl ? titleEl.innerText.trim() : '',
287
+ company: companyEl ? companyEl.innerText.trim() : '',
288
+ location: locEl ? locEl.innerText.trim() : '',
289
+ href: linkEl ? linkEl.href : '',
290
+ };
291
+ }).filter(j => j.title));
292
+ })()
293
+ """)
294
+
295
+ results = json.loads(jobs)
296
+ print(f"Found {len(results)} jobs")
297
+ ```
298
+
299
+ ---
300
+
301
+ ## Workflow 4: GraphQL API (authenticated sessions only)
302
+
303
+ Wellfound's GraphQL endpoint (`/graphql`) requires:
304
+ 1. A valid `_wellfound` session cookie from a real browser load
305
+ 2. A CSRF token from the page's `<meta name="csrf-token">` tag
306
+ 3. Cloudflare Bot Management to have passed (only happens in a real Chrome session)
307
+
308
+ **This approach only works from inside a browser session (after navigating to any Wellfound page).**
309
+
310
+ ```python
311
+ import json
312
+
313
+ # Step 1: Load any Wellfound page so the session cookie + DataDome cookie are set
314
+ new_tab("https://wellfound.com/")
315
+ wait_for_load()
316
+ wait(5)
317
+
318
+ # Step 2: Extract CSRF token from meta tag
319
+ csrf = js("document.querySelector('meta[name=\"csrf-token\"]') ? document.querySelector('meta[name=\"csrf-token\"]').getAttribute('content') : null")
320
+ if not csrf:
321
+ raise RuntimeError("CSRF token not found — page may not have loaded correctly")
322
+
323
+ print(f"CSRF token: {csrf[:20]}...")
324
+
325
+ # Step 3: Execute GraphQL query via fetch() from within the browser
326
+ # This uses the browser's existing cookies automatically
327
+ result = js(f"""
328
+ (async function() {{
329
+ try {{
330
+ var resp = await fetch('/graphql', {{
331
+ method: 'POST',
332
+ credentials: 'include',
333
+ headers: {{
334
+ 'Content-Type': 'application/json',
335
+ 'Accept': 'application/json',
336
+ 'x-csrf-token': '{csrf}',
337
+ 'x-requested-with': 'XMLHttpRequest',
338
+ }},
339
+ body: JSON.stringify({{
340
+ query: `query StartupShow($slug: String!) {{
341
+ startup(slug: $slug) {{
342
+ id
343
+ name
344
+ description: highConcept
345
+ productDesc
346
+ teamSize
347
+ locations {{ displayName }}
348
+ markets {{ displayName }}
349
+ totalRaised
350
+ fundingStage
351
+ badges
352
+ hiring
353
+ jobListingsCount
354
+ }}
355
+ }}`,
356
+ variables: {{ slug: "stripe" }}
357
+ }})
358
+ }});
359
+ var data = await resp.json();
360
+ return JSON.stringify(data);
361
+ }} catch(e) {{
362
+ return JSON.stringify({{error: e.message}});
363
+ }}
364
+ }})()
365
+ """)
366
+
367
+ # js() with async returns a Promise — use js_async() if available, or eval trick:
368
+ # Note: the above may return None if js() doesn't await Promises.
369
+ # Use this pattern instead if js() doesn't handle async:
370
+ result_sync = js("""
371
+ var done = false, out = null;
372
+ fetch('/graphql', {
373
+ method: 'POST',
374
+ credentials: 'include',
375
+ headers: {
376
+ 'Content-Type': 'application/json',
377
+ 'Accept': 'application/json',
378
+ 'x-csrf-token': document.querySelector('meta[name="csrf-token"]').content,
379
+ 'x-requested-with': 'XMLHttpRequest',
380
+ },
381
+ body: JSON.stringify({
382
+ query: '{ __typename }',
383
+ })
384
+ }).then(r => r.json()).then(d => { window._wf_gql_result = JSON.stringify(d); });
385
+ 'pending'
386
+ """)
387
+ # Wait for async result
388
+ import time; time.sleep(3)
389
+ gql_result = js("window._wf_gql_result || null")
390
+ if gql_result:
391
+ data = json.loads(gql_result)
392
+ print("GraphQL response:", json.dumps(data, indent=2)[:1000])
393
+ ```
394
+
395
+ ### Known GraphQL operations
396
+
397
+ | Operation | Purpose |
398
+ |-----------|---------|
399
+ | `StartupShow` | Full company profile (name, desc, funding, team size, markets) |
400
+ | `JobListingsIndex` | Paginated job board |
401
+ | `JobSearch` | Filtered job search by role/location |
402
+ | `UserProfile` | User/candidate profile |
403
+ | `InvestorShow` | VC/investor profile |
404
+
405
+ ---
406
+
407
+ ## Handling the login wall
408
+
409
+ Wellfound shows a sign-in modal on:
410
+ - Job detail pages (immediately or after 2-3 seconds)
411
+ - Candidate profile pages (immediately)
412
+ - Some company pages after scrolling
413
+
414
+ Company overview pages typically show content without login. Job listings require login to see full details and apply.
415
+
416
+ ```python
417
+ def dismiss_wellfound_login_modal():
418
+ """Close the Wellfound sign-in modal. Safe to call if no modal is present."""
419
+ closed = js("""
420
+ (function() {
421
+ var selectors = [
422
+ 'button[aria-label="Close"]',
423
+ 'button[class*="close"]',
424
+ 'button[class*="Close"]',
425
+ '[data-test="close-modal"]',
426
+ '[aria-label="Dismiss"]',
427
+ 'button[class*="dismiss"]',
428
+ // Wellfound-specific: modal overlay dismiss
429
+ 'div[class*="Modal"] button[type="button"]',
430
+ ];
431
+ for (var s of selectors) {
432
+ var btn = document.querySelector(s);
433
+ if (btn && btn.offsetParent !== null) {
434
+ btn.click();
435
+ return s;
436
+ }
437
+ }
438
+ // Try pressing Escape
439
+ document.dispatchEvent(new KeyboardEvent('keydown', {key: 'Escape', keyCode: 27, bubbles: true}));
440
+ return 'escape';
441
+ })()
442
+ """)
443
+ if closed:
444
+ wait(1)
445
+ return closed
446
+ ```
447
+
448
+ ---
449
+
450
+ ## Detecting DataDome / challenge page
451
+
452
+ After `new_tab()` + `wait(5)`, verify you are on a real Wellfound page:
453
+
454
+ ```python
455
+ def wellfound_is_blocked() -> bool:
456
+ """True if DataDome or Cloudflare challenge is still showing."""
457
+ title = js("document.title") or ""
458
+ url = page_info()["url"]
459
+ # DataDome challenge page has no useful title; CF shows "Just a moment..."
460
+ blocked = (
461
+ "Just a moment" in title or
462
+ "wellfound.com" not in url or
463
+ "captcha-delivery.com" in js("document.body.innerHTML or ''") or
464
+ not title
465
+ )
466
+ return blocked
467
+
468
+ # Usage
469
+ new_tab("https://wellfound.com/company/stripe")
470
+ wait_for_load()
471
+ wait(5)
472
+
473
+ if wellfound_is_blocked():
474
+ wait(8) # DataDome sometimes needs up to 10s total
475
+ if wellfound_is_blocked():
476
+ capture_screenshot("/tmp/wellfound_blocked.png")
477
+ raise RuntimeError("DataDome/CF challenge did not resolve — see /tmp/wellfound_blocked.png")
478
+ ```
479
+
480
+ ---
481
+
482
+ ## Key selectors reference
483
+
484
+ Wellfound uses **Tailwind CSS** — no stable semantic class names. These patterns are robust:
485
+
486
+ | Target | Selector strategy |
487
+ |--------|------------------|
488
+ | Company name | `h1` (first on page) |
489
+ | Company description | `[class*="description"]`, `[class*="about"]` |
490
+ | Team size | Text search: `/\d+[-–]\d+\s+(employees\|people)/i` |
491
+ | Funding amount | Text search: `/\$[\d,.]+[KMBkm]/i` |
492
+ | Funding stage | Text search: `/\b(Seed\|Series [A-Z]\+?\|Pre-seed\|Late Stage)\b/` |
493
+ | Role/market tags | `a[href*="/jobs?role="]` |
494
+ | Location tags | `a[href*="/location/"]` |
495
+ | Job cards | `a[href*="/company/"][href*="/jobs/"]` (broad fallback) |
496
+ | Job title | `h2`, `h3`, `[class*="title"]` within card |
497
+ | CSRF token | `meta[name="csrf-token"]` |
498
+ | Login modal | `button[aria-label="Close"]`, Escape key |
499
+
500
+ ---
501
+
502
+ ## Common pitfalls
503
+
504
+ 1. **`http_get` is permanently blocked.** DataDome intercepts all non-browser HTTP requests with
505
+ a 403 + captcha challenge. No User-Agent, header combination, or cookie replay works.
506
+ `api.angel.co` is HTTP 404 (shut down). Use `new_tab()` exclusively.
507
+
508
+ 2. **NOT a Next.js app.** Wellfound is Ruby on Rails + React. There is no `__NEXT_DATA__` JSON
509
+ blob. Look for `window.__APOLLO_STATE__`, `window.gon`, or inline `<script>` tags instead.
510
+
511
+ 3. **`wait(5)` minimum after `wait_for_load()`.** DataDome runs JS fingerprinting probes for
512
+ 2-4 seconds after `readyState = complete`. Extracting before this resolves returns the challenge
513
+ page HTML, not real content.
514
+
515
+ 4. **Tailwind CSS — no stable class names.** Wellfound uses Tailwind utility classes. Never
516
+ hardcode a specific class name. Use `href` attribute patterns, `data-test` attributes if present,
517
+ or semantic element selectors (`h1`, `h2`, `li`, `article`).
518
+
519
+ 5. **GraphQL requires both CSRF token AND browser session cookies.** The CSRF token is a
520
+ per-session value from `<meta name="csrf-token">`. Cloudflare Bot Management blocks
521
+ `POST /graphql` from non-browser sessions. Always fire GraphQL via `fetch()` inside the
522
+ browser session (not from Python's `http_get`).
523
+
524
+ 6. **`?role=` and `?location=` params are robots.txt-disallowed.** Wellfound may redirect or
525
+ show a login wall for filtered job search URLs. Load `/jobs` unfiltered and use in-page
526
+ UI filters (dropdowns) to narrow results.
527
+
528
+ 7. **Login wall on job details and user profiles.** Company overview pages load without login.
529
+ Individual job detail pages, and all `/u/{username}` profiles, hit a login modal immediately.
530
+ Call `dismiss_wellfound_login_modal()` right after `wait(5)` on these pages.
531
+
532
+ 8. **Rate limiting.** After ~5-10 rapid page navigations DataDome may harden. Use `wait(3)` between
533
+ `goto_url()` calls. If you get a captcha that does not auto-resolve, wait 30-60 seconds.
534
+
535
+ 9. **`new_tab()` over `goto_url()` for the first Wellfound page.** `goto_url()` in an existing tab
536
+ may inherit a stale DataDome fingerprint. `new_tab()` gives a clean origin context that
537
+ DataDome processes cleanly.
538
+
539
+ ---
540
+
541
+ ## Anti-bot response identification
542
+
543
+ What you see in the 403 body when NOT in a browser:
544
+
545
+ ```html
546
+ <!-- DataDome challenge (page GETs) -->
547
+ <script>var dd={'rt':'c','cid':'...','t':'bv','host':'geo.captcha-delivery.com',...}</script>
548
+ <script src="https://ct.captcha-delivery.com/c.js"></script>
549
+ <!-- rt='c' = captcha required; rt='i' = invisible solve; rt='b' = blocked -->
550
+
551
+ <!-- Cloudflare challenge (API POSTs) -->
552
+ <title>Just a moment...</title>
553
+ <script>window.__CF$cv$params={r:'...',t:'...'}</script>
554
+ ```
555
+
556
+ In a real Chrome browser, both challenges resolve automatically without user interaction.
557
+
558
+ ---
559
+
560
+ ## Minimal working example
561
+
562
+ ```python
563
+ import json
564
+
565
+ # Open Wellfound company page
566
+ new_tab("https://wellfound.com/company/openai")
567
+ wait_for_load()
568
+ wait(5)
569
+
570
+ # Verify not blocked
571
+ title = js("document.title")
572
+ assert "Just a moment" not in (title or ""), f"Still on challenge page: {title}"
573
+
574
+ # Extract company overview
575
+ data = js("""
576
+ (function() {
577
+ var name = document.querySelector('h1');
578
+ var bodyText = document.body.innerText;
579
+ var sizeMatch = bodyText.match(/(\\d+[-\\u2013]\\d+)\\s+(employees|people)/i);
580
+ var fundingMatch = bodyText.match(/\\$[\\d,.]+[KMBkm](?:\\s+(?:raised|total))?/i);
581
+ var stageMatch = bodyText.match(/\\b(Seed|Series [A-Z]\\+?|Pre-seed|Late Stage|Public)\\b/);
582
+ var tags = Array.from(document.querySelectorAll('a[href*="/jobs?role="]')).map(a => a.innerText.trim());
583
+ var locs = Array.from(document.querySelectorAll('a[href*="/location/"]')).map(a => a.innerText.trim());
584
+ return JSON.stringify({
585
+ name: name ? name.innerText.trim() : null,
586
+ teamSize: sizeMatch ? sizeMatch[0] : null,
587
+ funding: fundingMatch ? fundingMatch[0] : null,
588
+ stage: stageMatch ? stageMatch[0] : null,
589
+ roles: tags.slice(0, 8),
590
+ locations: locs.slice(0, 5),
591
+ });
592
+ })()
593
+ """)
594
+
595
+ print(json.dumps(json.loads(data), indent=2))
596
+ ```