@pencil-agent/nano-pencil 2.0.1 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +267 -267
  2. package/dist/build-meta.json +3 -3
  3. package/dist/core/export-html/AGENT.md +11 -11
  4. package/dist/core/export-html/template.css +971 -971
  5. package/dist/core/export-html/template.html +54 -54
  6. package/dist/core/model/custom-providers.js +1 -1
  7. package/dist/core/model-registry.js +5 -5
  8. package/dist/extensions/builtin/AGENT.md +115 -115
  9. package/dist/extensions/builtin/browser/AGENT.md +17 -17
  10. package/dist/extensions/builtin/browser/agent-workspace/agent_helpers.py +12 -12
  11. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/amazon/product-search.md +198 -198
  12. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/archive-org/scraping.md +341 -341
  13. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/arxiv/scraping.md +311 -311
  14. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/arxiv-bulk/scraping.md +333 -333
  15. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/atlas/overview.md +70 -70
  16. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/booking-com/scraping.md +578 -578
  17. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/capterra/scraping.md +440 -440
  18. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/centilebrain/generate-estimates.md +110 -110
  19. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coingecko/scraping.md +325 -325
  20. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coinmarketcap/scraping.md +463 -463
  21. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coursera/scraping.md +360 -360
  22. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/craigslist/scraping.md +390 -390
  23. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/crossref/scraping.md +568 -568
  24. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/dev-to/scraping.md +323 -323
  25. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/duckduckgo/scraping.md +349 -349
  26. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/ebay/scraping.md +435 -435
  27. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/etsy/scraping.md +506 -506
  28. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/eventbrite/scraping.md +363 -363
  29. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/expedia/automation.md +168 -168
  30. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/facebook/groups.md +236 -236
  31. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/facebook/pages.md +295 -295
  32. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/framer/editor.md +108 -108
  33. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/fred/scraping.md +493 -493
  34. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/g2/scraping.md +580 -580
  35. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/genius/scraping.md +511 -511
  36. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/github/repo-actions.md +65 -65
  37. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/github/scraping.md +184 -184
  38. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/glassdoor/scraping.md +543 -543
  39. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/gmail/compose.md +122 -122
  40. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/goodreads/scraping.md +461 -461
  41. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/gutenberg/scraping.md +383 -383
  42. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/hackernews/scraping.md +243 -243
  43. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/howlongtobeat/scraping.md +473 -473
  44. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/imdb/scraping.md +271 -271
  45. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/itch-io/scraping.md +436 -436
  46. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/job-boards/indeed-glassdoor.md +1021 -1021
  47. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/letterboxd/scraping.md +349 -349
  48. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/linkedin/invitation-manager.md +109 -109
  49. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/loom/folder-enumeration.md +170 -170
  50. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/macrotrends/scraping.md +537 -537
  51. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/medium/article-hydration.md +120 -120
  52. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/medium/scraping.md +414 -414
  53. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/metacritic/scraping.md +477 -477
  54. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/musicbrainz/scraping.md +478 -478
  55. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/nasa/scraping.md +339 -339
  56. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/news-aggregation/multi-source.md +205 -205
  57. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/open-library/scraping.md +472 -472
  58. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/openalex/scraping.md +470 -470
  59. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/openstreetmap/scraping.md +490 -490
  60. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/package-registries/npm-pypi.md +478 -478
  61. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/polymarket/scraping.md +234 -234
  62. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/producthunt/scraping.md +307 -307
  63. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/pubmed/scraping.md +421 -421
  64. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/quora/scraping.md +364 -364
  65. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/rawg/scraping.md +352 -352
  66. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/reddit/scraping.md +124 -124
  67. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/rest-countries/scraping.md +233 -233
  68. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/sec-edgar/scraping.md +361 -361
  69. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/README.md +36 -36
  70. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/embedded-apps.md +72 -72
  71. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/knowledge-base.md +109 -109
  72. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/polaris-inputs.md +137 -137
  73. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/soundcloud/scraping.md +362 -362
  74. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/spotify/scraping.md +339 -339
  75. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/stackoverflow/scraping.md +435 -435
  76. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/steam/scraping.md +575 -575
  77. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/substack/scraping.md +338 -338
  78. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/thetechgeeks/pricing.md +52 -52
  79. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/tiktok/upload.md +107 -107
  80. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/tradingview/scraping.md +309 -309
  81. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/trello/boards-and-lists.md +88 -88
  82. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/trustpilot/scraping.md +375 -375
  83. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/walmart/scraping.md +444 -444
  84. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/wayback-machine/scraping.md +306 -306
  85. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/weather/scraping.md +398 -398
  86. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/wellfound/scraping.md +596 -596
  87. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/world-bank/scraping.md +356 -356
  88. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/xiaohongshu/scraping.md +84 -84
  89. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/youtube/scraping.md +418 -418
  90. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/zillow/scraping.md +433 -433
  91. package/dist/extensions/builtin/browser/browser.md +73 -73
  92. package/dist/extensions/builtin/browser/install.md +142 -142
  93. package/dist/extensions/builtin/browser/interaction-skills/connection.md +48 -48
  94. package/dist/extensions/builtin/browser/interaction-skills/cookies.md +3 -3
  95. package/dist/extensions/builtin/browser/interaction-skills/cross-origin-iframes.md +3 -3
  96. package/dist/extensions/builtin/browser/interaction-skills/dialogs.md +64 -64
  97. package/dist/extensions/builtin/browser/interaction-skills/downloads.md +3 -3
  98. package/dist/extensions/builtin/browser/interaction-skills/drag-and-drop.md +3 -3
  99. package/dist/extensions/builtin/browser/interaction-skills/dropdowns.md +3 -3
  100. package/dist/extensions/builtin/browser/interaction-skills/iframes.md +3 -3
  101. package/dist/extensions/builtin/browser/interaction-skills/network-requests.md +3 -3
  102. package/dist/extensions/builtin/browser/interaction-skills/print-as-pdf.md +3 -3
  103. package/dist/extensions/builtin/browser/interaction-skills/profile-sync.md +90 -90
  104. package/dist/extensions/builtin/browser/interaction-skills/screenshots.md +17 -17
  105. package/dist/extensions/builtin/browser/interaction-skills/scrolling.md +3 -3
  106. package/dist/extensions/builtin/browser/interaction-skills/shadow-dom.md +3 -3
  107. package/dist/extensions/builtin/browser/interaction-skills/tabs.md +69 -69
  108. package/dist/extensions/builtin/browser/interaction-skills/uploads.md +1 -1
  109. package/dist/extensions/builtin/browser/interaction-skills/viewport.md +3 -3
  110. package/dist/extensions/builtin/browser/src/browser_harness/AGENT.md +15 -15
  111. package/dist/extensions/builtin/browser/src/browser_harness/__init__.py +8 -8
  112. package/dist/extensions/builtin/browser/src/browser_harness/_ipc.py +90 -90
  113. package/dist/extensions/builtin/browser/src/browser_harness/admin.py +722 -722
  114. package/dist/extensions/builtin/browser/src/browser_harness/daemon.py +328 -328
  115. package/dist/extensions/builtin/browser/src/browser_harness/helpers.py +396 -396
  116. package/dist/extensions/builtin/browser/src/browser_harness/run.py +103 -103
  117. package/dist/extensions/builtin/discipline/skills/brainstorming/SKILL.md +33 -33
  118. package/dist/extensions/builtin/discipline/skills/executing-plans/SKILL.md +25 -25
  119. package/dist/extensions/builtin/discipline/skills/finishing-development-branch/SKILL.md +25 -25
  120. package/dist/extensions/builtin/discipline/skills/receiving-code-review/SKILL.md +22 -22
  121. package/dist/extensions/builtin/discipline/skills/requesting-code-review/SKILL.md +31 -31
  122. package/dist/extensions/builtin/discipline/skills/systematic-debugging/SKILL.md +28 -28
  123. package/dist/extensions/builtin/discipline/skills/test-driven-development/SKILL.md +32 -32
  124. package/dist/extensions/builtin/discipline/skills/using-git-worktrees/SKILL.md +25 -25
  125. package/dist/extensions/builtin/discipline/skills/verification-before-completion/SKILL.md +27 -27
  126. package/dist/extensions/builtin/discipline/skills/writing-plans/SKILL.md +26 -26
  127. package/dist/extensions/builtin/goal/README.md +67 -67
  128. package/dist/extensions/builtin/grub/README.md +112 -112
  129. package/dist/extensions/builtin/link-world/agent-workspace/README.md +16 -16
  130. package/dist/extensions/builtin/link-world/internet-search/internet-search.md +65 -65
  131. package/dist/extensions/builtin/link-world/link-world-agent.md +82 -82
  132. package/dist/extensions/builtin/link-world/linkworld.md +313 -313
  133. package/dist/extensions/builtin/link-world/network-routing/network-routing.md +67 -67
  134. package/dist/extensions/builtin/loop/README.md +92 -92
  135. package/dist/extensions/builtin/mcp/figma-design.md +68 -68
  136. package/dist/extensions/builtin/mcp/mcp-management.md +85 -85
  137. package/dist/extensions/builtin/recap/AGENT.md +15 -15
  138. package/dist/extensions/builtin/sal/README.md +72 -72
  139. package/dist/extensions/builtin/security-audit/README.md +289 -289
  140. package/dist/extensions/builtin/team/AGENT.md +112 -112
  141. package/dist/extensions/builtin/team/TESTING.md +299 -299
  142. package/dist/extensions/builtin/token-save/README.md +56 -56
  143. package/dist/extensions/optional/AGENT.md +10 -10
  144. package/dist/modes/interactive/controllers/input-submit-controller.js +2 -2
  145. package/dist/modes/interactive/controllers/stream-render-controller.js +2 -2
  146. package/dist/modes/interactive/interactive-mode.js +19 -19
  147. package/dist/modes/interactive/theme/dark.json +85 -85
  148. package/dist/modes/interactive/theme/light.json +84 -84
  149. package/dist/modes/interactive/theme/theme-schema.json +335 -335
  150. package/dist/modes/interactive/theme/warm.json +81 -81
  151. package/dist/node_modules/@pencil-agent/ai/dist/cli.js +0 -0
  152. package/dist/node_modules/@pencil-agent/ai/dist/models.generated.js +1 -1
  153. package/docs/ACP/345/215/217/350/256/256/351/233/206/346/210/220/345/274/200/345/217/221/346/226/207/346/241/243.md +851 -0
  154. package/docs/SDK-TESTING.md +364 -0
  155. package/docs/codex-goal-command-impl.md +1055 -1055
  156. package/docs/codex-goal-vs-grub.md +500 -500
  157. package/docs/custom-provider.md +27 -27
  158. package/docs/extensions.md +27 -27
  159. package/docs/keybindings.md +27 -27
  160. package/docs/loop /351/207/215/346/236/204/345/256/214/346/210/220/346/200/273/347/273/223.md" +250 -250
  161. package/docs/loop /351/207/215/346/236/204/345/256/214/346/210/220/346/212/245/345/221/212.md" +122 -122
  162. package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210.md" +1222 -1222
  163. package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210/345/256/236/347/216/260/346/212/245/345/221/212.md" +158 -158
  164. package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210/345/257/271/346/257/224/345/210/206/346/236/220.md" +128 -128
  165. package/docs/loop /351/207/215/346/236/204/350/256/241/345/210/222.md" +320 -320
  166. package/docs/loop-usage-examples.md +214 -214
  167. package/docs/mem-core/346/212/200/346/234/257/346/226/207/346/241/243.md +593 -0
  168. package/docs/models.md +27 -27
  169. package/docs/packages.md +27 -27
  170. package/docs/pi-design-philosophy.md +457 -457
  171. package/docs/planmode.md +1987 -1987
  172. package/docs/prompt-templates.md +27 -27
  173. package/docs/providers.md +27 -27
  174. package/docs/sdk.md +27 -27
  175. package/docs/skills.md +27 -27
  176. package/docs/startup-performance-optimization.md +301 -0
  177. package/docs/themes.md +27 -27
  178. package/docs/tui.md +27 -27
  179. package/docs//350/256/244/347/237/245/345/234/260/345/233/276.md +47 -0
  180. package/package.json +190 -190
  181. package/docs/cc-agent-design.md +0 -1297
  182. package/docs/cc-tui-design.md +0 -1333
  183. package/docs/nanoPencil-/345/255/246/344/271/240/350/256/241/345/210/222.md +0 -170
  184. package/docs/scan-report.md +0 -3820
  185. package/docs//345/257/271/346/240/207Claude-Code.md +0 -1775
  186. package/docs//351/230/277/351/207/214/345/267/264/345/267/264/350/264/242/346/212/245/345/210/206/346/236/220/344/271/246.md +0 -261
@@ -1,596 +1,596 @@
1
- # Wellfound (AngelList) — Startup Jobs & Company Profiles
2
-
3
- Field-tested against wellfound.com on 2026-04-18.
4
- All confirmed via live HTTP probes and response header analysis.
5
-
6
- ---
7
-
8
- ## Anti-bot verdict: browser required, no http_get workaround exists
9
-
10
- **`http_get` returns HTTP 403 on every Wellfound URL without exception** (except `robots.txt`).
11
-
12
- Tested endpoints (all 403):
13
- - `/company/stripe`
14
- - `/jobs`
15
- - `/jobs?role=engineer&location=remote`
16
- - `/company/stripe/jobs`
17
- - `/sitemap.xml`, `/sitemap_index.xml`
18
- - `/jobs.rss`
19
- - `POST /graphql` (HTTP 403, Cloudflare managed challenge)
20
-
21
- Old AngelList public API (`api.angel.co/1/...`) returns `404 Not Found` — permanently shut down.
22
-
23
- **Dual anti-bot stack confirmed from response headers:**
24
-
25
- | Layer | System | Evidence |
26
- |-------|--------|----------|
27
- | Page GETs | DataDome | `X-DataDome: protected`, `X-DD-B: 2`, `Set-Cookie: datadome=...` |
28
- | API POSTs | Cloudflare Bot Management | `Cf-Mitigated: challenge` |
29
-
30
- The 403 response body contains a DataDome captcha challenge script (`geo.captcha-delivery.com`) AND an embedded Cloudflare challenge (`window.__CF$cv$params`). Both fire simultaneously. Neither cookie can be replayed — both are TLS-fingerprint-bound.
31
-
32
- **Use `new_tab()` + `wait()` exclusively. Never use `http_get` for Wellfound.**
33
-
34
- ---
35
-
36
- ## Tech stack (confirmed from response headers)
37
-
38
- Wellfound is a **Ruby on Rails + React + Apollo GraphQL** hybrid app — NOT a pure Next.js app.
39
-
40
- Confirmed headers from `robots.txt` (the only accessible endpoint):
41
- ```
42
- x-runtime: 0.006700 → Rails rack middleware timer
43
- x-request-id: 4645fd66... → Rails request ID
44
- x-xss-protection: 1; mode=block → Rails security defaults
45
- Set-Cookie: _wellfound=... → Rails session cookie
46
- Server: cloudflare → Cloudflare CDN
47
- ```
48
-
49
- Implications:
50
- - **`__NEXT_DATA__` is NOT present** — not a Next.js app
51
- - **`window.__APOLLO_STATE__` or `window.gon` may be present** — check these instead
52
- - CSRF token is in a `<meta name="csrf-token">` tag (Rails default)
53
- - Session cookie is `_wellfound=...` for anonymous sessions; login sessions add `_wellfound_session=...`
54
-
55
- ---
56
-
57
- ## Do this first: open in new tab, wait for DataDome to resolve
58
-
59
- ```python
60
- new_tab("https://wellfound.com/company/stripe")
61
- wait_for_load()
62
- wait(5) # DataDome JS fingerprinting runs ~2-4s after readyState=complete
63
- ```
64
-
65
- Verify you are past the DataDome challenge before extracting:
66
-
67
- ```python
68
- title = js("document.title")
69
- url = page_info()["url"]
70
-
71
- if "wellfound.com" not in url or not title or "Just a moment" in title:
72
- # DataDome or CF challenge did not resolve — wait longer
73
- wait(8)
74
- title = js("document.title")
75
- if "Just a moment" in title or not title:
76
- capture_screenshot("/tmp/wellfound_block.png")
77
- raise RuntimeError("DataDome/CF challenge did not resolve — see screenshot")
78
- ```
79
-
80
- DataDome resolves **silently** in a real Chrome session via CDP — no user interaction required.
81
- The challenge is a JS fingerprint check that passes automatically when running in a real browser.
82
-
83
- ---
84
-
85
- ## URL patterns
86
-
87
- | Goal | URL |
88
- |------|-----|
89
- | Company profile | `https://wellfound.com/company/{slug}` |
90
- | Company jobs | `https://wellfound.com/company/{slug}/jobs` |
91
- | Company culture | `https://wellfound.com/company/{slug}/culture` |
92
- | Job board (all) | `https://wellfound.com/jobs` |
93
- | Job board filtered | `https://wellfound.com/jobs` — then use UI filters (query params are disallowed by robots.txt) |
94
- | Investor profile | `https://wellfound.com/investor/{slug}` |
95
- | User profile | `https://wellfound.com/u/{username}` (disallowed by robots.txt, login wall) |
96
-
97
- **Note on query params:** `robots.txt` disallows `?role=*`, `?jobId=*`, `?jobSlug=*`, `?location=*`.
98
- Wellfound enforces these with login walls or redirects for most filtered job searches.
99
-
100
- ---
101
-
102
- ## Workflow 1: Company profile — name, description, team size, funding, tags
103
-
104
- Navigate to the company page and extract structured data. Most fields are visible without login.
105
-
106
- ```python
107
- import json
108
-
109
- new_tab("https://wellfound.com/company/stripe")
110
- wait_for_load()
111
- wait(5)
112
-
113
- # Check for Apollo state (Rails + React app, not Next.js)
114
- # Wellfound embeds data in window.gon or inline script tags
115
- apollo_raw = js("""
116
- (function() {
117
- // Try window.__APOLLO_STATE__ (Apollo Client cache)
118
- if (window.__APOLLO_STATE__) return JSON.stringify(window.__APOLLO_STATE__);
119
- // Try window.gon (Rails Gon gem)
120
- if (window.gon) return JSON.stringify(window.gon);
121
- // Try inline <script> tags containing startup data
122
- var scripts = Array.from(document.querySelectorAll('script:not([src])'));
123
- for (var s of scripts) {
124
- var t = s.textContent || '';
125
- if (t.includes('"name"') && t.includes('"description"') && t.includes('teamSize')) {
126
- return t.substring(0, 5000);
127
- }
128
- }
129
- return null;
130
- })()
131
- """)
132
-
133
- if apollo_raw:
134
- try:
135
- data = json.loads(apollo_raw)
136
- # Apollo State: look for Startup:{id} keys
137
- for key, val in data.items():
138
- if key.startswith("Startup:") and isinstance(val, dict):
139
- print("Company:", val.get("name"))
140
- print("Description:", val.get("description") or val.get("highConcept"))
141
- print("Team size:", val.get("teamSize"))
142
- print("Total raised:", val.get("totalRaised"))
143
- print("Hiring:", val.get("hiring"))
144
- print(json.dumps(data, indent=2)[:3000])
145
- except json.JSONDecodeError:
146
- # Raw script tag — parse key fields with regex
147
- import re
148
- name = re.search(r'"name"\s*:\s*"([^"]+)"', apollo_raw)
149
- desc = re.search(r'"description"\s*:\s*"([^"]+)"', apollo_raw)
150
- print("Name:", name.group(1) if name else "not found")
151
- print("Desc:", desc.group(1) if desc else "not found")
152
- ```
153
-
154
- If the structured data path fails, fall back to DOM extraction:
155
-
156
- ```python
157
- # DOM extraction — company profile page
158
- profile = js("""
159
- (function() {
160
- // Company name — first h1 on the page
161
- var nameEl = document.querySelector('h1');
162
-
163
- // Description — first substantial paragraph or div with class containing 'description'
164
- var descEl = (
165
- document.querySelector('[class*="description"]') ||
166
- document.querySelector('[class*="about"]') ||
167
- document.querySelector('p[class*="startupDescription"]')
168
- );
169
-
170
- // Tags — market/role tags are links with /jobs?role= or /location/ in href
171
- // Wellfound uses Tailwind (no stable class names) — use href pattern
172
- var roleLinks = Array.from(document.querySelectorAll('a[href*="/jobs?role="]')).map(a => a.innerText.trim());
173
- var locationLinks = Array.from(document.querySelectorAll('a[href*="/location/"]')).map(a => a.innerText.trim());
174
-
175
- // Team size / funding — look in page text for patterns
176
- var bodyText = document.body.innerText;
177
-
178
- // Company size: "11-50 employees" or "51-200 people" pattern
179
- var sizeMatch = bodyText.match(/(\d+[-–]\d+)\s+(employees|people)/i);
180
- var teamSize = sizeMatch ? sizeMatch[0] : null;
181
-
182
- // Funding: "$X.XM" or "Raised $X" pattern
183
- var fundingMatch = bodyText.match(/\$[\d,.]+[KMBkm]\s*(raised|in funding|Series [A-Z])?/i);
184
- var funding = fundingMatch ? fundingMatch[0] : null;
185
-
186
- // Stage: "Series A", "Seed", "Series B", etc.
187
- var stageMatch = bodyText.match(/\b(Seed|Series [A-Z]\+?|Pre-seed|Angel|Late Stage|Public)\b/);
188
- var stage = stageMatch ? stageMatch[0] : null;
189
-
190
- return JSON.stringify({
191
- name: nameEl ? nameEl.innerText.trim() : null,
192
- desc: descEl ? descEl.innerText.trim().substring(0, 500) : null,
193
- teamSize: teamSize,
194
- funding: funding,
195
- stage: stage,
196
- roles: roleLinks.slice(0, 10),
197
- locations: locationLinks.slice(0, 5),
198
- });
199
- })()
200
- """)
201
-
202
- data = json.loads(profile)
203
- print(json.dumps(data, indent=2))
204
- ```
205
-
206
- ---
207
-
208
- ## Workflow 2: Company jobs listing
209
-
210
- ```python
211
- import json
212
-
213
- company_slug = "stripe"
214
- new_tab(f"https://wellfound.com/company/{company_slug}/jobs")
215
- wait_for_load()
216
- wait(5)
217
-
218
- jobs = js("""
219
- (function() {
220
- // Job listing cards — Wellfound uses role="listitem" or li elements in job list
221
- var cards = document.querySelectorAll('[data-test^="StartupJobListing"], li[class*="job"], div[class*="JobListing"]');
222
- if (!cards.length) {
223
- // Broad fallback: all anchor tags with /jobs/ in href
224
- var links = Array.from(document.querySelectorAll('a[href*="/jobs/"]'));
225
- return JSON.stringify(links.map(a => ({
226
- title: a.innerText.trim().split('\\n')[0],
227
- href: a.href,
228
- })).filter(j => j.title && j.title.length > 2).slice(0, 30));
229
- }
230
- return JSON.stringify(Array.from(cards).map(card => {
231
- var titleEl = card.querySelector('h2, h3, [class*="title"], [class*="jobTitle"]');
232
- var locEl = card.querySelector('[class*="location"], [class*="Location"]');
233
- var compEl = card.querySelector('[class*="salary"], [class*="comp"], [class*="equity"]');
234
- var linkEl = card.querySelector('a[href*="/jobs/"]');
235
- return {
236
- title: titleEl ? titleEl.innerText.trim() : '',
237
- location: locEl ? locEl.innerText.trim() : '',
238
- comp: compEl ? compEl.innerText.trim() : '',
239
- href: linkEl ? linkEl.href : '',
240
- };
241
- }).filter(j => j.title));
242
- })()
243
- """)
244
-
245
- results = json.loads(jobs)
246
- print(f"Found {len(results)} jobs")
247
- for j in results:
248
- print(f" {j['title']} | {j.get('location','?')} | {j.get('comp','?')}")
249
- ```
250
-
251
- ---
252
-
253
- ## Workflow 3: Job board — browse all jobs
254
-
255
- The main `/jobs` page shows a curated job feed. Filters are not accessible via URL params (DataDome blocks `?role=...`). Use the UI dropdown filters after loading the page.
256
-
257
- ```python
258
- import json
259
-
260
- new_tab("https://wellfound.com/jobs")
261
- wait_for_load()
262
- wait(5)
263
-
264
- # Extract visible job cards
265
- jobs = js("""
266
- (function() {
267
- // Job cards on the main /jobs board
268
- var cards = document.querySelectorAll(
269
- '[data-test*="job"], [class*="JobCard"], [class*="jobListing"], ' +
270
- 'li[class*="job"], article[class*="job"]'
271
- );
272
- if (!cards.length) {
273
- // Fallback: links to job detail pages
274
- var links = Array.from(document.querySelectorAll('a[href*="/company/"][href*="/jobs/"]'));
275
- return JSON.stringify(links.map(a => ({
276
- href: a.href,
277
- text: a.innerText.trim().substring(0, 100),
278
- })).slice(0, 30));
279
- }
280
- return JSON.stringify(Array.from(cards).map(card => {
281
- var titleEl = card.querySelector('h2, h3, [class*="title"]');
282
- var companyEl = card.querySelector('[class*="company"], [class*="startup"]');
283
- var locEl = card.querySelector('[class*="location"]');
284
- var linkEl = card.querySelector('a[href*="/jobs/"]');
285
- return {
286
- title: titleEl ? titleEl.innerText.trim() : '',
287
- company: companyEl ? companyEl.innerText.trim() : '',
288
- location: locEl ? locEl.innerText.trim() : '',
289
- href: linkEl ? linkEl.href : '',
290
- };
291
- }).filter(j => j.title));
292
- })()
293
- """)
294
-
295
- results = json.loads(jobs)
296
- print(f"Found {len(results)} jobs")
297
- ```
298
-
299
- ---
300
-
301
- ## Workflow 4: GraphQL API (authenticated sessions only)
302
-
303
- Wellfound's GraphQL endpoint (`/graphql`) requires:
304
- 1. A valid `_wellfound` session cookie from a real browser load
305
- 2. A CSRF token from the page's `<meta name="csrf-token">` tag
306
- 3. Cloudflare Bot Management to have passed (only happens in a real Chrome session)
307
-
308
- **This approach only works from inside a browser session (after navigating to any Wellfound page).**
309
-
310
- ```python
311
- import json
312
-
313
- # Step 1: Load any Wellfound page so the session cookie + DataDome cookie are set
314
- new_tab("https://wellfound.com/")
315
- wait_for_load()
316
- wait(5)
317
-
318
- # Step 2: Extract CSRF token from meta tag
319
- csrf = js("document.querySelector('meta[name=\"csrf-token\"]') ? document.querySelector('meta[name=\"csrf-token\"]').getAttribute('content') : null")
320
- if not csrf:
321
- raise RuntimeError("CSRF token not found — page may not have loaded correctly")
322
-
323
- print(f"CSRF token: {csrf[:20]}...")
324
-
325
- # Step 3: Execute GraphQL query via fetch() from within the browser
326
- # This uses the browser's existing cookies automatically
327
- result = js(f"""
328
- (async function() {{
329
- try {{
330
- var resp = await fetch('/graphql', {{
331
- method: 'POST',
332
- credentials: 'include',
333
- headers: {{
334
- 'Content-Type': 'application/json',
335
- 'Accept': 'application/json',
336
- 'x-csrf-token': '{csrf}',
337
- 'x-requested-with': 'XMLHttpRequest',
338
- }},
339
- body: JSON.stringify({{
340
- query: `query StartupShow($slug: String!) {{
341
- startup(slug: $slug) {{
342
- id
343
- name
344
- description: highConcept
345
- productDesc
346
- teamSize
347
- locations {{ displayName }}
348
- markets {{ displayName }}
349
- totalRaised
350
- fundingStage
351
- badges
352
- hiring
353
- jobListingsCount
354
- }}
355
- }}`,
356
- variables: {{ slug: "stripe" }}
357
- }})
358
- }});
359
- var data = await resp.json();
360
- return JSON.stringify(data);
361
- }} catch(e) {{
362
- return JSON.stringify({{error: e.message}});
363
- }}
364
- }})()
365
- """)
366
-
367
- # js() with async returns a Promise — use js_async() if available, or eval trick:
368
- # Note: the above may return None if js() doesn't await Promises.
369
- # Use this pattern instead if js() doesn't handle async:
370
- result_sync = js("""
371
- var done = false, out = null;
372
- fetch('/graphql', {
373
- method: 'POST',
374
- credentials: 'include',
375
- headers: {
376
- 'Content-Type': 'application/json',
377
- 'Accept': 'application/json',
378
- 'x-csrf-token': document.querySelector('meta[name="csrf-token"]').content,
379
- 'x-requested-with': 'XMLHttpRequest',
380
- },
381
- body: JSON.stringify({
382
- query: '{ __typename }',
383
- })
384
- }).then(r => r.json()).then(d => { window._wf_gql_result = JSON.stringify(d); });
385
- 'pending'
386
- """)
387
- # Wait for async result
388
- import time; time.sleep(3)
389
- gql_result = js("window._wf_gql_result || null")
390
- if gql_result:
391
- data = json.loads(gql_result)
392
- print("GraphQL response:", json.dumps(data, indent=2)[:1000])
393
- ```
394
-
395
- ### Known GraphQL operations
396
-
397
- | Operation | Purpose |
398
- |-----------|---------|
399
- | `StartupShow` | Full company profile (name, desc, funding, team size, markets) |
400
- | `JobListingsIndex` | Paginated job board |
401
- | `JobSearch` | Filtered job search by role/location |
402
- | `UserProfile` | User/candidate profile |
403
- | `InvestorShow` | VC/investor profile |
404
-
405
- ---
406
-
407
- ## Handling the login wall
408
-
409
- Wellfound shows a sign-in modal on:
410
- - Job detail pages (immediately or after 2-3 seconds)
411
- - Candidate profile pages (immediately)
412
- - Some company pages after scrolling
413
-
414
- Company overview pages typically show content without login. Job listings require login to see full details and apply.
415
-
416
- ```python
417
- def dismiss_wellfound_login_modal():
418
- """Close the Wellfound sign-in modal. Safe to call if no modal is present."""
419
- closed = js("""
420
- (function() {
421
- var selectors = [
422
- 'button[aria-label="Close"]',
423
- 'button[class*="close"]',
424
- 'button[class*="Close"]',
425
- '[data-test="close-modal"]',
426
- '[aria-label="Dismiss"]',
427
- 'button[class*="dismiss"]',
428
- // Wellfound-specific: modal overlay dismiss
429
- 'div[class*="Modal"] button[type="button"]',
430
- ];
431
- for (var s of selectors) {
432
- var btn = document.querySelector(s);
433
- if (btn && btn.offsetParent !== null) {
434
- btn.click();
435
- return s;
436
- }
437
- }
438
- // Try pressing Escape
439
- document.dispatchEvent(new KeyboardEvent('keydown', {key: 'Escape', keyCode: 27, bubbles: true}));
440
- return 'escape';
441
- })()
442
- """)
443
- if closed:
444
- wait(1)
445
- return closed
446
- ```
447
-
448
- ---
449
-
450
- ## Detecting DataDome / challenge page
451
-
452
- After `new_tab()` + `wait(5)`, verify you are on a real Wellfound page:
453
-
454
- ```python
455
- def wellfound_is_blocked() -> bool:
456
- """True if DataDome or Cloudflare challenge is still showing."""
457
- title = js("document.title") or ""
458
- url = page_info()["url"]
459
- # DataDome challenge page has no useful title; CF shows "Just a moment..."
460
- blocked = (
461
- "Just a moment" in title or
462
- "wellfound.com" not in url or
463
- "captcha-delivery.com" in js("document.body.innerHTML or ''") or
464
- not title
465
- )
466
- return blocked
467
-
468
- # Usage
469
- new_tab("https://wellfound.com/company/stripe")
470
- wait_for_load()
471
- wait(5)
472
-
473
- if wellfound_is_blocked():
474
- wait(8) # DataDome sometimes needs up to 10s total
475
- if wellfound_is_blocked():
476
- capture_screenshot("/tmp/wellfound_blocked.png")
477
- raise RuntimeError("DataDome/CF challenge did not resolve — see /tmp/wellfound_blocked.png")
478
- ```
479
-
480
- ---
481
-
482
- ## Key selectors reference
483
-
484
- Wellfound uses **Tailwind CSS** — no stable semantic class names. These patterns are robust:
485
-
486
- | Target | Selector strategy |
487
- |--------|------------------|
488
- | Company name | `h1` (first on page) |
489
- | Company description | `[class*="description"]`, `[class*="about"]` |
490
- | Team size | Text search: `/\d+[-–]\d+\s+(employees\|people)/i` |
491
- | Funding amount | Text search: `/\$[\d,.]+[KMBkm]/i` |
492
- | Funding stage | Text search: `/\b(Seed\|Series [A-Z]\+?\|Pre-seed\|Late Stage)\b/` |
493
- | Role/market tags | `a[href*="/jobs?role="]` |
494
- | Location tags | `a[href*="/location/"]` |
495
- | Job cards | `a[href*="/company/"][href*="/jobs/"]` (broad fallback) |
496
- | Job title | `h2`, `h3`, `[class*="title"]` within card |
497
- | CSRF token | `meta[name="csrf-token"]` |
498
- | Login modal | `button[aria-label="Close"]`, Escape key |
499
-
500
- ---
501
-
502
- ## Common pitfalls
503
-
504
- 1. **`http_get` is permanently blocked.** DataDome intercepts all non-browser HTTP requests with
505
- a 403 + captcha challenge. No User-Agent, header combination, or cookie replay works.
506
- `api.angel.co` is HTTP 404 (shut down). Use `new_tab()` exclusively.
507
-
508
- 2. **NOT a Next.js app.** Wellfound is Ruby on Rails + React. There is no `__NEXT_DATA__` JSON
509
- blob. Look for `window.__APOLLO_STATE__`, `window.gon`, or inline `<script>` tags instead.
510
-
511
- 3. **`wait(5)` minimum after `wait_for_load()`.** DataDome runs JS fingerprinting probes for
512
- 2-4 seconds after `readyState = complete`. Extracting before this resolves returns the challenge
513
- page HTML, not real content.
514
-
515
- 4. **Tailwind CSS — no stable class names.** Wellfound uses Tailwind utility classes. Never
516
- hardcode a specific class name. Use `href` attribute patterns, `data-test` attributes if present,
517
- or semantic element selectors (`h1`, `h2`, `li`, `article`).
518
-
519
- 5. **GraphQL requires both CSRF token AND browser session cookies.** The CSRF token is a
520
- per-session value from `<meta name="csrf-token">`. Cloudflare Bot Management blocks
521
- `POST /graphql` from non-browser sessions. Always fire GraphQL via `fetch()` inside the
522
- browser session (not from Python's `http_get`).
523
-
524
- 6. **`?role=` and `?location=` params are robots.txt-disallowed.** Wellfound may redirect or
525
- show a login wall for filtered job search URLs. Load `/jobs` unfiltered and use in-page
526
- UI filters (dropdowns) to narrow results.
527
-
528
- 7. **Login wall on job details and user profiles.** Company overview pages load without login.
529
- Individual job detail pages, and all `/u/{username}` profiles, hit a login modal immediately.
530
- Call `dismiss_wellfound_login_modal()` right after `wait(5)` on these pages.
531
-
532
- 8. **Rate limiting.** After ~5-10 rapid page navigations DataDome may harden. Use `wait(3)` between
533
- `goto_url()` calls. If you get a captcha that does not auto-resolve, wait 30-60 seconds.
534
-
535
- 9. **`new_tab()` over `goto_url()` for the first Wellfound page.** `goto_url()` in an existing tab
536
- may inherit a stale DataDome fingerprint. `new_tab()` gives a clean origin context that
537
- DataDome processes cleanly.
538
-
539
- ---
540
-
541
- ## Anti-bot response identification
542
-
543
- What you see in the 403 body when NOT in a browser:
544
-
545
- ```html
546
- <!-- DataDome challenge (page GETs) -->
547
- <script>var dd={'rt':'c','cid':'...','t':'bv','host':'geo.captcha-delivery.com',...}</script>
548
- <script src="https://ct.captcha-delivery.com/c.js"></script>
549
- <!-- rt='c' = captcha required; rt='i' = invisible solve; rt='b' = blocked -->
550
-
551
- <!-- Cloudflare challenge (API POSTs) -->
552
- <title>Just a moment...</title>
553
- <script>window.__CF$cv$params={r:'...',t:'...'}</script>
554
- ```
555
-
556
- In a real Chrome browser, both challenges resolve automatically without user interaction.
557
-
558
- ---
559
-
560
- ## Minimal working example
561
-
562
- ```python
563
- import json
564
-
565
- # Open Wellfound company page
566
- new_tab("https://wellfound.com/company/openai")
567
- wait_for_load()
568
- wait(5)
569
-
570
- # Verify not blocked
571
- title = js("document.title")
572
- assert "Just a moment" not in (title or ""), f"Still on challenge page: {title}"
573
-
574
- # Extract company overview
575
- data = js("""
576
- (function() {
577
- var name = document.querySelector('h1');
578
- var bodyText = document.body.innerText;
579
- var sizeMatch = bodyText.match(/(\\d+[-\\u2013]\\d+)\\s+(employees|people)/i);
580
- var fundingMatch = bodyText.match(/\\$[\\d,.]+[KMBkm](?:\\s+(?:raised|total))?/i);
581
- var stageMatch = bodyText.match(/\\b(Seed|Series [A-Z]\\+?|Pre-seed|Late Stage|Public)\\b/);
582
- var tags = Array.from(document.querySelectorAll('a[href*="/jobs?role="]')).map(a => a.innerText.trim());
583
- var locs = Array.from(document.querySelectorAll('a[href*="/location/"]')).map(a => a.innerText.trim());
584
- return JSON.stringify({
585
- name: name ? name.innerText.trim() : null,
586
- teamSize: sizeMatch ? sizeMatch[0] : null,
587
- funding: fundingMatch ? fundingMatch[0] : null,
588
- stage: stageMatch ? stageMatch[0] : null,
589
- roles: tags.slice(0, 8),
590
- locations: locs.slice(0, 5),
591
- });
592
- })()
593
- """)
594
-
595
- print(json.dumps(json.loads(data), indent=2))
596
- ```
1
+ # Wellfound (AngelList) — Startup Jobs & Company Profiles
2
+
3
+ Field-tested against wellfound.com on 2026-04-18.
4
+ All confirmed via live HTTP probes and response header analysis.
5
+
6
+ ---
7
+
8
+ ## Anti-bot verdict: browser required, no http_get workaround exists
9
+
10
+ **`http_get` returns HTTP 403 on every Wellfound URL without exception** (except `robots.txt`).
11
+
12
+ Tested endpoints (all 403):
13
+ - `/company/stripe`
14
+ - `/jobs`
15
+ - `/jobs?role=engineer&location=remote`
16
+ - `/company/stripe/jobs`
17
+ - `/sitemap.xml`, `/sitemap_index.xml`
18
+ - `/jobs.rss`
19
+ - `POST /graphql` (HTTP 403, Cloudflare managed challenge)
20
+
21
+ Old AngelList public API (`api.angel.co/1/...`) returns `404 Not Found` — permanently shut down.
22
+
23
+ **Dual anti-bot stack confirmed from response headers:**
24
+
25
+ | Layer | System | Evidence |
26
+ |-------|--------|----------|
27
+ | Page GETs | DataDome | `X-DataDome: protected`, `X-DD-B: 2`, `Set-Cookie: datadome=...` |
28
+ | API POSTs | Cloudflare Bot Management | `Cf-Mitigated: challenge` |
29
+
30
+ The 403 response body contains a DataDome captcha challenge script (`geo.captcha-delivery.com`) AND an embedded Cloudflare challenge (`window.__CF$cv$params`). Both fire simultaneously. Neither cookie can be replayed — both are TLS-fingerprint-bound.
31
+
32
+ **Use `new_tab()` + `wait()` exclusively. Never use `http_get` for Wellfound.**
33
+
34
+ ---
35
+
36
+ ## Tech stack (confirmed from response headers)
37
+
38
+ Wellfound is a **Ruby on Rails + React + Apollo GraphQL** hybrid app — NOT a pure Next.js app.
39
+
40
+ Confirmed headers from `robots.txt` (the only accessible endpoint):
41
+ ```
42
+ x-runtime: 0.006700 → Rails rack middleware timer
43
+ x-request-id: 4645fd66... → Rails request ID
44
+ x-xss-protection: 1; mode=block → Rails security defaults
45
+ Set-Cookie: _wellfound=... → Rails session cookie
46
+ Server: cloudflare → Cloudflare CDN
47
+ ```
48
+
49
+ Implications:
50
+ - **`__NEXT_DATA__` is NOT present** — not a Next.js app
51
+ - **`window.__APOLLO_STATE__` or `window.gon` may be present** — check these instead
52
+ - CSRF token is in a `<meta name="csrf-token">` tag (Rails default)
53
+ - Session cookie is `_wellfound=...` for anonymous sessions; login sessions add `_wellfound_session=...`
54
+
55
+ ---
56
+
57
+ ## Do this first: open in new tab, wait for DataDome to resolve
58
+
59
+ ```python
60
+ new_tab("https://wellfound.com/company/stripe")
61
+ wait_for_load()
62
+ wait(5) # DataDome JS fingerprinting runs ~2-4s after readyState=complete
63
+ ```
64
+
65
+ Verify you are past the DataDome challenge before extracting:
66
+
67
+ ```python
68
+ title = js("document.title")
69
+ url = page_info()["url"]
70
+
71
+ if "wellfound.com" not in url or not title or "Just a moment" in title:
72
+ # DataDome or CF challenge did not resolve — wait longer
73
+ wait(8)
74
+ title = js("document.title")
75
+ if "Just a moment" in title or not title:
76
+ capture_screenshot("/tmp/wellfound_block.png")
77
+ raise RuntimeError("DataDome/CF challenge did not resolve — see screenshot")
78
+ ```
79
+
80
+ DataDome resolves **silently** in a real Chrome session via CDP — no user interaction required.
81
+ The challenge is a JS fingerprint check that passes automatically when running in a real browser.
82
+
83
+ ---
84
+
85
+ ## URL patterns
86
+
87
+ | Goal | URL |
88
+ |------|-----|
89
+ | Company profile | `https://wellfound.com/company/{slug}` |
90
+ | Company jobs | `https://wellfound.com/company/{slug}/jobs` |
91
+ | Company culture | `https://wellfound.com/company/{slug}/culture` |
92
+ | Job board (all) | `https://wellfound.com/jobs` |
93
+ | Job board filtered | `https://wellfound.com/jobs` — then use UI filters (query params are disallowed by robots.txt) |
94
+ | Investor profile | `https://wellfound.com/investor/{slug}` |
95
+ | User profile | `https://wellfound.com/u/{username}` (disallowed by robots.txt, login wall) |
96
+
97
+ **Note on query params:** `robots.txt` disallows `?role=*`, `?jobId=*`, `?jobSlug=*`, `?location=*`.
98
+ Wellfound enforces these with login walls or redirects for most filtered job searches.
99
+
100
+ ---
101
+
102
+ ## Workflow 1: Company profile — name, description, team size, funding, tags
103
+
104
+ Navigate to the company page and extract structured data. Most fields are visible without login.
105
+
106
+ ```python
107
+ import json
108
+
109
+ new_tab("https://wellfound.com/company/stripe")
110
+ wait_for_load()
111
+ wait(5)
112
+
113
+ # Check for Apollo state (Rails + React app, not Next.js)
114
+ # Wellfound embeds data in window.gon or inline script tags
115
+ apollo_raw = js("""
116
+ (function() {
117
+ // Try window.__APOLLO_STATE__ (Apollo Client cache)
118
+ if (window.__APOLLO_STATE__) return JSON.stringify(window.__APOLLO_STATE__);
119
+ // Try window.gon (Rails Gon gem)
120
+ if (window.gon) return JSON.stringify(window.gon);
121
+ // Try inline <script> tags containing startup data
122
+ var scripts = Array.from(document.querySelectorAll('script:not([src])'));
123
+ for (var s of scripts) {
124
+ var t = s.textContent || '';
125
+ if (t.includes('"name"') && t.includes('"description"') && t.includes('teamSize')) {
126
+ return t.substring(0, 5000);
127
+ }
128
+ }
129
+ return null;
130
+ })()
131
+ """)
132
+
133
+ if apollo_raw:
134
+ try:
135
+ data = json.loads(apollo_raw)
136
+ # Apollo State: look for Startup:{id} keys
137
+ for key, val in data.items():
138
+ if key.startswith("Startup:") and isinstance(val, dict):
139
+ print("Company:", val.get("name"))
140
+ print("Description:", val.get("description") or val.get("highConcept"))
141
+ print("Team size:", val.get("teamSize"))
142
+ print("Total raised:", val.get("totalRaised"))
143
+ print("Hiring:", val.get("hiring"))
144
+ print(json.dumps(data, indent=2)[:3000])
145
+ except json.JSONDecodeError:
146
+ # Raw script tag — parse key fields with regex
147
+ import re
148
+ name = re.search(r'"name"\s*:\s*"([^"]+)"', apollo_raw)
149
+ desc = re.search(r'"description"\s*:\s*"([^"]+)"', apollo_raw)
150
+ print("Name:", name.group(1) if name else "not found")
151
+ print("Desc:", desc.group(1) if desc else "not found")
152
+ ```
153
+
154
+ If the structured data path fails, fall back to DOM extraction:
155
+
156
+ ```python
157
+ # DOM extraction — company profile page
158
+ profile = js("""
159
+ (function() {
160
+ // Company name — first h1 on the page
161
+ var nameEl = document.querySelector('h1');
162
+
163
+ // Description — first substantial paragraph or div with class containing 'description'
164
+ var descEl = (
165
+ document.querySelector('[class*="description"]') ||
166
+ document.querySelector('[class*="about"]') ||
167
+ document.querySelector('p[class*="startupDescription"]')
168
+ );
169
+
170
+ // Tags — market/role tags are links with /jobs?role= or /location/ in href
171
+ // Wellfound uses Tailwind (no stable class names) — use href pattern
172
+ var roleLinks = Array.from(document.querySelectorAll('a[href*="/jobs?role="]')).map(a => a.innerText.trim());
173
+ var locationLinks = Array.from(document.querySelectorAll('a[href*="/location/"]')).map(a => a.innerText.trim());
174
+
175
+ // Team size / funding — look in page text for patterns
176
+ var bodyText = document.body.innerText;
177
+
178
+ // Company size: "11-50 employees" or "51-200 people" pattern
179
+ var sizeMatch = bodyText.match(/(\d+[-–]\d+)\s+(employees|people)/i);
180
+ var teamSize = sizeMatch ? sizeMatch[0] : null;
181
+
182
+ // Funding: "$X.XM" or "Raised $X" pattern
183
+ var fundingMatch = bodyText.match(/\$[\d,.]+[KMBkm]\s*(raised|in funding|Series [A-Z])?/i);
184
+ var funding = fundingMatch ? fundingMatch[0] : null;
185
+
186
+ // Stage: "Series A", "Seed", "Series B", etc.
187
+ var stageMatch = bodyText.match(/\b(Seed|Series [A-Z]\+?|Pre-seed|Angel|Late Stage|Public)\b/);
188
+ var stage = stageMatch ? stageMatch[0] : null;
189
+
190
+ return JSON.stringify({
191
+ name: nameEl ? nameEl.innerText.trim() : null,
192
+ desc: descEl ? descEl.innerText.trim().substring(0, 500) : null,
193
+ teamSize: teamSize,
194
+ funding: funding,
195
+ stage: stage,
196
+ roles: roleLinks.slice(0, 10),
197
+ locations: locationLinks.slice(0, 5),
198
+ });
199
+ })()
200
+ """)
201
+
202
+ data = json.loads(profile)
203
+ print(json.dumps(data, indent=2))
204
+ ```
205
+
206
+ ---
207
+
208
+ ## Workflow 2: Company jobs listing
209
+
210
+ ```python
211
+ import json
212
+
213
+ company_slug = "stripe"
214
+ new_tab(f"https://wellfound.com/company/{company_slug}/jobs")
215
+ wait_for_load()
216
+ wait(5)
217
+
218
+ jobs = js("""
219
+ (function() {
220
+ // Job listing cards — Wellfound uses role="listitem" or li elements in job list
221
+ var cards = document.querySelectorAll('[data-test^="StartupJobListing"], li[class*="job"], div[class*="JobListing"]');
222
+ if (!cards.length) {
223
+ // Broad fallback: all anchor tags with /jobs/ in href
224
+ var links = Array.from(document.querySelectorAll('a[href*="/jobs/"]'));
225
+ return JSON.stringify(links.map(a => ({
226
+ title: a.innerText.trim().split('\\n')[0],
227
+ href: a.href,
228
+ })).filter(j => j.title && j.title.length > 2).slice(0, 30));
229
+ }
230
+ return JSON.stringify(Array.from(cards).map(card => {
231
+ var titleEl = card.querySelector('h2, h3, [class*="title"], [class*="jobTitle"]');
232
+ var locEl = card.querySelector('[class*="location"], [class*="Location"]');
233
+ var compEl = card.querySelector('[class*="salary"], [class*="comp"], [class*="equity"]');
234
+ var linkEl = card.querySelector('a[href*="/jobs/"]');
235
+ return {
236
+ title: titleEl ? titleEl.innerText.trim() : '',
237
+ location: locEl ? locEl.innerText.trim() : '',
238
+ comp: compEl ? compEl.innerText.trim() : '',
239
+ href: linkEl ? linkEl.href : '',
240
+ };
241
+ }).filter(j => j.title));
242
+ })()
243
+ """)
244
+
245
+ results = json.loads(jobs)
246
+ print(f"Found {len(results)} jobs")
247
+ for j in results:
248
+ print(f" {j['title']} | {j.get('location','?')} | {j.get('comp','?')}")
249
+ ```
250
+
251
+ ---
252
+
253
+ ## Workflow 3: Job board — browse all jobs
254
+
255
+ The main `/jobs` page shows a curated job feed. Filters are not accessible via URL params (DataDome blocks `?role=...`). Use the UI dropdown filters after loading the page.
256
+
257
+ ```python
258
+ import json
259
+
260
+ new_tab("https://wellfound.com/jobs")
261
+ wait_for_load()
262
+ wait(5)
263
+
264
+ # Extract visible job cards
265
+ jobs = js("""
266
+ (function() {
267
+ // Job cards on the main /jobs board
268
+ var cards = document.querySelectorAll(
269
+ '[data-test*="job"], [class*="JobCard"], [class*="jobListing"], ' +
270
+ 'li[class*="job"], article[class*="job"]'
271
+ );
272
+ if (!cards.length) {
273
+ // Fallback: links to job detail pages
274
+ var links = Array.from(document.querySelectorAll('a[href*="/company/"][href*="/jobs/"]'));
275
+ return JSON.stringify(links.map(a => ({
276
+ href: a.href,
277
+ text: a.innerText.trim().substring(0, 100),
278
+ })).slice(0, 30));
279
+ }
280
+ return JSON.stringify(Array.from(cards).map(card => {
281
+ var titleEl = card.querySelector('h2, h3, [class*="title"]');
282
+ var companyEl = card.querySelector('[class*="company"], [class*="startup"]');
283
+ var locEl = card.querySelector('[class*="location"]');
284
+ var linkEl = card.querySelector('a[href*="/jobs/"]');
285
+ return {
286
+ title: titleEl ? titleEl.innerText.trim() : '',
287
+ company: companyEl ? companyEl.innerText.trim() : '',
288
+ location: locEl ? locEl.innerText.trim() : '',
289
+ href: linkEl ? linkEl.href : '',
290
+ };
291
+ }).filter(j => j.title));
292
+ })()
293
+ """)
294
+
295
+ results = json.loads(jobs)
296
+ print(f"Found {len(results)} jobs")
297
+ ```
298
+
299
+ ---
300
+
301
+ ## Workflow 4: GraphQL API (authenticated sessions only)
302
+
303
+ Wellfound's GraphQL endpoint (`/graphql`) requires:
304
+ 1. A valid `_wellfound` session cookie from a real browser load
305
+ 2. A CSRF token from the page's `<meta name="csrf-token">` tag
306
+ 3. Cloudflare Bot Management to have passed (only happens in a real Chrome session)
307
+
308
+ **This approach only works from inside a browser session (after navigating to any Wellfound page).**
309
+
310
+ ```python
311
+ import json
312
+
313
+ # Step 1: Load any Wellfound page so the session cookie + DataDome cookie are set
314
+ new_tab("https://wellfound.com/")
315
+ wait_for_load()
316
+ wait(5)
317
+
318
+ # Step 2: Extract CSRF token from meta tag
319
+ csrf = js("document.querySelector('meta[name=\"csrf-token\"]') ? document.querySelector('meta[name=\"csrf-token\"]').getAttribute('content') : null")
320
+ if not csrf:
321
+ raise RuntimeError("CSRF token not found — page may not have loaded correctly")
322
+
323
+ print(f"CSRF token: {csrf[:20]}...")
324
+
325
+ # Step 3: Execute GraphQL query via fetch() from within the browser
326
+ # This uses the browser's existing cookies automatically
327
+ result = js(f"""
328
+ (async function() {{
329
+ try {{
330
+ var resp = await fetch('/graphql', {{
331
+ method: 'POST',
332
+ credentials: 'include',
333
+ headers: {{
334
+ 'Content-Type': 'application/json',
335
+ 'Accept': 'application/json',
336
+ 'x-csrf-token': '{csrf}',
337
+ 'x-requested-with': 'XMLHttpRequest',
338
+ }},
339
+ body: JSON.stringify({{
340
+ query: `query StartupShow($slug: String!) {{
341
+ startup(slug: $slug) {{
342
+ id
343
+ name
344
+ description: highConcept
345
+ productDesc
346
+ teamSize
347
+ locations {{ displayName }}
348
+ markets {{ displayName }}
349
+ totalRaised
350
+ fundingStage
351
+ badges
352
+ hiring
353
+ jobListingsCount
354
+ }}
355
+ }}`,
356
+ variables: {{ slug: "stripe" }}
357
+ }})
358
+ }});
359
+ var data = await resp.json();
360
+ return JSON.stringify(data);
361
+ }} catch(e) {{
362
+ return JSON.stringify({{error: e.message}});
363
+ }}
364
+ }})()
365
+ """)
366
+
367
+ # js() with async returns a Promise — use js_async() if available, or eval trick:
368
+ # Note: the above may return None if js() doesn't await Promises.
369
+ # Use this pattern instead if js() doesn't handle async:
370
+ result_sync = js("""
371
+ var done = false, out = null;
372
+ fetch('/graphql', {
373
+ method: 'POST',
374
+ credentials: 'include',
375
+ headers: {
376
+ 'Content-Type': 'application/json',
377
+ 'Accept': 'application/json',
378
+ 'x-csrf-token': document.querySelector('meta[name="csrf-token"]').content,
379
+ 'x-requested-with': 'XMLHttpRequest',
380
+ },
381
+ body: JSON.stringify({
382
+ query: '{ __typename }',
383
+ })
384
+ }).then(r => r.json()).then(d => { window._wf_gql_result = JSON.stringify(d); });
385
+ 'pending'
386
+ """)
387
+ # Wait for async result
388
+ import time; time.sleep(3)
389
+ gql_result = js("window._wf_gql_result || null")
390
+ if gql_result:
391
+ data = json.loads(gql_result)
392
+ print("GraphQL response:", json.dumps(data, indent=2)[:1000])
393
+ ```
394
+
395
+ ### Known GraphQL operations
396
+
397
+ | Operation | Purpose |
398
+ |-----------|---------|
399
+ | `StartupShow` | Full company profile (name, desc, funding, team size, markets) |
400
+ | `JobListingsIndex` | Paginated job board |
401
+ | `JobSearch` | Filtered job search by role/location |
402
+ | `UserProfile` | User/candidate profile |
403
+ | `InvestorShow` | VC/investor profile |
404
+
405
+ ---
406
+
407
+ ## Handling the login wall
408
+
409
+ Wellfound shows a sign-in modal on:
410
+ - Job detail pages (immediately or after 2-3 seconds)
411
+ - Candidate profile pages (immediately)
412
+ - Some company pages after scrolling
413
+
414
+ Company overview pages typically show content without login. Job listings require login to see full details and apply.
415
+
416
+ ```python
417
+ def dismiss_wellfound_login_modal():
418
+ """Close the Wellfound sign-in modal. Safe to call if no modal is present."""
419
+ closed = js("""
420
+ (function() {
421
+ var selectors = [
422
+ 'button[aria-label="Close"]',
423
+ 'button[class*="close"]',
424
+ 'button[class*="Close"]',
425
+ '[data-test="close-modal"]',
426
+ '[aria-label="Dismiss"]',
427
+ 'button[class*="dismiss"]',
428
+ // Wellfound-specific: modal overlay dismiss
429
+ 'div[class*="Modal"] button[type="button"]',
430
+ ];
431
+ for (var s of selectors) {
432
+ var btn = document.querySelector(s);
433
+ if (btn && btn.offsetParent !== null) {
434
+ btn.click();
435
+ return s;
436
+ }
437
+ }
438
+ // Try pressing Escape
439
+ document.dispatchEvent(new KeyboardEvent('keydown', {key: 'Escape', keyCode: 27, bubbles: true}));
440
+ return 'escape';
441
+ })()
442
+ """)
443
+ if closed:
444
+ wait(1)
445
+ return closed
446
+ ```
447
+
448
+ ---
449
+
450
+ ## Detecting DataDome / challenge page
451
+
452
+ After `new_tab()` + `wait(5)`, verify you are on a real Wellfound page:
453
+
454
+ ```python
455
+ def wellfound_is_blocked() -> bool:
456
+ """True if DataDome or Cloudflare challenge is still showing."""
457
+ title = js("document.title") or ""
458
+ url = page_info()["url"]
459
+ # DataDome challenge page has no useful title; CF shows "Just a moment..."
460
+ blocked = (
461
+ "Just a moment" in title or
462
+ "wellfound.com" not in url or
463
+ "captcha-delivery.com" in js("document.body.innerHTML or ''") or
464
+ not title
465
+ )
466
+ return blocked
467
+
468
+ # Usage
469
+ new_tab("https://wellfound.com/company/stripe")
470
+ wait_for_load()
471
+ wait(5)
472
+
473
+ if wellfound_is_blocked():
474
+ wait(8) # DataDome sometimes needs up to 10s total
475
+ if wellfound_is_blocked():
476
+ capture_screenshot("/tmp/wellfound_blocked.png")
477
+ raise RuntimeError("DataDome/CF challenge did not resolve — see /tmp/wellfound_blocked.png")
478
+ ```
479
+
480
+ ---
481
+
482
+ ## Key selectors reference
483
+
484
+ Wellfound uses **Tailwind CSS** — no stable semantic class names. These patterns are robust:
485
+
486
+ | Target | Selector strategy |
487
+ |--------|------------------|
488
+ | Company name | `h1` (first on page) |
489
+ | Company description | `[class*="description"]`, `[class*="about"]` |
490
+ | Team size | Text search: `/\d+[-–]\d+\s+(employees\|people)/i` |
491
+ | Funding amount | Text search: `/\$[\d,.]+[KMBkm]/i` |
492
+ | Funding stage | Text search: `/\b(Seed\|Series [A-Z]\+?\|Pre-seed\|Late Stage)\b/` |
493
+ | Role/market tags | `a[href*="/jobs?role="]` |
494
+ | Location tags | `a[href*="/location/"]` |
495
+ | Job cards | `a[href*="/company/"][href*="/jobs/"]` (broad fallback) |
496
+ | Job title | `h2`, `h3`, `[class*="title"]` within card |
497
+ | CSRF token | `meta[name="csrf-token"]` |
498
+ | Login modal | `button[aria-label="Close"]`, Escape key |
499
+
500
+ ---
501
+
502
+ ## Common pitfalls
503
+
504
+ 1. **`http_get` is permanently blocked.** DataDome intercepts all non-browser HTTP requests with
505
+ a 403 + captcha challenge. No User-Agent, header combination, or cookie replay works.
506
+ `api.angel.co` is HTTP 404 (shut down). Use `new_tab()` exclusively.
507
+
508
+ 2. **NOT a Next.js app.** Wellfound is Ruby on Rails + React. There is no `__NEXT_DATA__` JSON
509
+ blob. Look for `window.__APOLLO_STATE__`, `window.gon`, or inline `<script>` tags instead.
510
+
511
+ 3. **`wait(5)` minimum after `wait_for_load()`.** DataDome runs JS fingerprinting probes for
512
+ 2-4 seconds after `readyState = complete`. Extracting before this resolves returns the challenge
513
+ page HTML, not real content.
514
+
515
+ 4. **Tailwind CSS — no stable class names.** Wellfound uses Tailwind utility classes. Never
516
+ hardcode a specific class name. Use `href` attribute patterns, `data-test` attributes if present,
517
+ or semantic element selectors (`h1`, `h2`, `li`, `article`).
518
+
519
+ 5. **GraphQL requires both CSRF token AND browser session cookies.** The CSRF token is a
520
+ per-session value from `<meta name="csrf-token">`. Cloudflare Bot Management blocks
521
+ `POST /graphql` from non-browser sessions. Always fire GraphQL via `fetch()` inside the
522
+ browser session (not from Python's `http_get`).
523
+
524
+ 6. **`?role=` and `?location=` params are robots.txt-disallowed.** Wellfound may redirect or
525
+ show a login wall for filtered job search URLs. Load `/jobs` unfiltered and use in-page
526
+ UI filters (dropdowns) to narrow results.
527
+
528
+ 7. **Login wall on job details and user profiles.** Company overview pages load without login.
529
+ Individual job detail pages, and all `/u/{username}` profiles, hit a login modal immediately.
530
+ Call `dismiss_wellfound_login_modal()` right after `wait(5)` on these pages.
531
+
532
+ 8. **Rate limiting.** After ~5-10 rapid page navigations DataDome may harden. Use `wait(3)` between
533
+ `goto_url()` calls. If you get a captcha that does not auto-resolve, wait 30-60 seconds.
534
+
535
+ 9. **`new_tab()` over `goto_url()` for the first Wellfound page.** `goto_url()` in an existing tab
536
+ may inherit a stale DataDome fingerprint. `new_tab()` gives a clean origin context that
537
+ DataDome processes cleanly.
538
+
539
+ ---
540
+
541
+ ## Anti-bot response identification
542
+
543
+ What you see in the 403 body when NOT in a browser:
544
+
545
+ ```html
546
+ <!-- DataDome challenge (page GETs) -->
547
+ <script>var dd={'rt':'c','cid':'...','t':'bv','host':'geo.captcha-delivery.com',...}</script>
548
+ <script src="https://ct.captcha-delivery.com/c.js"></script>
549
+ <!-- rt='c' = captcha required; rt='i' = invisible solve; rt='b' = blocked -->
550
+
551
+ <!-- Cloudflare challenge (API POSTs) -->
552
+ <title>Just a moment...</title>
553
+ <script>window.__CF$cv$params={r:'...',t:'...'}</script>
554
+ ```
555
+
556
+ In a real Chrome browser, both challenges resolve automatically without user interaction.
557
+
558
+ ---
559
+
560
+ ## Minimal working example
561
+
562
+ ```python
563
+ import json
564
+
565
+ # Open Wellfound company page
566
+ new_tab("https://wellfound.com/company/openai")
567
+ wait_for_load()
568
+ wait(5)
569
+
570
+ # Verify not blocked
571
+ title = js("document.title")
572
+ assert "Just a moment" not in (title or ""), f"Still on challenge page: {title}"
573
+
574
+ # Extract company overview
575
+ data = js("""
576
+ (function() {
577
+ var name = document.querySelector('h1');
578
+ var bodyText = document.body.innerText;
579
+ var sizeMatch = bodyText.match(/(\\d+[-\\u2013]\\d+)\\s+(employees|people)/i);
580
+ var fundingMatch = bodyText.match(/\\$[\\d,.]+[KMBkm](?:\\s+(?:raised|total))?/i);
581
+ var stageMatch = bodyText.match(/\\b(Seed|Series [A-Z]\\+?|Pre-seed|Late Stage|Public)\\b/);
582
+ var tags = Array.from(document.querySelectorAll('a[href*="/jobs?role="]')).map(a => a.innerText.trim());
583
+ var locs = Array.from(document.querySelectorAll('a[href*="/location/"]')).map(a => a.innerText.trim());
584
+ return JSON.stringify({
585
+ name: name ? name.innerText.trim() : null,
586
+ teamSize: sizeMatch ? sizeMatch[0] : null,
587
+ funding: fundingMatch ? fundingMatch[0] : null,
588
+ stage: stageMatch ? stageMatch[0] : null,
589
+ roles: tags.slice(0, 8),
590
+ locations: locs.slice(0, 5),
591
+ });
592
+ })()
593
+ """)
594
+
595
+ print(json.dumps(json.loads(data), indent=2))
596
+ ```