jobcrawl 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/.prettierrc.json +10 -0
  2. package/CHANGELOG.md +40 -0
  3. package/README.md +232 -0
  4. package/dist/core/aggregators/yc.d.ts +7 -0
  5. package/dist/core/aggregators/yc.js +320 -0
  6. package/dist/core/browser.d.ts +30 -0
  7. package/dist/core/browser.js +196 -0
  8. package/dist/core/cache.d.ts +13 -0
  9. package/dist/core/cache.js +41 -0
  10. package/dist/core/detect-provider.d.ts +7 -0
  11. package/dist/core/detect-provider.js +125 -0
  12. package/dist/core/discover-careers.d.ts +18 -0
  13. package/dist/core/discover-careers.js +92 -0
  14. package/dist/core/extract-jobs.d.ts +14 -0
  15. package/dist/core/extract-jobs.js +36 -0
  16. package/dist/core/fetch-page.d.ts +11 -0
  17. package/dist/core/fetch-page.js +39 -0
  18. package/dist/core/format-output.d.ts +2 -0
  19. package/dist/core/format-output.js +59 -0
  20. package/dist/core/match-jobs.d.ts +6 -0
  21. package/dist/core/match-jobs.js +43 -0
  22. package/dist/core/providers/ashby.d.ts +6 -0
  23. package/dist/core/providers/ashby.js +58 -0
  24. package/dist/core/providers/generic.d.ts +6 -0
  25. package/dist/core/providers/generic.js +294 -0
  26. package/dist/core/providers/greenhouse.d.ts +6 -0
  27. package/dist/core/providers/greenhouse.js +47 -0
  28. package/dist/core/providers/lever.d.ts +7 -0
  29. package/dist/core/providers/lever.js +60 -0
  30. package/dist/core/providers/yc.d.ts +7 -0
  31. package/dist/core/providers/yc.js +320 -0
  32. package/dist/core/resolve-iframe.d.ts +6 -0
  33. package/dist/core/resolve-iframe.js +51 -0
  34. package/dist/core/save-raw.d.ts +4 -0
  35. package/dist/core/save-raw.js +13 -0
  36. package/dist/data/companies.d.ts +9 -0
  37. package/dist/data/companies.js +2849 -0
  38. package/dist/entrypoints/cli/app.d.ts +3 -0
  39. package/dist/entrypoints/cli/app.js +91 -0
  40. package/dist/entrypoints/cli/components/crawl-view.d.ts +1 -0
  41. package/dist/entrypoints/cli/components/crawl-view.js +94 -0
  42. package/dist/entrypoints/cli/components/discover-view.d.ts +1 -0
  43. package/dist/entrypoints/cli/components/discover-view.js +67 -0
  44. package/dist/entrypoints/cli/crawl-aggregators.d.ts +26 -0
  45. package/dist/entrypoints/cli/crawl-aggregators.js +76 -0
  46. package/dist/entrypoints/cli/crawl-url.d.ts +26 -0
  47. package/dist/entrypoints/cli/crawl-url.js +54 -0
  48. package/dist/entrypoints/cli/crawl.d.ts +32 -0
  49. package/dist/entrypoints/cli/crawl.js +108 -0
  50. package/dist/entrypoints/cli/discover.d.ts +10 -0
  51. package/dist/entrypoints/cli/discover.js +69 -0
  52. package/dist/entrypoints/cli/index.d.ts +2 -0
  53. package/dist/entrypoints/cli/index.js +197 -0
  54. package/dist/entrypoints/cli/init.d.ts +9 -0
  55. package/dist/entrypoints/cli/init.js +94 -0
  56. package/dist/entrypoints/cli/plain.d.ts +6 -0
  57. package/dist/entrypoints/cli/plain.js +77 -0
  58. package/dist/events.d.ts +114 -0
  59. package/dist/events.js +17 -0
  60. package/dist/orchestrators/crawl-all.d.ts +2 -0
  61. package/dist/orchestrators/crawl-all.js +66 -0
  62. package/dist/orchestrators/discover-all.d.ts +10 -0
  63. package/dist/orchestrators/discover-all.js +39 -0
  64. package/dist/threads/pool.d.ts +5 -0
  65. package/dist/threads/pool.js +23 -0
  66. package/dist/threads/process-url.d.ts +9 -0
  67. package/dist/threads/process-url.js +229 -0
  68. package/dist/types/index.d.ts +83 -0
  69. package/dist/types/index.js +6 -0
  70. package/dist/utils/config.d.ts +17 -0
  71. package/dist/utils/config.js +57 -0
  72. package/dist/utils/google-search.d.ts +19 -0
  73. package/dist/utils/google-search.js +139 -0
  74. package/dist/utils/llm.d.ts +8 -0
  75. package/dist/utils/llm.js +25 -0
  76. package/package.json +42 -0
  77. package/src/core/aggregators/yc.ts +415 -0
  78. package/src/core/browser.ts +239 -0
  79. package/src/core/detect-provider.ts +162 -0
  80. package/src/core/discover-careers.ts +117 -0
  81. package/src/core/extract-jobs.ts +50 -0
  82. package/src/core/fetch-page.ts +41 -0
  83. package/src/core/format-output.ts +80 -0
  84. package/src/core/match-jobs.ts +56 -0
  85. package/src/core/providers/ashby.ts +84 -0
  86. package/src/core/providers/generic.ts +332 -0
  87. package/src/core/providers/greenhouse.ts +74 -0
  88. package/src/core/providers/lever.ts +90 -0
  89. package/src/core/resolve-iframe.ts +59 -0
  90. package/src/core/save-raw.ts +18 -0
  91. package/src/data/companies.ts +2859 -0
  92. package/src/entrypoints/cli/app.tsx +173 -0
  93. package/src/entrypoints/cli/components/crawl-view.tsx +163 -0
  94. package/src/entrypoints/cli/components/discover-view.tsx +138 -0
  95. package/src/entrypoints/cli/crawl-aggregators.ts +112 -0
  96. package/src/entrypoints/cli/crawl-url.ts +87 -0
  97. package/src/entrypoints/cli/crawl.ts +163 -0
  98. package/src/entrypoints/cli/discover.ts +96 -0
  99. package/src/entrypoints/cli/index.ts +252 -0
  100. package/src/entrypoints/cli/init.ts +117 -0
  101. package/src/entrypoints/cli/plain.ts +104 -0
  102. package/src/events.ts +79 -0
  103. package/src/orchestrators/crawl-all.ts +96 -0
  104. package/src/orchestrators/discover-all.ts +61 -0
  105. package/src/threads/pool.ts +29 -0
  106. package/src/threads/process-url.ts +312 -0
  107. package/src/types/index.ts +110 -0
  108. package/src/utils/config.ts +79 -0
  109. package/src/utils/google-search.ts +155 -0
  110. package/src/utils/llm.ts +33 -0
  111. package/test/integration/process-url.test.ts +301 -0
  112. package/test/integration/providers/ashby.test.ts +163 -0
  113. package/test/integration/providers/greenhouse.test.ts +191 -0
  114. package/test/integration/providers/lever.test.ts +188 -0
  115. package/test/unit/config.test.ts +64 -0
  116. package/test/unit/detect-provider.test.ts +165 -0
  117. package/test/unit/events.test.ts +104 -0
  118. package/test/unit/format-output.test.ts +165 -0
  119. package/test/unit/match-jobs.test.ts +257 -0
  120. package/test/unit/pool.test.ts +74 -0
  121. package/test/unit/providers/generic.test.ts +139 -0
  122. package/test/unit/resolve-iframe.test.ts +100 -0
  123. package/tsconfig.json +19 -0
  124. package/vitest.config.ts +7 -0
@@ -0,0 +1,10 @@
1
+ {
2
+ "semi": true,
3
+ "trailingComma": "es5",
4
+ "singleQuote": false,
5
+ "printWidth": 80,
6
+ "tabWidth": 2,
7
+ "useTabs": false,
8
+ "arrowParens": "always",
9
+ "endOfLine": "lf"
10
+ }
package/CHANGELOG.md ADDED
@@ -0,0 +1,40 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ ## [0.1.0] - 2026-04-02
6
+
7
+ Initial release of jobcrawl — a CLI tool that discovers and crawls company career pages to find matching job listings.
8
+
9
+ ### Added
10
+
11
+ - **Project scaffold** — TypeScript project with Prettier, ESM modules, and core dependencies.
12
+ - **Type system** — Shared type definitions (`Company`, `Job`, `CrawlResult`, `AggregatorResult`) and a typed event bus for inter-module communication.
13
+ - **ATS provider extractors** — Built-in support for Greenhouse, Lever, and Ashby career pages with structured job extraction.
14
+ - **Generic provider** — Fallback extractor using LLM-powered container-based extraction strategy for non-standard career pages.
15
+ - **Provider detection** — Automatic ATS provider identification from page HTML and URL patterns.
16
+ - **iframe resolution** — Detects and resolves iframe-embedded career pages to their source URLs.
17
+ - **Career page discovery** — Finds career pages via Google search and heuristic URL guessing (e.g. `/careers`, `/jobs`).
18
+ - **Job matching** — LLM-based relevance scoring to filter jobs against user-defined search criteria.
19
+ - **Crawl orchestration** — Bounded concurrency pool for parallel crawling with progress tracking.
20
+ - **YC aggregator** — Scrapes Y Combinator's Work at a Startup with extended search filters (role, location, industry) and raw data capture.
21
+ - **Aggregator architecture** — Separated aggregator type from direct providers, with dedicated CLI commands and crawl orchestration.
22
+ - **Browser rendering** — Integrated `agent-browser` for JavaScript-rendered pages that can't be crawled with static HTTP fetches.
23
+ - **Companies dataset** — Built-in dataset of ~2,800 companies for `init` config generation.
24
+ - **CLI with Ink UI** — Interactive terminal interface with step-based thread progress, plus plain-text output mode.
25
+ - **CLI commands** — `init`, `crawl`, `crawl-url`, `crawl-aggregators`, and `discover` commands.
26
+ - **Config system** — YAML-based configuration for companies, search criteria, and crawl settings.
27
+ - **Output formatting** — Structured JSON and human-readable output for crawl results.
28
+ - **Raw data capture** — Optional saving of raw HTML/JSON responses for debugging.
29
+ - **Test suite** — Unit tests (detect-provider, match-jobs, resolve-iframe, config, events, format-output, pool, generic provider) and integration tests (process-url, Ashby, Greenhouse, Lever providers).
30
+
31
+ ### Fixed
32
+
33
+ - Job deduplication now uses URL+title composite key instead of URL alone, preventing false duplicates across pages.
34
+ - File permissions restricted on credentials and config directory during `init`.
35
+
36
+ ### Changed
37
+
38
+ - Removed `dotenv` dependency — environment variables are expected to be set externally.
39
+ - Rewrote CrawlView component with step-based thread progress for clearer status reporting.
40
+ - Rewrote README to cover aggregators, browser rendering, and the new config format.
package/README.md ADDED
@@ -0,0 +1,232 @@
1
+ # jobcrawl
2
+
3
+ CLI tool that crawls career pages for jobs matching your search criteria. Supports Greenhouse, Ashby, and Lever via their public JSON APIs, Y Combinator's Work at a Startup via Algolia, and custom career pages via browser rendering + DOM heuristics.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ npm install
9
+ npm run build
10
+ ```
11
+
12
+ Requires Node.js 20+. Optional dependencies:
13
+
14
+ - `ANTHROPIC_API_KEY` — for the `discover` command's LLM features
15
+ - `agent-browser` — for JS-rendered career pages (`npm i -g agent-browser`)
16
+
17
+ ## Quick start
18
+
19
+ ```bash
20
+ # Create config at ~/.jobcrawl/config.yaml + credentials.json
21
+ jobcrawl init
22
+
23
+ # Add your YC Algolia credentials to ~/.jobcrawl/credentials.json
24
+ # Then crawl
25
+ jobcrawl crawl --keywords "engineer" --output table
26
+
27
+ # Or search YC startups directly
28
+ jobcrawl crawl-aggregators yc --keywords "engineer" --role engineering
29
+ ```
30
+
31
+ ## Commands
32
+
33
+ ### `init` — Create config files
34
+
35
+ Creates `~/.jobcrawl/config.yaml` (pre-populated with 200+ companies), `~/.jobcrawl/credentials.json` (for aggregator API keys), and `~/.jobcrawl/raw/` directory.
36
+
37
+ ```bash
38
+ jobcrawl init # Create default config
39
+ jobcrawl init --force # Overwrite existing config
40
+ ```
41
+
42
+ ### `crawl` — Crawl career pages
43
+
44
+ ```bash
45
+ # Uses ~/.jobcrawl/config.yaml by default
46
+ jobcrawl crawl --keywords "engineer" --output table
47
+
48
+ # From a specific config file
49
+ jobcrawl crawl --file config.yaml --keywords "engineer"
50
+
51
+ # From inline URLs
52
+ jobcrawl crawl --urls https://boards.greenhouse.io/anthropic https://jobs.ashbyhq.com/openai \
53
+ --keywords "senior" "engineer" --remote
54
+
55
+ # Include YC aggregator alongside company targets
56
+ jobcrawl crawl --aggregators yc --keywords "engineer" --company-stage seed
57
+
58
+ # Pipe to jq
59
+ jobcrawl crawl --keywords "staff" | jq '.[].title'
60
+ ```
61
+
62
+ ### `crawl-url` — Crawl a single URL
63
+
64
+ ```bash
65
+ jobcrawl crawl-url https://boards.greenhouse.io/anthropic --keywords "engineer" --output table
66
+ ```
67
+
68
+ ### `crawl-aggregators` — Search aggregator sources
69
+
70
+ Searches across multiple companies in one query. Currently supports YC (Work at a Startup).
71
+
72
+ ```bash
73
+ # Search YC startups
74
+ jobcrawl crawl-aggregators yc --keywords "engineer" --output table
75
+
76
+ # With extended filters
77
+ jobcrawl crawl-aggregators yc --keywords "engineer" --role engineering --role-type backend \
78
+ --company-stage seed --has-salary --visa-sponsorship
79
+ ```
80
+
81
+ Requires Algolia credentials in `~/.jobcrawl/credentials.json` or via env vars `YC_ALGOLIA_APP_ID` and `YC_ALGOLIA_API_KEY`.
82
+
83
+ ### `discover` — Find career pages from company names
84
+
85
+ ```bash
86
+ # Output URLs (one per line)
87
+ jobcrawl discover --companies "Anthropic" "OpenAI" "Mistral AI"
88
+
89
+ # Save as YAML config
90
+ jobcrawl discover --companies "Anthropic" "OpenAI" --output yaml -o ~/.jobcrawl/config.yaml
91
+
92
+ # Pipe discover into crawl
93
+ jobcrawl discover --companies "Anthropic" | jobcrawl crawl --keywords "engineer"
94
+ ```
95
+
96
+ ### `detect` — Identify ATS provider
97
+
98
+ ```bash
99
+ jobcrawl detect https://boards.greenhouse.io/anthropic
100
+ # → { "provider": "greenhouse", "boardToken": "anthropic" }
101
+ ```
102
+
103
+ ### `match` — Filter a saved jobs file
104
+
105
+ ```bash
106
+ jobcrawl crawl-url https://jobs.ashbyhq.com/openai -o jobs.json
107
+ jobcrawl match jobs.json --keywords "senior" --remote --output table
108
+ ```
109
+
110
+ ## Search criteria flags
111
+
112
+ | Flag | Description |
113
+ |------|-------------|
114
+ | `--keywords <terms...>` | Job title keywords (any match) |
115
+ | `--exclude <terms...>` | Exclude jobs matching these keywords |
116
+ | `--location <location>` | Location substring filter |
117
+ | `--remote` | Only remote jobs |
118
+ | `--onsite` | Only onsite jobs |
119
+ | `--hybrid` | Only hybrid jobs |
120
+ | `--department <depts...>` | Department filter |
121
+ | `--role <roles...>` | Role category: engineering, design, product, science, sales, marketing, etc. |
122
+ | `--role-type <types...>` | Specialization: backend, frontend, full-stack, ML, DevOps, etc. |
123
+ | `--job-type <types...>` | fulltime, internship, contract |
124
+ | `--company-stage <stages...>` | seed, series-a, growth, scale |
125
+ | `--industry <industries...>` | Industry filter |
126
+ | `--company-size <sizes...>` | 1-10, 11-50, 51-300, 301+ |
127
+ | `--has-salary` | Only jobs with salary listed |
128
+ | `--has-equity` | Only jobs with equity |
129
+ | `--has-interview-process` | Only jobs with interview process listed |
130
+ | `--visa-sponsorship` | Jobs offering visa sponsorship |
131
+
132
+ ## Output formats
133
+
134
+ `--output json` (default), `table`, `markdown`, `csv`
135
+
136
+ Write to file with `-o <file>`. Save raw API responses with `--save-raw` (stored in `~/.jobcrawl/raw/`).
137
+
138
+ ## Config file
139
+
140
+ Located at `~/.jobcrawl/config.yaml`. Created by `jobcrawl init`.
141
+
142
+ ```yaml
143
+ # Aggregators (cross-company search engines)
144
+ aggregators:
145
+ - type: yc
146
+ enabled: true
147
+
148
+ # Companies — slug-based (recommended) or URL-based
149
+ companies:
150
+ # --- AI Foundation Models ---
151
+ - company: Anthropic
152
+ slug: anthropic
153
+ provider: greenhouse
154
+ - company: OpenAI
155
+ slug: openai
156
+ provider: ashby
157
+
158
+ # URL-based (also works)
159
+ - url: https://mistral.ai/careers/
160
+ company: Mistral AI
161
+
162
+ defaults:
163
+ concurrency: 5
164
+ ```
165
+
166
+ **Slug-based targets** (recommended): jobcrawl auto-probes Greenhouse, Ashby, and Lever APIs using the slug. Add an optional `provider` hint to skip probing, or a `fallback` URL for when no ATS API matches.
167
+
168
+ **URL-based targets**: directly probe the URL, falling back to browser rendering for JS-rendered pages.
169
+
170
+ Target resolution order:
171
+ 1. `--urls` flag (inline URLs)
172
+ 2. `--file` flag (explicit config file)
173
+ 3. `~/.jobcrawl/config.yaml` (default config)
174
+ 4. `stdin` (piped input)
175
+
176
+ ## Credentials
177
+
178
+ `~/.jobcrawl/credentials.json` stores API keys for aggregators:
179
+
180
+ ```json
181
+ {
182
+ "yc": {
183
+ "algoliaAppId": "YOUR_APP_ID",
184
+ "algoliaApiKey": "YOUR_API_KEY"
185
+ }
186
+ }
187
+ ```
188
+
189
+ Alternatively, set `YC_ALGOLIA_APP_ID` and `YC_ALGOLIA_API_KEY` environment variables.
190
+
191
+ ## How it works
192
+
193
+ **Three-tier extraction:**
194
+
195
+ 1. **Tier 1 (API-first)** — Quick HTTP probe detects ATS signals (Greenhouse, Ashby, Lever iframes/embeds), then calls their public JSON APIs directly. No browser needed. Sub-second.
196
+
197
+ 2. **Tier 2 (Browser rendering)** — For custom career pages, renders with agent-browser (Chrome via CDP), then extracts jobs via DOM heuristics: link-based extraction for pages with job links, or container-based extraction for pages with structured job cards.
198
+
199
+ 3. **Tier 3 (Aggregator)** — Searches cross-company job engines like YC's Work at a Startup (Algolia API). Runs in parallel with target crawling; results are deduplicated.
200
+
201
+ **Architecture:**
202
+
203
+ ```
204
+ ~/.jobcrawl/
205
+ ├── config.yaml # Targets + aggregators + defaults
206
+ ├── credentials.json # API keys for aggregators
207
+ └── raw/ # Saved raw API responses
208
+
209
+ src/
210
+ ├── events.ts # Typed EventBus (pub/sub)
211
+ ├── types/ # Shared type definitions
212
+ ├── data/ # Curated companies dataset
213
+ ├── core/ # Pure primitives (fetch, detect, extract, match)
214
+ │ ├── providers/ # Greenhouse, Ashby, Lever, generic extractors
215
+ │ ├── aggregators/ # YC (Algolia)
216
+ │ └── browser.ts # agent-browser wrapper (render, click-capture)
217
+ ├── threads/ # Per-URL pipeline + bounded concurrency pool
218
+ ├── orchestrators/ # Multi-URL + aggregator coordination
219
+ ├── utils/ # Config loading, LLM, web search
220
+ └── entrypoints/cli/ # Commander.js + Ink terminal UI
221
+ └── components/ # CrawlView, DiscoverView (React/Ink)
222
+ ```
223
+
224
+ ## Supported providers
225
+
226
+ | Provider | Detection | Extraction |
227
+ |----------|-----------|------------|
228
+ | Greenhouse | URL, iframe, `#grnhse_app`, embed script | Public JSON API |
229
+ | Ashby | URL, iframe, embed markers | Public JSON API |
230
+ | Lever | URL, iframe, `data-lever-*` | Public JSON API |
231
+ | YC (Work at a Startup) | Aggregator config | Algolia search API |
232
+ | Custom | Fallback | Browser render + DOM heuristics |
@@ -0,0 +1,7 @@
1
+ import type { Job, SearchCriteria } from "../../types/index.js";
2
+ /**
3
+ * Fetch jobs from YC's Work at a Startup via Algolia.
4
+ * Maps SearchCriteria to Algolia filters. Falls back to URL query params
5
+ * for backward compatibility with direct WaaS URLs.
6
+ */
7
+ export declare function extractYcJobs(sourceUrl: string, criteria: SearchCriteria, saveRaw?: boolean): Promise<Job[]>;
@@ -0,0 +1,320 @@
1
+ import { createHash } from "node:crypto";
2
+ import { readFileSync, existsSync } from "node:fs";
3
+ import { homedir } from "node:os";
4
+ import { join } from "node:path";
5
+ import { saveRawResponse } from "../save-raw.js";
6
+ const HITS_PER_PAGE = 100;
7
+ const INDEX_BY_DATE = "WaaSPublicCompanyJob_created_at_desc_production";
8
+ const INDEX_BY_RELEVANCE = "WaaSPublicCompanyJob_production";
9
+ // --- Value mapping tables ---
10
+ const ROLE_MAP = {
11
+ engineering: "eng",
12
+ design: "design",
13
+ product: "product",
14
+ science: "science",
15
+ sales: "sales",
16
+ marketing: "marketing",
17
+ support: "support",
18
+ operations: "operations",
19
+ "recruiting-hr": "recruiting",
20
+ finance: "finance",
21
+ legal: "legal",
22
+ };
23
+ const ENG_TYPE_MAP = {
24
+ android: "android",
25
+ backend: "be",
26
+ "data-science": "data_sci",
27
+ devops: "devops",
28
+ "embedded-systems": "embedded",
29
+ "engineering-manager": "eng_mgmt",
30
+ frontend: "fe",
31
+ "full-stack": "fs",
32
+ ios: "ios",
33
+ "machine-learning": "ml",
34
+ "qa-engineer": "qa",
35
+ robotics: "robotics",
36
+ hardware: "hw",
37
+ electrical: "electrical",
38
+ mechanical: "mechanical",
39
+ bioengineering: "bio",
40
+ "chemical-engineering": "chemical",
41
+ };
42
+ const DESIGN_TYPE_MAP = {
43
+ "web-design": "web",
44
+ "mobile-design": "mobile",
45
+ "product-design": "product",
46
+ "ui-ux": "ui_ux",
47
+ "user-research": "user_research",
48
+ "brand-graphic-design": "brand_graphic",
49
+ illustration: "illustration",
50
+ animation: "animation",
51
+ hardware: "hardware",
52
+ "3d-ar-vr": "ar_vr",
53
+ "design-manager": "design_mgmt",
54
+ };
55
+ const SCIENCE_TYPE_MAP = {
56
+ biology: "bio",
57
+ biotechnology: "biotech",
58
+ chemistry: "chem",
59
+ genetics: "genetics",
60
+ healthcare: "health",
61
+ immunology: "immuno",
62
+ laboratory: "lab",
63
+ oncology: "onc",
64
+ pharmacology: "pharma",
65
+ "process-engineer": "process",
66
+ research: "research",
67
+ };
68
+ const JOB_TYPE_MAP = {
69
+ fulltime: "fulltime",
70
+ internship: "intern",
71
+ contract: "contract",
72
+ };
73
+ const COMPANY_STAGE_MAP = {
74
+ seed: "seed",
75
+ "series-a": "series_a",
76
+ growth: "growth",
77
+ scale: "scale",
78
+ };
79
+ const WORK_MODE_MAP = {
80
+ remote: "only",
81
+ onsite: "no",
82
+ hybrid: "yes",
83
+ };
84
+ const ROLE_TYPE_FIELD_MAP = {
85
+ eng: "eng_type",
86
+ design: "design_type",
87
+ science: "science_type",
88
+ };
89
+ const ROLE_TYPE_VALUE_MAP = {
90
+ eng: ENG_TYPE_MAP,
91
+ design: DESIGN_TYPE_MAP,
92
+ science: SCIENCE_TYPE_MAP,
93
+ };
94
+ const COMPANY_SIZE_MAP = {
95
+ "1-10": "company_team_size <= 10",
96
+ "11-50": "company_team_size: 11 TO 50",
97
+ "51-300": "company_team_size: 51 TO 300",
98
+ "301+": "company_team_size >= 301",
99
+ };
100
+ // --- Filter helpers ---
101
+ function mapValues(values, map) {
102
+ return values.map((v) => map[v] ?? v);
103
+ }
104
+ function orFilter(field, values, quoted = false) {
105
+ const parts = values.map((v) => quoted ? `${field}:"${v}"` : `${field}:${v}`);
106
+ return `(${parts.join(" OR ")})`;
107
+ }
108
+ function buildFilters(criteria, sourceUrl) {
109
+ const params = new URL(sourceUrl).searchParams;
110
+ const filters = [];
111
+ // role
112
+ const roles = criteria.role
113
+ ? mapValues(criteria.role, ROLE_MAP)
114
+ : fallbackArray(params, "role");
115
+ if (roles.length > 0)
116
+ filters.push(orFilter("role", roles));
117
+ // roleType → field depends on role
118
+ if (criteria.roleType && criteria.role) {
119
+ const algoliaRoles = mapValues(criteria.role, ROLE_MAP);
120
+ for (const algoliaRole of algoliaRoles) {
121
+ const field = ROLE_TYPE_FIELD_MAP[algoliaRole];
122
+ const valueMap = ROLE_TYPE_VALUE_MAP[algoliaRole];
123
+ if (field && valueMap) {
124
+ const mapped = mapValues(criteria.roleType, valueMap);
125
+ filters.push(orFilter(field, mapped));
126
+ }
127
+ }
128
+ }
129
+ else {
130
+ const roleTypeParam = params.get("role_type");
131
+ if (roleTypeParam && roleTypeParam !== "any")
132
+ filters.push(`(eng_type:${roleTypeParam})`);
133
+ }
134
+ // workMode → remote
135
+ const remoteModes = criteria.workMode
136
+ ? mapValues(criteria.workMode, WORK_MODE_MAP)
137
+ : fallbackArray(params, "remote");
138
+ if (remoteModes.length > 0)
139
+ filters.push(orFilter("remote", remoteModes));
140
+ // location
141
+ const location = criteria.location ?? params.get("locations");
142
+ if (location && location !== "any")
143
+ filters.push(`(locations_for_search:"${location}")`);
144
+ // jobType
145
+ const jobTypes = criteria.jobType
146
+ ? mapValues(criteria.jobType, JOB_TYPE_MAP)
147
+ : fallbackArray(params, "jobType");
148
+ if (jobTypes.length > 0)
149
+ filters.push(orFilter("job_type", jobTypes, true));
150
+ // minExperience
151
+ const minExps = criteria.minExperience
152
+ ? criteria.minExperience.map(String)
153
+ : fallbackArray(params, "minExperience");
154
+ if (minExps.length > 0)
155
+ filters.push(orFilter("min_experience", minExps));
156
+ // companyStage
157
+ const stages = criteria.companyStage
158
+ ? mapValues(criteria.companyStage, COMPANY_STAGE_MAP)
159
+ : fallbackArray(params, "companyStage");
160
+ if (stages.length > 0)
161
+ filters.push(orFilter("company_waas_stage", stages));
162
+ // industry
163
+ const industries = criteria.industry ?? fallbackQuotedArray(params, "industry");
164
+ if (industries.length > 0)
165
+ filters.push(orFilter("company_parent_sector", industries, true));
166
+ // companySize → range syntax
167
+ if (criteria.companySize && criteria.companySize.length > 0) {
168
+ const rangeClauses = criteria.companySize
169
+ .map((s) => COMPANY_SIZE_MAP[s])
170
+ .filter(Boolean);
171
+ if (rangeClauses.length > 0)
172
+ filters.push(`(${rangeClauses.join(" OR ")})`);
173
+ }
174
+ // hasSalary
175
+ if (criteria.hasSalary)
176
+ filters.push("(has_salary:true)");
177
+ // hasEquity
178
+ if (criteria.hasEquity)
179
+ filters.push("(has_equity:true)");
180
+ // hasInterviewProcess
181
+ if (criteria.hasInterviewProcess)
182
+ filters.push("(has_interview_process:true)");
183
+ // visaSponsorship
184
+ if (criteria.visaSponsorship)
185
+ filters.push("(us_visa_required:none OR us_visa_required:possible)");
186
+ return filters.join(" AND ");
187
+ }
188
+ function fallbackArray(params, key) {
189
+ const val = params.get(key);
190
+ if (!val || val === "any")
191
+ return [];
192
+ return [val];
193
+ }
194
+ function fallbackQuotedArray(params, key) {
195
+ const val = params.get(key);
196
+ if (!val || val === "any")
197
+ return [];
198
+ return [val];
199
+ }
200
+ // --- Algolia API ---
201
+ async function fetchPage(appId, apiKey, query, filters, indexName, page) {
202
+ const algoliaParams = new URLSearchParams({
203
+ query,
204
+ page: String(page),
205
+ filters,
206
+ attributesToRetrieve: JSON.stringify(["*"]),
207
+ attributesToHighlight: JSON.stringify([]),
208
+ attributesToSnippet: JSON.stringify([]),
209
+ hitsPerPage: String(HITS_PER_PAGE),
210
+ clickAnalytics: "true",
211
+ distinct: "true",
212
+ });
213
+ const url = `https://${appId}-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=${encodeURIComponent("Algolia for JavaScript (3.35.1); Browser")}&x-algolia-application-id=${appId}&x-algolia-api-key=${apiKey}`;
214
+ const response = await fetch(url, {
215
+ method: "POST",
216
+ headers: {
217
+ accept: "application/json",
218
+ "content-type": "application/x-www-form-urlencoded",
219
+ Origin: "https://www.workatastartup.com",
220
+ Referer: "https://www.workatastartup.com/",
221
+ },
222
+ body: JSON.stringify({
223
+ requests: [
224
+ {
225
+ indexName,
226
+ params: algoliaParams.toString(),
227
+ },
228
+ ],
229
+ }),
230
+ });
231
+ if (!response.ok) {
232
+ throw new Error(`Algolia API returned ${response.status}`);
233
+ }
234
+ return (await response.json());
235
+ }
236
+ // --- Job mapping ---
237
+ function mapHit(hit, sourceUrl) {
238
+ const id = createHash("sha256")
239
+ .update(`${hit.objectID}:${hit.title}:yc`)
240
+ .digest("hex")
241
+ .slice(0, 12);
242
+ const jobUrl = hit.search_path ??
243
+ `https://www.workatastartup.com/jobs/${hit.objectID}`;
244
+ return {
245
+ id,
246
+ title: hit.title,
247
+ company: hit.company_name,
248
+ location: hit.locations_for_search?.[0] ?? null,
249
+ workMode: inferWorkMode(hit.remote),
250
+ department: hit.role ?? null,
251
+ url: jobUrl,
252
+ sourceUrl,
253
+ provider: "yc",
254
+ description: hit.description?.slice(0, 200) ?? null,
255
+ postedAt: hit.created_at ?? null,
256
+ extractedAt: new Date().toISOString(),
257
+ raw: hit,
258
+ };
259
+ }
260
+ function inferWorkMode(remote) {
261
+ if (!remote)
262
+ return null;
263
+ const r = remote.toLowerCase();
264
+ if (r === "only" || r === "yes")
265
+ return "remote";
266
+ if (r === "no")
267
+ return null;
268
+ return null;
269
+ }
270
+ // --- Entry point ---
271
+ /**
272
+ * Fetch jobs from YC's Work at a Startup via Algolia.
273
+ * Maps SearchCriteria to Algolia filters. Falls back to URL query params
274
+ * for backward compatibility with direct WaaS URLs.
275
+ */
276
+ export async function extractYcJobs(sourceUrl, criteria, saveRaw = false) {
277
+ let appId = process.env.YC_ALGOLIA_APP_ID;
278
+ let apiKey = process.env.YC_ALGOLIA_API_KEY;
279
+ if (!appId || !apiKey) {
280
+ const credPath = join(homedir(), ".jobcrawl", "credentials.json");
281
+ if (existsSync(credPath)) {
282
+ const creds = JSON.parse(readFileSync(credPath, "utf-8"));
283
+ appId = appId || creds.yc?.algoliaAppId;
284
+ apiKey = apiKey || creds.yc?.algoliaApiKey;
285
+ }
286
+ }
287
+ if (!appId || !apiKey) {
288
+ throw new Error("YC provider requires credentials. Run `jobcrawl init` and add your Algolia keys to ~/.jobcrawl/credentials.json");
289
+ }
290
+ const queries = criteria.keywords.length > 0
291
+ ? criteria.keywords
292
+ : [""];
293
+ const filters = buildFilters(criteria, sourceUrl);
294
+ // Fetch each keyword as a separate Algolia query and deduplicate
295
+ const jobMap = new Map();
296
+ const allHits = [];
297
+ for (const query of queries) {
298
+ const indexName = query ? INDEX_BY_RELEVANCE : INDEX_BY_DATE;
299
+ const first = await fetchPage(appId, apiKey, query, filters, indexName, 0);
300
+ const result = first.results[0];
301
+ for (const hit of result.hits) {
302
+ if (!jobMap.has(hit.objectID)) {
303
+ jobMap.set(hit.objectID, mapHit(hit, sourceUrl));
304
+ allHits.push(hit);
305
+ }
306
+ }
307
+ for (let page = 1; page < result.nbPages; page++) {
308
+ const resp = await fetchPage(appId, apiKey, query, filters, indexName, page);
309
+ for (const hit of resp.results[0].hits) {
310
+ if (!jobMap.has(hit.objectID)) {
311
+ jobMap.set(hit.objectID, mapHit(hit, sourceUrl));
312
+ allHits.push(hit);
313
+ }
314
+ }
315
+ }
316
+ }
317
+ if (saveRaw)
318
+ await saveRawResponse("yc", "workatastartup", allHits);
319
+ return [...jobMap.values()];
320
+ }
@@ -0,0 +1,30 @@
1
+ import type { Job } from "../types/index.js";
2
+ export declare class BrowserNotAvailableError extends Error {
3
+ constructor();
4
+ }
5
+ export declare class BrowserSession {
6
+ private name;
7
+ private opened;
8
+ private networkTimeout;
9
+ constructor(opts?: {
10
+ networkTimeout?: number;
11
+ });
12
+ /** Run a local DOM command (no timeout). */
13
+ private run;
14
+ /** Run a network-dependent command (open, wait) with a timeout. */
15
+ private runNetwork;
16
+ open(url: string): Promise<void>;
17
+ getHtml(): Promise<string>;
18
+ getUrl(): Promise<string>;
19
+ clickByText(text: string): Promise<void>;
20
+ /**
21
+ * Find an element by its exact text content, walk up `level` ancestors,
22
+ * and click that parent. Uses eval with base64 to avoid shell quoting.
23
+ */
24
+ clickNthParent(text: string, level: number): Promise<void>;
25
+ back(): Promise<void>;
26
+ close(): Promise<void>;
27
+ }
28
+ export declare function resolveJobUrls(jobs: Job[], listingUrl: string, session: BrowserSession, opts?: {
29
+ maxBubbleLevels?: number;
30
+ }): Promise<Job[]>;