jobcrawl 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.prettierrc.json +10 -0
- package/CHANGELOG.md +40 -0
- package/README.md +232 -0
- package/dist/core/aggregators/yc.d.ts +7 -0
- package/dist/core/aggregators/yc.js +320 -0
- package/dist/core/browser.d.ts +30 -0
- package/dist/core/browser.js +196 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +41 -0
- package/dist/core/detect-provider.d.ts +7 -0
- package/dist/core/detect-provider.js +125 -0
- package/dist/core/discover-careers.d.ts +18 -0
- package/dist/core/discover-careers.js +92 -0
- package/dist/core/extract-jobs.d.ts +14 -0
- package/dist/core/extract-jobs.js +36 -0
- package/dist/core/fetch-page.d.ts +11 -0
- package/dist/core/fetch-page.js +39 -0
- package/dist/core/format-output.d.ts +2 -0
- package/dist/core/format-output.js +59 -0
- package/dist/core/match-jobs.d.ts +6 -0
- package/dist/core/match-jobs.js +43 -0
- package/dist/core/providers/ashby.d.ts +6 -0
- package/dist/core/providers/ashby.js +58 -0
- package/dist/core/providers/generic.d.ts +6 -0
- package/dist/core/providers/generic.js +294 -0
- package/dist/core/providers/greenhouse.d.ts +6 -0
- package/dist/core/providers/greenhouse.js +47 -0
- package/dist/core/providers/lever.d.ts +7 -0
- package/dist/core/providers/lever.js +60 -0
- package/dist/core/providers/yc.d.ts +7 -0
- package/dist/core/providers/yc.js +320 -0
- package/dist/core/resolve-iframe.d.ts +6 -0
- package/dist/core/resolve-iframe.js +51 -0
- package/dist/core/save-raw.d.ts +4 -0
- package/dist/core/save-raw.js +13 -0
- package/dist/data/companies.d.ts +9 -0
- package/dist/data/companies.js +2849 -0
- package/dist/entrypoints/cli/app.d.ts +3 -0
- package/dist/entrypoints/cli/app.js +91 -0
- package/dist/entrypoints/cli/components/crawl-view.d.ts +1 -0
- package/dist/entrypoints/cli/components/crawl-view.js +94 -0
- package/dist/entrypoints/cli/components/discover-view.d.ts +1 -0
- package/dist/entrypoints/cli/components/discover-view.js +67 -0
- package/dist/entrypoints/cli/crawl-aggregators.d.ts +26 -0
- package/dist/entrypoints/cli/crawl-aggregators.js +76 -0
- package/dist/entrypoints/cli/crawl-url.d.ts +26 -0
- package/dist/entrypoints/cli/crawl-url.js +54 -0
- package/dist/entrypoints/cli/crawl.d.ts +32 -0
- package/dist/entrypoints/cli/crawl.js +108 -0
- package/dist/entrypoints/cli/discover.d.ts +10 -0
- package/dist/entrypoints/cli/discover.js +69 -0
- package/dist/entrypoints/cli/index.d.ts +2 -0
- package/dist/entrypoints/cli/index.js +197 -0
- package/dist/entrypoints/cli/init.d.ts +9 -0
- package/dist/entrypoints/cli/init.js +94 -0
- package/dist/entrypoints/cli/plain.d.ts +6 -0
- package/dist/entrypoints/cli/plain.js +77 -0
- package/dist/events.d.ts +114 -0
- package/dist/events.js +17 -0
- package/dist/orchestrators/crawl-all.d.ts +2 -0
- package/dist/orchestrators/crawl-all.js +66 -0
- package/dist/orchestrators/discover-all.d.ts +10 -0
- package/dist/orchestrators/discover-all.js +39 -0
- package/dist/threads/pool.d.ts +5 -0
- package/dist/threads/pool.js +23 -0
- package/dist/threads/process-url.d.ts +9 -0
- package/dist/threads/process-url.js +229 -0
- package/dist/types/index.d.ts +83 -0
- package/dist/types/index.js +6 -0
- package/dist/utils/config.d.ts +17 -0
- package/dist/utils/config.js +57 -0
- package/dist/utils/google-search.d.ts +19 -0
- package/dist/utils/google-search.js +139 -0
- package/dist/utils/llm.d.ts +8 -0
- package/dist/utils/llm.js +25 -0
- package/package.json +42 -0
- package/src/core/aggregators/yc.ts +415 -0
- package/src/core/browser.ts +239 -0
- package/src/core/detect-provider.ts +162 -0
- package/src/core/discover-careers.ts +117 -0
- package/src/core/extract-jobs.ts +50 -0
- package/src/core/fetch-page.ts +41 -0
- package/src/core/format-output.ts +80 -0
- package/src/core/match-jobs.ts +56 -0
- package/src/core/providers/ashby.ts +84 -0
- package/src/core/providers/generic.ts +332 -0
- package/src/core/providers/greenhouse.ts +74 -0
- package/src/core/providers/lever.ts +90 -0
- package/src/core/resolve-iframe.ts +59 -0
- package/src/core/save-raw.ts +18 -0
- package/src/data/companies.ts +2859 -0
- package/src/entrypoints/cli/app.tsx +173 -0
- package/src/entrypoints/cli/components/crawl-view.tsx +163 -0
- package/src/entrypoints/cli/components/discover-view.tsx +138 -0
- package/src/entrypoints/cli/crawl-aggregators.ts +112 -0
- package/src/entrypoints/cli/crawl-url.ts +87 -0
- package/src/entrypoints/cli/crawl.ts +163 -0
- package/src/entrypoints/cli/discover.ts +96 -0
- package/src/entrypoints/cli/index.ts +252 -0
- package/src/entrypoints/cli/init.ts +117 -0
- package/src/entrypoints/cli/plain.ts +104 -0
- package/src/events.ts +79 -0
- package/src/orchestrators/crawl-all.ts +96 -0
- package/src/orchestrators/discover-all.ts +61 -0
- package/src/threads/pool.ts +29 -0
- package/src/threads/process-url.ts +312 -0
- package/src/types/index.ts +110 -0
- package/src/utils/config.ts +79 -0
- package/src/utils/google-search.ts +155 -0
- package/src/utils/llm.ts +33 -0
- package/test/integration/process-url.test.ts +301 -0
- package/test/integration/providers/ashby.test.ts +163 -0
- package/test/integration/providers/greenhouse.test.ts +191 -0
- package/test/integration/providers/lever.test.ts +188 -0
- package/test/unit/config.test.ts +64 -0
- package/test/unit/detect-provider.test.ts +165 -0
- package/test/unit/events.test.ts +104 -0
- package/test/unit/format-output.test.ts +165 -0
- package/test/unit/match-jobs.test.ts +257 -0
- package/test/unit/pool.test.ts +74 -0
- package/test/unit/providers/generic.test.ts +139 -0
- package/test/unit/resolve-iframe.test.ts +100 -0
- package/tsconfig.json +19 -0
- package/vitest.config.ts +7 -0
package/.prettierrc.json
ADDED
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
## [0.1.0] - 2026-04-02
|
|
6
|
+
|
|
7
|
+
Initial release of jobcrawl — a CLI tool that discovers and crawls company career pages to find matching job listings.
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
|
|
11
|
+
- **Project scaffold** — TypeScript project with Prettier, ESM modules, and core dependencies.
|
|
12
|
+
- **Type system** — Shared type definitions (`Company`, `Job`, `CrawlResult`, `AggregatorResult`) and a typed event bus for inter-module communication.
|
|
13
|
+
- **ATS provider extractors** — Built-in support for Greenhouse, Lever, and Ashby career pages with structured job extraction.
|
|
14
|
+
- **Generic provider** — Fallback extractor using LLM-powered container-based extraction strategy for non-standard career pages.
|
|
15
|
+
- **Provider detection** — Automatic ATS provider identification from page HTML and URL patterns.
|
|
16
|
+
- **iframe resolution** — Detects and resolves iframe-embedded career pages to their source URLs.
|
|
17
|
+
- **Career page discovery** — Finds career pages via Google search and heuristic URL guessing (e.g. `/careers`, `/jobs`).
|
|
18
|
+
- **Job matching** — LLM-based relevance scoring to filter jobs against user-defined search criteria.
|
|
19
|
+
- **Crawl orchestration** — Bounded concurrency pool for parallel crawling with progress tracking.
|
|
20
|
+
- **YC aggregator** — Scrapes Y Combinator's Work at a Startup with extended search filters (role, location, industry) and raw data capture.
|
|
21
|
+
- **Aggregator architecture** — Separated aggregator type from direct providers, with dedicated CLI commands and crawl orchestration.
|
|
22
|
+
- **Browser rendering** — Integrated `agent-browser` for JavaScript-rendered pages that can't be crawled with static HTTP fetches.
|
|
23
|
+
- **Companies dataset** — Built-in dataset of ~2,800 companies for `init` config generation.
|
|
24
|
+
- **CLI with Ink UI** — Interactive terminal interface with step-based thread progress, plus plain-text output mode.
|
|
25
|
+
- **CLI commands** — `init`, `crawl`, `crawl-url`, `crawl-aggregators`, and `discover` commands.
|
|
26
|
+
- **Config system** — YAML-based configuration for companies, search criteria, and crawl settings.
|
|
27
|
+
- **Output formatting** — Structured JSON and human-readable output for crawl results.
|
|
28
|
+
- **Raw data capture** — Optional saving of raw HTML/JSON responses for debugging.
|
|
29
|
+
- **Test suite** — Unit tests (detect-provider, match-jobs, resolve-iframe, config, events, format-output, pool, generic provider) and integration tests (process-url, Ashby, Greenhouse, Lever providers).
|
|
30
|
+
|
|
31
|
+
### Fixed
|
|
32
|
+
|
|
33
|
+
- Job deduplication now uses URL+title composite key instead of URL alone, preventing false duplicates across pages.
|
|
34
|
+
- File permissions restricted on credentials and config directory during `init`.
|
|
35
|
+
|
|
36
|
+
### Changed
|
|
37
|
+
|
|
38
|
+
- Removed `dotenv` dependency — environment variables are expected to be set externally.
|
|
39
|
+
- Rewrote CrawlView component with step-based thread progress for clearer status reporting.
|
|
40
|
+
- Rewrote README to cover aggregators, browser rendering, and the new config format.
|
package/README.md
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
# jobcrawl
|
|
2
|
+
|
|
3
|
+
CLI tool that crawls career pages for jobs matching your search criteria. Supports Greenhouse, Ashby, and Lever via their public JSON APIs, Y Combinator's Work at a Startup via Algolia, and custom career pages via browser rendering + DOM heuristics.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install
|
|
9
|
+
npm run build
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
Requires Node.js 20+. Optional dependencies:
|
|
13
|
+
|
|
14
|
+
- `ANTHROPIC_API_KEY` — for the `discover` command's LLM features
|
|
15
|
+
- `agent-browser` — for JS-rendered career pages (`npm i -g agent-browser`)
|
|
16
|
+
|
|
17
|
+
## Quick start
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
# Create config at ~/.jobcrawl/config.yaml + credentials.json
|
|
21
|
+
jobcrawl init
|
|
22
|
+
|
|
23
|
+
# Add your YC Algolia credentials to ~/.jobcrawl/credentials.json
|
|
24
|
+
# Then crawl
|
|
25
|
+
jobcrawl crawl --keywords "engineer" --output table
|
|
26
|
+
|
|
27
|
+
# Or search YC startups directly
|
|
28
|
+
jobcrawl crawl-aggregators yc --keywords "engineer" --role engineering
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Commands
|
|
32
|
+
|
|
33
|
+
### `init` — Create config files
|
|
34
|
+
|
|
35
|
+
Creates `~/.jobcrawl/config.yaml` (pre-populated with 200+ companies), `~/.jobcrawl/credentials.json` (for aggregator API keys), and `~/.jobcrawl/raw/` directory.
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
jobcrawl init # Create default config
|
|
39
|
+
jobcrawl init --force # Overwrite existing config
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### `crawl` — Crawl career pages
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
# Uses ~/.jobcrawl/config.yaml by default
|
|
46
|
+
jobcrawl crawl --keywords "engineer" --output table
|
|
47
|
+
|
|
48
|
+
# From a specific config file
|
|
49
|
+
jobcrawl crawl --file config.yaml --keywords "engineer"
|
|
50
|
+
|
|
51
|
+
# From inline URLs
|
|
52
|
+
jobcrawl crawl --urls https://boards.greenhouse.io/anthropic https://jobs.ashbyhq.com/openai \
|
|
53
|
+
--keywords "senior" "engineer" --remote
|
|
54
|
+
|
|
55
|
+
# Include YC aggregator alongside company targets
|
|
56
|
+
jobcrawl crawl --aggregators yc --keywords "engineer" --company-stage seed
|
|
57
|
+
|
|
58
|
+
# Pipe to jq
|
|
59
|
+
jobcrawl crawl --keywords "staff" | jq '.[].title'
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### `crawl-url` — Crawl a single URL
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
jobcrawl crawl-url https://boards.greenhouse.io/anthropic --keywords "engineer" --output table
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### `crawl-aggregators` — Search aggregator sources
|
|
69
|
+
|
|
70
|
+
Searches across multiple companies in one query. Currently supports YC (Work at a Startup).
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
# Search YC startups
|
|
74
|
+
jobcrawl crawl-aggregators yc --keywords "engineer" --output table
|
|
75
|
+
|
|
76
|
+
# With extended filters
|
|
77
|
+
jobcrawl crawl-aggregators yc --keywords "engineer" --role engineering --role-type backend \
|
|
78
|
+
--company-stage seed --has-salary --visa-sponsorship
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Requires Algolia credentials in `~/.jobcrawl/credentials.json` or via env vars `YC_ALGOLIA_APP_ID` and `YC_ALGOLIA_API_KEY`.
|
|
82
|
+
|
|
83
|
+
### `discover` — Find career pages from company names
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# Output URLs (one per line)
|
|
87
|
+
jobcrawl discover --companies "Anthropic" "OpenAI" "Mistral AI"
|
|
88
|
+
|
|
89
|
+
# Save as YAML config
|
|
90
|
+
jobcrawl discover --companies "Anthropic" "OpenAI" --output yaml -o ~/.jobcrawl/config.yaml
|
|
91
|
+
|
|
92
|
+
# Pipe discover into crawl
|
|
93
|
+
jobcrawl discover --companies "Anthropic" | jobcrawl crawl --keywords "engineer"
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### `detect` — Identify ATS provider
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
jobcrawl detect https://boards.greenhouse.io/anthropic
|
|
100
|
+
# → { "provider": "greenhouse", "boardToken": "anthropic" }
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### `match` — Filter a saved jobs file
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
jobcrawl crawl-url https://jobs.ashbyhq.com/openai -o jobs.json
|
|
107
|
+
jobcrawl match jobs.json --keywords "senior" --remote --output table
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## Search criteria flags
|
|
111
|
+
|
|
112
|
+
| Flag | Description |
|
|
113
|
+
|------|-------------|
|
|
114
|
+
| `--keywords <terms...>` | Job title keywords (any match) |
|
|
115
|
+
| `--exclude <terms...>` | Exclude jobs matching these keywords |
|
|
116
|
+
| `--location <location>` | Location substring filter |
|
|
117
|
+
| `--remote` | Only remote jobs |
|
|
118
|
+
| `--onsite` | Only onsite jobs |
|
|
119
|
+
| `--hybrid` | Only hybrid jobs |
|
|
120
|
+
| `--department <depts...>` | Department filter |
|
|
121
|
+
| `--role <roles...>` | Role category: engineering, design, product, science, sales, marketing, etc. |
|
|
122
|
+
| `--role-type <types...>` | Specialization: backend, frontend, full-stack, ML, DevOps, etc. |
|
|
123
|
+
| `--job-type <types...>` | fulltime, internship, contract |
|
|
124
|
+
| `--company-stage <stages...>` | seed, series-a, growth, scale |
|
|
125
|
+
| `--industry <industries...>` | Industry filter |
|
|
126
|
+
| `--company-size <sizes...>` | 1-10, 11-50, 51-300, 301+ |
|
|
127
|
+
| `--has-salary` | Only jobs with salary listed |
|
|
128
|
+
| `--has-equity` | Only jobs with equity |
|
|
129
|
+
| `--has-interview-process` | Only jobs with interview process listed |
|
|
130
|
+
| `--visa-sponsorship` | Jobs offering visa sponsorship |
|
|
131
|
+
|
|
132
|
+
## Output formats
|
|
133
|
+
|
|
134
|
+
`--output json` (default), `table`, `markdown`, `csv`
|
|
135
|
+
|
|
136
|
+
Write to file with `-o <file>`. Save raw API responses with `--save-raw` (stored in `~/.jobcrawl/raw/`).
|
|
137
|
+
|
|
138
|
+
## Config file
|
|
139
|
+
|
|
140
|
+
Located at `~/.jobcrawl/config.yaml`. Created by `jobcrawl init`.
|
|
141
|
+
|
|
142
|
+
```yaml
|
|
143
|
+
# Aggregators (cross-company search engines)
|
|
144
|
+
aggregators:
|
|
145
|
+
- type: yc
|
|
146
|
+
enabled: true
|
|
147
|
+
|
|
148
|
+
# Companies — slug-based (recommended) or URL-based
|
|
149
|
+
companies:
|
|
150
|
+
# --- AI Foundation Models ---
|
|
151
|
+
- company: Anthropic
|
|
152
|
+
slug: anthropic
|
|
153
|
+
provider: greenhouse
|
|
154
|
+
- company: OpenAI
|
|
155
|
+
slug: openai
|
|
156
|
+
provider: ashby
|
|
157
|
+
|
|
158
|
+
# URL-based (also works)
|
|
159
|
+
- url: https://mistral.ai/careers/
|
|
160
|
+
company: Mistral AI
|
|
161
|
+
|
|
162
|
+
defaults:
|
|
163
|
+
concurrency: 5
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
**Slug-based targets** (recommended): jobcrawl auto-probes Greenhouse, Ashby, and Lever APIs using the slug. Add an optional `provider` hint to skip probing, or a `fallback` URL for when no ATS API matches.
|
|
167
|
+
|
|
168
|
+
**URL-based targets**: directly probe the URL, falling back to browser rendering for JS-rendered pages.
|
|
169
|
+
|
|
170
|
+
Target resolution order:
|
|
171
|
+
1. `--urls` flag (inline URLs)
|
|
172
|
+
2. `--file` flag (explicit config file)
|
|
173
|
+
3. `~/.jobcrawl/config.yaml` (default config)
|
|
174
|
+
4. `stdin` (piped input)
|
|
175
|
+
|
|
176
|
+
## Credentials
|
|
177
|
+
|
|
178
|
+
`~/.jobcrawl/credentials.json` stores API keys for aggregators:
|
|
179
|
+
|
|
180
|
+
```json
|
|
181
|
+
{
|
|
182
|
+
"yc": {
|
|
183
|
+
"algoliaAppId": "YOUR_APP_ID",
|
|
184
|
+
"algoliaApiKey": "YOUR_API_KEY"
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
Alternatively, set `YC_ALGOLIA_APP_ID` and `YC_ALGOLIA_API_KEY` environment variables.
|
|
190
|
+
|
|
191
|
+
## How it works
|
|
192
|
+
|
|
193
|
+
**Three-tier extraction:**
|
|
194
|
+
|
|
195
|
+
1. **Tier 1 (API-first)** — Quick HTTP probe detects ATS signals (Greenhouse, Ashby, Lever iframes/embeds), then calls their public JSON APIs directly. No browser needed. Sub-second.
|
|
196
|
+
|
|
197
|
+
2. **Tier 2 (Browser rendering)** — For custom career pages, renders with agent-browser (Chrome via CDP), then extracts jobs via DOM heuristics: link-based extraction for pages with job links, or container-based extraction for pages with structured job cards.
|
|
198
|
+
|
|
199
|
+
3. **Tier 3 (Aggregator)** — Searches cross-company job engines like YC's Work at a Startup (Algolia API). Runs in parallel with target crawling; results are deduplicated.
|
|
200
|
+
|
|
201
|
+
**Architecture:**
|
|
202
|
+
|
|
203
|
+
```
|
|
204
|
+
~/.jobcrawl/
|
|
205
|
+
├── config.yaml # Targets + aggregators + defaults
|
|
206
|
+
├── credentials.json # API keys for aggregators
|
|
207
|
+
└── raw/ # Saved raw API responses
|
|
208
|
+
|
|
209
|
+
src/
|
|
210
|
+
├── events.ts # Typed EventBus (pub/sub)
|
|
211
|
+
├── types/ # Shared type definitions
|
|
212
|
+
├── data/ # Curated companies dataset
|
|
213
|
+
├── core/ # Pure primitives (fetch, detect, extract, match)
|
|
214
|
+
│ ├── providers/ # Greenhouse, Ashby, Lever, generic extractors
|
|
215
|
+
│ ├── aggregators/ # YC (Algolia)
|
|
216
|
+
│ └── browser.ts # agent-browser wrapper (render, click-capture)
|
|
217
|
+
├── threads/ # Per-URL pipeline + bounded concurrency pool
|
|
218
|
+
├── orchestrators/ # Multi-URL + aggregator coordination
|
|
219
|
+
├── utils/ # Config loading, LLM, web search
|
|
220
|
+
└── entrypoints/cli/ # Commander.js + Ink terminal UI
|
|
221
|
+
└── components/ # CrawlView, DiscoverView (React/Ink)
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
## Supported providers
|
|
225
|
+
|
|
226
|
+
| Provider | Detection | Extraction |
|
|
227
|
+
|----------|-----------|------------|
|
|
228
|
+
| Greenhouse | URL, iframe, `#grnhse_app`, embed script | Public JSON API |
|
|
229
|
+
| Ashby | URL, iframe, embed markers | Public JSON API |
|
|
230
|
+
| Lever | URL, iframe, `data-lever-*` | Public JSON API |
|
|
231
|
+
| YC (Work at a Startup) | Aggregator config | Algolia search API |
|
|
232
|
+
| Custom | Fallback | Browser render + DOM heuristics |
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { Job, SearchCriteria } from "../../types/index.js";
|
|
2
|
+
/**
|
|
3
|
+
* Fetch jobs from YC's Work at a Startup via Algolia.
|
|
4
|
+
* Maps SearchCriteria to Algolia filters. Falls back to URL query params
|
|
5
|
+
* for backward compatibility with direct WaaS URLs.
|
|
6
|
+
*/
|
|
7
|
+
export declare function extractYcJobs(sourceUrl: string, criteria: SearchCriteria, saveRaw?: boolean): Promise<Job[]>;
|
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { readFileSync, existsSync } from "node:fs";
|
|
3
|
+
import { homedir } from "node:os";
|
|
4
|
+
import { join } from "node:path";
|
|
5
|
+
import { saveRawResponse } from "../save-raw.js";
|
|
6
|
+
const HITS_PER_PAGE = 100;
|
|
7
|
+
const INDEX_BY_DATE = "WaaSPublicCompanyJob_created_at_desc_production";
|
|
8
|
+
const INDEX_BY_RELEVANCE = "WaaSPublicCompanyJob_production";
|
|
9
|
+
// --- Value mapping tables ---
|
|
10
|
+
const ROLE_MAP = {
|
|
11
|
+
engineering: "eng",
|
|
12
|
+
design: "design",
|
|
13
|
+
product: "product",
|
|
14
|
+
science: "science",
|
|
15
|
+
sales: "sales",
|
|
16
|
+
marketing: "marketing",
|
|
17
|
+
support: "support",
|
|
18
|
+
operations: "operations",
|
|
19
|
+
"recruiting-hr": "recruiting",
|
|
20
|
+
finance: "finance",
|
|
21
|
+
legal: "legal",
|
|
22
|
+
};
|
|
23
|
+
const ENG_TYPE_MAP = {
|
|
24
|
+
android: "android",
|
|
25
|
+
backend: "be",
|
|
26
|
+
"data-science": "data_sci",
|
|
27
|
+
devops: "devops",
|
|
28
|
+
"embedded-systems": "embedded",
|
|
29
|
+
"engineering-manager": "eng_mgmt",
|
|
30
|
+
frontend: "fe",
|
|
31
|
+
"full-stack": "fs",
|
|
32
|
+
ios: "ios",
|
|
33
|
+
"machine-learning": "ml",
|
|
34
|
+
"qa-engineer": "qa",
|
|
35
|
+
robotics: "robotics",
|
|
36
|
+
hardware: "hw",
|
|
37
|
+
electrical: "electrical",
|
|
38
|
+
mechanical: "mechanical",
|
|
39
|
+
bioengineering: "bio",
|
|
40
|
+
"chemical-engineering": "chemical",
|
|
41
|
+
};
|
|
42
|
+
const DESIGN_TYPE_MAP = {
|
|
43
|
+
"web-design": "web",
|
|
44
|
+
"mobile-design": "mobile",
|
|
45
|
+
"product-design": "product",
|
|
46
|
+
"ui-ux": "ui_ux",
|
|
47
|
+
"user-research": "user_research",
|
|
48
|
+
"brand-graphic-design": "brand_graphic",
|
|
49
|
+
illustration: "illustration",
|
|
50
|
+
animation: "animation",
|
|
51
|
+
hardware: "hardware",
|
|
52
|
+
"3d-ar-vr": "ar_vr",
|
|
53
|
+
"design-manager": "design_mgmt",
|
|
54
|
+
};
|
|
55
|
+
const SCIENCE_TYPE_MAP = {
|
|
56
|
+
biology: "bio",
|
|
57
|
+
biotechnology: "biotech",
|
|
58
|
+
chemistry: "chem",
|
|
59
|
+
genetics: "genetics",
|
|
60
|
+
healthcare: "health",
|
|
61
|
+
immunology: "immuno",
|
|
62
|
+
laboratory: "lab",
|
|
63
|
+
oncology: "onc",
|
|
64
|
+
pharmacology: "pharma",
|
|
65
|
+
"process-engineer": "process",
|
|
66
|
+
research: "research",
|
|
67
|
+
};
|
|
68
|
+
const JOB_TYPE_MAP = {
|
|
69
|
+
fulltime: "fulltime",
|
|
70
|
+
internship: "intern",
|
|
71
|
+
contract: "contract",
|
|
72
|
+
};
|
|
73
|
+
const COMPANY_STAGE_MAP = {
|
|
74
|
+
seed: "seed",
|
|
75
|
+
"series-a": "series_a",
|
|
76
|
+
growth: "growth",
|
|
77
|
+
scale: "scale",
|
|
78
|
+
};
|
|
79
|
+
const WORK_MODE_MAP = {
|
|
80
|
+
remote: "only",
|
|
81
|
+
onsite: "no",
|
|
82
|
+
hybrid: "yes",
|
|
83
|
+
};
|
|
84
|
+
const ROLE_TYPE_FIELD_MAP = {
|
|
85
|
+
eng: "eng_type",
|
|
86
|
+
design: "design_type",
|
|
87
|
+
science: "science_type",
|
|
88
|
+
};
|
|
89
|
+
const ROLE_TYPE_VALUE_MAP = {
|
|
90
|
+
eng: ENG_TYPE_MAP,
|
|
91
|
+
design: DESIGN_TYPE_MAP,
|
|
92
|
+
science: SCIENCE_TYPE_MAP,
|
|
93
|
+
};
|
|
94
|
+
const COMPANY_SIZE_MAP = {
|
|
95
|
+
"1-10": "company_team_size <= 10",
|
|
96
|
+
"11-50": "company_team_size: 11 TO 50",
|
|
97
|
+
"51-300": "company_team_size: 51 TO 300",
|
|
98
|
+
"301+": "company_team_size >= 301",
|
|
99
|
+
};
|
|
100
|
+
// --- Filter helpers ---
|
|
101
|
+
function mapValues(values, map) {
|
|
102
|
+
return values.map((v) => map[v] ?? v);
|
|
103
|
+
}
|
|
104
|
+
function orFilter(field, values, quoted = false) {
|
|
105
|
+
const parts = values.map((v) => quoted ? `${field}:"${v}"` : `${field}:${v}`);
|
|
106
|
+
return `(${parts.join(" OR ")})`;
|
|
107
|
+
}
|
|
108
|
+
function buildFilters(criteria, sourceUrl) {
|
|
109
|
+
const params = new URL(sourceUrl).searchParams;
|
|
110
|
+
const filters = [];
|
|
111
|
+
// role
|
|
112
|
+
const roles = criteria.role
|
|
113
|
+
? mapValues(criteria.role, ROLE_MAP)
|
|
114
|
+
: fallbackArray(params, "role");
|
|
115
|
+
if (roles.length > 0)
|
|
116
|
+
filters.push(orFilter("role", roles));
|
|
117
|
+
// roleType → field depends on role
|
|
118
|
+
if (criteria.roleType && criteria.role) {
|
|
119
|
+
const algoliaRoles = mapValues(criteria.role, ROLE_MAP);
|
|
120
|
+
for (const algoliaRole of algoliaRoles) {
|
|
121
|
+
const field = ROLE_TYPE_FIELD_MAP[algoliaRole];
|
|
122
|
+
const valueMap = ROLE_TYPE_VALUE_MAP[algoliaRole];
|
|
123
|
+
if (field && valueMap) {
|
|
124
|
+
const mapped = mapValues(criteria.roleType, valueMap);
|
|
125
|
+
filters.push(orFilter(field, mapped));
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
else {
|
|
130
|
+
const roleTypeParam = params.get("role_type");
|
|
131
|
+
if (roleTypeParam && roleTypeParam !== "any")
|
|
132
|
+
filters.push(`(eng_type:${roleTypeParam})`);
|
|
133
|
+
}
|
|
134
|
+
// workMode → remote
|
|
135
|
+
const remoteModes = criteria.workMode
|
|
136
|
+
? mapValues(criteria.workMode, WORK_MODE_MAP)
|
|
137
|
+
: fallbackArray(params, "remote");
|
|
138
|
+
if (remoteModes.length > 0)
|
|
139
|
+
filters.push(orFilter("remote", remoteModes));
|
|
140
|
+
// location
|
|
141
|
+
const location = criteria.location ?? params.get("locations");
|
|
142
|
+
if (location && location !== "any")
|
|
143
|
+
filters.push(`(locations_for_search:"${location}")`);
|
|
144
|
+
// jobType
|
|
145
|
+
const jobTypes = criteria.jobType
|
|
146
|
+
? mapValues(criteria.jobType, JOB_TYPE_MAP)
|
|
147
|
+
: fallbackArray(params, "jobType");
|
|
148
|
+
if (jobTypes.length > 0)
|
|
149
|
+
filters.push(orFilter("job_type", jobTypes, true));
|
|
150
|
+
// minExperience
|
|
151
|
+
const minExps = criteria.minExperience
|
|
152
|
+
? criteria.minExperience.map(String)
|
|
153
|
+
: fallbackArray(params, "minExperience");
|
|
154
|
+
if (minExps.length > 0)
|
|
155
|
+
filters.push(orFilter("min_experience", minExps));
|
|
156
|
+
// companyStage
|
|
157
|
+
const stages = criteria.companyStage
|
|
158
|
+
? mapValues(criteria.companyStage, COMPANY_STAGE_MAP)
|
|
159
|
+
: fallbackArray(params, "companyStage");
|
|
160
|
+
if (stages.length > 0)
|
|
161
|
+
filters.push(orFilter("company_waas_stage", stages));
|
|
162
|
+
// industry
|
|
163
|
+
const industries = criteria.industry ?? fallbackQuotedArray(params, "industry");
|
|
164
|
+
if (industries.length > 0)
|
|
165
|
+
filters.push(orFilter("company_parent_sector", industries, true));
|
|
166
|
+
// companySize → range syntax
|
|
167
|
+
if (criteria.companySize && criteria.companySize.length > 0) {
|
|
168
|
+
const rangeClauses = criteria.companySize
|
|
169
|
+
.map((s) => COMPANY_SIZE_MAP[s])
|
|
170
|
+
.filter(Boolean);
|
|
171
|
+
if (rangeClauses.length > 0)
|
|
172
|
+
filters.push(`(${rangeClauses.join(" OR ")})`);
|
|
173
|
+
}
|
|
174
|
+
// hasSalary
|
|
175
|
+
if (criteria.hasSalary)
|
|
176
|
+
filters.push("(has_salary:true)");
|
|
177
|
+
// hasEquity
|
|
178
|
+
if (criteria.hasEquity)
|
|
179
|
+
filters.push("(has_equity:true)");
|
|
180
|
+
// hasInterviewProcess
|
|
181
|
+
if (criteria.hasInterviewProcess)
|
|
182
|
+
filters.push("(has_interview_process:true)");
|
|
183
|
+
// visaSponsorship
|
|
184
|
+
if (criteria.visaSponsorship)
|
|
185
|
+
filters.push("(us_visa_required:none OR us_visa_required:possible)");
|
|
186
|
+
return filters.join(" AND ");
|
|
187
|
+
}
|
|
188
|
+
function fallbackArray(params, key) {
|
|
189
|
+
const val = params.get(key);
|
|
190
|
+
if (!val || val === "any")
|
|
191
|
+
return [];
|
|
192
|
+
return [val];
|
|
193
|
+
}
|
|
194
|
+
function fallbackQuotedArray(params, key) {
|
|
195
|
+
const val = params.get(key);
|
|
196
|
+
if (!val || val === "any")
|
|
197
|
+
return [];
|
|
198
|
+
return [val];
|
|
199
|
+
}
|
|
200
|
+
// --- Algolia API ---
|
|
201
|
+
async function fetchPage(appId, apiKey, query, filters, indexName, page) {
|
|
202
|
+
const algoliaParams = new URLSearchParams({
|
|
203
|
+
query,
|
|
204
|
+
page: String(page),
|
|
205
|
+
filters,
|
|
206
|
+
attributesToRetrieve: JSON.stringify(["*"]),
|
|
207
|
+
attributesToHighlight: JSON.stringify([]),
|
|
208
|
+
attributesToSnippet: JSON.stringify([]),
|
|
209
|
+
hitsPerPage: String(HITS_PER_PAGE),
|
|
210
|
+
clickAnalytics: "true",
|
|
211
|
+
distinct: "true",
|
|
212
|
+
});
|
|
213
|
+
const url = `https://${appId}-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=${encodeURIComponent("Algolia for JavaScript (3.35.1); Browser")}&x-algolia-application-id=${appId}&x-algolia-api-key=${apiKey}`;
|
|
214
|
+
const response = await fetch(url, {
|
|
215
|
+
method: "POST",
|
|
216
|
+
headers: {
|
|
217
|
+
accept: "application/json",
|
|
218
|
+
"content-type": "application/x-www-form-urlencoded",
|
|
219
|
+
Origin: "https://www.workatastartup.com",
|
|
220
|
+
Referer: "https://www.workatastartup.com/",
|
|
221
|
+
},
|
|
222
|
+
body: JSON.stringify({
|
|
223
|
+
requests: [
|
|
224
|
+
{
|
|
225
|
+
indexName,
|
|
226
|
+
params: algoliaParams.toString(),
|
|
227
|
+
},
|
|
228
|
+
],
|
|
229
|
+
}),
|
|
230
|
+
});
|
|
231
|
+
if (!response.ok) {
|
|
232
|
+
throw new Error(`Algolia API returned ${response.status}`);
|
|
233
|
+
}
|
|
234
|
+
return (await response.json());
|
|
235
|
+
}
|
|
236
|
+
// --- Job mapping ---
|
|
237
|
+
function mapHit(hit, sourceUrl) {
|
|
238
|
+
const id = createHash("sha256")
|
|
239
|
+
.update(`${hit.objectID}:${hit.title}:yc`)
|
|
240
|
+
.digest("hex")
|
|
241
|
+
.slice(0, 12);
|
|
242
|
+
const jobUrl = hit.search_path ??
|
|
243
|
+
`https://www.workatastartup.com/jobs/${hit.objectID}`;
|
|
244
|
+
return {
|
|
245
|
+
id,
|
|
246
|
+
title: hit.title,
|
|
247
|
+
company: hit.company_name,
|
|
248
|
+
location: hit.locations_for_search?.[0] ?? null,
|
|
249
|
+
workMode: inferWorkMode(hit.remote),
|
|
250
|
+
department: hit.role ?? null,
|
|
251
|
+
url: jobUrl,
|
|
252
|
+
sourceUrl,
|
|
253
|
+
provider: "yc",
|
|
254
|
+
description: hit.description?.slice(0, 200) ?? null,
|
|
255
|
+
postedAt: hit.created_at ?? null,
|
|
256
|
+
extractedAt: new Date().toISOString(),
|
|
257
|
+
raw: hit,
|
|
258
|
+
};
|
|
259
|
+
}
|
|
260
|
+
function inferWorkMode(remote) {
|
|
261
|
+
if (!remote)
|
|
262
|
+
return null;
|
|
263
|
+
const r = remote.toLowerCase();
|
|
264
|
+
if (r === "only" || r === "yes")
|
|
265
|
+
return "remote";
|
|
266
|
+
if (r === "no")
|
|
267
|
+
return null;
|
|
268
|
+
return null;
|
|
269
|
+
}
|
|
270
|
+
// --- Entry point ---
|
|
271
|
+
/**
|
|
272
|
+
* Fetch jobs from YC's Work at a Startup via Algolia.
|
|
273
|
+
* Maps SearchCriteria to Algolia filters. Falls back to URL query params
|
|
274
|
+
* for backward compatibility with direct WaaS URLs.
|
|
275
|
+
*/
|
|
276
|
+
export async function extractYcJobs(sourceUrl, criteria, saveRaw = false) {
|
|
277
|
+
let appId = process.env.YC_ALGOLIA_APP_ID;
|
|
278
|
+
let apiKey = process.env.YC_ALGOLIA_API_KEY;
|
|
279
|
+
if (!appId || !apiKey) {
|
|
280
|
+
const credPath = join(homedir(), ".jobcrawl", "credentials.json");
|
|
281
|
+
if (existsSync(credPath)) {
|
|
282
|
+
const creds = JSON.parse(readFileSync(credPath, "utf-8"));
|
|
283
|
+
appId = appId || creds.yc?.algoliaAppId;
|
|
284
|
+
apiKey = apiKey || creds.yc?.algoliaApiKey;
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
if (!appId || !apiKey) {
|
|
288
|
+
throw new Error("YC provider requires credentials. Run `jobcrawl init` and add your Algolia keys to ~/.jobcrawl/credentials.json");
|
|
289
|
+
}
|
|
290
|
+
const queries = criteria.keywords.length > 0
|
|
291
|
+
? criteria.keywords
|
|
292
|
+
: [""];
|
|
293
|
+
const filters = buildFilters(criteria, sourceUrl);
|
|
294
|
+
// Fetch each keyword as a separate Algolia query and deduplicate
|
|
295
|
+
const jobMap = new Map();
|
|
296
|
+
const allHits = [];
|
|
297
|
+
for (const query of queries) {
|
|
298
|
+
const indexName = query ? INDEX_BY_RELEVANCE : INDEX_BY_DATE;
|
|
299
|
+
const first = await fetchPage(appId, apiKey, query, filters, indexName, 0);
|
|
300
|
+
const result = first.results[0];
|
|
301
|
+
for (const hit of result.hits) {
|
|
302
|
+
if (!jobMap.has(hit.objectID)) {
|
|
303
|
+
jobMap.set(hit.objectID, mapHit(hit, sourceUrl));
|
|
304
|
+
allHits.push(hit);
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
for (let page = 1; page < result.nbPages; page++) {
|
|
308
|
+
const resp = await fetchPage(appId, apiKey, query, filters, indexName, page);
|
|
309
|
+
for (const hit of resp.results[0].hits) {
|
|
310
|
+
if (!jobMap.has(hit.objectID)) {
|
|
311
|
+
jobMap.set(hit.objectID, mapHit(hit, sourceUrl));
|
|
312
|
+
allHits.push(hit);
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
if (saveRaw)
|
|
318
|
+
await saveRawResponse("yc", "workatastartup", allHits);
|
|
319
|
+
return [...jobMap.values()];
|
|
320
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import type { Job } from "../types/index.js";
|
|
2
|
+
export declare class BrowserNotAvailableError extends Error {
|
|
3
|
+
constructor();
|
|
4
|
+
}
|
|
5
|
+
export declare class BrowserSession {
|
|
6
|
+
private name;
|
|
7
|
+
private opened;
|
|
8
|
+
private networkTimeout;
|
|
9
|
+
constructor(opts?: {
|
|
10
|
+
networkTimeout?: number;
|
|
11
|
+
});
|
|
12
|
+
/** Run a local DOM command (no timeout). */
|
|
13
|
+
private run;
|
|
14
|
+
/** Run a network-dependent command (open, wait) with a timeout. */
|
|
15
|
+
private runNetwork;
|
|
16
|
+
open(url: string): Promise<void>;
|
|
17
|
+
getHtml(): Promise<string>;
|
|
18
|
+
getUrl(): Promise<string>;
|
|
19
|
+
clickByText(text: string): Promise<void>;
|
|
20
|
+
/**
|
|
21
|
+
* Find an element by its exact text content, walk up `level` ancestors,
|
|
22
|
+
* and click that parent. Uses eval with base64 to avoid shell quoting.
|
|
23
|
+
*/
|
|
24
|
+
clickNthParent(text: string, level: number): Promise<void>;
|
|
25
|
+
back(): Promise<void>;
|
|
26
|
+
close(): Promise<void>;
|
|
27
|
+
}
|
|
28
|
+
export declare function resolveJobUrls(jobs: Job[], listingUrl: string, session: BrowserSession, opts?: {
|
|
29
|
+
maxBubbleLevels?: number;
|
|
30
|
+
}): Promise<Job[]>;
|