jobcrawl 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.prettierrc.json +10 -0
- package/CHANGELOG.md +40 -0
- package/README.md +232 -0
- package/dist/core/aggregators/yc.d.ts +7 -0
- package/dist/core/aggregators/yc.js +320 -0
- package/dist/core/browser.d.ts +30 -0
- package/dist/core/browser.js +196 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +41 -0
- package/dist/core/detect-provider.d.ts +7 -0
- package/dist/core/detect-provider.js +125 -0
- package/dist/core/discover-careers.d.ts +18 -0
- package/dist/core/discover-careers.js +92 -0
- package/dist/core/extract-jobs.d.ts +14 -0
- package/dist/core/extract-jobs.js +36 -0
- package/dist/core/fetch-page.d.ts +11 -0
- package/dist/core/fetch-page.js +39 -0
- package/dist/core/format-output.d.ts +2 -0
- package/dist/core/format-output.js +59 -0
- package/dist/core/match-jobs.d.ts +6 -0
- package/dist/core/match-jobs.js +43 -0
- package/dist/core/providers/ashby.d.ts +6 -0
- package/dist/core/providers/ashby.js +58 -0
- package/dist/core/providers/generic.d.ts +6 -0
- package/dist/core/providers/generic.js +294 -0
- package/dist/core/providers/greenhouse.d.ts +6 -0
- package/dist/core/providers/greenhouse.js +47 -0
- package/dist/core/providers/lever.d.ts +7 -0
- package/dist/core/providers/lever.js +60 -0
- package/dist/core/providers/yc.d.ts +7 -0
- package/dist/core/providers/yc.js +320 -0
- package/dist/core/resolve-iframe.d.ts +6 -0
- package/dist/core/resolve-iframe.js +51 -0
- package/dist/core/save-raw.d.ts +4 -0
- package/dist/core/save-raw.js +13 -0
- package/dist/data/companies.d.ts +9 -0
- package/dist/data/companies.js +2849 -0
- package/dist/entrypoints/cli/app.d.ts +3 -0
- package/dist/entrypoints/cli/app.js +91 -0
- package/dist/entrypoints/cli/components/crawl-view.d.ts +1 -0
- package/dist/entrypoints/cli/components/crawl-view.js +94 -0
- package/dist/entrypoints/cli/components/discover-view.d.ts +1 -0
- package/dist/entrypoints/cli/components/discover-view.js +67 -0
- package/dist/entrypoints/cli/crawl-aggregators.d.ts +26 -0
- package/dist/entrypoints/cli/crawl-aggregators.js +76 -0
- package/dist/entrypoints/cli/crawl-url.d.ts +26 -0
- package/dist/entrypoints/cli/crawl-url.js +54 -0
- package/dist/entrypoints/cli/crawl.d.ts +32 -0
- package/dist/entrypoints/cli/crawl.js +108 -0
- package/dist/entrypoints/cli/discover.d.ts +10 -0
- package/dist/entrypoints/cli/discover.js +69 -0
- package/dist/entrypoints/cli/index.d.ts +2 -0
- package/dist/entrypoints/cli/index.js +197 -0
- package/dist/entrypoints/cli/init.d.ts +9 -0
- package/dist/entrypoints/cli/init.js +94 -0
- package/dist/entrypoints/cli/plain.d.ts +6 -0
- package/dist/entrypoints/cli/plain.js +77 -0
- package/dist/events.d.ts +114 -0
- package/dist/events.js +17 -0
- package/dist/orchestrators/crawl-all.d.ts +2 -0
- package/dist/orchestrators/crawl-all.js +66 -0
- package/dist/orchestrators/discover-all.d.ts +10 -0
- package/dist/orchestrators/discover-all.js +39 -0
- package/dist/threads/pool.d.ts +5 -0
- package/dist/threads/pool.js +23 -0
- package/dist/threads/process-url.d.ts +9 -0
- package/dist/threads/process-url.js +229 -0
- package/dist/types/index.d.ts +83 -0
- package/dist/types/index.js +6 -0
- package/dist/utils/config.d.ts +17 -0
- package/dist/utils/config.js +57 -0
- package/dist/utils/google-search.d.ts +19 -0
- package/dist/utils/google-search.js +139 -0
- package/dist/utils/llm.d.ts +8 -0
- package/dist/utils/llm.js +25 -0
- package/package.json +42 -0
- package/src/core/aggregators/yc.ts +415 -0
- package/src/core/browser.ts +239 -0
- package/src/core/detect-provider.ts +162 -0
- package/src/core/discover-careers.ts +117 -0
- package/src/core/extract-jobs.ts +50 -0
- package/src/core/fetch-page.ts +41 -0
- package/src/core/format-output.ts +80 -0
- package/src/core/match-jobs.ts +56 -0
- package/src/core/providers/ashby.ts +84 -0
- package/src/core/providers/generic.ts +332 -0
- package/src/core/providers/greenhouse.ts +74 -0
- package/src/core/providers/lever.ts +90 -0
- package/src/core/resolve-iframe.ts +59 -0
- package/src/core/save-raw.ts +18 -0
- package/src/data/companies.ts +2859 -0
- package/src/entrypoints/cli/app.tsx +173 -0
- package/src/entrypoints/cli/components/crawl-view.tsx +163 -0
- package/src/entrypoints/cli/components/discover-view.tsx +138 -0
- package/src/entrypoints/cli/crawl-aggregators.ts +112 -0
- package/src/entrypoints/cli/crawl-url.ts +87 -0
- package/src/entrypoints/cli/crawl.ts +163 -0
- package/src/entrypoints/cli/discover.ts +96 -0
- package/src/entrypoints/cli/index.ts +252 -0
- package/src/entrypoints/cli/init.ts +117 -0
- package/src/entrypoints/cli/plain.ts +104 -0
- package/src/events.ts +79 -0
- package/src/orchestrators/crawl-all.ts +96 -0
- package/src/orchestrators/discover-all.ts +61 -0
- package/src/threads/pool.ts +29 -0
- package/src/threads/process-url.ts +312 -0
- package/src/types/index.ts +110 -0
- package/src/utils/config.ts +79 -0
- package/src/utils/google-search.ts +155 -0
- package/src/utils/llm.ts +33 -0
- package/test/integration/process-url.test.ts +301 -0
- package/test/integration/providers/ashby.test.ts +163 -0
- package/test/integration/providers/greenhouse.test.ts +191 -0
- package/test/integration/providers/lever.test.ts +188 -0
- package/test/unit/config.test.ts +64 -0
- package/test/unit/detect-provider.test.ts +165 -0
- package/test/unit/events.test.ts +104 -0
- package/test/unit/format-output.test.ts +165 -0
- package/test/unit/match-jobs.test.ts +257 -0
- package/test/unit/pool.test.ts +74 -0
- package/test/unit/providers/generic.test.ts +139 -0
- package/test/unit/resolve-iframe.test.ts +100 -0
- package/tsconfig.json +19 -0
- package/vitest.config.ts +7 -0
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
import { execFile as execFileCb } from "node:child_process";
|
|
2
|
+
import { promisify } from "node:util";
|
|
3
|
+
import { createHash } from "node:crypto";
|
|
4
|
+
import { existsSync } from "node:fs";
|
|
5
|
+
import { join } from "node:path";
|
|
6
|
+
import { setTimeout as delay } from "node:timers/promises";
|
|
7
|
+
const execFile = promisify(execFileCb);
|
|
8
|
+
// ---------------------------------------------------------------------------
|
|
9
|
+
// Errors
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
export class BrowserNotAvailableError extends Error {
|
|
12
|
+
constructor() {
|
|
13
|
+
super("agent-browser is not installed. Install with: npm i -g agent-browser");
|
|
14
|
+
this.name = "BrowserNotAvailableError";
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Binary resolution (cached)
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
let cachedBin = null;
|
|
21
|
+
async function resolveBin() {
|
|
22
|
+
if (cachedBin)
|
|
23
|
+
return cachedBin;
|
|
24
|
+
// 1. Local node_modules
|
|
25
|
+
const local = join(process.cwd(), "node_modules", ".bin", "agent-browser");
|
|
26
|
+
if (existsSync(local)) {
|
|
27
|
+
cachedBin = local;
|
|
28
|
+
return local;
|
|
29
|
+
}
|
|
30
|
+
// 2. Global — check if it's on PATH
|
|
31
|
+
try {
|
|
32
|
+
await execFile("agent-browser", ["--version"], { timeout: 5000 });
|
|
33
|
+
cachedBin = "agent-browser";
|
|
34
|
+
return "agent-browser";
|
|
35
|
+
}
|
|
36
|
+
catch {
|
|
37
|
+
throw new BrowserNotAvailableError();
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
// ---------------------------------------------------------------------------
|
|
41
|
+
// BrowserSession
|
|
42
|
+
// ---------------------------------------------------------------------------
|
|
43
|
+
const DEFAULT_NETWORK_TIMEOUT = 120_000; // 2 min for network-dependent commands
|
|
44
|
+
export class BrowserSession {
|
|
45
|
+
name;
|
|
46
|
+
opened = false;
|
|
47
|
+
networkTimeout;
|
|
48
|
+
constructor(opts) {
|
|
49
|
+
const id = createHash("sha256")
|
|
50
|
+
.update(`${Date.now()}:${Math.random()}`)
|
|
51
|
+
.digest("hex")
|
|
52
|
+
.slice(0, 8);
|
|
53
|
+
this.name = `jc-${id}`;
|
|
54
|
+
this.networkTimeout = opts?.networkTimeout ?? DEFAULT_NETWORK_TIMEOUT;
|
|
55
|
+
}
|
|
56
|
+
/** Run a local DOM command (no timeout). */
|
|
57
|
+
async run(...args) {
|
|
58
|
+
const bin = await resolveBin();
|
|
59
|
+
const { stdout } = await execFile(bin, ["--session", this.name, ...args], { maxBuffer: 20 * 1024 * 1024 });
|
|
60
|
+
return stdout.trimEnd();
|
|
61
|
+
}
|
|
62
|
+
/** Run a network-dependent command (open, wait) with a timeout. */
|
|
63
|
+
async runNetwork(...args) {
|
|
64
|
+
const bin = await resolveBin();
|
|
65
|
+
const { stdout } = await execFile(bin, ["--session", this.name, ...args], { timeout: this.networkTimeout, maxBuffer: 20 * 1024 * 1024 });
|
|
66
|
+
return stdout.trimEnd();
|
|
67
|
+
}
|
|
68
|
+
async open(url) {
|
|
69
|
+
await this.runNetwork("open", url);
|
|
70
|
+
await this.runNetwork("wait", "--load", "networkidle");
|
|
71
|
+
this.opened = true;
|
|
72
|
+
}
|
|
73
|
+
async getHtml() {
|
|
74
|
+
return this.run("eval", "document.documentElement.outerHTML");
|
|
75
|
+
}
|
|
76
|
+
async getUrl() {
|
|
77
|
+
return this.run("get", "url");
|
|
78
|
+
}
|
|
79
|
+
async clickByText(text) {
|
|
80
|
+
await this.run("find", "text", text, "click", "--exact");
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Find an element by its exact text content, walk up `level` ancestors,
|
|
84
|
+
* and click that parent. Uses eval with base64 to avoid shell quoting.
|
|
85
|
+
*/
|
|
86
|
+
async clickNthParent(text, level) {
|
|
87
|
+
const js = `(() => {
|
|
88
|
+
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
|
|
89
|
+
let node;
|
|
90
|
+
while (node = walker.nextNode()) {
|
|
91
|
+
if (node.textContent.trim() === ${JSON.stringify(text)}) {
|
|
92
|
+
let target = node.parentElement;
|
|
93
|
+
for (let i = 0; i < ${level} && target && target.parentElement; i++) {
|
|
94
|
+
target = target.parentElement;
|
|
95
|
+
}
|
|
96
|
+
if (target) { target.click(); return "clicked"; }
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
return "not found";
|
|
100
|
+
})()`;
|
|
101
|
+
await this.run("eval", "-b", Buffer.from(js).toString("base64"));
|
|
102
|
+
}
|
|
103
|
+
async back() {
|
|
104
|
+
await this.run("back");
|
|
105
|
+
}
|
|
106
|
+
async close() {
|
|
107
|
+
if (!this.opened)
|
|
108
|
+
return;
|
|
109
|
+
try {
|
|
110
|
+
await this.run("close");
|
|
111
|
+
}
|
|
112
|
+
catch {
|
|
113
|
+
// Best-effort cleanup
|
|
114
|
+
}
|
|
115
|
+
this.opened = false;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
// ---------------------------------------------------------------------------
|
|
119
|
+
// resolveJobUrls — click each job card to capture individual URLs
|
|
120
|
+
// ---------------------------------------------------------------------------
|
|
121
|
+
const DEFAULT_MAX_BUBBLE_LEVELS = 5;
|
|
122
|
+
/**
|
|
123
|
+
* Try clicking at various DOM levels to find which one navigates.
|
|
124
|
+
* Returns the level that worked (0 = direct text, 1-N = Nth parent),
|
|
125
|
+
* or -1 if nothing navigated.
|
|
126
|
+
*/
|
|
127
|
+
async function probeClickLevel(title, listingUrl, session, maxLevels) {
|
|
128
|
+
// Level 0: direct text click
|
|
129
|
+
try {
|
|
130
|
+
await session.clickByText(title);
|
|
131
|
+
await delay(800);
|
|
132
|
+
const url = await session.getUrl();
|
|
133
|
+
if (url !== listingUrl)
|
|
134
|
+
return 0;
|
|
135
|
+
}
|
|
136
|
+
catch {
|
|
137
|
+
// Element not found by text — continue to parent probing
|
|
138
|
+
}
|
|
139
|
+
// Levels 1-N: walk up parent elements
|
|
140
|
+
for (let level = 1; level <= maxLevels; level++) {
|
|
141
|
+
try {
|
|
142
|
+
await session.clickNthParent(title, level);
|
|
143
|
+
await delay(800);
|
|
144
|
+
const url = await session.getUrl();
|
|
145
|
+
if (url !== listingUrl)
|
|
146
|
+
return level;
|
|
147
|
+
}
|
|
148
|
+
catch {
|
|
149
|
+
continue;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
return -1;
|
|
153
|
+
}
|
|
154
|
+
export async function resolveJobUrls(jobs, listingUrl, session, opts) {
|
|
155
|
+
const maxLevels = opts?.maxBubbleLevels ?? DEFAULT_MAX_BUBBLE_LEVELS;
|
|
156
|
+
const resolved = [...jobs];
|
|
157
|
+
const candidates = resolved.filter((j) => j.url === listingUrl);
|
|
158
|
+
if (candidates.length === 0)
|
|
159
|
+
return resolved;
|
|
160
|
+
// Probe the first job to discover which click level causes navigation
|
|
161
|
+
const probeJob = candidates[0];
|
|
162
|
+
const level = await probeClickLevel(probeJob.title, listingUrl, session, maxLevels);
|
|
163
|
+
if (level === -1) {
|
|
164
|
+
// No click level navigated — this page doesn't link to individual jobs
|
|
165
|
+
return resolved;
|
|
166
|
+
}
|
|
167
|
+
// Capture the URL from the probe click
|
|
168
|
+
const probeUrl = await session.getUrl();
|
|
169
|
+
const probeIdx = resolved.indexOf(probeJob);
|
|
170
|
+
resolved[probeIdx] = { ...probeJob, url: probeUrl };
|
|
171
|
+
await session.open(listingUrl);
|
|
172
|
+
// Apply the discovered level to remaining jobs
|
|
173
|
+
for (let i = 0; i < resolved.length; i++) {
|
|
174
|
+
const job = resolved[i];
|
|
175
|
+
if (job.url !== listingUrl)
|
|
176
|
+
continue; // already resolved or different URL
|
|
177
|
+
try {
|
|
178
|
+
if (level === 0) {
|
|
179
|
+
await session.clickByText(job.title);
|
|
180
|
+
}
|
|
181
|
+
else {
|
|
182
|
+
await session.clickNthParent(job.title, level);
|
|
183
|
+
}
|
|
184
|
+
await delay(800);
|
|
185
|
+
const afterUrl = await session.getUrl();
|
|
186
|
+
if (afterUrl !== listingUrl) {
|
|
187
|
+
resolved[i] = { ...job, url: afterUrl };
|
|
188
|
+
await session.open(listingUrl);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
catch {
|
|
192
|
+
continue;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
return resolved;
|
|
196
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { Provider } from "../types/index.js";
|
|
2
|
+
interface CacheEntry {
|
|
3
|
+
provider: Provider;
|
|
4
|
+
discoveredAt: string;
|
|
5
|
+
}
|
|
6
|
+
interface ProviderCache {
|
|
7
|
+
version: 1;
|
|
8
|
+
entries: Record<string, CacheEntry>;
|
|
9
|
+
}
|
|
10
|
+
export declare function loadCache(): Promise<ProviderCache>;
|
|
11
|
+
export declare function getCachedProvider(slug: string): Promise<Provider | null>;
|
|
12
|
+
export declare function setCachedProvider(slug: string, provider: Provider): Promise<void>;
|
|
13
|
+
export {};
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { existsSync } from "node:fs";
|
|
2
|
+
import { readFile, writeFile } from "node:fs/promises";
|
|
3
|
+
import { homedir } from "node:os";
|
|
4
|
+
import { join } from "node:path";
|
|
5
|
+
const CACHE_FILE = join(homedir(), ".jobcrawl", "cache.json");
|
|
6
|
+
function emptyCache() {
|
|
7
|
+
return { version: 1, entries: {} };
|
|
8
|
+
}
|
|
9
|
+
export async function loadCache() {
|
|
10
|
+
if (!existsSync(CACHE_FILE))
|
|
11
|
+
return emptyCache();
|
|
12
|
+
try {
|
|
13
|
+
const raw = await readFile(CACHE_FILE, "utf-8");
|
|
14
|
+
const parsed = JSON.parse(raw);
|
|
15
|
+
if (parsed.version !== 1)
|
|
16
|
+
return emptyCache();
|
|
17
|
+
return parsed;
|
|
18
|
+
}
|
|
19
|
+
catch {
|
|
20
|
+
return emptyCache();
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
export async function getCachedProvider(slug) {
|
|
24
|
+
const cache = await loadCache();
|
|
25
|
+
const entry = cache.entries[slug];
|
|
26
|
+
if (!entry)
|
|
27
|
+
return null;
|
|
28
|
+
// 7-day TTL
|
|
29
|
+
const age = Date.now() - new Date(entry.discoveredAt).getTime();
|
|
30
|
+
if (age > 7 * 24 * 60 * 60 * 1000)
|
|
31
|
+
return null;
|
|
32
|
+
return entry.provider;
|
|
33
|
+
}
|
|
34
|
+
export async function setCachedProvider(slug, provider) {
|
|
35
|
+
const cache = await loadCache();
|
|
36
|
+
cache.entries[slug] = {
|
|
37
|
+
provider,
|
|
38
|
+
discoveredAt: new Date().toISOString(),
|
|
39
|
+
};
|
|
40
|
+
await writeFile(CACHE_FILE, JSON.stringify(cache, null, 2) + "\n");
|
|
41
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { ProviderDetection } from "../types/index.js";
|
|
2
|
+
/**
|
|
3
|
+
* Detect ATS provider from HTML content and URL.
|
|
4
|
+
* Checks for iframes, embed scripts, and DOM signatures.
|
|
5
|
+
* Extracts board token/slug for API calls when possible.
|
|
6
|
+
*/
|
|
7
|
+
export declare function detectProvider(html: string, url: string): ProviderDetection;
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Detect ATS provider from HTML content and URL.
|
|
3
|
+
* Checks for iframes, embed scripts, and DOM signatures.
|
|
4
|
+
* Extracts board token/slug for API calls when possible.
|
|
5
|
+
*/
|
|
6
|
+
export function detectProvider(html, url) {
|
|
7
|
+
// 1. URL pattern matching (direct ATS board URLs)
|
|
8
|
+
const urlMatch = detectFromUrl(url);
|
|
9
|
+
if (urlMatch.provider !== "unknown")
|
|
10
|
+
return urlMatch;
|
|
11
|
+
// 2. HTML signature detection (embedded ATS on custom domains)
|
|
12
|
+
return detectFromHtml(html);
|
|
13
|
+
}
|
|
14
|
+
function detectFromUrl(url) {
|
|
15
|
+
const u = new URL(url);
|
|
16
|
+
// Greenhouse: boards.greenhouse.io/{token} or job-boards.greenhouse.io/{token}
|
|
17
|
+
if (u.hostname === "boards.greenhouse.io" ||
|
|
18
|
+
u.hostname === "job-boards.greenhouse.io") {
|
|
19
|
+
const token = u.pathname.split("/").filter(Boolean)[0];
|
|
20
|
+
if (token)
|
|
21
|
+
return { provider: "greenhouse", boardToken: token };
|
|
22
|
+
}
|
|
23
|
+
// Lever: jobs.lever.co/{company}
|
|
24
|
+
if (u.hostname === "jobs.lever.co") {
|
|
25
|
+
const company = u.pathname.split("/").filter(Boolean)[0];
|
|
26
|
+
if (company)
|
|
27
|
+
return { provider: "lever", boardToken: company };
|
|
28
|
+
}
|
|
29
|
+
// Ashby: jobs.ashbyhq.com/{company}
|
|
30
|
+
if (u.hostname === "jobs.ashbyhq.com") {
|
|
31
|
+
const company = u.pathname.split("/").filter(Boolean)[0];
|
|
32
|
+
if (company)
|
|
33
|
+
return { provider: "ashby", boardToken: company };
|
|
34
|
+
}
|
|
35
|
+
// Workday: {company}.wd{n}.myworkdayjobs.com
|
|
36
|
+
if (u.hostname.includes(".myworkdayjobs.com")) {
|
|
37
|
+
const tenant = u.hostname.split(".")[0];
|
|
38
|
+
if (tenant)
|
|
39
|
+
return { provider: "workday", boardToken: tenant };
|
|
40
|
+
}
|
|
41
|
+
// BambooHR: {company}.bamboohr.com
|
|
42
|
+
if (u.hostname.includes(".bamboohr.com")) {
|
|
43
|
+
const company = u.hostname.split(".")[0];
|
|
44
|
+
if (company)
|
|
45
|
+
return { provider: "bamboohr", boardToken: company };
|
|
46
|
+
}
|
|
47
|
+
return { provider: "unknown", boardToken: null };
|
|
48
|
+
}
|
|
49
|
+
function detectFromHtml(html) {
|
|
50
|
+
// Greenhouse embed: <div id="grnhse_app"> or greenhouse embed script
|
|
51
|
+
const greenhouseIframe = html.match(/iframe[^>]+src=["']([^"']*(?:boards|job-boards)\.greenhouse\.io[^"']*)/i);
|
|
52
|
+
if (greenhouseIframe) {
|
|
53
|
+
const token = extractTokenFromUrl(greenhouseIframe[1], "boards.greenhouse.io");
|
|
54
|
+
if (token)
|
|
55
|
+
return { provider: "greenhouse", boardToken: token };
|
|
56
|
+
}
|
|
57
|
+
const greenhouseEmbed = html.match(/src=["'][^"']*api\.greenhouse\.io\/v1\/boards\/([^/"']+)/i);
|
|
58
|
+
if (greenhouseEmbed) {
|
|
59
|
+
return { provider: "greenhouse", boardToken: greenhouseEmbed[1] };
|
|
60
|
+
}
|
|
61
|
+
if (html.includes('id="grnhse_app"') || html.includes("id='grnhse_app'")) {
|
|
62
|
+
// Look for the greenhouse board token in embed script
|
|
63
|
+
const tokenMatch = html.match(/Grnhse\.Settings\.board_token\s*=\s*["']([^"']+)/);
|
|
64
|
+
if (tokenMatch) {
|
|
65
|
+
return { provider: "greenhouse", boardToken: tokenMatch[1] };
|
|
66
|
+
}
|
|
67
|
+
const scriptSrc = html.match(/src=["'][^"']*(?:boards|job-boards)\.greenhouse\.io\/embed\/job_board\/js\?for=([^"'&]+)/i);
|
|
68
|
+
if (scriptSrc) {
|
|
69
|
+
return { provider: "greenhouse", boardToken: scriptSrc[1] };
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
// Lever embed
|
|
73
|
+
const leverIframe = html.match(/iframe[^>]+src=["']([^"']*jobs\.lever\.co[^"']*)/i);
|
|
74
|
+
if (leverIframe) {
|
|
75
|
+
const token = extractTokenFromUrl(leverIframe[1], "jobs.lever.co");
|
|
76
|
+
if (token)
|
|
77
|
+
return { provider: "lever", boardToken: token };
|
|
78
|
+
}
|
|
79
|
+
if (html.includes("data-lever-")) {
|
|
80
|
+
const leverCompany = html.match(/data-lever-company=["']([^"']+)/i);
|
|
81
|
+
if (leverCompany) {
|
|
82
|
+
return { provider: "lever", boardToken: leverCompany[1] };
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
// Ashby embed
|
|
86
|
+
const ashbyIframe = html.match(/iframe[^>]+src=["']([^"']*jobs\.ashbyhq\.com[^"']*)/i);
|
|
87
|
+
if (ashbyIframe) {
|
|
88
|
+
const token = extractTokenFromUrl(ashbyIframe[1], "jobs.ashbyhq.com");
|
|
89
|
+
if (token)
|
|
90
|
+
return { provider: "ashby", boardToken: token };
|
|
91
|
+
}
|
|
92
|
+
// Ashby embed script pattern
|
|
93
|
+
const ashbyScript = html.match(/src=["'][^"']*ashbyhq\.com\/[^"']*["'][^>]*data-ashby-job-board-id=["']([^"']+)/i);
|
|
94
|
+
if (ashbyScript) {
|
|
95
|
+
return { provider: "ashby", boardToken: ashbyScript[1] };
|
|
96
|
+
}
|
|
97
|
+
// Workday embed
|
|
98
|
+
const workdayIframe = html.match(/iframe[^>]+src=["']([^"']*\.myworkdayjobs\.com[^"']*)/i);
|
|
99
|
+
if (workdayIframe) {
|
|
100
|
+
const tenant = workdayIframe[1].match(/\/\/([^.]+)\.wd\d+\.myworkdayjobs/);
|
|
101
|
+
if (tenant)
|
|
102
|
+
return { provider: "workday", boardToken: tenant[1] };
|
|
103
|
+
}
|
|
104
|
+
// BambooHR embed
|
|
105
|
+
const bambooIframe = html.match(/iframe[^>]+src=["']([^"']*\.bamboohr\.com[^"']*)/i);
|
|
106
|
+
if (bambooIframe) {
|
|
107
|
+
const company = bambooIframe[1].match(/\/\/([^.]+)\.bamboohr\.com/);
|
|
108
|
+
if (company)
|
|
109
|
+
return { provider: "bamboohr", boardToken: company[1] };
|
|
110
|
+
}
|
|
111
|
+
return { provider: "unknown", boardToken: null };
|
|
112
|
+
}
|
|
113
|
+
function extractTokenFromUrl(iframeSrc, hostname) {
|
|
114
|
+
try {
|
|
115
|
+
const url = new URL(iframeSrc.startsWith("//") ? `https:${iframeSrc}` : iframeSrc);
|
|
116
|
+
if (url.hostname === hostname) {
|
|
117
|
+
const parts = url.pathname.split("/").filter(Boolean);
|
|
118
|
+
return parts[0] || null;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
catch {
|
|
122
|
+
// Invalid URL
|
|
123
|
+
}
|
|
124
|
+
return null;
|
|
125
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
export interface DiscoverResult {
|
|
2
|
+
company: string;
|
|
3
|
+
url: string | null;
|
|
4
|
+
verified: boolean;
|
|
5
|
+
error: string | null;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Discover the career page URL for a company.
|
|
9
|
+
*
|
|
10
|
+
* Strategy (in order):
|
|
11
|
+
* 1. Web search (DDG) + LLM to pick the best URL
|
|
12
|
+
* 2. Heuristic URL guessing ({company}.com/careers) + HTTP HEAD check
|
|
13
|
+
*/
|
|
14
|
+
export declare function discoverCareerPage(company: string): Promise<DiscoverResult>;
|
|
15
|
+
/**
|
|
16
|
+
* Verify that a discovered URL is a real career page.
|
|
17
|
+
*/
|
|
18
|
+
export declare function verifyCareerPage(url: string): Promise<boolean>;
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { webSearch, guessCareerUrls, } from "../utils/google-search.js";
|
|
2
|
+
import { askClaude } from "../utils/llm.js";
|
|
3
|
+
import { probePage } from "./fetch-page.js";
|
|
4
|
+
/**
|
|
5
|
+
* Discover the career page URL for a company.
|
|
6
|
+
*
|
|
7
|
+
* Strategy (in order):
|
|
8
|
+
* 1. Web search (DDG) + LLM to pick the best URL
|
|
9
|
+
* 2. Heuristic URL guessing ({company}.com/careers) + HTTP HEAD check
|
|
10
|
+
*/
|
|
11
|
+
export async function discoverCareerPage(company) {
|
|
12
|
+
try {
|
|
13
|
+
// Strategy 1: Web search + LLM
|
|
14
|
+
const results = await webSearch(`"${company}" careers jobs`);
|
|
15
|
+
if (results.length > 0) {
|
|
16
|
+
const url = await pickCareerUrl(company, results);
|
|
17
|
+
if (url) {
|
|
18
|
+
return { company, url, verified: false, error: null };
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
// Strategy 2: Heuristic URL guessing
|
|
22
|
+
const guessed = await guessCareerUrls(company);
|
|
23
|
+
if (guessed.length > 0) {
|
|
24
|
+
return {
|
|
25
|
+
company,
|
|
26
|
+
url: guessed[0].url,
|
|
27
|
+
verified: false,
|
|
28
|
+
error: null,
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
return {
|
|
32
|
+
company,
|
|
33
|
+
url: null,
|
|
34
|
+
verified: false,
|
|
35
|
+
error: "Could not find career page",
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
catch (err) {
|
|
39
|
+
return {
|
|
40
|
+
company,
|
|
41
|
+
url: null,
|
|
42
|
+
verified: false,
|
|
43
|
+
error: err instanceof Error ? err.message : String(err),
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Verify that a discovered URL is a real career page.
|
|
49
|
+
*/
|
|
50
|
+
export async function verifyCareerPage(url) {
|
|
51
|
+
try {
|
|
52
|
+
const { html } = await probePage(url);
|
|
53
|
+
const lower = html.toLowerCase();
|
|
54
|
+
const signals = [
|
|
55
|
+
"careers",
|
|
56
|
+
"jobs",
|
|
57
|
+
"openings",
|
|
58
|
+
"positions",
|
|
59
|
+
"join our team",
|
|
60
|
+
"join us",
|
|
61
|
+
"we're hiring",
|
|
62
|
+
"open roles",
|
|
63
|
+
"apply now",
|
|
64
|
+
"job-boards.greenhouse.io",
|
|
65
|
+
"boards.greenhouse.io",
|
|
66
|
+
"jobs.lever.co",
|
|
67
|
+
"jobs.ashbyhq.com",
|
|
68
|
+
"grnhse_app",
|
|
69
|
+
];
|
|
70
|
+
const matchCount = signals.filter((s) => lower.includes(s)).length;
|
|
71
|
+
return matchCount >= 2;
|
|
72
|
+
}
|
|
73
|
+
catch {
|
|
74
|
+
return false;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
async function pickCareerUrl(company, results) {
|
|
78
|
+
const formatted = results
|
|
79
|
+
.map((r, i) => `${i + 1}. ${r.title}\n URL: ${r.url}\n ${r.snippet}`)
|
|
80
|
+
.join("\n\n");
|
|
81
|
+
const prompt = `Given these search results for "${company} careers", return ONLY the URL of their official careers/jobs page. Not individual job postings or third-party job boards (Indeed, LinkedIn, Glassdoor).
|
|
82
|
+
|
|
83
|
+
Search results:
|
|
84
|
+
${formatted}
|
|
85
|
+
|
|
86
|
+
Respond with ONLY the URL, nothing else. If you can't determine it, respond with "null".`;
|
|
87
|
+
const response = await askClaude(prompt);
|
|
88
|
+
const urlMatch = response.match(/https?:\/\/[^\s"'<>]+/);
|
|
89
|
+
if (urlMatch)
|
|
90
|
+
return urlMatch[0];
|
|
91
|
+
return null;
|
|
92
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { Job, Provider, Aggregator, SearchCriteria } from "../types/index.js";
|
|
2
|
+
/**
|
|
3
|
+
* Extract jobs via ATS JSON API. Used when provider + boardToken are known.
|
|
4
|
+
* Criteria is passed through so providers can apply server-side filtering.
|
|
5
|
+
*/
|
|
6
|
+
export declare function extractViaApi(provider: Provider, boardToken: string, sourceUrl: string, criteria: SearchCriteria, saveRaw?: boolean): Promise<Job[]>;
|
|
7
|
+
/**
|
|
8
|
+
* Extract jobs via aggregator. Aggregators search across multiple companies.
|
|
9
|
+
*/
|
|
10
|
+
export declare function extractViaAggregator(aggregator: Aggregator, criteria: SearchCriteria, saveRaw?: boolean): Promise<Job[]>;
|
|
11
|
+
/**
|
|
12
|
+
* Extract jobs from rendered HTML. Used when no ATS API is available.
|
|
13
|
+
*/
|
|
14
|
+
export declare function extractFromHtml(html: string, sourceUrl: string): Job[];
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { extractGreenhouseJobs } from "./providers/greenhouse.js";
|
|
2
|
+
import { extractAshbyJobs } from "./providers/ashby.js";
|
|
3
|
+
import { extractLeverJobs } from "./providers/lever.js";
|
|
4
|
+
import { extractYcJobs } from "./aggregators/yc.js";
|
|
5
|
+
import { extractGenericJobs } from "./providers/generic.js";
|
|
6
|
+
/**
|
|
7
|
+
* Extract jobs via ATS JSON API. Used when provider + boardToken are known.
|
|
8
|
+
* Criteria is passed through so providers can apply server-side filtering.
|
|
9
|
+
*/
|
|
10
|
+
export async function extractViaApi(provider, boardToken, sourceUrl, criteria, saveRaw = false) {
|
|
11
|
+
switch (provider) {
|
|
12
|
+
case "greenhouse":
|
|
13
|
+
return extractGreenhouseJobs(boardToken, sourceUrl, criteria, saveRaw);
|
|
14
|
+
case "ashby":
|
|
15
|
+
return extractAshbyJobs(boardToken, sourceUrl, criteria, saveRaw);
|
|
16
|
+
case "lever":
|
|
17
|
+
return extractLeverJobs(boardToken, sourceUrl, criteria, saveRaw);
|
|
18
|
+
default:
|
|
19
|
+
throw new Error(`No API extractor for provider "${provider}"`);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Extract jobs via aggregator. Aggregators search across multiple companies.
|
|
24
|
+
*/
|
|
25
|
+
export async function extractViaAggregator(aggregator, criteria, saveRaw = false) {
|
|
26
|
+
switch (aggregator) {
|
|
27
|
+
case "yc":
|
|
28
|
+
return extractYcJobs("https://workatastartup.com", criteria, saveRaw);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Extract jobs from rendered HTML. Used when no ATS API is available.
|
|
33
|
+
*/
|
|
34
|
+
export function extractFromHtml(html, sourceUrl) {
|
|
35
|
+
return extractGenericJobs(html, sourceUrl);
|
|
36
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { FetchResult } from "../types/index.js";
|
|
2
|
+
/**
|
|
3
|
+
* Quick HTTP probe — simple GET, no browser.
|
|
4
|
+
* Enough to detect ATS signals in static HTML.
|
|
5
|
+
*/
|
|
6
|
+
export declare function probePage(url: string): Promise<FetchResult>;
|
|
7
|
+
/**
|
|
8
|
+
* Full JS rendering via agent-browser.
|
|
9
|
+
* Falls back to probePage when agent-browser is not installed.
|
|
10
|
+
*/
|
|
11
|
+
export declare function renderPage(url: string): Promise<FetchResult>;
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { BrowserSession, BrowserNotAvailableError } from "./browser.js";
|
|
2
|
+
/**
|
|
3
|
+
* Quick HTTP probe — simple GET, no browser.
|
|
4
|
+
* Enough to detect ATS signals in static HTML.
|
|
5
|
+
*/
|
|
6
|
+
export async function probePage(url) {
|
|
7
|
+
const response = await fetch(url, {
|
|
8
|
+
headers: {
|
|
9
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
10
|
+
},
|
|
11
|
+
redirect: "follow",
|
|
12
|
+
});
|
|
13
|
+
const html = await response.text();
|
|
14
|
+
return { html, finalUrl: response.url };
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Full JS rendering via agent-browser.
|
|
18
|
+
* Falls back to probePage when agent-browser is not installed.
|
|
19
|
+
*/
|
|
20
|
+
export async function renderPage(url) {
|
|
21
|
+
const session = new BrowserSession();
|
|
22
|
+
try {
|
|
23
|
+
await session.open(url);
|
|
24
|
+
const [html, finalUrl] = await Promise.all([
|
|
25
|
+
session.getHtml(),
|
|
26
|
+
session.getUrl(),
|
|
27
|
+
]);
|
|
28
|
+
return { html, finalUrl };
|
|
29
|
+
}
|
|
30
|
+
catch (err) {
|
|
31
|
+
if (err instanceof BrowserNotAvailableError) {
|
|
32
|
+
return probePage(url);
|
|
33
|
+
}
|
|
34
|
+
throw err;
|
|
35
|
+
}
|
|
36
|
+
finally {
|
|
37
|
+
await session.close();
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
export function formatOutput(jobs, format) {
|
|
2
|
+
switch (format) {
|
|
3
|
+
case "json":
|
|
4
|
+
return JSON.stringify(jobs, null, 2);
|
|
5
|
+
case "table":
|
|
6
|
+
return formatTable(jobs);
|
|
7
|
+
case "csv":
|
|
8
|
+
return formatCsv(jobs);
|
|
9
|
+
case "markdown":
|
|
10
|
+
return formatMarkdown(jobs);
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
function formatTable(jobs) {
|
|
14
|
+
if (jobs.length === 0)
|
|
15
|
+
return "No matching jobs found.";
|
|
16
|
+
const headers = ["Company", "Title", "Location", "Department", "URL"];
|
|
17
|
+
const rows = jobs.map((j) => [
|
|
18
|
+
j.company,
|
|
19
|
+
j.title,
|
|
20
|
+
j.location ?? "-",
|
|
21
|
+
j.department ?? "-",
|
|
22
|
+
j.url,
|
|
23
|
+
]);
|
|
24
|
+
// Calculate column widths
|
|
25
|
+
const widths = headers.map((h, i) => Math.min(50, Math.max(h.length, ...rows.map((r) => r[i].length))));
|
|
26
|
+
const separator = widths.map((w) => "-".repeat(w)).join(" | ");
|
|
27
|
+
const headerLine = headers.map((h, i) => h.padEnd(widths[i])).join(" | ");
|
|
28
|
+
const bodyLines = rows.map((row) => row.map((cell, i) => cell.slice(0, widths[i]).padEnd(widths[i])).join(" | "));
|
|
29
|
+
return [headerLine, separator, ...bodyLines].join("\n");
|
|
30
|
+
}
|
|
31
|
+
function formatCsv(jobs) {
|
|
32
|
+
const headers = "company,title,location,department,workMode,url,postedAt";
|
|
33
|
+
const rows = jobs.map((j) => [
|
|
34
|
+
csvEscape(j.company),
|
|
35
|
+
csvEscape(j.title),
|
|
36
|
+
csvEscape(j.location ?? ""),
|
|
37
|
+
csvEscape(j.department ?? ""),
|
|
38
|
+
csvEscape(j.workMode ?? ""),
|
|
39
|
+
csvEscape(j.url),
|
|
40
|
+
csvEscape(j.postedAt ?? ""),
|
|
41
|
+
].join(","));
|
|
42
|
+
return [headers, ...rows].join("\n");
|
|
43
|
+
}
|
|
44
|
+
function csvEscape(value) {
|
|
45
|
+
if (value.includes(",") || value.includes('"') || value.includes("\n")) {
|
|
46
|
+
return `"${value.replace(/"/g, '""')}"`;
|
|
47
|
+
}
|
|
48
|
+
return value;
|
|
49
|
+
}
|
|
50
|
+
function formatMarkdown(jobs) {
|
|
51
|
+
if (jobs.length === 0)
|
|
52
|
+
return "No matching jobs found.";
|
|
53
|
+
const lines = [
|
|
54
|
+
"| Company | Title | Location | Department |",
|
|
55
|
+
"| --- | --- | --- | --- |",
|
|
56
|
+
...jobs.map((j) => `| ${j.company} | [${j.title}](${j.url}) | ${j.location ?? "-"} | ${j.department ?? "-"} |`),
|
|
57
|
+
];
|
|
58
|
+
return lines.join("\n");
|
|
59
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { Job, SearchCriteria } from "../types/index.js";
|
|
2
|
+
/**
|
|
3
|
+
* Filter jobs against search criteria. Case-insensitive substring matching.
|
|
4
|
+
* Jobs with null fields are included (don't exclude for missing data).
|
|
5
|
+
*/
|
|
6
|
+
export declare function matchJobs(jobs: Job[], criteria: SearchCriteria): Job[];
|