jobcrawl 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.prettierrc.json +10 -0
- package/CHANGELOG.md +40 -0
- package/README.md +232 -0
- package/dist/core/aggregators/yc.d.ts +7 -0
- package/dist/core/aggregators/yc.js +320 -0
- package/dist/core/browser.d.ts +30 -0
- package/dist/core/browser.js +196 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +41 -0
- package/dist/core/detect-provider.d.ts +7 -0
- package/dist/core/detect-provider.js +125 -0
- package/dist/core/discover-careers.d.ts +18 -0
- package/dist/core/discover-careers.js +92 -0
- package/dist/core/extract-jobs.d.ts +14 -0
- package/dist/core/extract-jobs.js +36 -0
- package/dist/core/fetch-page.d.ts +11 -0
- package/dist/core/fetch-page.js +39 -0
- package/dist/core/format-output.d.ts +2 -0
- package/dist/core/format-output.js +59 -0
- package/dist/core/match-jobs.d.ts +6 -0
- package/dist/core/match-jobs.js +43 -0
- package/dist/core/providers/ashby.d.ts +6 -0
- package/dist/core/providers/ashby.js +58 -0
- package/dist/core/providers/generic.d.ts +6 -0
- package/dist/core/providers/generic.js +294 -0
- package/dist/core/providers/greenhouse.d.ts +6 -0
- package/dist/core/providers/greenhouse.js +47 -0
- package/dist/core/providers/lever.d.ts +7 -0
- package/dist/core/providers/lever.js +60 -0
- package/dist/core/providers/yc.d.ts +7 -0
- package/dist/core/providers/yc.js +320 -0
- package/dist/core/resolve-iframe.d.ts +6 -0
- package/dist/core/resolve-iframe.js +51 -0
- package/dist/core/save-raw.d.ts +4 -0
- package/dist/core/save-raw.js +13 -0
- package/dist/data/companies.d.ts +9 -0
- package/dist/data/companies.js +2849 -0
- package/dist/entrypoints/cli/app.d.ts +3 -0
- package/dist/entrypoints/cli/app.js +91 -0
- package/dist/entrypoints/cli/components/crawl-view.d.ts +1 -0
- package/dist/entrypoints/cli/components/crawl-view.js +94 -0
- package/dist/entrypoints/cli/components/discover-view.d.ts +1 -0
- package/dist/entrypoints/cli/components/discover-view.js +67 -0
- package/dist/entrypoints/cli/crawl-aggregators.d.ts +26 -0
- package/dist/entrypoints/cli/crawl-aggregators.js +76 -0
- package/dist/entrypoints/cli/crawl-url.d.ts +26 -0
- package/dist/entrypoints/cli/crawl-url.js +54 -0
- package/dist/entrypoints/cli/crawl.d.ts +32 -0
- package/dist/entrypoints/cli/crawl.js +108 -0
- package/dist/entrypoints/cli/discover.d.ts +10 -0
- package/dist/entrypoints/cli/discover.js +69 -0
- package/dist/entrypoints/cli/index.d.ts +2 -0
- package/dist/entrypoints/cli/index.js +197 -0
- package/dist/entrypoints/cli/init.d.ts +9 -0
- package/dist/entrypoints/cli/init.js +94 -0
- package/dist/entrypoints/cli/plain.d.ts +6 -0
- package/dist/entrypoints/cli/plain.js +77 -0
- package/dist/events.d.ts +114 -0
- package/dist/events.js +17 -0
- package/dist/orchestrators/crawl-all.d.ts +2 -0
- package/dist/orchestrators/crawl-all.js +66 -0
- package/dist/orchestrators/discover-all.d.ts +10 -0
- package/dist/orchestrators/discover-all.js +39 -0
- package/dist/threads/pool.d.ts +5 -0
- package/dist/threads/pool.js +23 -0
- package/dist/threads/process-url.d.ts +9 -0
- package/dist/threads/process-url.js +229 -0
- package/dist/types/index.d.ts +83 -0
- package/dist/types/index.js +6 -0
- package/dist/utils/config.d.ts +17 -0
- package/dist/utils/config.js +57 -0
- package/dist/utils/google-search.d.ts +19 -0
- package/dist/utils/google-search.js +139 -0
- package/dist/utils/llm.d.ts +8 -0
- package/dist/utils/llm.js +25 -0
- package/package.json +42 -0
- package/src/core/aggregators/yc.ts +415 -0
- package/src/core/browser.ts +239 -0
- package/src/core/detect-provider.ts +162 -0
- package/src/core/discover-careers.ts +117 -0
- package/src/core/extract-jobs.ts +50 -0
- package/src/core/fetch-page.ts +41 -0
- package/src/core/format-output.ts +80 -0
- package/src/core/match-jobs.ts +56 -0
- package/src/core/providers/ashby.ts +84 -0
- package/src/core/providers/generic.ts +332 -0
- package/src/core/providers/greenhouse.ts +74 -0
- package/src/core/providers/lever.ts +90 -0
- package/src/core/resolve-iframe.ts +59 -0
- package/src/core/save-raw.ts +18 -0
- package/src/data/companies.ts +2859 -0
- package/src/entrypoints/cli/app.tsx +173 -0
- package/src/entrypoints/cli/components/crawl-view.tsx +163 -0
- package/src/entrypoints/cli/components/discover-view.tsx +138 -0
- package/src/entrypoints/cli/crawl-aggregators.ts +112 -0
- package/src/entrypoints/cli/crawl-url.ts +87 -0
- package/src/entrypoints/cli/crawl.ts +163 -0
- package/src/entrypoints/cli/discover.ts +96 -0
- package/src/entrypoints/cli/index.ts +252 -0
- package/src/entrypoints/cli/init.ts +117 -0
- package/src/entrypoints/cli/plain.ts +104 -0
- package/src/events.ts +79 -0
- package/src/orchestrators/crawl-all.ts +96 -0
- package/src/orchestrators/discover-all.ts +61 -0
- package/src/threads/pool.ts +29 -0
- package/src/threads/process-url.ts +312 -0
- package/src/types/index.ts +110 -0
- package/src/utils/config.ts +79 -0
- package/src/utils/google-search.ts +155 -0
- package/src/utils/llm.ts +33 -0
- package/test/integration/process-url.test.ts +301 -0
- package/test/integration/providers/ashby.test.ts +163 -0
- package/test/integration/providers/greenhouse.test.ts +191 -0
- package/test/integration/providers/lever.test.ts +188 -0
- package/test/unit/config.test.ts +64 -0
- package/test/unit/detect-provider.test.ts +165 -0
- package/test/unit/events.test.ts +104 -0
- package/test/unit/format-output.test.ts +165 -0
- package/test/unit/match-jobs.test.ts +257 -0
- package/test/unit/pool.test.ts +74 -0
- package/test/unit/providers/generic.test.ts +139 -0
- package/test/unit/resolve-iframe.test.ts +100 -0
- package/tsconfig.json +19 -0
- package/vitest.config.ts +7 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import type { ProviderDetection } from "../types/index.js";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Detect ATS provider from HTML content and URL.
|
|
5
|
+
* Checks for iframes, embed scripts, and DOM signatures.
|
|
6
|
+
* Extracts board token/slug for API calls when possible.
|
|
7
|
+
*/
|
|
8
|
+
export function detectProvider(html: string, url: string): ProviderDetection {
|
|
9
|
+
// 1. URL pattern matching (direct ATS board URLs)
|
|
10
|
+
const urlMatch = detectFromUrl(url);
|
|
11
|
+
if (urlMatch.provider !== "unknown") return urlMatch;
|
|
12
|
+
|
|
13
|
+
// 2. HTML signature detection (embedded ATS on custom domains)
|
|
14
|
+
return detectFromHtml(html);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
function detectFromUrl(url: string): ProviderDetection {
|
|
18
|
+
const u = new URL(url);
|
|
19
|
+
|
|
20
|
+
// Greenhouse: boards.greenhouse.io/{token} or job-boards.greenhouse.io/{token}
|
|
21
|
+
if (
|
|
22
|
+
u.hostname === "boards.greenhouse.io" ||
|
|
23
|
+
u.hostname === "job-boards.greenhouse.io"
|
|
24
|
+
) {
|
|
25
|
+
const token = u.pathname.split("/").filter(Boolean)[0];
|
|
26
|
+
if (token) return { provider: "greenhouse", boardToken: token };
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Lever: jobs.lever.co/{company}
|
|
30
|
+
if (u.hostname === "jobs.lever.co") {
|
|
31
|
+
const company = u.pathname.split("/").filter(Boolean)[0];
|
|
32
|
+
if (company) return { provider: "lever", boardToken: company };
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Ashby: jobs.ashbyhq.com/{company}
|
|
36
|
+
if (u.hostname === "jobs.ashbyhq.com") {
|
|
37
|
+
const company = u.pathname.split("/").filter(Boolean)[0];
|
|
38
|
+
if (company) return { provider: "ashby", boardToken: company };
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Workday: {company}.wd{n}.myworkdayjobs.com
|
|
42
|
+
if (u.hostname.includes(".myworkdayjobs.com")) {
|
|
43
|
+
const tenant = u.hostname.split(".")[0];
|
|
44
|
+
if (tenant) return { provider: "workday", boardToken: tenant };
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// BambooHR: {company}.bamboohr.com
|
|
48
|
+
if (u.hostname.includes(".bamboohr.com")) {
|
|
49
|
+
const company = u.hostname.split(".")[0];
|
|
50
|
+
if (company) return { provider: "bamboohr", boardToken: company };
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return { provider: "unknown", boardToken: null };
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function detectFromHtml(html: string): ProviderDetection {
|
|
57
|
+
// Greenhouse embed: <div id="grnhse_app"> or greenhouse embed script
|
|
58
|
+
const greenhouseIframe = html.match(
|
|
59
|
+
/iframe[^>]+src=["']([^"']*(?:boards|job-boards)\.greenhouse\.io[^"']*)/i
|
|
60
|
+
);
|
|
61
|
+
if (greenhouseIframe) {
|
|
62
|
+
const token = extractTokenFromUrl(
|
|
63
|
+
greenhouseIframe[1],
|
|
64
|
+
"boards.greenhouse.io"
|
|
65
|
+
);
|
|
66
|
+
if (token) return { provider: "greenhouse", boardToken: token };
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const greenhouseEmbed = html.match(
|
|
70
|
+
/src=["'][^"']*api\.greenhouse\.io\/v1\/boards\/([^/"']+)/i
|
|
71
|
+
);
|
|
72
|
+
if (greenhouseEmbed) {
|
|
73
|
+
return { provider: "greenhouse", boardToken: greenhouseEmbed[1] };
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if (html.includes('id="grnhse_app"') || html.includes("id='grnhse_app'")) {
|
|
77
|
+
// Look for the greenhouse board token in embed script
|
|
78
|
+
const tokenMatch = html.match(
|
|
79
|
+
/Grnhse\.Settings\.board_token\s*=\s*["']([^"']+)/
|
|
80
|
+
);
|
|
81
|
+
if (tokenMatch) {
|
|
82
|
+
return { provider: "greenhouse", boardToken: tokenMatch[1] };
|
|
83
|
+
}
|
|
84
|
+
const scriptSrc = html.match(
|
|
85
|
+
/src=["'][^"']*(?:boards|job-boards)\.greenhouse\.io\/embed\/job_board\/js\?for=([^"'&]+)/i
|
|
86
|
+
);
|
|
87
|
+
if (scriptSrc) {
|
|
88
|
+
return { provider: "greenhouse", boardToken: scriptSrc[1] };
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// Lever embed
|
|
93
|
+
const leverIframe = html.match(
|
|
94
|
+
/iframe[^>]+src=["']([^"']*jobs\.lever\.co[^"']*)/i
|
|
95
|
+
);
|
|
96
|
+
if (leverIframe) {
|
|
97
|
+
const token = extractTokenFromUrl(leverIframe[1], "jobs.lever.co");
|
|
98
|
+
if (token) return { provider: "lever", boardToken: token };
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
if (html.includes("data-lever-")) {
|
|
102
|
+
const leverCompany = html.match(/data-lever-company=["']([^"']+)/i);
|
|
103
|
+
if (leverCompany) {
|
|
104
|
+
return { provider: "lever", boardToken: leverCompany[1] };
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Ashby embed
|
|
109
|
+
const ashbyIframe = html.match(
|
|
110
|
+
/iframe[^>]+src=["']([^"']*jobs\.ashbyhq\.com[^"']*)/i
|
|
111
|
+
);
|
|
112
|
+
if (ashbyIframe) {
|
|
113
|
+
const token = extractTokenFromUrl(ashbyIframe[1], "jobs.ashbyhq.com");
|
|
114
|
+
if (token) return { provider: "ashby", boardToken: token };
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Ashby embed script pattern
|
|
118
|
+
const ashbyScript = html.match(
|
|
119
|
+
/src=["'][^"']*ashbyhq\.com\/[^"']*["'][^>]*data-ashby-job-board-id=["']([^"']+)/i
|
|
120
|
+
);
|
|
121
|
+
if (ashbyScript) {
|
|
122
|
+
return { provider: "ashby", boardToken: ashbyScript[1] };
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// Workday embed
|
|
126
|
+
const workdayIframe = html.match(
|
|
127
|
+
/iframe[^>]+src=["']([^"']*\.myworkdayjobs\.com[^"']*)/i
|
|
128
|
+
);
|
|
129
|
+
if (workdayIframe) {
|
|
130
|
+
const tenant = workdayIframe[1].match(/\/\/([^.]+)\.wd\d+\.myworkdayjobs/);
|
|
131
|
+
if (tenant) return { provider: "workday", boardToken: tenant[1] };
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// BambooHR embed
|
|
135
|
+
const bambooIframe = html.match(
|
|
136
|
+
/iframe[^>]+src=["']([^"']*\.bamboohr\.com[^"']*)/i
|
|
137
|
+
);
|
|
138
|
+
if (bambooIframe) {
|
|
139
|
+
const company = bambooIframe[1].match(/\/\/([^.]+)\.bamboohr\.com/);
|
|
140
|
+
if (company) return { provider: "bamboohr", boardToken: company[1] };
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return { provider: "unknown", boardToken: null };
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
function extractTokenFromUrl(
|
|
147
|
+
iframeSrc: string,
|
|
148
|
+
hostname: string
|
|
149
|
+
): string | null {
|
|
150
|
+
try {
|
|
151
|
+
const url = new URL(
|
|
152
|
+
iframeSrc.startsWith("//") ? `https:${iframeSrc}` : iframeSrc
|
|
153
|
+
);
|
|
154
|
+
if (url.hostname === hostname) {
|
|
155
|
+
const parts = url.pathname.split("/").filter(Boolean);
|
|
156
|
+
return parts[0] || null;
|
|
157
|
+
}
|
|
158
|
+
} catch {
|
|
159
|
+
// Invalid URL
|
|
160
|
+
}
|
|
161
|
+
return null;
|
|
162
|
+
}
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import {
|
|
2
|
+
webSearch,
|
|
3
|
+
guessCareerUrls,
|
|
4
|
+
type SearchResult,
|
|
5
|
+
} from "../utils/google-search.js";
|
|
6
|
+
import { askClaude } from "../utils/llm.js";
|
|
7
|
+
import { probePage } from "./fetch-page.js";
|
|
8
|
+
|
|
9
|
+
export interface DiscoverResult {
|
|
10
|
+
company: string;
|
|
11
|
+
url: string | null;
|
|
12
|
+
verified: boolean;
|
|
13
|
+
error: string | null;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Discover the career page URL for a company.
|
|
18
|
+
*
|
|
19
|
+
* Strategy (in order):
|
|
20
|
+
* 1. Web search (DDG) + LLM to pick the best URL
|
|
21
|
+
* 2. Heuristic URL guessing ({company}.com/careers) + HTTP HEAD check
|
|
22
|
+
*/
|
|
23
|
+
export async function discoverCareerPage(
|
|
24
|
+
company: string
|
|
25
|
+
): Promise<DiscoverResult> {
|
|
26
|
+
try {
|
|
27
|
+
// Strategy 1: Web search + LLM
|
|
28
|
+
const results = await webSearch(`"${company}" careers jobs`);
|
|
29
|
+
|
|
30
|
+
if (results.length > 0) {
|
|
31
|
+
const url = await pickCareerUrl(company, results);
|
|
32
|
+
if (url) {
|
|
33
|
+
return { company, url, verified: false, error: null };
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Strategy 2: Heuristic URL guessing
|
|
38
|
+
const guessed = await guessCareerUrls(company);
|
|
39
|
+
if (guessed.length > 0) {
|
|
40
|
+
return {
|
|
41
|
+
company,
|
|
42
|
+
url: guessed[0].url,
|
|
43
|
+
verified: false,
|
|
44
|
+
error: null,
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
return {
|
|
49
|
+
company,
|
|
50
|
+
url: null,
|
|
51
|
+
verified: false,
|
|
52
|
+
error: "Could not find career page",
|
|
53
|
+
};
|
|
54
|
+
} catch (err) {
|
|
55
|
+
return {
|
|
56
|
+
company,
|
|
57
|
+
url: null,
|
|
58
|
+
verified: false,
|
|
59
|
+
error: err instanceof Error ? err.message : String(err),
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Verify that a discovered URL is a real career page.
|
|
66
|
+
*/
|
|
67
|
+
export async function verifyCareerPage(url: string): Promise<boolean> {
|
|
68
|
+
try {
|
|
69
|
+
const { html } = await probePage(url);
|
|
70
|
+
const lower = html.toLowerCase();
|
|
71
|
+
|
|
72
|
+
const signals = [
|
|
73
|
+
"careers",
|
|
74
|
+
"jobs",
|
|
75
|
+
"openings",
|
|
76
|
+
"positions",
|
|
77
|
+
"join our team",
|
|
78
|
+
"join us",
|
|
79
|
+
"we're hiring",
|
|
80
|
+
"open roles",
|
|
81
|
+
"apply now",
|
|
82
|
+
"job-boards.greenhouse.io",
|
|
83
|
+
"boards.greenhouse.io",
|
|
84
|
+
"jobs.lever.co",
|
|
85
|
+
"jobs.ashbyhq.com",
|
|
86
|
+
"grnhse_app",
|
|
87
|
+
];
|
|
88
|
+
|
|
89
|
+
const matchCount = signals.filter((s) => lower.includes(s)).length;
|
|
90
|
+
return matchCount >= 2;
|
|
91
|
+
} catch {
|
|
92
|
+
return false;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
async function pickCareerUrl(
|
|
97
|
+
company: string,
|
|
98
|
+
results: SearchResult[]
|
|
99
|
+
): Promise<string | null> {
|
|
100
|
+
const formatted = results
|
|
101
|
+
.map((r, i) => `${i + 1}. ${r.title}\n URL: ${r.url}\n ${r.snippet}`)
|
|
102
|
+
.join("\n\n");
|
|
103
|
+
|
|
104
|
+
const prompt = `Given these search results for "${company} careers", return ONLY the URL of their official careers/jobs page. Not individual job postings or third-party job boards (Indeed, LinkedIn, Glassdoor).
|
|
105
|
+
|
|
106
|
+
Search results:
|
|
107
|
+
${formatted}
|
|
108
|
+
|
|
109
|
+
Respond with ONLY the URL, nothing else. If you can't determine it, respond with "null".`;
|
|
110
|
+
|
|
111
|
+
const response = await askClaude(prompt);
|
|
112
|
+
|
|
113
|
+
const urlMatch = response.match(/https?:\/\/[^\s"'<>]+/);
|
|
114
|
+
if (urlMatch) return urlMatch[0];
|
|
115
|
+
|
|
116
|
+
return null;
|
|
117
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import type { Job, Provider, Aggregator, SearchCriteria } from "../types/index.js";
|
|
2
|
+
import { extractGreenhouseJobs } from "./providers/greenhouse.js";
|
|
3
|
+
import { extractAshbyJobs } from "./providers/ashby.js";
|
|
4
|
+
import { extractLeverJobs } from "./providers/lever.js";
|
|
5
|
+
import { extractYcJobs } from "./aggregators/yc.js";
|
|
6
|
+
import { extractGenericJobs } from "./providers/generic.js";
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Extract jobs via ATS JSON API. Used when provider + boardToken are known.
|
|
10
|
+
* Criteria is passed through so providers can apply server-side filtering.
|
|
11
|
+
*/
|
|
12
|
+
export async function extractViaApi(
|
|
13
|
+
provider: Provider,
|
|
14
|
+
boardToken: string,
|
|
15
|
+
sourceUrl: string,
|
|
16
|
+
criteria: SearchCriteria,
|
|
17
|
+
saveRaw = false
|
|
18
|
+
): Promise<Job[]> {
|
|
19
|
+
switch (provider) {
|
|
20
|
+
case "greenhouse":
|
|
21
|
+
return extractGreenhouseJobs(boardToken, sourceUrl, criteria, saveRaw);
|
|
22
|
+
case "ashby":
|
|
23
|
+
return extractAshbyJobs(boardToken, sourceUrl, criteria, saveRaw);
|
|
24
|
+
case "lever":
|
|
25
|
+
return extractLeverJobs(boardToken, sourceUrl, criteria, saveRaw);
|
|
26
|
+
default:
|
|
27
|
+
throw new Error(`No API extractor for provider "${provider}"`);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Extract jobs via aggregator. Aggregators search across multiple companies.
|
|
33
|
+
*/
|
|
34
|
+
export async function extractViaAggregator(
|
|
35
|
+
aggregator: Aggregator,
|
|
36
|
+
criteria: SearchCriteria,
|
|
37
|
+
saveRaw = false
|
|
38
|
+
): Promise<Job[]> {
|
|
39
|
+
switch (aggregator) {
|
|
40
|
+
case "yc":
|
|
41
|
+
return extractYcJobs("https://workatastartup.com", criteria, saveRaw);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Extract jobs from rendered HTML. Used when no ATS API is available.
|
|
47
|
+
*/
|
|
48
|
+
export function extractFromHtml(html: string, sourceUrl: string): Job[] {
|
|
49
|
+
return extractGenericJobs(html, sourceUrl);
|
|
50
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { BrowserSession, BrowserNotAvailableError } from "./browser.js";
|
|
2
|
+
import type { FetchResult } from "../types/index.js";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Quick HTTP probe — simple GET, no browser.
|
|
6
|
+
* Enough to detect ATS signals in static HTML.
|
|
7
|
+
*/
|
|
8
|
+
export async function probePage(url: string): Promise<FetchResult> {
|
|
9
|
+
const response = await fetch(url, {
|
|
10
|
+
headers: {
|
|
11
|
+
"User-Agent":
|
|
12
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
13
|
+
},
|
|
14
|
+
redirect: "follow",
|
|
15
|
+
});
|
|
16
|
+
const html = await response.text();
|
|
17
|
+
return { html, finalUrl: response.url };
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Full JS rendering via agent-browser.
|
|
22
|
+
* Falls back to probePage when agent-browser is not installed.
|
|
23
|
+
*/
|
|
24
|
+
export async function renderPage(url: string): Promise<FetchResult> {
|
|
25
|
+
const session = new BrowserSession();
|
|
26
|
+
try {
|
|
27
|
+
await session.open(url);
|
|
28
|
+
const [html, finalUrl] = await Promise.all([
|
|
29
|
+
session.getHtml(),
|
|
30
|
+
session.getUrl(),
|
|
31
|
+
]);
|
|
32
|
+
return { html, finalUrl };
|
|
33
|
+
} catch (err) {
|
|
34
|
+
if (err instanceof BrowserNotAvailableError) {
|
|
35
|
+
return probePage(url);
|
|
36
|
+
}
|
|
37
|
+
throw err;
|
|
38
|
+
} finally {
|
|
39
|
+
await session.close();
|
|
40
|
+
}
|
|
41
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import type { Job, OutputFormat } from "../types/index.js";
|
|
2
|
+
|
|
3
|
+
export function formatOutput(jobs: Job[], format: OutputFormat): string {
|
|
4
|
+
switch (format) {
|
|
5
|
+
case "json":
|
|
6
|
+
return JSON.stringify(jobs, null, 2);
|
|
7
|
+
|
|
8
|
+
case "table":
|
|
9
|
+
return formatTable(jobs);
|
|
10
|
+
|
|
11
|
+
case "csv":
|
|
12
|
+
return formatCsv(jobs);
|
|
13
|
+
|
|
14
|
+
case "markdown":
|
|
15
|
+
return formatMarkdown(jobs);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
function formatTable(jobs: Job[]): string {
|
|
20
|
+
if (jobs.length === 0) return "No matching jobs found.";
|
|
21
|
+
|
|
22
|
+
const headers = ["Company", "Title", "Location", "Department", "URL"];
|
|
23
|
+
const rows = jobs.map((j) => [
|
|
24
|
+
j.company,
|
|
25
|
+
j.title,
|
|
26
|
+
j.location ?? "-",
|
|
27
|
+
j.department ?? "-",
|
|
28
|
+
j.url,
|
|
29
|
+
]);
|
|
30
|
+
|
|
31
|
+
// Calculate column widths
|
|
32
|
+
const widths = headers.map((h, i) =>
|
|
33
|
+
Math.min(50, Math.max(h.length, ...rows.map((r) => r[i].length)))
|
|
34
|
+
);
|
|
35
|
+
|
|
36
|
+
const separator = widths.map((w) => "-".repeat(w)).join(" | ");
|
|
37
|
+
const headerLine = headers.map((h, i) => h.padEnd(widths[i])).join(" | ");
|
|
38
|
+
const bodyLines = rows.map((row) =>
|
|
39
|
+
row.map((cell, i) => cell.slice(0, widths[i]).padEnd(widths[i])).join(" | ")
|
|
40
|
+
);
|
|
41
|
+
|
|
42
|
+
return [headerLine, separator, ...bodyLines].join("\n");
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function formatCsv(jobs: Job[]): string {
|
|
46
|
+
const headers = "company,title,location,department,workMode,url,postedAt";
|
|
47
|
+
const rows = jobs.map((j) =>
|
|
48
|
+
[
|
|
49
|
+
csvEscape(j.company),
|
|
50
|
+
csvEscape(j.title),
|
|
51
|
+
csvEscape(j.location ?? ""),
|
|
52
|
+
csvEscape(j.department ?? ""),
|
|
53
|
+
csvEscape(j.workMode ?? ""),
|
|
54
|
+
csvEscape(j.url),
|
|
55
|
+
csvEscape(j.postedAt ?? ""),
|
|
56
|
+
].join(",")
|
|
57
|
+
);
|
|
58
|
+
return [headers, ...rows].join("\n");
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function csvEscape(value: string): string {
|
|
62
|
+
if (value.includes(",") || value.includes('"') || value.includes("\n")) {
|
|
63
|
+
return `"${value.replace(/"/g, '""')}"`;
|
|
64
|
+
}
|
|
65
|
+
return value;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function formatMarkdown(jobs: Job[]): string {
|
|
69
|
+
if (jobs.length === 0) return "No matching jobs found.";
|
|
70
|
+
|
|
71
|
+
const lines = [
|
|
72
|
+
"| Company | Title | Location | Department |",
|
|
73
|
+
"| --- | --- | --- | --- |",
|
|
74
|
+
...jobs.map(
|
|
75
|
+
(j) =>
|
|
76
|
+
`| ${j.company} | [${j.title}](${j.url}) | ${j.location ?? "-"} | ${j.department ?? "-"} |`
|
|
77
|
+
),
|
|
78
|
+
];
|
|
79
|
+
return lines.join("\n");
|
|
80
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import type { Job, SearchCriteria } from "../types/index.js";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Filter jobs against search criteria. Case-insensitive substring matching.
|
|
5
|
+
* Jobs with null fields are included (don't exclude for missing data).
|
|
6
|
+
*/
|
|
7
|
+
export function matchJobs(jobs: Job[], criteria: SearchCriteria): Job[] {
|
|
8
|
+
return jobs.filter((job) => {
|
|
9
|
+
// Keywords: any keyword must match title (include if no keywords specified)
|
|
10
|
+
if (criteria.keywords.length > 0) {
|
|
11
|
+
const title = job.title.toLowerCase();
|
|
12
|
+
const hasMatch = criteria.keywords.some((kw) =>
|
|
13
|
+
title.includes(kw.toLowerCase())
|
|
14
|
+
);
|
|
15
|
+
if (!hasMatch) return false;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
// Exclude keywords: exclude if any match title
|
|
19
|
+
if (criteria.excludeKeywords.length > 0) {
|
|
20
|
+
const title = job.title.toLowerCase();
|
|
21
|
+
const hasExclude = criteria.excludeKeywords.some((kw) =>
|
|
22
|
+
title.includes(kw.toLowerCase())
|
|
23
|
+
);
|
|
24
|
+
if (hasExclude) return false;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Location: substring match (include if job has no location)
|
|
28
|
+
if (criteria.location && job.location) {
|
|
29
|
+
if (
|
|
30
|
+
!job.location.toLowerCase().includes(criteria.location.toLowerCase())
|
|
31
|
+
) {
|
|
32
|
+
return false;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Work mode: include if job's workMode is in allowed set (include if null)
|
|
37
|
+
if (criteria.workMode && criteria.workMode.length > 0 && job.workMode) {
|
|
38
|
+
if (!criteria.workMode.includes(job.workMode)) return false;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Departments: substring match (include if null)
|
|
42
|
+
if (
|
|
43
|
+
criteria.departments &&
|
|
44
|
+
criteria.departments.length > 0 &&
|
|
45
|
+
job.department
|
|
46
|
+
) {
|
|
47
|
+
const dept = job.department.toLowerCase();
|
|
48
|
+
const hasMatch = criteria.departments.some((d) =>
|
|
49
|
+
dept.includes(d.toLowerCase())
|
|
50
|
+
);
|
|
51
|
+
if (!hasMatch) return false;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
return true;
|
|
55
|
+
});
|
|
56
|
+
}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import type { Job, SearchCriteria } from "../../types/index.js";
|
|
3
|
+
import { saveRawResponse } from "../save-raw.js";
|
|
4
|
+
|
|
5
|
+
interface AshbyJob {
|
|
6
|
+
id: string;
|
|
7
|
+
title: string;
|
|
8
|
+
department: string | null;
|
|
9
|
+
team: string | null;
|
|
10
|
+
employmentType: string | null;
|
|
11
|
+
location: string | null;
|
|
12
|
+
isRemote: boolean | null;
|
|
13
|
+
workplaceType: string | null;
|
|
14
|
+
jobUrl: string;
|
|
15
|
+
publishedAt: string | null;
|
|
16
|
+
descriptionPlain?: string;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
interface AshbyResponse {
|
|
20
|
+
jobs: AshbyJob[];
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Fetch jobs from Ashby's public posting API.
|
|
25
|
+
* No auth required. Returns all jobs as a single dump.
|
|
26
|
+
*/
|
|
27
|
+
export async function extractAshbyJobs(
|
|
28
|
+
boardToken: string,
|
|
29
|
+
sourceUrl: string,
|
|
30
|
+
_criteria: SearchCriteria,
|
|
31
|
+
saveRaw = false
|
|
32
|
+
): Promise<Job[]> {
|
|
33
|
+
const apiUrl = `https://api.ashbyhq.com/posting-api/job-board/${boardToken}`;
|
|
34
|
+
const response = await fetch(apiUrl);
|
|
35
|
+
|
|
36
|
+
if (!response.ok) {
|
|
37
|
+
throw new Error(
|
|
38
|
+
`Ashby API returned ${response.status} for board "${boardToken}"`
|
|
39
|
+
);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const data = (await response.json()) as AshbyResponse;
|
|
43
|
+
|
|
44
|
+
if (saveRaw) await saveRawResponse("ashby", boardToken, data);
|
|
45
|
+
|
|
46
|
+
return data.jobs.map((aj): Job => {
|
|
47
|
+
const id = createHash("sha256")
|
|
48
|
+
.update(`${aj.jobUrl}:${aj.title}:${boardToken}`)
|
|
49
|
+
.digest("hex")
|
|
50
|
+
.slice(0, 12);
|
|
51
|
+
|
|
52
|
+
return {
|
|
53
|
+
id,
|
|
54
|
+
title: aj.title,
|
|
55
|
+
company: boardToken,
|
|
56
|
+
location: aj.location ?? null,
|
|
57
|
+
workMode: inferWorkMode(aj),
|
|
58
|
+
department: aj.department ?? aj.team ?? null,
|
|
59
|
+
url: aj.jobUrl,
|
|
60
|
+
sourceUrl,
|
|
61
|
+
provider: "ashby",
|
|
62
|
+
description: aj.descriptionPlain?.slice(0, 200) ?? null,
|
|
63
|
+
postedAt: aj.publishedAt ?? null,
|
|
64
|
+
extractedAt: new Date().toISOString(),
|
|
65
|
+
raw: aj as unknown as Record<string, unknown>,
|
|
66
|
+
};
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function inferWorkMode(job: AshbyJob): "remote" | "onsite" | "hybrid" | null {
|
|
71
|
+
if (job.isRemote === true) return "remote";
|
|
72
|
+
if (job.workplaceType) {
|
|
73
|
+
const wt = job.workplaceType.toLowerCase();
|
|
74
|
+
if (wt.includes("remote")) return "remote";
|
|
75
|
+
if (wt.includes("hybrid")) return "hybrid";
|
|
76
|
+
if (wt.includes("onsite") || wt.includes("on-site")) return "onsite";
|
|
77
|
+
}
|
|
78
|
+
if (job.location) {
|
|
79
|
+
const loc = job.location.toLowerCase();
|
|
80
|
+
if (loc.includes("remote")) return "remote";
|
|
81
|
+
if (loc.includes("hybrid")) return "hybrid";
|
|
82
|
+
}
|
|
83
|
+
return null;
|
|
84
|
+
}
|