jobcrawl 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.prettierrc.json +10 -0
- package/CHANGELOG.md +40 -0
- package/README.md +232 -0
- package/dist/core/aggregators/yc.d.ts +7 -0
- package/dist/core/aggregators/yc.js +320 -0
- package/dist/core/browser.d.ts +30 -0
- package/dist/core/browser.js +196 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +41 -0
- package/dist/core/detect-provider.d.ts +7 -0
- package/dist/core/detect-provider.js +125 -0
- package/dist/core/discover-careers.d.ts +18 -0
- package/dist/core/discover-careers.js +92 -0
- package/dist/core/extract-jobs.d.ts +14 -0
- package/dist/core/extract-jobs.js +36 -0
- package/dist/core/fetch-page.d.ts +11 -0
- package/dist/core/fetch-page.js +39 -0
- package/dist/core/format-output.d.ts +2 -0
- package/dist/core/format-output.js +59 -0
- package/dist/core/match-jobs.d.ts +6 -0
- package/dist/core/match-jobs.js +43 -0
- package/dist/core/providers/ashby.d.ts +6 -0
- package/dist/core/providers/ashby.js +58 -0
- package/dist/core/providers/generic.d.ts +6 -0
- package/dist/core/providers/generic.js +294 -0
- package/dist/core/providers/greenhouse.d.ts +6 -0
- package/dist/core/providers/greenhouse.js +47 -0
- package/dist/core/providers/lever.d.ts +7 -0
- package/dist/core/providers/lever.js +60 -0
- package/dist/core/providers/yc.d.ts +7 -0
- package/dist/core/providers/yc.js +320 -0
- package/dist/core/resolve-iframe.d.ts +6 -0
- package/dist/core/resolve-iframe.js +51 -0
- package/dist/core/save-raw.d.ts +4 -0
- package/dist/core/save-raw.js +13 -0
- package/dist/data/companies.d.ts +9 -0
- package/dist/data/companies.js +2849 -0
- package/dist/entrypoints/cli/app.d.ts +3 -0
- package/dist/entrypoints/cli/app.js +91 -0
- package/dist/entrypoints/cli/components/crawl-view.d.ts +1 -0
- package/dist/entrypoints/cli/components/crawl-view.js +94 -0
- package/dist/entrypoints/cli/components/discover-view.d.ts +1 -0
- package/dist/entrypoints/cli/components/discover-view.js +67 -0
- package/dist/entrypoints/cli/crawl-aggregators.d.ts +26 -0
- package/dist/entrypoints/cli/crawl-aggregators.js +76 -0
- package/dist/entrypoints/cli/crawl-url.d.ts +26 -0
- package/dist/entrypoints/cli/crawl-url.js +54 -0
- package/dist/entrypoints/cli/crawl.d.ts +32 -0
- package/dist/entrypoints/cli/crawl.js +108 -0
- package/dist/entrypoints/cli/discover.d.ts +10 -0
- package/dist/entrypoints/cli/discover.js +69 -0
- package/dist/entrypoints/cli/index.d.ts +2 -0
- package/dist/entrypoints/cli/index.js +197 -0
- package/dist/entrypoints/cli/init.d.ts +9 -0
- package/dist/entrypoints/cli/init.js +94 -0
- package/dist/entrypoints/cli/plain.d.ts +6 -0
- package/dist/entrypoints/cli/plain.js +77 -0
- package/dist/events.d.ts +114 -0
- package/dist/events.js +17 -0
- package/dist/orchestrators/crawl-all.d.ts +2 -0
- package/dist/orchestrators/crawl-all.js +66 -0
- package/dist/orchestrators/discover-all.d.ts +10 -0
- package/dist/orchestrators/discover-all.js +39 -0
- package/dist/threads/pool.d.ts +5 -0
- package/dist/threads/pool.js +23 -0
- package/dist/threads/process-url.d.ts +9 -0
- package/dist/threads/process-url.js +229 -0
- package/dist/types/index.d.ts +83 -0
- package/dist/types/index.js +6 -0
- package/dist/utils/config.d.ts +17 -0
- package/dist/utils/config.js +57 -0
- package/dist/utils/google-search.d.ts +19 -0
- package/dist/utils/google-search.js +139 -0
- package/dist/utils/llm.d.ts +8 -0
- package/dist/utils/llm.js +25 -0
- package/package.json +42 -0
- package/src/core/aggregators/yc.ts +415 -0
- package/src/core/browser.ts +239 -0
- package/src/core/detect-provider.ts +162 -0
- package/src/core/discover-careers.ts +117 -0
- package/src/core/extract-jobs.ts +50 -0
- package/src/core/fetch-page.ts +41 -0
- package/src/core/format-output.ts +80 -0
- package/src/core/match-jobs.ts +56 -0
- package/src/core/providers/ashby.ts +84 -0
- package/src/core/providers/generic.ts +332 -0
- package/src/core/providers/greenhouse.ts +74 -0
- package/src/core/providers/lever.ts +90 -0
- package/src/core/resolve-iframe.ts +59 -0
- package/src/core/save-raw.ts +18 -0
- package/src/data/companies.ts +2859 -0
- package/src/entrypoints/cli/app.tsx +173 -0
- package/src/entrypoints/cli/components/crawl-view.tsx +163 -0
- package/src/entrypoints/cli/components/discover-view.tsx +138 -0
- package/src/entrypoints/cli/crawl-aggregators.ts +112 -0
- package/src/entrypoints/cli/crawl-url.ts +87 -0
- package/src/entrypoints/cli/crawl.ts +163 -0
- package/src/entrypoints/cli/discover.ts +96 -0
- package/src/entrypoints/cli/index.ts +252 -0
- package/src/entrypoints/cli/init.ts +117 -0
- package/src/entrypoints/cli/plain.ts +104 -0
- package/src/events.ts +79 -0
- package/src/orchestrators/crawl-all.ts +96 -0
- package/src/orchestrators/discover-all.ts +61 -0
- package/src/threads/pool.ts +29 -0
- package/src/threads/process-url.ts +312 -0
- package/src/types/index.ts +110 -0
- package/src/utils/config.ts +79 -0
- package/src/utils/google-search.ts +155 -0
- package/src/utils/llm.ts +33 -0
- package/test/integration/process-url.test.ts +301 -0
- package/test/integration/providers/ashby.test.ts +163 -0
- package/test/integration/providers/greenhouse.test.ts +191 -0
- package/test/integration/providers/lever.test.ts +188 -0
- package/test/unit/config.test.ts +64 -0
- package/test/unit/detect-provider.test.ts +165 -0
- package/test/unit/events.test.ts +104 -0
- package/test/unit/format-output.test.ts +165 -0
- package/test/unit/match-jobs.test.ts +257 -0
- package/test/unit/pool.test.ts +74 -0
- package/test/unit/providers/generic.test.ts +139 -0
- package/test/unit/resolve-iframe.test.ts +100 -0
- package/tsconfig.json +19 -0
- package/vitest.config.ts +7 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import React from "react";
|
|
3
|
+
import { render } from "ink";
|
|
4
|
+
import { Command } from "commander";
|
|
5
|
+
import { bus } from "../../events.js";
|
|
6
|
+
import { mountPlain } from "./plain.js";
|
|
7
|
+
import { App } from "./app.js";
|
|
8
|
+
import { crawlUrlCommand } from "./crawl-url.js";
|
|
9
|
+
import { crawlCommand } from "./crawl.js";
|
|
10
|
+
import { crawlAggregatorsCommand } from "./crawl-aggregators.js";
|
|
11
|
+
import { discoverCommand } from "./discover.js";
|
|
12
|
+
import { initCommand } from "./init.js";
|
|
13
|
+
/**
|
|
14
|
+
* Lifecycle manager. Mounts Ink (TTY) or plain text (piped) subscriber,
|
|
15
|
+
* executes the command, and handles teardown.
|
|
16
|
+
*/
|
|
17
|
+
function wrap(
|
|
18
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
19
|
+
fn) {
|
|
20
|
+
return (...args) => {
|
|
21
|
+
if (process.stdout.isTTY) {
|
|
22
|
+
// Interactive: mount Ink (handles run + done + cleanup internally)
|
|
23
|
+
const run = () => fn(...args);
|
|
24
|
+
const instance = render(React.createElement(App, { run }));
|
|
25
|
+
const onDone = () => {
|
|
26
|
+
bus.off("done", onDone);
|
|
27
|
+
setTimeout(() => instance.unmount(), 32);
|
|
28
|
+
};
|
|
29
|
+
bus.on("done", onDone);
|
|
30
|
+
}
|
|
31
|
+
else {
|
|
32
|
+
// Piped: plain text subscriber
|
|
33
|
+
const cleanup = mountPlain();
|
|
34
|
+
fn(...args)
|
|
35
|
+
.then(() => {
|
|
36
|
+
bus.emit("done", { exitCode: 0 });
|
|
37
|
+
})
|
|
38
|
+
.catch((err) => {
|
|
39
|
+
bus.emit("error", {
|
|
40
|
+
message: err instanceof Error ? err.message : String(err),
|
|
41
|
+
});
|
|
42
|
+
bus.emit("done", { exitCode: 1 });
|
|
43
|
+
})
|
|
44
|
+
.finally(() => {
|
|
45
|
+
cleanup();
|
|
46
|
+
});
|
|
47
|
+
}
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
function addFilterOptions(cmd) {
|
|
51
|
+
return cmd
|
|
52
|
+
.option("--role <roles...>", "Role filter (e.g. engineering, design)")
|
|
53
|
+
.option("--role-type <types...>", "Role sub-type (e.g. backend, frontend)")
|
|
54
|
+
.option("--job-type <types...>", "Job type (e.g. fulltime, internship)")
|
|
55
|
+
.option("--min-experience <years...>", "Min years experience (e.g. 0, 3)")
|
|
56
|
+
.option("--company-stage <stages...>", "Company stage (e.g. seed, growth)")
|
|
57
|
+
.option("--industry <industries...>", "Industry filter")
|
|
58
|
+
.option("--company-size <sizes...>", "Company size (e.g. 1-10, 11-50)")
|
|
59
|
+
.option("--has-salary", "Only jobs with salary listed")
|
|
60
|
+
.option("--has-equity", "Only jobs with equity")
|
|
61
|
+
.option("--has-interview-process", "Only jobs with interview process")
|
|
62
|
+
.option("--visa-sponsorship", "Only jobs not requiring US visa");
|
|
63
|
+
}
|
|
64
|
+
const program = new Command("jobcrawl")
|
|
65
|
+
.version("1.0.0")
|
|
66
|
+
.description("Crawl career pages for jobs matching your search criteria");
|
|
67
|
+
addFilterOptions(program
|
|
68
|
+
.command("crawl-url <url>")
|
|
69
|
+
.description("Crawl a single career page URL and return matching jobs")
|
|
70
|
+
.option("--keywords <terms...>", "Job title keywords to match")
|
|
71
|
+
.option("--exclude <terms...>", "Keywords to exclude")
|
|
72
|
+
.option("--location <location>", "Location filter")
|
|
73
|
+
.option("--remote", "Only remote jobs")
|
|
74
|
+
.option("--onsite", "Only onsite jobs")
|
|
75
|
+
.option("--hybrid", "Only hybrid jobs")
|
|
76
|
+
.option("--department <depts...>", "Department filter")
|
|
77
|
+
.option("--output <format>", "Output format: json, table, markdown, csv", "json")
|
|
78
|
+
.option("-o, --out <file>", "Write output to file")
|
|
79
|
+
.option("--save-raw", "Save raw API responses to ~/.jobcrawl/raw/")).action(wrap(async (url, opts) => {
|
|
80
|
+
await crawlUrlCommand(url, opts);
|
|
81
|
+
}));
|
|
82
|
+
addFilterOptions(program
|
|
83
|
+
.command("crawl")
|
|
84
|
+
.description("Crawl multiple career pages and return matching jobs")
|
|
85
|
+
.option("--urls <urls...>", "Career page URLs to crawl")
|
|
86
|
+
.option("--file <path>", "Config file with targets (YAML/JSON)")
|
|
87
|
+
.option("--keywords <terms...>", "Job title keywords to match")
|
|
88
|
+
.option("--exclude <terms...>", "Keywords to exclude")
|
|
89
|
+
.option("--location <location>", "Location filter")
|
|
90
|
+
.option("--remote", "Only remote jobs")
|
|
91
|
+
.option("--onsite", "Only onsite jobs")
|
|
92
|
+
.option("--hybrid", "Only hybrid jobs")
|
|
93
|
+
.option("--department <depts...>", "Department filter")
|
|
94
|
+
.option("--output <format>", "Output format: json, table, markdown, csv", "json")
|
|
95
|
+
.option("-o, --out <file>", "Write output to file")
|
|
96
|
+
.option("--concurrency <n>", "Max concurrent crawls", "5")
|
|
97
|
+
.option("--save-raw", "Save raw API responses to ~/.jobcrawl/raw/")
|
|
98
|
+
.option("--aggregators <names...>", "Also run aggregators (e.g. yc)")
|
|
99
|
+
.option("--network-timeout <ms>", "Timeout for browser network commands (ms)")
|
|
100
|
+
.option("--max-bubble-levels <n>", "Max parent levels to try when clicking job cards")).action(wrap(async (opts) => {
|
|
101
|
+
await crawlCommand(opts);
|
|
102
|
+
}));
|
|
103
|
+
addFilterOptions(program
|
|
104
|
+
.command("crawl-aggregators <aggregators...>")
|
|
105
|
+
.description("Crawl aggregator sources (e.g. yc) for matching jobs")
|
|
106
|
+
.option("--keywords <terms...>", "Job title keywords to match")
|
|
107
|
+
.option("--exclude <terms...>", "Keywords to exclude")
|
|
108
|
+
.option("--location <location>", "Location filter")
|
|
109
|
+
.option("--remote", "Only remote jobs")
|
|
110
|
+
.option("--onsite", "Only onsite jobs")
|
|
111
|
+
.option("--hybrid", "Only hybrid jobs")
|
|
112
|
+
.option("--department <depts...>", "Department filter")
|
|
113
|
+
.option("--output <format>", "Output format: json, table, markdown, csv", "json")
|
|
114
|
+
.option("-o, --out <file>", "Write output to file")
|
|
115
|
+
.option("--save-raw", "Save raw API responses to ~/.jobcrawl/raw/")).action(wrap(async (aggregators, opts) => {
|
|
116
|
+
await crawlAggregatorsCommand(aggregators, opts);
|
|
117
|
+
}));
|
|
118
|
+
program
|
|
119
|
+
.command("discover")
|
|
120
|
+
.description("Find career page URLs from company names")
|
|
121
|
+
.option("--companies <names...>", "Company names to search for")
|
|
122
|
+
.option("--file <path>", "File with company names (one per line)")
|
|
123
|
+
.option("--output <format>", "Output format: urls, yaml, json", "urls")
|
|
124
|
+
.option("-o, --out <file>", "Write output to file")
|
|
125
|
+
.option("--verify", "Verify each discovered URL is a real career page")
|
|
126
|
+
.option("--concurrency <n>", "Max concurrent searches", "3")
|
|
127
|
+
.action(wrap(async (opts) => {
|
|
128
|
+
await discoverCommand(opts);
|
|
129
|
+
}));
|
|
130
|
+
program
|
|
131
|
+
.command("init")
|
|
132
|
+
.description("Create config file at ~/.jobcrawl/config.yaml")
|
|
133
|
+
.option("--force", "Overwrite existing config")
|
|
134
|
+
.action(wrap(async (opts) => {
|
|
135
|
+
await initCommand(opts);
|
|
136
|
+
}));
|
|
137
|
+
program
|
|
138
|
+
.command("detect <url>")
|
|
139
|
+
.description("Detect which ATS provider a career page uses")
|
|
140
|
+
.action(wrap(async (url) => {
|
|
141
|
+
const { probePage } = await import("../../core/fetch-page.js");
|
|
142
|
+
const { detectProvider } = await import("../../core/detect-provider.js");
|
|
143
|
+
const { html, finalUrl } = await probePage(url);
|
|
144
|
+
const result = detectProvider(html, finalUrl);
|
|
145
|
+
bus.emit("output:json", { data: { url, finalUrl, ...result } });
|
|
146
|
+
}));
|
|
147
|
+
addFilterOptions(program
|
|
148
|
+
.command("match <file>")
|
|
149
|
+
.description("Filter a jobs JSON file against search criteria")
|
|
150
|
+
.option("--keywords <terms...>", "Job title keywords to match")
|
|
151
|
+
.option("--exclude <terms...>", "Keywords to exclude")
|
|
152
|
+
.option("--location <location>", "Location filter")
|
|
153
|
+
.option("--remote", "Only remote jobs")
|
|
154
|
+
.option("--onsite", "Only onsite jobs")
|
|
155
|
+
.option("--hybrid", "Only hybrid jobs")
|
|
156
|
+
.option("--department <depts...>", "Department filter")
|
|
157
|
+
.option("--output <format>", "Output format: json, table, markdown, csv", "json")).action(wrap(async (file, opts) => {
|
|
158
|
+
const { readFile } = await import("node:fs/promises");
|
|
159
|
+
const { matchJobs } = await import("../../core/match-jobs.js");
|
|
160
|
+
const { formatOutput } = await import("../../core/format-output.js");
|
|
161
|
+
const jobs = JSON.parse(await readFile(file, "utf-8"));
|
|
162
|
+
const criteria = {
|
|
163
|
+
keywords: opts.keywords ?? [],
|
|
164
|
+
excludeKeywords: opts.exclude ?? [],
|
|
165
|
+
location: opts.location ?? null,
|
|
166
|
+
workMode: (() => {
|
|
167
|
+
const modes = [
|
|
168
|
+
...(opts.remote ? ["remote"] : []),
|
|
169
|
+
...(opts.onsite ? ["onsite"] : []),
|
|
170
|
+
...(opts.hybrid ? ["hybrid"] : []),
|
|
171
|
+
];
|
|
172
|
+
return modes.length ? modes : null;
|
|
173
|
+
})(),
|
|
174
|
+
departments: opts.department ?? null,
|
|
175
|
+
role: opts.role ?? null,
|
|
176
|
+
roleType: opts.roleType ?? null,
|
|
177
|
+
jobType: opts.jobType ?? null,
|
|
178
|
+
minExperience: opts.minExperience
|
|
179
|
+
? opts.minExperience.map((v) => parseInt(v, 10))
|
|
180
|
+
: null,
|
|
181
|
+
companyStage: opts.companyStage ?? null,
|
|
182
|
+
industry: opts.industry ?? null,
|
|
183
|
+
companySize: opts.companySize ?? null,
|
|
184
|
+
hasSalary: opts.hasSalary ?? null,
|
|
185
|
+
hasEquity: opts.hasEquity ?? null,
|
|
186
|
+
hasInterviewProcess: opts.hasInterviewProcess ?? null,
|
|
187
|
+
visaSponsorship: opts.visaSponsorship ?? null,
|
|
188
|
+
};
|
|
189
|
+
const matched = matchJobs(jobs, criteria);
|
|
190
|
+
const output = formatOutput(matched, opts.output ?? "json");
|
|
191
|
+
bus.emit("output:text", { text: output });
|
|
192
|
+
}));
|
|
193
|
+
// Handle exit
|
|
194
|
+
bus.on("done", (p) => {
|
|
195
|
+
process.exitCode = p.exitCode;
|
|
196
|
+
});
|
|
197
|
+
program.parse();
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
declare const CONFIG_DIR: string;
|
|
2
|
+
declare const CONFIG_FILE: string;
|
|
3
|
+
declare const CREDENTIALS_FILE: string;
|
|
4
|
+
declare const RAW_DIR: string;
|
|
5
|
+
export { CONFIG_DIR, CONFIG_FILE, CREDENTIALS_FILE, RAW_DIR };
|
|
6
|
+
interface InitOptions {
|
|
7
|
+
force?: boolean;
|
|
8
|
+
}
|
|
9
|
+
export declare function initCommand(opts: InitOptions): Promise<void>;
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import { existsSync } from "node:fs";
|
|
2
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
3
|
+
import { homedir } from "node:os";
|
|
4
|
+
import { join } from "node:path";
|
|
5
|
+
import { bus } from "../../events.js";
|
|
6
|
+
import { companies } from "../../data/companies.js";
|
|
7
|
+
const CONFIG_DIR = join(homedir(), ".jobcrawl");
|
|
8
|
+
const CONFIG_FILE = join(CONFIG_DIR, "config.yaml");
|
|
9
|
+
const CREDENTIALS_FILE = join(CONFIG_DIR, "credentials.json");
|
|
10
|
+
const RAW_DIR = join(CONFIG_DIR, "raw");
|
|
11
|
+
function generateDefaultConfig() {
|
|
12
|
+
const lines = [
|
|
13
|
+
"# jobcrawl config — auto-generated by `jobcrawl init`",
|
|
14
|
+
"#",
|
|
15
|
+
"# Aggregators (cross-company search engines):",
|
|
16
|
+
"# Run with: jobcrawl crawl-aggregators yc",
|
|
17
|
+
"# Or alongside companies: jobcrawl crawl --aggregators yc",
|
|
18
|
+
"",
|
|
19
|
+
"aggregators:",
|
|
20
|
+
" - type: yc",
|
|
21
|
+
" enabled: true",
|
|
22
|
+
"",
|
|
23
|
+
"# Companies (add your own or modify existing ones):",
|
|
24
|
+
"#",
|
|
25
|
+
"# Slug-based (recommended):",
|
|
26
|
+
"# - company: Company Name",
|
|
27
|
+
"# slug: board-slug",
|
|
28
|
+
"# provider: greenhouse | ashby | lever (optional — auto-detected if omitted)",
|
|
29
|
+
"# fallback: https://... (optional — used when no ATS API matches)",
|
|
30
|
+
"#",
|
|
31
|
+
"# URL-based:",
|
|
32
|
+
"# - url: https://example.com/careers",
|
|
33
|
+
"# company: Company Name",
|
|
34
|
+
"",
|
|
35
|
+
"companies:",
|
|
36
|
+
];
|
|
37
|
+
let currentSection = "";
|
|
38
|
+
for (const c of companies) {
|
|
39
|
+
if (c.section !== currentSection) {
|
|
40
|
+
currentSection = c.section;
|
|
41
|
+
if (currentSection) {
|
|
42
|
+
lines.push("");
|
|
43
|
+
lines.push(` # --- ${currentSection} ---`);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
lines.push(` - company: ${c.company}`);
|
|
47
|
+
lines.push(` slug: ${c.slug}`);
|
|
48
|
+
if (c.provider) {
|
|
49
|
+
lines.push(` provider: ${c.provider}`);
|
|
50
|
+
}
|
|
51
|
+
if (c.fallback) {
|
|
52
|
+
lines.push(` fallback: ${c.fallback}`);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
lines.push("");
|
|
56
|
+
lines.push("defaults:");
|
|
57
|
+
lines.push(" concurrency: 5");
|
|
58
|
+
lines.push("");
|
|
59
|
+
return lines.join("\n");
|
|
60
|
+
}
|
|
61
|
+
export { CONFIG_DIR, CONFIG_FILE, CREDENTIALS_FILE, RAW_DIR };
|
|
62
|
+
export async function initCommand(opts) {
|
|
63
|
+
const created = [];
|
|
64
|
+
const skipped = [];
|
|
65
|
+
await mkdir(RAW_DIR, { recursive: true, mode: 0o700 });
|
|
66
|
+
created.push(`${RAW_DIR}/`);
|
|
67
|
+
if (!existsSync(CONFIG_FILE) || opts.force) {
|
|
68
|
+
await writeFile(CONFIG_FILE, generateDefaultConfig());
|
|
69
|
+
created.push(CONFIG_FILE);
|
|
70
|
+
}
|
|
71
|
+
else {
|
|
72
|
+
skipped.push(CONFIG_FILE);
|
|
73
|
+
}
|
|
74
|
+
if (!existsSync(CREDENTIALS_FILE) || opts.force) {
|
|
75
|
+
const defaultCredentials = {
|
|
76
|
+
yc: {
|
|
77
|
+
algoliaAppId: "",
|
|
78
|
+
algoliaApiKey: "",
|
|
79
|
+
},
|
|
80
|
+
};
|
|
81
|
+
await writeFile(CREDENTIALS_FILE, JSON.stringify(defaultCredentials, null, 2) + "\n", { mode: 0o600 });
|
|
82
|
+
created.push(CREDENTIALS_FILE);
|
|
83
|
+
}
|
|
84
|
+
else {
|
|
85
|
+
skipped.push(CREDENTIALS_FILE);
|
|
86
|
+
}
|
|
87
|
+
const lines = [];
|
|
88
|
+
for (const f of created)
|
|
89
|
+
lines.push(`Created ${f}`);
|
|
90
|
+
for (const f of skipped)
|
|
91
|
+
lines.push(`Skipped ${f} (already exists)`);
|
|
92
|
+
lines.push("", `Add your YC Algolia credentials to ${CREDENTIALS_FILE}, then run:`, "", " jobcrawl crawl --keywords \"engineer\"");
|
|
93
|
+
bus.emit("output:message", { text: lines.join("\n"), style: "success" });
|
|
94
|
+
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import { bus } from "../../events.js";
|
|
2
|
+
/**
|
|
3
|
+
* Mount plain-text event subscribers for piped (non-TTY) output.
|
|
4
|
+
* Data goes to stdout, status/progress to stderr.
|
|
5
|
+
* Returns a cleanup function that unsubscribes all handlers.
|
|
6
|
+
*/
|
|
7
|
+
export function mountPlain() {
|
|
8
|
+
const handlers = [];
|
|
9
|
+
function on(event, handler) {
|
|
10
|
+
bus.on(event, handler);
|
|
11
|
+
handlers.push([event, handler]);
|
|
12
|
+
}
|
|
13
|
+
// Status events → stderr
|
|
14
|
+
on("url:fetching", (p) => {
|
|
15
|
+
process.stderr.write(` [${p.urlId}] ${p.company}: fetching\n`);
|
|
16
|
+
});
|
|
17
|
+
on("url:detecting", (p) => {
|
|
18
|
+
process.stderr.write(` [${p.urlId}] provider: ${p.provider}\n`);
|
|
19
|
+
});
|
|
20
|
+
on("url:extracting", (p) => {
|
|
21
|
+
const count = p.jobCount !== undefined ? ` (${p.jobCount} jobs)` : "";
|
|
22
|
+
process.stderr.write(` [${p.urlId}] extracting${count}\n`);
|
|
23
|
+
});
|
|
24
|
+
on("url:done", (p) => {
|
|
25
|
+
process.stderr.write(` [${p.urlId}] ${p.company}: ${p.matched}/${p.total} matched\n`);
|
|
26
|
+
});
|
|
27
|
+
on("url:failed", (p) => {
|
|
28
|
+
process.stderr.write(` [${p.urlId}] ${p.company}: FAILED — ${p.error}\n`);
|
|
29
|
+
});
|
|
30
|
+
on("crawl:start", (p) => {
|
|
31
|
+
process.stderr.write(`Crawling ${p.total} URLs...\n`);
|
|
32
|
+
});
|
|
33
|
+
on("crawl:complete", (p) => {
|
|
34
|
+
process.stderr.write(`Done. ${p.totalMatched} jobs matched across ${p.totalUrls} URLs (${p.failedUrls} failed)\n`);
|
|
35
|
+
});
|
|
36
|
+
// Discover events → stderr
|
|
37
|
+
on("discover:searching", (p) => {
|
|
38
|
+
process.stderr.write(` Searching for ${p.company}...\n`);
|
|
39
|
+
});
|
|
40
|
+
on("discover:found", (p) => {
|
|
41
|
+
process.stderr.write(` ✓ ${p.company} → ${p.url}\n`);
|
|
42
|
+
});
|
|
43
|
+
on("discover:not-found", (p) => {
|
|
44
|
+
process.stderr.write(` ✗ ${p.company}: ${p.reason}\n`);
|
|
45
|
+
});
|
|
46
|
+
// Output events → stdout
|
|
47
|
+
on("output:json", (p) => {
|
|
48
|
+
process.stdout.write(JSON.stringify(p.data, null, 2) + "\n");
|
|
49
|
+
});
|
|
50
|
+
on("output:text", (p) => {
|
|
51
|
+
process.stdout.write(p.text + "\n");
|
|
52
|
+
});
|
|
53
|
+
on("output:table", (p) => {
|
|
54
|
+
// Simple table rendering
|
|
55
|
+
const widths = p.headers.map((h, i) => Math.max(h.length, ...p.rows.map((r) => (r[i] ?? "").length)));
|
|
56
|
+
const headerLine = p.headers.map((h, i) => h.padEnd(widths[i])).join(" | ");
|
|
57
|
+
const separator = widths.map((w) => "-".repeat(w)).join("-+-");
|
|
58
|
+
process.stdout.write(headerLine + "\n" + separator + "\n");
|
|
59
|
+
for (const row of p.rows) {
|
|
60
|
+
process.stdout.write(row.map((c, i) => (c ?? "").padEnd(widths[i])).join(" | ") + "\n");
|
|
61
|
+
}
|
|
62
|
+
if (p.footer)
|
|
63
|
+
process.stdout.write(p.footer + "\n");
|
|
64
|
+
});
|
|
65
|
+
on("output:message", (p) => {
|
|
66
|
+
process.stderr.write(p.text + "\n");
|
|
67
|
+
});
|
|
68
|
+
on("error", (p) => {
|
|
69
|
+
process.stderr.write(`Error: ${p.message}\n`);
|
|
70
|
+
});
|
|
71
|
+
// Cleanup function
|
|
72
|
+
return () => {
|
|
73
|
+
for (const [event, handler] of handlers) {
|
|
74
|
+
bus.off(event, handler);
|
|
75
|
+
}
|
|
76
|
+
};
|
|
77
|
+
}
|
package/dist/events.d.ts
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
export type Events = {
|
|
2
|
+
"discover:searching": {
|
|
3
|
+
company: string;
|
|
4
|
+
};
|
|
5
|
+
"discover:found": {
|
|
6
|
+
company: string;
|
|
7
|
+
url: string;
|
|
8
|
+
};
|
|
9
|
+
"discover:not-found": {
|
|
10
|
+
company: string;
|
|
11
|
+
reason: string;
|
|
12
|
+
};
|
|
13
|
+
"discover:verifying": {
|
|
14
|
+
company: string;
|
|
15
|
+
url: string;
|
|
16
|
+
};
|
|
17
|
+
"discover:complete": {
|
|
18
|
+
found: number;
|
|
19
|
+
notFound: number;
|
|
20
|
+
total: number;
|
|
21
|
+
};
|
|
22
|
+
"crawl:start": {
|
|
23
|
+
total: number;
|
|
24
|
+
};
|
|
25
|
+
"url:fetching": {
|
|
26
|
+
urlId: string;
|
|
27
|
+
url: string;
|
|
28
|
+
company: string;
|
|
29
|
+
};
|
|
30
|
+
"url:detecting": {
|
|
31
|
+
urlId: string;
|
|
32
|
+
provider: string;
|
|
33
|
+
};
|
|
34
|
+
"url:extracting": {
|
|
35
|
+
urlId: string;
|
|
36
|
+
jobCount?: number;
|
|
37
|
+
};
|
|
38
|
+
"url:matching": {
|
|
39
|
+
urlId: string;
|
|
40
|
+
matched: number;
|
|
41
|
+
total: number;
|
|
42
|
+
};
|
|
43
|
+
"url:done": {
|
|
44
|
+
urlId: string;
|
|
45
|
+
company: string;
|
|
46
|
+
matched: number;
|
|
47
|
+
total: number;
|
|
48
|
+
};
|
|
49
|
+
"url:failed": {
|
|
50
|
+
urlId: string;
|
|
51
|
+
company: string;
|
|
52
|
+
error: string;
|
|
53
|
+
};
|
|
54
|
+
"url:rendering": {
|
|
55
|
+
urlId: string;
|
|
56
|
+
url: string;
|
|
57
|
+
};
|
|
58
|
+
"url:resolving-urls": {
|
|
59
|
+
urlId: string;
|
|
60
|
+
count: number;
|
|
61
|
+
};
|
|
62
|
+
"target:probing": {
|
|
63
|
+
urlId: string;
|
|
64
|
+
provider: string;
|
|
65
|
+
};
|
|
66
|
+
"aggregator:start": {
|
|
67
|
+
type: string;
|
|
68
|
+
};
|
|
69
|
+
"aggregator:done": {
|
|
70
|
+
type: string;
|
|
71
|
+
jobCount: number;
|
|
72
|
+
};
|
|
73
|
+
"aggregator:failed": {
|
|
74
|
+
type: string;
|
|
75
|
+
error: string;
|
|
76
|
+
};
|
|
77
|
+
"crawl:complete": {
|
|
78
|
+
totalJobs: number;
|
|
79
|
+
totalMatched: number;
|
|
80
|
+
totalUrls: number;
|
|
81
|
+
failedUrls: number;
|
|
82
|
+
};
|
|
83
|
+
"output:json": {
|
|
84
|
+
data: unknown;
|
|
85
|
+
};
|
|
86
|
+
"output:table": {
|
|
87
|
+
headers: string[];
|
|
88
|
+
rows: string[][];
|
|
89
|
+
footer?: string;
|
|
90
|
+
};
|
|
91
|
+
"output:text": {
|
|
92
|
+
text: string;
|
|
93
|
+
};
|
|
94
|
+
"output:message": {
|
|
95
|
+
text: string;
|
|
96
|
+
style?: "success" | "warning" | "dim";
|
|
97
|
+
};
|
|
98
|
+
error: {
|
|
99
|
+
message: string;
|
|
100
|
+
};
|
|
101
|
+
done: {
|
|
102
|
+
exitCode: number;
|
|
103
|
+
};
|
|
104
|
+
};
|
|
105
|
+
type EventName = keyof Events;
|
|
106
|
+
declare class TypedEventBus {
|
|
107
|
+
private emitter;
|
|
108
|
+
emit<K extends EventName>(event: K, payload: Events[K]): void;
|
|
109
|
+
on<K extends EventName>(event: K, handler: (payload: Events[K]) => void): void;
|
|
110
|
+
off<K extends EventName>(event: K, handler: (payload: Events[K]) => void): void;
|
|
111
|
+
removeAllListeners(): void;
|
|
112
|
+
}
|
|
113
|
+
export declare const bus: TypedEventBus;
|
|
114
|
+
export {};
|
package/dist/events.js
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { EventEmitter } from "node:events";
|
|
2
|
+
class TypedEventBus {
|
|
3
|
+
emitter = new EventEmitter();
|
|
4
|
+
emit(event, payload) {
|
|
5
|
+
this.emitter.emit(event, payload);
|
|
6
|
+
}
|
|
7
|
+
on(event, handler) {
|
|
8
|
+
this.emitter.on(event, handler);
|
|
9
|
+
}
|
|
10
|
+
off(event, handler) {
|
|
11
|
+
this.emitter.off(event, handler);
|
|
12
|
+
}
|
|
13
|
+
removeAllListeners() {
|
|
14
|
+
this.emitter.removeAllListeners();
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
export const bus = new TypedEventBus();
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import { bus } from "../events.js";
|
|
2
|
+
import { runPool } from "../threads/pool.js";
|
|
3
|
+
import { processTarget } from "../threads/process-url.js";
|
|
4
|
+
import { extractViaAggregator } from "../core/extract-jobs.js";
|
|
5
|
+
import { matchJobs } from "../core/match-jobs.js";
|
|
6
|
+
async function runAggregators(aggregators, criteria, saveRaw) {
|
|
7
|
+
const allJobs = [];
|
|
8
|
+
for (const aggregator of aggregators) {
|
|
9
|
+
bus.emit("aggregator:start", { type: aggregator });
|
|
10
|
+
try {
|
|
11
|
+
const jobs = await extractViaAggregator(aggregator, criteria, saveRaw);
|
|
12
|
+
const matched = matchJobs(jobs, criteria);
|
|
13
|
+
bus.emit("aggregator:done", { type: aggregator, jobCount: matched.length });
|
|
14
|
+
allJobs.push(...matched);
|
|
15
|
+
}
|
|
16
|
+
catch (err) {
|
|
17
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
18
|
+
bus.emit("aggregator:failed", { type: aggregator, error: message });
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
return allJobs;
|
|
22
|
+
}
|
|
23
|
+
function dedup(jobs) {
|
|
24
|
+
const seen = new Map();
|
|
25
|
+
for (const job of jobs) {
|
|
26
|
+
const key = `${job.url}::${job.title}`;
|
|
27
|
+
const existing = seen.get(key);
|
|
28
|
+
// Prefer provider results over aggregator results (richer data)
|
|
29
|
+
if (!existing || existing.provider === "yc") {
|
|
30
|
+
seen.set(key, job);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
return [...seen.values()];
|
|
34
|
+
}
|
|
35
|
+
export async function crawlAll(targets, criteria, options, aggregators) {
|
|
36
|
+
const start = Date.now();
|
|
37
|
+
bus.emit("crawl:start", { total: targets.length });
|
|
38
|
+
const targetTasks = targets.map((target) => () => processTarget(target, criteria, {
|
|
39
|
+
saveRaw: options.saveRaw,
|
|
40
|
+
browser: options.browser,
|
|
41
|
+
}));
|
|
42
|
+
// Run aggregators and targets in parallel
|
|
43
|
+
const [aggregatorJobs, results] = await Promise.all([
|
|
44
|
+
aggregators && aggregators.length > 0
|
|
45
|
+
? runAggregators(aggregators, criteria, options.saveRaw)
|
|
46
|
+
: Promise.resolve([]),
|
|
47
|
+
runPool(targetTasks, options.concurrency),
|
|
48
|
+
]);
|
|
49
|
+
const targetJobs = results.flatMap((r) => r.jobs);
|
|
50
|
+
const allTargetJobs = results.flatMap((r) => r.allJobs);
|
|
51
|
+
const failed = results.filter((r) => r.error !== null);
|
|
52
|
+
// Merge and dedup aggregator + target jobs
|
|
53
|
+
const jobs = dedup([...targetJobs, ...aggregatorJobs]);
|
|
54
|
+
const allJobs = dedup([...allTargetJobs, ...aggregatorJobs]);
|
|
55
|
+
bus.emit("crawl:complete", {
|
|
56
|
+
totalJobs: allJobs.length,
|
|
57
|
+
totalMatched: jobs.length,
|
|
58
|
+
totalUrls: targets.length,
|
|
59
|
+
failedUrls: failed.length,
|
|
60
|
+
});
|
|
61
|
+
return {
|
|
62
|
+
jobs,
|
|
63
|
+
results,
|
|
64
|
+
totalDurationMs: Date.now() - start,
|
|
65
|
+
};
|
|
66
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { type DiscoverResult } from "../core/discover-careers.js";
|
|
2
|
+
export interface DiscoverAllResult {
|
|
3
|
+
results: DiscoverResult[];
|
|
4
|
+
found: number;
|
|
5
|
+
notFound: number;
|
|
6
|
+
}
|
|
7
|
+
export declare function discoverAll(companies: string[], options: {
|
|
8
|
+
concurrency: number;
|
|
9
|
+
verify: boolean;
|
|
10
|
+
}): Promise<DiscoverAllResult>;
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { bus } from "../events.js";
|
|
2
|
+
import { runPool } from "../threads/pool.js";
|
|
3
|
+
import { discoverCareerPage, verifyCareerPage, } from "../core/discover-careers.js";
|
|
4
|
+
export async function discoverAll(companies, options) {
|
|
5
|
+
const tasks = companies.map((company) => async () => {
|
|
6
|
+
bus.emit("discover:searching", { company });
|
|
7
|
+
const result = await discoverCareerPage(company);
|
|
8
|
+
if (result.url) {
|
|
9
|
+
if (options.verify) {
|
|
10
|
+
bus.emit("discover:verifying", { company, url: result.url });
|
|
11
|
+
const valid = await verifyCareerPage(result.url);
|
|
12
|
+
if (!valid) {
|
|
13
|
+
bus.emit("discover:not-found", {
|
|
14
|
+
company,
|
|
15
|
+
reason: "URL found but failed verification",
|
|
16
|
+
});
|
|
17
|
+
return { ...result, url: null, error: "Failed verification" };
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
bus.emit("discover:found", { company, url: result.url });
|
|
21
|
+
}
|
|
22
|
+
else {
|
|
23
|
+
bus.emit("discover:not-found", {
|
|
24
|
+
company,
|
|
25
|
+
reason: result.error ?? "Not found",
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
return result;
|
|
29
|
+
});
|
|
30
|
+
const results = await runPool(tasks, options.concurrency);
|
|
31
|
+
const found = results.filter((r) => r.url !== null).length;
|
|
32
|
+
const notFound = results.filter((r) => r.url === null).length;
|
|
33
|
+
bus.emit("discover:complete", {
|
|
34
|
+
found,
|
|
35
|
+
notFound,
|
|
36
|
+
total: companies.length,
|
|
37
|
+
});
|
|
38
|
+
return { results, found, notFound };
|
|
39
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Bounded concurrency pool. Runs up to `concurrency` tasks simultaneously.
|
|
3
|
+
* ~20 lines, no external dependency.
|
|
4
|
+
*/
|
|
5
|
+
export async function runPool(tasks, concurrency) {
|
|
6
|
+
const results = [];
|
|
7
|
+
const executing = new Set();
|
|
8
|
+
for (const task of tasks) {
|
|
9
|
+
const p = task()
|
|
10
|
+
.then((result) => {
|
|
11
|
+
results.push(result);
|
|
12
|
+
})
|
|
13
|
+
.finally(() => {
|
|
14
|
+
executing.delete(p);
|
|
15
|
+
});
|
|
16
|
+
executing.add(p);
|
|
17
|
+
if (executing.size >= concurrency) {
|
|
18
|
+
await Promise.race(executing);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
await Promise.all(executing);
|
|
22
|
+
return results;
|
|
23
|
+
}
|