jobcrawl 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.prettierrc.json +10 -0
- package/CHANGELOG.md +40 -0
- package/README.md +232 -0
- package/dist/core/aggregators/yc.d.ts +7 -0
- package/dist/core/aggregators/yc.js +320 -0
- package/dist/core/browser.d.ts +30 -0
- package/dist/core/browser.js +196 -0
- package/dist/core/cache.d.ts +13 -0
- package/dist/core/cache.js +41 -0
- package/dist/core/detect-provider.d.ts +7 -0
- package/dist/core/detect-provider.js +125 -0
- package/dist/core/discover-careers.d.ts +18 -0
- package/dist/core/discover-careers.js +92 -0
- package/dist/core/extract-jobs.d.ts +14 -0
- package/dist/core/extract-jobs.js +36 -0
- package/dist/core/fetch-page.d.ts +11 -0
- package/dist/core/fetch-page.js +39 -0
- package/dist/core/format-output.d.ts +2 -0
- package/dist/core/format-output.js +59 -0
- package/dist/core/match-jobs.d.ts +6 -0
- package/dist/core/match-jobs.js +43 -0
- package/dist/core/providers/ashby.d.ts +6 -0
- package/dist/core/providers/ashby.js +58 -0
- package/dist/core/providers/generic.d.ts +6 -0
- package/dist/core/providers/generic.js +294 -0
- package/dist/core/providers/greenhouse.d.ts +6 -0
- package/dist/core/providers/greenhouse.js +47 -0
- package/dist/core/providers/lever.d.ts +7 -0
- package/dist/core/providers/lever.js +60 -0
- package/dist/core/providers/yc.d.ts +7 -0
- package/dist/core/providers/yc.js +320 -0
- package/dist/core/resolve-iframe.d.ts +6 -0
- package/dist/core/resolve-iframe.js +51 -0
- package/dist/core/save-raw.d.ts +4 -0
- package/dist/core/save-raw.js +13 -0
- package/dist/data/companies.d.ts +9 -0
- package/dist/data/companies.js +2849 -0
- package/dist/entrypoints/cli/app.d.ts +3 -0
- package/dist/entrypoints/cli/app.js +91 -0
- package/dist/entrypoints/cli/components/crawl-view.d.ts +1 -0
- package/dist/entrypoints/cli/components/crawl-view.js +94 -0
- package/dist/entrypoints/cli/components/discover-view.d.ts +1 -0
- package/dist/entrypoints/cli/components/discover-view.js +67 -0
- package/dist/entrypoints/cli/crawl-aggregators.d.ts +26 -0
- package/dist/entrypoints/cli/crawl-aggregators.js +76 -0
- package/dist/entrypoints/cli/crawl-url.d.ts +26 -0
- package/dist/entrypoints/cli/crawl-url.js +54 -0
- package/dist/entrypoints/cli/crawl.d.ts +32 -0
- package/dist/entrypoints/cli/crawl.js +108 -0
- package/dist/entrypoints/cli/discover.d.ts +10 -0
- package/dist/entrypoints/cli/discover.js +69 -0
- package/dist/entrypoints/cli/index.d.ts +2 -0
- package/dist/entrypoints/cli/index.js +197 -0
- package/dist/entrypoints/cli/init.d.ts +9 -0
- package/dist/entrypoints/cli/init.js +94 -0
- package/dist/entrypoints/cli/plain.d.ts +6 -0
- package/dist/entrypoints/cli/plain.js +77 -0
- package/dist/events.d.ts +114 -0
- package/dist/events.js +17 -0
- package/dist/orchestrators/crawl-all.d.ts +2 -0
- package/dist/orchestrators/crawl-all.js +66 -0
- package/dist/orchestrators/discover-all.d.ts +10 -0
- package/dist/orchestrators/discover-all.js +39 -0
- package/dist/threads/pool.d.ts +5 -0
- package/dist/threads/pool.js +23 -0
- package/dist/threads/process-url.d.ts +9 -0
- package/dist/threads/process-url.js +229 -0
- package/dist/types/index.d.ts +83 -0
- package/dist/types/index.js +6 -0
- package/dist/utils/config.d.ts +17 -0
- package/dist/utils/config.js +57 -0
- package/dist/utils/google-search.d.ts +19 -0
- package/dist/utils/google-search.js +139 -0
- package/dist/utils/llm.d.ts +8 -0
- package/dist/utils/llm.js +25 -0
- package/package.json +42 -0
- package/src/core/aggregators/yc.ts +415 -0
- package/src/core/browser.ts +239 -0
- package/src/core/detect-provider.ts +162 -0
- package/src/core/discover-careers.ts +117 -0
- package/src/core/extract-jobs.ts +50 -0
- package/src/core/fetch-page.ts +41 -0
- package/src/core/format-output.ts +80 -0
- package/src/core/match-jobs.ts +56 -0
- package/src/core/providers/ashby.ts +84 -0
- package/src/core/providers/generic.ts +332 -0
- package/src/core/providers/greenhouse.ts +74 -0
- package/src/core/providers/lever.ts +90 -0
- package/src/core/resolve-iframe.ts +59 -0
- package/src/core/save-raw.ts +18 -0
- package/src/data/companies.ts +2859 -0
- package/src/entrypoints/cli/app.tsx +173 -0
- package/src/entrypoints/cli/components/crawl-view.tsx +163 -0
- package/src/entrypoints/cli/components/discover-view.tsx +138 -0
- package/src/entrypoints/cli/crawl-aggregators.ts +112 -0
- package/src/entrypoints/cli/crawl-url.ts +87 -0
- package/src/entrypoints/cli/crawl.ts +163 -0
- package/src/entrypoints/cli/discover.ts +96 -0
- package/src/entrypoints/cli/index.ts +252 -0
- package/src/entrypoints/cli/init.ts +117 -0
- package/src/entrypoints/cli/plain.ts +104 -0
- package/src/events.ts +79 -0
- package/src/orchestrators/crawl-all.ts +96 -0
- package/src/orchestrators/discover-all.ts +61 -0
- package/src/threads/pool.ts +29 -0
- package/src/threads/process-url.ts +312 -0
- package/src/types/index.ts +110 -0
- package/src/utils/config.ts +79 -0
- package/src/utils/google-search.ts +155 -0
- package/src/utils/llm.ts +33 -0
- package/test/integration/process-url.test.ts +301 -0
- package/test/integration/providers/ashby.test.ts +163 -0
- package/test/integration/providers/greenhouse.test.ts +191 -0
- package/test/integration/providers/lever.test.ts +188 -0
- package/test/unit/config.test.ts +64 -0
- package/test/unit/detect-provider.test.ts +165 -0
- package/test/unit/events.test.ts +104 -0
- package/test/unit/format-output.test.ts +165 -0
- package/test/unit/match-jobs.test.ts +257 -0
- package/test/unit/pool.test.ts +74 -0
- package/test/unit/providers/generic.test.ts +139 -0
- package/test/unit/resolve-iframe.test.ts +100 -0
- package/tsconfig.json +19 -0
- package/vitest.config.ts +7 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import { existsSync } from "node:fs";
|
|
2
|
+
import { readFile } from "node:fs/promises";
|
|
3
|
+
import { bus } from "../../events.js";
|
|
4
|
+
import { crawlAll } from "../../orchestrators/crawl-all.js";
|
|
5
|
+
import { loadConfig, parseUrlList } from "../../utils/config.js";
|
|
6
|
+
import { formatOutput } from "../../core/format-output.js";
|
|
7
|
+
import { CONFIG_FILE } from "./init.js";
|
|
8
|
+
import type { Config } from "../../utils/config.js";
|
|
9
|
+
import type {
|
|
10
|
+
Target,
|
|
11
|
+
Aggregator,
|
|
12
|
+
SearchCriteria,
|
|
13
|
+
OutputFormat,
|
|
14
|
+
} from "../../types/index.js";
|
|
15
|
+
|
|
16
|
+
interface CrawlOptions {
|
|
17
|
+
urls?: string[];
|
|
18
|
+
file?: string;
|
|
19
|
+
aggregators?: string[];
|
|
20
|
+
keywords?: string[];
|
|
21
|
+
exclude?: string[];
|
|
22
|
+
location?: string;
|
|
23
|
+
remote?: boolean;
|
|
24
|
+
onsite?: boolean;
|
|
25
|
+
hybrid?: boolean;
|
|
26
|
+
department?: string[];
|
|
27
|
+
role?: string[];
|
|
28
|
+
roleType?: string[];
|
|
29
|
+
jobType?: string[];
|
|
30
|
+
minExperience?: string[];
|
|
31
|
+
companyStage?: string[];
|
|
32
|
+
industry?: string[];
|
|
33
|
+
companySize?: string[];
|
|
34
|
+
hasSalary?: boolean;
|
|
35
|
+
hasEquity?: boolean;
|
|
36
|
+
hasInterviewProcess?: boolean;
|
|
37
|
+
visaSponsorship?: boolean;
|
|
38
|
+
output?: OutputFormat;
|
|
39
|
+
out?: string;
|
|
40
|
+
concurrency?: string;
|
|
41
|
+
saveRaw?: boolean;
|
|
42
|
+
networkTimeout?: string;
|
|
43
|
+
maxBubbleLevels?: string;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export async function crawlCommand(opts: CrawlOptions): Promise<void> {
|
|
47
|
+
const { targets, config } = await resolveTargets(opts);
|
|
48
|
+
|
|
49
|
+
if (targets.length === 0) {
|
|
50
|
+
throw new Error(
|
|
51
|
+
"No targets provided. Use --urls, --file, or run `jobcrawl init` to set up default targets."
|
|
52
|
+
);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
const criteria: SearchCriteria = {
|
|
56
|
+
keywords: opts.keywords ?? [],
|
|
57
|
+
excludeKeywords: opts.exclude ?? [],
|
|
58
|
+
location: opts.location ?? null,
|
|
59
|
+
workMode: buildWorkMode(opts),
|
|
60
|
+
departments: opts.department ?? null,
|
|
61
|
+
role: opts.role ?? null,
|
|
62
|
+
roleType: opts.roleType ?? null,
|
|
63
|
+
jobType: opts.jobType ?? null,
|
|
64
|
+
minExperience: opts.minExperience
|
|
65
|
+
? opts.minExperience.map((v) => parseInt(v, 10))
|
|
66
|
+
: null,
|
|
67
|
+
companyStage: opts.companyStage ?? null,
|
|
68
|
+
industry: opts.industry ?? null,
|
|
69
|
+
companySize: opts.companySize ?? null,
|
|
70
|
+
hasSalary: opts.hasSalary ?? null,
|
|
71
|
+
hasEquity: opts.hasEquity ?? null,
|
|
72
|
+
hasInterviewProcess: opts.hasInterviewProcess ?? null,
|
|
73
|
+
visaSponsorship: opts.visaSponsorship ?? null,
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
const concurrency = opts.concurrency ? parseInt(opts.concurrency, 10) : 5;
|
|
77
|
+
const format = opts.output ?? "json";
|
|
78
|
+
|
|
79
|
+
// Parse --aggregators flag
|
|
80
|
+
const aggregators: Aggregator[] | undefined = opts.aggregators?.map((name) => {
|
|
81
|
+
if (name === "yc") return name;
|
|
82
|
+
throw new Error(`Unknown aggregator: "${name}". Available: yc`);
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
// Browser options: CLI flags override config defaults
|
|
86
|
+
const configBrowser = config?.defaults?.browser;
|
|
87
|
+
const browser = {
|
|
88
|
+
networkTimeout: opts.networkTimeout
|
|
89
|
+
? parseInt(opts.networkTimeout, 10)
|
|
90
|
+
: configBrowser?.networkTimeout,
|
|
91
|
+
maxBubbleLevels: opts.maxBubbleLevels
|
|
92
|
+
? parseInt(opts.maxBubbleLevels, 10)
|
|
93
|
+
: configBrowser?.maxBubbleLevels,
|
|
94
|
+
};
|
|
95
|
+
|
|
96
|
+
const result = await crawlAll(targets, criteria, {
|
|
97
|
+
concurrency,
|
|
98
|
+
saveRaw: opts.saveRaw,
|
|
99
|
+
browser,
|
|
100
|
+
}, aggregators);
|
|
101
|
+
|
|
102
|
+
const output = formatOutput(result.jobs, format);
|
|
103
|
+
|
|
104
|
+
if (opts.out) {
|
|
105
|
+
const { writeFile } = await import("node:fs/promises");
|
|
106
|
+
await writeFile(opts.out, output + "\n");
|
|
107
|
+
bus.emit("output:message", {
|
|
108
|
+
text: `Wrote ${result.jobs.length} jobs to ${opts.out}`,
|
|
109
|
+
style: "success",
|
|
110
|
+
});
|
|
111
|
+
} else {
|
|
112
|
+
bus.emit("output:text", { text: output });
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
async function resolveTargets(
|
|
117
|
+
opts: CrawlOptions
|
|
118
|
+
): Promise<{ targets: Target[]; config: Config | null }> {
|
|
119
|
+
const targets: Target[] = [];
|
|
120
|
+
let config: Config | null = null;
|
|
121
|
+
|
|
122
|
+
// From --urls flag
|
|
123
|
+
if (opts.urls) {
|
|
124
|
+
for (const url of opts.urls) {
|
|
125
|
+
targets.push({ url });
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// From --file flag
|
|
130
|
+
if (opts.file) {
|
|
131
|
+
config = await loadConfig(opts.file);
|
|
132
|
+
targets.push(...config.companies);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// From ~/.jobcrawl/config.yaml (default config)
|
|
136
|
+
if (targets.length === 0 && existsSync(CONFIG_FILE)) {
|
|
137
|
+
config = await loadConfig(CONFIG_FILE);
|
|
138
|
+
targets.push(...config.companies);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Always load config for defaults even when using --urls
|
|
142
|
+
if (!config && existsSync(CONFIG_FILE)) {
|
|
143
|
+
config = await loadConfig(CONFIG_FILE);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// From stdin (if not a TTY and no other input)
|
|
147
|
+
if (targets.length === 0 && !process.stdin.isTTY) {
|
|
148
|
+
const input = await readFile("/dev/stdin", "utf-8");
|
|
149
|
+
targets.push(...parseUrlList(input));
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
return { targets, config };
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
function buildWorkMode(
|
|
156
|
+
opts: CrawlOptions
|
|
157
|
+
): ("remote" | "onsite" | "hybrid")[] | null {
|
|
158
|
+
const modes: ("remote" | "onsite" | "hybrid")[] = [];
|
|
159
|
+
if (opts.remote) modes.push("remote");
|
|
160
|
+
if (opts.onsite) modes.push("onsite");
|
|
161
|
+
if (opts.hybrid) modes.push("hybrid");
|
|
162
|
+
return modes.length > 0 ? modes : null;
|
|
163
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import { readFile } from "node:fs/promises";
|
|
2
|
+
import { bus } from "../../events.js";
|
|
3
|
+
import { discoverAll } from "../../orchestrators/discover-all.js";
|
|
4
|
+
import type { UrlTarget } from "../../types/index.js";
|
|
5
|
+
import yaml from "js-yaml";
|
|
6
|
+
|
|
7
|
+
interface DiscoverOptions {
|
|
8
|
+
companies?: string[];
|
|
9
|
+
file?: string;
|
|
10
|
+
output?: "urls" | "yaml" | "json";
|
|
11
|
+
out?: string;
|
|
12
|
+
verify?: boolean;
|
|
13
|
+
concurrency?: string;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export async function discoverCommand(opts: DiscoverOptions): Promise<void> {
|
|
17
|
+
const companies = await resolveCompanies(opts);
|
|
18
|
+
|
|
19
|
+
if (companies.length === 0) {
|
|
20
|
+
throw new Error("No company names provided. Use --companies or --file.");
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
const concurrency = opts.concurrency ? parseInt(opts.concurrency, 10) : 3;
|
|
24
|
+
const format = opts.output ?? "urls";
|
|
25
|
+
|
|
26
|
+
const result = await discoverAll(companies, {
|
|
27
|
+
concurrency,
|
|
28
|
+
verify: opts.verify ?? false,
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
const found = result.results.filter((r) => r.url !== null);
|
|
32
|
+
const output = formatDiscoverOutput(found, format);
|
|
33
|
+
|
|
34
|
+
if (opts.out) {
|
|
35
|
+
const { writeFile } = await import("node:fs/promises");
|
|
36
|
+
await writeFile(opts.out, output + "\n");
|
|
37
|
+
bus.emit("output:message", {
|
|
38
|
+
text: `Wrote ${found.length} targets to ${opts.out}`,
|
|
39
|
+
style: "success",
|
|
40
|
+
});
|
|
41
|
+
} else {
|
|
42
|
+
bus.emit("output:text", { text: output });
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
async function resolveCompanies(opts: DiscoverOptions): Promise<string[]> {
|
|
47
|
+
const companies: string[] = [];
|
|
48
|
+
|
|
49
|
+
if (opts.companies) {
|
|
50
|
+
companies.push(...opts.companies);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
if (opts.file) {
|
|
54
|
+
const content = await readFile(opts.file, "utf-8");
|
|
55
|
+
const lines = content
|
|
56
|
+
.split("\n")
|
|
57
|
+
.map((l) => l.trim())
|
|
58
|
+
.filter((l) => l.length > 0 && !l.startsWith("#"));
|
|
59
|
+
companies.push(...lines);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// stdin if no other input and not a TTY
|
|
63
|
+
if (companies.length === 0 && !process.stdin.isTTY) {
|
|
64
|
+
const input = await readFile("/dev/stdin", "utf-8");
|
|
65
|
+
const lines = input
|
|
66
|
+
.split("\n")
|
|
67
|
+
.map((l) => l.trim())
|
|
68
|
+
.filter((l) => l.length > 0 && !l.startsWith("#"));
|
|
69
|
+
companies.push(...lines);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
return companies;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
function formatDiscoverOutput(
|
|
76
|
+
found: Array<{ company: string; url: string | null }>,
|
|
77
|
+
format: "urls" | "yaml" | "json"
|
|
78
|
+
): string {
|
|
79
|
+
const targets: UrlTarget[] = found
|
|
80
|
+
.filter((r): r is { company: string; url: string } => r.url !== null)
|
|
81
|
+
.map((r) => ({ url: r.url, company: r.company }));
|
|
82
|
+
|
|
83
|
+
switch (format) {
|
|
84
|
+
case "urls":
|
|
85
|
+
return targets.map((t) => t.url).join("\n");
|
|
86
|
+
|
|
87
|
+
case "json":
|
|
88
|
+
return JSON.stringify(targets, null, 2);
|
|
89
|
+
|
|
90
|
+
case "yaml":
|
|
91
|
+
return yaml.dump({
|
|
92
|
+
targets: targets.map((t) => ({ url: t.url, company: t.company })),
|
|
93
|
+
defaults: { concurrency: 5 },
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
}
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import React from "react";
|
|
4
|
+
import { render } from "ink";
|
|
5
|
+
import { Command } from "commander";
|
|
6
|
+
import { bus } from "../../events.js";
|
|
7
|
+
import { mountPlain } from "./plain.js";
|
|
8
|
+
import { App } from "./app.js";
|
|
9
|
+
import { crawlUrlCommand } from "./crawl-url.js";
|
|
10
|
+
import { crawlCommand } from "./crawl.js";
|
|
11
|
+
import { crawlAggregatorsCommand } from "./crawl-aggregators.js";
|
|
12
|
+
import { discoverCommand } from "./discover.js";
|
|
13
|
+
import { initCommand } from "./init.js";
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Lifecycle manager. Mounts Ink (TTY) or plain text (piped) subscriber,
|
|
17
|
+
* executes the command, and handles teardown.
|
|
18
|
+
*/
|
|
19
|
+
function wrap(
|
|
20
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
21
|
+
fn: (...args: any[]) => Promise<void>
|
|
22
|
+
): (...args: unknown[]) => void {
|
|
23
|
+
return (...args: unknown[]) => {
|
|
24
|
+
if (process.stdout.isTTY) {
|
|
25
|
+
// Interactive: mount Ink (handles run + done + cleanup internally)
|
|
26
|
+
const run = () => fn(...args);
|
|
27
|
+
const instance = render(React.createElement(App, { run }));
|
|
28
|
+
const onDone = () => {
|
|
29
|
+
bus.off("done", onDone);
|
|
30
|
+
setTimeout(() => instance.unmount(), 32);
|
|
31
|
+
};
|
|
32
|
+
bus.on("done", onDone);
|
|
33
|
+
} else {
|
|
34
|
+
// Piped: plain text subscriber
|
|
35
|
+
const cleanup = mountPlain();
|
|
36
|
+
|
|
37
|
+
fn(...args)
|
|
38
|
+
.then(() => {
|
|
39
|
+
bus.emit("done", { exitCode: 0 });
|
|
40
|
+
})
|
|
41
|
+
.catch((err: unknown) => {
|
|
42
|
+
bus.emit("error", {
|
|
43
|
+
message: err instanceof Error ? err.message : String(err),
|
|
44
|
+
});
|
|
45
|
+
bus.emit("done", { exitCode: 1 });
|
|
46
|
+
})
|
|
47
|
+
.finally(() => {
|
|
48
|
+
cleanup();
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function addFilterOptions(cmd: Command): Command {
|
|
55
|
+
return cmd
|
|
56
|
+
.option("--role <roles...>", "Role filter (e.g. engineering, design)")
|
|
57
|
+
.option("--role-type <types...>", "Role sub-type (e.g. backend, frontend)")
|
|
58
|
+
.option("--job-type <types...>", "Job type (e.g. fulltime, internship)")
|
|
59
|
+
.option("--min-experience <years...>", "Min years experience (e.g. 0, 3)")
|
|
60
|
+
.option("--company-stage <stages...>", "Company stage (e.g. seed, growth)")
|
|
61
|
+
.option("--industry <industries...>", "Industry filter")
|
|
62
|
+
.option("--company-size <sizes...>", "Company size (e.g. 1-10, 11-50)")
|
|
63
|
+
.option("--has-salary", "Only jobs with salary listed")
|
|
64
|
+
.option("--has-equity", "Only jobs with equity")
|
|
65
|
+
.option("--has-interview-process", "Only jobs with interview process")
|
|
66
|
+
.option("--visa-sponsorship", "Only jobs not requiring US visa");
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const program = new Command("jobcrawl")
|
|
70
|
+
.version("1.0.0")
|
|
71
|
+
.description("Crawl career pages for jobs matching your search criteria");
|
|
72
|
+
|
|
73
|
+
addFilterOptions(
|
|
74
|
+
program
|
|
75
|
+
.command("crawl-url <url>")
|
|
76
|
+
.description("Crawl a single career page URL and return matching jobs")
|
|
77
|
+
.option("--keywords <terms...>", "Job title keywords to match")
|
|
78
|
+
.option("--exclude <terms...>", "Keywords to exclude")
|
|
79
|
+
.option("--location <location>", "Location filter")
|
|
80
|
+
.option("--remote", "Only remote jobs")
|
|
81
|
+
.option("--onsite", "Only onsite jobs")
|
|
82
|
+
.option("--hybrid", "Only hybrid jobs")
|
|
83
|
+
.option("--department <depts...>", "Department filter")
|
|
84
|
+
.option(
|
|
85
|
+
"--output <format>",
|
|
86
|
+
"Output format: json, table, markdown, csv",
|
|
87
|
+
"json"
|
|
88
|
+
)
|
|
89
|
+
.option("-o, --out <file>", "Write output to file")
|
|
90
|
+
.option("--save-raw", "Save raw API responses to ~/.jobcrawl/raw/")
|
|
91
|
+
).action(
|
|
92
|
+
wrap(async (url: string, opts: Record<string, unknown>) => {
|
|
93
|
+
await crawlUrlCommand(url as string, opts);
|
|
94
|
+
})
|
|
95
|
+
);
|
|
96
|
+
|
|
97
|
+
addFilterOptions(
|
|
98
|
+
program
|
|
99
|
+
.command("crawl")
|
|
100
|
+
.description("Crawl multiple career pages and return matching jobs")
|
|
101
|
+
.option("--urls <urls...>", "Career page URLs to crawl")
|
|
102
|
+
.option("--file <path>", "Config file with targets (YAML/JSON)")
|
|
103
|
+
.option("--keywords <terms...>", "Job title keywords to match")
|
|
104
|
+
.option("--exclude <terms...>", "Keywords to exclude")
|
|
105
|
+
.option("--location <location>", "Location filter")
|
|
106
|
+
.option("--remote", "Only remote jobs")
|
|
107
|
+
.option("--onsite", "Only onsite jobs")
|
|
108
|
+
.option("--hybrid", "Only hybrid jobs")
|
|
109
|
+
.option("--department <depts...>", "Department filter")
|
|
110
|
+
.option(
|
|
111
|
+
"--output <format>",
|
|
112
|
+
"Output format: json, table, markdown, csv",
|
|
113
|
+
"json"
|
|
114
|
+
)
|
|
115
|
+
.option("-o, --out <file>", "Write output to file")
|
|
116
|
+
.option("--concurrency <n>", "Max concurrent crawls", "5")
|
|
117
|
+
.option("--save-raw", "Save raw API responses to ~/.jobcrawl/raw/")
|
|
118
|
+
.option("--aggregators <names...>", "Also run aggregators (e.g. yc)")
|
|
119
|
+
.option("--network-timeout <ms>", "Timeout for browser network commands (ms)")
|
|
120
|
+
.option("--max-bubble-levels <n>", "Max parent levels to try when clicking job cards")
|
|
121
|
+
).action(
|
|
122
|
+
wrap(async (opts: Record<string, unknown>) => {
|
|
123
|
+
await crawlCommand(opts);
|
|
124
|
+
})
|
|
125
|
+
);
|
|
126
|
+
|
|
127
|
+
addFilterOptions(
|
|
128
|
+
program
|
|
129
|
+
.command("crawl-aggregators <aggregators...>")
|
|
130
|
+
.description("Crawl aggregator sources (e.g. yc) for matching jobs")
|
|
131
|
+
.option("--keywords <terms...>", "Job title keywords to match")
|
|
132
|
+
.option("--exclude <terms...>", "Keywords to exclude")
|
|
133
|
+
.option("--location <location>", "Location filter")
|
|
134
|
+
.option("--remote", "Only remote jobs")
|
|
135
|
+
.option("--onsite", "Only onsite jobs")
|
|
136
|
+
.option("--hybrid", "Only hybrid jobs")
|
|
137
|
+
.option("--department <depts...>", "Department filter")
|
|
138
|
+
.option(
|
|
139
|
+
"--output <format>",
|
|
140
|
+
"Output format: json, table, markdown, csv",
|
|
141
|
+
"json"
|
|
142
|
+
)
|
|
143
|
+
.option("-o, --out <file>", "Write output to file")
|
|
144
|
+
.option("--save-raw", "Save raw API responses to ~/.jobcrawl/raw/")
|
|
145
|
+
).action(
|
|
146
|
+
wrap(async (aggregators: string[], opts: Record<string, unknown>) => {
|
|
147
|
+
await crawlAggregatorsCommand(aggregators, opts);
|
|
148
|
+
})
|
|
149
|
+
);
|
|
150
|
+
|
|
151
|
+
program
|
|
152
|
+
.command("discover")
|
|
153
|
+
.description("Find career page URLs from company names")
|
|
154
|
+
.option("--companies <names...>", "Company names to search for")
|
|
155
|
+
.option("--file <path>", "File with company names (one per line)")
|
|
156
|
+
.option("--output <format>", "Output format: urls, yaml, json", "urls")
|
|
157
|
+
.option("-o, --out <file>", "Write output to file")
|
|
158
|
+
.option("--verify", "Verify each discovered URL is a real career page")
|
|
159
|
+
.option("--concurrency <n>", "Max concurrent searches", "3")
|
|
160
|
+
.action(
|
|
161
|
+
wrap(async (opts: Record<string, unknown>) => {
|
|
162
|
+
await discoverCommand(opts);
|
|
163
|
+
})
|
|
164
|
+
);
|
|
165
|
+
|
|
166
|
+
program
|
|
167
|
+
.command("init")
|
|
168
|
+
.description("Create config file at ~/.jobcrawl/config.yaml")
|
|
169
|
+
.option("--force", "Overwrite existing config")
|
|
170
|
+
.action(
|
|
171
|
+
wrap(async (opts: Record<string, unknown>) => {
|
|
172
|
+
await initCommand(opts);
|
|
173
|
+
})
|
|
174
|
+
);
|
|
175
|
+
|
|
176
|
+
program
|
|
177
|
+
.command("detect <url>")
|
|
178
|
+
.description("Detect which ATS provider a career page uses")
|
|
179
|
+
.action(
|
|
180
|
+
wrap(async (url: string) => {
|
|
181
|
+
const { probePage } = await import("../../core/fetch-page.js");
|
|
182
|
+
const { detectProvider } = await import("../../core/detect-provider.js");
|
|
183
|
+
const { html, finalUrl } = await probePage(url);
|
|
184
|
+
const result = detectProvider(html, finalUrl);
|
|
185
|
+
bus.emit("output:json", { data: { url, finalUrl, ...result } });
|
|
186
|
+
})
|
|
187
|
+
);
|
|
188
|
+
|
|
189
|
+
addFilterOptions(
|
|
190
|
+
program
|
|
191
|
+
.command("match <file>")
|
|
192
|
+
.description("Filter a jobs JSON file against search criteria")
|
|
193
|
+
.option("--keywords <terms...>", "Job title keywords to match")
|
|
194
|
+
.option("--exclude <terms...>", "Keywords to exclude")
|
|
195
|
+
.option("--location <location>", "Location filter")
|
|
196
|
+
.option("--remote", "Only remote jobs")
|
|
197
|
+
.option("--onsite", "Only onsite jobs")
|
|
198
|
+
.option("--hybrid", "Only hybrid jobs")
|
|
199
|
+
.option("--department <depts...>", "Department filter")
|
|
200
|
+
.option(
|
|
201
|
+
"--output <format>",
|
|
202
|
+
"Output format: json, table, markdown, csv",
|
|
203
|
+
"json"
|
|
204
|
+
)
|
|
205
|
+
).action(
|
|
206
|
+
wrap(async (file: string, opts: Record<string, unknown>) => {
|
|
207
|
+
const { readFile } = await import("node:fs/promises");
|
|
208
|
+
const { matchJobs } = await import("../../core/match-jobs.js");
|
|
209
|
+
const { formatOutput } = await import("../../core/format-output.js");
|
|
210
|
+
const jobs = JSON.parse(await readFile(file, "utf-8"));
|
|
211
|
+
const criteria = {
|
|
212
|
+
keywords: (opts.keywords as string[]) ?? [],
|
|
213
|
+
excludeKeywords: (opts.exclude as string[]) ?? [],
|
|
214
|
+
location: (opts.location as string) ?? null,
|
|
215
|
+
workMode: (() => {
|
|
216
|
+
const modes = [
|
|
217
|
+
...(opts.remote ? ["remote" as const] : []),
|
|
218
|
+
...(opts.onsite ? ["onsite" as const] : []),
|
|
219
|
+
...(opts.hybrid ? ["hybrid" as const] : []),
|
|
220
|
+
];
|
|
221
|
+
return modes.length ? modes : null;
|
|
222
|
+
})(),
|
|
223
|
+
departments: (opts.department as string[]) ?? null,
|
|
224
|
+
role: (opts.role as string[]) ?? null,
|
|
225
|
+
roleType: (opts.roleType as string[]) ?? null,
|
|
226
|
+
jobType: (opts.jobType as string[]) ?? null,
|
|
227
|
+
minExperience: opts.minExperience
|
|
228
|
+
? (opts.minExperience as string[]).map((v) => parseInt(v, 10))
|
|
229
|
+
: null,
|
|
230
|
+
companyStage: (opts.companyStage as string[]) ?? null,
|
|
231
|
+
industry: (opts.industry as string[]) ?? null,
|
|
232
|
+
companySize: (opts.companySize as string[]) ?? null,
|
|
233
|
+
hasSalary: (opts.hasSalary as boolean) ?? null,
|
|
234
|
+
hasEquity: (opts.hasEquity as boolean) ?? null,
|
|
235
|
+
hasInterviewProcess: (opts.hasInterviewProcess as boolean) ?? null,
|
|
236
|
+
visaSponsorship: (opts.visaSponsorship as boolean) ?? null,
|
|
237
|
+
};
|
|
238
|
+
const matched = matchJobs(jobs, criteria);
|
|
239
|
+
const output = formatOutput(
|
|
240
|
+
matched,
|
|
241
|
+
(opts.output as string as "json") ?? "json"
|
|
242
|
+
);
|
|
243
|
+
bus.emit("output:text", { text: output });
|
|
244
|
+
})
|
|
245
|
+
);
|
|
246
|
+
|
|
247
|
+
// Handle exit
|
|
248
|
+
bus.on("done", (p) => {
|
|
249
|
+
process.exitCode = p.exitCode;
|
|
250
|
+
});
|
|
251
|
+
|
|
252
|
+
program.parse();
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import { existsSync } from "node:fs";
|
|
2
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
3
|
+
import { homedir } from "node:os";
|
|
4
|
+
import { join } from "node:path";
|
|
5
|
+
import { bus } from "../../events.js";
|
|
6
|
+
import { companies } from "../../data/companies.js";
|
|
7
|
+
|
|
8
|
+
const CONFIG_DIR = join(homedir(), ".jobcrawl");
|
|
9
|
+
const CONFIG_FILE = join(CONFIG_DIR, "config.yaml");
|
|
10
|
+
const CREDENTIALS_FILE = join(CONFIG_DIR, "credentials.json");
|
|
11
|
+
const RAW_DIR = join(CONFIG_DIR, "raw");
|
|
12
|
+
|
|
13
|
+
function generateDefaultConfig(): string {
|
|
14
|
+
const lines: string[] = [
|
|
15
|
+
"# jobcrawl config — auto-generated by `jobcrawl init`",
|
|
16
|
+
"#",
|
|
17
|
+
"# Aggregators (cross-company search engines):",
|
|
18
|
+
"# Run with: jobcrawl crawl-aggregators yc",
|
|
19
|
+
"# Or alongside companies: jobcrawl crawl --aggregators yc",
|
|
20
|
+
"",
|
|
21
|
+
"aggregators:",
|
|
22
|
+
" - type: yc",
|
|
23
|
+
" enabled: true",
|
|
24
|
+
"",
|
|
25
|
+
"# Companies (add your own or modify existing ones):",
|
|
26
|
+
"#",
|
|
27
|
+
"# Slug-based (recommended):",
|
|
28
|
+
"# - company: Company Name",
|
|
29
|
+
"# slug: board-slug",
|
|
30
|
+
"# provider: greenhouse | ashby | lever (optional — auto-detected if omitted)",
|
|
31
|
+
"# fallback: https://... (optional — used when no ATS API matches)",
|
|
32
|
+
"#",
|
|
33
|
+
"# URL-based:",
|
|
34
|
+
"# - url: https://example.com/careers",
|
|
35
|
+
"# company: Company Name",
|
|
36
|
+
"",
|
|
37
|
+
"companies:",
|
|
38
|
+
];
|
|
39
|
+
|
|
40
|
+
let currentSection = "";
|
|
41
|
+
|
|
42
|
+
for (const c of companies) {
|
|
43
|
+
if (c.section !== currentSection) {
|
|
44
|
+
currentSection = c.section;
|
|
45
|
+
if (currentSection) {
|
|
46
|
+
lines.push("");
|
|
47
|
+
lines.push(` # --- ${currentSection} ---`);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
lines.push(` - company: ${c.company}`);
|
|
52
|
+
lines.push(` slug: ${c.slug}`);
|
|
53
|
+
if (c.provider) {
|
|
54
|
+
lines.push(` provider: ${c.provider}`);
|
|
55
|
+
}
|
|
56
|
+
if (c.fallback) {
|
|
57
|
+
lines.push(` fallback: ${c.fallback}`);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
lines.push("");
|
|
62
|
+
lines.push("defaults:");
|
|
63
|
+
lines.push(" concurrency: 5");
|
|
64
|
+
lines.push("");
|
|
65
|
+
|
|
66
|
+
return lines.join("\n");
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
export { CONFIG_DIR, CONFIG_FILE, CREDENTIALS_FILE, RAW_DIR };
|
|
70
|
+
|
|
71
|
+
interface InitOptions {
|
|
72
|
+
force?: boolean;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
export async function initCommand(opts: InitOptions): Promise<void> {
|
|
76
|
+
const created: string[] = [];
|
|
77
|
+
const skipped: string[] = [];
|
|
78
|
+
|
|
79
|
+
await mkdir(RAW_DIR, { recursive: true, mode: 0o700 });
|
|
80
|
+
created.push(`${RAW_DIR}/`);
|
|
81
|
+
|
|
82
|
+
if (!existsSync(CONFIG_FILE) || opts.force) {
|
|
83
|
+
await writeFile(CONFIG_FILE, generateDefaultConfig());
|
|
84
|
+
created.push(CONFIG_FILE);
|
|
85
|
+
} else {
|
|
86
|
+
skipped.push(CONFIG_FILE);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if (!existsSync(CREDENTIALS_FILE) || opts.force) {
|
|
90
|
+
const defaultCredentials = {
|
|
91
|
+
yc: {
|
|
92
|
+
algoliaAppId: "",
|
|
93
|
+
algoliaApiKey: "",
|
|
94
|
+
},
|
|
95
|
+
};
|
|
96
|
+
await writeFile(
|
|
97
|
+
CREDENTIALS_FILE,
|
|
98
|
+
JSON.stringify(defaultCredentials, null, 2) + "\n",
|
|
99
|
+
{ mode: 0o600 },
|
|
100
|
+
);
|
|
101
|
+
created.push(CREDENTIALS_FILE);
|
|
102
|
+
} else {
|
|
103
|
+
skipped.push(CREDENTIALS_FILE);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
const lines: string[] = [];
|
|
107
|
+
for (const f of created) lines.push(`Created ${f}`);
|
|
108
|
+
for (const f of skipped) lines.push(`Skipped ${f} (already exists)`);
|
|
109
|
+
lines.push(
|
|
110
|
+
"",
|
|
111
|
+
`Add your YC Algolia credentials to ${CREDENTIALS_FILE}, then run:`,
|
|
112
|
+
"",
|
|
113
|
+
" jobcrawl crawl --keywords \"engineer\"",
|
|
114
|
+
);
|
|
115
|
+
|
|
116
|
+
bus.emit("output:message", { text: lines.join("\n"), style: "success" });
|
|
117
|
+
}
|