jobcrawl 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/.prettierrc.json +10 -0
  2. package/CHANGELOG.md +40 -0
  3. package/README.md +232 -0
  4. package/dist/core/aggregators/yc.d.ts +7 -0
  5. package/dist/core/aggregators/yc.js +320 -0
  6. package/dist/core/browser.d.ts +30 -0
  7. package/dist/core/browser.js +196 -0
  8. package/dist/core/cache.d.ts +13 -0
  9. package/dist/core/cache.js +41 -0
  10. package/dist/core/detect-provider.d.ts +7 -0
  11. package/dist/core/detect-provider.js +125 -0
  12. package/dist/core/discover-careers.d.ts +18 -0
  13. package/dist/core/discover-careers.js +92 -0
  14. package/dist/core/extract-jobs.d.ts +14 -0
  15. package/dist/core/extract-jobs.js +36 -0
  16. package/dist/core/fetch-page.d.ts +11 -0
  17. package/dist/core/fetch-page.js +39 -0
  18. package/dist/core/format-output.d.ts +2 -0
  19. package/dist/core/format-output.js +59 -0
  20. package/dist/core/match-jobs.d.ts +6 -0
  21. package/dist/core/match-jobs.js +43 -0
  22. package/dist/core/providers/ashby.d.ts +6 -0
  23. package/dist/core/providers/ashby.js +58 -0
  24. package/dist/core/providers/generic.d.ts +6 -0
  25. package/dist/core/providers/generic.js +294 -0
  26. package/dist/core/providers/greenhouse.d.ts +6 -0
  27. package/dist/core/providers/greenhouse.js +47 -0
  28. package/dist/core/providers/lever.d.ts +7 -0
  29. package/dist/core/providers/lever.js +60 -0
  30. package/dist/core/providers/yc.d.ts +7 -0
  31. package/dist/core/providers/yc.js +320 -0
  32. package/dist/core/resolve-iframe.d.ts +6 -0
  33. package/dist/core/resolve-iframe.js +51 -0
  34. package/dist/core/save-raw.d.ts +4 -0
  35. package/dist/core/save-raw.js +13 -0
  36. package/dist/data/companies.d.ts +9 -0
  37. package/dist/data/companies.js +2849 -0
  38. package/dist/entrypoints/cli/app.d.ts +3 -0
  39. package/dist/entrypoints/cli/app.js +91 -0
  40. package/dist/entrypoints/cli/components/crawl-view.d.ts +1 -0
  41. package/dist/entrypoints/cli/components/crawl-view.js +94 -0
  42. package/dist/entrypoints/cli/components/discover-view.d.ts +1 -0
  43. package/dist/entrypoints/cli/components/discover-view.js +67 -0
  44. package/dist/entrypoints/cli/crawl-aggregators.d.ts +26 -0
  45. package/dist/entrypoints/cli/crawl-aggregators.js +76 -0
  46. package/dist/entrypoints/cli/crawl-url.d.ts +26 -0
  47. package/dist/entrypoints/cli/crawl-url.js +54 -0
  48. package/dist/entrypoints/cli/crawl.d.ts +32 -0
  49. package/dist/entrypoints/cli/crawl.js +108 -0
  50. package/dist/entrypoints/cli/discover.d.ts +10 -0
  51. package/dist/entrypoints/cli/discover.js +69 -0
  52. package/dist/entrypoints/cli/index.d.ts +2 -0
  53. package/dist/entrypoints/cli/index.js +197 -0
  54. package/dist/entrypoints/cli/init.d.ts +9 -0
  55. package/dist/entrypoints/cli/init.js +94 -0
  56. package/dist/entrypoints/cli/plain.d.ts +6 -0
  57. package/dist/entrypoints/cli/plain.js +77 -0
  58. package/dist/events.d.ts +114 -0
  59. package/dist/events.js +17 -0
  60. package/dist/orchestrators/crawl-all.d.ts +2 -0
  61. package/dist/orchestrators/crawl-all.js +66 -0
  62. package/dist/orchestrators/discover-all.d.ts +10 -0
  63. package/dist/orchestrators/discover-all.js +39 -0
  64. package/dist/threads/pool.d.ts +5 -0
  65. package/dist/threads/pool.js +23 -0
  66. package/dist/threads/process-url.d.ts +9 -0
  67. package/dist/threads/process-url.js +229 -0
  68. package/dist/types/index.d.ts +83 -0
  69. package/dist/types/index.js +6 -0
  70. package/dist/utils/config.d.ts +17 -0
  71. package/dist/utils/config.js +57 -0
  72. package/dist/utils/google-search.d.ts +19 -0
  73. package/dist/utils/google-search.js +139 -0
  74. package/dist/utils/llm.d.ts +8 -0
  75. package/dist/utils/llm.js +25 -0
  76. package/package.json +42 -0
  77. package/src/core/aggregators/yc.ts +415 -0
  78. package/src/core/browser.ts +239 -0
  79. package/src/core/detect-provider.ts +162 -0
  80. package/src/core/discover-careers.ts +117 -0
  81. package/src/core/extract-jobs.ts +50 -0
  82. package/src/core/fetch-page.ts +41 -0
  83. package/src/core/format-output.ts +80 -0
  84. package/src/core/match-jobs.ts +56 -0
  85. package/src/core/providers/ashby.ts +84 -0
  86. package/src/core/providers/generic.ts +332 -0
  87. package/src/core/providers/greenhouse.ts +74 -0
  88. package/src/core/providers/lever.ts +90 -0
  89. package/src/core/resolve-iframe.ts +59 -0
  90. package/src/core/save-raw.ts +18 -0
  91. package/src/data/companies.ts +2859 -0
  92. package/src/entrypoints/cli/app.tsx +173 -0
  93. package/src/entrypoints/cli/components/crawl-view.tsx +163 -0
  94. package/src/entrypoints/cli/components/discover-view.tsx +138 -0
  95. package/src/entrypoints/cli/crawl-aggregators.ts +112 -0
  96. package/src/entrypoints/cli/crawl-url.ts +87 -0
  97. package/src/entrypoints/cli/crawl.ts +163 -0
  98. package/src/entrypoints/cli/discover.ts +96 -0
  99. package/src/entrypoints/cli/index.ts +252 -0
  100. package/src/entrypoints/cli/init.ts +117 -0
  101. package/src/entrypoints/cli/plain.ts +104 -0
  102. package/src/events.ts +79 -0
  103. package/src/orchestrators/crawl-all.ts +96 -0
  104. package/src/orchestrators/discover-all.ts +61 -0
  105. package/src/threads/pool.ts +29 -0
  106. package/src/threads/process-url.ts +312 -0
  107. package/src/types/index.ts +110 -0
  108. package/src/utils/config.ts +79 -0
  109. package/src/utils/google-search.ts +155 -0
  110. package/src/utils/llm.ts +33 -0
  111. package/test/integration/process-url.test.ts +301 -0
  112. package/test/integration/providers/ashby.test.ts +163 -0
  113. package/test/integration/providers/greenhouse.test.ts +191 -0
  114. package/test/integration/providers/lever.test.ts +188 -0
  115. package/test/unit/config.test.ts +64 -0
  116. package/test/unit/detect-provider.test.ts +165 -0
  117. package/test/unit/events.test.ts +104 -0
  118. package/test/unit/format-output.test.ts +165 -0
  119. package/test/unit/match-jobs.test.ts +257 -0
  120. package/test/unit/pool.test.ts +74 -0
  121. package/test/unit/providers/generic.test.ts +139 -0
  122. package/test/unit/resolve-iframe.test.ts +100 -0
  123. package/tsconfig.json +19 -0
  124. package/vitest.config.ts +7 -0
@@ -0,0 +1,9 @@
1
+ import { type Target, type UrlTarget, type SearchCriteria, type CrawlResult, type BrowserOptions } from "../types/index.js";
2
+ export declare function processUrl(target: UrlTarget, criteria: SearchCriteria, options?: {
3
+ saveRaw?: boolean;
4
+ browser?: BrowserOptions;
5
+ }): Promise<CrawlResult>;
6
+ export declare function processTarget(target: Target, criteria: SearchCriteria, options?: {
7
+ saveRaw?: boolean;
8
+ browser?: BrowserOptions;
9
+ }): Promise<CrawlResult>;
@@ -0,0 +1,229 @@
1
+ import { createHash } from "node:crypto";
2
+ import { setTimeout as delay } from "node:timers/promises";
3
+ import { bus } from "../events.js";
4
+ import { probePage } from "../core/fetch-page.js";
5
+ import { BrowserSession, BrowserNotAvailableError, resolveJobUrls, } from "../core/browser.js";
6
+ import { detectProvider } from "../core/detect-provider.js";
7
+ import { extractViaApi, extractFromHtml } from "../core/extract-jobs.js";
8
+ import { matchJobs } from "../core/match-jobs.js";
9
+ import { isSlugTarget, } from "../types/index.js";
10
+ /** Yield to the event loop so Ink can re-render between steps. */
11
+ const tick = () => delay(0);
12
+ function urlId(url) {
13
+ return "u_" + createHash("sha256").update(url).digest("hex").slice(0, 5);
14
+ }
15
+ export async function processUrl(target, criteria, options) {
16
+ const id = urlId(target.url);
17
+ const company = target.company ?? new URL(target.url).hostname;
18
+ const start = Date.now();
19
+ try {
20
+ // Step 1: Quick probe — simple HTTP GET (no browser)
21
+ bus.emit("url:fetching", { urlId: id, url: target.url, company });
22
+ await tick();
23
+ const probe = await probePage(target.url);
24
+ // Step 2: Detect provider from static HTML
25
+ const detection = detectProvider(probe.html, probe.finalUrl);
26
+ bus.emit("url:detecting", {
27
+ urlId: id,
28
+ provider: detection.provider,
29
+ });
30
+ await tick();
31
+ let jobs;
32
+ if (detection.provider !== "unknown" && detection.boardToken) {
33
+ // Tier 1: ATS JSON API — no browser needed
34
+ bus.emit("url:extracting", { urlId: id });
35
+ await tick();
36
+ try {
37
+ jobs = await extractViaApi(detection.provider, detection.boardToken, target.url, criteria, options?.saveRaw);
38
+ }
39
+ catch {
40
+ // API failed, fall through to Tier 2
41
+ jobs = await fallbackToHtml(id, target.url, company, criteria, options?.saveRaw, options?.browser);
42
+ }
43
+ }
44
+ else {
45
+ // Tier 2: Render with agent-browser, then extract
46
+ jobs = await fallbackToHtml(id, target.url, company, criteria, options?.saveRaw, options?.browser);
47
+ }
48
+ // Apply company name override
49
+ if (target.company) {
50
+ for (const job of jobs) {
51
+ job.company = target.company;
52
+ }
53
+ }
54
+ // Match against criteria
55
+ const matched = matchJobs(jobs, criteria);
56
+ bus.emit("url:matching", {
57
+ urlId: id,
58
+ matched: matched.length,
59
+ total: jobs.length,
60
+ });
61
+ await tick();
62
+ bus.emit("url:done", {
63
+ urlId: id,
64
+ company,
65
+ matched: matched.length,
66
+ total: jobs.length,
67
+ });
68
+ return {
69
+ url: target.url,
70
+ provider: detection.provider,
71
+ jobs: matched,
72
+ allJobs: jobs,
73
+ error: null,
74
+ durationMs: Date.now() - start,
75
+ };
76
+ }
77
+ catch (err) {
78
+ const message = err instanceof Error ? err.message : String(err);
79
+ bus.emit("url:failed", { urlId: id, company, error: message });
80
+ return {
81
+ url: target.url,
82
+ provider: "unknown",
83
+ jobs: [],
84
+ allJobs: [],
85
+ error: message,
86
+ durationMs: Date.now() - start,
87
+ };
88
+ }
89
+ }
90
+ const PROVIDER_SOURCE_URLS = {
91
+ greenhouse: (slug) => `https://boards.greenhouse.io/${slug}`,
92
+ ashby: (slug) => `https://jobs.ashbyhq.com/${slug}`,
93
+ lever: (slug) => `https://jobs.lever.co/${slug}`,
94
+ };
95
+ const PROBE_ORDER = ["greenhouse", "ashby", "lever"];
96
+ export async function processTarget(target, criteria, options) {
97
+ if (isSlugTarget(target)) {
98
+ return processSlug(target, criteria, options);
99
+ }
100
+ return processUrl(target, criteria, options);
101
+ }
102
+ async function processSlug(target, criteria, options) {
103
+ const id = "s_" + createHash("sha256").update(target.slug).digest("hex").slice(0, 5);
104
+ const company = target.company;
105
+ const start = Date.now();
106
+ try {
107
+ // Build provider order: hint first, then probe all
108
+ const providers = [];
109
+ if (target.provider) {
110
+ providers.push(target.provider);
111
+ }
112
+ else {
113
+ for (const p of PROBE_ORDER) {
114
+ providers.push(p);
115
+ }
116
+ }
117
+ // Try each provider
118
+ for (const provider of providers) {
119
+ const sourceUrl = PROVIDER_SOURCE_URLS[provider]?.(target.slug) ?? target.slug;
120
+ bus.emit("url:fetching", { urlId: id, url: sourceUrl, company });
121
+ await tick();
122
+ bus.emit("url:detecting", { urlId: id, provider });
123
+ await tick();
124
+ bus.emit("url:extracting", { urlId: id });
125
+ await tick();
126
+ try {
127
+ const jobs = await extractViaApi(provider, target.slug, sourceUrl, criteria, options?.saveRaw);
128
+ // Apply company name
129
+ for (const job of jobs) {
130
+ job.company = company;
131
+ }
132
+ const matched = matchJobs(jobs, criteria);
133
+ bus.emit("url:matching", { urlId: id, matched: matched.length, total: jobs.length });
134
+ await tick();
135
+ bus.emit("url:done", { urlId: id, company, matched: matched.length, total: jobs.length });
136
+ return {
137
+ url: sourceUrl,
138
+ provider,
139
+ jobs: matched,
140
+ allJobs: jobs,
141
+ error: null,
142
+ durationMs: Date.now() - start,
143
+ };
144
+ }
145
+ catch {
146
+ // This provider didn't work, try next
147
+ bus.emit("target:probing", { urlId: id, provider });
148
+ await tick();
149
+ }
150
+ }
151
+ // All providers failed — try fallback URL
152
+ if (target.fallback) {
153
+ return processUrl({ url: target.fallback, company: target.company }, criteria, options);
154
+ }
155
+ // No fallback
156
+ const message = `No ATS provider found for slug "${target.slug}" and no fallback URL provided`;
157
+ bus.emit("url:failed", { urlId: id, company, error: message });
158
+ return {
159
+ url: target.slug,
160
+ provider: "unknown",
161
+ jobs: [],
162
+ allJobs: [],
163
+ error: message,
164
+ durationMs: Date.now() - start,
165
+ };
166
+ }
167
+ catch (err) {
168
+ const message = err instanceof Error ? err.message : String(err);
169
+ bus.emit("url:failed", { urlId: id, company, error: message });
170
+ return {
171
+ url: target.slug,
172
+ provider: "unknown",
173
+ jobs: [],
174
+ allJobs: [],
175
+ error: message,
176
+ durationMs: Date.now() - start,
177
+ };
178
+ }
179
+ }
180
+ async function fallbackToHtml(id, url, company, criteria, saveRaw, browser) {
181
+ const session = new BrowserSession({
182
+ networkTimeout: browser?.networkTimeout,
183
+ });
184
+ try {
185
+ bus.emit("url:rendering", { urlId: id, url });
186
+ await tick();
187
+ await session.open(url);
188
+ const [html, finalUrl] = await Promise.all([
189
+ session.getHtml(),
190
+ session.getUrl(),
191
+ ]);
192
+ // Re-detect on rendered HTML
193
+ const recheck = detectProvider(html, finalUrl);
194
+ if (recheck.provider !== "unknown" && recheck.boardToken) {
195
+ bus.emit("url:detecting", { urlId: id, provider: recheck.provider });
196
+ await tick();
197
+ return extractViaApi(recheck.provider, recheck.boardToken, url, criteria, saveRaw);
198
+ }
199
+ bus.emit("url:extracting", { urlId: id });
200
+ await tick();
201
+ let jobs = extractFromHtml(html, finalUrl);
202
+ // Click-capture: resolve individual URLs for container-extracted jobs
203
+ const allShareSourceUrl = jobs.length > 0 && jobs.every((j) => j.url === finalUrl);
204
+ if (allShareSourceUrl) {
205
+ bus.emit("url:resolving-urls", { urlId: id, count: jobs.length });
206
+ await tick();
207
+ jobs = await resolveJobUrls(jobs, finalUrl, session, {
208
+ maxBubbleLevels: browser?.maxBubbleLevels,
209
+ });
210
+ }
211
+ return jobs;
212
+ }
213
+ catch (err) {
214
+ if (err instanceof BrowserNotAvailableError) {
215
+ // Graceful fallback: simple HTTP GET + generic HTML extraction.
216
+ // Skip ATS re-detection — the caller already tried the API path.
217
+ bus.emit("url:fetching", { urlId: id, url, company });
218
+ await tick();
219
+ const probe = await probePage(url);
220
+ bus.emit("url:extracting", { urlId: id });
221
+ await tick();
222
+ return extractFromHtml(probe.html, url);
223
+ }
224
+ throw err;
225
+ }
226
+ finally {
227
+ await session.close();
228
+ }
229
+ }
@@ -0,0 +1,83 @@
1
+ export type Provider = "greenhouse" | "lever" | "ashby" | "workday" | "bamboohr" | "workable" | "generic";
2
+ export type Aggregator = "yc";
3
+ export interface AggregatorConfig {
4
+ type: Aggregator;
5
+ enabled: boolean;
6
+ }
7
+ export interface Job {
8
+ id: string;
9
+ title: string;
10
+ company: string;
11
+ location: string | null;
12
+ workMode: "remote" | "onsite" | "hybrid" | null;
13
+ department: string | null;
14
+ url: string;
15
+ sourceUrl: string;
16
+ provider: Provider | Aggregator | "unknown";
17
+ description: string | null;
18
+ postedAt: string | null;
19
+ extractedAt: string;
20
+ raw: Record<string, unknown> | null;
21
+ }
22
+ export interface SearchCriteria {
23
+ keywords: string[];
24
+ excludeKeywords: string[];
25
+ location: string | null;
26
+ workMode: ("remote" | "onsite" | "hybrid")[] | null;
27
+ departments: string[] | null;
28
+ role: string[] | null;
29
+ roleType: string[] | null;
30
+ jobType: string[] | null;
31
+ minExperience: number[] | null;
32
+ companyStage: string[] | null;
33
+ industry: string[] | null;
34
+ companySize: string[] | null;
35
+ hasSalary: boolean | null;
36
+ hasEquity: boolean | null;
37
+ hasInterviewProcess: boolean | null;
38
+ visaSponsorship: boolean | null;
39
+ }
40
+ export interface SlugTarget {
41
+ company: string;
42
+ slug: string;
43
+ provider?: Provider;
44
+ fallback?: string;
45
+ }
46
+ export interface UrlTarget {
47
+ url: string;
48
+ company?: string;
49
+ }
50
+ export type Target = SlugTarget | UrlTarget;
51
+ export declare function isSlugTarget(t: Target): t is SlugTarget;
52
+ export declare function isUrlTarget(t: Target): t is UrlTarget;
53
+ export interface BrowserOptions {
54
+ networkTimeout?: number;
55
+ maxBubbleLevels?: number;
56
+ }
57
+ export interface CrawlOptions {
58
+ concurrency: number;
59
+ saveRaw?: boolean;
60
+ browser?: BrowserOptions;
61
+ }
62
+ export interface CrawlResult {
63
+ url: string;
64
+ provider: Provider | "unknown";
65
+ jobs: Job[];
66
+ allJobs: Job[];
67
+ error: string | null;
68
+ durationMs: number;
69
+ }
70
+ export interface CrawlAllResult {
71
+ jobs: Job[];
72
+ results: CrawlResult[];
73
+ totalDurationMs: number;
74
+ }
75
+ export interface ProviderDetection {
76
+ provider: Provider | "unknown";
77
+ boardToken: string | null;
78
+ }
79
+ export interface FetchResult {
80
+ html: string;
81
+ finalUrl: string;
82
+ }
83
+ export type OutputFormat = "json" | "table" | "markdown" | "csv";
@@ -0,0 +1,6 @@
1
+ export function isSlugTarget(t) {
2
+ return "slug" in t;
3
+ }
4
+ export function isUrlTarget(t) {
5
+ return "url" in t;
6
+ }
@@ -0,0 +1,17 @@
1
+ import type { Target, AggregatorConfig } from "../types/index.js";
2
+ export interface Config {
3
+ aggregators?: AggregatorConfig[];
4
+ companies: Target[];
5
+ defaults?: {
6
+ concurrency?: number;
7
+ browser?: {
8
+ networkTimeout?: number;
9
+ maxBubbleLevels?: number;
10
+ };
11
+ };
12
+ }
13
+ export declare function loadConfig(filePath: string): Promise<Config>;
14
+ /**
15
+ * Parse URLs from stdin (newline-delimited).
16
+ */
17
+ export declare function parseUrlList(input: string): Target[];
@@ -0,0 +1,57 @@
1
+ import { readFile } from "node:fs/promises";
2
+ import yaml from "js-yaml";
3
+ import { z } from "zod/v4";
4
+ const SlugTargetSchema = z.object({
5
+ company: z.string(),
6
+ slug: z.string(),
7
+ provider: z
8
+ .enum(["greenhouse", "lever", "ashby", "workday", "bamboohr", "workable"])
9
+ .optional(),
10
+ fallback: z.url().optional(),
11
+ });
12
+ const UrlTargetSchema = z.object({
13
+ url: z.url(),
14
+ company: z.string().optional(),
15
+ });
16
+ const TargetSchema = z.union([SlugTargetSchema, UrlTargetSchema]);
17
+ const AggregatorSchema = z.object({
18
+ type: z.enum(["yc"]),
19
+ enabled: z.boolean().default(true),
20
+ });
21
+ const BrowserSchema = z.object({
22
+ networkTimeout: z.number().int().positive().optional(),
23
+ maxBubbleLevels: z.number().int().min(0).max(10).optional(),
24
+ });
25
+ const ConfigSchema = z.object({
26
+ aggregators: z.array(AggregatorSchema).optional(),
27
+ companies: z.array(TargetSchema),
28
+ defaults: z
29
+ .object({
30
+ concurrency: z.number().int().positive().optional(),
31
+ browser: BrowserSchema.optional(),
32
+ })
33
+ .optional(),
34
+ });
35
+ export async function loadConfig(filePath) {
36
+ const content = await readFile(filePath, "utf-8");
37
+ let parsed;
38
+ if (filePath.endsWith(".json")) {
39
+ parsed = JSON.parse(content);
40
+ }
41
+ else {
42
+ // YAML (default for .yaml, .yml, or anything else)
43
+ parsed = yaml.load(content);
44
+ }
45
+ const result = ConfigSchema.parse(parsed);
46
+ return result;
47
+ }
48
+ /**
49
+ * Parse URLs from stdin (newline-delimited).
50
+ */
51
+ export function parseUrlList(input) {
52
+ return input
53
+ .split("\n")
54
+ .map((line) => line.trim())
55
+ .filter((line) => line.length > 0 && !line.startsWith("#"))
56
+ .map((url) => ({ url }));
57
+ }
@@ -0,0 +1,19 @@
1
+ export interface SearchResult {
2
+ title: string;
3
+ url: string;
4
+ snippet: string;
5
+ }
6
+ /**
7
+ * Web search via DuckDuckGo HTML lite.
8
+ * Falls back to heuristic URL guessing if DDG blocks the request.
9
+ *
10
+ * NOTE: Both Google and DDG block plain HTTP scraping with captchas.
11
+ * For reliable search, this needs agent-browser (real Chrome).
12
+ * The heuristic fallback covers most tech companies.
13
+ */
14
+ export declare function webSearch(query: string): Promise<SearchResult[]>;
15
+ /**
16
+ * Heuristic career URL guessing for common tech companies.
17
+ * Tries common career page patterns and verifies they return 200.
18
+ */
19
+ export declare function guessCareerUrls(company: string): Promise<SearchResult[]>;
@@ -0,0 +1,139 @@
1
+ import { probePage } from "../core/fetch-page.js";
2
+ import { parse } from "node-html-parser";
3
+ /**
4
+ * Web search via DuckDuckGo HTML lite.
5
+ * Falls back to heuristic URL guessing if DDG blocks the request.
6
+ *
7
+ * NOTE: Both Google and DDG block plain HTTP scraping with captchas.
8
+ * For reliable search, this needs agent-browser (real Chrome).
9
+ * The heuristic fallback covers most tech companies.
10
+ */
11
+ export async function webSearch(query) {
12
+ try {
13
+ return await ddgSearch(query);
14
+ }
15
+ catch {
16
+ return [];
17
+ }
18
+ }
19
+ async function ddgSearch(query) {
20
+ const encoded = encodeURIComponent(query);
21
+ const url = `https://html.duckduckgo.com/html/?q=${encoded}`;
22
+ const { html } = await probePage(url);
23
+ const root = parse(html);
24
+ const results = [];
25
+ // DDG lite results use .result__a links
26
+ const resultLinks = root.querySelectorAll("a.result__a");
27
+ for (const link of resultLinks) {
28
+ const href = link.getAttribute("href") ?? "";
29
+ if (!href.startsWith("http") && !href.startsWith("//"))
30
+ continue;
31
+ let targetUrl;
32
+ if (href.includes("duckduckgo.com/l/?")) {
33
+ try {
34
+ const params = new URL(href.startsWith("//") ? `https:${href}` : href)
35
+ .searchParams;
36
+ const uddg = params.get("uddg");
37
+ if (!uddg)
38
+ continue;
39
+ targetUrl = uddg;
40
+ }
41
+ catch {
42
+ continue;
43
+ }
44
+ }
45
+ else {
46
+ targetUrl = href.startsWith("//") ? `https:${href}` : href;
47
+ }
48
+ try {
49
+ const parsed = new URL(targetUrl);
50
+ if (parsed.hostname.includes("duckduckgo.com"))
51
+ continue;
52
+ }
53
+ catch {
54
+ continue;
55
+ }
56
+ const title = link.textContent.trim();
57
+ if (!title || title.length < 3)
58
+ continue;
59
+ const resultDiv = link.closest(".result");
60
+ const snippetEl = resultDiv?.querySelector(".result__snippet");
61
+ const snippet = snippetEl?.textContent?.trim().slice(0, 200) ?? "";
62
+ if (results.some((r) => r.url === targetUrl))
63
+ continue;
64
+ results.push({ title, url: targetUrl, snippet });
65
+ }
66
+ return results.slice(0, 10);
67
+ }
68
+ /**
69
+ * Heuristic career URL guessing for common tech companies.
70
+ * Tries common career page patterns and verifies they return 200.
71
+ */
72
+ export async function guessCareerUrls(company) {
73
+ const domain = guessDomain(company);
74
+ const candidates = [
75
+ `https://www.${domain}/careers`,
76
+ `https://${domain}/careers`,
77
+ `https://www.${domain}/careers/`,
78
+ `https://${domain}/careers/`,
79
+ `https://www.${domain}/jobs`,
80
+ `https://${domain}/jobs`,
81
+ // ATS board URLs for well-known companies
82
+ `https://jobs.ashbyhq.com/${company.toLowerCase().replace(/\s+/g, "")}`,
83
+ `https://boards.greenhouse.io/${company.toLowerCase().replace(/\s+/g, "")}`,
84
+ `https://jobs.lever.co/${company.toLowerCase().replace(/\s+/g, "")}`,
85
+ ];
86
+ const results = [];
87
+ for (const url of candidates) {
88
+ try {
89
+ const response = await fetch(url, {
90
+ method: "HEAD",
91
+ redirect: "follow",
92
+ headers: {
93
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
94
+ },
95
+ signal: AbortSignal.timeout(5000),
96
+ });
97
+ if (response.ok) {
98
+ results.push({
99
+ title: `${company} Careers`,
100
+ url: response.url, // use final URL after redirects
101
+ snippet: "",
102
+ });
103
+ break; // first hit is enough
104
+ }
105
+ }
106
+ catch {
107
+ // timeout or network error, try next
108
+ }
109
+ }
110
+ return results;
111
+ }
112
+ function guessDomain(company) {
113
+ // Common tech company name → domain mappings
114
+ const overrides = {
115
+ openai: "openai.com",
116
+ deepmind: "deepmind.google",
117
+ "deep mind": "deepmind.google",
118
+ xai: "x.ai",
119
+ "x.ai": "x.ai",
120
+ ai21: "ai21.com",
121
+ "ai21 labs": "ai21.com",
122
+ "hugging face": "huggingface.co",
123
+ huggingface: "huggingface.co",
124
+ "eleven labs": "elevenlabs.io",
125
+ elevenlabs: "elevenlabs.io",
126
+ "luma ai": "lumalabs.ai",
127
+ "together ai": "together.ai",
128
+ "character.ai": "character.ai",
129
+ "character ai": "character.ai",
130
+ "stability ai": "stability.ai",
131
+ "mistral ai": "mistral.ai",
132
+ "inflection ai": "inflection.ai",
133
+ };
134
+ const lower = company.toLowerCase();
135
+ if (overrides[lower])
136
+ return overrides[lower];
137
+ // Default: lowercase, remove spaces/punctuation, add .com
138
+ return lower.replace(/[^a-z0-9]/g, "") + ".com";
139
+ }
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Ask Claude to extract a structured answer from context.
3
+ * Uses Haiku for cheap, fast classification tasks.
4
+ */
5
+ export declare function askClaude(prompt: string, options?: {
6
+ model?: string;
7
+ maxTokens?: number;
8
+ }): Promise<string>;
@@ -0,0 +1,25 @@
1
+ import Anthropic from "@anthropic-ai/sdk";
2
+ let client = null;
3
+ function getClient() {
4
+ if (!client) {
5
+ client = new Anthropic();
6
+ }
7
+ return client;
8
+ }
9
+ /**
10
+ * Ask Claude to extract a structured answer from context.
11
+ * Uses Haiku for cheap, fast classification tasks.
12
+ */
13
+ export async function askClaude(prompt, options) {
14
+ const anthropic = getClient();
15
+ const response = await anthropic.messages.create({
16
+ model: options?.model ?? "claude-haiku-4-5-20251001",
17
+ max_tokens: options?.maxTokens ?? 256,
18
+ messages: [{ role: "user", content: prompt }],
19
+ });
20
+ const block = response.content[0];
21
+ if (block.type === "text") {
22
+ return block.text.trim();
23
+ }
24
+ throw new Error("Unexpected response format from Claude API");
25
+ }
package/package.json ADDED
@@ -0,0 +1,42 @@
1
+ {
2
+ "name": "jobcrawl",
3
+ "version": "0.1.0",
4
+ "description": "Find your next role",
5
+ "license": "ISC",
6
+ "author": "",
7
+ "type": "module",
8
+ "bin": {
9
+ "jobcrawl": "dist/entrypoints/cli/index.js"
10
+ },
11
+ "scripts": {
12
+ "build": "tsc",
13
+ "postbuild": "chmod +x dist/entrypoints/cli/index.js",
14
+ "dev": "tsx src/entrypoints/cli/index.ts",
15
+ "format": "prettier --write .",
16
+ "format:check": "prettier --check .",
17
+ "test": "vitest run",
18
+ "test:watch": "vitest"
19
+ },
20
+ "dependencies": {
21
+ "@anthropic-ai/sdk": "^0.81.0",
22
+ "commander": "^14.0.3",
23
+ "ink": "^6.8.0",
24
+ "ink-spinner": "^5.0.0",
25
+ "js-yaml": "^4.1.1",
26
+ "node-html-parser": "^7.1.0",
27
+ "react": "^19.2.4",
28
+ "zod": "^4.3.6"
29
+ },
30
+ "optionalDependencies": {
31
+ "agent-browser": "^0.23.4"
32
+ },
33
+ "devDependencies": {
34
+ "@types/js-yaml": "^4.0.9",
35
+ "@types/node": "^25.5.0",
36
+ "@types/react": "^19.2.14",
37
+ "prettier": "^3.8.1",
38
+ "tsx": "^4.21.0",
39
+ "typescript": "^6.0.2",
40
+ "vitest": "^4.1.2"
41
+ }
42
+ }