webpeel 0.8.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/README.md +39 -5
  2. package/dist/cli.js +1299 -85
  3. package/dist/cli.js.map +1 -1
  4. package/dist/core/application-tracker.d.ts +85 -0
  5. package/dist/core/application-tracker.d.ts.map +1 -0
  6. package/dist/core/application-tracker.js +184 -0
  7. package/dist/core/application-tracker.js.map +1 -0
  8. package/dist/core/apply.d.ts +163 -0
  9. package/dist/core/apply.d.ts.map +1 -0
  10. package/dist/core/apply.js +817 -0
  11. package/dist/core/apply.js.map +1 -0
  12. package/dist/core/branding.d.ts +1 -1
  13. package/dist/core/branding.d.ts.map +1 -1
  14. package/dist/core/budget.d.ts +43 -0
  15. package/dist/core/budget.d.ts.map +1 -0
  16. package/dist/core/budget.js +325 -0
  17. package/dist/core/budget.js.map +1 -0
  18. package/dist/core/challenge-detection.d.ts +27 -0
  19. package/dist/core/challenge-detection.d.ts.map +1 -0
  20. package/dist/core/challenge-detection.js +436 -0
  21. package/dist/core/challenge-detection.js.map +1 -0
  22. package/dist/core/change-tracking.d.ts.map +1 -1
  23. package/dist/core/change-tracking.js +10 -1
  24. package/dist/core/change-tracking.js.map +1 -1
  25. package/dist/core/crawler.d.ts.map +1 -1
  26. package/dist/core/crawler.js +17 -4
  27. package/dist/core/crawler.js.map +1 -1
  28. package/dist/core/diff.d.ts +62 -0
  29. package/dist/core/diff.d.ts.map +1 -0
  30. package/dist/core/diff.js +289 -0
  31. package/dist/core/diff.js.map +1 -0
  32. package/dist/core/extract-listings.d.ts +39 -0
  33. package/dist/core/extract-listings.d.ts.map +1 -0
  34. package/dist/core/extract-listings.js +331 -0
  35. package/dist/core/extract-listings.js.map +1 -0
  36. package/dist/core/extract.d.ts.map +1 -1
  37. package/dist/core/extract.js +15 -2
  38. package/dist/core/extract.js.map +1 -1
  39. package/dist/core/fetcher.d.ts +29 -3
  40. package/dist/core/fetcher.d.ts.map +1 -1
  41. package/dist/core/fetcher.js +158 -20
  42. package/dist/core/fetcher.js.map +1 -1
  43. package/dist/core/human.d.ts +176 -0
  44. package/dist/core/human.d.ts.map +1 -0
  45. package/dist/core/human.js +681 -0
  46. package/dist/core/human.js.map +1 -0
  47. package/dist/core/jobs.d.ts +12 -2
  48. package/dist/core/jobs.d.ts.map +1 -1
  49. package/dist/core/jobs.js +124 -2
  50. package/dist/core/jobs.js.map +1 -1
  51. package/dist/core/map.d.ts.map +1 -1
  52. package/dist/core/map.js +14 -2
  53. package/dist/core/map.js.map +1 -1
  54. package/dist/core/paginate.d.ts +32 -0
  55. package/dist/core/paginate.d.ts.map +1 -0
  56. package/dist/core/paginate.js +107 -0
  57. package/dist/core/paginate.js.map +1 -0
  58. package/dist/core/rate-governor.d.ts +81 -0
  59. package/dist/core/rate-governor.d.ts.map +1 -0
  60. package/dist/core/rate-governor.js +238 -0
  61. package/dist/core/rate-governor.js.map +1 -0
  62. package/dist/core/search-provider.d.ts +5 -0
  63. package/dist/core/search-provider.d.ts.map +1 -1
  64. package/dist/core/search-provider.js +81 -2
  65. package/dist/core/search-provider.js.map +1 -1
  66. package/dist/core/site-search.d.ts +45 -0
  67. package/dist/core/site-search.d.ts.map +1 -0
  68. package/dist/core/site-search.js +253 -0
  69. package/dist/core/site-search.js.map +1 -0
  70. package/dist/core/strategies.d.ts +8 -0
  71. package/dist/core/strategies.d.ts.map +1 -1
  72. package/dist/core/strategies.js +185 -45
  73. package/dist/core/strategies.js.map +1 -1
  74. package/dist/core/strategy-hooks.d.ts +6 -0
  75. package/dist/core/strategy-hooks.d.ts.map +1 -1
  76. package/dist/core/strategy-hooks.js.map +1 -1
  77. package/dist/core/table-format.d.ts +31 -0
  78. package/dist/core/table-format.d.ts.map +1 -0
  79. package/dist/core/table-format.js +147 -0
  80. package/dist/core/table-format.js.map +1 -0
  81. package/dist/core/user-agents.d.ts +58 -0
  82. package/dist/core/user-agents.d.ts.map +1 -0
  83. package/dist/core/user-agents.js +159 -0
  84. package/dist/core/user-agents.js.map +1 -0
  85. package/dist/core/watch.d.ts +100 -0
  86. package/dist/core/watch.d.ts.map +1 -0
  87. package/dist/core/watch.js +368 -0
  88. package/dist/core/watch.js.map +1 -0
  89. package/dist/index.d.ts +13 -2
  90. package/dist/index.d.ts.map +1 -1
  91. package/dist/index.js +41 -4
  92. package/dist/index.js.map +1 -1
  93. package/dist/mcp/server.js +3 -0
  94. package/dist/mcp/server.js.map +1 -1
  95. package/dist/types.d.ts +73 -0
  96. package/dist/types.d.ts.map +1 -1
  97. package/dist/types.js.map +1 -1
  98. package/llms.txt +1 -1
  99. package/package.json +3 -3
package/dist/cli.js CHANGED
@@ -18,6 +18,8 @@ import { writeFileSync, readFileSync } from 'fs';
18
18
  import { peel, peelBatch, cleanup } from './index.js';
19
19
  import { checkUsage, showUsageFooter, handleLogin, handleLogout, handleUsage, loadConfig, saveConfig } from './cli-auth.js';
20
20
  import { getCache, setCache, parseTTL, clearCache, cacheStats } from './cache.js';
21
+ import { estimateTokens } from './core/markdown.js';
22
+ import { distillToBudget, budgetListings } from './core/budget.js';
21
23
  const program = new Command();
22
24
  // Read version from package.json dynamically
23
25
  import { fileURLToPath } from 'url';
@@ -115,6 +117,8 @@ function parseActions(actionStrings) {
115
117
  return { type: 'hover', selector: value };
116
118
  case 'waitFor':
117
119
  return { type: 'waitForSelector', selector: value };
120
+ case 'wait-for':
121
+ return { type: 'waitForSelector', selector: value, timeout: 10000 };
118
122
  case 'screenshot':
119
123
  return { type: 'screenshot' };
120
124
  default:
@@ -130,7 +134,7 @@ program
130
134
  .option('--html', 'Output raw HTML instead of markdown')
131
135
  .option('--text', 'Output plain text instead of markdown')
132
136
  .option('--json', 'Output as JSON')
133
- .option('-t, --timeout <ms>', 'Request timeout (ms)', parseInt, 30000)
137
+ .option('-t, --timeout <ms>', 'Request timeout (ms)', (v) => parseInt(v, 10), 30000)
134
138
  .option('--ua <agent>', 'Custom user agent')
135
139
  .option('-s, --silent', 'Silent mode (no spinner)')
136
140
  .option('--screenshot [path]', 'Take a screenshot (optionally save to file path)')
@@ -142,7 +146,8 @@ program
142
146
  .option('--only-main-content', 'Shortcut for --include-tags main,article')
143
147
  .option('-H, --header <header...>', 'Custom headers (e.g., "Authorization: Bearer token")')
144
148
  .option('--cookie <cookie...>', 'Cookies to set (e.g., "session=abc123")')
145
- .option('--cache <ttl>', 'Cache results locally (e.g., "5m", "1h", "1d")')
149
+ .option('--cache <ttl>', 'Cache results locally (e.g., "5m", "1h", "1d") — default: 5m')
150
+ .option('--no-cache', 'Disable automatic caching for this request')
146
151
  .option('--links', 'Output only the links found on the page')
147
152
  .option('--images', 'Output image URLs from the page')
148
153
  .option('--meta', 'Output only the page metadata (title, description, author, etc.)')
@@ -155,57 +160,112 @@ program
155
160
  .option('--location <country>', 'ISO country code for geo-targeting (e.g., "US", "DE", "JP")')
156
161
  .option('--language <lang>', 'Language preference (e.g., "en", "de", "ja")')
157
162
  .option('--max-tokens <n>', 'Maximum token count for output (truncate if exceeded)', parseInt)
163
+ .option('--budget <n>', 'Smart token budget — distill content to fit within N tokens (heuristic, no LLM key needed)', parseInt)
164
+ .option('--extract-all', 'Auto-detect and extract repeated listing items (e.g., search results)')
165
+ .option('--scroll-extract [count]', 'Scroll page N times to load lazy content, then extract (implies --render)', (v) => parseInt(v, 10))
166
+ .option('--csv', 'Output extraction results as CSV')
167
+ .option('--table', 'Output extraction results as a formatted table')
168
+ .option('--pages <n>', 'Follow pagination "Next" links for N pages (max 10)', (v) => parseInt(v, 10))
169
+ .option('--profile <path>', 'Use a persistent browser profile directory (cookies/sessions survive between calls)')
170
+ .option('--headed', 'Run browser in headed (visible) mode — useful for profile setup and debugging')
171
+ .option('--agent', 'Agent mode: sets --json, --silent, --extract-all, and --budget 4000 (override with --budget N)')
158
172
  .action(async (url, options) => {
159
- if (!url) {
160
- console.error('Error: URL is required\n');
161
- program.help();
173
+ // --agent sets sensible defaults for AI agents; explicit flags override
174
+ if (options.agent) {
175
+ if (!options.json)
176
+ options.json = true;
177
+ if (!options.silent)
178
+ options.silent = true;
179
+ if (!options.extractAll)
180
+ options.extractAll = true;
181
+ if (options.budget === undefined)
182
+ options.budget = 4000;
183
+ }
184
+ const isJson = options.json;
185
+ // --- #5: Concise error for missing URL (no help dump) ---
186
+ if (!url || url.trim() === '') {
187
+ if (isJson) {
188
+ await writeStdout(JSON.stringify({ error: 'URL is required', code: 'URL_REQUIRED' }) + '\n');
189
+ }
190
+ else {
191
+ console.error('Error: URL is required');
192
+ console.error('Usage: webpeel <url> [options]');
193
+ console.error('Run "webpeel --help" for full usage.');
194
+ }
195
+ process.exit(1);
196
+ }
197
+ // --- #6: Helper to output JSON errors and exit ---
198
+ function exitWithJsonError(message, code) {
199
+ if (isJson) {
200
+ process.stdout.write(JSON.stringify({ error: message, code }) + '\n');
201
+ }
202
+ else {
203
+ console.error(`Error: ${message}`);
204
+ }
162
205
  process.exit(1);
163
206
  }
164
207
  // SECURITY: Enhanced URL validation
165
208
  if (url.length > 2048) {
166
- console.error('Error: URL too long (max 2048 characters)');
167
- process.exit(1);
209
+ exitWithJsonError('URL too long (max 2048 characters)', 'INVALID_URL');
168
210
  }
169
211
  // Check for control characters
170
212
  if (/[\x00-\x1F\x7F]/.test(url)) {
171
- console.error('Error: URL contains invalid control characters');
172
- process.exit(1);
213
+ exitWithJsonError('URL contains invalid control characters', 'INVALID_URL');
173
214
  }
174
215
  // Validate URL format
175
216
  try {
176
217
  const parsed = new URL(url);
177
218
  if (!['http:', 'https:'].includes(parsed.protocol)) {
178
- console.error('Error: Only HTTP and HTTPS protocols are allowed');
179
- process.exit(1);
219
+ exitWithJsonError('Only HTTP and HTTPS protocols are allowed', 'INVALID_URL');
180
220
  }
181
221
  }
182
222
  catch {
183
- console.error(`Error: Invalid URL format: ${url}`);
184
- process.exit(1);
223
+ exitWithJsonError(`Invalid URL format: ${url}`, 'INVALID_URL');
185
224
  }
186
225
  const useStealth = options.stealth || false;
187
226
  // Check usage quota
188
227
  const usageCheck = await checkUsage();
189
228
  if (!usageCheck.allowed) {
229
+ if (isJson) {
230
+ await writeStdout(JSON.stringify({ error: usageCheck.message, code: 'BLOCKED' }) + '\n');
231
+ process.exit(1);
232
+ }
190
233
  console.error(usageCheck.message);
191
234
  process.exit(1);
192
235
  }
193
236
  // Check cache first (before spinner/network)
237
+ // Default: 5m TTL for all CLI fetches unless --no-cache is set
194
238
  let cacheTtlMs;
195
- if (options.cache) {
239
+ const cacheDisabled = options.cache === false; // --no-cache sets options.cache to false
240
+ const explicitTtl = typeof options.cache === 'string' ? options.cache : undefined;
241
+ if (!cacheDisabled) {
242
+ const ttlStr = explicitTtl || '5m';
196
243
  try {
197
- cacheTtlMs = parseTTL(options.cache);
244
+ cacheTtlMs = parseTTL(ttlStr);
198
245
  }
199
246
  catch (e) {
200
- console.error(`Error: ${e.message}`);
201
- process.exit(1);
247
+ exitWithJsonError(e.message, 'FETCH_FAILED');
202
248
  }
203
- const cached = getCache(url, { render: options.render, stealth: options.stealth, selector: options.selector, format: options.html ? 'html' : options.text ? 'text' : 'markdown' });
204
- if (cached) {
249
+ const cacheOptions = {
250
+ render: options.render,
251
+ stealth: options.stealth,
252
+ selector: options.selector,
253
+ format: options.html ? 'html' : options.text ? 'text' : 'markdown',
254
+ budget: null, // Budget excluded from cache key — cache stores full content
255
+ };
256
+ const cachedResult = getCache(url, cacheOptions);
257
+ if (cachedResult) {
205
258
  if (!options.silent) {
206
- console.error(`\x1b[36m⚡ Cache hit\x1b[0m (TTL: ${options.cache})`);
259
+ console.error(`\x1b[36m⚡ Cache hit\x1b[0m (TTL: ${ttlStr})`);
260
+ }
261
+ // Apply budget to cached content (cache stores full, budget is post-process)
262
+ if (options.budget && options.budget > 0 && cachedResult.content) {
263
+ const { distillToBudget } = await import('./core/budget.js');
264
+ const fmt = options.text ? 'text' : 'markdown';
265
+ cachedResult.content = distillToBudget(cachedResult.content, options.budget, fmt);
266
+ cachedResult.tokens = Math.ceil(cachedResult.content.length / 4);
207
267
  }
208
- outputResult(cached, options);
268
+ await outputResult(cachedResult, options, { cached: true });
209
269
  process.exit(0);
210
270
  }
211
271
  }
@@ -213,8 +273,7 @@ program
213
273
  try {
214
274
  // Validate options
215
275
  if (options.wait && (options.wait < 0 || options.wait > 60000)) {
216
- console.error('Error: Wait time must be between 0 and 60000ms');
217
- process.exit(1);
276
+ throw Object.assign(new Error('Wait time must be between 0 and 60000ms'), { _code: 'FETCH_FAILED' });
218
277
  }
219
278
  // Parse custom headers
220
279
  let headers;
@@ -223,9 +282,7 @@ program
223
282
  for (const header of options.header) {
224
283
  const colonIndex = header.indexOf(':');
225
284
  if (colonIndex === -1) {
226
- console.error(`Error: Invalid header format: ${header}`);
227
- console.error('Expected format: "Key: Value"');
228
- process.exit(1);
285
+ throw Object.assign(new Error(`Invalid header format: ${header}. Expected "Key: Value"`), { _code: 'FETCH_FAILED' });
229
286
  }
230
287
  const key = header.slice(0, colonIndex).trim();
231
288
  const value = header.slice(colonIndex + 1).trim();
@@ -239,8 +296,7 @@ program
239
296
  actions = parseActions(options.action);
240
297
  }
241
298
  catch (e) {
242
- console.error(`Error: ${e.message}`);
243
- process.exit(1);
299
+ throw Object.assign(new Error(e.message), { _code: 'FETCH_FAILED' });
244
300
  }
245
301
  }
246
302
  // Parse extract
@@ -254,8 +310,7 @@ program
254
310
  llmBaseUrl: process.env.WEBPEEL_LLM_BASE_URL || 'https://api.openai.com/v1',
255
311
  };
256
312
  if (!extract.llmApiKey) {
257
- console.error('Error: --llm-extract requires OPENAI_API_KEY environment variable');
258
- process.exit(1);
313
+ throw Object.assign(new Error('--llm-extract requires OPENAI_API_KEY environment variable'), { _code: 'FETCH_FAILED' });
259
314
  }
260
315
  }
261
316
  else if (options.extract) {
@@ -264,15 +319,13 @@ program
264
319
  extract = { selectors: JSON.parse(options.extract) };
265
320
  }
266
321
  catch {
267
- console.error('Error: --extract must be valid JSON (e.g., \'{"title": "h1", "price": ".price"}\')');
268
- process.exit(1);
322
+ throw Object.assign(new Error('--extract must be valid JSON (e.g., \'{"title": "h1", "price": ".price"}\')'), { _code: 'FETCH_FAILED' });
269
323
  }
270
324
  }
271
325
  // Validate maxTokens
272
326
  if (options.maxTokens !== undefined) {
273
327
  if (isNaN(options.maxTokens) || options.maxTokens < 100) {
274
- console.error('Error: --max-tokens must be at least 100');
275
- process.exit(1);
328
+ throw Object.assign(new Error('--max-tokens must be at least 100'), { _code: 'FETCH_FAILED' });
276
329
  }
277
330
  }
278
331
  // Parse include-tags and exclude-tags
@@ -301,7 +354,20 @@ program
301
354
  // Build peel options
302
355
  // --stealth auto-enables --render (stealth requires browser)
303
356
  // --action auto-enables --render (actions require browser)
304
- const useRender = options.render || options.stealth || (actions && actions.length > 0) || false;
357
+ // --scroll-extract implies --render (needs browser)
358
+ const scrollExtractCount = options.scrollExtract !== undefined
359
+ ? (typeof options.scrollExtract === 'number' ? options.scrollExtract : 3)
360
+ : 0;
361
+ const useRender = options.render || options.stealth || (actions && actions.length > 0) || scrollExtractCount > 0 || false;
362
+ // Inject scroll actions when --scroll-extract is used
363
+ if (scrollExtractCount > 0) {
364
+ const scrollActions = [];
365
+ for (let i = 0; i < scrollExtractCount; i++) {
366
+ scrollActions.push({ type: 'scroll', to: 'bottom' });
367
+ scrollActions.push({ type: 'wait', ms: 1500 });
368
+ }
369
+ actions = actions ? [...actions, ...scrollActions] : scrollActions;
370
+ }
305
371
  const peelOptions = {
306
372
  render: useRender,
307
373
  stealth: options.stealth || false,
@@ -319,16 +385,20 @@ program
319
385
  raw: options.raw || false,
320
386
  actions,
321
387
  maxTokens: options.maxTokens,
388
+ // Note: budget is applied AFTER caching (so cache stores full content)
389
+ // We pass it to peel() for programmatic API compatibility, but the CLI
390
+ // also applies it post-fetch (see below) to ensure cache stores full result.
322
391
  extract,
323
392
  images: options.images || false,
324
393
  location: locationOptions,
394
+ profileDir: options.profile || undefined,
395
+ headed: options.headed || false,
325
396
  };
326
397
  // Add summary option if requested
327
398
  if (options.summary) {
328
399
  const llmApiKey = options.llmKey || process.env.OPENAI_API_KEY;
329
400
  if (!llmApiKey) {
330
- console.error('Error: --summary requires --llm-key or OPENAI_API_KEY environment variable');
331
- process.exit(1);
401
+ throw Object.assign(new Error('--summary requires --llm-key or OPENAI_API_KEY environment variable'), { _code: 'FETCH_FAILED' });
332
402
  }
333
403
  peelOptions.summary = true;
334
404
  peelOptions.llm = {
@@ -371,12 +441,162 @@ program
371
441
  delete result.screenshot;
372
442
  }
373
443
  }
374
- // Store in cache if caching is enabled
375
- if (cacheTtlMs) {
376
- setCache(url, result, cacheTtlMs, { render: options.render, stealth: useStealth, selector: options.selector, format: peelOptions.format });
444
+ // Store full result in cache (before budget distillation so cache is reusable)
445
+ if (cacheTtlMs && !cacheDisabled) {
446
+ setCache(url, result, cacheTtlMs, {
447
+ render: options.render,
448
+ stealth: useStealth,
449
+ selector: options.selector,
450
+ format: peelOptions.format,
451
+ budget: null, // Budget excluded — cache stores full content, budget applied post-cache
452
+ });
453
+ }
454
+ // Apply smart budget distillation AFTER caching (cache always stores full content)
455
+ // When --agent is set, always apply budget even with --extract-all (listings will be budgeted
456
+ // separately, but if no listings are found the content itself still needs trimming).
457
+ const skipBudgetForExtract = (options.extractAll || options.scrollExtract !== undefined) && !options.agent;
458
+ let contentTruncated = false;
459
+ if (options.budget && options.budget > 0 && !skipBudgetForExtract) {
460
+ const budgetFormat = peelOptions.format === 'text' ? 'text' : 'markdown';
461
+ const distilled = distillToBudget(result.content, options.budget, budgetFormat);
462
+ if (distilled !== result.content) {
463
+ contentTruncated = true;
464
+ result.content = distilled;
465
+ result.tokens = estimateTokens(distilled);
466
+ }
467
+ }
468
+ // --- #4: Content quality warning ---
469
+ const isHtmlContent = result.contentType ? result.contentType.toLowerCase().includes('html') : true;
470
+ const isRedirect = false; // peel() follows redirects — final result is always 200
471
+ if (result.tokens < 20 && !useRender && isHtmlContent && !isRedirect) {
472
+ const warningMsg = `Low content detected (${result.tokens} tokens). Try: webpeel ${url} --render`;
473
+ if (isJson) {
474
+ result.warning = warningMsg;
475
+ }
476
+ else {
477
+ console.error(`⚠ ${warningMsg}`);
478
+ }
479
+ }
480
+ // --- Extract-all / pagination / output formatting ---
481
+ const wantsExtractAll = options.extractAll || options.scrollExtract !== undefined;
482
+ const pagesCount = Math.min(Math.max(options.pages || 1, 1), 10);
483
+ if (wantsExtractAll) {
484
+ const { extractListings } = await import('./core/extract-listings.js');
485
+ const { findNextPageUrl } = await import('./core/paginate.js');
486
+ // We need the raw HTML for extraction. Re-fetch with format=html if needed.
487
+ let allListings = [];
488
+ // Fetch HTML for extraction
489
+ const htmlResult = peelOptions.format === 'html'
490
+ ? result
491
+ : await peel(url, { ...peelOptions, format: 'html', maxTokens: undefined });
492
+ allListings.push(...extractListings(htmlResult.content, result.url));
493
+ // Pagination: follow "Next" links
494
+ if (pagesCount > 1) {
495
+ let currentHtml = htmlResult.content;
496
+ let currentUrl = result.url;
497
+ for (let page = 1; page < pagesCount; page++) {
498
+ const nextUrl = findNextPageUrl(currentHtml, currentUrl);
499
+ if (!nextUrl)
500
+ break;
501
+ try {
502
+ const nextResult = await peel(nextUrl, { ...peelOptions, format: 'html', maxTokens: undefined });
503
+ const pageListings = extractListings(nextResult.content, nextResult.url);
504
+ allListings.push(...pageListings);
505
+ currentHtml = nextResult.content;
506
+ currentUrl = nextResult.url;
507
+ }
508
+ catch {
509
+ break; // Stop paginating on error
510
+ }
511
+ }
512
+ }
513
+ // Apply budget to listings if requested
514
+ let listingsTruncated = false;
515
+ let totalAvailableListings;
516
+ if (options.budget && options.budget > 0 && allListings.length > 0) {
517
+ const { maxItems, truncated, totalAvailable } = budgetListings(allListings.length, options.budget);
518
+ if (truncated) {
519
+ listingsTruncated = true;
520
+ totalAvailableListings = totalAvailable;
521
+ allListings = allListings.slice(0, maxItems);
522
+ }
523
+ }
524
+ // Output based on format flags
525
+ if (options.csv) {
526
+ const csvOutput = formatListingsCsv(allListings);
527
+ await writeStdout(csvOutput);
528
+ }
529
+ else if (options.table) {
530
+ const { formatTable } = await import('./core/table-format.js');
531
+ const tableRows = allListings.map(item => {
532
+ const row = {};
533
+ for (const [k, v] of Object.entries(item)) {
534
+ if (v !== undefined)
535
+ row[k] = v;
536
+ }
537
+ return row;
538
+ });
539
+ await writeStdout(formatTable(tableRows) + '\n');
540
+ }
541
+ else if (isJson) {
542
+ // Use unified envelope for JSON output
543
+ const structured = allListings;
544
+ const envelope = buildEnvelope(result, {
545
+ cached: false,
546
+ structured,
547
+ truncated: listingsTruncated || undefined,
548
+ totalAvailable: totalAvailableListings,
549
+ });
550
+ // Also include legacy fields for backward compat
551
+ envelope.listings = allListings;
552
+ envelope.count = allListings.length;
553
+ await writeStdout(JSON.stringify(envelope, null, 2) + '\n');
554
+ }
555
+ else {
556
+ // Formatted text output
557
+ if (allListings.length === 0) {
558
+ await writeStdout('No listings found.\n');
559
+ }
560
+ else {
561
+ const truncNote = listingsTruncated && totalAvailableListings
562
+ ? ` (${totalAvailableListings} total — budget limited to ${allListings.length})`
563
+ : '';
564
+ await writeStdout(`Found ${allListings.length} listings${truncNote}:\n\n`);
565
+ allListings.forEach((item, i) => {
566
+ const pricePart = item.price ? ` — ${item.price}` : '';
567
+ const line = `${i + 1}. ${item.title}${pricePart}\n`;
568
+ process.stdout.write(line);
569
+ if (item.link) {
570
+ process.stdout.write(` ${item.link}\n`);
571
+ }
572
+ process.stdout.write('\n');
573
+ });
574
+ }
575
+ }
576
+ }
577
+ else if (options.csv || options.table) {
578
+ // CSV / table output for --extract (CSS selector extraction)
579
+ if (result.extracted) {
580
+ const rows = normaliseExtractedToRows(result.extracted);
581
+ if (options.csv) {
582
+ await writeStdout(formatListingsCsv(rows));
583
+ }
584
+ else {
585
+ const { formatTable } = await import('./core/table-format.js');
586
+ await writeStdout(formatTable(rows) + '\n');
587
+ }
588
+ }
589
+ else {
590
+ console.error('--csv / --table require --extract-all or --extract to produce structured data.');
591
+ }
592
+ }
593
+ else {
594
+ // Output results (default path)
595
+ await outputResult(result, options, {
596
+ cached: false,
597
+ truncated: contentTruncated || undefined,
598
+ });
377
599
  }
378
- // Output results
379
- await outputResult(result, options);
380
600
  // Clean up and exit
381
601
  await cleanup();
382
602
  process.exit(0);
@@ -385,6 +605,14 @@ program
385
605
  if (spinner) {
386
606
  spinner.fail('Failed to fetch');
387
607
  }
608
+ // --- #6: Consistent JSON error output ---
609
+ if (isJson) {
610
+ const errMsg = error instanceof Error ? error.message : 'Unknown error';
611
+ const errCode = classifyErrorCode(error);
612
+ await writeStdout(JSON.stringify({ error: errMsg, code: errCode }) + '\n');
613
+ await cleanup();
614
+ process.exit(1);
615
+ }
388
616
  if (error instanceof Error) {
389
617
  console.error(`\nError: ${error.message}`);
390
618
  // Provide actionable hints based on error type
@@ -418,22 +646,122 @@ program
418
646
  // Search command
419
647
  program
420
648
  .command('search <query>')
421
- .description('Search the web (DuckDuckGo by default, or Brave with --provider brave)')
649
+ .description('Search the web (DuckDuckGo by default, or use --site for site-specific search)')
422
650
  .option('-n, --count <n>', 'Number of results (1-10)', '5')
651
+ .option('--top <n>', 'Limit results (alias for --count)')
423
652
  .option('--provider <provider>', 'Search provider: duckduckgo (default) or brave')
424
653
  .option('--search-api-key <key>', 'API key for the search provider (or env WEBPEEL_BRAVE_API_KEY)')
654
+ .option('--site <site>', 'Search a specific site (e.g. ebay, amazon, github). Run "webpeel sites" for full list.')
425
655
  .option('--json', 'Output as JSON')
656
+ .option('--urls-only', 'Output only URLs, one per line (pipe-friendly)')
657
+ .option('--table', 'Output site-search results as a formatted table (requires --site)')
658
+ .option('--csv', 'Output site-search results as CSV (requires --site)')
659
+ .option('--budget <n>', 'Token budget for site-search result content', parseInt)
426
660
  .option('-s, --silent', 'Silent mode')
427
661
  .action(async (query, options) => {
428
662
  const isJson = options.json;
429
663
  const isSilent = options.silent;
430
- const count = parseInt(options.count) || 5;
664
+ // --top overrides --count when both are provided
665
+ const count = parseInt(options.top ?? options.count) || 5;
431
666
  // Check usage quota
432
667
  const usageCheck = await checkUsage();
433
668
  if (!usageCheck.allowed) {
434
669
  console.error(usageCheck.message);
435
670
  process.exit(1);
436
671
  }
672
+ // ── --site: site-specific structured search ───────────────────────────
673
+ if (options.site) {
674
+ const spinner = isSilent ? null : ora(`Searching ${options.site}...`).start();
675
+ try {
676
+ const { buildSiteSearchUrl } = await import('./core/site-search.js');
677
+ const siteResult = buildSiteSearchUrl(options.site, query);
678
+ // Fetch the raw HTML (needed for listing extraction)
679
+ const htmlResult = await peel(siteResult.url, {
680
+ format: 'html',
681
+ timeout: 30000,
682
+ });
683
+ if (spinner) {
684
+ spinner.succeed(`Fetched ${siteResult.site} in ${htmlResult.elapsed}ms`);
685
+ }
686
+ // Extract listings from the HTML
687
+ const { extractListings } = await import('./core/extract-listings.js');
688
+ let listings = extractListings(htmlResult.content, siteResult.url);
689
+ // Apply budget if requested
690
+ if (options.budget && options.budget > 0 && listings.length > 0) {
691
+ const { budgetListings } = await import('./core/budget.js');
692
+ const { maxItems } = budgetListings(listings.length, options.budget);
693
+ listings = listings.slice(0, maxItems);
694
+ }
695
+ // Show usage footer
696
+ if (usageCheck.usageInfo && !isSilent) {
697
+ showUsageFooter(usageCheck.usageInfo, usageCheck.isAnonymous || false, false);
698
+ }
699
+ // Output
700
+ if (options.csv) {
701
+ const rows = listings.map(item => {
702
+ const row = {};
703
+ for (const [k, v] of Object.entries(item)) {
704
+ if (v !== undefined)
705
+ row[k] = v;
706
+ }
707
+ return row;
708
+ });
709
+ await writeStdout(formatListingsCsv(rows));
710
+ }
711
+ else if (options.table) {
712
+ const { formatTable } = await import('./core/table-format.js');
713
+ const rows = listings.map(item => {
714
+ const row = {};
715
+ for (const [k, v] of Object.entries(item)) {
716
+ if (v !== undefined)
717
+ row[k] = v;
718
+ }
719
+ return row;
720
+ });
721
+ await writeStdout(formatTable(rows) + '\n');
722
+ }
723
+ else if (isJson) {
724
+ const envelope = {
725
+ site: siteResult.site,
726
+ query: siteResult.query,
727
+ url: siteResult.url,
728
+ count: listings.length,
729
+ items: listings,
730
+ elapsed: htmlResult.elapsed,
731
+ };
732
+ await writeStdout(JSON.stringify(envelope, null, 2) + '\n');
733
+ }
734
+ else {
735
+ if (listings.length === 0) {
736
+ await writeStdout('No listings found.\n');
737
+ }
738
+ else {
739
+ await writeStdout(`Found ${listings.length} listings on ${siteResult.site}:\n\n`);
740
+ for (const [i, item] of listings.entries()) {
741
+ const pricePart = item.price ? ` — ${item.price}` : '';
742
+ process.stdout.write(`${i + 1}. ${item.title}${pricePart}\n`);
743
+ if (item.link)
744
+ process.stdout.write(` ${item.link}\n`);
745
+ process.stdout.write('\n');
746
+ }
747
+ }
748
+ }
749
+ await cleanup();
750
+ process.exit(0);
751
+ }
752
+ catch (error) {
753
+ if (spinner)
754
+ spinner.fail('Site search failed');
755
+ if (error instanceof Error) {
756
+ console.error(`\nError: ${error.message}`);
757
+ }
758
+ else {
759
+ console.error('\nError: Unknown error occurred');
760
+ }
761
+ await cleanup();
762
+ process.exit(1);
763
+ }
764
+ }
437
765
  const spinner = isSilent ? null : ora('Searching...').start();
438
766
  try {
439
767
  const { getSearchProvider } = await import('./core/search-provider.js');
@@ -456,16 +784,15 @@ program
456
784
  if (usageCheck.usageInfo && !isSilent) {
457
785
  showUsageFooter(usageCheck.usageInfo, usageCheck.isAnonymous || false, false);
458
786
  }
459
- if (isJson) {
787
+ if (options.urlsOnly) {
788
+ // Pipe-friendly: one URL per line
789
+ for (const result of results) {
790
+ await writeStdout(result.url + '\n');
791
+ }
792
+ }
793
+ else if (isJson) {
460
794
  const jsonStr = JSON.stringify(results, null, 2);
461
- await new Promise((resolve, reject) => {
462
- process.stdout.write(jsonStr + '\n', (err) => {
463
- if (err)
464
- reject(err);
465
- else
466
- resolve();
467
- });
468
- });
795
+ await writeStdout(jsonStr + '\n');
469
796
  }
470
797
  else {
471
798
  for (const result of results) {
@@ -497,6 +824,44 @@ program
497
824
  process.exit(1);
498
825
  }
499
826
  });
827
+ // Sites command — list all supported site templates
828
+ program
829
+ .command('sites')
830
+ .description('List all sites supported by "webpeel search --site <site>"')
831
+ .option('--json', 'Output as JSON')
832
+ .option('--category <cat>', 'Filter by category (shopping, social, tech, jobs, general, real-estate, food)')
833
+ .action(async (options) => {
834
+ const { listSites } = await import('./core/site-search.js');
835
+ let sites = listSites();
836
+ if (options.category) {
837
+ sites = sites.filter(s => s.category === options.category);
838
+ }
839
+ if (options.json) {
840
+ await writeStdout(JSON.stringify(sites, null, 2) + '\n');
841
+ process.exit(0);
842
+ }
843
+ // Group by category for pretty output
844
+ const byCategory = new Map();
845
+ for (const site of sites) {
846
+ if (!byCategory.has(site.category))
847
+ byCategory.set(site.category, []);
848
+ byCategory.get(site.category).push(site);
849
+ }
850
+ const categoryOrder = ['shopping', 'general', 'social', 'tech', 'jobs', 'real-estate', 'food'];
851
+ const sortedCategories = categoryOrder.filter(c => byCategory.has(c));
852
+ console.log('\nWebPeel Site-Aware Search — supported sites\n');
853
+ console.log('Usage: webpeel search --site <id> "<query>"\n');
854
+ for (const cat of sortedCategories) {
855
+ const catSites = byCategory.get(cat);
856
+ const label = cat.charAt(0).toUpperCase() + cat.slice(1);
857
+ console.log(` ${label}:`);
858
+ for (const s of catSites) {
859
+ console.log(` ${s.id.padEnd(16)} ${s.name}`);
860
+ }
861
+ console.log('');
862
+ }
863
+ process.exit(0);
864
+ });
500
865
  // Batch command
501
866
  program
502
867
  .command('batch [file]')
@@ -632,12 +997,12 @@ program
632
997
  program
633
998
  .command('crawl <url>')
634
999
  .description('Crawl a website starting from a URL')
635
- .option('--max-pages <number>', 'Maximum number of pages to crawl (default: 10, max: 100)', parseInt, 10)
636
- .option('--max-depth <number>', 'Maximum depth to crawl (default: 2, max: 5)', parseInt, 2)
1000
+ .option('--max-pages <number>', 'Maximum number of pages to crawl (default: 10, max: 100)', (v) => parseInt(v, 10), 10)
1001
+ .option('--max-depth <number>', 'Maximum depth to crawl (default: 2, max: 5)', (v) => parseInt(v, 10), 2)
637
1002
  .option('--allowed-domains <domains...>', 'Only crawl these domains (default: same as starting URL)')
638
1003
  .option('--exclude <patterns...>', 'Exclude URLs matching these regex patterns')
639
1004
  .option('--ignore-robots', 'Ignore robots.txt (default: respect robots.txt)')
640
- .option('--rate-limit <ms>', 'Rate limit between requests in ms (default: 1000)', parseInt, 1000)
1005
+ .option('--rate-limit <ms>', 'Rate limit between requests in ms (default: 1000)', (v) => parseInt(v, 10), 1000)
641
1006
  .option('-r, --render', 'Use headless browser for all pages')
642
1007
  .option('--stealth', 'Use stealth mode for all pages')
643
1008
  .option('-s, --silent', 'Silent mode (no spinner)')
@@ -710,7 +1075,7 @@ program
710
1075
  .description('Discover all URLs on a domain (sitemap + crawl)')
711
1076
  .option('--no-sitemap', 'Skip sitemap.xml discovery')
712
1077
  .option('--no-crawl', 'Skip homepage crawl')
713
- .option('--max <n>', 'Maximum URLs to discover (default: 5000)', parseInt, 5000)
1078
+ .option('--max <n>', 'Maximum URLs to discover (default: 5000)', (v) => parseInt(v, 10), 5000)
714
1079
  .option('--include <patterns...>', 'Include only URLs matching these regex patterns')
715
1080
  .option('--exclude <patterns...>', 'Exclude URLs matching these regex patterns')
716
1081
  .option('--json', 'Output as JSON')
@@ -751,6 +1116,177 @@ program
751
1116
  process.exit(1);
752
1117
  }
753
1118
  });
1119
+ // Watch command - monitor a URL for changes / assertion failures
1120
+ program
1121
+ .command('watch <url>')
1122
+ .description('Monitor a URL for changes and assertion failures')
1123
+ .option('--interval <duration>', 'Check interval (e.g. 30s, 5m, 1h)', '5m')
1124
+ .option('--assert <condition...>', 'Assertion(s) to check (e.g. "status=200" "body.health=ok")')
1125
+ .option('--webhook <url>', 'POST this URL on assertion failure or content change')
1126
+ .option('-t, --timeout <ms>', 'Per-request timeout in ms', (v) => parseInt(v, 10), 10000)
1127
+ .option('--max-checks <n>', 'Stop after N checks (default: unlimited)', (v) => parseInt(v, 10))
1128
+ .option('--json', 'Output each check as NDJSON to stdout')
1129
+ .option('-s, --silent', 'Only output on failures/changes')
1130
+ .option('-r, --render', 'Use browser rendering for checks')
1131
+ .action(async (url, options) => {
1132
+ const { watch: runWatch, parseDuration, parseAssertion } = await import('./core/watch.js');
1133
+ // Validate URL
1134
+ try {
1135
+ const parsed = new URL(url);
1136
+ if (!['http:', 'https:'].includes(parsed.protocol)) {
1137
+ console.error('Error: Only HTTP and HTTPS protocols are allowed');
1138
+ process.exit(1);
1139
+ }
1140
+ }
1141
+ catch {
1142
+ console.error(`Error: Invalid URL format: ${url}`);
1143
+ process.exit(1);
1144
+ }
1145
+ // Parse interval
1146
+ let intervalMs;
1147
+ try {
1148
+ intervalMs = parseDuration(options.interval);
1149
+ }
1150
+ catch (e) {
1151
+ console.error(`Error: ${e.message}`);
1152
+ process.exit(1);
1153
+ }
1154
+ // Parse assertions
1155
+ const assertions = [];
1156
+ if (options.assert && Array.isArray(options.assert)) {
1157
+ for (const expr of options.assert) {
1158
+ try {
1159
+ assertions.push(parseAssertion(expr));
1160
+ }
1161
+ catch (e) {
1162
+ console.error(`Error: ${e.message}`);
1163
+ process.exit(1);
1164
+ }
1165
+ }
1166
+ }
1167
+ if (!options.json && !options.silent) {
1168
+ const intervalLabel = options.interval;
1169
+ const assertLabel = assertions.length > 0
1170
+ ? ` with ${assertions.length} assertion(s)`
1171
+ : '';
1172
+ process.stderr.write(`Watching ${url} every ${intervalLabel}${assertLabel}. Press Ctrl+C to stop.\n`);
1173
+ }
1174
+ const watchOptions = {
1175
+ url,
1176
+ intervalMs,
1177
+ assertions,
1178
+ webhookUrl: options.webhook,
1179
+ timeout: options.timeout,
1180
+ maxChecks: options.maxChecks,
1181
+ render: options.render || false,
1182
+ json: options.json || false,
1183
+ silent: options.silent || false,
1184
+ };
1185
+ try {
1186
+ await runWatch(watchOptions);
1187
+ }
1188
+ catch (error) {
1189
+ console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
1190
+ process.exit(1);
1191
+ }
1192
+ process.exit(0);
1193
+ });
1194
+ // Diff command - semantic diff against last snapshot
1195
+ program
1196
+ .command('diff <url>')
1197
+ .description('Show semantic diff between current content and the last tracked snapshot')
1198
+ .option('--last', 'Compare against last tracked snapshot (default)')
1199
+ .option('--against <snapshot-url>', 'Compare against the snapshot stored for a different URL')
1200
+ .option('--fields <fields>', 'For JSON responses: only diff these fields (comma-separated dot-notation)')
1201
+ .option('--json', 'Output diff as JSON')
1202
+ .option('-r, --render', 'Use browser rendering')
1203
+ .option('-t, --timeout <ms>', 'Request timeout in ms', (v) => parseInt(v, 10), 30000)
1204
+ .option('-s, --silent', 'Silent mode (no spinner)')
1205
+ .action(async (url, options) => {
1206
+ const isJson = options.json;
1207
+ // Validate URL
1208
+ try {
1209
+ const parsed = new URL(url);
1210
+ if (!['http:', 'https:'].includes(parsed.protocol)) {
1211
+ if (isJson) {
1212
+ await writeStdout(JSON.stringify({ error: 'Only HTTP and HTTPS protocols are allowed', code: 'INVALID_URL' }) + '\n');
1213
+ }
1214
+ else {
1215
+ console.error('Error: Only HTTP and HTTPS protocols are allowed');
1216
+ }
1217
+ process.exit(1);
1218
+ }
1219
+ }
1220
+ catch {
1221
+ if (isJson) {
1222
+ await writeStdout(JSON.stringify({ error: `Invalid URL format: ${url}`, code: 'INVALID_URL' }) + '\n');
1223
+ }
1224
+ else {
1225
+ console.error(`Error: Invalid URL format: ${url}`);
1226
+ }
1227
+ process.exit(1);
1228
+ }
1229
+ const spinner = options.silent ? null : ora('Fetching and diffing...').start();
1230
+ try {
1231
+ const { diffUrl } = await import('./core/diff.js');
1232
+ const fields = options.fields
1233
+ ? options.fields.split(',').map((f) => f.trim()).filter(Boolean)
1234
+ : undefined;
1235
+ const result = await diffUrl(url, {
1236
+ render: options.render || false,
1237
+ timeout: options.timeout,
1238
+ fields,
1239
+ });
1240
+ if (spinner) {
1241
+ spinner.succeed(`Diff completed in ${result.changed ? 'CHANGED' : 'no change'}`);
1242
+ }
1243
+ if (isJson) {
1244
+ await writeStdout(JSON.stringify(result, null, 2) + '\n');
1245
+ }
1246
+ else {
1247
+ // Human-readable output
1248
+ const ago = result.previousTimestamp
1249
+ ? formatRelativeTime(new Date(result.previousTimestamp))
1250
+ : 'unknown';
1251
+ console.log(`\nComparing ${result.url} (now vs ${ago})\n`);
1252
+ if (!result.changed) {
1253
+ console.log(' No changes detected.');
1254
+ }
1255
+ else {
1256
+ for (const change of result.changes) {
1257
+ const label = change.field ?? change.path ?? '(unknown)';
1258
+ if (change.type === 'modified') {
1259
+ console.log(` Modified: ${label} ${change.before} → ${change.after}`);
1260
+ }
1261
+ else if (change.type === 'added') {
1262
+ console.log(` Added: ${label} ${change.after}`);
1263
+ }
1264
+ else if (change.type === 'removed') {
1265
+ console.log(` Removed: ${label} ${change.before}`);
1266
+ }
1267
+ }
1268
+ }
1269
+ console.log(`\nSummary: ${result.summary}`);
1270
+ }
1271
+ await cleanup();
1272
+ process.exit(0);
1273
+ }
1274
+ catch (error) {
1275
+ if (spinner)
1276
+ spinner.fail('Diff failed');
1277
+ if (isJson) {
1278
+ await writeStdout(JSON.stringify({
1279
+ error: error instanceof Error ? error.message : 'Unknown error',
1280
+ code: 'FETCH_FAILED',
1281
+ }) + '\n');
1282
+ }
1283
+ else {
1284
+ console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
1285
+ }
1286
+ await cleanup();
1287
+ process.exit(1);
1288
+ }
1289
+ });
754
1290
  program
755
1291
  .command('login')
756
1292
  .description('Authenticate the CLI with your API key')
@@ -987,33 +1523,46 @@ program
987
1523
  // Track command - track changes on a URL
988
1524
  program
989
1525
  .command('track <url>')
990
- .description('Track changes on a URL (returns fingerprint for change detection)')
1526
+ .description('Track changes on a URL (saves snapshot for use with `webpeel diff`)')
991
1527
  .option('-s, --silent', 'Silent mode (no spinner)')
992
1528
  .option('--json', 'Output as JSON')
1529
+ .option('-r, --render', 'Use browser rendering')
993
1530
  .action(async (url, options) => {
994
1531
  const spinner = options.silent ? null : ora('Fetching and tracking...').start();
995
1532
  try {
996
- const result = await peel(url);
1533
+ // changeTracking: true saves the snapshot to ~/.webpeel/snapshots/ so that
1534
+ // `webpeel diff` can compare against it later.
1535
+ const result = await peel(url, {
1536
+ render: options.render || false,
1537
+ changeTracking: true,
1538
+ });
997
1539
  if (spinner) {
998
1540
  spinner.succeed(`Tracked in ${result.elapsed}ms`);
999
1541
  }
1542
+ const changeStatus = result.changeTracking?.changeStatus ?? 'new';
1543
+ const previousScrapeAt = result.changeTracking?.previousScrapeAt ?? null;
1000
1544
  if (options.json) {
1001
- console.log(JSON.stringify({
1545
+ await writeStdout(JSON.stringify({
1002
1546
  url: result.url,
1003
1547
  title: result.title,
1004
1548
  fingerprint: result.fingerprint,
1005
1549
  tokens: result.tokens,
1006
1550
  contentType: result.contentType,
1551
+ changeStatus,
1552
+ previousScrapeAt,
1007
1553
  lastChecked: new Date().toISOString(),
1008
- }, null, 2));
1554
+ }, null, 2) + '\n');
1009
1555
  }
1010
1556
  else {
1011
1557
  console.log(`URL: ${result.url}`);
1012
1558
  console.log(`Title: ${result.title}`);
1013
1559
  console.log(`Fingerprint: ${result.fingerprint}`);
1014
1560
  console.log(`Tokens: ${result.tokens}`);
1561
+ console.log(`Status: ${changeStatus}`);
1562
+ if (previousScrapeAt)
1563
+ console.log(`Previous check: ${previousScrapeAt}`);
1015
1564
  console.log(`Last checked: ${new Date().toISOString()}`);
1016
- console.log('\nSave this fingerprint to detect future changes.');
1565
+ console.log('\nSnapshot saved. Run `webpeel diff <url> --last` to compare future changes.');
1017
1566
  }
1018
1567
  await cleanup();
1019
1568
  process.exit(0);
@@ -1145,25 +1694,39 @@ program
1145
1694
  process.exit(1);
1146
1695
  }
1147
1696
  });
1148
- // Jobs command - search job boards (LinkedIn, Indeed, Glassdoor)
1149
- program
1150
- .command('jobs <keywords>')
1151
- .description('Search job boards for listings (LinkedIn, Indeed, Glassdoor)')
1697
+ // ── Jobs command group ─────────────────────────────────────────────────────
1698
+ const jobsCmd = program
1699
+ .command('jobs')
1700
+ .description('Job board operations: search listings and auto-apply (LinkedIn, Indeed, Glassdoor, Upwork)')
1701
+ .argument('[keywords]', 'Search keywords — shorthand for "jobs search <keywords>"')
1152
1702
  .option('-l, --location <location>', 'Location filter')
1153
- .option('-s, --source <source>', 'Job board: glassdoor, indeed, or linkedin (default: linkedin)', 'linkedin')
1703
+ .option('-s, --source <source>', 'Job board: glassdoor, indeed, linkedin, or upwork (default: linkedin)', 'linkedin')
1154
1704
  .option('-n, --limit <number>', 'Max results (default: 25)', '25')
1155
1705
  .option('-d, --details <number>', 'Fetch full details for top N results (default: 0)', '0')
1156
1706
  .option('--json', 'Output raw JSON')
1157
1707
  .option('--timeout <ms>', 'Request timeout in ms (default: 30000)', '30000')
1158
1708
  .option('--silent', 'Silent mode (no spinner)')
1159
1709
  .action(async (keywords, options) => {
1710
+ // Default action: when called as `webpeel jobs <keywords>`, act as search
1711
+ if (!keywords) {
1712
+ jobsCmd.help();
1713
+ process.exit(0);
1714
+ }
1715
+ // Delegate to shared search logic
1716
+ await runJobSearch(keywords, options);
1717
+ });
1718
+ // ── Shared job-search logic (used by both `jobs` default and `jobs search`) ───
1719
+ async function runJobSearch(keywords, options) {
1160
1720
  const spinner = options.silent ? null : ora('Searching jobs...').start();
1161
1721
  try {
1162
1722
  const { searchJobs } = await import('./core/jobs.js');
1163
- const source = (['glassdoor', 'indeed', 'linkedin'].includes(options.source) ? options.source : 'linkedin');
1164
- const limit = Math.min(Math.max(parseInt(options.limit, 10) || 25, 1), 100);
1165
- const fetchDetails = Math.min(Math.max(parseInt(options.details, 10) || 0, 0), limit);
1166
- const timeout = parseInt(options.timeout, 10) || 30000;
1723
+ const VALID_SOURCES = ['glassdoor', 'indeed', 'linkedin', 'upwork'];
1724
+ const source = (VALID_SOURCES.includes((options.source ?? 'linkedin'))
1725
+ ? options.source
1726
+ : 'linkedin');
1727
+ const limit = Math.min(Math.max(parseInt(options.limit ?? '25', 10) || 25, 1), 100);
1728
+ const fetchDetails = Math.min(Math.max(parseInt(options.details ?? '0', 10) || 0, 0), limit);
1729
+ const timeout = parseInt(options.timeout ?? '30000', 10) || 30000;
1167
1730
  const result = await searchJobs({
1168
1731
  keywords,
1169
1732
  location: options.location,
@@ -1174,12 +1737,10 @@ program
1174
1737
  });
1175
1738
  if (spinner)
1176
1739
  spinner.stop();
1177
- // --json: raw output
1178
1740
  if (options.json) {
1179
1741
  await writeStdout(JSON.stringify(result, null, 2) + '\n');
1180
1742
  process.exit(0);
1181
1743
  }
1182
- // Formatted table output
1183
1744
  const totalLabel = result.totalFound >= 1000
1184
1745
  ? `${(result.totalFound / 1000).toFixed(0).replace(/\.0$/, '')}k+`
1185
1746
  : String(result.totalFound);
@@ -1189,7 +1750,6 @@ program
1189
1750
  console.log(' No jobs found.\n');
1190
1751
  process.exit(0);
1191
1752
  }
1192
- // Column widths
1193
1753
  const colNum = 3;
1194
1754
  const colTitle = 40;
1195
1755
  const colCompany = 18;
@@ -1198,18 +1758,15 @@ program
1198
1758
  const colPosted = 10;
1199
1759
  const pad = (s, w) => s.length > w ? s.slice(0, w - 1) + '…' : s.padEnd(w);
1200
1760
  const rpad = (s, w) => s.padStart(w);
1201
- // Header
1202
- console.log(` ${rpad('#', colNum)} ${pad('Title', colTitle)} ${pad('Company', colCompany)} ${pad('Location', colLocation)} ${pad('Salary', colSalary)} ${pad('Posted', colPosted)}`);
1203
- // Rows
1761
+ console.log(` ${rpad('#', colNum)} ${pad('Title', colTitle)} ${pad('Company', colCompany)} ${pad('Location', colLocation)} ${pad('Salary/Budget', colSalary)} ${pad('Posted', colPosted)}`);
1204
1762
  result.jobs.forEach((job, i) => {
1205
- const title = job.title + (job.remote ? ' 🏠' : '');
1206
- console.log(` ${rpad(String(i + 1), colNum)} ${pad(title, colTitle)} ${pad(job.company, colCompany)} ${pad(job.location, colLocation)} ${pad(job.salary || '', colSalary)} ${pad(job.postedAt || '', colPosted)}`);
1763
+ const titleStr = job.title + (job.remote ? ' 🏠' : '');
1764
+ const salaryStr = job.salary ?? ('budget' in job ? job.budget : '') ?? '';
1765
+ console.log(` ${rpad(String(i + 1), colNum)} ${pad(titleStr, colTitle)} ${pad(job.company, colCompany)} ${pad(job.location, colLocation)} ${pad(salaryStr, colSalary)} ${pad(job.postedAt ?? '', colPosted)}`);
1207
1766
  });
1208
- // Footer
1209
1767
  const timeSec = (result.timeTakenMs / 1000).toFixed(1);
1210
1768
  const detailsNote = fetchDetails > 0 ? ` | Details: ${result.detailsFetched} fetched` : '';
1211
1769
  console.log(`\nFetched ${result.jobs.length} jobs in ${timeSec}s${detailsNote}\n`);
1212
- // Detailed job cards (when --details > 0)
1213
1770
  const detailedJobs = result.jobs.filter((j) => 'description' in j);
1214
1771
  for (let i = 0; i < detailedJobs.length; i++) {
1215
1772
  const job = detailedJobs[i];
@@ -1251,7 +1808,238 @@ program
1251
1808
  }
1252
1809
  catch (error) {
1253
1810
  if (spinner)
1254
- spinner.fail('Job search failed');
1811
+ spinner.fail?.('Job search failed');
1812
+ console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
1813
+ process.exit(1);
1814
+ }
1815
+ }
1816
+ // jobs search <keywords> — explicit subcommand (same logic as default action)
1817
+ jobsCmd
1818
+ .command('search <keywords>')
1819
+ .description('Search job boards for listings (LinkedIn, Indeed, Glassdoor, Upwork)')
1820
+ .alias('s')
1821
+ .option('-l, --location <location>', 'Location filter')
1822
+ .option('-s, --source <source>', 'Job board: glassdoor, indeed, linkedin, or upwork (default: linkedin)', 'linkedin')
1823
+ .option('-n, --limit <number>', 'Max results (default: 25)', '25')
1824
+ .option('-d, --details <number>', 'Fetch full details for top N results (default: 0)', '0')
1825
+ .option('--json', 'Output raw JSON')
1826
+ .option('--timeout <ms>', 'Request timeout in ms (default: 30000)', '30000')
1827
+ .option('--silent', 'Silent mode (no spinner)')
1828
+ .action(async (keywords, options) => {
1829
+ await runJobSearch(keywords, options);
1830
+ });
1831
+ // ── jobs apply <url> ─────────────────────────────────────────────────────────
1832
+ // Stealth automated job application using human behavior simulation
1833
+ jobsCmd
1834
+ .command('apply <url>')
1835
+ .description('Stealth automated job application using human behavior simulation')
1836
+ .option('--profile <path>', 'Path to profile JSON file', `${process.env.HOME ?? '~'}/.webpeel/profile.json`)
1837
+ .option('--resume <path>', 'Path to resume PDF (overrides profile.resumePath)')
1838
+ .option('--mode <mode>', 'Submission mode: auto | review | dry-run (default: review)', 'review')
1839
+ .option('--session-dir <path>', 'Browser session directory (preserves login cookies)')
1840
+ .option('--llm-key <key>', 'LLM API key for custom question answers')
1841
+ .option('--llm-provider <name>', 'LLM provider: openai | anthropic (default: openai)', 'openai')
1842
+ .option('--daily-limit <n>', 'Max applications per day (default: 8)', '8')
1843
+ .option('--no-warmup', 'Skip browsing warmup phase')
1844
+ .option('--json', 'Output result as JSON')
1845
+ .option('--silent', 'Minimal output')
1846
+ .action(async (url, options) => {
1847
+ const isSilent = options.silent;
1848
+ const isJson = options.json;
1849
+ const mode = (['auto', 'review', 'dry-run'].includes(options.mode)
1850
+ ? options.mode
1851
+ : 'review');
1852
+ if (!isSilent) {
1853
+ console.log(`\n🤖 WebPeel Auto-Apply — mode: ${mode}`);
1854
+ console.log(` URL: ${url}\n`);
1855
+ }
1856
+ // Load profile
1857
+ const profilePath = options.profile;
1858
+ let profile;
1859
+ try {
1860
+ const raw = readFileSync(profilePath, 'utf-8');
1861
+ profile = JSON.parse(raw);
1862
+ }
1863
+ catch {
1864
+ console.error(`Error: Could not load profile from ${profilePath}`);
1865
+ console.error(`Run "webpeel jobs apply-setup" to create a profile.`);
1866
+ process.exit(1);
1867
+ }
1868
+ if (options.resume) {
1869
+ profile.resumePath = options.resume;
1870
+ }
1871
+ const spinner = isSilent ? null : ora('Applying...').start();
1872
+ try {
1873
+ const { applyToJob } = await import('./core/apply.js');
1874
+ const result = await applyToJob({
1875
+ url,
1876
+ profile,
1877
+ mode,
1878
+ sessionDir: options.sessionDir,
1879
+ llmKey: options.llmKey,
1880
+ llmProvider: options.llmProvider,
1881
+ dailyLimit: parseInt(options.dailyLimit, 10) || 8,
1882
+ warmup: options.warmup !== false,
1883
+ onProgress: isSilent
1884
+ ? undefined
1885
+ : (event) => {
1886
+ if (spinner)
1887
+ spinner.text = `[${event.stage}] ${event.message}`;
1888
+ else
1889
+ console.log(` [${event.stage}] ${event.message}`);
1890
+ },
1891
+ });
1892
+ if (spinner)
1893
+ spinner.stop();
1894
+ if (isJson) {
1895
+ await writeStdout(JSON.stringify(result, null, 2) + '\n');
1896
+ process.exit(result.error ? 1 : 0);
1897
+ }
1898
+ const statusIcon = result.submitted ? '✅' : result.error ? '❌' : '📋';
1899
+ console.log(`\n${statusIcon} ${result.submitted
1900
+ ? 'Application submitted!'
1901
+ : result.error
1902
+ ? `Error: ${result.error}`
1903
+ : 'Application completed (not submitted)'}`);
1904
+ if (result.job.title || result.job.company) {
1905
+ console.log(` ${result.job.title}${result.job.company ? ` @ ${result.job.company}` : ''}`);
1906
+ }
1907
+ console.log(`\n Fields filled: ${result.fieldsFilled}`);
1908
+ if (result.llmAnswers > 0)
1909
+ console.log(` LLM answers: ${result.llmAnswers}`);
1910
+ if (result.fieldsSkipped.length > 0)
1911
+ console.log(` Skipped: ${result.fieldsSkipped.join(', ')}`);
1912
+ if (result.warnings.length > 0 && !isSilent) {
1913
+ console.log(`\n Warnings:`);
1914
+ result.warnings.forEach(w => console.log(` ⚠️ ${w}`));
1915
+ }
1916
+ console.log(` Time: ${(result.elapsed / 1000).toFixed(1)}s\n`);
1917
+ process.exit(result.error ? 1 : 0);
1918
+ }
1919
+ catch (error) {
1920
+ if (spinner)
1921
+ spinner.fail('Application failed');
1922
+ console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
1923
+ process.exit(1);
1924
+ }
1925
+ });
1926
+ // ── jobs apply-setup ─────────────────────────────────────────────────────────
1927
+ // Interactive wizard to create ~/.webpeel/profile.json
1928
+ jobsCmd
1929
+ .command('apply-setup')
1930
+ .description('Interactive setup wizard — creates ~/.webpeel/profile.json')
1931
+ .action(async () => {
1932
+ const { createInterface } = await import('readline');
1933
+ const rl = createInterface({ input: process.stdin, output: process.stdout });
1934
+ const ask = (q) => new Promise(resolve => rl.question(q, ans => resolve(ans.trim())));
1935
+ console.log('\n🤖 WebPeel Apply Setup — Create your applicant profile\n');
1936
+ console.log('This creates ~/.webpeel/profile.json used by "webpeel jobs apply".\n');
1937
+ try {
1938
+ const name = await ask('Full name: ');
1939
+ const email = await ask('Email address: ');
1940
+ const phone = await ask('Phone number: ');
1941
+ const linkedin = await ask('LinkedIn URL (optional, press Enter to skip): ');
1942
+ const website = await ask('Portfolio/website URL (optional): ');
1943
+ const location = await ask('City, State (e.g. San Francisco, CA): ');
1944
+ const workAuth = await ask('Work authorization (e.g. US Citizen, Permanent Resident, H-1B, Need Sponsorship): ');
1945
+ const yearsExp = await ask('Years of experience: ');
1946
+ const currentTitle = await ask('Current/most recent job title: ');
1947
+ const skills = await ask('Skills (comma-separated, e.g. TypeScript, React, Node.js): ');
1948
+ const education = await ask('Education (e.g. B.S. Computer Science, MIT): ');
1949
+ const resumePath = await ask('Path to resume PDF (e.g. /Users/you/resume.pdf): ');
1950
+ const summary = await ask('Professional summary (1-3 sentences): ');
1951
+ const salaryMin = await ask('Minimum desired salary (optional, e.g. 120000): ');
1952
+ const salaryMax = await ask('Maximum desired salary (optional, e.g. 180000): ');
1953
+ const relocate = await ask('Willing to relocate? (y/n): ');
1954
+ const sponsorship = await ask('Need visa sponsorship? (y/n): ');
1955
+ rl.close();
1956
+ const profileData = {
1957
+ name,
1958
+ email,
1959
+ phone,
1960
+ ...(linkedin ? { linkedin } : {}),
1961
+ ...(website ? { website } : {}),
1962
+ location,
1963
+ workAuthorization: workAuth,
1964
+ yearsExperience: parseInt(yearsExp, 10) || 0,
1965
+ currentTitle,
1966
+ skills: skills.split(',').map(s => s.trim()).filter(Boolean),
1967
+ education,
1968
+ resumePath,
1969
+ summary,
1970
+ ...(salaryMin && salaryMax
1971
+ ? { salaryRange: { min: parseInt(salaryMin, 10), max: parseInt(salaryMax, 10) } }
1972
+ : {}),
1973
+ willingToRelocate: relocate.toLowerCase().startsWith('y'),
1974
+ needsSponsorship: sponsorship.toLowerCase().startsWith('y'),
1975
+ };
1976
+ const { mkdirSync: mk, writeFileSync: wf, existsSync: ex } = await import('fs');
1977
+ const { join: j } = await import('path');
1978
+ const { homedir: hd } = await import('os');
1979
+ const webpeelDir = j(hd(), '.webpeel');
1980
+ if (!ex(webpeelDir))
1981
+ mk(webpeelDir, { recursive: true });
1982
+ const profilePath = j(webpeelDir, 'profile.json');
1983
+ wf(profilePath, JSON.stringify(profileData, null, 2), 'utf-8');
1984
+ console.log(`\n✅ Profile saved to: ${profilePath}`);
1985
+ console.log('\nNext steps:');
1986
+ console.log(' 1. Apply to a job: webpeel jobs apply https://linkedin.com/jobs/view/...');
1987
+ console.log(' (First run opens a browser — log in to LinkedIn, then the session is saved)\n');
1988
+ }
1989
+ catch (error) {
1990
+ rl.close();
1991
+ console.error(`\nError: ${error instanceof Error ? error.message : 'Unknown error'}`);
1992
+ process.exit(1);
1993
+ }
1994
+ });
1995
+ // ── jobs apply-history ───────────────────────────────────────────────────────
1996
+ // View application history from ~/.webpeel/applications.json
1997
+ jobsCmd
1998
+ .command('apply-history')
1999
+ .description('View application history from ~/.webpeel/applications.json')
2000
+ .option('--json', 'Output as JSON')
2001
+ .option('--limit <n>', 'Number of recent applications to show (default: 20)', '20')
2002
+ .action(async (options) => {
2003
+ const isJson = options.json;
2004
+ const limit = parseInt(options.limit, 10) || 20;
2005
+ try {
2006
+ const { loadApplications } = await import('./core/apply.js');
2007
+ const allApps = loadApplications();
2008
+ const apps = allApps.slice().reverse().slice(0, limit);
2009
+ if (isJson) {
2010
+ await writeStdout(JSON.stringify(apps, null, 2) + '\n');
2011
+ process.exit(0);
2012
+ }
2013
+ if (apps.length === 0) {
2014
+ console.log('\nNo applications yet. Use "webpeel jobs apply <url>" to start.\n');
2015
+ process.exit(0);
2016
+ }
2017
+ console.log(`\n📋 Application History (${apps.length} of ${allApps.length} total)\n`);
2018
+ const colDate = 22;
2019
+ const colStatus = 10;
2020
+ const colTitle = 35;
2021
+ const colCompany = 20;
2022
+ const colMode = 8;
2023
+ const pad = (s, w) => (s.length > w ? s.slice(0, w - 1) + '…' : s.padEnd(w));
2024
+ console.log(` ${pad('Applied', colDate)} ${pad('Status', colStatus)} ${pad('Title', colTitle)} ${pad('Company', colCompany)} ${pad('Mode', colMode)}`);
2025
+ console.log(` ${'-'.repeat(colDate)} ${'-'.repeat(colStatus)} ${'-'.repeat(colTitle)} ${'-'.repeat(colCompany)} ${'-'.repeat(colMode)}`);
2026
+ for (const app of apps) {
2027
+ const date = new Date(app.appliedAt).toLocaleString('en-US', {
2028
+ month: 'short',
2029
+ day: 'numeric',
2030
+ year: 'numeric',
2031
+ hour: '2-digit',
2032
+ minute: '2-digit',
2033
+ });
2034
+ const statusEmoji = { applied: '📤', interview: '🎯', offer: '🎉', rejected: '❌', withdrawn: '🚫' }[app.status] ?? '';
2035
+ console.log(` ${pad(date, colDate)} ${pad(`${statusEmoji} ${app.status}`, colStatus)} ${pad(app.title, colTitle)} ${pad(app.company, colCompany)} ${pad(app.mode, colMode)}`);
2036
+ }
2037
+ const today = new Date().toISOString().slice(0, 10);
2038
+ const todayCount = allApps.filter(a => a.appliedAt.startsWith(today)).length;
2039
+ console.log(`\n Today: ${todayCount} application(s)\n`);
2040
+ process.exit(0);
2041
+ }
2042
+ catch (error) {
1255
2043
  console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
1256
2044
  process.exit(1);
1257
2045
  }
@@ -1451,7 +2239,7 @@ program
1451
2239
  .option('--format <fmt>', 'Image format: png (default) or jpeg', 'png')
1452
2240
  .option('--quality <n>', 'JPEG quality 1-100 (ignored for PNG)', parseInt)
1453
2241
  .option('-w, --wait <ms>', 'Wait time after page load (ms)', parseInt)
1454
- .option('-t, --timeout <ms>', 'Request timeout (ms)', parseInt, 30000)
2242
+ .option('-t, --timeout <ms>', 'Request timeout (ms)', (v) => parseInt(v, 10), 30000)
1455
2243
  .option('--stealth', 'Use stealth mode to bypass bot detection')
1456
2244
  .option('--action <actions...>', 'Page actions before screenshot (e.g., "click:.btn" "wait:2000")')
1457
2245
  .option('-o, --output <path>', 'Output file path (default: screenshot.png)')
@@ -1558,11 +2346,376 @@ program
1558
2346
  process.exit(1);
1559
2347
  }
1560
2348
  });
2349
+ // ── Top-level Apply command group ──────────────────────────────────────────
2350
+ //
2351
+ // webpeel apply <url> — submit a job application
2352
+ // webpeel apply init — interactive profile setup wizard
2353
+ // webpeel apply status — show application stats
2354
+ // webpeel apply list — list tracked applications (with filters)
2355
+ // webpeel apply rate — show rate-governor status
2356
+ const applyCmd = program
2357
+ .command('apply')
2358
+ .description('Auto-apply pipeline: submit applications, track history, manage rate limits');
2359
+ // apply <url> — auto-apply to a job posting
2360
+ applyCmd
2361
+ .command('submit <url>')
2362
+ .description('Auto-apply to a job posting')
2363
+ .alias('s')
2364
+ .option('--profile-path <path>', 'Path to apply profile JSON', `${process.env.HOME ?? '~'}/.webpeel/profile.json`)
2365
+ .option('--browser-profile <path>', 'Path to persistent browser data dir', `${process.env.HOME ?? '~'}/.webpeel/browser-profile`)
2366
+ .option('--headed', 'Run browser visibly (default for apply)')
2367
+ .option('--headless', 'Run browser invisibly')
2368
+ .option('--confirm', 'Pause for confirmation before submit (default: true)')
2369
+ .option('--no-confirm', 'Skip confirmation, auto-submit')
2370
+ .option('--dry-run', 'Go through flow but do not submit')
2371
+ .option('--generate-cover', 'Generate tailored cover letter (needs OPENAI_API_KEY)')
2372
+ .option('--timeout <ms>', 'Timeout in ms (default: 300000)', '300000')
2373
+ .option('--json', 'Output result as JSON')
2374
+ .option('--silent', 'Silent mode')
2375
+ .action(async (url, options) => {
2376
+ const isSilent = options.silent;
2377
+ const isJson = options.json;
2378
+ // Load profile
2379
+ const profilePath = options.profilePath;
2380
+ let profile;
2381
+ try {
2382
+ const raw = readFileSync(profilePath, 'utf-8');
2383
+ profile = JSON.parse(raw);
2384
+ }
2385
+ catch {
2386
+ const msg = `Could not load profile from ${profilePath}. Run "webpeel apply init" to create one.`;
2387
+ if (isJson) {
2388
+ await writeStdout(JSON.stringify({ error: msg }) + '\n');
2389
+ }
2390
+ else {
2391
+ console.error(`Error: ${msg}`);
2392
+ }
2393
+ process.exit(1);
2394
+ }
2395
+ const spinner = isSilent ? null : ora('Applying...').start();
2396
+ try {
2397
+ const { applyToJob } = await import('./core/apply.js');
2398
+ const result = await applyToJob({
2399
+ url,
2400
+ profile,
2401
+ // Use sessionDir for persistent session storage (renamed from browserProfile)
2402
+ sessionDir: options.browserProfile,
2403
+ // Map dryRun flag → mode: 'dry-run'
2404
+ mode: (options.dryRun ? 'dry-run' : (options.noConfirm ? 'auto' : 'review')),
2405
+ timeout: parseInt(options.timeout, 10) || 300_000,
2406
+ });
2407
+ if (spinner)
2408
+ spinner.stop();
2409
+ // Normalize result to a consistent output shape
2410
+ const success = result.submitted && !result.error;
2411
+ const jobTitle = result.job?.title ?? '';
2412
+ const jobCompany = result.job?.company ?? '';
2413
+ if (isJson) {
2414
+ await writeStdout(JSON.stringify(result, null, 2) + '\n');
2415
+ process.exit(success ? 0 : 1);
2416
+ }
2417
+ const icon = success ? '✅' : '❌';
2418
+ console.log(`\n${icon} ${success ? 'Application submitted!' : `Failed: ${result.error ?? 'Unknown error'}`}`);
2419
+ if (jobTitle)
2420
+ console.log(` ${jobTitle}${jobCompany ? ` @ ${jobCompany}` : ''}`);
2421
+ if (options.dryRun)
2422
+ console.log(' (Dry run — not submitted)');
2423
+ console.log(` Time: ${(result.elapsed / 1000).toFixed(1)}s\n`);
2424
+ process.exit(success ? 0 : 1);
2425
+ }
2426
+ catch (error) {
2427
+ if (spinner)
2428
+ spinner.fail('Application failed');
2429
+ const msg = error instanceof Error ? error.message : 'Unknown error';
2430
+ if (isJson) {
2431
+ await writeStdout(JSON.stringify({ error: msg }) + '\n');
2432
+ }
2433
+ else {
2434
+ console.error(`Error: ${msg}`);
2435
+ }
2436
+ process.exit(1);
2437
+ }
2438
+ });
2439
+ // apply init — interactive profile setup
2440
+ applyCmd
2441
+ .command('init')
2442
+ .description('Interactive profile setup — creates ~/.webpeel/profile.json')
2443
+ .action(async () => {
2444
+ const { createInterface } = await import('readline');
2445
+ const rl = createInterface({ input: process.stdin, output: process.stdout });
2446
+ const ask = (q) => new Promise((resolve) => rl.question(q, (ans) => resolve(ans.trim())));
2447
+ console.log('\n🤖 WebPeel Apply Setup — Create your applicant profile\n');
2448
+ console.log('This creates ~/.webpeel/profile.json used by "webpeel apply submit".\n');
2449
+ try {
2450
+ const name = await ask('Full name: ');
2451
+ const email = await ask('Email address: ');
2452
+ const phone = await ask('Phone number (optional): ');
2453
+ const resumePath = await ask('Path to resume PDF (e.g. /Users/you/resume.pdf): ');
2454
+ const currentTitle = await ask('Current/most recent job title: ');
2455
+ const yearsExp = await ask('Years of experience: ');
2456
+ const skills = await ask('Skills (comma-separated, e.g. TypeScript, React, Node.js): ');
2457
+ const education = await ask('Education (e.g. B.S. Computer Science, MIT): ');
2458
+ const location = await ask('City, State (e.g. San Francisco, CA): ');
2459
+ const workAuth = await ask('Work authorization (e.g. US Citizen, Permanent Resident, H-1B, Need Sponsorship): ');
2460
+ const linkedinUrl = await ask('LinkedIn URL (optional): ');
2461
+ const websiteUrl = await ask('Portfolio/website URL (optional): ');
2462
+ const desiredSalary = await ask('Desired salary (optional, e.g. $150,000): ');
2463
+ rl.close();
2464
+ const { mkdirSync: mk, writeFileSync: wf } = await import('fs');
2465
+ const { join: j } = await import('path');
2466
+ const { homedir: hd } = await import('os');
2467
+ const webpeelDir = j(hd(), '.webpeel');
2468
+ mk(webpeelDir, { recursive: true });
2469
+ const profile = {
2470
+ name,
2471
+ email,
2472
+ ...(phone ? { phone } : {}),
2473
+ resumePath,
2474
+ currentTitle,
2475
+ yearsExperience: parseInt(yearsExp, 10) || 0,
2476
+ skills: skills.split(',').map((s) => s.trim()).filter(Boolean),
2477
+ education,
2478
+ location,
2479
+ workAuthorization: workAuth,
2480
+ ...(linkedinUrl ? { linkedinUrl } : {}),
2481
+ ...(websiteUrl ? { websiteUrl } : {}),
2482
+ ...(desiredSalary ? { desiredSalary } : {}),
2483
+ };
2484
+ const profilePath = j(webpeelDir, 'profile.json');
2485
+ wf(profilePath, JSON.stringify(profile, null, 2), 'utf-8');
2486
+ console.log(`\n✅ Profile saved to: ${profilePath}`);
2487
+ console.log('\nNext steps:');
2488
+ console.log(' • Apply to a job: webpeel apply submit <url>');
2489
+ console.log(' • Dry run first: webpeel apply submit <url> --dry-run');
2490
+ console.log(' • View stats: webpeel apply status\n');
2491
+ }
2492
+ catch (error) {
2493
+ rl.close();
2494
+ console.error(`\nError: ${error instanceof Error ? error.message : 'Unknown error'}`);
2495
+ process.exit(1);
2496
+ }
2497
+ });
2498
+ // apply status — application stats summary
2499
+ applyCmd
2500
+ .command('status')
2501
+ .description('Show application stats')
2502
+ .option('--json', 'Output as JSON')
2503
+ .action(async (options) => {
2504
+ try {
2505
+ const { ApplicationTracker } = await import('./core/application-tracker.js');
2506
+ const tracker = new ApplicationTracker();
2507
+ const stats = tracker.stats();
2508
+ if (options.json) {
2509
+ await writeStdout(JSON.stringify(stats, null, 2) + '\n');
2510
+ process.exit(0);
2511
+ }
2512
+ console.log('\n📊 Application Stats\n');
2513
+ console.log(` Total: ${stats.total}`);
2514
+ console.log(` Today: ${stats.today}`);
2515
+ console.log(` This week: ${stats.thisWeek}`);
2516
+ if (Object.keys(stats.byPlatform).length > 0) {
2517
+ console.log('\n By Platform:');
2518
+ for (const [platform, count] of Object.entries(stats.byPlatform)) {
2519
+ console.log(` ${platform.padEnd(12)} ${count}`);
2520
+ }
2521
+ }
2522
+ if (Object.keys(stats.byStatus).length > 0) {
2523
+ console.log('\n By Status:');
2524
+ for (const [status, count] of Object.entries(stats.byStatus)) {
2525
+ console.log(` ${status.padEnd(12)} ${count}`);
2526
+ }
2527
+ }
2528
+ console.log('');
2529
+ process.exit(0);
2530
+ }
2531
+ catch (error) {
2532
+ console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
2533
+ process.exit(1);
2534
+ }
2535
+ });
2536
+ // apply list — list applications with optional filters
2537
+ applyCmd
2538
+ .command('list')
2539
+ .description('List tracked applications')
2540
+ .option('--platform <platform>', 'Filter by platform (e.g. linkedin, upwork)')
2541
+ .option('--status <status>', 'Filter by status (applied, interview, rejected, offer, ...)')
2542
+ .option('--since <date>', 'Filter to applications on or after this date (YYYY-MM-DD)')
2543
+ .option('--json', 'Output as JSON')
2544
+ .option('--limit <n>', 'Max records to show (default: 50)', '50')
2545
+ .action(async (options) => {
2546
+ try {
2547
+ const { ApplicationTracker } = await import('./core/application-tracker.js');
2548
+ const tracker = new ApplicationTracker();
2549
+ const limit = parseInt(options.limit, 10) || 50;
2550
+ const records = tracker.list({
2551
+ platform: options.platform,
2552
+ status: options.status,
2553
+ since: options.since,
2554
+ }).slice(0, limit);
2555
+ if (options.json) {
2556
+ await writeStdout(JSON.stringify(records, null, 2) + '\n');
2557
+ process.exit(0);
2558
+ }
2559
+ if (records.length === 0) {
2560
+ console.log('\nNo applications found.\n');
2561
+ process.exit(0);
2562
+ }
2563
+ console.log(`\n📋 Applications (${records.length})\n`);
2564
+ const colDate = 12;
2565
+ const colStatus = 10;
2566
+ const colTitle = 35;
2567
+ const colCompany = 20;
2568
+ const pad = (s, w) => s.length > w ? s.slice(0, w - 1) + '…' : s.padEnd(w);
2569
+ console.log(` ${'Date'.padEnd(colDate)} ${'Status'.padEnd(colStatus)} ${'Title'.padEnd(colTitle)} ${'Company'.padEnd(colCompany)}`);
2570
+ console.log(` ${'-'.repeat(colDate)} ${'-'.repeat(colStatus)} ${'-'.repeat(colTitle)} ${'-'.repeat(colCompany)}`);
2571
+ for (const r of records) {
2572
+ const dateStr = r.appliedAt.slice(0, 10);
2573
+ console.log(` ${pad(dateStr, colDate)} ${pad(r.status, colStatus)} ${pad(r.title, colTitle)} ${pad(r.company, colCompany)}`);
2574
+ }
2575
+ console.log('');
2576
+ process.exit(0);
2577
+ }
2578
+ catch (error) {
2579
+ console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
2580
+ process.exit(1);
2581
+ }
2582
+ });
2583
+ // apply rate — rate governor status
2584
+ applyCmd
2585
+ .command('rate')
2586
+ .description('Show rate governor status (daily limits, cooldown, next allowed time)')
2587
+ .option('--json', 'Output as JSON')
2588
+ .option('--reset-cooldown', 'Clear any active cooldown (manual override)')
2589
+ .action(async (options) => {
2590
+ try {
2591
+ const { RateGovernor, formatDuration } = await import('./core/rate-governor.js');
2592
+ const governor = new RateGovernor();
2593
+ if (options.resetCooldown) {
2594
+ governor.resetCooldown();
2595
+ console.log('✅ Cooldown cleared.');
2596
+ process.exit(0);
2597
+ }
2598
+ const state = governor.getState();
2599
+ const config = governor.getConfig();
2600
+ const check = governor.canApply();
2601
+ if (options.json) {
2602
+ await writeStdout(JSON.stringify({
2603
+ state,
2604
+ config,
2605
+ canApply: check.allowed,
2606
+ reason: check.reason,
2607
+ waitMs: check.waitMs,
2608
+ nextDelayMs: governor.getNextDelay(),
2609
+ }, null, 2) + '\n');
2610
+ process.exit(0);
2611
+ }
2612
+ console.log('\n⏱ Rate Governor Status\n');
2613
+ console.log(` Today's applications: ${state.todayCount} / ${config.maxPerDay}`);
2614
+ console.log(` Total applications: ${state.totalApplications}`);
2615
+ console.log(` Can apply now: ${check.allowed ? '✅ Yes' : '❌ No'}`);
2616
+ if (!check.allowed && check.reason) {
2617
+ console.log(` Reason: ${check.reason}`);
2618
+ }
2619
+ if (!check.allowed && check.waitMs) {
2620
+ console.log(` Wait time: ${formatDuration(check.waitMs)}`);
2621
+ }
2622
+ if (state.cooldownUntil > 0) {
2623
+ const remaining = state.cooldownUntil - Date.now();
2624
+ console.log(` Cooldown: Active (${formatDuration(Math.max(0, remaining))} remaining)`);
2625
+ }
2626
+ console.log(` Min delay: ${formatDuration(config.minDelayMs)}`);
2627
+ console.log(` Max delay: ${formatDuration(config.maxDelayMs)}`);
2628
+ console.log(` Active hours: ${config.activeHours[0]}:00 – ${config.activeHours[1]}:00`);
2629
+ console.log(` Weekdays only: ${config.weekdaysOnly ? 'Yes' : 'No'}`);
2630
+ console.log('');
2631
+ process.exit(0);
2632
+ }
2633
+ catch (error) {
2634
+ console.error(`Error: ${error instanceof Error ? error.message : 'Unknown error'}`);
2635
+ process.exit(1);
2636
+ }
2637
+ });
1561
2638
  program.parse();
1562
2639
  // ============================================================
2640
+ // Time formatting helper
2641
+ // ============================================================
2642
+ /**
2643
+ * Format a past Date relative to now (e.g. "2h ago", "5m ago").
2644
+ */
2645
+ function formatRelativeTime(past) {
2646
+ const diffMs = Date.now() - past.getTime();
2647
+ const diffSec = Math.round(diffMs / 1000);
2648
+ if (diffSec < 60)
2649
+ return `${diffSec}s ago`;
2650
+ const diffMin = Math.round(diffSec / 60);
2651
+ if (diffMin < 60)
2652
+ return `${diffMin}m ago`;
2653
+ const diffHr = Math.round(diffMin / 60);
2654
+ if (diffHr < 24)
2655
+ return `${diffHr}h ago`;
2656
+ const diffDay = Math.round(diffHr / 24);
2657
+ return `${diffDay}d ago`;
2658
+ }
2659
+ // ============================================================
2660
+ // Error classification for JSON error output (#6)
2661
+ // ============================================================
2662
+ function classifyErrorCode(error) {
2663
+ if (!(error instanceof Error))
2664
+ return 'FETCH_FAILED';
2665
+ // Check for our custom _code first (set in pre-fetch validation)
2666
+ if (error._code)
2667
+ return error._code;
2668
+ const msg = error.message.toLowerCase();
2669
+ const name = error.name || '';
2670
+ if (name === 'TimeoutError' || msg.includes('timeout') || msg.includes('timed out')) {
2671
+ return 'TIMEOUT';
2672
+ }
2673
+ if (name === 'BlockedError' || msg.includes('blocked') || msg.includes('403') || msg.includes('cloudflare')) {
2674
+ return 'BLOCKED';
2675
+ }
2676
+ if (msg.includes('enotfound') || msg.includes('getaddrinfo') || msg.includes('dns resolution failed') || msg.includes('not found')) {
2677
+ return 'DNS_FAILED';
2678
+ }
2679
+ if (msg.includes('invalid url') || msg.includes('invalid hostname') || msg.includes('only http')) {
2680
+ return 'INVALID_URL';
2681
+ }
2682
+ return 'FETCH_FAILED';
2683
+ }
2684
+ /**
2685
+ * Build a unified PeelEnvelope from a PeelResult.
2686
+ *
2687
+ * All existing PeelResult fields are spread first (backward compatibility),
2688
+ * then canonical envelope fields override/extend them.
2689
+ */
2690
+ function buildEnvelope(result, extra) {
2691
+ const envelope = {
2692
+ // Spread all PeelResult fields for backward compatibility
2693
+ ...result,
2694
+ // Required envelope fields (override PeelResult where they overlap)
2695
+ url: result.url,
2696
+ status: 200,
2697
+ content: result.content,
2698
+ metadata: {
2699
+ title: result.title,
2700
+ ...result.metadata,
2701
+ },
2702
+ tokens: result.tokens,
2703
+ cached: extra.cached ?? false,
2704
+ elapsed: result.elapsed,
2705
+ };
2706
+ // Optional envelope fields — only include when meaningful
2707
+ if (extra.structured !== undefined)
2708
+ envelope.structured = extra.structured;
2709
+ if (extra.truncated)
2710
+ envelope.truncated = true;
2711
+ if (extra.totalAvailable !== undefined)
2712
+ envelope.totalAvailable = extra.totalAvailable;
2713
+ return envelope;
2714
+ }
2715
+ // ============================================================
1563
2716
  // Shared output helper
1564
2717
  // ============================================================
1565
- async function outputResult(result, options) {
2718
+ async function outputResult(result, options, extra = {}) {
1566
2719
  // --links: output only links
1567
2720
  if (options.links) {
1568
2721
  if (options.json) {
@@ -1603,6 +2756,7 @@ async function outputResult(result, options) {
1603
2756
  method: result.method,
1604
2757
  elapsed: result.elapsed,
1605
2758
  tokens: result.tokens,
2759
+ cached: extra.cached ?? false,
1606
2760
  ...result.metadata,
1607
2761
  };
1608
2762
  if (options.json) {
@@ -1624,12 +2778,14 @@ async function outputResult(result, options) {
1624
2778
  console.log(`Method: ${meta.method}`);
1625
2779
  console.log(`Elapsed: ${meta.elapsed}ms`);
1626
2780
  console.log(`Tokens: ${meta.tokens}`);
2781
+ console.log(`Cached: ${meta.cached}`);
1627
2782
  }
1628
2783
  return;
1629
2784
  }
1630
2785
  // Default: full output
1631
2786
  if (options.json) {
1632
- await writeStdout(JSON.stringify(result, null, 2) + '\n');
2787
+ const envelope = buildEnvelope(result, extra);
2788
+ await writeStdout(JSON.stringify(envelope, null, 2) + '\n');
1633
2789
  }
1634
2790
  else {
1635
2791
  await writeStdout(result.content + '\n');
@@ -1645,6 +2801,64 @@ function writeStdout(data) {
1645
2801
  });
1646
2802
  });
1647
2803
  }
2804
+ /**
2805
+ * Convert an array of listing items to CSV.
2806
+ */
2807
+ function formatListingsCsv(items) {
2808
+ if (items.length === 0)
2809
+ return '';
2810
+ // Collect all keys
2811
+ const keySet = new Set();
2812
+ for (const item of items) {
2813
+ for (const key of Object.keys(item)) {
2814
+ if (item[key] !== undefined)
2815
+ keySet.add(key);
2816
+ }
2817
+ }
2818
+ const keys = Array.from(keySet);
2819
+ const escapeCsv = (s) => {
2820
+ if (s === undefined || s === null)
2821
+ return '""';
2822
+ const str = String(s);
2823
+ if (str.includes('"') || str.includes(',') || str.includes('\n') || str.includes('\r')) {
2824
+ return '"' + str.replace(/"/g, '""') + '"';
2825
+ }
2826
+ return '"' + str + '"';
2827
+ };
2828
+ const lines = [keys.join(',')];
2829
+ for (const item of items) {
2830
+ lines.push(keys.map(k => escapeCsv(item[k])).join(','));
2831
+ }
2832
+ return lines.join('\n') + '\n';
2833
+ }
2834
+ /**
2835
+ * Normalise the result of --extract (which may be a flat object or contain
2836
+ * arrays) into an array of row objects suitable for CSV / table rendering.
2837
+ */
2838
+ function normaliseExtractedToRows(extracted) {
2839
+ // If every value is an array of the same length, zip them into rows
2840
+ const values = Object.values(extracted);
2841
+ const allArrays = values.length > 0 && values.every(v => Array.isArray(v));
2842
+ if (allArrays) {
2843
+ const length = values[0].length;
2844
+ const rows = [];
2845
+ for (let i = 0; i < length; i++) {
2846
+ const row = {};
2847
+ for (const key of Object.keys(extracted)) {
2848
+ const val = extracted[key][i];
2849
+ row[key] = val != null ? String(val) : undefined;
2850
+ }
2851
+ rows.push(row);
2852
+ }
2853
+ return rows;
2854
+ }
2855
+ // Otherwise treat as a single row
2856
+ const row = {};
2857
+ for (const [k, v] of Object.entries(extracted)) {
2858
+ row[k] = v != null ? String(v) : undefined;
2859
+ }
2860
+ return [row];
2861
+ }
1648
2862
  // Helper function to extract colors from content
1649
2863
  function extractColors(content) {
1650
2864
  const colors = [];