@xyleapp/cli 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/xyle.mjs CHANGED
@@ -8,7 +8,7 @@ const program = new Command();
8
8
  program
9
9
  .name("xyle")
10
10
  .description("SEO & AEO Intelligence Engine CLI")
11
- .version("0.10.0");
11
+ .version("0.11.0");
12
12
 
13
13
  registerCommands(program);
14
14
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@xyleapp/cli",
3
- "version": "0.10.0",
3
+ "version": "0.11.0",
4
4
  "description": "CLI for the Xyle SEO & AEO Intelligence Engine",
5
5
  "type": "module",
6
6
  "bin": {
package/src/api.mjs CHANGED
@@ -165,7 +165,10 @@ export function startSiteCrawl(seedUrl, config = {}) {
165
165
  }
166
166
 
167
167
  export function getSiteCrawlStatus(jobId) {
168
- return request("GET", `/site-crawl/${jobId}`, { timeout: 15000 });
168
+ // 60s timeout: the API container runs the crawl loop on the same event
169
+ // loop as the HTTP handler, so individual status requests can spike in
170
+ // latency when the crawl is hot. 15s was too tight.
171
+ return request("GET", `/site-crawl/${jobId}`, { timeout: 60000 });
169
172
  }
170
173
 
171
174
  export function getSiteCrawlPages(jobId, { limit = 50, offset = 0, filter } = {}) {
package/src/commands.mjs CHANGED
@@ -884,18 +884,20 @@ export function registerCommands(program) {
884
884
  .command("start")
885
885
  .description("Start a full-site crawl")
886
886
  .argument("<url>", "Seed URL to crawl")
887
- .option("--max-pages <n>", "Maximum pages to crawl", "500")
888
- .option("--max-depth <n>", "Maximum crawl depth", "5")
889
- .option("--no-robots", "Ignore robots.txt")
890
- .option("--no-js", "Skip JavaScript rendering")
887
+ .option("--max-pages <n>", "Maximum pages to crawl (default 50, max 500)", "50")
888
+ .option("--max-depth <n>", "Maximum crawl depth (default 3, max 10)", "3")
889
+ .option("--render-js", "Enable JavaScript rendering (slower, for SPA sites)")
890
+ .option("--include-subdomains", "Follow links to subdomains of the seed host")
891
891
  .option("--json", "Output as JSON (no live polling)")
892
892
  .action(async (url, opts) => {
893
893
  try {
894
+ // NOTE: robots.txt is always respected server-side. The crawler
895
+ // identifies as Xyle-Crawler/<version> and cannot be spoofed.
894
896
  const config = {
895
897
  max_pages: parseInt(opts.maxPages, 10),
896
898
  max_depth: parseInt(opts.maxDepth, 10),
897
- respect_robots: opts.robots !== false,
898
- render_js: opts.js !== false,
899
+ render_js: opts.renderJs === true,
900
+ include_subdomains: opts.includeSubdomains === true,
899
901
  };
900
902
  const data = await startSiteCrawl(url, config);
901
903
  const jobId = data.job_id;
@@ -907,20 +909,41 @@ export function registerCommands(program) {
907
909
 
908
910
  console.log(`\x1b[36mCrawl started:\x1b[0m ${jobId}`);
909
911
 
910
- // Poll every 2 seconds
912
+ // Poll every 2 seconds. Progress bar denominator is `max_pages`
913
+ // (the hard cap), not `pages_discovered` — which is the frontier
914
+ // size and can explode on link-heavy sites, making the bar look
915
+ // broken. We also tolerate a few transient poll failures (the API
916
+ // container is busy running the crawl on the same event loop so
917
+ // individual status requests can spike latency).
911
918
  const POLL_MS = 2000;
912
- let prev = 0;
919
+ const MAX_POLL_ERRORS = 5;
920
+ const target = config.max_pages;
921
+ let prev = -1;
922
+ let pollErrors = 0;
913
923
  while (true) {
914
924
  await new Promise((r) => setTimeout(r, POLL_MS));
915
- const status = await getSiteCrawlStatus(jobId);
925
+ let status;
926
+ try {
927
+ status = await getSiteCrawlStatus(jobId);
928
+ pollErrors = 0;
929
+ } catch (err) {
930
+ pollErrors += 1;
931
+ if (pollErrors >= MAX_POLL_ERRORS) {
932
+ process.stdout.write("\n");
933
+ console.log(`\x1b[31mLost connection while polling.\x1b[0m The job may still be running server-side.`);
934
+ console.log(`Re-check with: xyle site-crawl status ${jobId}`);
935
+ return;
936
+ }
937
+ continue;
938
+ }
916
939
  const crawled = status.pages_crawled || 0;
917
- const discovered = status.pages_discovered || 0;
918
940
  const errors = status.errors_count || 0;
919
941
 
920
942
  if (crawled !== prev) {
921
- const pct = discovered > 0 ? Math.round((crawled / discovered) * 100) : 0;
922
- const bar = "\u2588".repeat(Math.round(pct / 5)) + "\u2591".repeat(20 - Math.round(pct / 5));
923
- process.stdout.write(`\r ${bar} ${pct}% ${crawled}/${discovered} pages ${errors} errors`);
943
+ const pct = Math.min(100, Math.round((crawled / target) * 100));
944
+ const filled = Math.round(pct / 5);
945
+ const bar = "\u2588".repeat(filled) + "\u2591".repeat(20 - filled);
946
+ process.stdout.write(`\r ${bar} ${pct}% ${crawled}/${target} pages ${errors} errors `);
924
947
  prev = crawled;
925
948
  }
926
949