npm - @xyleapp/cli - Versions diffs - 0.10.0 → 0.11.0 - Mend

@xyleapp/cli 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/bin/xyle.mjs CHANGED Viewed

@@ -8,7 +8,7 @@ const program = new Command();
 program
   .name("xyle")
   .description("SEO & AEO Intelligence Engine CLI")
-  .version("0.10.0");
+  .version("0.11.0");
 registerCommands(program);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@xyleapp/cli",
-  "version": "0.10.0",
+  "version": "0.11.0",
   "description": "CLI for the Xyle SEO & AEO Intelligence Engine",
   "type": "module",
   "bin": {

package/src/api.mjs CHANGED Viewed

@@ -165,7 +165,10 @@ export function startSiteCrawl(seedUrl, config = {}) {
 }
 export function getSiteCrawlStatus(jobId) {
-  return request("GET", `/site-crawl/${jobId}`, { timeout: 15000 });
+  // 60s timeout: the API container runs the crawl loop on the same event
+  // loop as the HTTP handler, so individual status requests can spike in
+  // latency when the crawl is hot. 15s was too tight.
+  return request("GET", `/site-crawl/${jobId}`, { timeout: 60000 });
 }
 export function getSiteCrawlPages(jobId, { limit = 50, offset = 0, filter } = {}) {

package/src/commands.mjs CHANGED Viewed

@@ -884,18 +884,20 @@ export function registerCommands(program) {
     .command("start")
     .description("Start a full-site crawl")
     .argument("<url>", "Seed URL to crawl")
-    .option("--max-pages <n>", "Maximum pages to crawl", "500")
-    .option("--max-depth <n>", "Maximum crawl depth", "5")
-    .option("--no-robots", "Ignore robots.txt")
-    .option("--no-js", "Skip JavaScript rendering")
+    .option("--max-pages <n>", "Maximum pages to crawl (default 50, max 500)", "50")
+    .option("--max-depth <n>", "Maximum crawl depth (default 3, max 10)", "3")
+    .option("--render-js", "Enable JavaScript rendering (slower, for SPA sites)")
+    .option("--include-subdomains", "Follow links to subdomains of the seed host")
     .option("--json", "Output as JSON (no live polling)")
     .action(async (url, opts) => {
       try {
+        // NOTE: robots.txt is always respected server-side. The crawler
+        // identifies as Xyle-Crawler/<version> and cannot be spoofed.
         const config = {
           max_pages: parseInt(opts.maxPages, 10),
           max_depth: parseInt(opts.maxDepth, 10),
-          respect_robots: opts.robots !== false,
-          render_js: opts.js !== false,
+          render_js: opts.renderJs === true,
+          include_subdomains: opts.includeSubdomains === true,
         };
         const data = await startSiteCrawl(url, config);
         const jobId = data.job_id;
@@ -907,20 +909,41 @@ export function registerCommands(program) {
         console.log(`\x1b[36mCrawl started:\x1b[0m ${jobId}`);
-        // Poll every 2 seconds
+        // Poll every 2 seconds. Progress bar denominator is `max_pages`
+        // (the hard cap), not `pages_discovered` — which is the frontier
+        // size and can explode on link-heavy sites, making the bar look
+        // broken. We also tolerate a few transient poll failures (the API
+        // container is busy running the crawl on the same event loop so
+        // individual status requests can spike latency).
         const POLL_MS = 2000;
-        let prev = 0;
+        const MAX_POLL_ERRORS = 5;
+        const target = config.max_pages;
+        let prev = -1;
+        let pollErrors = 0;
         while (true) {
           await new Promise((r) => setTimeout(r, POLL_MS));
-          const status = await getSiteCrawlStatus(jobId);
+          let status;
+          try {
+            status = await getSiteCrawlStatus(jobId);
+            pollErrors = 0;
+          } catch (err) {
+            pollErrors += 1;
+            if (pollErrors >= MAX_POLL_ERRORS) {
+              process.stdout.write("\n");
+              console.log(`\x1b[31mLost connection while polling.\x1b[0m The job may still be running server-side.`);
+              console.log(`Re-check with: xyle site-crawl status ${jobId}`);
+              return;
+            }
+            continue;
+          }
           const crawled = status.pages_crawled || 0;
-          const discovered = status.pages_discovered || 0;
           const errors = status.errors_count || 0;
           if (crawled !== prev) {
-            const pct = discovered > 0 ? Math.round((crawled / discovered) * 100) : 0;
-            const bar = "\u2588".repeat(Math.round(pct / 5)) + "\u2591".repeat(20 - Math.round(pct / 5));
-            process.stdout.write(`\r  ${bar} ${pct}%  ${crawled}/${discovered} pages  ${errors} errors`);
+            const pct = Math.min(100, Math.round((crawled / target) * 100));
+            const filled = Math.round(pct / 5);
+            const bar = "\u2588".repeat(filled) + "\u2591".repeat(20 - filled);
+            process.stdout.write(`\r  ${bar} ${pct}%  ${crawled}/${target} pages  ${errors} errors  `);
             prev = crawled;
           }