npm - @xyleapp/cli - Versions diffs - 0.10.0 → 0.12.0 - Mend

@xyleapp/cli 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/bin/xyle.mjs CHANGED Viewed

@@ -8,7 +8,7 @@ const program = new Command();
 program
   .name("xyle")
   .description("SEO & AEO Intelligence Engine CLI")
-  .version("0.10.0");
+  .version("0.12.0");
 registerCommands(program);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@xyleapp/cli",
-  "version": "0.10.0",
+  "version": "0.12.0",
   "description": "CLI for the Xyle SEO & AEO Intelligence Engine",
   "type": "module",
   "bin": {

package/src/api.mjs CHANGED Viewed

@@ -165,7 +165,10 @@ export function startSiteCrawl(seedUrl, config = {}) {
 }
 export function getSiteCrawlStatus(jobId) {
-  return request("GET", `/site-crawl/${jobId}`, { timeout: 15000 });
+  // 60s timeout: the API container runs the crawl loop on the same event
+  // loop as the HTTP handler, so individual status requests can spike in
+  // latency when the crawl is hot. 15s was too tight.
+  return request("GET", `/site-crawl/${jobId}`, { timeout: 60000 });
 }
 export function getSiteCrawlPages(jobId, { limit = 50, offset = 0, filter } = {}) {

package/src/commands.mjs CHANGED Viewed

@@ -884,18 +884,20 @@ export function registerCommands(program) {
     .command("start")
     .description("Start a full-site crawl")
     .argument("<url>", "Seed URL to crawl")
-    .option("--max-pages <n>", "Maximum pages to crawl", "500")
-    .option("--max-depth <n>", "Maximum crawl depth", "5")
-    .option("--no-robots", "Ignore robots.txt")
-    .option("--no-js", "Skip JavaScript rendering")
+    .option("--max-pages <n>", "Maximum pages to crawl (default 50, max 500)", "50")
+    .option("--max-depth <n>", "Maximum crawl depth (default 3, max 10)", "3")
+    .option("--render-js", "Enable JavaScript rendering (slower, for SPA sites)")
+    .option("--include-subdomains", "Follow links to subdomains of the seed host")
     .option("--json", "Output as JSON (no live polling)")
     .action(async (url, opts) => {
       try {
+        // NOTE: robots.txt is always respected server-side. The crawler
+        // identifies as Xyle-Crawler/<version> and cannot be spoofed.
         const config = {
           max_pages: parseInt(opts.maxPages, 10),
           max_depth: parseInt(opts.maxDepth, 10),
-          respect_robots: opts.robots !== false,
-          render_js: opts.js !== false,
+          render_js: opts.renderJs === true,
+          include_subdomains: opts.includeSubdomains === true,
         };
         const data = await startSiteCrawl(url, config);
         const jobId = data.job_id;
@@ -907,20 +909,41 @@ export function registerCommands(program) {
         console.log(`\x1b[36mCrawl started:\x1b[0m ${jobId}`);
-        // Poll every 2 seconds
+        // Poll every 2 seconds. Progress bar denominator is `max_pages`
+        // (the hard cap), not `pages_discovered` — which is the frontier
+        // size and can explode on link-heavy sites, making the bar look
+        // broken. We also tolerate a few transient poll failures (the API
+        // container is busy running the crawl on the same event loop so
+        // individual status requests can spike latency).
         const POLL_MS = 2000;
-        let prev = 0;
+        const MAX_POLL_ERRORS = 5;
+        const target = config.max_pages;
+        let prev = -1;
+        let pollErrors = 0;
         while (true) {
           await new Promise((r) => setTimeout(r, POLL_MS));
-          const status = await getSiteCrawlStatus(jobId);
+          let status;
+          try {
+            status = await getSiteCrawlStatus(jobId);
+            pollErrors = 0;
+          } catch (err) {
+            pollErrors += 1;
+            if (pollErrors >= MAX_POLL_ERRORS) {
+              process.stdout.write("\n");
+              console.log(`\x1b[31mLost connection while polling.\x1b[0m The job may still be running server-side.`);
+              console.log(`Re-check with: xyle site-crawl status ${jobId}`);
+              return;
+            }
+            continue;
+          }
           const crawled = status.pages_crawled || 0;
-          const discovered = status.pages_discovered || 0;
           const errors = status.errors_count || 0;
           if (crawled !== prev) {
-            const pct = discovered > 0 ? Math.round((crawled / discovered) * 100) : 0;
-            const bar = "\u2588".repeat(Math.round(pct / 5)) + "\u2591".repeat(20 - Math.round(pct / 5));
-            process.stdout.write(`\r  ${bar} ${pct}%  ${crawled}/${discovered} pages  ${errors} errors`);
+            const pct = Math.min(100, Math.round((crawled / target) * 100));
+            const filled = Math.round(pct / 5);
+            const bar = "\u2588".repeat(filled) + "\u2591".repeat(20 - filled);
+            process.stdout.write(`\r  ${bar} ${pct}%  ${crawled}/${target} pages  ${errors} errors  `);
             prev = crawled;
           }

package/src/seed.mjs CHANGED Viewed

@@ -114,6 +114,21 @@ npx @xyleapp/cli <command> [options]
 Always use \`--json\` when parsing output programmatically.
+## Sites Hub & Auto-Discovery (Dashboard)
+Every account has a per-site hub in the dashboard at \`/dashboard/sites/[siteId]\` with four tabs: Overview, Crawl, Analysis, and Competitors. Use it when the CLI alone can't help.
+**Why this matters:** \`xyle competitors --query\` needs a query string from Search Console. For freshly onboarded sites (no GSC sync yet), there's no meaningful query to pass. The dashboard fills this gap with **Gemini-grounded auto-discovery**: one click finds the top 5–10 direct competitors for a domain, enriches each with a lightweight crawl, and links them to the site.
+**When to point the user to the Sites Hub:**
+- **New site, no GSC data** → tell the user to open \`/dashboard/sites/[siteId]\` → Competitors tab → "Discover via Gemini". Come back to the CLI after it finishes to continue the audit.
+- **User asks "who are my competitors?" without naming a query** → Sites Hub, not \`xyle competitors --query\`.
+- **User wants a persistent per-site view** → Sites Hub has all four tabs in one place.
+**Rate limits:** 10 discoveries/user/day, 1 run/site/6h. If the user hits a 429 there's nothing to fix, just wait.
+**Not in the CLI (yet):** Auto-discovery is dashboard-only. There is no \`xyle competitors discover\` subcommand. When a user is onboarding a fresh site from the CLI and has no query data, explicitly direct them to the dashboard instead of trying to improvise.
 ## Strategic Workflows
 ### 1. Full-Site Audit (Screaming Frog replacement)
@@ -196,6 +211,8 @@ When the user asks something SEO-related, route to the right workflow:
 | "What content am I missing?" / "Find gaps" | Content Gap Sprint | Ready to create, need briefs |
 | "Why is my CTR low?" | Page Optimization (CTR focus) | Likely a title/meta problem |
 | "Help me rank for [query]" | Page Optimization + Competitor Analysis | Need to see what's working for competitors |
+| "I just onboarded my site, what now?" / "No queries yet" | Sites Hub → Discover via Gemini | Bootstrap competitor context before GSC catches up |
+| "Who are my competitors?" (no query named) | Sites Hub → Discover via Gemini | No GSC query means \`competitors --query\` can't help |
 **After every analysis, proactively recommend the next step.** Don't just present data — interpret it and suggest action.
@@ -208,6 +225,7 @@ When the user asks something SEO-related, route to the right workflow:
 - Classify queries by intent before suggesting optimizations
 - Consider the user's ICP when recommending content topics
 - Use \`status\` first to verify API connectivity
+- **For fresh sites with no GSC history, point the user to the Sites Hub (\`/dashboard/sites/[id]\`) to run Gemini auto-discovery** — don't try to improvise competitor context with \`competitors --query\` when there's no query to pass
 **Don't:**
 - Optimize for zero-impression queries (no audience there)