@xyleapp/cli 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/xyle.mjs CHANGED
@@ -8,7 +8,7 @@ const program = new Command();
8
8
  program
9
9
  .name("xyle")
10
10
  .description("SEO & AEO Intelligence Engine CLI")
11
- .version("0.10.0");
11
+ .version("0.12.0");
12
12
 
13
13
  registerCommands(program);
14
14
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@xyleapp/cli",
3
- "version": "0.10.0",
3
+ "version": "0.12.0",
4
4
  "description": "CLI for the Xyle SEO & AEO Intelligence Engine",
5
5
  "type": "module",
6
6
  "bin": {
package/src/api.mjs CHANGED
@@ -165,7 +165,10 @@ export function startSiteCrawl(seedUrl, config = {}) {
165
165
  }
166
166
 
167
167
  export function getSiteCrawlStatus(jobId) {
168
- return request("GET", `/site-crawl/${jobId}`, { timeout: 15000 });
168
+ // 60s timeout: the API container runs the crawl loop on the same event
169
+ // loop as the HTTP handler, so individual status requests can spike in
170
+ // latency when the crawl is hot. 15s was too tight.
171
+ return request("GET", `/site-crawl/${jobId}`, { timeout: 60000 });
169
172
  }
170
173
 
171
174
  export function getSiteCrawlPages(jobId, { limit = 50, offset = 0, filter } = {}) {
package/src/commands.mjs CHANGED
@@ -884,18 +884,20 @@ export function registerCommands(program) {
884
884
  .command("start")
885
885
  .description("Start a full-site crawl")
886
886
  .argument("<url>", "Seed URL to crawl")
887
- .option("--max-pages <n>", "Maximum pages to crawl", "500")
888
- .option("--max-depth <n>", "Maximum crawl depth", "5")
889
- .option("--no-robots", "Ignore robots.txt")
890
- .option("--no-js", "Skip JavaScript rendering")
887
+ .option("--max-pages <n>", "Maximum pages to crawl (default 50, max 500)", "50")
888
+ .option("--max-depth <n>", "Maximum crawl depth (default 3, max 10)", "3")
889
+ .option("--render-js", "Enable JavaScript rendering (slower, for SPA sites)")
890
+ .option("--include-subdomains", "Follow links to subdomains of the seed host")
891
891
  .option("--json", "Output as JSON (no live polling)")
892
892
  .action(async (url, opts) => {
893
893
  try {
894
+ // NOTE: robots.txt is always respected server-side. The crawler
895
+ // identifies as Xyle-Crawler/<version> and cannot be spoofed.
894
896
  const config = {
895
897
  max_pages: parseInt(opts.maxPages, 10),
896
898
  max_depth: parseInt(opts.maxDepth, 10),
897
- respect_robots: opts.robots !== false,
898
- render_js: opts.js !== false,
899
+ render_js: opts.renderJs === true,
900
+ include_subdomains: opts.includeSubdomains === true,
899
901
  };
900
902
  const data = await startSiteCrawl(url, config);
901
903
  const jobId = data.job_id;
@@ -907,20 +909,41 @@ export function registerCommands(program) {
907
909
 
908
910
  console.log(`\x1b[36mCrawl started:\x1b[0m ${jobId}`);
909
911
 
910
- // Poll every 2 seconds
912
+ // Poll every 2 seconds. Progress bar denominator is `max_pages`
913
+ // (the hard cap), not `pages_discovered` — which is the frontier
914
+ // size and can explode on link-heavy sites, making the bar look
915
+ // broken. We also tolerate a few transient poll failures (the API
916
+ // container is busy running the crawl on the same event loop so
917
+ // individual status requests can spike latency).
911
918
  const POLL_MS = 2000;
912
- let prev = 0;
919
+ const MAX_POLL_ERRORS = 5;
920
+ const target = config.max_pages;
921
+ let prev = -1;
922
+ let pollErrors = 0;
913
923
  while (true) {
914
924
  await new Promise((r) => setTimeout(r, POLL_MS));
915
- const status = await getSiteCrawlStatus(jobId);
925
+ let status;
926
+ try {
927
+ status = await getSiteCrawlStatus(jobId);
928
+ pollErrors = 0;
929
+ } catch (err) {
930
+ pollErrors += 1;
931
+ if (pollErrors >= MAX_POLL_ERRORS) {
932
+ process.stdout.write("\n");
933
+ console.log(`\x1b[31mLost connection while polling.\x1b[0m The job may still be running server-side.`);
934
+ console.log(`Re-check with: xyle site-crawl status ${jobId}`);
935
+ return;
936
+ }
937
+ continue;
938
+ }
916
939
  const crawled = status.pages_crawled || 0;
917
- const discovered = status.pages_discovered || 0;
918
940
  const errors = status.errors_count || 0;
919
941
 
920
942
  if (crawled !== prev) {
921
- const pct = discovered > 0 ? Math.round((crawled / discovered) * 100) : 0;
922
- const bar = "\u2588".repeat(Math.round(pct / 5)) + "\u2591".repeat(20 - Math.round(pct / 5));
923
- process.stdout.write(`\r ${bar} ${pct}% ${crawled}/${discovered} pages ${errors} errors`);
943
+ const pct = Math.min(100, Math.round((crawled / target) * 100));
944
+ const filled = Math.round(pct / 5);
945
+ const bar = "\u2588".repeat(filled) + "\u2591".repeat(20 - filled);
946
+ process.stdout.write(`\r ${bar} ${pct}% ${crawled}/${target} pages ${errors} errors `);
924
947
  prev = crawled;
925
948
  }
926
949
 
package/src/seed.mjs CHANGED
@@ -114,6 +114,21 @@ npx @xyleapp/cli <command> [options]
114
114
 
115
115
  Always use \`--json\` when parsing output programmatically.
116
116
 
117
+ ## Sites Hub & Auto-Discovery (Dashboard)
118
+
119
+ Every account has a per-site hub in the dashboard at \`/dashboard/sites/[siteId]\` with four tabs: Overview, Crawl, Analysis, and Competitors. Use it when the CLI alone can't help.
120
+
121
+ **Why this matters:** \`xyle competitors --query\` needs a query string from Search Console. For freshly onboarded sites (no GSC sync yet), there's no meaningful query to pass. The dashboard fills this gap with **Gemini-grounded auto-discovery**: one click finds the top 5–10 direct competitors for a domain, enriches each with a lightweight crawl, and links them to the site.
122
+
123
+ **When to point the user to the Sites Hub:**
124
+ - **New site, no GSC data** → tell the user to open \`/dashboard/sites/[siteId]\` → Competitors tab → "Discover via Gemini". Come back to the CLI after it finishes to continue the audit.
125
+ - **User asks "who are my competitors?" without naming a query** → Sites Hub, not \`xyle competitors --query\`.
126
+ - **User wants a persistent per-site view** → Sites Hub has all four tabs in one place.
127
+
128
+ **Rate limits:** 10 discoveries/user/day, 1 run/site/6h. If the user hits a 429 there's nothing to fix, just wait.
129
+
130
+ **Not in the CLI (yet):** Auto-discovery is dashboard-only. There is no \`xyle competitors discover\` subcommand. When a user is onboarding a fresh site from the CLI and has no query data, explicitly direct them to the dashboard instead of trying to improvise.
131
+
117
132
  ## Strategic Workflows
118
133
 
119
134
  ### 1. Full-Site Audit (Screaming Frog replacement)
@@ -196,6 +211,8 @@ When the user asks something SEO-related, route to the right workflow:
196
211
  | "What content am I missing?" / "Find gaps" | Content Gap Sprint | Ready to create, need briefs |
197
212
  | "Why is my CTR low?" | Page Optimization (CTR focus) | Likely a title/meta problem |
198
213
  | "Help me rank for [query]" | Page Optimization + Competitor Analysis | Need to see what's working for competitors |
214
+ | "I just onboarded my site, what now?" / "No queries yet" | Sites Hub → Discover via Gemini | Bootstrap competitor context before GSC catches up |
215
+ | "Who are my competitors?" (no query named) | Sites Hub → Discover via Gemini | No GSC query means \`competitors --query\` can't help |
199
216
 
200
217
  **After every analysis, proactively recommend the next step.** Don't just present data — interpret it and suggest action.
201
218
 
@@ -208,6 +225,7 @@ When the user asks something SEO-related, route to the right workflow:
208
225
  - Classify queries by intent before suggesting optimizations
209
226
  - Consider the user's ICP when recommending content topics
210
227
  - Use \`status\` first to verify API connectivity
228
+ - **For fresh sites with no GSC history, point the user to the Sites Hub (\`/dashboard/sites/[id]\`) to run Gemini auto-discovery** — don't try to improvise competitor context with \`competitors --query\` when there's no query to pass
211
229
 
212
230
  **Don't:**
213
231
  - Optimize for zero-impression queries (no audience there)