@xyleapp/cli 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/xyle.mjs +1 -1
- package/package.json +1 -1
- package/src/api.mjs +4 -1
- package/src/commands.mjs +36 -13
- package/src/seed.mjs +18 -0
package/bin/xyle.mjs
CHANGED
package/package.json
CHANGED
package/src/api.mjs
CHANGED
|
@@ -165,7 +165,10 @@ export function startSiteCrawl(seedUrl, config = {}) {
|
|
|
165
165
|
}
|
|
166
166
|
|
|
167
167
|
export function getSiteCrawlStatus(jobId) {
|
|
168
|
-
|
|
168
|
+
// 60s timeout: the API container runs the crawl loop on the same event
|
|
169
|
+
// loop as the HTTP handler, so individual status requests can spike in
|
|
170
|
+
// latency when the crawl is hot. 15s was too tight.
|
|
171
|
+
return request("GET", `/site-crawl/${jobId}`, { timeout: 60000 });
|
|
169
172
|
}
|
|
170
173
|
|
|
171
174
|
export function getSiteCrawlPages(jobId, { limit = 50, offset = 0, filter } = {}) {
|
package/src/commands.mjs
CHANGED
|
@@ -884,18 +884,20 @@ export function registerCommands(program) {
|
|
|
884
884
|
.command("start")
|
|
885
885
|
.description("Start a full-site crawl")
|
|
886
886
|
.argument("<url>", "Seed URL to crawl")
|
|
887
|
-
.option("--max-pages <n>", "Maximum pages to crawl", "
|
|
888
|
-
.option("--max-depth <n>", "Maximum crawl depth", "
|
|
889
|
-
.option("--
|
|
890
|
-
.option("--
|
|
887
|
+
.option("--max-pages <n>", "Maximum pages to crawl (default 50, max 500)", "50")
|
|
888
|
+
.option("--max-depth <n>", "Maximum crawl depth (default 3, max 10)", "3")
|
|
889
|
+
.option("--render-js", "Enable JavaScript rendering (slower, for SPA sites)")
|
|
890
|
+
.option("--include-subdomains", "Follow links to subdomains of the seed host")
|
|
891
891
|
.option("--json", "Output as JSON (no live polling)")
|
|
892
892
|
.action(async (url, opts) => {
|
|
893
893
|
try {
|
|
894
|
+
// NOTE: robots.txt is always respected server-side. The crawler
|
|
895
|
+
// identifies as Xyle-Crawler/<version> and cannot be spoofed.
|
|
894
896
|
const config = {
|
|
895
897
|
max_pages: parseInt(opts.maxPages, 10),
|
|
896
898
|
max_depth: parseInt(opts.maxDepth, 10),
|
|
897
|
-
|
|
898
|
-
|
|
899
|
+
render_js: opts.renderJs === true,
|
|
900
|
+
include_subdomains: opts.includeSubdomains === true,
|
|
899
901
|
};
|
|
900
902
|
const data = await startSiteCrawl(url, config);
|
|
901
903
|
const jobId = data.job_id;
|
|
@@ -907,20 +909,41 @@ export function registerCommands(program) {
|
|
|
907
909
|
|
|
908
910
|
console.log(`\x1b[36mCrawl started:\x1b[0m ${jobId}`);
|
|
909
911
|
|
|
910
|
-
// Poll every 2 seconds
|
|
912
|
+
// Poll every 2 seconds. Progress bar denominator is `max_pages`
|
|
913
|
+
// (the hard cap), not `pages_discovered` — which is the frontier
|
|
914
|
+
// size and can explode on link-heavy sites, making the bar look
|
|
915
|
+
// broken. We also tolerate a few transient poll failures (the API
|
|
916
|
+
// container is busy running the crawl on the same event loop so
|
|
917
|
+
// individual status requests can spike latency).
|
|
911
918
|
const POLL_MS = 2000;
|
|
912
|
-
|
|
919
|
+
const MAX_POLL_ERRORS = 5;
|
|
920
|
+
const target = config.max_pages;
|
|
921
|
+
let prev = -1;
|
|
922
|
+
let pollErrors = 0;
|
|
913
923
|
while (true) {
|
|
914
924
|
await new Promise((r) => setTimeout(r, POLL_MS));
|
|
915
|
-
|
|
925
|
+
let status;
|
|
926
|
+
try {
|
|
927
|
+
status = await getSiteCrawlStatus(jobId);
|
|
928
|
+
pollErrors = 0;
|
|
929
|
+
} catch (err) {
|
|
930
|
+
pollErrors += 1;
|
|
931
|
+
if (pollErrors >= MAX_POLL_ERRORS) {
|
|
932
|
+
process.stdout.write("\n");
|
|
933
|
+
console.log(`\x1b[31mLost connection while polling.\x1b[0m The job may still be running server-side.`);
|
|
934
|
+
console.log(`Re-check with: xyle site-crawl status ${jobId}`);
|
|
935
|
+
return;
|
|
936
|
+
}
|
|
937
|
+
continue;
|
|
938
|
+
}
|
|
916
939
|
const crawled = status.pages_crawled || 0;
|
|
917
|
-
const discovered = status.pages_discovered || 0;
|
|
918
940
|
const errors = status.errors_count || 0;
|
|
919
941
|
|
|
920
942
|
if (crawled !== prev) {
|
|
921
|
-
const pct =
|
|
922
|
-
const
|
|
923
|
-
|
|
943
|
+
const pct = Math.min(100, Math.round((crawled / target) * 100));
|
|
944
|
+
const filled = Math.round(pct / 5);
|
|
945
|
+
const bar = "\u2588".repeat(filled) + "\u2591".repeat(20 - filled);
|
|
946
|
+
process.stdout.write(`\r ${bar} ${pct}% ${crawled}/${target} pages ${errors} errors `);
|
|
924
947
|
prev = crawled;
|
|
925
948
|
}
|
|
926
949
|
|
package/src/seed.mjs
CHANGED
|
@@ -114,6 +114,21 @@ npx @xyleapp/cli <command> [options]
|
|
|
114
114
|
|
|
115
115
|
Always use \`--json\` when parsing output programmatically.
|
|
116
116
|
|
|
117
|
+
## Sites Hub & Auto-Discovery (Dashboard)
|
|
118
|
+
|
|
119
|
+
Every account has a per-site hub in the dashboard at \`/dashboard/sites/[siteId]\` with four tabs: Overview, Crawl, Analysis, and Competitors. Use it when the CLI alone can't help.
|
|
120
|
+
|
|
121
|
+
**Why this matters:** \`xyle competitors --query\` needs a query string from Search Console. For freshly onboarded sites (no GSC sync yet), there's no meaningful query to pass. The dashboard fills this gap with **Gemini-grounded auto-discovery**: one click finds the top 5–10 direct competitors for a domain, enriches each with a lightweight crawl, and links them to the site.
|
|
122
|
+
|
|
123
|
+
**When to point the user to the Sites Hub:**
|
|
124
|
+
- **New site, no GSC data** → tell the user to open \`/dashboard/sites/[siteId]\` → Competitors tab → "Discover via Gemini". Come back to the CLI after it finishes to continue the audit.
|
|
125
|
+
- **User asks "who are my competitors?" without naming a query** → Sites Hub, not \`xyle competitors --query\`.
|
|
126
|
+
- **User wants a persistent per-site view** → Sites Hub has all four tabs in one place.
|
|
127
|
+
|
|
128
|
+
**Rate limits:** 10 discoveries/user/day, 1 run/site/6h. If the user hits a 429 there's nothing to fix, just wait.
|
|
129
|
+
|
|
130
|
+
**Not in the CLI (yet):** Auto-discovery is dashboard-only. There is no \`xyle competitors discover\` subcommand. When a user is onboarding a fresh site from the CLI and has no query data, explicitly direct them to the dashboard instead of trying to improvise.
|
|
131
|
+
|
|
117
132
|
## Strategic Workflows
|
|
118
133
|
|
|
119
134
|
### 1. Full-Site Audit (Screaming Frog replacement)
|
|
@@ -196,6 +211,8 @@ When the user asks something SEO-related, route to the right workflow:
|
|
|
196
211
|
| "What content am I missing?" / "Find gaps" | Content Gap Sprint | Ready to create, need briefs |
|
|
197
212
|
| "Why is my CTR low?" | Page Optimization (CTR focus) | Likely a title/meta problem |
|
|
198
213
|
| "Help me rank for [query]" | Page Optimization + Competitor Analysis | Need to see what's working for competitors |
|
|
214
|
+
| "I just onboarded my site, what now?" / "No queries yet" | Sites Hub → Discover via Gemini | Bootstrap competitor context before GSC catches up |
|
|
215
|
+
| "Who are my competitors?" (no query named) | Sites Hub → Discover via Gemini | No GSC query means \`competitors --query\` can't help |
|
|
199
216
|
|
|
200
217
|
**After every analysis, proactively recommend the next step.** Don't just present data — interpret it and suggest action.
|
|
201
218
|
|
|
@@ -208,6 +225,7 @@ When the user asks something SEO-related, route to the right workflow:
|
|
|
208
225
|
- Classify queries by intent before suggesting optimizations
|
|
209
226
|
- Consider the user's ICP when recommending content topics
|
|
210
227
|
- Use \`status\` first to verify API connectivity
|
|
228
|
+
- **For fresh sites with no GSC history, point the user to the Sites Hub (\`/dashboard/sites/[id]\`) to run Gemini auto-discovery** — don't try to improvise competitor context with \`competitors --query\` when there's no query to pass
|
|
211
229
|
|
|
212
230
|
**Don't:**
|
|
213
231
|
- Optimize for zero-impression queries (no audience there)
|