@xyleapp/cli 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/xyle.mjs +1 -1
- package/package.json +1 -1
- package/src/api.mjs +4 -1
- package/src/commands.mjs +36 -13
package/bin/xyle.mjs
CHANGED
package/package.json
CHANGED
package/src/api.mjs
CHANGED
|
@@ -165,7 +165,10 @@ export function startSiteCrawl(seedUrl, config = {}) {
|
|
|
165
165
|
}
|
|
166
166
|
|
|
167
167
|
export function getSiteCrawlStatus(jobId) {
|
|
168
|
-
|
|
168
|
+
// 60s timeout: the API container runs the crawl loop on the same event
|
|
169
|
+
// loop as the HTTP handler, so individual status requests can spike in
|
|
170
|
+
// latency when the crawl is hot. 15s was too tight.
|
|
171
|
+
return request("GET", `/site-crawl/${jobId}`, { timeout: 60000 });
|
|
169
172
|
}
|
|
170
173
|
|
|
171
174
|
export function getSiteCrawlPages(jobId, { limit = 50, offset = 0, filter } = {}) {
|
package/src/commands.mjs
CHANGED
|
@@ -884,18 +884,20 @@ export function registerCommands(program) {
|
|
|
884
884
|
.command("start")
|
|
885
885
|
.description("Start a full-site crawl")
|
|
886
886
|
.argument("<url>", "Seed URL to crawl")
|
|
887
|
-
.option("--max-pages <n>", "Maximum pages to crawl", "
|
|
888
|
-
.option("--max-depth <n>", "Maximum crawl depth", "
|
|
889
|
-
.option("--
|
|
890
|
-
.option("--
|
|
887
|
+
.option("--max-pages <n>", "Maximum pages to crawl (default 50, max 500)", "50")
|
|
888
|
+
.option("--max-depth <n>", "Maximum crawl depth (default 3, max 10)", "3")
|
|
889
|
+
.option("--render-js", "Enable JavaScript rendering (slower, for SPA sites)")
|
|
890
|
+
.option("--include-subdomains", "Follow links to subdomains of the seed host")
|
|
891
891
|
.option("--json", "Output as JSON (no live polling)")
|
|
892
892
|
.action(async (url, opts) => {
|
|
893
893
|
try {
|
|
894
|
+
// NOTE: robots.txt is always respected server-side. The crawler
|
|
895
|
+
// identifies as Xyle-Crawler/<version> and cannot be spoofed.
|
|
894
896
|
const config = {
|
|
895
897
|
max_pages: parseInt(opts.maxPages, 10),
|
|
896
898
|
max_depth: parseInt(opts.maxDepth, 10),
|
|
897
|
-
|
|
898
|
-
|
|
899
|
+
render_js: opts.renderJs === true,
|
|
900
|
+
include_subdomains: opts.includeSubdomains === true,
|
|
899
901
|
};
|
|
900
902
|
const data = await startSiteCrawl(url, config);
|
|
901
903
|
const jobId = data.job_id;
|
|
@@ -907,20 +909,41 @@ export function registerCommands(program) {
|
|
|
907
909
|
|
|
908
910
|
console.log(`\x1b[36mCrawl started:\x1b[0m ${jobId}`);
|
|
909
911
|
|
|
910
|
-
// Poll every 2 seconds
|
|
912
|
+
// Poll every 2 seconds. Progress bar denominator is `max_pages`
|
|
913
|
+
// (the hard cap), not `pages_discovered` — which is the frontier
|
|
914
|
+
// size and can explode on link-heavy sites, making the bar look
|
|
915
|
+
// broken. We also tolerate a few transient poll failures (the API
|
|
916
|
+
// container is busy running the crawl on the same event loop so
|
|
917
|
+
// individual status requests can spike latency).
|
|
911
918
|
const POLL_MS = 2000;
|
|
912
|
-
|
|
919
|
+
const MAX_POLL_ERRORS = 5;
|
|
920
|
+
const target = config.max_pages;
|
|
921
|
+
let prev = -1;
|
|
922
|
+
let pollErrors = 0;
|
|
913
923
|
while (true) {
|
|
914
924
|
await new Promise((r) => setTimeout(r, POLL_MS));
|
|
915
|
-
|
|
925
|
+
let status;
|
|
926
|
+
try {
|
|
927
|
+
status = await getSiteCrawlStatus(jobId);
|
|
928
|
+
pollErrors = 0;
|
|
929
|
+
} catch (err) {
|
|
930
|
+
pollErrors += 1;
|
|
931
|
+
if (pollErrors >= MAX_POLL_ERRORS) {
|
|
932
|
+
process.stdout.write("\n");
|
|
933
|
+
console.log(`\x1b[31mLost connection while polling.\x1b[0m The job may still be running server-side.`);
|
|
934
|
+
console.log(`Re-check with: xyle site-crawl status ${jobId}`);
|
|
935
|
+
return;
|
|
936
|
+
}
|
|
937
|
+
continue;
|
|
938
|
+
}
|
|
916
939
|
const crawled = status.pages_crawled || 0;
|
|
917
|
-
const discovered = status.pages_discovered || 0;
|
|
918
940
|
const errors = status.errors_count || 0;
|
|
919
941
|
|
|
920
942
|
if (crawled !== prev) {
|
|
921
|
-
const pct =
|
|
922
|
-
const
|
|
923
|
-
|
|
943
|
+
const pct = Math.min(100, Math.round((crawled / target) * 100));
|
|
944
|
+
const filled = Math.round(pct / 5);
|
|
945
|
+
const bar = "\u2588".repeat(filled) + "\u2591".repeat(20 - filled);
|
|
946
|
+
process.stdout.write(`\r ${bar} ${pct}% ${crawled}/${target} pages ${errors} errors `);
|
|
924
947
|
prev = crawled;
|
|
925
948
|
}
|
|
926
949
|
|