npm - @xbrowser/cli - Versions diffs - 1.7.0 → 1.7.2 - Mend

@xbrowser/cli 1.7.0 → 1.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/cli.js CHANGED Viewed

@@ -2794,6 +2794,24 @@ async function extractLinks(page, origin) {
     }).filter(Boolean);
   }, origin);
 }
+async function detectSpaRoutes(page, origin) {
+  return page.evaluate((evalOrigin) => {
+    const routeSet = /* @__PURE__ */ new Set();
+    try {
+      const scripts = document.querySelectorAll("script");
+      const allContent = Array.from(scripts).map((s) => s.textContent || "").join("\n");
+      const pathRegex = /['"`](\/[a-zA-Z0-9_\-/]+)['"`]/g;
+      let match;
+      while ((match = pathRegex.exec(allContent)) !== null) {
+        const path3 = match[1];
+        if (path3.includes(":") || path3.includes("*") || routeSet.has(path3)) continue;
+        routeSet.add(path3);
+      }
+    } catch {
+    }
+    return Array.from(routeSet).map((path3) => `${evalOrigin.replace(/\/$/, "")}${path3}`);
+  }, origin);
+}
 function parseRobotsTxt(text) {
   const rules = [];
   let inRelevantBlock = false;
@@ -2933,6 +2951,7 @@ var crawlCommand = registerCommand({
     allowSubdomains: z17.boolean().default(false),
     allowExternalLinks: z17.boolean().default(false),
     allowBackwardCrawling: z17.boolean().default(false),
+    enableSpa: z17.boolean().default(true).describe("Disable to skip SPA route detection"),
     format: z17.enum(["markdown", "html"]).default("markdown"),
     onlyMainContent: z17.boolean().default(true),
     concurrency: z17.number().default(3),
@@ -2949,6 +2968,7 @@ var crawlCommand = registerCommand({
       allowSubdomains: p.allowSubdomains,
       allowExternalLinks: p.allowExternalLinks,
       allowBackwardCrawling: p.allowBackwardCrawling,
+      enableSpa: p.enableSpa,
       format: p.format,
       onlyMainContent: p.onlyMainContent,
       concurrency: Math.min(Math.max(p.concurrency, 1), 10),
@@ -2981,6 +3001,22 @@ var crawlCommand = registerCommand({
         const content = options.format === "html" ? html : htmlToMarkdown(html, { onlyMainContent: options.onlyMainContent });
         results.push({ url: seedPage.url(), title, content });
         const firstLinks = await extractLinks(seedPage, startUrl.origin);
+        if (options.enableSpa) {
+          const spaRoutes = await detectSpaRoutes(seedPage, startUrl.origin);
+          for (const route2 of spaRoutes) {
+            try {
+              const absNorm = normalizeUrl(stripHashAnchorQuery(route2));
+              if (!visited.has(absNorm) && !shouldSkipUrl(route2)) {
+                queue.push({ url: stripHashAnchorQuery(route2), depth: 1 });
+              }
+            } catch {
+            }
+          }
+          if (options.verbose && spaRoutes.length > 0) {
+            process.stderr.write(`[SPA] Detected ${spaRoutes.length} SPA routes
+`);
+          }
+        }
         for (const link of firstLinks) {
           try {
             const absolute = new URL(link, seedPage.url()).href;

package/dist/daemon-main.js CHANGED Viewed

@@ -2761,6 +2761,24 @@ async function extractLinks(page, origin) {
     }).filter(Boolean);
   }, origin);
 }
+async function detectSpaRoutes(page, origin) {
+  return page.evaluate((evalOrigin) => {
+    const routeSet = /* @__PURE__ */ new Set();
+    try {
+      const scripts = document.querySelectorAll("script");
+      const allContent = Array.from(scripts).map((s) => s.textContent || "").join("\n");
+      const pathRegex = /['"`](\/[a-zA-Z0-9_\-/]+)['"`]/g;
+      let match;
+      while ((match = pathRegex.exec(allContent)) !== null) {
+        const path2 = match[1];
+        if (path2.includes(":") || path2.includes("*") || routeSet.has(path2)) continue;
+        routeSet.add(path2);
+      }
+    } catch {
+    }
+    return Array.from(routeSet).map((path2) => `${evalOrigin.replace(/\/$/, "")}${path2}`);
+  }, origin);
+}
 function parseRobotsTxt(text) {
   const rules = [];
   let inRelevantBlock = false;
@@ -2900,6 +2918,7 @@ var crawlCommand = registerCommand({
     allowSubdomains: z17.boolean().default(false),
     allowExternalLinks: z17.boolean().default(false),
     allowBackwardCrawling: z17.boolean().default(false),
+    enableSpa: z17.boolean().default(true).describe("Disable to skip SPA route detection"),
     format: z17.enum(["markdown", "html"]).default("markdown"),
     onlyMainContent: z17.boolean().default(true),
     concurrency: z17.number().default(3),
@@ -2916,6 +2935,7 @@ var crawlCommand = registerCommand({
       allowSubdomains: p.allowSubdomains,
       allowExternalLinks: p.allowExternalLinks,
       allowBackwardCrawling: p.allowBackwardCrawling,
+      enableSpa: p.enableSpa,
       format: p.format,
       onlyMainContent: p.onlyMainContent,
       concurrency: Math.min(Math.max(p.concurrency, 1), 10),
@@ -2948,6 +2968,22 @@ var crawlCommand = registerCommand({
         const content = options.format === "html" ? html : htmlToMarkdown(html, { onlyMainContent: options.onlyMainContent });
         results.push({ url: seedPage.url(), title, content });
         const firstLinks = await extractLinks(seedPage, startUrl.origin);
+        if (options.enableSpa) {
+          const spaRoutes = await detectSpaRoutes(seedPage, startUrl.origin);
+          for (const route of spaRoutes) {
+            try {
+              const absNorm = normalizeUrl(stripHashAnchorQuery(route));
+              if (!visited.has(absNorm) && !shouldSkipUrl(route)) {
+                queue.push({ url: stripHashAnchorQuery(route), depth: 1 });
+              }
+            } catch {
+            }
+          }
+          if (options.verbose && spaRoutes.length > 0) {
+            process.stderr.write(`[SPA] Detected ${spaRoutes.length} SPA routes
+`);
+          }
+        }
         for (const link of firstLinks) {
           try {
             const absolute = new URL(link, seedPage.url()).href;

package/dist/index.js CHANGED Viewed

@@ -2834,6 +2834,24 @@ async function extractLinks(page, origin) {
     }).filter(Boolean);
   }, origin);
 }
+async function detectSpaRoutes(page, origin) {
+  return page.evaluate((evalOrigin) => {
+    const routeSet = /* @__PURE__ */ new Set();
+    try {
+      const scripts = document.querySelectorAll("script");
+      const allContent = Array.from(scripts).map((s) => s.textContent || "").join("\n");
+      const pathRegex = /['"`](\/[a-zA-Z0-9_\-/]+)['"`]/g;
+      let match;
+      while ((match = pathRegex.exec(allContent)) !== null) {
+        const path5 = match[1];
+        if (path5.includes(":") || path5.includes("*") || routeSet.has(path5)) continue;
+        routeSet.add(path5);
+      }
+    } catch {
+    }
+    return Array.from(routeSet).map((path5) => `${evalOrigin.replace(/\/$/, "")}${path5}`);
+  }, origin);
+}
 function parseRobotsTxt(text) {
   const rules = [];
   let inRelevantBlock = false;
@@ -2973,6 +2991,7 @@ var crawlCommand = registerCommand({
     allowSubdomains: z17.boolean().default(false),
     allowExternalLinks: z17.boolean().default(false),
     allowBackwardCrawling: z17.boolean().default(false),
+    enableSpa: z17.boolean().default(true).describe("Disable to skip SPA route detection"),
     format: z17.enum(["markdown", "html"]).default("markdown"),
     onlyMainContent: z17.boolean().default(true),
     concurrency: z17.number().default(3),
@@ -2989,6 +3008,7 @@ var crawlCommand = registerCommand({
       allowSubdomains: p.allowSubdomains,
       allowExternalLinks: p.allowExternalLinks,
       allowBackwardCrawling: p.allowBackwardCrawling,
+      enableSpa: p.enableSpa,
       format: p.format,
       onlyMainContent: p.onlyMainContent,
       concurrency: Math.min(Math.max(p.concurrency, 1), 10),
@@ -3021,6 +3041,22 @@ var crawlCommand = registerCommand({
         const content = options.format === "html" ? html : htmlToMarkdown(html, { onlyMainContent: options.onlyMainContent });
         results.push({ url: seedPage.url(), title, content });
         const firstLinks = await extractLinks(seedPage, startUrl.origin);
+        if (options.enableSpa) {
+          const spaRoutes = await detectSpaRoutes(seedPage, startUrl.origin);
+          for (const route2 of spaRoutes) {
+            try {
+              const absNorm = normalizeUrl(stripHashAnchorQuery(route2));
+              if (!visited.has(absNorm) && !shouldSkipUrl(route2)) {
+                queue.push({ url: stripHashAnchorQuery(route2), depth: 1 });
+              }
+            } catch {
+            }
+          }
+          if (options.verbose && spaRoutes.length > 0) {
+            process.stderr.write(`[SPA] Detected ${spaRoutes.length} SPA routes
+`);
+          }
+        }
         for (const link of firstLinks) {
           try {
             const absolute = new URL(link, seedPage.url()).href;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@xbrowser/cli",
-  "version": "1.7.0",
+  "version": "1.7.2",
   "description": "Browser automation CLI for web scraping, headless browsing, SEO analysis, and AI agent workflows. A command-line alternative to Playwright, Puppeteer, and Selenium.",
   "type": "module",
   "bin": {