@xbrowser/cli 1.7.0 → 1.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -2794,6 +2794,24 @@ async function extractLinks(page, origin) {
2794
2794
  }).filter(Boolean);
2795
2795
  }, origin);
2796
2796
  }
2797
+ async function detectSpaRoutes(page, origin) {
2798
+ return page.evaluate((evalOrigin) => {
2799
+ const routeSet = /* @__PURE__ */ new Set();
2800
+ try {
2801
+ const scripts = document.querySelectorAll("script");
2802
+ const allContent = Array.from(scripts).map((s) => s.textContent || "").join("\n");
2803
+ const pathRegex = /['"`](\/[a-zA-Z0-9_\-/]+)['"`]/g;
2804
+ let match;
2805
+ while ((match = pathRegex.exec(allContent)) !== null) {
2806
+ const path3 = match[1];
2807
+ if (path3.includes(":") || path3.includes("*") || routeSet.has(path3)) continue;
2808
+ routeSet.add(path3);
2809
+ }
2810
+ } catch {
2811
+ }
2812
+ return Array.from(routeSet).map((path3) => `${evalOrigin.replace(/\/$/, "")}${path3}`);
2813
+ }, origin);
2814
+ }
2797
2815
  function parseRobotsTxt(text) {
2798
2816
  const rules = [];
2799
2817
  let inRelevantBlock = false;
@@ -2933,6 +2951,7 @@ var crawlCommand = registerCommand({
2933
2951
  allowSubdomains: z17.boolean().default(false),
2934
2952
  allowExternalLinks: z17.boolean().default(false),
2935
2953
  allowBackwardCrawling: z17.boolean().default(false),
2954
+ enableSpa: z17.boolean().default(true).describe("Disable to skip SPA route detection"),
2936
2955
  format: z17.enum(["markdown", "html"]).default("markdown"),
2937
2956
  onlyMainContent: z17.boolean().default(true),
2938
2957
  concurrency: z17.number().default(3),
@@ -2949,6 +2968,7 @@ var crawlCommand = registerCommand({
2949
2968
  allowSubdomains: p.allowSubdomains,
2950
2969
  allowExternalLinks: p.allowExternalLinks,
2951
2970
  allowBackwardCrawling: p.allowBackwardCrawling,
2971
+ enableSpa: p.enableSpa,
2952
2972
  format: p.format,
2953
2973
  onlyMainContent: p.onlyMainContent,
2954
2974
  concurrency: Math.min(Math.max(p.concurrency, 1), 10),
@@ -2981,6 +3001,22 @@ var crawlCommand = registerCommand({
2981
3001
  const content = options.format === "html" ? html : htmlToMarkdown(html, { onlyMainContent: options.onlyMainContent });
2982
3002
  results.push({ url: seedPage.url(), title, content });
2983
3003
  const firstLinks = await extractLinks(seedPage, startUrl.origin);
3004
+ if (options.enableSpa) {
3005
+ const spaRoutes = await detectSpaRoutes(seedPage, startUrl.origin);
3006
+ for (const route2 of spaRoutes) {
3007
+ try {
3008
+ const absNorm = normalizeUrl(stripHashAnchorQuery(route2));
3009
+ if (!visited.has(absNorm) && !shouldSkipUrl(route2)) {
3010
+ queue.push({ url: stripHashAnchorQuery(route2), depth: 1 });
3011
+ }
3012
+ } catch {
3013
+ }
3014
+ }
3015
+ if (options.verbose && spaRoutes.length > 0) {
3016
+ process.stderr.write(`[SPA] Detected ${spaRoutes.length} SPA routes
3017
+ `);
3018
+ }
3019
+ }
2984
3020
  for (const link of firstLinks) {
2985
3021
  try {
2986
3022
  const absolute = new URL(link, seedPage.url()).href;
@@ -2761,6 +2761,24 @@ async function extractLinks(page, origin) {
2761
2761
  }).filter(Boolean);
2762
2762
  }, origin);
2763
2763
  }
2764
+ async function detectSpaRoutes(page, origin) {
2765
+ return page.evaluate((evalOrigin) => {
2766
+ const routeSet = /* @__PURE__ */ new Set();
2767
+ try {
2768
+ const scripts = document.querySelectorAll("script");
2769
+ const allContent = Array.from(scripts).map((s) => s.textContent || "").join("\n");
2770
+ const pathRegex = /['"`](\/[a-zA-Z0-9_\-/]+)['"`]/g;
2771
+ let match;
2772
+ while ((match = pathRegex.exec(allContent)) !== null) {
2773
+ const path2 = match[1];
2774
+ if (path2.includes(":") || path2.includes("*") || routeSet.has(path2)) continue;
2775
+ routeSet.add(path2);
2776
+ }
2777
+ } catch {
2778
+ }
2779
+ return Array.from(routeSet).map((path2) => `${evalOrigin.replace(/\/$/, "")}${path2}`);
2780
+ }, origin);
2781
+ }
2764
2782
  function parseRobotsTxt(text) {
2765
2783
  const rules = [];
2766
2784
  let inRelevantBlock = false;
@@ -2900,6 +2918,7 @@ var crawlCommand = registerCommand({
2900
2918
  allowSubdomains: z17.boolean().default(false),
2901
2919
  allowExternalLinks: z17.boolean().default(false),
2902
2920
  allowBackwardCrawling: z17.boolean().default(false),
2921
+ enableSpa: z17.boolean().default(true).describe("Disable to skip SPA route detection"),
2903
2922
  format: z17.enum(["markdown", "html"]).default("markdown"),
2904
2923
  onlyMainContent: z17.boolean().default(true),
2905
2924
  concurrency: z17.number().default(3),
@@ -2916,6 +2935,7 @@ var crawlCommand = registerCommand({
2916
2935
  allowSubdomains: p.allowSubdomains,
2917
2936
  allowExternalLinks: p.allowExternalLinks,
2918
2937
  allowBackwardCrawling: p.allowBackwardCrawling,
2938
+ enableSpa: p.enableSpa,
2919
2939
  format: p.format,
2920
2940
  onlyMainContent: p.onlyMainContent,
2921
2941
  concurrency: Math.min(Math.max(p.concurrency, 1), 10),
@@ -2948,6 +2968,22 @@ var crawlCommand = registerCommand({
2948
2968
  const content = options.format === "html" ? html : htmlToMarkdown(html, { onlyMainContent: options.onlyMainContent });
2949
2969
  results.push({ url: seedPage.url(), title, content });
2950
2970
  const firstLinks = await extractLinks(seedPage, startUrl.origin);
2971
+ if (options.enableSpa) {
2972
+ const spaRoutes = await detectSpaRoutes(seedPage, startUrl.origin);
2973
+ for (const route of spaRoutes) {
2974
+ try {
2975
+ const absNorm = normalizeUrl(stripHashAnchorQuery(route));
2976
+ if (!visited.has(absNorm) && !shouldSkipUrl(route)) {
2977
+ queue.push({ url: stripHashAnchorQuery(route), depth: 1 });
2978
+ }
2979
+ } catch {
2980
+ }
2981
+ }
2982
+ if (options.verbose && spaRoutes.length > 0) {
2983
+ process.stderr.write(`[SPA] Detected ${spaRoutes.length} SPA routes
2984
+ `);
2985
+ }
2986
+ }
2951
2987
  for (const link of firstLinks) {
2952
2988
  try {
2953
2989
  const absolute = new URL(link, seedPage.url()).href;
package/dist/index.js CHANGED
@@ -2834,6 +2834,24 @@ async function extractLinks(page, origin) {
2834
2834
  }).filter(Boolean);
2835
2835
  }, origin);
2836
2836
  }
2837
+ async function detectSpaRoutes(page, origin) {
2838
+ return page.evaluate((evalOrigin) => {
2839
+ const routeSet = /* @__PURE__ */ new Set();
2840
+ try {
2841
+ const scripts = document.querySelectorAll("script");
2842
+ const allContent = Array.from(scripts).map((s) => s.textContent || "").join("\n");
2843
+ const pathRegex = /['"`](\/[a-zA-Z0-9_\-/]+)['"`]/g;
2844
+ let match;
2845
+ while ((match = pathRegex.exec(allContent)) !== null) {
2846
+ const path5 = match[1];
2847
+ if (path5.includes(":") || path5.includes("*") || routeSet.has(path5)) continue;
2848
+ routeSet.add(path5);
2849
+ }
2850
+ } catch {
2851
+ }
2852
+ return Array.from(routeSet).map((path5) => `${evalOrigin.replace(/\/$/, "")}${path5}`);
2853
+ }, origin);
2854
+ }
2837
2855
  function parseRobotsTxt(text) {
2838
2856
  const rules = [];
2839
2857
  let inRelevantBlock = false;
@@ -2973,6 +2991,7 @@ var crawlCommand = registerCommand({
2973
2991
  allowSubdomains: z17.boolean().default(false),
2974
2992
  allowExternalLinks: z17.boolean().default(false),
2975
2993
  allowBackwardCrawling: z17.boolean().default(false),
2994
+ enableSpa: z17.boolean().default(true).describe("Disable to skip SPA route detection"),
2976
2995
  format: z17.enum(["markdown", "html"]).default("markdown"),
2977
2996
  onlyMainContent: z17.boolean().default(true),
2978
2997
  concurrency: z17.number().default(3),
@@ -2989,6 +3008,7 @@ var crawlCommand = registerCommand({
2989
3008
  allowSubdomains: p.allowSubdomains,
2990
3009
  allowExternalLinks: p.allowExternalLinks,
2991
3010
  allowBackwardCrawling: p.allowBackwardCrawling,
3011
+ enableSpa: p.enableSpa,
2992
3012
  format: p.format,
2993
3013
  onlyMainContent: p.onlyMainContent,
2994
3014
  concurrency: Math.min(Math.max(p.concurrency, 1), 10),
@@ -3021,6 +3041,22 @@ var crawlCommand = registerCommand({
3021
3041
  const content = options.format === "html" ? html : htmlToMarkdown(html, { onlyMainContent: options.onlyMainContent });
3022
3042
  results.push({ url: seedPage.url(), title, content });
3023
3043
  const firstLinks = await extractLinks(seedPage, startUrl.origin);
3044
+ if (options.enableSpa) {
3045
+ const spaRoutes = await detectSpaRoutes(seedPage, startUrl.origin);
3046
+ for (const route2 of spaRoutes) {
3047
+ try {
3048
+ const absNorm = normalizeUrl(stripHashAnchorQuery(route2));
3049
+ if (!visited.has(absNorm) && !shouldSkipUrl(route2)) {
3050
+ queue.push({ url: stripHashAnchorQuery(route2), depth: 1 });
3051
+ }
3052
+ } catch {
3053
+ }
3054
+ }
3055
+ if (options.verbose && spaRoutes.length > 0) {
3056
+ process.stderr.write(`[SPA] Detected ${spaRoutes.length} SPA routes
3057
+ `);
3058
+ }
3059
+ }
3024
3060
  for (const link of firstLinks) {
3025
3061
  try {
3026
3062
  const absolute = new URL(link, seedPage.url()).href;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@xbrowser/cli",
3
- "version": "1.7.0",
3
+ "version": "1.7.2",
4
4
  "description": "Browser automation CLI for web scraping, headless browsing, SEO analysis, and AI agent workflows. A command-line alternative to Playwright, Puppeteer, and Selenium.",
5
5
  "type": "module",
6
6
  "bin": {