@xbrowser/cli 1.7.0 → 1.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +36 -0
- package/dist/daemon-main.js +36 -0
- package/dist/index.js +36 -0
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -2794,6 +2794,24 @@ async function extractLinks(page, origin) {
|
|
|
2794
2794
|
}).filter(Boolean);
|
|
2795
2795
|
}, origin);
|
|
2796
2796
|
}
|
|
2797
|
+
async function detectSpaRoutes(page, origin) {
|
|
2798
|
+
return page.evaluate((evalOrigin) => {
|
|
2799
|
+
const routeSet = /* @__PURE__ */ new Set();
|
|
2800
|
+
try {
|
|
2801
|
+
const scripts = document.querySelectorAll("script");
|
|
2802
|
+
const allContent = Array.from(scripts).map((s) => s.textContent || "").join("\n");
|
|
2803
|
+
const pathRegex = /['"`](\/[a-zA-Z0-9_\-/]+)['"`]/g;
|
|
2804
|
+
let match;
|
|
2805
|
+
while ((match = pathRegex.exec(allContent)) !== null) {
|
|
2806
|
+
const path3 = match[1];
|
|
2807
|
+
if (path3.includes(":") || path3.includes("*") || routeSet.has(path3)) continue;
|
|
2808
|
+
routeSet.add(path3);
|
|
2809
|
+
}
|
|
2810
|
+
} catch {
|
|
2811
|
+
}
|
|
2812
|
+
return Array.from(routeSet).map((path3) => `${evalOrigin.replace(/\/$/, "")}${path3}`);
|
|
2813
|
+
}, origin);
|
|
2814
|
+
}
|
|
2797
2815
|
function parseRobotsTxt(text) {
|
|
2798
2816
|
const rules = [];
|
|
2799
2817
|
let inRelevantBlock = false;
|
|
@@ -2933,6 +2951,7 @@ var crawlCommand = registerCommand({
|
|
|
2933
2951
|
allowSubdomains: z17.boolean().default(false),
|
|
2934
2952
|
allowExternalLinks: z17.boolean().default(false),
|
|
2935
2953
|
allowBackwardCrawling: z17.boolean().default(false),
|
|
2954
|
+
enableSpa: z17.boolean().default(false).describe("Detect SPA (Vue/React) routes from router config"),
|
|
2936
2955
|
format: z17.enum(["markdown", "html"]).default("markdown"),
|
|
2937
2956
|
onlyMainContent: z17.boolean().default(true),
|
|
2938
2957
|
concurrency: z17.number().default(3),
|
|
@@ -2949,6 +2968,7 @@ var crawlCommand = registerCommand({
|
|
|
2949
2968
|
allowSubdomains: p.allowSubdomains,
|
|
2950
2969
|
allowExternalLinks: p.allowExternalLinks,
|
|
2951
2970
|
allowBackwardCrawling: p.allowBackwardCrawling,
|
|
2971
|
+
enableSpa: p.enableSpa,
|
|
2952
2972
|
format: p.format,
|
|
2953
2973
|
onlyMainContent: p.onlyMainContent,
|
|
2954
2974
|
concurrency: Math.min(Math.max(p.concurrency, 1), 10),
|
|
@@ -2981,6 +3001,22 @@ var crawlCommand = registerCommand({
|
|
|
2981
3001
|
const content = options.format === "html" ? html : htmlToMarkdown(html, { onlyMainContent: options.onlyMainContent });
|
|
2982
3002
|
results.push({ url: seedPage.url(), title, content });
|
|
2983
3003
|
const firstLinks = await extractLinks(seedPage, startUrl.origin);
|
|
3004
|
+
if (options.enableSpa) {
|
|
3005
|
+
const spaRoutes = await detectSpaRoutes(seedPage, startUrl.origin);
|
|
3006
|
+
for (const route2 of spaRoutes) {
|
|
3007
|
+
try {
|
|
3008
|
+
const absNorm = normalizeUrl(stripHashAnchorQuery(route2));
|
|
3009
|
+
if (!visited.has(absNorm) && !shouldSkipUrl(route2)) {
|
|
3010
|
+
queue.push({ url: stripHashAnchorQuery(route2), depth: 1 });
|
|
3011
|
+
}
|
|
3012
|
+
} catch {
|
|
3013
|
+
}
|
|
3014
|
+
}
|
|
3015
|
+
if (options.verbose && spaRoutes.length > 0) {
|
|
3016
|
+
process.stderr.write(`[SPA] Detected ${spaRoutes.length} SPA routes
|
|
3017
|
+
`);
|
|
3018
|
+
}
|
|
3019
|
+
}
|
|
2984
3020
|
for (const link of firstLinks) {
|
|
2985
3021
|
try {
|
|
2986
3022
|
const absolute = new URL(link, seedPage.url()).href;
|
package/dist/daemon-main.js
CHANGED
|
@@ -2761,6 +2761,24 @@ async function extractLinks(page, origin) {
|
|
|
2761
2761
|
}).filter(Boolean);
|
|
2762
2762
|
}, origin);
|
|
2763
2763
|
}
|
|
2764
|
+
async function detectSpaRoutes(page, origin) {
|
|
2765
|
+
return page.evaluate((evalOrigin) => {
|
|
2766
|
+
const routeSet = /* @__PURE__ */ new Set();
|
|
2767
|
+
try {
|
|
2768
|
+
const scripts = document.querySelectorAll("script");
|
|
2769
|
+
const allContent = Array.from(scripts).map((s) => s.textContent || "").join("\n");
|
|
2770
|
+
const pathRegex = /['"`](\/[a-zA-Z0-9_\-/]+)['"`]/g;
|
|
2771
|
+
let match;
|
|
2772
|
+
while ((match = pathRegex.exec(allContent)) !== null) {
|
|
2773
|
+
const path2 = match[1];
|
|
2774
|
+
if (path2.includes(":") || path2.includes("*") || routeSet.has(path2)) continue;
|
|
2775
|
+
routeSet.add(path2);
|
|
2776
|
+
}
|
|
2777
|
+
} catch {
|
|
2778
|
+
}
|
|
2779
|
+
return Array.from(routeSet).map((path2) => `${evalOrigin.replace(/\/$/, "")}${path2}`);
|
|
2780
|
+
}, origin);
|
|
2781
|
+
}
|
|
2764
2782
|
function parseRobotsTxt(text) {
|
|
2765
2783
|
const rules = [];
|
|
2766
2784
|
let inRelevantBlock = false;
|
|
@@ -2900,6 +2918,7 @@ var crawlCommand = registerCommand({
|
|
|
2900
2918
|
allowSubdomains: z17.boolean().default(false),
|
|
2901
2919
|
allowExternalLinks: z17.boolean().default(false),
|
|
2902
2920
|
allowBackwardCrawling: z17.boolean().default(false),
|
|
2921
|
+
enableSpa: z17.boolean().default(false).describe("Detect SPA (Vue/React) routes from router config"),
|
|
2903
2922
|
format: z17.enum(["markdown", "html"]).default("markdown"),
|
|
2904
2923
|
onlyMainContent: z17.boolean().default(true),
|
|
2905
2924
|
concurrency: z17.number().default(3),
|
|
@@ -2916,6 +2935,7 @@ var crawlCommand = registerCommand({
|
|
|
2916
2935
|
allowSubdomains: p.allowSubdomains,
|
|
2917
2936
|
allowExternalLinks: p.allowExternalLinks,
|
|
2918
2937
|
allowBackwardCrawling: p.allowBackwardCrawling,
|
|
2938
|
+
enableSpa: p.enableSpa,
|
|
2919
2939
|
format: p.format,
|
|
2920
2940
|
onlyMainContent: p.onlyMainContent,
|
|
2921
2941
|
concurrency: Math.min(Math.max(p.concurrency, 1), 10),
|
|
@@ -2948,6 +2968,22 @@ var crawlCommand = registerCommand({
|
|
|
2948
2968
|
const content = options.format === "html" ? html : htmlToMarkdown(html, { onlyMainContent: options.onlyMainContent });
|
|
2949
2969
|
results.push({ url: seedPage.url(), title, content });
|
|
2950
2970
|
const firstLinks = await extractLinks(seedPage, startUrl.origin);
|
|
2971
|
+
if (options.enableSpa) {
|
|
2972
|
+
const spaRoutes = await detectSpaRoutes(seedPage, startUrl.origin);
|
|
2973
|
+
for (const route of spaRoutes) {
|
|
2974
|
+
try {
|
|
2975
|
+
const absNorm = normalizeUrl(stripHashAnchorQuery(route));
|
|
2976
|
+
if (!visited.has(absNorm) && !shouldSkipUrl(route)) {
|
|
2977
|
+
queue.push({ url: stripHashAnchorQuery(route), depth: 1 });
|
|
2978
|
+
}
|
|
2979
|
+
} catch {
|
|
2980
|
+
}
|
|
2981
|
+
}
|
|
2982
|
+
if (options.verbose && spaRoutes.length > 0) {
|
|
2983
|
+
process.stderr.write(`[SPA] Detected ${spaRoutes.length} SPA routes
|
|
2984
|
+
`);
|
|
2985
|
+
}
|
|
2986
|
+
}
|
|
2951
2987
|
for (const link of firstLinks) {
|
|
2952
2988
|
try {
|
|
2953
2989
|
const absolute = new URL(link, seedPage.url()).href;
|
package/dist/index.js
CHANGED
|
@@ -2834,6 +2834,24 @@ async function extractLinks(page, origin) {
|
|
|
2834
2834
|
}).filter(Boolean);
|
|
2835
2835
|
}, origin);
|
|
2836
2836
|
}
|
|
2837
|
+
async function detectSpaRoutes(page, origin) {
|
|
2838
|
+
return page.evaluate((evalOrigin) => {
|
|
2839
|
+
const routeSet = /* @__PURE__ */ new Set();
|
|
2840
|
+
try {
|
|
2841
|
+
const scripts = document.querySelectorAll("script");
|
|
2842
|
+
const allContent = Array.from(scripts).map((s) => s.textContent || "").join("\n");
|
|
2843
|
+
const pathRegex = /['"`](\/[a-zA-Z0-9_\-/]+)['"`]/g;
|
|
2844
|
+
let match;
|
|
2845
|
+
while ((match = pathRegex.exec(allContent)) !== null) {
|
|
2846
|
+
const path5 = match[1];
|
|
2847
|
+
if (path5.includes(":") || path5.includes("*") || routeSet.has(path5)) continue;
|
|
2848
|
+
routeSet.add(path5);
|
|
2849
|
+
}
|
|
2850
|
+
} catch {
|
|
2851
|
+
}
|
|
2852
|
+
return Array.from(routeSet).map((path5) => `${evalOrigin.replace(/\/$/, "")}${path5}`);
|
|
2853
|
+
}, origin);
|
|
2854
|
+
}
|
|
2837
2855
|
function parseRobotsTxt(text) {
|
|
2838
2856
|
const rules = [];
|
|
2839
2857
|
let inRelevantBlock = false;
|
|
@@ -2973,6 +2991,7 @@ var crawlCommand = registerCommand({
|
|
|
2973
2991
|
allowSubdomains: z17.boolean().default(false),
|
|
2974
2992
|
allowExternalLinks: z17.boolean().default(false),
|
|
2975
2993
|
allowBackwardCrawling: z17.boolean().default(false),
|
|
2994
|
+
enableSpa: z17.boolean().default(false).describe("Detect SPA (Vue/React) routes from router config"),
|
|
2976
2995
|
format: z17.enum(["markdown", "html"]).default("markdown"),
|
|
2977
2996
|
onlyMainContent: z17.boolean().default(true),
|
|
2978
2997
|
concurrency: z17.number().default(3),
|
|
@@ -2989,6 +3008,7 @@ var crawlCommand = registerCommand({
|
|
|
2989
3008
|
allowSubdomains: p.allowSubdomains,
|
|
2990
3009
|
allowExternalLinks: p.allowExternalLinks,
|
|
2991
3010
|
allowBackwardCrawling: p.allowBackwardCrawling,
|
|
3011
|
+
enableSpa: p.enableSpa,
|
|
2992
3012
|
format: p.format,
|
|
2993
3013
|
onlyMainContent: p.onlyMainContent,
|
|
2994
3014
|
concurrency: Math.min(Math.max(p.concurrency, 1), 10),
|
|
@@ -3021,6 +3041,22 @@ var crawlCommand = registerCommand({
|
|
|
3021
3041
|
const content = options.format === "html" ? html : htmlToMarkdown(html, { onlyMainContent: options.onlyMainContent });
|
|
3022
3042
|
results.push({ url: seedPage.url(), title, content });
|
|
3023
3043
|
const firstLinks = await extractLinks(seedPage, startUrl.origin);
|
|
3044
|
+
if (options.enableSpa) {
|
|
3045
|
+
const spaRoutes = await detectSpaRoutes(seedPage, startUrl.origin);
|
|
3046
|
+
for (const route2 of spaRoutes) {
|
|
3047
|
+
try {
|
|
3048
|
+
const absNorm = normalizeUrl(stripHashAnchorQuery(route2));
|
|
3049
|
+
if (!visited.has(absNorm) && !shouldSkipUrl(route2)) {
|
|
3050
|
+
queue.push({ url: stripHashAnchorQuery(route2), depth: 1 });
|
|
3051
|
+
}
|
|
3052
|
+
} catch {
|
|
3053
|
+
}
|
|
3054
|
+
}
|
|
3055
|
+
if (options.verbose && spaRoutes.length > 0) {
|
|
3056
|
+
process.stderr.write(`[SPA] Detected ${spaRoutes.length} SPA routes
|
|
3057
|
+
`);
|
|
3058
|
+
}
|
|
3059
|
+
}
|
|
3024
3060
|
for (const link of firstLinks) {
|
|
3025
3061
|
try {
|
|
3026
3062
|
const absolute = new URL(link, seedPage.url()).href;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@xbrowser/cli",
|
|
3
|
-
"version": "1.7.
|
|
3
|
+
"version": "1.7.1",
|
|
4
4
|
"description": "Browser automation CLI for web scraping, headless browsing, SEO analysis, and AI agent workflows. A command-line alternative to Playwright, Puppeteer, and Selenium.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|