@xbrowser/cli 1.7.1 → 1.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +50 -13
- package/dist/daemon-main.js +50 -13
- package/dist/index.js +50 -13
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -2795,22 +2795,59 @@ async function extractLinks(page, origin) {
|
|
|
2795
2795
|
}, origin);
|
|
2796
2796
|
}
|
|
2797
2797
|
async function detectSpaRoutes(page, origin) {
|
|
2798
|
-
|
|
2799
|
-
|
|
2798
|
+
const routeSet = /* @__PURE__ */ new Set();
|
|
2799
|
+
const pathRegex = /['"`](\/[a-zA-Z0-9_\-/]+)['"`]/g;
|
|
2800
|
+
const isParamRoute = (p) => p.includes(":") || p.includes("*");
|
|
2801
|
+
function extractPaths(source) {
|
|
2802
|
+
let match;
|
|
2803
|
+
while ((match = pathRegex.exec(source)) !== null) {
|
|
2804
|
+
const path3 = match[1];
|
|
2805
|
+
if (!isParamRoute(path3)) routeSet.add(path3);
|
|
2806
|
+
}
|
|
2807
|
+
}
|
|
2808
|
+
const scriptData = await page.evaluate(() => {
|
|
2809
|
+
const scripts = Array.from(document.querySelectorAll("script"));
|
|
2810
|
+
return {
|
|
2811
|
+
inlineContent: scripts.map((s) => s.textContent || "").join("\n"),
|
|
2812
|
+
externalUrls: scripts.map((s) => s.src).filter((src) => src && !src.includes("analytics") && !src.includes("google") && !src.includes("baidu"))
|
|
2813
|
+
};
|
|
2814
|
+
});
|
|
2815
|
+
const { inlineContent, externalUrls } = scriptData;
|
|
2816
|
+
extractPaths(inlineContent);
|
|
2817
|
+
for (const src of externalUrls) {
|
|
2800
2818
|
try {
|
|
2801
|
-
const
|
|
2802
|
-
const
|
|
2803
|
-
|
|
2804
|
-
|
|
2805
|
-
|
|
2806
|
-
const path3 = match[1];
|
|
2807
|
-
if (path3.includes(":") || path3.includes("*") || routeSet.has(path3)) continue;
|
|
2808
|
-
routeSet.add(path3);
|
|
2819
|
+
const absoluteSrc = src.startsWith("http") ? src : new URL(src, page.url()).href;
|
|
2820
|
+
const resp = await fetch(absoluteSrc, { signal: AbortSignal.timeout(5e3) });
|
|
2821
|
+
if (resp.ok) {
|
|
2822
|
+
const text = await resp.text();
|
|
2823
|
+
extractPaths(text);
|
|
2809
2824
|
}
|
|
2810
2825
|
} catch {
|
|
2811
2826
|
}
|
|
2812
|
-
|
|
2813
|
-
|
|
2827
|
+
}
|
|
2828
|
+
try {
|
|
2829
|
+
const vueRoutes = await page.evaluate((evalOrigin) => {
|
|
2830
|
+
const routes2 = [];
|
|
2831
|
+
const win = window;
|
|
2832
|
+
const vueApp = win.__vue_app__;
|
|
2833
|
+
const gp = vueApp?.config?.globalProperties;
|
|
2834
|
+
const router = gp?.$router;
|
|
2835
|
+
const routeList = router?.options?.routes;
|
|
2836
|
+
if (routeList) {
|
|
2837
|
+
for (const r of routeList) {
|
|
2838
|
+
if (r.path && !r.path.includes(":") && r.path !== "/" && r.path !== "") {
|
|
2839
|
+
routes2.push(`${evalOrigin.replace(/\/$/, "")}/#${r.path}`);
|
|
2840
|
+
}
|
|
2841
|
+
}
|
|
2842
|
+
}
|
|
2843
|
+
return routes2;
|
|
2844
|
+
}, origin);
|
|
2845
|
+
for (const r of vueRoutes) routeSet.add(r);
|
|
2846
|
+
} catch {
|
|
2847
|
+
}
|
|
2848
|
+
return Array.from(routeSet).map(
|
|
2849
|
+
(p) => p.startsWith("http") ? p : `${origin.replace(/\/$/, "")}${p.startsWith("/") ? "" : "/"}${p}`
|
|
2850
|
+
);
|
|
2814
2851
|
}
|
|
2815
2852
|
function parseRobotsTxt(text) {
|
|
2816
2853
|
const rules = [];
|
|
@@ -2951,7 +2988,7 @@ var crawlCommand = registerCommand({
|
|
|
2951
2988
|
allowSubdomains: z17.boolean().default(false),
|
|
2952
2989
|
allowExternalLinks: z17.boolean().default(false),
|
|
2953
2990
|
allowBackwardCrawling: z17.boolean().default(false),
|
|
2954
|
-
enableSpa: z17.boolean().default(
|
|
2991
|
+
enableSpa: z17.boolean().default(true).describe("Disable to skip SPA route detection"),
|
|
2955
2992
|
format: z17.enum(["markdown", "html"]).default("markdown"),
|
|
2956
2993
|
onlyMainContent: z17.boolean().default(true),
|
|
2957
2994
|
concurrency: z17.number().default(3),
|
package/dist/daemon-main.js
CHANGED
|
@@ -2762,22 +2762,59 @@ async function extractLinks(page, origin) {
|
|
|
2762
2762
|
}, origin);
|
|
2763
2763
|
}
|
|
2764
2764
|
async function detectSpaRoutes(page, origin) {
|
|
2765
|
-
|
|
2766
|
-
|
|
2765
|
+
const routeSet = /* @__PURE__ */ new Set();
|
|
2766
|
+
const pathRegex = /['"`](\/[a-zA-Z0-9_\-/]+)['"`]/g;
|
|
2767
|
+
const isParamRoute = (p) => p.includes(":") || p.includes("*");
|
|
2768
|
+
function extractPaths(source) {
|
|
2769
|
+
let match;
|
|
2770
|
+
while ((match = pathRegex.exec(source)) !== null) {
|
|
2771
|
+
const path2 = match[1];
|
|
2772
|
+
if (!isParamRoute(path2)) routeSet.add(path2);
|
|
2773
|
+
}
|
|
2774
|
+
}
|
|
2775
|
+
const scriptData = await page.evaluate(() => {
|
|
2776
|
+
const scripts = Array.from(document.querySelectorAll("script"));
|
|
2777
|
+
return {
|
|
2778
|
+
inlineContent: scripts.map((s) => s.textContent || "").join("\n"),
|
|
2779
|
+
externalUrls: scripts.map((s) => s.src).filter((src) => src && !src.includes("analytics") && !src.includes("google") && !src.includes("baidu"))
|
|
2780
|
+
};
|
|
2781
|
+
});
|
|
2782
|
+
const { inlineContent, externalUrls } = scriptData;
|
|
2783
|
+
extractPaths(inlineContent);
|
|
2784
|
+
for (const src of externalUrls) {
|
|
2767
2785
|
try {
|
|
2768
|
-
const
|
|
2769
|
-
const
|
|
2770
|
-
|
|
2771
|
-
|
|
2772
|
-
|
|
2773
|
-
const path2 = match[1];
|
|
2774
|
-
if (path2.includes(":") || path2.includes("*") || routeSet.has(path2)) continue;
|
|
2775
|
-
routeSet.add(path2);
|
|
2786
|
+
const absoluteSrc = src.startsWith("http") ? src : new URL(src, page.url()).href;
|
|
2787
|
+
const resp = await fetch(absoluteSrc, { signal: AbortSignal.timeout(5e3) });
|
|
2788
|
+
if (resp.ok) {
|
|
2789
|
+
const text = await resp.text();
|
|
2790
|
+
extractPaths(text);
|
|
2776
2791
|
}
|
|
2777
2792
|
} catch {
|
|
2778
2793
|
}
|
|
2779
|
-
|
|
2780
|
-
|
|
2794
|
+
}
|
|
2795
|
+
try {
|
|
2796
|
+
const vueRoutes = await page.evaluate((evalOrigin) => {
|
|
2797
|
+
const routes = [];
|
|
2798
|
+
const win = window;
|
|
2799
|
+
const vueApp = win.__vue_app__;
|
|
2800
|
+
const gp = vueApp?.config?.globalProperties;
|
|
2801
|
+
const router = gp?.$router;
|
|
2802
|
+
const routeList = router?.options?.routes;
|
|
2803
|
+
if (routeList) {
|
|
2804
|
+
for (const r of routeList) {
|
|
2805
|
+
if (r.path && !r.path.includes(":") && r.path !== "/" && r.path !== "") {
|
|
2806
|
+
routes.push(`${evalOrigin.replace(/\/$/, "")}/#${r.path}`);
|
|
2807
|
+
}
|
|
2808
|
+
}
|
|
2809
|
+
}
|
|
2810
|
+
return routes;
|
|
2811
|
+
}, origin);
|
|
2812
|
+
for (const r of vueRoutes) routeSet.add(r);
|
|
2813
|
+
} catch {
|
|
2814
|
+
}
|
|
2815
|
+
return Array.from(routeSet).map(
|
|
2816
|
+
(p) => p.startsWith("http") ? p : `${origin.replace(/\/$/, "")}${p.startsWith("/") ? "" : "/"}${p}`
|
|
2817
|
+
);
|
|
2781
2818
|
}
|
|
2782
2819
|
function parseRobotsTxt(text) {
|
|
2783
2820
|
const rules = [];
|
|
@@ -2918,7 +2955,7 @@ var crawlCommand = registerCommand({
|
|
|
2918
2955
|
allowSubdomains: z17.boolean().default(false),
|
|
2919
2956
|
allowExternalLinks: z17.boolean().default(false),
|
|
2920
2957
|
allowBackwardCrawling: z17.boolean().default(false),
|
|
2921
|
-
enableSpa: z17.boolean().default(
|
|
2958
|
+
enableSpa: z17.boolean().default(true).describe("Disable to skip SPA route detection"),
|
|
2922
2959
|
format: z17.enum(["markdown", "html"]).default("markdown"),
|
|
2923
2960
|
onlyMainContent: z17.boolean().default(true),
|
|
2924
2961
|
concurrency: z17.number().default(3),
|
package/dist/index.js
CHANGED
|
@@ -2835,22 +2835,59 @@ async function extractLinks(page, origin) {
|
|
|
2835
2835
|
}, origin);
|
|
2836
2836
|
}
|
|
2837
2837
|
async function detectSpaRoutes(page, origin) {
|
|
2838
|
-
|
|
2839
|
-
|
|
2838
|
+
const routeSet = /* @__PURE__ */ new Set();
|
|
2839
|
+
const pathRegex = /['"`](\/[a-zA-Z0-9_\-/]+)['"`]/g;
|
|
2840
|
+
const isParamRoute = (p) => p.includes(":") || p.includes("*");
|
|
2841
|
+
function extractPaths(source) {
|
|
2842
|
+
let match;
|
|
2843
|
+
while ((match = pathRegex.exec(source)) !== null) {
|
|
2844
|
+
const path5 = match[1];
|
|
2845
|
+
if (!isParamRoute(path5)) routeSet.add(path5);
|
|
2846
|
+
}
|
|
2847
|
+
}
|
|
2848
|
+
const scriptData = await page.evaluate(() => {
|
|
2849
|
+
const scripts = Array.from(document.querySelectorAll("script"));
|
|
2850
|
+
return {
|
|
2851
|
+
inlineContent: scripts.map((s) => s.textContent || "").join("\n"),
|
|
2852
|
+
externalUrls: scripts.map((s) => s.src).filter((src) => src && !src.includes("analytics") && !src.includes("google") && !src.includes("baidu"))
|
|
2853
|
+
};
|
|
2854
|
+
});
|
|
2855
|
+
const { inlineContent, externalUrls } = scriptData;
|
|
2856
|
+
extractPaths(inlineContent);
|
|
2857
|
+
for (const src of externalUrls) {
|
|
2840
2858
|
try {
|
|
2841
|
-
const
|
|
2842
|
-
const
|
|
2843
|
-
|
|
2844
|
-
|
|
2845
|
-
|
|
2846
|
-
const path5 = match[1];
|
|
2847
|
-
if (path5.includes(":") || path5.includes("*") || routeSet.has(path5)) continue;
|
|
2848
|
-
routeSet.add(path5);
|
|
2859
|
+
const absoluteSrc = src.startsWith("http") ? src : new URL(src, page.url()).href;
|
|
2860
|
+
const resp = await fetch(absoluteSrc, { signal: AbortSignal.timeout(5e3) });
|
|
2861
|
+
if (resp.ok) {
|
|
2862
|
+
const text = await resp.text();
|
|
2863
|
+
extractPaths(text);
|
|
2849
2864
|
}
|
|
2850
2865
|
} catch {
|
|
2851
2866
|
}
|
|
2852
|
-
|
|
2853
|
-
|
|
2867
|
+
}
|
|
2868
|
+
try {
|
|
2869
|
+
const vueRoutes = await page.evaluate((evalOrigin) => {
|
|
2870
|
+
const routes2 = [];
|
|
2871
|
+
const win = window;
|
|
2872
|
+
const vueApp = win.__vue_app__;
|
|
2873
|
+
const gp = vueApp?.config?.globalProperties;
|
|
2874
|
+
const router = gp?.$router;
|
|
2875
|
+
const routeList = router?.options?.routes;
|
|
2876
|
+
if (routeList) {
|
|
2877
|
+
for (const r of routeList) {
|
|
2878
|
+
if (r.path && !r.path.includes(":") && r.path !== "/" && r.path !== "") {
|
|
2879
|
+
routes2.push(`${evalOrigin.replace(/\/$/, "")}/#${r.path}`);
|
|
2880
|
+
}
|
|
2881
|
+
}
|
|
2882
|
+
}
|
|
2883
|
+
return routes2;
|
|
2884
|
+
}, origin);
|
|
2885
|
+
for (const r of vueRoutes) routeSet.add(r);
|
|
2886
|
+
} catch {
|
|
2887
|
+
}
|
|
2888
|
+
return Array.from(routeSet).map(
|
|
2889
|
+
(p) => p.startsWith("http") ? p : `${origin.replace(/\/$/, "")}${p.startsWith("/") ? "" : "/"}${p}`
|
|
2890
|
+
);
|
|
2854
2891
|
}
|
|
2855
2892
|
function parseRobotsTxt(text) {
|
|
2856
2893
|
const rules = [];
|
|
@@ -2991,7 +3028,7 @@ var crawlCommand = registerCommand({
|
|
|
2991
3028
|
allowSubdomains: z17.boolean().default(false),
|
|
2992
3029
|
allowExternalLinks: z17.boolean().default(false),
|
|
2993
3030
|
allowBackwardCrawling: z17.boolean().default(false),
|
|
2994
|
-
enableSpa: z17.boolean().default(
|
|
3031
|
+
enableSpa: z17.boolean().default(true).describe("Disable to skip SPA route detection"),
|
|
2995
3032
|
format: z17.enum(["markdown", "html"]).default("markdown"),
|
|
2996
3033
|
onlyMainContent: z17.boolean().default(true),
|
|
2997
3034
|
concurrency: z17.number().default(3),
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@xbrowser/cli",
|
|
3
|
-
"version": "1.7.
|
|
3
|
+
"version": "1.7.3",
|
|
4
4
|
"description": "Browser automation CLI for web scraping, headless browsing, SEO analysis, and AI agent workflows. A command-line alternative to Playwright, Puppeteer, and Selenium.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|