@opendesign-plus/geo-scripts 0.0.1-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/generate-llms-full.d.ts +76 -0
- package/dist/generate-llms-full.js +238 -0
- package/dist/generate-robots-txt.d.ts +74 -0
- package/dist/generate-robots-txt.js +68 -0
- package/dist/vitepress-sitemap-transformer.d.ts +32 -0
- package/dist/vitepress-sitemap-transformer.js +29 -0
- package/package.json +40 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* generate-llms-full.ts - 生成 llms-full.txt 文件
|
|
3
|
+
*
|
|
4
|
+
* 从网站sitemap或本地HTML目录抓取页面内容,转换为Markdown格式,
|
|
5
|
+
* 生成适合AI大语言模型读取的 llms-full.txt 文件。
|
|
6
|
+
*
|
|
7
|
+
* 支持两种调用方式:
|
|
8
|
+
*
|
|
9
|
+
* 1. 命令行调用:
|
|
10
|
+
* npx tsx generate-llms-full.ts [options]
|
|
11
|
+
*
|
|
12
|
+
* 选项:
|
|
13
|
+
* -s, --site <url> 网站根目录地址
|
|
14
|
+
* --html-dir <dirpath> 本地HTML构建产物目录(从本地文件读取,不传的话则遍历网站sitemap)
|
|
15
|
+
* --prefix <string> 生成文件的自定义开头内容
|
|
16
|
+
* --spa 是否为SPA应用,使用Playwright渲染
|
|
17
|
+
* --selector <selector> HTML元素选择器,默认 'main'
|
|
18
|
+
* --rmc --remove-class <...> 转md时移除的HTML元素class选择器(逗号分隔或传数组)
|
|
19
|
+
* --rme --remove-elements <...> 转md时移除的HTML元素标签(逗号分隔或传数组)
|
|
20
|
+
* -o --output <file> 输出文件路径,默认 './llms-full.txt'
|
|
21
|
+
* -e, --exclude <patterns...> 排除URL的正则表达式模式
|
|
22
|
+
* -c, --concurrency <number> 并发数,默认 '5'
|
|
23
|
+
*
|
|
24
|
+
* 示例:
|
|
25
|
+
* # 从网站sitemap抓取
|
|
26
|
+
* npx tsx generate-llms-full.ts -s https://example.com --spa -c 10
|
|
27
|
+
*
|
|
28
|
+
* # 从本地HTML目录读取
|
|
29
|
+
* npx tsx generate-llms-full.ts --html-dir ./dist -s https://example.com
|
|
30
|
+
*
|
|
31
|
+
* # 排除特定URL
|
|
32
|
+
* npx tsx generate-llms-full.ts -s https://example.com -e "/admin" -e "/login"
|
|
33
|
+
*
|
|
34
|
+
* 2. 模块导入调用:
|
|
35
|
+
* import generateLLMsFull from '@opendesign-plus/geo-scripts/generate-llms-full';
|
|
36
|
+
*
|
|
37
|
+
* generateLLMsFull({
|
|
38
|
+
* site: 'https://example.com',
|
|
39
|
+
* spa: true,
|
|
40
|
+
* selector: 'main',
|
|
41
|
+
* concurrency: '10',
|
|
42
|
+
* output: './llms-full.txt'
|
|
43
|
+
* });
|
|
44
|
+
*
|
|
45
|
+
* // 从本地HTML目录读取
|
|
46
|
+
* generateLLMsFull({
|
|
47
|
+
* htmlDir: './dist',
|
|
48
|
+
* site: 'https://example.com', // 用于生成完整URL
|
|
49
|
+
* selector: 'article',
|
|
50
|
+
* output: './llms-full.txt'
|
|
51
|
+
* });
|
|
52
|
+
*/
|
|
53
|
+
export interface GenerateLLMsFullOptions {
|
|
54
|
+
/** 网站根目录地址 */
|
|
55
|
+
site?: string;
|
|
56
|
+
/** 生成文件的自定义开头内容 */
|
|
57
|
+
prefix?: string;
|
|
58
|
+
/** 是否为SPA应用,使用Playwright渲染 */
|
|
59
|
+
spa?: boolean;
|
|
60
|
+
/** HTML元素选择器,默认 'main' */
|
|
61
|
+
selector?: string;
|
|
62
|
+
/** 转md时移除的HTML元素class选择器(逗号分隔或传数组) */
|
|
63
|
+
removeClass?: string | string[];
|
|
64
|
+
/** 转md时移除的HTML元素标签(逗号分隔或传数组) */
|
|
65
|
+
removeElements?: string | string[];
|
|
66
|
+
/** 本地HTML构建产物目录(从本地文件读取,不传的话则遍历网站sitemap) */
|
|
67
|
+
htmlDir?: string;
|
|
68
|
+
/** 输出文件路径,默认 './llms-full.txt' */
|
|
69
|
+
output?: string;
|
|
70
|
+
/** 排除URL的正则表达式模式 */
|
|
71
|
+
exclude?: (string | RegExp)[];
|
|
72
|
+
/** 并发数,默认 '5' */
|
|
73
|
+
concurrency?: string | number;
|
|
74
|
+
}
|
|
75
|
+
declare function generateLLMsFull(options: GenerateLLMsFullOptions): Promise<void>;
|
|
76
|
+
export default generateLLMsFull;
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
import { chromium as x } from "playwright-core";
|
|
2
|
+
import $ from "turndown";
|
|
3
|
+
import b from "p-limit";
|
|
4
|
+
import { createWriteStream as v, readdirSync as C, existsSync as L } from "node:fs";
|
|
5
|
+
import { resolve as S, join as d, relative as A } from "node:path";
|
|
6
|
+
import { fileURLToPath as P } from "node:url";
|
|
7
|
+
import { pipeline as T } from "node:stream/promises";
|
|
8
|
+
import w from "node:process";
|
|
9
|
+
import { Command as k } from "commander";
|
|
10
|
+
import { JSDOM as R } from "jsdom";
|
|
11
|
+
import { readFile as M } from "node:fs/promises";
|
|
12
|
+
const h = new $({
|
|
13
|
+
headingStyle: "atx",
|
|
14
|
+
bulletListMarker: "-"
|
|
15
|
+
});
|
|
16
|
+
h.addRule("remove-imgs", {
|
|
17
|
+
filter: ["img"],
|
|
18
|
+
replacement: () => ""
|
|
19
|
+
});
|
|
20
|
+
h.addRule("links", {
|
|
21
|
+
filter: ["a"],
|
|
22
|
+
replacement: (e, s) => {
|
|
23
|
+
var c;
|
|
24
|
+
const o = (c = s.querySelector(".o-card-title")) == null ? void 0 : c.textContent;
|
|
25
|
+
return o ? `(${o})[${s.getAttribute("href")}] ` : e;
|
|
26
|
+
}
|
|
27
|
+
});
|
|
28
|
+
h.addRule("table", {
|
|
29
|
+
filter: "table",
|
|
30
|
+
replacement: (e, s) => {
|
|
31
|
+
let o = "";
|
|
32
|
+
return Array.from(s.querySelectorAll("tr")).forEach((l, a) => {
|
|
33
|
+
const p = l.querySelectorAll("th, td"), f = Array.from(p).map((t) => {
|
|
34
|
+
var n;
|
|
35
|
+
return ((n = t.textContent) == null ? void 0 : n.trim()) || "";
|
|
36
|
+
}).join(" | ");
|
|
37
|
+
if (o += "| " + f + ` |
|
|
38
|
+
`, a === 0) {
|
|
39
|
+
const t = Array.from(p).map(() => "---").join(" | ");
|
|
40
|
+
o += "| " + t + ` |
|
|
41
|
+
`;
|
|
42
|
+
}
|
|
43
|
+
}), o;
|
|
44
|
+
}
|
|
45
|
+
});
|
|
46
|
+
async function E() {
|
|
47
|
+
const e = [
|
|
48
|
+
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
|
|
49
|
+
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
|
|
50
|
+
"/usr/bin/chromium-browser",
|
|
51
|
+
"/usr/bin/chromium",
|
|
52
|
+
"/usr/bin/google-chrome",
|
|
53
|
+
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
|
|
54
|
+
];
|
|
55
|
+
let s;
|
|
56
|
+
for (const o of e)
|
|
57
|
+
if (L(o)) {
|
|
58
|
+
s = o, console.log(`使用浏览器:${o}`);
|
|
59
|
+
break;
|
|
60
|
+
}
|
|
61
|
+
return s || (console.log("⚠ 未找到系统 Chrome,尝试使用 playwright 自带 Chromium..."), console.log(" 如果失败,请运行:npx playwright install chromium")), await x.launch({
|
|
62
|
+
headless: !0,
|
|
63
|
+
executablePath: s,
|
|
64
|
+
args: ["--no-sandbox", "--disable-setuid-sandbox"]
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
async function F(e) {
|
|
68
|
+
const s = [];
|
|
69
|
+
try {
|
|
70
|
+
const l = await fetch(`${e}robots.txt`);
|
|
71
|
+
if (!l.ok)
|
|
72
|
+
console.log(`Fetch robots.txt failed ${l.status}`), s.push(`${e}sitemap.xml`);
|
|
73
|
+
else {
|
|
74
|
+
const p = (await l.text()).split(`
|
|
75
|
+
`);
|
|
76
|
+
for (const f of p) {
|
|
77
|
+
const t = f.trim();
|
|
78
|
+
if (/^sitemap:/i.test(t)) {
|
|
79
|
+
const n = t.replace(/^sitemap:\s*/i, "");
|
|
80
|
+
n && s.push(n);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
s.length === 0 && s.push(`${e}sitemap.xml`);
|
|
84
|
+
}
|
|
85
|
+
} catch {
|
|
86
|
+
s.push(`${e}sitemap.xml`);
|
|
87
|
+
}
|
|
88
|
+
let o = "";
|
|
89
|
+
if (o = (await Promise.all(
|
|
90
|
+
s.map(async (l) => {
|
|
91
|
+
try {
|
|
92
|
+
const a = await fetch(l);
|
|
93
|
+
return a.ok ? await a.text() : "";
|
|
94
|
+
} catch {
|
|
95
|
+
return "";
|
|
96
|
+
}
|
|
97
|
+
})
|
|
98
|
+
)).join(""), !o)
|
|
99
|
+
return [];
|
|
100
|
+
const c = (l) => [...l.matchAll(/<loc>([^<]+)<\/loc>/g)].map((a) => a[1]);
|
|
101
|
+
for (; o.includes("sitemapindex"); ) {
|
|
102
|
+
const l = c(o);
|
|
103
|
+
o = (await Promise.all(
|
|
104
|
+
l.map(async (p) => {
|
|
105
|
+
try {
|
|
106
|
+
const f = await fetch(p);
|
|
107
|
+
return f.ok ? await f.text() : "";
|
|
108
|
+
} catch {
|
|
109
|
+
return "";
|
|
110
|
+
}
|
|
111
|
+
})
|
|
112
|
+
)).join("");
|
|
113
|
+
}
|
|
114
|
+
return c(o);
|
|
115
|
+
}
|
|
116
|
+
async function H(e) {
|
|
117
|
+
var f;
|
|
118
|
+
if (e.removeElements) {
|
|
119
|
+
const t = Array.isArray(e.removeElements) ? e.removeElements : e.removeElements.split(",").map((r) => r.trim()), n = new Set(t);
|
|
120
|
+
h.remove((r) => n.has(r.tagName.toLowerCase()));
|
|
121
|
+
}
|
|
122
|
+
if (e.removeClass) {
|
|
123
|
+
const t = Array.isArray(e.removeClass) ? e.removeClass : e.removeClass.split(",").map((n) => n.trim());
|
|
124
|
+
h.remove((n) => {
|
|
125
|
+
for (const r of t)
|
|
126
|
+
if (n.classList.contains(r))
|
|
127
|
+
return !0;
|
|
128
|
+
return !1;
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
const s = (f = e.exclude) != null && f.length ? e.exclude.map((t) => t instanceof RegExp ? t : new RegExp(t)) : null, o = e.htmlDir ? S(e.htmlDir) : "", c = e.site ? (() => {
|
|
132
|
+
try {
|
|
133
|
+
return new URL(e.site).origin;
|
|
134
|
+
} catch {
|
|
135
|
+
return "";
|
|
136
|
+
}
|
|
137
|
+
})() : "";
|
|
138
|
+
if (!c && !o) {
|
|
139
|
+
console.log("请指定 --site 或 --html-dir");
|
|
140
|
+
return;
|
|
141
|
+
}
|
|
142
|
+
let l = [];
|
|
143
|
+
if (o) {
|
|
144
|
+
const t = (n, r = []) => {
|
|
145
|
+
const i = C(n, { withFileTypes: !0 }), u = [];
|
|
146
|
+
e: for (const m of i) {
|
|
147
|
+
const g = d(n, m.name);
|
|
148
|
+
if (s) {
|
|
149
|
+
for (const y of s)
|
|
150
|
+
if (y.test(g.replace(/\\/g, "/")))
|
|
151
|
+
continue e;
|
|
152
|
+
}
|
|
153
|
+
m.isFile() && m.name.endsWith(".html") && r.push(g), m.isDirectory() && u.push(g);
|
|
154
|
+
}
|
|
155
|
+
for (const m of u)
|
|
156
|
+
t(m, r);
|
|
157
|
+
return r;
|
|
158
|
+
};
|
|
159
|
+
l = t(o);
|
|
160
|
+
} else {
|
|
161
|
+
let t = e.site;
|
|
162
|
+
if (t = t.endsWith("/") ? t : t + "/", console.log(`站点: ${t}`), console.log(`SPA模式: ${e.spa ? "是" : "否"}`), console.log(`选择器: ${e.selector}`), console.log(`并发数: ${e.concurrency}`), l = await F(t), s && (l = l.filter((n) => !s.some((r) => r.test(n.replace(/\\/g, "/"))))), !l.length) {
|
|
163
|
+
console.log("sitemap empty");
|
|
164
|
+
return;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
console.log(`共需处理 ${l.length} 个URL`);
|
|
168
|
+
const a = !o && e.spa ? await E() : null;
|
|
169
|
+
async function p(t) {
|
|
170
|
+
try {
|
|
171
|
+
console.log(`Processing: ${t}`);
|
|
172
|
+
let n;
|
|
173
|
+
if (a) {
|
|
174
|
+
const r = await a.newPage();
|
|
175
|
+
await r.goto(t, { waitUntil: "networkidle" }), n = await r.evaluate((i) => {
|
|
176
|
+
const u = document.querySelector(i);
|
|
177
|
+
return u ? u.innerHTML : null;
|
|
178
|
+
}, e.selector || "main"), await r.close();
|
|
179
|
+
} else {
|
|
180
|
+
let r = "";
|
|
181
|
+
if (o)
|
|
182
|
+
r = await M(t, "utf-8");
|
|
183
|
+
else {
|
|
184
|
+
const m = await fetch(t);
|
|
185
|
+
if (!m.ok) {
|
|
186
|
+
console.error(`Failed to fetch ${t}: ${m.status}`);
|
|
187
|
+
return;
|
|
188
|
+
}
|
|
189
|
+
r = await m.text();
|
|
190
|
+
}
|
|
191
|
+
const u = new R(r).window.document.querySelector(e.selector || "main");
|
|
192
|
+
n = u ? u.innerHTML : null;
|
|
193
|
+
}
|
|
194
|
+
if (n) {
|
|
195
|
+
const r = h.turndown(n);
|
|
196
|
+
if (console.log(`✓ ${t}`), o) {
|
|
197
|
+
let i = A(o, t).replace(/\\/g, "/").replace(/\.html$/, "");
|
|
198
|
+
return i === "index" ? i = "/" : i.endsWith("/index") ? i = i.slice(0, -5) : i = "/" + i, i = i.startsWith("/") ? i : "/" + i, `
|
|
199
|
+
---
|
|
200
|
+
${c ? c + i : i}
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
` + r + `
|
|
204
|
+
`;
|
|
205
|
+
}
|
|
206
|
+
return `
|
|
207
|
+
---
|
|
208
|
+
${t}
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
` + r + `
|
|
212
|
+
`;
|
|
213
|
+
} else
|
|
214
|
+
console.log(`No <${e.selector || "main"}> element found on ${t}`);
|
|
215
|
+
} catch (n) {
|
|
216
|
+
const r = n;
|
|
217
|
+
console.error(`Error processing ${t}:`, r.message);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
await T(
|
|
221
|
+
async function* () {
|
|
222
|
+
e.prefix && (yield e.prefix);
|
|
223
|
+
const t = b(typeof e.concurrency == "number" ? e.concurrency : parseInt(e.concurrency || "5", 10)), n = l.map((r) => t(() => p(r)));
|
|
224
|
+
for await (const r of n)
|
|
225
|
+
r && (yield r);
|
|
226
|
+
},
|
|
227
|
+
v(e.output || d(w.cwd(), "llms-full.txt"))
|
|
228
|
+
), a && await a.close();
|
|
229
|
+
}
|
|
230
|
+
if (w.argv[1] === P(import.meta.url)) {
|
|
231
|
+
const e = new k();
|
|
232
|
+
e.option("-s, --site <url>", "网站URL").option("--prefix <string>", "生成文件的自定义开头").option("--spa", "是否为SPA应用,使用Playwright渲染", !1).option("--selector <selector>", "HTML元素选择器", "main").option("--rmc --remove-class <selector, selector, ...>", "转md时移除的HTML元素class选择器").option("--rme --remove-elements <tag, tag, ...>", "转md时移除的HTML元素选择器").option("--html-dir <dirpath>", "html构建产物目录地址").option("-o --output <file>", "输出文件路径").option("-e, --exclude <patterns...>", "排除URL的正则表达式模式").option("-c, --concurrency <number>", "并发数", "5").parse();
|
|
233
|
+
const s = e.opts();
|
|
234
|
+
H(s);
|
|
235
|
+
}
|
|
236
|
+
export {
|
|
237
|
+
H as default
|
|
238
|
+
};
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* generate-robots-txt.ts - 生成 robots.txt 文件
|
|
3
|
+
*
|
|
4
|
+
* 支持两种调用方式:
|
|
5
|
+
*
|
|
6
|
+
* 1. 命令行调用:
|
|
7
|
+
* node generate-robots-txt.ts [options]
|
|
8
|
+
*
|
|
9
|
+
* 选项:
|
|
10
|
+
* -u, --user-agent <string> User-agent,默认 '*'
|
|
11
|
+
* -a, --allow <paths...> 允许的路径,可多次使用
|
|
12
|
+
* -d, --disallow <paths...> 禁止的路径,可多次使用
|
|
13
|
+
* -s, --sitemap <url> Sitemap URL
|
|
14
|
+
* --crawl-delay <seconds> 爬取延迟秒数
|
|
15
|
+
* --host <host> 主机地址
|
|
16
|
+
* -o, --output <file> 输出文件路径,默认 './robots.txt'
|
|
17
|
+
* -r, --rule <json> 多规则模式,JSON字符串,可多次使用
|
|
18
|
+
*
|
|
19
|
+
* 示例:
|
|
20
|
+
* # 简单模式
|
|
21
|
+
* node generate-robots-txt.ts -d /admin -d /private -s https://example.com/sitemap.xml
|
|
22
|
+
*
|
|
23
|
+
* # 多规则模式
|
|
24
|
+
* node generate-robots-txt.ts \
|
|
25
|
+
* -r '{"userAgent":"Googlebot","allow":["/"],"disallow":["/private"]}' \
|
|
26
|
+
* -r '{"userAgent":"Bingbot","disallow":["/admin"],"crawlDelay":5}' \
|
|
27
|
+
* -s https://example.com/sitemap.xml
|
|
28
|
+
*
|
|
29
|
+
* 2. 模块导入调用:
|
|
30
|
+
* import generateRobotsTxt from './generate-robots-txt.ts';
|
|
31
|
+
*
|
|
32
|
+
* // 简单模式
|
|
33
|
+
* generateRobotsTxt({
|
|
34
|
+
* userAgent: '*',
|
|
35
|
+
* allow: ['/public'],
|
|
36
|
+
* disallow: ['/admin', '/private'],
|
|
37
|
+
* sitemap: 'https://example.com/sitemap.xml',
|
|
38
|
+
* crawlDelay: 10,
|
|
39
|
+
* host: 'example.com',
|
|
40
|
+
* output: './robots.txt'
|
|
41
|
+
* });
|
|
42
|
+
*
|
|
43
|
+
* // 多规则模式
|
|
44
|
+
* generateRobotsTxt({
|
|
45
|
+
* rules: [
|
|
46
|
+
* { userAgent: 'Googlebot', allow: ['/'], disallow: ['/private'] },
|
|
47
|
+
* { userAgent: 'Bingbot', disallow: ['/admin'], crawlDelay: 5 }
|
|
48
|
+
* ],
|
|
49
|
+
* sitemap: 'https://example.com/sitemap.xml',
|
|
50
|
+
* output: './robots.txt'
|
|
51
|
+
* });
|
|
52
|
+
*/
|
|
53
|
+
export interface RobotsTxtRule {
|
|
54
|
+
userAgent?: string;
|
|
55
|
+
allow?: string[];
|
|
56
|
+
disallow?: string[];
|
|
57
|
+
crawlDelay?: number;
|
|
58
|
+
}
|
|
59
|
+
export interface RobotsTxtConfig {
|
|
60
|
+
userAgent?: string;
|
|
61
|
+
allow?: string[];
|
|
62
|
+
disallow?: string[];
|
|
63
|
+
sitemap?: string;
|
|
64
|
+
crawlDelay?: number;
|
|
65
|
+
host?: string;
|
|
66
|
+
rules?: RobotsTxtRule[];
|
|
67
|
+
output?: string;
|
|
68
|
+
}
|
|
69
|
+
export interface RobotsTxtResult {
|
|
70
|
+
output: string;
|
|
71
|
+
content: string;
|
|
72
|
+
}
|
|
73
|
+
declare function generateRobotsTxt(config: RobotsTxtConfig): RobotsTxtResult;
|
|
74
|
+
export default generateRobotsTxt;
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import { writeFileSync as m } from "node:fs";
|
|
2
|
+
import { resolve as w } from "node:path";
|
|
3
|
+
import { fileURLToPath as g } from "node:url";
|
|
4
|
+
import c from "node:process";
|
|
5
|
+
import { Command as d } from "commander";
|
|
6
|
+
function $(p) {
|
|
7
|
+
const {
|
|
8
|
+
userAgent: n = "*",
|
|
9
|
+
allow: l = [],
|
|
10
|
+
disallow: s = [],
|
|
11
|
+
sitemap: a,
|
|
12
|
+
crawlDelay: e,
|
|
13
|
+
host: r,
|
|
14
|
+
rules: i = [],
|
|
15
|
+
output: f = w(c.cwd(), "robots.txt")
|
|
16
|
+
} = p, o = [];
|
|
17
|
+
if (i.length > 0)
|
|
18
|
+
for (const t of i) {
|
|
19
|
+
if (t.userAgent && o.push(`User-agent: ${t.userAgent}`), t.allow && t.allow.length > 0)
|
|
20
|
+
for (const u of t.allow)
|
|
21
|
+
o.push(`Allow: ${u}`);
|
|
22
|
+
if (t.disallow && t.disallow.length > 0)
|
|
23
|
+
for (const u of t.disallow)
|
|
24
|
+
o.push(`Disallow: ${u}`);
|
|
25
|
+
t.crawlDelay !== void 0 && o.push(`Crawl-delay: ${t.crawlDelay}`), o.push("");
|
|
26
|
+
}
|
|
27
|
+
else {
|
|
28
|
+
if (o.push(`User-agent: ${n}`), o.push(""), l.length > 0)
|
|
29
|
+
for (const t of l)
|
|
30
|
+
o.push(`Allow: ${t}`);
|
|
31
|
+
if (s.length > 0)
|
|
32
|
+
for (const t of s)
|
|
33
|
+
o.push(`Disallow: ${t}`);
|
|
34
|
+
e !== void 0 && o.push(`Crawl-delay: ${e}`), o.push("");
|
|
35
|
+
}
|
|
36
|
+
a && o.push(`Sitemap: ${a}`), r && o.push(`Host: ${r}`);
|
|
37
|
+
const h = o.join(`
|
|
38
|
+
`).trim() + `
|
|
39
|
+
`;
|
|
40
|
+
return m(f, h, "utf-8"), { output: f, content: h };
|
|
41
|
+
}
|
|
42
|
+
if (c.argv[1] === g(import.meta.url)) {
|
|
43
|
+
let p = function(e, r) {
|
|
44
|
+
try {
|
|
45
|
+
const i = JSON.parse(e);
|
|
46
|
+
return r.concat(i);
|
|
47
|
+
} catch {
|
|
48
|
+
return console.error(`无效的JSON规则: ${e}`), r;
|
|
49
|
+
}
|
|
50
|
+
};
|
|
51
|
+
const n = new d();
|
|
52
|
+
n.option("-u, --user-agent <string>", "User-agent", "*").option("-a, --allow <paths...>", "允许的路径").option("-d, --disallow <paths...>", "禁止的路径").option("-s, --sitemap <url>", "Sitemap URL").option("--crawl-delay <seconds>", "爬取延迟秒数", parseInt).option("--host <host>", "主机地址").option("-o, --output <file>", "输出文件路径", w(c.cwd(), "robots.txt")).option("-r, --rule <json>", "多规则模式,传入JSON字符串,可多次使用", p, []).parse();
|
|
53
|
+
const l = n.opts(), s = {
|
|
54
|
+
userAgent: l.userAgent,
|
|
55
|
+
allow: l.allow || [],
|
|
56
|
+
disallow: l.disallow || [],
|
|
57
|
+
sitemap: l.sitemap,
|
|
58
|
+
crawlDelay: l.crawlDelay,
|
|
59
|
+
host: l.host,
|
|
60
|
+
output: l.output
|
|
61
|
+
};
|
|
62
|
+
l.rule && l.rule.length > 0 && (s.rules = l.rule);
|
|
63
|
+
const a = $(s);
|
|
64
|
+
console.log(`robots.txt 已生成: ${a.output}`);
|
|
65
|
+
}
|
|
66
|
+
export {
|
|
67
|
+
$ as default
|
|
68
|
+
};
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
type Awaitable<T> = T | Promise<T>;
|
|
2
|
+
/**
|
|
3
|
+
* 用于Vitepress项目的config.sitemap.transformItems
|
|
4
|
+
*
|
|
5
|
+
* ```js
|
|
6
|
+
* export default defineConfig({
|
|
7
|
+
* // ...
|
|
8
|
+
* sitemap: {
|
|
9
|
+
* hostname: 'https://example.com'
|
|
10
|
+
* transformItems: sitemapItemTransformer(
|
|
11
|
+
* (items) => {
|
|
12
|
+
* // 处理
|
|
13
|
+
* return items;
|
|
14
|
+
* },
|
|
15
|
+
* {
|
|
16
|
+
* 'zh/': {
|
|
17
|
+
* priority: 1
|
|
18
|
+
* },
|
|
19
|
+
* '^zh/blog/.*': { // 以^开头的键将被解析为正则表达式来匹配url
|
|
20
|
+
* priority: 0.5
|
|
21
|
+
* }
|
|
22
|
+
* }
|
|
23
|
+
* )
|
|
24
|
+
* }
|
|
25
|
+
* })
|
|
26
|
+
* ```
|
|
27
|
+
*
|
|
28
|
+
* @param steps 变长参数,值可以是函数,或者一个对象
|
|
29
|
+
* @returns
|
|
30
|
+
*/
|
|
31
|
+
export default function sitemapItemTransformer(...steps: (Record<string, any> | ((sitemapItems: any[]) => Awaitable<any[]>))[]): (items: any[]) => Promise<any[]>;
|
|
32
|
+
export {};
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
function u(...n) {
|
|
2
|
+
return async (f) => {
|
|
3
|
+
if (!Array.isArray(n)) return f;
|
|
4
|
+
let e = f;
|
|
5
|
+
for (const t of n)
|
|
6
|
+
if (typeof t == "function")
|
|
7
|
+
e = await t(e);
|
|
8
|
+
else if (typeof t == "object") {
|
|
9
|
+
const o = Object.entries(t).filter(([r]) => r.startsWith("^")).map((r) => [new RegExp(r[0]), r[1]]);
|
|
10
|
+
e = e.map((r) => {
|
|
11
|
+
for (const [a, s] of o)
|
|
12
|
+
if (a.test(r.url))
|
|
13
|
+
return {
|
|
14
|
+
...r,
|
|
15
|
+
...s,
|
|
16
|
+
...t[r.url]
|
|
17
|
+
};
|
|
18
|
+
return {
|
|
19
|
+
...r,
|
|
20
|
+
...t[r.url]
|
|
21
|
+
};
|
|
22
|
+
});
|
|
23
|
+
}
|
|
24
|
+
return e;
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
export {
|
|
28
|
+
u as default
|
|
29
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@opendesign-plus/geo-scripts",
|
|
3
|
+
"version": "0.0.1-rc.1",
|
|
4
|
+
"description": "",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"publishConfig": {
|
|
7
|
+
"access": "public",
|
|
8
|
+
"registry": "https://registry.npmjs.org/"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"dist"
|
|
12
|
+
],
|
|
13
|
+
"exports": {
|
|
14
|
+
"./*": {
|
|
15
|
+
"types": "./dist/*.d.ts",
|
|
16
|
+
"import": "./dist/*.js"
|
|
17
|
+
}
|
|
18
|
+
},
|
|
19
|
+
"keywords": [],
|
|
20
|
+
"author": "",
|
|
21
|
+
"license": "ISC",
|
|
22
|
+
"dependencies": {
|
|
23
|
+
"p-limit": "^7.3.0",
|
|
24
|
+
"playwright-core": "latest",
|
|
25
|
+
"turndown": "latest",
|
|
26
|
+
"commander": "^13.1.0",
|
|
27
|
+
"jsdom": "^24.0.0"
|
|
28
|
+
},
|
|
29
|
+
"devDependencies": {
|
|
30
|
+
"@types/turndown": "latest",
|
|
31
|
+
"@types/node": "^22.13.1",
|
|
32
|
+
"@types/jsdom": "latest",
|
|
33
|
+
"vite": "^6.2.3",
|
|
34
|
+
"vite-plugin-dts": "^4.5.3",
|
|
35
|
+
"typescript": "~5.8.2"
|
|
36
|
+
},
|
|
37
|
+
"scripts": {
|
|
38
|
+
"build": "vite build"
|
|
39
|
+
}
|
|
40
|
+
}
|