easy-sitemap-generator 0.1.13 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/bin/cli.js +2 -2
- package/lib/sitemapGenerator.js +194 -108
- package/package.json +7 -7
- package/utils/kleur.js +12 -5
- package/utils/xml.js +3 -9
package/LICENSE
CHANGED
package/bin/cli.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
1
|
+
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
const { logError } = require('../utils/kleur.js');
|
|
4
4
|
const { generate } = require('../lib/sitemapGenerator.js');
|
|
@@ -10,7 +10,7 @@ if (!urlArg) {
|
|
|
10
10
|
process.exit(1);
|
|
11
11
|
}
|
|
12
12
|
|
|
13
|
-
generate(urlArg.
|
|
13
|
+
generate(urlArg.slice('--url='.length)).catch(err => {
|
|
14
14
|
logError(err);
|
|
15
15
|
process.exit(2);
|
|
16
16
|
});
|
package/lib/sitemapGenerator.js
CHANGED
|
@@ -1,108 +1,194 @@
|
|
|
1
|
-
const
|
|
2
|
-
const
|
|
3
|
-
const
|
|
4
|
-
const {
|
|
5
|
-
const {
|
|
6
|
-
const {
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
const
|
|
10
|
-
const
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
const
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
}
|
|
1
|
+
const fs = require('node:fs/promises');
|
|
2
|
+
const path = require('node:path');
|
|
3
|
+
const { JSDOM } = require('jsdom');
|
|
4
|
+
const { axios, version } = require('../services/axios.js');
|
|
5
|
+
const { escapeXml, normalizeUrl, calculatePriority } = require('../utils/xml.js');
|
|
6
|
+
const { logInfo, logSuccess, logError, logWarning, logInfoStart, logInfoAppend } = require('../utils/kleur.js');
|
|
7
|
+
|
|
8
|
+
const IGNORED_PATTERNS = ['cdn-cgi', '?referrer=', '&referrer=', '/signin/v2/usernamerecovery', '/lifecycle/flows/signup', 'join?return_to='];
|
|
9
|
+
const BASE_DELAY = 12000;
|
|
10
|
+
const MAX_URLS = 50000;
|
|
11
|
+
|
|
12
|
+
const shouldIncludeUrl = (url, baseUrl, baseOrigin, urlOrigin = null) => {
|
|
13
|
+
if (!url.startsWith(baseUrl)) return false;
|
|
14
|
+
if (IGNORED_PATTERNS.some(pattern => url.includes(pattern))) return false;
|
|
15
|
+
try {
|
|
16
|
+
return (urlOrigin ?? new URL(url).origin) === baseOrigin;
|
|
17
|
+
} catch {
|
|
18
|
+
return false;
|
|
19
|
+
}
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
|
|
23
|
+
const formatIso = date => date.toISOString().replace(/\.\d{3}Z$/, 'Z');
|
|
24
|
+
const nowIso = () => formatIso(new Date());
|
|
25
|
+
|
|
26
|
+
const fetchUrl = async (url, retries = 0) => {
|
|
27
|
+
try {
|
|
28
|
+
logInfoStart(`GET ${url}`);
|
|
29
|
+
|
|
30
|
+
const res = await axios.get(url);
|
|
31
|
+
if (res.status === 200) {
|
|
32
|
+
return res;
|
|
33
|
+
} else {
|
|
34
|
+
process.stdout.write('\n');
|
|
35
|
+
logWarning(`Non-200 status code (${res.status}) for URL: ${url}. Skipping...`);
|
|
36
|
+
return null;
|
|
37
|
+
}
|
|
38
|
+
} catch (err) {
|
|
39
|
+
process.stdout.write('\n');
|
|
40
|
+
if (err.response) {
|
|
41
|
+
const statusCode = err.response.status;
|
|
42
|
+
if (statusCode === 429) {
|
|
43
|
+
const delayTime = BASE_DELAY * (2 ** retries);
|
|
44
|
+
logWarning(`429: Rate limit hit! Retrying in ${(delayTime / 1000).toFixed(2)}s... (Attempt ${retries + 1})`);
|
|
45
|
+
await delay(delayTime);
|
|
46
|
+
return fetchUrl(url, retries + 1);
|
|
47
|
+
} else if (statusCode === 404) {
|
|
48
|
+
logWarning('404: Not Found');
|
|
49
|
+
return null;
|
|
50
|
+
} else {
|
|
51
|
+
logError(`${statusCode}: Failed to fetch! Skipping...`);
|
|
52
|
+
return null;
|
|
53
|
+
}
|
|
54
|
+
} else {
|
|
55
|
+
logError(`Failed to fetch ${url}. Unknown error: ${err.message}. Skipping...`);
|
|
56
|
+
return null;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
const crawl = async (startUrl, baseUrl, baseOrigin, visitedUrls) => {
|
|
62
|
+
const queued = new Set();
|
|
63
|
+
const queue = [];
|
|
64
|
+
|
|
65
|
+
const enqueue = url => {
|
|
66
|
+
if (!queued.has(url)) {
|
|
67
|
+
queued.add(url);
|
|
68
|
+
queue.push(url);
|
|
69
|
+
}
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
enqueue(normalizeUrl(startUrl));
|
|
73
|
+
|
|
74
|
+
while (queue.length > 0) {
|
|
75
|
+
const normalizedUrl = queue.shift();
|
|
76
|
+
|
|
77
|
+
const res = await fetchUrl(normalizedUrl);
|
|
78
|
+
if (!res) continue;
|
|
79
|
+
|
|
80
|
+
const dom = new JSDOM(res.data);
|
|
81
|
+
const { document } = dom.window;
|
|
82
|
+
|
|
83
|
+
const canonicalEl = document.querySelector('link[rel="canonical"]');
|
|
84
|
+
if (canonicalEl) {
|
|
85
|
+
try {
|
|
86
|
+
const canonical = new URL(canonicalEl.getAttribute('href'), baseUrl);
|
|
87
|
+
canonical.hash = '';
|
|
88
|
+
if (canonical.href !== normalizedUrl && shouldIncludeUrl(canonical.href, baseUrl, baseOrigin, canonical.origin)) {
|
|
89
|
+
logInfoAppend(`GET ${normalizedUrl} (canonical → ${canonical.href}, skipped)`);
|
|
90
|
+
dom.window.close();
|
|
91
|
+
enqueue(canonical.href);
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
} catch {
|
|
95
|
+
// ...
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
const links = new Set();
|
|
100
|
+
for (const link of document.querySelectorAll('a[href]')) {
|
|
101
|
+
try {
|
|
102
|
+
const resolved = new URL(link.getAttribute('href'), baseUrl);
|
|
103
|
+
resolved.hash = '';
|
|
104
|
+
if (shouldIncludeUrl(resolved.href, baseUrl, baseOrigin, resolved.origin)) links.add(resolved.href);
|
|
105
|
+
} catch {
|
|
106
|
+
// ...
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
const rawLastMod = res.headers['last-modified']
|
|
111
|
+
?? document.querySelector('meta[property="article:modified_time"]')?.getAttribute('content')
|
|
112
|
+
?? document.querySelector('meta[name="last-modified"]')?.getAttribute('content');
|
|
113
|
+
|
|
114
|
+
dom.window.close();
|
|
115
|
+
|
|
116
|
+
const lastmodDate = rawLastMod ? new Date(rawLastMod) : new Date();
|
|
117
|
+
visitedUrls.set(normalizedUrl, {
|
|
118
|
+
url: normalizedUrl,
|
|
119
|
+
lastmod: formatIso(lastmodDate),
|
|
120
|
+
priority: calculatePriority(normalizedUrl, baseUrl),
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
logInfoAppend(`GET ${normalizedUrl} (${links.size} urls)`);
|
|
124
|
+
|
|
125
|
+
for (const link of links) enqueue(link);
|
|
126
|
+
}
|
|
127
|
+
};
|
|
128
|
+
|
|
129
|
+
const buildSitemapContent = urls => `<?xml version="1.0" encoding="UTF-8"?>
|
|
130
|
+
<!-- Generated by https://github.com/sefinek/easy-sitemap-generator v${version} at ${nowIso()} -->
|
|
131
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
|
|
132
|
+
${urls.map(({ url, priority, lastmod }) => ` <url>
|
|
133
|
+
<loc>${escapeXml(url)}</loc>
|
|
134
|
+
<lastmod>${lastmod}</lastmod>
|
|
135
|
+
<priority>${priority.toFixed(2)}</priority>
|
|
136
|
+
</url>`).join('\n')}
|
|
137
|
+
</urlset>`;
|
|
138
|
+
|
|
139
|
+
const buildIndexContent = sitemapLocs => `<?xml version="1.0" encoding="UTF-8"?>
|
|
140
|
+
<!-- Generated by https://github.com/sefinek/easy-sitemap-generator v${version} at ${nowIso()} -->
|
|
141
|
+
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
142
|
+
${sitemapLocs.map(({ loc, lastmod }) => ` <sitemap>
|
|
143
|
+
<loc>${escapeXml(loc)}</loc>
|
|
144
|
+
<lastmod>${lastmod}</lastmod>
|
|
145
|
+
</sitemap>`).join('\n')}
|
|
146
|
+
</sitemapindex>`;
|
|
147
|
+
|
|
148
|
+
const generate = async (baseUrl, destination = 'sitemap.xml') => {
|
|
149
|
+
logInfo(`Starting crawl for base URL: ${baseUrl}`);
|
|
150
|
+
|
|
151
|
+
const { origin: baseOrigin } = new URL(baseUrl);
|
|
152
|
+
const visitedUrls = new Map();
|
|
153
|
+
await crawl(baseUrl, baseUrl, baseOrigin, visitedUrls);
|
|
154
|
+
|
|
155
|
+
logInfo(`Generating sitemap with ${visitedUrls.size} URLs...`);
|
|
156
|
+
|
|
157
|
+
const urls = Array.from(visitedUrls.values())
|
|
158
|
+
.filter(entry => entry.lastmod != null && entry.priority != null)
|
|
159
|
+
.sort((a, b) => b.priority - a.priority);
|
|
160
|
+
|
|
161
|
+
const output = path.resolve(destination);
|
|
162
|
+
if (urls.length <= MAX_URLS) {
|
|
163
|
+
const content = buildSitemapContent(urls);
|
|
164
|
+
await fs.writeFile(output, content, 'utf8');
|
|
165
|
+
logSuccess(`Sitemap generated at ${output}`);
|
|
166
|
+
return content;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
logWarning(`Found ${urls.length} URLs — exceeds the ${MAX_URLS} limit. Splitting into multiple sitemap files...`);
|
|
170
|
+
|
|
171
|
+
const ext = path.extname(destination);
|
|
172
|
+
const base = path.basename(destination, ext);
|
|
173
|
+
const dir = path.dirname(output);
|
|
174
|
+
const timestamp = nowIso();
|
|
175
|
+
|
|
176
|
+
const totalParts = Math.ceil(urls.length / MAX_URLS);
|
|
177
|
+
const sitemapLocs = [];
|
|
178
|
+
for (let i = 0, part = 1; i < urls.length; i += MAX_URLS, part++) {
|
|
179
|
+
const filename = `${base}-${part}${ext}`;
|
|
180
|
+
const filepath = path.join(dir, filename);
|
|
181
|
+
const content = buildSitemapContent(urls.slice(i, i + MAX_URLS));
|
|
182
|
+
await fs.writeFile(filepath, content, 'utf8');
|
|
183
|
+
logSuccess(`Sitemap part ${part}/${totalParts} written to ${filepath}`);
|
|
184
|
+
sitemapLocs.push({ loc: `${baseOrigin}/${filename}`, lastmod: timestamp });
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const indexContent = buildIndexContent(sitemapLocs);
|
|
188
|
+
await fs.writeFile(output, indexContent, 'utf8');
|
|
189
|
+
logSuccess(`Sitemap index written to ${output}`);
|
|
190
|
+
|
|
191
|
+
return indexContent;
|
|
192
|
+
};
|
|
193
|
+
|
|
194
|
+
module.exports = { generate, version };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "easy-sitemap-generator",
|
|
3
|
-
"version": "0.1
|
|
3
|
+
"version": "0.2.1",
|
|
4
4
|
"description": "Easy and free sitemap.xml file generator without any restrictions for your website.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"sitemap",
|
|
@@ -43,16 +43,16 @@
|
|
|
43
43
|
"README.md"
|
|
44
44
|
],
|
|
45
45
|
"scripts": {
|
|
46
|
-
"
|
|
47
|
-
"
|
|
46
|
+
"m": "ncu -u && npm install && npm update",
|
|
47
|
+
"test": "echo \"Error: no test specified\" && exit 1"
|
|
48
48
|
},
|
|
49
49
|
"dependencies": {
|
|
50
|
-
"axios": "^1.
|
|
51
|
-
"jsdom": "^
|
|
50
|
+
"axios": "^1.16.1",
|
|
51
|
+
"jsdom": "^29.1.1",
|
|
52
52
|
"kleur": "^4.1.5"
|
|
53
53
|
},
|
|
54
54
|
"devDependencies": {
|
|
55
|
-
"@eslint/js": "^
|
|
56
|
-
"globals": "^
|
|
55
|
+
"@eslint/js": "^10.0.1",
|
|
56
|
+
"globals": "^17.6.0"
|
|
57
57
|
}
|
|
58
58
|
}
|
package/utils/kleur.js
CHANGED
|
@@ -1,8 +1,15 @@
|
|
|
1
1
|
const kleur = require('kleur');
|
|
2
2
|
|
|
3
|
-
const
|
|
4
|
-
const
|
|
5
|
-
const
|
|
6
|
-
const
|
|
3
|
+
const P_INFO = kleur.blue().bold('[INFO]: ');
|
|
4
|
+
const P_SUCCESS = kleur.green().bold('[SUCCESS]: ');
|
|
5
|
+
const P_ERROR = kleur.red().bold('[ERROR]: ');
|
|
6
|
+
const P_WARN = kleur.yellow().bold('[WARN]: ');
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
const logInfo = msg => console.log(P_INFO + msg);
|
|
9
|
+
const logSuccess = msg => console.log(P_SUCCESS + msg);
|
|
10
|
+
const logError = msg => console.error(P_ERROR + msg);
|
|
11
|
+
const logWarning = msg => console.warn(P_WARN + msg);
|
|
12
|
+
const logInfoStart = msg => process.stdout.write(P_INFO + msg);
|
|
13
|
+
const logInfoAppend = msg => process.stdout.write(`\r\x1b[K${P_INFO}${msg}\n`);
|
|
14
|
+
|
|
15
|
+
module.exports = { logInfo, logSuccess, logError, logWarning, logInfoStart, logInfoAppend };
|
package/utils/xml.js
CHANGED
|
@@ -1,9 +1,5 @@
|
|
|
1
|
-
const
|
|
2
|
-
|
|
3
|
-
.replace(/</g, '<')
|
|
4
|
-
.replace(/>/g, '>')
|
|
5
|
-
.replace(/"/g, '"')
|
|
6
|
-
.replace(/'/g, ''');
|
|
1
|
+
const XML_ESCAPE = { '&': '&', '<': '<', '>': '>', '"': '"', "'": ''' };
|
|
2
|
+
const escapeXml = str => str.replace(/[&<>"']/g, ch => XML_ESCAPE[ch]);
|
|
7
3
|
|
|
8
4
|
const normalizeUrl = url => {
|
|
9
5
|
const parsedUrl = new URL(url);
|
|
@@ -19,9 +15,7 @@ const calculatePriority = (url, baseUrl) => {
|
|
|
19
15
|
if (depth === 0) return 1.0;
|
|
20
16
|
if (depth === 1) return 0.85;
|
|
21
17
|
if (depth === 2) return hasQuery ? 0.54 : 0.74;
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
return 0.5;
|
|
18
|
+
return hasQuery ? 0.34 : 0.44;
|
|
25
19
|
};
|
|
26
20
|
|
|
27
21
|
module.exports = { escapeXml, normalizeUrl, calculatePriority };
|