sitemap-xml-parser 1.1.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +81 -58
- package/bin/cli.js +58 -17
- package/index.d.ts +4 -4
- package/lib/sitemap.js +10 -7
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -8,6 +8,68 @@ Parses sitemap XML files and returns all listed URLs. Supports sitemap index fil
|
|
|
8
8
|
npm install sitemap-xml-parser
|
|
9
9
|
```
|
|
10
10
|
|
|
11
|
+
## CLI
|
|
12
|
+
|
|
13
|
+
Run without installing via `npx`:
|
|
14
|
+
|
|
15
|
+
```sh
|
|
16
|
+
npx sitemap-xml-parser <url> [options]
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Or, after installing globally (`npm install -g sitemap-xml-parser`):
|
|
20
|
+
|
|
21
|
+
```sh
|
|
22
|
+
sitemap-xml-parser <url> [options]
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Fetched URLs are printed to stdout, one per line. Errors are printed to stderr. See [Options](#options) for available flags.
|
|
26
|
+
|
|
27
|
+
## Examples
|
|
28
|
+
|
|
29
|
+
```sh
|
|
30
|
+
# Print all URLs
|
|
31
|
+
npx sitemap-xml-parser https://example.com/sitemap.xml
|
|
32
|
+
|
|
33
|
+
# Count URLs
|
|
34
|
+
npx sitemap-xml-parser https://example.com/sitemap.xml --count
|
|
35
|
+
|
|
36
|
+
# Filter by substring
|
|
37
|
+
npx sitemap-xml-parser https://example.com/sitemap.xml --filter "blog"
|
|
38
|
+
|
|
39
|
+
# Filter by regular expression
|
|
40
|
+
npx sitemap-xml-parser https://example.com/sitemap.xml --filter-regex "blog/[0-9]{4}/"
|
|
41
|
+
|
|
42
|
+
# Filter and count
|
|
43
|
+
npx sitemap-xml-parser https://example.com/sitemap.xml --filter "blog" --count
|
|
44
|
+
|
|
45
|
+
# Output as TSV
|
|
46
|
+
npx sitemap-xml-parser https://example.com/sitemap.xml --tsv > urls.tsv
|
|
47
|
+
|
|
48
|
+
# Save URLs to a file, errors to a log
|
|
49
|
+
npx sitemap-xml-parser https://example.com/sitemap.xml > urls.txt 2> errors.log
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Options
|
|
53
|
+
|
|
54
|
+
| Option | Type | Default | Description |
|
|
55
|
+
|---------------------|------------|---------|-----------------------------------------------------------------------------|
|
|
56
|
+
| `delay` | `number` | `1000` | Milliseconds to wait between batches when following a sitemap index. `limit` URLs are fetched in parallel per batch; after each batch completes, the process waits `delay` ms before starting the next. Set to `0` to disable. CLI: `--delay` |
|
|
57
|
+
| `limit` | `number` | `10` | Number of child sitemaps to fetch concurrently per batch. CLI: `--limit` |
|
|
58
|
+
| `timeout` | `number` | `30000` | Milliseconds before a request is aborted. CLI: `--timeout` |
|
|
59
|
+
| `onError` | `function` | — | Called as `onError(url, error)` when a URL fails. The URL is skipped regardless. **Library only.** |
|
|
60
|
+
| `onEntry` | `function` | — | Called as `onEntry(entry)` each time a URL entry is parsed. `entry` has the same shape as the objects returned by `fetch()`. **Library only.** |
|
|
61
|
+
| `filter` | `string` | — | Only output URLs whose `loc` contains the given string (substring match). Can be combined with `--count` or `--tsv`. **CLI only.** |
|
|
62
|
+
| `filter-regex` | `string` | — | Only output URLs whose `loc` matches the given regular expression (evaluated with `new RegExp(value)`). Invalid patterns exit with a non-zero code and an error on stderr. Can be combined with `--count` or `--tsv`. **CLI only.** |
|
|
63
|
+
| `tsv` | — | — | Output results as tab-separated values. Prints a header row (`loc`, `lastmod`, `changefreq`, `priority`) followed by one row per entry. Missing fields are output as empty strings. **CLI only.** |
|
|
64
|
+
| `count` | — | — | Print only the total number of URLs instead of listing them. **CLI only.** |
|
|
65
|
+
|
|
66
|
+
## Features
|
|
67
|
+
|
|
68
|
+
- Follows Sitemap Index files recursively, including nested indexes (Index within an Index)
|
|
69
|
+
- Automatically decompresses gzip: supports both `.gz` URLs and `Content-Encoding: gzip` responses
|
|
70
|
+
- Batch processing: fetches `limit` child sitemaps in parallel per batch, then waits `delay` ms after each batch completes
|
|
71
|
+
- Automatically follows redirects (301/302/303/307/308) up to 5 hops; errors beyond that are reported via `onError`
|
|
72
|
+
|
|
11
73
|
## Usage
|
|
12
74
|
|
|
13
75
|
```js
|
|
@@ -18,11 +80,24 @@ const parser = new SitemapXMLParser('https://example.com/sitemap.xml');
|
|
|
18
80
|
(async () => {
|
|
19
81
|
const urls = await parser.fetch();
|
|
20
82
|
urls.forEach(entry => {
|
|
21
|
-
console.log(entry.loc
|
|
83
|
+
console.log(entry.loc);
|
|
22
84
|
});
|
|
23
85
|
})();
|
|
24
86
|
```
|
|
25
87
|
|
|
88
|
+
Or with ES modules:
|
|
89
|
+
|
|
90
|
+
```js
|
|
91
|
+
import SitemapXMLParser from 'sitemap-xml-parser';
|
|
92
|
+
|
|
93
|
+
const parser = new SitemapXMLParser('https://example.com/sitemap.xml');
|
|
94
|
+
|
|
95
|
+
const urls = await parser.fetch();
|
|
96
|
+
urls.forEach(entry => {
|
|
97
|
+
console.log(entry.loc);
|
|
98
|
+
});
|
|
99
|
+
```
|
|
100
|
+
|
|
26
101
|
### Error handling with `onError`
|
|
27
102
|
|
|
28
103
|
Failed URLs (network errors, non-2xx responses, malformed XML) are skipped by default. Provide an `onError` callback to inspect them:
|
|
@@ -35,17 +110,6 @@ const parser = new SitemapXMLParser('https://example.com/sitemap.xml', {
|
|
|
35
110
|
});
|
|
36
111
|
```
|
|
37
112
|
|
|
38
|
-
## Options
|
|
39
|
-
|
|
40
|
-
| Option | Type | Default | Description |
|
|
41
|
-
|-------------|------------|---------|-----------------------------------------------------------------------------|
|
|
42
|
-
| `delay` | `number` | `3000` | Milliseconds to wait between batches when following a sitemap index. Default is 3000 to avoid overloading the target server; set to `0` to disable. CLI: `--delay` |
|
|
43
|
-
| `limit` | `number` | `5` | Number of child sitemaps to fetch concurrently per batch. CLI: `--limit` |
|
|
44
|
-
| `timeout` | `number` | `30000` | Milliseconds before a request is aborted. CLI: `--timeout` |
|
|
45
|
-
| `onError` | `function` | — | Called as `onError(url, error)` when a URL fails. The URL is skipped regardless. **Library only.** |
|
|
46
|
-
| `onEntry` | `function` | — | Called as `onEntry(entry)` each time a URL entry is parsed. `entry` has the same shape as the objects returned by `fetch()`. **Library only.** |
|
|
47
|
-
| `tsv` | — | — | Output results as tab-separated values. Prints a header row (`loc`, `lastmod`, `changefreq`, `priority`) followed by one row per entry. Missing fields are output as empty strings. **CLI only.** |
|
|
48
|
-
|
|
49
113
|
## Return value
|
|
50
114
|
|
|
51
115
|
`fetch()` resolves to an array of URL entry objects. Each object reflects the fields present in the sitemap:
|
|
@@ -53,57 +117,16 @@ const parser = new SitemapXMLParser('https://example.com/sitemap.xml', {
|
|
|
53
117
|
```js
|
|
54
118
|
[
|
|
55
119
|
{
|
|
56
|
-
loc:
|
|
57
|
-
lastmod:
|
|
58
|
-
changefreq:
|
|
59
|
-
priority:
|
|
120
|
+
loc: 'https://example.com/page1',
|
|
121
|
+
lastmod: '2024-01-01',
|
|
122
|
+
changefreq: 'weekly',
|
|
123
|
+
priority: '0.8',
|
|
60
124
|
},
|
|
61
125
|
// ...
|
|
62
126
|
]
|
|
63
127
|
```
|
|
64
128
|
|
|
65
|
-
|
|
129
|
+
`loc` is always a string. Use `entry.loc` to get the URL. Optional fields (`lastmod`, `changefreq`, `priority`) are strings when present, or `undefined` when absent from the source XML.
|
|
66
130
|
|
|
67
131
|
Fields other than `loc` (`lastmod`, `changefreq`, `priority`, etc.) are included only when present in the source XML.
|
|
68
132
|
|
|
69
|
-
## CLI
|
|
70
|
-
|
|
71
|
-
Run without installing via `npx`:
|
|
72
|
-
|
|
73
|
-
```sh
|
|
74
|
-
npx sitemap-xml-parser <url> [options]
|
|
75
|
-
```
|
|
76
|
-
|
|
77
|
-
Or, after installing globally (`npm install -g sitemap-xml-parser`):
|
|
78
|
-
|
|
79
|
-
```sh
|
|
80
|
-
sitemap-xml-parser <url> [options]
|
|
81
|
-
```
|
|
82
|
-
|
|
83
|
-
Fetched URLs are printed to stdout, one per line. Errors are printed to stderr. See [Options](#options) for available flags.
|
|
84
|
-
|
|
85
|
-
### Examples
|
|
86
|
-
|
|
87
|
-
```sh
|
|
88
|
-
# Print all URLs
|
|
89
|
-
npx sitemap-xml-parser https://example.com/sitemap.xml
|
|
90
|
-
|
|
91
|
-
# No delay, higher concurrency
|
|
92
|
-
npx sitemap-xml-parser https://example.com/sitemap.xml --delay 0 --limit 10
|
|
93
|
-
|
|
94
|
-
# Save URLs to a file, errors to a log
|
|
95
|
-
npx sitemap-xml-parser https://example.com/sitemap.xml > urls.txt 2> errors.log
|
|
96
|
-
|
|
97
|
-
# Custom timeout
|
|
98
|
-
npx sitemap-xml-parser https://example.com/sitemap.xml --timeout 10000
|
|
99
|
-
|
|
100
|
-
# Output as TSV (includes lastmod, changefreq, priority)
|
|
101
|
-
npx sitemap-xml-parser https://example.com/sitemap.xml --tsv
|
|
102
|
-
|
|
103
|
-
# Save TSV to a file
|
|
104
|
-
npx sitemap-xml-parser https://example.com/sitemap.xml --tsv > urls.tsv
|
|
105
|
-
```
|
|
106
|
-
|
|
107
|
-
## Limitations
|
|
108
|
-
|
|
109
|
-
- **HTTP redirects are followed up to 5 times.** Status codes 301, 302, 303, 307, and 308 are handled automatically by following the `Location` header (relative URLs are resolved against the current URL). If the redirect chain exceeds 5 hops, an error is raised via `onError`.
|
package/bin/cli.js
CHANGED
|
@@ -8,20 +8,26 @@ function printUsage() {
|
|
|
8
8
|
'Usage: sitemap-xml-parser <url> [options]',
|
|
9
9
|
'',
|
|
10
10
|
'Options:',
|
|
11
|
-
' --delay <ms>
|
|
12
|
-
' --limit <n>
|
|
13
|
-
' --timeout <ms>
|
|
14
|
-
' --
|
|
15
|
-
' --
|
|
11
|
+
' --delay <ms> Delay between batches in milliseconds (default: 1000)',
|
|
12
|
+
' --limit <n> Concurrent fetches per batch (default: 10)',
|
|
13
|
+
' --timeout <ms> Request timeout in milliseconds (default: 30000)',
|
|
14
|
+
' --filter <str> Only output URLs that contain <str>',
|
|
15
|
+
' --filter-regex <regex> Only output URLs matching the given regular expression',
|
|
16
|
+
' --tsv Output as tab-separated values with a header row',
|
|
17
|
+
' --count Print only the total number of URLs',
|
|
18
|
+
' --help Show this help message',
|
|
16
19
|
'',
|
|
17
20
|
].join('\n'));
|
|
18
21
|
}
|
|
19
22
|
|
|
20
23
|
function parseArgs(argv) {
|
|
21
24
|
const args = argv.slice(2);
|
|
22
|
-
const opts = { delay:
|
|
25
|
+
const opts = { delay: 1000, limit: 10, timeout: 30000 };
|
|
23
26
|
let url = null;
|
|
24
27
|
let tsv = false;
|
|
28
|
+
let count = false;
|
|
29
|
+
let filter = null;
|
|
30
|
+
let filterRegex = null;
|
|
25
31
|
|
|
26
32
|
for (let i = 0; i < args.length; i++) {
|
|
27
33
|
const arg = args[i];
|
|
@@ -30,6 +36,25 @@ function parseArgs(argv) {
|
|
|
30
36
|
process.exit(0);
|
|
31
37
|
} else if (arg === '--tsv') {
|
|
32
38
|
tsv = true;
|
|
39
|
+
} else if (arg === '--count') {
|
|
40
|
+
count = true;
|
|
41
|
+
} else if (arg === '--filter') {
|
|
42
|
+
if (++i >= args.length) {
|
|
43
|
+
process.stderr.write(`Error: --filter requires a value\n`);
|
|
44
|
+
process.exit(1);
|
|
45
|
+
}
|
|
46
|
+
filter = args[i];
|
|
47
|
+
} else if (arg === '--filter-regex') {
|
|
48
|
+
if (++i >= args.length) {
|
|
49
|
+
process.stderr.write(`Error: --filter-regex requires a value\n`);
|
|
50
|
+
process.exit(1);
|
|
51
|
+
}
|
|
52
|
+
try {
|
|
53
|
+
filterRegex = new RegExp(args[i]);
|
|
54
|
+
} catch (e) {
|
|
55
|
+
process.stderr.write(`Error: --filter-regex invalid regular expression: ${e.message}\n`);
|
|
56
|
+
process.exit(1);
|
|
57
|
+
}
|
|
33
58
|
} else if (arg === '--delay') {
|
|
34
59
|
if (++i >= args.length) {
|
|
35
60
|
process.stderr.write(`Error: --delay requires a value\n`);
|
|
@@ -80,33 +105,48 @@ function parseArgs(argv) {
|
|
|
80
105
|
process.exit(1);
|
|
81
106
|
}
|
|
82
107
|
|
|
83
|
-
return { url, opts, tsv };
|
|
108
|
+
return { url, opts, tsv, count, filter, filterRegex };
|
|
84
109
|
}
|
|
85
110
|
|
|
86
111
|
(async () => {
|
|
87
|
-
const { url, opts, tsv } = parseArgs(process.argv);
|
|
112
|
+
const { url, opts, tsv, count, filter, filterRegex } = parseArgs(process.argv);
|
|
88
113
|
|
|
89
114
|
const red = process.stderr.isTTY ? '\x1b[31m' : '';
|
|
90
115
|
const reset = process.stderr.isTTY ? '\x1b[0m' : '';
|
|
91
116
|
|
|
92
|
-
if (tsv) {
|
|
117
|
+
if (tsv && !count) {
|
|
93
118
|
process.stdout.write('loc\tlastmod\tchangefreq\tpriority\n');
|
|
94
119
|
}
|
|
95
120
|
|
|
96
121
|
let hasError = false;
|
|
122
|
+
let filteredCount = 0;
|
|
123
|
+
|
|
124
|
+
const hasFilter = filter !== null || filterRegex !== null;
|
|
125
|
+
|
|
126
|
+
// onEntry is only skipped when count mode has no filter (result.length is sufficient).
|
|
127
|
+
const needsOnEntry = !count || hasFilter;
|
|
128
|
+
|
|
97
129
|
const parser = new SitemapXMLParser(url, {
|
|
98
130
|
...opts,
|
|
99
|
-
onEntry: (entry) => {
|
|
131
|
+
onEntry: needsOnEntry ? (entry) => {
|
|
132
|
+
const loc = entry.loc ?? '';
|
|
133
|
+
if (filter !== null && !loc.includes(filter)) return;
|
|
134
|
+
if (filterRegex !== null && !filterRegex.test(loc)) return;
|
|
135
|
+
|
|
136
|
+
if (count) {
|
|
137
|
+
filteredCount++;
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
140
|
+
|
|
100
141
|
if (tsv) {
|
|
101
|
-
const
|
|
102
|
-
const
|
|
103
|
-
const
|
|
104
|
-
const priority = entry.priority?.[0] ?? '';
|
|
142
|
+
const lastmod = entry.lastmod ?? '';
|
|
143
|
+
const changefreq = entry.changefreq ?? '';
|
|
144
|
+
const priority = entry.priority ?? '';
|
|
105
145
|
process.stdout.write(`${loc}\t${lastmod}\t${changefreq}\t${priority}\n`);
|
|
106
146
|
} else {
|
|
107
|
-
process.stdout.write(
|
|
147
|
+
process.stdout.write(loc + '\n');
|
|
108
148
|
}
|
|
109
|
-
},
|
|
149
|
+
} : null,
|
|
110
150
|
onError: (failedUrl, err) => {
|
|
111
151
|
hasError = true;
|
|
112
152
|
const msg = err.message.replace(/\r?\n/g, ' ').trim();
|
|
@@ -114,6 +154,7 @@ function parseArgs(argv) {
|
|
|
114
154
|
},
|
|
115
155
|
});
|
|
116
156
|
|
|
117
|
-
await parser.fetch();
|
|
157
|
+
const result = await parser.fetch();
|
|
158
|
+
if (count) process.stdout.write((hasFilter ? filteredCount : result.length) + '\n');
|
|
118
159
|
if (hasError) process.exit(1);
|
|
119
160
|
})();
|
package/index.d.ts
CHANGED
package/lib/sitemap.js
CHANGED
|
@@ -10,13 +10,13 @@ const { URL } = require('url');
|
|
|
10
10
|
class SitemapXMLParser {
|
|
11
11
|
constructor(url, options = {}) {
|
|
12
12
|
this.siteMapUrl = url;
|
|
13
|
-
this.delayTime = options.delay ??
|
|
14
|
-
this.limit = options.limit ??
|
|
13
|
+
this.delayTime = options.delay ?? 1000;
|
|
14
|
+
this.limit = options.limit ?? 10;
|
|
15
15
|
this.timeout = options.timeout ?? 30000;
|
|
16
16
|
this.onError = options.onError || null;
|
|
17
17
|
this.onEntry = options.onEntry || null;
|
|
18
18
|
this.urlArray = [];
|
|
19
|
-
this.parser = new xml2js.Parser();
|
|
19
|
+
this.parser = new xml2js.Parser({ explicitArray: false });
|
|
20
20
|
}
|
|
21
21
|
|
|
22
22
|
async fetch() {
|
|
@@ -35,7 +35,8 @@ class SitemapXMLParser {
|
|
|
35
35
|
*/
|
|
36
36
|
async getURLFromXML(xml) {
|
|
37
37
|
if (xml.sitemapindex && xml.sitemapindex.sitemap) {
|
|
38
|
-
const
|
|
38
|
+
const sitemapList = [].concat(xml.sitemapindex.sitemap);
|
|
39
|
+
const urls = sitemapList.map(s => s.loc).filter(Boolean);
|
|
39
40
|
|
|
40
41
|
for (let i = 0; i < urls.length; i += this.limit) {
|
|
41
42
|
const chunk = urls.slice(i, i + this.limit);
|
|
@@ -55,8 +56,9 @@ class SitemapXMLParser {
|
|
|
55
56
|
}
|
|
56
57
|
|
|
57
58
|
if (xml.urlset && xml.urlset.url) {
|
|
58
|
-
|
|
59
|
-
|
|
59
|
+
const urlList = [].concat(xml.urlset.url);
|
|
60
|
+
for (const entry of urlList) {
|
|
61
|
+
if (entry && entry.loc) {
|
|
60
62
|
this.urlArray.push(entry);
|
|
61
63
|
if (this.onEntry) this.onEntry(entry);
|
|
62
64
|
}
|
|
@@ -120,10 +122,11 @@ class SitemapXMLParser {
|
|
|
120
122
|
return;
|
|
121
123
|
}
|
|
122
124
|
const chunks = [];
|
|
125
|
+
const contentEncoding = res.headers['content-encoding'];
|
|
123
126
|
res.on('data', chunk => chunks.push(chunk));
|
|
124
127
|
res.on('end', () => {
|
|
125
128
|
const buf = Buffer.concat(chunks);
|
|
126
|
-
if (ext === '.gz') {
|
|
129
|
+
if (ext === '.gz' || contentEncoding === 'gzip') {
|
|
127
130
|
zlib.gunzip(buf, (err, result) => {
|
|
128
131
|
if (err) {
|
|
129
132
|
failOnce(originalUrl, err);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "sitemap-xml-parser",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"description": "Parses sitemap XML files and returns all listed URLs. Supports sitemap index files and gzip (.gz) compression.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"types": "index.d.ts",
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
"scripts": {
|
|
20
20
|
"test": "node test/test.js"
|
|
21
21
|
},
|
|
22
|
-
"keywords": ["sitemap", "xml", "parse", "gzip", "sitemap-index", "cli"],
|
|
22
|
+
"keywords": ["sitemap", "xml", "parse", "gzip", "sitemap-index", "cli", "tsv"],
|
|
23
23
|
"author": "shinkawax",
|
|
24
24
|
"license": "MIT",
|
|
25
25
|
"repository": {
|