sitemap-xml-parser 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -36,6 +36,9 @@ npx sitemap-xml-parser https://example.com/sitemap.xml --count
36
36
  # Filter by substring
37
37
  npx sitemap-xml-parser https://example.com/sitemap.xml --filter "blog"
38
38
 
39
+ # Filter by regular expression
40
+ npx sitemap-xml-parser https://example.com/sitemap.xml --filter-regex "blog/[0-9]{4}/"
41
+
39
42
  # Filter and count
40
43
  npx sitemap-xml-parser https://example.com/sitemap.xml --filter "blog" --count
41
44
 
@@ -48,16 +51,24 @@ npx sitemap-xml-parser https://example.com/sitemap.xml > urls.txt 2> errors.log
48
51
 
49
52
  ## Options
50
53
 
51
- | Option | Type | Default | Description |
52
- |-------------|------------|---------|-----------------------------------------------------------------------------|
53
- | `delay` | `number` | `1000` | Milliseconds to wait between batches when following a sitemap index. Default is 1000 to avoid overloading the target server; set to `0` to disable. CLI: `--delay` |
54
- | `limit` | `number` | `10` | Number of child sitemaps to fetch concurrently per batch. CLI: `--limit` |
55
- | `timeout` | `number` | `30000` | Milliseconds before a request is aborted. CLI: `--timeout` |
56
- | `onError` | `function` | — | Called as `onError(url, error)` when a URL fails. The URL is skipped regardless. **Library only.** |
57
- | `onEntry` | `function` | — | Called as `onEntry(entry)` each time a URL entry is parsed. `entry` has the same shape as the objects returned by `fetch()`. **Library only.** |
58
- | `filter` | `string` | — | Only output URLs whose `loc` contains the given string (substring match). Can be combined with `--count` or `--tsv`. **CLI only.** |
59
- | `tsv` | | — | Output results as tab-separated values. Prints a header row (`loc`, `lastmod`, `changefreq`, `priority`) followed by one row per entry. Missing fields are output as empty strings. **CLI only.** |
60
- | `count` | — | — | Print only the total number of URLs instead of listing them. **CLI only.** |
54
+ | Option | Type | Default | Description |
55
+ |---------------------|------------|---------|-----------------------------------------------------------------------------|
56
+ | `delay` | `number` | `1000` | Milliseconds to wait between batches when following a sitemap index. `limit` URLs are fetched in parallel per batch; after each batch completes, the process waits `delay` ms before starting the next. Set to `0` to disable. CLI: `--delay` |
57
+ | `limit` | `number` | `10` | Number of child sitemaps to fetch concurrently per batch. CLI: `--limit` |
58
+ | `timeout` | `number` | `30000` | Milliseconds before a request is aborted. CLI: `--timeout` |
59
+ | `onError` | `function` | — | Called as `onError(url, error)` when a URL fails. The URL is skipped regardless. **Library only.** |
60
+ | `onEntry` | `function` | — | Called as `onEntry(entry)` each time a URL entry is parsed. `entry` has the same shape as the objects returned by `fetch()`. **Library only.** |
61
+ | `filter` | `string` | — | Only output URLs whose `loc` contains the given string (substring match). Can be combined with `--count` or `--tsv`. **CLI only.** |
62
+ | `filter-regex` | `string` | — | Only output URLs whose `loc` matches the given regular expression (evaluated with `new RegExp(value)`). Invalid patterns exit with a non-zero code and an error on stderr. Can be combined with `--count` or `--tsv`. **CLI only.** |
63
+ | `tsv` | — | — | Output results as tab-separated values. Prints a header row (`loc`, `lastmod`, `changefreq`, `priority`) followed by one row per entry. Missing fields are output as empty strings. **CLI only.** |
64
+ | `count` | — | — | Print only the total number of URLs instead of listing them. **CLI only.** |
65
+
66
+ ## Features
67
+
68
+ - Follows Sitemap Index files recursively, including nested indexes (Index within an Index)
69
+ - Automatically decompresses gzip: supports both `.gz` URLs and `Content-Encoding: gzip` responses
70
+ - Batch processing: fetches `limit` child sitemaps in parallel per batch, then waits `delay` ms after each batch completes
71
+ - Automatically follows redirects (301/302/303/307/308) up to 5 hops; errors beyond that are reported via `onError`
61
72
 
62
73
  ## Usage
63
74
 
@@ -69,11 +80,24 @@ const parser = new SitemapXMLParser('https://example.com/sitemap.xml');
69
80
  (async () => {
70
81
  const urls = await parser.fetch();
71
82
  urls.forEach(entry => {
72
- console.log(entry.loc[0]);
83
+ console.log(entry.loc);
73
84
  });
74
85
  })();
75
86
  ```
76
87
 
88
+ Or with ES modules:
89
+
90
+ ```js
91
+ import SitemapXMLParser from 'sitemap-xml-parser';
92
+
93
+ const parser = new SitemapXMLParser('https://example.com/sitemap.xml');
94
+
95
+ const urls = await parser.fetch();
96
+ urls.forEach(entry => {
97
+ console.log(entry.loc);
98
+ });
99
+ ```
100
+
77
101
  ### Error handling with `onError`
78
102
 
79
103
  Failed URLs (network errors, non-2xx responses, malformed XML) are skipped by default. Provide an `onError` callback to inspect them:
@@ -93,19 +117,16 @@ const parser = new SitemapXMLParser('https://example.com/sitemap.xml', {
93
117
  ```js
94
118
  [
95
119
  {
96
- loc: ['https://example.com/page1'],
97
- lastmod: ['2024-01-01'],
98
- changefreq: ['weekly'],
99
- priority: ['0.8'],
120
+ loc: 'https://example.com/page1',
121
+ lastmod: '2024-01-01',
122
+ changefreq: 'weekly',
123
+ priority: '0.8',
100
124
  },
101
125
  // ...
102
126
  ]
103
127
  ```
104
128
 
105
- All field values are arrays (xml2js convention). Use `entry.loc[0]` to get the URL string, `entry.lastmod?.[0]` for optional fields, and so on.
129
+ `loc` is always a string. Use `entry.loc` to get the URL. Optional fields (`lastmod`, `changefreq`, `priority`) are strings when present, or `undefined` when absent from the source XML.
106
130
 
107
131
  Fields other than `loc` (`lastmod`, `changefreq`, `priority`, etc.) are included only when present in the source XML.
108
132
 
109
- ## Limitations
110
-
111
- - **HTTP redirects are followed up to 5 times.** Status codes 301, 302, 303, 307, and 308 are handled automatically by following the `Location` header (relative URLs are resolved against the current URL). If the redirect chain exceeds 5 hops, an error is raised via `onError`.
package/bin/cli.js CHANGED
@@ -8,13 +8,14 @@ function printUsage() {
8
8
  'Usage: sitemap-xml-parser <url> [options]',
9
9
  '',
10
10
  'Options:',
11
- ' --delay <ms> Delay between batches in milliseconds (default: 1000)',
12
- ' --limit <n> Concurrent fetches per batch (default: 10)',
13
- ' --timeout <ms> Request timeout in milliseconds (default: 30000)',
14
- ' --filter <str> Only output URLs that contain <str>',
15
- ' --tsv Output as tab-separated values with a header row',
16
- ' --count Print only the total number of URLs',
17
- ' --help Show this help message',
11
+ ' --delay <ms> Delay between batches in milliseconds (default: 1000)',
12
+ ' --limit <n> Concurrent fetches per batch (default: 10)',
13
+ ' --timeout <ms> Request timeout in milliseconds (default: 30000)',
14
+ ' --filter <str> Only output URLs that contain <str>',
15
+ ' --filter-regex <regex> Only output URLs matching the given regular expression',
16
+ ' --tsv Output as tab-separated values with a header row',
17
+ ' --count Print only the total number of URLs',
18
+ ' --help Show this help message',
18
19
  '',
19
20
  ].join('\n'));
20
21
  }
@@ -26,6 +27,7 @@ function parseArgs(argv) {
26
27
  let tsv = false;
27
28
  let count = false;
28
29
  let filter = null;
30
+ let filterRegex = null;
29
31
 
30
32
  for (let i = 0; i < args.length; i++) {
31
33
  const arg = args[i];
@@ -42,6 +44,17 @@ function parseArgs(argv) {
42
44
  process.exit(1);
43
45
  }
44
46
  filter = args[i];
47
+ } else if (arg === '--filter-regex') {
48
+ if (++i >= args.length) {
49
+ process.stderr.write(`Error: --filter-regex requires a value\n`);
50
+ process.exit(1);
51
+ }
52
+ try {
53
+ filterRegex = new RegExp(args[i]);
54
+ } catch (e) {
55
+ process.stderr.write(`Error: --filter-regex invalid regular expression: ${e.message}\n`);
56
+ process.exit(1);
57
+ }
45
58
  } else if (arg === '--delay') {
46
59
  if (++i >= args.length) {
47
60
  process.stderr.write(`Error: --delay requires a value\n`);
@@ -92,11 +105,11 @@ function parseArgs(argv) {
92
105
  process.exit(1);
93
106
  }
94
107
 
95
- return { url, opts, tsv, count, filter };
108
+ return { url, opts, tsv, count, filter, filterRegex };
96
109
  }
97
110
 
98
111
  (async () => {
99
- const { url, opts, tsv, count, filter } = parseArgs(process.argv);
112
+ const { url, opts, tsv, count, filter, filterRegex } = parseArgs(process.argv);
100
113
 
101
114
  const red = process.stderr.isTTY ? '\x1b[31m' : '';
102
115
  const reset = process.stderr.isTTY ? '\x1b[0m' : '';
@@ -108,14 +121,17 @@ function parseArgs(argv) {
108
121
  let hasError = false;
109
122
  let filteredCount = 0;
110
123
 
124
+ const hasFilter = filter !== null || filterRegex !== null;
125
+
111
126
  // onEntry is only skipped when count mode has no filter (result.length is sufficient).
112
- const needsOnEntry = !count || filter !== null;
127
+ const needsOnEntry = !count || hasFilter;
113
128
 
114
129
  const parser = new SitemapXMLParser(url, {
115
130
  ...opts,
116
131
  onEntry: needsOnEntry ? (entry) => {
117
- const loc = entry.loc?.[0] ?? '';
132
+ const loc = entry.loc ?? '';
118
133
  if (filter !== null && !loc.includes(filter)) return;
134
+ if (filterRegex !== null && !filterRegex.test(loc)) return;
119
135
 
120
136
  if (count) {
121
137
  filteredCount++;
@@ -123,9 +139,9 @@ function parseArgs(argv) {
123
139
  }
124
140
 
125
141
  if (tsv) {
126
- const lastmod = entry.lastmod?.[0] ?? '';
127
- const changefreq = entry.changefreq?.[0] ?? '';
128
- const priority = entry.priority?.[0] ?? '';
142
+ const lastmod = entry.lastmod ?? '';
143
+ const changefreq = entry.changefreq ?? '';
144
+ const priority = entry.priority ?? '';
129
145
  process.stdout.write(`${loc}\t${lastmod}\t${changefreq}\t${priority}\n`);
130
146
  } else {
131
147
  process.stdout.write(loc + '\n');
@@ -139,6 +155,6 @@ function parseArgs(argv) {
139
155
  });
140
156
 
141
157
  const result = await parser.fetch();
142
- if (count) process.stdout.write((filter !== null ? filteredCount : result.length) + '\n');
158
+ if (count) process.stdout.write((hasFilter ? filteredCount : result.length) + '\n');
143
159
  if (hasError) process.exit(1);
144
160
  })();
package/index.d.ts CHANGED
@@ -1,8 +1,8 @@
1
1
  export interface SitemapEntry {
2
- loc: string[];
3
- lastmod?: string[];
4
- changefreq?: string[];
5
- priority?: string[];
2
+ loc: string;
3
+ lastmod?: string;
4
+ changefreq?: string;
5
+ priority?: string;
6
6
  }
7
7
 
8
8
  export interface SitemapOptions {
package/lib/sitemap.js CHANGED
@@ -16,7 +16,7 @@ class SitemapXMLParser {
16
16
  this.onError = options.onError || null;
17
17
  this.onEntry = options.onEntry || null;
18
18
  this.urlArray = [];
19
- this.parser = new xml2js.Parser();
19
+ this.parser = new xml2js.Parser({ explicitArray: false });
20
20
  }
21
21
 
22
22
  async fetch() {
@@ -35,7 +35,8 @@ class SitemapXMLParser {
35
35
  */
36
36
  async getURLFromXML(xml) {
37
37
  if (xml.sitemapindex && xml.sitemapindex.sitemap) {
38
- const urls = xml.sitemapindex.sitemap.map(s => s.loc?.[0]).filter(Boolean);
38
+ const sitemapList = [].concat(xml.sitemapindex.sitemap);
39
+ const urls = sitemapList.map(s => s.loc).filter(Boolean);
39
40
 
40
41
  for (let i = 0; i < urls.length; i += this.limit) {
41
42
  const chunk = urls.slice(i, i + this.limit);
@@ -55,8 +56,9 @@ class SitemapXMLParser {
55
56
  }
56
57
 
57
58
  if (xml.urlset && xml.urlset.url) {
58
- for (const entry of xml.urlset.url) {
59
- if (entry && entry.loc?.[0]) {
59
+ const urlList = [].concat(xml.urlset.url);
60
+ for (const entry of urlList) {
61
+ if (entry && entry.loc) {
60
62
  this.urlArray.push(entry);
61
63
  if (this.onEntry) this.onEntry(entry);
62
64
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "sitemap-xml-parser",
3
- "version": "1.2.0",
3
+ "version": "1.3.0",
4
4
  "description": "Parses sitemap XML files and returns all listed URLs. Supports sitemap index files and gzip (.gz) compression.",
5
5
  "main": "index.js",
6
6
  "types": "index.d.ts",