sitemap-xml-parser 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -13,10 +13,7 @@ npm install sitemap-xml-parser
13
13
  ```js
14
14
  const SitemapXMLParser = require('sitemap-xml-parser');
15
15
 
16
- const parser = new SitemapXMLParser('https://example.com/sitemap.xml', {
17
- delay: 3000,
18
- limit: 5,
19
- });
16
+ const parser = new SitemapXMLParser('https://example.com/sitemap.xml');
20
17
 
21
18
  (async () => {
22
19
  const urls = await parser.fetch();
@@ -42,12 +39,12 @@ const parser = new SitemapXMLParser('https://example.com/sitemap.xml', {
42
39
 
43
40
  | Option | Type | Default | Description |
44
41
  |-------------|------------|---------|-----------------------------------------------------------------------------|
45
- | `delay` | `number` | `3000` | Milliseconds to wait between batches when following a sitemap index. CLI: `--delay` |
42
+ | `delay` | `number` | `3000` | Milliseconds to wait between batches when following a sitemap index. Default is 3000 to avoid overloading the target server; set to `0` to disable. CLI: `--delay` |
46
43
  | `limit` | `number` | `5` | Number of child sitemaps to fetch concurrently per batch. CLI: `--limit` |
47
44
  | `timeout` | `number` | `30000` | Milliseconds before a request is aborted. CLI: `--timeout` |
48
- | `onError` | `function` | — | Called as `onError(url, error)` when a URL fails. The URL is skipped regardless. **API only.** |
49
- | `--help` | | — | Prints usage information and exits. **CLI only.** |
50
- | `--timeout` | — | — | Same as the `timeout` option above, in milliseconds. **CLI only.** |
45
+ | `onError` | `function` | — | Called as `onError(url, error)` when a URL fails. The URL is skipped regardless. **Library only.** |
46
+ | `onEntry` | `function` | — | Called as `onEntry(entry)` each time a URL entry is parsed. `entry` has the same shape as the objects returned by `fetch()`. **Library only.** |
47
+ | `tsv` | — | — | Output results as tab-separated values. Prints a header row (`loc`, `lastmod`, `changefreq`, `priority`) followed by one row per entry. Missing fields are output as empty strings. **CLI only.** |
51
48
 
52
49
  ## Return value
53
50
 
@@ -65,6 +62,8 @@ const parser = new SitemapXMLParser('https://example.com/sitemap.xml', {
65
62
  ]
66
63
  ```
67
64
 
65
+ All field values are arrays (xml2js convention). Use `entry.loc[0]` to get the URL string, `entry.lastmod?.[0]` for optional fields, and so on.
66
+
68
67
  Fields other than `loc` (`lastmod`, `changefreq`, `priority`, etc.) are included only when present in the source XML.
69
68
 
70
69
  ## CLI
@@ -97,8 +96,14 @@ npx sitemap-xml-parser https://example.com/sitemap.xml > urls.txt 2> errors.log
97
96
 
98
97
  # Custom timeout
99
98
  npx sitemap-xml-parser https://example.com/sitemap.xml --timeout 10000
99
+
100
+ # Output as TSV (includes lastmod, changefreq, priority)
101
+ npx sitemap-xml-parser https://example.com/sitemap.xml --tsv
102
+
103
+ # Save TSV to a file
104
+ npx sitemap-xml-parser https://example.com/sitemap.xml --tsv > urls.tsv
100
105
  ```
101
106
 
102
107
  ## Limitations
103
108
 
104
- - **HTTP redirects are not followed.** Responses with status codes 301, 302, or other 3xx are treated as errors. If your sitemap URL redirects, use the final destination URL directly.
109
+ - **HTTP redirects are followed up to 5 times.** Status codes 301, 302, 303, 307, and 308 are handled automatically by following the `Location` header (relative URLs are resolved against the current URL). If the redirect chain exceeds 5 hops, an error is raised via `onError`.
package/bin/cli.js CHANGED
@@ -11,6 +11,7 @@ function printUsage() {
11
11
  ' --delay <ms> Delay between batches in milliseconds (default: 3000)',
12
12
  ' --limit <n> Concurrent fetches per batch (default: 5)',
13
13
  ' --timeout <ms> Request timeout in milliseconds (default: 30000)',
14
+ ' --tsv Output as tab-separated values with a header row',
14
15
  ' --help Show this help message',
15
16
  '',
16
17
  ].join('\n'));
@@ -20,12 +21,15 @@ function parseArgs(argv) {
20
21
  const args = argv.slice(2);
21
22
  const opts = { delay: 3000, limit: 5, timeout: 30000 };
22
23
  let url = null;
24
+ let tsv = false;
23
25
 
24
26
  for (let i = 0; i < args.length; i++) {
25
27
  const arg = args[i];
26
28
  if (arg === '--help' || arg === '-h') {
27
29
  printUsage();
28
30
  process.exit(0);
31
+ } else if (arg === '--tsv') {
32
+ tsv = true;
29
33
  } else if (arg === '--delay') {
30
34
  if (++i >= args.length) {
31
35
  process.stderr.write(`Error: --delay requires a value\n`);
@@ -76,24 +80,40 @@ function parseArgs(argv) {
76
80
  process.exit(1);
77
81
  }
78
82
 
79
- return { url, opts };
83
+ return { url, opts, tsv };
80
84
  }
81
85
 
82
86
  (async () => {
83
- const { url, opts } = parseArgs(process.argv);
87
+ const { url, opts, tsv } = parseArgs(process.argv);
88
+
89
+ const red = process.stderr.isTTY ? '\x1b[31m' : '';
90
+ const reset = process.stderr.isTTY ? '\x1b[0m' : '';
91
+
92
+ if (tsv) {
93
+ process.stdout.write('loc\tlastmod\tchangefreq\tpriority\n');
94
+ }
84
95
 
85
96
  let hasError = false;
86
97
  const parser = new SitemapXMLParser(url, {
87
98
  ...opts,
99
+ onEntry: (entry) => {
100
+ if (tsv) {
101
+ const loc = entry.loc?.[0] ?? '';
102
+ const lastmod = entry.lastmod?.[0] ?? '';
103
+ const changefreq = entry.changefreq?.[0] ?? '';
104
+ const priority = entry.priority?.[0] ?? '';
105
+ process.stdout.write(`${loc}\t${lastmod}\t${changefreq}\t${priority}\n`);
106
+ } else {
107
+ process.stdout.write(entry.loc[0] + '\n');
108
+ }
109
+ },
88
110
  onError: (failedUrl, err) => {
89
111
  hasError = true;
90
- process.stderr.write(`Error: ${failedUrl} ${err.message}\n`);
112
+ const msg = err.message.replace(/\r?\n/g, ' ').trim();
113
+ process.stderr.write(`${red}Error: ${failedUrl} — ${msg}${reset}\n`);
91
114
  },
92
115
  });
93
116
 
94
- const entries = await parser.fetch();
95
- for (const entry of entries) {
96
- process.stdout.write(entry.loc[0] + '\n');
97
- }
117
+ await parser.fetch();
98
118
  if (hasError) process.exit(1);
99
119
  })();
package/index.d.ts ADDED
@@ -0,0 +1,19 @@
1
+ export interface SitemapEntry {
2
+ loc: string[];
3
+ lastmod?: string[];
4
+ changefreq?: string[];
5
+ priority?: string[];
6
+ }
7
+
8
+ export interface SitemapOptions {
9
+ delay?: number;
10
+ limit?: number;
11
+ timeout?: number;
12
+ onError?: (url: string, error: Error) => void;
13
+ onEntry?: (entry: SitemapEntry) => void;
14
+ }
15
+
16
+ export default class SitemapXMLParser {
17
+ constructor(url: string, options?: SitemapOptions);
18
+ fetch(): Promise<SitemapEntry[]>;
19
+ }
package/lib/sitemap.js CHANGED
@@ -14,6 +14,7 @@ class SitemapXMLParser {
14
14
  this.limit = options.limit ?? 5;
15
15
  this.timeout = options.timeout ?? 30000;
16
16
  this.onError = options.onError || null;
17
+ this.onEntry = options.onEntry || null;
17
18
  this.urlArray = [];
18
19
  this.parser = new xml2js.Parser();
19
20
  }
@@ -57,6 +58,7 @@ class SitemapXMLParser {
57
58
  for (const entry of xml.urlset.url) {
58
59
  if (entry && entry.loc?.[0]) {
59
60
  this.urlArray.push(entry);
61
+ if (this.onEntry) this.onEntry(entry);
60
62
  }
61
63
  }
62
64
  }
@@ -64,28 +66,57 @@ class SitemapXMLParser {
64
66
 
65
67
  /**
66
68
  * Fetch body from URL using http/https.
69
+ * Follows redirects (301/302/303/307/308) up to 5 times.
67
70
  * Decompresses gzip automatically when the URL ends with .gz.
68
71
  * Returns null and calls onError on failure.
69
72
  */
70
73
  getBodyFromURL(url) {
74
+ return this._fetchWithRedirect(url, url, 0);
75
+ }
76
+
77
+ _fetchWithRedirect(originalUrl, currentUrl, redirectCount) {
71
78
  return new Promise((resolve) => {
79
+ let settled = false;
80
+ const failOnce = (url, err) => {
81
+ if (settled) return;
82
+ settled = true;
83
+ this._handleError(url, err);
84
+ resolve(null);
85
+ };
86
+
72
87
  let parsedUrl;
73
88
  try {
74
- parsedUrl = new URL(url);
89
+ parsedUrl = new URL(currentUrl);
75
90
  } catch (err) {
76
- this._handleError(url, err);
77
- resolve(null);
91
+ failOnce(originalUrl, err);
78
92
  return;
79
93
  }
80
94
 
81
95
  const ext = path.extname(parsedUrl.pathname);
82
96
  const transport = parsedUrl.protocol === 'https:' ? https : http;
83
97
 
84
- const req = transport.get(url, (res) => {
98
+ const req = transport.get(currentUrl, (res) => {
99
+ const REDIRECT_CODES = [301, 302, 303, 307, 308];
100
+ if (REDIRECT_CODES.includes(res.statusCode)) {
101
+ res.resume();
102
+ const location = res.headers['location'];
103
+ if (!location) {
104
+ failOnce(originalUrl, new Error(`HTTP ${res.statusCode} with no Location header`));
105
+ return;
106
+ }
107
+ if (redirectCount >= 5) {
108
+ failOnce(originalUrl, new Error('Too many redirects (max 5)'));
109
+ return;
110
+ }
111
+ settled = true;
112
+ const nextUrl = new URL(location, currentUrl).href;
113
+ resolve(this._fetchWithRedirect(originalUrl, nextUrl, redirectCount + 1));
114
+ return;
115
+ }
116
+
85
117
  if (res.statusCode < 200 || res.statusCode >= 300) {
86
118
  res.resume();
87
- this._handleError(url, new Error(`HTTP ${res.statusCode}`));
88
- resolve(null);
119
+ failOnce(originalUrl, new Error(`HTTP ${res.statusCode}`));
89
120
  return;
90
121
  }
91
122
  const chunks = [];
@@ -95,19 +126,19 @@ class SitemapXMLParser {
95
126
  if (ext === '.gz') {
96
127
  zlib.gunzip(buf, (err, result) => {
97
128
  if (err) {
98
- this._handleError(url, err);
99
- resolve(null);
129
+ failOnce(originalUrl, err);
100
130
  } else {
131
+ settled = true;
101
132
  resolve(result.toString());
102
133
  }
103
134
  });
104
135
  } else {
136
+ settled = true;
105
137
  resolve(buf.toString());
106
138
  }
107
139
  });
108
140
  res.on('error', (err) => {
109
- this._handleError(url, err);
110
- resolve(null);
141
+ failOnce(originalUrl, err);
111
142
  });
112
143
  });
113
144
 
@@ -116,8 +147,7 @@ class SitemapXMLParser {
116
147
  });
117
148
 
118
149
  req.on('error', (err) => {
119
- this._handleError(url, err);
120
- resolve(null);
150
+ failOnce(originalUrl, err);
121
151
  });
122
152
  });
123
153
  }
package/package.json CHANGED
@@ -1,18 +1,20 @@
1
1
  {
2
2
  "name": "sitemap-xml-parser",
3
- "version": "1.0.0",
3
+ "version": "1.1.0",
4
4
  "description": "Parses sitemap XML files and returns all listed URLs. Supports sitemap index files and gzip (.gz) compression.",
5
5
  "main": "index.js",
6
+ "types": "index.d.ts",
6
7
  "bin": {
7
8
  "sitemap-xml-parser": "bin/cli.js"
8
9
  },
9
10
  "files": [
10
11
  "index.js",
12
+ "index.d.ts",
11
13
  "lib",
12
14
  "bin"
13
15
  ],
14
16
  "engines": {
15
- "node": ">=18"
17
+ "node": ">=20"
16
18
  },
17
19
  "scripts": {
18
20
  "test": "node test/test.js"