sitemap-xml-parser 0.0.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2019 shinkawax
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md CHANGED
@@ -1,25 +1,104 @@
1
1
  # sitemap-xml-parser
2
2
 
3
- ## installation
3
+ Parses sitemap XML files and returns all listed URLs. Supports sitemap index files and gzip (.gz) compression.
4
+
5
+ ## Installation
4
6
 
5
7
  ```
6
8
  npm install sitemap-xml-parser
7
9
  ```
8
10
 
9
- ## example
11
+ ## Usage
10
12
 
11
- ```
13
+ ```js
12
14
  const SitemapXMLParser = require('sitemap-xml-parser');
13
15
 
14
- const url = 'something sitemap url';
15
- const options = {
16
+ const parser = new SitemapXMLParser('https://example.com/sitemap.xml', {
16
17
  delay: 3000,
17
- limit: 5
18
- };
18
+ limit: 5,
19
+ });
20
+
21
+ (async () => {
22
+ const urls = await parser.fetch();
23
+ urls.forEach(entry => {
24
+ console.log(entry.loc[0]);
25
+ });
26
+ })();
27
+ ```
28
+
29
+ ### Error handling with `onError`
19
30
 
20
- const sitemapXMLParser = new SitemapXMLParser(url, options);
31
+ Failed URLs (network errors, non-2xx responses, malformed XML) are skipped by default. Provide an `onError` callback to inspect them:
21
32
 
22
- sitemapXMLParser.fetch().then(result => {
23
- console.log(result);
33
+ ```js
34
+ const parser = new SitemapXMLParser('https://example.com/sitemap.xml', {
35
+ onError: (url, err) => {
36
+ console.error(`Skipped ${url}: ${err.message}`);
37
+ },
24
38
  });
25
39
  ```
40
+
41
+ ## Options
42
+
43
+ | Option | Type | Default | Description |
44
+ |-------------|------------|---------|-----------------------------------------------------------------------------|
45
+ | `delay` | `number` | `3000` | Milliseconds to wait between batches when following a sitemap index. CLI: `--delay` |
46
+ | `limit` | `number` | `5` | Number of child sitemaps to fetch concurrently per batch. CLI: `--limit` |
47
+ | `timeout` | `number` | `30000` | Milliseconds before a request is aborted. CLI: `--timeout` |
48
+ | `onError` | `function` | — | Called as `onError(url, error)` when a URL fails. The URL is skipped regardless. **API only.** |
49
+ | `--help` | — | — | Prints usage information and exits. **CLI only.** |
50
+ | `--timeout` | — | — | Same as the `timeout` option above, in milliseconds. **CLI only.** |
51
+
52
+ ## Return value
53
+
54
+ `fetch()` resolves to an array of URL entry objects. Each object reflects the fields present in the sitemap:
55
+
56
+ ```js
57
+ [
58
+ {
59
+ loc: ['https://example.com/page1'],
60
+ lastmod: ['2024-01-01'],
61
+ changefreq: ['weekly'],
62
+ priority: ['0.8'],
63
+ },
64
+ // ...
65
+ ]
66
+ ```
67
+
68
+ Fields other than `loc` (`lastmod`, `changefreq`, `priority`, etc.) are included only when present in the source XML.
69
+
70
+ ## CLI
71
+
72
+ Run without installing via `npx`:
73
+
74
+ ```sh
75
+ npx sitemap-xml-parser <url> [options]
76
+ ```
77
+
78
+ Or, after installing globally (`npm install -g sitemap-xml-parser`):
79
+
80
+ ```sh
81
+ sitemap-xml-parser <url> [options]
82
+ ```
83
+
84
+ Fetched URLs are printed to stdout, one per line. Errors are printed to stderr. See [Options](#options) for available flags.
85
+
86
+ ### Examples
87
+
88
+ ```sh
89
+ # Print all URLs
90
+ npx sitemap-xml-parser https://example.com/sitemap.xml
91
+
92
+ # No delay, higher concurrency
93
+ npx sitemap-xml-parser https://example.com/sitemap.xml --delay 0 --limit 10
94
+
95
+ # Save URLs to a file, errors to a log
96
+ npx sitemap-xml-parser https://example.com/sitemap.xml > urls.txt 2> errors.log
97
+
98
+ # Custom timeout
99
+ npx sitemap-xml-parser https://example.com/sitemap.xml --timeout 10000
100
+ ```
101
+
102
+ ## Limitations
103
+
104
+ - **HTTP redirects are not followed.** Responses with status codes 301, 302, or other 3xx are treated as errors. If your sitemap URL redirects, use the final destination URL directly.
package/bin/cli.js ADDED
@@ -0,0 +1,99 @@
1
+ #!/usr/bin/env node
2
+ 'use strict';
3
+
4
+ const SitemapXMLParser = require('../index.js');
5
+
6
+ function printUsage() {
7
+ process.stdout.write([
8
+ 'Usage: sitemap-xml-parser <url> [options]',
9
+ '',
10
+ 'Options:',
11
+ ' --delay <ms> Delay between batches in milliseconds (default: 3000)',
12
+ ' --limit <n> Concurrent fetches per batch (default: 5)',
13
+ ' --timeout <ms> Request timeout in milliseconds (default: 30000)',
14
+ ' --help Show this help message',
15
+ '',
16
+ ].join('\n'));
17
+ }
18
+
19
+ function parseArgs(argv) {
20
+ const args = argv.slice(2);
21
+ const opts = { delay: 3000, limit: 5, timeout: 30000 };
22
+ let url = null;
23
+
24
+ for (let i = 0; i < args.length; i++) {
25
+ const arg = args[i];
26
+ if (arg === '--help' || arg === '-h') {
27
+ printUsage();
28
+ process.exit(0);
29
+ } else if (arg === '--delay') {
30
+ if (++i >= args.length) {
31
+ process.stderr.write(`Error: --delay requires a value\n`);
32
+ process.exit(1);
33
+ }
34
+ const val = Number(args[i]);
35
+ if (!Number.isFinite(val) || val < 0) {
36
+ process.stderr.write(`Error: --delay must be a non-negative number\n`);
37
+ process.exit(1);
38
+ }
39
+ opts.delay = val;
40
+ } else if (arg === '--limit') {
41
+ if (++i >= args.length) {
42
+ process.stderr.write(`Error: --limit requires a value\n`);
43
+ process.exit(1);
44
+ }
45
+ const val = Number(args[i]);
46
+ if (!Number.isInteger(val) || val < 1) {
47
+ process.stderr.write(`Error: --limit must be a positive integer\n`);
48
+ process.exit(1);
49
+ }
50
+ opts.limit = val;
51
+ } else if (arg === '--timeout') {
52
+ if (++i >= args.length) {
53
+ process.stderr.write(`Error: --timeout requires a value\n`);
54
+ process.exit(1);
55
+ }
56
+ const val = Number(args[i]);
57
+ if (!Number.isFinite(val) || val < 0) {
58
+ process.stderr.write(`Error: --timeout must be a non-negative number\n`);
59
+ process.exit(1);
60
+ }
61
+ opts.timeout = val;
62
+ } else if (arg.startsWith('--')) {
63
+ process.stderr.write(`Error: unknown option ${arg}\n`);
64
+ process.exit(1);
65
+ } else {
66
+ if (url !== null) {
67
+ process.stderr.write(`Error: unexpected argument: ${arg}\n`);
68
+ process.exit(1);
69
+ }
70
+ url = arg;
71
+ }
72
+ }
73
+
74
+ if (!url) {
75
+ printUsage();
76
+ process.exit(1);
77
+ }
78
+
79
+ return { url, opts };
80
+ }
81
+
82
+ (async () => {
83
+ const { url, opts } = parseArgs(process.argv);
84
+
85
+ let hasError = false;
86
+ const parser = new SitemapXMLParser(url, {
87
+ ...opts,
88
+ onError: (failedUrl, err) => {
89
+ hasError = true;
90
+ process.stderr.write(`Error: ${failedUrl} — ${err.message}\n`);
91
+ },
92
+ });
93
+
94
+ const entries = await parser.fetch();
95
+ for (const entry of entries) {
96
+ process.stdout.write(entry.loc[0] + '\n');
97
+ }
98
+ if (hasError) process.exit(1);
99
+ })();
package/lib/sitemap.js CHANGED
@@ -1,130 +1,150 @@
1
1
  'use strict';
2
2
 
3
- const request = require('request');
3
+ const http = require('http');
4
+ const https = require('https');
4
5
  const xml2js = require('xml2js');
5
- const parser = new xml2js.Parser();
6
- const bluebird = require('bluebird');
7
- const promiseMap = bluebird.map;
8
- const delay = bluebird.delay;
9
- const Url = require('url');
10
6
  const path = require('path');
11
- const zlib = require("zlib");
7
+ const zlib = require('zlib');
8
+ const { URL } = require('url');
12
9
 
13
10
  class SitemapXMLParser {
14
- constructor(url, options) {
11
+ constructor(url, options = {}) {
15
12
  this.siteMapUrl = url;
16
- this.delayTime = options.delay ? options.delay : 3000;
17
- this.limit = options.limit ? options.limit : 5;
13
+ this.delayTime = options.delay ?? 3000;
14
+ this.limit = options.limit ?? 5;
15
+ this.timeout = options.timeout ?? 30000;
16
+ this.onError = options.onError || null;
18
17
  this.urlArray = [];
18
+ this.parser = new xml2js.Parser();
19
19
  }
20
20
 
21
21
  async fetch() {
22
- //トップページのXMLを取得
22
+ this.urlArray = [];
23
23
  const indexBody = await this.getBodyFromURL(this.siteMapUrl);
24
- const indexXML = await this.executeParseXml(indexBody);
25
- //URL一覧を取得
26
- await this.getURLFromXML(indexXML)
27
- //サイトマップの一覧
24
+ if (indexBody === null) return this.urlArray;
25
+ const indexXML = await this.executeParseXml(this.siteMapUrl, indexBody);
26
+ if (indexXML === null) return this.urlArray;
27
+ await this.getURLFromXML(indexXML);
28
28
  return this.urlArray;
29
- };
30
-
31
-
32
- async getURLFromURL(url) {
33
- let body = await this.getBodyFromURL(url);
34
- let sitemapData = await this.executeParseXml(body);
35
- await this.getURLFromXML(sitemapData);
36
- return delay(this.delayTime);
37
29
  }
38
30
 
39
31
  /**
40
- * サイトマップ一覧からURLを取得する
41
- * サイトマップインデックスファイルの場合は、リンク先にアクセスしてURLを集める
42
- * @param {*} xml
32
+ * Collect URLs from parsed XML.
33
+ * If the XML is a sitemap index, follow each child sitemap.
43
34
  */
44
35
  async getURLFromXML(xml) {
45
- let sitemapIndexData = [];
46
- if (xml.sitemapindex
47
- && xml.sitemapindex.sitemap
48
- ) {
49
- //サイトマップインデックスファイルの場合
50
- for (let i = 0; i < Object.keys(xml.sitemapindex.sitemap).length; i++) {
51
- sitemapIndexData.push(
52
- {
53
- url: xml.sitemapindex.sitemap[i].loc[0],
54
- this: this
55
- //TODO promiseMapの引数が1つ?のため一緒の配列にthisを入れる 本来不要
56
- //promiseMapへは参照渡しになっているので
57
- //promiseMap内でのthisの値を変更すればpromiseMap外でもthisの値は変更される
58
- }
36
+ if (xml.sitemapindex && xml.sitemapindex.sitemap) {
37
+ const urls = xml.sitemapindex.sitemap.map(s => s.loc?.[0]).filter(Boolean);
38
+
39
+ for (let i = 0; i < urls.length; i += this.limit) {
40
+ const chunk = urls.slice(i, i + this.limit);
41
+ await Promise.all(
42
+ chunk.map(async (url) => {
43
+ const body = await this.getBodyFromURL(url);
44
+ if (body === null) return;
45
+ const sitemapData = await this.executeParseXml(url, body);
46
+ if (sitemapData === null) return;
47
+ await this.getURLFromXML(sitemapData);
48
+ })
59
49
  );
50
+ if (i + this.limit < urls.length) {
51
+ await this._delay(this.delayTime);
52
+ }
60
53
  }
61
-
62
- //各サイトマップインデックスファィルにアクセスしてURL一覧を取得する
63
- //Limitに指定された数で同時に処理を行う
64
- await promiseMap(
65
- sitemapIndexData,
66
- async (data) => {
67
- let body = await data.this.getBodyFromURL(data.url);
68
- let sitemapData = await data.this.executeParseXml(body);
69
- await data.this.getURLFromXML(sitemapData);
70
- return delay(data.this.delayTime);
71
- },
72
- { concurrency: this.limit }
73
- )
74
54
  }
75
55
 
76
- if (xml.urlset
77
- && xml.urlset.url
78
- ) {
79
- //サイトマップの場合 取得した一覧を追加
80
- for (let i = 0; i < Object.keys(xml.urlset.url).length; i++) {
81
- if (xml.urlset.url[i]) {
82
- this.urlArray.push(xml.urlset.url[i]);
56
+ if (xml.urlset && xml.urlset.url) {
57
+ for (const entry of xml.urlset.url) {
58
+ if (entry && entry.loc?.[0]) {
59
+ this.urlArray.push(entry);
83
60
  }
84
61
  }
85
62
  }
86
63
  }
87
64
 
88
65
  /**
89
- * URLからbodyを取得する
90
- * 拡張子がgzファィルの場合は解凍する
91
- * @param {*} url
66
+ * Fetch body from URL using http/https.
67
+ * Decompresses gzip automatically when the URL ends with .gz.
68
+ * Returns null and calls onError on failure.
92
69
  */
93
- async getBodyFromURL(url) {
94
- console.log(url + ' Access');
95
- return new Promise(resolve => {
96
- //拡張子がgzでないか確認する
97
- let urlParse = Url.parse(url);
98
- let ext = path.extname(urlParse.path);
99
- if (ext == '.gz') {
100
- request(url, { encoding: null }, function (error, response, body) {
101
- zlib.gunzip(body, function (error, result) {
102
- console.log(url + ' Get');
103
- resolve(result.toString());
104
- });
70
+ getBodyFromURL(url) {
71
+ return new Promise((resolve) => {
72
+ let parsedUrl;
73
+ try {
74
+ parsedUrl = new URL(url);
75
+ } catch (err) {
76
+ this._handleError(url, err);
77
+ resolve(null);
78
+ return;
79
+ }
80
+
81
+ const ext = path.extname(parsedUrl.pathname);
82
+ const transport = parsedUrl.protocol === 'https:' ? https : http;
83
+
84
+ const req = transport.get(url, (res) => {
85
+ if (res.statusCode < 200 || res.statusCode >= 300) {
86
+ res.resume();
87
+ this._handleError(url, new Error(`HTTP ${res.statusCode}`));
88
+ resolve(null);
89
+ return;
90
+ }
91
+ const chunks = [];
92
+ res.on('data', chunk => chunks.push(chunk));
93
+ res.on('end', () => {
94
+ const buf = Buffer.concat(chunks);
95
+ if (ext === '.gz') {
96
+ zlib.gunzip(buf, (err, result) => {
97
+ if (err) {
98
+ this._handleError(url, err);
99
+ resolve(null);
100
+ } else {
101
+ resolve(result.toString());
102
+ }
103
+ });
104
+ } else {
105
+ resolve(buf.toString());
106
+ }
105
107
  });
106
- } else {
107
- request(url, function (error, response, body) {
108
- console.log(url + ' Get');
109
- resolve(body.toString());
108
+ res.on('error', (err) => {
109
+ this._handleError(url, err);
110
+ resolve(null);
110
111
  });
111
- }
112
+ });
113
+
114
+ req.setTimeout(this.timeout, () => {
115
+ req.destroy(new Error(`Timeout after ${this.timeout}ms`));
116
+ });
117
+
118
+ req.on('error', (err) => {
119
+ this._handleError(url, err);
120
+ resolve(null);
121
+ });
112
122
  });
113
123
  }
114
124
 
115
-
116
125
  /**
117
- * 実際にXMLのパースを行う関数
118
- * @param {*} value
126
+ * Parse XML string. Returns null and calls onError on parse failure.
119
127
  */
120
- async executeParseXml(xml) {
121
- return new Promise(resolve => {
122
- parser.parseString(xml, (error, result) => {
123
- resolve(result);
128
+ executeParseXml(url, xml) {
129
+ return new Promise((resolve) => {
130
+ this.parser.parseString(xml, (err, result) => {
131
+ if (err) {
132
+ this._handleError(url, err);
133
+ resolve(null);
134
+ } else {
135
+ resolve(result);
136
+ }
124
137
  });
125
- })
138
+ });
139
+ }
140
+
141
+ _handleError(url, err) {
142
+ if (this.onError) this.onError(url, err);
143
+ }
144
+
145
+ _delay(ms) {
146
+ return new Promise(resolve => setTimeout(resolve, ms));
126
147
  }
127
148
  }
128
149
 
129
150
  module.exports = SitemapXMLParser;
130
- module.exports.default = SitemapXMLParser;
package/package.json CHANGED
@@ -1,32 +1,34 @@
1
1
  {
2
2
  "name": "sitemap-xml-parser",
3
- "version": "0.0.1",
3
+ "version": "1.0.0",
4
+ "description": "Parses sitemap XML files and returns all listed URLs. Supports sitemap index files and gzip (.gz) compression.",
4
5
  "main": "index.js",
5
- "private": false,
6
- "devDependencies": {},
6
+ "bin": {
7
+ "sitemap-xml-parser": "bin/cli.js"
8
+ },
9
+ "files": [
10
+ "index.js",
11
+ "lib",
12
+ "bin"
13
+ ],
14
+ "engines": {
15
+ "node": ">=18"
16
+ },
7
17
  "scripts": {
8
- "test": "echo \"Error: no test specified\" && exit 1"
18
+ "test": "node test/test.js"
9
19
  },
20
+ "keywords": ["sitemap", "xml", "parse", "gzip", "sitemap-index", "cli"],
21
+ "author": "shinkawax",
22
+ "license": "MIT",
10
23
  "repository": {
11
24
  "type": "git",
12
25
  "url": "git+https://github.com/shinkawax/sitemap-xml-parser.git"
13
26
  },
14
- "keywords": [
15
- "sitemap",
16
- "xml",
17
- "parse",
18
- "gz"
19
- ],
20
- "author": "shinkawax",
21
- "license": "MIT",
22
27
  "bugs": {
23
28
  "url": "https://github.com/shinkawax/sitemap-xml-parser/issues"
24
29
  },
25
30
  "homepage": "https://github.com/shinkawax/sitemap-xml-parser#readme",
26
- "description": "It parses xml based on sitemap.xml and gets all files described in sitemap. Supports gz format",
27
31
  "dependencies": {
28
- "bluebird": "^3.5.5",
29
- "request": "^2.88.0",
30
- "xml2js": "^0.4.19"
32
+ "xml2js": "^0.6.2"
31
33
  }
32
34
  }