sitemap-xml-parser 0.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +86 -33
- package/bin/cli.js +119 -0
- package/index.d.ts +19 -0
- package/lib/sitemap.js +139 -86
- package/package.json +20 -16
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2019 shinkawax
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -1,56 +1,109 @@
|
|
|
1
1
|
# sitemap-xml-parser
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Parses sitemap XML files and returns all listed URLs. Supports sitemap index files and gzip (.gz) compression.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
4
6
|
|
|
5
7
|
```
|
|
6
8
|
npm install sitemap-xml-parser
|
|
7
9
|
```
|
|
8
10
|
|
|
9
|
-
##
|
|
11
|
+
## Usage
|
|
10
12
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
+
```js
|
|
14
|
+
const SitemapXMLParser = require('sitemap-xml-parser');
|
|
13
15
|
|
|
14
|
-
|
|
16
|
+
const parser = new SitemapXMLParser('https://example.com/sitemap.xml');
|
|
15
17
|
|
|
18
|
+
(async () => {
|
|
19
|
+
const urls = await parser.fetch();
|
|
20
|
+
urls.forEach(entry => {
|
|
21
|
+
console.log(entry.loc[0]);
|
|
22
|
+
});
|
|
23
|
+
})();
|
|
16
24
|
```
|
|
17
|
-
const SitemapXMLParser = require('sitemap-xml-parser');
|
|
18
25
|
|
|
19
|
-
|
|
26
|
+
### Error handling with `onError`
|
|
20
27
|
|
|
21
|
-
|
|
22
|
-
If sitemapindex (link of xml or gz file) is written in sitemap, the URL will be accessed.
|
|
23
|
-
You can optionally specify the number of concurrent accesses and the number of milliseconds after processing and access to resume processing after a delay.
|
|
24
|
-
*/
|
|
28
|
+
Failed URLs (network errors, non-2xx responses, malformed XML) are skipped by default. Provide an `onError` callback to inspect them:
|
|
25
29
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
};
|
|
30
|
+
```js
|
|
31
|
+
const parser = new SitemapXMLParser('https://example.com/sitemap.xml', {
|
|
32
|
+
onError: (url, err) => {
|
|
33
|
+
console.error(`Skipped ${url}: ${err.message}`);
|
|
34
|
+
},
|
|
35
|
+
});
|
|
36
|
+
```
|
|
30
37
|
|
|
31
|
-
|
|
38
|
+
## Options
|
|
32
39
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
40
|
+
| Option | Type | Default | Description |
|
|
41
|
+
|-------------|------------|---------|-----------------------------------------------------------------------------|
|
|
42
|
+
| `delay` | `number` | `3000` | Milliseconds to wait between batches when following a sitemap index. Default is 3000 to avoid overloading the target server; set to `0` to disable. CLI: `--delay` |
|
|
43
|
+
| `limit` | `number` | `5` | Number of child sitemaps to fetch concurrently per batch. CLI: `--limit` |
|
|
44
|
+
| `timeout` | `number` | `30000` | Milliseconds before a request is aborted. CLI: `--timeout` |
|
|
45
|
+
| `onError` | `function` | — | Called as `onError(url, error)` when a URL fails. The URL is skipped regardless. **Library only.** |
|
|
46
|
+
| `onEntry` | `function` | — | Called as `onEntry(entry)` each time a URL entry is parsed. `entry` has the same shape as the objects returned by `fetch()`. **Library only.** |
|
|
47
|
+
| `tsv` | — | — | Output results as tab-separated values. Prints a header row (`loc`, `lastmod`, `changefreq`, `priority`) followed by one row per entry. Missing fields are output as empty strings. **CLI only.** |
|
|
36
48
|
|
|
49
|
+
## Return value
|
|
37
50
|
|
|
38
|
-
|
|
39
|
-
Returns
|
|
51
|
+
`fetch()` resolves to an array of URL entry objects. Each object reflects the fields present in the sitemap:
|
|
40
52
|
|
|
53
|
+
```js
|
|
54
|
+
[
|
|
41
55
|
{
|
|
42
|
-
loc:
|
|
43
|
-
lastmod:
|
|
44
|
-
changefreq: [
|
|
45
|
-
priority:
|
|
46
|
-
},
|
|
47
|
-
{
|
|
48
|
-
loc: [ --- ],
|
|
49
|
-
lastmod: [ --- ],
|
|
50
|
-
changefreq: [ --- ],
|
|
51
|
-
priority: [ --- ]
|
|
56
|
+
loc: ['https://example.com/page1'],
|
|
57
|
+
lastmod: ['2024-01-01'],
|
|
58
|
+
changefreq: ['weekly'],
|
|
59
|
+
priority: ['0.8'],
|
|
52
60
|
},
|
|
53
|
-
...
|
|
54
|
-
|
|
61
|
+
// ...
|
|
62
|
+
]
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
All field values are arrays (xml2js convention). Use `entry.loc[0]` to get the URL string, `entry.lastmod?.[0]` for optional fields, and so on.
|
|
66
|
+
|
|
67
|
+
Fields other than `loc` (`lastmod`, `changefreq`, `priority`, etc.) are included only when present in the source XML.
|
|
55
68
|
|
|
69
|
+
## CLI
|
|
70
|
+
|
|
71
|
+
Run without installing via `npx`:
|
|
72
|
+
|
|
73
|
+
```sh
|
|
74
|
+
npx sitemap-xml-parser <url> [options]
|
|
56
75
|
```
|
|
76
|
+
|
|
77
|
+
Or, after installing globally (`npm install -g sitemap-xml-parser`):
|
|
78
|
+
|
|
79
|
+
```sh
|
|
80
|
+
sitemap-xml-parser <url> [options]
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Fetched URLs are printed to stdout, one per line. Errors are printed to stderr. See [Options](#options) for available flags.
|
|
84
|
+
|
|
85
|
+
### Examples
|
|
86
|
+
|
|
87
|
+
```sh
|
|
88
|
+
# Print all URLs
|
|
89
|
+
npx sitemap-xml-parser https://example.com/sitemap.xml
|
|
90
|
+
|
|
91
|
+
# No delay, higher concurrency
|
|
92
|
+
npx sitemap-xml-parser https://example.com/sitemap.xml --delay 0 --limit 10
|
|
93
|
+
|
|
94
|
+
# Save URLs to a file, errors to a log
|
|
95
|
+
npx sitemap-xml-parser https://example.com/sitemap.xml > urls.txt 2> errors.log
|
|
96
|
+
|
|
97
|
+
# Custom timeout
|
|
98
|
+
npx sitemap-xml-parser https://example.com/sitemap.xml --timeout 10000
|
|
99
|
+
|
|
100
|
+
# Output as TSV (includes lastmod, changefreq, priority)
|
|
101
|
+
npx sitemap-xml-parser https://example.com/sitemap.xml --tsv
|
|
102
|
+
|
|
103
|
+
# Save TSV to a file
|
|
104
|
+
npx sitemap-xml-parser https://example.com/sitemap.xml --tsv > urls.tsv
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Limitations
|
|
108
|
+
|
|
109
|
+
- **HTTP redirects are followed up to 5 times.** Status codes 301, 302, 303, 307, and 308 are handled automatically by following the `Location` header (relative URLs are resolved against the current URL). If the redirect chain exceeds 5 hops, an error is raised via `onError`.
|
package/bin/cli.js
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
const SitemapXMLParser = require('../index.js');
|
|
5
|
+
|
|
6
|
+
function printUsage() {
|
|
7
|
+
process.stdout.write([
|
|
8
|
+
'Usage: sitemap-xml-parser <url> [options]',
|
|
9
|
+
'',
|
|
10
|
+
'Options:',
|
|
11
|
+
' --delay <ms> Delay between batches in milliseconds (default: 3000)',
|
|
12
|
+
' --limit <n> Concurrent fetches per batch (default: 5)',
|
|
13
|
+
' --timeout <ms> Request timeout in milliseconds (default: 30000)',
|
|
14
|
+
' --tsv Output as tab-separated values with a header row',
|
|
15
|
+
' --help Show this help message',
|
|
16
|
+
'',
|
|
17
|
+
].join('\n'));
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function parseArgs(argv) {
|
|
21
|
+
const args = argv.slice(2);
|
|
22
|
+
const opts = { delay: 3000, limit: 5, timeout: 30000 };
|
|
23
|
+
let url = null;
|
|
24
|
+
let tsv = false;
|
|
25
|
+
|
|
26
|
+
for (let i = 0; i < args.length; i++) {
|
|
27
|
+
const arg = args[i];
|
|
28
|
+
if (arg === '--help' || arg === '-h') {
|
|
29
|
+
printUsage();
|
|
30
|
+
process.exit(0);
|
|
31
|
+
} else if (arg === '--tsv') {
|
|
32
|
+
tsv = true;
|
|
33
|
+
} else if (arg === '--delay') {
|
|
34
|
+
if (++i >= args.length) {
|
|
35
|
+
process.stderr.write(`Error: --delay requires a value\n`);
|
|
36
|
+
process.exit(1);
|
|
37
|
+
}
|
|
38
|
+
const val = Number(args[i]);
|
|
39
|
+
if (!Number.isFinite(val) || val < 0) {
|
|
40
|
+
process.stderr.write(`Error: --delay must be a non-negative number\n`);
|
|
41
|
+
process.exit(1);
|
|
42
|
+
}
|
|
43
|
+
opts.delay = val;
|
|
44
|
+
} else if (arg === '--limit') {
|
|
45
|
+
if (++i >= args.length) {
|
|
46
|
+
process.stderr.write(`Error: --limit requires a value\n`);
|
|
47
|
+
process.exit(1);
|
|
48
|
+
}
|
|
49
|
+
const val = Number(args[i]);
|
|
50
|
+
if (!Number.isInteger(val) || val < 1) {
|
|
51
|
+
process.stderr.write(`Error: --limit must be a positive integer\n`);
|
|
52
|
+
process.exit(1);
|
|
53
|
+
}
|
|
54
|
+
opts.limit = val;
|
|
55
|
+
} else if (arg === '--timeout') {
|
|
56
|
+
if (++i >= args.length) {
|
|
57
|
+
process.stderr.write(`Error: --timeout requires a value\n`);
|
|
58
|
+
process.exit(1);
|
|
59
|
+
}
|
|
60
|
+
const val = Number(args[i]);
|
|
61
|
+
if (!Number.isFinite(val) || val < 0) {
|
|
62
|
+
process.stderr.write(`Error: --timeout must be a non-negative number\n`);
|
|
63
|
+
process.exit(1);
|
|
64
|
+
}
|
|
65
|
+
opts.timeout = val;
|
|
66
|
+
} else if (arg.startsWith('--')) {
|
|
67
|
+
process.stderr.write(`Error: unknown option ${arg}\n`);
|
|
68
|
+
process.exit(1);
|
|
69
|
+
} else {
|
|
70
|
+
if (url !== null) {
|
|
71
|
+
process.stderr.write(`Error: unexpected argument: ${arg}\n`);
|
|
72
|
+
process.exit(1);
|
|
73
|
+
}
|
|
74
|
+
url = arg;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if (!url) {
|
|
79
|
+
printUsage();
|
|
80
|
+
process.exit(1);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return { url, opts, tsv };
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
(async () => {
|
|
87
|
+
const { url, opts, tsv } = parseArgs(process.argv);
|
|
88
|
+
|
|
89
|
+
const red = process.stderr.isTTY ? '\x1b[31m' : '';
|
|
90
|
+
const reset = process.stderr.isTTY ? '\x1b[0m' : '';
|
|
91
|
+
|
|
92
|
+
if (tsv) {
|
|
93
|
+
process.stdout.write('loc\tlastmod\tchangefreq\tpriority\n');
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
let hasError = false;
|
|
97
|
+
const parser = new SitemapXMLParser(url, {
|
|
98
|
+
...opts,
|
|
99
|
+
onEntry: (entry) => {
|
|
100
|
+
if (tsv) {
|
|
101
|
+
const loc = entry.loc?.[0] ?? '';
|
|
102
|
+
const lastmod = entry.lastmod?.[0] ?? '';
|
|
103
|
+
const changefreq = entry.changefreq?.[0] ?? '';
|
|
104
|
+
const priority = entry.priority?.[0] ?? '';
|
|
105
|
+
process.stdout.write(`${loc}\t${lastmod}\t${changefreq}\t${priority}\n`);
|
|
106
|
+
} else {
|
|
107
|
+
process.stdout.write(entry.loc[0] + '\n');
|
|
108
|
+
}
|
|
109
|
+
},
|
|
110
|
+
onError: (failedUrl, err) => {
|
|
111
|
+
hasError = true;
|
|
112
|
+
const msg = err.message.replace(/\r?\n/g, ' ').trim();
|
|
113
|
+
process.stderr.write(`${red}Error: ${failedUrl} — ${msg}${reset}\n`);
|
|
114
|
+
},
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
await parser.fetch();
|
|
118
|
+
if (hasError) process.exit(1);
|
|
119
|
+
})();
|
package/index.d.ts
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
export interface SitemapEntry {
|
|
2
|
+
loc: string[];
|
|
3
|
+
lastmod?: string[];
|
|
4
|
+
changefreq?: string[];
|
|
5
|
+
priority?: string[];
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
export interface SitemapOptions {
|
|
9
|
+
delay?: number;
|
|
10
|
+
limit?: number;
|
|
11
|
+
timeout?: number;
|
|
12
|
+
onError?: (url: string, error: Error) => void;
|
|
13
|
+
onEntry?: (entry: SitemapEntry) => void;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export default class SitemapXMLParser {
|
|
17
|
+
constructor(url: string, options?: SitemapOptions);
|
|
18
|
+
fetch(): Promise<SitemapEntry[]>;
|
|
19
|
+
}
|
package/lib/sitemap.js
CHANGED
|
@@ -1,127 +1,180 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
const
|
|
3
|
+
const http = require('http');
|
|
4
|
+
const https = require('https');
|
|
4
5
|
const xml2js = require('xml2js');
|
|
5
|
-
const parser = new xml2js.Parser();
|
|
6
|
-
const bluebird = require('bluebird');
|
|
7
|
-
const promiseMap = bluebird.map;
|
|
8
|
-
const delay = bluebird.delay;
|
|
9
|
-
const Url = require('url');
|
|
10
6
|
const path = require('path');
|
|
11
|
-
const zlib = require(
|
|
7
|
+
const zlib = require('zlib');
|
|
8
|
+
const { URL } = require('url');
|
|
12
9
|
|
|
13
10
|
class SitemapXMLParser {
|
|
14
|
-
constructor(url, options) {
|
|
11
|
+
constructor(url, options = {}) {
|
|
15
12
|
this.siteMapUrl = url;
|
|
16
|
-
this.delayTime = options.delay
|
|
17
|
-
this.limit = options.limit
|
|
13
|
+
this.delayTime = options.delay ?? 3000;
|
|
14
|
+
this.limit = options.limit ?? 5;
|
|
15
|
+
this.timeout = options.timeout ?? 30000;
|
|
16
|
+
this.onError = options.onError || null;
|
|
17
|
+
this.onEntry = options.onEntry || null;
|
|
18
18
|
this.urlArray = [];
|
|
19
|
+
this.parser = new xml2js.Parser();
|
|
19
20
|
}
|
|
20
21
|
|
|
21
22
|
async fetch() {
|
|
22
|
-
|
|
23
|
+
this.urlArray = [];
|
|
23
24
|
const indexBody = await this.getBodyFromURL(this.siteMapUrl);
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
25
|
+
if (indexBody === null) return this.urlArray;
|
|
26
|
+
const indexXML = await this.executeParseXml(this.siteMapUrl, indexBody);
|
|
27
|
+
if (indexXML === null) return this.urlArray;
|
|
28
|
+
await this.getURLFromXML(indexXML);
|
|
28
29
|
return this.urlArray;
|
|
29
|
-
};
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
async getURLFromURL(url) {
|
|
33
|
-
let body = await this.getBodyFromURL(url);
|
|
34
|
-
let sitemapData = await this.executeParseXml(body);
|
|
35
|
-
await this.getURLFromXML(sitemapData);
|
|
36
|
-
return delay(this.delayTime);
|
|
37
30
|
}
|
|
38
31
|
|
|
39
32
|
/**
|
|
40
|
-
*
|
|
41
|
-
*
|
|
42
|
-
* @param {*} xml
|
|
33
|
+
* Collect URLs from parsed XML.
|
|
34
|
+
* If the XML is a sitemap index, follow each child sitemap.
|
|
43
35
|
*/
|
|
44
36
|
async getURLFromXML(xml) {
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
this
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
}
|
|
37
|
+
if (xml.sitemapindex && xml.sitemapindex.sitemap) {
|
|
38
|
+
const urls = xml.sitemapindex.sitemap.map(s => s.loc?.[0]).filter(Boolean);
|
|
39
|
+
|
|
40
|
+
for (let i = 0; i < urls.length; i += this.limit) {
|
|
41
|
+
const chunk = urls.slice(i, i + this.limit);
|
|
42
|
+
await Promise.all(
|
|
43
|
+
chunk.map(async (url) => {
|
|
44
|
+
const body = await this.getBodyFromURL(url);
|
|
45
|
+
if (body === null) return;
|
|
46
|
+
const sitemapData = await this.executeParseXml(url, body);
|
|
47
|
+
if (sitemapData === null) return;
|
|
48
|
+
await this.getURLFromXML(sitemapData);
|
|
49
|
+
})
|
|
59
50
|
);
|
|
51
|
+
if (i + this.limit < urls.length) {
|
|
52
|
+
await this._delay(this.delayTime);
|
|
53
|
+
}
|
|
60
54
|
}
|
|
61
|
-
|
|
62
|
-
//各サイトマップインデックスファィルにアクセスしてURL一覧を取得する
|
|
63
|
-
//Limitに指定された数で同時に処理を行う
|
|
64
|
-
await promiseMap(
|
|
65
|
-
sitemapIndexData,
|
|
66
|
-
async (data) => {
|
|
67
|
-
let body = await data.this.getBodyFromURL(data.url);
|
|
68
|
-
let sitemapData = await data.this.executeParseXml(body);
|
|
69
|
-
await data.this.getURLFromXML(sitemapData);
|
|
70
|
-
return delay(data.this.delayTime);
|
|
71
|
-
},
|
|
72
|
-
{ concurrency: this.limit }
|
|
73
|
-
)
|
|
74
55
|
}
|
|
75
56
|
|
|
76
|
-
if (xml.urlset
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
if (xml.urlset.url[i]) {
|
|
82
|
-
this.urlArray.push(xml.urlset.url[i]);
|
|
57
|
+
if (xml.urlset && xml.urlset.url) {
|
|
58
|
+
for (const entry of xml.urlset.url) {
|
|
59
|
+
if (entry && entry.loc?.[0]) {
|
|
60
|
+
this.urlArray.push(entry);
|
|
61
|
+
if (this.onEntry) this.onEntry(entry);
|
|
83
62
|
}
|
|
84
63
|
}
|
|
85
64
|
}
|
|
86
65
|
}
|
|
87
66
|
|
|
88
67
|
/**
|
|
89
|
-
* URL
|
|
90
|
-
*
|
|
91
|
-
*
|
|
68
|
+
* Fetch body from URL using http/https.
|
|
69
|
+
* Follows redirects (301/302/303/307/308) up to 5 times.
|
|
70
|
+
* Decompresses gzip automatically when the URL ends with .gz.
|
|
71
|
+
* Returns null and calls onError on failure.
|
|
92
72
|
*/
|
|
93
|
-
|
|
94
|
-
return
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
73
|
+
getBodyFromURL(url) {
|
|
74
|
+
return this._fetchWithRedirect(url, url, 0);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
_fetchWithRedirect(originalUrl, currentUrl, redirectCount) {
|
|
78
|
+
return new Promise((resolve) => {
|
|
79
|
+
let settled = false;
|
|
80
|
+
const failOnce = (url, err) => {
|
|
81
|
+
if (settled) return;
|
|
82
|
+
settled = true;
|
|
83
|
+
this._handleError(url, err);
|
|
84
|
+
resolve(null);
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
let parsedUrl;
|
|
88
|
+
try {
|
|
89
|
+
parsedUrl = new URL(currentUrl);
|
|
90
|
+
} catch (err) {
|
|
91
|
+
failOnce(originalUrl, err);
|
|
92
|
+
return;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const ext = path.extname(parsedUrl.pathname);
|
|
96
|
+
const transport = parsedUrl.protocol === 'https:' ? https : http;
|
|
97
|
+
|
|
98
|
+
const req = transport.get(currentUrl, (res) => {
|
|
99
|
+
const REDIRECT_CODES = [301, 302, 303, 307, 308];
|
|
100
|
+
if (REDIRECT_CODES.includes(res.statusCode)) {
|
|
101
|
+
res.resume();
|
|
102
|
+
const location = res.headers['location'];
|
|
103
|
+
if (!location) {
|
|
104
|
+
failOnce(originalUrl, new Error(`HTTP ${res.statusCode} with no Location header`));
|
|
105
|
+
return;
|
|
106
|
+
}
|
|
107
|
+
if (redirectCount >= 5) {
|
|
108
|
+
failOnce(originalUrl, new Error('Too many redirects (max 5)'));
|
|
109
|
+
return;
|
|
110
|
+
}
|
|
111
|
+
settled = true;
|
|
112
|
+
const nextUrl = new URL(location, currentUrl).href;
|
|
113
|
+
resolve(this._fetchWithRedirect(originalUrl, nextUrl, redirectCount + 1));
|
|
114
|
+
return;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
if (res.statusCode < 200 || res.statusCode >= 300) {
|
|
118
|
+
res.resume();
|
|
119
|
+
failOnce(originalUrl, new Error(`HTTP ${res.statusCode}`));
|
|
120
|
+
return;
|
|
121
|
+
}
|
|
122
|
+
const chunks = [];
|
|
123
|
+
res.on('data', chunk => chunks.push(chunk));
|
|
124
|
+
res.on('end', () => {
|
|
125
|
+
const buf = Buffer.concat(chunks);
|
|
126
|
+
if (ext === '.gz') {
|
|
127
|
+
zlib.gunzip(buf, (err, result) => {
|
|
128
|
+
if (err) {
|
|
129
|
+
failOnce(originalUrl, err);
|
|
130
|
+
} else {
|
|
131
|
+
settled = true;
|
|
132
|
+
resolve(result.toString());
|
|
133
|
+
}
|
|
134
|
+
});
|
|
135
|
+
} else {
|
|
136
|
+
settled = true;
|
|
137
|
+
resolve(buf.toString());
|
|
138
|
+
}
|
|
103
139
|
});
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
resolve(body.toString());
|
|
140
|
+
res.on('error', (err) => {
|
|
141
|
+
failOnce(originalUrl, err);
|
|
107
142
|
});
|
|
108
|
-
}
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
req.setTimeout(this.timeout, () => {
|
|
146
|
+
req.destroy(new Error(`Timeout after ${this.timeout}ms`));
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
req.on('error', (err) => {
|
|
150
|
+
failOnce(originalUrl, err);
|
|
151
|
+
});
|
|
109
152
|
});
|
|
110
153
|
}
|
|
111
154
|
|
|
112
|
-
|
|
113
155
|
/**
|
|
114
|
-
*
|
|
115
|
-
* @param {*} value
|
|
156
|
+
* Parse XML string. Returns null and calls onError on parse failure.
|
|
116
157
|
*/
|
|
117
|
-
|
|
118
|
-
return new Promise(resolve => {
|
|
119
|
-
parser.parseString(xml, (
|
|
120
|
-
|
|
158
|
+
executeParseXml(url, xml) {
|
|
159
|
+
return new Promise((resolve) => {
|
|
160
|
+
this.parser.parseString(xml, (err, result) => {
|
|
161
|
+
if (err) {
|
|
162
|
+
this._handleError(url, err);
|
|
163
|
+
resolve(null);
|
|
164
|
+
} else {
|
|
165
|
+
resolve(result);
|
|
166
|
+
}
|
|
121
167
|
});
|
|
122
|
-
})
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
_handleError(url, err) {
|
|
172
|
+
if (this.onError) this.onError(url, err);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
_delay(ms) {
|
|
176
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
123
177
|
}
|
|
124
178
|
}
|
|
125
179
|
|
|
126
180
|
module.exports = SitemapXMLParser;
|
|
127
|
-
module.exports.default = SitemapXMLParser;
|
package/package.json
CHANGED
|
@@ -1,32 +1,36 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "sitemap-xml-parser",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "1.1.0",
|
|
4
|
+
"description": "Parses sitemap XML files and returns all listed URLs. Supports sitemap index files and gzip (.gz) compression.",
|
|
4
5
|
"main": "index.js",
|
|
5
|
-
"
|
|
6
|
-
"
|
|
6
|
+
"types": "index.d.ts",
|
|
7
|
+
"bin": {
|
|
8
|
+
"sitemap-xml-parser": "bin/cli.js"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"index.js",
|
|
12
|
+
"index.d.ts",
|
|
13
|
+
"lib",
|
|
14
|
+
"bin"
|
|
15
|
+
],
|
|
16
|
+
"engines": {
|
|
17
|
+
"node": ">=20"
|
|
18
|
+
},
|
|
7
19
|
"scripts": {
|
|
8
|
-
"test": "
|
|
20
|
+
"test": "node test/test.js"
|
|
9
21
|
},
|
|
22
|
+
"keywords": ["sitemap", "xml", "parse", "gzip", "sitemap-index", "cli"],
|
|
23
|
+
"author": "shinkawax",
|
|
24
|
+
"license": "MIT",
|
|
10
25
|
"repository": {
|
|
11
26
|
"type": "git",
|
|
12
27
|
"url": "git+https://github.com/shinkawax/sitemap-xml-parser.git"
|
|
13
28
|
},
|
|
14
|
-
"keywords": [
|
|
15
|
-
"sitemap",
|
|
16
|
-
"xml",
|
|
17
|
-
"parse",
|
|
18
|
-
"gz"
|
|
19
|
-
],
|
|
20
|
-
"author": "shinkawax",
|
|
21
|
-
"license": "MIT",
|
|
22
29
|
"bugs": {
|
|
23
30
|
"url": "https://github.com/shinkawax/sitemap-xml-parser/issues"
|
|
24
31
|
},
|
|
25
32
|
"homepage": "https://github.com/shinkawax/sitemap-xml-parser#readme",
|
|
26
|
-
"description": "It parses xml based on sitemap.xml and gets all files described in sitemap. Supports gz format",
|
|
27
33
|
"dependencies": {
|
|
28
|
-
"
|
|
29
|
-
"request": "^2.88.0",
|
|
30
|
-
"xml2js": "^0.4.19"
|
|
34
|
+
"xml2js": "^0.6.2"
|
|
31
35
|
}
|
|
32
36
|
}
|