npm - sitemap-xml-parser - Versions diffs - 0.0.1 → 1.0.0 - Mend

sitemap-xml-parser 0.0.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2019 shinkawax
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md CHANGED Viewed

@@ -1,25 +1,104 @@
 # sitemap-xml-parser
-## installation
+Parses sitemap XML files and returns all listed URLs. Supports sitemap index files and gzip (.gz) compression.
+## Installation
 ```
 npm install sitemap-xml-parser
 ```
-## example
+## Usage
-```
+```js
 const SitemapXMLParser = require('sitemap-xml-parser');
-const url = 'something sitemap url';
-const options = {
+const parser = new SitemapXMLParser('https://example.com/sitemap.xml', {
     delay: 3000,
-    limit: 5
-};
+    limit: 5,
+});
+(async () => {
+    const urls = await parser.fetch();
+    urls.forEach(entry => {
+        console.log(entry.loc[0]);
+    });
+})();
+```
+### Error handling with `onError`
-const sitemapXMLParser = new SitemapXMLParser(url, options);
+Failed URLs (network errors, non-2xx responses, malformed XML) are skipped by default. Provide an `onError` callback to inspect them:
-sitemapXMLParser.fetch().then(result => {
-    console.log(result);
+```js
+const parser = new SitemapXMLParser('https://example.com/sitemap.xml', {
+    onError: (url, err) => {
+        console.error(`Skipped ${url}: ${err.message}`);
+    },
 });
 ```
+## Options
+| Option      | Type       | Default | Description                                                                 |
+|-------------|------------|---------|-----------------------------------------------------------------------------|
+| `delay`     | `number`   | `3000`  | Milliseconds to wait between batches when following a sitemap index. CLI: `--delay`   |
+| `limit`     | `number`   | `5`     | Number of child sitemaps to fetch concurrently per batch. CLI: `--limit`              |
+| `timeout`   | `number`   | `30000` | Milliseconds before a request is aborted. CLI: `--timeout`                            |
+| `onError`   | `function` | —       | Called as `onError(url, error)` when a URL fails. The URL is skipped regardless. **API only.** |
+| `--help`    | —          | —       | Prints usage information and exits. **CLI only.**                           |
+| `--timeout` | —          | —       | Same as the `timeout` option above, in milliseconds. **CLI only.**          |
+## Return value
+`fetch()` resolves to an array of URL entry objects. Each object reflects the fields present in the sitemap:
+```js
+[
+  {
+    loc:        ['https://example.com/page1'],
+    lastmod:    ['2024-01-01'],
+    changefreq: ['weekly'],
+    priority:   ['0.8'],
+  },
+  // ...
+]
+```
+Fields other than `loc` (`lastmod`, `changefreq`, `priority`, etc.) are included only when present in the source XML.
+## CLI
+Run without installing via `npx`:
+```sh
+npx sitemap-xml-parser <url> [options]
+```
+Or, after installing globally (`npm install -g sitemap-xml-parser`):
+```sh
+sitemap-xml-parser <url> [options]
+```
+Fetched URLs are printed to stdout, one per line. Errors are printed to stderr. See [Options](#options) for available flags.
+### Examples
+```sh
+# Print all URLs
+npx sitemap-xml-parser https://example.com/sitemap.xml
+# No delay, higher concurrency
+npx sitemap-xml-parser https://example.com/sitemap.xml --delay 0 --limit 10
+# Save URLs to a file, errors to a log
+npx sitemap-xml-parser https://example.com/sitemap.xml > urls.txt 2> errors.log
+# Custom timeout
+npx sitemap-xml-parser https://example.com/sitemap.xml --timeout 10000
+```
+## Limitations
+- **HTTP redirects are not followed.** Responses with status codes 301, 302, or other 3xx are treated as errors. If your sitemap URL redirects, use the final destination URL directly.

package/bin/cli.js ADDED Viewed

@@ -0,0 +1,99 @@
+#!/usr/bin/env node
+'use strict';
+const SitemapXMLParser = require('../index.js');
+function printUsage() {
+    process.stdout.write([
+        'Usage: sitemap-xml-parser <url> [options]',
+        '',
+        'Options:',
+        '  --delay <ms>    Delay between batches in milliseconds (default: 3000)',
+        '  --limit <n>     Concurrent fetches per batch (default: 5)',
+        '  --timeout <ms>  Request timeout in milliseconds (default: 30000)',
+        '  --help          Show this help message',
+        '',
+    ].join('\n'));
+}
+function parseArgs(argv) {
+    const args = argv.slice(2);
+    const opts = { delay: 3000, limit: 5, timeout: 30000 };
+    let url = null;
+    for (let i = 0; i < args.length; i++) {
+        const arg = args[i];
+        if (arg === '--help' || arg === '-h') {
+            printUsage();
+            process.exit(0);
+        } else if (arg === '--delay') {
+            if (++i >= args.length) {
+                process.stderr.write(`Error: --delay requires a value\n`);
+                process.exit(1);
+            }
+            const val = Number(args[i]);
+            if (!Number.isFinite(val) || val < 0) {
+                process.stderr.write(`Error: --delay must be a non-negative number\n`);
+                process.exit(1);
+            }
+            opts.delay = val;
+        } else if (arg === '--limit') {
+            if (++i >= args.length) {
+                process.stderr.write(`Error: --limit requires a value\n`);
+                process.exit(1);
+            }
+            const val = Number(args[i]);
+            if (!Number.isInteger(val) || val < 1) {
+                process.stderr.write(`Error: --limit must be a positive integer\n`);
+                process.exit(1);
+            }
+            opts.limit = val;
+        } else if (arg === '--timeout') {
+            if (++i >= args.length) {
+                process.stderr.write(`Error: --timeout requires a value\n`);
+                process.exit(1);
+            }
+            const val = Number(args[i]);
+            if (!Number.isFinite(val) || val < 0) {
+                process.stderr.write(`Error: --timeout must be a non-negative number\n`);
+                process.exit(1);
+            }
+            opts.timeout = val;
+        } else if (arg.startsWith('--')) {
+            process.stderr.write(`Error: unknown option ${arg}\n`);
+            process.exit(1);
+        } else {
+            if (url !== null) {
+                process.stderr.write(`Error: unexpected argument: ${arg}\n`);
+                process.exit(1);
+            }
+            url = arg;
+        }
+    }
+    if (!url) {
+        printUsage();
+        process.exit(1);
+    }
+    return { url, opts };
+}
+(async () => {
+    const { url, opts } = parseArgs(process.argv);
+    let hasError = false;
+    const parser = new SitemapXMLParser(url, {
+        ...opts,
+        onError: (failedUrl, err) => {
+            hasError = true;
+            process.stderr.write(`Error: ${failedUrl} — ${err.message}\n`);
+        },
+    });
+    const entries = await parser.fetch();
+    for (const entry of entries) {
+        process.stdout.write(entry.loc[0] + '\n');
+    }
+    if (hasError) process.exit(1);
+})();

package/lib/sitemap.js CHANGED Viewed

@@ -1,130 +1,150 @@
 'use strict';
-const request = require('request');
+const http = require('http');
+const https = require('https');
 const xml2js = require('xml2js');
-const parser = new xml2js.Parser();
-const bluebird = require('bluebird');
-const promiseMap = bluebird.map;
-const delay = bluebird.delay;
-const Url = require('url');
 const path = require('path');
-const zlib = require("zlib");
+const zlib = require('zlib');
+const { URL } = require('url');
 class SitemapXMLParser {
-    constructor(url, options) {
+    constructor(url, options = {}) {
         this.siteMapUrl = url;
-        this.delayTime = options.delay ? options.delay : 3000;
-        this.limit = options.limit ? options.limit : 5;
+        this.delayTime = options.delay ?? 3000;
+        this.limit = options.limit ?? 5;
+        this.timeout = options.timeout ?? 30000;
+        this.onError = options.onError || null;
         this.urlArray = [];
+        this.parser = new xml2js.Parser();
     }
     async fetch() {
-        //トップページのXMLを取得
+        this.urlArray = [];
         const indexBody = await this.getBodyFromURL(this.siteMapUrl);
-        const indexXML = await this.executeParseXml(indexBody);
-        //URL一覧を取得
-        await this.getURLFromXML(indexXML)
-        //サイトマップの一覧
+        if (indexBody === null) return this.urlArray;
+        const indexXML = await this.executeParseXml(this.siteMapUrl, indexBody);
+        if (indexXML === null) return this.urlArray;
+        await this.getURLFromXML(indexXML);
         return this.urlArray;
-    };
-    async getURLFromURL(url) {
-        let body = await this.getBodyFromURL(url);
-        let sitemapData = await this.executeParseXml(body);
-        await this.getURLFromXML(sitemapData);
-        return delay(this.delayTime);
     }
     /**
-     * サイトマップ一覧からURLを取得する
-     * サイトマップインデックスファイルの場合は、リンク先にアクセスしてURLを集める
-     * @param {*} xml
+     * Collect URLs from parsed XML.
+     * If the XML is a sitemap index, follow each child sitemap.
      */
     async getURLFromXML(xml) {
-        let sitemapIndexData = [];
-        if (xml.sitemapindex
-            && xml.sitemapindex.sitemap
-        ) {
-            //サイトマップインデックスファイルの場合
-            for (let i = 0; i < Object.keys(xml.sitemapindex.sitemap).length; i++) {
-                sitemapIndexData.push(
-                    {
-                        url: xml.sitemapindex.sitemap[i].loc[0],
-                        this: this
-                        //TODO promiseMapの引数が1つ?のため一緒の配列にthisを入れる 本来不要
-                        //promiseMapへは参照渡しになっているので
-                        //promiseMap内でのthisの値を変更すればpromiseMap外でもthisの値は変更される
-                    }
+        if (xml.sitemapindex && xml.sitemapindex.sitemap) {
+            const urls = xml.sitemapindex.sitemap.map(s => s.loc?.[0]).filter(Boolean);
+            for (let i = 0; i < urls.length; i += this.limit) {
+                const chunk = urls.slice(i, i + this.limit);
+                await Promise.all(
+                    chunk.map(async (url) => {
+                        const body = await this.getBodyFromURL(url);
+                        if (body === null) return;
+                        const sitemapData = await this.executeParseXml(url, body);
+                        if (sitemapData === null) return;
+                        await this.getURLFromXML(sitemapData);
+                    })
                 );
+                if (i + this.limit < urls.length) {
+                    await this._delay(this.delayTime);
+                }
             }
-            //各サイトマップインデックスファィルにアクセスしてURL一覧を取得する
-            //Limitに指定された数で同時に処理を行う
-            await promiseMap(
-                sitemapIndexData,
-                async (data) => {
-                    let body = await data.this.getBodyFromURL(data.url);
-                    let sitemapData = await data.this.executeParseXml(body);
-                    await data.this.getURLFromXML(sitemapData);
-                    return delay(data.this.delayTime);
-                },
-                { concurrency: this.limit }
-            )
         }
-        if (xml.urlset
-            && xml.urlset.url
-        ) {
-            //サイトマップの場合　取得した一覧を追加
-            for (let i = 0; i < Object.keys(xml.urlset.url).length; i++) {
-                if (xml.urlset.url[i]) {
-                    this.urlArray.push(xml.urlset.url[i]);
+        if (xml.urlset && xml.urlset.url) {
+            for (const entry of xml.urlset.url) {
+                if (entry && entry.loc?.[0]) {
+                    this.urlArray.push(entry);
                 }
             }
         }
     }
     /**
-     * URLからbodyを取得する
-     * 拡張子がgzファィルの場合は解凍する
-     * @param {*} url
+     * Fetch body from URL using http/https.
+     * Decompresses gzip automatically when the URL ends with .gz.
+     * Returns null and calls onError on failure.
      */
-    async getBodyFromURL(url) {
-        console.log(url + ' Access');
-        return new Promise(resolve => {
-            //拡張子がgzでないか確認する
-            let urlParse = Url.parse(url);
-            let ext = path.extname(urlParse.path);
-            if (ext == '.gz') {
-                request(url, { encoding: null }, function (error, response, body) {
-                    zlib.gunzip(body, function (error, result) {
-                        console.log(url + ' Get');
-                        resolve(result.toString());
-                    });
+    getBodyFromURL(url) {
+        return new Promise((resolve) => {
+            let parsedUrl;
+            try {
+                parsedUrl = new URL(url);
+            } catch (err) {
+                this._handleError(url, err);
+                resolve(null);
+                return;
+            }
+            const ext = path.extname(parsedUrl.pathname);
+            const transport = parsedUrl.protocol === 'https:' ? https : http;
+            const req = transport.get(url, (res) => {
+                if (res.statusCode < 200 || res.statusCode >= 300) {
+                    res.resume();
+                    this._handleError(url, new Error(`HTTP ${res.statusCode}`));
+                    resolve(null);
+                    return;
+                }
+                const chunks = [];
+                res.on('data', chunk => chunks.push(chunk));
+                res.on('end', () => {
+                    const buf = Buffer.concat(chunks);
+                    if (ext === '.gz') {
+                        zlib.gunzip(buf, (err, result) => {
+                            if (err) {
+                                this._handleError(url, err);
+                                resolve(null);
+                            } else {
+                                resolve(result.toString());
+                            }
+                        });
+                    } else {
+                        resolve(buf.toString());
+                    }
                 });
-            } else {
-                request(url, function (error, response, body) {
-                    console.log(url + ' Get');
-                    resolve(body.toString());
+                res.on('error', (err) => {
+                    this._handleError(url, err);
+                    resolve(null);
                 });
-            }
+            });
+            req.setTimeout(this.timeout, () => {
+                req.destroy(new Error(`Timeout after ${this.timeout}ms`));
+            });
+            req.on('error', (err) => {
+                this._handleError(url, err);
+                resolve(null);
+            });
         });
     }
     /**
-     * 実際にXMLのパースを行う関数
-     * @param {*} value
+     * Parse XML string. Returns null and calls onError on parse failure.
      */
-    async executeParseXml(xml) {
-        return new Promise(resolve => {
-            parser.parseString(xml, (error, result) => {
-                resolve(result);
+    executeParseXml(url, xml) {
+        return new Promise((resolve) => {
+            this.parser.parseString(xml, (err, result) => {
+                if (err) {
+                    this._handleError(url, err);
+                    resolve(null);
+                } else {
+                    resolve(result);
+                }
             });
-        })
+        });
+    }
+    _handleError(url, err) {
+        if (this.onError) this.onError(url, err);
+    }
+    _delay(ms) {
+        return new Promise(resolve => setTimeout(resolve, ms));
     }
 }
 module.exports = SitemapXMLParser;
-module.exports.default = SitemapXMLParser;

package/package.json CHANGED Viewed

@@ -1,32 +1,34 @@
 {
   "name": "sitemap-xml-parser",
-  "version": "0.0.1",
+  "version": "1.0.0",
+  "description": "Parses sitemap XML files and returns all listed URLs. Supports sitemap index files and gzip (.gz) compression.",
   "main": "index.js",
-  "private": false,
-  "devDependencies": {},
+  "bin": {
+    "sitemap-xml-parser": "bin/cli.js"
+  },
+  "files": [
+    "index.js",
+    "lib",
+    "bin"
+  ],
+  "engines": {
+    "node": ">=18"
+  },
   "scripts": {
-    "test": "echo \"Error: no test specified\" && exit 1"
+    "test": "node test/test.js"
   },
+  "keywords": ["sitemap", "xml", "parse", "gzip", "sitemap-index", "cli"],
+  "author": "shinkawax",
+  "license": "MIT",
   "repository": {
     "type": "git",
     "url": "git+https://github.com/shinkawax/sitemap-xml-parser.git"
   },
-  "keywords": [
-    "sitemap",
-    "xml",
-    "parse",
-    "gz"
-  ],
-  "author": "shinkawax",
-  "license": "MIT",
   "bugs": {
     "url": "https://github.com/shinkawax/sitemap-xml-parser/issues"
   },
   "homepage": "https://github.com/shinkawax/sitemap-xml-parser#readme",
-  "description": "It parses xml based on sitemap.xml and gets all files described in sitemap. Supports gz format",
   "dependencies": {
-    "bluebird": "^3.5.5",
-    "request": "^2.88.0",
-    "xml2js": "^0.4.19"
+    "xml2js": "^0.6.2"
   }
 }