npm - es6-crawler-detect - Versions diffs - 3.2.0 → 3.3.0 - Mend

es6-crawler-detect 3.2.0 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/example/node/server.js +38 -32
package/package.json +1 -1
package/src/lib/crawler/crawlers.js +1403 -1402
package/src/lib/crawler/exclusions.js +62 -62
package/src/lib/crawler.js +130 -135
package/test/lib/crawler.test.js +94 -52
package/test/lib/database/crawlers.txt +3651 -0
package/test/lib/database/devices.txt +165636 -0

package/src/lib/crawler/exclusions.js CHANGED Viewed

@@ -1,62 +1,62 @@
-'use strict';
-const Provider = require('./provider');
-class Exclusions extends Provider {
-  constructor() {
-    super();
-    this.data = [
-      'Safari.[\\d\\.]*',
-      'Firefox.[\\d\\.]*',
-      ' Chrome.[\\d\\.]*',
-      'Chromium.[\\d\\.]*',
-      'MSIE.[\\d\\.]',
-      'Opera\\/[\\d\\.]*',
-      'Mozilla.[\\d\\.]*',
-      'AppleWebKit.[\\d\\.]*',
-      'Trident.[\\d\\.]*',
-      'Windows NT.[\\d\\.]*',
-      'Android [\\d\\.]*',
-      'Macintosh.',
-      'Ubuntu',
-      'Linux',
-      '[ ]Intel',
-      'Mac OS X [\\d_]*',
-      '(like )?Gecko(.[\\d\\.]*)?',
-      'KHTML,',
-      'CriOS.[\\d\\.]*',
-      'CPU iPhone OS ([0-9_])* like Mac OS X',
-      'CPU OS ([0-9_])* like Mac OS X',
-      'iPod',
-      'compatible',
-      'x86_..',
-      'i686',
-      'x64',
-      'X11',
-      'rv:[\\d\\.]*',
-      'Version.[\\d\\.]*',
-      'WOW64',
-      'Win64',
-      'Dalvik.[\\d\\.]*',
-      ' \\.NET CLR [\\d\\.]*',
-      'Presto.[\\d\\.]*',
-      'Media Center PC',
-      'BlackBerry',
-      'Build',
-      'Opera Mini\\/\\d{1,2}\\.\\d{1,2}\\.[\\d\\.]*\\/\\d{1,2}\\.',
-      'Opera',
-      ' \\.NET[\\d\\.]*',
-      'cubot',
-      '; M bot',
-      '; CRONO',
-      '; B bot',
-      '; IDbot',
-      '; ID bot',
-      '; POWER BOT',
-      ';',
-    ];
-  }
-}
-module.exports = Exclusions;
+'use strict';
+const Provider = require('./provider');
+class Exclusions extends Provider {
+  constructor() {
+    super();
+    this.data = [
+      'Safari.[\\d\\.]*',
+      'Firefox.[\\d\\.]*',
+      ' Chrome.[\\d\\.]*',
+      'Chromium.[\\d\\.]*',
+      'MSIE.[\\d\\.]',
+      'Opera\\/[\\d\\.]*',
+      'Mozilla.[\\d\\.]*',
+      'AppleWebKit.[\\d\\.]*',
+      'Trident.[\\d\\.]*',
+      'Windows NT.[\\d\\.]*',
+      'Android [\\d\\.]*',
+      'Macintosh.',
+      'Ubuntu',
+      'Linux',
+      '[ ]Intel',
+      'Mac OS X [\\d_]*',
+      '(like )?Gecko(.[\\d\\.]*)?',
+      'KHTML,',
+      'CriOS.[\\d\\.]*',
+      'CPU iPhone OS ([0-9_])* like Mac OS X',
+      'CPU OS ([0-9_])* like Mac OS X',
+      'iPod',
+      'compatible',
+      'x86_..',
+      'i686',
+      'x64',
+      'X11',
+      'rv:[\\d\\.]*',
+      'Version.[\\d\\.]*',
+      'WOW64',
+      'Win64',
+      'Dalvik.[\\d\\.]*',
+      ' \\.NET CLR [\\d\\.]*',
+      'Presto.[\\d\\.]*',
+      'Media Center PC',
+      'BlackBerry',
+      'Build',
+      'Opera Mini\\/\\d{1,2}\\.\\d{1,2}\\.[\\d\\.]*\\/\\d{1,2}\\.',
+      'Opera',
+      ' \\.NET[\\d\\.]*',
+      'cubot',
+      '; M bot',
+      '; CRONO',
+      '; B bot',
+      '; IDbot',
+      '; ID bot',
+      '; POWER BOT',
+      'OCTOPUS-CORE',
+    ];
+  }
+}
+module.exports = Exclusions;

package/src/lib/crawler.js CHANGED Viewed

@@ -1,135 +1,130 @@
-'use strict';
-const Crawlers = require('./crawler/crawlers');
-const Exclusions = require('./crawler/exclusions');
-const Headers = require('./crawler/headers');
-class Crawler {
-  constructor(request, headers, userAgent) {
-    /**
-     * Init classes
-     */
-    this._init();
-    /**
-     * This request must be an object
-     */
-    this.request = typeof request === 'object' ? request : {};
-    // The regex-list must not be used with g-flag!
-    // See: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results
-    this.compiledRegexList = this.compileRegex(this.crawlers.getAll(), 'i');
-    // The exclusions should be used with g-flag in order to remove each value.
-    this.compiledExclusions = this.compileRegex(this.exclusions.getAll(), 'g');
-    /**
-     * Set http headers
-     */
-    this.setHttpHeaders(headers);
-    /**
-     * Set userAgent
-     */
-    this.userAgent = this.setUserAgent(userAgent);
-  }
-  /**
-   * Init Classes Instances
-   */
-  _init() {
-    this.crawlers = new Crawlers();
-    this.headers = new Headers();
-    this.exclusions = new Exclusions();
-  }
-  compileRegex(patterns, flags) {
-    return new RegExp(patterns.join('|').trim(), flags);
-  }
-  /**
-   * Set HTTP headers.
-   */
-  setHttpHeaders(headers) {
-    // Use the Request headers if httpHeaders is not defined
-    if (typeof headers === 'undefined' || Object.keys(headers).length === 0) {
-      headers = Object.keys(this.request).length ? this.request.headers : {};
-    }
-    // Clear existing headers.
-    this.httpHeaders = [];
-    // Only save HTTP headers.
-    for (const key in headers) {
-      this.httpHeaders[key] = headers[key];
-    }
-  }
-  /**
-   * Set user agent
-   */
-  setUserAgent(userAgent) {
-    if (
-      typeof userAgent === 'undefined' ||
-      userAgent === null ||
-      !userAgent.length
-    ) {
-      for (const header of this.getUaHttpHeaders()) {
-        if (Object.keys(this.httpHeaders).indexOf(header.toLowerCase()) >= 0) {
-          userAgent += this.httpHeaders[header] + ' ';
-        }
-      }
-    }
-    return userAgent;
-  }
-  /**
-   * Get user agent headers
-   */
-  getUaHttpHeaders() {
-    return this.headers.getAll();
-  }
-  /**
-   * Check user agent string against the regex.
-   */
-  isCrawler(userAgent = undefined) {
-    if (Buffer.byteLength(userAgent || '', 'utf8') > 4096) {
-      return false;
-    }
-    var agent =
-      typeof userAgent === 'undefined' || userAgent === null
-        ? this.userAgent
-        : userAgent;
-    // test on compiled regx
-    agent = agent.replace(this.compiledExclusions, '');
-    if (agent.trim().length === 0) {
-      return false;
-    }
-    var matches = this.compiledRegexList.exec(agent.trim());
-    if (matches) {
-      this.matches = matches;
-    }
-    return matches !== null ? (matches.length ? true : false) : false;
-  }
-  /**
-   * Return the matches.
-   */
-  getMatches() {
-    return this.matches !== undefined
-      ? this.matches.length
-        ? this.matches[0]
-        : null
-      : {};
-  }
-}
-module.exports = Crawler;
+'use strict';
+const Crawlers = require('./crawler/crawlers');
+const Exclusions = require('./crawler/exclusions');
+const Headers = require('./crawler/headers');
+class Crawler {
+  constructor(request, headers, userAgent) {
+    /**
+     * Init classes
+     */
+    this._init();
+    /**
+     * This request must be an object
+     */
+    this.request = typeof request === 'object' ? request : {};
+    // The regex-list must not be used with g-flag!
+    // See: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results
+    this.compiledRegexList = this.compileRegex(this.crawlers.getAll(), 'i');
+    // The exclusions should be used with g-flag in order to remove each value.
+    this.compiledExclusions = this.compileRegex(this.exclusions.getAll(), 'gi');
+    /**
+     * Set http headers
+     */
+    this.setHttpHeaders(headers);
+    /**
+     * Set userAgent
+     */
+    this.userAgent = this.setUserAgent(userAgent);
+  }
+  /**
+   * Init Classes Instances
+   */
+  _init() {
+    this.crawlers = new Crawlers();
+    this.headers = new Headers();
+    this.exclusions = new Exclusions();
+  }
+  compileRegex(patterns, flags) {
+    return new RegExp(patterns.join('|'), flags);
+  }
+  /**
+   * Set HTTP headers.
+   */
+  setHttpHeaders(headers) {
+    // Use the Request headers if httpHeaders is not defined
+    if (typeof headers === 'undefined' || Object.keys(headers).length === 0) {
+      headers = Object.keys(this.request).length ? this.request.headers : {};
+    }
+    // Save the headers.
+    this.httpHeaders = headers;
+  }
+  /**
+   * Set user agent
+   */
+  setUserAgent(userAgent) {
+    if (
+      typeof userAgent === 'undefined' ||
+      userAgent === null ||
+      !userAgent.length
+    ) {
+      for (const header of this.getUaHttpHeaders()) {
+        if (Object.keys(this.httpHeaders).indexOf(header.toLowerCase()) >= 0) {
+          userAgent += this.httpHeaders[header.toLowerCase()] + ' ';
+        }
+      }
+    }
+    return userAgent;
+  }
+  /**
+   * Get user agent headers
+   */
+  getUaHttpHeaders() {
+    return this.headers.getAll();
+  }
+  /**
+   * Check user agent string against the regex.
+   */
+  isCrawler(userAgent = undefined) {
+    if (Buffer.byteLength(userAgent || '', 'utf8') > 4096) {
+      return false;
+    }
+    var agent =
+      typeof userAgent === 'undefined' || userAgent === null
+        ? this.userAgent
+        : userAgent;
+    // test on compiled regx
+    agent = agent.replace(this.compiledExclusions, '');
+    if (agent.trim().length === 0) {
+      return false;
+    }
+    var matches = this.compiledRegexList.exec(agent);
+    if (matches) {
+      this.matches = matches;
+    }
+    return matches !== null ? (matches.length ? true : false) : false;
+  }
+  /**
+   * Return the matches.
+   */
+  getMatches() {
+    return this.matches !== undefined
+      ? this.matches.length
+        ? this.matches[0]
+        : null
+      : {};
+  }
+}
+module.exports = Crawler;

package/test/lib/crawler.test.js CHANGED Viewed

@@ -1,52 +1,94 @@
-var assert = require('assert');
-const Crawler = require('../../src/lib/crawler');
-describe('crawler', () => {
-  var crawler = new Crawler();
-  it('will identify crawlers correctly on subsequent calls', () => {
-    assert.strictEqual(crawler.isCrawler('Zombie.js'), true);
-    assert.strictEqual(
-      crawler.isCrawler('Zombie.js'),
-      true,
-      'crawler was not identified on subsequent call'
-    );
-  });
-  it('will identify telegram bot', () => {
-    assert.strictEqual(
-      crawler.isCrawler('TelegramBot (like TwitterBot)'),
-      true
-    );
-  });
-  describe('regex-compilation', () => {
-    it('will join list of patterns with pipes', () => {
-      assert.strictEqual(
-        crawler.compileRegex(['some', 'patterns']).source,
-        'some|patterns'
-      );
-      assert.strictEqual(crawler.compileRegex(['single']).source, 'single');
-      assert.strictEqual(
-        crawler.compileRegex(['  remove-whitespaces ']).source,
-        'remove-whitespaces'
-      );
-    });
-    it('will accept regex-flags for compilation', () => {
-      var patterns = ['some', 'patterns'];
-      assert.strictEqual(crawler.compileRegex(patterns, 'g').flags, 'g');
-      assert.strictEqual(crawler.compileRegex(patterns, 'i').flags, 'i');
-    });
-    it('should be case insensitive', () => {
-      assert.strictEqual(crawler.isCrawler('Facebot\\1.0'), true);
-      assert.strictEqual(
-        crawler.getMatches('Facebot\\1.0'),
-        'Facebot',
-        'Crawler was not able to indentify crawler correctly'
-      );
-    });
-  });
-});
+const readline = require('readline');
+const fs = require('fs');
+const assert = require('assert');
+const Crawler = require('../../src/lib/crawler');
+describe('crawler', () => {
+  var crawler = new Crawler();
+  describe('regex-compilation', () => {
+    it('will join list of patterns with pipes', () => {
+      assert.strictEqual(
+        crawler.compileRegex(['some', 'patterns']).source,
+        'some|patterns'
+      );
+      assert.strictEqual(crawler.compileRegex(['single']).source, 'single');
+    });
+    it('keeps the whitespace', () => {
+      assert.strictEqual(
+        crawler.compileRegex(['  keep-whitespaces ']).source,
+        '  keep-whitespaces '
+      );
+    });
+    it('will accept regex-flags for compilation', () => {
+      var patterns = ['some', 'patterns'];
+      assert.strictEqual(crawler.compileRegex(patterns, 'g').flags, 'g');
+      assert.strictEqual(crawler.compileRegex(patterns, 'i').flags, 'i');
+    });
+    it('should be case insensitive', () => {
+      assert.strictEqual(crawler.isCrawler('Facebot\\1.0'), true);
+      assert.strictEqual(
+        crawler.getMatches('Facebot\\1.0'),
+        'Facebot',
+        'Crawler was not able to indentify crawler correctly'
+      );
+    });
+  });
+  describe('crawler-identification', () => {
+    it('should be able to identify crawlers', async () => {
+      const rl = readline.createInterface({
+        input: fs.createReadStream('./test/lib/database/crawlers.txt'),
+        crlfDelay: Infinity,
+      });
+      for await (const line of rl) {
+        assert.strictEqual(
+          crawler.isCrawler(line),
+          true,
+          `${line} is not a crawler`
+        );
+      }
+      rl.close();
+    });
+    it('should be able to identify devices', async () => {
+      const rl = readline.createInterface({
+        input: fs.createReadStream('./test/lib/database/devices.txt'),
+        crlfDelay: Infinity,
+      });
+      for await (const line of rl) {
+        assert.strictEqual(
+          crawler.isCrawler(line),
+          false,
+          `${line} is not a device`
+        );
+      }
+      rl.close();
+    });
+    it('should identify the crawler from a given headers', async () => {
+      crawler = new Crawler(null, {
+        host: '127.0.0.1:3000',
+        'user-agent': 'curl/7.73.0',
+        accept: '*/*',
+      });
+      assert.strictEqual(crawler.isCrawler(), true);
+    });
+    it('should identify the crawler from request headers', async () => {
+      crawler = new Crawler({
+        headers: { 'user-agent': 'curl/7.73.0', accept: '*/*' },
+      });
+      assert.strictEqual(crawler.isCrawler(), true);
+    });
+  });
+});