es6-crawler-detect 3.2.0 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,62 +1,62 @@
1
- 'use strict';
2
-
3
- const Provider = require('./provider');
4
-
5
- class Exclusions extends Provider {
6
- constructor() {
7
- super();
8
-
9
- this.data = [
10
- 'Safari.[\\d\\.]*',
11
- 'Firefox.[\\d\\.]*',
12
- ' Chrome.[\\d\\.]*',
13
- 'Chromium.[\\d\\.]*',
14
- 'MSIE.[\\d\\.]',
15
- 'Opera\\/[\\d\\.]*',
16
- 'Mozilla.[\\d\\.]*',
17
- 'AppleWebKit.[\\d\\.]*',
18
- 'Trident.[\\d\\.]*',
19
- 'Windows NT.[\\d\\.]*',
20
- 'Android [\\d\\.]*',
21
- 'Macintosh.',
22
- 'Ubuntu',
23
- 'Linux',
24
- '[ ]Intel',
25
- 'Mac OS X [\\d_]*',
26
- '(like )?Gecko(.[\\d\\.]*)?',
27
- 'KHTML,',
28
- 'CriOS.[\\d\\.]*',
29
- 'CPU iPhone OS ([0-9_])* like Mac OS X',
30
- 'CPU OS ([0-9_])* like Mac OS X',
31
- 'iPod',
32
- 'compatible',
33
- 'x86_..',
34
- 'i686',
35
- 'x64',
36
- 'X11',
37
- 'rv:[\\d\\.]*',
38
- 'Version.[\\d\\.]*',
39
- 'WOW64',
40
- 'Win64',
41
- 'Dalvik.[\\d\\.]*',
42
- ' \\.NET CLR [\\d\\.]*',
43
- 'Presto.[\\d\\.]*',
44
- 'Media Center PC',
45
- 'BlackBerry',
46
- 'Build',
47
- 'Opera Mini\\/\\d{1,2}\\.\\d{1,2}\\.[\\d\\.]*\\/\\d{1,2}\\.',
48
- 'Opera',
49
- ' \\.NET[\\d\\.]*',
50
- 'cubot',
51
- '; M bot',
52
- '; CRONO',
53
- '; B bot',
54
- '; IDbot',
55
- '; ID bot',
56
- '; POWER BOT',
57
- ';',
58
- ];
59
- }
60
- }
61
-
62
- module.exports = Exclusions;
1
+ 'use strict';
2
+
3
+ const Provider = require('./provider');
4
+
5
+ class Exclusions extends Provider {
6
+ constructor() {
7
+ super();
8
+
9
+ this.data = [
10
+ 'Safari.[\\d\\.]*',
11
+ 'Firefox.[\\d\\.]*',
12
+ ' Chrome.[\\d\\.]*',
13
+ 'Chromium.[\\d\\.]*',
14
+ 'MSIE.[\\d\\.]',
15
+ 'Opera\\/[\\d\\.]*',
16
+ 'Mozilla.[\\d\\.]*',
17
+ 'AppleWebKit.[\\d\\.]*',
18
+ 'Trident.[\\d\\.]*',
19
+ 'Windows NT.[\\d\\.]*',
20
+ 'Android [\\d\\.]*',
21
+ 'Macintosh.',
22
+ 'Ubuntu',
23
+ 'Linux',
24
+ '[ ]Intel',
25
+ 'Mac OS X [\\d_]*',
26
+ '(like )?Gecko(.[\\d\\.]*)?',
27
+ 'KHTML,',
28
+ 'CriOS.[\\d\\.]*',
29
+ 'CPU iPhone OS ([0-9_])* like Mac OS X',
30
+ 'CPU OS ([0-9_])* like Mac OS X',
31
+ 'iPod',
32
+ 'compatible',
33
+ 'x86_..',
34
+ 'i686',
35
+ 'x64',
36
+ 'X11',
37
+ 'rv:[\\d\\.]*',
38
+ 'Version.[\\d\\.]*',
39
+ 'WOW64',
40
+ 'Win64',
41
+ 'Dalvik.[\\d\\.]*',
42
+ ' \\.NET CLR [\\d\\.]*',
43
+ 'Presto.[\\d\\.]*',
44
+ 'Media Center PC',
45
+ 'BlackBerry',
46
+ 'Build',
47
+ 'Opera Mini\\/\\d{1,2}\\.\\d{1,2}\\.[\\d\\.]*\\/\\d{1,2}\\.',
48
+ 'Opera',
49
+ ' \\.NET[\\d\\.]*',
50
+ 'cubot',
51
+ '; M bot',
52
+ '; CRONO',
53
+ '; B bot',
54
+ '; IDbot',
55
+ '; ID bot',
56
+ '; POWER BOT',
57
+ 'OCTOPUS-CORE',
58
+ ];
59
+ }
60
+ }
61
+
62
+ module.exports = Exclusions;
@@ -1,135 +1,130 @@
1
- 'use strict';
2
-
3
- const Crawlers = require('./crawler/crawlers');
4
- const Exclusions = require('./crawler/exclusions');
5
- const Headers = require('./crawler/headers');
6
-
7
- class Crawler {
8
- constructor(request, headers, userAgent) {
9
- /**
10
- * Init classes
11
- */
12
- this._init();
13
-
14
- /**
15
- * This request must be an object
16
- */
17
- this.request = typeof request === 'object' ? request : {};
18
-
19
- // The regex-list must not be used with g-flag!
20
- // See: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results
21
- this.compiledRegexList = this.compileRegex(this.crawlers.getAll(), 'i');
22
-
23
- // The exclusions should be used with g-flag in order to remove each value.
24
- this.compiledExclusions = this.compileRegex(this.exclusions.getAll(), 'g');
25
-
26
- /**
27
- * Set http headers
28
- */
29
- this.setHttpHeaders(headers);
30
-
31
- /**
32
- * Set userAgent
33
- */
34
- this.userAgent = this.setUserAgent(userAgent);
35
- }
36
-
37
- /**
38
- * Init Classes Instances
39
- */
40
- _init() {
41
- this.crawlers = new Crawlers();
42
- this.headers = new Headers();
43
- this.exclusions = new Exclusions();
44
- }
45
-
46
- compileRegex(patterns, flags) {
47
- return new RegExp(patterns.join('|').trim(), flags);
48
- }
49
-
50
- /**
51
- * Set HTTP headers.
52
- */
53
- setHttpHeaders(headers) {
54
- // Use the Request headers if httpHeaders is not defined
55
- if (typeof headers === 'undefined' || Object.keys(headers).length === 0) {
56
- headers = Object.keys(this.request).length ? this.request.headers : {};
57
- }
58
-
59
- // Clear existing headers.
60
- this.httpHeaders = [];
61
-
62
- // Only save HTTP headers.
63
- for (const key in headers) {
64
- this.httpHeaders[key] = headers[key];
65
- }
66
- }
67
-
68
- /**
69
- * Set user agent
70
- */
71
- setUserAgent(userAgent) {
72
- if (
73
- typeof userAgent === 'undefined' ||
74
- userAgent === null ||
75
- !userAgent.length
76
- ) {
77
- for (const header of this.getUaHttpHeaders()) {
78
- if (Object.keys(this.httpHeaders).indexOf(header.toLowerCase()) >= 0) {
79
- userAgent += this.httpHeaders[header] + ' ';
80
- }
81
- }
82
- }
83
-
84
- return userAgent;
85
- }
86
-
87
- /**
88
- * Get user agent headers
89
- */
90
- getUaHttpHeaders() {
91
- return this.headers.getAll();
92
- }
93
-
94
- /**
95
- * Check user agent string against the regex.
96
- */
97
- isCrawler(userAgent = undefined) {
98
- if (Buffer.byteLength(userAgent || '', 'utf8') > 4096) {
99
- return false;
100
- }
101
-
102
- var agent =
103
- typeof userAgent === 'undefined' || userAgent === null
104
- ? this.userAgent
105
- : userAgent;
106
-
107
- // test on compiled regx
108
- agent = agent.replace(this.compiledExclusions, '');
109
-
110
- if (agent.trim().length === 0) {
111
- return false;
112
- }
113
-
114
- var matches = this.compiledRegexList.exec(agent.trim());
115
-
116
- if (matches) {
117
- this.matches = matches;
118
- }
119
-
120
- return matches !== null ? (matches.length ? true : false) : false;
121
- }
122
-
123
- /**
124
- * Return the matches.
125
- */
126
- getMatches() {
127
- return this.matches !== undefined
128
- ? this.matches.length
129
- ? this.matches[0]
130
- : null
131
- : {};
132
- }
133
- }
134
-
135
- module.exports = Crawler;
1
+ 'use strict';
2
+
3
+ const Crawlers = require('./crawler/crawlers');
4
+ const Exclusions = require('./crawler/exclusions');
5
+ const Headers = require('./crawler/headers');
6
+
7
+ class Crawler {
8
+ constructor(request, headers, userAgent) {
9
+ /**
10
+ * Init classes
11
+ */
12
+ this._init();
13
+
14
+ /**
15
+ * This request must be an object
16
+ */
17
+ this.request = typeof request === 'object' ? request : {};
18
+
19
+ // The regex-list must not be used with g-flag!
20
+ // See: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results
21
+ this.compiledRegexList = this.compileRegex(this.crawlers.getAll(), 'i');
22
+
23
+ // The exclusions should be used with g-flag in order to remove each value.
24
+ this.compiledExclusions = this.compileRegex(this.exclusions.getAll(), 'gi');
25
+
26
+ /**
27
+ * Set http headers
28
+ */
29
+ this.setHttpHeaders(headers);
30
+
31
+ /**
32
+ * Set userAgent
33
+ */
34
+ this.userAgent = this.setUserAgent(userAgent);
35
+ }
36
+
37
+ /**
38
+ * Init Classes Instances
39
+ */
40
+ _init() {
41
+ this.crawlers = new Crawlers();
42
+ this.headers = new Headers();
43
+ this.exclusions = new Exclusions();
44
+ }
45
+
46
+ compileRegex(patterns, flags) {
47
+ return new RegExp(patterns.join('|'), flags);
48
+ }
49
+
50
+ /**
51
+ * Set HTTP headers.
52
+ */
53
+ setHttpHeaders(headers) {
54
+ // Use the Request headers if httpHeaders is not defined
55
+ if (typeof headers === 'undefined' || Object.keys(headers).length === 0) {
56
+ headers = Object.keys(this.request).length ? this.request.headers : {};
57
+ }
58
+
59
+ // Save the headers.
60
+ this.httpHeaders = headers;
61
+ }
62
+
63
+ /**
64
+ * Set user agent
65
+ */
66
+ setUserAgent(userAgent) {
67
+ if (
68
+ typeof userAgent === 'undefined' ||
69
+ userAgent === null ||
70
+ !userAgent.length
71
+ ) {
72
+ for (const header of this.getUaHttpHeaders()) {
73
+ if (Object.keys(this.httpHeaders).indexOf(header.toLowerCase()) >= 0) {
74
+ userAgent += this.httpHeaders[header.toLowerCase()] + ' ';
75
+ }
76
+ }
77
+ }
78
+
79
+ return userAgent;
80
+ }
81
+
82
+ /**
83
+ * Get user agent headers
84
+ */
85
+ getUaHttpHeaders() {
86
+ return this.headers.getAll();
87
+ }
88
+
89
+ /**
90
+ * Check user agent string against the regex.
91
+ */
92
+ isCrawler(userAgent = undefined) {
93
+ if (Buffer.byteLength(userAgent || '', 'utf8') > 4096) {
94
+ return false;
95
+ }
96
+
97
+ var agent =
98
+ typeof userAgent === 'undefined' || userAgent === null
99
+ ? this.userAgent
100
+ : userAgent;
101
+
102
+ // test on compiled regx
103
+ agent = agent.replace(this.compiledExclusions, '');
104
+
105
+ if (agent.trim().length === 0) {
106
+ return false;
107
+ }
108
+
109
+ var matches = this.compiledRegexList.exec(agent);
110
+
111
+ if (matches) {
112
+ this.matches = matches;
113
+ }
114
+
115
+ return matches !== null ? (matches.length ? true : false) : false;
116
+ }
117
+
118
+ /**
119
+ * Return the matches.
120
+ */
121
+ getMatches() {
122
+ return this.matches !== undefined
123
+ ? this.matches.length
124
+ ? this.matches[0]
125
+ : null
126
+ : {};
127
+ }
128
+ }
129
+
130
+ module.exports = Crawler;
@@ -1,52 +1,94 @@
1
- var assert = require('assert');
2
-
3
- const Crawler = require('../../src/lib/crawler');
4
-
5
- describe('crawler', () => {
6
- var crawler = new Crawler();
7
-
8
- it('will identify crawlers correctly on subsequent calls', () => {
9
- assert.strictEqual(crawler.isCrawler('Zombie.js'), true);
10
- assert.strictEqual(
11
- crawler.isCrawler('Zombie.js'),
12
- true,
13
- 'crawler was not identified on subsequent call'
14
- );
15
- });
16
-
17
- it('will identify telegram bot', () => {
18
- assert.strictEqual(
19
- crawler.isCrawler('TelegramBot (like TwitterBot)'),
20
- true
21
- );
22
- });
23
-
24
- describe('regex-compilation', () => {
25
- it('will join list of patterns with pipes', () => {
26
- assert.strictEqual(
27
- crawler.compileRegex(['some', 'patterns']).source,
28
- 'some|patterns'
29
- );
30
- assert.strictEqual(crawler.compileRegex(['single']).source, 'single');
31
- assert.strictEqual(
32
- crawler.compileRegex([' remove-whitespaces ']).source,
33
- 'remove-whitespaces'
34
- );
35
- });
36
-
37
- it('will accept regex-flags for compilation', () => {
38
- var patterns = ['some', 'patterns'];
39
- assert.strictEqual(crawler.compileRegex(patterns, 'g').flags, 'g');
40
- assert.strictEqual(crawler.compileRegex(patterns, 'i').flags, 'i');
41
- });
42
-
43
- it('should be case insensitive', () => {
44
- assert.strictEqual(crawler.isCrawler('Facebot\\1.0'), true);
45
- assert.strictEqual(
46
- crawler.getMatches('Facebot\\1.0'),
47
- 'Facebot',
48
- 'Crawler was not able to indentify crawler correctly'
49
- );
50
- });
51
- });
52
- });
1
+ const readline = require('readline');
2
+ const fs = require('fs');
3
+ const assert = require('assert');
4
+ const Crawler = require('../../src/lib/crawler');
5
+
6
+ describe('crawler', () => {
7
+ var crawler = new Crawler();
8
+
9
+ describe('regex-compilation', () => {
10
+ it('will join list of patterns with pipes', () => {
11
+ assert.strictEqual(
12
+ crawler.compileRegex(['some', 'patterns']).source,
13
+ 'some|patterns'
14
+ );
15
+ assert.strictEqual(crawler.compileRegex(['single']).source, 'single');
16
+ });
17
+
18
+ it('keeps the whitespace', () => {
19
+ assert.strictEqual(
20
+ crawler.compileRegex([' keep-whitespaces ']).source,
21
+ ' keep-whitespaces '
22
+ );
23
+ });
24
+
25
+ it('will accept regex-flags for compilation', () => {
26
+ var patterns = ['some', 'patterns'];
27
+ assert.strictEqual(crawler.compileRegex(patterns, 'g').flags, 'g');
28
+ assert.strictEqual(crawler.compileRegex(patterns, 'i').flags, 'i');
29
+ });
30
+
31
+ it('should be case insensitive', () => {
32
+ assert.strictEqual(crawler.isCrawler('Facebot\\1.0'), true);
33
+ assert.strictEqual(
34
+ crawler.getMatches('Facebot\\1.0'),
35
+ 'Facebot',
36
+ 'Crawler was not able to indentify crawler correctly'
37
+ );
38
+ });
39
+ });
40
+
41
+ describe('crawler-identification', () => {
42
+ it('should be able to identify crawlers', async () => {
43
+ const rl = readline.createInterface({
44
+ input: fs.createReadStream('./test/lib/database/crawlers.txt'),
45
+ crlfDelay: Infinity,
46
+ });
47
+
48
+ for await (const line of rl) {
49
+ assert.strictEqual(
50
+ crawler.isCrawler(line),
51
+ true,
52
+ `${line} is not a crawler`
53
+ );
54
+ }
55
+
56
+ rl.close();
57
+ });
58
+
59
+ it('should be able to identify devices', async () => {
60
+ const rl = readline.createInterface({
61
+ input: fs.createReadStream('./test/lib/database/devices.txt'),
62
+ crlfDelay: Infinity,
63
+ });
64
+
65
+ for await (const line of rl) {
66
+ assert.strictEqual(
67
+ crawler.isCrawler(line),
68
+ false,
69
+ `${line} is not a device`
70
+ );
71
+ }
72
+
73
+ rl.close();
74
+ });
75
+
76
+ it('should identify the crawler from a given headers', async () => {
77
+ crawler = new Crawler(null, {
78
+ host: '127.0.0.1:3000',
79
+ 'user-agent': 'curl/7.73.0',
80
+ accept: '*/*',
81
+ });
82
+
83
+ assert.strictEqual(crawler.isCrawler(), true);
84
+ });
85
+
86
+ it('should identify the crawler from request headers', async () => {
87
+ crawler = new Crawler({
88
+ headers: { 'user-agent': 'curl/7.73.0', accept: '*/*' },
89
+ });
90
+
91
+ assert.strictEqual(crawler.isCrawler(), true);
92
+ });
93
+ });
94
+ });