es6-crawler-detect 4.0.0 → 4.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -1
- package/package.json +8 -2
- package/.babelrc +0 -19
- package/.github/FUNDING.yml +0 -12
- package/.github/ISSUE_TEMPLATE/bug_report.md +0 -41
- package/.github/ISSUE_TEMPLATE/feature_request.md +0 -20
- package/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md +0 -29
- package/.mocharc.json +0 -5
- package/.prettierrc.json +0 -5
- package/CODE_OF_CONDUCT.md +0 -76
- package/eslint.config.mjs +0 -50
- package/example/node/index.html +0 -22
- package/example/node/package-lock.json +0 -708
- package/example/node/package.json +0 -11
- package/example/node/server.js +0 -38
- package/src/index.ts +0 -18
- package/src/lib/crawler/crawlers.ts +0 -1397
- package/src/lib/crawler/exclusions.ts +0 -56
- package/src/lib/crawler/headers.ts +0 -18
- package/src/lib/crawler/provider.ts +0 -3
- package/src/lib/crawler.ts +0 -128
- package/src/types.ts +0 -15
- package/test/lib/crawler.spec.ts +0 -116
- package/test/lib/database/crawlers.txt +0 -3651
- package/test/lib/database/devices.txt +0 -165636
- package/tsconfig.json +0 -28
@@ -1,56 +0,0 @@
|
|
1
|
-
import { Provider } from './provider';
|
2
|
-
|
3
|
-
export class Exclusions implements Provider {
|
4
|
-
getAll(): string[] {
|
5
|
-
return [
|
6
|
-
'Safari.[\\d\\.]*',
|
7
|
-
'Firefox.[\\d\\.]*',
|
8
|
-
' Chrome.[\\d\\.]*',
|
9
|
-
'Chromium.[\\d\\.]*',
|
10
|
-
'MSIE.[\\d\\.]',
|
11
|
-
'Opera\\/[\\d\\.]*',
|
12
|
-
'Mozilla.[\\d\\.]*',
|
13
|
-
'AppleWebKit.[\\d\\.]*',
|
14
|
-
'Trident.[\\d\\.]*',
|
15
|
-
'Windows NT.[\\d\\.]*',
|
16
|
-
'Android [\\d\\.]*',
|
17
|
-
'Macintosh.',
|
18
|
-
'Ubuntu',
|
19
|
-
'Linux',
|
20
|
-
'[ ]Intel',
|
21
|
-
'Mac OS X [\\d_]*',
|
22
|
-
'(like )?Gecko(.[\\d\\.]*)?',
|
23
|
-
'KHTML,',
|
24
|
-
'CriOS.[\\d\\.]*',
|
25
|
-
'CPU iPhone OS ([0-9_])* like Mac OS X',
|
26
|
-
'CPU OS ([0-9_])* like Mac OS X',
|
27
|
-
'iPod',
|
28
|
-
'compatible',
|
29
|
-
'x86_..',
|
30
|
-
'i686',
|
31
|
-
'x64',
|
32
|
-
'X11',
|
33
|
-
'rv:[\\d\\.]*',
|
34
|
-
'Version.[\\d\\.]*',
|
35
|
-
'WOW64',
|
36
|
-
'Win64',
|
37
|
-
'Dalvik.[\\d\\.]*',
|
38
|
-
' \\.NET CLR [\\d\\.]*',
|
39
|
-
'Presto.[\\d\\.]*',
|
40
|
-
'Media Center PC',
|
41
|
-
'BlackBerry',
|
42
|
-
'Build',
|
43
|
-
'Opera Mini\\/\\d{1,2}\\.\\d{1,2}\\.[\\d\\.]*\\/\\d{1,2}\\.',
|
44
|
-
'Opera',
|
45
|
-
' \\.NET[\\d\\.]*',
|
46
|
-
'cubot',
|
47
|
-
'; M bot',
|
48
|
-
'; CRONO',
|
49
|
-
'; B bot',
|
50
|
-
'; IDbot',
|
51
|
-
'; ID bot',
|
52
|
-
'; POWER BOT',
|
53
|
-
'OCTOPUS-CORE',
|
54
|
-
];
|
55
|
-
}
|
56
|
-
}
|
@@ -1,18 +0,0 @@
|
|
1
|
-
import { Provider } from './provider';
|
2
|
-
|
3
|
-
export class Headers implements Provider {
|
4
|
-
getAll(): string[] {
|
5
|
-
return [
|
6
|
-
'USER-AGENT',
|
7
|
-
'X-OPERAMINI-PHONE-UA',
|
8
|
-
'X-DEVICE-USER-AGENT',
|
9
|
-
'X-ORIGINAL-USER-AGENT',
|
10
|
-
'X-SKYFIRE-PHONE',
|
11
|
-
'X-BOLT-PHONE-UA',
|
12
|
-
'DEVICE-STOCK-UA',
|
13
|
-
'X-UCBROWSER-DEVICE-UA',
|
14
|
-
'FROM',
|
15
|
-
'X-SCANNER',
|
16
|
-
];
|
17
|
-
}
|
18
|
-
}
|
package/src/lib/crawler.ts
DELETED
@@ -1,128 +0,0 @@
|
|
1
|
-
import { Request } from 'express-serve-static-core';
|
2
|
-
import { IncomingHttpHeaders } from 'http2';
|
3
|
-
|
4
|
-
import { Crawlers } from './crawler/crawlers';
|
5
|
-
import { Exclusions } from './crawler/exclusions';
|
6
|
-
import { Headers } from './crawler/headers';
|
7
|
-
|
8
|
-
export class Crawler {
|
9
|
-
private crawlers: Crawlers;
|
10
|
-
private headers: Headers;
|
11
|
-
private exclusions: Exclusions;
|
12
|
-
|
13
|
-
private request: Request | NodeJS.Dict<string | string[]>;
|
14
|
-
private compiledRegexList: RegExp;
|
15
|
-
private compiledExclusions: RegExp;
|
16
|
-
private httpHeaders: string | string[] | IncomingHttpHeaders;
|
17
|
-
private userAgent: string;
|
18
|
-
private matches?: RegExpExecArray | null;
|
19
|
-
|
20
|
-
constructor(
|
21
|
-
request?: Request,
|
22
|
-
headers?: IncomingHttpHeaders,
|
23
|
-
userAgent?: string
|
24
|
-
) {
|
25
|
-
this.crawlers = new Crawlers();
|
26
|
-
this.headers = new Headers();
|
27
|
-
this.exclusions = new Exclusions();
|
28
|
-
|
29
|
-
this.request = request ?? {};
|
30
|
-
|
31
|
-
// The regex-list must not be used with g-flag!
|
32
|
-
// See: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results
|
33
|
-
this.compiledRegexList = this.compileRegex(this.crawlers.getAll(), 'i');
|
34
|
-
|
35
|
-
// The exclusions should be used with g-flag in order to remove each value.
|
36
|
-
this.compiledExclusions = this.compileRegex(this.exclusions.getAll(), 'gi');
|
37
|
-
|
38
|
-
this.httpHeaders = this.setHttpHeaders(headers);
|
39
|
-
this.userAgent = this.setUserAgent(userAgent);
|
40
|
-
}
|
41
|
-
|
42
|
-
public compileRegex(patterns: string[], flags?: string): RegExp {
|
43
|
-
return new RegExp(patterns.join('|'), flags);
|
44
|
-
}
|
45
|
-
|
46
|
-
private setHttpHeaders(
|
47
|
-
headers?: string | string[] | IncomingHttpHeaders
|
48
|
-
): string | string[] | IncomingHttpHeaders {
|
49
|
-
// Use the Request headers if httpHeaders is not defined
|
50
|
-
if (!headers || Object.keys(headers).length === 0) {
|
51
|
-
if (Object.keys(this.request).length) {
|
52
|
-
if (this.request.headers) {
|
53
|
-
return this.request.headers;
|
54
|
-
}
|
55
|
-
}
|
56
|
-
|
57
|
-
return '';
|
58
|
-
}
|
59
|
-
|
60
|
-
// Save the headers.
|
61
|
-
return headers;
|
62
|
-
}
|
63
|
-
|
64
|
-
private setUserAgent(userAgent?: string): string {
|
65
|
-
if (!userAgent?.length) {
|
66
|
-
userAgent = '';
|
67
|
-
for (const header of this.getUaHttpHeaders()) {
|
68
|
-
if (
|
69
|
-
typeof this.httpHeaders === 'object' &&
|
70
|
-
!Array.isArray(this.httpHeaders)
|
71
|
-
) {
|
72
|
-
if (Object.hasOwn(this.httpHeaders, header.toLowerCase())) {
|
73
|
-
const headerValue = this.httpHeaders[header.toLowerCase()];
|
74
|
-
|
75
|
-
if (typeof headerValue === 'string') {
|
76
|
-
const separator = userAgent.length > 0 ? ' ' : '';
|
77
|
-
userAgent += separator + headerValue;
|
78
|
-
} else if (Array.isArray(headerValue)) {
|
79
|
-
const separator = userAgent.length > 0 ? ' ' : '';
|
80
|
-
userAgent += separator + headerValue.join(' ');
|
81
|
-
}
|
82
|
-
}
|
83
|
-
}
|
84
|
-
}
|
85
|
-
}
|
86
|
-
|
87
|
-
return userAgent;
|
88
|
-
}
|
89
|
-
|
90
|
-
private getUaHttpHeaders() {
|
91
|
-
return this.headers.getAll();
|
92
|
-
}
|
93
|
-
|
94
|
-
public getMatches(): string | null | object {
|
95
|
-
if (this.matches !== undefined) {
|
96
|
-
if (this.matches?.length) {
|
97
|
-
return this.matches[0];
|
98
|
-
}
|
99
|
-
|
100
|
-
return null;
|
101
|
-
}
|
102
|
-
|
103
|
-
return {};
|
104
|
-
}
|
105
|
-
|
106
|
-
public isCrawler(userAgent?: string): boolean {
|
107
|
-
if (Buffer.byteLength(userAgent ?? '', 'utf8') > 4096) {
|
108
|
-
return false;
|
109
|
-
}
|
110
|
-
|
111
|
-
let agent = userAgent ?? this.userAgent;
|
112
|
-
|
113
|
-
// test on compiled regx
|
114
|
-
agent = agent.replace(this.compiledExclusions, '');
|
115
|
-
|
116
|
-
if (agent.trim().length === 0) {
|
117
|
-
return false;
|
118
|
-
}
|
119
|
-
|
120
|
-
const matches = this.compiledRegexList.exec(agent);
|
121
|
-
|
122
|
-
if (matches) {
|
123
|
-
this.matches = matches;
|
124
|
-
}
|
125
|
-
|
126
|
-
return matches !== null && matches.length > 0;
|
127
|
-
}
|
128
|
-
}
|
package/src/types.ts
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
import { NextFunction, Request, Response } from 'express-serve-static-core';
|
2
|
-
|
3
|
-
import { Crawler } from './lib/crawler';
|
4
|
-
|
5
|
-
export type Middleware = (
|
6
|
-
request: Request,
|
7
|
-
response: Response,
|
8
|
-
next: NextFunction
|
9
|
-
) => Promise<Response>;
|
10
|
-
|
11
|
-
declare module 'express-serve-static-core' {
|
12
|
-
interface Request {
|
13
|
-
Crawler: Crawler;
|
14
|
-
}
|
15
|
-
}
|
package/test/lib/crawler.spec.ts
DELETED
@@ -1,116 +0,0 @@
|
|
1
|
-
import assert from 'assert';
|
2
|
-
import { Request } from 'express-serve-static-core';
|
3
|
-
import * as fs from 'fs';
|
4
|
-
import * as readline from 'readline';
|
5
|
-
|
6
|
-
import { Crawler } from '../../src';
|
7
|
-
|
8
|
-
declare module 'express-serve-static-core' {
|
9
|
-
interface Request {
|
10
|
-
Crawler: Crawler;
|
11
|
-
}
|
12
|
-
}
|
13
|
-
|
14
|
-
let crawler = new Crawler();
|
15
|
-
|
16
|
-
describe('regex-compilation', () => {
|
17
|
-
it('will join list of patterns with pipes', () => {
|
18
|
-
assert.strictEqual(
|
19
|
-
crawler.compileRegex(['some', 'patterns']).source,
|
20
|
-
'some|patterns'
|
21
|
-
);
|
22
|
-
assert.strictEqual(crawler.compileRegex(['single']).source, 'single');
|
23
|
-
});
|
24
|
-
|
25
|
-
it('keeps the whitespace', () => {
|
26
|
-
assert.strictEqual(
|
27
|
-
crawler.compileRegex([' keep-whitespaces ']).source,
|
28
|
-
' keep-whitespaces '
|
29
|
-
);
|
30
|
-
});
|
31
|
-
|
32
|
-
it('will accept regex-flags for compilation', () => {
|
33
|
-
const patterns = ['some', 'patterns'];
|
34
|
-
assert.strictEqual(crawler.compileRegex(patterns, 'g').flags, 'g');
|
35
|
-
assert.strictEqual(crawler.compileRegex(patterns, 'i').flags, 'i');
|
36
|
-
});
|
37
|
-
|
38
|
-
it('should be case insensitive', () => {
|
39
|
-
assert.strictEqual(crawler.isCrawler('Facebot\\1.0'), true);
|
40
|
-
assert.strictEqual(
|
41
|
-
crawler.getMatches(),
|
42
|
-
'Facebot',
|
43
|
-
'Crawler was not able to identify crawler correctly'
|
44
|
-
);
|
45
|
-
});
|
46
|
-
});
|
47
|
-
|
48
|
-
describe('crawler-identification', () => {
|
49
|
-
it('should be able to identify crawlers', async () => {
|
50
|
-
const rl = readline.createInterface({
|
51
|
-
input: fs.createReadStream('./test/lib/database/crawlers.txt'),
|
52
|
-
crlfDelay: Infinity,
|
53
|
-
});
|
54
|
-
|
55
|
-
for await (const line of rl) {
|
56
|
-
assert.strictEqual(
|
57
|
-
crawler.isCrawler(line),
|
58
|
-
true,
|
59
|
-
`${line} is not a crawler`
|
60
|
-
);
|
61
|
-
}
|
62
|
-
|
63
|
-
rl.close();
|
64
|
-
});
|
65
|
-
|
66
|
-
it('should be able to identify devices', async () => {
|
67
|
-
const rl = readline.createInterface({
|
68
|
-
input: fs.createReadStream('./test/lib/database/devices.txt'),
|
69
|
-
crlfDelay: Infinity,
|
70
|
-
});
|
71
|
-
|
72
|
-
for await (const line of rl) {
|
73
|
-
assert.strictEqual(
|
74
|
-
crawler.isCrawler(line),
|
75
|
-
false,
|
76
|
-
`${line} is not a device`
|
77
|
-
);
|
78
|
-
}
|
79
|
-
|
80
|
-
rl.close();
|
81
|
-
});
|
82
|
-
|
83
|
-
it('should identify the crawler from given headers', async () => {
|
84
|
-
crawler = new Crawler(undefined, {
|
85
|
-
host: '127.0.0.1:3000',
|
86
|
-
'user-agent': 'curl/7.73.0',
|
87
|
-
accept: '*/*',
|
88
|
-
});
|
89
|
-
|
90
|
-
assert.strictEqual(crawler.isCrawler(), true);
|
91
|
-
});
|
92
|
-
|
93
|
-
it('should identify the crawler from request headers', async () => {
|
94
|
-
crawler = new Crawler({
|
95
|
-
headers: { 'user-agent': 'curl/7.73.0', accept: '*/*' },
|
96
|
-
} as Request);
|
97
|
-
|
98
|
-
assert.strictEqual(crawler.isCrawler(), true);
|
99
|
-
});
|
100
|
-
|
101
|
-
it('should identify the crawler from request headers with exact pattern', async () => {
|
102
|
-
crawler = new Crawler({
|
103
|
-
headers: { 'user-agent': 'b0t', accept: '*/*' },
|
104
|
-
} as Request);
|
105
|
-
|
106
|
-
assert.strictEqual(crawler.isCrawler(), true);
|
107
|
-
});
|
108
|
-
|
109
|
-
it('should not throw an exception on empty request header', async () => {
|
110
|
-
crawler = new Crawler({
|
111
|
-
headers: { accept: '*/*' },
|
112
|
-
} as Request);
|
113
|
-
|
114
|
-
assert.doesNotThrow(() => crawler.isCrawler());
|
115
|
-
});
|
116
|
-
});
|