@nitpicker/crawler 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -2
- package/CHANGELOG.md +0 -16
- package/src/archive/__mock__/.gitignore +0 -3
- package/src/archive/__mock__/mock.sqlite +0 -0
- package/src/archive/archive-accessor.ts +0 -337
- package/src/archive/archive.ts +0 -408
- package/src/archive/database.spec.ts +0 -469
- package/src/archive/database.ts +0 -1059
- package/src/archive/debug.ts +0 -10
- package/src/archive/filesystem/append-text.spec.ts +0 -26
- package/src/archive/filesystem/append-text.ts +0 -16
- package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
- package/src/archive/filesystem/copy-dir-sync.ts +0 -10
- package/src/archive/filesystem/copy-dir.spec.ts +0 -33
- package/src/archive/filesystem/copy-dir.ts +0 -14
- package/src/archive/filesystem/exists.spec.ts +0 -33
- package/src/archive/filesystem/exists.ts +0 -10
- package/src/archive/filesystem/get-file-list.spec.ts +0 -37
- package/src/archive/filesystem/get-file-list.ts +0 -13
- package/src/archive/filesystem/index.ts +0 -17
- package/src/archive/filesystem/is-dir.spec.ts +0 -29
- package/src/archive/filesystem/is-dir.ts +0 -11
- package/src/archive/filesystem/mkdir.spec.ts +0 -37
- package/src/archive/filesystem/mkdir.ts +0 -16
- package/src/archive/filesystem/output-json.spec.ts +0 -34
- package/src/archive/filesystem/output-json.ts +0 -16
- package/src/archive/filesystem/output-text.spec.ts +0 -31
- package/src/archive/filesystem/output-text.ts +0 -35
- package/src/archive/filesystem/read-json.spec.ts +0 -26
- package/src/archive/filesystem/read-json.ts +0 -12
- package/src/archive/filesystem/read-text.spec.ts +0 -25
- package/src/archive/filesystem/read-text.ts +0 -11
- package/src/archive/filesystem/readline.spec.ts +0 -29
- package/src/archive/filesystem/readline.ts +0 -30
- package/src/archive/filesystem/remove.spec.ts +0 -34
- package/src/archive/filesystem/remove.ts +0 -11
- package/src/archive/filesystem/rename.spec.ts +0 -46
- package/src/archive/filesystem/rename.ts +0 -21
- package/src/archive/filesystem/tar.spec.ts +0 -33
- package/src/archive/filesystem/tar.ts +0 -27
- package/src/archive/filesystem/untar.spec.ts +0 -34
- package/src/archive/filesystem/untar.ts +0 -36
- package/src/archive/index.ts +0 -13
- package/src/archive/page.spec.ts +0 -368
- package/src/archive/page.ts +0 -420
- package/src/archive/resource.spec.ts +0 -101
- package/src/archive/resource.ts +0 -73
- package/src/archive/safe-path.spec.ts +0 -44
- package/src/archive/safe-path.ts +0 -18
- package/src/archive/types.ts +0 -227
- package/src/crawler/clear-destination-cache.spec.ts +0 -20
- package/src/crawler/clear-destination-cache.ts +0 -9
- package/src/crawler/crawler.ts +0 -873
- package/src/crawler/decompose-url.spec.ts +0 -48
- package/src/crawler/decompose-url.ts +0 -90
- package/src/crawler/destination-cache.spec.ts +0 -23
- package/src/crawler/destination-cache.ts +0 -8
- package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
- package/src/crawler/detect-pagination-pattern.ts +0 -66
- package/src/crawler/fetch-destination.ts +0 -257
- package/src/crawler/fetch-robots-txt.spec.ts +0 -83
- package/src/crawler/fetch-robots-txt.ts +0 -91
- package/src/crawler/find-best-matching-scope.spec.ts +0 -39
- package/src/crawler/find-best-matching-scope.ts +0 -57
- package/src/crawler/generate-predicted-urls.spec.ts +0 -42
- package/src/crawler/generate-predicted-urls.ts +0 -34
- package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
- package/src/crawler/handle-ignore-and-skip.ts +0 -30
- package/src/crawler/handle-resource-response.spec.ts +0 -45
- package/src/crawler/handle-resource-response.ts +0 -21
- package/src/crawler/handle-scrape-end.spec.ts +0 -109
- package/src/crawler/handle-scrape-end.ts +0 -115
- package/src/crawler/handle-scrape-error.spec.ts +0 -105
- package/src/crawler/handle-scrape-error.ts +0 -58
- package/src/crawler/index.ts +0 -2
- package/src/crawler/inject-scope-auth.spec.ts +0 -36
- package/src/crawler/inject-scope-auth.ts +0 -27
- package/src/crawler/is-external-url.spec.ts +0 -31
- package/src/crawler/is-external-url.ts +0 -17
- package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
- package/src/crawler/is-in-any-lower-layer.ts +0 -22
- package/src/crawler/link-list.spec.ts +0 -355
- package/src/crawler/link-list.ts +0 -275
- package/src/crawler/link-to-page-data.spec.ts +0 -133
- package/src/crawler/link-to-page-data.ts +0 -34
- package/src/crawler/net-timeout-error.spec.ts +0 -25
- package/src/crawler/net-timeout-error.ts +0 -11
- package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
- package/src/crawler/protocol-agnostic-key.ts +0 -11
- package/src/crawler/reconstruct-url.spec.ts +0 -37
- package/src/crawler/reconstruct-url.ts +0 -37
- package/src/crawler/robots-checker.spec.ts +0 -104
- package/src/crawler/robots-checker.ts +0 -73
- package/src/crawler/should-discard-predicted.spec.ts +0 -125
- package/src/crawler/should-discard-predicted.ts +0 -33
- package/src/crawler/should-skip-url.spec.ts +0 -77
- package/src/crawler/should-skip-url.ts +0 -37
- package/src/crawler/types.ts +0 -146
- package/src/crawler-orchestrator.ts +0 -401
- package/src/debug.ts +0 -10
- package/src/index.ts +0 -25
- package/src/types.ts +0 -30
- package/src/utils/array/each-splitted.spec.ts +0 -38
- package/src/utils/array/each-splitted.ts +0 -19
- package/src/utils/array/index.ts +0 -1
- package/src/utils/debug.ts +0 -6
- package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
- package/src/utils/error/dom-evaluation-error.ts +0 -6
- package/src/utils/error/error-emitter.spec.ts +0 -78
- package/src/utils/error/error-emitter.ts +0 -44
- package/src/utils/error/index.ts +0 -3
- package/src/utils/index.ts +0 -5
- package/src/utils/object/clean-object.spec.ts +0 -24
- package/src/utils/object/clean-object.ts +0 -13
- package/src/utils/object/index.ts +0 -1
- package/src/utils/types/index.ts +0 -1
- package/src/utils/types/types.ts +0 -65
- package/tsconfig.json +0 -11
- package/tsconfig.tsbuildinfo +0 -1
|
@@ -1,133 +0,0 @@
|
|
|
1
|
-
import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
|
|
2
|
-
import { describe, it, expect } from 'vitest';
|
|
3
|
-
|
|
4
|
-
import { linkToPageData } from './link-to-page-data.js';
|
|
5
|
-
|
|
6
|
-
/**
|
|
7
|
-
*
|
|
8
|
-
* @param overrides
|
|
9
|
-
*/
|
|
10
|
-
function createLink(overrides: Record<string, unknown> = {}) {
|
|
11
|
-
const url = parseUrl('https://example.com/page')!;
|
|
12
|
-
return {
|
|
13
|
-
url,
|
|
14
|
-
isExternal: false,
|
|
15
|
-
isLowerLayer: true,
|
|
16
|
-
...overrides,
|
|
17
|
-
};
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
describe('linkToPageData', () => {
|
|
21
|
-
it('converts link without dest to default PageData', () => {
|
|
22
|
-
const link = createLink();
|
|
23
|
-
const result = linkToPageData(link);
|
|
24
|
-
expect(result.url).toBe(link.url);
|
|
25
|
-
expect(result.status).toBe(-1);
|
|
26
|
-
expect(result.statusText).toBe('UnknownError');
|
|
27
|
-
expect(result.contentType).toBeNull();
|
|
28
|
-
expect(result.contentLength).toBeNull();
|
|
29
|
-
expect(result.responseHeaders).toBeNull();
|
|
30
|
-
expect(result.redirectPaths).toStrictEqual([]);
|
|
31
|
-
expect(result.meta.title).toBe('');
|
|
32
|
-
expect(result.anchorList).toStrictEqual([]);
|
|
33
|
-
expect(result.imageList).toStrictEqual([]);
|
|
34
|
-
expect(result.html).toBe('');
|
|
35
|
-
expect(result.isSkipped).toBe(false);
|
|
36
|
-
});
|
|
37
|
-
|
|
38
|
-
it('sets isTarget=true for internal link', () => {
|
|
39
|
-
const link = createLink({ isExternal: false });
|
|
40
|
-
const result = linkToPageData(link);
|
|
41
|
-
expect(result.isTarget).toBe(true);
|
|
42
|
-
expect(result.isExternal).toBe(false);
|
|
43
|
-
});
|
|
44
|
-
|
|
45
|
-
it('sets isTarget=false for external link', () => {
|
|
46
|
-
const link = createLink({ isExternal: true });
|
|
47
|
-
const result = linkToPageData(link);
|
|
48
|
-
expect(result.isTarget).toBe(false);
|
|
49
|
-
expect(result.isExternal).toBe(true);
|
|
50
|
-
});
|
|
51
|
-
|
|
52
|
-
it('uses dest data when available', () => {
|
|
53
|
-
const link = createLink({
|
|
54
|
-
dest: {
|
|
55
|
-
redirectPaths: ['/redirect'],
|
|
56
|
-
status: 200,
|
|
57
|
-
statusText: 'OK',
|
|
58
|
-
contentType: 'text/html',
|
|
59
|
-
contentLength: 1024,
|
|
60
|
-
responseHeaders: { 'content-type': 'text/html' },
|
|
61
|
-
title: 'Test Page',
|
|
62
|
-
},
|
|
63
|
-
});
|
|
64
|
-
const result = linkToPageData(link);
|
|
65
|
-
expect(result.status).toBe(200);
|
|
66
|
-
expect(result.statusText).toBe('OK');
|
|
67
|
-
expect(result.contentType).toBe('text/html');
|
|
68
|
-
expect(result.contentLength).toBe(1024);
|
|
69
|
-
expect(result.redirectPaths).toStrictEqual(['/redirect']);
|
|
70
|
-
expect(result.meta.title).toBe('Test Page');
|
|
71
|
-
});
|
|
72
|
-
|
|
73
|
-
it('uses dest title when provided', () => {
|
|
74
|
-
const link = createLink({
|
|
75
|
-
dest: {
|
|
76
|
-
redirectPaths: [],
|
|
77
|
-
status: 200,
|
|
78
|
-
statusText: 'OK',
|
|
79
|
-
contentType: 'text/html',
|
|
80
|
-
contentLength: 0,
|
|
81
|
-
responseHeaders: {},
|
|
82
|
-
title: 'My Title',
|
|
83
|
-
},
|
|
84
|
-
});
|
|
85
|
-
const result = linkToPageData(link);
|
|
86
|
-
expect(result.meta.title).toBe('My Title');
|
|
87
|
-
});
|
|
88
|
-
|
|
89
|
-
it('falls back to empty title when dest has no title', () => {
|
|
90
|
-
const link = createLink({
|
|
91
|
-
dest: {
|
|
92
|
-
redirectPaths: [],
|
|
93
|
-
status: 200,
|
|
94
|
-
statusText: 'OK',
|
|
95
|
-
contentType: null,
|
|
96
|
-
contentLength: null,
|
|
97
|
-
responseHeaders: null,
|
|
98
|
-
},
|
|
99
|
-
});
|
|
100
|
-
const result = linkToPageData(link);
|
|
101
|
-
expect(result.meta.title).toBe('');
|
|
102
|
-
});
|
|
103
|
-
|
|
104
|
-
it('records current behavior: status 0 becomes -1 via || operator', () => {
|
|
105
|
-
const link = createLink({
|
|
106
|
-
dest: {
|
|
107
|
-
redirectPaths: [],
|
|
108
|
-
status: 0,
|
|
109
|
-
statusText: 'OK',
|
|
110
|
-
contentType: null,
|
|
111
|
-
contentLength: null,
|
|
112
|
-
responseHeaders: null,
|
|
113
|
-
},
|
|
114
|
-
});
|
|
115
|
-
const result = linkToPageData(link);
|
|
116
|
-
expect(result.status).toBe(-1);
|
|
117
|
-
});
|
|
118
|
-
|
|
119
|
-
it('records current behavior: contentLength 0 becomes null via || operator', () => {
|
|
120
|
-
const link = createLink({
|
|
121
|
-
dest: {
|
|
122
|
-
redirectPaths: [],
|
|
123
|
-
status: 200,
|
|
124
|
-
statusText: 'OK',
|
|
125
|
-
contentType: null,
|
|
126
|
-
contentLength: 0,
|
|
127
|
-
responseHeaders: null,
|
|
128
|
-
},
|
|
129
|
-
});
|
|
130
|
-
const result = linkToPageData(link);
|
|
131
|
-
expect(result.contentLength).toBeNull();
|
|
132
|
-
});
|
|
133
|
-
});
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
import type { Link, PageData } from '../utils/index.js';
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Convert a {@link Link} object into a {@link PageData} structure.
|
|
5
|
-
*
|
|
6
|
-
* Creates a minimal PageData from the link's destination metadata. This is used
|
|
7
|
-
* when a full scrape is not performed (e.g., for external pages when
|
|
8
|
-
* `fetchExternal` is disabled, or when a scrape error produces a fallback result).
|
|
9
|
-
*
|
|
10
|
-
* Missing destination fields are filled with sensible defaults (e.g., status -1
|
|
11
|
-
* for unknown, empty arrays for anchors/images, empty string for HTML).
|
|
12
|
-
* @param link - The link to convert, containing URL and optional destination metadata.
|
|
13
|
-
* @returns A PageData object populated from the link's available data.
|
|
14
|
-
*/
|
|
15
|
-
export function linkToPageData(link: Link): PageData {
|
|
16
|
-
return {
|
|
17
|
-
url: link.url,
|
|
18
|
-
redirectPaths: link.dest?.redirectPaths || [],
|
|
19
|
-
isTarget: !link.isExternal,
|
|
20
|
-
isExternal: link.isExternal,
|
|
21
|
-
status: link.dest?.status || -1,
|
|
22
|
-
statusText: link.dest?.statusText || 'UnknownError',
|
|
23
|
-
contentType: link.dest?.contentType || null,
|
|
24
|
-
contentLength: link.dest?.contentLength || null,
|
|
25
|
-
responseHeaders: link.dest?.responseHeaders || null,
|
|
26
|
-
meta: {
|
|
27
|
-
title: link.dest?.title || '',
|
|
28
|
-
},
|
|
29
|
-
anchorList: [],
|
|
30
|
-
imageList: [],
|
|
31
|
-
html: '',
|
|
32
|
-
isSkipped: false,
|
|
33
|
-
};
|
|
34
|
-
}
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from 'vitest';
|
|
2
|
-
|
|
3
|
-
import NetTimeoutError from './net-timeout-error.js';
|
|
4
|
-
|
|
5
|
-
describe('NetTimeoutError', () => {
|
|
6
|
-
it('has the name NetTimeoutError', () => {
|
|
7
|
-
const error = new NetTimeoutError();
|
|
8
|
-
expect(error.name).toBe('NetTimeoutError');
|
|
9
|
-
});
|
|
10
|
-
|
|
11
|
-
it('includes the URL in the message when provided', () => {
|
|
12
|
-
const error = new NetTimeoutError('https://example.com/');
|
|
13
|
-
expect(error.message).toBe('Timeout: https://example.com/');
|
|
14
|
-
});
|
|
15
|
-
|
|
16
|
-
it('uses a generic message when no URL is provided', () => {
|
|
17
|
-
const error = new NetTimeoutError();
|
|
18
|
-
expect(error.message).toBe('Timeout');
|
|
19
|
-
});
|
|
20
|
-
|
|
21
|
-
it('is an instance of Error', () => {
|
|
22
|
-
const error = new NetTimeoutError();
|
|
23
|
-
expect(error).toBeInstanceOf(Error);
|
|
24
|
-
});
|
|
25
|
-
});
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Error thrown when a network request (typically an HTTP HEAD check)
|
|
3
|
-
* exceeds the allowed timeout duration. Used by `fetchDestination`
|
|
4
|
-
* to signal that the destination server did not respond in time.
|
|
5
|
-
*/
|
|
6
|
-
export default class NetTimeoutError extends Error {
|
|
7
|
-
constructor(url?: string) {
|
|
8
|
-
super(url ? `Timeout: ${url}` : 'Timeout');
|
|
9
|
-
}
|
|
10
|
-
override name = 'NetTimeoutError';
|
|
11
|
-
}
|
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from 'vitest';
|
|
2
|
-
|
|
3
|
-
import { protocolAgnosticKey } from './protocol-agnostic-key.js';
|
|
4
|
-
|
|
5
|
-
describe('protocolAgnosticKey', () => {
|
|
6
|
-
it('strips https: from URL', () => {
|
|
7
|
-
expect(protocolAgnosticKey('https://example.com/page')).toBe('//example.com/page');
|
|
8
|
-
});
|
|
9
|
-
|
|
10
|
-
it('strips http: from URL', () => {
|
|
11
|
-
expect(protocolAgnosticKey('http://example.com/page')).toBe('//example.com/page');
|
|
12
|
-
});
|
|
13
|
-
|
|
14
|
-
it('produces the same key for http and https', () => {
|
|
15
|
-
expect(protocolAgnosticKey('http://example.com/page')).toBe(
|
|
16
|
-
protocolAgnosticKey('https://example.com/page'),
|
|
17
|
-
);
|
|
18
|
-
});
|
|
19
|
-
|
|
20
|
-
it('handles URLs with ports', () => {
|
|
21
|
-
expect(protocolAgnosticKey('https://example.com:8080/page')).toBe(
|
|
22
|
-
'//example.com:8080/page',
|
|
23
|
-
);
|
|
24
|
-
});
|
|
25
|
-
|
|
26
|
-
it('does not strip non-HTTP protocols', () => {
|
|
27
|
-
expect(protocolAgnosticKey('ftp://example.com/file')).toBe('ftp://example.com/file');
|
|
28
|
-
});
|
|
29
|
-
|
|
30
|
-
it('handles URL without path', () => {
|
|
31
|
-
expect(protocolAgnosticKey('https://example.com')).toBe('//example.com');
|
|
32
|
-
});
|
|
33
|
-
|
|
34
|
-
it('handles withoutHashAndAuth style strings', () => {
|
|
35
|
-
// withoutHashAndAuth produces strings like "https://example.com/path"
|
|
36
|
-
const httpKey = protocolAgnosticKey('http://example.com/path');
|
|
37
|
-
const httpsKey = protocolAgnosticKey('https://example.com/path');
|
|
38
|
-
expect(httpKey).toBe(httpsKey);
|
|
39
|
-
});
|
|
40
|
-
});
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Returns a URL string with the protocol prefix (`http:` / `https:`) stripped.
|
|
3
|
-
*
|
|
4
|
-
* Used as a deduplication key so that HTTP and HTTPS variants of the
|
|
5
|
-
* same URL are treated as identical during crawling.
|
|
6
|
-
* @param url - A URL string (e.g. `"https://example.com/page"`)
|
|
7
|
-
* @returns The URL without its protocol prefix (e.g. `"//example.com/page"`)
|
|
8
|
-
*/
|
|
9
|
-
export function protocolAgnosticKey(url: string): string {
|
|
10
|
-
return url.replace(/^https?:/, '');
|
|
11
|
-
}
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from 'vitest';
|
|
2
|
-
|
|
3
|
-
import { decomposeUrl } from './decompose-url.js';
|
|
4
|
-
import { reconstructUrl } from './reconstruct-url.js';
|
|
5
|
-
|
|
6
|
-
describe('reconstructUrl', () => {
|
|
7
|
-
it('replaces a path segment token', () => {
|
|
8
|
-
const decomposed = decomposeUrl('//example.com/page/3')!;
|
|
9
|
-
const result = reconstructUrl(decomposed, 1, '4');
|
|
10
|
-
expect(result).toBe('//example.com/page/4');
|
|
11
|
-
});
|
|
12
|
-
|
|
13
|
-
it('replaces a query value token', () => {
|
|
14
|
-
const decomposed = decomposeUrl('//example.com/list?p=2&sort=name')!;
|
|
15
|
-
const result = reconstructUrl(decomposed, 1, '3');
|
|
16
|
-
expect(result).toBe('//example.com/list?p=3&sort=name');
|
|
17
|
-
});
|
|
18
|
-
|
|
19
|
-
it('preserves protocol', () => {
|
|
20
|
-
const decomposed = decomposeUrl('https://example.com/page/1')!;
|
|
21
|
-
const result = reconstructUrl(decomposed, 1, '2');
|
|
22
|
-
expect(result).toBe('https://example.com/page/2');
|
|
23
|
-
});
|
|
24
|
-
|
|
25
|
-
it('encodes special characters in query keys and values', () => {
|
|
26
|
-
const decomposed = decomposeUrl('//example.com/search?q=hello&lang=en')!;
|
|
27
|
-
// queryKeys are sorted: ['lang', 'q'], tokenIndex 1 replaces lang's value
|
|
28
|
-
const result = reconstructUrl(decomposed, 1, 'a&b=c');
|
|
29
|
-
expect(result).toContain('lang=a%26b%3Dc');
|
|
30
|
-
});
|
|
31
|
-
|
|
32
|
-
it('encodes unicode characters in query values', () => {
|
|
33
|
-
const decomposed = decomposeUrl('//example.com/search?q=test')!;
|
|
34
|
-
const result = reconstructUrl(decomposed, 1, '日本語');
|
|
35
|
-
expect(result).toContain(encodeURIComponent('日本語'));
|
|
36
|
-
});
|
|
37
|
-
});
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
import type { DecomposedUrl } from './decompose-url.js';
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Reconstructs a URL string from a decomposed representation with one
|
|
5
|
-
* token replaced at the specified index.
|
|
6
|
-
* @param decomposed - The decomposed URL to reconstruct
|
|
7
|
-
* @param tokenIndex - Index in the combined token array (path segments + query values)
|
|
8
|
-
* @param newValue - The replacement value for the token at `tokenIndex`
|
|
9
|
-
* @returns The reconstructed URL string
|
|
10
|
-
*/
|
|
11
|
-
export function reconstructUrl(
|
|
12
|
-
decomposed: DecomposedUrl,
|
|
13
|
-
tokenIndex: number,
|
|
14
|
-
newValue: string,
|
|
15
|
-
): string {
|
|
16
|
-
const { host, pathSegments, queryKeys, queryValues, protocol } = decomposed;
|
|
17
|
-
const newPathSegments = [...pathSegments];
|
|
18
|
-
const newQueryValues = [...queryValues];
|
|
19
|
-
|
|
20
|
-
if (tokenIndex < pathSegments.length) {
|
|
21
|
-
newPathSegments[tokenIndex] = newValue;
|
|
22
|
-
} else {
|
|
23
|
-
newQueryValues[tokenIndex - pathSegments.length] = newValue;
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
let url = `${protocol}//${host}`;
|
|
27
|
-
if (newPathSegments.length > 0) {
|
|
28
|
-
url += `/${newPathSegments.join('/')}`;
|
|
29
|
-
}
|
|
30
|
-
if (queryKeys.length > 0) {
|
|
31
|
-
const pairs = queryKeys.map(
|
|
32
|
-
(k, i) => `${encodeURIComponent(k)}=${encodeURIComponent(newQueryValues[i] ?? '')}`,
|
|
33
|
-
);
|
|
34
|
-
url += `?${pairs.join('&')}`;
|
|
35
|
-
}
|
|
36
|
-
return url;
|
|
37
|
-
}
|
|
@@ -1,104 +0,0 @@
|
|
|
1
|
-
import http from 'node:http';
|
|
2
|
-
|
|
3
|
-
import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
|
|
4
|
-
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
|
5
|
-
|
|
6
|
-
import { RobotsChecker } from './robots-checker.js';
|
|
7
|
-
|
|
8
|
-
const ROBOTS_TXT = `
|
|
9
|
-
User-agent: *
|
|
10
|
-
Disallow: /secret/
|
|
11
|
-
Allow: /
|
|
12
|
-
|
|
13
|
-
User-agent: Nitpicker
|
|
14
|
-
Disallow: /admin/
|
|
15
|
-
`;
|
|
16
|
-
|
|
17
|
-
let server: http.Server;
|
|
18
|
-
let port: number;
|
|
19
|
-
|
|
20
|
-
beforeAll(async () => {
|
|
21
|
-
server = http.createServer((req, res) => {
|
|
22
|
-
if (req.url === '/robots.txt') {
|
|
23
|
-
res.writeHead(200, { 'Content-Type': 'text/plain' });
|
|
24
|
-
res.end(ROBOTS_TXT);
|
|
25
|
-
} else {
|
|
26
|
-
res.writeHead(200);
|
|
27
|
-
res.end('ok');
|
|
28
|
-
}
|
|
29
|
-
});
|
|
30
|
-
await new Promise<void>((resolve) => {
|
|
31
|
-
server.listen(0, () => resolve());
|
|
32
|
-
});
|
|
33
|
-
const address = server.address();
|
|
34
|
-
port = typeof address === 'object' && address ? address.port : 0;
|
|
35
|
-
});
|
|
36
|
-
|
|
37
|
-
afterAll(async () => {
|
|
38
|
-
await new Promise<void>((resolve) => {
|
|
39
|
-
server.close(() => resolve());
|
|
40
|
-
});
|
|
41
|
-
});
|
|
42
|
-
|
|
43
|
-
describe('RobotsChecker', () => {
|
|
44
|
-
it('blocks URLs disallowed by robots.txt', async () => {
|
|
45
|
-
const checker = new RobotsChecker('Nitpicker', true);
|
|
46
|
-
const url = parseUrl(`http://127.0.0.1:${port}/admin/settings`)!;
|
|
47
|
-
const result = await checker.isAllowed(url);
|
|
48
|
-
expect(result).toBe(false);
|
|
49
|
-
});
|
|
50
|
-
|
|
51
|
-
it('allows URLs permitted by robots.txt', async () => {
|
|
52
|
-
const checker = new RobotsChecker('Nitpicker', true);
|
|
53
|
-
const url = parseUrl(`http://127.0.0.1:${port}/public/page`)!;
|
|
54
|
-
const result = await checker.isAllowed(url);
|
|
55
|
-
expect(result).toBe(true);
|
|
56
|
-
});
|
|
57
|
-
|
|
58
|
-
it('always allows when disabled', async () => {
|
|
59
|
-
const checker = new RobotsChecker('Nitpicker', false);
|
|
60
|
-
const url = parseUrl(`http://127.0.0.1:${port}/admin/settings`)!;
|
|
61
|
-
const result = await checker.isAllowed(url);
|
|
62
|
-
expect(result).toBe(true);
|
|
63
|
-
});
|
|
64
|
-
|
|
65
|
-
it('allows non-HTTP URLs', async () => {
|
|
66
|
-
const checker = new RobotsChecker('Nitpicker', true);
|
|
67
|
-
const url = parseUrl('mailto:test@example.com')!;
|
|
68
|
-
const result = await checker.isAllowed(url);
|
|
69
|
-
expect(result).toBe(true);
|
|
70
|
-
});
|
|
71
|
-
|
|
72
|
-
it('allows all URLs when robots.txt is absent', async () => {
|
|
73
|
-
const noRobotsServer = http.createServer((_req, res) => {
|
|
74
|
-
res.writeHead(404);
|
|
75
|
-
res.end();
|
|
76
|
-
});
|
|
77
|
-
await new Promise<void>((resolve) => {
|
|
78
|
-
noRobotsServer.listen(0, () => resolve());
|
|
79
|
-
});
|
|
80
|
-
const address = noRobotsServer.address();
|
|
81
|
-
const noRobotsPort = typeof address === 'object' && address ? address.port : 0;
|
|
82
|
-
|
|
83
|
-
try {
|
|
84
|
-
const checker = new RobotsChecker('Nitpicker', true);
|
|
85
|
-
const url = parseUrl(`http://127.0.0.1:${noRobotsPort}/secret/page`)!;
|
|
86
|
-
const result = await checker.isAllowed(url);
|
|
87
|
-
expect(result).toBe(true);
|
|
88
|
-
} finally {
|
|
89
|
-
await new Promise<void>((resolve) => {
|
|
90
|
-
noRobotsServer.close(() => resolve());
|
|
91
|
-
});
|
|
92
|
-
}
|
|
93
|
-
});
|
|
94
|
-
|
|
95
|
-
it('caches robots.txt per origin', async () => {
|
|
96
|
-
const checker = new RobotsChecker('Nitpicker', true);
|
|
97
|
-
const url1 = parseUrl(`http://127.0.0.1:${port}/admin/page1`)!;
|
|
98
|
-
const url2 = parseUrl(`http://127.0.0.1:${port}/admin/page2`)!;
|
|
99
|
-
|
|
100
|
-
await checker.isAllowed(url1);
|
|
101
|
-
const result = await checker.isAllowed(url2);
|
|
102
|
-
expect(result).toBe(false);
|
|
103
|
-
});
|
|
104
|
-
});
|
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
import type { ExURL } from '@d-zero/shared/parse-url';
|
|
2
|
-
|
|
3
|
-
import { crawlerLog } from '../debug.js';
|
|
4
|
-
|
|
5
|
-
import { fetchRobotsTxt } from './fetch-robots-txt.js';
|
|
6
|
-
|
|
7
|
-
/**
|
|
8
|
-
* Derives the origin string from an ExURL (e.g., `https://example.com:8080`).
|
|
9
|
-
* @param url - The extended URL.
|
|
10
|
-
* @returns The origin string.
|
|
11
|
-
*/
|
|
12
|
-
function getOrigin(url: ExURL): string {
|
|
13
|
-
return `${url.protocol}//${url.hostname}${url.port ? `:${url.port}` : ''}`;
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
/**
|
|
17
|
-
* Checks whether a URL is allowed by the site's robots.txt rules.
|
|
18
|
-
*
|
|
19
|
-
* Caches robots.txt per origin so each origin is fetched at most once.
|
|
20
|
-
* When disabled (i.e., `ignoreRobots` mode), all URLs are allowed.
|
|
21
|
-
*/
|
|
22
|
-
export class RobotsChecker {
|
|
23
|
-
/** Cache of parsed robots.txt per origin. `null` means no robots.txt or fetch failed. */
|
|
24
|
-
readonly #cache = new Map<string, Awaited<ReturnType<typeof fetchRobotsTxt>>>();
|
|
25
|
-
/** When `false`, robots.txt checking is disabled and all URLs are allowed. */
|
|
26
|
-
readonly #enabled: boolean;
|
|
27
|
-
/** User-Agent string used for robots.txt rule matching and HTTP requests. */
|
|
28
|
-
readonly #userAgent: string;
|
|
29
|
-
|
|
30
|
-
/**
|
|
31
|
-
* Create a new RobotsChecker.
|
|
32
|
-
* @param userAgent - User-Agent string for rule matching and fetching robots.txt.
|
|
33
|
-
* @param enabled - Whether robots.txt checking is enabled. When `false`, {@link isAllowed} always returns `true`.
|
|
34
|
-
*/
|
|
35
|
-
constructor(userAgent: string, enabled: boolean) {
|
|
36
|
-
this.#userAgent = userAgent;
|
|
37
|
-
this.#enabled = enabled;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
/**
|
|
41
|
-
* Check whether the given URL is allowed by the site's robots.txt.
|
|
42
|
-
*
|
|
43
|
-
* Fetches and caches robots.txt per origin on first access.
|
|
44
|
-
* Returns `true` if robots.txt checking is disabled, if no robots.txt
|
|
45
|
-
* exists, or if the URL is explicitly allowed.
|
|
46
|
-
* @param url - The URL to check.
|
|
47
|
-
* @returns `true` if the URL is allowed, `false` if blocked.
|
|
48
|
-
*/
|
|
49
|
-
async isAllowed(url: ExURL): Promise<boolean> {
|
|
50
|
-
if (!this.#enabled) {
|
|
51
|
-
return true;
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
if (!url.isHTTP) {
|
|
55
|
-
return true;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
const origin = getOrigin(url);
|
|
59
|
-
if (!this.#cache.has(origin)) {
|
|
60
|
-
crawlerLog('Fetching robots.txt for %s', origin);
|
|
61
|
-
const robot = await fetchRobotsTxt(origin, this.#userAgent);
|
|
62
|
-
this.#cache.set(origin, robot);
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
const robot = this.#cache.get(origin);
|
|
66
|
-
if (!robot) {
|
|
67
|
-
return true;
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
const allowed = robot.isAllowed(url.href, this.#userAgent);
|
|
71
|
-
return allowed !== false;
|
|
72
|
-
}
|
|
73
|
-
}
|
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
import type { ScrapeResult } from '@d-zero/beholder';
|
|
2
|
-
|
|
3
|
-
import { describe, expect, it } from 'vitest';
|
|
4
|
-
|
|
5
|
-
import { shouldDiscardPredicted } from './should-discard-predicted.js';
|
|
6
|
-
|
|
7
|
-
describe('shouldDiscardPredicted', () => {
|
|
8
|
-
it('type=error は破棄する', () => {
|
|
9
|
-
const result: ScrapeResult = {
|
|
10
|
-
type: 'error',
|
|
11
|
-
resources: [],
|
|
12
|
-
error: { name: 'Error', message: 'connection refused', shutdown: false },
|
|
13
|
-
};
|
|
14
|
-
expect(shouldDiscardPredicted(result)).toBe(true);
|
|
15
|
-
});
|
|
16
|
-
|
|
17
|
-
it('type=success, status=404 は破棄する', () => {
|
|
18
|
-
const result: ScrapeResult = {
|
|
19
|
-
type: 'success',
|
|
20
|
-
resources: [],
|
|
21
|
-
pageData: {
|
|
22
|
-
url: {} as never,
|
|
23
|
-
redirectPaths: [],
|
|
24
|
-
isTarget: false,
|
|
25
|
-
isExternal: false,
|
|
26
|
-
status: 404,
|
|
27
|
-
statusText: 'Not Found',
|
|
28
|
-
contentType: 'text/html',
|
|
29
|
-
contentLength: 0,
|
|
30
|
-
responseHeaders: {},
|
|
31
|
-
meta: { title: '' },
|
|
32
|
-
anchorList: [],
|
|
33
|
-
imageList: [],
|
|
34
|
-
html: '',
|
|
35
|
-
isSkipped: false,
|
|
36
|
-
},
|
|
37
|
-
};
|
|
38
|
-
expect(shouldDiscardPredicted(result)).toBe(true);
|
|
39
|
-
});
|
|
40
|
-
|
|
41
|
-
it('type=success, status=500 は破棄する', () => {
|
|
42
|
-
const result: ScrapeResult = {
|
|
43
|
-
type: 'success',
|
|
44
|
-
resources: [],
|
|
45
|
-
pageData: {
|
|
46
|
-
url: {} as never,
|
|
47
|
-
redirectPaths: [],
|
|
48
|
-
isTarget: false,
|
|
49
|
-
isExternal: false,
|
|
50
|
-
status: 500,
|
|
51
|
-
statusText: 'Internal Server Error',
|
|
52
|
-
contentType: 'text/html',
|
|
53
|
-
contentLength: 0,
|
|
54
|
-
responseHeaders: {},
|
|
55
|
-
meta: { title: '' },
|
|
56
|
-
anchorList: [],
|
|
57
|
-
imageList: [],
|
|
58
|
-
html: '',
|
|
59
|
-
isSkipped: false,
|
|
60
|
-
},
|
|
61
|
-
};
|
|
62
|
-
expect(shouldDiscardPredicted(result)).toBe(true);
|
|
63
|
-
});
|
|
64
|
-
|
|
65
|
-
it('type=success, status=200 は保持する', () => {
|
|
66
|
-
const result: ScrapeResult = {
|
|
67
|
-
type: 'success',
|
|
68
|
-
resources: [],
|
|
69
|
-
pageData: {
|
|
70
|
-
url: {} as never,
|
|
71
|
-
redirectPaths: [],
|
|
72
|
-
isTarget: true,
|
|
73
|
-
isExternal: false,
|
|
74
|
-
status: 200,
|
|
75
|
-
statusText: 'OK',
|
|
76
|
-
contentType: 'text/html',
|
|
77
|
-
contentLength: 1024,
|
|
78
|
-
responseHeaders: {},
|
|
79
|
-
meta: { title: 'Test' },
|
|
80
|
-
anchorList: [],
|
|
81
|
-
imageList: [],
|
|
82
|
-
html: '<html></html>',
|
|
83
|
-
isSkipped: false,
|
|
84
|
-
},
|
|
85
|
-
};
|
|
86
|
-
expect(shouldDiscardPredicted(result)).toBe(false);
|
|
87
|
-
});
|
|
88
|
-
|
|
89
|
-
it('type=success, status=301 は保持する', () => {
|
|
90
|
-
const result: ScrapeResult = {
|
|
91
|
-
type: 'success',
|
|
92
|
-
resources: [],
|
|
93
|
-
pageData: {
|
|
94
|
-
url: {} as never,
|
|
95
|
-
redirectPaths: [],
|
|
96
|
-
isTarget: false,
|
|
97
|
-
isExternal: false,
|
|
98
|
-
status: 301,
|
|
99
|
-
statusText: 'Moved Permanently',
|
|
100
|
-
contentType: 'text/html',
|
|
101
|
-
contentLength: 0,
|
|
102
|
-
responseHeaders: {},
|
|
103
|
-
meta: { title: '' },
|
|
104
|
-
anchorList: [],
|
|
105
|
-
imageList: [],
|
|
106
|
-
html: '',
|
|
107
|
-
isSkipped: false,
|
|
108
|
-
},
|
|
109
|
-
};
|
|
110
|
-
expect(shouldDiscardPredicted(result)).toBe(false);
|
|
111
|
-
});
|
|
112
|
-
|
|
113
|
-
it('type=skipped は破棄する', () => {
|
|
114
|
-
const result: ScrapeResult = {
|
|
115
|
-
type: 'skipped',
|
|
116
|
-
resources: [],
|
|
117
|
-
ignored: {
|
|
118
|
-
url: {} as never,
|
|
119
|
-
matchedText: 'keyword',
|
|
120
|
-
excludeKeywords: ['keyword'],
|
|
121
|
-
},
|
|
122
|
-
};
|
|
123
|
-
expect(shouldDiscardPredicted(result)).toBe(true);
|
|
124
|
-
});
|
|
125
|
-
});
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
import type { ScrapeResult } from '@d-zero/beholder';
|
|
2
|
-
|
|
3
|
-
import { isError } from '@d-zero/beholder';
|
|
4
|
-
|
|
5
|
-
/**
|
|
6
|
-
* Determines whether a predicted URL's scrape result should be discarded.
|
|
7
|
-
*
|
|
8
|
-
* Predicted URLs are pre-emptively pushed into the crawl queue before
|
|
9
|
-
* knowing if they exist. This function filters out invalid results:
|
|
10
|
-
* - `error` type → discard (server unreachable, timeout, etc.)
|
|
11
|
-
* - `skipped` type → discard (matched exclusion rule)
|
|
12
|
-
* - `success` with HTTP error status (4xx/5xx) → discard
|
|
13
|
-
* - `success` with 2xx/3xx → keep
|
|
14
|
-
* @param result - The scrape result for the predicted URL
|
|
15
|
-
* @returns `true` if the result should be discarded (not saved to archive)
|
|
16
|
-
*/
|
|
17
|
-
export function shouldDiscardPredicted(result: ScrapeResult): boolean {
|
|
18
|
-
switch (result.type) {
|
|
19
|
-
case 'error': {
|
|
20
|
-
return true;
|
|
21
|
-
}
|
|
22
|
-
case 'skipped': {
|
|
23
|
-
return true;
|
|
24
|
-
}
|
|
25
|
-
case 'success': {
|
|
26
|
-
if (!result.pageData) return true;
|
|
27
|
-
return isError(result.pageData.status);
|
|
28
|
-
}
|
|
29
|
-
default: {
|
|
30
|
-
return true;
|
|
31
|
-
}
|
|
32
|
-
}
|
|
33
|
-
}
|