@nitpicker/crawler 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -2
- package/CHANGELOG.md +0 -16
- package/src/archive/__mock__/.gitignore +0 -3
- package/src/archive/__mock__/mock.sqlite +0 -0
- package/src/archive/archive-accessor.ts +0 -337
- package/src/archive/archive.ts +0 -408
- package/src/archive/database.spec.ts +0 -469
- package/src/archive/database.ts +0 -1059
- package/src/archive/debug.ts +0 -10
- package/src/archive/filesystem/append-text.spec.ts +0 -26
- package/src/archive/filesystem/append-text.ts +0 -16
- package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
- package/src/archive/filesystem/copy-dir-sync.ts +0 -10
- package/src/archive/filesystem/copy-dir.spec.ts +0 -33
- package/src/archive/filesystem/copy-dir.ts +0 -14
- package/src/archive/filesystem/exists.spec.ts +0 -33
- package/src/archive/filesystem/exists.ts +0 -10
- package/src/archive/filesystem/get-file-list.spec.ts +0 -37
- package/src/archive/filesystem/get-file-list.ts +0 -13
- package/src/archive/filesystem/index.ts +0 -17
- package/src/archive/filesystem/is-dir.spec.ts +0 -29
- package/src/archive/filesystem/is-dir.ts +0 -11
- package/src/archive/filesystem/mkdir.spec.ts +0 -37
- package/src/archive/filesystem/mkdir.ts +0 -16
- package/src/archive/filesystem/output-json.spec.ts +0 -34
- package/src/archive/filesystem/output-json.ts +0 -16
- package/src/archive/filesystem/output-text.spec.ts +0 -31
- package/src/archive/filesystem/output-text.ts +0 -35
- package/src/archive/filesystem/read-json.spec.ts +0 -26
- package/src/archive/filesystem/read-json.ts +0 -12
- package/src/archive/filesystem/read-text.spec.ts +0 -25
- package/src/archive/filesystem/read-text.ts +0 -11
- package/src/archive/filesystem/readline.spec.ts +0 -29
- package/src/archive/filesystem/readline.ts +0 -30
- package/src/archive/filesystem/remove.spec.ts +0 -34
- package/src/archive/filesystem/remove.ts +0 -11
- package/src/archive/filesystem/rename.spec.ts +0 -46
- package/src/archive/filesystem/rename.ts +0 -21
- package/src/archive/filesystem/tar.spec.ts +0 -33
- package/src/archive/filesystem/tar.ts +0 -27
- package/src/archive/filesystem/untar.spec.ts +0 -34
- package/src/archive/filesystem/untar.ts +0 -36
- package/src/archive/index.ts +0 -13
- package/src/archive/page.spec.ts +0 -368
- package/src/archive/page.ts +0 -420
- package/src/archive/resource.spec.ts +0 -101
- package/src/archive/resource.ts +0 -73
- package/src/archive/safe-path.spec.ts +0 -44
- package/src/archive/safe-path.ts +0 -18
- package/src/archive/types.ts +0 -227
- package/src/crawler/clear-destination-cache.spec.ts +0 -20
- package/src/crawler/clear-destination-cache.ts +0 -9
- package/src/crawler/crawler.ts +0 -873
- package/src/crawler/decompose-url.spec.ts +0 -48
- package/src/crawler/decompose-url.ts +0 -90
- package/src/crawler/destination-cache.spec.ts +0 -23
- package/src/crawler/destination-cache.ts +0 -8
- package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
- package/src/crawler/detect-pagination-pattern.ts +0 -66
- package/src/crawler/fetch-destination.ts +0 -257
- package/src/crawler/fetch-robots-txt.spec.ts +0 -83
- package/src/crawler/fetch-robots-txt.ts +0 -91
- package/src/crawler/find-best-matching-scope.spec.ts +0 -39
- package/src/crawler/find-best-matching-scope.ts +0 -57
- package/src/crawler/generate-predicted-urls.spec.ts +0 -42
- package/src/crawler/generate-predicted-urls.ts +0 -34
- package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
- package/src/crawler/handle-ignore-and-skip.ts +0 -30
- package/src/crawler/handle-resource-response.spec.ts +0 -45
- package/src/crawler/handle-resource-response.ts +0 -21
- package/src/crawler/handle-scrape-end.spec.ts +0 -109
- package/src/crawler/handle-scrape-end.ts +0 -115
- package/src/crawler/handle-scrape-error.spec.ts +0 -105
- package/src/crawler/handle-scrape-error.ts +0 -58
- package/src/crawler/index.ts +0 -2
- package/src/crawler/inject-scope-auth.spec.ts +0 -36
- package/src/crawler/inject-scope-auth.ts +0 -27
- package/src/crawler/is-external-url.spec.ts +0 -31
- package/src/crawler/is-external-url.ts +0 -17
- package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
- package/src/crawler/is-in-any-lower-layer.ts +0 -22
- package/src/crawler/link-list.spec.ts +0 -355
- package/src/crawler/link-list.ts +0 -275
- package/src/crawler/link-to-page-data.spec.ts +0 -133
- package/src/crawler/link-to-page-data.ts +0 -34
- package/src/crawler/net-timeout-error.spec.ts +0 -25
- package/src/crawler/net-timeout-error.ts +0 -11
- package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
- package/src/crawler/protocol-agnostic-key.ts +0 -11
- package/src/crawler/reconstruct-url.spec.ts +0 -37
- package/src/crawler/reconstruct-url.ts +0 -37
- package/src/crawler/robots-checker.spec.ts +0 -104
- package/src/crawler/robots-checker.ts +0 -73
- package/src/crawler/should-discard-predicted.spec.ts +0 -125
- package/src/crawler/should-discard-predicted.ts +0 -33
- package/src/crawler/should-skip-url.spec.ts +0 -77
- package/src/crawler/should-skip-url.ts +0 -37
- package/src/crawler/types.ts +0 -146
- package/src/crawler-orchestrator.ts +0 -401
- package/src/debug.ts +0 -10
- package/src/index.ts +0 -25
- package/src/types.ts +0 -30
- package/src/utils/array/each-splitted.spec.ts +0 -38
- package/src/utils/array/each-splitted.ts +0 -19
- package/src/utils/array/index.ts +0 -1
- package/src/utils/debug.ts +0 -6
- package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
- package/src/utils/error/dom-evaluation-error.ts +0 -6
- package/src/utils/error/error-emitter.spec.ts +0 -78
- package/src/utils/error/error-emitter.ts +0 -44
- package/src/utils/error/index.ts +0 -3
- package/src/utils/index.ts +0 -5
- package/src/utils/object/clean-object.spec.ts +0 -24
- package/src/utils/object/clean-object.ts +0 -13
- package/src/utils/object/index.ts +0 -1
- package/src/utils/types/index.ts +0 -1
- package/src/utils/types/types.ts +0 -65
- package/tsconfig.json +0 -11
- package/tsconfig.tsbuildinfo +0 -1
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
import http from 'node:http';
|
|
2
|
-
|
|
3
|
-
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
|
4
|
-
|
|
5
|
-
import { fetchRobotsTxt } from './fetch-robots-txt.js';
|
|
6
|
-
|
|
7
|
-
const ROBOTS_TXT = `
|
|
8
|
-
User-agent: *
|
|
9
|
-
Disallow: /secret/
|
|
10
|
-
Allow: /public/
|
|
11
|
-
|
|
12
|
-
User-agent: Nitpicker
|
|
13
|
-
Disallow: /admin/
|
|
14
|
-
`;
|
|
15
|
-
|
|
16
|
-
let server: http.Server;
|
|
17
|
-
let port: number;
|
|
18
|
-
|
|
19
|
-
beforeAll(async () => {
|
|
20
|
-
server = http.createServer((req, res) => {
|
|
21
|
-
if (req.url === '/robots.txt') {
|
|
22
|
-
res.writeHead(200, { 'Content-Type': 'text/plain' });
|
|
23
|
-
res.end(ROBOTS_TXT);
|
|
24
|
-
} else {
|
|
25
|
-
res.writeHead(404);
|
|
26
|
-
res.end();
|
|
27
|
-
}
|
|
28
|
-
});
|
|
29
|
-
await new Promise<void>((resolve) => {
|
|
30
|
-
server.listen(0, () => resolve());
|
|
31
|
-
});
|
|
32
|
-
const address = server.address();
|
|
33
|
-
port = typeof address === 'object' && address ? address.port : 0;
|
|
34
|
-
});
|
|
35
|
-
|
|
36
|
-
afterAll(async () => {
|
|
37
|
-
await new Promise<void>((resolve) => {
|
|
38
|
-
server.close(() => resolve());
|
|
39
|
-
});
|
|
40
|
-
});
|
|
41
|
-
|
|
42
|
-
describe('fetchRobotsTxt', () => {
|
|
43
|
-
it('returns a parsed Robot object for a valid robots.txt', async () => {
|
|
44
|
-
const robot = await fetchRobotsTxt(`http://127.0.0.1:${port}`);
|
|
45
|
-
expect(robot).not.toBeNull();
|
|
46
|
-
expect(robot!.isAllowed(`http://127.0.0.1:${port}/public/page`, '*')).toBe(true);
|
|
47
|
-
expect(robot!.isAllowed(`http://127.0.0.1:${port}/secret/page`, '*')).toBe(false);
|
|
48
|
-
});
|
|
49
|
-
|
|
50
|
-
it('respects user-agent specific rules', async () => {
|
|
51
|
-
const robot = await fetchRobotsTxt(`http://127.0.0.1:${port}`);
|
|
52
|
-
expect(robot).not.toBeNull();
|
|
53
|
-
expect(robot!.isAllowed(`http://127.0.0.1:${port}/admin/page`, 'Nitpicker')).toBe(
|
|
54
|
-
false,
|
|
55
|
-
);
|
|
56
|
-
});
|
|
57
|
-
|
|
58
|
-
it('returns null when robots.txt does not exist', async () => {
|
|
59
|
-
const noRobotsServer = http.createServer((_req, res) => {
|
|
60
|
-
res.writeHead(404);
|
|
61
|
-
res.end();
|
|
62
|
-
});
|
|
63
|
-
await new Promise<void>((resolve) => {
|
|
64
|
-
noRobotsServer.listen(0, () => resolve());
|
|
65
|
-
});
|
|
66
|
-
const address = noRobotsServer.address();
|
|
67
|
-
const noRobotsPort = typeof address === 'object' && address ? address.port : 0;
|
|
68
|
-
|
|
69
|
-
try {
|
|
70
|
-
const robot = await fetchRobotsTxt(`http://127.0.0.1:${noRobotsPort}`);
|
|
71
|
-
expect(robot).toBeNull();
|
|
72
|
-
} finally {
|
|
73
|
-
await new Promise<void>((resolve) => {
|
|
74
|
-
noRobotsServer.close(() => resolve());
|
|
75
|
-
});
|
|
76
|
-
}
|
|
77
|
-
});
|
|
78
|
-
|
|
79
|
-
it('returns null when the server is unreachable', async () => {
|
|
80
|
-
const robot = await fetchRobotsTxt('http://127.0.0.1:1');
|
|
81
|
-
expect(robot).toBeNull();
|
|
82
|
-
});
|
|
83
|
-
});
|
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
import type { FollowResponse } from 'follow-redirects';
|
|
2
|
-
import type { IncomingMessage } from 'node:http';
|
|
3
|
-
|
|
4
|
-
import { createRequire } from 'node:module';
|
|
5
|
-
|
|
6
|
-
import redirects from 'follow-redirects';
|
|
7
|
-
|
|
8
|
-
/**
|
|
9
|
-
* Result of parsing a robots.txt file.
|
|
10
|
-
*/
|
|
11
|
-
interface RobotsResult {
|
|
12
|
-
/**
|
|
13
|
-
* Check if a URL is allowed for a given user-agent.
|
|
14
|
-
* @param url - The URL to check.
|
|
15
|
-
* @param ua - The user-agent string to match against.
|
|
16
|
-
* @returns `true` if allowed, `false` if disallowed, `undefined` if no matching rule.
|
|
17
|
-
*/
|
|
18
|
-
isAllowed(url: string, ua?: string): boolean | undefined;
|
|
19
|
-
/**
|
|
20
|
-
* Check if a URL is disallowed for a given user-agent.
|
|
21
|
-
* @param url - The URL to check.
|
|
22
|
-
* @param ua - The user-agent string to match against.
|
|
23
|
-
* @returns `true` if disallowed, `false` if allowed, `undefined` if no matching rule.
|
|
24
|
-
*/
|
|
25
|
-
isDisallowed(url: string, ua?: string): boolean | undefined;
|
|
26
|
-
/**
|
|
27
|
-
* Get the crawl delay for a given user-agent.
|
|
28
|
-
* @param ua - The user-agent string to match against.
|
|
29
|
-
* @returns The crawl delay in seconds, or `undefined` if not specified.
|
|
30
|
-
*/
|
|
31
|
-
getCrawlDelay(ua?: string): number | undefined;
|
|
32
|
-
/**
|
|
33
|
-
* Get the sitemaps listed in robots.txt.
|
|
34
|
-
* @returns An array of sitemap URLs.
|
|
35
|
-
*/
|
|
36
|
-
getSitemaps(): string[];
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
const require = createRequire(import.meta.url);
|
|
40
|
-
const robotsParser = require('robots-parser') as (
|
|
41
|
-
url: string,
|
|
42
|
-
robotstxt: string,
|
|
43
|
-
) => RobotsResult;
|
|
44
|
-
|
|
45
|
-
/**
|
|
46
|
-
* Fetches and parses the robots.txt file for a given origin URL.
|
|
47
|
-
*
|
|
48
|
-
* Sends an HTTP(S) GET request to `{origin}/robots.txt` and parses the
|
|
49
|
-
* response using `robots-parser`. Returns `null` if the server returns
|
|
50
|
-
* a non-200 status code or if the request fails.
|
|
51
|
-
* @param origin - The origin URL (e.g., `https://example.com`).
|
|
52
|
-
* @param userAgent - Optional User-Agent string to send with the request.
|
|
53
|
-
* @returns A parsed RobotsResult instance, or `null` if robots.txt is unavailable.
|
|
54
|
-
*/
|
|
55
|
-
export async function fetchRobotsTxt(
|
|
56
|
-
origin: string,
|
|
57
|
-
userAgent?: string,
|
|
58
|
-
): Promise<RobotsResult | null> {
|
|
59
|
-
const robotsUrl = `${origin}/robots.txt`;
|
|
60
|
-
return new Promise((resolve) => {
|
|
61
|
-
const protocol = robotsUrl.startsWith('https') ? redirects.https : redirects.http;
|
|
62
|
-
const req = protocol.get(
|
|
63
|
-
robotsUrl,
|
|
64
|
-
{
|
|
65
|
-
headers: {
|
|
66
|
-
...(userAgent ? { 'User-Agent': userAgent } : {}),
|
|
67
|
-
},
|
|
68
|
-
timeout: 10_000,
|
|
69
|
-
},
|
|
70
|
-
(res: IncomingMessage & FollowResponse) => {
|
|
71
|
-
if (res.statusCode !== 200) {
|
|
72
|
-
res.resume();
|
|
73
|
-
resolve(null);
|
|
74
|
-
return;
|
|
75
|
-
}
|
|
76
|
-
const chunks: Buffer[] = [];
|
|
77
|
-
res.on('data', (chunk: Buffer) => chunks.push(chunk));
|
|
78
|
-
res.on('end', () => {
|
|
79
|
-
const body = Buffer.concat(chunks).toString('utf8');
|
|
80
|
-
resolve(robotsParser(robotsUrl, body));
|
|
81
|
-
});
|
|
82
|
-
res.on('error', () => resolve(null));
|
|
83
|
-
},
|
|
84
|
-
);
|
|
85
|
-
req.on('error', () => resolve(null));
|
|
86
|
-
req.on('timeout', () => {
|
|
87
|
-
req.destroy();
|
|
88
|
-
resolve(null);
|
|
89
|
-
});
|
|
90
|
-
});
|
|
91
|
-
}
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
|
|
2
|
-
import { describe, it, expect } from 'vitest';
|
|
3
|
-
|
|
4
|
-
import { findBestMatchingScope } from './find-best-matching-scope.js';
|
|
5
|
-
|
|
6
|
-
describe('findBestMatchingScope', () => {
|
|
7
|
-
it('returns the deepest matching scope URL', () => {
|
|
8
|
-
const url = parseUrl('https://example.com/blog/post/1')!;
|
|
9
|
-
const scopes = [
|
|
10
|
-
parseUrl('https://example.com/blog')!,
|
|
11
|
-
parseUrl('https://example.com/blog/post')!,
|
|
12
|
-
];
|
|
13
|
-
const result = findBestMatchingScope(url, scopes);
|
|
14
|
-
expect(result).not.toBeNull();
|
|
15
|
-
expect(result!.pathname).toBe('/blog/post');
|
|
16
|
-
});
|
|
17
|
-
|
|
18
|
-
it('returns null when no scope matches', () => {
|
|
19
|
-
const url = parseUrl('https://other.com/page')!;
|
|
20
|
-
const scopes = [parseUrl('https://example.com/')!];
|
|
21
|
-
const result = findBestMatchingScope(url, scopes);
|
|
22
|
-
expect(result).toBeNull();
|
|
23
|
-
});
|
|
24
|
-
|
|
25
|
-
it('returns null for empty scopes array', () => {
|
|
26
|
-
const url = parseUrl('https://example.com/page')!;
|
|
27
|
-
const result = findBestMatchingScope(url, []);
|
|
28
|
-
expect(result).toBeNull();
|
|
29
|
-
});
|
|
30
|
-
|
|
31
|
-
it('does not match root scope without trailing slash against subpath', () => {
|
|
32
|
-
// Root URL https://example.com has paths [''], while /page has paths ['page']
|
|
33
|
-
// isPathMatch(['page'], ['']) fails because 'page' !== ''
|
|
34
|
-
const url = parseUrl('https://example.com/page')!;
|
|
35
|
-
const scopes = [parseUrl('https://example.com')!];
|
|
36
|
-
const result = findBestMatchingScope(url, scopes);
|
|
37
|
-
expect(result).toBeNull();
|
|
38
|
-
});
|
|
39
|
-
});
|
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
import type { ExURL } from '@d-zero/shared/parse-url';
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Find the scope URL with the deepest matching path for a given URL.
|
|
5
|
-
*
|
|
6
|
-
* Among all scope URLs sharing the same hostname, returns the one whose
|
|
7
|
-
* path segments are a prefix of the target URL's path segments and which
|
|
8
|
-
* has the greatest depth. Returns `null` if no scope URL matches.
|
|
9
|
-
* @param url - The parsed URL to match against scope URLs.
|
|
10
|
-
* @param scopes - The list of scope URLs to search.
|
|
11
|
-
* @returns The best-matching scope URL, or `null` if none match.
|
|
12
|
-
*/
|
|
13
|
-
export function findBestMatchingScope(
|
|
14
|
-
url: ExURL,
|
|
15
|
-
scopes: readonly ExURL[],
|
|
16
|
-
): ExURL | null {
|
|
17
|
-
let bestMatch: ExURL | null = null;
|
|
18
|
-
let maxDepth = -1;
|
|
19
|
-
|
|
20
|
-
for (const scope of scopes) {
|
|
21
|
-
if (url.hostname !== scope.hostname) {
|
|
22
|
-
continue;
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
const isMatch = isPathMatch(url.paths, scope.paths);
|
|
26
|
-
if (isMatch && scope.depth > maxDepth) {
|
|
27
|
-
bestMatch = scope;
|
|
28
|
-
maxDepth = scope.depth;
|
|
29
|
-
}
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
return bestMatch;
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
/**
|
|
36
|
-
* Check whether a target path is equal to or is a descendant of a base path.
|
|
37
|
-
*
|
|
38
|
-
* Compares path segments element by element. The target path matches if
|
|
39
|
-
* all segments of the base path appear in the same positions at the
|
|
40
|
-
* beginning of the target path.
|
|
41
|
-
* @param targetPaths - The path segments of the URL being checked.
|
|
42
|
-
* @param basePaths - The path segments of the scope URL to match against.
|
|
43
|
-
* @returns `true` if the target path starts with or equals the base path.
|
|
44
|
-
*/
|
|
45
|
-
function isPathMatch(targetPaths: string[], basePaths: string[]): boolean {
|
|
46
|
-
if (targetPaths.length < basePaths.length) {
|
|
47
|
-
return false;
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
for (const [i, basePath] of basePaths.entries()) {
|
|
51
|
-
if (targetPaths[i] !== basePath) {
|
|
52
|
-
return false;
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
return true;
|
|
57
|
-
}
|
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
import { describe, expect, it } from 'vitest';
|
|
2
|
-
|
|
3
|
-
import { generatePredictedUrls } from './generate-predicted-urls.js';
|
|
4
|
-
|
|
5
|
-
describe('generatePredictedUrls', () => {
|
|
6
|
-
it('step=1, count=3 で3つのURLを生成する', () => {
|
|
7
|
-
const pattern = { tokenIndex: 1, step: 1, currentNumber: 3 };
|
|
8
|
-
const urls = generatePredictedUrls(pattern, '//example.com/page/3', 3);
|
|
9
|
-
expect(urls).toEqual([
|
|
10
|
-
'//example.com/page/4',
|
|
11
|
-
'//example.com/page/5',
|
|
12
|
-
'//example.com/page/6',
|
|
13
|
-
]);
|
|
14
|
-
});
|
|
15
|
-
|
|
16
|
-
it('step=10, count=2 で2つのURLを生成する', () => {
|
|
17
|
-
const pattern = { tokenIndex: 1, step: 10, currentNumber: 20 };
|
|
18
|
-
const urls = generatePredictedUrls(pattern, '//example.com/page/20', 2);
|
|
19
|
-
expect(urls).toEqual(['//example.com/page/30', '//example.com/page/40']);
|
|
20
|
-
});
|
|
21
|
-
|
|
22
|
-
it('クエリパターンのURLを生成する', () => {
|
|
23
|
-
const pattern = { tokenIndex: 1, step: 1, currentNumber: 2 };
|
|
24
|
-
const urls = generatePredictedUrls(pattern, '//example.com/list?p=2&sort=name', 2);
|
|
25
|
-
expect(urls).toEqual([
|
|
26
|
-
'//example.com/list?p=3&sort=name',
|
|
27
|
-
'//example.com/list?p=4&sort=name',
|
|
28
|
-
]);
|
|
29
|
-
});
|
|
30
|
-
|
|
31
|
-
it('count=0 で空配列を返す', () => {
|
|
32
|
-
const pattern = { tokenIndex: 1, step: 1, currentNumber: 3 };
|
|
33
|
-
const urls = generatePredictedUrls(pattern, '//example.com/page/3', 0);
|
|
34
|
-
expect(urls).toEqual([]);
|
|
35
|
-
});
|
|
36
|
-
|
|
37
|
-
it('深いパスのURLを生成する', () => {
|
|
38
|
-
const pattern = { tokenIndex: 2, step: 1, currentNumber: 3 };
|
|
39
|
-
const urls = generatePredictedUrls(pattern, '//example.com/a/b/3/c', 2);
|
|
40
|
-
expect(urls).toEqual(['//example.com/a/b/4/c', '//example.com/a/b/5/c']);
|
|
41
|
-
});
|
|
42
|
-
});
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
import type { PaginationPattern } from './types.js';
|
|
2
|
-
|
|
3
|
-
import { decomposeUrl } from './decompose-url.js';
|
|
4
|
-
import { reconstructUrl } from './reconstruct-url.js';
|
|
5
|
-
|
|
6
|
-
/**
|
|
7
|
-
* Generates predicted URLs by extrapolating the detected pagination pattern.
|
|
8
|
-
*
|
|
9
|
-
* Starting from `currentUrl`, applies the pattern's step `count` times to produce
|
|
10
|
-
* future page URLs (e.g. if step=1 and currentNumber=2, generates page 3, 4, ...).
|
|
11
|
-
* These URLs are pushed into the crawl queue and discarded later if they 404.
|
|
12
|
-
* @param pattern - The detected pagination pattern from `detectPaginationPattern()`
|
|
13
|
-
* @param currentUrl - The URL to extrapolate from (protocol-agnostic, without hash/auth)
|
|
14
|
-
* @param count - Number of predicted URLs to generate (typically equals concurrency)
|
|
15
|
-
* @returns Array of predicted URL strings
|
|
16
|
-
*/
|
|
17
|
-
export function generatePredictedUrls(
|
|
18
|
-
pattern: PaginationPattern,
|
|
19
|
-
currentUrl: string,
|
|
20
|
-
count: number,
|
|
21
|
-
): string[] {
|
|
22
|
-
if (count <= 0) return [];
|
|
23
|
-
|
|
24
|
-
const decomposed = decomposeUrl(currentUrl);
|
|
25
|
-
if (!decomposed) return [];
|
|
26
|
-
|
|
27
|
-
const results: string[] = [];
|
|
28
|
-
for (let i = 1; i <= count; i++) {
|
|
29
|
-
const nextNum = pattern.currentNumber + pattern.step * i;
|
|
30
|
-
const url = reconstructUrl(decomposed, pattern.tokenIndex, String(nextNum));
|
|
31
|
-
results.push(url);
|
|
32
|
-
}
|
|
33
|
-
return results;
|
|
34
|
-
}
|
|
@@ -1,66 +0,0 @@
|
|
|
1
|
-
import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
|
|
2
|
-
import { describe, it, expect, vi } from 'vitest';
|
|
3
|
-
|
|
4
|
-
import { handleIgnoreAndSkip } from './handle-ignore-and-skip.js';
|
|
5
|
-
|
|
6
|
-
describe('handleIgnoreAndSkip', () => {
|
|
7
|
-
it('calls linkList.done and returns the link when URL is in queue', () => {
|
|
8
|
-
const url = parseUrl('https://example.com/page')!;
|
|
9
|
-
const mockLink = { url, isExternal: false, isLowerLayer: false };
|
|
10
|
-
const linkList = {
|
|
11
|
-
done: vi.fn().mockReturnValue(mockLink),
|
|
12
|
-
};
|
|
13
|
-
const scope = new Map();
|
|
14
|
-
const options = {
|
|
15
|
-
interval: 0,
|
|
16
|
-
parallels: 1,
|
|
17
|
-
recursive: true,
|
|
18
|
-
fromList: false,
|
|
19
|
-
captureImages: false,
|
|
20
|
-
executablePath: null,
|
|
21
|
-
fetchExternal: false,
|
|
22
|
-
scope: [],
|
|
23
|
-
excludes: [],
|
|
24
|
-
excludeKeywords: [],
|
|
25
|
-
excludeUrls: [] as readonly string[],
|
|
26
|
-
maxExcludedDepth: 0,
|
|
27
|
-
retry: 0,
|
|
28
|
-
verbose: false,
|
|
29
|
-
disableQueries: false,
|
|
30
|
-
};
|
|
31
|
-
|
|
32
|
-
const result = handleIgnoreAndSkip(url, linkList as never, scope, options);
|
|
33
|
-
|
|
34
|
-
expect(linkList.done).toHaveBeenCalledWith(url, scope, {}, options);
|
|
35
|
-
expect(result).toBe(mockLink);
|
|
36
|
-
});
|
|
37
|
-
|
|
38
|
-
it('returns null when URL is not in queue', () => {
|
|
39
|
-
const url = parseUrl('https://example.com/page')!;
|
|
40
|
-
const linkList = {
|
|
41
|
-
done: vi.fn().mockReturnValue(null),
|
|
42
|
-
};
|
|
43
|
-
const scope = new Map();
|
|
44
|
-
const options = {
|
|
45
|
-
interval: 0,
|
|
46
|
-
parallels: 1,
|
|
47
|
-
recursive: true,
|
|
48
|
-
fromList: false,
|
|
49
|
-
captureImages: false,
|
|
50
|
-
executablePath: null,
|
|
51
|
-
fetchExternal: false,
|
|
52
|
-
scope: [],
|
|
53
|
-
excludes: [],
|
|
54
|
-
excludeKeywords: [],
|
|
55
|
-
excludeUrls: [] as readonly string[],
|
|
56
|
-
maxExcludedDepth: 0,
|
|
57
|
-
retry: 0,
|
|
58
|
-
verbose: false,
|
|
59
|
-
disableQueries: false,
|
|
60
|
-
};
|
|
61
|
-
|
|
62
|
-
const result = handleIgnoreAndSkip(url, linkList as never, scope, options);
|
|
63
|
-
|
|
64
|
-
expect(result).toBeNull();
|
|
65
|
-
});
|
|
66
|
-
});
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
import type LinkList from './link-list.js';
|
|
2
|
-
import type { CrawlerOptions } from './types.js';
|
|
3
|
-
import type { Link } from '../utils/index.js';
|
|
4
|
-
import type { ExURL } from '@d-zero/shared/parse-url';
|
|
5
|
-
|
|
6
|
-
import { crawlerLog } from '../debug.js';
|
|
7
|
-
|
|
8
|
-
/**
|
|
9
|
-
* Handle a URL that was ignored or skipped during scraping.
|
|
10
|
-
*
|
|
11
|
-
* Marks the URL as done in the link list without any page data,
|
|
12
|
-
* effectively recording that it was encountered but not scraped.
|
|
13
|
-
* @param url - The URL that was skipped.
|
|
14
|
-
* @param linkList - The link list managing the crawl queue.
|
|
15
|
-
* @param scope - Map of hostnames to their scope URLs.
|
|
16
|
-
* @param options - Crawler configuration options.
|
|
17
|
-
* @returns The constructed {@link Link} object, or `null` if the URL was not in the queue.
|
|
18
|
-
*/
|
|
19
|
-
export function handleIgnoreAndSkip(
|
|
20
|
-
url: ExURL,
|
|
21
|
-
linkList: LinkList,
|
|
22
|
-
scope: ReadonlyMap<string, readonly ExURL[]>,
|
|
23
|
-
options: CrawlerOptions,
|
|
24
|
-
): Link | null {
|
|
25
|
-
const updated = linkList.done(url, scope, {}, options);
|
|
26
|
-
if (updated) {
|
|
27
|
-
crawlerLog('Skipped URL: %s', url.href);
|
|
28
|
-
}
|
|
29
|
-
return updated;
|
|
30
|
-
}
|
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
|
|
2
|
-
import { describe, it, expect } from 'vitest';
|
|
3
|
-
|
|
4
|
-
import { handleResourceResponse } from './handle-resource-response.js';
|
|
5
|
-
|
|
6
|
-
describe('handleResourceResponse', () => {
|
|
7
|
-
it('returns isNew: true for a newly seen resource', () => {
|
|
8
|
-
const resources = new Set<string>();
|
|
9
|
-
const resource = {
|
|
10
|
-
url: parseUrl('https://example.com/style.css')!,
|
|
11
|
-
isExternal: false,
|
|
12
|
-
status: 200,
|
|
13
|
-
statusText: 'OK',
|
|
14
|
-
contentType: 'text/css',
|
|
15
|
-
contentLength: 1024,
|
|
16
|
-
compress: '' as const,
|
|
17
|
-
cdn: '' as const,
|
|
18
|
-
responseHeaders: {},
|
|
19
|
-
};
|
|
20
|
-
|
|
21
|
-
const result = handleResourceResponse(resource, resources);
|
|
22
|
-
|
|
23
|
-
expect(result.isNew).toBe(true);
|
|
24
|
-
expect(resources.has('https://example.com/style.css')).toBe(true);
|
|
25
|
-
});
|
|
26
|
-
|
|
27
|
-
it('returns isNew: false for a duplicate resource', () => {
|
|
28
|
-
const resources = new Set<string>(['https://example.com/style.css']);
|
|
29
|
-
const resource = {
|
|
30
|
-
url: parseUrl('https://example.com/style.css')!,
|
|
31
|
-
isExternal: false,
|
|
32
|
-
status: 200,
|
|
33
|
-
statusText: 'OK',
|
|
34
|
-
contentType: 'text/css',
|
|
35
|
-
contentLength: 1024,
|
|
36
|
-
compress: '' as const,
|
|
37
|
-
cdn: '' as const,
|
|
38
|
-
responseHeaders: {},
|
|
39
|
-
};
|
|
40
|
-
|
|
41
|
-
const result = handleResourceResponse(resource, resources);
|
|
42
|
-
|
|
43
|
-
expect(result.isNew).toBe(false);
|
|
44
|
-
});
|
|
45
|
-
});
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
import type { Resource } from '../utils/index.js';
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Track a network resource response and determine if it is newly discovered.
|
|
5
|
-
*
|
|
6
|
-
* Checks whether the resource URL has already been seen. If it is new,
|
|
7
|
-
* adds it to the known resources set.
|
|
8
|
-
* @param resource - The captured network resource data.
|
|
9
|
-
* @param resources - The set of already-known resource URLs (without hash).
|
|
10
|
-
* @returns An object with `isNew` indicating whether this resource was seen for the first time.
|
|
11
|
-
*/
|
|
12
|
-
export function handleResourceResponse(
|
|
13
|
-
resource: Resource,
|
|
14
|
-
resources: Set<string>,
|
|
15
|
-
): { isNew: boolean } {
|
|
16
|
-
const isNew = !resources.has(resource.url.withoutHash);
|
|
17
|
-
if (isNew) {
|
|
18
|
-
resources.add(resource.url.withoutHash);
|
|
19
|
-
}
|
|
20
|
-
return { isNew };
|
|
21
|
-
}
|
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
import type { CrawlerOptions } from './types.js';
|
|
2
|
-
import type { AnchorData, PageData } from '../utils/index.js';
|
|
3
|
-
|
|
4
|
-
import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
|
|
5
|
-
import { describe, it, expect, vi } from 'vitest';
|
|
6
|
-
|
|
7
|
-
import { handleScrapeEnd } from './handle-scrape-end.js';
|
|
8
|
-
|
|
9
|
-
const defaultOptions: CrawlerOptions = {
|
|
10
|
-
interval: 0,
|
|
11
|
-
parallels: 1,
|
|
12
|
-
recursive: true,
|
|
13
|
-
fromList: false,
|
|
14
|
-
captureImages: false,
|
|
15
|
-
executablePath: null,
|
|
16
|
-
fetchExternal: false,
|
|
17
|
-
scope: ['https://example.com/'],
|
|
18
|
-
excludes: [],
|
|
19
|
-
excludeKeywords: [],
|
|
20
|
-
excludeUrls: [],
|
|
21
|
-
maxExcludedDepth: 0,
|
|
22
|
-
retry: 0,
|
|
23
|
-
verbose: false,
|
|
24
|
-
disableQueries: false,
|
|
25
|
-
};
|
|
26
|
-
|
|
27
|
-
/**
|
|
28
|
-
*
|
|
29
|
-
* @param overrides
|
|
30
|
-
*/
|
|
31
|
-
function createMockResult(overrides?: Partial<PageData>): PageData {
|
|
32
|
-
return {
|
|
33
|
-
url: parseUrl('https://example.com/page')!,
|
|
34
|
-
isTarget: true,
|
|
35
|
-
isExternal: false,
|
|
36
|
-
redirectPaths: [],
|
|
37
|
-
status: 200,
|
|
38
|
-
statusText: 'OK',
|
|
39
|
-
contentType: 'text/html',
|
|
40
|
-
contentLength: 1000,
|
|
41
|
-
responseHeaders: {},
|
|
42
|
-
meta: { title: 'Test' },
|
|
43
|
-
imageList: [],
|
|
44
|
-
anchorList: [] as AnchorData[],
|
|
45
|
-
html: '<html></html>',
|
|
46
|
-
isSkipped: false,
|
|
47
|
-
...overrides,
|
|
48
|
-
};
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
describe('handleScrapeEnd', () => {
|
|
52
|
-
it('marks URL as done in the link list', () => {
|
|
53
|
-
const result = createMockResult();
|
|
54
|
-
const mockLink = { url: result.url, isExternal: false, isLowerLayer: false };
|
|
55
|
-
const linkList = {
|
|
56
|
-
done: vi.fn().mockReturnValue(mockLink),
|
|
57
|
-
isMetadataOnly: vi.fn().mockReturnValue(false),
|
|
58
|
-
};
|
|
59
|
-
const scope = new Map([['example.com', [parseUrl('https://example.com/')!]]]);
|
|
60
|
-
const addUrl = vi.fn();
|
|
61
|
-
|
|
62
|
-
const { link, isExternal } = handleScrapeEnd(
|
|
63
|
-
result,
|
|
64
|
-
linkList as never,
|
|
65
|
-
scope,
|
|
66
|
-
defaultOptions,
|
|
67
|
-
addUrl,
|
|
68
|
-
);
|
|
69
|
-
|
|
70
|
-
expect(linkList.done).toHaveBeenCalledOnce();
|
|
71
|
-
expect(link).toBe(mockLink);
|
|
72
|
-
expect(isExternal).toBe(false);
|
|
73
|
-
});
|
|
74
|
-
|
|
75
|
-
it('skips anchor processing in title-only mode', () => {
|
|
76
|
-
const anchor = { href: parseUrl('https://example.com/other')!, textContent: 'link' };
|
|
77
|
-
const result = createMockResult({ anchorList: [anchor as AnchorData] });
|
|
78
|
-
const linkList = {
|
|
79
|
-
done: vi.fn().mockReturnValue(null),
|
|
80
|
-
isMetadataOnly: vi.fn().mockReturnValue(true),
|
|
81
|
-
};
|
|
82
|
-
const scope = new Map([['example.com', [parseUrl('https://example.com/')!]]]);
|
|
83
|
-
const addUrl = vi.fn();
|
|
84
|
-
|
|
85
|
-
handleScrapeEnd(result, linkList as never, scope, defaultOptions, addUrl);
|
|
86
|
-
|
|
87
|
-
expect(addUrl).not.toHaveBeenCalled();
|
|
88
|
-
});
|
|
89
|
-
|
|
90
|
-
it('returns isExternal: true for external pages', () => {
|
|
91
|
-
const result = createMockResult({ isExternal: true });
|
|
92
|
-
const linkList = {
|
|
93
|
-
done: vi.fn().mockReturnValue(null),
|
|
94
|
-
isMetadataOnly: vi.fn().mockReturnValue(false),
|
|
95
|
-
};
|
|
96
|
-
const scope = new Map();
|
|
97
|
-
const addUrl = vi.fn();
|
|
98
|
-
|
|
99
|
-
const { isExternal } = handleScrapeEnd(
|
|
100
|
-
result,
|
|
101
|
-
linkList as never,
|
|
102
|
-
scope,
|
|
103
|
-
defaultOptions,
|
|
104
|
-
addUrl,
|
|
105
|
-
);
|
|
106
|
-
|
|
107
|
-
expect(isExternal).toBe(true);
|
|
108
|
-
});
|
|
109
|
-
});
|