@nitpicker/crawler 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. package/lib/archive/archive-accessor.d.ts +6 -1
  2. package/lib/archive/archive-accessor.js +7 -0
  3. package/lib/archive/database.js +2 -1
  4. package/package.json +5 -2
  5. package/CHANGELOG.md +0 -16
  6. package/src/archive/__mock__/.gitignore +0 -3
  7. package/src/archive/__mock__/mock.sqlite +0 -0
  8. package/src/archive/archive-accessor.ts +0 -337
  9. package/src/archive/archive.ts +0 -408
  10. package/src/archive/database.spec.ts +0 -469
  11. package/src/archive/database.ts +0 -1059
  12. package/src/archive/debug.ts +0 -10
  13. package/src/archive/filesystem/append-text.spec.ts +0 -26
  14. package/src/archive/filesystem/append-text.ts +0 -16
  15. package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
  16. package/src/archive/filesystem/copy-dir-sync.ts +0 -10
  17. package/src/archive/filesystem/copy-dir.spec.ts +0 -33
  18. package/src/archive/filesystem/copy-dir.ts +0 -14
  19. package/src/archive/filesystem/exists.spec.ts +0 -33
  20. package/src/archive/filesystem/exists.ts +0 -10
  21. package/src/archive/filesystem/get-file-list.spec.ts +0 -37
  22. package/src/archive/filesystem/get-file-list.ts +0 -13
  23. package/src/archive/filesystem/index.ts +0 -17
  24. package/src/archive/filesystem/is-dir.spec.ts +0 -29
  25. package/src/archive/filesystem/is-dir.ts +0 -11
  26. package/src/archive/filesystem/mkdir.spec.ts +0 -37
  27. package/src/archive/filesystem/mkdir.ts +0 -16
  28. package/src/archive/filesystem/output-json.spec.ts +0 -34
  29. package/src/archive/filesystem/output-json.ts +0 -16
  30. package/src/archive/filesystem/output-text.spec.ts +0 -31
  31. package/src/archive/filesystem/output-text.ts +0 -35
  32. package/src/archive/filesystem/read-json.spec.ts +0 -26
  33. package/src/archive/filesystem/read-json.ts +0 -12
  34. package/src/archive/filesystem/read-text.spec.ts +0 -25
  35. package/src/archive/filesystem/read-text.ts +0 -11
  36. package/src/archive/filesystem/readline.spec.ts +0 -29
  37. package/src/archive/filesystem/readline.ts +0 -30
  38. package/src/archive/filesystem/remove.spec.ts +0 -34
  39. package/src/archive/filesystem/remove.ts +0 -11
  40. package/src/archive/filesystem/rename.spec.ts +0 -46
  41. package/src/archive/filesystem/rename.ts +0 -21
  42. package/src/archive/filesystem/tar.spec.ts +0 -33
  43. package/src/archive/filesystem/tar.ts +0 -27
  44. package/src/archive/filesystem/untar.spec.ts +0 -34
  45. package/src/archive/filesystem/untar.ts +0 -36
  46. package/src/archive/index.ts +0 -13
  47. package/src/archive/page.spec.ts +0 -368
  48. package/src/archive/page.ts +0 -420
  49. package/src/archive/resource.spec.ts +0 -101
  50. package/src/archive/resource.ts +0 -73
  51. package/src/archive/safe-path.spec.ts +0 -44
  52. package/src/archive/safe-path.ts +0 -18
  53. package/src/archive/types.ts +0 -227
  54. package/src/crawler/clear-destination-cache.spec.ts +0 -20
  55. package/src/crawler/clear-destination-cache.ts +0 -9
  56. package/src/crawler/crawler.ts +0 -873
  57. package/src/crawler/decompose-url.spec.ts +0 -48
  58. package/src/crawler/decompose-url.ts +0 -90
  59. package/src/crawler/destination-cache.spec.ts +0 -23
  60. package/src/crawler/destination-cache.ts +0 -8
  61. package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
  62. package/src/crawler/detect-pagination-pattern.ts +0 -66
  63. package/src/crawler/fetch-destination.ts +0 -257
  64. package/src/crawler/fetch-robots-txt.spec.ts +0 -83
  65. package/src/crawler/fetch-robots-txt.ts +0 -91
  66. package/src/crawler/find-best-matching-scope.spec.ts +0 -39
  67. package/src/crawler/find-best-matching-scope.ts +0 -57
  68. package/src/crawler/generate-predicted-urls.spec.ts +0 -42
  69. package/src/crawler/generate-predicted-urls.ts +0 -34
  70. package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
  71. package/src/crawler/handle-ignore-and-skip.ts +0 -30
  72. package/src/crawler/handle-resource-response.spec.ts +0 -45
  73. package/src/crawler/handle-resource-response.ts +0 -21
  74. package/src/crawler/handle-scrape-end.spec.ts +0 -109
  75. package/src/crawler/handle-scrape-end.ts +0 -115
  76. package/src/crawler/handle-scrape-error.spec.ts +0 -105
  77. package/src/crawler/handle-scrape-error.ts +0 -58
  78. package/src/crawler/index.ts +0 -2
  79. package/src/crawler/inject-scope-auth.spec.ts +0 -36
  80. package/src/crawler/inject-scope-auth.ts +0 -27
  81. package/src/crawler/is-external-url.spec.ts +0 -31
  82. package/src/crawler/is-external-url.ts +0 -17
  83. package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
  84. package/src/crawler/is-in-any-lower-layer.ts +0 -22
  85. package/src/crawler/link-list.spec.ts +0 -355
  86. package/src/crawler/link-list.ts +0 -275
  87. package/src/crawler/link-to-page-data.spec.ts +0 -133
  88. package/src/crawler/link-to-page-data.ts +0 -34
  89. package/src/crawler/net-timeout-error.spec.ts +0 -25
  90. package/src/crawler/net-timeout-error.ts +0 -11
  91. package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
  92. package/src/crawler/protocol-agnostic-key.ts +0 -11
  93. package/src/crawler/reconstruct-url.spec.ts +0 -37
  94. package/src/crawler/reconstruct-url.ts +0 -37
  95. package/src/crawler/robots-checker.spec.ts +0 -104
  96. package/src/crawler/robots-checker.ts +0 -73
  97. package/src/crawler/should-discard-predicted.spec.ts +0 -125
  98. package/src/crawler/should-discard-predicted.ts +0 -33
  99. package/src/crawler/should-skip-url.spec.ts +0 -77
  100. package/src/crawler/should-skip-url.ts +0 -37
  101. package/src/crawler/types.ts +0 -146
  102. package/src/crawler-orchestrator.ts +0 -401
  103. package/src/debug.ts +0 -10
  104. package/src/index.ts +0 -25
  105. package/src/types.ts +0 -30
  106. package/src/utils/array/each-splitted.spec.ts +0 -38
  107. package/src/utils/array/each-splitted.ts +0 -19
  108. package/src/utils/array/index.ts +0 -1
  109. package/src/utils/debug.ts +0 -6
  110. package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
  111. package/src/utils/error/dom-evaluation-error.ts +0 -6
  112. package/src/utils/error/error-emitter.spec.ts +0 -78
  113. package/src/utils/error/error-emitter.ts +0 -44
  114. package/src/utils/error/index.ts +0 -3
  115. package/src/utils/index.ts +0 -5
  116. package/src/utils/object/clean-object.spec.ts +0 -24
  117. package/src/utils/object/clean-object.ts +0 -13
  118. package/src/utils/object/index.ts +0 -1
  119. package/src/utils/types/index.ts +0 -1
  120. package/src/utils/types/types.ts +0 -65
  121. package/tsconfig.json +0 -11
  122. package/tsconfig.tsbuildinfo +0 -1
@@ -1,48 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
-
3
- import { decomposeUrl } from './decompose-url.js';
4
-
5
- describe('decomposeUrl', () => {
6
- it('decomposes a full URL with path and query', () => {
7
- const result = decomposeUrl('https://example.com/page/2?sort=name&p=1');
8
- expect(result).not.toBeNull();
9
- expect(result!.host).toBe('example.com');
10
- expect(result!.pathSegments).toEqual(['page', '2']);
11
- expect(result!.queryKeys).toEqual(['p', 'sort']);
12
- expect(result!.queryValues).toEqual(['1', 'name']);
13
- expect(result!.protocol).toBe('https:');
14
- });
15
-
16
- it('decomposes a protocol-agnostic URL', () => {
17
- const result = decomposeUrl('//example.com/page/1');
18
- expect(result).not.toBeNull();
19
- expect(result!.host).toBe('example.com');
20
- expect(result!.pathSegments).toEqual(['page', '1']);
21
- expect(result!.protocol).toBe('');
22
- });
23
-
24
- it('decomposes URL with port', () => {
25
- const result = decomposeUrl('//example.com:8080/page/1');
26
- expect(result).not.toBeNull();
27
- expect(result!.host).toBe('example.com:8080');
28
- });
29
-
30
- it('returns null for invalid URL format', () => {
31
- expect(decomposeUrl('not-a-url')).toBeNull();
32
- });
33
-
34
- it('handles URL with query only (no path)', () => {
35
- const result = decomposeUrl('//example.com?offset=0');
36
- expect(result).not.toBeNull();
37
- expect(result!.pathSegments).toEqual([]);
38
- expect(result!.queryKeys).toEqual(['offset']);
39
- expect(result!.queryValues).toEqual(['0']);
40
- });
41
-
42
- it('handles URL with no path and no query', () => {
43
- const result = decomposeUrl('//example.com');
44
- expect(result).not.toBeNull();
45
- expect(result!.pathSegments).toEqual([]);
46
- expect(result!.queryKeys).toEqual([]);
47
- });
48
- });
@@ -1,90 +0,0 @@
1
- /**
2
- * Intermediate representation of a URL split into comparable tokens.
3
- * Used by pagination detection to identify which token changed between two URLs.
4
- */
5
- export interface DecomposedUrl {
6
- /** Hostname including port (e.g. `"example.com:8080"`). */
7
- host: string;
8
- /** Path segments split by `/` (e.g. `["page", "2"]` for `/page/2`). */
9
- pathSegments: string[];
10
- /** Sorted query parameter keys. */
11
- queryKeys: string[];
12
- /** Query parameter values sorted by their corresponding key. */
13
- queryValues: string[];
14
- /** Protocol prefix (e.g. `"https:"`) or empty string if protocol-agnostic. */
15
- protocol: string;
16
- }
17
-
18
- /**
19
- * Decomposes a URL string into its constituent tokens for comparison.
20
- * Handles both full URLs (`https://host/path?q=v`) and protocol-agnostic
21
- * URLs (`//host/path?q=v`). Query parameters are sorted by key for
22
- * consistent comparison.
23
- * @param url - The URL string to decompose
24
- * @returns The decomposed URL, or `null` if the format is invalid
25
- */
26
- export function decomposeUrl(url: string): DecomposedUrl | null {
27
- // URL format: //host/path?query or //host?query (protocol-agnostic)
28
- // Also handle protocol://host/path?query
29
- let work = url;
30
- let protocol = '';
31
-
32
- // Strip protocol
33
- const protoMatch = /^(https?:)?\/\//.exec(work);
34
- if (!protoMatch) return null;
35
- protocol = protoMatch[1] ?? '';
36
- work = work.slice(protoMatch[0].length);
37
-
38
- // Split host from rest
39
- const slashIdx = work.indexOf('/');
40
- const qmarkIdx = work.indexOf('?');
41
-
42
- let host: string;
43
- let pathPart: string;
44
- let queryPart: string;
45
-
46
- if (slashIdx === -1 && qmarkIdx === -1) {
47
- host = work;
48
- pathPart = '';
49
- queryPart = '';
50
- } else if (slashIdx === -1) {
51
- host = work.slice(0, qmarkIdx);
52
- pathPart = '';
53
- queryPart = work.slice(qmarkIdx + 1);
54
- } else {
55
- host = work.slice(0, slashIdx);
56
- const pathAndQuery = work.slice(slashIdx + 1);
57
- const pq = pathAndQuery.indexOf('?');
58
- if (pq === -1) {
59
- pathPart = pathAndQuery;
60
- queryPart = '';
61
- } else {
62
- pathPart = pathAndQuery.slice(0, pq);
63
- queryPart = pathAndQuery.slice(pq + 1);
64
- }
65
- }
66
-
67
- const pathSegments = pathPart ? pathPart.split('/') : [];
68
-
69
- // Parse query into sorted key-value pairs
70
- const queryPairs: [string, string][] = [];
71
- if (queryPart) {
72
- for (const pair of queryPart.split('&')) {
73
- const eqIdx = pair.indexOf('=');
74
- if (eqIdx === -1) {
75
- queryPairs.push([pair, '']);
76
- } else {
77
- queryPairs.push([pair.slice(0, eqIdx), pair.slice(eqIdx + 1)]);
78
- }
79
- }
80
- }
81
- queryPairs.sort((a, b) => a[0].localeCompare(b[0]));
82
-
83
- return {
84
- host,
85
- pathSegments,
86
- queryKeys: queryPairs.map(([k]) => k),
87
- queryValues: queryPairs.map(([, v]) => v),
88
- protocol,
89
- };
90
- }
@@ -1,23 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
-
3
- import { destinationCache } from './destination-cache.js';
4
-
5
- describe('destinationCache', () => {
6
- it('is a Map instance', () => {
7
- expect(destinationCache).toBeInstanceOf(Map);
8
- });
9
-
10
- it('supports set and get operations', () => {
11
- destinationCache.set('test-key', new Error('test'));
12
- expect(destinationCache.has('test-key')).toBe(true);
13
- expect(destinationCache.get('test-key')).toBeInstanceOf(Error);
14
- destinationCache.delete('test-key');
15
- });
16
-
17
- it('supports clear operation', () => {
18
- destinationCache.set('key1', new Error('a'));
19
- destinationCache.set('key2', new Error('b'));
20
- destinationCache.clear();
21
- expect(destinationCache.size).toBe(0);
22
- });
23
- });
@@ -1,8 +0,0 @@
1
- import type { PageData } from '@d-zero/beholder';
2
-
3
- /**
4
- * In-memory cache of HEAD request results keyed by URL (without hash).
5
- * Stores either the successful {@link PageData} or the {@link Error} to avoid
6
- * repeated requests to the same destination.
7
- */
8
- export const destinationCache = new Map<string, PageData | Error>();
@@ -1,169 +0,0 @@
1
- import { describe, expect, it } from 'vitest';
2
-
3
- import { detectPaginationPattern } from './detect-pagination-pattern.js';
4
-
5
- describe('detectPaginationPattern', () => {
6
- describe('正常検出ケース', () => {
7
- it('パスセグメントの数値差異を検出する', () => {
8
- const result = detectPaginationPattern(
9
- '//example.com/page/2',
10
- '//example.com/page/3',
11
- );
12
- expect(result).toEqual({ tokenIndex: 1, step: 1, currentNumber: 3 });
13
- });
14
-
15
- it('末尾パスの数値差異を検出する', () => {
16
- const result = detectPaginationPattern(
17
- '//example.com/blog/2',
18
- '//example.com/blog/3',
19
- );
20
- expect(result).toEqual({ tokenIndex: 1, step: 1, currentNumber: 3 });
21
- });
22
-
23
- it('深いパスの数値差異を検出する', () => {
24
- const result = detectPaginationPattern(
25
- '//example.com/a/b/2/c',
26
- '//example.com/a/b/3/c',
27
- );
28
- expect(result).toEqual({ tokenIndex: 2, step: 1, currentNumber: 3 });
29
- });
30
-
31
- it('クエリパラメータの数値差異を検出する', () => {
32
- const result = detectPaginationPattern(
33
- '//example.com/list?p=1&sort=name',
34
- '//example.com/list?p=2&sort=name',
35
- );
36
- expect(result).toEqual({ tokenIndex: 1, step: 1, currentNumber: 2 });
37
- });
38
-
39
- it('step > 1 を検出する', () => {
40
- const result = detectPaginationPattern(
41
- '//example.com/page/10',
42
- '//example.com/page/20',
43
- );
44
- expect(result).toEqual({ tokenIndex: 1, step: 10, currentNumber: 20 });
45
- });
46
-
47
- it('0始まりページネーションを検出する', () => {
48
- const result = detectPaginationPattern(
49
- '//example.com/page/0',
50
- '//example.com/page/1',
51
- );
52
- expect(result).toEqual({ tokenIndex: 1, step: 1, currentNumber: 1 });
53
- });
54
-
55
- it('大きい数値でも検出する', () => {
56
- const result = detectPaginationPattern(
57
- '//example.com/items/100',
58
- '//example.com/items/101',
59
- );
60
- expect(result).toEqual({ tokenIndex: 1, step: 1, currentNumber: 101 });
61
- });
62
-
63
- it('ポート付きURLを検出する', () => {
64
- const result = detectPaginationPattern(
65
- '//example.com:8080/page/1',
66
- '//example.com:8080/page/2',
67
- );
68
- expect(result).toEqual({ tokenIndex: 1, step: 1, currentNumber: 2 });
69
- });
70
-
71
- it('クエリのみ(パスなし)を検出する', () => {
72
- const result = detectPaginationPattern(
73
- '//example.com?offset=0',
74
- '//example.com?offset=10',
75
- );
76
- expect(result).toEqual({ tokenIndex: 0, step: 10, currentNumber: 10 });
77
- });
78
- });
79
-
80
- describe('null を返すケース', () => {
81
- it('ホスト名が異なる場合', () => {
82
- expect(
83
- detectPaginationPattern('//example.com/page/2', '//other.com/page/3'),
84
- ).toBeNull();
85
- });
86
-
87
- it('パスの長さが異なる場合', () => {
88
- expect(
89
- detectPaginationPattern('//example.com/page/2', '//example.com/page/2/extra'),
90
- ).toBeNull();
91
- });
92
-
93
- it('非数値の差異がある場合', () => {
94
- expect(
95
- detectPaginationPattern('//example.com/page/a', '//example.com/page/b'),
96
- ).toBeNull();
97
- });
98
-
99
- it('複数箇所の数値差異がある場合', () => {
100
- expect(
101
- detectPaginationPattern('//example.com/1/page/2', '//example.com/2/page/3'),
102
- ).toBeNull();
103
- });
104
-
105
- it('数値 + 非数値の差異がある場合', () => {
106
- expect(
107
- detectPaginationPattern('//example.com/a/2', '//example.com/b/3'),
108
- ).toBeNull();
109
- });
110
-
111
- it('step が 0(同一URL)の場合', () => {
112
- expect(
113
- detectPaginationPattern('//example.com/page/3', '//example.com/page/3'),
114
- ).toBeNull();
115
- });
116
-
117
- it('step が負(デクリメント)の場合', () => {
118
- expect(
119
- detectPaginationPattern('//example.com/page/5', '//example.com/page/3'),
120
- ).toBeNull();
121
- });
122
-
123
- it('クエリのキーセットが異なる場合', () => {
124
- expect(
125
- detectPaginationPattern(
126
- '//example.com/list?p=1&a=x',
127
- '//example.com/list?p=2&b=y',
128
- ),
129
- ).toBeNull();
130
- });
131
-
132
- it('クエリのキー数が異なる場合', () => {
133
- expect(
134
- detectPaginationPattern(
135
- '//example.com/list?p=1',
136
- '//example.com/list?p=2&extra=1',
137
- ),
138
- ).toBeNull();
139
- });
140
-
141
- it('プロトコル以外完全一致(数値差異なし)の場合', () => {
142
- expect(
143
- detectPaginationPattern('//example.com/about', '//example.com/about'),
144
- ).toBeNull();
145
- });
146
-
147
- it('空パス同士の場合', () => {
148
- expect(detectPaginationPattern('//example.com', '//example.com')).toBeNull();
149
- });
150
- });
151
-
152
- describe('境界値ケース', () => {
153
- it('非常に大きな数値でも動作する', () => {
154
- const result = detectPaginationPattern(
155
- '//example.com/page/999999',
156
- '//example.com/page/1000000',
157
- );
158
- expect(result).toEqual({ tokenIndex: 1, step: 1, currentNumber: 1_000_000 });
159
- });
160
-
161
- it('パス内に固定数値セグメントがあっても変化箇所のみ検出する', () => {
162
- const result = detectPaginationPattern(
163
- '//example.com/v2/page/3',
164
- '//example.com/v2/page/4',
165
- );
166
- expect(result).toEqual({ tokenIndex: 2, step: 1, currentNumber: 4 });
167
- });
168
- });
169
- });
@@ -1,66 +0,0 @@
1
- import type { PaginationPattern } from './types.js';
2
-
3
- import { decomposeUrl } from './decompose-url.js';
4
-
5
- /**
6
- * Compares two consecutive URL strings and detects a single-token numeric
7
- * pagination pattern (e.g. `/page/1` → `/page/2`, or `?p=1` → `?p=2`).
8
- *
9
- * The algorithm decomposes each URL into tokens (path segments + sorted query values),
10
- * then checks that exactly one token differs and both values are integers with a
11
- * positive step. Returns `null` when no pattern is detected.
12
- *
13
- * WHY single-token constraint: Multi-token differences (e.g. both path and query
14
- * changing) indicate different routes rather than pagination, so they are rejected.
15
- * @param prevUrl - The previously pushed URL (protocol-agnostic, without hash/auth)
16
- * @param currentUrl - The newly discovered URL
17
- * @returns The detected pattern, or `null` if no pagination pattern was found
18
- */
19
- export function detectPaginationPattern(
20
- prevUrl: string,
21
- currentUrl: string,
22
- ): PaginationPattern | null {
23
- const prev = decomposeUrl(prevUrl);
24
- const curr = decomposeUrl(currentUrl);
25
- if (!prev || !curr) return null;
26
-
27
- // Host (including port) must match
28
- if (prev.host !== curr.host) return null;
29
-
30
- // Path segment count must match
31
- if (prev.pathSegments.length !== curr.pathSegments.length) return null;
32
-
33
- // Query key sets must match in count and identity
34
- if (prev.queryKeys.length !== curr.queryKeys.length) return null;
35
- for (let i = 0; i < prev.queryKeys.length; i++) {
36
- if (prev.queryKeys[i] !== curr.queryKeys[i]) return null;
37
- }
38
-
39
- // Build combined token arrays: path segments + query values (sorted by key)
40
- const prevTokens = [...prev.pathSegments, ...prev.queryValues];
41
- const currTokens = [...curr.pathSegments, ...curr.queryValues];
42
-
43
- let diffIndex = -1;
44
- for (const [i, prevToken] of prevTokens.entries()) {
45
- if (prevToken !== currTokens[i]) {
46
- if (diffIndex !== -1) return null; // more than one difference
47
- diffIndex = i;
48
- }
49
- }
50
-
51
- if (diffIndex === -1) return null; // identical URLs
52
-
53
- const prevNum = Number(prevTokens[diffIndex]);
54
- const currNum = Number(currTokens[diffIndex]);
55
- if (!Number.isFinite(prevNum) || !Number.isFinite(currNum)) return null;
56
- if (!Number.isInteger(prevNum) || !Number.isInteger(currNum)) return null;
57
-
58
- const step = currNum - prevNum;
59
- if (step <= 0) return null;
60
-
61
- return {
62
- tokenIndex: diffIndex,
63
- step,
64
- currentNumber: currNum,
65
- };
66
- }
@@ -1,257 +0,0 @@
1
- import type { PageData } from '@d-zero/beholder';
2
- import type { ExURL } from '@d-zero/shared/parse-url';
3
- import type { FollowResponse, RedirectableRequest } from 'follow-redirects';
4
- import type { ClientRequest, IncomingMessage, RequestOptions } from 'node:http';
5
-
6
- import { delay } from '@d-zero/shared/delay';
7
- import redirects from 'follow-redirects';
8
-
9
- import { destinationCache } from './destination-cache.js';
10
- import NetTimeoutError from './net-timeout-error.js';
11
-
12
- /**
13
- * Parameters for {@link fetchDestination}.
14
- */
15
- export interface FetchDestinationParams {
16
- /** The extended URL to fetch. */
17
- readonly url: ExURL;
18
- /** Whether the URL is external to the crawl scope. */
19
- readonly isExternal: boolean;
20
- /** The HTTP method to use. Defaults to `"HEAD"`. */
21
- readonly method?: string;
22
- /** Additional options. */
23
- readonly options?: {
24
- /**
25
- * When set, forces a GET request and reads up to this many bytes from
26
- * the response body to extract an HTML `<title>` tag.
27
- */
28
- titleBytesLimit?: number;
29
- };
30
- /** User-Agent string to send with the request. */
31
- readonly userAgent?: string;
32
- }
33
-
34
- /**
35
- * Fetches the destination metadata for a URL using an HTTP HEAD request (or GET as fallback).
36
- *
37
- * Results are cached in memory so that repeated calls for the same URL
38
- * (without hash) return immediately. The request races against a 10-second
39
- * timeout; if the server does not respond in time, a {@link NetTimeoutError} is thrown.
40
- *
41
- * If the server returns 405 (Method Not Allowed), 501 (Not Implemented), or 503
42
- * (Service Unavailable) for a HEAD request, the function automatically retries with GET.
43
- * @param params - Parameters containing URL, external flag, method, options, and optional User-Agent.
44
- * @returns The page metadata obtained from the HTTP response.
45
- * @throws {NetTimeoutError} If the request exceeds the 10-second timeout.
46
- * @throws {Error} If the HTTP request fails for any other reason.
47
- */
48
- export async function fetchDestination(
49
- params: FetchDestinationParams,
50
- ): Promise<PageData> {
51
- const { url, isExternal, method = 'HEAD', options, userAgent } = params;
52
- const titleBytesLimit = options?.titleBytesLimit;
53
- const cacheKey = titleBytesLimit == null ? url.withoutHash : `${url.withoutHash}:title`;
54
-
55
- if (destinationCache.has(cacheKey)) {
56
- const cache = destinationCache.get(cacheKey)!;
57
- if (cache instanceof Error) {
58
- throw cache;
59
- }
60
- return cache;
61
- }
62
-
63
- const effectiveMethod = titleBytesLimit == null ? method : 'GET';
64
-
65
- const result = await Promise.race([
66
- _fetchHead(url, isExternal, effectiveMethod, titleBytesLimit, userAgent).catch(
67
- (error: unknown) => (error instanceof Error ? error : new Error(String(error))),
68
- ),
69
- (async () => {
70
- await delay(10 * 1000);
71
- return new NetTimeoutError(url.href);
72
- })(),
73
- ]);
74
-
75
- destinationCache.set(cacheKey, result);
76
- if (result instanceof Error) {
77
- throw result;
78
- }
79
-
80
- return result;
81
- }
82
-
83
- /**
84
- * Performs the actual HTTP request to retrieve page metadata.
85
- *
86
- * Handles both HTTP and HTTPS protocols via `follow-redirects`, tracks redirect chains,
87
- * and falls back to GET on certain status codes (405, 501, 503).
88
- * @param url - The extended URL to request.
89
- * @param isExternal - Whether the URL is external to the crawl scope.
90
- * @param method - The HTTP method (`"HEAD"` or `"GET"`).
91
- * @param titleBytesLimit - When set, reads up to this many bytes from the response body
92
- * to extract a `<title>` tag, then destroys the connection.
93
- * @param userAgent - Optional User-Agent string to send with the request.
94
- * @returns A promise resolving to {@link PageData} with response metadata.
95
- */
96
- async function _fetchHead(
97
- url: ExURL,
98
- isExternal: boolean,
99
- method: string,
100
- titleBytesLimit?: number,
101
- userAgent?: string,
102
- ) {
103
- return new Promise<PageData>((resolve, reject) => {
104
- const hostHeader = url.port ? `${url.hostname}:${url.port}` : url.hostname;
105
- const request: RequestOptions = {
106
- protocol: url.protocol,
107
- hostname: url.hostname,
108
- port: url.port || undefined,
109
- path: url.pathname,
110
- method,
111
- headers: {
112
- host: hostHeader,
113
- ...(userAgent ? { 'User-Agent': userAgent } : {}),
114
- Connection: 'keep-alive',
115
- Pragma: 'no-cache',
116
- 'Cache-Control': 'no-cache',
117
- 'Upgrade-Insecure-Requests': 1,
118
- Accept:
119
- 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
120
- 'Accept-Encoding': 'gzip, deflate',
121
- 'Accept-Language':
122
- 'ja,en;q=0.9,zh;q=0.8,en-US;q=0.7,pl;q=0.6,de;q=0.5,zh-CN;q=0.4,zh-TW;q=0.3,th;q=0.2,ko;q=0.1,fr;q=0.1',
123
- // Range: url.extname?.toLowerCase() === 'pdf' ? 'bytes=0-0' : undefined,
124
- },
125
- };
126
-
127
- if (url.username && url.password) {
128
- request.auth = `${url.username}:${url.password}`;
129
- }
130
-
131
- let req: RedirectableRequest<ClientRequest, IncomingMessage>;
132
- let destroyed = false;
133
- const response = (res: IncomingMessage & FollowResponse) => {
134
- const chunks: Buffer[] = [];
135
- let totalBytes = 0;
136
- let settled = false;
137
-
138
- const buildPageData = (title: string): PageData => {
139
- const redirectPaths = res.redirects.map((r) => r.url);
140
- const _contentLength = Number.parseInt(res.headers['content-length'] || '');
141
- const contentLength = Number.isFinite(_contentLength) ? _contentLength : null;
142
- return {
143
- url,
144
- isTarget: !isExternal,
145
- isExternal,
146
- redirectPaths,
147
- status: res.statusCode || 0,
148
- statusText: res.statusMessage || '',
149
- contentType: res.headers['content-type']?.split(';')[0] || null,
150
- contentLength,
151
- responseHeaders: res.headers,
152
- meta: { title },
153
- imageList: [],
154
- anchorList: [],
155
- html: '',
156
- isSkipped: false,
157
- };
158
- };
159
-
160
- if (titleBytesLimit == null) {
161
- res.on('data', () => {});
162
- res.on('end', async () => {
163
- let rep = buildPageData('');
164
-
165
- if (rep.status === 405) {
166
- if (method === 'GET') {
167
- reject(new Error(`Method Not Allowed: ${url.href} ${rep.statusText}`));
168
- return;
169
- }
170
- try {
171
- rep = await fetchDestination({ url, isExternal, method: 'GET' });
172
- } catch (error) {
173
- reject(error);
174
- return;
175
- }
176
- }
177
-
178
- if (rep.status === 501) {
179
- if (method === 'GET') {
180
- reject(new Error(`Method Not Implemented: ${url.href} ${rep.statusText}`));
181
- return;
182
- }
183
- await delay(5 * 1000);
184
- try {
185
- rep = await fetchDestination({ url, isExternal, method: 'GET' });
186
- } catch (error) {
187
- reject(error);
188
- return;
189
- }
190
- }
191
-
192
- if (rep.status === 503) {
193
- if (method === 'GET') {
194
- reject(new Error(`Retrying failed: ${url.href} ${rep.statusText}`));
195
- return;
196
- }
197
- await delay(5 * 1000);
198
- try {
199
- rep = await fetchDestination({ url, isExternal, method: 'GET' });
200
- } catch (error) {
201
- reject(error);
202
- return;
203
- }
204
- }
205
-
206
- resolve(rep);
207
- });
208
- } else {
209
- res.on('data', (chunk: Buffer) => {
210
- if (settled) return;
211
- chunks.push(chunk);
212
- totalBytes += chunk.length;
213
-
214
- // Check for title in accumulated data so far
215
- const body = Buffer.concat(chunks).toString('utf8');
216
- const titleMatch = /<title[^>]*>([\s\S]*?)<\/title>/i.exec(body);
217
- if (titleMatch) {
218
- settled = true;
219
- const title = titleMatch[1]?.trim() ?? '';
220
- resolve(buildPageData(title));
221
- destroyed = true;
222
- req.destroy();
223
- return;
224
- }
225
-
226
- // Reached byte limit without finding title
227
- if (totalBytes >= titleBytesLimit) {
228
- settled = true;
229
- resolve(buildPageData(''));
230
- destroyed = true;
231
- req.destroy();
232
- }
233
- });
234
- res.on('end', () => {
235
- if (settled) return;
236
- settled = true;
237
- // Stream ended before limit — try to extract title from what we have
238
- const body = Buffer.concat(chunks).toString('utf8');
239
- const titleMatch = /<title[^>]*>([\s\S]*?)<\/title>/i.exec(body);
240
- const title = titleMatch?.[1]?.trim() ?? '';
241
- resolve(buildPageData(title));
242
- });
243
- }
244
- };
245
- if (url.protocol === 'https:') {
246
- req = redirects.https.request(request, response);
247
- } else {
248
- req = redirects.http.request(request, response);
249
- }
250
- req.on('error', (error) => {
251
- // Ignore errors caused by intentional req.destroy()
252
- if (destroyed) return;
253
- reject(error);
254
- });
255
- req.end();
256
- });
257
- }