@nitpicker/crawler 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. package/lib/archive/archive-accessor.d.ts +6 -1
  2. package/lib/archive/archive-accessor.js +7 -0
  3. package/lib/archive/database.js +2 -1
  4. package/package.json +5 -2
  5. package/CHANGELOG.md +0 -16
  6. package/src/archive/__mock__/.gitignore +0 -3
  7. package/src/archive/__mock__/mock.sqlite +0 -0
  8. package/src/archive/archive-accessor.ts +0 -337
  9. package/src/archive/archive.ts +0 -408
  10. package/src/archive/database.spec.ts +0 -469
  11. package/src/archive/database.ts +0 -1059
  12. package/src/archive/debug.ts +0 -10
  13. package/src/archive/filesystem/append-text.spec.ts +0 -26
  14. package/src/archive/filesystem/append-text.ts +0 -16
  15. package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
  16. package/src/archive/filesystem/copy-dir-sync.ts +0 -10
  17. package/src/archive/filesystem/copy-dir.spec.ts +0 -33
  18. package/src/archive/filesystem/copy-dir.ts +0 -14
  19. package/src/archive/filesystem/exists.spec.ts +0 -33
  20. package/src/archive/filesystem/exists.ts +0 -10
  21. package/src/archive/filesystem/get-file-list.spec.ts +0 -37
  22. package/src/archive/filesystem/get-file-list.ts +0 -13
  23. package/src/archive/filesystem/index.ts +0 -17
  24. package/src/archive/filesystem/is-dir.spec.ts +0 -29
  25. package/src/archive/filesystem/is-dir.ts +0 -11
  26. package/src/archive/filesystem/mkdir.spec.ts +0 -37
  27. package/src/archive/filesystem/mkdir.ts +0 -16
  28. package/src/archive/filesystem/output-json.spec.ts +0 -34
  29. package/src/archive/filesystem/output-json.ts +0 -16
  30. package/src/archive/filesystem/output-text.spec.ts +0 -31
  31. package/src/archive/filesystem/output-text.ts +0 -35
  32. package/src/archive/filesystem/read-json.spec.ts +0 -26
  33. package/src/archive/filesystem/read-json.ts +0 -12
  34. package/src/archive/filesystem/read-text.spec.ts +0 -25
  35. package/src/archive/filesystem/read-text.ts +0 -11
  36. package/src/archive/filesystem/readline.spec.ts +0 -29
  37. package/src/archive/filesystem/readline.ts +0 -30
  38. package/src/archive/filesystem/remove.spec.ts +0 -34
  39. package/src/archive/filesystem/remove.ts +0 -11
  40. package/src/archive/filesystem/rename.spec.ts +0 -46
  41. package/src/archive/filesystem/rename.ts +0 -21
  42. package/src/archive/filesystem/tar.spec.ts +0 -33
  43. package/src/archive/filesystem/tar.ts +0 -27
  44. package/src/archive/filesystem/untar.spec.ts +0 -34
  45. package/src/archive/filesystem/untar.ts +0 -36
  46. package/src/archive/index.ts +0 -13
  47. package/src/archive/page.spec.ts +0 -368
  48. package/src/archive/page.ts +0 -420
  49. package/src/archive/resource.spec.ts +0 -101
  50. package/src/archive/resource.ts +0 -73
  51. package/src/archive/safe-path.spec.ts +0 -44
  52. package/src/archive/safe-path.ts +0 -18
  53. package/src/archive/types.ts +0 -227
  54. package/src/crawler/clear-destination-cache.spec.ts +0 -20
  55. package/src/crawler/clear-destination-cache.ts +0 -9
  56. package/src/crawler/crawler.ts +0 -873
  57. package/src/crawler/decompose-url.spec.ts +0 -48
  58. package/src/crawler/decompose-url.ts +0 -90
  59. package/src/crawler/destination-cache.spec.ts +0 -23
  60. package/src/crawler/destination-cache.ts +0 -8
  61. package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
  62. package/src/crawler/detect-pagination-pattern.ts +0 -66
  63. package/src/crawler/fetch-destination.ts +0 -257
  64. package/src/crawler/fetch-robots-txt.spec.ts +0 -83
  65. package/src/crawler/fetch-robots-txt.ts +0 -91
  66. package/src/crawler/find-best-matching-scope.spec.ts +0 -39
  67. package/src/crawler/find-best-matching-scope.ts +0 -57
  68. package/src/crawler/generate-predicted-urls.spec.ts +0 -42
  69. package/src/crawler/generate-predicted-urls.ts +0 -34
  70. package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
  71. package/src/crawler/handle-ignore-and-skip.ts +0 -30
  72. package/src/crawler/handle-resource-response.spec.ts +0 -45
  73. package/src/crawler/handle-resource-response.ts +0 -21
  74. package/src/crawler/handle-scrape-end.spec.ts +0 -109
  75. package/src/crawler/handle-scrape-end.ts +0 -115
  76. package/src/crawler/handle-scrape-error.spec.ts +0 -105
  77. package/src/crawler/handle-scrape-error.ts +0 -58
  78. package/src/crawler/index.ts +0 -2
  79. package/src/crawler/inject-scope-auth.spec.ts +0 -36
  80. package/src/crawler/inject-scope-auth.ts +0 -27
  81. package/src/crawler/is-external-url.spec.ts +0 -31
  82. package/src/crawler/is-external-url.ts +0 -17
  83. package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
  84. package/src/crawler/is-in-any-lower-layer.ts +0 -22
  85. package/src/crawler/link-list.spec.ts +0 -355
  86. package/src/crawler/link-list.ts +0 -275
  87. package/src/crawler/link-to-page-data.spec.ts +0 -133
  88. package/src/crawler/link-to-page-data.ts +0 -34
  89. package/src/crawler/net-timeout-error.spec.ts +0 -25
  90. package/src/crawler/net-timeout-error.ts +0 -11
  91. package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
  92. package/src/crawler/protocol-agnostic-key.ts +0 -11
  93. package/src/crawler/reconstruct-url.spec.ts +0 -37
  94. package/src/crawler/reconstruct-url.ts +0 -37
  95. package/src/crawler/robots-checker.spec.ts +0 -104
  96. package/src/crawler/robots-checker.ts +0 -73
  97. package/src/crawler/should-discard-predicted.spec.ts +0 -125
  98. package/src/crawler/should-discard-predicted.ts +0 -33
  99. package/src/crawler/should-skip-url.spec.ts +0 -77
  100. package/src/crawler/should-skip-url.ts +0 -37
  101. package/src/crawler/types.ts +0 -146
  102. package/src/crawler-orchestrator.ts +0 -401
  103. package/src/debug.ts +0 -10
  104. package/src/index.ts +0 -25
  105. package/src/types.ts +0 -30
  106. package/src/utils/array/each-splitted.spec.ts +0 -38
  107. package/src/utils/array/each-splitted.ts +0 -19
  108. package/src/utils/array/index.ts +0 -1
  109. package/src/utils/debug.ts +0 -6
  110. package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
  111. package/src/utils/error/dom-evaluation-error.ts +0 -6
  112. package/src/utils/error/error-emitter.spec.ts +0 -78
  113. package/src/utils/error/error-emitter.ts +0 -44
  114. package/src/utils/error/index.ts +0 -3
  115. package/src/utils/index.ts +0 -5
  116. package/src/utils/object/clean-object.spec.ts +0 -24
  117. package/src/utils/object/clean-object.ts +0 -13
  118. package/src/utils/object/index.ts +0 -1
  119. package/src/utils/types/index.ts +0 -1
  120. package/src/utils/types/types.ts +0 -65
  121. package/tsconfig.json +0 -11
  122. package/tsconfig.tsbuildinfo +0 -1
@@ -1,11 +0,0 @@
1
- import { promises as fs } from 'node:fs';
2
-
3
- /**
4
- * Recursively removes a file or directory at the specified path.
5
- * @param dirPath - The path of the file or directory to remove.
6
- */
7
- export async function remove(dirPath: string) {
8
- await fs.rm(dirPath, {
9
- recursive: true,
10
- });
11
- }
@@ -1,46 +0,0 @@
1
- import { writeFileSync, readFileSync, existsSync, mkdirSync, rmSync } from 'node:fs';
2
- import { tmpdir } from 'node:os';
3
- import path from 'node:path';
4
-
5
- import { afterEach, beforeEach, describe, expect, it } from 'vitest';
6
-
7
- import { rename } from './rename.js';
8
-
9
- describe('rename', () => {
10
- const testDir = path.join(tmpdir(), 'nitpicker-test-rename');
11
-
12
- beforeEach(() => {
13
- mkdirSync(testDir, { recursive: true });
14
- });
15
-
16
- afterEach(() => {
17
- rmSync(testDir, { recursive: true, force: true });
18
- });
19
-
20
- it('renames a file', async () => {
21
- const oldPath = path.join(testDir, 'old.txt');
22
- const newPath = path.join(testDir, 'new.txt');
23
- writeFileSync(oldPath, 'data');
24
- await rename(oldPath, newPath);
25
- expect(existsSync(newPath)).toBe(true);
26
- expect(existsSync(oldPath)).toBe(false);
27
- });
28
-
29
- it('overrides existing file when override is true', async () => {
30
- const oldPath = path.join(testDir, 'old.txt');
31
- const newPath = path.join(testDir, 'new.txt');
32
- writeFileSync(oldPath, 'new data');
33
- writeFileSync(newPath, 'old data');
34
- await rename(oldPath, newPath, true);
35
- expect(readFileSync(newPath, 'utf8')).toBe('new data');
36
- });
37
-
38
- it('succeeds with override true when destination does not exist', async () => {
39
- const oldPath = path.join(testDir, 'old.txt');
40
- const newPath = path.join(testDir, 'new.txt');
41
- writeFileSync(oldPath, 'data');
42
- await rename(oldPath, newPath, true);
43
- expect(existsSync(newPath)).toBe(true);
44
- expect(readFileSync(newPath, 'utf8')).toBe('data');
45
- });
46
- });
@@ -1,21 +0,0 @@
1
- import { promises as fs } from 'node:fs';
2
-
3
- import { remove } from './remove.js';
4
-
5
- /**
6
- * Renames (moves) a file or directory from one path to another.
7
- *
8
- * If `override` is `true`, the destination is unconditionally removed
9
- * before renaming. This avoids a TOCTOU race condition between
10
- * checking existence and performing the removal.
11
- * @param oldPath - The current path of the file or directory.
12
- * @param newPath - The new path for the file or directory.
13
- * @param override - Whether to overwrite the destination if it already exists. Defaults to `false`.
14
- */
15
- export async function rename(oldPath: string, newPath: string, override = false) {
16
- if (override) {
17
- await remove(newPath).catch(() => {});
18
- }
19
-
20
- await fs.rename(oldPath, newPath);
21
- }
@@ -1,33 +0,0 @@
1
- import { writeFileSync, existsSync, mkdirSync, rmSync } from 'node:fs';
2
- import { tmpdir } from 'node:os';
3
- import path from 'node:path';
4
-
5
- import { afterEach, beforeEach, describe, expect, it } from 'vitest';
6
-
7
- import { tar } from './tar.js';
8
- import { untar } from './untar.js';
9
-
10
- describe('tar', () => {
11
- const testDir = path.join(tmpdir(), 'nitpicker-test-tar');
12
- const srcDir = path.join(testDir, 'source');
13
- const extractDir = path.join(testDir, 'extract');
14
-
15
- beforeEach(() => {
16
- mkdirSync(srcDir, { recursive: true });
17
- mkdirSync(extractDir, { recursive: true });
18
- });
19
-
20
- afterEach(() => {
21
- rmSync(testDir, { recursive: true, force: true });
22
- });
23
-
24
- it('creates and extracts a tar archive', async () => {
25
- writeFileSync(path.join(srcDir, 'test.txt'), 'hello tar');
26
- const tarPath = path.join(testDir, 'archive.tar');
27
- await tar(srcDir, tarPath);
28
- expect(existsSync(tarPath)).toBe(true);
29
-
30
- await untar(tarPath, { cwd: extractDir });
31
- expect(existsSync(path.join(extractDir, 'source', 'test.txt'))).toBe(true);
32
- });
33
- });
@@ -1,27 +0,0 @@
1
- import path from 'node:path';
2
-
3
- import { create } from 'tar';
4
-
5
- /**
6
- * Creates an uncompressed TAR archive from a directory.
7
- *
8
- * The archive preserves the relative directory structure.
9
- * The `dir` parameter is resolved relative to its parent directory
10
- * so only the target directory name appears in the archive.
11
- * @param dir - The absolute path of the directory to archive.
12
- * @param outputPath - The file path where the TAR archive will be written.
13
- * @returns A promise that resolves when the TAR archive has been created.
14
- */
15
- export function tar(dir: string, outputPath: string) {
16
- const baseDir = path.dirname(dir);
17
- const targetDir = path.relative(baseDir, dir);
18
- return create(
19
- {
20
- gzip: false,
21
- cwd: baseDir,
22
- file: outputPath,
23
- preservePaths: false,
24
- },
25
- [targetDir],
26
- );
27
- }
@@ -1,34 +0,0 @@
1
- import { writeFileSync, readFileSync, existsSync, mkdirSync, rmSync } from 'node:fs';
2
- import { tmpdir } from 'node:os';
3
- import path from 'node:path';
4
-
5
- import { afterEach, beforeEach, describe, expect, it } from 'vitest';
6
-
7
- import { tar } from './tar.js';
8
- import { untar } from './untar.js';
9
-
10
- describe('untar', () => {
11
- const testDir = path.join(tmpdir(), 'nitpicker-test-untar');
12
- const srcDir = path.join(testDir, 'source');
13
- const extractDir = path.join(testDir, 'extract');
14
-
15
- beforeEach(() => {
16
- mkdirSync(srcDir, { recursive: true });
17
- mkdirSync(extractDir, { recursive: true });
18
- });
19
-
20
- afterEach(() => {
21
- rmSync(testDir, { recursive: true, force: true });
22
- });
23
-
24
- it('extracts specific files from tar archive', async () => {
25
- writeFileSync(path.join(srcDir, 'a.txt'), 'aaa');
26
- writeFileSync(path.join(srcDir, 'b.txt'), 'bbb');
27
- const tarPath = path.join(testDir, 'archive.tar');
28
- await tar(srcDir, tarPath);
29
-
30
- await untar(tarPath, { cwd: extractDir, fileList: ['source/a.txt'] });
31
- expect(existsSync(path.join(extractDir, 'source', 'a.txt'))).toBe(true);
32
- expect(readFileSync(path.join(extractDir, 'source', 'a.txt'), 'utf8')).toBe('aaa');
33
- });
34
- });
@@ -1,36 +0,0 @@
1
- import { extract } from 'tar';
2
-
3
- /**
4
- * Extracts files from a TAR archive.
5
- *
6
- * Only files newer than existing files in the target directory are extracted
7
- * (uses the `newer` option). Optionally restricts extraction to a specific
8
- * working directory and/or a subset of files.
9
- * @param tarFilePath - The path to the TAR archive to extract.
10
- * @param options - Optional extraction settings.
11
- * @param options.cwd - The working directory to extract files into.
12
- * If omitted, the current working directory is used.
13
- * @param options.fileList - An array of specific file paths within the archive
14
- * to extract. If omitted, all files in the archive are extracted.
15
- * @returns A promise that resolves when extraction is complete.
16
- */
17
- export function untar(
18
- tarFilePath: string,
19
- options?: {
20
- /** The working directory to extract files into. */
21
- cwd?: string;
22
- /** An array of specific file paths within the archive to extract. */
23
- fileList?: string[];
24
- },
25
- ) {
26
- return extract(
27
- {
28
- file: tarFilePath,
29
- newer: true,
30
- cwd: options?.cwd,
31
- preservePaths: false,
32
- noMtime: true,
33
- },
34
- options?.fileList ?? [],
35
- );
36
- }
@@ -1,13 +0,0 @@
1
- // Archive storage and retrieval layer for Nitpicker crawl data.
2
- //
3
- // This package provides the `Archive` class for creating, reading, and writing
4
- // `.nitpicker` archive files that store crawl results in a SQLite database along with
5
- // optional HTML snapshots. It also exports the `ArchiveAccessor` for read-only
6
- // access, `Page` and `Resource` model classes, and all related types.
7
- export * from './archive-accessor.js';
8
- export type { Redirect, Referrer, Anchor, StaticPageData } from './page.js';
9
- export { default as Page } from './page.js';
10
- export { default as Resource } from './resource.js';
11
- export * from './types.js';
12
-
13
- export { default } from './archive.js';
@@ -1,368 +0,0 @@
1
- import type { DB_Anchor, DB_Page, DB_Redirect, DB_Referrer } from './types.js';
2
-
3
- import { describe, it, expect, vi } from 'vitest';
4
-
5
- import Page from './page.js';
6
-
7
- /**
8
- * Create a mock ArchiveAccessor with vi.fn() stubs.
9
- * @param overrides - Optional method overrides.
10
- * @returns A mock ArchiveAccessor.
11
- */
12
- function createMockArchive(overrides: Record<string, unknown> = {}) {
13
- return {
14
- getAnchorsOnPage: vi.fn().mockResolvedValue([]),
15
- getHtmlOfPage: vi.fn().mockResolvedValue(null),
16
- getReferrersOfPage: vi.fn().mockResolvedValue([]),
17
- ...overrides,
18
- };
19
- }
20
-
21
- /**
22
- * Create a minimal DB_Page fixture with sensible defaults.
23
- * @param overrides - Optional field overrides.
24
- * @returns A DB_Page object.
25
- */
26
- function createRawPage(overrides: Partial<DB_Page> = {}): DB_Page {
27
- return {
28
- id: 1,
29
- url: 'https://example.com/',
30
- redirectDestId: null,
31
- scraped: 1,
32
- isTarget: 1,
33
- isExternal: 0,
34
- status: 200,
35
- statusText: 'OK',
36
- contentType: 'text/html',
37
- contentLength: 5000,
38
- responseHeaders: '{"content-type":"text/html"}',
39
- lang: 'ja',
40
- title: 'Example Page',
41
- description: 'A test page',
42
- keywords: 'test,example',
43
- noindex: 0,
44
- nofollow: 0,
45
- noarchive: 0,
46
- canonical: 'https://example.com/',
47
- alternate: null,
48
- og_type: 'website',
49
- og_title: 'Example',
50
- og_site_name: 'Example Site',
51
- og_description: 'OG description',
52
- og_url: 'https://example.com/',
53
- og_image: 'https://example.com/image.png',
54
- twitter_card: 'summary',
55
- networkLogs: null,
56
- html: 'pages/1.html',
57
- isSkipped: 0,
58
- skipReason: null,
59
- order: 0,
60
- ...overrides,
61
- };
62
- }
63
-
64
- describe('Page', () => {
65
- describe('getters', () => {
66
- it('returns url as ExURL', () => {
67
- const page = new Page(createMockArchive() as never, createRawPage());
68
- expect(page.url.href).toBe('https://example.com');
69
- });
70
-
71
- it('returns title from raw data', () => {
72
- const page = new Page(createMockArchive() as never, createRawPage());
73
- expect(page.title).toBe('Example Page');
74
- });
75
-
76
- it('returns empty string for null title', () => {
77
- const page = new Page(createMockArchive() as never, createRawPage({ title: null }));
78
- expect(page.title).toBe('');
79
- });
80
-
81
- it('returns status from raw data', () => {
82
- const page = new Page(createMockArchive() as never, createRawPage());
83
- expect(page.status).toBe(200);
84
- });
85
-
86
- it('returns isExternal as false when flag is 0', () => {
87
- const page = new Page(
88
- createMockArchive() as never,
89
- createRawPage({ isExternal: 0 }),
90
- );
91
- expect(page.isExternal).toBe(false);
92
- });
93
-
94
- it('returns isExternal as true when flag is 1', () => {
95
- const page = new Page(
96
- createMockArchive() as never,
97
- createRawPage({ isExternal: 1 }),
98
- );
99
- expect(page.isExternal).toBe(true);
100
- });
101
-
102
- it('returns isSkipped as false when flag is 0', () => {
103
- const page = new Page(
104
- createMockArchive() as never,
105
- createRawPage({ isSkipped: 0 }),
106
- );
107
- expect(page.isSkipped).toBe(false);
108
- });
109
-
110
- it('returns isSkipped as true when flag is 1', () => {
111
- const page = new Page(
112
- createMockArchive() as never,
113
- createRawPage({ isSkipped: 1 }),
114
- );
115
- expect(page.isSkipped).toBe(true);
116
- });
117
-
118
- it('returns skipReason from raw data', () => {
119
- const page = new Page(
120
- createMockArchive() as never,
121
- createRawPage({ isSkipped: 1, skipReason: 'blocked by robots.txt' }),
122
- );
123
- expect(page.skipReason).toBe('blocked by robots.txt');
124
- });
125
-
126
- it('returns null skipReason for non-skipped pages', () => {
127
- const page = new Page(createMockArchive() as never, createRawPage());
128
- expect(page.skipReason).toBeNull();
129
- });
130
-
131
- it('returns isTarget as boolean', () => {
132
- const page = new Page(createMockArchive() as never, createRawPage({ isTarget: 1 }));
133
- expect(page.isTarget).toBe(true);
134
- });
135
-
136
- it('returns noindex/nofollow/noarchive as booleans', () => {
137
- const page = new Page(
138
- createMockArchive() as never,
139
- createRawPage({ noindex: 1, nofollow: 1, noarchive: 1 }),
140
- );
141
- expect(page.noindex).toBe(true);
142
- expect(page.nofollow).toBe(true);
143
- expect(page.noarchive).toBe(true);
144
- });
145
-
146
- it('returns og_* and twitter_card from raw data', () => {
147
- const page = new Page(createMockArchive() as never, createRawPage());
148
- expect(page.og_type).toBe('website');
149
- expect(page.og_title).toBe('Example');
150
- expect(page.og_site_name).toBe('Example Site');
151
- expect(page.og_description).toBe('OG description');
152
- expect(page.og_url).toBe('https://example.com/');
153
- expect(page.og_image).toBe('https://example.com/image.png');
154
- expect(page.twitter_card).toBe('summary');
155
- });
156
-
157
- it('parses responseHeaders from JSON string', () => {
158
- const page = new Page(createMockArchive() as never, createRawPage());
159
- expect(page.responseHeaders).toEqual({ 'content-type': 'text/html' });
160
- });
161
-
162
- it('returns empty object for invalid responseHeaders JSON', () => {
163
- const page = new Page(
164
- createMockArchive() as never,
165
- createRawPage({ responseHeaders: 'not-json' }),
166
- );
167
- expect(page.responseHeaders).toEqual({});
168
- });
169
- });
170
-
171
- describe('isPage / isInternalPage', () => {
172
- it('returns true for text/html content type', () => {
173
- const page = new Page(
174
- createMockArchive() as never,
175
- createRawPage({ contentType: 'text/html' }),
176
- );
177
- expect(page.isPage()).toBe(true);
178
- });
179
-
180
- it('returns true for text/html with extra whitespace', () => {
181
- const page = new Page(
182
- createMockArchive() as never,
183
- createRawPage({ contentType: ' text/html ' }),
184
- );
185
- expect(page.isPage()).toBe(true);
186
- });
187
-
188
- it('returns false for non-html content type', () => {
189
- const page = new Page(
190
- createMockArchive() as never,
191
- createRawPage({ contentType: 'application/json' }),
192
- );
193
- expect(page.isPage()).toBe(false);
194
- });
195
-
196
- it('returns false for null content type', () => {
197
- const page = new Page(
198
- createMockArchive() as never,
199
- createRawPage({ contentType: null }),
200
- );
201
- expect(page.isPage()).toBe(false);
202
- });
203
-
204
- it('isInternalPage returns true for internal HTML pages', () => {
205
- const page = new Page(
206
- createMockArchive() as never,
207
- createRawPage({ contentType: 'text/html', isExternal: 0 }),
208
- );
209
- expect(page.isInternalPage()).toBe(true);
210
- });
211
-
212
- it('isInternalPage returns false for external pages', () => {
213
- const page = new Page(
214
- createMockArchive() as never,
215
- createRawPage({ contentType: 'text/html', isExternal: 1 }),
216
- );
217
- expect(page.isInternalPage()).toBe(false);
218
- });
219
- });
220
-
221
- describe('redirectFrom', () => {
222
- it('maps rawRedirects to Redirect[]', () => {
223
- const redirects: DB_Redirect[] = [
224
- { pageId: 1, from: 'https://old.com/', fromId: 10 },
225
- ];
226
- const page = new Page(createMockArchive() as never, createRawPage(), redirects);
227
- expect(page.redirectFrom).toEqual([{ url: 'https://old.com/', pageId: 10 }]);
228
- });
229
-
230
- it('returns empty array when no redirects', () => {
231
- const page = new Page(createMockArchive() as never, createRawPage());
232
- expect(page.redirectFrom).toEqual([]);
233
- });
234
- });
235
-
236
- describe('getAnchors', () => {
237
- it('returns pre-loaded anchors without querying archive', async () => {
238
- const rawAnchors: DB_Anchor[] = [
239
- {
240
- pageId: 1,
241
- url: 'https://example.com/about',
242
- href: '/about',
243
- isExternal: 0,
244
- title: null,
245
- status: 200,
246
- statusText: 'OK',
247
- contentType: 'text/html',
248
- hash: null,
249
- textContent: 'About',
250
- },
251
- ];
252
- const archive = createMockArchive();
253
- const page = new Page(archive as never, createRawPage(), [], rawAnchors);
254
- const anchors = await page.getAnchors();
255
- expect(anchors).toHaveLength(1);
256
- expect(anchors[0].url).toBe('https://example.com/about');
257
- expect(anchors[0].isExternal).toBe(false);
258
- expect(archive.getAnchorsOnPage).not.toHaveBeenCalled();
259
- });
260
-
261
- it('queries archive when no pre-loaded anchors', async () => {
262
- const archive = createMockArchive({
263
- getAnchorsOnPage: vi.fn().mockResolvedValue([{ url: 'https://example.com/a' }]),
264
- });
265
- const page = new Page(archive as never, createRawPage({ id: 5 }));
266
- const anchors = await page.getAnchors();
267
- expect(anchors).toHaveLength(1);
268
- expect(archive.getAnchorsOnPage).toHaveBeenCalledWith(5);
269
- });
270
- });
271
-
272
- describe('getReferrers', () => {
273
- it('returns pre-loaded referrers without querying archive', async () => {
274
- const rawReferrers: DB_Referrer[] = [
275
- {
276
- pageId: 1,
277
- url: 'https://example.com/home',
278
- through: 'https://example.com/home',
279
- throughId: 2,
280
- hash: null,
281
- textContent: 'link text',
282
- },
283
- ];
284
- const archive = createMockArchive();
285
- const page = new Page(
286
- archive as never,
287
- createRawPage(),
288
- [],
289
- undefined,
290
- rawReferrers,
291
- );
292
- const referrers = await page.getReferrers();
293
- expect(referrers).toHaveLength(1);
294
- expect(referrers[0].textContent).toBe('link text');
295
- expect(archive.getReferrersOfPage).not.toHaveBeenCalled();
296
- });
297
-
298
- it('defaults textContent to empty string for null', async () => {
299
- const rawReferrers: DB_Referrer[] = [
300
- {
301
- pageId: 1,
302
- url: 'https://example.com/',
303
- through: 'https://example.com/',
304
- throughId: 2,
305
- hash: null,
306
- textContent: null,
307
- },
308
- ];
309
- const page = new Page(
310
- createMockArchive() as never,
311
- createRawPage(),
312
- [],
313
- undefined,
314
- rawReferrers,
315
- );
316
- const referrers = await page.getReferrers();
317
- expect(referrers[0].textContent).toBe('');
318
- });
319
-
320
- it('queries archive when no pre-loaded referrers', async () => {
321
- const archive = createMockArchive({
322
- getReferrersOfPage: vi.fn().mockResolvedValue([]),
323
- });
324
- const page = new Page(archive as never, createRawPage({ id: 7 }));
325
- await page.getReferrers();
326
- expect(archive.getReferrersOfPage).toHaveBeenCalledWith(7);
327
- });
328
- });
329
-
330
- describe('getHtml', () => {
331
- it('delegates to archive with raw html path', async () => {
332
- const archive = createMockArchive({
333
- getHtmlOfPage: vi.fn().mockResolvedValue('<html></html>'),
334
- });
335
- const page = new Page(archive as never, createRawPage({ html: 'pages/1.html' }));
336
- const html = await page.getHtml();
337
- expect(html).toBe('<html></html>');
338
- expect(archive.getHtmlOfPage).toHaveBeenCalledWith('pages/1.html');
339
- });
340
- });
341
-
342
- describe('getRequests', () => {
343
- it('always queries archive even when pre-loaded referrers exist', async () => {
344
- const rawReferrers: DB_Referrer[] = [
345
- {
346
- pageId: 1,
347
- url: 'https://example.com/',
348
- through: 'https://example.com/',
349
- throughId: 2,
350
- hash: null,
351
- textContent: 'text',
352
- },
353
- ];
354
- const archive = createMockArchive({
355
- getReferrersOfPage: vi.fn().mockResolvedValue([]),
356
- });
357
- const page = new Page(
358
- archive as never,
359
- createRawPage({ id: 3 }),
360
- [],
361
- undefined,
362
- rawReferrers,
363
- );
364
- await page.getRequests();
365
- expect(archive.getReferrersOfPage).toHaveBeenCalledWith(3);
366
- });
367
- });
368
- });