@nitpicker/crawler 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/package.json +5 -2
  2. package/CHANGELOG.md +0 -16
  3. package/src/archive/__mock__/.gitignore +0 -3
  4. package/src/archive/__mock__/mock.sqlite +0 -0
  5. package/src/archive/archive-accessor.ts +0 -337
  6. package/src/archive/archive.ts +0 -408
  7. package/src/archive/database.spec.ts +0 -469
  8. package/src/archive/database.ts +0 -1059
  9. package/src/archive/debug.ts +0 -10
  10. package/src/archive/filesystem/append-text.spec.ts +0 -26
  11. package/src/archive/filesystem/append-text.ts +0 -16
  12. package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
  13. package/src/archive/filesystem/copy-dir-sync.ts +0 -10
  14. package/src/archive/filesystem/copy-dir.spec.ts +0 -33
  15. package/src/archive/filesystem/copy-dir.ts +0 -14
  16. package/src/archive/filesystem/exists.spec.ts +0 -33
  17. package/src/archive/filesystem/exists.ts +0 -10
  18. package/src/archive/filesystem/get-file-list.spec.ts +0 -37
  19. package/src/archive/filesystem/get-file-list.ts +0 -13
  20. package/src/archive/filesystem/index.ts +0 -17
  21. package/src/archive/filesystem/is-dir.spec.ts +0 -29
  22. package/src/archive/filesystem/is-dir.ts +0 -11
  23. package/src/archive/filesystem/mkdir.spec.ts +0 -37
  24. package/src/archive/filesystem/mkdir.ts +0 -16
  25. package/src/archive/filesystem/output-json.spec.ts +0 -34
  26. package/src/archive/filesystem/output-json.ts +0 -16
  27. package/src/archive/filesystem/output-text.spec.ts +0 -31
  28. package/src/archive/filesystem/output-text.ts +0 -35
  29. package/src/archive/filesystem/read-json.spec.ts +0 -26
  30. package/src/archive/filesystem/read-json.ts +0 -12
  31. package/src/archive/filesystem/read-text.spec.ts +0 -25
  32. package/src/archive/filesystem/read-text.ts +0 -11
  33. package/src/archive/filesystem/readline.spec.ts +0 -29
  34. package/src/archive/filesystem/readline.ts +0 -30
  35. package/src/archive/filesystem/remove.spec.ts +0 -34
  36. package/src/archive/filesystem/remove.ts +0 -11
  37. package/src/archive/filesystem/rename.spec.ts +0 -46
  38. package/src/archive/filesystem/rename.ts +0 -21
  39. package/src/archive/filesystem/tar.spec.ts +0 -33
  40. package/src/archive/filesystem/tar.ts +0 -27
  41. package/src/archive/filesystem/untar.spec.ts +0 -34
  42. package/src/archive/filesystem/untar.ts +0 -36
  43. package/src/archive/index.ts +0 -13
  44. package/src/archive/page.spec.ts +0 -368
  45. package/src/archive/page.ts +0 -420
  46. package/src/archive/resource.spec.ts +0 -101
  47. package/src/archive/resource.ts +0 -73
  48. package/src/archive/safe-path.spec.ts +0 -44
  49. package/src/archive/safe-path.ts +0 -18
  50. package/src/archive/types.ts +0 -227
  51. package/src/crawler/clear-destination-cache.spec.ts +0 -20
  52. package/src/crawler/clear-destination-cache.ts +0 -9
  53. package/src/crawler/crawler.ts +0 -873
  54. package/src/crawler/decompose-url.spec.ts +0 -48
  55. package/src/crawler/decompose-url.ts +0 -90
  56. package/src/crawler/destination-cache.spec.ts +0 -23
  57. package/src/crawler/destination-cache.ts +0 -8
  58. package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
  59. package/src/crawler/detect-pagination-pattern.ts +0 -66
  60. package/src/crawler/fetch-destination.ts +0 -257
  61. package/src/crawler/fetch-robots-txt.spec.ts +0 -83
  62. package/src/crawler/fetch-robots-txt.ts +0 -91
  63. package/src/crawler/find-best-matching-scope.spec.ts +0 -39
  64. package/src/crawler/find-best-matching-scope.ts +0 -57
  65. package/src/crawler/generate-predicted-urls.spec.ts +0 -42
  66. package/src/crawler/generate-predicted-urls.ts +0 -34
  67. package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
  68. package/src/crawler/handle-ignore-and-skip.ts +0 -30
  69. package/src/crawler/handle-resource-response.spec.ts +0 -45
  70. package/src/crawler/handle-resource-response.ts +0 -21
  71. package/src/crawler/handle-scrape-end.spec.ts +0 -109
  72. package/src/crawler/handle-scrape-end.ts +0 -115
  73. package/src/crawler/handle-scrape-error.spec.ts +0 -105
  74. package/src/crawler/handle-scrape-error.ts +0 -58
  75. package/src/crawler/index.ts +0 -2
  76. package/src/crawler/inject-scope-auth.spec.ts +0 -36
  77. package/src/crawler/inject-scope-auth.ts +0 -27
  78. package/src/crawler/is-external-url.spec.ts +0 -31
  79. package/src/crawler/is-external-url.ts +0 -17
  80. package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
  81. package/src/crawler/is-in-any-lower-layer.ts +0 -22
  82. package/src/crawler/link-list.spec.ts +0 -355
  83. package/src/crawler/link-list.ts +0 -275
  84. package/src/crawler/link-to-page-data.spec.ts +0 -133
  85. package/src/crawler/link-to-page-data.ts +0 -34
  86. package/src/crawler/net-timeout-error.spec.ts +0 -25
  87. package/src/crawler/net-timeout-error.ts +0 -11
  88. package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
  89. package/src/crawler/protocol-agnostic-key.ts +0 -11
  90. package/src/crawler/reconstruct-url.spec.ts +0 -37
  91. package/src/crawler/reconstruct-url.ts +0 -37
  92. package/src/crawler/robots-checker.spec.ts +0 -104
  93. package/src/crawler/robots-checker.ts +0 -73
  94. package/src/crawler/should-discard-predicted.spec.ts +0 -125
  95. package/src/crawler/should-discard-predicted.ts +0 -33
  96. package/src/crawler/should-skip-url.spec.ts +0 -77
  97. package/src/crawler/should-skip-url.ts +0 -37
  98. package/src/crawler/types.ts +0 -146
  99. package/src/crawler-orchestrator.ts +0 -401
  100. package/src/debug.ts +0 -10
  101. package/src/index.ts +0 -25
  102. package/src/types.ts +0 -30
  103. package/src/utils/array/each-splitted.spec.ts +0 -38
  104. package/src/utils/array/each-splitted.ts +0 -19
  105. package/src/utils/array/index.ts +0 -1
  106. package/src/utils/debug.ts +0 -6
  107. package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
  108. package/src/utils/error/dom-evaluation-error.ts +0 -6
  109. package/src/utils/error/error-emitter.spec.ts +0 -78
  110. package/src/utils/error/error-emitter.ts +0 -44
  111. package/src/utils/error/index.ts +0 -3
  112. package/src/utils/index.ts +0 -5
  113. package/src/utils/object/clean-object.spec.ts +0 -24
  114. package/src/utils/object/clean-object.ts +0 -13
  115. package/src/utils/object/index.ts +0 -1
  116. package/src/utils/types/index.ts +0 -1
  117. package/src/utils/types/types.ts +0 -65
  118. package/tsconfig.json +0 -11
  119. package/tsconfig.tsbuildinfo +0 -1
@@ -1,420 +0,0 @@
1
- import type { ArchiveAccessor } from './archive-accessor.js';
2
- import type { DB_Anchor, DB_Page, DB_Redirect, DB_Referrer } from './types.js';
3
-
4
- import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
5
-
6
- /**
7
- * Represents a crawled page stored in the archive.
8
- *
9
- * Provides access to the page's metadata (title, status, SEO tags, etc.),
10
- * its relationships (anchors, referrers, redirects), and its HTML snapshot.
11
- * Instances are created by {@link ArchiveAccessor.getPages} or
12
- * {@link ArchiveAccessor.getPagesWithRefs}.
13
- */
14
- export default class Page {
15
- /**
16
- * An array of URLs that redirect to this page.
17
- * Each entry contains the source URL and its page ID.
18
- * Returns an empty array if no redirects exist.
19
- */
20
- readonly redirectFrom: Redirect[];
21
- #archive: ArchiveAccessor;
22
- #disableQueries: boolean;
23
- #raw: DB_Page;
24
- #rawAnchors: DB_Anchor[] | null;
25
- #rawReferrers: DB_Referrer[] | null;
26
-
27
- /**
28
- * The alternate URL from the `<link rel="alternate">` tag, or null if not present.
29
- */
30
- get alternate() {
31
- return this.#raw.alternate;
32
- }
33
-
34
- /**
35
- * The canonical URL from the `<link rel="canonical">` tag, or null if not present.
36
- */
37
- get canonical() {
38
- return this.#raw.canonical;
39
- }
40
-
41
- /**
42
- * The content length of the HTTP response in bytes, or null if unknown.
43
- */
44
- get contentLength() {
45
- return this.#raw.contentLength;
46
- }
47
-
48
- /**
49
- * The MIME content type of the HTTP response (e.g., `"text/html"`), or null if unknown.
50
- */
51
- get contentType() {
52
- return this.#raw.contentType;
53
- }
54
-
55
- /**
56
- * The meta description content, or null if not present.
57
- */
58
- get description() {
59
- return this.#raw.description;
60
- }
61
-
62
- /**
63
- * Whether this page is on an external domain (outside the crawl scope).
64
- */
65
- get isExternal() {
66
- return !!this.#raw.isExternal;
67
- }
68
-
69
- /**
70
- * Whether this page was skipped during crawling.
71
- */
72
- get isSkipped() {
73
- return !!this.#raw.isSkipped;
74
- }
75
-
76
- /**
77
- * Whether this page was a crawl target (as opposed to being discovered incidentally).
78
- */
79
- get isTarget() {
80
- return !!this.#raw.isTarget;
81
- }
82
-
83
- /**
84
- * The reason this page was skipped during crawling, or null if it was not skipped.
85
- */
86
- get skipReason() {
87
- return this.#raw.skipReason;
88
- }
89
-
90
- /**
91
- * The meta keywords content, or null if not present.
92
- */
93
- get keywords() {
94
- return this.#raw.keywords;
95
- }
96
-
97
- /**
98
- * The `lang` attribute value from the HTML element, or null if not present.
99
- */
100
- get lang() {
101
- return this.#raw.lang;
102
- }
103
-
104
- /**
105
- * Whether the noarchive robots directive is set.
106
- */
107
- get noarchive() {
108
- return !!this.#raw.noarchive;
109
- }
110
-
111
- /**
112
- * Whether the nofollow robots directive is set.
113
- */
114
- get nofollow() {
115
- return !!this.#raw.nofollow;
116
- }
117
-
118
- /**
119
- * Whether the noindex robots directive is set.
120
- */
121
- get noindex() {
122
- return !!this.#raw.noindex;
123
- }
124
-
125
- /**
126
- * The Open Graph description (`og:description`), or null if not present.
127
- */
128
- get og_description() {
129
- return this.#raw.og_description;
130
- }
131
-
132
- /**
133
- * The Open Graph image URL (`og:image`), or null if not present.
134
- */
135
- get og_image() {
136
- return this.#raw.og_image;
137
- }
138
-
139
- /**
140
- * The Open Graph site name (`og:site_name`), or null if not present.
141
- */
142
- get og_site_name() {
143
- return this.#raw.og_site_name;
144
- }
145
-
146
- /**
147
- * The Open Graph title (`og:title`), or null if not present.
148
- */
149
- get og_title() {
150
- return this.#raw.og_title;
151
- }
152
-
153
- /**
154
- * The Open Graph type (`og:type`), or null if not present.
155
- */
156
- get og_type() {
157
- return this.#raw.og_type;
158
- }
159
-
160
- /**
161
- * The Open Graph URL (`og:url`), or null if not present.
162
- */
163
- get og_url() {
164
- return this.#raw.og_url;
165
- }
166
-
167
- /**
168
- * The parsed HTTP response headers as a key-value record.
169
- * Returns an empty object if headers cannot be parsed.
170
- */
171
- get responseHeaders(): Record<string, string> {
172
- try {
173
- return JSON.parse(this.#raw.responseHeaders);
174
- } catch {
175
- return {};
176
- }
177
- }
178
-
179
- /**
180
- * The HTTP response status code, or null if the page has not been fetched.
181
- */
182
- get status() {
183
- return this.#raw.status;
184
- }
185
-
186
- /**
187
- * The HTTP response status text (e.g., `"OK"`, `"Not Found"`), or null if not fetched.
188
- */
189
- get statusText() {
190
- return this.#raw.statusText;
191
- }
192
-
193
- /**
194
- * The page title from the `<title>` element.
195
- * Returns an empty string if no title is set.
196
- */
197
- get title() {
198
- return this.#raw.title || '';
199
- }
200
-
201
- /**
202
- * The Twitter Card type (`twitter:card`), or null if not present.
203
- */
204
- get twitter_card() {
205
- return this.#raw.twitter_card;
206
- }
207
-
208
- /**
209
- * The parsed URL of this page as an ExURL object.
210
- * Respects the `disableQueries` option for query string handling.
211
- */
212
- get url() {
213
- return parseUrl(this.#raw.url, {
214
- disableQueries: this.#disableQueries,
215
- })!;
216
- }
217
-
218
- /**
219
- * Creates a new Page instance.
220
- * @param archive - The ArchiveAccessor used for lazy-loading relationships.
221
- * @param raw - The raw database row for this page.
222
- * @param rawRedirects - Pre-loaded redirect records, or undefined for lazy loading.
223
- * @param rawAnchors - Pre-loaded anchor records, or undefined for lazy loading.
224
- * @param rawReferrers - Pre-loaded referrer records, or undefined for lazy loading.
225
- * @param disableQueries - Whether to strip query strings from the URL.
226
- */
227
- constructor(
228
- archive: ArchiveAccessor,
229
- raw: DB_Page,
230
- rawRedirects?: DB_Redirect[],
231
- rawAnchors?: DB_Anchor[],
232
- rawReferrers?: DB_Referrer[],
233
- disableQueries?: boolean,
234
- ) {
235
- this.#archive = archive;
236
- this.#raw = raw;
237
- this.redirectFrom = (rawRedirects || []).map((r) => ({
238
- url: r.from,
239
- pageId: r.fromId,
240
- }));
241
- this.#rawAnchors = rawAnchors || null;
242
- this.#rawReferrers = rawReferrers || null;
243
- this.#disableQueries = disableQueries ?? false;
244
- }
245
-
246
- /**
247
- * Retrieves the anchors (outgoing links) found on this page.
248
- * Uses pre-loaded data if available, otherwise queries the database.
249
- * @returns An array of {@link Anchor} objects representing the links on this page.
250
- */
251
- async getAnchors(): Promise<Anchor[]> {
252
- if (this.#rawAnchors) {
253
- return this.#rawAnchors.map((a) => ({
254
- url: a.url,
255
- href: a.href,
256
- isExternal: !!a.isExternal,
257
- title: a.title,
258
- status: a.status,
259
- statusText: a.statusText,
260
- contentType: a.contentType,
261
- hash: a.hash,
262
- textContent: a.textContent,
263
- }));
264
- }
265
- return this.#archive.getAnchorsOnPage(this.#raw.id);
266
- }
267
-
268
- /**
269
- * Reads the HTML snapshot content of this page from the archive.
270
- * @returns The HTML content as a string, or null if no snapshot was saved.
271
- */
272
- async getHtml() {
273
- return this.#archive.getHtmlOfPage(this.#raw.html);
274
- }
275
-
276
- /**
277
- * Retrieves the referrers (incoming links) pointing to this page.
278
- * Uses pre-loaded data if available, otherwise queries the database.
279
- * @returns An array of {@link Referrer} objects representing pages that link to this page.
280
- */
281
- async getReferrers(): Promise<Referrer[]> {
282
- if (this.#rawReferrers) {
283
- return this.#rawReferrers.map((r) => ({
284
- url: r.url,
285
- through: r.through,
286
- throughId: r.throughId,
287
- hash: r.hash,
288
- textContent: r.textContent || '',
289
- }));
290
- }
291
- return this.#archive.getReferrersOfPage(this.#raw.id);
292
- }
293
-
294
- /**
295
- * Retrieves all request referrers for this page directly from the database.
296
- * Unlike {@link getReferrers}, this always queries the database and does not use pre-loaded data.
297
- * @returns An array of {@link Referrer} objects.
298
- */
299
- async getRequests(): Promise<Referrer[]> {
300
- return this.#archive.getReferrersOfPage(this.#raw.id);
301
- }
302
-
303
- /**
304
- * Checks whether this page is an internal HTML page (not external and has `text/html` content type).
305
- * @returns `true` if this is an internal HTML page, `false` otherwise.
306
- */
307
- isInternalPage() {
308
- return this.isPage() && !this.isExternal;
309
- }
310
-
311
- /**
312
- * Checks whether this entry represents an HTML page (content type is `text/html`).
313
- * @returns `true` if the content type is `text/html`, `false` otherwise.
314
- */
315
- isPage() {
316
- const type = this.contentType || '';
317
- return type.toLowerCase().trim() === 'text/html';
318
- }
319
-
320
- /**
321
- * Serializes the page data to a plain JSON object,
322
- * including resolved anchors and referrers.
323
- * @returns A plain object containing all page metadata and relationships.
324
- */
325
- async toJSON() {
326
- return {
327
- url: this.url.href,
328
- title: this.title,
329
- status: this.status,
330
- statusText: this.statusText,
331
- contentType: this.contentType,
332
- contentLength: this.contentLength,
333
- responseHeaders: this.responseHeaders,
334
- isExternal: this.isExternal,
335
- isSkipped: this.isSkipped,
336
- skipReason: this.skipReason,
337
- isTarget: this.isTarget,
338
- lang: this.lang,
339
- description: this.description,
340
- keywords: this.keywords,
341
- noindex: this.noindex,
342
- nofollow: this.nofollow,
343
- noarchive: this.noarchive,
344
- canonical: this.canonical,
345
- alternate: this.alternate,
346
- twitter_card: this.twitter_card,
347
- og_site_name: this.og_site_name,
348
- og_url: this.og_url,
349
- og_title: this.og_title,
350
- og_description: this.og_description,
351
- og_type: this.og_type,
352
- og_image: this.og_image,
353
- redirectFrom: this.redirectFrom,
354
- isPage: this.isPage(),
355
- isInternalPage: this.isInternalPage(),
356
- getAnchors: await this.getAnchors(),
357
- getReferrers: await this.getReferrers(),
358
- };
359
- }
360
- }
361
-
362
- /**
363
- * Utility type that extracts the resolved type from a Promise.
364
- */
365
- type PromiseType<T> = T extends PromiseLike<infer U> ? U : T;
366
-
367
- /**
368
- * The static (serialized) representation of a Page, as returned by {@link Page.toJSON}.
369
- */
370
- export type StaticPageData = PromiseType<ReturnType<Page['toJSON']>>;
371
-
372
- /**
373
- * Represents a page that links to another page (an incoming link).
374
- */
375
- export interface Referrer {
376
- /** The URL of the referring page. */
377
- url: string;
378
- /** The URL through which the referral passes (may differ due to redirects). */
379
- through: string;
380
- /** The page ID corresponding to the through URL. */
381
- throughId: number;
382
- /** The URL fragment (hash) of the referring link, or null if not present. */
383
- hash: string | null;
384
- /** The text content of the referring anchor element. */
385
- textContent: string;
386
- }
387
-
388
- /**
389
- * Represents an outgoing link (anchor element) found on a page.
390
- */
391
- export interface Anchor {
392
- /** The resolved destination URL of the anchor. */
393
- url: string;
394
- /** The original href attribute value of the anchor element. */
395
- href: string;
396
- /** Whether the anchor points to an external domain. */
397
- isExternal: boolean;
398
- /** The title attribute of the anchor element, or null if not present. */
399
- title: string | null;
400
- /** The HTTP status code of the linked page, or null if not yet fetched. */
401
- status: number | null;
402
- /** The HTTP status text of the linked page, or null if not yet fetched. */
403
- statusText: string | null;
404
- /** The content type of the linked page, or null if not yet fetched. */
405
- contentType: string | null;
406
- /** The URL fragment (hash) portion of the link, or null if not present. */
407
- hash: string | null;
408
- /** The text content of the anchor element, or null if empty. */
409
- textContent: string | null;
410
- }
411
-
412
- /**
413
- * Represents a page that redirects to this page.
414
- */
415
- export interface Redirect {
416
- /** The URL of the redirect source page. */
417
- url: string;
418
- /** The database ID of the redirect source page. */
419
- pageId: number;
420
- }
@@ -1,101 +0,0 @@
1
- import type { DB_Resource } from './types.js';
2
-
3
- import { describe, it, expect, vi } from 'vitest';
4
-
5
- import Resource from './resource.js';
6
-
7
- /**
8
- * Create a mock ArchiveAccessor with vi.fn() stubs.
9
- * @param overrides - Optional method overrides.
10
- * @returns A mock ArchiveAccessor.
11
- */
12
- function createMockArchive(overrides: Record<string, unknown> = {}) {
13
- return {
14
- getReferrersOfResource: vi.fn().mockResolvedValue([]),
15
- ...overrides,
16
- };
17
- }
18
-
19
- /**
20
- * Create a minimal DB_Resource for testing.
21
- * @param overrides - Optional field overrides.
22
- * @returns A DB_Resource object.
23
- */
24
- function createRawResource(overrides: Partial<DB_Resource> = {}): DB_Resource {
25
- return {
26
- id: 1,
27
- url: 'https://example.com/style.css',
28
- isExternal: 0,
29
- status: 200,
30
- statusText: 'OK',
31
- contentType: 'text/css',
32
- contentLength: 1024,
33
- compress: 'gzip',
34
- cdn: 0,
35
- responseHeaders: null,
36
- ...overrides,
37
- };
38
- }
39
-
40
- describe('Resource', () => {
41
- it('exposes url getter', () => {
42
- const resource = new Resource(createMockArchive() as never, createRawResource());
43
- expect(resource.url).toBe('https://example.com/style.css');
44
- });
45
-
46
- it('exposes status getter', () => {
47
- const resource = new Resource(createMockArchive() as never, createRawResource());
48
- expect(resource.status).toBe(200);
49
- });
50
-
51
- it('exposes statusText getter', () => {
52
- const resource = new Resource(createMockArchive() as never, createRawResource());
53
- expect(resource.statusText).toBe('OK');
54
- });
55
-
56
- it('exposes contentType getter', () => {
57
- const resource = new Resource(createMockArchive() as never, createRawResource());
58
- expect(resource.contentType).toBe('text/css');
59
- });
60
-
61
- it('exposes contentLength getter', () => {
62
- const resource = new Resource(createMockArchive() as never, createRawResource());
63
- expect(resource.contentLength).toBe(1024);
64
- });
65
-
66
- it('returns false for isExternal when flag is 0', () => {
67
- const resource = new Resource(
68
- createMockArchive() as never,
69
- createRawResource({ isExternal: 0 }),
70
- );
71
- expect(resource.isExternal).toBe(false);
72
- });
73
-
74
- it('returns true for isExternal when flag is 1', () => {
75
- const resource = new Resource(
76
- createMockArchive() as never,
77
- createRawResource({ isExternal: 1 }),
78
- );
79
- expect(resource.isExternal).toBe(true);
80
- });
81
-
82
- it('getReferrers delegates to archive', async () => {
83
- const mockArchive = createMockArchive({
84
- getReferrersOfResource: vi
85
- .fn()
86
- .mockResolvedValue(['https://a.com/', 'https://b.com/']),
87
- });
88
- const resource = new Resource(mockArchive as never, createRawResource({ id: 42 }));
89
- const referrers = await resource.getReferrers();
90
- expect(referrers).toEqual(['https://a.com/', 'https://b.com/']);
91
- expect(mockArchive.getReferrersOfResource).toHaveBeenCalledWith(42);
92
- });
93
-
94
- it('returns null for status when not fetched', () => {
95
- const resource = new Resource(
96
- createMockArchive() as never,
97
- createRawResource({ status: null }),
98
- );
99
- expect(resource.status).toBeNull();
100
- });
101
- });
@@ -1,73 +0,0 @@
1
- import type { ArchiveAccessor } from './archive-accessor.js';
2
- import type { DB_Resource } from './types.js';
3
-
4
- /**
5
- * Represents a sub-resource (CSS, JS, image, font, etc.) stored in the archive.
6
- *
7
- * Provides access to the resource's HTTP metadata and referrer information.
8
- * Instances are created by {@link ArchiveAccessor.getResources}.
9
- */
10
- export default class Resource {
11
- #archive: ArchiveAccessor;
12
- #raw: DB_Resource;
13
-
14
- /**
15
- * The content length of the resource in bytes, or null if unknown.
16
- */
17
- get contentLength() {
18
- return this.#raw.contentLength;
19
- }
20
-
21
- /**
22
- * The MIME content type of the resource (e.g., `"text/css"`, `"application/javascript"`), or null if unknown.
23
- */
24
- get contentType() {
25
- return this.#raw.contentType;
26
- }
27
-
28
- /**
29
- * Whether this resource is hosted on an external domain.
30
- */
31
- get isExternal() {
32
- return !!this.#raw.isExternal;
33
- }
34
-
35
- /**
36
- * The HTTP response status code, or null if not yet fetched.
37
- */
38
- get status() {
39
- return this.#raw.status;
40
- }
41
-
42
- /**
43
- * The HTTP response status text (e.g., `"OK"`, `"Not Found"`), or null if not yet fetched.
44
- */
45
- get statusText() {
46
- return this.#raw.statusText;
47
- }
48
-
49
- /**
50
- * The URL of the resource.
51
- */
52
- get url() {
53
- return this.#raw.url;
54
- }
55
-
56
- /**
57
- * Creates a new Resource instance.
58
- * @param archive - The ArchiveAccessor used for querying referrer data.
59
- * @param raw - The raw database row for this resource.
60
- */
61
- constructor(archive: ArchiveAccessor, raw: DB_Resource) {
62
- this.#archive = archive;
63
- this.#raw = raw;
64
- }
65
-
66
- /**
67
- * Retrieves the page URLs that reference this resource.
68
- * @returns An array of page URL strings that include or reference this resource.
69
- */
70
- async getReferrers() {
71
- return this.#archive.getReferrersOfResource(this.#raw.id);
72
- }
73
- }
@@ -1,44 +0,0 @@
1
- import path from 'node:path';
2
-
3
- import { describe, it, expect } from 'vitest';
4
-
5
- import { safePath } from './safe-path.js';
6
-
7
- describe('safePath', () => {
8
- it('resolves a valid path within the base directory', () => {
9
- const base = '/tmp/archive';
10
- const result = safePath(base, 'data', 'file.json');
11
- expect(result).toBe(path.resolve(base, 'data', 'file.json'));
12
- });
13
-
14
- it('throws on path traversal with ..', () => {
15
- const base = '/tmp/archive';
16
- expect(() => safePath(base, '..', 'etc', 'passwd')).toThrow(
17
- 'Path traversal detected',
18
- );
19
- });
20
-
21
- it('throws on absolute path that escapes base', () => {
22
- const base = '/tmp/archive';
23
- expect(() => safePath(base, '/etc/passwd')).toThrow('Path traversal detected');
24
- });
25
-
26
- it('throws on deeply nested traversal', () => {
27
- const base = '/tmp/archive';
28
- expect(() => safePath(base, 'a', '..', '..', 'secret')).toThrow(
29
- 'Path traversal detected',
30
- );
31
- });
32
-
33
- it('allows paths that resolve to the base itself', () => {
34
- const base = '/tmp/archive';
35
- const result = safePath(base, '.');
36
- expect(result).toBe(path.resolve(base));
37
- });
38
-
39
- it('allows nested directories within base', () => {
40
- const base = '/tmp/archive';
41
- const result = safePath(base, 'deep', 'nested', 'dir', 'file.txt');
42
- expect(result).toBe(path.resolve(base, 'deep', 'nested', 'dir', 'file.txt'));
43
- });
44
- });
@@ -1,18 +0,0 @@
1
- import path from 'node:path';
2
-
3
- /**
4
- * Resolves and validates a file path to prevent path traversal attacks.
5
- * Ensures the resolved path stays within the specified base directory.
6
- * @param base - The base directory that all paths must stay within.
7
- * @param segments - Path segments to resolve relative to the base.
8
- * @returns The resolved absolute path.
9
- * @throws {Error} If the resolved path escapes the base directory.
10
- */
11
- export function safePath(base: string, ...segments: string[]): string {
12
- const resolvedBase = path.resolve(base);
13
- const resolved = path.resolve(base, ...segments);
14
- if (!resolved.startsWith(resolvedBase + path.sep) && resolved !== resolvedBase) {
15
- throw new Error(`Path traversal detected: ${segments.join('/')}`);
16
- }
17
- return resolved;
18
- }