@nitpicker/crawler 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/package.json +5 -2
  2. package/CHANGELOG.md +0 -16
  3. package/src/archive/__mock__/.gitignore +0 -3
  4. package/src/archive/__mock__/mock.sqlite +0 -0
  5. package/src/archive/archive-accessor.ts +0 -337
  6. package/src/archive/archive.ts +0 -408
  7. package/src/archive/database.spec.ts +0 -469
  8. package/src/archive/database.ts +0 -1059
  9. package/src/archive/debug.ts +0 -10
  10. package/src/archive/filesystem/append-text.spec.ts +0 -26
  11. package/src/archive/filesystem/append-text.ts +0 -16
  12. package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
  13. package/src/archive/filesystem/copy-dir-sync.ts +0 -10
  14. package/src/archive/filesystem/copy-dir.spec.ts +0 -33
  15. package/src/archive/filesystem/copy-dir.ts +0 -14
  16. package/src/archive/filesystem/exists.spec.ts +0 -33
  17. package/src/archive/filesystem/exists.ts +0 -10
  18. package/src/archive/filesystem/get-file-list.spec.ts +0 -37
  19. package/src/archive/filesystem/get-file-list.ts +0 -13
  20. package/src/archive/filesystem/index.ts +0 -17
  21. package/src/archive/filesystem/is-dir.spec.ts +0 -29
  22. package/src/archive/filesystem/is-dir.ts +0 -11
  23. package/src/archive/filesystem/mkdir.spec.ts +0 -37
  24. package/src/archive/filesystem/mkdir.ts +0 -16
  25. package/src/archive/filesystem/output-json.spec.ts +0 -34
  26. package/src/archive/filesystem/output-json.ts +0 -16
  27. package/src/archive/filesystem/output-text.spec.ts +0 -31
  28. package/src/archive/filesystem/output-text.ts +0 -35
  29. package/src/archive/filesystem/read-json.spec.ts +0 -26
  30. package/src/archive/filesystem/read-json.ts +0 -12
  31. package/src/archive/filesystem/read-text.spec.ts +0 -25
  32. package/src/archive/filesystem/read-text.ts +0 -11
  33. package/src/archive/filesystem/readline.spec.ts +0 -29
  34. package/src/archive/filesystem/readline.ts +0 -30
  35. package/src/archive/filesystem/remove.spec.ts +0 -34
  36. package/src/archive/filesystem/remove.ts +0 -11
  37. package/src/archive/filesystem/rename.spec.ts +0 -46
  38. package/src/archive/filesystem/rename.ts +0 -21
  39. package/src/archive/filesystem/tar.spec.ts +0 -33
  40. package/src/archive/filesystem/tar.ts +0 -27
  41. package/src/archive/filesystem/untar.spec.ts +0 -34
  42. package/src/archive/filesystem/untar.ts +0 -36
  43. package/src/archive/index.ts +0 -13
  44. package/src/archive/page.spec.ts +0 -368
  45. package/src/archive/page.ts +0 -420
  46. package/src/archive/resource.spec.ts +0 -101
  47. package/src/archive/resource.ts +0 -73
  48. package/src/archive/safe-path.spec.ts +0 -44
  49. package/src/archive/safe-path.ts +0 -18
  50. package/src/archive/types.ts +0 -227
  51. package/src/crawler/clear-destination-cache.spec.ts +0 -20
  52. package/src/crawler/clear-destination-cache.ts +0 -9
  53. package/src/crawler/crawler.ts +0 -873
  54. package/src/crawler/decompose-url.spec.ts +0 -48
  55. package/src/crawler/decompose-url.ts +0 -90
  56. package/src/crawler/destination-cache.spec.ts +0 -23
  57. package/src/crawler/destination-cache.ts +0 -8
  58. package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
  59. package/src/crawler/detect-pagination-pattern.ts +0 -66
  60. package/src/crawler/fetch-destination.ts +0 -257
  61. package/src/crawler/fetch-robots-txt.spec.ts +0 -83
  62. package/src/crawler/fetch-robots-txt.ts +0 -91
  63. package/src/crawler/find-best-matching-scope.spec.ts +0 -39
  64. package/src/crawler/find-best-matching-scope.ts +0 -57
  65. package/src/crawler/generate-predicted-urls.spec.ts +0 -42
  66. package/src/crawler/generate-predicted-urls.ts +0 -34
  67. package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
  68. package/src/crawler/handle-ignore-and-skip.ts +0 -30
  69. package/src/crawler/handle-resource-response.spec.ts +0 -45
  70. package/src/crawler/handle-resource-response.ts +0 -21
  71. package/src/crawler/handle-scrape-end.spec.ts +0 -109
  72. package/src/crawler/handle-scrape-end.ts +0 -115
  73. package/src/crawler/handle-scrape-error.spec.ts +0 -105
  74. package/src/crawler/handle-scrape-error.ts +0 -58
  75. package/src/crawler/index.ts +0 -2
  76. package/src/crawler/inject-scope-auth.spec.ts +0 -36
  77. package/src/crawler/inject-scope-auth.ts +0 -27
  78. package/src/crawler/is-external-url.spec.ts +0 -31
  79. package/src/crawler/is-external-url.ts +0 -17
  80. package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
  81. package/src/crawler/is-in-any-lower-layer.ts +0 -22
  82. package/src/crawler/link-list.spec.ts +0 -355
  83. package/src/crawler/link-list.ts +0 -275
  84. package/src/crawler/link-to-page-data.spec.ts +0 -133
  85. package/src/crawler/link-to-page-data.ts +0 -34
  86. package/src/crawler/net-timeout-error.spec.ts +0 -25
  87. package/src/crawler/net-timeout-error.ts +0 -11
  88. package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
  89. package/src/crawler/protocol-agnostic-key.ts +0 -11
  90. package/src/crawler/reconstruct-url.spec.ts +0 -37
  91. package/src/crawler/reconstruct-url.ts +0 -37
  92. package/src/crawler/robots-checker.spec.ts +0 -104
  93. package/src/crawler/robots-checker.ts +0 -73
  94. package/src/crawler/should-discard-predicted.spec.ts +0 -125
  95. package/src/crawler/should-discard-predicted.ts +0 -33
  96. package/src/crawler/should-skip-url.spec.ts +0 -77
  97. package/src/crawler/should-skip-url.ts +0 -37
  98. package/src/crawler/types.ts +0 -146
  99. package/src/crawler-orchestrator.ts +0 -401
  100. package/src/debug.ts +0 -10
  101. package/src/index.ts +0 -25
  102. package/src/types.ts +0 -30
  103. package/src/utils/array/each-splitted.spec.ts +0 -38
  104. package/src/utils/array/each-splitted.ts +0 -19
  105. package/src/utils/array/index.ts +0 -1
  106. package/src/utils/debug.ts +0 -6
  107. package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
  108. package/src/utils/error/dom-evaluation-error.ts +0 -6
  109. package/src/utils/error/error-emitter.spec.ts +0 -78
  110. package/src/utils/error/error-emitter.ts +0 -44
  111. package/src/utils/error/index.ts +0 -3
  112. package/src/utils/index.ts +0 -5
  113. package/src/utils/object/clean-object.spec.ts +0 -24
  114. package/src/utils/object/clean-object.ts +0 -13
  115. package/src/utils/object/index.ts +0 -1
  116. package/src/utils/types/index.ts +0 -1
  117. package/src/utils/types/types.ts +0 -65
  118. package/tsconfig.json +0 -11
  119. package/tsconfig.tsbuildinfo +0 -1
@@ -1,227 +0,0 @@
1
- import type { ParseURLOptions } from '@d-zero/shared/parse-url';
2
-
3
- /**
4
- * Event map for database-related events emitted by the Database and ArchiveAccessor classes.
5
- */
6
- export interface DatabaseEvent {
7
- /** An error that occurred during a database operation. */
8
- error: Error;
9
- }
10
-
11
- /**
12
- * Configuration stored in the archive database's `info` table.
13
- * Represents all crawling options that were used for the crawl session.
14
- */
15
- export interface Config extends Required<Pick<ParseURLOptions, 'disableQueries'>> {
16
- /** The starting URL for the crawl. */
17
- baseUrl: string;
18
- /** Maximum directory depth for excluded paths. */
19
- maxExcludedDepth: number;
20
- /** URL patterns defining the crawl scope. */
21
- scope: string[];
22
- /** Keywords used to exclude pages from crawling. */
23
- excludeKeywords: string[];
24
- /** URL patterns to exclude from crawling. */
25
- excludes: string[];
26
- /** URL prefixes to exclude from crawling. */
27
- excludeUrls: string[];
28
- /** Whether to fetch external (off-site) pages. */
29
- fetchExternal: boolean;
30
- /** Whether the crawl was initiated from a URL list rather than recursive discovery. */
31
- fromList: boolean;
32
- /** Whether to collect image data during crawling. */
33
- image: boolean;
34
- /** Interval in milliseconds between requests. */
35
- interval: number;
36
- /** The name identifier for this crawl session. */
37
- name: string;
38
- /** Number of parallel crawling processes. */
39
- parallels: number;
40
- /** Whether to recursively follow links. */
41
- recursive: boolean;
42
- /** Maximum number of retry attempts per URL on scrape failure. */
43
- retry: number;
44
- /** The version of Nitpicker that created this archive. */
45
- version: string;
46
-
47
- /** User-Agent string used for HTTP requests. */
48
- userAgent: string;
49
-
50
- /** Whether robots.txt restrictions were ignored during crawling. */
51
- ignoreRobots: boolean;
52
- }
53
-
54
- /**
55
- * Filter type for querying pages from the database.
56
- *
57
- * - `'page'` - HTML pages that are crawl targets
58
- * - `'page-included-no-target'` - All HTML pages, including non-target pages
59
- * - `'external-page'` - HTML pages on external domains
60
- * - `'internal-page'` - HTML pages on the crawled domain
61
- * - `'no-page'` - Non-HTML resources (e.g., images, PDFs)
62
- * - `'external-no-page'` - External non-HTML resources
63
- * - `'internal-no-page'` - Internal non-HTML resources
64
- */
65
- export type PageFilter =
66
- | 'page'
67
- | 'page-included-no-target'
68
- | 'external-page'
69
- | 'internal-page'
70
- | 'no-page'
71
- | 'external-no-page'
72
- | 'internal-no-page';
73
-
74
- /**
75
- * Raw database row representing a crawled page in the `pages` table.
76
- */
77
- export interface DB_Page {
78
- /** Auto-incremented primary key. */
79
- id: number;
80
- /** The canonical URL of the page. */
81
- url: string;
82
- /** Foreign key to the redirect destination page, or null if not redirected. */
83
- redirectDestId: number | null;
84
- /** Whether the page has been scraped (1) or is still pending (0). */
85
- scraped: 0 | 1;
86
- /** Whether the page is a crawl target (1) or discovered incidentally (0). */
87
- isTarget: 0 | 1;
88
- /** Whether the page is on an external domain (1) or internal (0). */
89
- isExternal: 0 | 1;
90
- /** HTTP response status code, or null if not yet fetched. */
91
- status: number | null;
92
- /** HTTP response status text (e.g., "OK", "Not Found"), or null if not yet fetched. */
93
- statusText: string | null;
94
- /** MIME content type of the response (e.g., "text/html"), or null if unknown. */
95
- contentType: string | null;
96
- /** Content length in bytes, or null if unknown. */
97
- contentLength: number | null;
98
- /** JSON-serialized HTTP response headers. */
99
- responseHeaders: string;
100
- /** The `lang` attribute value from the HTML element, or null if not present. */
101
- lang: string | null;
102
- /** The page title from the `<title>` element, or null if not present. */
103
- title: string | null;
104
- /** The meta description content, or null if not present. */
105
- description: string | null;
106
- /** The meta keywords content, or null if not present. */
107
- keywords: string | null;
108
- /** Whether the noindex robots directive is set (SQLite INTEGER 0/1). */
109
- noindex: number | null;
110
- /** Whether the nofollow robots directive is set (SQLite INTEGER 0/1). */
111
- nofollow: number | null;
112
- /** Whether the noarchive robots directive is set (SQLite INTEGER 0/1). */
113
- noarchive: number | null;
114
- /** The canonical URL from `<link rel="canonical">`, or null if not present. */
115
- canonical: string | null;
116
- /** The alternate URL from `<link rel="alternate">`, or null if not present. */
117
- alternate: string | null;
118
- /** The Open Graph type (`og:type`), or null if not present. */
119
- og_type: string | null;
120
- /** The Open Graph title (`og:title`), or null if not present. */
121
- og_title: string | null;
122
- /** The Open Graph site name (`og:site_name`), or null if not present. */
123
- og_site_name: string | null;
124
- /** The Open Graph description (`og:description`), or null if not present. */
125
- og_description: string | null;
126
- /** The Open Graph URL (`og:url`), or null if not present. */
127
- og_url: string | null;
128
- /** The Open Graph image URL (`og:image`), or null if not present. */
129
- og_image: string | null;
130
- /** The Twitter Card type (`twitter:card`), or null if not present. */
131
- twitter_card: string | null;
132
- /** JSON-serialized network logs captured during scraping, or null if not collected. */
133
- networkLogs: string | null;
134
- /** Relative file path to the saved HTML snapshot, or null if not saved. */
135
- html: string | null;
136
- /** Whether the page was skipped during crawling (1) or processed normally (0). */
137
- isSkipped: 0 | 1;
138
- /** The reason the page was skipped, or null if it was not skipped. */
139
- skipReason: string | null;
140
- /** The natural URL sort order index, or null if not yet assigned. */
141
- order: number | null;
142
- }
143
-
144
- /**
145
- * Raw database row representing a redirect relationship.
146
- * Maps a source page to its redirect destination.
147
- */
148
- export interface DB_Redirect {
149
- /** The ID of the destination page after redirect. */
150
- pageId: number;
151
- /** The URL that was redirected from. */
152
- from: string;
153
- /** The page ID of the source URL that was redirected. */
154
- fromId: number;
155
- }
156
-
157
- /**
158
- * Raw database row representing an anchor (link) found on a page.
159
- * Combines data from the `anchors` table and the linked `pages` table.
160
- */
161
- export interface DB_Anchor {
162
- /** The ID of the page that contains this anchor. */
163
- pageId: number;
164
- /** The resolved destination URL of the anchor. */
165
- url: string;
166
- /** The original href attribute value of the anchor element. */
167
- href: string;
168
- /** Whether the anchor points to an external domain (1) or internal (0). */
169
- isExternal: 0 | 1;
170
- /** The title attribute of the anchor element, or null if not present. */
171
- title: string | null;
172
- /** The HTTP status code of the linked page, or null if not yet fetched. */
173
- status: number | null;
174
- /** The HTTP status text of the linked page, or null if not yet fetched. */
175
- statusText: string | null;
176
- /** The content type of the linked page, or null if not yet fetched. */
177
- contentType: string | null;
178
- /** The URL fragment (hash) portion of the link, or null if not present. */
179
- hash: string | null;
180
- /** The text content of the anchor element, or null if empty. */
181
- textContent: string | null;
182
- }
183
-
184
- /**
185
- * Raw database row representing a referrer relationship.
186
- * Indicates which page links to which other page, potentially through redirects.
187
- */
188
- export interface DB_Referrer {
189
- /** The ID of the page being referred to. */
190
- pageId: number;
191
- /** The URL of the referring page. */
192
- url: string;
193
- /** The URL through which the referral passes (may differ from url due to redirects). */
194
- through: string;
195
- /** The page ID of the through URL. */
196
- throughId: number;
197
- /** The URL fragment (hash) of the referring link, or null if not present. */
198
- hash: string | null;
199
- /** The text content of the referring anchor element, or null if empty. */
200
- textContent: string | null;
201
- }
202
-
203
- /**
204
- * Raw database row representing a sub-resource (CSS, JS, image, etc.) in the `resources` table.
205
- */
206
- export interface DB_Resource {
207
- /** Auto-incremented primary key. */
208
- id: number;
209
- /** The URL of the resource. */
210
- url: string;
211
- /** Whether the resource is hosted on an external domain (1) or internal (0). */
212
- isExternal: 0 | 1;
213
- /** HTTP response status code, or null if not yet fetched. */
214
- status: number | null;
215
- /** HTTP response status text, or null if not yet fetched. */
216
- statusText: string | null;
217
- /** MIME content type of the resource, or null if unknown. */
218
- contentType: string | null;
219
- /** Content length in bytes, or null if unknown. */
220
- contentLength: number | null;
221
- /** Compression encoding (e.g., "gzip", "br"), or 0 if not compressed. */
222
- compress: string | 0;
223
- /** CDN provider identifier, or 0 if not served from a CDN. */
224
- cdn: string | 0;
225
- /** JSON-serialized HTTP response headers, or null if not available. */
226
- responseHeaders: string | null;
227
- }
@@ -1,20 +0,0 @@
1
- import { describe, it, expect } from 'vitest';
2
-
3
- import { clearDestinationCache } from './clear-destination-cache.js';
4
- import { destinationCache } from './destination-cache.js';
5
-
6
- describe('clearDestinationCache', () => {
7
- it('clears all entries from the destination cache', () => {
8
- destinationCache.set('https://example.com/', new Error('test'));
9
- expect(destinationCache.size).toBe(1);
10
-
11
- clearDestinationCache();
12
-
13
- expect(destinationCache.size).toBe(0);
14
- });
15
-
16
- it('does not throw when cache is already empty', () => {
17
- destinationCache.clear();
18
- expect(() => clearDestinationCache()).not.toThrow();
19
- });
20
- });
@@ -1,9 +0,0 @@
1
- import { destinationCache } from './destination-cache.js';
2
-
3
- /**
4
- * Clears the in-memory cache of HTTP request results.
5
- * Should be called between crawl sessions to prevent memory leaks.
6
- */
7
- export function clearDestinationCache() {
8
- destinationCache.clear();
9
- }