@nitpicker/crawler 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -2
- package/CHANGELOG.md +0 -16
- package/src/archive/__mock__/.gitignore +0 -3
- package/src/archive/__mock__/mock.sqlite +0 -0
- package/src/archive/archive-accessor.ts +0 -337
- package/src/archive/archive.ts +0 -408
- package/src/archive/database.spec.ts +0 -469
- package/src/archive/database.ts +0 -1059
- package/src/archive/debug.ts +0 -10
- package/src/archive/filesystem/append-text.spec.ts +0 -26
- package/src/archive/filesystem/append-text.ts +0 -16
- package/src/archive/filesystem/copy-dir-sync.spec.ts +0 -27
- package/src/archive/filesystem/copy-dir-sync.ts +0 -10
- package/src/archive/filesystem/copy-dir.spec.ts +0 -33
- package/src/archive/filesystem/copy-dir.ts +0 -14
- package/src/archive/filesystem/exists.spec.ts +0 -33
- package/src/archive/filesystem/exists.ts +0 -10
- package/src/archive/filesystem/get-file-list.spec.ts +0 -37
- package/src/archive/filesystem/get-file-list.ts +0 -13
- package/src/archive/filesystem/index.ts +0 -17
- package/src/archive/filesystem/is-dir.spec.ts +0 -29
- package/src/archive/filesystem/is-dir.ts +0 -11
- package/src/archive/filesystem/mkdir.spec.ts +0 -37
- package/src/archive/filesystem/mkdir.ts +0 -16
- package/src/archive/filesystem/output-json.spec.ts +0 -34
- package/src/archive/filesystem/output-json.ts +0 -16
- package/src/archive/filesystem/output-text.spec.ts +0 -31
- package/src/archive/filesystem/output-text.ts +0 -35
- package/src/archive/filesystem/read-json.spec.ts +0 -26
- package/src/archive/filesystem/read-json.ts +0 -12
- package/src/archive/filesystem/read-text.spec.ts +0 -25
- package/src/archive/filesystem/read-text.ts +0 -11
- package/src/archive/filesystem/readline.spec.ts +0 -29
- package/src/archive/filesystem/readline.ts +0 -30
- package/src/archive/filesystem/remove.spec.ts +0 -34
- package/src/archive/filesystem/remove.ts +0 -11
- package/src/archive/filesystem/rename.spec.ts +0 -46
- package/src/archive/filesystem/rename.ts +0 -21
- package/src/archive/filesystem/tar.spec.ts +0 -33
- package/src/archive/filesystem/tar.ts +0 -27
- package/src/archive/filesystem/untar.spec.ts +0 -34
- package/src/archive/filesystem/untar.ts +0 -36
- package/src/archive/index.ts +0 -13
- package/src/archive/page.spec.ts +0 -368
- package/src/archive/page.ts +0 -420
- package/src/archive/resource.spec.ts +0 -101
- package/src/archive/resource.ts +0 -73
- package/src/archive/safe-path.spec.ts +0 -44
- package/src/archive/safe-path.ts +0 -18
- package/src/archive/types.ts +0 -227
- package/src/crawler/clear-destination-cache.spec.ts +0 -20
- package/src/crawler/clear-destination-cache.ts +0 -9
- package/src/crawler/crawler.ts +0 -873
- package/src/crawler/decompose-url.spec.ts +0 -48
- package/src/crawler/decompose-url.ts +0 -90
- package/src/crawler/destination-cache.spec.ts +0 -23
- package/src/crawler/destination-cache.ts +0 -8
- package/src/crawler/detect-pagination-pattern.spec.ts +0 -169
- package/src/crawler/detect-pagination-pattern.ts +0 -66
- package/src/crawler/fetch-destination.ts +0 -257
- package/src/crawler/fetch-robots-txt.spec.ts +0 -83
- package/src/crawler/fetch-robots-txt.ts +0 -91
- package/src/crawler/find-best-matching-scope.spec.ts +0 -39
- package/src/crawler/find-best-matching-scope.ts +0 -57
- package/src/crawler/generate-predicted-urls.spec.ts +0 -42
- package/src/crawler/generate-predicted-urls.ts +0 -34
- package/src/crawler/handle-ignore-and-skip.spec.ts +0 -66
- package/src/crawler/handle-ignore-and-skip.ts +0 -30
- package/src/crawler/handle-resource-response.spec.ts +0 -45
- package/src/crawler/handle-resource-response.ts +0 -21
- package/src/crawler/handle-scrape-end.spec.ts +0 -109
- package/src/crawler/handle-scrape-end.ts +0 -115
- package/src/crawler/handle-scrape-error.spec.ts +0 -105
- package/src/crawler/handle-scrape-error.ts +0 -58
- package/src/crawler/index.ts +0 -2
- package/src/crawler/inject-scope-auth.spec.ts +0 -36
- package/src/crawler/inject-scope-auth.ts +0 -27
- package/src/crawler/is-external-url.spec.ts +0 -31
- package/src/crawler/is-external-url.ts +0 -17
- package/src/crawler/is-in-any-lower-layer.spec.ts +0 -31
- package/src/crawler/is-in-any-lower-layer.ts +0 -22
- package/src/crawler/link-list.spec.ts +0 -355
- package/src/crawler/link-list.ts +0 -275
- package/src/crawler/link-to-page-data.spec.ts +0 -133
- package/src/crawler/link-to-page-data.ts +0 -34
- package/src/crawler/net-timeout-error.spec.ts +0 -25
- package/src/crawler/net-timeout-error.ts +0 -11
- package/src/crawler/protocol-agnostic-key.spec.ts +0 -40
- package/src/crawler/protocol-agnostic-key.ts +0 -11
- package/src/crawler/reconstruct-url.spec.ts +0 -37
- package/src/crawler/reconstruct-url.ts +0 -37
- package/src/crawler/robots-checker.spec.ts +0 -104
- package/src/crawler/robots-checker.ts +0 -73
- package/src/crawler/should-discard-predicted.spec.ts +0 -125
- package/src/crawler/should-discard-predicted.ts +0 -33
- package/src/crawler/should-skip-url.spec.ts +0 -77
- package/src/crawler/should-skip-url.ts +0 -37
- package/src/crawler/types.ts +0 -146
- package/src/crawler-orchestrator.ts +0 -401
- package/src/debug.ts +0 -10
- package/src/index.ts +0 -25
- package/src/types.ts +0 -30
- package/src/utils/array/each-splitted.spec.ts +0 -38
- package/src/utils/array/each-splitted.ts +0 -19
- package/src/utils/array/index.ts +0 -1
- package/src/utils/debug.ts +0 -6
- package/src/utils/error/dom-evaluation-error.spec.ts +0 -20
- package/src/utils/error/dom-evaluation-error.ts +0 -6
- package/src/utils/error/error-emitter.spec.ts +0 -78
- package/src/utils/error/error-emitter.ts +0 -44
- package/src/utils/error/index.ts +0 -3
- package/src/utils/index.ts +0 -5
- package/src/utils/object/clean-object.spec.ts +0 -24
- package/src/utils/object/clean-object.ts +0 -13
- package/src/utils/object/index.ts +0 -1
- package/src/utils/types/index.ts +0 -1
- package/src/utils/types/types.ts +0 -65
- package/tsconfig.json +0 -11
- package/tsconfig.tsbuildinfo +0 -1
package/src/archive/database.ts
DELETED
|
@@ -1,1059 +0,0 @@
|
|
|
1
|
-
import type {
|
|
2
|
-
Config,
|
|
3
|
-
DB_Anchor,
|
|
4
|
-
DB_Page,
|
|
5
|
-
DB_Redirect,
|
|
6
|
-
DB_Referrer,
|
|
7
|
-
DB_Resource,
|
|
8
|
-
DatabaseEvent,
|
|
9
|
-
PageFilter,
|
|
10
|
-
} from './types.js';
|
|
11
|
-
import type { PageData, Resource } from '../utils/index.js';
|
|
12
|
-
import type { RetryDecoratorOptions } from '@d-zero/shared/retry';
|
|
13
|
-
import type { Knex } from 'knex';
|
|
14
|
-
|
|
15
|
-
import path from 'node:path';
|
|
16
|
-
|
|
17
|
-
import { tryParseUrl as parseUrl } from '@d-zero/shared/parse-url';
|
|
18
|
-
import { retry } from '@d-zero/shared/retry';
|
|
19
|
-
import { pathComparator } from '@d-zero/shared/sort/path';
|
|
20
|
-
import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
21
|
-
import knex from 'knex';
|
|
22
|
-
|
|
23
|
-
import { ErrorEmitter, eachSplitted } from '../utils/index.js';
|
|
24
|
-
|
|
25
|
-
import { dbLog } from './debug.js';
|
|
26
|
-
import { mkdir } from './filesystem/index.js';
|
|
27
|
-
|
|
28
|
-
const retrySetting: RetryDecoratorOptions = {
|
|
29
|
-
interval: 300,
|
|
30
|
-
retries: 3,
|
|
31
|
-
};
|
|
32
|
-
|
|
33
|
-
/**
|
|
34
|
-
* Low-level database abstraction layer for the archive's SQLite database.
|
|
35
|
-
*
|
|
36
|
-
* Manages the `pages`, `anchors`, `images`, `resources`, and `resources-referrers`
|
|
37
|
-
* tables. All public methods that perform database queries use the `@retryable`
|
|
38
|
-
* decorator for automatic retry on transient failures, and `@ErrorEmitter` to
|
|
39
|
-
* propagate errors as events.
|
|
40
|
-
*
|
|
41
|
-
* Use the static {@link Database.connect} factory method to create instances.
|
|
42
|
-
* The constructor is private.
|
|
43
|
-
*/
|
|
44
|
-
export class Database extends EventEmitter<DatabaseEvent> {
|
|
45
|
-
/** The Knex query builder instance connected to the SQLite database. */
|
|
46
|
-
#instance: Knex;
|
|
47
|
-
/** Absolute path to the working directory, used for resolving relative snapshot paths. */
|
|
48
|
-
#workingDir: string;
|
|
49
|
-
// eslint-disable-next-line no-restricted-syntax
|
|
50
|
-
private constructor(options: DatabaseOption) {
|
|
51
|
-
super();
|
|
52
|
-
this.#workingDir = options.workingDir;
|
|
53
|
-
switch (options.type) {
|
|
54
|
-
case 'sqlite3': {
|
|
55
|
-
this.#instance = knex({
|
|
56
|
-
client: options.type,
|
|
57
|
-
connection: {
|
|
58
|
-
filename: options.filename,
|
|
59
|
-
},
|
|
60
|
-
useNullAsDefault: true,
|
|
61
|
-
pool: {
|
|
62
|
-
acquireTimeoutMillis: 600_000,
|
|
63
|
-
},
|
|
64
|
-
});
|
|
65
|
-
break;
|
|
66
|
-
}
|
|
67
|
-
case 'mysql': {
|
|
68
|
-
throw new Error("Don't support MySQL yet.");
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
/**
|
|
74
|
-
* Adds the `order` column to the `pages` table for URL sort ordering.
|
|
75
|
-
* @deprecated Since v0.1.x. The column is now created during table initialization.
|
|
76
|
-
* @returns The result of the schema alteration.
|
|
77
|
-
*/
|
|
78
|
-
async addOrderField() {
|
|
79
|
-
return await this.#instance.schema.table('pages', (t) => {
|
|
80
|
-
t.integer('order').unsigned().nullable().defaultTo(null);
|
|
81
|
-
});
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
/**
|
|
85
|
-
* Forces a WAL checkpoint, writing all pending WAL data back to the main database file.
|
|
86
|
-
* Uses TRUNCATE mode to reset the WAL file to zero bytes after checkpointing.
|
|
87
|
-
* This ensures the database is fully self-contained in `db.sqlite` before archiving.
|
|
88
|
-
*/
|
|
89
|
-
async checkpoint() {
|
|
90
|
-
await this.#instance.raw('PRAGMA wal_checkpoint(TRUNCATE)');
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
async destroy() {
|
|
94
|
-
await this.#instance.destroy();
|
|
95
|
-
}
|
|
96
|
-
/**
|
|
97
|
-
* Retrieves all anchors (outgoing links) on a specific page.
|
|
98
|
-
* Joins the `anchors` table with the `pages` table to resolve link destinations.
|
|
99
|
-
* @param pageId - The database ID of the page whose anchors to retrieve.
|
|
100
|
-
* @returns An array of anchor records with resolved URL, title, status, and content type.
|
|
101
|
-
*/
|
|
102
|
-
@ErrorEmitter()
|
|
103
|
-
@retry(retrySetting)
|
|
104
|
-
async getAnchorsOnPage(pageId: number) {
|
|
105
|
-
const res = await this.#instance
|
|
106
|
-
.select(
|
|
107
|
-
'pages.url',
|
|
108
|
-
'pages.title',
|
|
109
|
-
'pages.status',
|
|
110
|
-
'pages.statusText',
|
|
111
|
-
'pages.contentType',
|
|
112
|
-
'anchors.hash',
|
|
113
|
-
'anchors.textContent',
|
|
114
|
-
)
|
|
115
|
-
.from('anchors')
|
|
116
|
-
.join('pages', 'anchors.hrefId', '=', 'pages.id')
|
|
117
|
-
.where('anchors.pageId', pageId);
|
|
118
|
-
return res;
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
/**
|
|
122
|
-
* Retrieves the base URL of the crawl session from the `info` table.
|
|
123
|
-
* @returns The base URL string.
|
|
124
|
-
* @throws {Error} If no base URL is found in the database.
|
|
125
|
-
*/
|
|
126
|
-
@ErrorEmitter()
|
|
127
|
-
@retry(retrySetting)
|
|
128
|
-
async getBaseUrl() {
|
|
129
|
-
const selected = await this.#instance.select('baseUrl').from<Config>('info');
|
|
130
|
-
if (!selected[0]) {
|
|
131
|
-
throw new Error('No baseUrl');
|
|
132
|
-
}
|
|
133
|
-
const [{ baseUrl }] = selected;
|
|
134
|
-
return baseUrl || '';
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
/**
|
|
138
|
-
* Retrieves the full crawl configuration from the `info` table.
|
|
139
|
-
* Deserializes JSON-encoded fields (`excludes`, `excludeKeywords`, `scope`).
|
|
140
|
-
* @returns The parsed {@link Config} object.
|
|
141
|
-
* @throws {Error} If no configuration is found in the database.
|
|
142
|
-
*/
|
|
143
|
-
@ErrorEmitter()
|
|
144
|
-
@retry(retrySetting)
|
|
145
|
-
async getConfig() {
|
|
146
|
-
const [config] = await this.#instance.select('*').from<Config>('info');
|
|
147
|
-
if (!config) {
|
|
148
|
-
throw new Error('No config');
|
|
149
|
-
}
|
|
150
|
-
const opt: Config = {
|
|
151
|
-
...config,
|
|
152
|
-
excludes: getJSON<string[]>(config.excludes, []),
|
|
153
|
-
excludeKeywords: getJSON<string[]>(config.excludeKeywords, []),
|
|
154
|
-
excludeUrls: getJSON<string[]>(config.excludeUrls, []),
|
|
155
|
-
scope: getJSON<string[]>(config.scope, []),
|
|
156
|
-
retry: config.retry ?? 3,
|
|
157
|
-
};
|
|
158
|
-
// @ts-expect-error
|
|
159
|
-
delete opt.id;
|
|
160
|
-
dbLog('Table `info`: %O => %O', config, opt);
|
|
161
|
-
return opt;
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
/**
|
|
165
|
-
* Retrieves the current crawling state by listing scraped and pending URLs.
|
|
166
|
-
* @returns An object with `scraped` (completed URLs) and `pending` (remaining URLs) arrays.
|
|
167
|
-
*/
|
|
168
|
-
@ErrorEmitter()
|
|
169
|
-
@retry(retrySetting)
|
|
170
|
-
async getCrawlingState() {
|
|
171
|
-
const ex = (r: { url: string }) => r.url;
|
|
172
|
-
const $scraped = await this.#instance
|
|
173
|
-
.select('url')
|
|
174
|
-
.from<DB_Page>('pages')
|
|
175
|
-
.where('scraped', 1);
|
|
176
|
-
const scraped = $scraped.map(ex);
|
|
177
|
-
const $pending = await this.#instance
|
|
178
|
-
.select('url')
|
|
179
|
-
.from<DB_Page>('pages')
|
|
180
|
-
.where('scraped', 0);
|
|
181
|
-
const pending = $pending.map(ex);
|
|
182
|
-
return {
|
|
183
|
-
scraped,
|
|
184
|
-
pending,
|
|
185
|
-
};
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
/**
|
|
189
|
-
* Retrieves the HTML snapshot file path for a specific page.
|
|
190
|
-
* @param pageId - The database ID of the page.
|
|
191
|
-
* @returns The relative file path to the HTML snapshot, or null if not saved.
|
|
192
|
-
*/
|
|
193
|
-
@ErrorEmitter()
|
|
194
|
-
@retry(retrySetting)
|
|
195
|
-
async getHtmlPathOnPage(pageId: number) {
|
|
196
|
-
return await this.#instance.transaction(async (trx) => {
|
|
197
|
-
const [{ html }] = await trx
|
|
198
|
-
.select('html')
|
|
199
|
-
.from<DB_Page>('pages')
|
|
200
|
-
.where('id', pageId);
|
|
201
|
-
return html || null;
|
|
202
|
-
});
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
/**
|
|
206
|
-
* Retrieves the crawl session name from the `info` table.
|
|
207
|
-
* @returns The name string.
|
|
208
|
-
* @throws {Error} If no name is found in the database.
|
|
209
|
-
*/
|
|
210
|
-
@ErrorEmitter()
|
|
211
|
-
@retry(retrySetting)
|
|
212
|
-
async getName() {
|
|
213
|
-
const selected = await this.#instance.select('name').from<Config>('info');
|
|
214
|
-
if (!selected[0]) {
|
|
215
|
-
throw new Error('No name');
|
|
216
|
-
}
|
|
217
|
-
const [{ name }] = selected;
|
|
218
|
-
return name;
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
/**
|
|
222
|
-
* Counts the total number of pages in the database.
|
|
223
|
-
* @returns The total page count.
|
|
224
|
-
* @throws {Error} If the count query fails.
|
|
225
|
-
*/
|
|
226
|
-
@ErrorEmitter()
|
|
227
|
-
@retry(retrySetting)
|
|
228
|
-
async getPageCount() {
|
|
229
|
-
const selected = await this.#instance.count('id').from<DB_Page>('pages');
|
|
230
|
-
if (!selected[0]) {
|
|
231
|
-
throw new Error('No count');
|
|
232
|
-
}
|
|
233
|
-
// @ts-expect-error
|
|
234
|
-
const count: number = selected[0]['count(`id`)'];
|
|
235
|
-
dbLog('Number of pages: %d', count);
|
|
236
|
-
return count;
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
/**
|
|
240
|
-
* Retrieves pages from the database with optional filtering, pagination via offset and limit.
|
|
241
|
-
* @param filter - An optional {@link PageFilter} to narrow results by content type and origin.
|
|
242
|
-
* @param offset - The number of rows to skip. Defaults to `0`.
|
|
243
|
-
* @param limit - The maximum number of rows to return. Defaults to `100000`.
|
|
244
|
-
* @returns An array of raw {@link DB_Page} rows.
|
|
245
|
-
*/
|
|
246
|
-
@ErrorEmitter()
|
|
247
|
-
@retry(retrySetting)
|
|
248
|
-
async getPages(filter?: PageFilter, offset = 0, limit = 100_000) {
|
|
249
|
-
const q = this.#instance.select('*').from<DB_Page>('pages');
|
|
250
|
-
switch (filter) {
|
|
251
|
-
case 'page': {
|
|
252
|
-
return q
|
|
253
|
-
.where({
|
|
254
|
-
contentType: 'text/html',
|
|
255
|
-
isTarget: 1,
|
|
256
|
-
})
|
|
257
|
-
.limit(limit)
|
|
258
|
-
.offset(offset);
|
|
259
|
-
}
|
|
260
|
-
case 'page-included-no-target': {
|
|
261
|
-
return q
|
|
262
|
-
.where({
|
|
263
|
-
contentType: 'text/html',
|
|
264
|
-
})
|
|
265
|
-
.limit(limit)
|
|
266
|
-
.offset(offset);
|
|
267
|
-
}
|
|
268
|
-
case 'external-page': {
|
|
269
|
-
return q
|
|
270
|
-
.where({
|
|
271
|
-
contentType: 'text/html',
|
|
272
|
-
isExternal: 1,
|
|
273
|
-
})
|
|
274
|
-
.limit(limit)
|
|
275
|
-
.offset(offset);
|
|
276
|
-
}
|
|
277
|
-
case 'internal-page': {
|
|
278
|
-
return q
|
|
279
|
-
.where({
|
|
280
|
-
contentType: 'text/html',
|
|
281
|
-
isExternal: 0,
|
|
282
|
-
})
|
|
283
|
-
.limit(limit)
|
|
284
|
-
.offset(offset);
|
|
285
|
-
}
|
|
286
|
-
case 'no-page': {
|
|
287
|
-
return q
|
|
288
|
-
.whereNull('contentType')
|
|
289
|
-
.orWhereNot({
|
|
290
|
-
contentType: 'text/html',
|
|
291
|
-
})
|
|
292
|
-
.limit(limit)
|
|
293
|
-
.offset(offset);
|
|
294
|
-
}
|
|
295
|
-
case 'external-no-page': {
|
|
296
|
-
return q
|
|
297
|
-
.where((qb) => {
|
|
298
|
-
qb.whereNull('contentType').orWhereNot({
|
|
299
|
-
contentType: 'text/html',
|
|
300
|
-
});
|
|
301
|
-
})
|
|
302
|
-
.andWhere({
|
|
303
|
-
isExternal: 1,
|
|
304
|
-
})
|
|
305
|
-
.limit(limit)
|
|
306
|
-
.offset(offset);
|
|
307
|
-
}
|
|
308
|
-
case 'internal-no-page': {
|
|
309
|
-
return q
|
|
310
|
-
.where((qb) => {
|
|
311
|
-
qb.whereNull('contentType').orWhereNot({
|
|
312
|
-
contentType: 'text/html',
|
|
313
|
-
});
|
|
314
|
-
})
|
|
315
|
-
.andWhere({
|
|
316
|
-
isExternal: 0,
|
|
317
|
-
})
|
|
318
|
-
.limit(limit)
|
|
319
|
-
.offset(offset);
|
|
320
|
-
}
|
|
321
|
-
}
|
|
322
|
-
return q.limit(limit).offset(offset);
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
/**
|
|
326
|
-
* Retrieves pages along with their related redirect, anchor, and referrer data.
|
|
327
|
-
* Results are ordered by the natural URL sort order. Only non-redirected pages are returned.
|
|
328
|
-
* @param offset - The number of rows to skip.
|
|
329
|
-
* @param limit - The maximum number of pages to return.
|
|
330
|
-
* @returns An object containing `pages`, `redirects`, `anchors`, and `referrers` arrays.
|
|
331
|
-
*/
|
|
332
|
-
@ErrorEmitter()
|
|
333
|
-
@retry(retrySetting)
|
|
334
|
-
async getPagesWithRels(offset: number, limit: number) {
|
|
335
|
-
await this.addOrderField().catch((error) => error);
|
|
336
|
-
await this.setUrlOrder();
|
|
337
|
-
dbLog('Get Pages');
|
|
338
|
-
const pages = await this.#instance
|
|
339
|
-
.select('*')
|
|
340
|
-
.from<DB_Page>('pages')
|
|
341
|
-
.orderByRaw('`order` ASC NULLS LAST')
|
|
342
|
-
.whereNull('redirectDestId')
|
|
343
|
-
.limit(limit)
|
|
344
|
-
.offset(offset);
|
|
345
|
-
|
|
346
|
-
// When empty
|
|
347
|
-
if (pages.length === 0) {
|
|
348
|
-
return {
|
|
349
|
-
pages: [],
|
|
350
|
-
redirects: [],
|
|
351
|
-
referrers: [],
|
|
352
|
-
anchors: [],
|
|
353
|
-
};
|
|
354
|
-
}
|
|
355
|
-
|
|
356
|
-
dbLog('Get Pages: Redirects');
|
|
357
|
-
const redirects: DB_Redirect[] = await this.#instance
|
|
358
|
-
.with('limitedPages', limitedPageIds(limit, offset))
|
|
359
|
-
.with('redirect', redirectTable(false))
|
|
360
|
-
.select('id as pageId', 'from', 'fromId')
|
|
361
|
-
.from('redirect')
|
|
362
|
-
// Filter
|
|
363
|
-
.join('limitedPages', 'redirect.toId', '=', 'limitedPages.id')
|
|
364
|
-
// Sort
|
|
365
|
-
.orderBy('id', 'asc');
|
|
366
|
-
|
|
367
|
-
dbLog('Get Pages: Anchors');
|
|
368
|
-
const anchors: DB_Anchor[] = await this.#instance
|
|
369
|
-
.with('limitedPages', limitedPageIds(limit, offset))
|
|
370
|
-
.with('redirect', redirectTable())
|
|
371
|
-
.select(
|
|
372
|
-
'limitedPages.id as pageId',
|
|
373
|
-
'href.url',
|
|
374
|
-
'redirect.from as href',
|
|
375
|
-
'href.isExternal',
|
|
376
|
-
'href.title',
|
|
377
|
-
'href.status',
|
|
378
|
-
'href.statusText',
|
|
379
|
-
'href.contentType',
|
|
380
|
-
'anchors.hash',
|
|
381
|
-
'anchors.textContent',
|
|
382
|
-
)
|
|
383
|
-
.from('anchors')
|
|
384
|
-
// Filters
|
|
385
|
-
.join('limitedPages', 'anchors.pageId', '=', 'limitedPages.id')
|
|
386
|
-
// Resolves redirect
|
|
387
|
-
.join('redirect', 'anchors.hrefId', '=', 'redirect.fromId')
|
|
388
|
-
// Target
|
|
389
|
-
.join('pages as href', 'redirect.toId', '=', 'href.id')
|
|
390
|
-
// Sort
|
|
391
|
-
.orderBy('anchors.id', 'asc');
|
|
392
|
-
|
|
393
|
-
dbLog('Get Pages: Referrers');
|
|
394
|
-
const referrers: DB_Referrer[] = await this.#instance
|
|
395
|
-
.with('limitedPages', limitedPageIds(limit, offset))
|
|
396
|
-
.with('redirect', redirectTable())
|
|
397
|
-
.select(
|
|
398
|
-
'redirect.toId as pageId',
|
|
399
|
-
'referrer.url',
|
|
400
|
-
'redirect.from as through',
|
|
401
|
-
'redirect.fromId as throughId',
|
|
402
|
-
'anchors.hash',
|
|
403
|
-
'anchors.textContent',
|
|
404
|
-
)
|
|
405
|
-
.from('anchors')
|
|
406
|
-
// Resolves redirect
|
|
407
|
-
.join('redirect', 'anchors.hrefId', '=', 'redirect.fromId')
|
|
408
|
-
// Referrer
|
|
409
|
-
.join('pages as referrer', 'anchors.pageId', '=', 'referrer.id')
|
|
410
|
-
// Filters
|
|
411
|
-
.join('limitedPages', 'redirect.toId', '=', 'limitedPages.id')
|
|
412
|
-
// Sort
|
|
413
|
-
.orderBy('anchors.id', 'asc');
|
|
414
|
-
|
|
415
|
-
dbLog('Get Pages: Done');
|
|
416
|
-
return {
|
|
417
|
-
pages,
|
|
418
|
-
redirects,
|
|
419
|
-
anchors,
|
|
420
|
-
referrers,
|
|
421
|
-
};
|
|
422
|
-
}
|
|
423
|
-
|
|
424
|
-
/**
|
|
425
|
-
* Retrieves redirect sources for the given page IDs in bulk.
|
|
426
|
-
* @param pageIds - The database IDs of the destination pages.
|
|
427
|
-
* @returns An array of {@link DB_Redirect} records mapping destination pages to their redirect sources.
|
|
428
|
-
*/
|
|
429
|
-
@ErrorEmitter()
|
|
430
|
-
@retry(retrySetting)
|
|
431
|
-
async getRedirectsForPages(pageIds: number[]): Promise<DB_Redirect[]> {
|
|
432
|
-
if (pageIds.length === 0) return [];
|
|
433
|
-
return this.#instance
|
|
434
|
-
.select('redirectDestId as pageId', 'url as from', 'id as fromId')
|
|
435
|
-
.from('pages')
|
|
436
|
-
.whereIn('redirectDestId', pageIds);
|
|
437
|
-
}
|
|
438
|
-
/**
|
|
439
|
-
* Retrieves pages that link to a specific page (incoming links / referrers).
|
|
440
|
-
* @param pageId - The database ID of the target page.
|
|
441
|
-
* @returns An array of referrer records with URL, hash, and text content.
|
|
442
|
-
*/
|
|
443
|
-
@ErrorEmitter()
|
|
444
|
-
@retry(retrySetting)
|
|
445
|
-
async getReferrersOfPage(pageId: number) {
|
|
446
|
-
const res = await this.#instance
|
|
447
|
-
.select('pages.url', 'anchors.hash', 'anchors.textContent')
|
|
448
|
-
.from('anchors')
|
|
449
|
-
.join('pages', 'anchors.pageId', '=', 'pages.id')
|
|
450
|
-
.where('anchors.hrefId', pageId);
|
|
451
|
-
return res;
|
|
452
|
-
}
|
|
453
|
-
|
|
454
|
-
/**
|
|
455
|
-
* Retrieves the page URLs that reference a specific resource.
|
|
456
|
-
* @param id - The database ID of the resource.
|
|
457
|
-
* @returns An array of page URL strings that reference the resource.
|
|
458
|
-
*/
|
|
459
|
-
@ErrorEmitter()
|
|
460
|
-
@retry(retrySetting)
|
|
461
|
-
async getReferrersOfResource(id: number): Promise<string[]> {
|
|
462
|
-
const res = await this.#instance
|
|
463
|
-
.select('pages.url')
|
|
464
|
-
.from('resources-referrers')
|
|
465
|
-
.join('resources', 'resources.id', '=', 'resources-referrers.resourceId')
|
|
466
|
-
.join('pages', 'pages.id', '=', 'resources-referrers.pageId')
|
|
467
|
-
.where('resources.id', id);
|
|
468
|
-
return res.map((r) => r.url);
|
|
469
|
-
}
|
|
470
|
-
|
|
471
|
-
/**
|
|
472
|
-
* Retrieves all sub-resources from the `resources` table.
|
|
473
|
-
* @returns An array of raw {@link DB_Resource} rows.
|
|
474
|
-
*/
|
|
475
|
-
@ErrorEmitter()
|
|
476
|
-
@retry(retrySetting)
|
|
477
|
-
async getResources() {
|
|
478
|
-
return this.#instance.select('*').from<DB_Resource>('resources');
|
|
479
|
-
}
|
|
480
|
-
|
|
481
|
-
/**
|
|
482
|
-
* Retrieves a flat list of all resource URLs from the `resources` table.
|
|
483
|
-
* @returns An array of resource URL strings.
|
|
484
|
-
*/
|
|
485
|
-
@ErrorEmitter()
|
|
486
|
-
@retry(retrySetting)
|
|
487
|
-
async getResourceUrlList() {
|
|
488
|
-
const res = await this.#instance.select('url').from<DB_Resource>('resources');
|
|
489
|
-
return res.map((r) => r.url);
|
|
490
|
-
}
|
|
491
|
-
|
|
492
|
-
/**
|
|
493
|
-
* Inserts a sub-resource into the `resources` table.
|
|
494
|
-
* Ignores duplicate URLs (uses `ON CONFLICT IGNORE`).
|
|
495
|
-
* @param resource - The resource data to insert.
|
|
496
|
-
*/
|
|
497
|
-
@ErrorEmitter()
|
|
498
|
-
@retry(retrySetting)
|
|
499
|
-
async insertResource(resource: Resource) {
|
|
500
|
-
await this.#instance
|
|
501
|
-
.from<DB_Resource>('resources')
|
|
502
|
-
.insert({
|
|
503
|
-
url: resource.url.href,
|
|
504
|
-
isExternal: resource.isExternal ? 1 : 0,
|
|
505
|
-
status: resource.status,
|
|
506
|
-
statusText: resource.statusText,
|
|
507
|
-
contentType: resource.contentType,
|
|
508
|
-
contentLength: resource.contentLength,
|
|
509
|
-
compress: resource.compress || 0,
|
|
510
|
-
cdn: resource.cdn || 0,
|
|
511
|
-
responseHeaders: JSON.stringify(resource.headers),
|
|
512
|
-
})
|
|
513
|
-
.onConflict('url')
|
|
514
|
-
.ignore();
|
|
515
|
-
}
|
|
516
|
-
|
|
517
|
-
/**
|
|
518
|
-
* Inserts a referrer relationship between a resource and a page into the
|
|
519
|
-
* `resources-referrers` table. Silently skips if the resource is not found.
|
|
520
|
-
* @param src - The URL of the resource.
|
|
521
|
-
* @param pageUrl - The URL of the page that references the resource.
|
|
522
|
-
*/
|
|
523
|
-
@ErrorEmitter()
|
|
524
|
-
@retry(retrySetting)
|
|
525
|
-
async insertResourceReferrers(src: string, pageUrl: string) {
|
|
526
|
-
const selected = await this.#instance
|
|
527
|
-
.select('id')
|
|
528
|
-
.from<DB_Resource>('resources')
|
|
529
|
-
.where('url', src);
|
|
530
|
-
if (!selected[0]) {
|
|
531
|
-
// Ignore when the resource is not found
|
|
532
|
-
return;
|
|
533
|
-
}
|
|
534
|
-
const [{ id: resourceId }] = selected;
|
|
535
|
-
const pageId = await this.#getIdByUrl(pageUrl);
|
|
536
|
-
await this.#instance('resources-referrers').insert({
|
|
537
|
-
resourceId,
|
|
538
|
-
pageId,
|
|
539
|
-
});
|
|
540
|
-
}
|
|
541
|
-
|
|
542
|
-
/**
|
|
543
|
-
* Stores the crawl configuration in the `info` table.
|
|
544
|
-
* Serializes array fields (`excludes`, `excludeKeywords`, `scope`) as JSON strings.
|
|
545
|
-
* @param config - The {@link Config} object to store.
|
|
546
|
-
*/
|
|
547
|
-
@ErrorEmitter()
|
|
548
|
-
@retry(retrySetting)
|
|
549
|
-
async setConfig(config: Config) {
|
|
550
|
-
return this.#instance.from<Config>('info').insert({
|
|
551
|
-
...config,
|
|
552
|
-
// @ts-expect-error
|
|
553
|
-
excludes: JSON.stringify(config.excludes),
|
|
554
|
-
// @ts-expect-error
|
|
555
|
-
excludeKeywords: JSON.stringify(config.excludeKeywords),
|
|
556
|
-
// @ts-expect-error
|
|
557
|
-
excludeUrls: JSON.stringify(config.excludeUrls),
|
|
558
|
-
// @ts-expect-error
|
|
559
|
-
scope: JSON.stringify(config.scope),
|
|
560
|
-
});
|
|
561
|
-
}
|
|
562
|
-
|
|
563
|
-
/**
|
|
564
|
-
* Marks a page as skipped in the database with the given reason.
|
|
565
|
-
* Creates the page row if it does not already exist.
|
|
566
|
-
* @param url - The URL of the skipped page.
|
|
567
|
-
* @param reason - The reason the page was skipped.
|
|
568
|
-
* @param isExternal - Whether the page is on an external domain. Defaults to `false`.
|
|
569
|
-
*/
|
|
570
|
-
@ErrorEmitter()
|
|
571
|
-
@retry(retrySetting)
|
|
572
|
-
async setSkippedPage(url: string, reason: string, isExternal = false) {
|
|
573
|
-
const pageId = await this.#getIdByUrl(url, isExternal ? 1 : 0);
|
|
574
|
-
await this.#instance<DB_Page>('pages')
|
|
575
|
-
.where('id', pageId)
|
|
576
|
-
.update({
|
|
577
|
-
scraped: 1,
|
|
578
|
-
isExternal: isExternal ? 1 : 0,
|
|
579
|
-
isSkipped: 1,
|
|
580
|
-
skipReason: reason,
|
|
581
|
-
});
|
|
582
|
-
}
|
|
583
|
-
|
|
584
|
-
/**
|
|
585
|
-
* Assigns natural URL sort order values to all internal pages.
|
|
586
|
-
* Pages are sorted using {@link pathComparator} and assigned sequential order numbers.
|
|
587
|
-
*/
|
|
588
|
-
async setUrlOrder() {
|
|
589
|
-
dbLog('Set URL Order');
|
|
590
|
-
const res = await this.#instance
|
|
591
|
-
.select('id', 'url')
|
|
592
|
-
.from<DB_Page>('pages')
|
|
593
|
-
.where('isExternal', '=', 0);
|
|
594
|
-
const sorted = res.toSorted((a, b) => pathComparator(a.url, b.url));
|
|
595
|
-
|
|
596
|
-
// Batch update using chunked CASE statements to avoid N+1 queries
|
|
597
|
-
const BATCH_SIZE = 500;
|
|
598
|
-
for (let i = 0; i < sorted.length; i += BATCH_SIZE) {
|
|
599
|
-
const batch = sorted.slice(i, i + BATCH_SIZE);
|
|
600
|
-
const ids = batch.map((row) => row.id);
|
|
601
|
-
const bindings: (string | number)[] = [];
|
|
602
|
-
const cases = batch
|
|
603
|
-
.map((row, j) => {
|
|
604
|
-
bindings.push(row.id, i + j + 1);
|
|
605
|
-
return 'WHEN ? THEN ?';
|
|
606
|
-
})
|
|
607
|
-
.join(' ');
|
|
608
|
-
const placeholders = ids.map(() => '?').join(',');
|
|
609
|
-
await this.#instance.raw(
|
|
610
|
-
`UPDATE pages SET \`order\` = CASE id ${cases} END WHERE id IN (${placeholders})`,
|
|
611
|
-
[...bindings, ...ids],
|
|
612
|
-
);
|
|
613
|
-
}
|
|
614
|
-
}
|
|
615
|
-
|
|
616
|
-
/**
|
|
617
|
-
* Inserts or updates a crawled page in the database, including its redirect chain,
|
|
618
|
-
* anchors, and images. Optionally creates an HTML snapshot file path entry.
|
|
619
|
-
* @param page - The page data to store.
|
|
620
|
-
* @param snapshotDir - The directory for saving HTML snapshots, or null to skip snapshots.
|
|
621
|
-
* @param isTarget - Whether this page is a crawl target.
|
|
622
|
-
* @returns An object with the optional `html` snapshot file path and the page's database `pageId`.
|
|
623
|
-
*/
|
|
624
|
-
@ErrorEmitter()
|
|
625
|
-
@retry(retrySetting)
|
|
626
|
-
async updatePage(
|
|
627
|
-
page: PageData,
|
|
628
|
-
snapshotDir: string | null,
|
|
629
|
-
isTarget: boolean,
|
|
630
|
-
): Promise<{
|
|
631
|
-
html?: string | undefined;
|
|
632
|
-
pageId: number;
|
|
633
|
-
}> {
|
|
634
|
-
let destUrl = page.url.withoutHashAndAuth;
|
|
635
|
-
const redirectPaths = [...page.redirectPaths];
|
|
636
|
-
if (redirectPaths.length > 0) {
|
|
637
|
-
destUrl = redirectPaths.pop()!;
|
|
638
|
-
redirectPaths.unshift(page.url.withoutHashAndAuth);
|
|
639
|
-
}
|
|
640
|
-
|
|
641
|
-
const destUrlObject = parseUrl(destUrl);
|
|
642
|
-
|
|
643
|
-
if (!destUrlObject) {
|
|
644
|
-
throw new Error(`Failed to parse URL: ${destUrl}`);
|
|
645
|
-
}
|
|
646
|
-
|
|
647
|
-
return await this.#instance.transaction(async (trx) => {
|
|
648
|
-
const pageId = await this.#insertPage(
|
|
649
|
-
{
|
|
650
|
-
...page,
|
|
651
|
-
url: destUrlObject,
|
|
652
|
-
},
|
|
653
|
-
isTarget,
|
|
654
|
-
trx,
|
|
655
|
-
);
|
|
656
|
-
|
|
657
|
-
for (const redirect of redirectPaths) {
|
|
658
|
-
dbLog('Set redirected url: %s -> %s', redirect, destUrl);
|
|
659
|
-
const redirectId = await this.#getIdByUrl(redirect, undefined, trx);
|
|
660
|
-
await trx<DB_Page>('pages')
|
|
661
|
-
.where('id', redirectId)
|
|
662
|
-
.update({
|
|
663
|
-
scraped: 1,
|
|
664
|
-
redirectDestId: pageId,
|
|
665
|
-
isExternal: page.isExternal ? 1 : 0,
|
|
666
|
-
});
|
|
667
|
-
}
|
|
668
|
-
let snapshot: { html?: string; pageId: number } = { pageId };
|
|
669
|
-
if (isTarget && snapshotDir) {
|
|
670
|
-
snapshot = await this.#updateSnapshotPath(pageId, snapshotDir, trx);
|
|
671
|
-
}
|
|
672
|
-
const anchors = await Promise.all(
|
|
673
|
-
page.anchorList.map(async (anchor) => {
|
|
674
|
-
const hrefId = await this.#getIdByUrl(
|
|
675
|
-
anchor.href.withoutHashAndAuth,
|
|
676
|
-
anchor.isExternal ? 1 : 0,
|
|
677
|
-
trx,
|
|
678
|
-
);
|
|
679
|
-
return {
|
|
680
|
-
pageId,
|
|
681
|
-
hrefId,
|
|
682
|
-
hash: anchor.href.hash,
|
|
683
|
-
textContent: anchor.textContent,
|
|
684
|
-
};
|
|
685
|
-
}),
|
|
686
|
-
);
|
|
687
|
-
dbLog('Insert anchors.length: %d', anchors.length);
|
|
688
|
-
if (anchors.length > 0) {
|
|
689
|
-
await eachSplitted(anchors, 100, async (_anchors) => {
|
|
690
|
-
await trx('anchors').insert(_anchors);
|
|
691
|
-
});
|
|
692
|
-
}
|
|
693
|
-
const images = page.imageList.map((image) => ({
|
|
694
|
-
pageId,
|
|
695
|
-
...image,
|
|
696
|
-
}));
|
|
697
|
-
dbLog('Insert images.length: %d', images.length);
|
|
698
|
-
if (images.length > 0) {
|
|
699
|
-
await eachSplitted(images, 100, async (_images) => {
|
|
700
|
-
await trx('images').insert(_images);
|
|
701
|
-
});
|
|
702
|
-
}
|
|
703
|
-
return snapshot;
|
|
704
|
-
});
|
|
705
|
-
}
|
|
706
|
-
|
|
707
|
-
/**
|
|
708
|
-
* Returns the database ID for a URL, creating a new page row if needed.
|
|
709
|
-
* Uses `ON CONFLICT IGNORE` to handle race conditions in concurrent inserts.
|
|
710
|
-
* @param url
|
|
711
|
-
* @param isExternal
|
|
712
|
-
* @param trx
|
|
713
|
-
*/
|
|
714
|
-
async #getIdByUrl(url: string, isExternal?: 0 | 1, trx?: Knex.Transaction) {
|
|
715
|
-
const qb = trx ?? this.#instance;
|
|
716
|
-
const [record] = await qb.select('id').from<DB_Page>('pages').where('url', url);
|
|
717
|
-
// Must use `?` because it may be `undefined`
|
|
718
|
-
const pageId = record?.id ?? Number.NaN;
|
|
719
|
-
if (Number.isFinite(pageId)) {
|
|
720
|
-
return pageId;
|
|
721
|
-
}
|
|
722
|
-
const insertedRows = await qb<DB_Page>('pages')
|
|
723
|
-
.insert({
|
|
724
|
-
url,
|
|
725
|
-
scraped: 0,
|
|
726
|
-
isTarget: 0,
|
|
727
|
-
...(isExternal != null && { isExternal }),
|
|
728
|
-
})
|
|
729
|
-
.onConflict('url')
|
|
730
|
-
.ignore();
|
|
731
|
-
const [insertedId] = insertedRows;
|
|
732
|
-
if (!insertedId) {
|
|
733
|
-
// onConflict.ignore() returns 0 on race condition — re-select
|
|
734
|
-
const [existing] = await qb.select('id').from<DB_Page>('pages').where('url', url);
|
|
735
|
-
if (existing?.id) {
|
|
736
|
-
return existing.id;
|
|
737
|
-
}
|
|
738
|
-
throw new Error(`Failed to insert a new page: ${url}`);
|
|
739
|
-
}
|
|
740
|
-
return insertedId;
|
|
741
|
-
}
|
|
742
|
-
|
|
743
|
-
/**
|
|
744
|
-
* Initializes the database schema if tables do not exist.
|
|
745
|
-
* Enables WAL journal mode and foreign keys, then creates all tables
|
|
746
|
-
* (`info`, `pages`, `anchors`, `images`, `resources`, `resources-referrers`).
|
|
747
|
-
*/
|
|
748
|
-
async #init() {
|
|
749
|
-
const isExists = await this.#instance.schema.hasTable('info');
|
|
750
|
-
if (isExists) {
|
|
751
|
-
return;
|
|
752
|
-
}
|
|
753
|
-
|
|
754
|
-
// Enable WAL mode and foreign keys for better performance and data integrity
|
|
755
|
-
await this.#instance.raw('PRAGMA journal_mode = WAL');
|
|
756
|
-
await this.#instance.raw('PRAGMA foreign_keys = ON');
|
|
757
|
-
|
|
758
|
-
await this.#instance.schema
|
|
759
|
-
.createTable('info', (t) => {
|
|
760
|
-
t.increments('id');
|
|
761
|
-
t.string('version');
|
|
762
|
-
t.string('name');
|
|
763
|
-
t.string('baseUrl');
|
|
764
|
-
t.boolean('recursive');
|
|
765
|
-
t.boolean('useSubprocess');
|
|
766
|
-
t.integer('interval');
|
|
767
|
-
t.boolean('image');
|
|
768
|
-
t.boolean('fetchExternal');
|
|
769
|
-
t.integer('parallels');
|
|
770
|
-
t.json('scope');
|
|
771
|
-
t.json('excludes');
|
|
772
|
-
t.json('excludeKeywords');
|
|
773
|
-
t.json('excludeUrls');
|
|
774
|
-
t.integer('maxExcludedDepth');
|
|
775
|
-
t.integer('retry');
|
|
776
|
-
t.boolean('fromList');
|
|
777
|
-
t.boolean('disableQueries');
|
|
778
|
-
})
|
|
779
|
-
.createTable('pages', (t) => {
|
|
780
|
-
t.increments('id');
|
|
781
|
-
t.string('url', 8190).notNullable().unique();
|
|
782
|
-
t.integer('redirectDestId').unsigned().references('pages.id').defaultTo(null);
|
|
783
|
-
t.boolean('scraped').notNullable();
|
|
784
|
-
t.boolean('isTarget').notNullable();
|
|
785
|
-
t.boolean('isExternal');
|
|
786
|
-
t.integer('status');
|
|
787
|
-
t.string('statusText');
|
|
788
|
-
t.string('contentType').nullable();
|
|
789
|
-
t.integer('contentLength').unsigned().nullable();
|
|
790
|
-
t.json('responseHeaders').nullable();
|
|
791
|
-
t.string('lang');
|
|
792
|
-
t.string('title');
|
|
793
|
-
t.string('description');
|
|
794
|
-
t.string('keywords');
|
|
795
|
-
t.boolean('noindex');
|
|
796
|
-
t.boolean('nofollow');
|
|
797
|
-
t.boolean('noarchive');
|
|
798
|
-
t.string('canonical');
|
|
799
|
-
t.string('alternate');
|
|
800
|
-
t.string('og_type');
|
|
801
|
-
t.string('og_title');
|
|
802
|
-
t.string('og_site_name');
|
|
803
|
-
t.string('og_description');
|
|
804
|
-
t.string('og_url');
|
|
805
|
-
t.string('og_image');
|
|
806
|
-
t.string('twitter_card');
|
|
807
|
-
t.string('html');
|
|
808
|
-
t.boolean('isSkipped');
|
|
809
|
-
t.string('skipReason');
|
|
810
|
-
t.integer('order').unsigned().nullable();
|
|
811
|
-
|
|
812
|
-
t.index('isExternal');
|
|
813
|
-
t.index('contentType');
|
|
814
|
-
t.index('scraped');
|
|
815
|
-
t.index('redirectDestId');
|
|
816
|
-
t.index('order');
|
|
817
|
-
})
|
|
818
|
-
.createTable('anchors', (t) => {
|
|
819
|
-
t.increments('id');
|
|
820
|
-
t.integer('pageId').notNullable().unsigned().references('pages.id');
|
|
821
|
-
t.integer('hrefId').notNullable().unsigned().references('pages.id');
|
|
822
|
-
t.string('hash');
|
|
823
|
-
t.string('textContent').nullable();
|
|
824
|
-
|
|
825
|
-
t.index('pageId');
|
|
826
|
-
t.index('hrefId');
|
|
827
|
-
})
|
|
828
|
-
.createTable('images', (t) => {
|
|
829
|
-
t.increments('id');
|
|
830
|
-
t.integer('pageId').notNullable().unsigned().references('pages.id');
|
|
831
|
-
t.string('src', 8190);
|
|
832
|
-
t.string('currentSrc', 8190);
|
|
833
|
-
t.string('alt');
|
|
834
|
-
t.float('width').unsigned().notNullable();
|
|
835
|
-
t.float('height').unsigned().notNullable();
|
|
836
|
-
t.integer('naturalWidth').unsigned().notNullable();
|
|
837
|
-
t.integer('naturalHeight').unsigned().notNullable();
|
|
838
|
-
t.boolean('isLazy');
|
|
839
|
-
t.integer('viewportWidth').unsigned().notNullable();
|
|
840
|
-
t.string('sourceCode');
|
|
841
|
-
|
|
842
|
-
t.index('pageId');
|
|
843
|
-
})
|
|
844
|
-
.createTable('resources', (t) => {
|
|
845
|
-
t.increments('id');
|
|
846
|
-
t.string('url', 8190).notNullable().unique();
|
|
847
|
-
t.boolean('isExternal');
|
|
848
|
-
t.integer('status');
|
|
849
|
-
t.string('statusText');
|
|
850
|
-
t.string('contentType').nullable();
|
|
851
|
-
t.integer('contentLength').unsigned().nullable();
|
|
852
|
-
t.string('compress').nullable();
|
|
853
|
-
t.string('cdn').nullable();
|
|
854
|
-
t.json('responseHeaders').nullable();
|
|
855
|
-
})
|
|
856
|
-
.createTable('resources-referrers', (t) => {
|
|
857
|
-
t.increments('id');
|
|
858
|
-
t.integer('resourceId').notNullable().unsigned().references('resources.id');
|
|
859
|
-
t.integer('pageId').notNullable().unsigned().references('pages.id');
|
|
860
|
-
|
|
861
|
-
t.unique(['resourceId', 'pageId']);
|
|
862
|
-
t.index('resourceId');
|
|
863
|
-
t.index('pageId');
|
|
864
|
-
});
|
|
865
|
-
}
|
|
866
|
-
|
|
867
|
-
/**
|
|
868
|
-
* Upserts page data into the `pages` table (inserts if new, updates if existing).
|
|
869
|
-
* @param page
|
|
870
|
-
* @param isTarget
|
|
871
|
-
* @param trx
|
|
872
|
-
*/
|
|
873
|
-
async #insertPage(page: PageData, isTarget: boolean, trx?: Knex.Transaction) {
|
|
874
|
-
const qb = trx ?? this.#instance;
|
|
875
|
-
const pageId = await this.#getIdByUrl(page.url.withoutHashAndAuth, undefined, trx);
|
|
876
|
-
await qb('pages')
|
|
877
|
-
.where('id', pageId)
|
|
878
|
-
.update({
|
|
879
|
-
scraped: true,
|
|
880
|
-
isTarget,
|
|
881
|
-
isExternal: page.isExternal,
|
|
882
|
-
status: page.status,
|
|
883
|
-
statusText: page.statusText,
|
|
884
|
-
contentType: page.contentType,
|
|
885
|
-
contentLength: page.contentLength,
|
|
886
|
-
responseHeaders: JSON.stringify(page.responseHeaders),
|
|
887
|
-
lang: page.meta.lang,
|
|
888
|
-
title: page.meta.title,
|
|
889
|
-
description: page.meta.description,
|
|
890
|
-
keywords: page.meta.keywords,
|
|
891
|
-
noindex: page.meta.noindex,
|
|
892
|
-
nofollow: page.meta.nofollow,
|
|
893
|
-
noarchive: page.meta.noarchive,
|
|
894
|
-
canonical: page.meta.canonical,
|
|
895
|
-
alternate: page.meta.alternate,
|
|
896
|
-
og_type: page.meta['og:type'],
|
|
897
|
-
og_title: page.meta['og:title'],
|
|
898
|
-
og_site_name: page.meta['og:site_name'],
|
|
899
|
-
og_description: page.meta['og:description'],
|
|
900
|
-
og_url: page.meta['og:url'],
|
|
901
|
-
og_image: page.meta['og:image'],
|
|
902
|
-
twitter_card: page.meta['twitter:card'],
|
|
903
|
-
isSkipped: page.isSkipped,
|
|
904
|
-
});
|
|
905
|
-
return pageId;
|
|
906
|
-
}
|
|
907
|
-
|
|
908
|
-
/**
|
|
909
|
-
* Assigns and persists the HTML snapshot file path for a page.
|
|
910
|
-
* @param pageId
|
|
911
|
-
* @param snapshotDir
|
|
912
|
-
* @param trx
|
|
913
|
-
*/
|
|
914
|
-
async #updateSnapshotPath(pageId: number, snapshotDir: string, trx?: Knex.Transaction) {
|
|
915
|
-
const qb = trx ?? this.#instance;
|
|
916
|
-
const snapshotHtmlPath = path.resolve(snapshotDir, `${pageId}.html`);
|
|
917
|
-
const snapshotRelHtmlPath = path.relative(this.#workingDir, snapshotHtmlPath);
|
|
918
|
-
await qb('pages').where('id', pageId).update({
|
|
919
|
-
html: snapshotRelHtmlPath,
|
|
920
|
-
});
|
|
921
|
-
return {
|
|
922
|
-
html: snapshotHtmlPath,
|
|
923
|
-
pageId,
|
|
924
|
-
};
|
|
925
|
-
}
|
|
926
|
-
|
|
927
|
-
/**
|
|
928
|
-
* Creates and initializes a new Database instance.
|
|
929
|
-
* Creates the parent directory for the database file if needed,
|
|
930
|
-
* establishes the connection, and initializes tables if they do not exist.
|
|
931
|
-
* @param options - The database connection options specifying the type and file path.
|
|
932
|
-
* @returns A fully initialized Database instance.
|
|
933
|
-
*/
|
|
934
|
-
static async connect(options: DatabaseOption) {
|
|
935
|
-
switch (options.type) {
|
|
936
|
-
case 'sqlite3': {
|
|
937
|
-
mkdir(options.filename);
|
|
938
|
-
break;
|
|
939
|
-
}
|
|
940
|
-
}
|
|
941
|
-
const db = new Database(options);
|
|
942
|
-
await db.#init();
|
|
943
|
-
return db;
|
|
944
|
-
}
|
|
945
|
-
}
|
|
946
|
-
|
|
947
|
-
// ----- ----- ----- ----- -----
|
|
948
|
-
//
|
|
949
|
-
// Common Queries
|
|
950
|
-
//
|
|
951
|
-
// ----- ----- ----- ----- -----
|
|
952
|
-
|
|
953
|
-
/**
|
|
954
|
-
* Returns a Knex subquery builder that selects page IDs with pagination,
|
|
955
|
-
* ordered by the `order` column (nulls last), excluding redirected pages.
|
|
956
|
-
* @param limit - The maximum number of page IDs to return.
|
|
957
|
-
* @param offset - The number of page IDs to skip before returning results.
|
|
958
|
-
*/
|
|
959
|
-
function limitedPageIds(limit: number, offset: number) {
|
|
960
|
-
return async (qb: Knex.QueryBuilder<Record<string, unknown>, unknown>) => {
|
|
961
|
-
await qb
|
|
962
|
-
.select('id')
|
|
963
|
-
.from<DB_Page>('pages')
|
|
964
|
-
.orderByRaw('`order` ASC NULLS LAST')
|
|
965
|
-
.whereNull('redirectDestId')
|
|
966
|
-
.limit(limit)
|
|
967
|
-
.offset(offset);
|
|
968
|
-
};
|
|
969
|
-
}
|
|
970
|
-
|
|
971
|
-
/**
|
|
972
|
-
* Returns a Knex subquery builder that joins pages with their redirect destinations.
|
|
973
|
-
* When `includeNull` is true, also includes pages without redirects (self-referencing).
|
|
974
|
-
* @param includeNull - Whether to include non-redirected pages in the result. Defaults to `true`.
|
|
975
|
-
*/
|
|
976
|
-
function redirectTable(includeNull = true) {
|
|
977
|
-
return async (qb: Knex.QueryBuilder<Record<string, unknown>, unknown>) => {
|
|
978
|
-
const list = qb
|
|
979
|
-
.select('A.id as fromId', 'A.url as from', 'B.url as to', 'B.id as toId')
|
|
980
|
-
.from('pages as A')
|
|
981
|
-
.join('pages as B', (j) => {
|
|
982
|
-
j.on('A.redirectDestId', '=', 'B.id').andOnNotNull('A.redirectDestId');
|
|
983
|
-
});
|
|
984
|
-
if (includeNull) {
|
|
985
|
-
await list.union(async (qb) => {
|
|
986
|
-
await qb
|
|
987
|
-
.select('A.id as fromId', 'A.url as from', 'A.url as to', 'A.id as toId')
|
|
988
|
-
.from('pages as A')
|
|
989
|
-
.whereNull('A.redirectDestId');
|
|
990
|
-
});
|
|
991
|
-
}
|
|
992
|
-
};
|
|
993
|
-
}
|
|
994
|
-
|
|
995
|
-
// ----- ----- ----- ----- -----
|
|
996
|
-
//
|
|
997
|
-
// Utils
|
|
998
|
-
//
|
|
999
|
-
// ----- ----- ----- ----- -----
|
|
1000
|
-
|
|
1001
|
-
/**
|
|
1002
|
-
* Safely parses a JSON string, returning a fallback value if parsing fails or the input is not a string.
|
|
1003
|
-
* @param data - The data to parse. Only string values are parsed; other types return the fallback.
|
|
1004
|
-
* @param fallback - The value to return if parsing fails or the result is falsy.
|
|
1005
|
-
* @returns The parsed JSON value, or the fallback.
|
|
1006
|
-
*/
|
|
1007
|
-
function getJSON<T>(data: unknown, fallback: T): T {
|
|
1008
|
-
try {
|
|
1009
|
-
if (typeof data === 'string') {
|
|
1010
|
-
const result = JSON.parse(data);
|
|
1011
|
-
if (result) {
|
|
1012
|
-
return result;
|
|
1013
|
-
}
|
|
1014
|
-
return fallback;
|
|
1015
|
-
}
|
|
1016
|
-
} catch {
|
|
1017
|
-
// void
|
|
1018
|
-
}
|
|
1019
|
-
|
|
1020
|
-
return fallback;
|
|
1021
|
-
}
|
|
1022
|
-
|
|
1023
|
-
// ----- ----- ----- ----- -----
|
|
1024
|
-
//
|
|
1025
|
-
// Types
|
|
1026
|
-
//
|
|
1027
|
-
// ----- ----- ----- ----- -----
|
|
1028
|
-
|
|
1029
|
-
/**
|
|
1030
|
-
* Base options shared by all database connection configurations.
|
|
1031
|
-
*/
|
|
1032
|
-
type AbsDatabaseOption = {
|
|
1033
|
-
/** The working directory for the database (used for resolving relative paths). */
|
|
1034
|
-
workingDir: string;
|
|
1035
|
-
};
|
|
1036
|
-
|
|
1037
|
-
/**
|
|
1038
|
-
* Union type for all supported database connection options.
|
|
1039
|
-
*/
|
|
1040
|
-
type DatabaseOption = DatabaseSqlite3Option | DatabaseMySqlOption;
|
|
1041
|
-
|
|
1042
|
-
/**
|
|
1043
|
-
* Connection options for a SQLite3 database.
|
|
1044
|
-
*/
|
|
1045
|
-
type DatabaseSqlite3Option = AbsDatabaseOption & {
|
|
1046
|
-
/** The database type identifier. */
|
|
1047
|
-
type: 'sqlite3';
|
|
1048
|
-
/** The absolute file path to the SQLite database file. */
|
|
1049
|
-
filename: string;
|
|
1050
|
-
};
|
|
1051
|
-
|
|
1052
|
-
/**
|
|
1053
|
-
* Connection options for a MySQL database.
|
|
1054
|
-
* Note: MySQL support is not yet implemented.
|
|
1055
|
-
*/
|
|
1056
|
-
type DatabaseMySqlOption = AbsDatabaseOption & {
|
|
1057
|
-
/** The database type identifier. */
|
|
1058
|
-
type: 'mysql';
|
|
1059
|
-
};
|