@nitpicker/crawler 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/LICENSE +191 -0
- package/README.md +13 -0
- package/lib/archive/archive-accessor.d.ts +107 -0
- package/lib/archive/archive-accessor.js +264 -0
- package/lib/archive/archive.d.ts +174 -0
- package/lib/archive/archive.js +331 -0
- package/lib/archive/database.d.ts +207 -0
- package/lib/archive/database.js +972 -0
- package/lib/archive/debug.d.ts +8 -0
- package/lib/archive/debug.js +9 -0
- package/lib/archive/filesystem/append-text.d.ts +9 -0
- package/lib/archive/filesystem/append-text.js +14 -0
- package/lib/archive/filesystem/copy-dir-sync.d.ts +6 -0
- package/lib/archive/filesystem/copy-dir-sync.js +9 -0
- package/lib/archive/filesystem/copy-dir.d.ts +7 -0
- package/lib/archive/filesystem/copy-dir.js +13 -0
- package/lib/archive/filesystem/exists.d.ts +6 -0
- package/lib/archive/filesystem/exists.js +9 -0
- package/lib/archive/filesystem/get-file-list.d.ts +8 -0
- package/lib/archive/filesystem/get-file-list.js +12 -0
- package/lib/archive/filesystem/index.d.ts +17 -0
- package/lib/archive/filesystem/index.js +17 -0
- package/lib/archive/filesystem/is-dir.d.ts +6 -0
- package/lib/archive/filesystem/is-dir.js +10 -0
- package/lib/archive/filesystem/mkdir.d.ts +8 -0
- package/lib/archive/filesystem/mkdir.js +15 -0
- package/lib/archive/filesystem/output-json.d.ts +9 -0
- package/lib/archive/filesystem/output-json.js +14 -0
- package/lib/archive/filesystem/output-text.d.ts +11 -0
- package/lib/archive/filesystem/output-text.js +32 -0
- package/lib/archive/filesystem/read-json.d.ts +7 -0
- package/lib/archive/filesystem/read-json.js +11 -0
- package/lib/archive/filesystem/read-text.d.ts +6 -0
- package/lib/archive/filesystem/read-text.js +10 -0
- package/lib/archive/filesystem/readline.d.ts +11 -0
- package/lib/archive/filesystem/readline.js +26 -0
- package/lib/archive/filesystem/remove.d.ts +5 -0
- package/lib/archive/filesystem/remove.js +10 -0
- package/lib/archive/filesystem/rename.d.ts +11 -0
- package/lib/archive/filesystem/rename.js +18 -0
- package/lib/archive/filesystem/tar.d.ts +11 -0
- package/lib/archive/filesystem/tar.js +22 -0
- package/lib/archive/filesystem/untar.d.ts +20 -0
- package/lib/archive/filesystem/untar.js +24 -0
- package/lib/archive/filesystem/utils.d.ts +109 -0
- package/lib/archive/filesystem/utils.js +185 -0
- package/lib/archive/filesystem/zip.d.ts +29 -0
- package/lib/archive/filesystem/zip.js +53 -0
- package/lib/archive/index.d.ts +6 -0
- package/lib/archive/index.js +11 -0
- package/lib/archive/page.d.ts +263 -0
- package/lib/archive/page.js +316 -0
- package/lib/archive/resource.d.ts +46 -0
- package/lib/archive/resource.js +62 -0
- package/lib/archive/safe-path.d.ts +9 -0
- package/lib/archive/safe-path.js +17 -0
- package/lib/archive/types.d.ts +210 -0
- package/lib/archive/types.js +1 -0
- package/lib/crawler/clear-destination-cache.d.ts +5 -0
- package/lib/crawler/clear-destination-cache.js +8 -0
- package/lib/crawler/crawler.d.ts +73 -0
- package/lib/crawler/crawler.js +748 -0
- package/lib/crawler/decompose-url.d.ts +25 -0
- package/lib/crawler/decompose-url.js +71 -0
- package/lib/crawler/destination-cache.d.ts +7 -0
- package/lib/crawler/destination-cache.js +6 -0
- package/lib/crawler/detect-pagination-pattern.d.ts +16 -0
- package/lib/crawler/detect-pagination-pattern.js +61 -0
- package/lib/crawler/fetch-destination.d.ts +38 -0
- package/lib/crawler/fetch-destination.js +208 -0
- package/lib/crawler/fetch-robots-txt.d.ts +42 -0
- package/lib/crawler/fetch-robots-txt.js +44 -0
- package/lib/crawler/find-best-matching-scope.d.ts +12 -0
- package/lib/crawler/find-best-matching-scope.js +46 -0
- package/lib/crawler/generate-predicted-urls.d.ts +13 -0
- package/lib/crawler/generate-predicted-urls.js +27 -0
- package/lib/crawler/handle-ignore-and-skip.d.ts +16 -0
- package/lib/crawler/handle-ignore-and-skip.js +19 -0
- package/lib/crawler/handle-resource-response.d.ts +13 -0
- package/lib/crawler/handle-resource-response.js +16 -0
- package/lib/crawler/handle-scrape-end.d.ts +24 -0
- package/lib/crawler/handle-scrape-end.js +82 -0
- package/lib/crawler/handle-scrape-error.d.ts +37 -0
- package/lib/crawler/handle-scrape-error.js +38 -0
- package/lib/crawler/index.d.ts +2 -0
- package/lib/crawler/index.js +2 -0
- package/lib/crawler/inject-scope-auth.d.ts +11 -0
- package/lib/crawler/inject-scope-auth.js +21 -0
- package/lib/crawler/is-external-url.d.ts +11 -0
- package/lib/crawler/is-external-url.js +12 -0
- package/lib/crawler/is-in-any-lower-layer.d.ts +13 -0
- package/lib/crawler/is-in-any-lower-layer.js +15 -0
- package/lib/crawler/link-list.d.ts +112 -0
- package/lib/crawler/link-list.js +248 -0
- package/lib/crawler/link-to-page-data.d.ts +14 -0
- package/lib/crawler/link-to-page-data.js +32 -0
- package/lib/crawler/net-timeout-error.d.ts +9 -0
- package/lib/crawler/net-timeout-error.js +11 -0
- package/lib/crawler/network.d.ts +30 -0
- package/lib/crawler/network.js +226 -0
- package/lib/crawler/protocol-agnostic-key.d.ts +9 -0
- package/lib/crawler/protocol-agnostic-key.js +11 -0
- package/lib/crawler/reconstruct-url.d.ts +10 -0
- package/lib/crawler/reconstruct-url.js +28 -0
- package/lib/crawler/result-handler.d.ts +118 -0
- package/lib/crawler/result-handler.js +153 -0
- package/lib/crawler/robots-checker.d.ts +26 -0
- package/lib/crawler/robots-checker.js +62 -0
- package/lib/crawler/should-discard-predicted.d.ts +14 -0
- package/lib/crawler/should-discard-predicted.js +31 -0
- package/lib/crawler/should-skip-url.d.ts +23 -0
- package/lib/crawler/should-skip-url.js +15 -0
- package/lib/crawler/speculative-pagination.d.ts +52 -0
- package/lib/crawler/speculative-pagination.js +215 -0
- package/lib/crawler/types.d.ts +119 -0
- package/lib/crawler/types.js +1 -0
- package/lib/crawler/url-filter.d.ts +56 -0
- package/lib/crawler/url-filter.js +110 -0
- package/lib/crawler-orchestrator.d.ts +142 -0
- package/lib/crawler-orchestrator.js +309 -0
- package/lib/debug.d.ts +8 -0
- package/lib/debug.js +9 -0
- package/lib/index.d.ts +16 -0
- package/lib/index.js +18 -0
- package/lib/qzilla.d.ts +136 -0
- package/lib/qzilla.js +292 -0
- package/lib/types.d.ts +27 -0
- package/lib/types.js +1 -0
- package/lib/utils/array/each-splitted.d.ts +10 -0
- package/lib/utils/array/each-splitted.js +14 -0
- package/lib/utils/array/index.d.ts +1 -0
- package/lib/utils/array/index.js +1 -0
- package/lib/utils/async/index.d.ts +1 -0
- package/lib/utils/async/index.js +1 -0
- package/lib/utils/debug.d.ts +5 -0
- package/lib/utils/debug.js +5 -0
- package/lib/utils/error/dom-evaluation-error.d.ts +7 -0
- package/lib/utils/error/dom-evaluation-error.js +7 -0
- package/lib/utils/error/error-emitter.d.ts +18 -0
- package/lib/utils/error/error-emitter.js +29 -0
- package/lib/utils/error/index.d.ts +3 -0
- package/lib/utils/error/index.js +2 -0
- package/lib/utils/event-emitter/index.d.ts +6 -0
- package/lib/utils/event-emitter/index.js +6 -0
- package/lib/utils/index.d.ts +5 -0
- package/lib/utils/index.js +5 -0
- package/lib/utils/network/index.d.ts +1 -0
- package/lib/utils/network/index.js +1 -0
- package/lib/utils/object/clean-object.d.ts +8 -0
- package/lib/utils/object/clean-object.js +13 -0
- package/lib/utils/object/index.d.ts +1 -0
- package/lib/utils/object/index.js +1 -0
- package/lib/utils/path/index.d.ts +1 -0
- package/lib/utils/path/index.js +1 -0
- package/lib/utils/path/safe-filepath.d.ts +7 -0
- package/lib/utils/path/safe-filepath.js +12 -0
- package/lib/utils/regexp/index.d.ts +1 -0
- package/lib/utils/regexp/index.js +1 -0
- package/lib/utils/retryable/index.d.ts +2 -0
- package/lib/utils/retryable/index.js +1 -0
- package/lib/utils/sort/index.d.ts +14 -0
- package/lib/utils/sort/index.js +61 -0
- package/lib/utils/sort/remove-matches.d.ts +9 -0
- package/lib/utils/sort/remove-matches.js +23 -0
- package/lib/utils/types/index.d.ts +1 -0
- package/lib/utils/types/index.js +1 -0
- package/lib/utils/types/types.d.ts +46 -0
- package/lib/utils/types/types.js +1 -0
- package/lib/utils/url/index.d.ts +5 -0
- package/lib/utils/url/index.js +5 -0
- package/lib/utils/url/is-lower-layer.d.ts +15 -0
- package/lib/utils/url/is-lower-layer.js +55 -0
- package/lib/utils/url/parse-url.d.ts +11 -0
- package/lib/utils/url/parse-url.js +20 -0
- package/lib/utils/url/path-match.d.ts +11 -0
- package/lib/utils/url/path-match.js +18 -0
- package/lib/utils/url/sort-url.d.ts +10 -0
- package/lib/utils/url/sort-url.js +24 -0
- package/lib/utils/url/url-partial-match.d.ts +11 -0
- package/lib/utils/url/url-partial-match.js +32 -0
- package/package.json +49 -0
- package/src/archive/__mock__/.gitignore +3 -0
- package/src/archive/__mock__/mock.sqlite +0 -0
- package/src/archive/archive-accessor.ts +337 -0
- package/src/archive/archive.ts +408 -0
- package/src/archive/database.spec.ts +469 -0
- package/src/archive/database.ts +1059 -0
- package/src/archive/debug.ts +10 -0
- package/src/archive/filesystem/append-text.spec.ts +26 -0
- package/src/archive/filesystem/append-text.ts +16 -0
- package/src/archive/filesystem/copy-dir-sync.spec.ts +27 -0
- package/src/archive/filesystem/copy-dir-sync.ts +10 -0
- package/src/archive/filesystem/copy-dir.spec.ts +33 -0
- package/src/archive/filesystem/copy-dir.ts +14 -0
- package/src/archive/filesystem/exists.spec.ts +33 -0
- package/src/archive/filesystem/exists.ts +10 -0
- package/src/archive/filesystem/get-file-list.spec.ts +37 -0
- package/src/archive/filesystem/get-file-list.ts +13 -0
- package/src/archive/filesystem/index.ts +17 -0
- package/src/archive/filesystem/is-dir.spec.ts +29 -0
- package/src/archive/filesystem/is-dir.ts +11 -0
- package/src/archive/filesystem/mkdir.spec.ts +37 -0
- package/src/archive/filesystem/mkdir.ts +16 -0
- package/src/archive/filesystem/output-json.spec.ts +34 -0
- package/src/archive/filesystem/output-json.ts +16 -0
- package/src/archive/filesystem/output-text.spec.ts +31 -0
- package/src/archive/filesystem/output-text.ts +35 -0
- package/src/archive/filesystem/read-json.spec.ts +26 -0
- package/src/archive/filesystem/read-json.ts +12 -0
- package/src/archive/filesystem/read-text.spec.ts +25 -0
- package/src/archive/filesystem/read-text.ts +11 -0
- package/src/archive/filesystem/readline.spec.ts +29 -0
- package/src/archive/filesystem/readline.ts +30 -0
- package/src/archive/filesystem/remove.spec.ts +34 -0
- package/src/archive/filesystem/remove.ts +11 -0
- package/src/archive/filesystem/rename.spec.ts +46 -0
- package/src/archive/filesystem/rename.ts +21 -0
- package/src/archive/filesystem/tar.spec.ts +33 -0
- package/src/archive/filesystem/tar.ts +27 -0
- package/src/archive/filesystem/untar.spec.ts +34 -0
- package/src/archive/filesystem/untar.ts +36 -0
- package/src/archive/index.ts +13 -0
- package/src/archive/page.spec.ts +368 -0
- package/src/archive/page.ts +420 -0
- package/src/archive/resource.spec.ts +101 -0
- package/src/archive/resource.ts +73 -0
- package/src/archive/safe-path.spec.ts +44 -0
- package/src/archive/safe-path.ts +18 -0
- package/src/archive/types.ts +227 -0
- package/src/crawler/clear-destination-cache.spec.ts +20 -0
- package/src/crawler/clear-destination-cache.ts +9 -0
- package/src/crawler/crawler.ts +873 -0
- package/src/crawler/decompose-url.spec.ts +48 -0
- package/src/crawler/decompose-url.ts +90 -0
- package/src/crawler/destination-cache.spec.ts +23 -0
- package/src/crawler/destination-cache.ts +8 -0
- package/src/crawler/detect-pagination-pattern.spec.ts +169 -0
- package/src/crawler/detect-pagination-pattern.ts +66 -0
- package/src/crawler/fetch-destination.ts +257 -0
- package/src/crawler/fetch-robots-txt.spec.ts +83 -0
- package/src/crawler/fetch-robots-txt.ts +91 -0
- package/src/crawler/find-best-matching-scope.spec.ts +39 -0
- package/src/crawler/find-best-matching-scope.ts +57 -0
- package/src/crawler/generate-predicted-urls.spec.ts +42 -0
- package/src/crawler/generate-predicted-urls.ts +34 -0
- package/src/crawler/handle-ignore-and-skip.spec.ts +66 -0
- package/src/crawler/handle-ignore-and-skip.ts +30 -0
- package/src/crawler/handle-resource-response.spec.ts +45 -0
- package/src/crawler/handle-resource-response.ts +21 -0
- package/src/crawler/handle-scrape-end.spec.ts +109 -0
- package/src/crawler/handle-scrape-end.ts +115 -0
- package/src/crawler/handle-scrape-error.spec.ts +105 -0
- package/src/crawler/handle-scrape-error.ts +58 -0
- package/src/crawler/index.ts +2 -0
- package/src/crawler/inject-scope-auth.spec.ts +36 -0
- package/src/crawler/inject-scope-auth.ts +27 -0
- package/src/crawler/is-external-url.spec.ts +31 -0
- package/src/crawler/is-external-url.ts +17 -0
- package/src/crawler/is-in-any-lower-layer.spec.ts +31 -0
- package/src/crawler/is-in-any-lower-layer.ts +22 -0
- package/src/crawler/link-list.spec.ts +355 -0
- package/src/crawler/link-list.ts +275 -0
- package/src/crawler/link-to-page-data.spec.ts +133 -0
- package/src/crawler/link-to-page-data.ts +34 -0
- package/src/crawler/net-timeout-error.spec.ts +25 -0
- package/src/crawler/net-timeout-error.ts +11 -0
- package/src/crawler/protocol-agnostic-key.spec.ts +40 -0
- package/src/crawler/protocol-agnostic-key.ts +11 -0
- package/src/crawler/reconstruct-url.spec.ts +37 -0
- package/src/crawler/reconstruct-url.ts +37 -0
- package/src/crawler/robots-checker.spec.ts +104 -0
- package/src/crawler/robots-checker.ts +73 -0
- package/src/crawler/should-discard-predicted.spec.ts +125 -0
- package/src/crawler/should-discard-predicted.ts +33 -0
- package/src/crawler/should-skip-url.spec.ts +77 -0
- package/src/crawler/should-skip-url.ts +37 -0
- package/src/crawler/types.ts +146 -0
- package/src/crawler-orchestrator.ts +401 -0
- package/src/debug.ts +10 -0
- package/src/index.ts +25 -0
- package/src/types.ts +30 -0
- package/src/utils/array/each-splitted.spec.ts +38 -0
- package/src/utils/array/each-splitted.ts +19 -0
- package/src/utils/array/index.ts +1 -0
- package/src/utils/debug.ts +6 -0
- package/src/utils/error/dom-evaluation-error.spec.ts +20 -0
- package/src/utils/error/dom-evaluation-error.ts +6 -0
- package/src/utils/error/error-emitter.spec.ts +78 -0
- package/src/utils/error/error-emitter.ts +44 -0
- package/src/utils/error/index.ts +3 -0
- package/src/utils/index.ts +5 -0
- package/src/utils/object/clean-object.spec.ts +24 -0
- package/src/utils/object/clean-object.ts +13 -0
- package/src/utils/object/index.ts +1 -0
- package/src/utils/types/index.ts +1 -0
- package/src/utils/types/types.ts +65 -0
- package/tsconfig.json +11 -0
- package/tsconfig.tsbuildinfo +1 -0
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
import type { Config, DB_Anchor, DB_Page, DB_Redirect, DB_Referrer, DB_Resource, DatabaseEvent, PageFilter } from './types.js';
|
|
2
|
+
import type { PageData, Resource } from '../utils/index.js';
|
|
3
|
+
import { TypedAwaitEventEmitter as EventEmitter } from '@d-zero/shared/typed-await-event-emitter';
|
|
4
|
+
/**
|
|
5
|
+
* Low-level database abstraction layer for the archive's SQLite database.
|
|
6
|
+
*
|
|
7
|
+
* Manages the `pages`, `anchors`, `images`, `resources`, and `resources-referrers`
|
|
8
|
+
* tables. All public methods that perform database queries use the `@retryable`
|
|
9
|
+
* decorator for automatic retry on transient failures, and `@ErrorEmitter` to
|
|
10
|
+
* propagate errors as events.
|
|
11
|
+
*
|
|
12
|
+
* Use the static {@link Database.connect} factory method to create instances.
|
|
13
|
+
* The constructor is private.
|
|
14
|
+
*/
|
|
15
|
+
export declare class Database extends EventEmitter<DatabaseEvent> {
|
|
16
|
+
#private;
|
|
17
|
+
private constructor();
|
|
18
|
+
/**
|
|
19
|
+
* Adds the `order` column to the `pages` table for URL sort ordering.
|
|
20
|
+
* @deprecated Since v0.1.x. The column is now created during table initialization.
|
|
21
|
+
* @returns The result of the schema alteration.
|
|
22
|
+
*/
|
|
23
|
+
addOrderField(): Promise<void>;
|
|
24
|
+
/**
|
|
25
|
+
* Forces a WAL checkpoint, writing all pending WAL data back to the main database file.
|
|
26
|
+
* Uses TRUNCATE mode to reset the WAL file to zero bytes after checkpointing.
|
|
27
|
+
* This ensures the database is fully self-contained in `db.sqlite` before archiving.
|
|
28
|
+
*/
|
|
29
|
+
checkpoint(): Promise<void>;
|
|
30
|
+
destroy(): Promise<void>;
|
|
31
|
+
/**
|
|
32
|
+
* Retrieves all anchors (outgoing links) on a specific page.
|
|
33
|
+
* Joins the `anchors` table with the `pages` table to resolve link destinations.
|
|
34
|
+
* @param pageId - The database ID of the page whose anchors to retrieve.
|
|
35
|
+
* @returns An array of anchor records with resolved URL, title, status, and content type.
|
|
36
|
+
*/
|
|
37
|
+
getAnchorsOnPage(pageId: number): Promise<any[]>;
|
|
38
|
+
/**
|
|
39
|
+
* Retrieves the base URL of the crawl session from the `info` table.
|
|
40
|
+
* @returns The base URL string.
|
|
41
|
+
* @throws {Error} If no base URL is found in the database.
|
|
42
|
+
*/
|
|
43
|
+
getBaseUrl(): Promise<any>;
|
|
44
|
+
/**
|
|
45
|
+
* Retrieves the full crawl configuration from the `info` table.
|
|
46
|
+
* Deserializes JSON-encoded fields (`excludes`, `excludeKeywords`, `scope`).
|
|
47
|
+
* @returns The parsed {@link Config} object.
|
|
48
|
+
* @throws {Error} If no configuration is found in the database.
|
|
49
|
+
*/
|
|
50
|
+
getConfig(): Promise<Config>;
|
|
51
|
+
/**
|
|
52
|
+
* Retrieves the current crawling state by listing scraped and pending URLs.
|
|
53
|
+
* @returns An object with `scraped` (completed URLs) and `pending` (remaining URLs) arrays.
|
|
54
|
+
*/
|
|
55
|
+
getCrawlingState(): Promise<{
|
|
56
|
+
scraped: string[];
|
|
57
|
+
pending: string[];
|
|
58
|
+
}>;
|
|
59
|
+
/**
|
|
60
|
+
* Retrieves the HTML snapshot file path for a specific page.
|
|
61
|
+
* @param pageId - The database ID of the page.
|
|
62
|
+
* @returns The relative file path to the HTML snapshot, or null if not saved.
|
|
63
|
+
*/
|
|
64
|
+
getHtmlPathOnPage(pageId: number): Promise<any>;
|
|
65
|
+
/**
|
|
66
|
+
* Retrieves the crawl session name from the `info` table.
|
|
67
|
+
* @returns The name string.
|
|
68
|
+
* @throws {Error} If no name is found in the database.
|
|
69
|
+
*/
|
|
70
|
+
getName(): Promise<any>;
|
|
71
|
+
/**
|
|
72
|
+
* Counts the total number of pages in the database.
|
|
73
|
+
* @returns The total page count.
|
|
74
|
+
* @throws {Error} If the count query fails.
|
|
75
|
+
*/
|
|
76
|
+
getPageCount(): Promise<number>;
|
|
77
|
+
/**
|
|
78
|
+
* Retrieves pages from the database with optional filtering, pagination via offset and limit.
|
|
79
|
+
* @param filter - An optional {@link PageFilter} to narrow results by content type and origin.
|
|
80
|
+
* @param offset - The number of rows to skip. Defaults to `0`.
|
|
81
|
+
* @param limit - The maximum number of rows to return. Defaults to `100000`.
|
|
82
|
+
* @returns An array of raw {@link DB_Page} rows.
|
|
83
|
+
*/
|
|
84
|
+
getPages(filter?: PageFilter, offset?: number, limit?: number): Promise<DB_Page[]>;
|
|
85
|
+
/**
|
|
86
|
+
* Retrieves pages along with their related redirect, anchor, and referrer data.
|
|
87
|
+
* Results are ordered by the natural URL sort order. Only non-redirected pages are returned.
|
|
88
|
+
* @param offset - The number of rows to skip.
|
|
89
|
+
* @param limit - The maximum number of pages to return.
|
|
90
|
+
* @returns An object containing `pages`, `redirects`, `anchors`, and `referrers` arrays.
|
|
91
|
+
*/
|
|
92
|
+
getPagesWithRels(offset: number, limit: number): Promise<{
|
|
93
|
+
pages: DB_Page[];
|
|
94
|
+
redirects: DB_Redirect[];
|
|
95
|
+
anchors: DB_Anchor[];
|
|
96
|
+
referrers: DB_Referrer[];
|
|
97
|
+
}>;
|
|
98
|
+
/**
|
|
99
|
+
* Retrieves redirect sources for the given page IDs in bulk.
|
|
100
|
+
* @param pageIds - The database IDs of the destination pages.
|
|
101
|
+
* @returns An array of {@link DB_Redirect} records mapping destination pages to their redirect sources.
|
|
102
|
+
*/
|
|
103
|
+
getRedirectsForPages(pageIds: number[]): Promise<DB_Redirect[]>;
|
|
104
|
+
/**
|
|
105
|
+
* Retrieves pages that link to a specific page (incoming links / referrers).
|
|
106
|
+
* @param pageId - The database ID of the target page.
|
|
107
|
+
* @returns An array of referrer records with URL, hash, and text content.
|
|
108
|
+
*/
|
|
109
|
+
getReferrersOfPage(pageId: number): Promise<any[]>;
|
|
110
|
+
/**
|
|
111
|
+
* Retrieves the page URLs that reference a specific resource.
|
|
112
|
+
* @param id - The database ID of the resource.
|
|
113
|
+
* @returns An array of page URL strings that reference the resource.
|
|
114
|
+
*/
|
|
115
|
+
getReferrersOfResource(id: number): Promise<string[]>;
|
|
116
|
+
/**
|
|
117
|
+
* Retrieves all sub-resources from the `resources` table.
|
|
118
|
+
* @returns An array of raw {@link DB_Resource} rows.
|
|
119
|
+
*/
|
|
120
|
+
getResources(): Promise<DB_Resource[]>;
|
|
121
|
+
/**
|
|
122
|
+
* Retrieves a flat list of all resource URLs from the `resources` table.
|
|
123
|
+
* @returns An array of resource URL strings.
|
|
124
|
+
*/
|
|
125
|
+
getResourceUrlList(): Promise<any[]>;
|
|
126
|
+
/**
|
|
127
|
+
* Inserts a sub-resource into the `resources` table.
|
|
128
|
+
* Ignores duplicate URLs (uses `ON CONFLICT IGNORE`).
|
|
129
|
+
* @param resource - The resource data to insert.
|
|
130
|
+
*/
|
|
131
|
+
insertResource(resource: Resource): Promise<void>;
|
|
132
|
+
/**
|
|
133
|
+
* Inserts a referrer relationship between a resource and a page into the
|
|
134
|
+
* `resources-referrers` table. Silently skips if the resource is not found.
|
|
135
|
+
* @param src - The URL of the resource.
|
|
136
|
+
* @param pageUrl - The URL of the page that references the resource.
|
|
137
|
+
*/
|
|
138
|
+
insertResourceReferrers(src: string, pageUrl: string): Promise<void>;
|
|
139
|
+
/**
|
|
140
|
+
* Stores the crawl configuration in the `info` table.
|
|
141
|
+
* Serializes array fields (`excludes`, `excludeKeywords`, `scope`) as JSON strings.
|
|
142
|
+
* @param config - The {@link Config} object to store.
|
|
143
|
+
*/
|
|
144
|
+
setConfig(config: Config): Promise<Config[]>;
|
|
145
|
+
/**
|
|
146
|
+
* Marks a page as skipped in the database with the given reason.
|
|
147
|
+
* Creates the page row if it does not already exist.
|
|
148
|
+
* @param url - The URL of the skipped page.
|
|
149
|
+
* @param reason - The reason the page was skipped.
|
|
150
|
+
* @param isExternal - Whether the page is on an external domain. Defaults to `false`.
|
|
151
|
+
*/
|
|
152
|
+
setSkippedPage(url: string, reason: string, isExternal?: boolean): Promise<void>;
|
|
153
|
+
/**
|
|
154
|
+
* Assigns natural URL sort order values to all internal pages.
|
|
155
|
+
* Pages are sorted using {@link pathComparator} and assigned sequential order numbers.
|
|
156
|
+
*/
|
|
157
|
+
setUrlOrder(): Promise<void>;
|
|
158
|
+
/**
|
|
159
|
+
* Inserts or updates a crawled page in the database, including its redirect chain,
|
|
160
|
+
* anchors, and images. Optionally creates an HTML snapshot file path entry.
|
|
161
|
+
* @param page - The page data to store.
|
|
162
|
+
* @param snapshotDir - The directory for saving HTML snapshots, or null to skip snapshots.
|
|
163
|
+
* @param isTarget - Whether this page is a crawl target.
|
|
164
|
+
* @returns An object with the optional `html` snapshot file path and the page's database `pageId`.
|
|
165
|
+
*/
|
|
166
|
+
updatePage(page: PageData, snapshotDir: string | null, isTarget: boolean): Promise<{
|
|
167
|
+
html?: string | undefined;
|
|
168
|
+
pageId: number;
|
|
169
|
+
}>;
|
|
170
|
+
/**
|
|
171
|
+
* Creates and initializes a new Database instance.
|
|
172
|
+
* Creates the parent directory for the database file if needed,
|
|
173
|
+
* establishes the connection, and initializes tables if they do not exist.
|
|
174
|
+
* @param options - The database connection options specifying the type and file path.
|
|
175
|
+
* @returns A fully initialized Database instance.
|
|
176
|
+
*/
|
|
177
|
+
static connect(options: DatabaseOption): Promise<Database>;
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Base options shared by all database connection configurations.
|
|
181
|
+
*/
|
|
182
|
+
type AbsDatabaseOption = {
|
|
183
|
+
/** The working directory for the database (used for resolving relative paths). */
|
|
184
|
+
workingDir: string;
|
|
185
|
+
};
|
|
186
|
+
/**
|
|
187
|
+
* Union type for all supported database connection options.
|
|
188
|
+
*/
|
|
189
|
+
type DatabaseOption = DatabaseSqlite3Option | DatabaseMySqlOption;
|
|
190
|
+
/**
|
|
191
|
+
* Connection options for a SQLite3 database.
|
|
192
|
+
*/
|
|
193
|
+
type DatabaseSqlite3Option = AbsDatabaseOption & {
|
|
194
|
+
/** The database type identifier. */
|
|
195
|
+
type: 'sqlite3';
|
|
196
|
+
/** The absolute file path to the SQLite database file. */
|
|
197
|
+
filename: string;
|
|
198
|
+
};
|
|
199
|
+
/**
|
|
200
|
+
* Connection options for a MySQL database.
|
|
201
|
+
* Note: MySQL support is not yet implemented.
|
|
202
|
+
*/
|
|
203
|
+
type DatabaseMySqlOption = AbsDatabaseOption & {
|
|
204
|
+
/** The database type identifier. */
|
|
205
|
+
type: 'mysql';
|
|
206
|
+
};
|
|
207
|
+
export {};
|