@crawlith/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +40 -5
- package/dist/analysis/analyze.js +395 -347
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +11 -2
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +87 -0
- package/dist/crawler/crawler.js +683 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +2 -1
- package/dist/crawler/fetcher.js +26 -11
- package/dist/crawler/metricsRunner.d.ts +23 -1
- package/dist/crawler/metricsRunner.js +202 -72
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +6 -0
- package/dist/crawler/sitemap.js +27 -17
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +42 -30
- package/dist/db/index.d.ts +11 -0
- package/dist/db/index.js +41 -29
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +13 -0
- package/dist/db/repositories/EdgeRepository.js +20 -0
- package/dist/db/repositories/MetricsRepository.d.ts +16 -8
- package/dist/db/repositories/MetricsRepository.js +28 -7
- package/dist/db/repositories/PageRepository.d.ts +15 -2
- package/dist/db/repositories/PageRepository.js +169 -25
- package/dist/db/repositories/SiteRepository.d.ts +9 -0
- package/dist/db/repositories/SiteRepository.js +13 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
- package/dist/db/repositories/SnapshotRepository.js +64 -5
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +56 -0
- package/dist/events.js +1 -0
- package/dist/graph/graph.d.ts +36 -42
- package/dist/graph/graph.js +26 -17
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +25 -9
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -91
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +29 -8
- package/dist/index.js +29 -8
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +5 -1
- package/dist/lock/lockManager.js +38 -13
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/html.js +15 -216
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +56 -0
- package/dist/scoring/health.js +213 -0
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +12 -6
- package/CHANGELOG.md +0 -7
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -173
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -251
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
- package/dist/report/sitegraph_template.js +0 -630
- package/dist/scoring/hits.d.ts +0 -9
- package/dist/scoring/hits.js +0 -111
- package/src/analysis/analyze.ts +0 -548
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -59
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -92
- package/src/crawler/crawl.ts +0 -382
- package/src/crawler/extract.ts +0 -34
- package/src/crawler/fetcher.ts +0 -233
- package/src/crawler/metricsRunner.ts +0 -124
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -73
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -105
- package/src/db/index.ts +0 -70
- package/src/db/repositories/EdgeRepository.ts +0 -29
- package/src/db/repositories/MetricsRepository.ts +0 -49
- package/src/db/repositories/PageRepository.ts +0 -128
- package/src/db/repositories/SiteRepository.ts +0 -32
- package/src/db/repositories/SnapshotRepository.ts +0 -74
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/graph/cluster.ts +0 -192
- package/src/graph/duplicate.ts +0 -286
- package/src/graph/graph.ts +0 -172
- package/src/graph/metrics.ts +0 -110
- package/src/graph/pagerank.ts +0 -125
- package/src/graph/simhash.ts +0 -61
- package/src/index.ts +0 -30
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -124
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/html.ts +0 -227
- package/src/report/sitegraphExport.ts +0 -58
- package/src/scoring/hits.ts +0 -131
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -98
- package/tests/analyze.integration.test.ts +0 -98
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -112
- package/tests/clustering.test.ts +0 -118
- package/tests/crawler.test.ts +0 -358
- package/tests/db.test.ts +0 -159
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/fetcher.test.ts +0 -106
- package/tests/fetcher_safety.test.ts +0 -85
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -58
- package/tests/lock/lockManager.test.ts +0 -138
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -101
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -73
- package/tests/safety.test.ts +0 -114
- package/tests/scope.test.ts +0 -66
- package/tests/scoring.test.ts +0 -59
- package/tests/sitemap.test.ts +0 -88
- package/tests/soft404.test.ts +0 -41
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
|
@@ -8,36 +8,38 @@ export class PageRepository {
|
|
|
8
8
|
INSERT INTO pages (
|
|
9
9
|
site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
|
|
10
10
|
http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
|
|
11
|
-
|
|
12
|
-
redirect_chain, bytes_received, crawl_trap_flag, crawl_trap_risk, trap_type,
|
|
11
|
+
noindex, nofollow, security_error, retries, depth,
|
|
12
|
+
discovered_via_sitemap, redirect_chain, bytes_received, is_internal, crawl_trap_flag, crawl_trap_risk, trap_type,
|
|
13
13
|
updated_at
|
|
14
14
|
) VALUES (
|
|
15
15
|
@site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
|
|
16
16
|
@http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
|
|
17
|
-
@
|
|
18
|
-
@redirect_chain, @bytes_received, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
|
|
17
|
+
@noindex, @nofollow, @security_error, @retries, @depth,
|
|
18
|
+
@discovered_via_sitemap, @redirect_chain, @bytes_received, @is_internal, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
|
|
19
19
|
datetime('now')
|
|
20
20
|
)
|
|
21
21
|
ON CONFLICT(site_id, normalized_url) DO UPDATE SET
|
|
22
|
+
first_seen_snapshot_id = COALESCE(pages.first_seen_snapshot_id, excluded.first_seen_snapshot_id),
|
|
22
23
|
last_seen_snapshot_id = excluded.last_seen_snapshot_id,
|
|
23
|
-
http_status = excluded.http_status,
|
|
24
|
-
canonical_url = excluded.canonical_url,
|
|
25
|
-
content_hash = excluded.content_hash,
|
|
26
|
-
simhash = excluded.simhash,
|
|
27
|
-
etag = excluded.etag,
|
|
28
|
-
last_modified = excluded.last_modified,
|
|
29
|
-
html = excluded.html,
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
redirect_chain = excluded.redirect_chain,
|
|
37
|
-
bytes_received = excluded.bytes_received,
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
24
|
+
http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
|
|
25
|
+
canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
|
|
26
|
+
content_hash = COALESCE(excluded.content_hash, pages.content_hash),
|
|
27
|
+
simhash = COALESCE(excluded.simhash, pages.simhash),
|
|
28
|
+
etag = COALESCE(excluded.etag, pages.etag),
|
|
29
|
+
last_modified = COALESCE(excluded.last_modified, pages.last_modified),
|
|
30
|
+
html = COALESCE(excluded.html, pages.html),
|
|
31
|
+
noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
|
|
32
|
+
nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
|
|
33
|
+
security_error = COALESCE(excluded.security_error, pages.security_error),
|
|
34
|
+
retries = MAX(pages.retries, excluded.retries),
|
|
35
|
+
depth = MIN(pages.depth, excluded.depth),
|
|
36
|
+
discovered_via_sitemap = MAX(pages.discovered_via_sitemap, excluded.discovered_via_sitemap),
|
|
37
|
+
redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
|
|
38
|
+
bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
|
|
39
|
+
is_internal = COALESCE(excluded.is_internal, pages.is_internal),
|
|
40
|
+
crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
|
|
41
|
+
crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
|
|
42
|
+
trap_type = COALESCE(excluded.trap_type, pages.trap_type),
|
|
41
43
|
updated_at = datetime('now')
|
|
42
44
|
`);
|
|
43
45
|
this.getIdStmt = this.db.prepare('SELECT id FROM pages WHERE site_id = ? AND normalized_url = ?');
|
|
@@ -55,14 +57,15 @@ export class PageRepository {
|
|
|
55
57
|
etag: page.etag ?? null,
|
|
56
58
|
last_modified: page.last_modified ?? null,
|
|
57
59
|
html: page.html ?? null,
|
|
58
|
-
soft404_score: page.soft404_score ?? null,
|
|
59
60
|
noindex: page.noindex ?? 0,
|
|
60
61
|
nofollow: page.nofollow ?? 0,
|
|
61
62
|
security_error: page.security_error ?? null,
|
|
62
63
|
retries: page.retries ?? 0,
|
|
63
64
|
depth: page.depth ?? 0,
|
|
65
|
+
discovered_via_sitemap: page.discovered_via_sitemap ?? 0,
|
|
64
66
|
redirect_chain: page.redirect_chain ?? null,
|
|
65
67
|
bytes_received: page.bytes_received ?? null,
|
|
68
|
+
is_internal: page.is_internal ?? 1,
|
|
66
69
|
crawl_trap_flag: page.crawl_trap_flag ?? 0,
|
|
67
70
|
crawl_trap_risk: page.crawl_trap_risk ?? null,
|
|
68
71
|
trap_type: page.trap_type ?? null,
|
|
@@ -83,11 +86,152 @@ export class PageRepository {
|
|
|
83
86
|
getPage(siteId, url) {
|
|
84
87
|
return this.db.prepare('SELECT * FROM pages WHERE site_id = ? AND normalized_url = ?').get(siteId, url);
|
|
85
88
|
}
|
|
86
|
-
|
|
87
|
-
|
|
89
|
+
getPagesByUrls(siteId, urls) {
|
|
90
|
+
if (urls.length === 0)
|
|
91
|
+
return [];
|
|
92
|
+
const chunkSize = 900;
|
|
93
|
+
const results = [];
|
|
94
|
+
for (let i = 0; i < urls.length; i += chunkSize) {
|
|
95
|
+
const chunk = urls.slice(i, i + chunkSize);
|
|
96
|
+
const placeholders = chunk.map(() => '?').join(',');
|
|
97
|
+
const chunkResults = this.db.prepare(`SELECT * FROM pages WHERE site_id = ? AND normalized_url IN (${placeholders})`).all(siteId, ...chunk);
|
|
98
|
+
results.push(...chunkResults);
|
|
99
|
+
}
|
|
100
|
+
return results;
|
|
101
|
+
}
|
|
102
|
+
upsertMany(pages) {
|
|
103
|
+
if (pages.length === 0)
|
|
104
|
+
return new Map();
|
|
105
|
+
const upsertStmtWithReturn = this.db.prepare(`
|
|
106
|
+
INSERT INTO pages (
|
|
107
|
+
site_id, normalized_url, first_seen_snapshot_id, last_seen_snapshot_id,
|
|
108
|
+
http_status, canonical_url, content_hash, simhash, etag, last_modified, html,
|
|
109
|
+
noindex, nofollow, security_error, retries, depth,
|
|
110
|
+
discovered_via_sitemap, redirect_chain, bytes_received, is_internal, crawl_trap_flag, crawl_trap_risk, trap_type,
|
|
111
|
+
updated_at
|
|
112
|
+
) VALUES (
|
|
113
|
+
@site_id, @normalized_url, @first_seen_snapshot_id, @last_seen_snapshot_id,
|
|
114
|
+
@http_status, @canonical_url, @content_hash, @simhash, @etag, @last_modified, @html,
|
|
115
|
+
@noindex, @nofollow, @security_error, @retries, @depth,
|
|
116
|
+
@discovered_via_sitemap, @redirect_chain, @bytes_received, @is_internal, @crawl_trap_flag, @crawl_trap_risk, @trap_type,
|
|
117
|
+
datetime('now')
|
|
118
|
+
)
|
|
119
|
+
ON CONFLICT(site_id, normalized_url) DO UPDATE SET
|
|
120
|
+
first_seen_snapshot_id = COALESCE(pages.first_seen_snapshot_id, excluded.first_seen_snapshot_id),
|
|
121
|
+
last_seen_snapshot_id = excluded.last_seen_snapshot_id,
|
|
122
|
+
http_status = CASE WHEN excluded.http_status != 0 THEN excluded.http_status ELSE pages.http_status END,
|
|
123
|
+
canonical_url = COALESCE(excluded.canonical_url, pages.canonical_url),
|
|
124
|
+
content_hash = COALESCE(excluded.content_hash, pages.content_hash),
|
|
125
|
+
simhash = COALESCE(excluded.simhash, pages.simhash),
|
|
126
|
+
etag = COALESCE(excluded.etag, pages.etag),
|
|
127
|
+
last_modified = COALESCE(excluded.last_modified, pages.last_modified),
|
|
128
|
+
html = COALESCE(excluded.html, pages.html),
|
|
129
|
+
noindex = CASE WHEN excluded.http_status != 0 THEN excluded.noindex ELSE pages.noindex END,
|
|
130
|
+
nofollow = CASE WHEN excluded.http_status != 0 THEN excluded.nofollow ELSE pages.nofollow END,
|
|
131
|
+
security_error = COALESCE(excluded.security_error, pages.security_error),
|
|
132
|
+
retries = MAX(pages.retries, excluded.retries),
|
|
133
|
+
depth = MIN(pages.depth, excluded.depth),
|
|
134
|
+
discovered_via_sitemap = MAX(pages.discovered_via_sitemap, excluded.discovered_via_sitemap),
|
|
135
|
+
redirect_chain = COALESCE(excluded.redirect_chain, pages.redirect_chain),
|
|
136
|
+
bytes_received = COALESCE(excluded.bytes_received, pages.bytes_received),
|
|
137
|
+
is_internal = COALESCE(excluded.is_internal, pages.is_internal),
|
|
138
|
+
crawl_trap_flag = MAX(pages.crawl_trap_flag, excluded.crawl_trap_flag),
|
|
139
|
+
crawl_trap_risk = COALESCE(excluded.crawl_trap_risk, pages.crawl_trap_risk),
|
|
140
|
+
trap_type = COALESCE(excluded.trap_type, pages.trap_type),
|
|
141
|
+
updated_at = datetime('now')
|
|
142
|
+
RETURNING id
|
|
143
|
+
`);
|
|
144
|
+
const urlToId = new Map();
|
|
145
|
+
const tx = this.db.transaction((pagesBatch) => {
|
|
146
|
+
for (const page of pagesBatch) {
|
|
147
|
+
const params = {
|
|
148
|
+
site_id: page.site_id,
|
|
149
|
+
normalized_url: page.normalized_url,
|
|
150
|
+
first_seen_snapshot_id: page.first_seen_snapshot_id ?? page.last_seen_snapshot_id,
|
|
151
|
+
last_seen_snapshot_id: page.last_seen_snapshot_id,
|
|
152
|
+
http_status: page.http_status ?? null,
|
|
153
|
+
canonical_url: page.canonical_url ?? null,
|
|
154
|
+
content_hash: page.content_hash ?? null,
|
|
155
|
+
simhash: page.simhash ?? null,
|
|
156
|
+
etag: page.etag ?? null,
|
|
157
|
+
last_modified: page.last_modified ?? null,
|
|
158
|
+
html: page.html ?? null,
|
|
159
|
+
noindex: page.noindex ?? 0,
|
|
160
|
+
nofollow: page.nofollow ?? 0,
|
|
161
|
+
security_error: page.security_error ?? null,
|
|
162
|
+
retries: page.retries ?? 0,
|
|
163
|
+
depth: page.depth ?? 0,
|
|
164
|
+
discovered_via_sitemap: page.discovered_via_sitemap ?? 0,
|
|
165
|
+
redirect_chain: page.redirect_chain ?? null,
|
|
166
|
+
bytes_received: page.bytes_received ?? null,
|
|
167
|
+
is_internal: page.is_internal ?? 1,
|
|
168
|
+
crawl_trap_flag: page.crawl_trap_flag ?? 0,
|
|
169
|
+
crawl_trap_risk: page.crawl_trap_risk ?? null,
|
|
170
|
+
trap_type: page.trap_type ?? null,
|
|
171
|
+
};
|
|
172
|
+
const row = upsertStmtWithReturn.get(params);
|
|
173
|
+
urlToId.set(page.normalized_url, row.id);
|
|
174
|
+
}
|
|
175
|
+
});
|
|
176
|
+
tx(pages);
|
|
177
|
+
return urlToId;
|
|
178
|
+
}
|
|
179
|
+
getPagesBySnapshot(snapshotId, runType = 'completed') {
|
|
180
|
+
if (runType === 'single') {
|
|
181
|
+
return this.db.prepare('SELECT p.* FROM pages p JOIN metrics m ON p.id = m.page_id WHERE m.snapshot_id = ?').all(snapshotId);
|
|
182
|
+
}
|
|
183
|
+
return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND COALESCE(p.first_seen_snapshot_id, p.last_seen_snapshot_id) <= ?').all(snapshotId, snapshotId);
|
|
184
|
+
}
|
|
185
|
+
getPagesIdentityBySnapshot(snapshotId) {
|
|
186
|
+
// For identities, always loading all up to this point is fine for the crawler to map URLs to IDs.
|
|
187
|
+
return this.db.prepare('SELECT p.id, p.normalized_url FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND COALESCE(p.first_seen_snapshot_id, p.last_seen_snapshot_id) <= ?').all(snapshotId, snapshotId);
|
|
188
|
+
}
|
|
189
|
+
getPagesIteratorBySnapshot(snapshotId, runType = 'completed') {
|
|
190
|
+
if (runType === 'single') {
|
|
191
|
+
return this.db.prepare('SELECT p.* FROM pages p JOIN metrics m ON p.id = m.page_id WHERE m.snapshot_id = ?').iterate(snapshotId);
|
|
192
|
+
}
|
|
193
|
+
return this.db.prepare('SELECT p.* FROM pages p JOIN snapshots s ON p.site_id = s.site_id WHERE s.id = ? AND COALESCE(p.first_seen_snapshot_id, p.last_seen_snapshot_id) <= ?').iterate(snapshotId, snapshotId);
|
|
88
194
|
}
|
|
89
195
|
getIdByUrl(siteId, url) {
|
|
90
196
|
const row = this.getIdStmt.get(siteId, url);
|
|
91
197
|
return row?.id;
|
|
92
198
|
}
|
|
199
|
+
reconcileInternalUrls(siteId, siteOrigin) {
|
|
200
|
+
const origin = siteOrigin.replace(/\/+$/, '');
|
|
201
|
+
const tx = this.db.transaction(() => {
|
|
202
|
+
const rows = this.db
|
|
203
|
+
.prepare("SELECT id, normalized_url FROM pages WHERE site_id = ? AND (normalized_url LIKE 'http://%' OR normalized_url LIKE 'https://%')")
|
|
204
|
+
.all(siteId);
|
|
205
|
+
for (const row of rows) {
|
|
206
|
+
let parsed;
|
|
207
|
+
try {
|
|
208
|
+
parsed = new URL(row.normalized_url);
|
|
209
|
+
}
|
|
210
|
+
catch {
|
|
211
|
+
continue;
|
|
212
|
+
}
|
|
213
|
+
if (parsed.origin !== origin) {
|
|
214
|
+
continue;
|
|
215
|
+
}
|
|
216
|
+
const targetPath = `${parsed.pathname}${parsed.search}`;
|
|
217
|
+
if (targetPath === row.normalized_url) {
|
|
218
|
+
continue;
|
|
219
|
+
}
|
|
220
|
+
const existing = this.db
|
|
221
|
+
.prepare('SELECT id FROM pages WHERE site_id = ? AND normalized_url = ?')
|
|
222
|
+
.get(siteId, targetPath);
|
|
223
|
+
if (existing && existing.id !== row.id) {
|
|
224
|
+
this.db.prepare('UPDATE edges SET source_page_id = ? WHERE source_page_id = ?').run(existing.id, row.id);
|
|
225
|
+
this.db.prepare('UPDATE edges SET target_page_id = ? WHERE target_page_id = ?').run(existing.id, row.id);
|
|
226
|
+
this.db.prepare('UPDATE OR IGNORE metrics SET page_id = ? WHERE page_id = ?').run(existing.id, row.id);
|
|
227
|
+
this.db.prepare('DELETE FROM metrics WHERE page_id = ?').run(row.id);
|
|
228
|
+
this.db.prepare('DELETE FROM pages WHERE id = ?').run(row.id);
|
|
229
|
+
}
|
|
230
|
+
else {
|
|
231
|
+
this.db.prepare('UPDATE pages SET normalized_url = ? WHERE id = ?').run(targetPath, row.id);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
});
|
|
235
|
+
tx();
|
|
236
|
+
}
|
|
93
237
|
}
|
|
@@ -2,6 +2,8 @@ import { Database } from 'better-sqlite3';
|
|
|
2
2
|
export interface Site {
|
|
3
3
|
id: number;
|
|
4
4
|
domain: string;
|
|
5
|
+
preferred_url: string | null;
|
|
6
|
+
ssl: number | null;
|
|
5
7
|
created_at: string;
|
|
6
8
|
settings_json: string | null;
|
|
7
9
|
is_active: number;
|
|
@@ -9,7 +11,14 @@ export interface Site {
|
|
|
9
11
|
export declare class SiteRepository {
|
|
10
12
|
private db;
|
|
11
13
|
constructor(db: Database);
|
|
14
|
+
getSiteById(id: number): Site | undefined;
|
|
12
15
|
getSite(domain: string): Site | undefined;
|
|
16
|
+
getAllSites(): Site[];
|
|
13
17
|
createSite(domain: string): number;
|
|
18
|
+
updateSitePreference(id: number, prefs: {
|
|
19
|
+
preferred_url: string;
|
|
20
|
+
ssl: number;
|
|
21
|
+
}): void;
|
|
14
22
|
firstOrCreateSite(domain: string): Site;
|
|
23
|
+
deleteSite(id: number): void;
|
|
15
24
|
}
|
|
@@ -3,14 +3,24 @@ export class SiteRepository {
|
|
|
3
3
|
constructor(db) {
|
|
4
4
|
this.db = db;
|
|
5
5
|
}
|
|
6
|
+
getSiteById(id) {
|
|
7
|
+
return this.db.prepare('SELECT * FROM sites WHERE id = ?').get(id);
|
|
8
|
+
}
|
|
6
9
|
getSite(domain) {
|
|
7
10
|
return this.db.prepare('SELECT * FROM sites WHERE domain = ?').get(domain);
|
|
8
11
|
}
|
|
12
|
+
getAllSites() {
|
|
13
|
+
return this.db.prepare('SELECT * FROM sites ORDER BY domain ASC').all();
|
|
14
|
+
}
|
|
9
15
|
createSite(domain) {
|
|
10
16
|
const stmt = this.db.prepare('INSERT INTO sites (domain) VALUES (?)');
|
|
11
17
|
const info = stmt.run(domain);
|
|
12
18
|
return info.lastInsertRowid;
|
|
13
19
|
}
|
|
20
|
+
updateSitePreference(id, prefs) {
|
|
21
|
+
const stmt = this.db.prepare('UPDATE sites SET preferred_url = ?, ssl = ? WHERE id = ?');
|
|
22
|
+
stmt.run(prefs.preferred_url, prefs.ssl, id);
|
|
23
|
+
}
|
|
14
24
|
firstOrCreateSite(domain) {
|
|
15
25
|
let site = this.getSite(domain);
|
|
16
26
|
if (!site) {
|
|
@@ -19,4 +29,7 @@ export class SiteRepository {
|
|
|
19
29
|
}
|
|
20
30
|
return site;
|
|
21
31
|
}
|
|
32
|
+
deleteSite(id) {
|
|
33
|
+
this.db.prepare('DELETE FROM sites WHERE id = ?').run(id);
|
|
34
|
+
}
|
|
22
35
|
}
|
|
@@ -2,11 +2,11 @@ import { Database } from 'better-sqlite3';
|
|
|
2
2
|
export interface Snapshot {
|
|
3
3
|
id: number;
|
|
4
4
|
site_id: number;
|
|
5
|
-
|
|
5
|
+
run_type: 'completed' | 'incremental' | 'single';
|
|
6
6
|
created_at: string;
|
|
7
7
|
node_count: number;
|
|
8
8
|
edge_count: number;
|
|
9
|
-
status: 'running' | 'completed' | 'failed';
|
|
9
|
+
status: 'queued' | 'running' | 'completed' | 'failed' | 'cancelled';
|
|
10
10
|
limit_reached: number;
|
|
11
11
|
health_score: number | null;
|
|
12
12
|
orphan_count: number | null;
|
|
@@ -15,8 +15,17 @@ export interface Snapshot {
|
|
|
15
15
|
export declare class SnapshotRepository {
|
|
16
16
|
private db;
|
|
17
17
|
constructor(db: Database);
|
|
18
|
-
createSnapshot(siteId: number,
|
|
19
|
-
getLatestSnapshot(siteId: number, status?: '
|
|
20
|
-
|
|
18
|
+
createSnapshot(siteId: number, runType: Snapshot['run_type'], status?: Snapshot['status']): number;
|
|
19
|
+
getLatestSnapshot(siteId: number, status?: Snapshot['status'], includeSingle?: boolean): Snapshot | undefined;
|
|
20
|
+
touchSnapshot(id: number): void;
|
|
21
|
+
getSnapshotCount(siteId: number): number;
|
|
22
|
+
/**
|
|
23
|
+
* Returns true if the site has ever had a completed full or incremental crawl.
|
|
24
|
+
* Single snapshots (from page --live) do NOT count as a "first crawl".
|
|
25
|
+
*/
|
|
26
|
+
hasFullCrawl(siteId: number): boolean;
|
|
27
|
+
updateSnapshotStatus(id: number, status: Snapshot['status'], stats?: Partial<Snapshot>): void;
|
|
21
28
|
getSnapshot(id: number): Snapshot | undefined;
|
|
29
|
+
deleteSnapshot(id: number): void;
|
|
30
|
+
pruneSnapshots(siteId: number, maxSnapshots: number, maxSingleSnapshots: number, protectedSnapshotId?: number): void;
|
|
22
31
|
}
|
|
@@ -3,21 +3,39 @@ export class SnapshotRepository {
|
|
|
3
3
|
constructor(db) {
|
|
4
4
|
this.db = db;
|
|
5
5
|
}
|
|
6
|
-
createSnapshot(siteId,
|
|
7
|
-
const stmt = this.db.prepare('INSERT INTO snapshots (site_id,
|
|
8
|
-
const info = stmt.run(siteId,
|
|
6
|
+
createSnapshot(siteId, runType, status = 'running') {
|
|
7
|
+
const stmt = this.db.prepare('INSERT INTO snapshots (site_id, run_type, status) VALUES (?, ?, ?)');
|
|
8
|
+
const info = stmt.run(siteId, runType, status);
|
|
9
9
|
return info.lastInsertRowid;
|
|
10
10
|
}
|
|
11
|
-
getLatestSnapshot(siteId, status) {
|
|
11
|
+
getLatestSnapshot(siteId, status, includeSingle = false) {
|
|
12
12
|
let sql = 'SELECT * FROM snapshots WHERE site_id = ?';
|
|
13
|
+
if (!includeSingle) {
|
|
14
|
+
sql += ' AND run_type != \'single\'';
|
|
15
|
+
}
|
|
13
16
|
const params = [siteId];
|
|
14
17
|
if (status) {
|
|
15
18
|
sql += ' AND status = ?';
|
|
16
19
|
params.push(status);
|
|
17
20
|
}
|
|
18
|
-
sql += ' ORDER BY created_at DESC LIMIT 1';
|
|
21
|
+
sql += ' ORDER BY created_at DESC, id DESC LIMIT 1';
|
|
19
22
|
return this.db.prepare(sql).get(...params);
|
|
20
23
|
}
|
|
24
|
+
touchSnapshot(id) {
|
|
25
|
+
this.db.prepare(`UPDATE snapshots SET created_at = datetime('now') WHERE id = ?`).run(id);
|
|
26
|
+
}
|
|
27
|
+
getSnapshotCount(siteId) {
|
|
28
|
+
const result = this.db.prepare('SELECT COUNT(*) as count FROM snapshots WHERE site_id = ?').get(siteId);
|
|
29
|
+
return result.count;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Returns true if the site has ever had a completed full or incremental crawl.
|
|
33
|
+
* Single snapshots (from page --live) do NOT count as a "first crawl".
|
|
34
|
+
*/
|
|
35
|
+
hasFullCrawl(siteId) {
|
|
36
|
+
const result = this.db.prepare(`SELECT COUNT(*) as count FROM snapshots WHERE site_id = ? AND run_type IN ('completed', 'incremental') AND status = 'completed'`).get(siteId);
|
|
37
|
+
return result.count > 0;
|
|
38
|
+
}
|
|
21
39
|
updateSnapshotStatus(id, status, stats = {}) {
|
|
22
40
|
const sets = ['status = ?'];
|
|
23
41
|
const params = [status];
|
|
@@ -52,4 +70,45 @@ export class SnapshotRepository {
|
|
|
52
70
|
getSnapshot(id) {
|
|
53
71
|
return this.db.prepare('SELECT * FROM snapshots WHERE id = ?').get(id);
|
|
54
72
|
}
|
|
73
|
+
deleteSnapshot(id) {
|
|
74
|
+
const tx = this.db.transaction(() => {
|
|
75
|
+
// Unlink pages from this snapshot to prevent FK constraint violations or data inconsistencies
|
|
76
|
+
this.db.prepare('UPDATE pages SET first_seen_snapshot_id = NULL WHERE first_seen_snapshot_id = ?').run(id);
|
|
77
|
+
this.db.prepare('UPDATE pages SET last_seen_snapshot_id = NULL WHERE last_seen_snapshot_id = ?').run(id);
|
|
78
|
+
// Cleanup: Delete pages that are no longer referenced by any snapshot
|
|
79
|
+
this.db.prepare('DELETE FROM pages WHERE first_seen_snapshot_id IS NULL AND last_seen_snapshot_id IS NULL').run();
|
|
80
|
+
// Delete the snapshot
|
|
81
|
+
this.db.prepare('DELETE FROM snapshots WHERE id = ?').run(id);
|
|
82
|
+
});
|
|
83
|
+
tx();
|
|
84
|
+
}
|
|
85
|
+
pruneSnapshots(siteId, maxSnapshots, maxSingleSnapshots, protectedSnapshotId) {
|
|
86
|
+
const tx = this.db.transaction(() => {
|
|
87
|
+
const singlesToDelete = this.db.prepare(`
|
|
88
|
+
SELECT id
|
|
89
|
+
FROM snapshots
|
|
90
|
+
WHERE site_id = ? AND run_type = 'single'
|
|
91
|
+
ORDER BY created_at DESC, id DESC
|
|
92
|
+
LIMIT -1 OFFSET ?
|
|
93
|
+
`).all(siteId, Math.max(0, maxSingleSnapshots));
|
|
94
|
+
const fullToDelete = this.db.prepare(`
|
|
95
|
+
SELECT id
|
|
96
|
+
FROM snapshots
|
|
97
|
+
WHERE site_id = ? AND run_type IN ('completed', 'incremental')
|
|
98
|
+
ORDER BY created_at DESC, id DESC
|
|
99
|
+
LIMIT -1 OFFSET ?
|
|
100
|
+
`).all(siteId, Math.max(0, maxSnapshots));
|
|
101
|
+
const ids = [...singlesToDelete, ...fullToDelete]
|
|
102
|
+
.map(r => r.id)
|
|
103
|
+
.filter(id => id !== protectedSnapshotId);
|
|
104
|
+
for (const id of ids) {
|
|
105
|
+
// Inline delete logic to keep operation inside this transaction.
|
|
106
|
+
this.db.prepare('UPDATE pages SET first_seen_snapshot_id = NULL WHERE first_seen_snapshot_id = ?').run(id);
|
|
107
|
+
this.db.prepare('UPDATE pages SET last_seen_snapshot_id = NULL WHERE last_seen_snapshot_id = ?').run(id);
|
|
108
|
+
this.db.prepare('DELETE FROM pages WHERE first_seen_snapshot_id IS NULL AND last_seen_snapshot_id IS NULL').run();
|
|
109
|
+
this.db.prepare('DELETE FROM snapshots WHERE id = ?').run(id);
|
|
110
|
+
}
|
|
111
|
+
});
|
|
112
|
+
tx();
|
|
113
|
+
}
|
|
55
114
|
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
export interface ResetOptions {
|
|
2
|
+
reportsDir?: string;
|
|
3
|
+
dryRun?: boolean;
|
|
4
|
+
}
|
|
5
|
+
/**
|
|
6
|
+
* Completely resets the Crawlith state.
|
|
7
|
+
* Deletes the database, clears all locks, and optionally wipes the reports directory.
|
|
8
|
+
*/
|
|
9
|
+
export declare function resetCrawlith(options?: ResetOptions): Promise<void>;
|
package/dist/db/reset.js
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import os from 'node:os';
|
|
4
|
+
import { closeDb, getDb, getDbPath } from './index.js';
|
|
5
|
+
import { LockManager } from '../lock/lockManager.js';
|
|
6
|
+
/**
|
|
7
|
+
* Completely resets the Crawlith state.
|
|
8
|
+
* Deletes the database, clears all locks, and optionally wipes the reports directory.
|
|
9
|
+
*/
|
|
10
|
+
export async function resetCrawlith(options = {}) {
|
|
11
|
+
const { reportsDir, dryRun = false } = options;
|
|
12
|
+
if (dryRun) {
|
|
13
|
+
return;
|
|
14
|
+
}
|
|
15
|
+
// 1. Close database connection to release file handles
|
|
16
|
+
closeDb();
|
|
17
|
+
// 2. Clear all locks
|
|
18
|
+
await LockManager.clearAllLocks();
|
|
19
|
+
// 3. Remove the entire state directory (includes DB)
|
|
20
|
+
const dbPath = getDbPath();
|
|
21
|
+
if (dbPath !== ':memory:') {
|
|
22
|
+
const crawlithDir = path.join(os.homedir(), '.crawlith');
|
|
23
|
+
await fs.rm(crawlithDir, { recursive: true, force: true });
|
|
24
|
+
}
|
|
25
|
+
// 4. Remove reports directory if specified
|
|
26
|
+
if (reportsDir) {
|
|
27
|
+
const resolvedReportsDir = path.resolve(reportsDir);
|
|
28
|
+
await fs.rm(resolvedReportsDir, { recursive: true, force: true });
|
|
29
|
+
}
|
|
30
|
+
// 5. Re-initialize database to ensure schema is fresh for next use
|
|
31
|
+
getDb();
|
|
32
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { Database, Statement } from 'better-sqlite3';
|
|
2
|
+
export declare class Statements {
|
|
3
|
+
private db;
|
|
4
|
+
getPageIdByUrl: Statement;
|
|
5
|
+
insertPluginReport: Statement;
|
|
6
|
+
getPluginReport: Statement;
|
|
7
|
+
deleteSnapshotPlugins: Statement;
|
|
8
|
+
getSnapshot: Statement;
|
|
9
|
+
getMigration: Statement;
|
|
10
|
+
insertMigration: Statement;
|
|
11
|
+
constructor(db: Database);
|
|
12
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
export class Statements {
|
|
2
|
+
db;
|
|
3
|
+
getPageIdByUrl;
|
|
4
|
+
insertPluginReport;
|
|
5
|
+
getPluginReport;
|
|
6
|
+
deleteSnapshotPlugins;
|
|
7
|
+
getSnapshot;
|
|
8
|
+
getMigration;
|
|
9
|
+
insertMigration;
|
|
10
|
+
constructor(db) {
|
|
11
|
+
this.db = db;
|
|
12
|
+
this.getPageIdByUrl = this.db.prepare(`
|
|
13
|
+
SELECT id FROM pages
|
|
14
|
+
WHERE site_id = (SELECT site_id FROM snapshots WHERE id = ?)
|
|
15
|
+
AND normalized_url = ?
|
|
16
|
+
`);
|
|
17
|
+
this.insertPluginReport = this.db.prepare(`
|
|
18
|
+
INSERT OR REPLACE INTO plugin_reports
|
|
19
|
+
(snapshot_id, plugin_name, data, total_score, score_count, score_weight_sum, score_calculated_at)
|
|
20
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
21
|
+
`);
|
|
22
|
+
this.getPluginReport = this.db.prepare(`
|
|
23
|
+
SELECT data FROM plugin_reports
|
|
24
|
+
WHERE snapshot_id = ? AND plugin_name = ?
|
|
25
|
+
ORDER BY created_at DESC LIMIT 1
|
|
26
|
+
`);
|
|
27
|
+
this.deleteSnapshotPlugins = this.db.prepare(`
|
|
28
|
+
DELETE FROM plugin_reports WHERE snapshot_id = ?
|
|
29
|
+
`);
|
|
30
|
+
this.getSnapshot = this.db.prepare(`
|
|
31
|
+
SELECT id FROM snapshots WHERE id = ?
|
|
32
|
+
`);
|
|
33
|
+
this.getMigration = this.db.prepare(`
|
|
34
|
+
SELECT plugin_name FROM plugin_migrations WHERE plugin_name = ?
|
|
35
|
+
`);
|
|
36
|
+
this.insertMigration = this.db.prepare(`
|
|
37
|
+
INSERT INTO plugin_migrations (plugin_name) VALUES (?)
|
|
38
|
+
`);
|
|
39
|
+
}
|
|
40
|
+
}
|
package/dist/diff/compare.d.ts
CHANGED
|
@@ -12,11 +12,6 @@ export interface DiffResult {
|
|
|
12
12
|
oldCanonical: string | null;
|
|
13
13
|
newCanonical: string | null;
|
|
14
14
|
}[];
|
|
15
|
-
changedDuplicateGroup: {
|
|
16
|
-
url: string;
|
|
17
|
-
oldGroup: string | null;
|
|
18
|
-
newGroup: string | null;
|
|
19
|
-
}[];
|
|
20
15
|
metricDeltas: {
|
|
21
16
|
structuralEntropy: number;
|
|
22
17
|
orphanCount: number;
|
package/dist/diff/compare.js
CHANGED
|
@@ -6,7 +6,6 @@ export function compareGraphs(oldGraph, newGraph) {
|
|
|
6
6
|
const removedUrls = [];
|
|
7
7
|
const changedStatus = [];
|
|
8
8
|
const changedCanonical = [];
|
|
9
|
-
const changedDuplicateGroup = [];
|
|
10
9
|
// Added & Changed
|
|
11
10
|
for (const [url, newNode] of newNodes) {
|
|
12
11
|
const oldNode = oldNodes.get(url);
|
|
@@ -26,16 +25,6 @@ export function compareGraphs(oldGraph, newGraph) {
|
|
|
26
25
|
newCanonical: newNode.canonical || null
|
|
27
26
|
});
|
|
28
27
|
}
|
|
29
|
-
// Changed Duplicate Group
|
|
30
|
-
const oldGroup = oldNode.duplicateClusterId || null;
|
|
31
|
-
const newGroup = newNode.duplicateClusterId || null;
|
|
32
|
-
if (oldGroup !== newGroup) {
|
|
33
|
-
changedDuplicateGroup.push({
|
|
34
|
-
url,
|
|
35
|
-
oldGroup,
|
|
36
|
-
newGroup
|
|
37
|
-
});
|
|
38
|
-
}
|
|
39
28
|
}
|
|
40
29
|
}
|
|
41
30
|
// Removed
|
|
@@ -58,7 +47,6 @@ export function compareGraphs(oldGraph, newGraph) {
|
|
|
58
47
|
removedUrls,
|
|
59
48
|
changedStatus,
|
|
60
49
|
changedCanonical,
|
|
61
|
-
changedDuplicateGroup,
|
|
62
50
|
metricDeltas
|
|
63
51
|
};
|
|
64
52
|
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { Graph } from '../graph/graph.js';
|
|
2
|
+
export interface DiffOptions {
|
|
3
|
+
onlyCritical?: boolean;
|
|
4
|
+
}
|
|
5
|
+
export interface SnapshotDiff {
|
|
6
|
+
newPages: string[];
|
|
7
|
+
removedPages: string[];
|
|
8
|
+
changedPages: {
|
|
9
|
+
url: string;
|
|
10
|
+
changes: string[];
|
|
11
|
+
severity: 'low' | 'medium' | 'high';
|
|
12
|
+
}[];
|
|
13
|
+
}
|
|
14
|
+
export declare class DiffService {
|
|
15
|
+
compare(oldGraph: Graph | undefined, newGraph: Graph, _options?: DiffOptions): SnapshotDiff;
|
|
16
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
export class DiffService {
|
|
2
|
+
compare(oldGraph, newGraph, _options = {}) {
|
|
3
|
+
if (!oldGraph) {
|
|
4
|
+
return {
|
|
5
|
+
newPages: Array.from(newGraph.nodes.keys()),
|
|
6
|
+
removedPages: [],
|
|
7
|
+
changedPages: []
|
|
8
|
+
};
|
|
9
|
+
}
|
|
10
|
+
const oldUrls = new Set(oldGraph.nodes.keys());
|
|
11
|
+
const newUrls = new Set(newGraph.nodes.keys());
|
|
12
|
+
const newPages = Array.from(newUrls).filter(u => !oldUrls.has(u));
|
|
13
|
+
const removedPages = Array.from(oldUrls).filter(u => !newUrls.has(u));
|
|
14
|
+
const changedPages = [];
|
|
15
|
+
for (const url of newUrls) {
|
|
16
|
+
if (oldUrls.has(url)) {
|
|
17
|
+
const oldNode = oldGraph.nodes.get(url);
|
|
18
|
+
const newNode = newGraph.nodes.get(url);
|
|
19
|
+
const changes = [];
|
|
20
|
+
let severity = 'low';
|
|
21
|
+
if (oldNode.status !== newNode.status) {
|
|
22
|
+
changes.push(`status: ${oldNode.status} -> ${newNode.status}`);
|
|
23
|
+
severity = 'high';
|
|
24
|
+
}
|
|
25
|
+
if (oldNode.contentHash !== newNode.contentHash) {
|
|
26
|
+
changes.push('content changed');
|
|
27
|
+
if (severity !== 'high')
|
|
28
|
+
severity = 'medium';
|
|
29
|
+
}
|
|
30
|
+
if (oldNode.noindex !== newNode.noindex) {
|
|
31
|
+
changes.push(`noindex: ${oldNode.noindex} -> ${newNode.noindex}`);
|
|
32
|
+
severity = 'high';
|
|
33
|
+
}
|
|
34
|
+
if (changes.length > 0) {
|
|
35
|
+
changedPages.push({ url, changes, severity });
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
return { newPages, removedPages, changedPages };
|
|
40
|
+
}
|
|
41
|
+
}
|