@crawlith/core 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analyze.d.ts +29 -8
- package/dist/analysis/analyze.js +325 -221
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +4 -1
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/crawler/crawl.d.ts +2 -2
- package/dist/crawler/crawler.d.ts +17 -5
- package/dist/crawler/crawler.js +259 -94
- package/dist/crawler/fetcher.d.ts +1 -1
- package/dist/crawler/fetcher.js +6 -6
- package/dist/crawler/metricsRunner.d.ts +21 -1
- package/dist/crawler/metricsRunner.js +181 -60
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +4 -1
- package/dist/crawler/sitemap.js +24 -18
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +15 -32
- package/dist/db/index.d.ts +9 -1
- package/dist/db/index.js +39 -31
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +5 -0
- package/dist/db/repositories/EdgeRepository.js +7 -0
- package/dist/db/repositories/MetricsRepository.d.ts +13 -8
- package/dist/db/repositories/MetricsRepository.js +14 -6
- package/dist/db/repositories/PageRepository.d.ts +5 -3
- package/dist/db/repositories/PageRepository.js +68 -17
- package/dist/db/repositories/SiteRepository.d.ts +6 -0
- package/dist/db/repositories/SiteRepository.js +4 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +12 -5
- package/dist/db/repositories/SnapshotRepository.js +48 -10
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +8 -0
- package/dist/graph/graph.d.ts +20 -42
- package/dist/graph/graph.js +12 -16
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +19 -15
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -93
- package/dist/index.d.ts +27 -9
- package/dist/index.js +27 -9
- package/dist/lock/lockManager.d.ts +1 -0
- package/dist/lock/lockManager.js +15 -0
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +17 -11
- package/dist/scoring/health.js +183 -140
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +10 -4
- package/CHANGELOG.md +0 -13
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -221
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -302
- package/dist/scoring/hits.d.ts +0 -10
- package/dist/scoring/hits.js +0 -131
- package/scripts/copy-assets.js +0 -37
- package/src/analysis/analysis_list.html +0 -35
- package/src/analysis/analysis_page.html +0 -123
- package/src/analysis/analyze.ts +0 -505
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -66
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/analysis/templates.ts +0 -9
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -171
- package/src/crawler/crawl.ts +0 -9
- package/src/crawler/crawler.ts +0 -601
- package/src/crawler/extract.ts +0 -39
- package/src/crawler/fetcher.ts +0 -251
- package/src/crawler/metricsRunner.ts +0 -137
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -76
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -135
- package/src/db/index.ts +0 -75
- package/src/db/repositories/EdgeRepository.ts +0 -43
- package/src/db/repositories/MetricsRepository.ts +0 -63
- package/src/db/repositories/PageRepository.ts +0 -228
- package/src/db/repositories/SiteRepository.ts +0 -43
- package/src/db/repositories/SnapshotRepository.ts +0 -99
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/events.ts +0 -16
- package/src/graph/cluster.ts +0 -246
- package/src/graph/duplicate.ts +0 -350
- package/src/graph/graph.ts +0 -192
- package/src/graph/metrics.ts +0 -125
- package/src/graph/pagerank.ts +0 -126
- package/src/graph/simhash.ts +0 -76
- package/src/index.ts +0 -33
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -132
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/crawl.html +0 -879
- package/src/report/crawlExport.ts +0 -58
- package/src/report/crawl_template.ts +0 -9
- package/src/report/html.ts +0 -27
- package/src/scoring/health.ts +0 -241
- package/src/scoring/hits.ts +0 -153
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -142
- package/tests/analyze.integration.test.ts +0 -133
- package/tests/analyze_markdown.test.ts +0 -98
- package/tests/audit/audit.test.ts +0 -101
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -111
- package/tests/clustering.test.ts +0 -118
- package/tests/clustering_risk.test.ts +0 -118
- package/tests/crawler.test.ts +0 -364
- package/tests/db/index.test.ts +0 -134
- package/tests/db/repositories.test.ts +0 -115
- package/tests/db.test.ts +0 -159
- package/tests/db_repos.test.ts +0 -72
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/extract.test.ts +0 -86
- package/tests/fetcher.test.ts +0 -110
- package/tests/fetcher_safety.test.ts +0 -91
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/graph/graph.test.ts +0 -100
- package/tests/graphLoader.test.ts +0 -124
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -59
- package/tests/ipGuard.test.ts +0 -73
- package/tests/lock/lockManager.test.ts +0 -198
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -88
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -77
- package/tests/renderAnalysisCsv.test.ts +0 -183
- package/tests/safety.test.ts +0 -126
- package/tests/scope.test.ts +0 -84
- package/tests/scoring.test.ts +0 -60
- package/tests/sitemap.test.ts +0 -100
- package/tests/soft404.test.ts +0 -41
- package/tests/ssrf_fix.test.ts +0 -69
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
|
@@ -1,3 +1,23 @@
|
|
|
1
1
|
import { EngineContext } from '../events.js';
|
|
2
2
|
import { Graph } from '../graph/graph.js';
|
|
3
|
-
export
|
|
3
|
+
export interface PostCrawlOptions {
|
|
4
|
+
context?: EngineContext;
|
|
5
|
+
limitReached?: boolean;
|
|
6
|
+
graphInstance?: Graph;
|
|
7
|
+
clustering?: boolean;
|
|
8
|
+
clusterThreshold?: number;
|
|
9
|
+
minClusterSize?: number;
|
|
10
|
+
health?: boolean;
|
|
11
|
+
computePagerank?: boolean;
|
|
12
|
+
computeHits?: boolean;
|
|
13
|
+
heading?: boolean;
|
|
14
|
+
orphans?: boolean;
|
|
15
|
+
orphanSeverity?: 'low' | 'medium' | 'high' | boolean;
|
|
16
|
+
includeSoftOrphans?: boolean;
|
|
17
|
+
minInbound?: number;
|
|
18
|
+
rootOrigin?: string;
|
|
19
|
+
}
|
|
20
|
+
export declare function runPostCrawlMetrics(snapshotId: number, maxDepth: number, options?: PostCrawlOptions): {
|
|
21
|
+
metrics: any;
|
|
22
|
+
healthData?: any;
|
|
23
|
+
} | undefined;
|
|
@@ -3,11 +3,22 @@ import { loadGraphFromSnapshot } from '../db/graphLoader.js';
|
|
|
3
3
|
import { MetricsRepository } from '../db/repositories/MetricsRepository.js';
|
|
4
4
|
import { SnapshotRepository } from '../db/repositories/SnapshotRepository.js';
|
|
5
5
|
import { PageRepository } from '../db/repositories/PageRepository.js';
|
|
6
|
-
import { computePageRank } from '../graph/pagerank.js';
|
|
7
6
|
import { calculateMetrics } from '../graph/metrics.js';
|
|
8
|
-
import {
|
|
9
|
-
import {
|
|
10
|
-
|
|
7
|
+
import { PageRankService } from '../graph/pagerank.js';
|
|
8
|
+
import { HITSService } from '../graph/hits.js';
|
|
9
|
+
import { TrapDetector } from './trap.js';
|
|
10
|
+
import { ClusteringService } from '../analysis/clustering.js';
|
|
11
|
+
import { DuplicateService } from '../analysis/duplicate.js';
|
|
12
|
+
import { annotateOrphans } from '../analysis/orphan.js';
|
|
13
|
+
import { Soft404Service } from '../analysis/soft404.js';
|
|
14
|
+
import { HeadingHealthService } from '../analysis/heading.js';
|
|
15
|
+
import { HealthService } from '../scoring/health.js';
|
|
16
|
+
import { analyzeContent } from '../analysis/content.js';
|
|
17
|
+
import { load } from 'cheerio';
|
|
18
|
+
export function runPostCrawlMetrics(snapshotId, maxDepth, options = {}) {
|
|
19
|
+
const context = options.context;
|
|
20
|
+
const limitReached = options.limitReached || false;
|
|
21
|
+
const graphInstance = options.graphInstance;
|
|
11
22
|
const db = getDb();
|
|
12
23
|
const metricsRepo = new MetricsRepository(db);
|
|
13
24
|
const snapshotRepo = new SnapshotRepository(db);
|
|
@@ -21,62 +32,160 @@ export function runPostCrawlMetrics(snapshotId, maxDepth, context, limitReached
|
|
|
21
32
|
else {
|
|
22
33
|
if (event.type === 'error')
|
|
23
34
|
console.error(event.message);
|
|
24
|
-
else if (event.type !== 'debug')
|
|
25
|
-
|
|
35
|
+
else if (event.type !== 'debug') {
|
|
36
|
+
const out = event.message || event.phase;
|
|
37
|
+
if (out)
|
|
38
|
+
console.log(out);
|
|
39
|
+
}
|
|
26
40
|
}
|
|
27
41
|
};
|
|
28
42
|
const snapshot = snapshotRepo.getSnapshot(snapshotId);
|
|
29
43
|
if (!snapshot) {
|
|
30
44
|
emit({ type: 'error', message: `Snapshot ${snapshotId} not found` });
|
|
31
|
-
return;
|
|
45
|
+
return undefined;
|
|
32
46
|
}
|
|
33
47
|
if (!graphInstance) {
|
|
34
48
|
emit({ type: 'metrics:start', phase: 'Loading graph' });
|
|
35
49
|
}
|
|
36
|
-
emit({ type: 'metrics:start', phase: '
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
50
|
+
emit({ type: 'metrics:start', phase: 'Running core algorithms' });
|
|
51
|
+
// 1. Graph Algorithms
|
|
52
|
+
const prResults = options.computePagerank ? new PageRankService().evaluate(graph) : new Map();
|
|
53
|
+
const hitsResults = options.computeHits ? new HITSService().evaluate(graph, { iterations: 20 }) : new Map();
|
|
54
|
+
// 2. Crawler Safety
|
|
55
|
+
new TrapDetector().analyze(graph);
|
|
56
|
+
// 3. Analysis / Intelligence
|
|
57
|
+
if (options.clustering) {
|
|
58
|
+
const contentClusters = new ClusteringService().detectContentClusters(graph, options.clusterThreshold, options.minClusterSize);
|
|
59
|
+
if (contentClusters.length > 0) {
|
|
60
|
+
const insertCluster = db.prepare(`
|
|
61
|
+
INSERT OR REPLACE INTO content_clusters (id, snapshot_id, count, primary_url, risk, shared_path_prefix)
|
|
62
|
+
VALUES (@id, @snapshot_id, @count, @primary_url, @risk, @shared_path_prefix)
|
|
63
|
+
`);
|
|
64
|
+
const insertContentTx = db.transaction((clusters) => {
|
|
65
|
+
for (const c of clusters) {
|
|
66
|
+
insertCluster.run({
|
|
67
|
+
id: c.id,
|
|
68
|
+
snapshot_id: snapshotId,
|
|
69
|
+
count: c.count,
|
|
70
|
+
primary_url: c.primaryUrl,
|
|
71
|
+
risk: c.risk,
|
|
72
|
+
shared_path_prefix: c.sharedPathPrefix ?? null
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
});
|
|
76
|
+
insertContentTx(contentClusters);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
const duplicateClusters = new DuplicateService().detectDuplicates(graph, { collapse: false });
|
|
80
|
+
if (duplicateClusters.length > 0) {
|
|
81
|
+
const insertCluster = db.prepare(`
|
|
82
|
+
INSERT OR REPLACE INTO duplicate_clusters (id, snapshot_id, type, size, representative, severity)
|
|
83
|
+
VALUES (@id, @snapshot_id, @type, @size, @representative, @severity)
|
|
84
|
+
`);
|
|
85
|
+
const insertDuplicateTx = db.transaction((clusters) => {
|
|
86
|
+
for (const c of clusters) {
|
|
87
|
+
insertCluster.run({
|
|
88
|
+
id: c.id,
|
|
89
|
+
snapshot_id: snapshotId,
|
|
90
|
+
type: c.type, // valid: 'exact' | 'near' | 'template_heavy'
|
|
91
|
+
size: c.size,
|
|
92
|
+
representative: c.representative,
|
|
93
|
+
severity: c.severity || 'low'
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
});
|
|
97
|
+
insertDuplicateTx(duplicateClusters);
|
|
98
|
+
}
|
|
99
|
+
let annotatedNodes = [];
|
|
100
|
+
if (options.orphans) {
|
|
101
|
+
const orphanOptions = {
|
|
102
|
+
enabled: true,
|
|
103
|
+
severityEnabled: !!options.orphanSeverity || options.orphanSeverity === undefined,
|
|
104
|
+
includeSoftOrphans: options.includeSoftOrphans ?? true,
|
|
105
|
+
minInbound: options.minInbound ?? 2
|
|
106
|
+
};
|
|
107
|
+
annotatedNodes = annotateOrphans(graph.getNodes(), graph.getEdges(), orphanOptions);
|
|
108
|
+
}
|
|
109
|
+
const soft404Service = new Soft404Service();
|
|
110
|
+
const headingService = new HeadingHealthService();
|
|
111
|
+
// Pre-calculate heading health for all nodes with HTML
|
|
112
|
+
let headingPayloads = new Map();
|
|
113
|
+
if (options.heading) {
|
|
114
|
+
const result = headingService.evaluateNodes(graph.getNodes());
|
|
115
|
+
headingPayloads = result.payloadsByUrl;
|
|
116
|
+
}
|
|
117
|
+
// Apply signals to nodes
|
|
118
|
+
for (const node of graph.getNodes()) {
|
|
119
|
+
const pr = prResults.get(node.url);
|
|
120
|
+
if (pr)
|
|
121
|
+
node.pagerankScore = pr.score;
|
|
122
|
+
const hits = hitsResults.get(node.url);
|
|
123
|
+
if (hits) {
|
|
124
|
+
node.authScore = hits.authority_score;
|
|
125
|
+
node.hubScore = hits.hub_score;
|
|
126
|
+
node.linkRole = hits.link_role;
|
|
127
|
+
}
|
|
128
|
+
if (options.orphans) {
|
|
129
|
+
const annotated = annotatedNodes.find((n) => n.url === node.url);
|
|
130
|
+
if (annotated) {
|
|
131
|
+
node.orphanScore = annotated.orphanSeverity;
|
|
132
|
+
node.orphanType = annotated.orphanType;
|
|
133
|
+
node.impactLevel = annotated.impactLevel;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
if (options.heading) {
|
|
137
|
+
const heading = headingPayloads.get(node.url);
|
|
138
|
+
if (heading) {
|
|
139
|
+
node.headingScore = heading.score;
|
|
140
|
+
node.headingData = JSON.stringify(heading);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
if (node.html) {
|
|
144
|
+
const soft404 = soft404Service.analyze(node.html, node.outLinks);
|
|
145
|
+
node.soft404Score = soft404.score;
|
|
146
|
+
const $ = load(node.html);
|
|
147
|
+
const content = analyzeContent($);
|
|
148
|
+
node.wordCount = content.wordCount;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
40
151
|
emit({ type: 'metrics:start', phase: 'Updating metrics in DB' });
|
|
41
|
-
const nodes = graph.getNodes();
|
|
42
152
|
// Pre-fetch all page IDs to avoid N+1 queries
|
|
43
|
-
|
|
44
|
-
const pages = pageRepo.getPagesIdentityBySnapshot(snapshotId);
|
|
153
|
+
const pagesIdentity = pageRepo.getPagesIdentityBySnapshot(snapshotId);
|
|
45
154
|
const urlToId = new Map();
|
|
46
|
-
for (const p of
|
|
155
|
+
for (const p of pagesIdentity) {
|
|
47
156
|
urlToId.set(p.normalized_url, p.id);
|
|
48
157
|
}
|
|
49
|
-
const
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
158
|
+
const metricsToSave = graph.getNodes().map(node => {
|
|
159
|
+
const pageId = urlToId.get(node.url);
|
|
160
|
+
if (!pageId)
|
|
161
|
+
return null;
|
|
162
|
+
return {
|
|
163
|
+
snapshot_id: snapshotId,
|
|
164
|
+
page_id: pageId,
|
|
165
|
+
crawl_status: node.crawlStatus ?? null,
|
|
166
|
+
word_count: node.wordCount ?? null,
|
|
167
|
+
thin_content_score: node.thinContentScore ?? null,
|
|
168
|
+
external_link_ratio: node.externalLinkRatio ?? null,
|
|
169
|
+
pagerank_score: node.pagerankScore ?? null,
|
|
170
|
+
hub_score: node.hubScore ?? null,
|
|
171
|
+
auth_score: node.authScore ?? null,
|
|
172
|
+
link_role: node.linkRole ?? null,
|
|
173
|
+
duplicate_cluster_id: node.duplicateClusterId ?? null,
|
|
174
|
+
duplicate_type: node.duplicateType ?? null,
|
|
175
|
+
cluster_id: node.clusterId ?? null,
|
|
176
|
+
soft404_score: node.soft404Score ?? null,
|
|
177
|
+
heading_score: node.headingScore ?? null,
|
|
178
|
+
orphan_score: node.orphanScore ?? null,
|
|
179
|
+
orphan_type: node.orphanType ?? null,
|
|
180
|
+
impact_level: node.impactLevel ?? null,
|
|
181
|
+
heading_data: node.headingData ?? null,
|
|
182
|
+
is_cluster_primary: node.isClusterPrimary ? 1 : 0
|
|
183
|
+
};
|
|
184
|
+
}).filter(m => m !== null);
|
|
185
|
+
metricsRepo.insertMany(metricsToSave);
|
|
186
|
+
// Update page-level metadata in transaction
|
|
57
187
|
const tx = db.transaction(() => {
|
|
58
|
-
for (const node of
|
|
59
|
-
const pageId = urlToId.get(node.url);
|
|
60
|
-
if (!pageId)
|
|
61
|
-
continue;
|
|
62
|
-
metricsRepo.insertMetrics({
|
|
63
|
-
snapshot_id: snapshotId,
|
|
64
|
-
page_id: pageId,
|
|
65
|
-
authority_score: node.authorityScore ?? null,
|
|
66
|
-
hub_score: node.hubScore ?? null,
|
|
67
|
-
pagerank: node.pageRank ?? null,
|
|
68
|
-
pagerank_score: node.pageRankScore ?? null,
|
|
69
|
-
link_role: node.linkRole ?? null,
|
|
70
|
-
crawl_status: node.crawlStatus ?? null,
|
|
71
|
-
word_count: node.wordCount ?? null,
|
|
72
|
-
thin_content_score: node.thinContentScore ?? null,
|
|
73
|
-
external_link_ratio: node.externalLinkRatio ?? null,
|
|
74
|
-
orphan_score: node.orphanScore ?? null,
|
|
75
|
-
duplicate_cluster_id: node.duplicateClusterId ?? null,
|
|
76
|
-
duplicate_type: node.duplicateType ?? null,
|
|
77
|
-
is_cluster_primary: node.isClusterPrimary ? 1 : 0
|
|
78
|
-
});
|
|
79
|
-
// Update page-level crawl trap data
|
|
188
|
+
for (const node of graph.getNodes()) {
|
|
80
189
|
if (node.crawlTrapFlag || node.redirectChain?.length || node.bytesReceived) {
|
|
81
190
|
pageRepo.upsertPage({
|
|
82
191
|
site_id: snapshot.site_id,
|
|
@@ -90,28 +199,40 @@ export function runPostCrawlMetrics(snapshotId, maxDepth, context, limitReached
|
|
|
90
199
|
});
|
|
91
200
|
}
|
|
92
201
|
}
|
|
93
|
-
// Save duplicate clusters
|
|
94
|
-
for (const cluster of graph.duplicateClusters) {
|
|
95
|
-
clusterStmt.run(cluster.id, snapshotId, cluster.type, cluster.size, cluster.representative, cluster.severity);
|
|
96
|
-
}
|
|
97
|
-
// Save content clusters
|
|
98
|
-
for (const cluster of graph.contentClusters) {
|
|
99
|
-
contentStmt.run(cluster.id, snapshotId, cluster.count, cluster.primaryUrl, cluster.risk, cluster.sharedPathPrefix ?? null);
|
|
100
|
-
}
|
|
101
202
|
});
|
|
102
203
|
tx();
|
|
103
204
|
emit({ type: 'metrics:start', phase: 'Computing aggregate stats' });
|
|
104
205
|
const metrics = calculateMetrics(graph, maxDepth);
|
|
105
|
-
//
|
|
106
|
-
|
|
107
|
-
|
|
206
|
+
// Compute health score if enabled
|
|
207
|
+
let healthScore = null;
|
|
208
|
+
if (options.health) {
|
|
209
|
+
try {
|
|
210
|
+
const rootOrigin = options.rootOrigin ?? '';
|
|
211
|
+
const healthService = new HealthService();
|
|
212
|
+
const issues = healthService.collectCrawlIssues(graph, metrics, rootOrigin);
|
|
213
|
+
const breakdown = healthService.calculateHealthScore(metrics.totalPages, issues);
|
|
214
|
+
healthScore = breakdown.score;
|
|
215
|
+
}
|
|
216
|
+
catch (e) {
|
|
217
|
+
emit({ type: 'error', message: 'Error computing health score', error: e });
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
const thinContentCount = graph.getNodes().filter(n => n.wordCount !== undefined && n.wordCount < 200 && n.status === 200).length;
|
|
221
|
+
const orphanCount = metrics.orphanPages.length;
|
|
108
222
|
snapshotRepo.updateSnapshotStatus(snapshotId, 'completed', {
|
|
109
223
|
node_count: metrics.totalPages,
|
|
110
224
|
edge_count: metrics.totalEdges,
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
225
|
+
limit_reached: limitReached ? 1 : 0,
|
|
226
|
+
thin_content_count: thinContentCount,
|
|
227
|
+
orphan_count: orphanCount,
|
|
228
|
+
...(healthScore !== null ? { health_score: healthScore } : {})
|
|
115
229
|
});
|
|
116
230
|
emit({ type: 'metrics:complete', durationMs: 0 });
|
|
231
|
+
return {
|
|
232
|
+
metrics,
|
|
233
|
+
healthData: healthScore !== null ? {
|
|
234
|
+
health: new HealthService().calculateHealthScore(metrics.totalPages, new HealthService().collectCrawlIssues(graph, metrics, options.rootOrigin ?? '')),
|
|
235
|
+
issues: new HealthService().collectCrawlIssues(graph, metrics, options.rootOrigin ?? '')
|
|
236
|
+
} : undefined
|
|
237
|
+
};
|
|
117
238
|
}
|
|
@@ -3,5 +3,46 @@
|
|
|
3
3
|
*/
|
|
4
4
|
export interface NormalizeOptions {
|
|
5
5
|
stripQuery?: boolean;
|
|
6
|
+
toPath?: boolean;
|
|
6
7
|
}
|
|
7
8
|
export declare function normalizeUrl(input: string, base: string, options?: NormalizeOptions): string | null;
|
|
9
|
+
/**
|
|
10
|
+
* Utility for converting between absolute URLs and relative paths
|
|
11
|
+
* primarily used for database storage.
|
|
12
|
+
*/
|
|
13
|
+
export declare class UrlUtil {
|
|
14
|
+
/**
|
|
15
|
+
* Extract a stable domain key from a URL/domain input.
|
|
16
|
+
* Examples:
|
|
17
|
+
* - "https://www.example.com/a" -> "example.com"
|
|
18
|
+
* - "example.com" -> "example.com"
|
|
19
|
+
*/
|
|
20
|
+
static extractDomain(input: string): string;
|
|
21
|
+
/**
|
|
22
|
+
* Resolve a site's absolute origin from persisted site fields.
|
|
23
|
+
*/
|
|
24
|
+
static resolveSiteOrigin(site: {
|
|
25
|
+
domain: string;
|
|
26
|
+
preferred_url?: string | null;
|
|
27
|
+
ssl?: number | null;
|
|
28
|
+
}): string;
|
|
29
|
+
/**
|
|
30
|
+
* Converts a full URL to a root-relative path if it matches the origin.
|
|
31
|
+
* If it doesn't match the origin, it's considered external and kept absolute.
|
|
32
|
+
*/
|
|
33
|
+
static toPath(urlStr: string, origin: string): string;
|
|
34
|
+
/**
|
|
35
|
+
* Converts a root-relative path back to an absolute URL relative to the origin.
|
|
36
|
+
* If the input is already an absolute URL, it is returned as-is.
|
|
37
|
+
*/
|
|
38
|
+
static toAbsolute(pathOrUrl: string, origin: string): string;
|
|
39
|
+
/**
|
|
40
|
+
* Determines if a URL (or path) is internal relative to the origin.
|
|
41
|
+
*/
|
|
42
|
+
static isInternal(pathOrUrl: string, origin: string): boolean;
|
|
43
|
+
/**
|
|
44
|
+
* Build normalized lookup candidates for querying pages table.
|
|
45
|
+
* Returns path/absolute/original variants in priority order, deduplicated.
|
|
46
|
+
*/
|
|
47
|
+
static toLookupCandidates(input: string, origin: string): string[];
|
|
48
|
+
}
|
|
@@ -10,7 +10,7 @@ const TRACKING_PARAMS = new Set([
|
|
|
10
10
|
]);
|
|
11
11
|
const SKIP_EXTENSIONS = new Set([
|
|
12
12
|
'.pdf', '.jpg', '.png', '.svg', '.webp', '.gif',
|
|
13
|
-
'.zip', '.xml', '.json', '.mp4'
|
|
13
|
+
'.zip', '.xml', '.json', '.mp4', '.avif', '.ics'
|
|
14
14
|
]);
|
|
15
15
|
export function normalizeUrl(input, base, options = {}) {
|
|
16
16
|
try {
|
|
@@ -71,6 +71,7 @@ export function normalizeUrl(input, base, options = {}) {
|
|
|
71
71
|
pathname = pathname.slice(0, -1);
|
|
72
72
|
}
|
|
73
73
|
u.pathname = pathname;
|
|
74
|
+
const finalUrl = u.toString();
|
|
74
75
|
// 9. Skip non-HTML assets by extension
|
|
75
76
|
const lastDotIndex = u.pathname.lastIndexOf('.');
|
|
76
77
|
if (lastDotIndex !== -1) {
|
|
@@ -79,10 +80,125 @@ export function normalizeUrl(input, base, options = {}) {
|
|
|
79
80
|
return null;
|
|
80
81
|
}
|
|
81
82
|
}
|
|
82
|
-
// 10. Return
|
|
83
|
-
|
|
83
|
+
// 10. Return path if requested
|
|
84
|
+
if (options.toPath) {
|
|
85
|
+
return u.pathname + u.search;
|
|
86
|
+
}
|
|
87
|
+
// 11. Return final string
|
|
88
|
+
return finalUrl;
|
|
84
89
|
}
|
|
85
90
|
catch (_e) {
|
|
86
91
|
return null;
|
|
87
92
|
}
|
|
88
93
|
}
|
|
94
|
+
/**
|
|
95
|
+
* Utility for converting between absolute URLs and relative paths
|
|
96
|
+
* primarily used for database storage.
|
|
97
|
+
*/
|
|
98
|
+
export class UrlUtil {
|
|
99
|
+
/**
|
|
100
|
+
* Extract a stable domain key from a URL/domain input.
|
|
101
|
+
* Examples:
|
|
102
|
+
* - "https://www.example.com/a" -> "example.com"
|
|
103
|
+
* - "example.com" -> "example.com"
|
|
104
|
+
*/
|
|
105
|
+
static extractDomain(input) {
|
|
106
|
+
const trimmed = input.trim();
|
|
107
|
+
if (!trimmed)
|
|
108
|
+
return '';
|
|
109
|
+
try {
|
|
110
|
+
const direct = new URL(trimmed);
|
|
111
|
+
return direct.hostname.toLowerCase().replace(/^www\./, '');
|
|
112
|
+
}
|
|
113
|
+
catch {
|
|
114
|
+
// fall through
|
|
115
|
+
}
|
|
116
|
+
try {
|
|
117
|
+
const withProtocol = new URL(`https://${trimmed}`);
|
|
118
|
+
return withProtocol.hostname.toLowerCase().replace(/^www\./, '');
|
|
119
|
+
}
|
|
120
|
+
catch {
|
|
121
|
+
return trimmed.toLowerCase().replace(/^www\./, '');
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Resolve a site's absolute origin from persisted site fields.
|
|
126
|
+
*/
|
|
127
|
+
static resolveSiteOrigin(site) {
|
|
128
|
+
if (site.preferred_url) {
|
|
129
|
+
try {
|
|
130
|
+
return new URL(site.preferred_url).origin;
|
|
131
|
+
}
|
|
132
|
+
catch {
|
|
133
|
+
// fall through to domain+ssl fallback
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
const protocol = site.ssl === 0 ? 'http' : 'https';
|
|
137
|
+
return `${protocol}://${site.domain}`;
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Converts a full URL to a root-relative path if it matches the origin.
|
|
141
|
+
* If it doesn't match the origin, it's considered external and kept absolute.
|
|
142
|
+
*/
|
|
143
|
+
static toPath(urlStr, origin) {
|
|
144
|
+
try {
|
|
145
|
+
const url = new URL(urlStr);
|
|
146
|
+
const originUrl = new URL(origin);
|
|
147
|
+
if (url.origin === originUrl.origin) {
|
|
148
|
+
return url.pathname + url.search;
|
|
149
|
+
}
|
|
150
|
+
return urlStr;
|
|
151
|
+
}
|
|
152
|
+
catch {
|
|
153
|
+
return urlStr;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Converts a root-relative path back to an absolute URL relative to the origin.
|
|
158
|
+
* If the input is already an absolute URL, it is returned as-is.
|
|
159
|
+
*/
|
|
160
|
+
static toAbsolute(pathOrUrl, origin) {
|
|
161
|
+
if (pathOrUrl.startsWith('http://') || pathOrUrl.startsWith('https://')) {
|
|
162
|
+
return pathOrUrl;
|
|
163
|
+
}
|
|
164
|
+
try {
|
|
165
|
+
return new URL(pathOrUrl, origin).toString();
|
|
166
|
+
}
|
|
167
|
+
catch {
|
|
168
|
+
return pathOrUrl;
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
/**
|
|
172
|
+
* Determines if a URL (or path) is internal relative to the origin.
|
|
173
|
+
*/
|
|
174
|
+
static isInternal(pathOrUrl, origin) {
|
|
175
|
+
if (!pathOrUrl.startsWith('http'))
|
|
176
|
+
return true;
|
|
177
|
+
try {
|
|
178
|
+
const url = new URL(pathOrUrl);
|
|
179
|
+
const originUrl = new URL(origin);
|
|
180
|
+
return url.origin === originUrl.origin;
|
|
181
|
+
}
|
|
182
|
+
catch {
|
|
183
|
+
return false;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Build normalized lookup candidates for querying pages table.
|
|
188
|
+
* Returns path/absolute/original variants in priority order, deduplicated.
|
|
189
|
+
*/
|
|
190
|
+
static toLookupCandidates(input, origin) {
|
|
191
|
+
const candidates = new Set();
|
|
192
|
+
const raw = input.trim();
|
|
193
|
+
if (!raw)
|
|
194
|
+
return [];
|
|
195
|
+
const absolute = normalizeUrl(raw, origin, { stripQuery: false }) || UrlUtil.toAbsolute(raw, origin);
|
|
196
|
+
const path = normalizeUrl(raw, origin, { stripQuery: false, toPath: true }) || UrlUtil.toPath(raw, origin);
|
|
197
|
+
const absolutePath = normalizeUrl(absolute, '', { stripQuery: false, toPath: true }) || UrlUtil.toPath(absolute, origin);
|
|
198
|
+
candidates.add(path);
|
|
199
|
+
candidates.add(absolute);
|
|
200
|
+
candidates.add(absolutePath);
|
|
201
|
+
candidates.add(raw);
|
|
202
|
+
return Array.from(candidates).filter(Boolean);
|
|
203
|
+
}
|
|
204
|
+
}
|
package/dist/crawler/parser.d.ts
CHANGED
|
@@ -11,12 +11,10 @@ export interface ParseResult {
|
|
|
11
11
|
contentHash: string;
|
|
12
12
|
simhash?: string;
|
|
13
13
|
uniqueTokenRatio?: number;
|
|
14
|
-
soft404Score: number;
|
|
15
|
-
soft404Signals: string[];
|
|
16
14
|
}
|
|
17
15
|
export declare class Parser {
|
|
18
16
|
/**
|
|
19
17
|
* Parses HTML content to extract metadata and links.
|
|
20
18
|
*/
|
|
21
|
-
parse(html: string, baseUrl: string,
|
|
19
|
+
parse(html: string, baseUrl: string, _status: number): ParseResult;
|
|
22
20
|
}
|
package/dist/crawler/parser.js
CHANGED
|
@@ -6,7 +6,7 @@ export class Parser {
|
|
|
6
6
|
/**
|
|
7
7
|
* Parses HTML content to extract metadata and links.
|
|
8
8
|
*/
|
|
9
|
-
parse(html, baseUrl,
|
|
9
|
+
parse(html, baseUrl, _status) {
|
|
10
10
|
const $ = cheerio.load(html);
|
|
11
11
|
// 1. Robots Meta
|
|
12
12
|
let noindex = false;
|
|
@@ -97,51 +97,6 @@ export class Parser {
|
|
|
97
97
|
const uniqueTokens = new Set(tokens);
|
|
98
98
|
const uniqueTokenRatio = tokens.length > 0 ? (uniqueTokens.size / tokens.length) : 0;
|
|
99
99
|
const simhash = SimHash.generate(tokens).toString();
|
|
100
|
-
// 5. Soft 404 Detection
|
|
101
|
-
let soft404Score = 0;
|
|
102
|
-
const soft404Signals = [];
|
|
103
|
-
if (status === 200) {
|
|
104
|
-
const title = $('title').text().toLowerCase();
|
|
105
|
-
const h1Text = $('h1').first().text().toLowerCase();
|
|
106
|
-
const bodyText = cleanText.toLowerCase();
|
|
107
|
-
const errorPatterns = ['404', 'not found', 'error', 'doesn\'t exist', 'unavailable', 'invalid'];
|
|
108
|
-
// Pattern checks
|
|
109
|
-
for (const pattern of errorPatterns) {
|
|
110
|
-
if (title.includes(pattern)) {
|
|
111
|
-
soft404Score += 0.4;
|
|
112
|
-
soft404Signals.push(`title_pattern_${pattern.replace(/\s+/g, '_')}`);
|
|
113
|
-
break;
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
for (const pattern of errorPatterns) {
|
|
117
|
-
if (h1Text.includes(pattern)) {
|
|
118
|
-
soft404Score += 0.3;
|
|
119
|
-
soft404Signals.push(`h1_pattern_${pattern.replace(/\s+/g, '_')}`);
|
|
120
|
-
break;
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
if (bodyText.includes('page not found') || bodyText.includes('404 error')) {
|
|
124
|
-
soft404Score += 0.2;
|
|
125
|
-
soft404Signals.push('body_error_phrase');
|
|
126
|
-
}
|
|
127
|
-
// Content length check (Word count approximation)
|
|
128
|
-
const words = cleanText.split(/\s+/).filter(w => w.length > 0);
|
|
129
|
-
if (words.length < 50) {
|
|
130
|
-
soft404Score += 0.3;
|
|
131
|
-
soft404Signals.push('very_low_word_count');
|
|
132
|
-
}
|
|
133
|
-
else if (words.length < 150) {
|
|
134
|
-
soft404Score += 0.1;
|
|
135
|
-
soft404Signals.push('low_word_count');
|
|
136
|
-
}
|
|
137
|
-
// Link count check
|
|
138
|
-
if (links.size === 0) {
|
|
139
|
-
soft404Score += 0.2;
|
|
140
|
-
soft404Signals.push('no_outbound_links');
|
|
141
|
-
}
|
|
142
|
-
// Cap at 1.0
|
|
143
|
-
soft404Score = Math.min(1.0, soft404Score);
|
|
144
|
-
}
|
|
145
100
|
return {
|
|
146
101
|
links: Array.from(links.entries()).map(([url, weight]) => ({ url, weight })),
|
|
147
102
|
html: html, // pass raw HTML for analysis
|
|
@@ -150,9 +105,7 @@ export class Parser {
|
|
|
150
105
|
nofollow,
|
|
151
106
|
contentHash,
|
|
152
107
|
simhash,
|
|
153
|
-
uniqueTokenRatio
|
|
154
|
-
soft404Score,
|
|
155
|
-
soft404Signals
|
|
108
|
+
uniqueTokenRatio
|
|
156
109
|
};
|
|
157
110
|
}
|
|
158
111
|
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { Fetcher } from './fetcher.js';
|
|
2
|
+
import { Site } from '../db/repositories/SiteRepository.js';
|
|
3
|
+
export interface ResolvedUrl {
|
|
4
|
+
url: string;
|
|
5
|
+
site: Site;
|
|
6
|
+
}
|
|
7
|
+
export declare class UrlResolver {
|
|
8
|
+
private siteRepo;
|
|
9
|
+
constructor();
|
|
10
|
+
resolve(inputUrl: string, fetcher: Fetcher): Promise<ResolvedUrl>;
|
|
11
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import { SiteRepository } from '../db/repositories/SiteRepository.js';
|
|
2
|
+
import { getDb } from '../db/index.js';
|
|
3
|
+
export class UrlResolver {
|
|
4
|
+
siteRepo;
|
|
5
|
+
constructor() {
|
|
6
|
+
this.siteRepo = new SiteRepository(getDb());
|
|
7
|
+
}
|
|
8
|
+
async resolve(inputUrl, fetcher) {
|
|
9
|
+
const hasProtocol = inputUrl.startsWith('http://') || inputUrl.startsWith('https://');
|
|
10
|
+
const workingUrl = hasProtocol ? inputUrl : `https://${inputUrl}`;
|
|
11
|
+
let hostname;
|
|
12
|
+
try {
|
|
13
|
+
hostname = new URL(workingUrl).hostname;
|
|
14
|
+
}
|
|
15
|
+
catch {
|
|
16
|
+
throw new Error(`Invalid URL or domain: ${inputUrl}`);
|
|
17
|
+
}
|
|
18
|
+
const domain = hostname.replace(/^www\./, '');
|
|
19
|
+
let site = this.siteRepo.firstOrCreateSite(domain);
|
|
20
|
+
// If protocol was omitted, we use our discovery logic or stored preference
|
|
21
|
+
if (!hasProtocol) {
|
|
22
|
+
if (site.ssl !== null && site.preferred_url) {
|
|
23
|
+
return {
|
|
24
|
+
url: site.preferred_url,
|
|
25
|
+
site
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
// No protocol provided and no stored preference: Probe HTTPS first
|
|
29
|
+
try {
|
|
30
|
+
const res = await fetcher.fetch(`https://${hostname}/`);
|
|
31
|
+
if (typeof res.status === 'number' && res.status >= 200 && res.status < 400) {
|
|
32
|
+
const isSsl = res.finalUrl.startsWith('https:') ? 1 : 0;
|
|
33
|
+
this.siteRepo.updateSitePreference(site.id, { preferred_url: res.finalUrl, ssl: isSsl });
|
|
34
|
+
// Refresh site object
|
|
35
|
+
site = this.siteRepo.getSiteById(site.id);
|
|
36
|
+
return { url: res.finalUrl, site };
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
catch {
|
|
40
|
+
// Fallback to HTTP
|
|
41
|
+
}
|
|
42
|
+
// Try HTTP
|
|
43
|
+
try {
|
|
44
|
+
const res = await fetcher.fetch(`http://${hostname}/`);
|
|
45
|
+
if (typeof res.status === 'number' && res.status >= 200 && res.status < 400) {
|
|
46
|
+
const isSsl = res.finalUrl.startsWith('https:') ? 1 : 0;
|
|
47
|
+
this.siteRepo.updateSitePreference(site.id, { preferred_url: res.finalUrl, ssl: isSsl });
|
|
48
|
+
site = this.siteRepo.getSiteById(site.id);
|
|
49
|
+
return { url: res.finalUrl, site };
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
// If both fail, we still default to the provided input as https
|
|
54
|
+
return { url: workingUrl, site };
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
// Protocol was provided, we just return it but ensure site is in sync if it's the first time
|
|
58
|
+
if (site.ssl === null) {
|
|
59
|
+
this.siteRepo.updateSitePreference(site.id, {
|
|
60
|
+
preferred_url: inputUrl,
|
|
61
|
+
ssl: inputUrl.startsWith('https:') ? 1 : 0
|
|
62
|
+
});
|
|
63
|
+
site = this.siteRepo.getSiteById(site.id);
|
|
64
|
+
}
|
|
65
|
+
return { url: inputUrl, site };
|
|
66
|
+
}
|
|
67
|
+
}
|