@crawlith/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +70 -0
- package/dist/analysis/analysis_list.html +35 -0
- package/dist/analysis/analysis_page.html +123 -0
- package/dist/analysis/analyze.d.ts +40 -5
- package/dist/analysis/analyze.js +395 -347
- package/dist/analysis/clustering.d.ts +23 -0
- package/dist/analysis/clustering.js +206 -0
- package/dist/analysis/content.d.ts +1 -1
- package/dist/analysis/content.js +11 -5
- package/dist/analysis/duplicate.d.ts +34 -0
- package/dist/analysis/duplicate.js +305 -0
- package/dist/analysis/heading.d.ts +116 -0
- package/dist/analysis/heading.js +356 -0
- package/dist/analysis/images.d.ts +1 -1
- package/dist/analysis/images.js +6 -5
- package/dist/analysis/links.d.ts +1 -1
- package/dist/analysis/links.js +8 -8
- package/dist/{scoring/orphanSeverity.d.ts → analysis/orphan.d.ts} +12 -23
- package/dist/{scoring/orphanSeverity.js → analysis/orphan.js} +9 -3
- package/dist/analysis/scoring.js +11 -2
- package/dist/analysis/seo.d.ts +8 -4
- package/dist/analysis/seo.js +41 -30
- package/dist/analysis/soft404.d.ts +17 -0
- package/dist/analysis/soft404.js +62 -0
- package/dist/analysis/structuredData.d.ts +1 -1
- package/dist/analysis/structuredData.js +5 -4
- package/dist/analysis/templates.d.ts +2 -0
- package/dist/analysis/templates.js +7 -0
- package/dist/application/index.d.ts +2 -0
- package/dist/application/index.js +2 -0
- package/dist/application/usecase.d.ts +3 -0
- package/dist/application/usecase.js +1 -0
- package/dist/application/usecases.d.ts +114 -0
- package/dist/application/usecases.js +201 -0
- package/dist/audit/index.js +1 -1
- package/dist/audit/transport.d.ts +1 -1
- package/dist/audit/transport.js +5 -4
- package/dist/audit/types.d.ts +1 -0
- package/dist/constants.d.ts +17 -0
- package/dist/constants.js +23 -0
- package/dist/core/scope/scopeManager.js +3 -0
- package/dist/core/security/ipGuard.d.ts +11 -0
- package/dist/core/security/ipGuard.js +71 -3
- package/dist/crawler/crawl.d.ts +4 -22
- package/dist/crawler/crawl.js +4 -335
- package/dist/crawler/crawler.d.ts +87 -0
- package/dist/crawler/crawler.js +683 -0
- package/dist/crawler/extract.d.ts +4 -1
- package/dist/crawler/extract.js +7 -2
- package/dist/crawler/fetcher.d.ts +2 -1
- package/dist/crawler/fetcher.js +26 -11
- package/dist/crawler/metricsRunner.d.ts +23 -1
- package/dist/crawler/metricsRunner.js +202 -72
- package/dist/crawler/normalize.d.ts +41 -0
- package/dist/crawler/normalize.js +119 -3
- package/dist/crawler/parser.d.ts +1 -3
- package/dist/crawler/parser.js +2 -49
- package/dist/crawler/resolver.d.ts +11 -0
- package/dist/crawler/resolver.js +67 -0
- package/dist/crawler/sitemap.d.ts +6 -0
- package/dist/crawler/sitemap.js +27 -17
- package/dist/crawler/trap.d.ts +5 -1
- package/dist/crawler/trap.js +23 -2
- package/dist/db/CrawlithDB.d.ts +110 -0
- package/dist/db/CrawlithDB.js +500 -0
- package/dist/db/graphLoader.js +42 -30
- package/dist/db/index.d.ts +11 -0
- package/dist/db/index.js +41 -29
- package/dist/db/migrations.d.ts +2 -0
- package/dist/db/{schema.js → migrations.js} +90 -43
- package/dist/db/pluginRegistry.d.ts +9 -0
- package/dist/db/pluginRegistry.js +19 -0
- package/dist/db/repositories/EdgeRepository.d.ts +13 -0
- package/dist/db/repositories/EdgeRepository.js +20 -0
- package/dist/db/repositories/MetricsRepository.d.ts +16 -8
- package/dist/db/repositories/MetricsRepository.js +28 -7
- package/dist/db/repositories/PageRepository.d.ts +15 -2
- package/dist/db/repositories/PageRepository.js +169 -25
- package/dist/db/repositories/SiteRepository.d.ts +9 -0
- package/dist/db/repositories/SiteRepository.js +13 -0
- package/dist/db/repositories/SnapshotRepository.d.ts +14 -5
- package/dist/db/repositories/SnapshotRepository.js +64 -5
- package/dist/db/reset.d.ts +9 -0
- package/dist/db/reset.js +32 -0
- package/dist/db/statements.d.ts +12 -0
- package/dist/db/statements.js +40 -0
- package/dist/diff/compare.d.ts +0 -5
- package/dist/diff/compare.js +0 -12
- package/dist/diff/service.d.ts +16 -0
- package/dist/diff/service.js +41 -0
- package/dist/domain/index.d.ts +4 -0
- package/dist/domain/index.js +4 -0
- package/dist/events.d.ts +56 -0
- package/dist/events.js +1 -0
- package/dist/graph/graph.d.ts +36 -42
- package/dist/graph/graph.js +26 -17
- package/dist/graph/hits.d.ts +23 -0
- package/dist/graph/hits.js +111 -0
- package/dist/graph/metrics.d.ts +0 -4
- package/dist/graph/metrics.js +25 -9
- package/dist/graph/pagerank.d.ts +17 -4
- package/dist/graph/pagerank.js +126 -91
- package/dist/graph/simhash.d.ts +6 -0
- package/dist/graph/simhash.js +14 -0
- package/dist/index.d.ts +29 -8
- package/dist/index.js +29 -8
- package/dist/lock/hashKey.js +1 -1
- package/dist/lock/lockManager.d.ts +5 -1
- package/dist/lock/lockManager.js +38 -13
- package/dist/plugin-system/plugin-cli.d.ts +10 -0
- package/dist/plugin-system/plugin-cli.js +31 -0
- package/dist/plugin-system/plugin-config.d.ts +16 -0
- package/dist/plugin-system/plugin-config.js +36 -0
- package/dist/plugin-system/plugin-loader.d.ts +17 -0
- package/dist/plugin-system/plugin-loader.js +122 -0
- package/dist/plugin-system/plugin-registry.d.ts +25 -0
- package/dist/plugin-system/plugin-registry.js +167 -0
- package/dist/plugin-system/plugin-types.d.ts +205 -0
- package/dist/plugin-system/plugin-types.js +1 -0
- package/dist/ports/index.d.ts +9 -0
- package/dist/ports/index.js +1 -0
- package/{src/report/sitegraph_template.ts → dist/report/crawl.html} +330 -81
- package/dist/report/crawlExport.d.ts +3 -0
- package/dist/report/{sitegraphExport.js → crawlExport.js} +3 -3
- package/dist/report/crawl_template.d.ts +1 -0
- package/dist/report/crawl_template.js +7 -0
- package/dist/report/export.d.ts +3 -0
- package/dist/report/export.js +81 -0
- package/dist/report/html.js +15 -216
- package/dist/report/insight.d.ts +27 -0
- package/dist/report/insight.js +103 -0
- package/dist/scoring/health.d.ts +56 -0
- package/dist/scoring/health.js +213 -0
- package/dist/utils/chalk.d.ts +6 -0
- package/dist/utils/chalk.js +41 -0
- package/dist/utils/secureConfig.d.ts +23 -0
- package/dist/utils/secureConfig.js +128 -0
- package/package.json +12 -6
- package/CHANGELOG.md +0 -7
- package/dist/db/schema.d.ts +0 -2
- package/dist/graph/cluster.d.ts +0 -6
- package/dist/graph/cluster.js +0 -173
- package/dist/graph/duplicate.d.ts +0 -10
- package/dist/graph/duplicate.js +0 -251
- package/dist/report/sitegraphExport.d.ts +0 -3
- package/dist/report/sitegraph_template.d.ts +0 -1
- package/dist/report/sitegraph_template.js +0 -630
- package/dist/scoring/hits.d.ts +0 -9
- package/dist/scoring/hits.js +0 -111
- package/src/analysis/analyze.ts +0 -548
- package/src/analysis/content.ts +0 -62
- package/src/analysis/images.ts +0 -28
- package/src/analysis/links.ts +0 -41
- package/src/analysis/scoring.ts +0 -59
- package/src/analysis/seo.ts +0 -82
- package/src/analysis/structuredData.ts +0 -62
- package/src/audit/dns.ts +0 -49
- package/src/audit/headers.ts +0 -98
- package/src/audit/index.ts +0 -66
- package/src/audit/scoring.ts +0 -232
- package/src/audit/transport.ts +0 -258
- package/src/audit/types.ts +0 -102
- package/src/core/network/proxyAdapter.ts +0 -21
- package/src/core/network/rateLimiter.ts +0 -39
- package/src/core/network/redirectController.ts +0 -47
- package/src/core/network/responseLimiter.ts +0 -34
- package/src/core/network/retryPolicy.ts +0 -57
- package/src/core/scope/domainFilter.ts +0 -45
- package/src/core/scope/scopeManager.ts +0 -52
- package/src/core/scope/subdomainPolicy.ts +0 -39
- package/src/core/security/ipGuard.ts +0 -92
- package/src/crawler/crawl.ts +0 -382
- package/src/crawler/extract.ts +0 -34
- package/src/crawler/fetcher.ts +0 -233
- package/src/crawler/metricsRunner.ts +0 -124
- package/src/crawler/normalize.ts +0 -108
- package/src/crawler/parser.ts +0 -190
- package/src/crawler/sitemap.ts +0 -73
- package/src/crawler/trap.ts +0 -96
- package/src/db/graphLoader.ts +0 -105
- package/src/db/index.ts +0 -70
- package/src/db/repositories/EdgeRepository.ts +0 -29
- package/src/db/repositories/MetricsRepository.ts +0 -49
- package/src/db/repositories/PageRepository.ts +0 -128
- package/src/db/repositories/SiteRepository.ts +0 -32
- package/src/db/repositories/SnapshotRepository.ts +0 -74
- package/src/db/schema.ts +0 -177
- package/src/diff/compare.ts +0 -84
- package/src/graph/cluster.ts +0 -192
- package/src/graph/duplicate.ts +0 -286
- package/src/graph/graph.ts +0 -172
- package/src/graph/metrics.ts +0 -110
- package/src/graph/pagerank.ts +0 -125
- package/src/graph/simhash.ts +0 -61
- package/src/index.ts +0 -30
- package/src/lock/hashKey.ts +0 -51
- package/src/lock/lockManager.ts +0 -124
- package/src/lock/pidCheck.ts +0 -13
- package/src/report/html.ts +0 -227
- package/src/report/sitegraphExport.ts +0 -58
- package/src/scoring/hits.ts +0 -131
- package/src/scoring/orphanSeverity.ts +0 -176
- package/src/utils/version.ts +0 -18
- package/tests/__snapshots__/orphanSeverity.test.ts.snap +0 -49
- package/tests/analysis.unit.test.ts +0 -98
- package/tests/analyze.integration.test.ts +0 -98
- package/tests/audit/dns.test.ts +0 -31
- package/tests/audit/headers.test.ts +0 -45
- package/tests/audit/scoring.test.ts +0 -133
- package/tests/audit/security.test.ts +0 -12
- package/tests/audit/transport.test.ts +0 -112
- package/tests/clustering.test.ts +0 -118
- package/tests/crawler.test.ts +0 -358
- package/tests/db.test.ts +0 -159
- package/tests/diff.test.ts +0 -67
- package/tests/duplicate.test.ts +0 -110
- package/tests/fetcher.test.ts +0 -106
- package/tests/fetcher_safety.test.ts +0 -85
- package/tests/fixtures/analyze-crawl.json +0 -26
- package/tests/hits.test.ts +0 -134
- package/tests/html_report.test.ts +0 -58
- package/tests/lock/lockManager.test.ts +0 -138
- package/tests/metrics.test.ts +0 -196
- package/tests/normalize.test.ts +0 -101
- package/tests/orphanSeverity.test.ts +0 -160
- package/tests/pagerank.test.ts +0 -98
- package/tests/parser.test.ts +0 -117
- package/tests/proxy_safety.test.ts +0 -57
- package/tests/redirect_safety.test.ts +0 -73
- package/tests/safety.test.ts +0 -114
- package/tests/scope.test.ts +0 -66
- package/tests/scoring.test.ts +0 -59
- package/tests/sitemap.test.ts +0 -88
- package/tests/soft404.test.ts +0 -41
- package/tests/trap.test.ts +0 -39
- package/tests/visualization_data.test.ts +0 -46
- package/tsconfig.json +0 -11
package/dist/crawler/parser.js
CHANGED
|
@@ -6,7 +6,7 @@ export class Parser {
|
|
|
6
6
|
/**
|
|
7
7
|
* Parses HTML content to extract metadata and links.
|
|
8
8
|
*/
|
|
9
|
-
parse(html, baseUrl,
|
|
9
|
+
parse(html, baseUrl, _status) {
|
|
10
10
|
const $ = cheerio.load(html);
|
|
11
11
|
// 1. Robots Meta
|
|
12
12
|
let noindex = false;
|
|
@@ -97,51 +97,6 @@ export class Parser {
|
|
|
97
97
|
const uniqueTokens = new Set(tokens);
|
|
98
98
|
const uniqueTokenRatio = tokens.length > 0 ? (uniqueTokens.size / tokens.length) : 0;
|
|
99
99
|
const simhash = SimHash.generate(tokens).toString();
|
|
100
|
-
// 5. Soft 404 Detection
|
|
101
|
-
let soft404Score = 0;
|
|
102
|
-
const soft404Signals = [];
|
|
103
|
-
if (status === 200) {
|
|
104
|
-
const title = $('title').text().toLowerCase();
|
|
105
|
-
const h1Text = $('h1').first().text().toLowerCase();
|
|
106
|
-
const bodyText = cleanText.toLowerCase();
|
|
107
|
-
const errorPatterns = ['404', 'not found', 'error', 'doesn\'t exist', 'unavailable', 'invalid'];
|
|
108
|
-
// Pattern checks
|
|
109
|
-
for (const pattern of errorPatterns) {
|
|
110
|
-
if (title.includes(pattern)) {
|
|
111
|
-
soft404Score += 0.4;
|
|
112
|
-
soft404Signals.push(`title_pattern_${pattern.replace(/\s+/g, '_')}`);
|
|
113
|
-
break;
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
for (const pattern of errorPatterns) {
|
|
117
|
-
if (h1Text.includes(pattern)) {
|
|
118
|
-
soft404Score += 0.3;
|
|
119
|
-
soft404Signals.push(`h1_pattern_${pattern.replace(/\s+/g, '_')}`);
|
|
120
|
-
break;
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
if (bodyText.includes('page not found') || bodyText.includes('404 error')) {
|
|
124
|
-
soft404Score += 0.2;
|
|
125
|
-
soft404Signals.push('body_error_phrase');
|
|
126
|
-
}
|
|
127
|
-
// Content length check (Word count approximation)
|
|
128
|
-
const words = cleanText.split(/\s+/).filter(w => w.length > 0);
|
|
129
|
-
if (words.length < 50) {
|
|
130
|
-
soft404Score += 0.3;
|
|
131
|
-
soft404Signals.push('very_low_word_count');
|
|
132
|
-
}
|
|
133
|
-
else if (words.length < 150) {
|
|
134
|
-
soft404Score += 0.1;
|
|
135
|
-
soft404Signals.push('low_word_count');
|
|
136
|
-
}
|
|
137
|
-
// Link count check
|
|
138
|
-
if (links.size === 0) {
|
|
139
|
-
soft404Score += 0.2;
|
|
140
|
-
soft404Signals.push('no_outbound_links');
|
|
141
|
-
}
|
|
142
|
-
// Cap at 1.0
|
|
143
|
-
soft404Score = Math.min(1.0, soft404Score);
|
|
144
|
-
}
|
|
145
100
|
return {
|
|
146
101
|
links: Array.from(links.entries()).map(([url, weight]) => ({ url, weight })),
|
|
147
102
|
html: html, // pass raw HTML for analysis
|
|
@@ -150,9 +105,7 @@ export class Parser {
|
|
|
150
105
|
nofollow,
|
|
151
106
|
contentHash,
|
|
152
107
|
simhash,
|
|
153
|
-
uniqueTokenRatio
|
|
154
|
-
soft404Score,
|
|
155
|
-
soft404Signals
|
|
108
|
+
uniqueTokenRatio
|
|
156
109
|
};
|
|
157
110
|
}
|
|
158
111
|
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { Fetcher } from './fetcher.js';
|
|
2
|
+
import { Site } from '../db/repositories/SiteRepository.js';
|
|
3
|
+
export interface ResolvedUrl {
|
|
4
|
+
url: string;
|
|
5
|
+
site: Site;
|
|
6
|
+
}
|
|
7
|
+
export declare class UrlResolver {
|
|
8
|
+
private siteRepo;
|
|
9
|
+
constructor();
|
|
10
|
+
resolve(inputUrl: string, fetcher: Fetcher): Promise<ResolvedUrl>;
|
|
11
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import { SiteRepository } from '../db/repositories/SiteRepository.js';
|
|
2
|
+
import { getDb } from '../db/index.js';
|
|
3
|
+
export class UrlResolver {
|
|
4
|
+
siteRepo;
|
|
5
|
+
constructor() {
|
|
6
|
+
this.siteRepo = new SiteRepository(getDb());
|
|
7
|
+
}
|
|
8
|
+
async resolve(inputUrl, fetcher) {
|
|
9
|
+
const hasProtocol = inputUrl.startsWith('http://') || inputUrl.startsWith('https://');
|
|
10
|
+
const workingUrl = hasProtocol ? inputUrl : `https://${inputUrl}`;
|
|
11
|
+
let hostname;
|
|
12
|
+
try {
|
|
13
|
+
hostname = new URL(workingUrl).hostname;
|
|
14
|
+
}
|
|
15
|
+
catch {
|
|
16
|
+
throw new Error(`Invalid URL or domain: ${inputUrl}`);
|
|
17
|
+
}
|
|
18
|
+
const domain = hostname.replace(/^www\./, '');
|
|
19
|
+
let site = this.siteRepo.firstOrCreateSite(domain);
|
|
20
|
+
// If protocol was omitted, we use our discovery logic or stored preference
|
|
21
|
+
if (!hasProtocol) {
|
|
22
|
+
if (site.ssl !== null && site.preferred_url) {
|
|
23
|
+
return {
|
|
24
|
+
url: site.preferred_url,
|
|
25
|
+
site
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
// No protocol provided and no stored preference: Probe HTTPS first
|
|
29
|
+
try {
|
|
30
|
+
const res = await fetcher.fetch(`https://${hostname}/`);
|
|
31
|
+
if (typeof res.status === 'number' && res.status >= 200 && res.status < 400) {
|
|
32
|
+
const isSsl = res.finalUrl.startsWith('https:') ? 1 : 0;
|
|
33
|
+
this.siteRepo.updateSitePreference(site.id, { preferred_url: res.finalUrl, ssl: isSsl });
|
|
34
|
+
// Refresh site object
|
|
35
|
+
site = this.siteRepo.getSiteById(site.id);
|
|
36
|
+
return { url: res.finalUrl, site };
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
catch {
|
|
40
|
+
// Fallback to HTTP
|
|
41
|
+
}
|
|
42
|
+
// Try HTTP
|
|
43
|
+
try {
|
|
44
|
+
const res = await fetcher.fetch(`http://${hostname}/`);
|
|
45
|
+
if (typeof res.status === 'number' && res.status >= 200 && res.status < 400) {
|
|
46
|
+
const isSsl = res.finalUrl.startsWith('https:') ? 1 : 0;
|
|
47
|
+
this.siteRepo.updateSitePreference(site.id, { preferred_url: res.finalUrl, ssl: isSsl });
|
|
48
|
+
site = this.siteRepo.getSiteById(site.id);
|
|
49
|
+
return { url: res.finalUrl, site };
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
// If both fail, we still default to the provided input as https
|
|
54
|
+
return { url: workingUrl, site };
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
// Protocol was provided, we just return it but ensure site is in sync if it's the first time
|
|
58
|
+
if (site.ssl === null) {
|
|
59
|
+
this.siteRepo.updateSitePreference(site.id, {
|
|
60
|
+
preferred_url: inputUrl,
|
|
61
|
+
ssl: inputUrl.startsWith('https:') ? 1 : 0
|
|
62
|
+
});
|
|
63
|
+
site = this.siteRepo.getSiteById(site.id);
|
|
64
|
+
}
|
|
65
|
+
return { url: inputUrl, site };
|
|
66
|
+
}
|
|
67
|
+
}
|
|
@@ -1,4 +1,10 @@
|
|
|
1
|
+
import { EngineContext } from '../events.js';
|
|
2
|
+
import { Fetcher } from './fetcher.js';
|
|
1
3
|
export declare class Sitemap {
|
|
4
|
+
private context?;
|
|
5
|
+
private fetcher?;
|
|
6
|
+
private userAgent;
|
|
7
|
+
constructor(context?: EngineContext | undefined, fetcher?: Fetcher | undefined, userAgent?: string);
|
|
2
8
|
/**
|
|
3
9
|
* Fetches and parses a sitemap (or sitemap index) to extract URLs.
|
|
4
10
|
* Recursively handles sitemap indexes with loop detection and depth limits.
|
package/dist/crawler/sitemap.js
CHANGED
|
@@ -1,7 +1,19 @@
|
|
|
1
|
-
import { request } from 'undici';
|
|
2
1
|
import * as cheerio from 'cheerio';
|
|
2
|
+
import pLimit from 'p-limit';
|
|
3
3
|
import { normalizeUrl } from './normalize.js';
|
|
4
|
+
import { DEFAULTS } from '../constants.js';
|
|
4
5
|
export class Sitemap {
|
|
6
|
+
context;
|
|
7
|
+
fetcher;
|
|
8
|
+
userAgent = DEFAULTS.USER_AGENT;
|
|
9
|
+
constructor(context, fetcher, userAgent) {
|
|
10
|
+
this.context = context;
|
|
11
|
+
this.fetcher = fetcher;
|
|
12
|
+
if (userAgent)
|
|
13
|
+
this.userAgent = userAgent;
|
|
14
|
+
else if (fetcher)
|
|
15
|
+
this.userAgent = fetcher.userAgent;
|
|
16
|
+
}
|
|
5
17
|
/**
|
|
6
18
|
* Fetches and parses a sitemap (or sitemap index) to extract URLs.
|
|
7
19
|
* Recursively handles sitemap indexes with loop detection and depth limits.
|
|
@@ -20,14 +32,16 @@ export class Sitemap {
|
|
|
20
32
|
if (visited.size > 50)
|
|
21
33
|
return;
|
|
22
34
|
try {
|
|
23
|
-
const res =
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
35
|
+
const res = this.fetcher
|
|
36
|
+
? await this.fetcher.fetch(url, { maxBytes: 5000000 })
|
|
37
|
+
: await (async () => {
|
|
38
|
+
const { request } = await import('undici');
|
|
39
|
+
const r = await request(url, { headers: { 'User-Agent': this.userAgent } });
|
|
40
|
+
const b = await r.body.text();
|
|
41
|
+
return { status: r.statusCode, body: b };
|
|
42
|
+
})();
|
|
43
|
+
if (typeof res.status === 'number' && res.status >= 200 && res.status < 300) {
|
|
44
|
+
const xml = res.body;
|
|
31
45
|
// Basic validation: must verify it looks like XML
|
|
32
46
|
if (!xml.trim().startsWith('<'))
|
|
33
47
|
return;
|
|
@@ -41,10 +55,9 @@ export class Sitemap {
|
|
|
41
55
|
if (loc)
|
|
42
56
|
childSitemaps.push(loc);
|
|
43
57
|
});
|
|
44
|
-
// Process children
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
}
|
|
58
|
+
// Process children concurrently but with a limit to avoid massive concurrency spike
|
|
59
|
+
const limit = pLimit(10);
|
|
60
|
+
await Promise.all(childSitemaps.map(childUrl => limit(() => this.processSitemap(childUrl, visited, urls))));
|
|
48
61
|
}
|
|
49
62
|
else {
|
|
50
63
|
// It's a URL Set
|
|
@@ -59,12 +72,9 @@ export class Sitemap {
|
|
|
59
72
|
});
|
|
60
73
|
}
|
|
61
74
|
}
|
|
62
|
-
else {
|
|
63
|
-
await res.body.dump();
|
|
64
|
-
}
|
|
65
75
|
}
|
|
66
76
|
catch (e) {
|
|
67
|
-
|
|
77
|
+
this.context?.emit({ type: 'warn', message: `Failed to fetch sitemap ${url} (${String(e)})`, context: e });
|
|
68
78
|
}
|
|
69
79
|
}
|
|
70
80
|
}
|
package/dist/crawler/trap.d.ts
CHANGED
|
@@ -16,7 +16,11 @@ export declare class TrapDetector {
|
|
|
16
16
|
/**
|
|
17
17
|
* Checks if a URL represents a potential crawl trap.
|
|
18
18
|
*/
|
|
19
|
-
checkTrap(rawUrl: string, _depth: number): TrapResult;
|
|
19
|
+
checkTrap(rawUrl: string, _depth: number, isInternal?: boolean): TrapResult;
|
|
20
|
+
/**
|
|
21
|
+
* Iterates over all nodes in the graph and flags potential traps.
|
|
22
|
+
*/
|
|
23
|
+
analyze(graph: any): void;
|
|
20
24
|
/**
|
|
21
25
|
* Resets internal state (useful for multi-crawl sessions if needed)
|
|
22
26
|
*/
|
package/dist/crawler/trap.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
export class TrapDetector {
|
|
2
2
|
pathCounters = new Map();
|
|
3
3
|
paginationCounters = new Map();
|
|
4
|
-
sessionParams = new Set(['sid', 'session', 'phpsessid', 'sessid', 'token']);
|
|
4
|
+
sessionParams = new Set(['sid', 'session', 'phpsessid', 'sessid', 'token', 'intended']);
|
|
5
5
|
// Configurable thresholds
|
|
6
6
|
PARAM_EXPLOSION_THRESHOLD = 30;
|
|
7
7
|
PAGINATION_THRESHOLD = 50;
|
|
@@ -14,7 +14,12 @@ export class TrapDetector {
|
|
|
14
14
|
/**
|
|
15
15
|
* Checks if a URL represents a potential crawl trap.
|
|
16
16
|
*/
|
|
17
|
-
checkTrap(rawUrl, _depth) {
|
|
17
|
+
checkTrap(rawUrl, _depth, isInternal = true) {
|
|
18
|
+
// If it's not internal (e.g., social sharing links), we don't flag it as a trap
|
|
19
|
+
// that affects our crawl health, even though technically it might have many params.
|
|
20
|
+
if (!isInternal) {
|
|
21
|
+
return { risk: 0, type: null };
|
|
22
|
+
}
|
|
18
23
|
let risk = 0;
|
|
19
24
|
let type = null;
|
|
20
25
|
try {
|
|
@@ -68,6 +73,22 @@ export class TrapDetector {
|
|
|
68
73
|
}
|
|
69
74
|
return { risk, type };
|
|
70
75
|
}
|
|
76
|
+
/**
|
|
77
|
+
* Iterates over all nodes in the graph and flags potential traps.
|
|
78
|
+
*/
|
|
79
|
+
analyze(graph) {
|
|
80
|
+
const nodes = graph.getNodes();
|
|
81
|
+
for (const node of nodes) {
|
|
82
|
+
if (node.status === 200 || node.status === 0) {
|
|
83
|
+
const res = this.checkTrap(node.url, node.depth || 0, !!node.isInternal);
|
|
84
|
+
if (res.risk > 0.4) {
|
|
85
|
+
node.crawlTrapFlag = true;
|
|
86
|
+
node.crawlTrapRisk = res.risk;
|
|
87
|
+
node.trapType = res.type;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
71
92
|
/**
|
|
72
93
|
* Resets internal state (useful for multi-crawl sessions if needed)
|
|
73
94
|
*/
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import Database from 'better-sqlite3';
|
|
2
|
+
import type { CrawlithPlugin } from '../plugin-system/plugin-types.js';
|
|
3
|
+
export declare class CrawlithDB {
|
|
4
|
+
private db;
|
|
5
|
+
private statements;
|
|
6
|
+
private registry;
|
|
7
|
+
/**
|
|
8
|
+
* @internal
|
|
9
|
+
* Dangerous: Returns the raw better-sqlite3 instance.
|
|
10
|
+
* Core only. Plugins must never use this.
|
|
11
|
+
*/
|
|
12
|
+
unsafeGetRawDb(): Database.Database;
|
|
13
|
+
private _pluginName?;
|
|
14
|
+
private _snapshotId?;
|
|
15
|
+
/** Whether live fallback is allowed (from --live flag). Core-controlled. */
|
|
16
|
+
private _live;
|
|
17
|
+
/** Whether this plugin makes network calls. Core-controlled via plugin.storage.fetchMode. */
|
|
18
|
+
private _fetchMode;
|
|
19
|
+
constructor(dbPath: string);
|
|
20
|
+
/**
|
|
21
|
+
* Schema API
|
|
22
|
+
*/
|
|
23
|
+
get schema(): {
|
|
24
|
+
define: (columns: Record<string, string>) => void;
|
|
25
|
+
};
|
|
26
|
+
/**
|
|
27
|
+
* Fluent Data API (URL-scoped rows)
|
|
28
|
+
*/
|
|
29
|
+
get data(): {
|
|
30
|
+
save: <T>(input: {
|
|
31
|
+
url: string;
|
|
32
|
+
data: T;
|
|
33
|
+
}) => void;
|
|
34
|
+
find: <T>(url: string, options?: {
|
|
35
|
+
maxAge?: string | number;
|
|
36
|
+
global?: boolean;
|
|
37
|
+
}) => T | null;
|
|
38
|
+
all: <T>() => T[];
|
|
39
|
+
/**
|
|
40
|
+
* Cache-first with live fallback. Core-enforced pattern:
|
|
41
|
+
* 1. If cached data exists → return it (always, regardless of age)
|
|
42
|
+
* 2. If no cache + fetchMode='network' + live=false → return null (skip)
|
|
43
|
+
* 3. If no cache + (fetchMode='local' OR live=true) → call fetchFn, save, return
|
|
44
|
+
*
|
|
45
|
+
* Plugin authors NEVER touch ctx.live — the core injects it via scope().
|
|
46
|
+
*/
|
|
47
|
+
getOrFetch: <T>(url: string, fetchFn: () => Promise<T>) => Promise<T | null>;
|
|
48
|
+
};
|
|
49
|
+
/**
|
|
50
|
+
* Report API (Global snapshot summary)
|
|
51
|
+
*/
|
|
52
|
+
get report(): {
|
|
53
|
+
save: (summary: any, optionalScores?: {
|
|
54
|
+
totalScore?: number;
|
|
55
|
+
scoreCount?: number;
|
|
56
|
+
scoreWeightSum?: number;
|
|
57
|
+
scoreCalculatedAt?: string;
|
|
58
|
+
}) => void;
|
|
59
|
+
find: <T>() => T | null;
|
|
60
|
+
};
|
|
61
|
+
initialize(): void;
|
|
62
|
+
/**
|
|
63
|
+
* Create a scoped instance for a specific plugin.
|
|
64
|
+
* Also bakes in live + fetchMode so getOrFetch() can enforce the protocol
|
|
65
|
+
* without exposing those controls to the plugin author.
|
|
66
|
+
*/
|
|
67
|
+
scope(pluginName: string, snapshotId?: number | string, options?: {
|
|
68
|
+
live?: boolean;
|
|
69
|
+
fetchMode?: 'local' | 'network';
|
|
70
|
+
}): CrawlithDB;
|
|
71
|
+
registerPluginDataSchema(pluginNameOrColumns: string | Record<string, string>, extraColumns?: Record<string, string>): void;
|
|
72
|
+
/** @deprecated Use registerPluginDataSchema */
|
|
73
|
+
registerPluginMigration(pluginName: string, migrationSQL: string): void;
|
|
74
|
+
getPageIdByUrl(snapshotId: number | string, url: string): number | null;
|
|
75
|
+
insertPluginReport(input: {
|
|
76
|
+
snapshotId?: number | string;
|
|
77
|
+
pluginName?: string;
|
|
78
|
+
summary: unknown;
|
|
79
|
+
totalScore?: number;
|
|
80
|
+
scoreCount?: number;
|
|
81
|
+
scoreWeightSum?: number;
|
|
82
|
+
scoreCalculatedAt?: string;
|
|
83
|
+
}): void;
|
|
84
|
+
insertPluginRow<T>(input: {
|
|
85
|
+
tableName?: string;
|
|
86
|
+
snapshotId?: number | string;
|
|
87
|
+
url: string;
|
|
88
|
+
data: T;
|
|
89
|
+
}): void;
|
|
90
|
+
getPluginReport(snapshotId?: number | string, pluginName?: string): unknown | null;
|
|
91
|
+
getPluginRows<T>(tableName?: string, snapshotId?: number | string): T[];
|
|
92
|
+
getPluginRow<T>(tableNameOrUrl: string, snapshotId?: number | string, url?: string, options?: {
|
|
93
|
+
maxAge?: string | number;
|
|
94
|
+
global?: boolean;
|
|
95
|
+
}): T | null;
|
|
96
|
+
private _parseDuration;
|
|
97
|
+
private _parseRow;
|
|
98
|
+
deleteSnapshotPlugins(snapshotId: number | string): void;
|
|
99
|
+
private _getOrFetch;
|
|
100
|
+
aggregateScoreProviders(snapshotId: number | string, plugins: CrawlithPlugin[]): void;
|
|
101
|
+
runInTransaction(fn: () => void): void;
|
|
102
|
+
private _resolveTableName;
|
|
103
|
+
/** Converts a plugin name to its canonical SQLite table name, sanitizing invalid characters. */
|
|
104
|
+
private _toTableName;
|
|
105
|
+
close(): void;
|
|
106
|
+
private _isMigrationExecuted;
|
|
107
|
+
private _assertSnapshotExists;
|
|
108
|
+
private _assertTableRegistered;
|
|
109
|
+
private _assertOwnership;
|
|
110
|
+
}
|