s3db.js 18.0.11-next.1534f717 → 18.0.11-next.e8e71b5b
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/clients/recker-http-handler.js +56 -8
- package/dist/clients/recker-http-handler.js.map +1 -1
- package/dist/concerns/high-performance-inserter.js +6 -34
- package/dist/concerns/high-performance-inserter.js.map +1 -1
- package/dist/concerns/id/alphabets.js +150 -0
- package/dist/concerns/id/alphabets.js.map +1 -0
- package/dist/concerns/id/entropy.js +243 -0
- package/dist/concerns/id/entropy.js.map +1 -0
- package/dist/concerns/id/generators/nanoid.js +74 -0
- package/dist/concerns/id/generators/nanoid.js.map +1 -0
- package/dist/concerns/id/generators/sid.js +73 -0
- package/dist/concerns/id/generators/sid.js.map +1 -0
- package/dist/concerns/id/generators/ulid.js +208 -0
- package/dist/concerns/id/generators/ulid.js.map +1 -0
- package/dist/concerns/id/generators/uuid-v7.js +150 -0
- package/dist/concerns/id/generators/uuid-v7.js.map +1 -0
- package/dist/concerns/id/index.js +74 -0
- package/dist/concerns/id/index.js.map +1 -0
- package/dist/concerns/plugin-storage.js +114 -0
- package/dist/concerns/plugin-storage.js.map +1 -1
- package/dist/concerns/s3-errors.js +72 -0
- package/dist/concerns/s3-errors.js.map +1 -0
- package/dist/concerns/s3-key.js +54 -0
- package/dist/concerns/s3-key.js.map +1 -0
- package/dist/concerns/safe-merge.js +47 -0
- package/dist/concerns/safe-merge.js.map +1 -0
- package/dist/core/resource-config-validator.js +12 -2
- package/dist/core/resource-config-validator.js.map +1 -1
- package/dist/core/resource-partitions.class.js +12 -1
- package/dist/core/resource-partitions.class.js.map +1 -1
- package/dist/core/resource-persistence.class.js +41 -12
- package/dist/core/resource-persistence.class.js.map +1 -1
- package/dist/core/resource-query.class.js +21 -47
- package/dist/core/resource-query.class.js.map +1 -1
- package/dist/database/database-connection.class.js +3 -6
- package/dist/database/database-connection.class.js.map +1 -1
- package/dist/database/database-plugins.class.js +7 -13
- package/dist/database/database-plugins.class.js.map +1 -1
- package/dist/plugins/concerns/s3-mutex.class.js +155 -0
- package/dist/plugins/concerns/s3-mutex.class.js.map +1 -0
- package/dist/plugins/eventual-consistency/consolidation.js +4 -7
- package/dist/plugins/eventual-consistency/consolidation.js.map +1 -1
- package/dist/plugins/eventual-consistency/garbage-collection.js +3 -6
- package/dist/plugins/eventual-consistency/garbage-collection.js.map +1 -1
- package/dist/plugins/queue-consumer.plugin.js +10 -16
- package/dist/plugins/queue-consumer.plugin.js.map +1 -1
- package/dist/plugins/recon/managers/scheduler-manager.js +3 -5
- package/dist/plugins/recon/managers/scheduler-manager.js.map +1 -1
- package/dist/plugins/recon/stages/recker-asn-stage.js +279 -0
- package/dist/plugins/recon/stages/recker-asn-stage.js.map +1 -0
- package/dist/plugins/recon/stages/recker-dns-stage.js +227 -0
- package/dist/plugins/recon/stages/recker-dns-stage.js.map +1 -0
- package/dist/plugins/recon/stages/recker-scrape-stage.js +369 -0
- package/dist/plugins/recon/stages/recker-scrape-stage.js.map +1 -0
- package/dist/plugins/replicator.plugin.js +13 -31
- package/dist/plugins/replicator.plugin.js.map +1 -1
- package/dist/plugins/replicators/base-replicator.class.js +10 -23
- package/dist/plugins/replicators/base-replicator.class.js.map +1 -1
- package/dist/plugins/spider/recker-link-discoverer.js +544 -0
- package/dist/plugins/spider/recker-link-discoverer.js.map +1 -0
- package/dist/plugins/spider/recker-llms-validator.js +334 -0
- package/dist/plugins/spider/recker-llms-validator.js.map +1 -0
- package/dist/plugins/spider/recker-robots-validator.js +336 -0
- package/dist/plugins/spider/recker-robots-validator.js.map +1 -0
- package/dist/plugins/spider/recker-security-adapter.js +325 -0
- package/dist/plugins/spider/recker-security-adapter.js.map +1 -0
- package/dist/plugins/spider/recker-seo-adapter.js +399 -0
- package/dist/plugins/spider/recker-seo-adapter.js.map +1 -0
- package/dist/plugins/spider/recker-sitemap-validator.js +406 -0
- package/dist/plugins/spider/recker-sitemap-validator.js.map +1 -0
- package/dist/resource.class.js +2 -0
- package/dist/resource.class.js.map +1 -1
- package/dist/s3db.cjs +444 -219
- package/dist/s3db.cjs.map +1 -1
- package/dist/s3db.es.js +445 -220
- package/dist/s3db.es.js.map +1 -1
- package/dist/stream/resource-reader.class.js +5 -7
- package/dist/stream/resource-reader.class.js.map +1 -1
- package/dist/stream/resource-writer.class.js +5 -7
- package/dist/stream/resource-writer.class.js.map +1 -1
- package/dist/tasks/tasks-pool.class.js +31 -0
- package/dist/tasks/tasks-pool.class.js.map +1 -1
- package/dist/types/clients/recker-http-handler.d.ts +1 -0
- package/dist/types/clients/recker-http-handler.d.ts.map +1 -1
- package/dist/types/clients/types.d.ts +14 -0
- package/dist/types/clients/types.d.ts.map +1 -1
- package/dist/types/concerns/high-performance-inserter.d.ts.map +1 -1
- package/dist/types/concerns/id/alphabets.d.ts +125 -0
- package/dist/types/concerns/id/alphabets.d.ts.map +1 -0
- package/dist/types/concerns/id/entropy.d.ts +84 -0
- package/dist/types/concerns/id/entropy.d.ts.map +1 -0
- package/dist/types/concerns/id/generators/nanoid.d.ts +46 -0
- package/dist/types/concerns/id/generators/nanoid.d.ts.map +1 -0
- package/dist/types/concerns/id/generators/sid.d.ts +45 -0
- package/dist/types/concerns/id/generators/sid.d.ts.map +1 -0
- package/dist/types/concerns/id/generators/ulid.d.ts +71 -0
- package/dist/types/concerns/id/generators/ulid.d.ts.map +1 -0
- package/dist/types/concerns/id/generators/uuid-v7.d.ts +60 -0
- package/dist/types/concerns/id/generators/uuid-v7.d.ts.map +1 -0
- package/dist/types/concerns/id/index.d.ts +51 -0
- package/dist/types/concerns/id/index.d.ts.map +1 -0
- package/dist/types/concerns/plugin-storage.d.ts +25 -0
- package/dist/types/concerns/plugin-storage.d.ts.map +1 -1
- package/dist/types/concerns/s3-errors.d.ts +20 -0
- package/dist/types/concerns/s3-errors.d.ts.map +1 -0
- package/dist/types/concerns/s3-key.d.ts +30 -0
- package/dist/types/concerns/s3-key.d.ts.map +1 -0
- package/dist/types/concerns/safe-merge.d.ts +22 -0
- package/dist/types/concerns/safe-merge.d.ts.map +1 -0
- package/dist/types/core/resource-config-validator.d.ts.map +1 -1
- package/dist/types/core/resource-partitions.class.d.ts.map +1 -1
- package/dist/types/core/resource-persistence.class.d.ts.map +1 -1
- package/dist/types/core/resource-query.class.d.ts.map +1 -1
- package/dist/types/database/database-connection.class.d.ts.map +1 -1
- package/dist/types/database/database-plugins.class.d.ts.map +1 -1
- package/dist/types/plugins/concerns/s3-mutex.class.d.ts +30 -0
- package/dist/types/plugins/concerns/s3-mutex.class.d.ts.map +1 -0
- package/dist/types/plugins/eventual-consistency/consolidation.d.ts.map +1 -1
- package/dist/types/plugins/eventual-consistency/garbage-collection.d.ts.map +1 -1
- package/dist/types/plugins/queue-consumer.plugin.d.ts.map +1 -1
- package/dist/types/plugins/recon/managers/scheduler-manager.d.ts.map +1 -1
- package/dist/types/plugins/recon/stages/recker-asn-stage.d.ts +90 -0
- package/dist/types/plugins/recon/stages/recker-asn-stage.d.ts.map +1 -0
- package/dist/types/plugins/recon/stages/recker-dns-stage.d.ts +125 -0
- package/dist/types/plugins/recon/stages/recker-dns-stage.d.ts.map +1 -0
- package/dist/types/plugins/recon/stages/recker-scrape-stage.d.ts +96 -0
- package/dist/types/plugins/recon/stages/recker-scrape-stage.d.ts.map +1 -0
- package/dist/types/plugins/replicator.plugin.d.ts.map +1 -1
- package/dist/types/plugins/replicators/base-replicator.class.d.ts.map +1 -1
- package/dist/types/plugins/spider/recker-link-discoverer.d.ts +54 -0
- package/dist/types/plugins/spider/recker-link-discoverer.d.ts.map +1 -0
- package/dist/types/plugins/spider/recker-llms-validator.d.ts +105 -0
- package/dist/types/plugins/spider/recker-llms-validator.d.ts.map +1 -0
- package/dist/types/plugins/spider/recker-robots-validator.d.ts +92 -0
- package/dist/types/plugins/spider/recker-robots-validator.d.ts.map +1 -0
- package/dist/types/plugins/spider/recker-security-adapter.d.ts +83 -0
- package/dist/types/plugins/spider/recker-security-adapter.d.ts.map +1 -0
- package/dist/types/plugins/spider/recker-seo-adapter.d.ts +187 -0
- package/dist/types/plugins/spider/recker-seo-adapter.d.ts.map +1 -0
- package/dist/types/plugins/spider/recker-sitemap-validator.d.ts +121 -0
- package/dist/types/plugins/spider/recker-sitemap-validator.d.ts.map +1 -0
- package/dist/types/resource.class.d.ts.map +1 -1
- package/dist/types/stream/resource-reader.class.d.ts.map +1 -1
- package/dist/types/stream/resource-writer.class.d.ts.map +1 -1
- package/dist/types/tasks/tasks-pool.class.d.ts +23 -0
- package/dist/types/tasks/tasks-pool.class.d.ts.map +1 -1
- package/mcp/prompts/index.ts +275 -0
- package/mcp/resources/index.ts +322 -0
- package/mcp/tools/plugins.ts +1137 -0
- package/mcp/tools/streams.ts +340 -0
- package/package.json +20 -22
- package/src/clients/recker-http-handler.ts +74 -8
- package/src/clients/types.ts +14 -0
- package/src/concerns/high-performance-inserter.ts +18 -57
- package/src/concerns/id/alphabets.ts +175 -0
- package/src/concerns/id/entropy.ts +286 -0
- package/src/concerns/id/generators/sid.ts +90 -0
- package/src/concerns/id/generators/ulid.ts +249 -0
- package/src/concerns/id/generators/uuid-v7.ts +179 -0
- package/src/concerns/id/index.ts +167 -0
- package/src/concerns/plugin-storage.ts +144 -0
- package/src/concerns/s3-errors.ts +97 -0
- package/src/concerns/s3-key.ts +62 -0
- package/src/concerns/safe-merge.ts +60 -0
- package/src/core/resource-config-validator.ts +9 -2
- package/src/core/resource-partitions.class.ts +14 -1
- package/src/core/resource-persistence.class.ts +47 -13
- package/src/core/resource-query.class.ts +21 -46
- package/src/database/database-connection.class.ts +7 -6
- package/src/database/database-plugins.class.ts +15 -13
- package/src/plugins/concerns/s3-mutex.class.ts +228 -0
- package/src/plugins/eventual-consistency/consolidation.ts +8 -7
- package/src/plugins/eventual-consistency/garbage-collection.ts +7 -6
- package/src/plugins/queue-consumer.plugin.ts +21 -19
- package/src/plugins/recon/managers/scheduler-manager.ts +7 -5
- package/src/plugins/recon/stages/recker-asn-stage.ts +385 -0
- package/src/plugins/recon/stages/recker-dns-stage.ts +360 -0
- package/src/plugins/recon/stages/recker-scrape-stage.ts +509 -0
- package/src/plugins/replicator.plugin.ts +41 -35
- package/src/plugins/replicators/base-replicator.class.ts +17 -23
- package/src/plugins/spider/recker-link-discoverer.ts +645 -0
- package/src/plugins/spider/recker-llms-validator.ts +500 -0
- package/src/plugins/spider/recker-robots-validator.ts +473 -0
- package/src/plugins/spider/recker-security-adapter.ts +489 -0
- package/src/plugins/spider/recker-seo-adapter.ts +605 -0
- package/src/plugins/spider/recker-sitemap-validator.ts +621 -0
- package/src/resource.class.ts +2 -0
- package/src/stream/resource-reader.class.ts +10 -8
- package/src/stream/resource-writer.class.ts +10 -8
- package/src/tasks/tasks-pool.class.ts +46 -0
|
@@ -0,0 +1,621 @@
|
|
|
1
|
+
import { gunzipSync } from 'zlib';
|
|
2
|
+
import { createHttpClient } from '#src/concerns/http-client.js';
|
|
3
|
+
import type { CrawlContext } from './crawl-context.js';
|
|
4
|
+
import type {
|
|
5
|
+
SitemapParserConfig,
|
|
6
|
+
SitemapEntry,
|
|
7
|
+
SitemapImage,
|
|
8
|
+
SitemapVideo,
|
|
9
|
+
ParseOptions,
|
|
10
|
+
SitemapStats,
|
|
11
|
+
ProbeResult,
|
|
12
|
+
FetcherResult
|
|
13
|
+
} from './sitemap-parser.js';
|
|
14
|
+
|
|
15
|
+
type ReckerSitemapUrl = {
|
|
16
|
+
loc: string;
|
|
17
|
+
lastmod?: string;
|
|
18
|
+
changefreq?: 'always' | 'hourly' | 'daily' | 'weekly' | 'monthly' | 'yearly' | 'never';
|
|
19
|
+
priority?: number;
|
|
20
|
+
images?: Array<{
|
|
21
|
+
loc: string;
|
|
22
|
+
caption?: string;
|
|
23
|
+
title?: string;
|
|
24
|
+
}>;
|
|
25
|
+
videos?: Array<{
|
|
26
|
+
thumbnailLoc: string;
|
|
27
|
+
title: string;
|
|
28
|
+
description: string;
|
|
29
|
+
contentLoc?: string;
|
|
30
|
+
playerLoc?: string;
|
|
31
|
+
}>;
|
|
32
|
+
news?: {
|
|
33
|
+
publicationName: string;
|
|
34
|
+
publicationLanguage: string;
|
|
35
|
+
publicationDate: string;
|
|
36
|
+
title: string;
|
|
37
|
+
};
|
|
38
|
+
alternates?: Array<{
|
|
39
|
+
hreflang: string;
|
|
40
|
+
href: string;
|
|
41
|
+
}>;
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
type ReckerSitemapIndex = {
|
|
45
|
+
loc: string;
|
|
46
|
+
lastmod?: string;
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
type ReckerSitemapParseResult = {
|
|
50
|
+
type: 'urlset' | 'sitemapindex' | 'unknown';
|
|
51
|
+
valid: boolean;
|
|
52
|
+
errors: string[];
|
|
53
|
+
warnings: string[];
|
|
54
|
+
urls: ReckerSitemapUrl[];
|
|
55
|
+
sitemaps: ReckerSitemapIndex[];
|
|
56
|
+
urlCount: number;
|
|
57
|
+
size: number;
|
|
58
|
+
compressed: boolean;
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
type ReckerSitemapValidationIssue = {
|
|
62
|
+
type: 'error' | 'warning' | 'info';
|
|
63
|
+
code: string;
|
|
64
|
+
message: string;
|
|
65
|
+
url?: string;
|
|
66
|
+
recommendation?: string;
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
type ReckerSitemapValidationResult = {
|
|
70
|
+
valid: boolean;
|
|
71
|
+
issues: ReckerSitemapValidationIssue[];
|
|
72
|
+
parseResult: ReckerSitemapParseResult;
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
type ReckerParseSitemap = (content: string, compressed?: boolean) => ReckerSitemapParseResult;
|
|
76
|
+
type ReckerValidateSitemap = (content: string, baseUrl?: string) => ReckerSitemapValidationResult;
|
|
77
|
+
type ReckerDiscoverSitemaps = (
|
|
78
|
+
baseUrl: string,
|
|
79
|
+
robotsTxtContent?: string,
|
|
80
|
+
fetcher?: (url: string) => Promise<{ status: number; text: string }>
|
|
81
|
+
) => Promise<string[]>;
|
|
82
|
+
type ReckerFetchAndValidateSitemap = (
|
|
83
|
+
url: string,
|
|
84
|
+
fetcher?: (url: string) => Promise<{ status: number; text: string; headers?: Record<string, string> }>
|
|
85
|
+
) => Promise<ReckerSitemapValidationResult & { exists: boolean; status?: number }>;
|
|
86
|
+
|
|
87
|
+
interface CacheEntry {
|
|
88
|
+
entries: SitemapEntry[];
|
|
89
|
+
parseResult: ReckerSitemapParseResult | null;
|
|
90
|
+
validationResult: ReckerSitemapValidationResult | null;
|
|
91
|
+
timestamp: number;
|
|
92
|
+
format: string;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
interface HttpClient {
|
|
96
|
+
get(url: string): Promise<HttpResponse>;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
interface HttpResponse {
|
|
100
|
+
ok: boolean;
|
|
101
|
+
status: number;
|
|
102
|
+
headers: Headers;
|
|
103
|
+
text(): Promise<string>;
|
|
104
|
+
arrayBuffer(): Promise<ArrayBuffer>;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
interface Headers {
|
|
108
|
+
get(name: string): string | null;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
export interface SitemapValidationDetails {
|
|
112
|
+
valid: boolean;
|
|
113
|
+
issues: ReckerSitemapValidationIssue[];
|
|
114
|
+
type: 'urlset' | 'sitemapindex' | 'unknown';
|
|
115
|
+
urlCount: number;
|
|
116
|
+
size: number;
|
|
117
|
+
compressed: boolean;
|
|
118
|
+
errors: string[];
|
|
119
|
+
warnings: string[];
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
export interface SitemapEntryExtended extends SitemapEntry {
|
|
123
|
+
news?: {
|
|
124
|
+
publicationName: string;
|
|
125
|
+
publicationLanguage: string;
|
|
126
|
+
publicationDate: string;
|
|
127
|
+
title: string;
|
|
128
|
+
};
|
|
129
|
+
alternates?: Array<{
|
|
130
|
+
hreflang: string;
|
|
131
|
+
href: string;
|
|
132
|
+
}>;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
export class ReckerSitemapValidator {
|
|
136
|
+
private config: Required<Omit<SitemapParserConfig, 'context' | 'fetcher'>> & {
|
|
137
|
+
context: CrawlContext | null;
|
|
138
|
+
fetcher: ((url: string) => Promise<FetcherResult>) | null;
|
|
139
|
+
};
|
|
140
|
+
private _context: CrawlContext | null;
|
|
141
|
+
private cache: Map<string, CacheEntry>;
|
|
142
|
+
private fetcher: ((url: string) => Promise<FetcherResult>) | null;
|
|
143
|
+
private _httpClient: HttpClient | null;
|
|
144
|
+
private stats: {
|
|
145
|
+
sitemapsParsed: number;
|
|
146
|
+
urlsExtracted: number;
|
|
147
|
+
errors: number;
|
|
148
|
+
};
|
|
149
|
+
|
|
150
|
+
private reckerAvailable: boolean | null = null;
|
|
151
|
+
private parseSitemap: ReckerParseSitemap | null = null;
|
|
152
|
+
private validateSitemap: ReckerValidateSitemap | null = null;
|
|
153
|
+
private discoverSitemaps: ReckerDiscoverSitemaps | null = null;
|
|
154
|
+
private fetchAndValidateSitemap: ReckerFetchAndValidateSitemap | null = null;
|
|
155
|
+
private fallbackParser: import('./sitemap-parser.js').SitemapParser | null = null;
|
|
156
|
+
|
|
157
|
+
constructor(config: SitemapParserConfig = {}) {
|
|
158
|
+
this.config = {
|
|
159
|
+
userAgent: config.userAgent || 's3db-spider',
|
|
160
|
+
fetchTimeout: config.fetchTimeout || 30000,
|
|
161
|
+
maxSitemaps: config.maxSitemaps || 50,
|
|
162
|
+
maxUrls: config.maxUrls || 50000,
|
|
163
|
+
followSitemapIndex: config.followSitemapIndex !== false,
|
|
164
|
+
cacheTimeout: config.cacheTimeout || 3600000,
|
|
165
|
+
context: config.context || null,
|
|
166
|
+
fetcher: config.fetcher || null
|
|
167
|
+
};
|
|
168
|
+
|
|
169
|
+
this._context = this.config.context;
|
|
170
|
+
this.cache = new Map();
|
|
171
|
+
this.fetcher = this.config.fetcher;
|
|
172
|
+
this._httpClient = null;
|
|
173
|
+
|
|
174
|
+
this.stats = {
|
|
175
|
+
sitemapsParsed: 0,
|
|
176
|
+
urlsExtracted: 0,
|
|
177
|
+
errors: 0
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
private async _checkReckerAvailability(): Promise<boolean> {
|
|
182
|
+
if (this.reckerAvailable !== null) {
|
|
183
|
+
return this.reckerAvailable;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
try {
|
|
187
|
+
const sitemapModule = await import('recker/seo/validators/sitemap');
|
|
188
|
+
this.parseSitemap = sitemapModule.parseSitemap;
|
|
189
|
+
this.validateSitemap = sitemapModule.validateSitemap;
|
|
190
|
+
this.discoverSitemaps = sitemapModule.discoverSitemaps;
|
|
191
|
+
this.fetchAndValidateSitemap = sitemapModule.fetchAndValidateSitemap;
|
|
192
|
+
this.reckerAvailable = true;
|
|
193
|
+
return true;
|
|
194
|
+
} catch {
|
|
195
|
+
this.reckerAvailable = false;
|
|
196
|
+
return false;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
private async _getFallbackParser(): Promise<import('./sitemap-parser.js').SitemapParser> {
|
|
201
|
+
if (!this.fallbackParser) {
|
|
202
|
+
const { SitemapParser } = await import('./sitemap-parser.js');
|
|
203
|
+
this.fallbackParser = new SitemapParser(this.config);
|
|
204
|
+
}
|
|
205
|
+
return this.fallbackParser;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
setFetcher(fetcher: (url: string) => Promise<FetcherResult>): void {
|
|
209
|
+
this.fetcher = fetcher;
|
|
210
|
+
if (this.fallbackParser) {
|
|
211
|
+
this.fallbackParser.setFetcher(fetcher);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
async parse(sitemapUrl: string, options: ParseOptions = {}): Promise<SitemapEntry[]> {
|
|
216
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
217
|
+
|
|
218
|
+
if (!isReckerAvailable) {
|
|
219
|
+
const fallback = await this._getFallbackParser();
|
|
220
|
+
return fallback.parse(sitemapUrl, options);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
const opts = {
|
|
224
|
+
recursive: options.recursive !== false,
|
|
225
|
+
maxDepth: options.maxDepth || 3,
|
|
226
|
+
_depth: options._depth || 0
|
|
227
|
+
};
|
|
228
|
+
|
|
229
|
+
const cached = this.cache.get(sitemapUrl);
|
|
230
|
+
if (cached && Date.now() - cached.timestamp < this.config.cacheTimeout) {
|
|
231
|
+
return cached.entries;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
if (opts._depth > opts.maxDepth) {
|
|
235
|
+
return [];
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
if (this.stats.urlsExtracted >= this.config.maxUrls) {
|
|
239
|
+
return [];
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
try {
|
|
243
|
+
const { content, compressed } = await this._fetch(sitemapUrl);
|
|
244
|
+
|
|
245
|
+
const parseResult = this.parseSitemap!(content, compressed);
|
|
246
|
+
const validationResult = this.validateSitemap!(content, sitemapUrl);
|
|
247
|
+
|
|
248
|
+
let entries: SitemapEntry[] = [];
|
|
249
|
+
|
|
250
|
+
if (parseResult.type === 'sitemapindex' && opts.recursive) {
|
|
251
|
+
entries = await this._parseReckerIndex(parseResult, opts);
|
|
252
|
+
} else {
|
|
253
|
+
entries = this._mapReckerUrlsToEntries(parseResult.urls, 'sitemap');
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
this.stats.sitemapsParsed++;
|
|
257
|
+
this.stats.urlsExtracted += entries.length;
|
|
258
|
+
|
|
259
|
+
const cacheEntry: CacheEntry = {
|
|
260
|
+
entries,
|
|
261
|
+
parseResult,
|
|
262
|
+
validationResult,
|
|
263
|
+
timestamp: Date.now(),
|
|
264
|
+
format: parseResult.type
|
|
265
|
+
};
|
|
266
|
+
this.cache.set(sitemapUrl, cacheEntry);
|
|
267
|
+
|
|
268
|
+
return entries;
|
|
269
|
+
|
|
270
|
+
} catch (error) {
|
|
271
|
+
this.stats.errors++;
|
|
272
|
+
throw error;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
private async _parseReckerIndex(
|
|
277
|
+
parseResult: ReckerSitemapParseResult,
|
|
278
|
+
opts: { recursive: boolean; maxDepth: number; _depth: number }
|
|
279
|
+
): Promise<SitemapEntry[]> {
|
|
280
|
+
if (!opts.recursive) {
|
|
281
|
+
return parseResult.sitemaps.map(s => ({
|
|
282
|
+
url: s.loc,
|
|
283
|
+
lastmod: s.lastmod || null,
|
|
284
|
+
source: 'sitemap-index',
|
|
285
|
+
type: 'sitemap'
|
|
286
|
+
}));
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
const allEntries: SitemapEntry[] = [];
|
|
290
|
+
const sitemapsToProcess = parseResult.sitemaps.slice(0, this.config.maxSitemaps);
|
|
291
|
+
|
|
292
|
+
for (const sitemap of sitemapsToProcess) {
|
|
293
|
+
if (this.stats.urlsExtracted >= this.config.maxUrls) break;
|
|
294
|
+
|
|
295
|
+
try {
|
|
296
|
+
const entries = await this.parse(sitemap.loc, {
|
|
297
|
+
...opts,
|
|
298
|
+
_depth: opts._depth + 1
|
|
299
|
+
});
|
|
300
|
+
allEntries.push(...entries);
|
|
301
|
+
} catch {
|
|
302
|
+
this.stats.errors++;
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
return allEntries;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
private _mapReckerUrlsToEntries(urls: ReckerSitemapUrl[], source: string): SitemapEntry[] {
|
|
310
|
+
return urls.slice(0, this.config.maxUrls - this.stats.urlsExtracted).map(url => {
|
|
311
|
+
const entry: SitemapEntryExtended = {
|
|
312
|
+
url: url.loc,
|
|
313
|
+
lastmod: url.lastmod || null,
|
|
314
|
+
changefreq: url.changefreq || null,
|
|
315
|
+
priority: url.priority ?? null,
|
|
316
|
+
source,
|
|
317
|
+
images: url.images?.map(img => ({
|
|
318
|
+
url: img.loc,
|
|
319
|
+
title: img.title || null,
|
|
320
|
+
caption: img.caption || null
|
|
321
|
+
})) as SitemapImage[],
|
|
322
|
+
videos: url.videos?.map(vid => ({
|
|
323
|
+
url: vid.contentLoc || vid.playerLoc || null,
|
|
324
|
+
thumbnailUrl: vid.thumbnailLoc || null,
|
|
325
|
+
title: vid.title || null,
|
|
326
|
+
description: vid.description || null
|
|
327
|
+
})) as SitemapVideo[]
|
|
328
|
+
};
|
|
329
|
+
|
|
330
|
+
if (url.news) {
|
|
331
|
+
entry.news = url.news;
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
if (url.alternates && url.alternates.length > 0) {
|
|
335
|
+
entry.alternates = url.alternates;
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
return entry;
|
|
339
|
+
});
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
private async _getHttpClient(): Promise<HttpClient> {
|
|
343
|
+
if (!this._httpClient) {
|
|
344
|
+
const baseConfig = this._context
|
|
345
|
+
? this._context.getHttpClientConfig('https://example.com')
|
|
346
|
+
: {
|
|
347
|
+
headers: {
|
|
348
|
+
'User-Agent': this.config.userAgent
|
|
349
|
+
}
|
|
350
|
+
};
|
|
351
|
+
|
|
352
|
+
this._httpClient = await createHttpClient({
|
|
353
|
+
...baseConfig,
|
|
354
|
+
timeout: this.config.fetchTimeout,
|
|
355
|
+
retry: {
|
|
356
|
+
maxAttempts: 2,
|
|
357
|
+
delay: 1000,
|
|
358
|
+
backoff: 'exponential',
|
|
359
|
+
retryAfter: true,
|
|
360
|
+
retryOn: [429, 500, 502, 503, 504]
|
|
361
|
+
}
|
|
362
|
+
}) as unknown as HttpClient;
|
|
363
|
+
}
|
|
364
|
+
return this._httpClient;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
private async _fetch(url: string): Promise<{ content: string; compressed: boolean }> {
|
|
368
|
+
let content: string | Buffer;
|
|
369
|
+
let compressed = false;
|
|
370
|
+
|
|
371
|
+
if (this.fetcher) {
|
|
372
|
+
const result = await this.fetcher(url);
|
|
373
|
+
content = result.content || (result as unknown as string);
|
|
374
|
+
} else {
|
|
375
|
+
const client = await this._getHttpClient();
|
|
376
|
+
const response = await client.get(url);
|
|
377
|
+
|
|
378
|
+
if (this._context) {
|
|
379
|
+
this._context.processResponse(
|
|
380
|
+
response as unknown as Parameters<typeof this._context.processResponse>[0],
|
|
381
|
+
url
|
|
382
|
+
);
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
if (!response.ok) {
|
|
386
|
+
throw new Error(`HTTP ${response.status}`);
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
const contentType = response.headers.get('content-type') || '';
|
|
390
|
+
|
|
391
|
+
if (url.endsWith('.gz') || contentType.includes('gzip')) {
|
|
392
|
+
const buffer = await response.arrayBuffer();
|
|
393
|
+
content = this._decompress(Buffer.from(buffer));
|
|
394
|
+
compressed = true;
|
|
395
|
+
} else {
|
|
396
|
+
content = await response.text();
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
if (Buffer.isBuffer(content)) {
|
|
401
|
+
if (content[0] === 0x1f && content[1] === 0x8b) {
|
|
402
|
+
content = this._decompress(content);
|
|
403
|
+
compressed = true;
|
|
404
|
+
} else {
|
|
405
|
+
content = content.toString('utf-8');
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
return { content: content as string, compressed };
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
private _decompress(buffer: Buffer): string {
|
|
413
|
+
try {
|
|
414
|
+
return gunzipSync(buffer).toString('utf-8');
|
|
415
|
+
} catch (error) {
|
|
416
|
+
throw new Error(`Failed to decompress gzip: ${(error as Error).message}`);
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
getStats(): SitemapStats {
|
|
421
|
+
return {
|
|
422
|
+
...this.stats,
|
|
423
|
+
cacheSize: this.cache.size
|
|
424
|
+
};
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
clearCache(url?: string): void {
|
|
428
|
+
if (url) {
|
|
429
|
+
this.cache.delete(url);
|
|
430
|
+
} else {
|
|
431
|
+
this.cache.clear();
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
resetStats(): void {
|
|
436
|
+
this.stats = {
|
|
437
|
+
sitemapsParsed: 0,
|
|
438
|
+
urlsExtracted: 0,
|
|
439
|
+
errors: 0
|
|
440
|
+
};
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
async discoverFromRobotsTxt(robotsTxtUrl: string): Promise<string[]> {
|
|
444
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
445
|
+
|
|
446
|
+
if (!isReckerAvailable) {
|
|
447
|
+
const fallback = await this._getFallbackParser();
|
|
448
|
+
return fallback.discoverFromRobotsTxt(robotsTxtUrl);
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
try {
|
|
452
|
+
const baseUrl = new URL(robotsTxtUrl).origin;
|
|
453
|
+
const { content } = await this._fetch(robotsTxtUrl);
|
|
454
|
+
|
|
455
|
+
return await this.discoverSitemaps!(baseUrl, content, async (url) => {
|
|
456
|
+
const { content: text } = await this._fetch(url);
|
|
457
|
+
return { status: 200, text };
|
|
458
|
+
});
|
|
459
|
+
} catch {
|
|
460
|
+
return [];
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
async probeCommonLocations(baseUrl: string): Promise<ProbeResult[]> {
|
|
465
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
466
|
+
|
|
467
|
+
if (!isReckerAvailable) {
|
|
468
|
+
const fallback = await this._getFallbackParser();
|
|
469
|
+
return fallback.probeCommonLocations(baseUrl);
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
const commonPaths = [
|
|
473
|
+
'/sitemap.xml',
|
|
474
|
+
'/sitemap_index.xml',
|
|
475
|
+
'/sitemap.xml.gz',
|
|
476
|
+
'/sitemaps/sitemap.xml',
|
|
477
|
+
'/sitemap.txt',
|
|
478
|
+
'/feed.xml',
|
|
479
|
+
'/rss.xml',
|
|
480
|
+
'/atom.xml',
|
|
481
|
+
'/feed',
|
|
482
|
+
'/rss'
|
|
483
|
+
];
|
|
484
|
+
|
|
485
|
+
const results: ProbeResult[] = [];
|
|
486
|
+
|
|
487
|
+
for (const path of commonPaths) {
|
|
488
|
+
const url = baseUrl.replace(/\/$/, '') + path;
|
|
489
|
+
|
|
490
|
+
try {
|
|
491
|
+
const { content, compressed } = await this._fetch(url);
|
|
492
|
+
const parseResult = this.parseSitemap!(content, compressed);
|
|
493
|
+
|
|
494
|
+
results.push({
|
|
495
|
+
url,
|
|
496
|
+
exists: true,
|
|
497
|
+
format: parseResult.type
|
|
498
|
+
});
|
|
499
|
+
} catch {
|
|
500
|
+
results.push({
|
|
501
|
+
url,
|
|
502
|
+
exists: false
|
|
503
|
+
});
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
return results;
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
async validate(sitemapUrl: string): Promise<SitemapValidationDetails | null> {
|
|
511
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
512
|
+
|
|
513
|
+
if (!isReckerAvailable) {
|
|
514
|
+
return null;
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
try {
|
|
518
|
+
let cached = this.cache.get(sitemapUrl);
|
|
519
|
+
|
|
520
|
+
if (!cached || Date.now() - cached.timestamp >= this.config.cacheTimeout) {
|
|
521
|
+
await this.parse(sitemapUrl);
|
|
522
|
+
cached = this.cache.get(sitemapUrl);
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
if (!cached?.validationResult || !cached?.parseResult) {
|
|
526
|
+
return null;
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
return {
|
|
530
|
+
valid: cached.validationResult.valid,
|
|
531
|
+
issues: cached.validationResult.issues,
|
|
532
|
+
type: cached.parseResult.type,
|
|
533
|
+
urlCount: cached.parseResult.urlCount,
|
|
534
|
+
size: cached.parseResult.size,
|
|
535
|
+
compressed: cached.parseResult.compressed,
|
|
536
|
+
errors: cached.parseResult.errors,
|
|
537
|
+
warnings: cached.parseResult.warnings
|
|
538
|
+
};
|
|
539
|
+
} catch {
|
|
540
|
+
return null;
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
async validateContent(content: string, baseUrl?: string): Promise<ReckerSitemapValidationResult | null> {
|
|
545
|
+
const isReckerAvailable = await this._checkReckerAvailability();
|
|
546
|
+
|
|
547
|
+
if (!isReckerAvailable || !this.validateSitemap) {
|
|
548
|
+
return null;
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
return this.validateSitemap(content, baseUrl);
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
parseContent(content: string, compressed?: boolean): ReckerSitemapParseResult | null {
|
|
555
|
+
if (!this.reckerAvailable || !this.parseSitemap) {
|
|
556
|
+
return null;
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
return this.parseSitemap(content, compressed);
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
async getValidationIssues(sitemapUrl: string): Promise<ReckerSitemapValidationIssue[]> {
|
|
563
|
+
const validation = await this.validate(sitemapUrl);
|
|
564
|
+
return validation?.issues || [];
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
async getNewsEntries(sitemapUrl: string): Promise<SitemapEntryExtended[]> {
|
|
568
|
+
const entries = await this.parse(sitemapUrl);
|
|
569
|
+
return (entries as SitemapEntryExtended[]).filter(e => e.news);
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
async getAlternateLanguages(sitemapUrl: string): Promise<Map<string, SitemapEntryExtended[]>> {
|
|
573
|
+
const entries = await this.parse(sitemapUrl);
|
|
574
|
+
const byLanguage = new Map<string, SitemapEntryExtended[]>();
|
|
575
|
+
|
|
576
|
+
for (const entry of entries as SitemapEntryExtended[]) {
|
|
577
|
+
if (entry.alternates) {
|
|
578
|
+
for (const alt of entry.alternates) {
|
|
579
|
+
const lang = alt.hreflang;
|
|
580
|
+
if (!byLanguage.has(lang)) {
|
|
581
|
+
byLanguage.set(lang, []);
|
|
582
|
+
}
|
|
583
|
+
byLanguage.get(lang)!.push(entry);
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
return byLanguage;
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
async discoverAll(baseUrl: string): Promise<{
|
|
592
|
+
fromRobots: string[];
|
|
593
|
+
fromProbing: ProbeResult[];
|
|
594
|
+
all: string[];
|
|
595
|
+
}> {
|
|
596
|
+
const robotsTxtUrl = `${baseUrl.replace(/\/$/, '')}/robots.txt`;
|
|
597
|
+
|
|
598
|
+
const [fromRobots, fromProbing] = await Promise.all([
|
|
599
|
+
this.discoverFromRobotsTxt(robotsTxtUrl),
|
|
600
|
+
this.probeCommonLocations(baseUrl)
|
|
601
|
+
]);
|
|
602
|
+
|
|
603
|
+
const foundFromProbing = fromProbing
|
|
604
|
+
.filter(p => p.exists)
|
|
605
|
+
.map(p => p.url);
|
|
606
|
+
|
|
607
|
+
const all = [...new Set([...fromRobots, ...foundFromProbing])];
|
|
608
|
+
|
|
609
|
+
return {
|
|
610
|
+
fromRobots,
|
|
611
|
+
fromProbing,
|
|
612
|
+
all
|
|
613
|
+
};
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
isReckerEnabled(): boolean {
|
|
617
|
+
return this.reckerAvailable === true;
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
export default ReckerSitemapValidator;
|
package/src/resource.class.ts
CHANGED
|
@@ -22,6 +22,7 @@ import tryFn, { tryFnSync } from './concerns/try-fn.js';
|
|
|
22
22
|
import { ResourceReader, ResourceWriter } from './stream/index.js';
|
|
23
23
|
import { getBehavior, DEFAULT_BEHAVIOR } from './behaviors/index.js';
|
|
24
24
|
import { idGenerator as defaultIdGenerator } from './concerns/id.js';
|
|
25
|
+
import { validateS3KeySegment } from './concerns/s3-key.js';
|
|
25
26
|
import { ResourceError, PartitionError } from './errors.js';
|
|
26
27
|
import { createLogger, type Logger, type LogLevel as LoggerLogLevel } from './concerns/logger.js';
|
|
27
28
|
import { validateResourceConfig } from './core/resource-config-validator.js';
|
|
@@ -782,6 +783,7 @@ export class Resource extends AsyncEventEmitter implements Disposable {
|
|
|
782
783
|
}
|
|
783
784
|
|
|
784
785
|
getResourceKey(id: string): string {
|
|
786
|
+
validateS3KeySegment(id, 'id');
|
|
785
787
|
const key = join('resource=' + this.name, 'data', `id=${id}`);
|
|
786
788
|
return key;
|
|
787
789
|
}
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import EventEmitter from "events";
|
|
2
2
|
import { Transform, TransformCallback } from "stream";
|
|
3
|
-
import { PromisePool } from "@supercharge/promise-pool";
|
|
4
3
|
|
|
5
4
|
import { ResourceIdsPageReader } from "./resource-ids-page-reader.class.js";
|
|
5
|
+
import { TasksPool } from '../tasks/tasks-pool.class.js';
|
|
6
6
|
import tryFn from "../concerns/try-fn.js";
|
|
7
7
|
import { StreamError } from '../errors.js';
|
|
8
8
|
|
|
@@ -86,16 +86,18 @@ export class ResourceReader extends EventEmitter {
|
|
|
86
86
|
|
|
87
87
|
async _transform(chunk: string[], _encoding: BufferEncoding, callback: TransformCallback): Promise<void> {
|
|
88
88
|
const [, err] = await tryFn(async () => {
|
|
89
|
-
await
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
this.emit("error", error, content);
|
|
93
|
-
})
|
|
94
|
-
.process(async (id) => {
|
|
89
|
+
await TasksPool.map(
|
|
90
|
+
chunk,
|
|
91
|
+
async (id) => {
|
|
95
92
|
const data = await this.resource.get(id);
|
|
96
93
|
this.transform.push(data);
|
|
97
94
|
return data;
|
|
98
|
-
}
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
concurrency: this.concurrency,
|
|
98
|
+
onItemError: (error, id) => this.emit("error", error, id)
|
|
99
|
+
}
|
|
100
|
+
);
|
|
99
101
|
});
|
|
100
102
|
callback(err as Error | null | undefined);
|
|
101
103
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import EventEmitter from "events";
|
|
2
2
|
import { Writable, WritableOptions } from 'stream';
|
|
3
|
-
import {
|
|
3
|
+
import { TasksPool } from '../tasks/tasks-pool.class.js';
|
|
4
4
|
import tryFn from "../concerns/try-fn.js";
|
|
5
5
|
|
|
6
6
|
interface S3Client {
|
|
@@ -81,12 +81,9 @@ export class ResourceWriter extends EventEmitter {
|
|
|
81
81
|
while (this.buffer.length > 0) {
|
|
82
82
|
const batch = this.buffer.splice(0, this.batchSize);
|
|
83
83
|
const [ok, err] = await tryFn(async () => {
|
|
84
|
-
await
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
this.emit("error", error, content);
|
|
88
|
-
})
|
|
89
|
-
.process(async (item) => {
|
|
84
|
+
await TasksPool.map(
|
|
85
|
+
batch,
|
|
86
|
+
async (item) => {
|
|
90
87
|
const [insertOk, insertErr, result] = await tryFn(async () => {
|
|
91
88
|
const res = await this.resource.insert(item);
|
|
92
89
|
return res;
|
|
@@ -96,7 +93,12 @@ export class ResourceWriter extends EventEmitter {
|
|
|
96
93
|
return null;
|
|
97
94
|
}
|
|
98
95
|
return result;
|
|
99
|
-
}
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
concurrency: this.concurrency,
|
|
99
|
+
onItemError: (error, item) => this.emit("error", error, item)
|
|
100
|
+
}
|
|
101
|
+
);
|
|
100
102
|
});
|
|
101
103
|
if (!ok) {
|
|
102
104
|
this.emit('error', err);
|