s3db.js 18.0.11-next.1534f717 → 18.0.11-next.e8e71b5b

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/dist/clients/recker-http-handler.js +56 -8
  2. package/dist/clients/recker-http-handler.js.map +1 -1
  3. package/dist/concerns/high-performance-inserter.js +6 -34
  4. package/dist/concerns/high-performance-inserter.js.map +1 -1
  5. package/dist/concerns/id/alphabets.js +150 -0
  6. package/dist/concerns/id/alphabets.js.map +1 -0
  7. package/dist/concerns/id/entropy.js +243 -0
  8. package/dist/concerns/id/entropy.js.map +1 -0
  9. package/dist/concerns/id/generators/nanoid.js +74 -0
  10. package/dist/concerns/id/generators/nanoid.js.map +1 -0
  11. package/dist/concerns/id/generators/sid.js +73 -0
  12. package/dist/concerns/id/generators/sid.js.map +1 -0
  13. package/dist/concerns/id/generators/ulid.js +208 -0
  14. package/dist/concerns/id/generators/ulid.js.map +1 -0
  15. package/dist/concerns/id/generators/uuid-v7.js +150 -0
  16. package/dist/concerns/id/generators/uuid-v7.js.map +1 -0
  17. package/dist/concerns/id/index.js +74 -0
  18. package/dist/concerns/id/index.js.map +1 -0
  19. package/dist/concerns/plugin-storage.js +114 -0
  20. package/dist/concerns/plugin-storage.js.map +1 -1
  21. package/dist/concerns/s3-errors.js +72 -0
  22. package/dist/concerns/s3-errors.js.map +1 -0
  23. package/dist/concerns/s3-key.js +54 -0
  24. package/dist/concerns/s3-key.js.map +1 -0
  25. package/dist/concerns/safe-merge.js +47 -0
  26. package/dist/concerns/safe-merge.js.map +1 -0
  27. package/dist/core/resource-config-validator.js +12 -2
  28. package/dist/core/resource-config-validator.js.map +1 -1
  29. package/dist/core/resource-partitions.class.js +12 -1
  30. package/dist/core/resource-partitions.class.js.map +1 -1
  31. package/dist/core/resource-persistence.class.js +41 -12
  32. package/dist/core/resource-persistence.class.js.map +1 -1
  33. package/dist/core/resource-query.class.js +21 -47
  34. package/dist/core/resource-query.class.js.map +1 -1
  35. package/dist/database/database-connection.class.js +3 -6
  36. package/dist/database/database-connection.class.js.map +1 -1
  37. package/dist/database/database-plugins.class.js +7 -13
  38. package/dist/database/database-plugins.class.js.map +1 -1
  39. package/dist/plugins/concerns/s3-mutex.class.js +155 -0
  40. package/dist/plugins/concerns/s3-mutex.class.js.map +1 -0
  41. package/dist/plugins/eventual-consistency/consolidation.js +4 -7
  42. package/dist/plugins/eventual-consistency/consolidation.js.map +1 -1
  43. package/dist/plugins/eventual-consistency/garbage-collection.js +3 -6
  44. package/dist/plugins/eventual-consistency/garbage-collection.js.map +1 -1
  45. package/dist/plugins/queue-consumer.plugin.js +10 -16
  46. package/dist/plugins/queue-consumer.plugin.js.map +1 -1
  47. package/dist/plugins/recon/managers/scheduler-manager.js +3 -5
  48. package/dist/plugins/recon/managers/scheduler-manager.js.map +1 -1
  49. package/dist/plugins/recon/stages/recker-asn-stage.js +279 -0
  50. package/dist/plugins/recon/stages/recker-asn-stage.js.map +1 -0
  51. package/dist/plugins/recon/stages/recker-dns-stage.js +227 -0
  52. package/dist/plugins/recon/stages/recker-dns-stage.js.map +1 -0
  53. package/dist/plugins/recon/stages/recker-scrape-stage.js +369 -0
  54. package/dist/plugins/recon/stages/recker-scrape-stage.js.map +1 -0
  55. package/dist/plugins/replicator.plugin.js +13 -31
  56. package/dist/plugins/replicator.plugin.js.map +1 -1
  57. package/dist/plugins/replicators/base-replicator.class.js +10 -23
  58. package/dist/plugins/replicators/base-replicator.class.js.map +1 -1
  59. package/dist/plugins/spider/recker-link-discoverer.js +544 -0
  60. package/dist/plugins/spider/recker-link-discoverer.js.map +1 -0
  61. package/dist/plugins/spider/recker-llms-validator.js +334 -0
  62. package/dist/plugins/spider/recker-llms-validator.js.map +1 -0
  63. package/dist/plugins/spider/recker-robots-validator.js +336 -0
  64. package/dist/plugins/spider/recker-robots-validator.js.map +1 -0
  65. package/dist/plugins/spider/recker-security-adapter.js +325 -0
  66. package/dist/plugins/spider/recker-security-adapter.js.map +1 -0
  67. package/dist/plugins/spider/recker-seo-adapter.js +399 -0
  68. package/dist/plugins/spider/recker-seo-adapter.js.map +1 -0
  69. package/dist/plugins/spider/recker-sitemap-validator.js +406 -0
  70. package/dist/plugins/spider/recker-sitemap-validator.js.map +1 -0
  71. package/dist/resource.class.js +2 -0
  72. package/dist/resource.class.js.map +1 -1
  73. package/dist/s3db.cjs +444 -219
  74. package/dist/s3db.cjs.map +1 -1
  75. package/dist/s3db.es.js +445 -220
  76. package/dist/s3db.es.js.map +1 -1
  77. package/dist/stream/resource-reader.class.js +5 -7
  78. package/dist/stream/resource-reader.class.js.map +1 -1
  79. package/dist/stream/resource-writer.class.js +5 -7
  80. package/dist/stream/resource-writer.class.js.map +1 -1
  81. package/dist/tasks/tasks-pool.class.js +31 -0
  82. package/dist/tasks/tasks-pool.class.js.map +1 -1
  83. package/dist/types/clients/recker-http-handler.d.ts +1 -0
  84. package/dist/types/clients/recker-http-handler.d.ts.map +1 -1
  85. package/dist/types/clients/types.d.ts +14 -0
  86. package/dist/types/clients/types.d.ts.map +1 -1
  87. package/dist/types/concerns/high-performance-inserter.d.ts.map +1 -1
  88. package/dist/types/concerns/id/alphabets.d.ts +125 -0
  89. package/dist/types/concerns/id/alphabets.d.ts.map +1 -0
  90. package/dist/types/concerns/id/entropy.d.ts +84 -0
  91. package/dist/types/concerns/id/entropy.d.ts.map +1 -0
  92. package/dist/types/concerns/id/generators/nanoid.d.ts +46 -0
  93. package/dist/types/concerns/id/generators/nanoid.d.ts.map +1 -0
  94. package/dist/types/concerns/id/generators/sid.d.ts +45 -0
  95. package/dist/types/concerns/id/generators/sid.d.ts.map +1 -0
  96. package/dist/types/concerns/id/generators/ulid.d.ts +71 -0
  97. package/dist/types/concerns/id/generators/ulid.d.ts.map +1 -0
  98. package/dist/types/concerns/id/generators/uuid-v7.d.ts +60 -0
  99. package/dist/types/concerns/id/generators/uuid-v7.d.ts.map +1 -0
  100. package/dist/types/concerns/id/index.d.ts +51 -0
  101. package/dist/types/concerns/id/index.d.ts.map +1 -0
  102. package/dist/types/concerns/plugin-storage.d.ts +25 -0
  103. package/dist/types/concerns/plugin-storage.d.ts.map +1 -1
  104. package/dist/types/concerns/s3-errors.d.ts +20 -0
  105. package/dist/types/concerns/s3-errors.d.ts.map +1 -0
  106. package/dist/types/concerns/s3-key.d.ts +30 -0
  107. package/dist/types/concerns/s3-key.d.ts.map +1 -0
  108. package/dist/types/concerns/safe-merge.d.ts +22 -0
  109. package/dist/types/concerns/safe-merge.d.ts.map +1 -0
  110. package/dist/types/core/resource-config-validator.d.ts.map +1 -1
  111. package/dist/types/core/resource-partitions.class.d.ts.map +1 -1
  112. package/dist/types/core/resource-persistence.class.d.ts.map +1 -1
  113. package/dist/types/core/resource-query.class.d.ts.map +1 -1
  114. package/dist/types/database/database-connection.class.d.ts.map +1 -1
  115. package/dist/types/database/database-plugins.class.d.ts.map +1 -1
  116. package/dist/types/plugins/concerns/s3-mutex.class.d.ts +30 -0
  117. package/dist/types/plugins/concerns/s3-mutex.class.d.ts.map +1 -0
  118. package/dist/types/plugins/eventual-consistency/consolidation.d.ts.map +1 -1
  119. package/dist/types/plugins/eventual-consistency/garbage-collection.d.ts.map +1 -1
  120. package/dist/types/plugins/queue-consumer.plugin.d.ts.map +1 -1
  121. package/dist/types/plugins/recon/managers/scheduler-manager.d.ts.map +1 -1
  122. package/dist/types/plugins/recon/stages/recker-asn-stage.d.ts +90 -0
  123. package/dist/types/plugins/recon/stages/recker-asn-stage.d.ts.map +1 -0
  124. package/dist/types/plugins/recon/stages/recker-dns-stage.d.ts +125 -0
  125. package/dist/types/plugins/recon/stages/recker-dns-stage.d.ts.map +1 -0
  126. package/dist/types/plugins/recon/stages/recker-scrape-stage.d.ts +96 -0
  127. package/dist/types/plugins/recon/stages/recker-scrape-stage.d.ts.map +1 -0
  128. package/dist/types/plugins/replicator.plugin.d.ts.map +1 -1
  129. package/dist/types/plugins/replicators/base-replicator.class.d.ts.map +1 -1
  130. package/dist/types/plugins/spider/recker-link-discoverer.d.ts +54 -0
  131. package/dist/types/plugins/spider/recker-link-discoverer.d.ts.map +1 -0
  132. package/dist/types/plugins/spider/recker-llms-validator.d.ts +105 -0
  133. package/dist/types/plugins/spider/recker-llms-validator.d.ts.map +1 -0
  134. package/dist/types/plugins/spider/recker-robots-validator.d.ts +92 -0
  135. package/dist/types/plugins/spider/recker-robots-validator.d.ts.map +1 -0
  136. package/dist/types/plugins/spider/recker-security-adapter.d.ts +83 -0
  137. package/dist/types/plugins/spider/recker-security-adapter.d.ts.map +1 -0
  138. package/dist/types/plugins/spider/recker-seo-adapter.d.ts +187 -0
  139. package/dist/types/plugins/spider/recker-seo-adapter.d.ts.map +1 -0
  140. package/dist/types/plugins/spider/recker-sitemap-validator.d.ts +121 -0
  141. package/dist/types/plugins/spider/recker-sitemap-validator.d.ts.map +1 -0
  142. package/dist/types/resource.class.d.ts.map +1 -1
  143. package/dist/types/stream/resource-reader.class.d.ts.map +1 -1
  144. package/dist/types/stream/resource-writer.class.d.ts.map +1 -1
  145. package/dist/types/tasks/tasks-pool.class.d.ts +23 -0
  146. package/dist/types/tasks/tasks-pool.class.d.ts.map +1 -1
  147. package/mcp/prompts/index.ts +275 -0
  148. package/mcp/resources/index.ts +322 -0
  149. package/mcp/tools/plugins.ts +1137 -0
  150. package/mcp/tools/streams.ts +340 -0
  151. package/package.json +20 -22
  152. package/src/clients/recker-http-handler.ts +74 -8
  153. package/src/clients/types.ts +14 -0
  154. package/src/concerns/high-performance-inserter.ts +18 -57
  155. package/src/concerns/id/alphabets.ts +175 -0
  156. package/src/concerns/id/entropy.ts +286 -0
  157. package/src/concerns/id/generators/sid.ts +90 -0
  158. package/src/concerns/id/generators/ulid.ts +249 -0
  159. package/src/concerns/id/generators/uuid-v7.ts +179 -0
  160. package/src/concerns/id/index.ts +167 -0
  161. package/src/concerns/plugin-storage.ts +144 -0
  162. package/src/concerns/s3-errors.ts +97 -0
  163. package/src/concerns/s3-key.ts +62 -0
  164. package/src/concerns/safe-merge.ts +60 -0
  165. package/src/core/resource-config-validator.ts +9 -2
  166. package/src/core/resource-partitions.class.ts +14 -1
  167. package/src/core/resource-persistence.class.ts +47 -13
  168. package/src/core/resource-query.class.ts +21 -46
  169. package/src/database/database-connection.class.ts +7 -6
  170. package/src/database/database-plugins.class.ts +15 -13
  171. package/src/plugins/concerns/s3-mutex.class.ts +228 -0
  172. package/src/plugins/eventual-consistency/consolidation.ts +8 -7
  173. package/src/plugins/eventual-consistency/garbage-collection.ts +7 -6
  174. package/src/plugins/queue-consumer.plugin.ts +21 -19
  175. package/src/plugins/recon/managers/scheduler-manager.ts +7 -5
  176. package/src/plugins/recon/stages/recker-asn-stage.ts +385 -0
  177. package/src/plugins/recon/stages/recker-dns-stage.ts +360 -0
  178. package/src/plugins/recon/stages/recker-scrape-stage.ts +509 -0
  179. package/src/plugins/replicator.plugin.ts +41 -35
  180. package/src/plugins/replicators/base-replicator.class.ts +17 -23
  181. package/src/plugins/spider/recker-link-discoverer.ts +645 -0
  182. package/src/plugins/spider/recker-llms-validator.ts +500 -0
  183. package/src/plugins/spider/recker-robots-validator.ts +473 -0
  184. package/src/plugins/spider/recker-security-adapter.ts +489 -0
  185. package/src/plugins/spider/recker-seo-adapter.ts +605 -0
  186. package/src/plugins/spider/recker-sitemap-validator.ts +621 -0
  187. package/src/resource.class.ts +2 -0
  188. package/src/stream/resource-reader.class.ts +10 -8
  189. package/src/stream/resource-writer.class.ts +10 -8
  190. package/src/tasks/tasks-pool.class.ts +46 -0
@@ -0,0 +1,621 @@
1
+ import { gunzipSync } from 'zlib';
2
+ import { createHttpClient } from '#src/concerns/http-client.js';
3
+ import type { CrawlContext } from './crawl-context.js';
4
+ import type {
5
+ SitemapParserConfig,
6
+ SitemapEntry,
7
+ SitemapImage,
8
+ SitemapVideo,
9
+ ParseOptions,
10
+ SitemapStats,
11
+ ProbeResult,
12
+ FetcherResult
13
+ } from './sitemap-parser.js';
14
+
15
+ type ReckerSitemapUrl = {
16
+ loc: string;
17
+ lastmod?: string;
18
+ changefreq?: 'always' | 'hourly' | 'daily' | 'weekly' | 'monthly' | 'yearly' | 'never';
19
+ priority?: number;
20
+ images?: Array<{
21
+ loc: string;
22
+ caption?: string;
23
+ title?: string;
24
+ }>;
25
+ videos?: Array<{
26
+ thumbnailLoc: string;
27
+ title: string;
28
+ description: string;
29
+ contentLoc?: string;
30
+ playerLoc?: string;
31
+ }>;
32
+ news?: {
33
+ publicationName: string;
34
+ publicationLanguage: string;
35
+ publicationDate: string;
36
+ title: string;
37
+ };
38
+ alternates?: Array<{
39
+ hreflang: string;
40
+ href: string;
41
+ }>;
42
+ };
43
+
44
+ type ReckerSitemapIndex = {
45
+ loc: string;
46
+ lastmod?: string;
47
+ };
48
+
49
+ type ReckerSitemapParseResult = {
50
+ type: 'urlset' | 'sitemapindex' | 'unknown';
51
+ valid: boolean;
52
+ errors: string[];
53
+ warnings: string[];
54
+ urls: ReckerSitemapUrl[];
55
+ sitemaps: ReckerSitemapIndex[];
56
+ urlCount: number;
57
+ size: number;
58
+ compressed: boolean;
59
+ };
60
+
61
+ type ReckerSitemapValidationIssue = {
62
+ type: 'error' | 'warning' | 'info';
63
+ code: string;
64
+ message: string;
65
+ url?: string;
66
+ recommendation?: string;
67
+ };
68
+
69
+ type ReckerSitemapValidationResult = {
70
+ valid: boolean;
71
+ issues: ReckerSitemapValidationIssue[];
72
+ parseResult: ReckerSitemapParseResult;
73
+ };
74
+
75
+ type ReckerParseSitemap = (content: string, compressed?: boolean) => ReckerSitemapParseResult;
76
+ type ReckerValidateSitemap = (content: string, baseUrl?: string) => ReckerSitemapValidationResult;
77
+ type ReckerDiscoverSitemaps = (
78
+ baseUrl: string,
79
+ robotsTxtContent?: string,
80
+ fetcher?: (url: string) => Promise<{ status: number; text: string }>
81
+ ) => Promise<string[]>;
82
+ type ReckerFetchAndValidateSitemap = (
83
+ url: string,
84
+ fetcher?: (url: string) => Promise<{ status: number; text: string; headers?: Record<string, string> }>
85
+ ) => Promise<ReckerSitemapValidationResult & { exists: boolean; status?: number }>;
86
+
87
+ interface CacheEntry {
88
+ entries: SitemapEntry[];
89
+ parseResult: ReckerSitemapParseResult | null;
90
+ validationResult: ReckerSitemapValidationResult | null;
91
+ timestamp: number;
92
+ format: string;
93
+ }
94
+
95
+ interface HttpClient {
96
+ get(url: string): Promise<HttpResponse>;
97
+ }
98
+
99
+ interface HttpResponse {
100
+ ok: boolean;
101
+ status: number;
102
+ headers: Headers;
103
+ text(): Promise<string>;
104
+ arrayBuffer(): Promise<ArrayBuffer>;
105
+ }
106
+
107
+ interface Headers {
108
+ get(name: string): string | null;
109
+ }
110
+
111
+ export interface SitemapValidationDetails {
112
+ valid: boolean;
113
+ issues: ReckerSitemapValidationIssue[];
114
+ type: 'urlset' | 'sitemapindex' | 'unknown';
115
+ urlCount: number;
116
+ size: number;
117
+ compressed: boolean;
118
+ errors: string[];
119
+ warnings: string[];
120
+ }
121
+
122
+ export interface SitemapEntryExtended extends SitemapEntry {
123
+ news?: {
124
+ publicationName: string;
125
+ publicationLanguage: string;
126
+ publicationDate: string;
127
+ title: string;
128
+ };
129
+ alternates?: Array<{
130
+ hreflang: string;
131
+ href: string;
132
+ }>;
133
+ }
134
+
135
+ export class ReckerSitemapValidator {
136
+ private config: Required<Omit<SitemapParserConfig, 'context' | 'fetcher'>> & {
137
+ context: CrawlContext | null;
138
+ fetcher: ((url: string) => Promise<FetcherResult>) | null;
139
+ };
140
+ private _context: CrawlContext | null;
141
+ private cache: Map<string, CacheEntry>;
142
+ private fetcher: ((url: string) => Promise<FetcherResult>) | null;
143
+ private _httpClient: HttpClient | null;
144
+ private stats: {
145
+ sitemapsParsed: number;
146
+ urlsExtracted: number;
147
+ errors: number;
148
+ };
149
+
150
+ private reckerAvailable: boolean | null = null;
151
+ private parseSitemap: ReckerParseSitemap | null = null;
152
+ private validateSitemap: ReckerValidateSitemap | null = null;
153
+ private discoverSitemaps: ReckerDiscoverSitemaps | null = null;
154
+ private fetchAndValidateSitemap: ReckerFetchAndValidateSitemap | null = null;
155
+ private fallbackParser: import('./sitemap-parser.js').SitemapParser | null = null;
156
+
157
+ constructor(config: SitemapParserConfig = {}) {
158
+ this.config = {
159
+ userAgent: config.userAgent || 's3db-spider',
160
+ fetchTimeout: config.fetchTimeout || 30000,
161
+ maxSitemaps: config.maxSitemaps || 50,
162
+ maxUrls: config.maxUrls || 50000,
163
+ followSitemapIndex: config.followSitemapIndex !== false,
164
+ cacheTimeout: config.cacheTimeout || 3600000,
165
+ context: config.context || null,
166
+ fetcher: config.fetcher || null
167
+ };
168
+
169
+ this._context = this.config.context;
170
+ this.cache = new Map();
171
+ this.fetcher = this.config.fetcher;
172
+ this._httpClient = null;
173
+
174
+ this.stats = {
175
+ sitemapsParsed: 0,
176
+ urlsExtracted: 0,
177
+ errors: 0
178
+ };
179
+ }
180
+
181
+ private async _checkReckerAvailability(): Promise<boolean> {
182
+ if (this.reckerAvailable !== null) {
183
+ return this.reckerAvailable;
184
+ }
185
+
186
+ try {
187
+ const sitemapModule = await import('recker/seo/validators/sitemap');
188
+ this.parseSitemap = sitemapModule.parseSitemap;
189
+ this.validateSitemap = sitemapModule.validateSitemap;
190
+ this.discoverSitemaps = sitemapModule.discoverSitemaps;
191
+ this.fetchAndValidateSitemap = sitemapModule.fetchAndValidateSitemap;
192
+ this.reckerAvailable = true;
193
+ return true;
194
+ } catch {
195
+ this.reckerAvailable = false;
196
+ return false;
197
+ }
198
+ }
199
+
200
+ private async _getFallbackParser(): Promise<import('./sitemap-parser.js').SitemapParser> {
201
+ if (!this.fallbackParser) {
202
+ const { SitemapParser } = await import('./sitemap-parser.js');
203
+ this.fallbackParser = new SitemapParser(this.config);
204
+ }
205
+ return this.fallbackParser;
206
+ }
207
+
208
+ setFetcher(fetcher: (url: string) => Promise<FetcherResult>): void {
209
+ this.fetcher = fetcher;
210
+ if (this.fallbackParser) {
211
+ this.fallbackParser.setFetcher(fetcher);
212
+ }
213
+ }
214
+
215
+ async parse(sitemapUrl: string, options: ParseOptions = {}): Promise<SitemapEntry[]> {
216
+ const isReckerAvailable = await this._checkReckerAvailability();
217
+
218
+ if (!isReckerAvailable) {
219
+ const fallback = await this._getFallbackParser();
220
+ return fallback.parse(sitemapUrl, options);
221
+ }
222
+
223
+ const opts = {
224
+ recursive: options.recursive !== false,
225
+ maxDepth: options.maxDepth || 3,
226
+ _depth: options._depth || 0
227
+ };
228
+
229
+ const cached = this.cache.get(sitemapUrl);
230
+ if (cached && Date.now() - cached.timestamp < this.config.cacheTimeout) {
231
+ return cached.entries;
232
+ }
233
+
234
+ if (opts._depth > opts.maxDepth) {
235
+ return [];
236
+ }
237
+
238
+ if (this.stats.urlsExtracted >= this.config.maxUrls) {
239
+ return [];
240
+ }
241
+
242
+ try {
243
+ const { content, compressed } = await this._fetch(sitemapUrl);
244
+
245
+ const parseResult = this.parseSitemap!(content, compressed);
246
+ const validationResult = this.validateSitemap!(content, sitemapUrl);
247
+
248
+ let entries: SitemapEntry[] = [];
249
+
250
+ if (parseResult.type === 'sitemapindex' && opts.recursive) {
251
+ entries = await this._parseReckerIndex(parseResult, opts);
252
+ } else {
253
+ entries = this._mapReckerUrlsToEntries(parseResult.urls, 'sitemap');
254
+ }
255
+
256
+ this.stats.sitemapsParsed++;
257
+ this.stats.urlsExtracted += entries.length;
258
+
259
+ const cacheEntry: CacheEntry = {
260
+ entries,
261
+ parseResult,
262
+ validationResult,
263
+ timestamp: Date.now(),
264
+ format: parseResult.type
265
+ };
266
+ this.cache.set(sitemapUrl, cacheEntry);
267
+
268
+ return entries;
269
+
270
+ } catch (error) {
271
+ this.stats.errors++;
272
+ throw error;
273
+ }
274
+ }
275
+
276
+ private async _parseReckerIndex(
277
+ parseResult: ReckerSitemapParseResult,
278
+ opts: { recursive: boolean; maxDepth: number; _depth: number }
279
+ ): Promise<SitemapEntry[]> {
280
+ if (!opts.recursive) {
281
+ return parseResult.sitemaps.map(s => ({
282
+ url: s.loc,
283
+ lastmod: s.lastmod || null,
284
+ source: 'sitemap-index',
285
+ type: 'sitemap'
286
+ }));
287
+ }
288
+
289
+ const allEntries: SitemapEntry[] = [];
290
+ const sitemapsToProcess = parseResult.sitemaps.slice(0, this.config.maxSitemaps);
291
+
292
+ for (const sitemap of sitemapsToProcess) {
293
+ if (this.stats.urlsExtracted >= this.config.maxUrls) break;
294
+
295
+ try {
296
+ const entries = await this.parse(sitemap.loc, {
297
+ ...opts,
298
+ _depth: opts._depth + 1
299
+ });
300
+ allEntries.push(...entries);
301
+ } catch {
302
+ this.stats.errors++;
303
+ }
304
+ }
305
+
306
+ return allEntries;
307
+ }
308
+
309
+ private _mapReckerUrlsToEntries(urls: ReckerSitemapUrl[], source: string): SitemapEntry[] {
310
+ return urls.slice(0, this.config.maxUrls - this.stats.urlsExtracted).map(url => {
311
+ const entry: SitemapEntryExtended = {
312
+ url: url.loc,
313
+ lastmod: url.lastmod || null,
314
+ changefreq: url.changefreq || null,
315
+ priority: url.priority ?? null,
316
+ source,
317
+ images: url.images?.map(img => ({
318
+ url: img.loc,
319
+ title: img.title || null,
320
+ caption: img.caption || null
321
+ })) as SitemapImage[],
322
+ videos: url.videos?.map(vid => ({
323
+ url: vid.contentLoc || vid.playerLoc || null,
324
+ thumbnailUrl: vid.thumbnailLoc || null,
325
+ title: vid.title || null,
326
+ description: vid.description || null
327
+ })) as SitemapVideo[]
328
+ };
329
+
330
+ if (url.news) {
331
+ entry.news = url.news;
332
+ }
333
+
334
+ if (url.alternates && url.alternates.length > 0) {
335
+ entry.alternates = url.alternates;
336
+ }
337
+
338
+ return entry;
339
+ });
340
+ }
341
+
342
+ private async _getHttpClient(): Promise<HttpClient> {
343
+ if (!this._httpClient) {
344
+ const baseConfig = this._context
345
+ ? this._context.getHttpClientConfig('https://example.com')
346
+ : {
347
+ headers: {
348
+ 'User-Agent': this.config.userAgent
349
+ }
350
+ };
351
+
352
+ this._httpClient = await createHttpClient({
353
+ ...baseConfig,
354
+ timeout: this.config.fetchTimeout,
355
+ retry: {
356
+ maxAttempts: 2,
357
+ delay: 1000,
358
+ backoff: 'exponential',
359
+ retryAfter: true,
360
+ retryOn: [429, 500, 502, 503, 504]
361
+ }
362
+ }) as unknown as HttpClient;
363
+ }
364
+ return this._httpClient;
365
+ }
366
+
367
+ private async _fetch(url: string): Promise<{ content: string; compressed: boolean }> {
368
+ let content: string | Buffer;
369
+ let compressed = false;
370
+
371
+ if (this.fetcher) {
372
+ const result = await this.fetcher(url);
373
+ content = result.content || (result as unknown as string);
374
+ } else {
375
+ const client = await this._getHttpClient();
376
+ const response = await client.get(url);
377
+
378
+ if (this._context) {
379
+ this._context.processResponse(
380
+ response as unknown as Parameters<typeof this._context.processResponse>[0],
381
+ url
382
+ );
383
+ }
384
+
385
+ if (!response.ok) {
386
+ throw new Error(`HTTP ${response.status}`);
387
+ }
388
+
389
+ const contentType = response.headers.get('content-type') || '';
390
+
391
+ if (url.endsWith('.gz') || contentType.includes('gzip')) {
392
+ const buffer = await response.arrayBuffer();
393
+ content = this._decompress(Buffer.from(buffer));
394
+ compressed = true;
395
+ } else {
396
+ content = await response.text();
397
+ }
398
+ }
399
+
400
+ if (Buffer.isBuffer(content)) {
401
+ if (content[0] === 0x1f && content[1] === 0x8b) {
402
+ content = this._decompress(content);
403
+ compressed = true;
404
+ } else {
405
+ content = content.toString('utf-8');
406
+ }
407
+ }
408
+
409
+ return { content: content as string, compressed };
410
+ }
411
+
412
+ private _decompress(buffer: Buffer): string {
413
+ try {
414
+ return gunzipSync(buffer).toString('utf-8');
415
+ } catch (error) {
416
+ throw new Error(`Failed to decompress gzip: ${(error as Error).message}`);
417
+ }
418
+ }
419
+
420
+ getStats(): SitemapStats {
421
+ return {
422
+ ...this.stats,
423
+ cacheSize: this.cache.size
424
+ };
425
+ }
426
+
427
+ clearCache(url?: string): void {
428
+ if (url) {
429
+ this.cache.delete(url);
430
+ } else {
431
+ this.cache.clear();
432
+ }
433
+ }
434
+
435
+ resetStats(): void {
436
+ this.stats = {
437
+ sitemapsParsed: 0,
438
+ urlsExtracted: 0,
439
+ errors: 0
440
+ };
441
+ }
442
+
443
+ async discoverFromRobotsTxt(robotsTxtUrl: string): Promise<string[]> {
444
+ const isReckerAvailable = await this._checkReckerAvailability();
445
+
446
+ if (!isReckerAvailable) {
447
+ const fallback = await this._getFallbackParser();
448
+ return fallback.discoverFromRobotsTxt(robotsTxtUrl);
449
+ }
450
+
451
+ try {
452
+ const baseUrl = new URL(robotsTxtUrl).origin;
453
+ const { content } = await this._fetch(robotsTxtUrl);
454
+
455
+ return await this.discoverSitemaps!(baseUrl, content, async (url) => {
456
+ const { content: text } = await this._fetch(url);
457
+ return { status: 200, text };
458
+ });
459
+ } catch {
460
+ return [];
461
+ }
462
+ }
463
+
464
+ async probeCommonLocations(baseUrl: string): Promise<ProbeResult[]> {
465
+ const isReckerAvailable = await this._checkReckerAvailability();
466
+
467
+ if (!isReckerAvailable) {
468
+ const fallback = await this._getFallbackParser();
469
+ return fallback.probeCommonLocations(baseUrl);
470
+ }
471
+
472
+ const commonPaths = [
473
+ '/sitemap.xml',
474
+ '/sitemap_index.xml',
475
+ '/sitemap.xml.gz',
476
+ '/sitemaps/sitemap.xml',
477
+ '/sitemap.txt',
478
+ '/feed.xml',
479
+ '/rss.xml',
480
+ '/atom.xml',
481
+ '/feed',
482
+ '/rss'
483
+ ];
484
+
485
+ const results: ProbeResult[] = [];
486
+
487
+ for (const path of commonPaths) {
488
+ const url = baseUrl.replace(/\/$/, '') + path;
489
+
490
+ try {
491
+ const { content, compressed } = await this._fetch(url);
492
+ const parseResult = this.parseSitemap!(content, compressed);
493
+
494
+ results.push({
495
+ url,
496
+ exists: true,
497
+ format: parseResult.type
498
+ });
499
+ } catch {
500
+ results.push({
501
+ url,
502
+ exists: false
503
+ });
504
+ }
505
+ }
506
+
507
+ return results;
508
+ }
509
+
510
+ async validate(sitemapUrl: string): Promise<SitemapValidationDetails | null> {
511
+ const isReckerAvailable = await this._checkReckerAvailability();
512
+
513
+ if (!isReckerAvailable) {
514
+ return null;
515
+ }
516
+
517
+ try {
518
+ let cached = this.cache.get(sitemapUrl);
519
+
520
+ if (!cached || Date.now() - cached.timestamp >= this.config.cacheTimeout) {
521
+ await this.parse(sitemapUrl);
522
+ cached = this.cache.get(sitemapUrl);
523
+ }
524
+
525
+ if (!cached?.validationResult || !cached?.parseResult) {
526
+ return null;
527
+ }
528
+
529
+ return {
530
+ valid: cached.validationResult.valid,
531
+ issues: cached.validationResult.issues,
532
+ type: cached.parseResult.type,
533
+ urlCount: cached.parseResult.urlCount,
534
+ size: cached.parseResult.size,
535
+ compressed: cached.parseResult.compressed,
536
+ errors: cached.parseResult.errors,
537
+ warnings: cached.parseResult.warnings
538
+ };
539
+ } catch {
540
+ return null;
541
+ }
542
+ }
543
+
544
+ async validateContent(content: string, baseUrl?: string): Promise<ReckerSitemapValidationResult | null> {
545
+ const isReckerAvailable = await this._checkReckerAvailability();
546
+
547
+ if (!isReckerAvailable || !this.validateSitemap) {
548
+ return null;
549
+ }
550
+
551
+ return this.validateSitemap(content, baseUrl);
552
+ }
553
+
554
+ parseContent(content: string, compressed?: boolean): ReckerSitemapParseResult | null {
555
+ if (!this.reckerAvailable || !this.parseSitemap) {
556
+ return null;
557
+ }
558
+
559
+ return this.parseSitemap(content, compressed);
560
+ }
561
+
562
+ async getValidationIssues(sitemapUrl: string): Promise<ReckerSitemapValidationIssue[]> {
563
+ const validation = await this.validate(sitemapUrl);
564
+ return validation?.issues || [];
565
+ }
566
+
567
+ async getNewsEntries(sitemapUrl: string): Promise<SitemapEntryExtended[]> {
568
+ const entries = await this.parse(sitemapUrl);
569
+ return (entries as SitemapEntryExtended[]).filter(e => e.news);
570
+ }
571
+
572
+ async getAlternateLanguages(sitemapUrl: string): Promise<Map<string, SitemapEntryExtended[]>> {
573
+ const entries = await this.parse(sitemapUrl);
574
+ const byLanguage = new Map<string, SitemapEntryExtended[]>();
575
+
576
+ for (const entry of entries as SitemapEntryExtended[]) {
577
+ if (entry.alternates) {
578
+ for (const alt of entry.alternates) {
579
+ const lang = alt.hreflang;
580
+ if (!byLanguage.has(lang)) {
581
+ byLanguage.set(lang, []);
582
+ }
583
+ byLanguage.get(lang)!.push(entry);
584
+ }
585
+ }
586
+ }
587
+
588
+ return byLanguage;
589
+ }
590
+
591
+ async discoverAll(baseUrl: string): Promise<{
592
+ fromRobots: string[];
593
+ fromProbing: ProbeResult[];
594
+ all: string[];
595
+ }> {
596
+ const robotsTxtUrl = `${baseUrl.replace(/\/$/, '')}/robots.txt`;
597
+
598
+ const [fromRobots, fromProbing] = await Promise.all([
599
+ this.discoverFromRobotsTxt(robotsTxtUrl),
600
+ this.probeCommonLocations(baseUrl)
601
+ ]);
602
+
603
+ const foundFromProbing = fromProbing
604
+ .filter(p => p.exists)
605
+ .map(p => p.url);
606
+
607
+ const all = [...new Set([...fromRobots, ...foundFromProbing])];
608
+
609
+ return {
610
+ fromRobots,
611
+ fromProbing,
612
+ all
613
+ };
614
+ }
615
+
616
+ isReckerEnabled(): boolean {
617
+ return this.reckerAvailable === true;
618
+ }
619
+ }
620
+
621
+ export default ReckerSitemapValidator;
@@ -22,6 +22,7 @@ import tryFn, { tryFnSync } from './concerns/try-fn.js';
22
22
  import { ResourceReader, ResourceWriter } from './stream/index.js';
23
23
  import { getBehavior, DEFAULT_BEHAVIOR } from './behaviors/index.js';
24
24
  import { idGenerator as defaultIdGenerator } from './concerns/id.js';
25
+ import { validateS3KeySegment } from './concerns/s3-key.js';
25
26
  import { ResourceError, PartitionError } from './errors.js';
26
27
  import { createLogger, type Logger, type LogLevel as LoggerLogLevel } from './concerns/logger.js';
27
28
  import { validateResourceConfig } from './core/resource-config-validator.js';
@@ -782,6 +783,7 @@ export class Resource extends AsyncEventEmitter implements Disposable {
782
783
  }
783
784
 
784
785
  getResourceKey(id: string): string {
786
+ validateS3KeySegment(id, 'id');
785
787
  const key = join('resource=' + this.name, 'data', `id=${id}`);
786
788
  return key;
787
789
  }
@@ -1,8 +1,8 @@
1
1
  import EventEmitter from "events";
2
2
  import { Transform, TransformCallback } from "stream";
3
- import { PromisePool } from "@supercharge/promise-pool";
4
3
 
5
4
  import { ResourceIdsPageReader } from "./resource-ids-page-reader.class.js";
5
+ import { TasksPool } from '../tasks/tasks-pool.class.js';
6
6
  import tryFn from "../concerns/try-fn.js";
7
7
  import { StreamError } from '../errors.js';
8
8
 
@@ -86,16 +86,18 @@ export class ResourceReader extends EventEmitter {
86
86
 
87
87
  async _transform(chunk: string[], _encoding: BufferEncoding, callback: TransformCallback): Promise<void> {
88
88
  const [, err] = await tryFn(async () => {
89
- await PromisePool.for(chunk)
90
- .withConcurrency(this.concurrency)
91
- .handleError(async (error, content) => {
92
- this.emit("error", error, content);
93
- })
94
- .process(async (id) => {
89
+ await TasksPool.map(
90
+ chunk,
91
+ async (id) => {
95
92
  const data = await this.resource.get(id);
96
93
  this.transform.push(data);
97
94
  return data;
98
- });
95
+ },
96
+ {
97
+ concurrency: this.concurrency,
98
+ onItemError: (error, id) => this.emit("error", error, id)
99
+ }
100
+ );
99
101
  });
100
102
  callback(err as Error | null | undefined);
101
103
  }
@@ -1,6 +1,6 @@
1
1
  import EventEmitter from "events";
2
2
  import { Writable, WritableOptions } from 'stream';
3
- import { PromisePool } from '@supercharge/promise-pool';
3
+ import { TasksPool } from '../tasks/tasks-pool.class.js';
4
4
  import tryFn from "../concerns/try-fn.js";
5
5
 
6
6
  interface S3Client {
@@ -81,12 +81,9 @@ export class ResourceWriter extends EventEmitter {
81
81
  while (this.buffer.length > 0) {
82
82
  const batch = this.buffer.splice(0, this.batchSize);
83
83
  const [ok, err] = await tryFn(async () => {
84
- await PromisePool.for(batch)
85
- .withConcurrency(this.concurrency)
86
- .handleError(async (error, content) => {
87
- this.emit("error", error, content);
88
- })
89
- .process(async (item) => {
84
+ await TasksPool.map(
85
+ batch,
86
+ async (item) => {
90
87
  const [insertOk, insertErr, result] = await tryFn(async () => {
91
88
  const res = await this.resource.insert(item);
92
89
  return res;
@@ -96,7 +93,12 @@ export class ResourceWriter extends EventEmitter {
96
93
  return null;
97
94
  }
98
95
  return result;
99
- });
96
+ },
97
+ {
98
+ concurrency: this.concurrency,
99
+ onItemError: (error, item) => this.emit("error", error, item)
100
+ }
101
+ );
100
102
  });
101
103
  if (!ok) {
102
104
  this.emit('error', err);