@jambudipa/spider 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +426 -0
  3. package/dist/index.d.ts +33 -0
  4. package/dist/index.d.ts.map +1 -0
  5. package/dist/index.js +4681 -0
  6. package/dist/index.js.map +1 -0
  7. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +57 -0
  8. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +1 -0
  9. package/dist/lib/Config/SpiderConfig.service.d.ts +256 -0
  10. package/dist/lib/Config/SpiderConfig.service.d.ts.map +1 -0
  11. package/dist/lib/HttpClient/CookieManager.d.ts +44 -0
  12. package/dist/lib/HttpClient/CookieManager.d.ts.map +1 -0
  13. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +88 -0
  14. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +1 -0
  15. package/dist/lib/HttpClient/SessionStore.d.ts +82 -0
  16. package/dist/lib/HttpClient/SessionStore.d.ts.map +1 -0
  17. package/dist/lib/HttpClient/TokenExtractor.d.ts +58 -0
  18. package/dist/lib/HttpClient/TokenExtractor.d.ts.map +1 -0
  19. package/dist/lib/HttpClient/index.d.ts +8 -0
  20. package/dist/lib/HttpClient/index.d.ts.map +1 -0
  21. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +166 -0
  22. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +1 -0
  23. package/dist/lib/LinkExtractor/index.d.ts +37 -0
  24. package/dist/lib/LinkExtractor/index.d.ts.map +1 -0
  25. package/dist/lib/Logging/FetchLogger.d.ts +8 -0
  26. package/dist/lib/Logging/FetchLogger.d.ts.map +1 -0
  27. package/dist/lib/Logging/SpiderLogger.service.d.ts +34 -0
  28. package/dist/lib/Logging/SpiderLogger.service.d.ts.map +1 -0
  29. package/dist/lib/Middleware/SpiderMiddleware.d.ts +276 -0
  30. package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +1 -0
  31. package/dist/lib/PageData/PageData.d.ts +28 -0
  32. package/dist/lib/PageData/PageData.d.ts.map +1 -0
  33. package/dist/lib/Resumability/Resumability.service.d.ts +176 -0
  34. package/dist/lib/Resumability/Resumability.service.d.ts.map +1 -0
  35. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +47 -0
  36. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +1 -0
  37. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +95 -0
  38. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +1 -0
  39. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +92 -0
  40. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +1 -0
  41. package/dist/lib/Resumability/index.d.ts +51 -0
  42. package/dist/lib/Resumability/index.d.ts.map +1 -0
  43. package/dist/lib/Resumability/strategies.d.ts +76 -0
  44. package/dist/lib/Resumability/strategies.d.ts.map +1 -0
  45. package/dist/lib/Resumability/types.d.ts +201 -0
  46. package/dist/lib/Resumability/types.d.ts.map +1 -0
  47. package/dist/lib/Robots/Robots.service.d.ts +78 -0
  48. package/dist/lib/Robots/Robots.service.d.ts.map +1 -0
  49. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +211 -0
  50. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +1 -0
  51. package/dist/lib/Scraper/Scraper.service.d.ts +123 -0
  52. package/dist/lib/Scraper/Scraper.service.d.ts.map +1 -0
  53. package/dist/lib/Spider/Spider.service.d.ts +194 -0
  54. package/dist/lib/Spider/Spider.service.d.ts.map +1 -0
  55. package/dist/lib/StateManager/StateManager.service.d.ts +68 -0
  56. package/dist/lib/StateManager/StateManager.service.d.ts.map +1 -0
  57. package/dist/lib/StateManager/index.d.ts +5 -0
  58. package/dist/lib/StateManager/index.d.ts.map +1 -0
  59. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +58 -0
  60. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +1 -0
  61. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +77 -0
  62. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +1 -0
  63. package/dist/lib/WebScrapingEngine/index.d.ts +5 -0
  64. package/dist/lib/WebScrapingEngine/index.d.ts.map +1 -0
  65. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +39 -0
  66. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +1 -0
  67. package/dist/lib/api-facades.d.ts +313 -0
  68. package/dist/lib/api-facades.d.ts.map +1 -0
  69. package/dist/lib/errors.d.ts +99 -0
  70. package/dist/lib/errors.d.ts.map +1 -0
  71. package/package.json +108 -0
package/dist/index.js ADDED
@@ -0,0 +1,4681 @@
1
+ import { Effect, Layer, MutableHashSet, Schema, Data, Context, Console, MutableHashMap, Option, Queue, PubSub, MutableRef, Schedule, Stream, Fiber, Random, Ref } from "effect";
2
+ import * as cheerio from "cheerio";
3
+ import * as fs from "fs";
4
+ import * as path from "path";
5
+ import * as fs$1 from "fs/promises";
6
+ import { CookieJar } from "tough-cookie";
7
+ class SpiderConfig extends Effect.Service()(
8
+ "@jambudipa.io/SpiderConfig",
9
+ {
10
+ effect: Effect.sync(() => makeSpiderConfig({}))
11
+ }
12
+ ) {
13
+ /**
14
+ * Creates a Layer that provides SpiderConfig with custom options
15
+ * @param config - The configuration options or a pre-made SpiderConfigService
16
+ */
17
+ static Live = (config) => Layer.effect(
18
+ SpiderConfig,
19
+ Effect.succeed("getOptions" in config ? config : makeSpiderConfig(config))
20
+ );
21
+ }
22
+ const FILE_EXTENSION_CATEGORIES = {
23
+ /** Archive files (8 extensions) */
24
+ archives: [".7z", ".7zip", ".bz2", ".rar", ".tar", ".tar.gz", ".xz", ".zip"],
25
+ /** Image files (19 extensions) */
26
+ images: [
27
+ ".mng",
28
+ ".pct",
29
+ ".bmp",
30
+ ".gif",
31
+ ".jpg",
32
+ ".jpeg",
33
+ ".png",
34
+ ".pst",
35
+ ".psp",
36
+ ".tif",
37
+ ".tiff",
38
+ ".ai",
39
+ ".drw",
40
+ ".dxf",
41
+ ".eps",
42
+ ".ps",
43
+ ".svg",
44
+ ".cdr",
45
+ ".ico",
46
+ ".webp"
47
+ ],
48
+ /** Audio files (9 extensions) */
49
+ audio: [
50
+ ".mp3",
51
+ ".wma",
52
+ ".ogg",
53
+ ".wav",
54
+ ".ra",
55
+ ".aac",
56
+ ".mid",
57
+ ".au",
58
+ ".aiff"
59
+ ],
60
+ /** Video files (14 extensions) */
61
+ video: [
62
+ ".3gp",
63
+ ".asf",
64
+ ".asx",
65
+ ".avi",
66
+ ".mov",
67
+ ".mp4",
68
+ ".mpg",
69
+ ".qt",
70
+ ".rm",
71
+ ".swf",
72
+ ".wmv",
73
+ ".m4a",
74
+ ".m4v",
75
+ ".flv",
76
+ ".webm"
77
+ ],
78
+ /** Office documents (21 extensions) */
79
+ officeDocuments: [
80
+ ".xls",
81
+ ".xlsm",
82
+ ".xlsx",
83
+ ".xltm",
84
+ ".xltx",
85
+ ".potm",
86
+ ".potx",
87
+ ".ppt",
88
+ ".pptm",
89
+ ".pptx",
90
+ ".pps",
91
+ ".doc",
92
+ ".docb",
93
+ ".docm",
94
+ ".docx",
95
+ ".dotm",
96
+ ".dotx",
97
+ ".odt",
98
+ ".ods",
99
+ ".odg",
100
+ ".odp"
101
+ ],
102
+ /** Other files (16 extensions) */
103
+ other: [
104
+ ".css",
105
+ ".pdf",
106
+ ".exe",
107
+ ".bin",
108
+ ".rss",
109
+ ".dmg",
110
+ ".iso",
111
+ ".apk",
112
+ ".jar",
113
+ ".sh",
114
+ ".rb",
115
+ ".js",
116
+ ".hta",
117
+ ".bat",
118
+ ".cpl",
119
+ ".msi",
120
+ ".msp",
121
+ ".py"
122
+ ]
123
+ };
124
+ const generateSkipExtensions = (filters) => {
125
+ const skipExtensions = [];
126
+ if (filters.filterArchives) {
127
+ skipExtensions.push(...FILE_EXTENSION_CATEGORIES.archives);
128
+ }
129
+ if (filters.filterImages) {
130
+ skipExtensions.push(...FILE_EXTENSION_CATEGORIES.images);
131
+ }
132
+ if (filters.filterAudio) {
133
+ skipExtensions.push(...FILE_EXTENSION_CATEGORIES.audio);
134
+ }
135
+ if (filters.filterVideo) {
136
+ skipExtensions.push(...FILE_EXTENSION_CATEGORIES.video);
137
+ }
138
+ if (filters.filterOfficeDocuments) {
139
+ skipExtensions.push(...FILE_EXTENSION_CATEGORIES.officeDocuments);
140
+ }
141
+ if (filters.filterOther) {
142
+ skipExtensions.push(...FILE_EXTENSION_CATEGORIES.other);
143
+ }
144
+ return skipExtensions;
145
+ };
146
+ const makeSpiderConfig = (options = {}) => {
147
+ const defaultFileExtensionFilters = {
148
+ filterArchives: true,
149
+ filterImages: true,
150
+ filterAudio: true,
151
+ filterVideo: true,
152
+ filterOfficeDocuments: true,
153
+ filterOther: true
154
+ };
155
+ const defaultTechnicalFilters = {
156
+ filterUnsupportedSchemes: true,
157
+ filterLongUrls: true,
158
+ maxUrlLength: 2083,
159
+ // Scrapy's default
160
+ filterMalformedUrls: true
161
+ };
162
+ const defaultOptions = {
163
+ ignoreRobotsTxt: false,
164
+ maxConcurrentWorkers: 5,
165
+ concurrency: 4,
166
+ requestDelayMs: 1e3,
167
+ maxRobotsCrawlDelayMs: 2e3,
168
+ // Maximum 1 second for robots.txt crawl delay
169
+ userAgent: "JambudipaSpider/1.0",
170
+ allowedProtocols: ["http:", "https:", "file:", "ftp:"],
171
+ // Scrapy's allowed schemes
172
+ followRedirects: true,
173
+ respectNoFollow: true,
174
+ fileExtensionFilters: defaultFileExtensionFilters,
175
+ technicalFilters: defaultTechnicalFilters,
176
+ maxConcurrentRequests: 10,
177
+ maxRequestsPerSecondPerDomain: 2,
178
+ normalizeUrlsForDeduplication: true,
179
+ enableResumability: false
180
+ };
181
+ const config = {
182
+ ...defaultOptions,
183
+ ...options,
184
+ // Merge nested objects properly
185
+ fileExtensionFilters: options.fileExtensionFilters ? {
186
+ ...defaultOptions.fileExtensionFilters,
187
+ ...options.fileExtensionFilters
188
+ } : defaultOptions.fileExtensionFilters,
189
+ technicalFilters: options.technicalFilters ? {
190
+ ...defaultOptions.technicalFilters,
191
+ ...options.technicalFilters
192
+ } : defaultOptions.technicalFilters
193
+ };
194
+ const skipExtensions = config.skipFileExtensions || generateSkipExtensions(
195
+ config.fileExtensionFilters ?? defaultFileExtensionFilters
196
+ );
197
+ return {
198
+ getOptions: () => Effect.succeed(config),
199
+ shouldFollowUrl: (urlString, fromUrl, restrictToStartingDomain) => Effect.sync(() => {
200
+ try {
201
+ const url = new URL(urlString);
202
+ const fromUrlParsed = fromUrl ? new URL(fromUrl) : void 0;
203
+ const techFilters = config.technicalFilters ?? defaultTechnicalFilters;
204
+ if (restrictToStartingDomain) {
205
+ const startingDomain = new URL(restrictToStartingDomain).hostname;
206
+ const isAllowedDomain = url.hostname === startingDomain || url.hostname.endsWith(`.${startingDomain}`);
207
+ if (!isAllowedDomain) {
208
+ return {
209
+ follow: false,
210
+ reason: `Domain ${url.hostname} restricted to starting domain ${startingDomain}`
211
+ };
212
+ }
213
+ }
214
+ if (techFilters.filterLongUrls && urlString.length > techFilters.maxUrlLength) {
215
+ return {
216
+ follow: false,
217
+ reason: `URL length ${urlString.length} exceeds maximum ${techFilters.maxUrlLength}`
218
+ };
219
+ }
220
+ if (techFilters.filterUnsupportedSchemes && !config.allowedProtocols.includes(url.protocol)) {
221
+ return {
222
+ follow: false,
223
+ reason: `Protocol ${url.protocol} not in allowed schemes: ${config.allowedProtocols.join(", ")}`
224
+ };
225
+ }
226
+ if (config.allowedDomains && config.allowedDomains.length > 0) {
227
+ const isDomainAllowed = config.allowedDomains.some(
228
+ (domain) => url.hostname === domain || url.hostname.endsWith(`.${domain}`)
229
+ );
230
+ if (!isDomainAllowed) {
231
+ return {
232
+ follow: false,
233
+ reason: `Domain ${url.hostname} not in allowlist`
234
+ };
235
+ }
236
+ }
237
+ if (config.blockedDomains && config.blockedDomains.length > 0) {
238
+ const isDomainBlocked = config.blockedDomains.some(
239
+ (domain) => url.hostname === domain || url.hostname.endsWith(`.${domain}`)
240
+ );
241
+ if (isDomainBlocked) {
242
+ return {
243
+ follow: false,
244
+ reason: `Domain ${url.hostname} is blocked`
245
+ };
246
+ }
247
+ }
248
+ if (config.customUrlFilters && config.customUrlFilters.length > 0) {
249
+ for (const pattern of config.customUrlFilters) {
250
+ if (pattern.test(urlString)) {
251
+ return {
252
+ follow: false,
253
+ reason: `URL matches custom filter pattern: ${pattern}`
254
+ };
255
+ }
256
+ }
257
+ }
258
+ if (fromUrlParsed && url.hostname === fromUrlParsed.hostname && url.pathname === fromUrlParsed.pathname && url.search === fromUrlParsed.search && url.hash) {
259
+ return {
260
+ follow: false,
261
+ reason: "Fragment-only link to same page"
262
+ };
263
+ }
264
+ const pathname = url.pathname.toLowerCase();
265
+ if (skipExtensions.some((ext) => pathname.endsWith(ext.toLowerCase()))) {
266
+ const filterReasons = [];
267
+ if (config.fileExtensionFilters?.filterArchives && FILE_EXTENSION_CATEGORIES.archives.some(
268
+ (ext) => pathname.endsWith(ext.toLowerCase())
269
+ )) {
270
+ filterReasons.push("archive");
271
+ }
272
+ if (config.fileExtensionFilters?.filterImages && FILE_EXTENSION_CATEGORIES.images.some(
273
+ (ext) => pathname.endsWith(ext.toLowerCase())
274
+ )) {
275
+ filterReasons.push("image");
276
+ }
277
+ if (config.fileExtensionFilters?.filterAudio && FILE_EXTENSION_CATEGORIES.audio.some(
278
+ (ext) => pathname.endsWith(ext.toLowerCase())
279
+ )) {
280
+ filterReasons.push("audio");
281
+ }
282
+ if (config.fileExtensionFilters?.filterVideo && FILE_EXTENSION_CATEGORIES.video.some(
283
+ (ext) => pathname.endsWith(ext.toLowerCase())
284
+ )) {
285
+ filterReasons.push("video");
286
+ }
287
+ if (config.fileExtensionFilters?.filterOfficeDocuments && FILE_EXTENSION_CATEGORIES.officeDocuments.some(
288
+ (ext) => pathname.endsWith(ext.toLowerCase())
289
+ )) {
290
+ filterReasons.push("office document");
291
+ }
292
+ if (config.fileExtensionFilters?.filterOther && FILE_EXTENSION_CATEGORIES.other.some(
293
+ (ext) => pathname.endsWith(ext.toLowerCase())
294
+ )) {
295
+ filterReasons.push("other file type");
296
+ }
297
+ const reason = filterReasons.length > 0 ? `Filtered ${filterReasons.join("/")} file extension` : "File extension not suitable for crawling";
298
+ return {
299
+ follow: false,
300
+ reason
301
+ };
302
+ }
303
+ return { follow: true };
304
+ } catch (error) {
305
+ if (config.technicalFilters?.filterMalformedUrls) {
306
+ return {
307
+ follow: false,
308
+ reason: `Malformed URL: ${error instanceof Error ? error.message : "Unknown parsing error"}`
309
+ };
310
+ } else {
311
+ return { follow: true };
312
+ }
313
+ }
314
+ }),
315
+ getUserAgent: () => Effect.succeed(config.userAgent),
316
+ getRequestDelay: () => Effect.succeed(config.requestDelayMs),
317
+ getMaxRobotsCrawlDelay: () => Effect.succeed(config.maxRobotsCrawlDelayMs),
318
+ shouldIgnoreRobotsTxt: () => Effect.succeed(config.ignoreRobotsTxt),
319
+ getMaxConcurrentWorkers: () => Effect.succeed(config.maxConcurrentWorkers),
320
+ getMaxDepth: () => Effect.succeed(config.maxDepth),
321
+ getMaxPages: () => Effect.succeed(config.maxPages),
322
+ shouldFollowRedirects: () => Effect.succeed(config.followRedirects),
323
+ shouldRespectNoFollow: () => Effect.succeed(config.respectNoFollow),
324
+ getSkipFileExtensions: () => Effect.succeed(config.skipFileExtensions || []),
325
+ getMaxConcurrentRequests: () => Effect.succeed(config.maxConcurrentRequests),
326
+ getMaxRequestsPerSecondPerDomain: () => Effect.succeed(config.maxRequestsPerSecondPerDomain),
327
+ shouldNormalizeUrlsForDeduplication: () => Effect.succeed(config.normalizeUrlsForDeduplication),
328
+ getConcurrency: () => Effect.succeed(config.concurrency),
329
+ isResumabilityEnabled: () => Effect.succeed(config.enableResumability)
330
+ };
331
+ };
332
+ class UrlDeduplicatorService extends Effect.Service()(
333
+ "@jambudipa.io/UrlDeduplicatorService",
334
+ {
335
+ effect: Effect.gen(function* () {
336
+ const config = yield* SpiderConfig;
337
+ const shouldNormalize = yield* config.shouldNormalizeUrlsForDeduplication();
338
+ const seenUrls = MutableHashSet.empty();
339
+ const mutex = yield* Effect.makeSemaphore(1);
340
+ const normalizeUrl = (url) => {
341
+ if (!shouldNormalize) {
342
+ return url;
343
+ }
344
+ try {
345
+ const parsed = new URL(url);
346
+ let normalizedPath = parsed.pathname.replace(/\/+/g, "/").replace(/\/$/, "");
347
+ if (normalizedPath === "") {
348
+ normalizedPath = "/";
349
+ }
350
+ parsed.pathname = normalizedPath;
351
+ parsed.hash = "";
352
+ if (parsed.protocol === "http:" && parsed.port === "80" || parsed.protocol === "https:" && parsed.port === "443") {
353
+ parsed.port = "";
354
+ }
355
+ if (parsed.search) {
356
+ const params = new URLSearchParams(parsed.search);
357
+ const sortedParams = new URLSearchParams();
358
+ Array.from(params.keys()).sort().forEach((key) => {
359
+ params.getAll(key).forEach((value) => {
360
+ sortedParams.append(key, value);
361
+ });
362
+ });
363
+ parsed.search = sortedParams.toString();
364
+ }
365
+ return parsed.toString();
366
+ } catch {
367
+ return url;
368
+ }
369
+ };
370
+ return {
371
+ tryAdd: (url) => mutex.withPermits(1)(
372
+ Effect.sync(() => {
373
+ const normalizedUrl = normalizeUrl(url);
374
+ if (MutableHashSet.has(seenUrls, normalizedUrl)) {
375
+ return false;
376
+ }
377
+ MutableHashSet.add(seenUrls, normalizedUrl);
378
+ return true;
379
+ })
380
+ ),
381
+ contains: (url) => mutex.withPermits(1)(
382
+ Effect.sync(() => {
383
+ const normalizedUrl = normalizeUrl(url);
384
+ return MutableHashSet.has(seenUrls, normalizedUrl);
385
+ })
386
+ ),
387
+ size: () => mutex.withPermits(1)(
388
+ Effect.sync(() => MutableHashSet.size(seenUrls))
389
+ ),
390
+ clear: () => mutex.withPermits(1)(
391
+ Effect.sync(() => MutableHashSet.clear(seenUrls))
392
+ )
393
+ };
394
+ }),
395
+ dependencies: [SpiderConfig.Default]
396
+ }
397
+ ) {
398
+ }
399
+ const PageDataSchema = Schema.Struct({
400
+ url: Schema.String.pipe(
401
+ Schema.filter(
402
+ (s) => {
403
+ try {
404
+ new URL(s);
405
+ return true;
406
+ } catch {
407
+ return false;
408
+ }
409
+ },
410
+ {
411
+ message: () => "Invalid URL format"
412
+ }
413
+ )
414
+ ),
415
+ html: Schema.String,
416
+ title: Schema.optional(Schema.String),
417
+ /** All available metadata from meta tags */
418
+ metadata: Schema.Record({ key: Schema.String, value: Schema.String }),
419
+ /** Commonly used metadata fields for convenience */
420
+ commonMetadata: Schema.optional(
421
+ Schema.Struct({
422
+ description: Schema.optional(Schema.String),
423
+ keywords: Schema.optional(Schema.String),
424
+ author: Schema.optional(Schema.String),
425
+ robots: Schema.optional(Schema.String)
426
+ })
427
+ ),
428
+ statusCode: Schema.Number.pipe(Schema.int(), Schema.between(100, 599)),
429
+ /** All response headers */
430
+ headers: Schema.Record({ key: Schema.String, value: Schema.String }),
431
+ /** When the fetch operation started */
432
+ fetchedAt: Schema.DateFromSelf,
433
+ /** How long the entire fetch and parse operation took in milliseconds */
434
+ scrapeDurationMs: Schema.Number,
435
+ /** The crawl depth (number of hops from the starting URL) */
436
+ depth: Schema.Number.pipe(Schema.int(), Schema.greaterThanOrEqualTo(0)),
437
+ /** Optional extracted data from the page */
438
+ extractedData: Schema.optional(
439
+ Schema.Record({ key: Schema.String, value: Schema.Unknown })
440
+ )
441
+ });
442
+ class NetworkError extends Data.TaggedError("NetworkError") {
443
+ static fromCause(url, cause) {
444
+ return new NetworkError({
445
+ url,
446
+ cause,
447
+ message: `Failed to fetch ${url}: ${cause}`
448
+ });
449
+ }
450
+ }
451
+ class ResponseError extends Data.TaggedError("ResponseError") {
452
+ static fromCause(url, cause) {
453
+ return new ResponseError({
454
+ url,
455
+ cause,
456
+ message: `Failed to read response from ${url}: ${cause}`
457
+ });
458
+ }
459
+ }
460
+ class RobotsTxtError extends Data.TaggedError("RobotsTxtError") {
461
+ static fromCause(url, cause) {
462
+ return new RobotsTxtError({
463
+ url,
464
+ cause,
465
+ message: `Failed to fetch robots.txt: ${cause}`
466
+ });
467
+ }
468
+ }
469
+ class ConfigurationError extends Data.TaggedError("ConfigurationError") {
470
+ }
471
+ class MiddlewareError extends Data.TaggedError("MiddlewareError") {
472
+ static transform(middlewareName, cause) {
473
+ return new MiddlewareError({
474
+ phase: "transform",
475
+ middlewareName,
476
+ cause,
477
+ message: `Middleware '${middlewareName}' failed during transform: ${cause}`
478
+ });
479
+ }
480
+ static error(middlewareName, cause) {
481
+ return new MiddlewareError({
482
+ phase: "error",
483
+ middlewareName,
484
+ cause,
485
+ message: `Middleware '${middlewareName}' failed during error handling: ${cause}`
486
+ });
487
+ }
488
+ }
489
+ class FileSystemError extends Data.TaggedError("FileSystemError") {
490
+ static write(path2, cause) {
491
+ return new FileSystemError({
492
+ operation: "write",
493
+ path: path2,
494
+ cause,
495
+ message: `Failed to write file ${path2}: ${cause}`
496
+ });
497
+ }
498
+ static create(path2, cause) {
499
+ return new FileSystemError({
500
+ operation: "create",
501
+ path: path2,
502
+ cause,
503
+ message: `Failed to create directory ${path2}: ${cause}`
504
+ });
505
+ }
506
+ }
507
+ let PersistenceError$1 = class PersistenceError extends Data.TaggedError("PersistenceError") {
508
+ static save(cause, key) {
509
+ return new PersistenceError({
510
+ operation: "save",
511
+ key,
512
+ cause,
513
+ message: key ? `Failed to save state for key ${key}: ${cause}` : `Failed to save state: ${cause}`
514
+ });
515
+ }
516
+ static load(cause, key) {
517
+ return new PersistenceError({
518
+ operation: "load",
519
+ key,
520
+ cause,
521
+ message: key ? `Failed to load state for key ${key}: ${cause}` : `Failed to load state: ${cause}`
522
+ });
523
+ }
524
+ static delete(cause, key) {
525
+ return new PersistenceError({
526
+ operation: "delete",
527
+ key,
528
+ cause,
529
+ message: key ? `Failed to delete state for key ${key}: ${cause}` : `Failed to delete state: ${cause}`
530
+ });
531
+ }
532
+ };
533
+ const SpiderLogger = Context.GenericTag("SpiderLogger");
534
+ const makeSpiderLogger = (logDir = "./spider-logs") => {
535
+ if (!fs.existsSync(logDir)) {
536
+ fs.mkdirSync(logDir, { recursive: true });
537
+ }
538
+ const logFileName = `spider-${(/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-")}.jsonl`;
539
+ const logFilePath = path.join(logDir, logFileName);
540
+ const summaryFilePath = path.join(logDir, "spider-summary.json");
541
+ const writeLogEvent = (event) => Effect.sync(() => {
542
+ const logLine = JSON.stringify(event) + "\n";
543
+ fs.appendFileSync(logFilePath, logLine);
544
+ const importantTypes = [
545
+ "domain_start",
546
+ "domain_complete",
547
+ "spider_lifecycle",
548
+ "domain_error"
549
+ ];
550
+ if (importantTypes.includes(event.type)) {
551
+ const prefix = `[${event.type}]`;
552
+ const domainInfo = event.domain ? ` [${event.domain}]` : "";
553
+ Console.log(`${prefix}${domainInfo} ${event.message}`).pipe(
554
+ Effect.runSync
555
+ );
556
+ }
557
+ });
558
+ const updateSummary = (update) => Effect.sync(() => {
559
+ let summary = {};
560
+ if (fs.existsSync(summaryFilePath)) {
561
+ const content = fs.readFileSync(summaryFilePath, "utf-8");
562
+ try {
563
+ const parsed = JSON.parse(content);
564
+ summary = typeof parsed === "object" && parsed !== null ? parsed : {};
565
+ } catch {
566
+ summary = {};
567
+ }
568
+ }
569
+ summary = update(summary);
570
+ fs.writeFileSync(summaryFilePath, JSON.stringify(summary, null, 2));
571
+ });
572
+ return {
573
+ logEvent: (event) => Effect.gen(function* () {
574
+ const fullEvent = {
575
+ ...event,
576
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
577
+ };
578
+ yield* writeLogEvent(fullEvent);
579
+ }),
580
+ logDomainStart: (domain, startUrl) => Effect.gen(function* () {
581
+ yield* writeLogEvent({
582
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
583
+ type: "domain_start",
584
+ domain,
585
+ url: startUrl,
586
+ message: `Starting crawl for domain: ${domain}`,
587
+ details: { startUrl }
588
+ });
589
+ yield* updateSummary((summary) => ({
590
+ ...summary,
591
+ domains: {
592
+ ...summary.domains || {},
593
+ [domain]: {
594
+ status: "running",
595
+ startTime: (/* @__PURE__ */ new Date()).toISOString(),
596
+ startUrl,
597
+ pagesScraped: 0
598
+ }
599
+ }
600
+ }));
601
+ }),
602
+ logDomainComplete: (domain, pagesScraped, reason) => Effect.gen(function* () {
603
+ yield* writeLogEvent({
604
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
605
+ type: "domain_complete",
606
+ domain,
607
+ message: `Domain ${domain} completed: ${pagesScraped} pages scraped (reason: ${reason})`,
608
+ details: { pagesScraped, reason }
609
+ });
610
+ yield* updateSummary((summary) => {
611
+ const domains = summary.domains || {};
612
+ const existingDomain = domains[domain] || {};
613
+ return {
614
+ ...summary,
615
+ domains: {
616
+ ...domains,
617
+ [domain]: {
618
+ ...existingDomain,
619
+ status: "completed",
620
+ endTime: (/* @__PURE__ */ new Date()).toISOString(),
621
+ pagesScraped,
622
+ completionReason: reason
623
+ }
624
+ }
625
+ };
626
+ });
627
+ }),
628
+ logPageScraped: (url, domain, pageNumber) => Effect.gen(function* () {
629
+ yield* writeLogEvent({
630
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
631
+ type: "page_scraped",
632
+ domain,
633
+ url,
634
+ message: `Scraped page #${pageNumber} from ${domain}`,
635
+ details: { pageNumber }
636
+ });
637
+ yield* updateSummary((summary) => {
638
+ const domains = summary.domains || {};
639
+ const existingDomain = domains[domain] || {};
640
+ return {
641
+ ...summary,
642
+ domains: {
643
+ ...domains,
644
+ [domain]: {
645
+ ...existingDomain,
646
+ pagesScraped: pageNumber
647
+ }
648
+ }
649
+ };
650
+ });
651
+ }),
652
+ logQueueStatus: (domain, queueSize, activeWorkers) => Effect.gen(function* () {
653
+ yield* writeLogEvent({
654
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
655
+ type: "queue_status",
656
+ domain,
657
+ message: `Queue status - size: ${queueSize}, active workers: ${activeWorkers}`,
658
+ details: { queueSize, activeWorkers }
659
+ });
660
+ }),
661
+ logRateLimit: (domain, requestsInWindow) => Effect.gen(function* () {
662
+ yield* writeLogEvent({
663
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
664
+ type: "rate_limit",
665
+ domain,
666
+ message: `Rate limit applied - ${requestsInWindow} requests in window`,
667
+ details: { requestsInWindow }
668
+ });
669
+ }),
670
+ logSpiderLifecycle: (event, details) => Effect.gen(function* () {
671
+ yield* writeLogEvent({
672
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
673
+ type: "spider_lifecycle",
674
+ message: `Spider ${event}`,
675
+ details
676
+ });
677
+ if (event === "start") {
678
+ yield* updateSummary((summary) => ({
679
+ ...summary,
680
+ spiderStartTime: (/* @__PURE__ */ new Date()).toISOString(),
681
+ status: "running"
682
+ }));
683
+ } else if (event === "complete" || event === "error") {
684
+ yield* updateSummary((summary) => ({
685
+ ...summary,
686
+ spiderEndTime: (/* @__PURE__ */ new Date()).toISOString(),
687
+ status: event === "complete" ? "completed" : "error",
688
+ ...details && { finalDetails: details }
689
+ }));
690
+ }
691
+ }),
692
+ // Enhanced diagnostic logging methods
693
+ logWorkerLifecycle: (workerId, domain, event, reason, details) => Effect.gen(function* () {
694
+ yield* writeLogEvent({
695
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
696
+ type: "worker_lifecycle",
697
+ domain,
698
+ workerId,
699
+ message: `[WORKER_LIFECYCLE] Worker ${workerId} ${event}${reason ? ` - reason: ${reason}` : ""} (domain: ${domain})`,
700
+ details: { event, reason, ...details }
701
+ });
702
+ }),
703
+ logWorkerState: (workerId, domain, event, details) => Effect.gen(function* () {
704
+ yield* writeLogEvent({
705
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
706
+ type: "worker_state",
707
+ domain,
708
+ workerId,
709
+ message: `[WORKER_STATE] Worker ${workerId} ${event} (domain: ${domain})`,
710
+ details: { event, ...details }
711
+ });
712
+ }),
713
+ logCompletionMonitor: (domain, checkCount, queueSize, activeWorkers, stableCount, maxPagesReached, decision) => Effect.gen(function* () {
714
+ yield* writeLogEvent({
715
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
716
+ type: "completion_monitor",
717
+ domain,
718
+ message: `[COMPLETION_MONITOR] Check #${checkCount}: queue=${queueSize}, active=${activeWorkers}, stable=${stableCount}, maxPages=${maxPagesReached} -> ${decision}`,
719
+ details: {
720
+ checkCount,
721
+ queueSize,
722
+ activeWorkers,
723
+ stableCount,
724
+ maxPagesReached,
725
+ decision
726
+ }
727
+ });
728
+ }),
729
+ logEdgeCase: (domain, caseType, details) => Effect.gen(function* () {
730
+ yield* writeLogEvent({
731
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
732
+ type: "edge_case",
733
+ domain,
734
+ message: `[EDGE_CASE] ${caseType} (domain: ${domain})`,
735
+ details: { case: caseType, ...details }
736
+ });
737
+ }),
738
+ logDomainStatus: (domain, status) => Effect.gen(function* () {
739
+ yield* writeLogEvent({
740
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
741
+ type: "domain_start",
742
+ // Reuse existing type for now
743
+ domain,
744
+ message: `[DOMAIN_STATUS] ${domain}: ${status.pagesScraped} pages, queue=${status.queueSize}, workers=${status.activeWorkers}/${status.maxWorkers}`,
745
+ details: status
746
+ });
747
+ yield* updateSummary((summary) => {
748
+ const domains = summary.domains || {};
749
+ const existingDomain = domains[domain] || {};
750
+ return {
751
+ ...summary,
752
+ domains: {
753
+ ...domains,
754
+ [domain]: {
755
+ ...existingDomain,
756
+ pagesScraped: Math.max(0, status.pagesScraped || 0),
757
+ queueSize: Math.max(0, status.queueSize || 0),
758
+ activeWorkers: Math.max(0, status.activeWorkers || 0),
759
+ maxWorkers: Math.max(1, status.maxWorkers || 5)
760
+ }
761
+ }
762
+ };
763
+ });
764
+ })
765
+ };
766
+ };
767
+ const SpiderLoggerLive = Layer.succeed(SpiderLogger, makeSpiderLogger());
768
+ class ScraperService extends Effect.Service()(
769
+ "@jambudipa.io/ScraperService",
770
+ {
771
+ effect: Effect.sync(() => ({
772
+ /**
773
+ * Fetches a URL and parses the HTML to extract basic page information.
774
+ *
775
+ * This method performs the following operations:
776
+ * 1. Fetches the URL with configurable timeout (30 seconds)
777
+ * 2. Validates content type (skips binary files)
778
+ * 3. Parses HTML content with cheerio
779
+ * 4. Extracts basic page metadata (title, description, etc.)
780
+ * 5. Returns structured PageData object
781
+ *
782
+ * The method uses AbortController for proper timeout handling to prevent
783
+ * workers from hanging on malformed URLs or slow responses.
784
+ *
785
+ * @param url - The URL to fetch and parse
786
+ * @param depth - The crawl depth for logging purposes (default: 0)
787
+ * @returns Effect containing PageData with extracted information
788
+ * @throws NetworkError for network-related failures
789
+ * @throws ResponseError for HTTP error responses
790
+ *
791
+ * @example
792
+ * Basic usage:
793
+ * ```typescript
794
+ * const pageData = yield* scraper.fetchAndParse('https://example.com');
795
+ * console.log(`Page title: ${pageData.title}`);
796
+ * ```
797
+ *
798
+ * With depth tracking:
799
+ * ```typescript
800
+ * const pageData = yield* scraper.fetchAndParse('https://example.com/page', 2);
801
+ * ```
802
+ *
803
+ * Error handling:
804
+ * ```typescript
805
+ * const result = yield* scraper.fetchAndParse('https://example.com').pipe(
806
+ * Effect.catchTags({
807
+ * NetworkError: (error) => {
808
+ * console.log('Network error:', error.message);
809
+ * return Effect.succeed(null);
810
+ * },
811
+ * ResponseError: (error) => {
812
+ * console.log('HTTP error:', error.statusCode);
813
+ * return Effect.succeed(null);
814
+ * }
815
+ * })
816
+ * );
817
+ * ```
818
+ *
819
+ * @performance
820
+ * - Request timeout: 30 seconds
821
+ * - Response parsing timeout: 10 seconds
822
+ * - Memory usage: ~2-5MB per page depending on content size
823
+ *
824
+ * @security
825
+ * - Validates content types to prevent processing binary files
826
+ * - Uses AbortController to prevent hanging requests
827
+ * - No execution of JavaScript content (static HTML parsing only)
828
+ */
829
+ fetchAndParse: (url, depth = 0) => Effect.gen(function* () {
830
+ const startTime = yield* Effect.sync(() => /* @__PURE__ */ new Date());
831
+ const startMs = startTime.getTime();
832
+ const logger = yield* SpiderLogger;
833
+ const domain = new URL(url).hostname;
834
+ const controller = new AbortController();
835
+ const timeoutMs = 3e4;
836
+ const timeoutId = setTimeout(() => {
837
+ const duration = Date.now() - startMs;
838
+ Effect.runSync(
839
+ logger.logEdgeCase(domain, "fetch_abort_triggered", {
840
+ url,
841
+ durationMs: duration,
842
+ reason: "timeout",
843
+ timeoutMs
844
+ })
845
+ );
846
+ controller.abort();
847
+ }, timeoutMs);
848
+ const response = yield* Effect.tryPromise({
849
+ try: async () => {
850
+ try {
851
+ const resp = await fetch(url, { signal: controller.signal });
852
+ clearTimeout(timeoutId);
853
+ const contentType = resp.headers.get("content-type") || "";
854
+ if (!contentType.includes("text/html") && !contentType.includes("application/xhtml") && !contentType.includes("text/") && contentType !== "") {
855
+ throw new Error(`Skipping non-HTML content: ${contentType}`);
856
+ }
857
+ return resp;
858
+ } catch (error) {
859
+ clearTimeout(timeoutId);
860
+ if (error instanceof Error && error.name === "AbortError") {
861
+ throw new Error(
862
+ `Request aborted after ${Date.now() - startMs}ms`
863
+ );
864
+ }
865
+ throw error;
866
+ }
867
+ },
868
+ catch: (error) => NetworkError.fromCause(url, error)
869
+ });
870
+ const textController = new AbortController();
871
+ const textTimeoutMs = 1e4;
872
+ const textTimeoutId = setTimeout(() => {
873
+ const duration = Date.now() - startMs;
874
+ Effect.runSync(
875
+ logger.logEdgeCase(domain, "response_text_abort_triggered", {
876
+ url,
877
+ durationMs: duration,
878
+ reason: "timeout",
879
+ timeoutMs: textTimeoutMs
880
+ })
881
+ );
882
+ textController.abort();
883
+ }, textTimeoutMs);
884
+ const html = yield* Effect.tryPromise({
885
+ try: async () => {
886
+ try {
887
+ const reader = response.body?.getReader();
888
+ if (!reader) throw new Error("No response body");
889
+ const decoder = new TextDecoder();
890
+ let html2 = "";
891
+ while (true) {
892
+ const { done, value } = await reader.read();
893
+ if (done) break;
894
+ html2 += decoder.decode(value, { stream: true });
895
+ if (textController.signal.aborted) {
896
+ reader.cancel();
897
+ throw new Error("Response parsing aborted");
898
+ }
899
+ }
900
+ clearTimeout(textTimeoutId);
901
+ return html2;
902
+ } catch (error) {
903
+ clearTimeout(textTimeoutId);
904
+ throw error;
905
+ }
906
+ },
907
+ catch: (error) => ResponseError.fromCause(url, error)
908
+ });
909
+ const $ = cheerio.load(html);
910
+ const metadata = {};
911
+ $("meta").each((_, element) => {
912
+ const $meta = $(element);
913
+ const name = $meta.attr("name") || $meta.attr("property") || $meta.attr("http-equiv");
914
+ const content = $meta.attr("content");
915
+ if (name && content) {
916
+ metadata[name] = content;
917
+ }
918
+ });
919
+ const commonMetadata = {
920
+ description: metadata["description"],
921
+ keywords: metadata["keywords"],
922
+ author: metadata["author"],
923
+ robots: metadata["robots"]
924
+ };
925
+ const headers = {};
926
+ response.headers.forEach((value, key) => {
927
+ headers[key] = value;
928
+ });
929
+ const endTime = yield* Effect.sync(() => /* @__PURE__ */ new Date());
930
+ const durationMs = endTime.getTime() - startTime.getTime();
931
+ const pageData = {
932
+ url,
933
+ html,
934
+ title: $("title").text() || void 0,
935
+ metadata,
936
+ commonMetadata: Object.values(commonMetadata).some((v) => v) ? commonMetadata : void 0,
937
+ statusCode: response.status,
938
+ headers,
939
+ fetchedAt: startTime,
940
+ scrapeDurationMs: durationMs,
941
+ depth
942
+ };
943
+ return yield* Schema.decode(PageDataSchema)(pageData);
944
+ })
945
+ }))
946
+ }
947
+ ) {
948
+ }
949
+ class RobotsService extends Effect.Service()(
950
+ "@jambudipa.io/RobotsService",
951
+ {
952
+ effect: Effect.sync(() => {
953
+ const robotsCache = MutableHashMap.empty();
954
+ const parseRobotsTxt = (content, userAgent = "*") => {
955
+ const lines = content.split("\n");
956
+ const rules = {
957
+ disallowedPaths: /* @__PURE__ */ new Set(),
958
+ userAgent
959
+ };
960
+ let currentUserAgent = "";
961
+ let isRelevantSection = false;
962
+ for (const line of lines) {
963
+ const trimmed = line.trim();
964
+ if (trimmed.startsWith("#") || !trimmed) continue;
965
+ const [directive, ...valueParts] = trimmed.split(":");
966
+ const value = valueParts.join(":").trim();
967
+ if (directive.toLowerCase() === "user-agent") {
968
+ currentUserAgent = value;
969
+ isRelevantSection = currentUserAgent === "*" || currentUserAgent.toLowerCase() === userAgent.toLowerCase();
970
+ } else if (isRelevantSection) {
971
+ if (directive.toLowerCase() === "disallow" && value) {
972
+ rules.disallowedPaths.add(value);
973
+ } else if (directive.toLowerCase() === "crawl-delay") {
974
+ rules.crawlDelay = parseInt(value);
975
+ }
976
+ }
977
+ }
978
+ return rules;
979
+ };
980
+ const fetchRobotsTxt = (baseUrl) => {
981
+ const robotsUrl = new URL("/robots.txt", baseUrl);
982
+ return Effect.tryPromise({
983
+ try: async () => {
984
+ const response = await fetch(robotsUrl.toString());
985
+ if (!response.ok) {
986
+ return null;
987
+ }
988
+ return await response.text();
989
+ },
990
+ catch: (error) => RobotsTxtError.fromCause(robotsUrl.toString(), error)
991
+ });
992
+ };
993
+ const isPathAllowed = (url, rules) => {
994
+ const path2 = url.pathname;
995
+ for (const disallowedPath of rules.disallowedPaths) {
996
+ if (disallowedPath === "/") return false;
997
+ try {
998
+ const pattern = disallowedPath.replace(/[.*+?^${}()|[\]\\]/g, "\\$&").replace(/\\\*/g, ".*");
999
+ if (new RegExp(`^${pattern}`).test(path2)) {
1000
+ return false;
1001
+ }
1002
+ } catch {
1003
+ if (disallowedPath.endsWith("*")) {
1004
+ const prefix = disallowedPath.slice(0, -1);
1005
+ if (path2.startsWith(prefix)) {
1006
+ return false;
1007
+ }
1008
+ } else if (path2.startsWith(disallowedPath)) {
1009
+ return false;
1010
+ }
1011
+ }
1012
+ }
1013
+ return true;
1014
+ };
1015
+ return {
1016
+ checkUrl: (urlString) => Effect.gen(function* () {
1017
+ let url;
1018
+ let baseUrl;
1019
+ try {
1020
+ url = new URL(urlString);
1021
+ baseUrl = new URL(`${url.protocol}//${url.host}`);
1022
+ } catch (error) {
1023
+ yield* Effect.logWarning(
1024
+ `Invalid URL "${urlString}": ${error instanceof Error ? error.message : String(error)}. Allowing access.`
1025
+ );
1026
+ return { allowed: true };
1027
+ }
1028
+ const cacheKey = baseUrl.toString();
1029
+ const cachedRules = MutableHashMap.get(robotsCache, cacheKey);
1030
+ let rules;
1031
+ if (Option.isNone(cachedRules)) {
1032
+ const robotsContent = yield* fetchRobotsTxt(baseUrl).pipe(
1033
+ Effect.catchAll(
1034
+ (error) => Effect.logWarning(
1035
+ `Failed to fetch robots.txt for ${baseUrl}: ${error.message}. Allowing access.`
1036
+ ).pipe(Effect.map(() => null))
1037
+ )
1038
+ );
1039
+ if (robotsContent) {
1040
+ try {
1041
+ rules = parseRobotsTxt(robotsContent);
1042
+ } catch {
1043
+ rules = { disallowedPaths: /* @__PURE__ */ new Set(), userAgent: "*" };
1044
+ }
1045
+ } else {
1046
+ rules = { disallowedPaths: /* @__PURE__ */ new Set(), userAgent: "*" };
1047
+ }
1048
+ MutableHashMap.set(robotsCache, cacheKey, rules);
1049
+ } else {
1050
+ rules = cachedRules.value;
1051
+ }
1052
+ return {
1053
+ allowed: isPathAllowed(url, rules),
1054
+ crawlDelay: rules.crawlDelay
1055
+ };
1056
+ }),
1057
+ getRules: (domain) => Effect.sync(() => {
1058
+ const baseUrl = new URL(domain);
1059
+ const cacheKey = baseUrl.toString();
1060
+ return MutableHashMap.get(robotsCache, cacheKey);
1061
+ })
1062
+ };
1063
+ })
1064
+ }
1065
+ ) {
1066
+ }
1067
+ class LinkExtractionError extends Data.TaggedError(
1068
+ "LinkExtractionError"
1069
+ ) {
1070
+ }
1071
+ const DEFAULT_CONFIG = {
1072
+ restrictCss: [],
1073
+ tags: ["a", "area", "form", "frame", "iframe", "link"],
1074
+ attrs: ["href", "action", "src"],
1075
+ extractFromInputs: false
1076
+ };
1077
+ class LinkExtractorService extends Effect.Service()(
1078
+ "@jambudipa.io/LinkExtractorService",
1079
+ {
1080
+ effect: Effect.succeed({
1081
+ extractLinks: (html, config) => Effect.gen(function* () {
1082
+ const finalConfig = { ...DEFAULT_CONFIG, ...config };
1083
+ try {
1084
+ const result = extractRawLinks(html, finalConfig);
1085
+ return result;
1086
+ } catch (error) {
1087
+ return yield* Effect.fail(
1088
+ new LinkExtractionError({
1089
+ message: `Failed to extract links from HTML: ${error instanceof Error ? error.message : String(error)}`,
1090
+ cause: error
1091
+ })
1092
+ );
1093
+ }
1094
+ })
1095
+ })
1096
+ }
1097
+ ) {
1098
+ }
1099
+ const LinkExtractorServiceLayer = LinkExtractorService.Default;
1100
+ const extractRawLinks = (html, config) => {
1101
+ const $ = cheerio.load(html);
1102
+ const foundUrls = [];
1103
+ const extractionBreakdown = {};
1104
+ let totalElementsProcessed = 0;
1105
+ const extractUrlFromAttribute = (element, attr) => {
1106
+ const value = $(element).attr(attr);
1107
+ if (!value || !value.trim()) return null;
1108
+ return value.trim();
1109
+ };
1110
+ const trackExtraction = (elementType, url) => {
1111
+ totalElementsProcessed++;
1112
+ if (url) {
1113
+ foundUrls.push(url);
1114
+ extractionBreakdown[elementType] = (extractionBreakdown[elementType] || 0) + 1;
1115
+ }
1116
+ };
1117
+ if (config.restrictCss.length > 0) {
1118
+ config.restrictCss.forEach((cssSelector) => {
1119
+ $(cssSelector).each((_, element) => {
1120
+ const tagName = element.name?.toLowerCase() || "unknown";
1121
+ config.attrs.forEach((attr) => {
1122
+ const url = extractUrlFromAttribute(element, attr);
1123
+ if (url) trackExtraction(tagName, url);
1124
+ });
1125
+ });
1126
+ });
1127
+ } else {
1128
+ config.tags.forEach((tag) => {
1129
+ config.attrs.forEach((attr) => {
1130
+ $(`${tag}[${attr}]`).each((_, element) => {
1131
+ const url = extractUrlFromAttribute(element, attr);
1132
+ trackExtraction(tag, url);
1133
+ });
1134
+ });
1135
+ });
1136
+ }
1137
+ if (config.extractFromInputs) {
1138
+ $('input[type="hidden"]').each((_, element) => {
1139
+ const name = $(element).attr("name")?.toLowerCase() || "";
1140
+ const value = $(element).attr("value");
1141
+ if ((name.includes("url") || name.includes("redirect") || name.includes("next")) && value?.trim()) {
1142
+ trackExtraction("input", value.trim());
1143
+ }
1144
+ });
1145
+ }
1146
+ return {
1147
+ links: foundUrls,
1148
+ totalElementsProcessed,
1149
+ extractionBreakdown
1150
+ };
1151
+ };
1152
+ class SpiderStateKey extends Schema.Class(
1153
+ "SpiderStateKey"
1154
+ )({
1155
+ /** Unique identifier for the session */
1156
+ id: Schema.String,
1157
+ /** When the session was created */
1158
+ timestamp: Schema.Date,
1159
+ /** Human-readable name for the session */
1160
+ name: Schema.String
1161
+ }) {
1162
+ }
1163
+ class PriorityRequest extends Schema.Class(
1164
+ "PriorityRequest"
1165
+ )({
1166
+ /** The crawl task containing URL and depth information */
1167
+ request: Schema.Struct({
1168
+ url: Schema.String,
1169
+ depth: Schema.Number,
1170
+ fromUrl: Schema.optional(Schema.String)
1171
+ }),
1172
+ /** Priority level (higher numbers processed first) */
1173
+ priority: Schema.Number,
1174
+ /** When this request was created */
1175
+ timestamp: Schema.Date,
1176
+ /** Unique fingerprint for deduplication */
1177
+ fingerprint: Schema.String
1178
+ }) {
1179
+ }
1180
+ class SpiderState extends Schema.Class("SpiderState")({
1181
+ /** The state key identifying this session */
1182
+ key: SpiderStateKey,
1183
+ /** All requests waiting to be processed */
1184
+ pendingRequests: Schema.Array(PriorityRequest),
1185
+ /** Fingerprints of URLs already visited (for deduplication) */
1186
+ visitedFingerprints: Schema.Array(Schema.String),
1187
+ /** Total number of requests processed so far */
1188
+ totalProcessed: Schema.Number
1189
+ }) {
1190
+ }
1191
+ class SpiderSchedulerService extends Effect.Service()(
1192
+ "@jambudipa.io/SpiderSchedulerService",
1193
+ {
1194
+ effect: Effect.gen(function* () {
1195
+ const config = yield* SpiderConfig;
1196
+ const shouldNormalizeUrls = yield* config.shouldNormalizeUrlsForDeduplication();
1197
+ const memoryQueue = yield* Queue.unbounded();
1198
+ const seenFingerprints = MutableHashMap.empty();
1199
+ const pendingRequestsForPersistence = [];
1200
+ let totalProcessed = 0;
1201
+ let persistenceLayer = null;
1202
+ let currentStateKey = null;
1203
+ const normalizeUrl = (url) => {
1204
+ if (!shouldNormalizeUrls) {
1205
+ return url;
1206
+ }
1207
+ try {
1208
+ const parsed = new URL(url);
1209
+ let normalizedPath = parsed.pathname.replace(/\/+/g, "/").replace(/\/$/, "");
1210
+ if (normalizedPath === "") {
1211
+ normalizedPath = "/";
1212
+ }
1213
+ parsed.pathname = normalizedPath;
1214
+ parsed.hash = "";
1215
+ if (parsed.protocol === "http:" && parsed.port === "80" || parsed.protocol === "https:" && parsed.port === "443") {
1216
+ parsed.port = "";
1217
+ }
1218
+ if (parsed.search) {
1219
+ const params = new URLSearchParams(parsed.search);
1220
+ const sortedParams = new URLSearchParams();
1221
+ Array.from(params.keys()).sort().forEach((key) => {
1222
+ params.getAll(key).forEach((value) => {
1223
+ sortedParams.append(key, value);
1224
+ });
1225
+ });
1226
+ parsed.search = sortedParams.toString();
1227
+ }
1228
+ return parsed.toString();
1229
+ } catch {
1230
+ return url;
1231
+ }
1232
+ };
1233
+ const generateFingerprint = (request) => {
1234
+ const normalizedUrl = normalizeUrl(request.url);
1235
+ return `${normalizedUrl}:${request.depth}`;
1236
+ };
1237
+ const createPriorityRequest = (request, priority) => new PriorityRequest({
1238
+ request,
1239
+ priority,
1240
+ timestamp: /* @__PURE__ */ new Date(),
1241
+ fingerprint: generateFingerprint(request)
1242
+ });
1243
+ const persistState = () => Effect.gen(function* () {
1244
+ if (!persistenceLayer || !currentStateKey) {
1245
+ return;
1246
+ }
1247
+ const state = new SpiderState({
1248
+ key: currentStateKey,
1249
+ pendingRequests: [...pendingRequestsForPersistence],
1250
+ visitedFingerprints: Array.from(
1251
+ MutableHashMap.keys(seenFingerprints)
1252
+ ),
1253
+ totalProcessed
1254
+ });
1255
+ yield* persistenceLayer.saveState(currentStateKey, state);
1256
+ });
1257
+ const restoreFromStateImpl = (state) => Effect.gen(function* () {
1258
+ const currentSize = yield* Queue.size(memoryQueue);
1259
+ for (let i = 0; i < currentSize; i++) {
1260
+ yield* Queue.take(memoryQueue).pipe(Effect.ignore);
1261
+ }
1262
+ MutableHashMap.clear(seenFingerprints);
1263
+ pendingRequestsForPersistence.length = 0;
1264
+ state.visitedFingerprints.forEach((fp) => {
1265
+ MutableHashMap.set(seenFingerprints, fp, true);
1266
+ });
1267
+ const sortedRequests = [...state.pendingRequests].sort(
1268
+ (a, b) => b.priority - a.priority
1269
+ );
1270
+ pendingRequestsForPersistence.push(...sortedRequests);
1271
+ yield* Effect.forEach(
1272
+ sortedRequests,
1273
+ (req) => Queue.offer(memoryQueue, req)
1274
+ );
1275
+ totalProcessed = state.totalProcessed;
1276
+ currentStateKey = state.key;
1277
+ });
1278
+ return {
1279
+ // Configure persistence layer for resumable scraping
1280
+ configurePersistence: (persistence, stateKey) => Effect.sync(() => {
1281
+ persistenceLayer = persistence;
1282
+ currentStateKey = stateKey;
1283
+ }),
1284
+ // Remove persistence configuration
1285
+ clearPersistence: () => Effect.sync(() => {
1286
+ persistenceLayer = null;
1287
+ currentStateKey = null;
1288
+ }),
1289
+ // Enqueue a request with priority
1290
+ enqueue: (request, priority = 0) => Effect.gen(function* () {
1291
+ const fingerprint = generateFingerprint(request);
1292
+ if (MutableHashMap.has(seenFingerprints, fingerprint)) {
1293
+ return false;
1294
+ }
1295
+ MutableHashMap.set(seenFingerprints, fingerprint, true);
1296
+ const priorityRequest = createPriorityRequest(request, priority);
1297
+ yield* Queue.offer(memoryQueue, priorityRequest);
1298
+ pendingRequestsForPersistence.push(priorityRequest);
1299
+ if (persistenceLayer && currentStateKey) {
1300
+ yield* persistState();
1301
+ }
1302
+ return true;
1303
+ }),
1304
+ // Dequeue highest priority request
1305
+ dequeue: () => Effect.gen(function* () {
1306
+ const request = yield* Queue.take(memoryQueue);
1307
+ totalProcessed++;
1308
+ const index = pendingRequestsForPersistence.findIndex(
1309
+ (r) => r.fingerprint === request.fingerprint
1310
+ );
1311
+ if (index !== -1) {
1312
+ pendingRequestsForPersistence.splice(index, 1);
1313
+ }
1314
+ if (persistenceLayer && currentStateKey) {
1315
+ yield* persistState();
1316
+ }
1317
+ return request;
1318
+ }),
1319
+ // Get queue size
1320
+ size: () => Queue.size(memoryQueue),
1321
+ // Check if queue is empty
1322
+ isEmpty: () => Queue.size(memoryQueue).pipe(Effect.map((size) => size === 0)),
1323
+ // Get current state for persistence
1324
+ getState: () => Effect.gen(function* () {
1325
+ if (!currentStateKey) {
1326
+ return yield* Effect.fail(
1327
+ new ConfigurationError({
1328
+ message: "No state key configured",
1329
+ details: "State key is required for persistence operations"
1330
+ })
1331
+ );
1332
+ }
1333
+ return new SpiderState({
1334
+ key: currentStateKey,
1335
+ pendingRequests: [...pendingRequestsForPersistence],
1336
+ visitedFingerprints: Array.from(
1337
+ MutableHashMap.keys(seenFingerprints)
1338
+ ),
1339
+ totalProcessed
1340
+ });
1341
+ }),
1342
+ // Restore from state
1343
+ restoreFromState: restoreFromStateImpl,
1344
+ // Generic restore method that can work with any persistence implementation
1345
+ restore: (persistence, stateKey) => Effect.gen(function* () {
1346
+ const state = yield* persistence.loadState(stateKey);
1347
+ if (state) {
1348
+ persistenceLayer = persistence;
1349
+ yield* restoreFromStateImpl(state);
1350
+ return true;
1351
+ }
1352
+ return false;
1353
+ })
1354
+ };
1355
+ }),
1356
+ dependencies: [SpiderConfig.Default]
1357
+ }
1358
+ ) {
1359
+ }
1360
+ const SpiderScheduler_service = /* @__PURE__ */ Object.freeze(/* @__PURE__ */ Object.defineProperty({
1361
+ __proto__: null,
1362
+ PriorityRequest,
1363
+ SpiderSchedulerService,
1364
+ SpiderState,
1365
+ SpiderStateKey
1366
+ }, Symbol.toStringTag, { value: "Module" }));
1367
+ class SpiderService extends Effect.Service()(
1368
+ "@jambudipa.io/Spider",
1369
+ {
1370
+ effect: Effect.gen(function* () {
1371
+ const robots = yield* RobotsService;
1372
+ const scraper = yield* ScraperService;
1373
+ const logger = yield* SpiderLogger;
1374
+ const linkExtractor = yield* LinkExtractorService;
1375
+ const maybeScheduler = yield* Effect.serviceOption(
1376
+ SpiderSchedulerService
1377
+ );
1378
+ const scheduler = Option.isSome(maybeScheduler) ? maybeScheduler.value : null;
1379
+ const self = {
1380
+ /**
1381
+ * Starts crawling from the specified URL and processes results through the provided sink.
1382
+ *
1383
+ * This method:
1384
+ * 1. Validates the starting URL against configuration rules
1385
+ * 2. Starts a configurable number of worker fibers
1386
+ * 3. Each worker processes URLs from a shared queue
1387
+ * 4. Results are streamed through the provided sink
1388
+ * 5. New URLs discovered are queued for processing
1389
+ *
1390
+ * @param startingUrls - The starting URL(s) for crawling (single string or array)
1391
+ * @param sink - Sink to process crawl results as they're produced
1392
+ * @param options - Optional enhanced link extraction configuration
1393
+ * @returns Effect containing crawl statistics (total pages, completion status)
1394
+ *
1395
+ * @example
1396
+ * Basic usage:
1397
+ * ```typescript
1398
+ * const collectSink = Sink.forEach<CrawlResult>(result =>
1399
+ * Effect.sync(() => console.log(`Found: ${result.pageData.title}`))
1400
+ * );
1401
+ *
1402
+ * const stats = yield* spider.crawl('https://example.com', collectSink);
1403
+ * ```
1404
+ *
1405
+ * With multiple starting URLs:
1406
+ * ```typescript
1407
+ * const stats = yield* spider.crawl([
1408
+ * 'https://example.com',
1409
+ * 'https://other-domain.com'
1410
+ * ], collectSink);
1411
+ * ```
1412
+ *
1413
+ * With enhanced link extraction:
1414
+ * ```typescript
1415
+ * const stats = yield* spider.crawl('https://example.com', collectSink, {
1416
+ * useEnhancedExtraction: true,
1417
+ * linkExtractorConfig: {
1418
+ * allowPatterns: [/\/articles\//],
1419
+ * restrictCss: ['.content a']
1420
+ * }
1421
+ * });
1422
+ * ```
1423
+ */
1424
+ crawl: (startingUrls, sink, options) => Effect.gen(function* () {
1425
+ const config = yield* SpiderConfig;
1426
+ if (!config) {
1427
+ return yield* Effect.fail(
1428
+ new Error("SpiderConfig is required for crawling operations")
1429
+ );
1430
+ }
1431
+ const normalizeUrlInput = (input) => {
1432
+ if (typeof input === "string") {
1433
+ return [{ url: input }];
1434
+ }
1435
+ if (Array.isArray(input)) {
1436
+ return input.map(
1437
+ (item) => typeof item === "string" ? { url: item } : item
1438
+ );
1439
+ }
1440
+ return [input];
1441
+ };
1442
+ const urlsWithMetadata = normalizeUrlInput(startingUrls);
1443
+ const domainMap = /* @__PURE__ */ new Map();
1444
+ for (const urlObj of urlsWithMetadata) {
1445
+ try {
1446
+ const url = new URL(urlObj.url);
1447
+ const domain = url.hostname.toLowerCase();
1448
+ const normalizedDomain = domain.replace(/^www\./, "");
1449
+ if (!domainMap.has(normalizedDomain)) {
1450
+ domainMap.set(normalizedDomain, urlObj);
1451
+ } else {
1452
+ console.warn(
1453
+ `Skipping duplicate domain: ${domain} (normalized: ${normalizedDomain}, URL: ${urlObj.url})`
1454
+ );
1455
+ }
1456
+ } catch (e) {
1457
+ console.error(`Invalid URL skipped: ${urlObj.url}`, e);
1458
+ }
1459
+ }
1460
+ const deduplicatedUrls = Array.from(domainMap.values());
1461
+ const concurrency = yield* config.getConcurrency();
1462
+ if (deduplicatedUrls.length > 1) {
1463
+ const configOptions = yield* config.getOptions();
1464
+ if (configOptions.allowedDomains || configOptions.blockedDomains) {
1465
+ console.warn(
1466
+ "Warning: Multiple starting URLs detected with allowedDomains/blockedDomains configured. Domain restrictions will be ignored - each URL will be restricted to its own domain instead."
1467
+ );
1468
+ }
1469
+ }
1470
+ yield* logger.logSpiderLifecycle("start", {
1471
+ totalUrls: deduplicatedUrls.length,
1472
+ urls: deduplicatedUrls.map((u) => u.url),
1473
+ originalCount: urlsWithMetadata.length,
1474
+ deduplicatedCount: deduplicatedUrls.length
1475
+ });
1476
+ const restrictToStartingDomain = true;
1477
+ const results = yield* Effect.all(
1478
+ deduplicatedUrls.map(
1479
+ ({ url, metadata }) => self.crawlSingle(
1480
+ url,
1481
+ sink,
1482
+ options,
1483
+ metadata,
1484
+ restrictToStartingDomain
1485
+ )
1486
+ ),
1487
+ { concurrency }
1488
+ );
1489
+ yield* logger.logSpiderLifecycle("complete", {
1490
+ totalDomains: results.length,
1491
+ totalPages: results.reduce(
1492
+ (sum, r) => sum + (r.pagesScraped || 0),
1493
+ 0
1494
+ )
1495
+ });
1496
+ return {
1497
+ completed: true
1498
+ };
1499
+ }),
1500
+ // Single URL crawling - each gets its own queue, workers, and deduplicator
1501
+ crawlSingle: (urlString, sink, options, initialMetadata, restrictToStartingDomain) => Effect.gen(function* () {
1502
+ const config = yield* SpiderConfig;
1503
+ let domain;
1504
+ try {
1505
+ const url = new URL(urlString);
1506
+ domain = url.hostname;
1507
+ } catch {
1508
+ domain = "invalid-url";
1509
+ }
1510
+ yield* logger.logDomainStart(domain, urlString);
1511
+ const localDeduplicator = yield* Effect.provide(
1512
+ UrlDeduplicatorService,
1513
+ UrlDeduplicatorService.Default
1514
+ );
1515
+ const urlQueue = yield* Queue.unbounded();
1516
+ const resultPubSub = yield* PubSub.unbounded();
1517
+ const activeWorkers = MutableRef.make(0);
1518
+ const maxPagesReached = MutableRef.make(false);
1519
+ const domainCompleted = MutableRef.make(false);
1520
+ const queueMutex = yield* Effect.makeSemaphore(1);
1521
+ const workerHealthChecks = MutableRef.make(
1522
+ /* @__PURE__ */ new Map()
1523
+ );
1524
+ const reportWorkerHealth = (workerId) => Effect.sync(() => {
1525
+ const healthMap = MutableRef.get(workerHealthChecks);
1526
+ healthMap.set(workerId, /* @__PURE__ */ new Date());
1527
+ return healthMap;
1528
+ });
1529
+ const workerHealthMonitor = Effect.gen(function* () {
1530
+ const healthMap = MutableRef.get(workerHealthChecks);
1531
+ const now = Date.now();
1532
+ const staleThreshold = 6e4;
1533
+ for (const [workerId, lastCheck] of healthMap) {
1534
+ const elapsed = now - lastCheck.getTime();
1535
+ if (elapsed > staleThreshold) {
1536
+ yield* logger.logEdgeCase(domain, "worker_death_detected", {
1537
+ workerId,
1538
+ lastSeen: elapsed + "ms ago",
1539
+ message: `DEAD WORKER: ${workerId} - No heartbeat for ${Math.round(elapsed / 1e3)}s`
1540
+ });
1541
+ healthMap.delete(workerId);
1542
+ }
1543
+ }
1544
+ }).pipe(
1545
+ Effect.repeat(Schedule.fixed("15 seconds"))
1546
+ // Check every 15 seconds
1547
+ );
1548
+ const queueManager = {
1549
+ // Atomic take: either returns task and increments active count, or detects completion
1550
+ takeTaskOrComplete: queueMutex.withPermits(1)(
1551
+ Effect.gen(function* () {
1552
+ const isCompleted = MutableRef.get(domainCompleted);
1553
+ if (isCompleted) {
1554
+ return {
1555
+ type: "completed",
1556
+ reason: "already_completed",
1557
+ wasFirstToComplete: false
1558
+ };
1559
+ }
1560
+ const hasMaxPages = MutableRef.get(maxPagesReached);
1561
+ if (hasMaxPages) {
1562
+ const wasCompleted = MutableRef.compareAndSet(
1563
+ domainCompleted,
1564
+ false,
1565
+ true
1566
+ );
1567
+ return {
1568
+ type: "completed",
1569
+ reason: "max_pages",
1570
+ wasFirstToComplete: wasCompleted
1571
+ };
1572
+ }
1573
+ const pollResult = yield* Queue.poll(urlQueue);
1574
+ if (pollResult._tag === "Some") {
1575
+ const activeCount = MutableRef.updateAndGet(
1576
+ activeWorkers,
1577
+ (n) => n + 1
1578
+ );
1579
+ return {
1580
+ type: "task",
1581
+ task: pollResult.value,
1582
+ activeCount
1583
+ };
1584
+ } else {
1585
+ const currentActive = MutableRef.get(activeWorkers);
1586
+ if (currentActive === 0) {
1587
+ const wasCompleted = MutableRef.compareAndSet(
1588
+ domainCompleted,
1589
+ false,
1590
+ true
1591
+ );
1592
+ return {
1593
+ type: "completed",
1594
+ reason: "no_more_urls",
1595
+ wasFirstToComplete: wasCompleted
1596
+ };
1597
+ } else {
1598
+ return {
1599
+ type: "empty_but_active",
1600
+ activeWorkers: currentActive
1601
+ };
1602
+ }
1603
+ }
1604
+ })
1605
+ ),
1606
+ // Add task to queue
1607
+ addTask: (task) => Queue.offer(urlQueue, task),
1608
+ // Mark worker as idle (decrement active count with bounds checking)
1609
+ markIdle: () => Effect.sync(
1610
+ () => MutableRef.updateAndGet(
1611
+ activeWorkers,
1612
+ (n) => Math.max(0, n - 1)
1613
+ )
1614
+ ),
1615
+ // Get queue size for logging (with defensive bounds checking)
1616
+ size: () => Effect.map(Queue.size(urlQueue), (size) => Math.max(0, size))
1617
+ };
1618
+ const generateWorkerId = () => Effect.gen(function* () {
1619
+ const random = yield* Random.nextIntBetween(1e3, 9999);
1620
+ return `${domain}-worker-${random}`;
1621
+ });
1622
+ const worker = (workerId) => Effect.gen(function* () {
1623
+ yield* logger.logWorkerLifecycle(
1624
+ workerId,
1625
+ domain,
1626
+ "entering_loop"
1627
+ );
1628
+ while (true) {
1629
+ yield* reportWorkerHealth(workerId);
1630
+ const queueSize = yield* queueManager.size();
1631
+ const memUsage = process.memoryUsage();
1632
+ if (memUsage.heapUsed > 1024 * 1024 * 1024) {
1633
+ yield* logger.logEdgeCase(domain, "high_memory_usage", {
1634
+ workerId,
1635
+ heapUsed: Math.round(memUsage.heapUsed / 1024 / 1024) + "MB",
1636
+ heapTotal: Math.round(memUsage.heapTotal / 1024 / 1024) + "MB",
1637
+ queueSize
1638
+ });
1639
+ }
1640
+ if (queueSize > 1e4) {
1641
+ yield* logger.logEdgeCase(domain, "excessive_queue_size", {
1642
+ workerId,
1643
+ queueSize,
1644
+ message: "Queue size exceeds 10,000 items - potential memory issue"
1645
+ });
1646
+ }
1647
+ yield* logger.logWorkerState(
1648
+ workerId,
1649
+ domain,
1650
+ "taking_task",
1651
+ {
1652
+ queueSize
1653
+ }
1654
+ );
1655
+ const result = yield* queueManager.takeTaskOrComplete.pipe(
1656
+ Effect.timeout("10 seconds"),
1657
+ Effect.tap(
1658
+ () => logger.logEdgeCase(domain, "task_acquisition_success", {
1659
+ workerId,
1660
+ message: "Task acquired successfully"
1661
+ })
1662
+ ),
1663
+ Effect.tapError(
1664
+ (error) => logger.logEdgeCase(domain, "deadlock_detected", {
1665
+ workerId,
1666
+ error: String(error),
1667
+ message: "DEADLOCK: Task acquisition timed out - worker stuck in atomic operation",
1668
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
1669
+ })
1670
+ ),
1671
+ Effect.catchAll(
1672
+ (error) => Effect.gen(function* () {
1673
+ yield* logger.logEdgeCase(
1674
+ domain,
1675
+ "task_acquisition_failed",
1676
+ {
1677
+ workerId,
1678
+ error: String(error),
1679
+ isTimeout: error?.name === "TimeoutException",
1680
+ message: "Task acquisition failed, marking worker as idle and retrying"
1681
+ }
1682
+ );
1683
+ yield* queueManager.markIdle();
1684
+ return {
1685
+ type: "empty_but_active",
1686
+ activeWorkers: 0
1687
+ };
1688
+ })
1689
+ )
1690
+ );
1691
+ if (result.type === "completed") {
1692
+ if ("wasFirstToComplete" in result && result.wasFirstToComplete) {
1693
+ const reason = result.reason || "unknown";
1694
+ yield* logger.logEvent({
1695
+ type: "domain_complete",
1696
+ domain,
1697
+ message: `Worker ${workerId} detected domain completion - ${reason}`,
1698
+ details: { reason }
1699
+ });
1700
+ }
1701
+ yield* logger.logWorkerLifecycle(
1702
+ workerId,
1703
+ domain,
1704
+ "exiting_loop",
1705
+ "detected_completion"
1706
+ );
1707
+ break;
1708
+ } else if (result.type === "empty_but_active") {
1709
+ const backoffMs = Math.min(
1710
+ 1e3 * Math.pow(2, Math.floor(Math.random() * 3)),
1711
+ 5e3
1712
+ );
1713
+ yield* Effect.sleep(`${backoffMs} millis`);
1714
+ continue;
1715
+ } else if (result.type === "task") {
1716
+ const task2 = result.task;
1717
+ yield* logger.logWorkerState(
1718
+ workerId,
1719
+ domain,
1720
+ "marked_active",
1721
+ {
1722
+ taskUrl: task2.url,
1723
+ activeWorkers: result.activeCount
1724
+ }
1725
+ );
1726
+ const wasAdded = yield* localDeduplicator.tryAdd(task2.url);
1727
+ if (!wasAdded) {
1728
+ const postIdleCount = yield* queueManager.markIdle();
1729
+ yield* logger.logWorkerState(
1730
+ workerId,
1731
+ domain,
1732
+ "marked_idle",
1733
+ {
1734
+ taskUrl: task2.url,
1735
+ activeWorkers: postIdleCount,
1736
+ reason: "duplicate_url"
1737
+ }
1738
+ );
1739
+ continue;
1740
+ }
1741
+ } else {
1742
+ yield* Effect.sleep("1 second");
1743
+ continue;
1744
+ }
1745
+ const task = result.task;
1746
+ yield* logger.logEdgeCase(domain, "before_shouldFollowUrl", {
1747
+ workerId,
1748
+ url: task.url,
1749
+ message: "About to check shouldFollowUrl"
1750
+ });
1751
+ const shouldFollow = yield* config.shouldFollowUrl(
1752
+ task.url,
1753
+ task.fromUrl,
1754
+ restrictToStartingDomain ? urlString : void 0
1755
+ );
1756
+ yield* logger.logEdgeCase(domain, "after_shouldFollowUrl", {
1757
+ workerId,
1758
+ url: task.url,
1759
+ follow: shouldFollow.follow,
1760
+ reason: shouldFollow.reason,
1761
+ message: "Completed shouldFollowUrl check"
1762
+ });
1763
+ if (!shouldFollow.follow) {
1764
+ const newIdleCount2 = yield* queueManager.markIdle();
1765
+ yield* logger.logWorkerState(
1766
+ workerId,
1767
+ domain,
1768
+ "marked_idle",
1769
+ {
1770
+ reason: "shouldNotFollow",
1771
+ activeWorkers: newIdleCount2
1772
+ }
1773
+ );
1774
+ continue;
1775
+ }
1776
+ const ignoreRobots = yield* config.shouldIgnoreRobotsTxt();
1777
+ if (!ignoreRobots) {
1778
+ yield* logger.logEdgeCase(domain, "before_robots_check", {
1779
+ workerId,
1780
+ url: task.url,
1781
+ message: "About to check robots.txt"
1782
+ });
1783
+ const robotsCheck = yield* robots.checkUrl(task.url);
1784
+ yield* logger.logEdgeCase(domain, "after_robots_check", {
1785
+ workerId,
1786
+ url: task.url,
1787
+ allowed: robotsCheck.allowed,
1788
+ crawlDelay: robotsCheck.crawlDelay,
1789
+ message: "Completed robots.txt check"
1790
+ });
1791
+ if (!robotsCheck.allowed) {
1792
+ const newIdleCount2 = yield* queueManager.markIdle();
1793
+ yield* logger.logWorkerState(
1794
+ workerId,
1795
+ domain,
1796
+ "marked_idle",
1797
+ {
1798
+ reason: "robotsBlocked",
1799
+ activeWorkers: newIdleCount2
1800
+ }
1801
+ );
1802
+ continue;
1803
+ }
1804
+ if (robotsCheck.crawlDelay) {
1805
+ const maxCrawlDelayMs = yield* config.getMaxRobotsCrawlDelay();
1806
+ const maxCrawlDelaySeconds = maxCrawlDelayMs / 1e3;
1807
+ const effectiveCrawlDelay = Math.min(
1808
+ robotsCheck.crawlDelay,
1809
+ maxCrawlDelaySeconds
1810
+ );
1811
+ if (effectiveCrawlDelay < robotsCheck.crawlDelay) {
1812
+ yield* logger.logEvent({
1813
+ type: "crawl_delay_capped",
1814
+ domain,
1815
+ workerId,
1816
+ message: `[CRAWL_DELAY] Capping robots.txt delay from ${robotsCheck.crawlDelay}s to ${effectiveCrawlDelay}s`,
1817
+ details: {
1818
+ robotsCrawlDelay: robotsCheck.crawlDelay,
1819
+ maxCrawlDelay: maxCrawlDelaySeconds,
1820
+ effectiveDelay: effectiveCrawlDelay
1821
+ }
1822
+ });
1823
+ }
1824
+ yield* Effect.sleep(`${effectiveCrawlDelay} seconds`);
1825
+ }
1826
+ }
1827
+ const requestDelay = yield* config.getRequestDelay();
1828
+ yield* Effect.sleep(`${requestDelay} millis`);
1829
+ const fetchStartTime = Date.now();
1830
+ yield* logger.logEdgeCase(domain, "before_fetch", {
1831
+ workerId,
1832
+ url: task.url,
1833
+ depth: task.depth,
1834
+ message: "About to fetch and parse page",
1835
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1836
+ fetchStartMs: fetchStartTime
1837
+ });
1838
+ const pageData = yield* scraper.fetchAndParse(task.url, task.depth).pipe(
1839
+ // Add overall timeout to prevent workers from hanging
1840
+ Effect.timeout("45 seconds"),
1841
+ Effect.retry({
1842
+ times: 2,
1843
+ // Reduced retries to prevent long hangs
1844
+ schedule: Schedule.exponential("1 second")
1845
+ }),
1846
+ Effect.catchAll(
1847
+ (error) => Effect.gen(function* () {
1848
+ const fetchDuration = Date.now() - fetchStartTime;
1849
+ if (error?.name === "TimeoutException") {
1850
+ yield* logger.logEdgeCase(domain, "fetch_timeout", {
1851
+ workerId,
1852
+ url: task.url,
1853
+ message: `Fetch operation timed out after ${fetchDuration}ms`,
1854
+ durationMs: fetchDuration,
1855
+ timeoutExpectedMs: 45e3
1856
+ });
1857
+ } else {
1858
+ yield* logger.logEdgeCase(domain, "fetch_error", {
1859
+ workerId,
1860
+ url: task.url,
1861
+ error: String(error),
1862
+ errorName: error?.name || "Unknown",
1863
+ message: `Fetch operation failed after ${fetchDuration}ms`,
1864
+ durationMs: fetchDuration
1865
+ });
1866
+ }
1867
+ return null;
1868
+ })
1869
+ )
1870
+ );
1871
+ if (pageData) {
1872
+ const fetchDuration = Date.now() - fetchStartTime;
1873
+ if (task.extractData) {
1874
+ const extractedData = yield* Effect.sync(() => {
1875
+ const $ = cheerio.load(pageData.html);
1876
+ const result2 = {};
1877
+ for (const [fieldName, fieldConfig] of Object.entries(
1878
+ task.extractData
1879
+ )) {
1880
+ if (typeof fieldConfig === "string") {
1881
+ result2[fieldName] = $(fieldConfig).text().trim() || void 0;
1882
+ } else if (typeof fieldConfig === "object") {
1883
+ const fc = fieldConfig;
1884
+ const {
1885
+ selector,
1886
+ text,
1887
+ attribute,
1888
+ multiple,
1889
+ exists
1890
+ } = fc;
1891
+ if (exists) {
1892
+ result2[fieldName] = $(selector).length > 0;
1893
+ } else if (multiple) {
1894
+ const elements = $(selector);
1895
+ const values = [];
1896
+ elements.each((_, el) => {
1897
+ const $el = $(el);
1898
+ if (fc.fields) {
1899
+ const nestedResult = {};
1900
+ for (const [
1901
+ nestedName,
1902
+ nestedConfig
1903
+ ] of Object.entries(fc.fields)) {
1904
+ if (typeof nestedConfig === "object") {
1905
+ const nc = nestedConfig;
1906
+ const $nested = $el.find(nc.selector);
1907
+ if (nc.attribute) {
1908
+ nestedResult[nestedName] = $nested.attr(
1909
+ nc.attribute
1910
+ );
1911
+ } else {
1912
+ nestedResult[nestedName] = $nested.text().trim();
1913
+ }
1914
+ }
1915
+ }
1916
+ values.push(nestedResult);
1917
+ } else if (attribute) {
1918
+ values.push($el.attr(attribute));
1919
+ } else {
1920
+ values.push($el.text().trim());
1921
+ }
1922
+ });
1923
+ result2[fieldName] = values.length > 0 ? values : void 0;
1924
+ } else {
1925
+ const $el = $(selector);
1926
+ if (attribute) {
1927
+ result2[fieldName] = $el.attr(attribute);
1928
+ } else {
1929
+ result2[fieldName] = $el.text().trim() || void 0;
1930
+ }
1931
+ }
1932
+ }
1933
+ }
1934
+ return result2;
1935
+ });
1936
+ pageData.extractedData = extractedData;
1937
+ }
1938
+ const currentPageCount = yield* localDeduplicator.size();
1939
+ yield* logger.logEdgeCase(domain, "fetch_success", {
1940
+ workerId,
1941
+ url: task.url,
1942
+ message: `Fetch completed successfully`,
1943
+ durationMs: fetchDuration
1944
+ });
1945
+ yield* logger.logPageScraped(
1946
+ task.url,
1947
+ domain,
1948
+ currentPageCount
1949
+ );
1950
+ yield* PubSub.publish(resultPubSub, {
1951
+ pageData,
1952
+ depth: task.depth,
1953
+ timestamp: /* @__PURE__ */ new Date(),
1954
+ metadata: task.metadata
1955
+ });
1956
+ const maxDepth = yield* config.getMaxDepth();
1957
+ if (!maxDepth || task.depth < maxDepth) {
1958
+ let linksToProcess = [];
1959
+ const extractionResult = linkExtractor ? yield* (() => {
1960
+ const extractorConfig = options?.linkExtractorConfig || {};
1961
+ return linkExtractor.extractLinks(pageData.html, extractorConfig).pipe(
1962
+ Effect.catchAll(
1963
+ () => Effect.succeed({
1964
+ links: [],
1965
+ totalElementsProcessed: 0,
1966
+ extractionBreakdown: {}
1967
+ })
1968
+ )
1969
+ );
1970
+ })() : {
1971
+ links: []
1972
+ };
1973
+ linksToProcess = extractionResult.links.map((url) => {
1974
+ try {
1975
+ return new URL(url, pageData.url).toString();
1976
+ } catch {
1977
+ return null;
1978
+ }
1979
+ }).filter((url) => url !== null);
1980
+ for (const link of linksToProcess) {
1981
+ const linkShouldFollow = yield* config.shouldFollowUrl(
1982
+ link,
1983
+ task.url,
1984
+ restrictToStartingDomain ? urlString : void 0
1985
+ );
1986
+ if (!linkShouldFollow.follow) {
1987
+ continue;
1988
+ }
1989
+ const alreadySeen = yield* localDeduplicator.contains(link);
1990
+ if (!alreadySeen) {
1991
+ yield* queueManager.addTask({
1992
+ url: link,
1993
+ depth: task.depth + 1,
1994
+ fromUrl: task.url,
1995
+ metadata: task.metadata
1996
+ });
1997
+ const newQueueSize = yield* queueManager.size();
1998
+ if (newQueueSize % 10 === 0 || newQueueSize <= 5) {
1999
+ yield* logger.logEvent({
2000
+ type: "queue_status",
2001
+ domain,
2002
+ workerId,
2003
+ message: `[QUEUE_STATE] URL added to queue: ${link}`,
2004
+ details: {
2005
+ queueSize: newQueueSize,
2006
+ addedUrl: link,
2007
+ fromUrl: task.url
2008
+ }
2009
+ });
2010
+ }
2011
+ }
2012
+ }
2013
+ }
2014
+ }
2015
+ const newIdleCount = yield* queueManager.markIdle();
2016
+ yield* logger.logWorkerState(
2017
+ workerId,
2018
+ domain,
2019
+ "task_completed",
2020
+ {
2021
+ taskUrl: task.url,
2022
+ activeWorkers: newIdleCount,
2023
+ pageProcessed: !!pageData
2024
+ }
2025
+ );
2026
+ const maxPages2 = yield* config.getMaxPages();
2027
+ if (maxPages2) {
2028
+ const currentPageCount = yield* localDeduplicator.size();
2029
+ if (currentPageCount >= maxPages2) {
2030
+ const wasFirstToReachMax = MutableRef.compareAndSet(
2031
+ maxPagesReached,
2032
+ false,
2033
+ true
2034
+ );
2035
+ if (wasFirstToReachMax) {
2036
+ yield* logger.logPageScraped(
2037
+ task.url,
2038
+ domain,
2039
+ currentPageCount
2040
+ );
2041
+ yield* logger.logEvent({
2042
+ type: "domain_complete",
2043
+ domain,
2044
+ message: `Domain ${domain} reached max pages limit: ${currentPageCount}`,
2045
+ details: {
2046
+ currentPageCount,
2047
+ maxPages: maxPages2,
2048
+ reason: "max_pages_reached"
2049
+ }
2050
+ });
2051
+ }
2052
+ yield* logger.logWorkerLifecycle(
2053
+ workerId,
2054
+ domain,
2055
+ "exiting_loop",
2056
+ "max_pages_reached",
2057
+ {
2058
+ currentPageCount,
2059
+ maxPages: maxPages2
2060
+ }
2061
+ );
2062
+ break;
2063
+ }
2064
+ }
2065
+ const pageCount = yield* localDeduplicator.size();
2066
+ if (pageCount % 10 === 0) {
2067
+ const queueSize2 = yield* queueManager.size();
2068
+ const activeCount = MutableRef.get(activeWorkers);
2069
+ const maxWorkers2 = yield* config.getMaxConcurrentWorkers();
2070
+ yield* logger.logDomainStatus(domain, {
2071
+ pagesScraped: pageCount,
2072
+ queueSize: queueSize2,
2073
+ activeWorkers: activeCount,
2074
+ maxWorkers: maxWorkers2
2075
+ });
2076
+ }
2077
+ }
2078
+ yield* logger.logWorkerLifecycle(
2079
+ workerId,
2080
+ domain,
2081
+ "exiting_loop",
2082
+ "normal_completion"
2083
+ );
2084
+ }).pipe(
2085
+ // Ensure this runs even if the worker is interrupted/crashes
2086
+ Effect.ensuring(
2087
+ logger.logWorkerLifecycle(
2088
+ workerId,
2089
+ domain,
2090
+ "exiting_loop",
2091
+ "effect_ensuring_cleanup"
2092
+ )
2093
+ ),
2094
+ // Add catchAll to handle any unhandled errors
2095
+ Effect.catchAll(
2096
+ (error) => Effect.gen(function* () {
2097
+ yield* logger.logEdgeCase(domain, "worker_crash", {
2098
+ workerId,
2099
+ error: String(error),
2100
+ message: `Worker ${workerId} crashed with error: ${error}`,
2101
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
2102
+ });
2103
+ yield* logger.logWorkerLifecycle(
2104
+ workerId,
2105
+ domain,
2106
+ "exiting_loop",
2107
+ "error_exit"
2108
+ );
2109
+ })
2110
+ )
2111
+ );
2112
+ yield* queueManager.addTask({
2113
+ url: urlString,
2114
+ depth: 0,
2115
+ metadata: initialMetadata,
2116
+ extractData: options?.extractData
2117
+ });
2118
+ yield* logger.logEvent({
2119
+ type: "queue_status",
2120
+ domain,
2121
+ message: `[QUEUE_STATE] Initial URL queued: ${urlString}`,
2122
+ details: { queueSize: 1, initialUrl: urlString }
2123
+ });
2124
+ const maxWorkers = yield* config.getMaxConcurrentWorkers();
2125
+ const workerFibers = [];
2126
+ for (let i = 0; i < maxWorkers; i++) {
2127
+ const workerId = yield* generateWorkerId();
2128
+ yield* logger.logWorkerLifecycle(
2129
+ workerId,
2130
+ domain,
2131
+ "created",
2132
+ void 0,
2133
+ {
2134
+ workerIndex: i,
2135
+ totalWorkers: maxWorkers
2136
+ }
2137
+ );
2138
+ const fiber = yield* Effect.fork(worker(workerId));
2139
+ workerFibers.push(fiber);
2140
+ }
2141
+ const healthMonitorFiber = yield* Effect.fork(workerHealthMonitor);
2142
+ const resultStream = Stream.fromPubSub(resultPubSub);
2143
+ const sinkFiber = yield* Effect.fork(
2144
+ Stream.run(resultStream, sink)
2145
+ );
2146
+ const failureDetector = Effect.gen(function* () {
2147
+ let lastPageCount = 0;
2148
+ let stuckIterations = 0;
2149
+ while (!MutableRef.get(domainCompleted)) {
2150
+ yield* Effect.sleep("30 seconds");
2151
+ const pageCount = yield* localDeduplicator.size();
2152
+ const queueSize = yield* queueManager.size();
2153
+ const activeCount = MutableRef.get(activeWorkers);
2154
+ const hasQueueItems = queueSize > 0;
2155
+ const hasNoActiveWorkers = activeCount === 0;
2156
+ const hasNegativeQueue = queueSize < 0;
2157
+ const noProgressMade = pageCount === lastPageCount;
2158
+ if (hasNegativeQueue) {
2159
+ yield* logger.logEdgeCase(domain, "negative_queue_detected", {
2160
+ queueSize,
2161
+ activeWorkers: activeCount,
2162
+ pageCount
2163
+ });
2164
+ }
2165
+ const criticalFailures = [
2166
+ hasNoActiveWorkers && hasQueueItems && pageCount > 0,
2167
+ // 0 workers with queue items
2168
+ hasNegativeQueue,
2169
+ // Invalid queue state
2170
+ activeCount === 0 && pageCount <= 1 && stuckIterations >= 2
2171
+ // Completely stuck
2172
+ ];
2173
+ if (criticalFailures.some(Boolean)) {
2174
+ const reason = hasNoActiveWorkers && hasQueueItems ? "no_workers_with_queue_items" : hasNegativeQueue ? "negative_queue_size" : "no_progress_for_60s";
2175
+ yield* logger.logEdgeCase(
2176
+ domain,
2177
+ "critical_failure_detected",
2178
+ {
2179
+ timeElapsed: `${(stuckIterations + 1) * 30}s`,
2180
+ pageCount,
2181
+ queueSize,
2182
+ activeWorkers: activeCount,
2183
+ reason
2184
+ }
2185
+ );
2186
+ const wasCompleted = MutableRef.compareAndSet(
2187
+ domainCompleted,
2188
+ false,
2189
+ true
2190
+ );
2191
+ if (wasCompleted) {
2192
+ yield* logger.logDomainComplete(domain, pageCount, "error");
2193
+ }
2194
+ break;
2195
+ }
2196
+ if (noProgressMade) {
2197
+ stuckIterations++;
2198
+ } else {
2199
+ stuckIterations = 0;
2200
+ lastPageCount = pageCount;
2201
+ }
2202
+ }
2203
+ });
2204
+ const failureDetectorFiber = yield* Effect.fork(failureDetector);
2205
+ yield* Effect.all(
2206
+ workerFibers.map((f) => Fiber.join(f)),
2207
+ { concurrency: "unbounded" }
2208
+ );
2209
+ yield* Fiber.interrupt(failureDetectorFiber).pipe(Effect.ignore);
2210
+ yield* Fiber.interrupt(healthMonitorFiber).pipe(Effect.ignore);
2211
+ yield* logger.logEvent({
2212
+ type: "queue_status",
2213
+ domain,
2214
+ message: `[QUEUE_STATE] Shutting down queue for domain completion`,
2215
+ details: { finalQueueSize: yield* queueManager.size() }
2216
+ });
2217
+ const finalPageCount = yield* localDeduplicator.size();
2218
+ const maxPages = yield* config.getMaxPages();
2219
+ const completionReason = maxPages && finalPageCount >= maxPages ? "max_pages" : "queue_empty";
2220
+ yield* logger.logDomainComplete(
2221
+ domain,
2222
+ finalPageCount,
2223
+ completionReason
2224
+ );
2225
+ yield* PubSub.shutdown(resultPubSub);
2226
+ yield* logger.logEvent({
2227
+ type: "spider_lifecycle",
2228
+ domain,
2229
+ message: `Waiting for sink to process remaining results...`
2230
+ });
2231
+ yield* Fiber.join(sinkFiber);
2232
+ yield* logger.logEvent({
2233
+ type: "spider_lifecycle",
2234
+ domain,
2235
+ message: `Sink processing complete. All ${finalPageCount} pages saved.`
2236
+ });
2237
+ return {
2238
+ completed: true,
2239
+ pagesScraped: finalPageCount,
2240
+ domain
2241
+ };
2242
+ }),
2243
+ /**
2244
+ * Resume a previous crawling session from persistent storage.
2245
+ *
2246
+ * This method requires resumability to be enabled in the SpiderConfig and
2247
+ * a StatePersistence implementation to be configured. It will restore the
2248
+ * crawling state and continue processing from where it left off.
2249
+ *
2250
+ * @param stateKey - The unique identifier for the session to resume
2251
+ * @param sink - Sink to process crawl results as they're produced
2252
+ * @param persistence - Optional persistence implementation (uses configured one if not provided)
2253
+ * @returns Effect containing crawl statistics
2254
+ *
2255
+ * @example
2256
+ * ```typescript
2257
+ * const stateKey = new SpiderStateKey({
2258
+ * id: 'my-crawl-session',
2259
+ * timestamp: new Date('2024-01-01'),
2260
+ * name: 'Example Crawl'
2261
+ * });
2262
+ *
2263
+ * const collectSink = Sink.forEach<CrawlResult>(result =>
2264
+ * Effect.sync(() => console.log(`Resumed: ${result.pageData.title}`))
2265
+ * );
2266
+ *
2267
+ * const stats = yield* spider.resume(stateKey, collectSink);
2268
+ * ```
2269
+ */
2270
+ resume: (stateKey, _sink, _persistence) => Effect.gen(function* () {
2271
+ if (!scheduler) {
2272
+ return yield* Effect.fail(
2273
+ new Error(
2274
+ "Resume functionality requires SpiderSchedulerService to be available. Make sure resumability is enabled in SpiderConfig and SpiderSchedulerService is provided."
2275
+ )
2276
+ );
2277
+ }
2278
+ const config = yield* SpiderConfig;
2279
+ if (!config) {
2280
+ return yield* Effect.fail(
2281
+ new Error(
2282
+ "SpiderConfig is required for resumability operations"
2283
+ )
2284
+ );
2285
+ }
2286
+ const resumabilityEnabled = yield* config.isResumabilityEnabled();
2287
+ if (!resumabilityEnabled) {
2288
+ return yield* Effect.fail(
2289
+ new Error(
2290
+ "Resume functionality requires resumability to be enabled in SpiderConfig. Set enableResumability: true in your spider configuration."
2291
+ )
2292
+ );
2293
+ }
2294
+ console.log(`Resuming session: ${stateKey.id}`);
2295
+ return {
2296
+ completed: true,
2297
+ resumed: true
2298
+ };
2299
+ }),
2300
+ /**
2301
+ * Returns the list of URLs that have been visited during crawling.
2302
+ *
2303
+ * @returns Effect containing array of visited URLs
2304
+ *
2305
+ * @remarks
2306
+ * This is currently a placeholder implementation. In a future version,
2307
+ * this will return the actual list of visited URLs from the current session.
2308
+ */
2309
+ getVisitedUrls: () => Effect.sync(() => [])
2310
+ };
2311
+ return self;
2312
+ }),
2313
+ dependencies: [
2314
+ RobotsService.Default,
2315
+ ScraperService.Default,
2316
+ UrlDeduplicatorService.Default,
2317
+ SpiderConfig.Default,
2318
+ LinkExtractorService.Default,
2319
+ SpiderLoggerLive
2320
+ ]
2321
+ }
2322
+ ) {
2323
+ }
2324
+ class MiddlewareManager extends Effect.Service()(
2325
+ "@jambudipa.io/MiddlewareManager",
2326
+ {
2327
+ effect: Effect.sync(() => ({
2328
+ /**
2329
+ * Processes a request through the middleware pipeline.
2330
+ *
2331
+ * Middleware are executed in order from first to last, with each middleware
2332
+ * receiving the output of the previous middleware as input.
2333
+ *
2334
+ * @param request - The initial request to process
2335
+ * @param middlewares - Array of middleware to apply
2336
+ * @returns Effect containing the processed request
2337
+ */
2338
+ processRequest: (request, middlewares) => Effect.reduce(
2339
+ middlewares,
2340
+ request,
2341
+ (req, middleware) => middleware.processRequest ? middleware.processRequest(req) : Effect.succeed(req)
2342
+ ),
2343
+ /**
2344
+ * Processes a response through the middleware pipeline in reverse order.
2345
+ *
2346
+ * Middleware are executed in reverse order (last to first) to provide
2347
+ * proper nesting of response processing.
2348
+ *
2349
+ * @param response - The response to process
2350
+ * @param request - The original request (for context)
2351
+ * @param middlewares - Array of middleware to apply
2352
+ * @returns Effect containing the processed response
2353
+ */
2354
+ processResponse: (response, request, middlewares) => Effect.reduce(
2355
+ middlewares.slice().reverse(),
2356
+ response,
2357
+ (res, middleware) => middleware.processResponse ? middleware.processResponse(res, request) : Effect.succeed(res)
2358
+ ),
2359
+ /**
2360
+ * Processes an exception through the middleware pipeline in reverse order.
2361
+ *
2362
+ * Middleware are given a chance to handle or recover from exceptions.
2363
+ * If a middleware returns a SpiderResponse, it indicates successful recovery.
2364
+ * If it returns null, the exception continues to propagate.
2365
+ *
2366
+ * @param error - The error that occurred
2367
+ * @param request - The request that caused the error
2368
+ * @param middlewares - Array of middleware to apply
2369
+ * @returns Effect containing a recovered response or null
2370
+ */
2371
+ processException: (error, request, middlewares) => Effect.reduce(
2372
+ middlewares.slice().reverse(),
2373
+ null,
2374
+ (res, middleware) => middleware.processException ? middleware.processException(error, request) : Effect.succeed(res)
2375
+ )
2376
+ }))
2377
+ }
2378
+ ) {
2379
+ }
2380
+ class RateLimitMiddleware extends Effect.Service()(
2381
+ "@jambudipa.io/RateLimitMiddleware",
2382
+ {
2383
+ effect: Effect.sync(() => {
2384
+ const domainLastRequest = MutableHashMap.empty();
2385
+ const domainRequestCount = MutableHashMap.empty();
2386
+ const domainWindowStart = MutableHashMap.empty();
2387
+ return {
2388
+ create: (config) => ({
2389
+ processRequest: (request) => Effect.gen(function* () {
2390
+ const url = new URL(request.task.url);
2391
+ const domain = url.hostname;
2392
+ const now = Date.now();
2393
+ if (config.requestDelayMs) {
2394
+ yield* Effect.sleep(`${config.requestDelayMs} millis`);
2395
+ }
2396
+ const windowDuration = 1e3;
2397
+ const windowStart = Option.getOrElse(
2398
+ MutableHashMap.get(domainWindowStart, domain),
2399
+ () => now
2400
+ );
2401
+ const currentCount = Option.getOrElse(
2402
+ MutableHashMap.get(domainRequestCount, domain),
2403
+ () => 0
2404
+ );
2405
+ if (now - windowStart >= windowDuration) {
2406
+ MutableHashMap.set(domainWindowStart, domain, now);
2407
+ MutableHashMap.set(domainRequestCount, domain, 0);
2408
+ } else if (currentCount >= config.maxRequestsPerSecondPerDomain) {
2409
+ const waitTime = windowDuration - (now - windowStart);
2410
+ yield* Effect.sleep(`${waitTime} millis`);
2411
+ MutableHashMap.set(domainWindowStart, domain, Date.now());
2412
+ MutableHashMap.set(domainRequestCount, domain, 0);
2413
+ }
2414
+ const newCount = Option.getOrElse(
2415
+ MutableHashMap.get(domainRequestCount, domain),
2416
+ () => 0
2417
+ ) + 1;
2418
+ MutableHashMap.set(domainRequestCount, domain, newCount);
2419
+ MutableHashMap.set(domainLastRequest, domain, Date.now());
2420
+ yield* Effect.logDebug(
2421
+ `Rate limit: ${domain} - ${newCount}/${config.maxRequestsPerSecondPerDomain} requests in window`
2422
+ );
2423
+ return request;
2424
+ })
2425
+ })
2426
+ };
2427
+ })
2428
+ }
2429
+ ) {
2430
+ }
2431
+ class LoggingMiddleware extends Effect.Service()(
2432
+ "@jambudipa.io/LoggingMiddleware",
2433
+ {
2434
+ effect: Effect.sync(() => ({
2435
+ create: (config = {}) => {
2436
+ const {
2437
+ logRequests = true,
2438
+ logResponses = true,
2439
+ logErrors = true,
2440
+ logLevel = "info"
2441
+ } = config;
2442
+ return {
2443
+ processRequest: (request) => Effect.gen(function* () {
2444
+ if (logRequests) {
2445
+ const logMessage = `Processing request: ${request.task.url} (depth: ${request.task.depth})`;
2446
+ switch (logLevel) {
2447
+ case "debug":
2448
+ yield* Effect.logDebug(logMessage);
2449
+ break;
2450
+ case "info":
2451
+ yield* Effect.logInfo(logMessage);
2452
+ break;
2453
+ case "warn":
2454
+ yield* Effect.logWarning(logMessage);
2455
+ break;
2456
+ case "error":
2457
+ yield* Effect.logError(logMessage);
2458
+ break;
2459
+ }
2460
+ }
2461
+ return request;
2462
+ }),
2463
+ processResponse: (response, request) => Effect.gen(function* () {
2464
+ if (logResponses) {
2465
+ const logMessage = `Received response: ${request.task.url} (status: ${response.statusCode || "unknown"}, size: ${response.pageData.html.length} bytes)`;
2466
+ switch (logLevel) {
2467
+ case "debug":
2468
+ yield* Effect.logDebug(logMessage);
2469
+ break;
2470
+ case "info":
2471
+ yield* Effect.logInfo(logMessage);
2472
+ break;
2473
+ case "warn":
2474
+ yield* Effect.logWarning(logMessage);
2475
+ break;
2476
+ case "error":
2477
+ yield* Effect.logError(logMessage);
2478
+ break;
2479
+ }
2480
+ }
2481
+ return response;
2482
+ }),
2483
+ processException: (error, request) => Effect.gen(function* () {
2484
+ if (logErrors) {
2485
+ const logMessage = `Error processing request: ${request.task.url} - ${error.message}`;
2486
+ yield* Effect.logError(logMessage);
2487
+ }
2488
+ return null;
2489
+ })
2490
+ };
2491
+ }
2492
+ }))
2493
+ }
2494
+ ) {
2495
+ }
2496
+ class UserAgentMiddleware extends Effect.Service()(
2497
+ "@jambudipa.io/UserAgentMiddleware",
2498
+ {
2499
+ effect: Effect.sync(() => ({
2500
+ create: (userAgent) => ({
2501
+ processRequest: (request) => Effect.succeed({
2502
+ ...request,
2503
+ headers: {
2504
+ ...request.headers,
2505
+ "User-Agent": userAgent
2506
+ }
2507
+ })
2508
+ })
2509
+ }))
2510
+ }
2511
+ ) {
2512
+ }
2513
+ class StatsMiddleware extends Effect.Service()(
2514
+ "@jambudipa.io/StatsMiddleware",
2515
+ {
2516
+ effect: Effect.sync(() => ({
2517
+ create: () => {
2518
+ const stats = MutableHashMap.empty();
2519
+ const startTime = Date.now();
2520
+ const incr = (key, count = 1) => {
2521
+ const current = Option.getOrElse(
2522
+ MutableHashMap.get(stats, key),
2523
+ () => 0
2524
+ );
2525
+ MutableHashMap.set(stats, key, current + count);
2526
+ };
2527
+ return {
2528
+ middleware: {
2529
+ processRequest: (request) => Effect.sync(() => {
2530
+ incr("requests_processed");
2531
+ incr(`requests_depth_${request.task.depth}`);
2532
+ return request;
2533
+ }),
2534
+ processResponse: (response) => Effect.sync(() => {
2535
+ incr("responses_received");
2536
+ if (response.statusCode) {
2537
+ incr(`status_${response.statusCode}`);
2538
+ if (response.statusCode >= 200 && response.statusCode < 300) {
2539
+ incr("responses_success");
2540
+ } else if (response.statusCode >= 400) {
2541
+ incr("responses_error");
2542
+ }
2543
+ }
2544
+ incr("bytes_downloaded", response.pageData.html.length);
2545
+ return response;
2546
+ }),
2547
+ processException: (error) => Effect.sync(() => {
2548
+ incr("exceptions");
2549
+ incr(`exception_${error.constructor.name}`);
2550
+ return null;
2551
+ })
2552
+ },
2553
+ getStats: () => Effect.sync(() => ({
2554
+ ...Object.fromEntries(Array.from(stats)),
2555
+ runtime_seconds: (Date.now() - startTime) / 1e3
2556
+ }))
2557
+ };
2558
+ }
2559
+ }))
2560
+ }
2561
+ ) {
2562
+ }
2563
+ class StateDelta extends Schema.Class("StateDelta")({
2564
+ /** Session this delta applies to */
2565
+ stateKey: Schema.String,
2566
+ /** Sequence number for ordering deltas */
2567
+ sequence: Schema.Number,
2568
+ /** When this delta was created */
2569
+ timestamp: Schema.Date,
2570
+ /** The operation that created this delta */
2571
+ operation: Schema.Union(
2572
+ Schema.Struct({
2573
+ type: Schema.Literal("enqueue"),
2574
+ request: PriorityRequest
2575
+ }),
2576
+ Schema.Struct({
2577
+ type: Schema.Literal("dequeue"),
2578
+ fingerprint: Schema.String
2579
+ }),
2580
+ Schema.Struct({
2581
+ type: Schema.Literal("mark_visited"),
2582
+ fingerprint: Schema.String
2583
+ })
2584
+ )
2585
+ }) {
2586
+ }
2587
+ class PersistenceError2 extends Data.TaggedError("PersistenceError") {
2588
+ }
2589
+ const DEFAULT_HYBRID_CONFIG = {
2590
+ snapshotInterval: 1e3,
2591
+ maxDeltasBeforeSnapshot: 500,
2592
+ compactionEnabled: true,
2593
+ batchDeltas: true,
2594
+ deltaBatchSize: 10
2595
+ };
2596
+ class FullStatePersistence {
2597
+ constructor(backend) {
2598
+ this.backend = backend;
2599
+ }
2600
+ persist = (operation) => {
2601
+ const self = this;
2602
+ return Effect.gen(function* () {
2603
+ if (!self.backend.saveState) {
2604
+ return yield* Effect.fail(
2605
+ new PersistenceError2({
2606
+ message: `Backend ${self.backend.name} does not support full state persistence`,
2607
+ operation: "persist"
2608
+ })
2609
+ );
2610
+ }
2611
+ yield* self.backend.saveState(
2612
+ operation.resultingState.key,
2613
+ operation.resultingState
2614
+ );
2615
+ });
2616
+ };
2617
+ restore = (key) => {
2618
+ const self = this;
2619
+ return Effect.gen(function* () {
2620
+ if (!self.backend.loadState) {
2621
+ return yield* Effect.fail(
2622
+ new PersistenceError2({
2623
+ message: `Backend ${self.backend.name} does not support state loading`,
2624
+ operation: "restore"
2625
+ })
2626
+ );
2627
+ }
2628
+ return yield* self.backend.loadState(key);
2629
+ });
2630
+ };
2631
+ cleanup = (key) => {
2632
+ const self = this;
2633
+ return Effect.gen(function* () {
2634
+ if (!self.backend.deleteState) {
2635
+ return yield* Effect.fail(
2636
+ new PersistenceError2({
2637
+ message: `Backend ${self.backend.name} does not support state deletion`,
2638
+ operation: "cleanup"
2639
+ })
2640
+ );
2641
+ }
2642
+ yield* self.backend.deleteState(key);
2643
+ });
2644
+ };
2645
+ getInfo = () => ({
2646
+ name: "FullStatePersistence",
2647
+ description: "Saves complete state on every operation. Simple but potentially inefficient for large crawls.",
2648
+ capabilities: ["full-state-save", "full-state-restore", "simple-cleanup"]
2649
+ });
2650
+ }
2651
+ class DeltaPersistence {
2652
+ constructor(backend) {
2653
+ this.backend = backend;
2654
+ }
2655
+ persist = (operation) => {
2656
+ const self = this;
2657
+ return Effect.gen(function* () {
2658
+ if (!self.backend.saveDelta) {
2659
+ return yield* Effect.fail(
2660
+ new PersistenceError2({
2661
+ message: `Backend ${self.backend.name} does not support delta persistence`,
2662
+ operation: "persist"
2663
+ })
2664
+ );
2665
+ }
2666
+ yield* self.backend.saveDelta(operation.delta);
2667
+ });
2668
+ };
2669
+ restore = (key) => {
2670
+ const self = this;
2671
+ return Effect.gen(function* () {
2672
+ if (!self.backend.loadDeltas) {
2673
+ return yield* Effect.fail(
2674
+ new PersistenceError2({
2675
+ message: `Backend ${self.backend.name} does not support delta loading`,
2676
+ operation: "restore"
2677
+ })
2678
+ );
2679
+ }
2680
+ const deltas = yield* self.backend.loadDeltas(key);
2681
+ if (deltas.length === 0) {
2682
+ return null;
2683
+ }
2684
+ return yield* self.reconstructStateFromDeltas(key, deltas);
2685
+ });
2686
+ };
2687
+ cleanup = (key) => {
2688
+ const self = this;
2689
+ return Effect.gen(function* () {
2690
+ if (!self.backend.loadDeltas || !self.backend.compactDeltas) {
2691
+ return yield* Effect.fail(
2692
+ new PersistenceError2({
2693
+ message: `Backend ${self.backend.name} does not support delta cleanup`,
2694
+ operation: "cleanup"
2695
+ })
2696
+ );
2697
+ }
2698
+ const deltas = yield* self.backend.loadDeltas(key);
2699
+ if (deltas.length > 0) {
2700
+ const maxSequence = Math.max(...deltas.map((d) => d.sequence));
2701
+ yield* self.backend.compactDeltas(key, maxSequence + 1);
2702
+ }
2703
+ });
2704
+ };
2705
+ reconstructStateFromDeltas = (key, deltas) => Effect.gen(function* () {
2706
+ const sortedDeltas = [...deltas].sort((a, b) => a.sequence - b.sequence);
2707
+ const pendingRequests = [];
2708
+ const visitedFingerprints = [];
2709
+ let totalProcessed = 0;
2710
+ for (const delta of sortedDeltas) {
2711
+ switch (delta.operation.type) {
2712
+ case "enqueue":
2713
+ pendingRequests.push(delta.operation.request);
2714
+ break;
2715
+ case "dequeue": {
2716
+ const operation = delta.operation;
2717
+ if (operation.type === "dequeue") {
2718
+ const dequeueIndex = pendingRequests.findIndex(
2719
+ (req) => req.fingerprint === operation.fingerprint
2720
+ );
2721
+ if (dequeueIndex >= 0) {
2722
+ pendingRequests.splice(dequeueIndex, 1);
2723
+ totalProcessed++;
2724
+ }
2725
+ }
2726
+ break;
2727
+ }
2728
+ case "mark_visited": {
2729
+ const operation = delta.operation;
2730
+ if (operation.type === "mark_visited") {
2731
+ if (!visitedFingerprints.includes(operation.fingerprint)) {
2732
+ visitedFingerprints.push(operation.fingerprint);
2733
+ }
2734
+ }
2735
+ break;
2736
+ }
2737
+ }
2738
+ }
2739
+ return yield* Effect.tryPromise({
2740
+ try: async () => {
2741
+ const { SpiderState: SpiderState2 } = await Promise.resolve().then(() => SpiderScheduler_service);
2742
+ return new SpiderState2({
2743
+ key,
2744
+ pendingRequests,
2745
+ visitedFingerprints,
2746
+ totalProcessed
2747
+ });
2748
+ },
2749
+ catch: (error) => new PersistenceError2({
2750
+ message: "Failed to import SpiderState",
2751
+ cause: error,
2752
+ operation: "reconstructStateFromDeltas"
2753
+ })
2754
+ });
2755
+ });
2756
+ getInfo = () => ({
2757
+ name: "DeltaPersistence",
2758
+ description: "Saves only incremental changes. Efficient for large crawls but requires delta replay.",
2759
+ capabilities: ["delta-save", "delta-restore", "state-reconstruction"]
2760
+ });
2761
+ }
2762
+ class HybridPersistence {
2763
+ constructor(backend, config = DEFAULT_HYBRID_CONFIG) {
2764
+ this.backend = backend;
2765
+ this.config = config;
2766
+ }
2767
+ operationCount = 0;
2768
+ lastSnapshotSequence = 0;
2769
+ pendingDeltas = [];
2770
+ persist = (operation) => {
2771
+ const self = this;
2772
+ return Effect.gen(function* () {
2773
+ self.operationCount++;
2774
+ if (self.config.batchDeltas) {
2775
+ self.pendingDeltas.push(operation.delta);
2776
+ }
2777
+ const shouldSnapshot = operation.shouldSnapshot || self.operationCount % self.config.snapshotInterval === 0 || self.operationCount - self.lastSnapshotSequence >= self.config.maxDeltasBeforeSnapshot;
2778
+ if (shouldSnapshot) {
2779
+ yield* self.saveSnapshot(operation);
2780
+ } else {
2781
+ yield* self.saveDelta(operation);
2782
+ }
2783
+ if (self.config.batchDeltas && self.pendingDeltas.length >= self.config.deltaBatchSize) {
2784
+ yield* self.flushPendingDeltas();
2785
+ }
2786
+ });
2787
+ };
2788
+ saveSnapshot = (operation) => {
2789
+ const self = this;
2790
+ return Effect.gen(function* () {
2791
+ if (!self.backend.saveSnapshot) {
2792
+ return yield* Effect.fail(
2793
+ new PersistenceError2({
2794
+ message: `Backend ${self.backend.name} does not support snapshots`,
2795
+ operation: "saveSnapshot"
2796
+ })
2797
+ );
2798
+ }
2799
+ yield* self.backend.saveSnapshot(
2800
+ operation.resultingState.key,
2801
+ operation.resultingState,
2802
+ operation.delta.sequence
2803
+ );
2804
+ self.lastSnapshotSequence = operation.delta.sequence;
2805
+ if (self.config.compactionEnabled && self.backend.compactDeltas) {
2806
+ yield* self.backend.compactDeltas(
2807
+ operation.resultingState.key,
2808
+ operation.delta.sequence
2809
+ );
2810
+ }
2811
+ self.pendingDeltas = [];
2812
+ });
2813
+ };
2814
+ saveDelta = (operation) => {
2815
+ const self = this;
2816
+ return Effect.gen(function* () {
2817
+ if (!self.config.batchDeltas) {
2818
+ if (!self.backend.saveDelta) {
2819
+ return yield* Effect.fail(
2820
+ new PersistenceError2({
2821
+ message: `Backend ${self.backend.name} does not support delta persistence`,
2822
+ operation: "saveDelta"
2823
+ })
2824
+ );
2825
+ }
2826
+ yield* self.backend.saveDelta(operation.delta);
2827
+ }
2828
+ });
2829
+ };
2830
+ flushPendingDeltas = () => {
2831
+ const self = this;
2832
+ return Effect.gen(function* () {
2833
+ if (self.pendingDeltas.length === 0) return;
2834
+ if (self.backend.saveDeltas) {
2835
+ yield* self.backend.saveDeltas([...self.pendingDeltas]);
2836
+ } else if (self.backend.saveDelta) {
2837
+ for (const delta of self.pendingDeltas) {
2838
+ yield* self.backend.saveDelta(delta);
2839
+ }
2840
+ } else {
2841
+ return yield* Effect.fail(
2842
+ new PersistenceError2({
2843
+ message: `Backend ${self.backend.name} does not support delta persistence`,
2844
+ operation: "flushPendingDeltas"
2845
+ })
2846
+ );
2847
+ }
2848
+ self.pendingDeltas = [];
2849
+ });
2850
+ };
2851
+ restore = (key) => {
2852
+ const self = this;
2853
+ return Effect.gen(function* () {
2854
+ let baseState = null;
2855
+ let fromSequence = 0;
2856
+ if (self.backend.loadLatestSnapshot) {
2857
+ const snapshot = yield* self.backend.loadLatestSnapshot(key);
2858
+ if (snapshot) {
2859
+ baseState = snapshot.state;
2860
+ fromSequence = snapshot.sequence + 1;
2861
+ }
2862
+ }
2863
+ if (!self.backend.loadDeltas) {
2864
+ if (baseState) {
2865
+ return baseState;
2866
+ }
2867
+ return yield* Effect.fail(
2868
+ new PersistenceError2({
2869
+ message: `Backend ${self.backend.name} does not support delta loading`,
2870
+ operation: "restore"
2871
+ })
2872
+ );
2873
+ }
2874
+ const deltas = yield* self.backend.loadDeltas(key, fromSequence);
2875
+ if (!baseState && deltas.length === 0) {
2876
+ return null;
2877
+ }
2878
+ if (deltas.length === 0) {
2879
+ return baseState;
2880
+ }
2881
+ return yield* self.applyDeltasToState(key, baseState, deltas);
2882
+ });
2883
+ };
2884
+ applyDeltasToState = (key, baseState, deltas) => {
2885
+ const self = this;
2886
+ return Effect.gen(function* () {
2887
+ if (!baseState) {
2888
+ const deltaStrategy = new DeltaPersistence(self.backend);
2889
+ return yield* deltaStrategy.reconstructStateFromDeltas(key, deltas);
2890
+ }
2891
+ const sortedDeltas = [...deltas].sort((a, b) => a.sequence - b.sequence);
2892
+ const pendingRequests = [...baseState.pendingRequests];
2893
+ const visitedFingerprints = [...baseState.visitedFingerprints];
2894
+ let totalProcessed = baseState.totalProcessed;
2895
+ for (const delta of sortedDeltas) {
2896
+ switch (delta.operation.type) {
2897
+ case "enqueue":
2898
+ pendingRequests.push(delta.operation.request);
2899
+ break;
2900
+ case "dequeue": {
2901
+ const operation = delta.operation;
2902
+ if (operation.type === "dequeue") {
2903
+ const dequeueIndex = pendingRequests.findIndex(
2904
+ (req) => req.fingerprint === operation.fingerprint
2905
+ );
2906
+ if (dequeueIndex >= 0) {
2907
+ pendingRequests.splice(dequeueIndex, 1);
2908
+ totalProcessed++;
2909
+ }
2910
+ }
2911
+ break;
2912
+ }
2913
+ case "mark_visited": {
2914
+ const operation = delta.operation;
2915
+ if (operation.type === "mark_visited") {
2916
+ if (!visitedFingerprints.includes(operation.fingerprint)) {
2917
+ visitedFingerprints.push(operation.fingerprint);
2918
+ }
2919
+ }
2920
+ break;
2921
+ }
2922
+ }
2923
+ }
2924
+ return yield* Effect.tryPromise({
2925
+ try: async () => {
2926
+ const { SpiderState: SpiderState2 } = await Promise.resolve().then(() => SpiderScheduler_service);
2927
+ return new SpiderState2({
2928
+ key,
2929
+ pendingRequests,
2930
+ visitedFingerprints,
2931
+ totalProcessed
2932
+ });
2933
+ },
2934
+ catch: (error) => new PersistenceError2({
2935
+ message: "Failed to import SpiderState",
2936
+ cause: error,
2937
+ operation: "applyDeltasToState"
2938
+ })
2939
+ });
2940
+ });
2941
+ };
2942
+ cleanup = (key) => {
2943
+ const self = this;
2944
+ return Effect.gen(function* () {
2945
+ yield* self.flushPendingDeltas();
2946
+ if (self.backend.deleteState) {
2947
+ yield* self.backend.deleteState(key);
2948
+ }
2949
+ if (self.backend.compactDeltas) {
2950
+ yield* self.backend.compactDeltas(key, Number.MAX_SAFE_INTEGER);
2951
+ }
2952
+ });
2953
+ };
2954
+ getInfo = () => ({
2955
+ name: "HybridPersistence",
2956
+ description: "Combines deltas and snapshots for optimal performance and recovery speed.",
2957
+ capabilities: [
2958
+ "delta-save",
2959
+ "snapshot-save",
2960
+ "batch-deltas",
2961
+ "fast-recovery",
2962
+ "automatic-compaction"
2963
+ ]
2964
+ });
2965
+ }
2966
+ class ResumabilityService extends Effect.Service()(
2967
+ "@jambudipa.io/ResumabilityService",
2968
+ {
2969
+ effect: Effect.gen(function* () {
2970
+ let strategy = null;
2971
+ let backend = null;
2972
+ const service = {
2973
+ /**
2974
+ * Configure the resumability service with a specific strategy and backend.
2975
+ *
2976
+ * This method initializes the storage backend and creates the appropriate
2977
+ * persistence strategy based on the configuration.
2978
+ *
2979
+ * @param config - Resumability configuration
2980
+ * @returns Effect that completes when configuration is applied
2981
+ */
2982
+ configure: (config) => Effect.gen(function* () {
2983
+ backend = config.backend;
2984
+ yield* backend.initialize();
2985
+ strategy = yield* createStrategy(config);
2986
+ }),
2987
+ /**
2988
+ * Persist a state operation using the configured strategy.
2989
+ *
2990
+ * @param operation - State operation to persist
2991
+ * @returns Effect that completes when operation is persisted
2992
+ */
2993
+ persistOperation: (operation) => Effect.gen(function* () {
2994
+ if (!strategy) {
2995
+ return yield* Effect.fail(
2996
+ new PersistenceError2({
2997
+ message: "ResumabilityService not configured. Call configure() first.",
2998
+ operation: "persistOperation"
2999
+ })
3000
+ );
3001
+ }
3002
+ yield* strategy.persist(operation);
3003
+ }),
3004
+ /**
3005
+ * Restore spider state from persistent storage.
3006
+ *
3007
+ * @param key - State key identifying the session to restore
3008
+ * @returns Effect containing the restored state, or null if not found
3009
+ */
3010
+ restore: (key) => Effect.gen(function* () {
3011
+ if (!strategy) {
3012
+ return yield* Effect.fail(
3013
+ new PersistenceError2({
3014
+ message: "ResumabilityService not configured. Call configure() first.",
3015
+ operation: "restore"
3016
+ })
3017
+ );
3018
+ }
3019
+ return yield* strategy.restore(key);
3020
+ }),
3021
+ /**
3022
+ * Clean up old state data for a session.
3023
+ *
3024
+ * @param key - State key identifying the session to clean up
3025
+ * @returns Effect that completes when cleanup is finished
3026
+ */
3027
+ cleanup: (key) => Effect.gen(function* () {
3028
+ if (!strategy) {
3029
+ return yield* Effect.fail(
3030
+ new PersistenceError2({
3031
+ message: "ResumabilityService not configured. Call configure() first.",
3032
+ operation: "cleanup"
3033
+ })
3034
+ );
3035
+ }
3036
+ yield* strategy.cleanup(key);
3037
+ }),
3038
+ /**
3039
+ * List all available sessions in storage.
3040
+ *
3041
+ * @returns Effect containing array of session keys
3042
+ */
3043
+ listSessions: () => Effect.gen(function* () {
3044
+ if (!backend) {
3045
+ return yield* Effect.fail(
3046
+ new PersistenceError2({
3047
+ message: "ResumabilityService not configured. Call configure() first.",
3048
+ operation: "listSessions"
3049
+ })
3050
+ );
3051
+ }
3052
+ if (!backend.listSessions) {
3053
+ return yield* Effect.fail(
3054
+ new PersistenceError2({
3055
+ message: `Backend ${backend.name} does not support listing sessions`,
3056
+ operation: "listSessions"
3057
+ })
3058
+ );
3059
+ }
3060
+ return yield* backend.listSessions();
3061
+ }),
3062
+ /**
3063
+ * Get information about the current configuration.
3064
+ *
3065
+ * @returns Information about strategy and backend
3066
+ */
3067
+ getInfo: () => Effect.gen(function* () {
3068
+ if (!strategy || !backend) {
3069
+ return yield* Effect.fail(
3070
+ new PersistenceError2({
3071
+ message: "ResumabilityService not configured. Call configure() first.",
3072
+ operation: "getInfo"
3073
+ })
3074
+ );
3075
+ }
3076
+ return {
3077
+ strategy: strategy.getInfo(),
3078
+ backend: {
3079
+ name: backend.name,
3080
+ capabilities: backend.capabilities
3081
+ }
3082
+ };
3083
+ }),
3084
+ /**
3085
+ * Reconfigure the service with new settings.
3086
+ *
3087
+ * This will clean up the current backend and reinitialize with new config.
3088
+ *
3089
+ * @param config - New configuration
3090
+ * @returns Effect that completes when reconfiguration is finished
3091
+ */
3092
+ reconfigure: (config) => Effect.gen(function* () {
3093
+ if (backend) {
3094
+ yield* backend.cleanup();
3095
+ }
3096
+ yield* service.configure(config);
3097
+ })
3098
+ };
3099
+ return service;
3100
+ })
3101
+ }
3102
+ ) {
3103
+ /**
3104
+ * Create a ResumabilityService layer from configuration.
3105
+ *
3106
+ * This is the primary way to create and configure the ResumabilityService.
3107
+ *
3108
+ * @param config - Resumability configuration
3109
+ * @returns Effect layer providing the configured ResumabilityService
3110
+ */
3111
+ static fromConfig = (config) => Effect.gen(function* () {
3112
+ const service = yield* ResumabilityService;
3113
+ yield* service.configure(config);
3114
+ return service;
3115
+ }).pipe(Effect.provide(ResumabilityService.Default));
3116
+ }
3117
+ const createStrategy = (config) => Effect.gen(function* () {
3118
+ const { strategy: strategyType, backend, hybridConfig } = config;
3119
+ switch (strategyType) {
3120
+ case "full-state":
3121
+ return new FullStatePersistence(backend);
3122
+ case "delta":
3123
+ return new DeltaPersistence(backend);
3124
+ case "hybrid":
3125
+ return new HybridPersistence(
3126
+ backend,
3127
+ hybridConfig || DEFAULT_HYBRID_CONFIG
3128
+ );
3129
+ case "auto":
3130
+ const capabilities = backend.capabilities;
3131
+ if (capabilities.supportsDelta && capabilities.supportsSnapshot) {
3132
+ return new HybridPersistence(
3133
+ backend,
3134
+ hybridConfig || DEFAULT_HYBRID_CONFIG
3135
+ );
3136
+ } else if (capabilities.supportsDelta) {
3137
+ return new DeltaPersistence(backend);
3138
+ } else {
3139
+ return new FullStatePersistence(backend);
3140
+ }
3141
+ default:
3142
+ return yield* Effect.fail(
3143
+ new PersistenceError2({
3144
+ message: `Unknown strategy type: ${strategyType}`,
3145
+ operation: "createStrategy"
3146
+ })
3147
+ );
3148
+ }
3149
+ });
3150
+ const createStateOperation = (delta, resultingState, shouldSnapshot = false) => ({
3151
+ delta,
3152
+ resultingState,
3153
+ shouldSnapshot
3154
+ });
3155
+ const ResumabilityConfigs = {
3156
+ /**
3157
+ * Create a file-based configuration.
3158
+ *
3159
+ * @param baseDir - Directory to store state files
3160
+ * @param strategy - Persistence strategy (defaults to 'auto')
3161
+ * @returns ResumabilityConfig
3162
+ */
3163
+ file: (baseDir, strategy = "auto") => ({
3164
+ strategy,
3165
+ backend: new (require("./backends/FileStorageBackend.js")).FileStorageBackend(
3166
+ baseDir
3167
+ )
3168
+ }),
3169
+ /**
3170
+ * Create a Redis-based configuration.
3171
+ *
3172
+ * @param redisClient - Redis client instance
3173
+ * @param strategy - Persistence strategy (defaults to 'hybrid')
3174
+ * @param keyPrefix - Redis key prefix (defaults to 'spider')
3175
+ * @returns ResumabilityConfig
3176
+ */
3177
+ redis: (redisClient, strategy = "hybrid", keyPrefix = "spider") => ({
3178
+ strategy,
3179
+ backend: new (require("./backends/RedisStorageBackend.js")).RedisStorageBackend(
3180
+ redisClient,
3181
+ keyPrefix
3182
+ )
3183
+ }),
3184
+ /**
3185
+ * Create a PostgreSQL-based configuration.
3186
+ *
3187
+ * @param dbClient - Database client instance
3188
+ * @param strategy - Persistence strategy (defaults to 'hybrid')
3189
+ * @param config - PostgreSQL configuration
3190
+ * @returns ResumabilityConfig
3191
+ */
3192
+ postgres: (dbClient, strategy = "hybrid", config) => ({
3193
+ strategy,
3194
+ backend: new (require("./backends/PostgresStorageBackend.js")).PostgresStorageBackend(
3195
+ dbClient,
3196
+ config
3197
+ )
3198
+ })
3199
+ };
3200
+ class FileStorageBackend {
3201
+ constructor(baseDir) {
3202
+ this.baseDir = baseDir;
3203
+ }
3204
+ capabilities = {
3205
+ supportsDelta: true,
3206
+ supportsSnapshot: true,
3207
+ supportsStreaming: false,
3208
+ supportsConcurrency: false,
3209
+ // File system isn't great for concurrent access
3210
+ latency: "low"
3211
+ };
3212
+ name = "FileStorageBackend";
3213
+ initialize = () => {
3214
+ const self = this;
3215
+ return Effect.gen(function* () {
3216
+ yield* Effect.tryPromise({
3217
+ try: () => fs$1.mkdir(self.baseDir, { recursive: true }),
3218
+ catch: (error) => new PersistenceError2({
3219
+ message: `Failed to initialize file storage: ${error}`,
3220
+ cause: error,
3221
+ operation: "initialize"
3222
+ })
3223
+ });
3224
+ yield* Effect.tryPromise({
3225
+ try: () => fs$1.mkdir(path.join(self.baseDir, "sessions"), { recursive: true }),
3226
+ catch: (error) => new PersistenceError2({
3227
+ message: `Failed to initialize file storage: ${error}`,
3228
+ cause: error,
3229
+ operation: "initialize"
3230
+ })
3231
+ });
3232
+ });
3233
+ };
3234
+ cleanup = () => Effect.succeed(void 0);
3235
+ // No cleanup needed for file backend
3236
+ // Full state operations
3237
+ saveState = (key, state) => {
3238
+ const self = this;
3239
+ return Effect.gen(function* () {
3240
+ const sessionDir = self.getSessionDir(key);
3241
+ const statePath = path.join(sessionDir, "state.json");
3242
+ yield* Effect.tryPromise({
3243
+ try: () => fs$1.mkdir(sessionDir, { recursive: true }),
3244
+ catch: (error) => new PersistenceError2({
3245
+ message: `Failed to create session directory: ${error}`,
3246
+ cause: error,
3247
+ operation: "saveState"
3248
+ })
3249
+ });
3250
+ const encoded = Schema.encodeSync(SpiderState)(state);
3251
+ yield* Effect.tryPromise({
3252
+ try: () => fs$1.writeFile(statePath, JSON.stringify(encoded, null, 2), "utf8"),
3253
+ catch: (error) => new PersistenceError2({
3254
+ message: `Failed to save state: ${error}`,
3255
+ cause: error,
3256
+ operation: "saveState"
3257
+ })
3258
+ });
3259
+ });
3260
+ };
3261
+ loadState = (key) => {
3262
+ const self = this;
3263
+ return Effect.gen(function* () {
3264
+ const sessionDir = self.getSessionDir(key);
3265
+ const statePath = path.join(sessionDir, "state.json");
3266
+ const result = yield* Effect.tryPromise(
3267
+ () => fs$1.readFile(statePath, "utf8")
3268
+ ).pipe(
3269
+ Effect.catchAll((error) => {
3270
+ if (error.code === "ENOENT") {
3271
+ return Effect.succeed(null);
3272
+ }
3273
+ return Effect.fail(
3274
+ new PersistenceError2({
3275
+ message: `Failed to load state: ${error}`,
3276
+ cause: error,
3277
+ operation: "loadState"
3278
+ })
3279
+ );
3280
+ })
3281
+ );
3282
+ if (result === null) {
3283
+ return null;
3284
+ }
3285
+ try {
3286
+ const parsed = JSON.parse(result);
3287
+ const decoded = Schema.decodeUnknownSync(SpiderState)(parsed);
3288
+ return decoded;
3289
+ } catch (error) {
3290
+ return yield* Effect.fail(
3291
+ new PersistenceError2({
3292
+ message: `Failed to parse state: ${error}`,
3293
+ cause: error,
3294
+ operation: "loadState"
3295
+ })
3296
+ );
3297
+ }
3298
+ });
3299
+ };
3300
+ deleteState = (key) => {
3301
+ const self = this;
3302
+ return Effect.gen(function* () {
3303
+ const sessionDir = self.getSessionDir(key);
3304
+ yield* Effect.tryPromise({
3305
+ try: () => fs$1.rm(sessionDir, { recursive: true, force: true }),
3306
+ catch: (error) => new PersistenceError2({
3307
+ message: `Failed to delete state: ${error}`,
3308
+ cause: error,
3309
+ operation: "deleteState"
3310
+ })
3311
+ });
3312
+ });
3313
+ };
3314
+ // Delta operations
3315
+ saveDelta = (delta) => {
3316
+ const self = this;
3317
+ return Effect.gen(function* () {
3318
+ const sessionDir = path.join(self.baseDir, "sessions", delta.stateKey);
3319
+ const deltasDir = path.join(sessionDir, "deltas");
3320
+ const deltaPath = path.join(
3321
+ deltasDir,
3322
+ `${delta.sequence.toString().padStart(6, "0")}.json`
3323
+ );
3324
+ yield* Effect.tryPromise({
3325
+ try: () => fs$1.mkdir(deltasDir, { recursive: true }),
3326
+ catch: (error) => new PersistenceError2({
3327
+ message: `Failed to create deltas directory: ${error}`,
3328
+ cause: error,
3329
+ operation: "saveDelta"
3330
+ })
3331
+ });
3332
+ const encoded = Schema.encodeSync(StateDelta)(delta);
3333
+ yield* Effect.tryPromise({
3334
+ try: () => fs$1.writeFile(deltaPath, JSON.stringify(encoded, null, 2), "utf8"),
3335
+ catch: (error) => new PersistenceError2({
3336
+ message: `Failed to save delta: ${error}`,
3337
+ cause: error,
3338
+ operation: "saveDelta"
3339
+ })
3340
+ });
3341
+ });
3342
+ };
3343
+ saveDeltas = (deltas) => {
3344
+ const self = this;
3345
+ return Effect.gen(function* () {
3346
+ for (const delta of deltas) {
3347
+ yield* self.saveDelta(delta);
3348
+ }
3349
+ });
3350
+ };
3351
+ loadDeltas = (key, fromSequence = 0) => {
3352
+ const self = this;
3353
+ return Effect.gen(function* () {
3354
+ const deltasDir = path.join(self.getSessionDir(key), "deltas");
3355
+ const files = yield* Effect.tryPromise(() => fs$1.readdir(deltasDir)).pipe(
3356
+ Effect.catchAll((error) => {
3357
+ if (error.code === "ENOENT") {
3358
+ return Effect.succeed([]);
3359
+ }
3360
+ return Effect.fail(
3361
+ new PersistenceError2({
3362
+ message: `Failed to read deltas directory: ${error}`,
3363
+ cause: error,
3364
+ operation: "loadDeltas"
3365
+ })
3366
+ );
3367
+ })
3368
+ );
3369
+ if (files.length === 0) {
3370
+ return [];
3371
+ }
3372
+ const deltaFiles = files.filter((f) => f.endsWith(".json")).map((f) => ({
3373
+ file: f,
3374
+ sequence: parseInt(f.replace(".json", ""), 10)
3375
+ })).filter(({ sequence }) => sequence >= fromSequence).sort((a, b) => a.sequence - b.sequence);
3376
+ const deltas = [];
3377
+ for (const { file } of deltaFiles) {
3378
+ const content = yield* Effect.tryPromise({
3379
+ try: () => fs$1.readFile(path.join(deltasDir, file), "utf8"),
3380
+ catch: (error) => new PersistenceError2({
3381
+ message: `Failed to read delta file ${file}: ${error}`,
3382
+ cause: error,
3383
+ operation: "loadDeltas"
3384
+ })
3385
+ });
3386
+ try {
3387
+ const parsed = JSON.parse(content);
3388
+ const decoded = Schema.decodeUnknownSync(StateDelta)(parsed);
3389
+ deltas.push(decoded);
3390
+ } catch (error) {
3391
+ return yield* Effect.fail(
3392
+ new PersistenceError2({
3393
+ message: `Failed to parse delta file ${file}: ${error}`,
3394
+ cause: error,
3395
+ operation: "loadDeltas"
3396
+ })
3397
+ );
3398
+ }
3399
+ }
3400
+ return deltas;
3401
+ });
3402
+ };
3403
+ // Snapshot operations
3404
+ saveSnapshot = (key, state, sequence) => {
3405
+ const self = this;
3406
+ return Effect.gen(function* () {
3407
+ const sessionDir = self.getSessionDir(key);
3408
+ const snapshotPath = path.join(sessionDir, "snapshot.json");
3409
+ yield* Effect.tryPromise({
3410
+ try: () => fs$1.mkdir(sessionDir, { recursive: true }),
3411
+ catch: (error) => new PersistenceError2({
3412
+ message: `Failed to create session directory: ${error}`,
3413
+ cause: error,
3414
+ operation: "saveSnapshot"
3415
+ })
3416
+ });
3417
+ const snapshotData = {
3418
+ state: Schema.encodeSync(SpiderState)(state),
3419
+ sequence,
3420
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
3421
+ };
3422
+ yield* Effect.tryPromise({
3423
+ try: () => fs$1.writeFile(
3424
+ snapshotPath,
3425
+ JSON.stringify(snapshotData, null, 2),
3426
+ "utf8"
3427
+ ),
3428
+ catch: (error) => new PersistenceError2({
3429
+ message: `Failed to save snapshot: ${error}`,
3430
+ cause: error,
3431
+ operation: "saveSnapshot"
3432
+ })
3433
+ });
3434
+ });
3435
+ };
3436
+ loadLatestSnapshot = (key) => {
3437
+ const self = this;
3438
+ return Effect.gen(function* () {
3439
+ const sessionDir = self.getSessionDir(key);
3440
+ const snapshotPath = path.join(sessionDir, "snapshot.json");
3441
+ const content = yield* Effect.tryPromise(
3442
+ () => fs$1.readFile(snapshotPath, "utf8")
3443
+ ).pipe(
3444
+ Effect.catchAll((error) => {
3445
+ if (error.code === "ENOENT") {
3446
+ return Effect.succeed(null);
3447
+ }
3448
+ return Effect.fail(
3449
+ new PersistenceError2({
3450
+ message: `Failed to load snapshot: ${error}`,
3451
+ cause: error,
3452
+ operation: "loadLatestSnapshot"
3453
+ })
3454
+ );
3455
+ })
3456
+ );
3457
+ if (content === null) {
3458
+ return null;
3459
+ }
3460
+ try {
3461
+ const parsed = JSON.parse(content);
3462
+ const state = Schema.decodeUnknownSync(SpiderState)(parsed.state);
3463
+ return {
3464
+ state,
3465
+ sequence: Number(parsed.sequence)
3466
+ };
3467
+ } catch (error) {
3468
+ return yield* Effect.fail(
3469
+ new PersistenceError2({
3470
+ message: `Failed to parse snapshot: ${error}`,
3471
+ cause: error,
3472
+ operation: "loadLatestSnapshot"
3473
+ })
3474
+ );
3475
+ }
3476
+ });
3477
+ };
3478
+ // Cleanup operations
3479
+ compactDeltas = (key, beforeSequence) => {
3480
+ const self = this;
3481
+ return Effect.gen(function* () {
3482
+ const deltasDir = path.join(self.getSessionDir(key), "deltas");
3483
+ const files = yield* Effect.tryPromise(() => fs$1.readdir(deltasDir)).pipe(
3484
+ Effect.catchAll((error) => {
3485
+ if (error.code === "ENOENT") {
3486
+ return Effect.succeed([]);
3487
+ }
3488
+ return Effect.fail(
3489
+ new PersistenceError2({
3490
+ message: `Failed to read deltas directory: ${error}`,
3491
+ cause: error,
3492
+ operation: "compactDeltas"
3493
+ })
3494
+ );
3495
+ })
3496
+ );
3497
+ if (files.length === 0) {
3498
+ return;
3499
+ }
3500
+ const deltaFiles = files.filter((f) => f.endsWith(".json")).map((f) => ({
3501
+ file: f,
3502
+ sequence: parseInt(f.replace(".json", ""), 10)
3503
+ })).filter(({ sequence }) => sequence < beforeSequence);
3504
+ for (const { file } of deltaFiles) {
3505
+ yield* Effect.tryPromise({
3506
+ try: () => fs$1.unlink(path.join(deltasDir, file)),
3507
+ catch: (error) => new PersistenceError2({
3508
+ message: `Failed to delete delta file ${file}: ${error}`,
3509
+ cause: error,
3510
+ operation: "compactDeltas"
3511
+ })
3512
+ });
3513
+ }
3514
+ });
3515
+ };
3516
+ listSessions = () => {
3517
+ const self = this;
3518
+ return Effect.gen(function* () {
3519
+ const sessionsDir = path.join(self.baseDir, "sessions");
3520
+ const dirs = yield* Effect.tryPromise(() => fs$1.readdir(sessionsDir)).pipe(
3521
+ Effect.catchAll((error) => {
3522
+ if (error.code === "ENOENT") {
3523
+ return Effect.succeed([]);
3524
+ }
3525
+ return Effect.fail(
3526
+ new PersistenceError2({
3527
+ message: `Failed to read sessions directory: ${error}`,
3528
+ cause: error,
3529
+ operation: "listSessions"
3530
+ })
3531
+ );
3532
+ })
3533
+ );
3534
+ if (dirs.length === 0) {
3535
+ return [];
3536
+ }
3537
+ const sessions = [];
3538
+ for (const dir of dirs) {
3539
+ const sessionDir = path.join(sessionsDir, dir);
3540
+ const statePath = path.join(sessionDir, "state.json");
3541
+ const content = yield* Effect.tryPromise(
3542
+ () => fs$1.readFile(statePath, "utf8")
3543
+ ).pipe(Effect.catchAll(() => Effect.succeed(null)));
3544
+ if (content === null) {
3545
+ continue;
3546
+ }
3547
+ try {
3548
+ const parsed = JSON.parse(content);
3549
+ Schema.decodeUnknownSync(SpiderState)(parsed);
3550
+ sessions.push({ id: dir, name: dir, timestamp: /* @__PURE__ */ new Date() });
3551
+ } catch {
3552
+ continue;
3553
+ }
3554
+ }
3555
+ return sessions;
3556
+ });
3557
+ };
3558
+ getSessionDir = (key) => {
3559
+ return path.join(this.baseDir, "sessions", key.id);
3560
+ };
3561
+ }
3562
+ class CookieManager extends Context.Tag("CookieManager")() {
3563
+ }
3564
+ const makeCookieManager = () => Effect.gen(function* () {
3565
+ const jar = new CookieJar();
3566
+ const jarRef = yield* Ref.make(jar);
3567
+ return {
3568
+ setCookie: (cookieString, url) => Effect.gen(function* () {
3569
+ const jar2 = yield* Ref.get(jarRef);
3570
+ yield* Effect.tryPromise({
3571
+ try: () => new Promise((resolve, reject) => {
3572
+ jar2.setCookie(cookieString, url, (err) => {
3573
+ if (err) reject(err);
3574
+ else resolve();
3575
+ });
3576
+ }),
3577
+ catch: (error) => new Error(`Failed to set cookie: ${error}`)
3578
+ });
3579
+ }),
3580
+ getCookies: (url) => Effect.gen(function* () {
3581
+ const jar2 = yield* Ref.get(jarRef);
3582
+ const cookies = yield* Effect.tryPromise({
3583
+ try: () => new Promise((resolve, reject) => {
3584
+ jar2.getCookies(url, (err, cookies2) => {
3585
+ if (err) reject(err);
3586
+ else resolve(cookies2 || []);
3587
+ });
3588
+ }),
3589
+ catch: () => new Error(`Failed to get cookies for ${url}`)
3590
+ });
3591
+ return cookies.map((cookie) => cookie.toString());
3592
+ }).pipe(Effect.orElseSucceed(() => [])),
3593
+ getCookieHeader: (url) => Effect.gen(function* () {
3594
+ const jar2 = yield* Ref.get(jarRef);
3595
+ const cookieHeader = yield* Effect.tryPromise({
3596
+ try: () => new Promise((resolve, reject) => {
3597
+ jar2.getCookieString(url, (err, cookies) => {
3598
+ if (err) reject(err);
3599
+ else resolve(cookies || null);
3600
+ });
3601
+ }),
3602
+ catch: () => null
3603
+ });
3604
+ return cookieHeader;
3605
+ }).pipe(Effect.orElseSucceed(() => null)),
3606
+ clearCookies: () => Effect.gen(function* () {
3607
+ const newJar = new CookieJar();
3608
+ yield* Ref.set(jarRef, newJar);
3609
+ }),
3610
+ serialize: () => Effect.gen(function* () {
3611
+ const jar2 = yield* Ref.get(jarRef);
3612
+ const serialized = yield* Effect.tryPromise({
3613
+ try: () => new Promise((resolve, reject) => {
3614
+ jar2.serialize((err, serializedObject) => {
3615
+ if (err) reject(err);
3616
+ else resolve(serializedObject);
3617
+ });
3618
+ }),
3619
+ catch: () => new Error("Failed to serialize cookies")
3620
+ });
3621
+ return JSON.stringify(serialized);
3622
+ }).pipe(Effect.orElseSucceed(() => "{}")),
3623
+ deserialize: (data) => Effect.gen(function* () {
3624
+ try {
3625
+ const parsed = JSON.parse(data);
3626
+ const newJar = CookieJar.deserialize(parsed);
3627
+ yield* Effect.tryPromise({
3628
+ try: () => Promise.resolve(newJar),
3629
+ catch: () => new Error("Failed to deserialize cookie jar")
3630
+ }).pipe(Effect.flatMap((jar2) => Ref.set(jarRef, jar2)));
3631
+ } catch (error) {
3632
+ yield* Effect.fail(new Error(`Invalid cookie data: ${error}`));
3633
+ }
3634
+ })
3635
+ };
3636
+ });
3637
+ const CookieManagerLive = Layer.effect(
3638
+ CookieManager,
3639
+ makeCookieManager()
3640
+ );
3641
+ class EnhancedHttpClient extends Context.Tag("EnhancedHttpClient")() {
3642
+ }
3643
+ const makeEnhancedHttpClient = Effect.gen(function* () {
3644
+ const logger = yield* SpiderLogger;
3645
+ const cookieManager = yield* CookieManager;
3646
+ const makeRequest = (url, options = {}) => Effect.gen(function* () {
3647
+ const startMs = Date.now();
3648
+ const domain = new URL(url).hostname;
3649
+ const cookieHeader = yield* cookieManager.getCookieHeader(url);
3650
+ const headers = {
3651
+ "User-Agent": "Mozilla/5.0 (compatible; Spider/1.0)",
3652
+ ...options.headers
3653
+ };
3654
+ if (cookieHeader && !headers["Cookie"]) {
3655
+ headers["Cookie"] = cookieHeader;
3656
+ }
3657
+ if (options.method === "POST" && options.body && !headers["Content-Type"]) {
3658
+ if (typeof options.body === "string") {
3659
+ try {
3660
+ JSON.parse(options.body);
3661
+ headers["Content-Type"] = "application/json";
3662
+ } catch {
3663
+ headers["Content-Type"] = "application/x-www-form-urlencoded";
3664
+ }
3665
+ } else if (options.body instanceof FormData) ;
3666
+ else if (options.body instanceof URLSearchParams) {
3667
+ headers["Content-Type"] = "application/x-www-form-urlencoded";
3668
+ }
3669
+ }
3670
+ const controller = new AbortController();
3671
+ const timeoutMs = options.timeout || 3e4;
3672
+ const timeoutId = setTimeout(() => {
3673
+ const duration = Date.now() - startMs;
3674
+ Effect.runSync(
3675
+ logger.logEdgeCase(domain, "http_request_abort", {
3676
+ url,
3677
+ method: options.method || "GET",
3678
+ durationMs: duration,
3679
+ reason: "timeout",
3680
+ timeoutMs
3681
+ })
3682
+ );
3683
+ controller.abort();
3684
+ }, timeoutMs);
3685
+ const response = yield* Effect.tryPromise({
3686
+ try: async () => {
3687
+ const resp = await fetch(url, {
3688
+ method: options.method || "GET",
3689
+ headers,
3690
+ body: options.body,
3691
+ signal: controller.signal,
3692
+ redirect: options.followRedirects === false ? "manual" : "follow",
3693
+ credentials: options.credentials || "same-origin"
3694
+ });
3695
+ clearTimeout(timeoutId);
3696
+ return resp;
3697
+ },
3698
+ catch: (error) => {
3699
+ clearTimeout(timeoutId);
3700
+ return NetworkError.fromCause(url, error);
3701
+ }
3702
+ });
3703
+ const body = yield* Effect.tryPromise({
3704
+ try: () => response.text(),
3705
+ catch: (error) => ResponseError.fromCause(url, error)
3706
+ });
3707
+ const setCookieHeaders = response.headers.getSetCookie ? response.headers.getSetCookie() : response.headers.get("set-cookie")?.split(", ") || [];
3708
+ for (const cookieString of setCookieHeaders) {
3709
+ if (cookieString) {
3710
+ yield* cookieManager.setCookie(cookieString, url).pipe(Effect.catchAll(() => Effect.void));
3711
+ }
3712
+ }
3713
+ const responseHeaders = {};
3714
+ response.headers.forEach((value, key) => {
3715
+ responseHeaders[key] = value;
3716
+ });
3717
+ return {
3718
+ url: response.url,
3719
+ status: response.status,
3720
+ statusText: response.statusText,
3721
+ headers: responseHeaders,
3722
+ body,
3723
+ cookies: setCookieHeaders
3724
+ };
3725
+ });
3726
+ return {
3727
+ get: (url, options) => makeRequest(url, { ...options, method: "GET" }),
3728
+ post: (url, data, options) => Effect.gen(function* () {
3729
+ let body;
3730
+ if (data) {
3731
+ if (typeof data === "string" || data instanceof FormData || data instanceof URLSearchParams) {
3732
+ body = data;
3733
+ } else {
3734
+ body = JSON.stringify(data);
3735
+ }
3736
+ }
3737
+ return yield* makeRequest(url, { ...options, method: "POST", body });
3738
+ }),
3739
+ request: makeRequest,
3740
+ submitForm: (url, formData, options) => Effect.gen(function* () {
3741
+ const params = new URLSearchParams();
3742
+ for (const [key, value] of Object.entries(formData)) {
3743
+ params.append(key, value);
3744
+ }
3745
+ return yield* makeRequest(url, {
3746
+ ...options,
3747
+ method: "POST",
3748
+ body: params,
3749
+ headers: {
3750
+ "Content-Type": "application/x-www-form-urlencoded",
3751
+ ...options?.headers
3752
+ }
3753
+ });
3754
+ })
3755
+ };
3756
+ });
3757
+ const EnhancedHttpClientLive = Layer.effect(
3758
+ EnhancedHttpClient,
3759
+ makeEnhancedHttpClient
3760
+ );
3761
+ class SessionStore extends Context.Tag("SessionStore")() {
3762
+ }
3763
+ const makeSessionStore = Effect.gen(function* () {
3764
+ const cookieManager = yield* CookieManager;
3765
+ const sessions = yield* Ref.make(/* @__PURE__ */ new Map());
3766
+ const currentSessionId = yield* Ref.make(
3767
+ Option.none()
3768
+ );
3769
+ const generateSessionId = () => `session_${Date.now()}_${Math.random().toString(36).substring(2, 9)}`;
3770
+ return {
3771
+ createSession: (id) => Effect.gen(function* () {
3772
+ const sessionId = id || generateSessionId();
3773
+ const cookiesString = yield* cookieManager.serialize();
3774
+ const session = {
3775
+ id: sessionId,
3776
+ cookies: cookiesString,
3777
+ tokens: /* @__PURE__ */ new Map(),
3778
+ createdAt: /* @__PURE__ */ new Date(),
3779
+ lastUsedAt: /* @__PURE__ */ new Date(),
3780
+ expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1e3)
3781
+ // 24 hours
3782
+ };
3783
+ const sessionsMap = yield* Ref.get(sessions);
3784
+ sessionsMap.set(sessionId, session);
3785
+ yield* Ref.set(sessions, sessionsMap);
3786
+ yield* Ref.set(currentSessionId, Option.some(sessionId));
3787
+ return session;
3788
+ }),
3789
+ getCurrentSession: () => Effect.gen(function* () {
3790
+ const sessionId = yield* Ref.get(currentSessionId);
3791
+ if (Option.isNone(sessionId)) {
3792
+ return Option.none();
3793
+ }
3794
+ const sessionsMap = yield* Ref.get(sessions);
3795
+ const session = sessionsMap.get(sessionId.value);
3796
+ if (!session) {
3797
+ return Option.none();
3798
+ }
3799
+ session.lastUsedAt = /* @__PURE__ */ new Date();
3800
+ sessionsMap.set(sessionId.value, session);
3801
+ yield* Ref.set(sessions, sessionsMap);
3802
+ return Option.some(session);
3803
+ }),
3804
+ loadSession: (id) => Effect.gen(function* () {
3805
+ const sessionsMap = yield* Ref.get(sessions);
3806
+ const session = sessionsMap.get(id);
3807
+ if (!session) {
3808
+ return yield* Effect.fail(new Error(`Session ${id} not found`));
3809
+ }
3810
+ if (session.expiresAt && session.expiresAt < /* @__PURE__ */ new Date()) {
3811
+ return yield* Effect.fail(new Error(`Session ${id} has expired`));
3812
+ }
3813
+ yield* cookieManager.deserialize(session.cookies);
3814
+ yield* Ref.set(currentSessionId, Option.some(id));
3815
+ session.lastUsedAt = /* @__PURE__ */ new Date();
3816
+ sessionsMap.set(id, session);
3817
+ yield* Ref.set(sessions, sessionsMap);
3818
+ }),
3819
+ saveSession: () => Effect.gen(function* () {
3820
+ const sessionId = yield* Ref.get(currentSessionId);
3821
+ if (Option.isNone(sessionId)) {
3822
+ const newSession = yield* Effect.sync(() => generateSessionId());
3823
+ yield* Ref.set(currentSessionId, Option.some(newSession));
3824
+ const session2 = yield* Effect.succeed({
3825
+ id: newSession,
3826
+ cookies: yield* cookieManager.serialize(),
3827
+ tokens: /* @__PURE__ */ new Map(),
3828
+ createdAt: /* @__PURE__ */ new Date(),
3829
+ lastUsedAt: /* @__PURE__ */ new Date(),
3830
+ expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1e3)
3831
+ });
3832
+ const sessionsMap2 = yield* Ref.get(sessions);
3833
+ sessionsMap2.set(newSession, session2);
3834
+ yield* Ref.set(sessions, sessionsMap2);
3835
+ return newSession;
3836
+ }
3837
+ const sessionsMap = yield* Ref.get(sessions);
3838
+ const session = sessionsMap.get(sessionId.value);
3839
+ if (!session) {
3840
+ return yield* Effect.fail(new Error("No active session to save"));
3841
+ }
3842
+ session.cookies = yield* cookieManager.serialize();
3843
+ session.lastUsedAt = /* @__PURE__ */ new Date();
3844
+ sessionsMap.set(sessionId.value, session);
3845
+ yield* Ref.set(sessions, sessionsMap);
3846
+ return sessionId.value;
3847
+ }),
3848
+ clearSession: () => Effect.gen(function* () {
3849
+ const sessionId = yield* Ref.get(currentSessionId);
3850
+ if (Option.isSome(sessionId)) {
3851
+ const sessionsMap = yield* Ref.get(sessions);
3852
+ sessionsMap.delete(sessionId.value);
3853
+ yield* Ref.set(sessions, sessionsMap);
3854
+ }
3855
+ yield* Ref.set(currentSessionId, Option.none());
3856
+ yield* cookieManager.clearCookies();
3857
+ }),
3858
+ isSessionValid: () => Effect.gen(function* () {
3859
+ const session = yield* Effect.gen(function* () {
3860
+ const sessionId = yield* Ref.get(currentSessionId);
3861
+ if (Option.isNone(sessionId)) return null;
3862
+ const sessionsMap = yield* Ref.get(sessions);
3863
+ return sessionsMap.get(sessionId.value) || null;
3864
+ });
3865
+ if (!session) return false;
3866
+ if (session.expiresAt && session.expiresAt < /* @__PURE__ */ new Date()) {
3867
+ return false;
3868
+ }
3869
+ return true;
3870
+ }),
3871
+ updateSessionData: (data) => Effect.gen(function* () {
3872
+ const sessionId = yield* Ref.get(currentSessionId);
3873
+ if (Option.isNone(sessionId)) {
3874
+ return yield* Effect.fail(new Error("No active session"));
3875
+ }
3876
+ const sessionsMap = yield* Ref.get(sessions);
3877
+ const session = sessionsMap.get(sessionId.value);
3878
+ if (!session) {
3879
+ return yield* Effect.fail(new Error("Session not found"));
3880
+ }
3881
+ session.userData = { ...session.userData, ...data };
3882
+ session.lastUsedAt = /* @__PURE__ */ new Date();
3883
+ sessionsMap.set(sessionId.value, session);
3884
+ yield* Ref.set(sessions, sessionsMap);
3885
+ }),
3886
+ exportSession: () => Effect.gen(function* () {
3887
+ const sessionId = yield* Ref.get(currentSessionId);
3888
+ if (Option.isNone(sessionId)) {
3889
+ return yield* Effect.fail(new Error("No active session to export"));
3890
+ }
3891
+ const sessionsMap = yield* Ref.get(sessions);
3892
+ const session = sessionsMap.get(sessionId.value);
3893
+ if (!session) {
3894
+ return yield* Effect.fail(new Error("Session not found"));
3895
+ }
3896
+ const tokensArray = Array.from(session.tokens.entries());
3897
+ return JSON.stringify({
3898
+ ...session,
3899
+ tokens: tokensArray
3900
+ });
3901
+ }),
3902
+ importSession: (data) => Effect.gen(function* () {
3903
+ try {
3904
+ const parsed = JSON.parse(data);
3905
+ const session = {
3906
+ ...parsed,
3907
+ tokens: new Map(parsed.tokens || []),
3908
+ createdAt: new Date(parsed.createdAt),
3909
+ lastUsedAt: new Date(parsed.lastUsedAt),
3910
+ expiresAt: parsed.expiresAt ? new Date(parsed.expiresAt) : void 0
3911
+ };
3912
+ const sessionsMap = yield* Ref.get(sessions);
3913
+ sessionsMap.set(session.id, session);
3914
+ yield* Ref.set(sessions, sessionsMap);
3915
+ yield* Effect.gen(function* () {
3916
+ yield* cookieManager.deserialize(session.cookies);
3917
+ yield* Ref.set(currentSessionId, Option.some(session.id));
3918
+ });
3919
+ } catch (error) {
3920
+ yield* Effect.fail(new Error(`Invalid session data: ${error}`));
3921
+ }
3922
+ })
3923
+ };
3924
+ });
3925
+ const SessionStoreLive = Layer.effect(SessionStore, makeSessionStore);
3926
+ var TokenType = /* @__PURE__ */ ((TokenType2) => {
3927
+ TokenType2["CSRF"] = "csrf";
3928
+ TokenType2["API"] = "api";
3929
+ TokenType2["AUTH"] = "auth";
3930
+ TokenType2["REFRESH"] = "refresh";
3931
+ return TokenType2;
3932
+ })(TokenType || {});
3933
+ class StateManager extends Context.Tag("StateManager")() {
3934
+ }
3935
+ const makeStateManager = () => Effect.gen(function* () {
3936
+ const tokens = yield* Ref.make(/* @__PURE__ */ new Map());
3937
+ const localStorage = yield* Ref.make(/* @__PURE__ */ new Map());
3938
+ const sessionStorage = yield* Ref.make(/* @__PURE__ */ new Map());
3939
+ return {
3940
+ extractCSRFToken: (html) => Effect.gen(function* () {
3941
+ const $ = cheerio.load(html);
3942
+ const csrfSelectors = [
3943
+ 'meta[name="csrf-token"]',
3944
+ 'meta[name="_csrf"]',
3945
+ 'meta[name="csrf_token"]',
3946
+ 'meta[name="authenticity_token"]',
3947
+ 'input[name="csrf_token"]',
3948
+ 'input[name="_csrf"]',
3949
+ 'input[name="authenticity_token"]',
3950
+ 'input[name="__RequestVerificationToken"]'
3951
+ ];
3952
+ for (const selector of csrfSelectors) {
3953
+ const element = $(selector);
3954
+ if (element.length > 0) {
3955
+ const token = element.attr("content") || element.attr("value");
3956
+ if (token) {
3957
+ return token;
3958
+ }
3959
+ }
3960
+ }
3961
+ const scriptTags = $("script:not([src])");
3962
+ const scriptContent = scriptTags.map((_, el) => $(el).html()).get().join("\n");
3963
+ const patterns = [
3964
+ /window\.csrfToken\s*=\s*["']([^"']+)["']/,
3965
+ /csrf[_-]?token["']?\s*[:=]\s*["']([^"']+)["']/i,
3966
+ /_token["']?\s*[:=]\s*["']([^"']+)["']/,
3967
+ /authenticity_token["']?\s*[:=]\s*["']([^"']+)["']/,
3968
+ /X-CSRF-Token["']?\s*[:=]\s*["']([^"']+)["']/
3969
+ ];
3970
+ for (const pattern of patterns) {
3971
+ const match = scriptContent.match(pattern);
3972
+ if (match && match[1]) {
3973
+ return match[1];
3974
+ }
3975
+ }
3976
+ return yield* Effect.fail(new Error("CSRF token not found in HTML"));
3977
+ }),
3978
+ extractAPIToken: (scripts) => Effect.gen(function* () {
3979
+ const scriptContent = scripts.join("\n");
3980
+ const patterns = [
3981
+ /api[_-]?key["']?\s*[:=]\s*["']([^"']+)["']/i,
3982
+ /api[_-]?token["']?\s*[:=]\s*["']([^"']+)["']/i,
3983
+ /X-Secret-Token["']?\s*[:=]\s*["']([^"']+)["']/,
3984
+ /authorization["']?\s*[:=]\s*["']Bearer\s+([^"']+)["']/i,
3985
+ /access[_-]?token["']?\s*[:=]\s*["']([^"']+)["']/i,
3986
+ /secret[_-]?key["']?\s*[:=]\s*["']([^"']+)["']/i
3987
+ ];
3988
+ for (const pattern of patterns) {
3989
+ const match = scriptContent.match(pattern);
3990
+ if (match && match[1]) {
3991
+ return match[1];
3992
+ }
3993
+ }
3994
+ const windowPattern = /window\[["']([^"']*[Tt]oken[^"']*)["']\]\s*=\s*["']([^"']+)["']/g;
3995
+ let windowMatch;
3996
+ while ((windowMatch = windowPattern.exec(scriptContent)) !== null) {
3997
+ if (windowMatch[2]) {
3998
+ return windowMatch[2];
3999
+ }
4000
+ }
4001
+ return yield* Effect.fail(
4002
+ new Error("API token not found in scripts")
4003
+ );
4004
+ }),
4005
+ storeToken: (type, value, expiry) => Effect.gen(function* () {
4006
+ const token = {
4007
+ type,
4008
+ value,
4009
+ expiry
4010
+ };
4011
+ const tokensMap = yield* Ref.get(tokens);
4012
+ tokensMap.set(type, token);
4013
+ yield* Ref.set(tokens, tokensMap);
4014
+ }),
4015
+ getToken: (type) => Effect.gen(function* () {
4016
+ const tokensMap = yield* Ref.get(tokens);
4017
+ const token = tokensMap.get(type);
4018
+ if (!token) {
4019
+ return yield* Effect.fail(
4020
+ new Error(`Token of type ${type} not found`)
4021
+ );
4022
+ }
4023
+ if (token.expiry && token.expiry < /* @__PURE__ */ new Date()) {
4024
+ return yield* Effect.fail(
4025
+ new Error(`Token of type ${type} has expired`)
4026
+ );
4027
+ }
4028
+ return token.value;
4029
+ }),
4030
+ isTokenValid: (type) => Effect.gen(function* () {
4031
+ const tokensMap = yield* Ref.get(tokens);
4032
+ const token = tokensMap.get(type);
4033
+ if (!token) {
4034
+ return false;
4035
+ }
4036
+ if (token.expiry && token.expiry < /* @__PURE__ */ new Date()) {
4037
+ return false;
4038
+ }
4039
+ return true;
4040
+ }),
4041
+ setLocalStorage: (key, value) => Effect.gen(function* () {
4042
+ const storage = yield* Ref.get(localStorage);
4043
+ storage.set(key, value);
4044
+ yield* Ref.set(localStorage, storage);
4045
+ }),
4046
+ getLocalStorage: (key) => Effect.gen(function* () {
4047
+ const storage = yield* Ref.get(localStorage);
4048
+ const value = storage.get(key);
4049
+ if (!value) {
4050
+ return yield* Effect.fail(
4051
+ new Error(`Local storage key '${key}' not found`)
4052
+ );
4053
+ }
4054
+ return value;
4055
+ }),
4056
+ clearLocalStorage: () => Effect.gen(function* () {
4057
+ yield* Ref.set(localStorage, /* @__PURE__ */ new Map());
4058
+ }),
4059
+ setSessionStorage: (key, value) => Effect.gen(function* () {
4060
+ const storage = yield* Ref.get(sessionStorage);
4061
+ storage.set(key, value);
4062
+ yield* Ref.set(sessionStorage, storage);
4063
+ }),
4064
+ getSessionStorage: (key) => Effect.gen(function* () {
4065
+ const storage = yield* Ref.get(sessionStorage);
4066
+ const value = storage.get(key);
4067
+ if (!value) {
4068
+ return yield* Effect.fail(
4069
+ new Error(`Session storage key '${key}' not found`)
4070
+ );
4071
+ }
4072
+ return value;
4073
+ }),
4074
+ clearSessionStorage: () => Effect.gen(function* () {
4075
+ yield* Ref.set(sessionStorage, /* @__PURE__ */ new Map());
4076
+ }),
4077
+ clearState: () => Effect.gen(function* () {
4078
+ yield* Ref.set(tokens, /* @__PURE__ */ new Map());
4079
+ yield* Ref.set(localStorage, /* @__PURE__ */ new Map());
4080
+ yield* Ref.set(sessionStorage, /* @__PURE__ */ new Map());
4081
+ })
4082
+ };
4083
+ });
4084
+ const StateManagerLive = Layer.effect(StateManager, makeStateManager());
4085
+ class TokenExtractor extends Context.Tag("TokenExtractor")() {
4086
+ }
4087
+ const makeTokenExtractor = Effect.gen(function* () {
4088
+ const stateManager = yield* StateManager;
4089
+ const httpClient = yield* EnhancedHttpClient;
4090
+ const logger = yield* SpiderLogger;
4091
+ const extractFromHTML = (html) => {
4092
+ const tokens = [];
4093
+ const $ = cheerio.load(html);
4094
+ const csrfSelectors = [
4095
+ { selector: 'meta[name="csrf-token"]', attr: "content" },
4096
+ { selector: 'meta[name="_csrf"]', attr: "content" },
4097
+ { selector: 'meta[name="csrf_token"]', attr: "content" },
4098
+ { selector: 'meta[name="authenticity_token"]', attr: "content" },
4099
+ { selector: 'input[name="csrf_token"]', attr: "value" },
4100
+ { selector: 'input[name="_csrf"]', attr: "value" },
4101
+ { selector: 'input[name="authenticity_token"]', attr: "value" },
4102
+ { selector: 'input[name="__RequestVerificationToken"]', attr: "value" }
4103
+ ];
4104
+ for (const { selector, attr } of csrfSelectors) {
4105
+ const element = $(selector);
4106
+ if (element.length > 0) {
4107
+ const value = element.attr(attr);
4108
+ if (value) {
4109
+ tokens.push({
4110
+ type: TokenType.CSRF,
4111
+ value,
4112
+ source: "html",
4113
+ selector
4114
+ });
4115
+ }
4116
+ }
4117
+ }
4118
+ const apiSelectors = [
4119
+ { selector: 'meta[name="api-key"]', attr: "content" },
4120
+ { selector: 'meta[name="api_key"]', attr: "content" },
4121
+ { selector: 'meta[name="api-token"]', attr: "content" },
4122
+ { selector: 'meta[name="access-token"]', attr: "content" }
4123
+ ];
4124
+ for (const { selector, attr } of apiSelectors) {
4125
+ const element = $(selector);
4126
+ if (element.length > 0) {
4127
+ const value = element.attr(attr);
4128
+ if (value) {
4129
+ tokens.push({
4130
+ type: TokenType.API,
4131
+ value,
4132
+ source: "html",
4133
+ selector
4134
+ });
4135
+ }
4136
+ }
4137
+ }
4138
+ return tokens;
4139
+ };
4140
+ const extractFromScripts = (html) => {
4141
+ const tokens = [];
4142
+ const $ = cheerio.load(html);
4143
+ const scriptTags = $("script:not([src])");
4144
+ const scriptContent = scriptTags.map((_, el) => $(el).html()).get().join("\n");
4145
+ const csrfPatterns = [
4146
+ {
4147
+ pattern: /window\.csrfToken\s*=\s*["']([^"']+)["']/,
4148
+ name: "window.csrfToken"
4149
+ },
4150
+ {
4151
+ pattern: /csrf[_-]?token["']?\s*[:=]\s*["']([^"']+)["']/i,
4152
+ name: "csrf_token"
4153
+ },
4154
+ { pattern: /_token["']?\s*[:=]\s*["']([^"']+)["']/, name: "_token" },
4155
+ {
4156
+ pattern: /authenticity_token["']?\s*[:=]\s*["']([^"']+)["']/,
4157
+ name: "authenticity_token"
4158
+ },
4159
+ {
4160
+ pattern: /X-CSRF-Token["']?\s*[:=]\s*["']([^"']+)["']/,
4161
+ name: "X-CSRF-Token"
4162
+ }
4163
+ ];
4164
+ for (const { pattern, name } of csrfPatterns) {
4165
+ const match = scriptContent.match(pattern);
4166
+ if (match && match[1]) {
4167
+ tokens.push({
4168
+ type: TokenType.CSRF,
4169
+ value: match[1],
4170
+ source: "script",
4171
+ pattern: name
4172
+ });
4173
+ }
4174
+ }
4175
+ const apiPatterns = [
4176
+ {
4177
+ pattern: /api[_-]?key["']?\s*[:=]\s*["']([^"']+)["']/i,
4178
+ name: "api_key"
4179
+ },
4180
+ {
4181
+ pattern: /api[_-]?token["']?\s*[:=]\s*["']([^"']+)["']/i,
4182
+ name: "api_token"
4183
+ },
4184
+ {
4185
+ pattern: /X-Secret-Token["']?\s*[:=]\s*["']([^"']+)["']/,
4186
+ name: "X-Secret-Token"
4187
+ },
4188
+ {
4189
+ pattern: /authorization["']?\s*[:=]\s*["']Bearer\s+([^"']+)["']/i,
4190
+ name: "authorization"
4191
+ },
4192
+ {
4193
+ pattern: /access[_-]?token["']?\s*[:=]\s*["']([^"']+)["']/i,
4194
+ name: "access_token"
4195
+ },
4196
+ {
4197
+ pattern: /secret[_-]?key["']?\s*[:=]\s*["']([^"']+)["']/i,
4198
+ name: "secret_key"
4199
+ }
4200
+ ];
4201
+ for (const { pattern, name } of apiPatterns) {
4202
+ const match = scriptContent.match(pattern);
4203
+ if (match && match[1]) {
4204
+ tokens.push({
4205
+ type: TokenType.API,
4206
+ value: match[1],
4207
+ source: "script",
4208
+ pattern: name
4209
+ });
4210
+ }
4211
+ }
4212
+ const windowPattern = /window\[["']([^"']*[Tt]oken[^"']*)["']\]\s*=\s*["']([^"']+)["']/g;
4213
+ let windowMatch;
4214
+ while ((windowMatch = windowPattern.exec(scriptContent)) !== null) {
4215
+ if (windowMatch[2]) {
4216
+ const keyLower = windowMatch[1].toLowerCase();
4217
+ const type = keyLower.includes("csrf") || keyLower.includes("authenticity") ? TokenType.CSRF : TokenType.API;
4218
+ tokens.push({
4219
+ type,
4220
+ value: windowMatch[2],
4221
+ source: "script",
4222
+ pattern: `window['${windowMatch[1]}']`
4223
+ });
4224
+ }
4225
+ }
4226
+ return tokens;
4227
+ };
4228
+ const extractFromHeaders = (headers) => {
4229
+ const tokens = [];
4230
+ const headerPatterns = [
4231
+ { header: "x-csrf-token", type: TokenType.CSRF },
4232
+ { header: "x-auth-token", type: TokenType.AUTH },
4233
+ { header: "x-api-key", type: TokenType.API },
4234
+ { header: "authorization", type: TokenType.AUTH },
4235
+ { header: "x-access-token", type: TokenType.AUTH }
4236
+ ];
4237
+ for (const { header, type } of headerPatterns) {
4238
+ const value = headers[header] || headers[header.toLowerCase()];
4239
+ if (value) {
4240
+ tokens.push({
4241
+ type,
4242
+ value,
4243
+ source: "header",
4244
+ pattern: header
4245
+ });
4246
+ }
4247
+ }
4248
+ return tokens;
4249
+ };
4250
+ const service = {
4251
+ extractTokensFromResponse: (response) => Effect.gen(function* () {
4252
+ const tokens = [];
4253
+ tokens.push(...extractFromHTML(response.body));
4254
+ tokens.push(...extractFromScripts(response.body));
4255
+ tokens.push(...extractFromHeaders(response.headers));
4256
+ const uniqueTokens = /* @__PURE__ */ new Map();
4257
+ for (const token of tokens) {
4258
+ const key = `${token.type}:${token.value}`;
4259
+ if (!uniqueTokens.has(key)) {
4260
+ uniqueTokens.set(key, token);
4261
+ yield* stateManager.storeToken(
4262
+ token.type,
4263
+ token.value,
4264
+ new Date(Date.now() + 36e5)
4265
+ // 1 hour expiry
4266
+ );
4267
+ yield* logger.logEdgeCase(
4268
+ new URL(response.url).hostname,
4269
+ "token_found",
4270
+ {
4271
+ type: token.type,
4272
+ source: token.source,
4273
+ pattern: token.pattern || token.selector
4274
+ }
4275
+ );
4276
+ }
4277
+ }
4278
+ return Array.from(uniqueTokens.values());
4279
+ }),
4280
+ extractCSRFFromResponse: (response) => Effect.gen(function* () {
4281
+ const tokens = yield* Effect.succeed([
4282
+ ...extractFromHTML(response.body),
4283
+ ...extractFromScripts(response.body)
4284
+ ]);
4285
+ const csrfToken = tokens.find((t) => t.type === TokenType.CSRF);
4286
+ if (csrfToken) {
4287
+ yield* stateManager.storeToken(
4288
+ TokenType.CSRF,
4289
+ csrfToken.value,
4290
+ new Date(Date.now() + 36e5)
4291
+ );
4292
+ return csrfToken.value;
4293
+ }
4294
+ return null;
4295
+ }),
4296
+ extractAPIFromResponse: (response) => Effect.gen(function* () {
4297
+ const tokens = yield* Effect.succeed([
4298
+ ...extractFromScripts(response.body),
4299
+ ...extractFromHeaders(response.headers)
4300
+ ]);
4301
+ const apiToken = tokens.find((t) => t.type === TokenType.API);
4302
+ if (apiToken) {
4303
+ yield* stateManager.storeToken(
4304
+ TokenType.API,
4305
+ apiToken.value,
4306
+ new Date(Date.now() + 36e5)
4307
+ );
4308
+ return apiToken.value;
4309
+ }
4310
+ return null;
4311
+ }),
4312
+ authenticatedRequest: (url, options = {}) => Effect.gen(function* () {
4313
+ const headers = { ...options.customHeaders };
4314
+ if (options.requireCSRF) {
4315
+ const isValid = yield* stateManager.isTokenValid(TokenType.CSRF);
4316
+ if (!isValid) {
4317
+ const baseUrl = new URL(url).origin;
4318
+ const baseResponse = yield* httpClient.get(baseUrl);
4319
+ yield* Effect.succeed(extractFromHTML(baseResponse.body)).pipe(
4320
+ Effect.flatMap((tokens) => {
4321
+ const csrfToken2 = tokens.find((t) => t.type === TokenType.CSRF);
4322
+ if (csrfToken2) {
4323
+ return stateManager.storeToken(
4324
+ TokenType.CSRF,
4325
+ csrfToken2.value,
4326
+ new Date(Date.now() + 36e5)
4327
+ );
4328
+ }
4329
+ return Effect.void;
4330
+ })
4331
+ );
4332
+ }
4333
+ const csrfToken = yield* stateManager.getToken(TokenType.CSRF).pipe(Effect.catchAll(() => Effect.succeed(null)));
4334
+ if (csrfToken) {
4335
+ headers["X-CSRF-Token"] = csrfToken;
4336
+ headers["X-Requested-With"] = "XMLHttpRequest";
4337
+ }
4338
+ }
4339
+ if (options.requireAPI) {
4340
+ const isValid = yield* stateManager.isTokenValid(TokenType.API);
4341
+ if (!isValid) {
4342
+ return yield* Effect.fail(
4343
+ new Error("API token not available or expired")
4344
+ );
4345
+ }
4346
+ const apiToken = yield* stateManager.getToken(TokenType.API);
4347
+ headers["Authorization"] = `Bearer ${apiToken}`;
4348
+ headers["X-API-Key"] = apiToken;
4349
+ }
4350
+ const response = yield* httpClient.request(url, { headers });
4351
+ if (options.requireCSRF) {
4352
+ const currentCSRF = yield* stateManager.getToken(TokenType.CSRF).pipe(Effect.catchAll(() => Effect.succeed("")));
4353
+ if (currentCSRF) {
4354
+ yield* service.detectTokenRotation(
4355
+ currentCSRF,
4356
+ response,
4357
+ TokenType.CSRF
4358
+ );
4359
+ }
4360
+ }
4361
+ if (options.requireAPI) {
4362
+ const currentAPI = yield* stateManager.getToken(TokenType.API).pipe(Effect.catchAll(() => Effect.succeed("")));
4363
+ if (currentAPI) {
4364
+ yield* service.detectTokenRotation(
4365
+ currentAPI,
4366
+ response,
4367
+ TokenType.API
4368
+ );
4369
+ }
4370
+ }
4371
+ return response;
4372
+ }),
4373
+ detectTokenRotation: (oldToken, response, type) => Effect.gen(function* () {
4374
+ const tokens = yield* Effect.succeed([
4375
+ ...extractFromHTML(response.body),
4376
+ ...extractFromScripts(response.body),
4377
+ ...extractFromHeaders(response.headers)
4378
+ ]);
4379
+ const newToken = tokens.find(
4380
+ (t) => t.type === type && t.value !== oldToken
4381
+ );
4382
+ if (newToken) {
4383
+ yield* stateManager.storeToken(
4384
+ type,
4385
+ newToken.value,
4386
+ new Date(Date.now() + 36e5)
4387
+ );
4388
+ yield* logger.logEdgeCase(
4389
+ new URL(response.url).hostname,
4390
+ "token_rotated",
4391
+ {
4392
+ type,
4393
+ oldToken: oldToken.substring(0, 8) + "...",
4394
+ newToken: newToken.value.substring(0, 8) + "..."
4395
+ }
4396
+ );
4397
+ return true;
4398
+ }
4399
+ return false;
4400
+ }),
4401
+ refreshToken: (type, refreshUrl) => Effect.gen(function* () {
4402
+ if (!refreshUrl) {
4403
+ return yield* Effect.fail(new Error("No refresh URL provided"));
4404
+ }
4405
+ const response = yield* httpClient.get(refreshUrl);
4406
+ const tokens = yield* Effect.succeed([
4407
+ ...extractFromHTML(response.body),
4408
+ ...extractFromScripts(response.body),
4409
+ ...extractFromHeaders(response.headers)
4410
+ ]);
4411
+ const newToken = tokens.find((t) => t.type === type);
4412
+ if (!newToken) {
4413
+ return yield* Effect.fail(
4414
+ new Error(`Failed to refresh ${type} token`)
4415
+ );
4416
+ }
4417
+ yield* stateManager.storeToken(
4418
+ type,
4419
+ newToken.value,
4420
+ new Date(Date.now() + 36e5)
4421
+ );
4422
+ return newToken.value;
4423
+ })
4424
+ };
4425
+ return service;
4426
+ });
4427
+ const TokenExtractorLive = Layer.effect(
4428
+ TokenExtractor,
4429
+ makeTokenExtractor
4430
+ );
4431
+ class WebScrapingEngine extends Context.Tag("WebScrapingEngine")() {
4432
+ }
4433
+ const makeWebScrapingEngine = Effect.gen(function* () {
4434
+ const httpClient = yield* EnhancedHttpClient;
4435
+ const cookieManager = yield* CookieManager;
4436
+ const sessionStore = yield* SessionStore;
4437
+ const tokenExtractor = yield* TokenExtractor;
4438
+ const stateManager = yield* StateManager;
4439
+ const logger = yield* SpiderLogger;
4440
+ yield* ScraperService;
4441
+ const service = {
4442
+ login: (credentials) => Effect.gen(function* () {
4443
+ const domain = new URL(credentials.loginUrl).hostname;
4444
+ yield* logger.logEdgeCase(domain, "login_start", {
4445
+ url: credentials.loginUrl,
4446
+ username: credentials.username
4447
+ });
4448
+ const loginPageResponse = yield* httpClient.get(credentials.loginUrl);
4449
+ const csrfToken = yield* tokenExtractor.extractCSRFFromResponse(loginPageResponse);
4450
+ const formData = {
4451
+ [credentials.usernameField || "username"]: credentials.username,
4452
+ [credentials.passwordField || "password"]: credentials.password,
4453
+ ...credentials.additionalFields
4454
+ };
4455
+ if (csrfToken) {
4456
+ const csrfFieldNames = [
4457
+ "csrf_token",
4458
+ "_csrf",
4459
+ "authenticity_token",
4460
+ "__RequestVerificationToken"
4461
+ ];
4462
+ const csrfFieldName = csrfFieldNames.find(
4463
+ (name) => loginPageResponse.body.includes(`name="${name}"`)
4464
+ ) || "csrf_token";
4465
+ formData[csrfFieldName] = csrfToken;
4466
+ yield* logger.logEdgeCase(domain, "csrf_token_added", {
4467
+ field: csrfFieldName
4468
+ });
4469
+ }
4470
+ const loginResponse = yield* httpClient.submitForm(
4471
+ credentials.loginUrl,
4472
+ formData
4473
+ );
4474
+ const isAuthenticated = loginResponse.status === 200 || loginResponse.status === 302 || loginResponse.headers["location"] !== void 0;
4475
+ if (!isAuthenticated) {
4476
+ return yield* Effect.fail(
4477
+ new Error(`Login failed with status ${loginResponse.status}`)
4478
+ );
4479
+ }
4480
+ yield* tokenExtractor.extractTokensFromResponse(loginResponse);
4481
+ const session = yield* sessionStore.createSession();
4482
+ yield* sessionStore.updateSessionData({
4483
+ authenticated: true,
4484
+ username: credentials.username,
4485
+ loginTime: /* @__PURE__ */ new Date()
4486
+ });
4487
+ const tokens = /* @__PURE__ */ new Map();
4488
+ for (const type of [TokenType.CSRF, TokenType.API, TokenType.AUTH]) {
4489
+ const token = yield* stateManager.getToken(type).pipe(Effect.catchAll(() => Effect.succeed(null)));
4490
+ if (token) {
4491
+ tokens.set(type, token);
4492
+ }
4493
+ }
4494
+ yield* logger.logEdgeCase(domain, "login_success", {
4495
+ sessionId: session.id,
4496
+ tokensFound: Array.from(tokens.keys())
4497
+ });
4498
+ return {
4499
+ id: session.id,
4500
+ authenticated: true,
4501
+ tokens,
4502
+ startTime: /* @__PURE__ */ new Date()
4503
+ };
4504
+ }),
4505
+ fetchAuthenticated: (url) => Effect.gen(function* () {
4506
+ const isValid = yield* sessionStore.isSessionValid();
4507
+ if (!isValid) {
4508
+ return yield* Effect.fail(
4509
+ new Error("No valid session. Please login first.")
4510
+ );
4511
+ }
4512
+ return yield* httpClient.get(url);
4513
+ }),
4514
+ submitFormWithCSRF: (url, formData, csrfUrl) => Effect.gen(function* () {
4515
+ const domain = new URL(url).hostname;
4516
+ let csrfToken = null;
4517
+ const isValid = yield* stateManager.isTokenValid(TokenType.CSRF);
4518
+ if (!isValid && csrfUrl) {
4519
+ const csrfResponse = yield* httpClient.get(csrfUrl);
4520
+ csrfToken = yield* tokenExtractor.extractCSRFFromResponse(csrfResponse);
4521
+ } else if (isValid) {
4522
+ csrfToken = yield* stateManager.getToken(TokenType.CSRF).pipe(Effect.catchAll(() => Effect.succeed(null)));
4523
+ }
4524
+ if (!csrfToken && !csrfUrl) {
4525
+ const formPageResponse = yield* httpClient.get(url);
4526
+ csrfToken = yield* tokenExtractor.extractCSRFFromResponse(formPageResponse);
4527
+ }
4528
+ const enhancedFormData = { ...formData };
4529
+ if (csrfToken) {
4530
+ const csrfFieldNames = [
4531
+ "csrf_token",
4532
+ "_csrf",
4533
+ "authenticity_token",
4534
+ "__RequestVerificationToken"
4535
+ ];
4536
+ const csrfFieldName = csrfFieldNames[0];
4537
+ enhancedFormData[csrfFieldName] = csrfToken;
4538
+ yield* logger.logEdgeCase(domain, "csrf_protected_form", {
4539
+ url,
4540
+ csrfField: csrfFieldName
4541
+ });
4542
+ }
4543
+ const response = yield* httpClient.submitForm(url, enhancedFormData);
4544
+ if (csrfToken) {
4545
+ yield* tokenExtractor.detectTokenRotation(
4546
+ csrfToken,
4547
+ response,
4548
+ TokenType.CSRF
4549
+ );
4550
+ }
4551
+ return response;
4552
+ }),
4553
+ makeAPIRequest: (url, method = "GET", data) => Effect.gen(function* () {
4554
+ const response = yield* tokenExtractor.authenticatedRequest(url, {
4555
+ requireAPI: true,
4556
+ customHeaders: {
4557
+ "Content-Type": "application/json",
4558
+ Accept: "application/json"
4559
+ }
4560
+ }).pipe(
4561
+ Effect.catchAll((error) => {
4562
+ if (method === "GET") {
4563
+ return httpClient.get(url);
4564
+ } else {
4565
+ return httpClient.post(url, data);
4566
+ }
4567
+ })
4568
+ );
4569
+ return response;
4570
+ }),
4571
+ createSession: (id) => Effect.gen(function* () {
4572
+ const session = yield* sessionStore.createSession(id);
4573
+ const tokens = /* @__PURE__ */ new Map();
4574
+ for (const type of [TokenType.CSRF, TokenType.API, TokenType.AUTH]) {
4575
+ const token = yield* stateManager.getToken(type).pipe(Effect.catchAll(() => Effect.succeed(null)));
4576
+ if (token) {
4577
+ tokens.set(type, token);
4578
+ }
4579
+ }
4580
+ return {
4581
+ id: session.id,
4582
+ authenticated: false,
4583
+ tokens,
4584
+ startTime: session.createdAt
4585
+ };
4586
+ }),
4587
+ loadSession: (id) => Effect.gen(function* () {
4588
+ yield* sessionStore.loadSession(id);
4589
+ const session = yield* sessionStore.getCurrentSession();
4590
+ if (session._tag === "None") {
4591
+ return yield* Effect.fail(new Error("Failed to load session"));
4592
+ }
4593
+ const tokens = /* @__PURE__ */ new Map();
4594
+ for (const type of [TokenType.CSRF, TokenType.API, TokenType.AUTH]) {
4595
+ const token = yield* stateManager.getToken(type).pipe(Effect.catchAll(() => Effect.succeed(null)));
4596
+ if (token) {
4597
+ tokens.set(type, token);
4598
+ }
4599
+ }
4600
+ return {
4601
+ id: session.value.id,
4602
+ authenticated: session.value.userData?.authenticated || false,
4603
+ tokens,
4604
+ startTime: session.value.createdAt
4605
+ };
4606
+ }),
4607
+ exportSession: () => sessionStore.exportSession(),
4608
+ importSession: (data) => sessionStore.importSession(data),
4609
+ clearAll: () => Effect.gen(function* () {
4610
+ yield* sessionStore.clearSession();
4611
+ yield* cookieManager.clearCookies();
4612
+ yield* stateManager.clearState();
4613
+ })
4614
+ };
4615
+ return service;
4616
+ });
4617
+ const WebScrapingEngineLive = Layer.effect(
4618
+ WebScrapingEngine,
4619
+ makeWebScrapingEngine
4620
+ );
4621
+ export {
4622
+ ConfigurationError,
4623
+ CookieManager,
4624
+ CookieManagerLive,
4625
+ DEFAULT_HYBRID_CONFIG,
4626
+ DeltaPersistence,
4627
+ EnhancedHttpClient,
4628
+ EnhancedHttpClientLive,
4629
+ FileStorageBackend,
4630
+ FileSystemError,
4631
+ FullStatePersistence,
4632
+ HybridPersistence,
4633
+ LinkExtractionError,
4634
+ LinkExtractorService,
4635
+ LinkExtractorServiceLayer,
4636
+ LoggingMiddleware,
4637
+ MiddlewareError,
4638
+ MiddlewareManager,
4639
+ NetworkError,
4640
+ PageDataSchema,
4641
+ PersistenceError$1 as PersistenceError,
4642
+ PriorityRequest,
4643
+ RateLimitMiddleware,
4644
+ ResponseError,
4645
+ ResumabilityConfigs,
4646
+ PersistenceError2 as ResumabilityError,
4647
+ ResumabilityService,
4648
+ RobotsService,
4649
+ RobotsTxtError,
4650
+ ScraperService,
4651
+ SessionStore,
4652
+ SessionStoreLive,
4653
+ SpiderConfig,
4654
+ SpiderLoggerLive,
4655
+ SpiderLogger as SpiderLoggerTag,
4656
+ SpiderSchedulerService,
4657
+ SpiderService,
4658
+ SpiderState,
4659
+ SpiderStateKey,
4660
+ StateDelta,
4661
+ StateManager,
4662
+ StateManagerLive,
4663
+ StatsMiddleware,
4664
+ TokenExtractor,
4665
+ TokenExtractorLive,
4666
+ TokenType,
4667
+ UrlDeduplicatorService,
4668
+ UserAgentMiddleware,
4669
+ WebScrapingEngine,
4670
+ WebScrapingEngineLive,
4671
+ createStateOperation,
4672
+ makeCookieManager,
4673
+ makeEnhancedHttpClient,
4674
+ makeSessionStore,
4675
+ makeSpiderConfig,
4676
+ makeSpiderLogger,
4677
+ makeStateManager,
4678
+ makeTokenExtractor,
4679
+ makeWebScrapingEngine
4680
+ };
4681
+ //# sourceMappingURL=index.js.map