@jambudipa/spider 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/README.md +117 -69
  2. package/dist/index.js +835 -114
  3. package/dist/index.js.map +1 -1
  4. package/package.json +12 -7
  5. package/dist/index.d.ts +0 -33
  6. package/dist/index.d.ts.map +0 -1
  7. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +0 -57
  8. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +0 -1
  9. package/dist/lib/Config/SpiderConfig.service.d.ts +0 -256
  10. package/dist/lib/Config/SpiderConfig.service.d.ts.map +0 -1
  11. package/dist/lib/HttpClient/CookieManager.d.ts +0 -44
  12. package/dist/lib/HttpClient/CookieManager.d.ts.map +0 -1
  13. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +0 -88
  14. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +0 -1
  15. package/dist/lib/HttpClient/SessionStore.d.ts +0 -82
  16. package/dist/lib/HttpClient/SessionStore.d.ts.map +0 -1
  17. package/dist/lib/HttpClient/TokenExtractor.d.ts +0 -58
  18. package/dist/lib/HttpClient/TokenExtractor.d.ts.map +0 -1
  19. package/dist/lib/HttpClient/index.d.ts +0 -8
  20. package/dist/lib/HttpClient/index.d.ts.map +0 -1
  21. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +0 -166
  22. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +0 -1
  23. package/dist/lib/LinkExtractor/index.d.ts +0 -37
  24. package/dist/lib/LinkExtractor/index.d.ts.map +0 -1
  25. package/dist/lib/Logging/FetchLogger.d.ts +0 -8
  26. package/dist/lib/Logging/FetchLogger.d.ts.map +0 -1
  27. package/dist/lib/Logging/SpiderLogger.service.d.ts +0 -34
  28. package/dist/lib/Logging/SpiderLogger.service.d.ts.map +0 -1
  29. package/dist/lib/Middleware/SpiderMiddleware.d.ts +0 -276
  30. package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +0 -1
  31. package/dist/lib/PageData/PageData.d.ts +0 -28
  32. package/dist/lib/PageData/PageData.d.ts.map +0 -1
  33. package/dist/lib/Resumability/Resumability.service.d.ts +0 -176
  34. package/dist/lib/Resumability/Resumability.service.d.ts.map +0 -1
  35. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +0 -47
  36. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +0 -1
  37. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +0 -95
  38. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +0 -1
  39. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +0 -92
  40. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +0 -1
  41. package/dist/lib/Resumability/index.d.ts +0 -51
  42. package/dist/lib/Resumability/index.d.ts.map +0 -1
  43. package/dist/lib/Resumability/strategies.d.ts +0 -76
  44. package/dist/lib/Resumability/strategies.d.ts.map +0 -1
  45. package/dist/lib/Resumability/types.d.ts +0 -201
  46. package/dist/lib/Resumability/types.d.ts.map +0 -1
  47. package/dist/lib/Robots/Robots.service.d.ts +0 -78
  48. package/dist/lib/Robots/Robots.service.d.ts.map +0 -1
  49. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +0 -211
  50. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +0 -1
  51. package/dist/lib/Scraper/Scraper.service.d.ts +0 -123
  52. package/dist/lib/Scraper/Scraper.service.d.ts.map +0 -1
  53. package/dist/lib/Spider/Spider.service.d.ts +0 -194
  54. package/dist/lib/Spider/Spider.service.d.ts.map +0 -1
  55. package/dist/lib/StateManager/StateManager.service.d.ts +0 -68
  56. package/dist/lib/StateManager/StateManager.service.d.ts.map +0 -1
  57. package/dist/lib/StateManager/index.d.ts +0 -5
  58. package/dist/lib/StateManager/index.d.ts.map +0 -1
  59. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +0 -58
  60. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +0 -1
  61. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +0 -77
  62. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +0 -1
  63. package/dist/lib/WebScrapingEngine/index.d.ts +0 -5
  64. package/dist/lib/WebScrapingEngine/index.d.ts.map +0 -1
  65. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +0 -39
  66. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +0 -1
  67. package/dist/lib/api-facades.d.ts +0 -313
  68. package/dist/lib/api-facades.d.ts.map +0 -1
  69. package/dist/lib/errors.d.ts +0 -99
  70. package/dist/lib/errors.d.ts.map +0 -1
package/dist/index.js CHANGED
@@ -1,11 +1,11 @@
1
- import { Effect, Layer, MutableHashSet, Schema, Data, Context, Console, MutableHashMap, Option, Queue, PubSub, MutableRef, Schedule, Stream, Fiber, Random, Ref } from "effect";
1
+ import { Effect, Layer, MutableHashSet, Schema, Data, Context, Console, MutableHashMap, Option, Queue, Ref, pipe, PubSub, MutableRef, Schedule, Stream, Fiber, Random, Duration } from "effect";
2
2
  import * as cheerio from "cheerio";
3
3
  import * as fs from "fs";
4
4
  import * as path from "path";
5
5
  import * as fs$1 from "fs/promises";
6
6
  import { CookieJar } from "tough-cookie";
7
7
  class SpiderConfig extends Effect.Service()(
8
- "@jambudipa.io/SpiderConfig",
8
+ "@jambudipa/spiderConfig",
9
9
  {
10
10
  effect: Effect.sync(() => makeSpiderConfig({}))
11
11
  }
@@ -337,7 +337,7 @@ class UrlDeduplicatorService extends Effect.Service()(
337
337
  const shouldNormalize = yield* config.shouldNormalizeUrlsForDeduplication();
338
338
  const seenUrls = MutableHashSet.empty();
339
339
  const mutex = yield* Effect.makeSemaphore(1);
340
- const normalizeUrl = (url) => {
340
+ const normalizeUrl2 = (url) => {
341
341
  if (!shouldNormalize) {
342
342
  return url;
343
343
  }
@@ -370,7 +370,7 @@ class UrlDeduplicatorService extends Effect.Service()(
370
370
  return {
371
371
  tryAdd: (url) => mutex.withPermits(1)(
372
372
  Effect.sync(() => {
373
- const normalizedUrl = normalizeUrl(url);
373
+ const normalizedUrl = normalizeUrl2(url);
374
374
  if (MutableHashSet.has(seenUrls, normalizedUrl)) {
375
375
  return false;
376
376
  }
@@ -380,7 +380,7 @@ class UrlDeduplicatorService extends Effect.Service()(
380
380
  ),
381
381
  contains: (url) => mutex.withPermits(1)(
382
382
  Effect.sync(() => {
383
- const normalizedUrl = normalizeUrl(url);
383
+ const normalizedUrl = normalizeUrl2(url);
384
384
  return MutableHashSet.has(seenUrls, normalizedUrl);
385
385
  })
386
386
  ),
@@ -439,7 +439,7 @@ const PageDataSchema = Schema.Struct({
439
439
  Schema.Record({ key: Schema.String, value: Schema.Unknown })
440
440
  )
441
441
  });
442
- class NetworkError extends Data.TaggedError("NetworkError") {
442
+ let NetworkError$1 = class NetworkError extends Data.TaggedError("NetworkError") {
443
443
  static fromCause(url, cause) {
444
444
  return new NetworkError({
445
445
  url,
@@ -447,7 +447,7 @@ class NetworkError extends Data.TaggedError("NetworkError") {
447
447
  message: `Failed to fetch ${url}: ${cause}`
448
448
  });
449
449
  }
450
- }
450
+ };
451
451
  class ResponseError extends Data.TaggedError("ResponseError") {
452
452
  static fromCause(url, cause) {
453
453
  return new ResponseError({
@@ -468,7 +468,7 @@ class RobotsTxtError extends Data.TaggedError("RobotsTxtError") {
468
468
  }
469
469
  class ConfigurationError extends Data.TaggedError("ConfigurationError") {
470
470
  }
471
- class MiddlewareError extends Data.TaggedError("MiddlewareError") {
471
+ let MiddlewareError$1 = class MiddlewareError extends Data.TaggedError("MiddlewareError") {
472
472
  static transform(middlewareName, cause) {
473
473
  return new MiddlewareError({
474
474
  phase: "transform",
@@ -485,8 +485,8 @@ class MiddlewareError extends Data.TaggedError("MiddlewareError") {
485
485
  message: `Middleware '${middlewareName}' failed during error handling: ${cause}`
486
486
  });
487
487
  }
488
- }
489
- class FileSystemError extends Data.TaggedError("FileSystemError") {
488
+ };
489
+ let FileSystemError$1 = class FileSystemError extends Data.TaggedError("FileSystemError") {
490
490
  static write(path2, cause) {
491
491
  return new FileSystemError({
492
492
  operation: "write",
@@ -503,7 +503,7 @@ class FileSystemError extends Data.TaggedError("FileSystemError") {
503
503
  message: `Failed to create directory ${path2}: ${cause}`
504
504
  });
505
505
  }
506
- }
506
+ };
507
507
  let PersistenceError$1 = class PersistenceError extends Data.TaggedError("PersistenceError") {
508
508
  static save(cause, key) {
509
509
  return new PersistenceError({
@@ -530,6 +530,61 @@ let PersistenceError$1 = class PersistenceError extends Data.TaggedError("Persis
530
530
  });
531
531
  }
532
532
  };
533
+ class ContentTypeError extends Data.TaggedError("ContentTypeError") {
534
+ static create(url, contentType, expectedTypes) {
535
+ return new ContentTypeError({
536
+ url,
537
+ contentType,
538
+ expectedTypes,
539
+ message: `Invalid content type '${contentType}' for ${url}. Expected one of: ${expectedTypes.join(", ")}`
540
+ });
541
+ }
542
+ }
543
+ class RequestAbortError extends Data.TaggedError("RequestAbortError") {
544
+ static timeout(url, duration) {
545
+ return new RequestAbortError({
546
+ url,
547
+ duration,
548
+ reason: "timeout",
549
+ message: `Request to ${url} aborted after ${duration}ms due to timeout`
550
+ });
551
+ }
552
+ static cancelled(url, duration) {
553
+ return new RequestAbortError({
554
+ url,
555
+ duration,
556
+ reason: "cancelled",
557
+ message: `Request to ${url} cancelled after ${duration}ms`
558
+ });
559
+ }
560
+ }
561
+ class AdapterNotInitialisedError extends Data.TaggedError("AdapterNotInitialisedError") {
562
+ static create(adapterId, operation) {
563
+ return new AdapterNotInitialisedError({
564
+ adapterId,
565
+ operation,
566
+ message: `Adapter '${adapterId}' not initialised. Cannot perform operation: ${operation}`
567
+ });
568
+ }
569
+ }
570
+ class BrowserCleanupError extends Data.TaggedError("BrowserCleanupError") {
571
+ static context(id, cause) {
572
+ return new BrowserCleanupError({
573
+ resourceType: "context",
574
+ resourceId: id,
575
+ cause,
576
+ message: `Failed to close browser context '${id}': ${cause}`
577
+ });
578
+ }
579
+ static browser(id, cause) {
580
+ return new BrowserCleanupError({
581
+ resourceType: "browser",
582
+ resourceId: id,
583
+ cause,
584
+ message: `Failed to close browser '${id}': ${cause}`
585
+ });
586
+ }
587
+ }
533
588
  const SpiderLogger = Context.GenericTag("SpiderLogger");
534
589
  const makeSpiderLogger = (logDir = "./spider-logs") => {
535
590
  if (!fs.existsSync(logDir)) {
@@ -852,20 +907,22 @@ class ScraperService extends Effect.Service()(
852
907
  clearTimeout(timeoutId);
853
908
  const contentType = resp.headers.get("content-type") || "";
854
909
  if (!contentType.includes("text/html") && !contentType.includes("application/xhtml") && !contentType.includes("text/") && contentType !== "") {
855
- throw new Error(`Skipping non-HTML content: ${contentType}`);
910
+ throw ContentTypeError.create(
911
+ url,
912
+ contentType,
913
+ ["text/html", "application/xhtml+xml", "text/*"]
914
+ );
856
915
  }
857
916
  return resp;
858
917
  } catch (error) {
859
918
  clearTimeout(timeoutId);
860
919
  if (error instanceof Error && error.name === "AbortError") {
861
- throw new Error(
862
- `Request aborted after ${Date.now() - startMs}ms`
863
- );
920
+ throw RequestAbortError.timeout(url, Date.now() - startMs);
864
921
  }
865
922
  throw error;
866
923
  }
867
924
  },
868
- catch: (error) => NetworkError.fromCause(url, error)
925
+ catch: (error) => NetworkError$1.fromCause(url, error)
869
926
  });
870
927
  const textController = new AbortController();
871
928
  const textTimeoutMs = 1e4;
@@ -885,7 +942,7 @@ class ScraperService extends Effect.Service()(
885
942
  try: async () => {
886
943
  try {
887
944
  const reader = response.body?.getReader();
888
- if (!reader) throw new Error("No response body");
945
+ if (!reader) throw ResponseError.fromCause(url, "No response body");
889
946
  const decoder = new TextDecoder();
890
947
  let html2 = "";
891
948
  while (true) {
@@ -894,7 +951,7 @@ class ScraperService extends Effect.Service()(
894
951
  html2 += decoder.decode(value, { stream: true });
895
952
  if (textController.signal.aborted) {
896
953
  reader.cancel();
897
- throw new Error("Response parsing aborted");
954
+ throw RequestAbortError.cancelled(url, Date.now() - startMs);
898
955
  }
899
956
  }
900
957
  clearTimeout(textTimeoutId);
@@ -1189,7 +1246,7 @@ class SpiderState extends Schema.Class("SpiderState")({
1189
1246
  }) {
1190
1247
  }
1191
1248
  class SpiderSchedulerService extends Effect.Service()(
1192
- "@jambudipa.io/SpiderSchedulerService",
1249
+ "@jambudipa/spiderSchedulerService",
1193
1250
  {
1194
1251
  effect: Effect.gen(function* () {
1195
1252
  const config = yield* SpiderConfig;
@@ -1200,7 +1257,7 @@ class SpiderSchedulerService extends Effect.Service()(
1200
1257
  let totalProcessed = 0;
1201
1258
  let persistenceLayer = null;
1202
1259
  let currentStateKey = null;
1203
- const normalizeUrl = (url) => {
1260
+ const normalizeUrl2 = (url) => {
1204
1261
  if (!shouldNormalizeUrls) {
1205
1262
  return url;
1206
1263
  }
@@ -1231,7 +1288,7 @@ class SpiderSchedulerService extends Effect.Service()(
1231
1288
  }
1232
1289
  };
1233
1290
  const generateFingerprint = (request) => {
1234
- const normalizedUrl = normalizeUrl(request.url);
1291
+ const normalizedUrl = normalizeUrl2(request.url);
1235
1292
  return `${normalizedUrl}:${request.depth}`;
1236
1293
  };
1237
1294
  const createPriorityRequest = (request, priority) => new PriorityRequest({
@@ -1364,8 +1421,270 @@ const SpiderScheduler_service = /* @__PURE__ */ Object.freeze(/* @__PURE__ */ Ob
1364
1421
  SpiderState,
1365
1422
  SpiderStateKey
1366
1423
  }, Symbol.toStringTag, { value: "Module" }));
1424
+ class SpiderError extends Data.TaggedError("SpiderError") {
1425
+ get message() {
1426
+ return `Spider operation '${this.operation}' failed${this.details ? `: ${JSON.stringify(this.details)}` : ""}`;
1427
+ }
1428
+ }
1429
+ class NetworkError2 extends Data.TaggedError("NetworkError") {
1430
+ get message() {
1431
+ return `Network request to ${this.url} failed${this.statusCode ? ` with status ${this.statusCode}` : ""}`;
1432
+ }
1433
+ static fromResponse(url, response) {
1434
+ return new NetworkError2({
1435
+ url,
1436
+ statusCode: response.status,
1437
+ method: "GET"
1438
+ });
1439
+ }
1440
+ static fromCause(url, cause) {
1441
+ return new NetworkError2({ url, cause });
1442
+ }
1443
+ }
1444
+ class TimeoutError extends Data.TaggedError("TimeoutError") {
1445
+ get message() {
1446
+ return `Operation '${this.operation}' timed out after ${this.timeoutMs}ms for ${this.url}`;
1447
+ }
1448
+ }
1449
+ class ParseError extends Data.TaggedError("ParseError") {
1450
+ get message() {
1451
+ return `Failed to parse ${this.expected}${this.input ? ` from input: ${this.input.substring(0, 100)}...` : ""}`;
1452
+ }
1453
+ static json(input, cause) {
1454
+ return new ParseError({
1455
+ input,
1456
+ expected: "JSON",
1457
+ cause
1458
+ });
1459
+ }
1460
+ static html(input, cause) {
1461
+ return new ParseError({
1462
+ input,
1463
+ expected: "HTML",
1464
+ cause
1465
+ });
1466
+ }
1467
+ }
1468
+ class ValidationError extends Data.TaggedError("ValidationError") {
1469
+ get message() {
1470
+ return `Validation failed for field '${this.field}': ${this.constraint}`;
1471
+ }
1472
+ static url(url) {
1473
+ return new ValidationError({
1474
+ field: "url",
1475
+ value: url,
1476
+ constraint: "Invalid URL format"
1477
+ });
1478
+ }
1479
+ }
1480
+ class BrowserError extends Data.TaggedError("BrowserError") {
1481
+ get message() {
1482
+ return `Browser operation '${this.operation}' failed${this.browserId ? ` for browser ${this.browserId}` : ""}`;
1483
+ }
1484
+ static notLaunched() {
1485
+ return new BrowserError({
1486
+ operation: "access",
1487
+ cause: "Browser not launched"
1488
+ });
1489
+ }
1490
+ static launchFailed(cause) {
1491
+ return new BrowserError({
1492
+ operation: "launch",
1493
+ cause
1494
+ });
1495
+ }
1496
+ }
1497
+ class PageError extends Data.TaggedError("PageError") {
1498
+ get message() {
1499
+ return `Page operation '${this.operation}' failed for ${this.url}${this.selector ? ` with selector '${this.selector}'` : ""}`;
1500
+ }
1501
+ }
1502
+ class StateError extends Data.TaggedError("StateError") {
1503
+ get message() {
1504
+ return `State ${this.operation} operation failed${this.stateKey ? ` for key '${this.stateKey}'` : ""}`;
1505
+ }
1506
+ }
1507
+ class SessionError extends Data.TaggedError("SessionError") {
1508
+ get message() {
1509
+ return `Session operation '${this.operation}' failed${this.sessionId ? ` for session ${this.sessionId}` : ""}`;
1510
+ }
1511
+ static noActiveSession() {
1512
+ return new SessionError({
1513
+ operation: "access",
1514
+ cause: "No active session"
1515
+ });
1516
+ }
1517
+ }
1518
+ class FileSystemError2 extends Data.TaggedError("FileSystemError") {
1519
+ get message() {
1520
+ return `File system ${this.operation} operation failed for path: ${this.path}`;
1521
+ }
1522
+ }
1523
+ class CrawlError extends Data.TaggedError("CrawlError") {
1524
+ get message() {
1525
+ return `Failed to crawl ${this.url} at depth ${this.depth}: ${this.reason}`;
1526
+ }
1527
+ static maxDepthReached(url, depth) {
1528
+ return new CrawlError({
1529
+ url,
1530
+ depth,
1531
+ reason: "Maximum depth reached"
1532
+ });
1533
+ }
1534
+ static robotsBlocked(url) {
1535
+ return new CrawlError({
1536
+ url,
1537
+ depth: 0,
1538
+ reason: "Blocked by robots.txt"
1539
+ });
1540
+ }
1541
+ }
1542
+ class QueueError extends Data.TaggedError("QueueError") {
1543
+ get message() {
1544
+ return `Queue ${this.operation} operation failed${this.queueSize !== void 0 ? ` (queue size: ${this.queueSize})` : ""}`;
1545
+ }
1546
+ }
1547
+ class ConfigError extends Data.TaggedError("ConfigError") {
1548
+ get message() {
1549
+ return `Configuration error for '${this.field}': ${this.reason}`;
1550
+ }
1551
+ static invalid(field, value, expected) {
1552
+ return new ConfigError({
1553
+ field,
1554
+ value,
1555
+ reason: `Expected ${expected}, got ${typeof value}`
1556
+ });
1557
+ }
1558
+ }
1559
+ class MiddlewareError2 extends Data.TaggedError("MiddlewareError") {
1560
+ get message() {
1561
+ return `Middleware '${this.middlewareName}' failed during ${this.phase} phase`;
1562
+ }
1563
+ }
1564
+ const DEFAULT_DEDUPLICATION_STRATEGY = {
1565
+ wwwHandling: "ignore",
1566
+ protocolHandling: "prefer-https",
1567
+ trailingSlashHandling: "ignore",
1568
+ queryParamHandling: "preserve",
1569
+ fragmentHandling: "ignore"
1570
+ };
1571
+ const parseUrl = (url) => Effect.try({
1572
+ try: () => new URL(url),
1573
+ catch: () => ValidationError.url(url)
1574
+ });
1575
+ const normalizeUrl = (url, strategy = DEFAULT_DEDUPLICATION_STRATEGY) => Effect.gen(function* () {
1576
+ const parsed = yield* parseUrl(url);
1577
+ if (strategy.protocolHandling === "prefer-https") {
1578
+ parsed.protocol = "https:";
1579
+ }
1580
+ let domain = parsed.hostname.toLowerCase();
1581
+ const hasWww = domain.startsWith("www.");
1582
+ const domainWithoutWww = hasWww ? domain.substring(4) : domain;
1583
+ switch (strategy.wwwHandling) {
1584
+ case "ignore":
1585
+ case "prefer-non-www":
1586
+ domain = domainWithoutWww;
1587
+ parsed.hostname = domain;
1588
+ break;
1589
+ case "prefer-www":
1590
+ if (!hasWww) {
1591
+ domain = `www.${domain}`;
1592
+ parsed.hostname = domain;
1593
+ }
1594
+ break;
1595
+ }
1596
+ if (strategy.trailingSlashHandling === "ignore") {
1597
+ parsed.pathname = parsed.pathname.replace(/\/$/, "") || "/";
1598
+ }
1599
+ if (strategy.queryParamHandling === "ignore") {
1600
+ parsed.search = "";
1601
+ } else if (strategy.queryParamHandling === "sort") {
1602
+ const params = new URLSearchParams(parsed.search);
1603
+ const sorted = Array.from(params.entries()).sort(([a], [b]) => a.localeCompare(b));
1604
+ parsed.search = new URLSearchParams(sorted).toString();
1605
+ }
1606
+ if (strategy.fragmentHandling === "ignore") {
1607
+ parsed.hash = "";
1608
+ }
1609
+ return {
1610
+ original: url,
1611
+ normalized: parsed.toString(),
1612
+ domain: domainWithoutWww
1613
+ };
1614
+ });
1615
+ const deduplicateUrls = (urls, strategy = DEFAULT_DEDUPLICATION_STRATEGY) => Effect.gen(function* () {
1616
+ const domainMap = yield* Ref.make(/* @__PURE__ */ new Map());
1617
+ const skipped = yield* Ref.make([]);
1618
+ let invalidCount = 0;
1619
+ yield* Effect.all(
1620
+ urls.map(
1621
+ (urlObj) => pipe(
1622
+ normalizeUrl(urlObj.url, strategy),
1623
+ Effect.tap(
1624
+ (normalized) => Effect.gen(function* () {
1625
+ const currentMap = yield* Ref.get(domainMap);
1626
+ const key = strategy.wwwHandling === "preserve" ? normalized.normalized : normalized.domain;
1627
+ if (!currentMap.has(key)) {
1628
+ currentMap.set(key, urlObj);
1629
+ yield* Ref.set(domainMap, currentMap);
1630
+ } else {
1631
+ const existing = currentMap.get(key);
1632
+ let shouldReplace = false;
1633
+ if (strategy.wwwHandling === "prefer-www") {
1634
+ const existingHasWww = existing.url.includes("://www.");
1635
+ const newHasWww = urlObj.url.includes("://www.");
1636
+ shouldReplace = !existingHasWww && newHasWww;
1637
+ } else if (strategy.wwwHandling === "prefer-non-www") {
1638
+ const existingHasWww = existing.url.includes("://www.");
1639
+ const newHasWww = urlObj.url.includes("://www.");
1640
+ shouldReplace = existingHasWww && !newHasWww;
1641
+ }
1642
+ if (shouldReplace) {
1643
+ currentMap.set(key, urlObj);
1644
+ yield* Ref.set(domainMap, currentMap);
1645
+ yield* Ref.update(skipped, (arr) => [
1646
+ ...arr,
1647
+ { url: existing.url, reason: `Replaced by preferred variant: ${urlObj.url}` }
1648
+ ]);
1649
+ } else {
1650
+ yield* Ref.update(skipped, (arr) => [
1651
+ ...arr,
1652
+ { url: urlObj.url, reason: `Duplicate of: ${existing.url}` }
1653
+ ]);
1654
+ }
1655
+ }
1656
+ })
1657
+ ),
1658
+ Effect.catchAll(
1659
+ (error) => Effect.gen(function* () {
1660
+ invalidCount++;
1661
+ yield* Ref.update(skipped, (arr) => [
1662
+ ...arr,
1663
+ { url: urlObj.url, reason: `Invalid URL: ${error.message}` }
1664
+ ]);
1665
+ yield* Effect.logWarning(`Invalid URL skipped: ${urlObj.url}`);
1666
+ })
1667
+ )
1668
+ )
1669
+ ),
1670
+ { concurrency: "unbounded" }
1671
+ );
1672
+ const finalMap = yield* Ref.get(domainMap);
1673
+ const finalSkipped = yield* Ref.get(skipped);
1674
+ const deduplicated = Array.from(finalMap.values());
1675
+ return {
1676
+ deduplicated,
1677
+ skipped: finalSkipped,
1678
+ stats: {
1679
+ total: urls.length,
1680
+ unique: deduplicated.length,
1681
+ duplicates: finalSkipped.filter((s) => s.reason.startsWith("Duplicate")).length,
1682
+ invalid: invalidCount
1683
+ }
1684
+ };
1685
+ });
1367
1686
  class SpiderService extends Effect.Service()(
1368
- "@jambudipa.io/Spider",
1687
+ "@jambudipa/spider",
1369
1688
  {
1370
1689
  effect: Effect.gen(function* () {
1371
1690
  const robots = yield* RobotsService;
@@ -1375,7 +1694,7 @@ class SpiderService extends Effect.Service()(
1375
1694
  const maybeScheduler = yield* Effect.serviceOption(
1376
1695
  SpiderSchedulerService
1377
1696
  );
1378
- const scheduler = Option.isSome(maybeScheduler) ? maybeScheduler.value : null;
1697
+ Option.isSome(maybeScheduler) ? maybeScheduler.value : null;
1379
1698
  const self = {
1380
1699
  /**
1381
1700
  * Starts crawling from the specified URL and processes results through the provided sink.
@@ -1440,24 +1759,27 @@ class SpiderService extends Effect.Service()(
1440
1759
  return [input];
1441
1760
  };
1442
1761
  const urlsWithMetadata = normalizeUrlInput(startingUrls);
1443
- const domainMap = /* @__PURE__ */ new Map();
1444
- for (const urlObj of urlsWithMetadata) {
1445
- try {
1446
- const url = new URL(urlObj.url);
1447
- const domain = url.hostname.toLowerCase();
1448
- const normalizedDomain = domain.replace(/^www\./, "");
1449
- if (!domainMap.has(normalizedDomain)) {
1450
- domainMap.set(normalizedDomain, urlObj);
1451
- } else {
1452
- console.warn(
1453
- `Skipping duplicate domain: ${domain} (normalized: ${normalizedDomain}, URL: ${urlObj.url})`
1454
- );
1455
- }
1456
- } catch (e) {
1457
- console.error(`Invalid URL skipped: ${urlObj.url}`, e);
1762
+ const deduplicationResult = yield* deduplicateUrls(
1763
+ urlsWithMetadata,
1764
+ {
1765
+ // Strategy: Treat www and non-www as the same domain by default
1766
+ // This can be configured via Spider options if needed
1767
+ wwwHandling: "ignore",
1768
+ protocolHandling: "prefer-https",
1769
+ trailingSlashHandling: "ignore",
1770
+ queryParamHandling: "preserve",
1771
+ fragmentHandling: "ignore"
1458
1772
  }
1773
+ );
1774
+ const deduplicatedUrls = deduplicationResult.deduplicated;
1775
+ if (deduplicationResult.stats.duplicates > 0) {
1776
+ yield* Effect.logInfo(
1777
+ `URL deduplication: ${deduplicationResult.stats.total} total, ${deduplicationResult.stats.unique} unique, ${deduplicationResult.stats.duplicates} duplicates removed`
1778
+ );
1779
+ }
1780
+ for (const skipped of deduplicationResult.skipped) {
1781
+ yield* Effect.logDebug(`Skipped URL: ${skipped.url} - Reason: ${skipped.reason}`);
1459
1782
  }
1460
- const deduplicatedUrls = Array.from(domainMap.values());
1461
1783
  const concurrency = yield* config.getConcurrency();
1462
1784
  if (deduplicatedUrls.length > 1) {
1463
1785
  const configOptions = yield* config.getOptions();
@@ -2268,13 +2590,6 @@ class SpiderService extends Effect.Service()(
2268
2590
  * ```
2269
2591
  */
2270
2592
  resume: (stateKey, _sink, _persistence) => Effect.gen(function* () {
2271
- if (!scheduler) {
2272
- return yield* Effect.fail(
2273
- new Error(
2274
- "Resume functionality requires SpiderSchedulerService to be available. Make sure resumability is enabled in SpiderConfig and SpiderSchedulerService is provided."
2275
- )
2276
- );
2277
- }
2278
2593
  const config = yield* SpiderConfig;
2279
2594
  if (!config) {
2280
2595
  return yield* Effect.fail(
@@ -2291,10 +2606,75 @@ class SpiderService extends Effect.Service()(
2291
2606
  )
2292
2607
  );
2293
2608
  }
2294
- console.log(`Resuming session: ${stateKey.id}`);
2609
+ const scheduler2 = yield* SpiderSchedulerService;
2610
+ const logger2 = yield* SpiderLogger;
2611
+ yield* logger2.logSpiderLifecycle("start", {
2612
+ sessionId: stateKey.id,
2613
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
2614
+ });
2615
+ const savedState = yield* Effect.tryPromise({
2616
+ try: async () => {
2617
+ return scheduler2.getState ? await Effect.runPromise(scheduler2.getState()) : null;
2618
+ },
2619
+ catch: (error) => new StateError({
2620
+ operation: "load",
2621
+ stateKey: stateKey.id,
2622
+ cause: error
2623
+ })
2624
+ });
2625
+ if (!savedState) {
2626
+ return yield* Effect.fail(
2627
+ new StateError({
2628
+ operation: "load",
2629
+ stateKey: stateKey.id,
2630
+ cause: "No saved state found for session"
2631
+ })
2632
+ );
2633
+ }
2634
+ const restoredUrls = yield* Effect.try({
2635
+ try: () => {
2636
+ const urls = [];
2637
+ if (savedState && typeof savedState === "object") {
2638
+ if ("pendingUrls" in savedState && Array.isArray(savedState.pendingUrls)) {
2639
+ urls.push(...savedState.pendingUrls);
2640
+ }
2641
+ if ("visitedUrls" in savedState && Array.isArray(savedState.visitedUrls)) ;
2642
+ }
2643
+ return urls;
2644
+ },
2645
+ catch: (error) => new ParseError({
2646
+ input: "saved state",
2647
+ expected: "crawl state",
2648
+ cause: error
2649
+ })
2650
+ });
2651
+ yield* logger2.logSpiderLifecycle("start", {
2652
+ sessionId: stateKey.id,
2653
+ pendingUrls: restoredUrls.length,
2654
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
2655
+ });
2656
+ if (restoredUrls.length > 0) {
2657
+ const crawlResult = yield* self.crawl(
2658
+ restoredUrls,
2659
+ _sink,
2660
+ {}
2661
+ );
2662
+ yield* logger2.logSpiderLifecycle("complete", {
2663
+ sessionId: stateKey.id,
2664
+ urlsProcessed: restoredUrls.length,
2665
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
2666
+ });
2667
+ return {
2668
+ ...crawlResult,
2669
+ resumed: true,
2670
+ sessionId: stateKey.id
2671
+ };
2672
+ }
2295
2673
  return {
2296
2674
  completed: true,
2297
- resumed: true
2675
+ resumed: true,
2676
+ sessionId: stateKey.id,
2677
+ urlsProcessed: 0
2298
2678
  };
2299
2679
  }),
2300
2680
  /**
@@ -2321,6 +2701,80 @@ class SpiderService extends Effect.Service()(
2321
2701
  }
2322
2702
  ) {
2323
2703
  }
2704
+ class SpiderRequest extends Data.Class {
2705
+ /**
2706
+ * Create a SpiderRequest from a CrawlTask
2707
+ */
2708
+ static fromTask(task, headers, meta) {
2709
+ return new SpiderRequest({
2710
+ task,
2711
+ headers: Option.fromNullable(headers),
2712
+ meta: Option.fromNullable(meta)
2713
+ });
2714
+ }
2715
+ /**
2716
+ * Add or update headers
2717
+ */
2718
+ withHeaders(headers) {
2719
+ const existingHeaders = Option.getOrElse(this.headers, () => ({}));
2720
+ return new SpiderRequest({
2721
+ ...this,
2722
+ headers: Option.some({ ...existingHeaders, ...headers })
2723
+ });
2724
+ }
2725
+ /**
2726
+ * Add or update metadata
2727
+ */
2728
+ withMeta(meta) {
2729
+ const existingMeta = Option.getOrElse(this.meta, () => ({}));
2730
+ return new SpiderRequest({
2731
+ ...this,
2732
+ meta: Option.some({ ...existingMeta, ...meta })
2733
+ });
2734
+ }
2735
+ }
2736
+ class SpiderResponse extends Data.Class {
2737
+ /**
2738
+ * Create a SpiderResponse from PageData
2739
+ */
2740
+ static fromPageData(pageData, statusCode, headers, meta) {
2741
+ return new SpiderResponse({
2742
+ pageData,
2743
+ statusCode: Option.fromNullable(statusCode),
2744
+ headers: Option.fromNullable(headers),
2745
+ meta: Option.fromNullable(meta)
2746
+ });
2747
+ }
2748
+ /**
2749
+ * Update the page data
2750
+ */
2751
+ withPageData(pageData) {
2752
+ return new SpiderResponse({
2753
+ ...this,
2754
+ pageData
2755
+ });
2756
+ }
2757
+ /**
2758
+ * Add or update metadata
2759
+ */
2760
+ withMeta(meta) {
2761
+ const existingMeta = Option.getOrElse(this.meta, () => ({}));
2762
+ return new SpiderResponse({
2763
+ ...this,
2764
+ meta: Option.some({ ...existingMeta, ...meta })
2765
+ });
2766
+ }
2767
+ /**
2768
+ * Check if the response was successful (2xx status code)
2769
+ */
2770
+ isSuccessful() {
2771
+ return Option.match(this.statusCode, {
2772
+ onNone: () => true,
2773
+ // Assume success if no status code
2774
+ onSome: (code) => code >= 200 && code < 300
2775
+ });
2776
+ }
2777
+ }
2324
2778
  class MiddlewareManager extends Effect.Service()(
2325
2779
  "@jambudipa.io/MiddlewareManager",
2326
2780
  {
@@ -2498,13 +2952,9 @@ class UserAgentMiddleware extends Effect.Service()(
2498
2952
  {
2499
2953
  effect: Effect.sync(() => ({
2500
2954
  create: (userAgent) => ({
2501
- processRequest: (request) => Effect.succeed({
2502
- ...request,
2503
- headers: {
2504
- ...request.headers,
2505
- "User-Agent": userAgent
2506
- }
2507
- })
2955
+ processRequest: (request) => Effect.succeed(
2956
+ request.withHeaders({ "User-Agent": userAgent })
2957
+ )
2508
2958
  })
2509
2959
  }))
2510
2960
  }
@@ -2533,14 +2983,18 @@ class StatsMiddleware extends Effect.Service()(
2533
2983
  }),
2534
2984
  processResponse: (response) => Effect.sync(() => {
2535
2985
  incr("responses_received");
2536
- if (response.statusCode) {
2537
- incr(`status_${response.statusCode}`);
2538
- if (response.statusCode >= 200 && response.statusCode < 300) {
2539
- incr("responses_success");
2540
- } else if (response.statusCode >= 400) {
2541
- incr("responses_error");
2986
+ Option.match(response.statusCode, {
2987
+ onNone: () => {
2988
+ },
2989
+ onSome: (statusCode) => {
2990
+ incr(`status_${statusCode}`);
2991
+ if (statusCode >= 200 && statusCode < 300) {
2992
+ incr("responses_success");
2993
+ } else if (statusCode >= 400) {
2994
+ incr("responses_error");
2995
+ }
2542
2996
  }
2543
- }
2997
+ });
2544
2998
  incr("bytes_downloaded", response.pageData.html.length);
2545
2999
  return response;
2546
3000
  }),
@@ -3559,6 +4013,207 @@ class FileStorageBackend {
3559
4013
  return path.join(this.baseDir, "sessions", key.id);
3560
4014
  };
3561
4015
  }
4016
+ class JsonParseError extends Data.TaggedError("JsonParseError") {
4017
+ get message() {
4018
+ const preview = this.input.length > 100 ? `${this.input.substring(0, 100)}...` : this.input;
4019
+ return `Failed to parse JSON: ${this.cause}. Input: "${preview}"`;
4020
+ }
4021
+ }
4022
+ class JsonStringifyError extends Data.TaggedError("JsonStringifyError") {
4023
+ get message() {
4024
+ const typeInfo = typeof this.input === "object" ? this.input?.constructor?.name || "Object" : typeof this.input;
4025
+ return `Failed to stringify value of type ${typeInfo}: ${this.cause}`;
4026
+ }
4027
+ }
4028
+ class JsonSchemaValidationError extends Data.TaggedError("JsonSchemaValidationError") {
4029
+ get message() {
4030
+ return `JSON validation failed for schema "${this.schemaName}": ${this.cause}`;
4031
+ }
4032
+ }
4033
+ const JsonUtils = {
4034
+ /**
4035
+ * Safely parse JSON string
4036
+ *
4037
+ * @example
4038
+ * ```ts
4039
+ * const result = yield* JsonUtils.parse('{"name": "test"}');
4040
+ * // result: { name: "test" }
4041
+ * ```
4042
+ */
4043
+ parse: (input) => Effect.try({
4044
+ try: () => JSON.parse(input),
4045
+ catch: (cause) => new JsonParseError({ input, cause })
4046
+ }),
4047
+ /**
4048
+ * Parse JSON with schema validation
4049
+ *
4050
+ * @example
4051
+ * ```ts
4052
+ * const UserSchema = Schema.Struct({
4053
+ * name: Schema.String,
4054
+ * age: Schema.Number
4055
+ * });
4056
+ *
4057
+ * const user = yield* JsonUtils.parseWithSchema(
4058
+ * '{"name": "Alice", "age": 30}',
4059
+ * UserSchema
4060
+ * );
4061
+ * ```
4062
+ */
4063
+ parseWithSchema: (input, schema, options) => Effect.gen(function* () {
4064
+ const parsed = yield* JsonUtils.parse(input);
4065
+ return yield* Effect.try({
4066
+ try: () => {
4067
+ const parseResult = Schema.decodeUnknownSync(schema, {
4068
+ errors: "all",
4069
+ ...options
4070
+ })(parsed);
4071
+ return parseResult;
4072
+ },
4073
+ catch: (cause) => new JsonSchemaValidationError({
4074
+ input: parsed,
4075
+ schemaName: schema.ast._tag || "Unknown",
4076
+ cause
4077
+ })
4078
+ });
4079
+ }),
4080
+ /**
4081
+ * Safely stringify value to JSON
4082
+ *
4083
+ * @example
4084
+ * ```ts
4085
+ * const json = yield* JsonUtils.stringify({ name: "test" });
4086
+ * // json: '{"name":"test"}'
4087
+ *
4088
+ * const pretty = yield* JsonUtils.stringify({ name: "test" }, 2);
4089
+ * // pretty: '{\n "name": "test"\n}'
4090
+ * ```
4091
+ */
4092
+ stringify: (value, space, replacer) => Effect.try({
4093
+ try: () => JSON.stringify(value, replacer, space),
4094
+ catch: (cause) => new JsonStringifyError({ input: value, cause })
4095
+ }),
4096
+ /**
4097
+ * Parse JSON with fallback value
4098
+ *
4099
+ * @example
4100
+ * ```ts
4101
+ * const config = yield* JsonUtils.parseOrDefault(
4102
+ * configStr,
4103
+ * { debug: false }
4104
+ * );
4105
+ * ```
4106
+ */
4107
+ parseOrDefault: (input, defaultValue) => JsonUtils.parse(input).pipe(
4108
+ Effect.catchAll(() => Effect.succeed(defaultValue))
4109
+ ),
4110
+ /**
4111
+ * Parse JSON and return null on failure
4112
+ *
4113
+ * @example
4114
+ * ```ts
4115
+ * const data = yield* JsonUtils.parseOrNull(input);
4116
+ * if (data !== null) {
4117
+ * // Use parsed data
4118
+ * }
4119
+ * ```
4120
+ */
4121
+ parseOrNull: (input) => JsonUtils.parse(input).pipe(
4122
+ Effect.catchAll(() => Effect.succeed(null))
4123
+ ),
4124
+ /**
4125
+ * Try to parse JSON and return boolean success
4126
+ *
4127
+ * @example
4128
+ * ```ts
4129
+ * const isValid = yield* JsonUtils.isValid('{"valid": true}');
4130
+ * // isValid: true
4131
+ * ```
4132
+ */
4133
+ isValid: (input) => JsonUtils.parse(input).pipe(
4134
+ Effect.map(() => true),
4135
+ Effect.catchAll(() => Effect.succeed(false))
4136
+ ),
4137
+ /**
4138
+ * Pretty print JSON with indentation
4139
+ *
4140
+ * @example
4141
+ * ```ts
4142
+ * const pretty = yield* JsonUtils.prettyPrint({ complex: { data: true } });
4143
+ * ```
4144
+ */
4145
+ prettyPrint: (value, indent = 2) => JsonUtils.stringify(value, indent),
4146
+ /**
4147
+ * Deep clone an object via JSON serialization
4148
+ * Note: This will lose functions, undefined values, symbols, etc.
4149
+ *
4150
+ * @example
4151
+ * ```ts
4152
+ * const clone = yield* JsonUtils.deepClone(originalObject);
4153
+ * ```
4154
+ */
4155
+ deepClone: (value) => Effect.gen(function* () {
4156
+ const json = yield* JsonUtils.stringify(value);
4157
+ return yield* JsonUtils.parse(json);
4158
+ }),
4159
+ /**
4160
+ * Merge two JSON objects
4161
+ *
4162
+ * @example
4163
+ * ```ts
4164
+ * const merged = yield* JsonUtils.merge(
4165
+ * { a: 1 },
4166
+ * { b: 2 }
4167
+ * );
4168
+ * // merged: { a: 1, b: 2 }
4169
+ * ```
4170
+ */
4171
+ merge: (target, source) => Effect.gen(function* () {
4172
+ const clonedTarget = yield* JsonUtils.deepClone(target);
4173
+ const clonedSource = yield* JsonUtils.deepClone(source);
4174
+ return { ...clonedTarget, ...clonedSource };
4175
+ }),
4176
+ /**
4177
+ * Extract a subset of JSON properties
4178
+ *
4179
+ * @example
4180
+ * ```ts
4181
+ * const subset = yield* JsonUtils.pick(
4182
+ * { a: 1, b: 2, c: 3 },
4183
+ * ['a', 'c']
4184
+ * );
4185
+ * // subset: { a: 1, c: 3 }
4186
+ * ```
4187
+ */
4188
+ pick: (obj, keys) => Effect.succeed(
4189
+ keys.reduce((acc, key) => {
4190
+ if (key in obj) {
4191
+ acc[key] = obj[key];
4192
+ }
4193
+ return acc;
4194
+ }, {})
4195
+ ),
4196
+ /**
4197
+ * Omit properties from JSON object
4198
+ *
4199
+ * @example
4200
+ * ```ts
4201
+ * const result = yield* JsonUtils.omit(
4202
+ * { a: 1, b: 2, c: 3 },
4203
+ * ['b']
4204
+ * );
4205
+ * // result: { a: 1, c: 3 }
4206
+ * ```
4207
+ */
4208
+ omit: (obj, keys) => Effect.succeed(
4209
+ Object.keys(obj).reduce((acc, key) => {
4210
+ if (!keys.includes(key)) {
4211
+ acc[key] = obj[key];
4212
+ }
4213
+ return acc;
4214
+ }, {})
4215
+ )
4216
+ };
3562
4217
  class CookieManager extends Context.Tag("CookieManager")() {
3563
4218
  }
3564
4219
  const makeCookieManager = () => Effect.gen(function* () {
@@ -3618,19 +4273,17 @@ const makeCookieManager = () => Effect.gen(function* () {
3618
4273
  }),
3619
4274
  catch: () => new Error("Failed to serialize cookies")
3620
4275
  });
3621
- return JSON.stringify(serialized);
4276
+ return yield* JsonUtils.stringify(serialized);
3622
4277
  }).pipe(Effect.orElseSucceed(() => "{}")),
3623
4278
  deserialize: (data) => Effect.gen(function* () {
3624
- try {
3625
- const parsed = JSON.parse(data);
3626
- const newJar = CookieJar.deserialize(parsed);
3627
- yield* Effect.tryPromise({
3628
- try: () => Promise.resolve(newJar),
3629
- catch: () => new Error("Failed to deserialize cookie jar")
3630
- }).pipe(Effect.flatMap((jar2) => Ref.set(jarRef, jar2)));
3631
- } catch (error) {
3632
- yield* Effect.fail(new Error(`Invalid cookie data: ${error}`));
3633
- }
4279
+ const parsed = yield* JsonUtils.parse(data).pipe(
4280
+ Effect.mapError((error) => new Error(`Invalid cookie JSON format: ${error.message}`))
4281
+ );
4282
+ const newJar = yield* Effect.tryPromise({
4283
+ try: () => CookieJar.deserialize(parsed),
4284
+ catch: (error) => new Error(`Failed to deserialize cookie jar: ${error}`)
4285
+ });
4286
+ yield* Ref.set(jarRef, newJar);
3634
4287
  })
3635
4288
  };
3636
4289
  });
@@ -3643,7 +4296,7 @@ class EnhancedHttpClient extends Context.Tag("EnhancedHttpClient")() {
3643
4296
  const makeEnhancedHttpClient = Effect.gen(function* () {
3644
4297
  const logger = yield* SpiderLogger;
3645
4298
  const cookieManager = yield* CookieManager;
3646
- const makeRequest = (url, options = {}) => Effect.gen(function* () {
4299
+ const makeRequest = (url, options = {}) => Effect.gen(function* (_) {
3647
4300
  const startMs = Date.now();
3648
4301
  const domain = new URL(url).hostname;
3649
4302
  const cookieHeader = yield* cookieManager.getCookieHeader(url);
@@ -3656,12 +4309,15 @@ const makeEnhancedHttpClient = Effect.gen(function* () {
3656
4309
  }
3657
4310
  if (options.method === "POST" && options.body && !headers["Content-Type"]) {
3658
4311
  if (typeof options.body === "string") {
3659
- try {
3660
- JSON.parse(options.body);
3661
- headers["Content-Type"] = "application/json";
3662
- } catch {
3663
- headers["Content-Type"] = "application/x-www-form-urlencoded";
3664
- }
4312
+ const isJson = yield* Effect.succeed((() => {
4313
+ try {
4314
+ JSON.parse(options.body);
4315
+ return true;
4316
+ } catch {
4317
+ return false;
4318
+ }
4319
+ })());
4320
+ headers["Content-Type"] = isJson ? "application/json" : "application/x-www-form-urlencoded";
3665
4321
  } else if (options.body instanceof FormData) ;
3666
4322
  else if (options.body instanceof URLSearchParams) {
3667
4323
  headers["Content-Type"] = "application/x-www-form-urlencoded";
@@ -3697,34 +4353,93 @@ const makeEnhancedHttpClient = Effect.gen(function* () {
3697
4353
  },
3698
4354
  catch: (error) => {
3699
4355
  clearTimeout(timeoutId);
3700
- return NetworkError.fromCause(url, error);
4356
+ if (error instanceof Error && error.name === "AbortError") {
4357
+ return new TimeoutError({
4358
+ operation: `HTTP ${options.method || "GET"}`,
4359
+ timeoutMs,
4360
+ url
4361
+ });
4362
+ }
4363
+ return new NetworkError2({
4364
+ url,
4365
+ method: options.method || "GET",
4366
+ cause: error
4367
+ });
3701
4368
  }
3702
4369
  });
4370
+ if (!response.ok) {
4371
+ return yield* Effect.fail(new NetworkError2({
4372
+ url: response.url,
4373
+ statusCode: response.status,
4374
+ method: options.method || "GET",
4375
+ cause: `HTTP ${response.status}: ${response.statusText}`
4376
+ }));
4377
+ }
3703
4378
  const body = yield* Effect.tryPromise({
3704
4379
  try: () => response.text(),
3705
- catch: (error) => ResponseError.fromCause(url, error)
4380
+ catch: (error) => new ParseError({
4381
+ input: url,
4382
+ expected: "text",
4383
+ cause: error
4384
+ })
3706
4385
  });
3707
4386
  const setCookieHeaders = response.headers.getSetCookie ? response.headers.getSetCookie() : response.headers.get("set-cookie")?.split(", ") || [];
3708
4387
  for (const cookieString of setCookieHeaders) {
3709
4388
  if (cookieString) {
3710
- yield* cookieManager.setCookie(cookieString, url).pipe(Effect.catchAll(() => Effect.void));
4389
+ yield* cookieManager.setCookie(cookieString, url).pipe(Effect.catchAll(() => Effect.succeed(void 0)));
3711
4390
  }
3712
4391
  }
3713
4392
  const responseHeaders = {};
3714
4393
  response.headers.forEach((value, key) => {
3715
4394
  responseHeaders[key] = value;
3716
4395
  });
3717
- return {
4396
+ const result = {
3718
4397
  url: response.url,
3719
4398
  status: response.status,
3720
4399
  statusText: response.statusText,
3721
4400
  headers: responseHeaders,
3722
4401
  body,
3723
- cookies: setCookieHeaders
4402
+ cookies: setCookieHeaders.length > 0 ? setCookieHeaders : void 0
3724
4403
  };
4404
+ return result;
3725
4405
  });
4406
+ const makeRequestWithRetry = (url, options = {}) => {
4407
+ const retries = options.retries ?? 3;
4408
+ const retryDelay = options.retryDelay ?? 1e3;
4409
+ const retrySchedule = Schedule.exponential(Duration.millis(retryDelay), 2).pipe(
4410
+ Schedule.compose(Schedule.recurs(retries)),
4411
+ Schedule.tapInput(
4412
+ (error) => Effect.gen(function* () {
4413
+ yield* logger.logEdgeCase(
4414
+ new URL(url).hostname,
4415
+ "http_request_retry",
4416
+ {
4417
+ url,
4418
+ method: options.method || "GET",
4419
+ error: error instanceof Error ? error.message : String(error),
4420
+ attempt: retries
4421
+ }
4422
+ );
4423
+ })
4424
+ )
4425
+ );
4426
+ return makeRequest(url, options).pipe(
4427
+ Effect.retry({
4428
+ schedule: retrySchedule,
4429
+ while: (error) => {
4430
+ if (error instanceof NetworkError2) {
4431
+ if (error.statusCode && error.statusCode >= 400 && error.statusCode < 500) {
4432
+ return false;
4433
+ }
4434
+ return true;
4435
+ }
4436
+ return error instanceof TimeoutError;
4437
+ }
4438
+ })
4439
+ );
4440
+ };
3726
4441
  return {
3727
- get: (url, options) => makeRequest(url, { ...options, method: "GET" }),
4442
+ get: (url, options) => makeRequestWithRetry(url, { ...options, method: "GET" }),
3728
4443
  post: (url, data, options) => Effect.gen(function* () {
3729
4444
  let body;
3730
4445
  if (data) {
@@ -3734,15 +4449,15 @@ const makeEnhancedHttpClient = Effect.gen(function* () {
3734
4449
  body = JSON.stringify(data);
3735
4450
  }
3736
4451
  }
3737
- return yield* makeRequest(url, { ...options, method: "POST", body });
4452
+ return yield* makeRequestWithRetry(url, { ...options, method: "POST", body });
3738
4453
  }),
3739
- request: makeRequest,
4454
+ request: makeRequestWithRetry,
3740
4455
  submitForm: (url, formData, options) => Effect.gen(function* () {
3741
4456
  const params = new URLSearchParams();
3742
4457
  for (const [key, value] of Object.entries(formData)) {
3743
4458
  params.append(key, value);
3744
4459
  }
3745
- return yield* makeRequest(url, {
4460
+ return yield* makeRequestWithRetry(url, {
3746
4461
  ...options,
3747
4462
  method: "POST",
3748
4463
  body: params,
@@ -3856,17 +4571,23 @@ const makeSessionStore = Effect.gen(function* () {
3856
4571
  yield* cookieManager.clearCookies();
3857
4572
  }),
3858
4573
  isSessionValid: () => Effect.gen(function* () {
3859
- const session = yield* Effect.gen(function* () {
4574
+ const sessionOption = yield* Effect.gen(function* () {
3860
4575
  const sessionId = yield* Ref.get(currentSessionId);
3861
- if (Option.isNone(sessionId)) return null;
4576
+ if (Option.isNone(sessionId)) {
4577
+ return Option.none();
4578
+ }
3862
4579
  const sessionsMap = yield* Ref.get(sessions);
3863
- return sessionsMap.get(sessionId.value) || null;
4580
+ return Option.fromNullable(sessionsMap.get(sessionId.value));
4581
+ });
4582
+ return Option.match(sessionOption, {
4583
+ onNone: () => false,
4584
+ onSome: (session) => {
4585
+ if (session.expiresAt && session.expiresAt < /* @__PURE__ */ new Date()) {
4586
+ return false;
4587
+ }
4588
+ return true;
4589
+ }
3864
4590
  });
3865
- if (!session) return false;
3866
- if (session.expiresAt && session.expiresAt < /* @__PURE__ */ new Date()) {
3867
- return false;
3868
- }
3869
- return true;
3870
4591
  }),
3871
4592
  updateSessionData: (data) => Effect.gen(function* () {
3872
4593
  const sessionId = yield* Ref.get(currentSessionId);
@@ -3900,25 +4621,25 @@ const makeSessionStore = Effect.gen(function* () {
3900
4621
  });
3901
4622
  }),
3902
4623
  importSession: (data) => Effect.gen(function* () {
3903
- try {
3904
- const parsed = JSON.parse(data);
3905
- const session = {
4624
+ const parsed = yield* Effect.try({
4625
+ try: () => JSON.parse(data),
4626
+ catch: (error) => new Error(`Invalid session JSON: ${error}`)
4627
+ });
4628
+ const session = yield* Effect.try({
4629
+ try: () => ({
3906
4630
  ...parsed,
3907
4631
  tokens: new Map(parsed.tokens || []),
3908
4632
  createdAt: new Date(parsed.createdAt),
3909
4633
  lastUsedAt: new Date(parsed.lastUsedAt),
3910
4634
  expiresAt: parsed.expiresAt ? new Date(parsed.expiresAt) : void 0
3911
- };
3912
- const sessionsMap = yield* Ref.get(sessions);
3913
- sessionsMap.set(session.id, session);
3914
- yield* Ref.set(sessions, sessionsMap);
3915
- yield* Effect.gen(function* () {
3916
- yield* cookieManager.deserialize(session.cookies);
3917
- yield* Ref.set(currentSessionId, Option.some(session.id));
3918
- });
3919
- } catch (error) {
3920
- yield* Effect.fail(new Error(`Invalid session data: ${error}`));
3921
- }
4635
+ }),
4636
+ catch: (error) => new Error(`Failed to reconstruct session: ${error}`)
4637
+ });
4638
+ const sessionsMap = yield* Ref.get(sessions);
4639
+ sessionsMap.set(session.id, session);
4640
+ yield* Ref.set(sessions, sessionsMap);
4641
+ yield* cookieManager.deserialize(session.cookies);
4642
+ yield* Ref.set(currentSessionId, Option.some(session.id));
3922
4643
  })
3923
4644
  };
3924
4645
  });
@@ -4627,16 +5348,16 @@ export {
4627
5348
  EnhancedHttpClient,
4628
5349
  EnhancedHttpClientLive,
4629
5350
  FileStorageBackend,
4630
- FileSystemError,
5351
+ FileSystemError$1 as FileSystemError,
4631
5352
  FullStatePersistence,
4632
5353
  HybridPersistence,
4633
5354
  LinkExtractionError,
4634
5355
  LinkExtractorService,
4635
5356
  LinkExtractorServiceLayer,
4636
5357
  LoggingMiddleware,
4637
- MiddlewareError,
5358
+ MiddlewareError$1 as MiddlewareError,
4638
5359
  MiddlewareManager,
4639
- NetworkError,
5360
+ NetworkError$1 as NetworkError,
4640
5361
  PageDataSchema,
4641
5362
  PersistenceError$1 as PersistenceError,
4642
5363
  PriorityRequest,