@jambudipa/spider 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,5 +1,6 @@
1
1
  export type { ISpider, ISpiderScheduler, IMiddlewareManager, IRateLimitMiddleware, ILoggingMiddleware, IUserAgentMiddleware, IStatsMiddleware, } from './lib/api-facades.js';
2
2
  export * from './lib/Spider/Spider.service.js';
3
+ export { SPIDER_DEFAULTS } from './lib/Spider/Spider.defaults.js';
3
4
  export * from './lib/Robots/Robots.service.js';
4
5
  export * from './lib/Scraper/Scraper.service.js';
5
6
  export * from './lib/PageData/PageData.js';
@@ -20,16 +21,24 @@ export { StateDelta, PersistenceError as ResumabilityError, DEFAULT_HYBRID_CONFI
20
21
  export { ResumabilityService, ResumabilityConfigs, createStateOperation, } from './lib/Resumability/Resumability.service.js';
21
22
  export { FullStatePersistence, DeltaPersistence, HybridPersistence, } from './lib/Resumability/strategies.js';
22
23
  export { FileStorageBackend } from './lib/Resumability/backends/FileStorageBackend.js';
24
+ export type { DatabaseClientInterface, PostgresStorageConfig, } from './lib/Resumability/backends/PostgresStorageBackend.js';
25
+ export { PostgresStorageBackend } from './lib/Resumability/backends/PostgresStorageBackend.js';
23
26
  export { NetworkError, ResponseError, RobotsTxtError, ConfigurationError, MiddlewareError, FileSystemError, PersistenceError, ContentTypeError, RequestAbortError, AdapterNotInitialisedError, BrowserError, BrowserCleanupError, TimeoutError, ParseError, ValidationError, PageError, StateError, SessionError, CrawlError, QueueError, ConfigError, isSpiderError, isNetworkError, isBrowserError, } from './lib/errors/effect-errors.js';
24
27
  export type { SpiderError, AllSpiderErrors } from './lib/errors/effect-errors.js';
25
28
  export type { SpiderLogEvent, SpiderLogger, } from './lib/Logging/SpiderLogger.service.js';
26
29
  export { SpiderLogger as SpiderLoggerTag, makeSpiderLogger, SpiderLoggerLive, } from './lib/Logging/SpiderLogger.service.js';
30
+ export type { LoggingFetchFn } from './lib/Logging/FetchLogger.js';
31
+ export { FetchError, makeLoggingFetch, LoggingFetch, } from './lib/Logging/FetchLogger.js';
27
32
  export type { CookieManagerService, EnhancedHttpClientService, HttpRequestOptions, HttpResponse, Session, Credentials, SessionStoreService, TokenInfo, TokenExtractorService, } from './lib/HttpClient/index.js';
28
33
  export { CookieManager, makeCookieManager, CookieManagerLive, EnhancedHttpClient, makeEnhancedHttpClient, EnhancedHttpClientLive, SessionStore, makeSessionStore, SessionStoreLive, TokenExtractor, makeTokenExtractor, TokenExtractorLive, } from './lib/HttpClient/index.js';
29
34
  export type { Token, StateManagerService } from './lib/StateManager/index.js';
30
- export { TokenType, StateManager, makeStateManager, StateManagerLive, } from './lib/StateManager/index.js';
35
+ export { TokenType, StateManager, makeStateManager, StateManagerLive, CSRFTokenNotFoundError, APITokenNotFoundError, TokenNotFoundError, TokenExpiredError, StorageKeyNotFoundError, } from './lib/StateManager/index.js';
31
36
  export type { BrowserEngineConfig, BrowserEngineServiceInterface, PageElement, } from './lib/BrowserEngine/BrowserEngine.service.js';
32
37
  export { BrowserEngineService, BrowserEngineLive, BrowserEngineWithConfig, withBrowser, } from './lib/BrowserEngine/BrowserEngine.service.js';
33
- export type { LoginCredentials, ScrapingSession, WebScrapingEngineService, } from './lib/WebScrapingEngine/index.js';
34
- export { WebScrapingEngine, makeWebScrapingEngine, WebScrapingEngineLive, } from './lib/WebScrapingEngine/index.js';
38
+ export type { LoginCredentials, ScrapingSession, WebScrapingEngineService, WebScrapingEngineError, HttpOperationError, HttpPostOperationError, } from './lib/WebScrapingEngine/index.js';
39
+ export { WebScrapingEngine, makeWebScrapingEngine, WebScrapingEngineLive, LoginError, SessionNotValidError, SessionLoadError, } from './lib/WebScrapingEngine/index.js';
40
+ export { WorkerHealthMonitor } from './lib/WorkerHealth/WorkerHealthMonitor.service.js';
41
+ export type { DeduplicationStrategy, UrlWithMetadata, NormalizedUrl, } from './lib/utils/url-deduplication.js';
42
+ export { DEFAULT_DEDUPLICATION_STRATEGY, parseUrl, normalizeUrl, deduplicateUrls, createUrlDeduplicator, } from './lib/utils/url-deduplication.js';
43
+ export { safeJsonParse, toOption, fromPromise, cleanupResources, matchOption, } from './lib/utils/effect-migration.js';
35
44
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,YAAY,EACV,OAAO,EACP,gBAAgB,EAChB,kBAAkB,EAClB,oBAAoB,EACpB,kBAAkB,EAClB,oBAAoB,EACpB,gBAAgB,GACjB,MAAM,sBAAsB,CAAC;AAG9B,cAAc,gCAAgC,CAAC;AAC/C,cAAc,gCAAgC,CAAC;AAC/C,cAAc,kCAAkC,CAAC;AACjD,cAAc,4BAA4B,CAAC;AAG3C,YAAY,EACV,mBAAmB,EACnB,mBAAmB,GACpB,MAAM,sCAAsC,CAAC;AAC9C,OAAO,EACL,YAAY,EACZ,gBAAgB,GACjB,MAAM,sCAAsC,CAAC;AAG9C,YAAY,EAAE,gBAAgB,EAAE,MAAM,kDAAkD,CAAC;AACzF,OAAO,EAAE,sBAAsB,EAAE,MAAM,kDAAkD,CAAC;AAG1F,YAAY,EAAE,gBAAgB,EAAE,MAAM,4CAA4C,CAAC;AACnF,OAAO,EACL,sBAAsB,EACtB,cAAc,EACd,eAAe,EACf,WAAW,GACZ,MAAM,4CAA4C,CAAC;AAGpD,YAAY,EACV,gBAAgB,EAChB,aAAa,EACb,cAAc,GACf,MAAM,sCAAsC,CAAC;AAC9C,OAAO,EACL,iBAAiB,EACjB,mBAAmB,EACnB,iBAAiB,EACjB,mBAAmB,EACnB,eAAe,GAChB,MAAM,sCAAsC,CAAC;AAG9C,YAAY,EACV,mBAAmB,EACnB,oBAAoB,EACpB,6BAA6B,GAC9B,MAAM,8CAA8C,CAAC;AACtD,OAAO,EACL,oBAAoB,EACpB,yBAAyB,EACzB,mBAAmB,GACpB,MAAM,8CAA8C,CAAC;AAGtD,YAAY,EACV,WAAW,EACX,SAAS,EACT,2BAA2B,GAC5B,MAAM,gCAAgC,CAAC;AAGxC,YAAY,EACV,mBAAmB,EACnB,cAAc,EACd,cAAc,EACd,mBAAmB,EACnB,uBAAuB,GACxB,MAAM,6BAA6B,CAAC;AACrC,YAAY,EAAE,kBAAkB,EAAE,MAAM,4CAA4C,CAAC;AACrF,OAAO,EACL,UAAU,EACV,gBAAgB,IAAI,iBAAiB,EACrC,qBAAqB,GACtB,MAAM,6BAA6B,CAAC;AACrC,OAAO,EACL,mBAAmB,EACnB,mBAAmB,EACnB,oBAAoB,GACrB,MAAM,4CAA4C,CAAC;AACpD,OAAO,EACL,oBAAoB,EACpB,gBAAgB,EAChB,iBAAiB,GAClB,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EAAE,kBAAkB,EAAE,MAAM,mDAAmD,CAAC;AAGvF,OAAO,EACL,YAAY,EACZ,aAAa,EACb,cAAc,EACd,kBAAkB,EAClB,eAAe,EACf,eAAe,EACf,gBAAgB,EAChB,gBAAgB,EAChB,iBAAiB,EACjB,0BAA0B,EAC1B,YAAY,EACZ,mBAAmB,EACnB,YAAY,EACZ,UAAU,EACV,eAAe,EACf,SAAS,EACT,UAAU,EACV,YAAY,EACZ,UAAU,EACV,UAAU,EACV,WAAW,EACX,aAAa,EACb,cAAc,EACd,cAAc,GACf,MAAM,+BAA+B,CAAC;AACvC,YAAY,EAAE,WAAW,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAGlF,YAAY,EACV,cAAc,EACd,YAAY,GACb,MAAM,uCAAuC,CAAC;AAC/C,OAAO,EACL,YAAY,IAAI,eAAe,EAC/B,gBAAgB,EAChB,gBAAgB,GACjB,MAAM,uCAAuC,CAAC;AAG/C,YAAY,EACV,oBAAoB,EACpB,yBAAyB,EACzB,kBAAkB,EAClB,YAAY,EACZ,OAAO,EACP,WAAW,EACX,mBAAmB,EACnB,SAAS,EACT,qBAAqB,GACtB,MAAM,2BAA2B,CAAC;AACnC,OAAO,EACL,aAAa,EACb,iBAAiB,EACjB,iBAAiB,EACjB,kBAAkB,EAClB,sBAAsB,EACtB,sBAAsB,EACtB,YAAY,EACZ,gBAAgB,EAChB,gBAAgB,EAChB,cAAc,EACd,kBAAkB,EAClB,kBAAkB,GACnB,MAAM,2BAA2B,CAAC;AAGnC,YAAY,EAAE,KAAK,EAAE,mBAAmB,EAAE,MAAM,6BAA6B,CAAC;AAC9E,OAAO,EACL,SAAS,EACT,YAAY,EACZ,gBAAgB,EAChB,gBAAgB,GACjB,MAAM,6BAA6B,CAAC;AAGrC,YAAY,EACV,mBAAmB,EACnB,6BAA6B,EAC7B,WAAW,GACZ,MAAM,8CAA8C,CAAC;AACtD,OAAO,EACL,oBAAoB,EACpB,iBAAiB,EACjB,uBAAuB,EACvB,WAAW,GACZ,MAAM,8CAA8C,CAAC;AAGtD,YAAY,EACV,gBAAgB,EAChB,eAAe,EACf,wBAAwB,GACzB,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EACL,iBAAiB,EACjB,qBAAqB,EACrB,qBAAqB,GACtB,MAAM,kCAAkC,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,YAAY,EACV,OAAO,EACP,gBAAgB,EAChB,kBAAkB,EAClB,oBAAoB,EACpB,kBAAkB,EAClB,oBAAoB,EACpB,gBAAgB,GACjB,MAAM,sBAAsB,CAAC;AAG9B,cAAc,gCAAgC,CAAC;AAC/C,OAAO,EAAE,eAAe,EAAE,MAAM,iCAAiC,CAAC;AAClE,cAAc,gCAAgC,CAAC;AAC/C,cAAc,kCAAkC,CAAC;AACjD,cAAc,4BAA4B,CAAC;AAG3C,YAAY,EACV,mBAAmB,EACnB,mBAAmB,GACpB,MAAM,sCAAsC,CAAC;AAC9C,OAAO,EACL,YAAY,EACZ,gBAAgB,GACjB,MAAM,sCAAsC,CAAC;AAG9C,YAAY,EAAE,gBAAgB,EAAE,MAAM,kDAAkD,CAAC;AACzF,OAAO,EAAE,sBAAsB,EAAE,MAAM,kDAAkD,CAAC;AAG1F,YAAY,EAAE,gBAAgB,EAAE,MAAM,4CAA4C,CAAC;AACnF,OAAO,EACL,sBAAsB,EACtB,cAAc,EACd,eAAe,EACf,WAAW,GACZ,MAAM,4CAA4C,CAAC;AAGpD,YAAY,EACV,gBAAgB,EAChB,aAAa,EACb,cAAc,GACf,MAAM,sCAAsC,CAAC;AAC9C,OAAO,EACL,iBAAiB,EACjB,mBAAmB,EACnB,iBAAiB,EACjB,mBAAmB,EACnB,eAAe,GAChB,MAAM,sCAAsC,CAAC;AAG9C,YAAY,EACV,mBAAmB,EACnB,oBAAoB,EACpB,6BAA6B,GAC9B,MAAM,8CAA8C,CAAC;AACtD,OAAO,EACL,oBAAoB,EACpB,yBAAyB,EACzB,mBAAmB,GACpB,MAAM,8CAA8C,CAAC;AAGtD,YAAY,EACV,WAAW,EACX,SAAS,EACT,2BAA2B,GAC5B,MAAM,gCAAgC,CAAC;AAGxC,YAAY,EACV,mBAAmB,EACnB,cAAc,EACd,cAAc,EACd,mBAAmB,EACnB,uBAAuB,GACxB,MAAM,6BAA6B,CAAC;AACrC,YAAY,EAAE,kBAAkB,EAAE,MAAM,4CAA4C,CAAC;AACrF,OAAO,EACL,UAAU,EACV,gBAAgB,IAAI,iBAAiB,EACrC,qBAAqB,GACtB,MAAM,6BAA6B,CAAC;AACrC,OAAO,EACL,mBAAmB,EACnB,mBAAmB,EACnB,oBAAoB,GACrB,MAAM,4CAA4C,CAAC;AACpD,OAAO,EACL,oBAAoB,EACpB,gBAAgB,EAChB,iBAAiB,GAClB,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EAAE,kBAAkB,EAAE,MAAM,mDAAmD,CAAC;AACvF,YAAY,EACV,uBAAuB,EACvB,qBAAqB,GACtB,MAAM,uDAAuD,CAAC;AAC/D,OAAO,EAAE,sBAAsB,EAAE,MAAM,uDAAuD,CAAC;AAG/F,OAAO,EACL,YAAY,EACZ,aAAa,EACb,cAAc,EACd,kBAAkB,EAClB,eAAe,EACf,eAAe,EACf,gBAAgB,EAChB,gBAAgB,EAChB,iBAAiB,EACjB,0BAA0B,EAC1B,YAAY,EACZ,mBAAmB,EACnB,YAAY,EACZ,UAAU,EACV,eAAe,EACf,SAAS,EACT,UAAU,EACV,YAAY,EACZ,UAAU,EACV,UAAU,EACV,WAAW,EACX,aAAa,EACb,cAAc,EACd,cAAc,GACf,MAAM,+BAA+B,CAAC;AACvC,YAAY,EAAE,WAAW,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAGlF,YAAY,EACV,cAAc,EACd,YAAY,GACb,MAAM,uCAAuC,CAAC;AAC/C,OAAO,EACL,YAAY,IAAI,eAAe,EAC/B,gBAAgB,EAChB,gBAAgB,GACjB,MAAM,uCAAuC,CAAC;AAG/C,YAAY,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AACnE,OAAO,EACL,UAAU,EACV,gBAAgB,EAChB,YAAY,GACb,MAAM,8BAA8B,CAAC;AAGtC,YAAY,EACV,oBAAoB,EACpB,yBAAyB,EACzB,kBAAkB,EAClB,YAAY,EACZ,OAAO,EACP,WAAW,EACX,mBAAmB,EACnB,SAAS,EACT,qBAAqB,GACtB,MAAM,2BAA2B,CAAC;AACnC,OAAO,EACL,aAAa,EACb,iBAAiB,EACjB,iBAAiB,EACjB,kBAAkB,EAClB,sBAAsB,EACtB,sBAAsB,EACtB,YAAY,EACZ,gBAAgB,EAChB,gBAAgB,EAChB,cAAc,EACd,kBAAkB,EAClB,kBAAkB,GACnB,MAAM,2BAA2B,CAAC;AAGnC,YAAY,EAAE,KAAK,EAAE,mBAAmB,EAAE,MAAM,6BAA6B,CAAC;AAC9E,OAAO,EACL,SAAS,EACT,YAAY,EACZ,gBAAgB,EAChB,gBAAgB,EAChB,sBAAsB,EACtB,qBAAqB,EACrB,kBAAkB,EAClB,iBAAiB,EACjB,uBAAuB,GACxB,MAAM,6BAA6B,CAAC;AAGrC,YAAY,EACV,mBAAmB,EACnB,6BAA6B,EAC7B,WAAW,GACZ,MAAM,8CAA8C,CAAC;AACtD,OAAO,EACL,oBAAoB,EACpB,iBAAiB,EACjB,uBAAuB,EACvB,WAAW,GACZ,MAAM,8CAA8C,CAAC;AAGtD,YAAY,EACV,gBAAgB,EAChB,eAAe,EACf,wBAAwB,EACxB,sBAAsB,EACtB,kBAAkB,EAClB,sBAAsB,GACvB,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EACL,iBAAiB,EACjB,qBAAqB,EACrB,qBAAqB,EACrB,UAAU,EACV,oBAAoB,EACpB,gBAAgB,GACjB,MAAM,kCAAkC,CAAC;AAG1C,OAAO,EAAE,mBAAmB,EAAE,MAAM,mDAAmD,CAAC;AAGxF,YAAY,EACV,qBAAqB,EACrB,eAAe,EACf,aAAa,GACd,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EACL,8BAA8B,EAC9B,QAAQ,EACR,YAAY,EACZ,eAAe,EACf,qBAAqB,GACtB,MAAM,kCAAkC,CAAC;AAG1C,OAAO,EACL,aAAa,EACb,QAAQ,EACR,WAAW,EACX,gBAAgB,EAChB,WAAW,GACZ,MAAM,iCAAiC,CAAC"}
package/dist/index.js CHANGED
@@ -1,4 +1,4 @@
1
- import { Effect, Layer, Option, Chunk, MutableHashSet, Schema, Data, pipe, Context, DateTime, Console, Duration, MutableHashMap, Queue, HashMap, PubSub, MutableRef, Schedule, Stream, Fiber, Random, Struct, Ref } from "effect";
1
+ import { Effect, Layer, Option, Chunk, MutableHashSet, Schema, Data, pipe, Context, DateTime, Console, Duration, MutableHashMap, Queue, HashMap, Ref, HashSet, PubSub, MutableRef, Schedule, Stream, Fiber, Random, Struct } from "effect";
2
2
  import * as cheerio from "cheerio";
3
3
  import * as fs from "fs";
4
4
  import * as path from "path";
@@ -1747,6 +1747,62 @@ const deduplicateUrls = (urls, strategy = DEFAULT_DEDUPLICATION_STRATEGY) => Eff
1747
1747
  }
1748
1748
  };
1749
1749
  });
1750
+ const createUrlDeduplicator = (strategy = DEFAULT_DEDUPLICATION_STRATEGY) => Effect.gen(function* () {
1751
+ const seenUrls = yield* Ref.make(HashSet.empty());
1752
+ const urlStats = yield* Ref.make({
1753
+ processed: 0,
1754
+ unique: 0,
1755
+ duplicates: 0
1756
+ });
1757
+ return {
1758
+ /**
1759
+ * Check if a URL has been seen (after normalization)
1760
+ */
1761
+ hasSeenUrl: (url) => Effect.gen(function* () {
1762
+ const normalized = yield* normalizeUrl(url, strategy);
1763
+ const seen = yield* Ref.get(seenUrls);
1764
+ return HashSet.has(seen, normalized.normalized);
1765
+ }),
1766
+ /**
1767
+ * Add a URL to the seen set
1768
+ */
1769
+ markUrlSeen: (url) => Effect.gen(function* () {
1770
+ const normalized = yield* normalizeUrl(url, strategy);
1771
+ const seen = yield* Ref.get(seenUrls);
1772
+ if (HashSet.has(seen, normalized.normalized)) {
1773
+ yield* Ref.update(urlStats, (stats) => ({
1774
+ ...stats,
1775
+ processed: stats.processed + 1,
1776
+ duplicates: stats.duplicates + 1
1777
+ }));
1778
+ return false;
1779
+ } else {
1780
+ yield* Ref.set(seenUrls, HashSet.add(seen, normalized.normalized));
1781
+ yield* Ref.update(urlStats, (stats) => ({
1782
+ ...stats,
1783
+ processed: stats.processed + 1,
1784
+ unique: stats.unique + 1
1785
+ }));
1786
+ return true;
1787
+ }
1788
+ }),
1789
+ /**
1790
+ * Get deduplication statistics
1791
+ */
1792
+ getStats: () => Ref.get(urlStats),
1793
+ /**
1794
+ * Reset the deduplicator
1795
+ */
1796
+ reset: () => Effect.gen(function* () {
1797
+ yield* Ref.set(seenUrls, HashSet.empty());
1798
+ yield* Ref.set(urlStats, {
1799
+ processed: 0,
1800
+ unique: 0,
1801
+ duplicates: 0
1802
+ });
1803
+ })
1804
+ };
1805
+ });
1750
1806
  const SPIDER_DEFAULTS = Object.freeze({
1751
1807
  /** Threshold in ms after which a worker is considered stale (60s) */
1752
1808
  STALE_WORKER_THRESHOLD_MS: 6e4,
@@ -5080,6 +5136,120 @@ const ResumabilityConfigs = {
5080
5136
  backend: new PostgresStorageBackend(dbClient, config)
5081
5137
  })
5082
5138
  };
5139
+ class FetchError extends Data.TaggedError("FetchError") {
5140
+ get message() {
5141
+ return `Fetch failed for ${this.url}: ${this.reason} after ${this.durationMs}ms`;
5142
+ }
5143
+ }
5144
+ const makeLoggingFetch = Effect.gen(function* () {
5145
+ const logger = yield* SpiderLogger;
5146
+ return (url, options) => Effect.gen(function* () {
5147
+ const startTime = yield* DateTime.now;
5148
+ const startMs = DateTime.toEpochMillis(startTime);
5149
+ const domain = new URL(url).hostname;
5150
+ const optionDetails = Option.fromNullable(options).pipe(
5151
+ Option.map((opts) => ({
5152
+ method: opts.method,
5153
+ headers: Object.keys(opts.headers ?? {})
5154
+ }))
5155
+ );
5156
+ yield* logger.logEvent({
5157
+ type: "edge_case",
5158
+ domain,
5159
+ url,
5160
+ message: "[FETCH_START] Starting fetch request",
5161
+ details: {
5162
+ case: "fetch_start",
5163
+ url,
5164
+ timestamp: DateTime.formatIso(startTime),
5165
+ options: Option.getOrUndefined(optionDetails)
5166
+ }
5167
+ });
5168
+ const fetchEffect = Effect.tryPromise({
5169
+ try: () => globalThis.fetch(url, options),
5170
+ catch: (error) => new FetchError({
5171
+ url,
5172
+ reason: "network",
5173
+ durationMs: 0,
5174
+ // Duration will be calculated in error handler
5175
+ cause: error
5176
+ })
5177
+ });
5178
+ const timeoutDuration = Duration.seconds(30);
5179
+ const fetchWithTimeout = fetchEffect.pipe(
5180
+ Effect.timeoutOption(timeoutDuration),
5181
+ Effect.flatMap(
5182
+ (maybeResponse) => Option.match(maybeResponse, {
5183
+ onNone: () => Effect.gen(function* () {
5184
+ const currentTime = yield* DateTime.now;
5185
+ const durationMs2 = DateTime.toEpochMillis(currentTime) - startMs;
5186
+ yield* logger.logEvent({
5187
+ type: "edge_case",
5188
+ domain,
5189
+ url,
5190
+ message: `[FETCH_ABORT] Aborting fetch after ${durationMs2}ms`,
5191
+ details: {
5192
+ case: "fetch_abort",
5193
+ url,
5194
+ durationMs: durationMs2,
5195
+ reason: "timeout"
5196
+ }
5197
+ });
5198
+ return yield* Effect.fail(
5199
+ new FetchError({
5200
+ url,
5201
+ reason: "timeout",
5202
+ durationMs: Number(durationMs2)
5203
+ })
5204
+ );
5205
+ }),
5206
+ onSome: (response2) => Effect.succeed(response2)
5207
+ })
5208
+ )
5209
+ );
5210
+ const response = yield* fetchWithTimeout.pipe(
5211
+ Effect.catchAll(
5212
+ (error) => Effect.gen(function* () {
5213
+ const currentTime = yield* DateTime.now;
5214
+ const durationMs2 = DateTime.toEpochMillis(currentTime) - startMs;
5215
+ yield* logger.logEvent({
5216
+ type: "edge_case",
5217
+ domain,
5218
+ url,
5219
+ message: `[FETCH_ERROR] Failed after ${durationMs2}ms`,
5220
+ details: {
5221
+ case: "fetch_failed",
5222
+ url,
5223
+ durationMs: durationMs2,
5224
+ error: error._tag,
5225
+ message: error.message,
5226
+ isAborted: error.reason === "timeout"
5227
+ }
5228
+ });
5229
+ return yield* Effect.fail(error);
5230
+ })
5231
+ )
5232
+ );
5233
+ const endTime = yield* DateTime.now;
5234
+ const durationMs = DateTime.toEpochMillis(endTime) - startMs;
5235
+ yield* logger.logEvent({
5236
+ type: "edge_case",
5237
+ domain,
5238
+ url,
5239
+ message: `[FETCH_SUCCESS] Got response in ${durationMs}ms`,
5240
+ details: {
5241
+ case: "fetch_success",
5242
+ url,
5243
+ durationMs,
5244
+ status: response.status,
5245
+ statusText: response.statusText,
5246
+ contentType: response.headers.get("content-type")
5247
+ }
5248
+ });
5249
+ return response;
5250
+ });
5251
+ });
5252
+ const LoggingFetch = Context.GenericTag("LoggingFetch");
5083
5253
  class JsonParseError extends Data.TaggedError("JsonParseError") {
5084
5254
  get message() {
5085
5255
  const preview = this.input.length > 100 ? `${this.input.substring(0, 100)}...` : this.input;
@@ -6980,23 +7150,153 @@ const WebScrapingEngineLive = Layer.effect(
6980
7150
  WebScrapingEngine,
6981
7151
  makeWebScrapingEngine
6982
7152
  );
7153
+ class WorkerHealthMonitor extends Effect.Service()(
7154
+ "@jambudipa.io/WorkerHealthMonitor",
7155
+ {
7156
+ effect: Effect.gen(function* () {
7157
+ const logger = yield* SpiderLogger;
7158
+ const workers = yield* Ref.make(HashMap.empty());
7159
+ const stuckThresholdMs = 6e4;
7160
+ return {
7161
+ /**
7162
+ * Register a worker's activity
7163
+ */
7164
+ recordActivity: (workerId, domain, activity) => Effect.gen(function* () {
7165
+ const now = DateTime.unsafeNow();
7166
+ yield* Ref.update(workers, (map) => {
7167
+ const current = HashMap.get(map, workerId).pipe(
7168
+ (opt) => opt._tag === "Some" ? opt.value : {
7169
+ workerId,
7170
+ domain,
7171
+ lastActivity: now
7172
+ }
7173
+ );
7174
+ const updated = {
7175
+ ...current,
7176
+ domain,
7177
+ lastActivity: now,
7178
+ currentUrl: activity.url ?? current.currentUrl,
7179
+ fetchStartTime: activity.fetchStart ? now : current.fetchStartTime
7180
+ };
7181
+ return HashMap.set(map, workerId, updated);
7182
+ });
7183
+ }),
7184
+ /**
7185
+ * Remove a worker from monitoring
7186
+ */
7187
+ removeWorker: (workerId) => Ref.update(workers, (map) => HashMap.remove(map, workerId)),
7188
+ /**
7189
+ * Get stuck workers
7190
+ */
7191
+ getStuckWorkers: Effect.gen(function* () {
7192
+ const now = DateTime.unsafeNow();
7193
+ const workerMap = yield* Ref.get(workers);
7194
+ const stuck = [];
7195
+ for (const [, status] of workerMap) {
7196
+ const inactiveMs = DateTime.toEpochMillis(now) - DateTime.toEpochMillis(status.lastActivity);
7197
+ if (inactiveMs > stuckThresholdMs) {
7198
+ stuck.push(status);
7199
+ }
7200
+ }
7201
+ return stuck;
7202
+ }),
7203
+ /**
7204
+ * Monitor workers and log stuck ones
7205
+ */
7206
+ startMonitoring: Effect.gen(function* () {
7207
+ const self = {
7208
+ getStuckWorkers: Effect.gen(function* () {
7209
+ const now = DateTime.unsafeNow();
7210
+ const workerMap = yield* Ref.get(workers);
7211
+ const stuck = [];
7212
+ for (const [, status] of workerMap) {
7213
+ const inactiveMs = DateTime.toEpochMillis(now) - DateTime.toEpochMillis(status.lastActivity);
7214
+ if (inactiveMs > stuckThresholdMs) {
7215
+ stuck.push(status);
7216
+ }
7217
+ }
7218
+ return stuck;
7219
+ })
7220
+ };
7221
+ yield* Effect.repeat(
7222
+ Effect.gen(function* () {
7223
+ const stuck = yield* self.getStuckWorkers;
7224
+ if (stuck.length > 0) {
7225
+ for (const worker of stuck) {
7226
+ const nowMillis = DateTime.toEpochMillis(DateTime.unsafeNow());
7227
+ const inactiveMs = nowMillis - DateTime.toEpochMillis(worker.lastActivity);
7228
+ yield* logger.logEdgeCase(
7229
+ worker.domain,
7230
+ "worker_stuck_detected",
7231
+ {
7232
+ workerId: worker.workerId,
7233
+ currentUrl: worker.currentUrl,
7234
+ lastActivity: DateTime.formatIso(worker.lastActivity),
7235
+ inactiveMs,
7236
+ fetchStartTime: Option.fromNullable(worker.fetchStartTime).pipe(
7237
+ Option.map(DateTime.formatIso),
7238
+ Option.getOrElse(() => "N/A")
7239
+ )
7240
+ }
7241
+ );
7242
+ }
7243
+ }
7244
+ }),
7245
+ Schedule.fixed(Duration.seconds(30))
7246
+ );
7247
+ })
7248
+ };
7249
+ })
7250
+ }
7251
+ ) {
7252
+ }
7253
+ const safeJsonParse = (data, onError) => Schema.decodeUnknown(Schema.parseJson(Schema.Unknown))(data).pipe(
7254
+ Effect.mapError(onError)
7255
+ );
7256
+ const toOption = (value, logContext) => {
7257
+ const result = Option.fromNullable(value);
7258
+ if (logContext && Option.isNone(result)) {
7259
+ return Effect.logDebug(`[Migration] Null value encountered: ${logContext}`).pipe(
7260
+ Effect.map(() => result)
7261
+ );
7262
+ }
7263
+ return Effect.succeed(result);
7264
+ };
7265
+ const fromPromise = (promise, onError) => Effect.tryPromise({
7266
+ try: promise,
7267
+ catch: onError
7268
+ });
7269
+ const cleanupResources = (resources) => Effect.all(
7270
+ resources.map(
7271
+ ({ id, cleanup, onError }) => Effect.tryPromise({
7272
+ try: cleanup,
7273
+ catch: (error) => onError(id, error)
7274
+ })
7275
+ ),
7276
+ { mode: "either" }
7277
+ );
7278
+ const matchOption = (option, onNone, onSome) => Option.match(option, { onNone, onSome });
6983
7279
  export {
7280
+ APITokenNotFoundError,
6984
7281
  AdapterNotInitialisedError,
6985
7282
  BrowserCleanupError,
6986
7283
  BrowserEngineLive,
6987
7284
  BrowserEngineService,
6988
7285
  BrowserEngineWithConfig,
6989
7286
  BrowserError,
7287
+ CSRFTokenNotFoundError,
6990
7288
  ConfigError,
6991
7289
  ConfigurationError,
6992
7290
  ContentTypeError,
6993
7291
  CookieManager,
6994
7292
  CookieManagerLive,
6995
7293
  CrawlError,
7294
+ DEFAULT_DEDUPLICATION_STRATEGY,
6996
7295
  DEFAULT_HYBRID_CONFIG,
6997
7296
  DeltaPersistence,
6998
7297
  EnhancedHttpClient,
6999
7298
  EnhancedHttpClientLive,
7299
+ FetchError,
7000
7300
  FileStorageBackend,
7001
7301
  FileSystemError,
7002
7302
  FullStatePersistence,
@@ -7004,7 +7304,9 @@ export {
7004
7304
  LinkExtractionError,
7005
7305
  LinkExtractorService,
7006
7306
  LinkExtractorServiceLayer,
7307
+ LoggingFetch,
7007
7308
  LoggingMiddleware,
7309
+ LoginError,
7008
7310
  MiddlewareError,
7009
7311
  MiddlewareManager,
7010
7312
  NetworkError,
@@ -7012,6 +7314,7 @@ export {
7012
7314
  PageError,
7013
7315
  ParseError,
7014
7316
  PersistenceError$1 as PersistenceError,
7317
+ PostgresStorageBackend,
7015
7318
  PriorityRequest,
7016
7319
  QueueError,
7017
7320
  RateLimitMiddleware,
@@ -7022,8 +7325,11 @@ export {
7022
7325
  ResumabilityService,
7023
7326
  RobotsService,
7024
7327
  RobotsTxtError,
7328
+ SPIDER_DEFAULTS,
7025
7329
  ScraperService,
7026
7330
  SessionError$1 as SessionError,
7331
+ SessionLoadError,
7332
+ SessionNotValidError,
7027
7333
  SessionStore,
7028
7334
  SessionStoreLive,
7029
7335
  SpiderConfig,
@@ -7038,27 +7344,41 @@ export {
7038
7344
  StateManager,
7039
7345
  StateManagerLive,
7040
7346
  StatsMiddleware,
7347
+ StorageKeyNotFoundError,
7041
7348
  TimeoutError,
7349
+ TokenExpiredError,
7042
7350
  TokenExtractor,
7043
7351
  TokenExtractorLive,
7352
+ TokenNotFoundError,
7044
7353
  TokenType,
7045
7354
  UrlDeduplicatorService,
7046
7355
  UserAgentMiddleware,
7047
7356
  ValidationError,
7048
7357
  WebScrapingEngine,
7049
7358
  WebScrapingEngineLive,
7359
+ WorkerHealthMonitor,
7360
+ cleanupResources,
7050
7361
  createStateOperation,
7362
+ createUrlDeduplicator,
7363
+ deduplicateUrls,
7364
+ fromPromise,
7051
7365
  isBrowserError,
7052
7366
  isNetworkError,
7053
7367
  isSpiderError,
7054
7368
  makeCookieManager,
7055
7369
  makeEnhancedHttpClient,
7370
+ makeLoggingFetch,
7056
7371
  makeSessionStore,
7057
7372
  makeSpiderConfig,
7058
7373
  makeSpiderLogger,
7059
7374
  makeStateManager,
7060
7375
  makeTokenExtractor,
7061
7376
  makeWebScrapingEngine,
7377
+ matchOption,
7378
+ normalizeUrl,
7379
+ parseUrl,
7380
+ safeJsonParse,
7381
+ toOption,
7062
7382
  withBrowser
7063
7383
  };
7064
7384
  //# sourceMappingURL=index.js.map