@jambudipa/spider 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +10 -16
  2. package/dist/index.d.ts +33 -0
  3. package/dist/index.d.ts.map +1 -0
  4. package/dist/index.js +3091 -1657
  5. package/dist/index.js.map +1 -1
  6. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +107 -0
  7. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +1 -0
  8. package/dist/lib/Config/SpiderConfig.service.d.ts +256 -0
  9. package/dist/lib/Config/SpiderConfig.service.d.ts.map +1 -0
  10. package/dist/lib/HttpClient/CookieManager.d.ts +58 -0
  11. package/dist/lib/HttpClient/CookieManager.d.ts.map +1 -0
  12. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +63 -0
  13. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +1 -0
  14. package/dist/lib/HttpClient/SessionStore.d.ts +114 -0
  15. package/dist/lib/HttpClient/SessionStore.d.ts.map +1 -0
  16. package/dist/lib/HttpClient/TokenExtractor.d.ts +83 -0
  17. package/dist/lib/HttpClient/TokenExtractor.d.ts.map +1 -0
  18. package/dist/lib/HttpClient/index.d.ts +8 -0
  19. package/dist/lib/HttpClient/index.d.ts.map +1 -0
  20. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +166 -0
  21. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +1 -0
  22. package/dist/lib/LinkExtractor/index.d.ts +37 -0
  23. package/dist/lib/LinkExtractor/index.d.ts.map +1 -0
  24. package/dist/lib/Logging/FetchLogger.d.ts +24 -0
  25. package/dist/lib/Logging/FetchLogger.d.ts.map +1 -0
  26. package/dist/lib/Logging/SpiderLogger.service.d.ts +37 -0
  27. package/dist/lib/Logging/SpiderLogger.service.d.ts.map +1 -0
  28. package/dist/lib/Middleware/SpiderMiddleware.d.ts +239 -0
  29. package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +1 -0
  30. package/dist/lib/Middleware/types.d.ts +99 -0
  31. package/dist/lib/Middleware/types.d.ts.map +1 -0
  32. package/dist/lib/PageData/PageData.d.ts +28 -0
  33. package/dist/lib/PageData/PageData.d.ts.map +1 -0
  34. package/dist/lib/Resumability/Resumability.service.d.ts +178 -0
  35. package/dist/lib/Resumability/Resumability.service.d.ts.map +1 -0
  36. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +47 -0
  37. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +1 -0
  38. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +95 -0
  39. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +1 -0
  40. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +92 -0
  41. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +1 -0
  42. package/dist/lib/Resumability/index.d.ts +51 -0
  43. package/dist/lib/Resumability/index.d.ts.map +1 -0
  44. package/dist/lib/Resumability/strategies.d.ts +76 -0
  45. package/dist/lib/Resumability/strategies.d.ts.map +1 -0
  46. package/dist/lib/Resumability/types.d.ts +201 -0
  47. package/dist/lib/Resumability/types.d.ts.map +1 -0
  48. package/dist/lib/Robots/Robots.service.d.ts +78 -0
  49. package/dist/lib/Robots/Robots.service.d.ts.map +1 -0
  50. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +211 -0
  51. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +1 -0
  52. package/dist/lib/Scraper/Scraper.service.d.ts +123 -0
  53. package/dist/lib/Scraper/Scraper.service.d.ts.map +1 -0
  54. package/dist/lib/Spider/Spider.defaults.d.ts +24 -0
  55. package/dist/lib/Spider/Spider.defaults.d.ts.map +1 -0
  56. package/dist/lib/Spider/Spider.service.d.ts +239 -0
  57. package/dist/lib/Spider/Spider.service.d.ts.map +1 -0
  58. package/dist/lib/StateManager/StateManager.service.d.ts +107 -0
  59. package/dist/lib/StateManager/StateManager.service.d.ts.map +1 -0
  60. package/dist/lib/StateManager/index.d.ts +5 -0
  61. package/dist/lib/StateManager/index.d.ts.map +1 -0
  62. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +58 -0
  63. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +1 -0
  64. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +109 -0
  65. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +1 -0
  66. package/dist/lib/WebScrapingEngine/index.d.ts +5 -0
  67. package/dist/lib/WebScrapingEngine/index.d.ts.map +1 -0
  68. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +39 -0
  69. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +1 -0
  70. package/dist/lib/api-facades.d.ts +313 -0
  71. package/dist/lib/api-facades.d.ts.map +1 -0
  72. package/dist/lib/errors/effect-errors.d.ts +312 -0
  73. package/dist/lib/errors/effect-errors.d.ts.map +1 -0
  74. package/dist/lib/utils/FileUtils.d.ts +284 -0
  75. package/dist/lib/utils/FileUtils.d.ts.map +1 -0
  76. package/dist/lib/utils/JsonUtils.d.ts +196 -0
  77. package/dist/lib/utils/JsonUtils.d.ts.map +1 -0
  78. package/dist/lib/utils/RegexUtils.d.ts +257 -0
  79. package/dist/lib/utils/RegexUtils.d.ts.map +1 -0
  80. package/dist/lib/utils/SchemaUtils.d.ts +251 -0
  81. package/dist/lib/utils/SchemaUtils.d.ts.map +1 -0
  82. package/dist/lib/utils/UrlUtils.d.ts +223 -0
  83. package/dist/lib/utils/UrlUtils.d.ts.map +1 -0
  84. package/dist/lib/utils/effect-migration.d.ts +31 -0
  85. package/dist/lib/utils/effect-migration.d.ts.map +1 -0
  86. package/dist/lib/utils/index.d.ts +15 -0
  87. package/dist/lib/utils/index.d.ts.map +1 -0
  88. package/dist/lib/utils/url-deduplication.d.ts +108 -0
  89. package/dist/lib/utils/url-deduplication.d.ts.map +1 -0
  90. package/package.json +22 -13
@@ -0,0 +1,109 @@
1
+ /**
2
+ * Web Scraping Engine Service
3
+ * Orchestrates all scraping capabilities including authentication, token management, and session handling
4
+ */
5
+ import { Context, DateTime, Effect, HashMap, Layer } from 'effect';
6
+ import { EnhancedHttpClient, type HttpResponse } from '../HttpClient/EnhancedHttpClient.js';
7
+ import { CookieManager } from '../HttpClient/CookieManager.js';
8
+ import { SessionStore, SessionError } from '../HttpClient/SessionStore.js';
9
+ import { TokenExtractor } from '../HttpClient/TokenExtractor.js';
10
+ import { StateManager, TokenType } from '../StateManager/StateManager.service.js';
11
+ import { SpiderLogger } from '../Logging/SpiderLogger.service.js';
12
+ import { NetworkError, ParseError, TimeoutError } from '../errors/effect-errors.js';
13
+ import { JsonStringifyError } from '../utils/JsonUtils.js';
14
+ declare const LoginError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
15
+ readonly _tag: "LoginError";
16
+ } & Readonly<A>;
17
+ export declare class LoginError extends LoginError_base<{
18
+ readonly status: number;
19
+ readonly message: string;
20
+ }> {
21
+ }
22
+ declare const SessionNotValidError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
23
+ readonly _tag: "SessionNotValidError";
24
+ } & Readonly<A>;
25
+ export declare class SessionNotValidError extends SessionNotValidError_base<{
26
+ readonly message: string;
27
+ }> {
28
+ }
29
+ declare const SessionLoadError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
30
+ readonly _tag: "SessionLoadError";
31
+ } & Readonly<A>;
32
+ export declare class SessionLoadError extends SessionLoadError_base<{
33
+ readonly message: string;
34
+ }> {
35
+ }
36
+ export type WebScrapingEngineError = LoginError | SessionNotValidError | SessionLoadError;
37
+ /**
38
+ * Combined error types for HTTP operations
39
+ */
40
+ export type HttpOperationError = NetworkError | ParseError | TimeoutError;
41
+ /**
42
+ * Combined error types for POST operations
43
+ */
44
+ export type HttpPostOperationError = HttpOperationError | JsonStringifyError;
45
+ export interface LoginCredentials {
46
+ username: string;
47
+ password: string;
48
+ loginUrl: string;
49
+ usernameField?: string;
50
+ passwordField?: string;
51
+ additionalFields?: Record<string, string>;
52
+ }
53
+ export interface ScrapingSession {
54
+ id: string;
55
+ authenticated: boolean;
56
+ tokens: HashMap.HashMap<TokenType, string>;
57
+ startTime: DateTime.Utc;
58
+ }
59
+ export interface WebScrapingEngineService {
60
+ /**
61
+ * Perform login with form submission
62
+ */
63
+ login: (_credentials: LoginCredentials) => Effect.Effect<ScrapingSession, HttpOperationError | SessionError | LoginError>;
64
+ /**
65
+ * Fetch authenticated content
66
+ */
67
+ fetchAuthenticated: (_url: string) => Effect.Effect<HttpResponse, HttpOperationError | SessionNotValidError>;
68
+ /**
69
+ * Submit form with CSRF protection
70
+ */
71
+ submitFormWithCSRF: (_url: string, _formData: Record<string, string>, _csrfUrl?: string) => Effect.Effect<HttpResponse, HttpOperationError>;
72
+ /**
73
+ * Make API request with token
74
+ */
75
+ makeAPIRequest: (_url: string, _method?: 'GET' | 'POST' | 'PUT' | 'DELETE', _data?: Record<string, unknown>) => Effect.Effect<HttpResponse, HttpPostOperationError>;
76
+ /**
77
+ * Create and save a scraping session
78
+ */
79
+ createSession: (_id?: string) => Effect.Effect<ScrapingSession>;
80
+ /**
81
+ * Load existing session
82
+ */
83
+ loadSession: (_id: string) => Effect.Effect<ScrapingSession, SessionError | SessionLoadError>;
84
+ /**
85
+ * Export session for persistence
86
+ */
87
+ exportSession: () => Effect.Effect<string, SessionError>;
88
+ /**
89
+ * Import session from persistence
90
+ */
91
+ importSession: (_data: string) => Effect.Effect<void, SessionError>;
92
+ /**
93
+ * Clear all state and sessions
94
+ */
95
+ clearAll: () => Effect.Effect<void>;
96
+ }
97
+ declare const WebScrapingEngine_base: Context.TagClass<WebScrapingEngine, "WebScrapingEngine", WebScrapingEngineService>;
98
+ export declare class WebScrapingEngine extends WebScrapingEngine_base {
99
+ }
100
+ /**
101
+ * Create a WebScrapingEngine service implementation
102
+ */
103
+ export declare const makeWebScrapingEngine: Effect.Effect<WebScrapingEngineService, never, SpiderLogger | CookieManager | EnhancedHttpClient | StateManager | SessionStore | TokenExtractor>;
104
+ /**
105
+ * WebScrapingEngine Layer with all dependencies
106
+ */
107
+ export declare const WebScrapingEngineLive: Layer.Layer<WebScrapingEngine, never, SpiderLogger | CookieManager | EnhancedHttpClient | StateManager | SessionStore | TokenExtractor>;
108
+ export {};
109
+ //# sourceMappingURL=WebScrapingEngine.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"WebScrapingEngine.service.d.ts","sourceRoot":"","sources":["../../../src/lib/WebScrapingEngine/WebScrapingEngine.service.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,OAAO,EAAQ,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,EAAU,MAAM,QAAQ,CAAC;AACjF,OAAO,EACL,kBAAkB,EAClB,KAAK,YAAY,EAClB,MAAM,qCAAqC,CAAC;AAC7C,OAAO,EAAE,aAAa,EAAE,MAAM,gCAAgC,CAAC;AAC/D,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,+BAA+B,CAAC;AAC3E,OAAO,EAAE,cAAc,EAAE,MAAM,iCAAiC,CAAC;AACjE,OAAO,EACL,YAAY,EACZ,SAAS,EACV,MAAM,yCAAyC,CAAC;AACjD,OAAO,EAAE,YAAY,EAAE,MAAM,oCAAoC,CAAC;AAClE,OAAO,EAAE,YAAY,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AACpF,OAAO,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAC;;;;AAM3D,qBAAa,UAAW,SAAQ,gBAA+B;IAC7D,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;CAAG;;;;AAEL,qBAAa,oBAAqB,SAAQ,0BAAyC;IACjF,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;CAAG;;;;AAEL,qBAAa,gBAAiB,SAAQ,sBAAqC;IACzE,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;CAAG;AAEL,MAAM,MAAM,sBAAsB,GAAG,UAAU,GAAG,oBAAoB,GAAG,gBAAgB,CAAC;AAE1F;;GAEG;AACH,MAAM,MAAM,kBAAkB,GAAG,YAAY,GAAG,UAAU,GAAG,YAAY,CAAC;AAE1E;;GAEG;AACH,MAAM,MAAM,sBAAsB,GAAG,kBAAkB,GAAG,kBAAkB,CAAC;AAE7E,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;IACjB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAC3C;AAED,MAAM,WAAW,eAAe;IAC9B,EAAE,EAAE,MAAM,CAAC;IACX,aAAa,EAAE,OAAO,CAAC;IACvB,MAAM,EAAE,OAAO,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAC3C,SAAS,EAAE,QAAQ,CAAC,GAAG,CAAC;CACzB;AAED,MAAM,WAAW,wBAAwB;IACvC;;OAEG;IACH,KAAK,EAAE,CACL,YAAY,EAAE,gBAAgB,KAC3B,MAAM,CAAC,MAAM,CAAC,eAAe,EAAE,kBAAkB,GAAG,YAAY,GAAG,UAAU,CAAC,CAAC;IAEpF;;OAEG;IACH,kBAAkB,EAAE,CAClB,IAAI,EAAE,MAAM,KACT,MAAM,CAAC,MAAM,CAAC,YAAY,EAAE,kBAAkB,GAAG,oBAAoB,CAAC,CAAC;IAE5E;;OAEG;IACH,kBAAkB,EAAE,CAClB,IAAI,EAAE,MAAM,EACZ,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EACjC,QAAQ,CAAC,EAAE,MAAM,KACd,MAAM,CAAC,MAAM,CAAC,YAAY,EAAE,kBAAkB,CAAC,CAAC;IAErD;;OAEG;IACH,cAAc,EAAE,CACd,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,KAAK,GAAG,MAAM,GAAG,KAAK,GAAG,QAAQ,EAC3C,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAC5B,MAAM,CAAC,MAAM,CAAC,YAAY,EAAE,sBAAsB,CAAC,CAAC;IAEzD;;OAEG;IACH,aAAa,EAAE,CAAC,GAAG,CAAC,EAAE,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC;IAEhE;;OAEG;IACH,WAAW,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC,eAAe,EAAE,YAAY,GAAG,gBAAgB,CAAC,CAAC;IAE9F;;OAEG;IACH,aAAa,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;IAEzD;;OAEG;IACH,aAAa,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,YAAY,CAAC,CAAC;IAEpE;;OAEG;IACH,QAAQ,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;CACrC;;AAED,qBAAa,iBAAkB,SAAQ,sBAGpC;CAAG;AAEN;;GAEG;AACH,eAAO,MAAM,qBAAqB,kJA+RhC,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,qBAAqB,yIAGjC,CAAC"}
@@ -0,0 +1,5 @@
1
+ /**
2
+ * Web Scraping Engine module exports
3
+ */
4
+ export * from './WebScrapingEngine.service.js';
5
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/lib/WebScrapingEngine/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,cAAc,gCAAgC,CAAC"}
@@ -0,0 +1,39 @@
1
+ import { DateTime, Effect } from 'effect';
2
+ import { SpiderLogger } from '../Logging/SpiderLogger.service.js';
3
+ interface WorkerStatus {
4
+ workerId: string;
5
+ domain: string;
6
+ currentUrl?: string;
7
+ lastActivity: DateTime.Utc;
8
+ fetchStartTime?: DateTime.Utc;
9
+ }
10
+ declare const WorkerHealthMonitor_base: Effect.Service.Class<WorkerHealthMonitor, "@jambudipa.io/WorkerHealthMonitor", {
11
+ readonly effect: Effect.Effect<{
12
+ /**
13
+ * Register a worker's activity
14
+ */
15
+ recordActivity: (workerId: string, domain: string, activity: {
16
+ url?: string;
17
+ fetchStart?: boolean;
18
+ }) => Effect.Effect<void, never, never>;
19
+ /**
20
+ * Remove a worker from monitoring
21
+ */
22
+ removeWorker: (workerId: string) => Effect.Effect<void, never, never>;
23
+ /**
24
+ * Get stuck workers
25
+ */
26
+ getStuckWorkers: Effect.Effect<WorkerStatus[], never, never>;
27
+ /**
28
+ * Monitor workers and log stuck ones
29
+ */
30
+ startMonitoring: Effect.Effect<void, never, never>;
31
+ }, never, SpiderLogger>;
32
+ }>;
33
+ /**
34
+ * Monitors worker health and kills stuck workers
35
+ */
36
+ export declare class WorkerHealthMonitor extends WorkerHealthMonitor_base {
37
+ }
38
+ export {};
39
+ //# sourceMappingURL=WorkerHealthMonitor.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"WorkerHealthMonitor.service.d.ts","sourceRoot":"","sources":["../../../src/lib/WorkerHealth/WorkerHealthMonitor.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAY,MAAM,EAAkC,MAAM,QAAQ,CAAC;AACpF,OAAO,EAAE,YAAY,EAAE,MAAM,oCAAoC,CAAC;AAElE,UAAU,YAAY;IACpB,QAAQ,EAAE,MAAM,CAAC;IACjB,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,QAAQ,CAAC,GAAG,CAAC;IAC3B,cAAc,CAAC,EAAE,QAAQ,CAAC,GAAG,CAAC;CAC/B;;;QAcO;;WAEG;mCAES,MAAM,UACR,MAAM,YACJ;YAAE,GAAG,CAAC,EAAE,MAAM,CAAC;YAAC,UAAU,CAAC,EAAE,OAAO,CAAA;SAAE;QA4BlD;;WAEG;iCACsB,MAAM;QAG/B;;WAEG;;QAgBH;;WAEG;;;;AAxEX;;GAEG;AACH,qBAAa,mBAAoB,SAAQ,wBAwHxC;CAAG"}
@@ -0,0 +1,313 @@
1
+ /**
2
+ * Clean API facades that hide Effect.Service implementation details.
3
+ *
4
+ * These interfaces provide clean documentation without exposing
5
+ * internal Effect service machinery.
6
+ *
7
+ * @group Services
8
+ */
9
+ import { Effect, Sink } from 'effect';
10
+ import { CrawlResult, CrawlTask } from './Spider/Spider.service.js';
11
+ import { PriorityRequest, SpiderState, SpiderStateKey, StatePersistence } from './Scheduler/SpiderScheduler.service.js';
12
+ import { SpiderMiddleware, SpiderRequest, SpiderResponse } from './Middleware/SpiderMiddleware.js';
13
+ import { MiddlewareError } from './errors/effect-errors.js';
14
+ /**
15
+ * The main Spider service interface for web crawling.
16
+ *
17
+ * Orchestrates the entire crawling process including URL validation,
18
+ * robots.txt checking, concurrent processing, and result streaming.
19
+ *
20
+ * @example
21
+ * ```typescript
22
+ * const program = Effect.gen(function* () {
23
+ * const spider = yield* Spider;
24
+ * const collectSink = Sink.forEach<CrawlResult>(result =>
25
+ * Effect.sync(() => console.log(result.pageData.url))
26
+ * );
27
+ *
28
+ * const stats = yield* spider.crawl('https://example.com', collectSink);
29
+ * console.log(`Crawled ${stats.totalPages} pages`);
30
+ * });
31
+ * ```
32
+ *
33
+ * @group Services
34
+ * @public
35
+ */
36
+ export interface ISpider {
37
+ /**
38
+ * Starts crawling from the specified URL and processes results through the provided sink.
39
+ *
40
+ * @param urlString - The starting URL for crawling
41
+ * @param sink - Sink to process crawl results as they're produced
42
+ * @returns Effect containing crawl statistics (total pages, completion status)
43
+ */
44
+ crawl<A, E, R>(_urlString: string, _sink: Sink.Sink<A, CrawlResult, E, R>): Effect.Effect<{
45
+ totalPages: number;
46
+ completed: boolean;
47
+ }, Error>;
48
+ /**
49
+ * Returns the list of URLs that have been visited during crawling.
50
+ *
51
+ * @returns Effect containing array of visited URLs
52
+ */
53
+ getVisitedUrls(): Effect.Effect<string[]>;
54
+ }
55
+ /**
56
+ * The SpiderSchedulerService service interface for request scheduling and persistence.
57
+ *
58
+ * Manages request queuing, prioritization, and state persistence for
59
+ * resumable crawling operations.
60
+ *
61
+ * @example
62
+ * ```typescript
63
+ * const program = Effect.gen(function* () {
64
+ * const scheduler = yield* SpiderSchedulerService;
65
+ *
66
+ * // Configure persistence
67
+ * const stateKey = new SpiderStateKey({
68
+ * id: 'my-crawl',
69
+ * timestamp: new Date(),
70
+ * name: 'Example Crawl'
71
+ * });
72
+ *
73
+ * yield* scheduler.configurePersistence(persistence, stateKey);
74
+ *
75
+ * // Queue requests with priority
76
+ * yield* scheduler.enqueue({ url: 'https://example.com', depth: 0 }, 10);
77
+ *
78
+ * // Process requests
79
+ * const request = yield* scheduler.dequeue();
80
+ * console.log(`Processing: ${request.request.url}`);
81
+ * });
82
+ * ```
83
+ *
84
+ * @group Services
85
+ * @public
86
+ */
87
+ export interface ISpiderScheduler {
88
+ /**
89
+ * Configures the scheduler to use a specific persistence layer with a state key.
90
+ *
91
+ * @param persistence - Implementation of StatePersistence interface
92
+ * @param stateKey - Unique identifier for the crawl session
93
+ */
94
+ configurePersistence(_persistence: StatePersistence, _stateKey: SpiderStateKey): Effect.Effect<void>;
95
+ /**
96
+ * Removes persistence configuration, disabling state saving.
97
+ */
98
+ clearPersistence(): Effect.Effect<void>;
99
+ /**
100
+ * Adds a crawl task to the processing queue with optional priority.
101
+ *
102
+ * @param request - Crawl task containing URL and depth
103
+ * @param priority - Optional priority (higher numbers = higher priority, default: 0)
104
+ * @returns Effect containing boolean indicating if task was added (false if duplicate)
105
+ */
106
+ enqueue(_request: CrawlTask, _priority?: number): Effect.Effect<boolean>;
107
+ /**
108
+ * Retrieves the next highest-priority task from the queue.
109
+ *
110
+ * @returns Effect containing the next priority request
111
+ */
112
+ dequeue(): Effect.Effect<PriorityRequest>;
113
+ /**
114
+ * Returns the current number of tasks in the queue.
115
+ */
116
+ size(): Effect.Effect<number>;
117
+ /**
118
+ * Checks if the queue is empty.
119
+ */
120
+ isEmpty(): Effect.Effect<boolean>;
121
+ /**
122
+ * Returns the current scheduler state for persistence.
123
+ */
124
+ getState(): Effect.Effect<SpiderState>;
125
+ /**
126
+ * Restores the scheduler from a previously saved state.
127
+ *
128
+ * @param state - Complete state to restore from
129
+ */
130
+ restoreFromState(_state: SpiderState): Effect.Effect<void>;
131
+ /**
132
+ * Attempts to restore state from a persistence layer.
133
+ *
134
+ * @param persistence - Persistence layer to load from
135
+ * @param stateKey - State key to restore
136
+ * @returns Effect containing boolean indicating if state was successfully restored
137
+ */
138
+ restore(_persistence: StatePersistence, _stateKey: SpiderStateKey): Effect.Effect<boolean>;
139
+ }
140
+ /**
141
+ * The MiddlewareManager service interface for pipeline processing.
142
+ *
143
+ * Orchestrates the execution of middleware in the correct order for
144
+ * request processing, response handling, and error recovery.
145
+ *
146
+ * @example
147
+ * ```typescript
148
+ * const program = Effect.gen(function* () {
149
+ * const manager = yield* MiddlewareManager;
150
+ *
151
+ * const middleware = [
152
+ * rateLimitMiddleware,
153
+ * loggingMiddleware,
154
+ * userAgentMiddleware
155
+ * ];
156
+ *
157
+ * const processedRequest = yield* manager.processRequest(request, middleware);
158
+ * console.log('Request processed through middleware pipeline');
159
+ * });
160
+ * ```
161
+ *
162
+ * @group Services
163
+ * @public
164
+ */
165
+ export interface IMiddlewareManager {
166
+ /**
167
+ * Processes a request through the middleware pipeline.
168
+ *
169
+ * @param request - The initial request to process
170
+ * @param middlewares - Array of middleware to apply
171
+ * @returns Effect containing the processed request
172
+ */
173
+ processRequest(_request: SpiderRequest, _middlewares: SpiderMiddleware[]): Effect.Effect<SpiderRequest, MiddlewareError>;
174
+ /**
175
+ * Processes a response through the middleware pipeline in reverse order.
176
+ *
177
+ * @param response - The response to process
178
+ * @param request - The original request (for context)
179
+ * @param middlewares - Array of middleware to apply
180
+ * @returns Effect containing the processed response
181
+ */
182
+ processResponse(_response: SpiderResponse, _request: SpiderRequest, _middlewares: SpiderMiddleware[]): Effect.Effect<SpiderResponse, MiddlewareError>;
183
+ /**
184
+ * Processes an exception through the middleware pipeline in reverse order.
185
+ *
186
+ * @param error - The error that occurred
187
+ * @param request - The request that caused the error
188
+ * @param middlewares - Array of middleware to apply
189
+ * @returns Effect containing a recovered response or null
190
+ */
191
+ processException(_error: Error, _request: SpiderRequest, _middlewares: SpiderMiddleware[]): Effect.Effect<SpiderResponse | null, MiddlewareError>;
192
+ }
193
+ /**
194
+ * Rate limiting middleware service interface.
195
+ *
196
+ * Provides rate limiting functionality for respectful crawling,
197
+ * controlling request frequency at both global and per-domain levels.
198
+ *
199
+ * @group Middleware
200
+ * @public
201
+ */
202
+ export interface IRateLimitMiddleware {
203
+ /**
204
+ * Creates a rate limiting middleware with the specified configuration.
205
+ *
206
+ * @param config - Rate limiting configuration options
207
+ * @returns Configured middleware instance
208
+ *
209
+ * @example
210
+ * ```typescript
211
+ * const rateLimiter = yield* RateLimitMiddleware;
212
+ * const middleware = rateLimiter.create({
213
+ * maxConcurrentRequests: 5,
214
+ * maxRequestsPerSecondPerDomain: 2,
215
+ * requestDelayMs: 250
216
+ * });
217
+ * ```
218
+ */
219
+ create(_config: {
220
+ maxConcurrentRequests: number;
221
+ maxRequestsPerSecondPerDomain: number;
222
+ requestDelayMs?: number;
223
+ }): SpiderMiddleware;
224
+ }
225
+ /**
226
+ * Logging middleware service interface.
227
+ *
228
+ * Provides logging functionality using Effect.Logger for debugging
229
+ * and monitoring crawling operations.
230
+ *
231
+ * @group Middleware
232
+ * @public
233
+ */
234
+ export interface ILoggingMiddleware {
235
+ /**
236
+ * Creates a logging middleware with optional configuration.
237
+ *
238
+ * @param config - Optional logging configuration
239
+ * @returns Configured middleware instance
240
+ *
241
+ * @example
242
+ * ```typescript
243
+ * const logger = yield* LoggingMiddleware;
244
+ * const middleware = logger.create({
245
+ * logRequests: true,
246
+ * logResponses: true,
247
+ * logLevel: 'info'
248
+ * });
249
+ * ```
250
+ */
251
+ create(_config?: {
252
+ logRequests?: boolean;
253
+ logResponses?: boolean;
254
+ logErrors?: boolean;
255
+ logLevel?: 'debug' | 'info' | 'warn' | 'error';
256
+ }): SpiderMiddleware;
257
+ }
258
+ /**
259
+ * User agent middleware service interface.
260
+ *
261
+ * Adds consistent User-Agent headers to all requests for
262
+ * proper identification of your crawler.
263
+ *
264
+ * @group Middleware
265
+ * @public
266
+ */
267
+ export interface IUserAgentMiddleware {
268
+ /**
269
+ * Creates a User-Agent middleware with the specified user agent string.
270
+ *
271
+ * @param userAgent - User agent string to add to requests
272
+ * @returns Configured middleware instance
273
+ *
274
+ * @example
275
+ * ```typescript
276
+ * const userAgent = yield* UserAgentMiddleware;
277
+ * const middleware = userAgent.create('MyBot/1.0 (+https://example.com)');
278
+ * ```
279
+ */
280
+ create(_userAgent: string): SpiderMiddleware;
281
+ }
282
+ /**
283
+ * Statistics middleware service interface.
284
+ *
285
+ * Collects comprehensive metrics about crawling activity including
286
+ * request counts, response codes, and performance statistics.
287
+ *
288
+ * @group Middleware
289
+ * @public
290
+ */
291
+ export interface IStatsMiddleware {
292
+ /**
293
+ * Creates a statistics middleware and returns both the middleware and a stats getter.
294
+ *
295
+ * @returns Object containing the middleware instance and statistics retrieval function
296
+ *
297
+ * @example
298
+ * ```typescript
299
+ * const statsService = yield* StatsMiddleware;
300
+ * const { middleware, getStats } = statsService.create();
301
+ *
302
+ * // Use middleware in your pipeline
303
+ * // Later get statistics
304
+ * const stats = yield* getStats();
305
+ * console.log(`Processed ${stats.requests_processed} requests`);
306
+ * ```
307
+ */
308
+ create(): {
309
+ middleware: SpiderMiddleware;
310
+ getStats: () => Effect.Effect<Record<string, number>>;
311
+ };
312
+ }
313
+ //# sourceMappingURL=api-facades.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"api-facades.d.ts","sourceRoot":"","sources":["../../src/lib/api-facades.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,4BAA4B,CAAC;AACpE,OAAO,EACL,eAAe,EACf,WAAW,EACX,cAAc,EACd,gBAAgB,EACjB,MAAM,wCAAwC,CAAC;AAChD,OAAO,EACL,gBAAgB,EAChB,aAAa,EACb,cAAc,EACf,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EAAE,eAAe,EAAE,MAAM,2BAA2B,CAAC;AAE5D;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,MAAM,WAAW,OAAO;IACtB;;;;;;OAMG;IACH,KAAK,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EACX,UAAU,EAAE,MAAM,EAClB,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC,GACrC,MAAM,CAAC,MAAM,CAAC;QAAE,UAAU,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,OAAO,CAAA;KAAE,EAAE,KAAK,CAAC,CAAC;IAEpE;;;;OAIG;IACH,cAAc,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;CAC3C;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;OAKG;IACH,oBAAoB,CAClB,YAAY,EAAE,gBAAgB,EAC9B,SAAS,EAAE,cAAc,GACxB,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAEvB;;OAEG;IACH,gBAAgB,IAAI,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAExC;;;;;;OAMG;IACH,OAAO,CAAC,QAAQ,EAAE,SAAS,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAEzE;;;;OAIG;IACH,OAAO,IAAI,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC;IAE1C;;OAEG;IACH,IAAI,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IAE9B;;OAEG;IACH,OAAO,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAElC;;OAEG;IACH,QAAQ,IAAI,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC;IAEvC;;;;OAIG;IACH,gBAAgB,CAAC,MAAM,EAAE,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAE3D;;;;;;OAMG;IACH,OAAO,CACL,YAAY,EAAE,gBAAgB,EAC9B,SAAS,EAAE,cAAc,GACxB,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;CAC3B;AAED;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,MAAM,WAAW,kBAAkB;IACjC;;;;;;OAMG;IACH,cAAc,CACZ,QAAQ,EAAE,aAAa,EACvB,YAAY,EAAE,gBAAgB,EAAE,GAC/B,MAAM,CAAC,MAAM,CAAC,aAAa,EAAE,eAAe,CAAC,CAAC;IAEjD;;;;;;;OAOG;IACH,eAAe,CACb,SAAS,EAAE,cAAc,EACzB,QAAQ,EAAE,aAAa,EACvB,YAAY,EAAE,gBAAgB,EAAE,GAC/B,MAAM,CAAC,MAAM,CAAC,cAAc,EAAE,eAAe,CAAC,CAAC;IAElD;;;;;;;OAOG;IACH,gBAAgB,CACd,MAAM,EAAE,KAAK,EACb,QAAQ,EAAE,aAAa,EACvB,YAAY,EAAE,gBAAgB,EAAE,GAC/B,MAAM,CAAC,MAAM,CAAC,cAAc,GAAG,IAAI,EAAE,eAAe,CAAC,CAAC;CAC1D;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,oBAAoB;IACnC;;;;;;;;;;;;;;;OAeG;IACH,MAAM,CAAC,OAAO,EAAE;QACd,qBAAqB,EAAE,MAAM,CAAC;QAC9B,6BAA6B,EAAE,MAAM,CAAC;QACtC,cAAc,CAAC,EAAE,MAAM,CAAC;KACzB,GAAG,gBAAgB,CAAC;CACtB;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,kBAAkB;IACjC;;;;;;;;;;;;;;;OAeG;IACH,MAAM,CAAC,OAAO,CAAC,EAAE;QACf,WAAW,CAAC,EAAE,OAAO,CAAC;QACtB,YAAY,CAAC,EAAE,OAAO,CAAC;QACvB,SAAS,CAAC,EAAE,OAAO,CAAC;QACpB,QAAQ,CAAC,EAAE,OAAO,GAAG,MAAM,GAAG,MAAM,GAAG,OAAO,CAAC;KAChD,GAAG,gBAAgB,CAAC;CACtB;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,oBAAoB;IACnC;;;;;;;;;;;OAWG;IACH,MAAM,CAAC,UAAU,EAAE,MAAM,GAAG,gBAAgB,CAAC;CAC9C;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;;;;;;;;;;;OAeG;IACH,MAAM,IAAI;QACR,UAAU,EAAE,gBAAgB,CAAC;QAC7B,QAAQ,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;KACvD,CAAC;CACH"}