@jambudipa/spider 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +426 -0
  3. package/dist/index.d.ts +33 -0
  4. package/dist/index.d.ts.map +1 -0
  5. package/dist/index.js +4681 -0
  6. package/dist/index.js.map +1 -0
  7. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +57 -0
  8. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +1 -0
  9. package/dist/lib/Config/SpiderConfig.service.d.ts +256 -0
  10. package/dist/lib/Config/SpiderConfig.service.d.ts.map +1 -0
  11. package/dist/lib/HttpClient/CookieManager.d.ts +44 -0
  12. package/dist/lib/HttpClient/CookieManager.d.ts.map +1 -0
  13. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +88 -0
  14. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +1 -0
  15. package/dist/lib/HttpClient/SessionStore.d.ts +82 -0
  16. package/dist/lib/HttpClient/SessionStore.d.ts.map +1 -0
  17. package/dist/lib/HttpClient/TokenExtractor.d.ts +58 -0
  18. package/dist/lib/HttpClient/TokenExtractor.d.ts.map +1 -0
  19. package/dist/lib/HttpClient/index.d.ts +8 -0
  20. package/dist/lib/HttpClient/index.d.ts.map +1 -0
  21. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +166 -0
  22. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +1 -0
  23. package/dist/lib/LinkExtractor/index.d.ts +37 -0
  24. package/dist/lib/LinkExtractor/index.d.ts.map +1 -0
  25. package/dist/lib/Logging/FetchLogger.d.ts +8 -0
  26. package/dist/lib/Logging/FetchLogger.d.ts.map +1 -0
  27. package/dist/lib/Logging/SpiderLogger.service.d.ts +34 -0
  28. package/dist/lib/Logging/SpiderLogger.service.d.ts.map +1 -0
  29. package/dist/lib/Middleware/SpiderMiddleware.d.ts +276 -0
  30. package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +1 -0
  31. package/dist/lib/PageData/PageData.d.ts +28 -0
  32. package/dist/lib/PageData/PageData.d.ts.map +1 -0
  33. package/dist/lib/Resumability/Resumability.service.d.ts +176 -0
  34. package/dist/lib/Resumability/Resumability.service.d.ts.map +1 -0
  35. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +47 -0
  36. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +1 -0
  37. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +95 -0
  38. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +1 -0
  39. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +92 -0
  40. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +1 -0
  41. package/dist/lib/Resumability/index.d.ts +51 -0
  42. package/dist/lib/Resumability/index.d.ts.map +1 -0
  43. package/dist/lib/Resumability/strategies.d.ts +76 -0
  44. package/dist/lib/Resumability/strategies.d.ts.map +1 -0
  45. package/dist/lib/Resumability/types.d.ts +201 -0
  46. package/dist/lib/Resumability/types.d.ts.map +1 -0
  47. package/dist/lib/Robots/Robots.service.d.ts +78 -0
  48. package/dist/lib/Robots/Robots.service.d.ts.map +1 -0
  49. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +211 -0
  50. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +1 -0
  51. package/dist/lib/Scraper/Scraper.service.d.ts +123 -0
  52. package/dist/lib/Scraper/Scraper.service.d.ts.map +1 -0
  53. package/dist/lib/Spider/Spider.service.d.ts +194 -0
  54. package/dist/lib/Spider/Spider.service.d.ts.map +1 -0
  55. package/dist/lib/StateManager/StateManager.service.d.ts +68 -0
  56. package/dist/lib/StateManager/StateManager.service.d.ts.map +1 -0
  57. package/dist/lib/StateManager/index.d.ts +5 -0
  58. package/dist/lib/StateManager/index.d.ts.map +1 -0
  59. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +58 -0
  60. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +1 -0
  61. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +77 -0
  62. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +1 -0
  63. package/dist/lib/WebScrapingEngine/index.d.ts +5 -0
  64. package/dist/lib/WebScrapingEngine/index.d.ts.map +1 -0
  65. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +39 -0
  66. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +1 -0
  67. package/dist/lib/api-facades.d.ts +313 -0
  68. package/dist/lib/api-facades.d.ts.map +1 -0
  69. package/dist/lib/errors.d.ts +99 -0
  70. package/dist/lib/errors.d.ts.map +1 -0
  71. package/package.json +108 -0
@@ -0,0 +1,313 @@
1
+ /**
2
+ * Clean API facades that hide Effect.Service implementation details.
3
+ *
4
+ * These interfaces provide clean documentation without exposing
5
+ * internal Effect.js service machinery.
6
+ *
7
+ * @group Services
8
+ */
9
+ import { Effect, Sink } from 'effect';
10
+ import { CrawlResult, CrawlTask } from './Spider/Spider.service.js';
11
+ import { PriorityRequest, SpiderState, SpiderStateKey, StatePersistence } from './Scheduler/SpiderScheduler.service.js';
12
+ import { SpiderMiddleware, SpiderRequest, SpiderResponse } from './Middleware/SpiderMiddleware.js';
13
+ import { MiddlewareError } from './errors.js';
14
+ /**
15
+ * The main Spider service interface for web crawling.
16
+ *
17
+ * Orchestrates the entire crawling process including URL validation,
18
+ * robots.txt checking, concurrent processing, and result streaming.
19
+ *
20
+ * @example
21
+ * ```typescript
22
+ * const program = Effect.gen(function* () {
23
+ * const spider = yield* Spider;
24
+ * const collectSink = Sink.forEach<CrawlResult>(result =>
25
+ * Effect.sync(() => console.log(result.pageData.url))
26
+ * );
27
+ *
28
+ * const stats = yield* spider.crawl('https://example.com', collectSink);
29
+ * console.log(`Crawled ${stats.totalPages} pages`);
30
+ * });
31
+ * ```
32
+ *
33
+ * @group Services
34
+ * @public
35
+ */
36
+ export interface ISpider {
37
+ /**
38
+ * Starts crawling from the specified URL and processes results through the provided sink.
39
+ *
40
+ * @param urlString - The starting URL for crawling
41
+ * @param sink - Sink to process crawl results as they're produced
42
+ * @returns Effect containing crawl statistics (total pages, completion status)
43
+ */
44
+ crawl<A, E, R>(urlString: string, sink: Sink.Sink<A, CrawlResult, E, R>): Effect.Effect<{
45
+ totalPages: number;
46
+ completed: boolean;
47
+ }, Error>;
48
+ /**
49
+ * Returns the list of URLs that have been visited during crawling.
50
+ *
51
+ * @returns Effect containing array of visited URLs
52
+ */
53
+ getVisitedUrls(): Effect.Effect<string[]>;
54
+ }
55
+ /**
56
+ * The SpiderSchedulerService service interface for request scheduling and persistence.
57
+ *
58
+ * Manages request queuing, prioritization, and state persistence for
59
+ * resumable crawling operations.
60
+ *
61
+ * @example
62
+ * ```typescript
63
+ * const program = Effect.gen(function* () {
64
+ * const scheduler = yield* SpiderSchedulerService;
65
+ *
66
+ * // Configure persistence
67
+ * const stateKey = new SpiderStateKey({
68
+ * id: 'my-crawl',
69
+ * timestamp: new Date(),
70
+ * name: 'Example Crawl'
71
+ * });
72
+ *
73
+ * yield* scheduler.configurePersistence(persistence, stateKey);
74
+ *
75
+ * // Queue requests with priority
76
+ * yield* scheduler.enqueue({ url: 'https://example.com', depth: 0 }, 10);
77
+ *
78
+ * // Process requests
79
+ * const request = yield* scheduler.dequeue();
80
+ * console.log(`Processing: ${request.request.url}`);
81
+ * });
82
+ * ```
83
+ *
84
+ * @group Services
85
+ * @public
86
+ */
87
+ export interface ISpiderScheduler {
88
+ /**
89
+ * Configures the scheduler to use a specific persistence layer with a state key.
90
+ *
91
+ * @param persistence - Implementation of StatePersistence interface
92
+ * @param stateKey - Unique identifier for the crawl session
93
+ */
94
+ configurePersistence(persistence: StatePersistence, stateKey: SpiderStateKey): Effect.Effect<void>;
95
+ /**
96
+ * Removes persistence configuration, disabling state saving.
97
+ */
98
+ clearPersistence(): Effect.Effect<void>;
99
+ /**
100
+ * Adds a crawl task to the processing queue with optional priority.
101
+ *
102
+ * @param request - Crawl task containing URL and depth
103
+ * @param priority - Optional priority (higher numbers = higher priority, default: 0)
104
+ * @returns Effect containing boolean indicating if task was added (false if duplicate)
105
+ */
106
+ enqueue(request: CrawlTask, priority?: number): Effect.Effect<boolean>;
107
+ /**
108
+ * Retrieves the next highest-priority task from the queue.
109
+ *
110
+ * @returns Effect containing the next priority request
111
+ */
112
+ dequeue(): Effect.Effect<PriorityRequest>;
113
+ /**
114
+ * Returns the current number of tasks in the queue.
115
+ */
116
+ size(): Effect.Effect<number>;
117
+ /**
118
+ * Checks if the queue is empty.
119
+ */
120
+ isEmpty(): Effect.Effect<boolean>;
121
+ /**
122
+ * Returns the current scheduler state for persistence.
123
+ */
124
+ getState(): Effect.Effect<SpiderState>;
125
+ /**
126
+ * Restores the scheduler from a previously saved state.
127
+ *
128
+ * @param state - Complete state to restore from
129
+ */
130
+ restoreFromState(state: SpiderState): Effect.Effect<void>;
131
+ /**
132
+ * Attempts to restore state from a persistence layer.
133
+ *
134
+ * @param persistence - Persistence layer to load from
135
+ * @param stateKey - State key to restore
136
+ * @returns Effect containing boolean indicating if state was successfully restored
137
+ */
138
+ restore(persistence: StatePersistence, stateKey: SpiderStateKey): Effect.Effect<boolean>;
139
+ }
140
+ /**
141
+ * The MiddlewareManager service interface for pipeline processing.
142
+ *
143
+ * Orchestrates the execution of middleware in the correct order for
144
+ * request processing, response handling, and error recovery.
145
+ *
146
+ * @example
147
+ * ```typescript
148
+ * const program = Effect.gen(function* () {
149
+ * const manager = yield* MiddlewareManager;
150
+ *
151
+ * const middleware = [
152
+ * rateLimitMiddleware,
153
+ * loggingMiddleware,
154
+ * userAgentMiddleware
155
+ * ];
156
+ *
157
+ * const processedRequest = yield* manager.processRequest(request, middleware);
158
+ * console.log('Request processed through middleware pipeline');
159
+ * });
160
+ * ```
161
+ *
162
+ * @group Services
163
+ * @public
164
+ */
165
+ export interface IMiddlewareManager {
166
+ /**
167
+ * Processes a request through the middleware pipeline.
168
+ *
169
+ * @param request - The initial request to process
170
+ * @param middlewares - Array of middleware to apply
171
+ * @returns Effect containing the processed request
172
+ */
173
+ processRequest(request: SpiderRequest, middlewares: SpiderMiddleware[]): Effect.Effect<SpiderRequest, MiddlewareError>;
174
+ /**
175
+ * Processes a response through the middleware pipeline in reverse order.
176
+ *
177
+ * @param response - The response to process
178
+ * @param request - The original request (for context)
179
+ * @param middlewares - Array of middleware to apply
180
+ * @returns Effect containing the processed response
181
+ */
182
+ processResponse(response: SpiderResponse, request: SpiderRequest, middlewares: SpiderMiddleware[]): Effect.Effect<SpiderResponse, MiddlewareError>;
183
+ /**
184
+ * Processes an exception through the middleware pipeline in reverse order.
185
+ *
186
+ * @param error - The error that occurred
187
+ * @param request - The request that caused the error
188
+ * @param middlewares - Array of middleware to apply
189
+ * @returns Effect containing a recovered response or null
190
+ */
191
+ processException(error: Error, request: SpiderRequest, middlewares: SpiderMiddleware[]): Effect.Effect<SpiderResponse | null, MiddlewareError>;
192
+ }
193
+ /**
194
+ * Rate limiting middleware service interface.
195
+ *
196
+ * Provides rate limiting functionality for respectful crawling,
197
+ * controlling request frequency at both global and per-domain levels.
198
+ *
199
+ * @group Middleware
200
+ * @public
201
+ */
202
+ export interface IRateLimitMiddleware {
203
+ /**
204
+ * Creates a rate limiting middleware with the specified configuration.
205
+ *
206
+ * @param config - Rate limiting configuration options
207
+ * @returns Configured middleware instance
208
+ *
209
+ * @example
210
+ * ```typescript
211
+ * const rateLimiter = yield* RateLimitMiddleware;
212
+ * const middleware = rateLimiter.create({
213
+ * maxConcurrentRequests: 5,
214
+ * maxRequestsPerSecondPerDomain: 2,
215
+ * requestDelayMs: 250
216
+ * });
217
+ * ```
218
+ */
219
+ create(config: {
220
+ maxConcurrentRequests: number;
221
+ maxRequestsPerSecondPerDomain: number;
222
+ requestDelayMs?: number;
223
+ }): SpiderMiddleware;
224
+ }
225
+ /**
226
+ * Logging middleware service interface.
227
+ *
228
+ * Provides logging functionality using Effect.Logger for debugging
229
+ * and monitoring crawling operations.
230
+ *
231
+ * @group Middleware
232
+ * @public
233
+ */
234
+ export interface ILoggingMiddleware {
235
+ /**
236
+ * Creates a logging middleware with optional configuration.
237
+ *
238
+ * @param config - Optional logging configuration
239
+ * @returns Configured middleware instance
240
+ *
241
+ * @example
242
+ * ```typescript
243
+ * const logger = yield* LoggingMiddleware;
244
+ * const middleware = logger.create({
245
+ * logRequests: true,
246
+ * logResponses: true,
247
+ * logLevel: 'info'
248
+ * });
249
+ * ```
250
+ */
251
+ create(config?: {
252
+ logRequests?: boolean;
253
+ logResponses?: boolean;
254
+ logErrors?: boolean;
255
+ logLevel?: 'debug' | 'info' | 'warn' | 'error';
256
+ }): SpiderMiddleware;
257
+ }
258
+ /**
259
+ * User agent middleware service interface.
260
+ *
261
+ * Adds consistent User-Agent headers to all requests for
262
+ * proper identification of your crawler.
263
+ *
264
+ * @group Middleware
265
+ * @public
266
+ */
267
+ export interface IUserAgentMiddleware {
268
+ /**
269
+ * Creates a User-Agent middleware with the specified user agent string.
270
+ *
271
+ * @param userAgent - User agent string to add to requests
272
+ * @returns Configured middleware instance
273
+ *
274
+ * @example
275
+ * ```typescript
276
+ * const userAgent = yield* UserAgentMiddleware;
277
+ * const middleware = userAgent.create('MyBot/1.0 (+https://example.com)');
278
+ * ```
279
+ */
280
+ create(userAgent: string): SpiderMiddleware;
281
+ }
282
+ /**
283
+ * Statistics middleware service interface.
284
+ *
285
+ * Collects comprehensive metrics about crawling activity including
286
+ * request counts, response codes, and performance statistics.
287
+ *
288
+ * @group Middleware
289
+ * @public
290
+ */
291
+ export interface IStatsMiddleware {
292
+ /**
293
+ * Creates a statistics middleware and returns both the middleware and a stats getter.
294
+ *
295
+ * @returns Object containing the middleware instance and statistics retrieval function
296
+ *
297
+ * @example
298
+ * ```typescript
299
+ * const statsService = yield* StatsMiddleware;
300
+ * const { middleware, getStats } = statsService.create();
301
+ *
302
+ * // Use middleware in your pipeline
303
+ * // Later get statistics
304
+ * const stats = yield* getStats();
305
+ * console.log(`Processed ${stats.requests_processed} requests`);
306
+ * ```
307
+ */
308
+ create(): {
309
+ middleware: SpiderMiddleware;
310
+ getStats: () => Effect.Effect<Record<string, number>>;
311
+ };
312
+ }
313
+ //# sourceMappingURL=api-facades.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"api-facades.d.ts","sourceRoot":"","sources":["../../src/lib/api-facades.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,4BAA4B,CAAC;AACpE,OAAO,EACL,eAAe,EACf,WAAW,EACX,cAAc,EACd,gBAAgB,EACjB,MAAM,wCAAwC,CAAC;AAChD,OAAO,EACL,gBAAgB,EAChB,aAAa,EACb,cAAc,EACf,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAE9C;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,MAAM,WAAW,OAAO;IACtB;;;;;;OAMG;IACH,KAAK,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EACX,SAAS,EAAE,MAAM,EACjB,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC,GACpC,MAAM,CAAC,MAAM,CAAC;QAAE,UAAU,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,OAAO,CAAA;KAAE,EAAE,KAAK,CAAC,CAAC;IAEpE;;;;OAIG;IACH,cAAc,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;CAC3C;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;OAKG;IACH,oBAAoB,CAClB,WAAW,EAAE,gBAAgB,EAC7B,QAAQ,EAAE,cAAc,GACvB,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAEvB;;OAEG;IACH,gBAAgB,IAAI,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAExC;;;;;;OAMG;IACH,OAAO,CAAC,OAAO,EAAE,SAAS,EAAE,QAAQ,CAAC,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAEvE;;;;OAIG;IACH,OAAO,IAAI,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC;IAE1C;;OAEG;IACH,IAAI,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IAE9B;;OAEG;IACH,OAAO,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAElC;;OAEG;IACH,QAAQ,IAAI,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC;IAEvC;;;;OAIG;IACH,gBAAgB,CAAC,KAAK,EAAE,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAE1D;;;;;;OAMG;IACH,OAAO,CACL,WAAW,EAAE,gBAAgB,EAC7B,QAAQ,EAAE,cAAc,GACvB,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;CAC3B;AAED;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,MAAM,WAAW,kBAAkB;IACjC;;;;;;OAMG;IACH,cAAc,CACZ,OAAO,EAAE,aAAa,EACtB,WAAW,EAAE,gBAAgB,EAAE,GAC9B,MAAM,CAAC,MAAM,CAAC,aAAa,EAAE,eAAe,CAAC,CAAC;IAEjD;;;;;;;OAOG;IACH,eAAe,CACb,QAAQ,EAAE,cAAc,EACxB,OAAO,EAAE,aAAa,EACtB,WAAW,EAAE,gBAAgB,EAAE,GAC9B,MAAM,CAAC,MAAM,CAAC,cAAc,EAAE,eAAe,CAAC,CAAC;IAElD;;;;;;;OAOG;IACH,gBAAgB,CACd,KAAK,EAAE,KAAK,EACZ,OAAO,EAAE,aAAa,EACtB,WAAW,EAAE,gBAAgB,EAAE,GAC9B,MAAM,CAAC,MAAM,CAAC,cAAc,GAAG,IAAI,EAAE,eAAe,CAAC,CAAC;CAC1D;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,oBAAoB;IACnC;;;;;;;;;;;;;;;OAeG;IACH,MAAM,CAAC,MAAM,EAAE;QACb,qBAAqB,EAAE,MAAM,CAAC;QAC9B,6BAA6B,EAAE,MAAM,CAAC;QACtC,cAAc,CAAC,EAAE,MAAM,CAAC;KACzB,GAAG,gBAAgB,CAAC;CACtB;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,kBAAkB;IACjC;;;;;;;;;;;;;;;OAeG;IACH,MAAM,CAAC,MAAM,CAAC,EAAE;QACd,WAAW,CAAC,EAAE,OAAO,CAAC;QACtB,YAAY,CAAC,EAAE,OAAO,CAAC;QACvB,SAAS,CAAC,EAAE,OAAO,CAAC;QACpB,QAAQ,CAAC,EAAE,OAAO,GAAG,MAAM,GAAG,MAAM,GAAG,OAAO,CAAC;KAChD,GAAG,gBAAgB,CAAC;CACtB;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,oBAAoB;IACnC;;;;;;;;;;;OAWG;IACH,MAAM,CAAC,SAAS,EAAE,MAAM,GAAG,gBAAgB,CAAC;CAC7C;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;;;;;;;;;;;OAeG;IACH,MAAM,IAAI;QACR,UAAU,EAAE,gBAAgB,CAAC;QAC7B,QAAQ,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;KACvD,CAAC;CACH"}
@@ -0,0 +1,99 @@
1
+ declare const NetworkError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
2
+ readonly _tag: "NetworkError";
3
+ } & Readonly<A>;
4
+ /**
5
+ * Network-related errors (fetch failures, timeouts, etc.)
6
+ */
7
+ export declare class NetworkError extends NetworkError_base<{
8
+ readonly url: string;
9
+ readonly cause?: unknown;
10
+ readonly message: string;
11
+ }> {
12
+ static fromCause(url: string, cause: unknown): NetworkError;
13
+ }
14
+ declare const ResponseError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
15
+ readonly _tag: "ResponseError";
16
+ } & Readonly<A>;
17
+ /**
18
+ * Response processing errors (invalid content, parsing failures)
19
+ */
20
+ export declare class ResponseError extends ResponseError_base<{
21
+ readonly url: string;
22
+ readonly cause?: unknown;
23
+ readonly message: string;
24
+ }> {
25
+ static fromCause(url: string, cause: unknown): ResponseError;
26
+ }
27
+ declare const RobotsTxtError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
28
+ readonly _tag: "RobotsTxtError";
29
+ } & Readonly<A>;
30
+ /**
31
+ * Robots.txt fetching errors
32
+ */
33
+ export declare class RobotsTxtError extends RobotsTxtError_base<{
34
+ readonly url: string;
35
+ readonly cause?: unknown;
36
+ readonly message: string;
37
+ }> {
38
+ static fromCause(url: string, cause: unknown): RobotsTxtError;
39
+ }
40
+ declare const ConfigurationError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
41
+ readonly _tag: "ConfigurationError";
42
+ } & Readonly<A>;
43
+ /**
44
+ * Configuration errors
45
+ */
46
+ export declare class ConfigurationError extends ConfigurationError_base<{
47
+ readonly message: string;
48
+ readonly details?: unknown;
49
+ }> {
50
+ }
51
+ declare const MiddlewareError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
52
+ readonly _tag: "MiddlewareError";
53
+ } & Readonly<A>;
54
+ /**
55
+ * Middleware processing errors
56
+ */
57
+ export declare class MiddlewareError extends MiddlewareError_base<{
58
+ readonly phase: 'transform' | 'error';
59
+ readonly middlewareName: string;
60
+ readonly cause?: unknown;
61
+ readonly message: string;
62
+ }> {
63
+ static transform(middlewareName: string, cause: unknown): MiddlewareError;
64
+ static error(middlewareName: string, cause: unknown): MiddlewareError;
65
+ }
66
+ declare const FileSystemError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
67
+ readonly _tag: "FileSystemError";
68
+ } & Readonly<A>;
69
+ /**
70
+ * File system errors
71
+ */
72
+ export declare class FileSystemError extends FileSystemError_base<{
73
+ readonly operation: 'read' | 'write' | 'create' | 'delete';
74
+ readonly path: string;
75
+ readonly cause?: unknown;
76
+ readonly message: string;
77
+ }> {
78
+ static write(path: string, cause: unknown): FileSystemError;
79
+ static create(path: string, cause: unknown): FileSystemError;
80
+ }
81
+ declare const PersistenceError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
82
+ readonly _tag: "PersistenceError";
83
+ } & Readonly<A>;
84
+ /**
85
+ * Persistence layer errors
86
+ */
87
+ export declare class PersistenceError extends PersistenceError_base<{
88
+ readonly operation: 'save' | 'load' | 'delete';
89
+ readonly key?: string;
90
+ readonly cause?: unknown;
91
+ readonly message: string;
92
+ }> {
93
+ static save(cause: unknown, key?: string): PersistenceError;
94
+ static load(cause: unknown, key?: string): PersistenceError;
95
+ static delete(cause: unknown, key?: string): PersistenceError;
96
+ }
97
+ export type SpiderError = NetworkError | ResponseError | RobotsTxtError | ConfigurationError | MiddlewareError | FileSystemError | PersistenceError;
98
+ export {};
99
+ //# sourceMappingURL=errors.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"errors.d.ts","sourceRoot":"","sources":["../../src/lib/errors.ts"],"names":[],"mappings":";;;AAEA;;GAEG;AACH,qBAAa,YAAa,SAAQ,kBAAiC;IACjE,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;IACA,MAAM,CAAC,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,GAAG,YAAY;CAO5D;;;;AAED;;GAEG;AACH,qBAAa,aAAc,SAAQ,mBAAkC;IACnE,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;IACA,MAAM,CAAC,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,GAAG,aAAa;CAO7D;;;;AAED;;GAEG;AACH,qBAAa,cAAe,SAAQ,oBAAmC;IACrE,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;IACA,MAAM,CAAC,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,GAAG,cAAc;CAO9D;;;;AAED;;GAEG;AACH,qBAAa,kBAAmB,SAAQ,wBAAuC;IAC7E,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,OAAO,CAAC,EAAE,OAAO,CAAC;CAC5B,CAAC;CAAG;;;;AAEL;;GAEG;AACH,qBAAa,eAAgB,SAAQ,qBAAoC;IACvE,QAAQ,CAAC,KAAK,EAAE,WAAW,GAAG,OAAO,CAAC;IACtC,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;IACA,MAAM,CAAC,SAAS,CAAC,cAAc,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,GAAG,eAAe;IASzE,MAAM,CAAC,KAAK,CAAC,cAAc,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,GAAG,eAAe;CAQtE;;;;AAED;;GAEG;AACH,qBAAa,eAAgB,SAAQ,qBAAoC;IACvE,QAAQ,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,GAAG,QAAQ,GAAG,QAAQ,CAAC;IAC3D,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;IACA,MAAM,CAAC,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,GAAG,eAAe;IAS3D,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,GAAG,eAAe;CAQ7D;;;;AAED;;GAEG;AACH,qBAAa,gBAAiB,SAAQ,sBAAqC;IACzE,QAAQ,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,GAAG,QAAQ,CAAC;IAC/C,QAAQ,CAAC,GAAG,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;IACA,MAAM,CAAC,IAAI,CAAC,KAAK,EAAE,OAAO,EAAE,GAAG,CAAC,EAAE,MAAM,GAAG,gBAAgB;IAW3D,MAAM,CAAC,IAAI,CAAC,KAAK,EAAE,OAAO,EAAE,GAAG,CAAC,EAAE,MAAM,GAAG,gBAAgB;IAW3D,MAAM,CAAC,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,GAAG,CAAC,EAAE,MAAM,GAAG,gBAAgB;CAU9D;AAGD,MAAM,MAAM,WAAW,GACnB,YAAY,GACZ,aAAa,GACb,cAAc,GACd,kBAAkB,GAClB,eAAe,GACf,eAAe,GACf,gBAAgB,CAAC"}
package/package.json ADDED
@@ -0,0 +1,108 @@
1
+ {
2
+ "name": "@jambudipa/spider",
3
+ "version": "0.1.0",
4
+ "description": "A comprehensive web scraping library with resumable operations, middleware support, and built-in rate limiting",
5
+ "type": "module",
6
+ "main": "./dist/index.js",
7
+ "module": "./dist/index.js",
8
+ "types": "./dist/index.d.ts",
9
+ "exports": {
10
+ ".": {
11
+ "types": "./dist/index.d.ts",
12
+ "import": "./dist/index.js",
13
+ "default": "./dist/index.js"
14
+ },
15
+ "./package.json": "./package.json"
16
+ },
17
+ "files": [
18
+ "dist",
19
+ "README.md",
20
+ "LICENSE"
21
+ ],
22
+ "engines": {
23
+ "node": ">=18.0.0"
24
+ },
25
+ "scripts": {
26
+ "build": "vite build && tsc --emitDeclarationOnly --outDir dist",
27
+ "dev": "vite build --watch",
28
+ "test": "vitest",
29
+ "test:ui": "vitest --ui",
30
+ "test:coverage": "vitest --coverage",
31
+ "test:run": "vitest run",
32
+ "lint": "eslint src",
33
+ "lint:fix": "eslint src --fix",
34
+ "format": "prettier --write \"src/**/*.ts\"",
35
+ "format:check": "prettier --check \"src/**/*.ts\"",
36
+ "typecheck": "tsc --noEmit",
37
+ "typecheck:test": "tsc --noEmit --skipLibCheck --downlevelIteration --project tsconfig.test.json",
38
+ "typecheck:examples": "tsc --noEmit --project tsconfig.examples.json",
39
+ "clean": "rm -rf dist",
40
+ "prepublishOnly": "npm run clean && npm run build && npm run typecheck",
41
+ "prepack": "npm run build",
42
+ "changeset": "changeset",
43
+ "version": "changeset version",
44
+ "release": "npm run build && changeset publish",
45
+ "docs:api": "typedoc",
46
+ "docs:build": "npm run docs:api && npm run examples:validate",
47
+ "examples:run": "tsx",
48
+ "examples:validate": "tsx scripts/analyse-exports.ts",
49
+ "examples:run-all": "tsx examples/run-all.ts",
50
+ "test:examples": "vitest run tests/examples.test.ts",
51
+ "test:core": "vitest run tests/core",
52
+ "test:all": "npm run test:examples && npm run test:core"
53
+ },
54
+ "keywords": [
55
+ "web-scraping",
56
+ "crawler",
57
+ "spider",
58
+ "http",
59
+ "middleware",
60
+ "resumable",
61
+ "rate-limiting",
62
+ "typescript"
63
+ ],
64
+ "author": "Jambudipa.io",
65
+ "license": "MIT",
66
+ "repository": {
67
+ "type": "git",
68
+ "url": "https://github.com/jambudipa/spider.git"
69
+ },
70
+ "bugs": {
71
+ "url": "https://github.com/jambudipa/spider/issues"
72
+ },
73
+ "homepage": "https://github.com/jambudipa/spider#readme",
74
+ "peerDependencies": {
75
+ "@types/tough-cookie": "^4.0.5",
76
+ "cheerio": "^1.1.2",
77
+ "domhandler": "^5.0.3",
78
+ "effect": "^3.16.16",
79
+ "tough-cookie": "^6.0.0",
80
+ "tslib": "^2.3.0"
81
+ },
82
+ "devDependencies": {
83
+ "@changesets/cli": "^2.29.5",
84
+ "@types/node": "^20.0.0",
85
+ "@typescript-eslint/eslint-plugin": "^7.0.0",
86
+ "@typescript-eslint/parser": "^7.0.0",
87
+ "@typescript-eslint/typescript-estree": "^8.38.0",
88
+ "@vitest/coverage-v8": "^1.6.0",
89
+ "@vitest/ui": "^1.6.0",
90
+ "@types/tough-cookie": "^4.0.5",
91
+ "cheerio": "^1.1.2",
92
+ "domhandler": "^5.0.3",
93
+ "effect": "^3.16.16",
94
+ "eslint": "^8.57.0",
95
+ "prettier": "^3.6.2",
96
+ "tough-cookie": "^6.0.0",
97
+ "tsx": "^4.20.3",
98
+ "typedoc": "^0.28.9",
99
+ "typedoc-plugin-markdown": "^4.8.0",
100
+ "tslib": "^2.3.0",
101
+ "typescript": "^5.4.0",
102
+ "vite": "^5.0.0",
103
+ "vitest": "^1.6.0"
104
+ },
105
+ "publishConfig": {
106
+ "access": "public"
107
+ }
108
+ }