@jambudipa/spider 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +426 -0
- package/dist/index.d.ts +33 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +4681 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +57 -0
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +1 -0
- package/dist/lib/Config/SpiderConfig.service.d.ts +256 -0
- package/dist/lib/Config/SpiderConfig.service.d.ts.map +1 -0
- package/dist/lib/HttpClient/CookieManager.d.ts +44 -0
- package/dist/lib/HttpClient/CookieManager.d.ts.map +1 -0
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +88 -0
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +1 -0
- package/dist/lib/HttpClient/SessionStore.d.ts +82 -0
- package/dist/lib/HttpClient/SessionStore.d.ts.map +1 -0
- package/dist/lib/HttpClient/TokenExtractor.d.ts +58 -0
- package/dist/lib/HttpClient/TokenExtractor.d.ts.map +1 -0
- package/dist/lib/HttpClient/index.d.ts +8 -0
- package/dist/lib/HttpClient/index.d.ts.map +1 -0
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +166 -0
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +1 -0
- package/dist/lib/LinkExtractor/index.d.ts +37 -0
- package/dist/lib/LinkExtractor/index.d.ts.map +1 -0
- package/dist/lib/Logging/FetchLogger.d.ts +8 -0
- package/dist/lib/Logging/FetchLogger.d.ts.map +1 -0
- package/dist/lib/Logging/SpiderLogger.service.d.ts +34 -0
- package/dist/lib/Logging/SpiderLogger.service.d.ts.map +1 -0
- package/dist/lib/Middleware/SpiderMiddleware.d.ts +276 -0
- package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +1 -0
- package/dist/lib/PageData/PageData.d.ts +28 -0
- package/dist/lib/PageData/PageData.d.ts.map +1 -0
- package/dist/lib/Resumability/Resumability.service.d.ts +176 -0
- package/dist/lib/Resumability/Resumability.service.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +47 -0
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +95 -0
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +92 -0
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/index.d.ts +51 -0
- package/dist/lib/Resumability/index.d.ts.map +1 -0
- package/dist/lib/Resumability/strategies.d.ts +76 -0
- package/dist/lib/Resumability/strategies.d.ts.map +1 -0
- package/dist/lib/Resumability/types.d.ts +201 -0
- package/dist/lib/Resumability/types.d.ts.map +1 -0
- package/dist/lib/Robots/Robots.service.d.ts +78 -0
- package/dist/lib/Robots/Robots.service.d.ts.map +1 -0
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +211 -0
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +1 -0
- package/dist/lib/Scraper/Scraper.service.d.ts +123 -0
- package/dist/lib/Scraper/Scraper.service.d.ts.map +1 -0
- package/dist/lib/Spider/Spider.service.d.ts +194 -0
- package/dist/lib/Spider/Spider.service.d.ts.map +1 -0
- package/dist/lib/StateManager/StateManager.service.d.ts +68 -0
- package/dist/lib/StateManager/StateManager.service.d.ts.map +1 -0
- package/dist/lib/StateManager/index.d.ts +5 -0
- package/dist/lib/StateManager/index.d.ts.map +1 -0
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +58 -0
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +1 -0
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +77 -0
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +1 -0
- package/dist/lib/WebScrapingEngine/index.d.ts +5 -0
- package/dist/lib/WebScrapingEngine/index.d.ts.map +1 -0
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +39 -0
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +1 -0
- package/dist/lib/api-facades.d.ts +313 -0
- package/dist/lib/api-facades.d.ts.map +1 -0
- package/dist/lib/errors.d.ts +99 -0
- package/dist/lib/errors.d.ts.map +1 -0
- package/package.json +108 -0
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clean API facades that hide Effect.Service implementation details.
|
|
3
|
+
*
|
|
4
|
+
* These interfaces provide clean documentation without exposing
|
|
5
|
+
* internal Effect.js service machinery.
|
|
6
|
+
*
|
|
7
|
+
* @group Services
|
|
8
|
+
*/
|
|
9
|
+
import { Effect, Sink } from 'effect';
|
|
10
|
+
import { CrawlResult, CrawlTask } from './Spider/Spider.service.js';
|
|
11
|
+
import { PriorityRequest, SpiderState, SpiderStateKey, StatePersistence } from './Scheduler/SpiderScheduler.service.js';
|
|
12
|
+
import { SpiderMiddleware, SpiderRequest, SpiderResponse } from './Middleware/SpiderMiddleware.js';
|
|
13
|
+
import { MiddlewareError } from './errors.js';
|
|
14
|
+
/**
|
|
15
|
+
* The main Spider service interface for web crawling.
|
|
16
|
+
*
|
|
17
|
+
* Orchestrates the entire crawling process including URL validation,
|
|
18
|
+
* robots.txt checking, concurrent processing, and result streaming.
|
|
19
|
+
*
|
|
20
|
+
* @example
|
|
21
|
+
* ```typescript
|
|
22
|
+
* const program = Effect.gen(function* () {
|
|
23
|
+
* const spider = yield* Spider;
|
|
24
|
+
* const collectSink = Sink.forEach<CrawlResult>(result =>
|
|
25
|
+
* Effect.sync(() => console.log(result.pageData.url))
|
|
26
|
+
* );
|
|
27
|
+
*
|
|
28
|
+
* const stats = yield* spider.crawl('https://example.com', collectSink);
|
|
29
|
+
* console.log(`Crawled ${stats.totalPages} pages`);
|
|
30
|
+
* });
|
|
31
|
+
* ```
|
|
32
|
+
*
|
|
33
|
+
* @group Services
|
|
34
|
+
* @public
|
|
35
|
+
*/
|
|
36
|
+
export interface ISpider {
|
|
37
|
+
/**
|
|
38
|
+
* Starts crawling from the specified URL and processes results through the provided sink.
|
|
39
|
+
*
|
|
40
|
+
* @param urlString - The starting URL for crawling
|
|
41
|
+
* @param sink - Sink to process crawl results as they're produced
|
|
42
|
+
* @returns Effect containing crawl statistics (total pages, completion status)
|
|
43
|
+
*/
|
|
44
|
+
crawl<A, E, R>(urlString: string, sink: Sink.Sink<A, CrawlResult, E, R>): Effect.Effect<{
|
|
45
|
+
totalPages: number;
|
|
46
|
+
completed: boolean;
|
|
47
|
+
}, Error>;
|
|
48
|
+
/**
|
|
49
|
+
* Returns the list of URLs that have been visited during crawling.
|
|
50
|
+
*
|
|
51
|
+
* @returns Effect containing array of visited URLs
|
|
52
|
+
*/
|
|
53
|
+
getVisitedUrls(): Effect.Effect<string[]>;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* The SpiderSchedulerService service interface for request scheduling and persistence.
|
|
57
|
+
*
|
|
58
|
+
* Manages request queuing, prioritization, and state persistence for
|
|
59
|
+
* resumable crawling operations.
|
|
60
|
+
*
|
|
61
|
+
* @example
|
|
62
|
+
* ```typescript
|
|
63
|
+
* const program = Effect.gen(function* () {
|
|
64
|
+
* const scheduler = yield* SpiderSchedulerService;
|
|
65
|
+
*
|
|
66
|
+
* // Configure persistence
|
|
67
|
+
* const stateKey = new SpiderStateKey({
|
|
68
|
+
* id: 'my-crawl',
|
|
69
|
+
* timestamp: new Date(),
|
|
70
|
+
* name: 'Example Crawl'
|
|
71
|
+
* });
|
|
72
|
+
*
|
|
73
|
+
* yield* scheduler.configurePersistence(persistence, stateKey);
|
|
74
|
+
*
|
|
75
|
+
* // Queue requests with priority
|
|
76
|
+
* yield* scheduler.enqueue({ url: 'https://example.com', depth: 0 }, 10);
|
|
77
|
+
*
|
|
78
|
+
* // Process requests
|
|
79
|
+
* const request = yield* scheduler.dequeue();
|
|
80
|
+
* console.log(`Processing: ${request.request.url}`);
|
|
81
|
+
* });
|
|
82
|
+
* ```
|
|
83
|
+
*
|
|
84
|
+
* @group Services
|
|
85
|
+
* @public
|
|
86
|
+
*/
|
|
87
|
+
export interface ISpiderScheduler {
|
|
88
|
+
/**
|
|
89
|
+
* Configures the scheduler to use a specific persistence layer with a state key.
|
|
90
|
+
*
|
|
91
|
+
* @param persistence - Implementation of StatePersistence interface
|
|
92
|
+
* @param stateKey - Unique identifier for the crawl session
|
|
93
|
+
*/
|
|
94
|
+
configurePersistence(persistence: StatePersistence, stateKey: SpiderStateKey): Effect.Effect<void>;
|
|
95
|
+
/**
|
|
96
|
+
* Removes persistence configuration, disabling state saving.
|
|
97
|
+
*/
|
|
98
|
+
clearPersistence(): Effect.Effect<void>;
|
|
99
|
+
/**
|
|
100
|
+
* Adds a crawl task to the processing queue with optional priority.
|
|
101
|
+
*
|
|
102
|
+
* @param request - Crawl task containing URL and depth
|
|
103
|
+
* @param priority - Optional priority (higher numbers = higher priority, default: 0)
|
|
104
|
+
* @returns Effect containing boolean indicating if task was added (false if duplicate)
|
|
105
|
+
*/
|
|
106
|
+
enqueue(request: CrawlTask, priority?: number): Effect.Effect<boolean>;
|
|
107
|
+
/**
|
|
108
|
+
* Retrieves the next highest-priority task from the queue.
|
|
109
|
+
*
|
|
110
|
+
* @returns Effect containing the next priority request
|
|
111
|
+
*/
|
|
112
|
+
dequeue(): Effect.Effect<PriorityRequest>;
|
|
113
|
+
/**
|
|
114
|
+
* Returns the current number of tasks in the queue.
|
|
115
|
+
*/
|
|
116
|
+
size(): Effect.Effect<number>;
|
|
117
|
+
/**
|
|
118
|
+
* Checks if the queue is empty.
|
|
119
|
+
*/
|
|
120
|
+
isEmpty(): Effect.Effect<boolean>;
|
|
121
|
+
/**
|
|
122
|
+
* Returns the current scheduler state for persistence.
|
|
123
|
+
*/
|
|
124
|
+
getState(): Effect.Effect<SpiderState>;
|
|
125
|
+
/**
|
|
126
|
+
* Restores the scheduler from a previously saved state.
|
|
127
|
+
*
|
|
128
|
+
* @param state - Complete state to restore from
|
|
129
|
+
*/
|
|
130
|
+
restoreFromState(state: SpiderState): Effect.Effect<void>;
|
|
131
|
+
/**
|
|
132
|
+
* Attempts to restore state from a persistence layer.
|
|
133
|
+
*
|
|
134
|
+
* @param persistence - Persistence layer to load from
|
|
135
|
+
* @param stateKey - State key to restore
|
|
136
|
+
* @returns Effect containing boolean indicating if state was successfully restored
|
|
137
|
+
*/
|
|
138
|
+
restore(persistence: StatePersistence, stateKey: SpiderStateKey): Effect.Effect<boolean>;
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* The MiddlewareManager service interface for pipeline processing.
|
|
142
|
+
*
|
|
143
|
+
* Orchestrates the execution of middleware in the correct order for
|
|
144
|
+
* request processing, response handling, and error recovery.
|
|
145
|
+
*
|
|
146
|
+
* @example
|
|
147
|
+
* ```typescript
|
|
148
|
+
* const program = Effect.gen(function* () {
|
|
149
|
+
* const manager = yield* MiddlewareManager;
|
|
150
|
+
*
|
|
151
|
+
* const middleware = [
|
|
152
|
+
* rateLimitMiddleware,
|
|
153
|
+
* loggingMiddleware,
|
|
154
|
+
* userAgentMiddleware
|
|
155
|
+
* ];
|
|
156
|
+
*
|
|
157
|
+
* const processedRequest = yield* manager.processRequest(request, middleware);
|
|
158
|
+
* console.log('Request processed through middleware pipeline');
|
|
159
|
+
* });
|
|
160
|
+
* ```
|
|
161
|
+
*
|
|
162
|
+
* @group Services
|
|
163
|
+
* @public
|
|
164
|
+
*/
|
|
165
|
+
export interface IMiddlewareManager {
|
|
166
|
+
/**
|
|
167
|
+
* Processes a request through the middleware pipeline.
|
|
168
|
+
*
|
|
169
|
+
* @param request - The initial request to process
|
|
170
|
+
* @param middlewares - Array of middleware to apply
|
|
171
|
+
* @returns Effect containing the processed request
|
|
172
|
+
*/
|
|
173
|
+
processRequest(request: SpiderRequest, middlewares: SpiderMiddleware[]): Effect.Effect<SpiderRequest, MiddlewareError>;
|
|
174
|
+
/**
|
|
175
|
+
* Processes a response through the middleware pipeline in reverse order.
|
|
176
|
+
*
|
|
177
|
+
* @param response - The response to process
|
|
178
|
+
* @param request - The original request (for context)
|
|
179
|
+
* @param middlewares - Array of middleware to apply
|
|
180
|
+
* @returns Effect containing the processed response
|
|
181
|
+
*/
|
|
182
|
+
processResponse(response: SpiderResponse, request: SpiderRequest, middlewares: SpiderMiddleware[]): Effect.Effect<SpiderResponse, MiddlewareError>;
|
|
183
|
+
/**
|
|
184
|
+
* Processes an exception through the middleware pipeline in reverse order.
|
|
185
|
+
*
|
|
186
|
+
* @param error - The error that occurred
|
|
187
|
+
* @param request - The request that caused the error
|
|
188
|
+
* @param middlewares - Array of middleware to apply
|
|
189
|
+
* @returns Effect containing a recovered response or null
|
|
190
|
+
*/
|
|
191
|
+
processException(error: Error, request: SpiderRequest, middlewares: SpiderMiddleware[]): Effect.Effect<SpiderResponse | null, MiddlewareError>;
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Rate limiting middleware service interface.
|
|
195
|
+
*
|
|
196
|
+
* Provides rate limiting functionality for respectful crawling,
|
|
197
|
+
* controlling request frequency at both global and per-domain levels.
|
|
198
|
+
*
|
|
199
|
+
* @group Middleware
|
|
200
|
+
* @public
|
|
201
|
+
*/
|
|
202
|
+
export interface IRateLimitMiddleware {
|
|
203
|
+
/**
|
|
204
|
+
* Creates a rate limiting middleware with the specified configuration.
|
|
205
|
+
*
|
|
206
|
+
* @param config - Rate limiting configuration options
|
|
207
|
+
* @returns Configured middleware instance
|
|
208
|
+
*
|
|
209
|
+
* @example
|
|
210
|
+
* ```typescript
|
|
211
|
+
* const rateLimiter = yield* RateLimitMiddleware;
|
|
212
|
+
* const middleware = rateLimiter.create({
|
|
213
|
+
* maxConcurrentRequests: 5,
|
|
214
|
+
* maxRequestsPerSecondPerDomain: 2,
|
|
215
|
+
* requestDelayMs: 250
|
|
216
|
+
* });
|
|
217
|
+
* ```
|
|
218
|
+
*/
|
|
219
|
+
create(config: {
|
|
220
|
+
maxConcurrentRequests: number;
|
|
221
|
+
maxRequestsPerSecondPerDomain: number;
|
|
222
|
+
requestDelayMs?: number;
|
|
223
|
+
}): SpiderMiddleware;
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Logging middleware service interface.
|
|
227
|
+
*
|
|
228
|
+
* Provides logging functionality using Effect.Logger for debugging
|
|
229
|
+
* and monitoring crawling operations.
|
|
230
|
+
*
|
|
231
|
+
* @group Middleware
|
|
232
|
+
* @public
|
|
233
|
+
*/
|
|
234
|
+
export interface ILoggingMiddleware {
|
|
235
|
+
/**
|
|
236
|
+
* Creates a logging middleware with optional configuration.
|
|
237
|
+
*
|
|
238
|
+
* @param config - Optional logging configuration
|
|
239
|
+
* @returns Configured middleware instance
|
|
240
|
+
*
|
|
241
|
+
* @example
|
|
242
|
+
* ```typescript
|
|
243
|
+
* const logger = yield* LoggingMiddleware;
|
|
244
|
+
* const middleware = logger.create({
|
|
245
|
+
* logRequests: true,
|
|
246
|
+
* logResponses: true,
|
|
247
|
+
* logLevel: 'info'
|
|
248
|
+
* });
|
|
249
|
+
* ```
|
|
250
|
+
*/
|
|
251
|
+
create(config?: {
|
|
252
|
+
logRequests?: boolean;
|
|
253
|
+
logResponses?: boolean;
|
|
254
|
+
logErrors?: boolean;
|
|
255
|
+
logLevel?: 'debug' | 'info' | 'warn' | 'error';
|
|
256
|
+
}): SpiderMiddleware;
|
|
257
|
+
}
|
|
258
|
+
/**
|
|
259
|
+
* User agent middleware service interface.
|
|
260
|
+
*
|
|
261
|
+
* Adds consistent User-Agent headers to all requests for
|
|
262
|
+
* proper identification of your crawler.
|
|
263
|
+
*
|
|
264
|
+
* @group Middleware
|
|
265
|
+
* @public
|
|
266
|
+
*/
|
|
267
|
+
export interface IUserAgentMiddleware {
|
|
268
|
+
/**
|
|
269
|
+
* Creates a User-Agent middleware with the specified user agent string.
|
|
270
|
+
*
|
|
271
|
+
* @param userAgent - User agent string to add to requests
|
|
272
|
+
* @returns Configured middleware instance
|
|
273
|
+
*
|
|
274
|
+
* @example
|
|
275
|
+
* ```typescript
|
|
276
|
+
* const userAgent = yield* UserAgentMiddleware;
|
|
277
|
+
* const middleware = userAgent.create('MyBot/1.0 (+https://example.com)');
|
|
278
|
+
* ```
|
|
279
|
+
*/
|
|
280
|
+
create(userAgent: string): SpiderMiddleware;
|
|
281
|
+
}
|
|
282
|
+
/**
|
|
283
|
+
* Statistics middleware service interface.
|
|
284
|
+
*
|
|
285
|
+
* Collects comprehensive metrics about crawling activity including
|
|
286
|
+
* request counts, response codes, and performance statistics.
|
|
287
|
+
*
|
|
288
|
+
* @group Middleware
|
|
289
|
+
* @public
|
|
290
|
+
*/
|
|
291
|
+
export interface IStatsMiddleware {
|
|
292
|
+
/**
|
|
293
|
+
* Creates a statistics middleware and returns both the middleware and a stats getter.
|
|
294
|
+
*
|
|
295
|
+
* @returns Object containing the middleware instance and statistics retrieval function
|
|
296
|
+
*
|
|
297
|
+
* @example
|
|
298
|
+
* ```typescript
|
|
299
|
+
* const statsService = yield* StatsMiddleware;
|
|
300
|
+
* const { middleware, getStats } = statsService.create();
|
|
301
|
+
*
|
|
302
|
+
* // Use middleware in your pipeline
|
|
303
|
+
* // Later get statistics
|
|
304
|
+
* const stats = yield* getStats();
|
|
305
|
+
* console.log(`Processed ${stats.requests_processed} requests`);
|
|
306
|
+
* ```
|
|
307
|
+
*/
|
|
308
|
+
create(): {
|
|
309
|
+
middleware: SpiderMiddleware;
|
|
310
|
+
getStats: () => Effect.Effect<Record<string, number>>;
|
|
311
|
+
};
|
|
312
|
+
}
|
|
313
|
+
//# sourceMappingURL=api-facades.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"api-facades.d.ts","sourceRoot":"","sources":["../../src/lib/api-facades.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,QAAQ,CAAC;AACtC,OAAO,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,4BAA4B,CAAC;AACpE,OAAO,EACL,eAAe,EACf,WAAW,EACX,cAAc,EACd,gBAAgB,EACjB,MAAM,wCAAwC,CAAC;AAChD,OAAO,EACL,gBAAgB,EAChB,aAAa,EACb,cAAc,EACf,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAE9C;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,MAAM,WAAW,OAAO;IACtB;;;;;;OAMG;IACH,KAAK,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EACX,SAAS,EAAE,MAAM,EACjB,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC,GACpC,MAAM,CAAC,MAAM,CAAC;QAAE,UAAU,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,OAAO,CAAA;KAAE,EAAE,KAAK,CAAC,CAAC;IAEpE;;;;OAIG;IACH,cAAc,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;CAC3C;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;OAKG;IACH,oBAAoB,CAClB,WAAW,EAAE,gBAAgB,EAC7B,QAAQ,EAAE,cAAc,GACvB,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAEvB;;OAEG;IACH,gBAAgB,IAAI,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAExC;;;;;;OAMG;IACH,OAAO,CAAC,OAAO,EAAE,SAAS,EAAE,QAAQ,CAAC,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAEvE;;;;OAIG;IACH,OAAO,IAAI,MAAM,CAAC,MAAM,CAAC,eAAe,CAAC,CAAC;IAE1C;;OAEG;IACH,IAAI,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IAE9B;;OAEG;IACH,OAAO,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAElC;;OAEG;IACH,QAAQ,IAAI,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC;IAEvC;;;;OAIG;IACH,gBAAgB,CAAC,KAAK,EAAE,WAAW,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAE1D;;;;;;OAMG;IACH,OAAO,CACL,WAAW,EAAE,gBAAgB,EAC7B,QAAQ,EAAE,cAAc,GACvB,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;CAC3B;AAED;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,MAAM,WAAW,kBAAkB;IACjC;;;;;;OAMG;IACH,cAAc,CACZ,OAAO,EAAE,aAAa,EACtB,WAAW,EAAE,gBAAgB,EAAE,GAC9B,MAAM,CAAC,MAAM,CAAC,aAAa,EAAE,eAAe,CAAC,CAAC;IAEjD;;;;;;;OAOG;IACH,eAAe,CACb,QAAQ,EAAE,cAAc,EACxB,OAAO,EAAE,aAAa,EACtB,WAAW,EAAE,gBAAgB,EAAE,GAC9B,MAAM,CAAC,MAAM,CAAC,cAAc,EAAE,eAAe,CAAC,CAAC;IAElD;;;;;;;OAOG;IACH,gBAAgB,CACd,KAAK,EAAE,KAAK,EACZ,OAAO,EAAE,aAAa,EACtB,WAAW,EAAE,gBAAgB,EAAE,GAC9B,MAAM,CAAC,MAAM,CAAC,cAAc,GAAG,IAAI,EAAE,eAAe,CAAC,CAAC;CAC1D;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,oBAAoB;IACnC;;;;;;;;;;;;;;;OAeG;IACH,MAAM,CAAC,MAAM,EAAE;QACb,qBAAqB,EAAE,MAAM,CAAC;QAC9B,6BAA6B,EAAE,MAAM,CAAC;QACtC,cAAc,CAAC,EAAE,MAAM,CAAC;KACzB,GAAG,gBAAgB,CAAC;CACtB;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,kBAAkB;IACjC;;;;;;;;;;;;;;;OAeG;IACH,MAAM,CAAC,MAAM,CAAC,EAAE;QACd,WAAW,CAAC,EAAE,OAAO,CAAC;QACtB,YAAY,CAAC,EAAE,OAAO,CAAC;QACvB,SAAS,CAAC,EAAE,OAAO,CAAC;QACpB,QAAQ,CAAC,EAAE,OAAO,GAAG,MAAM,GAAG,MAAM,GAAG,OAAO,CAAC;KAChD,GAAG,gBAAgB,CAAC;CACtB;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,oBAAoB;IACnC;;;;;;;;;;;OAWG;IACH,MAAM,CAAC,SAAS,EAAE,MAAM,GAAG,gBAAgB,CAAC;CAC7C;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;;;;;;;;;;;OAeG;IACH,MAAM,IAAI;QACR,UAAU,EAAE,gBAAgB,CAAC;QAC7B,QAAQ,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;KACvD,CAAC;CACH"}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
declare const NetworkError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
2
|
+
readonly _tag: "NetworkError";
|
|
3
|
+
} & Readonly<A>;
|
|
4
|
+
/**
|
|
5
|
+
* Network-related errors (fetch failures, timeouts, etc.)
|
|
6
|
+
*/
|
|
7
|
+
export declare class NetworkError extends NetworkError_base<{
|
|
8
|
+
readonly url: string;
|
|
9
|
+
readonly cause?: unknown;
|
|
10
|
+
readonly message: string;
|
|
11
|
+
}> {
|
|
12
|
+
static fromCause(url: string, cause: unknown): NetworkError;
|
|
13
|
+
}
|
|
14
|
+
declare const ResponseError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
15
|
+
readonly _tag: "ResponseError";
|
|
16
|
+
} & Readonly<A>;
|
|
17
|
+
/**
|
|
18
|
+
* Response processing errors (invalid content, parsing failures)
|
|
19
|
+
*/
|
|
20
|
+
export declare class ResponseError extends ResponseError_base<{
|
|
21
|
+
readonly url: string;
|
|
22
|
+
readonly cause?: unknown;
|
|
23
|
+
readonly message: string;
|
|
24
|
+
}> {
|
|
25
|
+
static fromCause(url: string, cause: unknown): ResponseError;
|
|
26
|
+
}
|
|
27
|
+
declare const RobotsTxtError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
28
|
+
readonly _tag: "RobotsTxtError";
|
|
29
|
+
} & Readonly<A>;
|
|
30
|
+
/**
|
|
31
|
+
* Robots.txt fetching errors
|
|
32
|
+
*/
|
|
33
|
+
export declare class RobotsTxtError extends RobotsTxtError_base<{
|
|
34
|
+
readonly url: string;
|
|
35
|
+
readonly cause?: unknown;
|
|
36
|
+
readonly message: string;
|
|
37
|
+
}> {
|
|
38
|
+
static fromCause(url: string, cause: unknown): RobotsTxtError;
|
|
39
|
+
}
|
|
40
|
+
declare const ConfigurationError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
41
|
+
readonly _tag: "ConfigurationError";
|
|
42
|
+
} & Readonly<A>;
|
|
43
|
+
/**
|
|
44
|
+
* Configuration errors
|
|
45
|
+
*/
|
|
46
|
+
export declare class ConfigurationError extends ConfigurationError_base<{
|
|
47
|
+
readonly message: string;
|
|
48
|
+
readonly details?: unknown;
|
|
49
|
+
}> {
|
|
50
|
+
}
|
|
51
|
+
declare const MiddlewareError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
52
|
+
readonly _tag: "MiddlewareError";
|
|
53
|
+
} & Readonly<A>;
|
|
54
|
+
/**
|
|
55
|
+
* Middleware processing errors
|
|
56
|
+
*/
|
|
57
|
+
export declare class MiddlewareError extends MiddlewareError_base<{
|
|
58
|
+
readonly phase: 'transform' | 'error';
|
|
59
|
+
readonly middlewareName: string;
|
|
60
|
+
readonly cause?: unknown;
|
|
61
|
+
readonly message: string;
|
|
62
|
+
}> {
|
|
63
|
+
static transform(middlewareName: string, cause: unknown): MiddlewareError;
|
|
64
|
+
static error(middlewareName: string, cause: unknown): MiddlewareError;
|
|
65
|
+
}
|
|
66
|
+
declare const FileSystemError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
67
|
+
readonly _tag: "FileSystemError";
|
|
68
|
+
} & Readonly<A>;
|
|
69
|
+
/**
|
|
70
|
+
* File system errors
|
|
71
|
+
*/
|
|
72
|
+
export declare class FileSystemError extends FileSystemError_base<{
|
|
73
|
+
readonly operation: 'read' | 'write' | 'create' | 'delete';
|
|
74
|
+
readonly path: string;
|
|
75
|
+
readonly cause?: unknown;
|
|
76
|
+
readonly message: string;
|
|
77
|
+
}> {
|
|
78
|
+
static write(path: string, cause: unknown): FileSystemError;
|
|
79
|
+
static create(path: string, cause: unknown): FileSystemError;
|
|
80
|
+
}
|
|
81
|
+
declare const PersistenceError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
82
|
+
readonly _tag: "PersistenceError";
|
|
83
|
+
} & Readonly<A>;
|
|
84
|
+
/**
|
|
85
|
+
* Persistence layer errors
|
|
86
|
+
*/
|
|
87
|
+
export declare class PersistenceError extends PersistenceError_base<{
|
|
88
|
+
readonly operation: 'save' | 'load' | 'delete';
|
|
89
|
+
readonly key?: string;
|
|
90
|
+
readonly cause?: unknown;
|
|
91
|
+
readonly message: string;
|
|
92
|
+
}> {
|
|
93
|
+
static save(cause: unknown, key?: string): PersistenceError;
|
|
94
|
+
static load(cause: unknown, key?: string): PersistenceError;
|
|
95
|
+
static delete(cause: unknown, key?: string): PersistenceError;
|
|
96
|
+
}
|
|
97
|
+
export type SpiderError = NetworkError | ResponseError | RobotsTxtError | ConfigurationError | MiddlewareError | FileSystemError | PersistenceError;
|
|
98
|
+
export {};
|
|
99
|
+
//# sourceMappingURL=errors.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"errors.d.ts","sourceRoot":"","sources":["../../src/lib/errors.ts"],"names":[],"mappings":";;;AAEA;;GAEG;AACH,qBAAa,YAAa,SAAQ,kBAAiC;IACjE,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;IACA,MAAM,CAAC,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,GAAG,YAAY;CAO5D;;;;AAED;;GAEG;AACH,qBAAa,aAAc,SAAQ,mBAAkC;IACnE,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;IACA,MAAM,CAAC,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,GAAG,aAAa;CAO7D;;;;AAED;;GAEG;AACH,qBAAa,cAAe,SAAQ,oBAAmC;IACrE,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;IACA,MAAM,CAAC,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,GAAG,cAAc;CAO9D;;;;AAED;;GAEG;AACH,qBAAa,kBAAmB,SAAQ,wBAAuC;IAC7E,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,OAAO,CAAC,EAAE,OAAO,CAAC;CAC5B,CAAC;CAAG;;;;AAEL;;GAEG;AACH,qBAAa,eAAgB,SAAQ,qBAAoC;IACvE,QAAQ,CAAC,KAAK,EAAE,WAAW,GAAG,OAAO,CAAC;IACtC,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;IACA,MAAM,CAAC,SAAS,CAAC,cAAc,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,GAAG,eAAe;IASzE,MAAM,CAAC,KAAK,CAAC,cAAc,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,GAAG,eAAe;CAQtE;;;;AAED;;GAEG;AACH,qBAAa,eAAgB,SAAQ,qBAAoC;IACvE,QAAQ,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,GAAG,QAAQ,GAAG,QAAQ,CAAC;IAC3D,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;IACA,MAAM,CAAC,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,GAAG,eAAe;IAS3D,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,GAAG,eAAe;CAQ7D;;;;AAED;;GAEG;AACH,qBAAa,gBAAiB,SAAQ,sBAAqC;IACzE,QAAQ,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,GAAG,QAAQ,CAAC;IAC/C,QAAQ,CAAC,GAAG,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;IACA,MAAM,CAAC,IAAI,CAAC,KAAK,EAAE,OAAO,EAAE,GAAG,CAAC,EAAE,MAAM,GAAG,gBAAgB;IAW3D,MAAM,CAAC,IAAI,CAAC,KAAK,EAAE,OAAO,EAAE,GAAG,CAAC,EAAE,MAAM,GAAG,gBAAgB;IAW3D,MAAM,CAAC,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,GAAG,CAAC,EAAE,MAAM,GAAG,gBAAgB;CAU9D;AAGD,MAAM,MAAM,WAAW,GACnB,YAAY,GACZ,aAAa,GACb,cAAc,GACd,kBAAkB,GAClB,eAAe,GACf,eAAe,GACf,gBAAgB,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@jambudipa/spider",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "A comprehensive web scraping library with resumable operations, middleware support, and built-in rate limiting",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.js",
|
|
7
|
+
"module": "./dist/index.js",
|
|
8
|
+
"types": "./dist/index.d.ts",
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"types": "./dist/index.d.ts",
|
|
12
|
+
"import": "./dist/index.js",
|
|
13
|
+
"default": "./dist/index.js"
|
|
14
|
+
},
|
|
15
|
+
"./package.json": "./package.json"
|
|
16
|
+
},
|
|
17
|
+
"files": [
|
|
18
|
+
"dist",
|
|
19
|
+
"README.md",
|
|
20
|
+
"LICENSE"
|
|
21
|
+
],
|
|
22
|
+
"engines": {
|
|
23
|
+
"node": ">=18.0.0"
|
|
24
|
+
},
|
|
25
|
+
"scripts": {
|
|
26
|
+
"build": "vite build && tsc --emitDeclarationOnly --outDir dist",
|
|
27
|
+
"dev": "vite build --watch",
|
|
28
|
+
"test": "vitest",
|
|
29
|
+
"test:ui": "vitest --ui",
|
|
30
|
+
"test:coverage": "vitest --coverage",
|
|
31
|
+
"test:run": "vitest run",
|
|
32
|
+
"lint": "eslint src",
|
|
33
|
+
"lint:fix": "eslint src --fix",
|
|
34
|
+
"format": "prettier --write \"src/**/*.ts\"",
|
|
35
|
+
"format:check": "prettier --check \"src/**/*.ts\"",
|
|
36
|
+
"typecheck": "tsc --noEmit",
|
|
37
|
+
"typecheck:test": "tsc --noEmit --skipLibCheck --downlevelIteration --project tsconfig.test.json",
|
|
38
|
+
"typecheck:examples": "tsc --noEmit --project tsconfig.examples.json",
|
|
39
|
+
"clean": "rm -rf dist",
|
|
40
|
+
"prepublishOnly": "npm run clean && npm run build && npm run typecheck",
|
|
41
|
+
"prepack": "npm run build",
|
|
42
|
+
"changeset": "changeset",
|
|
43
|
+
"version": "changeset version",
|
|
44
|
+
"release": "npm run build && changeset publish",
|
|
45
|
+
"docs:api": "typedoc",
|
|
46
|
+
"docs:build": "npm run docs:api && npm run examples:validate",
|
|
47
|
+
"examples:run": "tsx",
|
|
48
|
+
"examples:validate": "tsx scripts/analyse-exports.ts",
|
|
49
|
+
"examples:run-all": "tsx examples/run-all.ts",
|
|
50
|
+
"test:examples": "vitest run tests/examples.test.ts",
|
|
51
|
+
"test:core": "vitest run tests/core",
|
|
52
|
+
"test:all": "npm run test:examples && npm run test:core"
|
|
53
|
+
},
|
|
54
|
+
"keywords": [
|
|
55
|
+
"web-scraping",
|
|
56
|
+
"crawler",
|
|
57
|
+
"spider",
|
|
58
|
+
"http",
|
|
59
|
+
"middleware",
|
|
60
|
+
"resumable",
|
|
61
|
+
"rate-limiting",
|
|
62
|
+
"typescript"
|
|
63
|
+
],
|
|
64
|
+
"author": "Jambudipa.io",
|
|
65
|
+
"license": "MIT",
|
|
66
|
+
"repository": {
|
|
67
|
+
"type": "git",
|
|
68
|
+
"url": "https://github.com/jambudipa/spider.git"
|
|
69
|
+
},
|
|
70
|
+
"bugs": {
|
|
71
|
+
"url": "https://github.com/jambudipa/spider/issues"
|
|
72
|
+
},
|
|
73
|
+
"homepage": "https://github.com/jambudipa/spider#readme",
|
|
74
|
+
"peerDependencies": {
|
|
75
|
+
"@types/tough-cookie": "^4.0.5",
|
|
76
|
+
"cheerio": "^1.1.2",
|
|
77
|
+
"domhandler": "^5.0.3",
|
|
78
|
+
"effect": "^3.16.16",
|
|
79
|
+
"tough-cookie": "^6.0.0",
|
|
80
|
+
"tslib": "^2.3.0"
|
|
81
|
+
},
|
|
82
|
+
"devDependencies": {
|
|
83
|
+
"@changesets/cli": "^2.29.5",
|
|
84
|
+
"@types/node": "^20.0.0",
|
|
85
|
+
"@typescript-eslint/eslint-plugin": "^7.0.0",
|
|
86
|
+
"@typescript-eslint/parser": "^7.0.0",
|
|
87
|
+
"@typescript-eslint/typescript-estree": "^8.38.0",
|
|
88
|
+
"@vitest/coverage-v8": "^1.6.0",
|
|
89
|
+
"@vitest/ui": "^1.6.0",
|
|
90
|
+
"@types/tough-cookie": "^4.0.5",
|
|
91
|
+
"cheerio": "^1.1.2",
|
|
92
|
+
"domhandler": "^5.0.3",
|
|
93
|
+
"effect": "^3.16.16",
|
|
94
|
+
"eslint": "^8.57.0",
|
|
95
|
+
"prettier": "^3.6.2",
|
|
96
|
+
"tough-cookie": "^6.0.0",
|
|
97
|
+
"tsx": "^4.20.3",
|
|
98
|
+
"typedoc": "^0.28.9",
|
|
99
|
+
"typedoc-plugin-markdown": "^4.8.0",
|
|
100
|
+
"tslib": "^2.3.0",
|
|
101
|
+
"typescript": "^5.4.0",
|
|
102
|
+
"vite": "^5.0.0",
|
|
103
|
+
"vitest": "^1.6.0"
|
|
104
|
+
},
|
|
105
|
+
"publishConfig": {
|
|
106
|
+
"access": "public"
|
|
107
|
+
}
|
|
108
|
+
}
|