@jambudipa/spider 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +426 -0
- package/dist/index.d.ts +33 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +4681 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +57 -0
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +1 -0
- package/dist/lib/Config/SpiderConfig.service.d.ts +256 -0
- package/dist/lib/Config/SpiderConfig.service.d.ts.map +1 -0
- package/dist/lib/HttpClient/CookieManager.d.ts +44 -0
- package/dist/lib/HttpClient/CookieManager.d.ts.map +1 -0
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +88 -0
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +1 -0
- package/dist/lib/HttpClient/SessionStore.d.ts +82 -0
- package/dist/lib/HttpClient/SessionStore.d.ts.map +1 -0
- package/dist/lib/HttpClient/TokenExtractor.d.ts +58 -0
- package/dist/lib/HttpClient/TokenExtractor.d.ts.map +1 -0
- package/dist/lib/HttpClient/index.d.ts +8 -0
- package/dist/lib/HttpClient/index.d.ts.map +1 -0
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +166 -0
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +1 -0
- package/dist/lib/LinkExtractor/index.d.ts +37 -0
- package/dist/lib/LinkExtractor/index.d.ts.map +1 -0
- package/dist/lib/Logging/FetchLogger.d.ts +8 -0
- package/dist/lib/Logging/FetchLogger.d.ts.map +1 -0
- package/dist/lib/Logging/SpiderLogger.service.d.ts +34 -0
- package/dist/lib/Logging/SpiderLogger.service.d.ts.map +1 -0
- package/dist/lib/Middleware/SpiderMiddleware.d.ts +276 -0
- package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +1 -0
- package/dist/lib/PageData/PageData.d.ts +28 -0
- package/dist/lib/PageData/PageData.d.ts.map +1 -0
- package/dist/lib/Resumability/Resumability.service.d.ts +176 -0
- package/dist/lib/Resumability/Resumability.service.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +47 -0
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +95 -0
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +92 -0
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/index.d.ts +51 -0
- package/dist/lib/Resumability/index.d.ts.map +1 -0
- package/dist/lib/Resumability/strategies.d.ts +76 -0
- package/dist/lib/Resumability/strategies.d.ts.map +1 -0
- package/dist/lib/Resumability/types.d.ts +201 -0
- package/dist/lib/Resumability/types.d.ts.map +1 -0
- package/dist/lib/Robots/Robots.service.d.ts +78 -0
- package/dist/lib/Robots/Robots.service.d.ts.map +1 -0
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +211 -0
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +1 -0
- package/dist/lib/Scraper/Scraper.service.d.ts +123 -0
- package/dist/lib/Scraper/Scraper.service.d.ts.map +1 -0
- package/dist/lib/Spider/Spider.service.d.ts +194 -0
- package/dist/lib/Spider/Spider.service.d.ts.map +1 -0
- package/dist/lib/StateManager/StateManager.service.d.ts +68 -0
- package/dist/lib/StateManager/StateManager.service.d.ts.map +1 -0
- package/dist/lib/StateManager/index.d.ts +5 -0
- package/dist/lib/StateManager/index.d.ts.map +1 -0
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +58 -0
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +1 -0
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +77 -0
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +1 -0
- package/dist/lib/WebScrapingEngine/index.d.ts +5 -0
- package/dist/lib/WebScrapingEngine/index.d.ts.map +1 -0
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +39 -0
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +1 -0
- package/dist/lib/api-facades.d.ts +313 -0
- package/dist/lib/api-facades.d.ts.map +1 -0
- package/dist/lib/errors.d.ts +99 -0
- package/dist/lib/errors.d.ts.map +1 -0
- package/package.json +108 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Jambudipa.io
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
# @jambudipa.io/spider
|
|
2
|
+
|
|
3
|
+
A powerful, Effect.js-based web crawling framework for modern TypeScript applications. Built for type safety, composability, and enterprise-scale crawling operations.
|
|
4
|
+
|
|
5
|
+
## ✨ Key Features
|
|
6
|
+
|
|
7
|
+
- **🔥 Effect.js Foundation**: Type-safe, functional composition with robust error handling
|
|
8
|
+
- **⚡ High Performance**: Concurrent crawling with intelligent worker pool management
|
|
9
|
+
- **🤖 Robots.txt Compliant**: Automatic robots.txt parsing and compliance checking
|
|
10
|
+
- **🔄 Resumable Crawls**: State persistence and crash recovery capabilities
|
|
11
|
+
- **🛡️ Middleware System**: Extensible middleware for rate limiting, authentication, and custom processing
|
|
12
|
+
- **📊 Built-in Monitoring**: Comprehensive logging and performance monitoring
|
|
13
|
+
- **🎯 TypeScript First**: Full type safety with excellent IntelliSense support
|
|
14
|
+
|
|
15
|
+
## 🚀 Getting Started
|
|
16
|
+
|
|
17
|
+
### Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
npm install @jambudipa/spider effect
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### Your First Crawl
|
|
24
|
+
|
|
25
|
+
```typescript
|
|
26
|
+
import { SpiderService, makeSpiderConfig } from '@jambudipa/spider'
|
|
27
|
+
import { Effect, Sink } from 'effect'
|
|
28
|
+
|
|
29
|
+
const program = Effect.gen(function* () {
|
|
30
|
+
// Create spider instance
|
|
31
|
+
const spider = yield* SpiderService
|
|
32
|
+
|
|
33
|
+
// Set up result collection
|
|
34
|
+
const collectSink = Sink.forEach(result =>
|
|
35
|
+
Effect.sync(() => console.log(`Found: ${result.pageData.title}`))
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
// Start crawling
|
|
39
|
+
yield* spider.crawl('https://example.com', collectSink)
|
|
40
|
+
})
|
|
41
|
+
|
|
42
|
+
// Run with default configuration
|
|
43
|
+
Effect.runPromise(program.pipe(
|
|
44
|
+
Effect.provide(SpiderService.Default)
|
|
45
|
+
))
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## 🎯 What's Next?
|
|
49
|
+
|
|
50
|
+
### 🆕 New to Spider?
|
|
51
|
+
- **[Getting Started Guide](./docs/guides/getting-started.md)** - Complete setup and first crawl
|
|
52
|
+
- **[Configuration Guide](./docs/guides/configuration.md)** - Customise Spider for your needs
|
|
53
|
+
- **[Basic Examples](./docs/examples/basic-crawling.md)** - Working examples to get you started
|
|
54
|
+
|
|
55
|
+
### 🔄 Migrating from Another Library?
|
|
56
|
+
- **[Migration Guide](./docs/guides/migration.md)** - Move from Puppeteer, Playwright, or Scrapy
|
|
57
|
+
- **[Advanced Patterns](./docs/guides/advanced-patterns.md)** - Implement sophisticated crawling logic
|
|
58
|
+
- **[Performance Guide](./docs/guides/performance.md)** - Optimise for your use case
|
|
59
|
+
|
|
60
|
+
### 🏭 Building Production Systems?
|
|
61
|
+
- **[Enterprise Patterns](./docs/examples/enterprise-patterns.md)** - Production-ready crawling solutions
|
|
62
|
+
- **[Monitoring Guide](./docs/features/monitoring.md)** - Set up observability and alerting
|
|
63
|
+
- **[API Reference](./docs/api/)** - Complete technical documentation
|
|
64
|
+
|
|
65
|
+
## 🛠️ Quick Configuration
|
|
66
|
+
|
|
67
|
+
```typescript
|
|
68
|
+
import { makeSpiderConfig } from '@jambudipa/spider'
|
|
69
|
+
|
|
70
|
+
const config = makeSpiderConfig({
|
|
71
|
+
maxDepth: 3,
|
|
72
|
+
maxPages: 100,
|
|
73
|
+
maxConcurrentWorkers: 5,
|
|
74
|
+
ignoreRobotsTxt: false, // Respect robots.txt
|
|
75
|
+
requestDelayMs: 1000
|
|
76
|
+
})
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Core Concepts
|
|
80
|
+
|
|
81
|
+
### Spider Configuration
|
|
82
|
+
|
|
83
|
+
The spider can be configured for different scraping scenarios:
|
|
84
|
+
|
|
85
|
+
```typescript
|
|
86
|
+
import { makeSpiderConfig } from '@jambudipa.io/spider';
|
|
87
|
+
|
|
88
|
+
const config = makeSpiderConfig({
|
|
89
|
+
// Basic settings
|
|
90
|
+
maxDepth: 5,
|
|
91
|
+
maxPages: 1000,
|
|
92
|
+
respectRobotsTxt: true,
|
|
93
|
+
|
|
94
|
+
// Rate limiting
|
|
95
|
+
rateLimitDelay: 2000,
|
|
96
|
+
maxConcurrentRequests: 3,
|
|
97
|
+
|
|
98
|
+
// Content handling
|
|
99
|
+
followRedirects: true,
|
|
100
|
+
maxRedirects: 5,
|
|
101
|
+
|
|
102
|
+
// Timeouts
|
|
103
|
+
requestTimeout: 30000,
|
|
104
|
+
|
|
105
|
+
// User agent
|
|
106
|
+
userAgent: 'MyBot/1.0'
|
|
107
|
+
});
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Middleware System
|
|
111
|
+
|
|
112
|
+
Add custom processing with middleware:
|
|
113
|
+
|
|
114
|
+
```typescript
|
|
115
|
+
import {
|
|
116
|
+
SpiderService,
|
|
117
|
+
MiddlewareManager,
|
|
118
|
+
LoggingMiddleware,
|
|
119
|
+
RateLimitMiddleware,
|
|
120
|
+
UserAgentMiddleware
|
|
121
|
+
} from '@jambudipa.io/spider';
|
|
122
|
+
|
|
123
|
+
const middlewares = new MiddlewareManager()
|
|
124
|
+
.use(new LoggingMiddleware({ level: 'info' }))
|
|
125
|
+
.use(new RateLimitMiddleware({ delay: 1000 }))
|
|
126
|
+
.use(new UserAgentMiddleware({
|
|
127
|
+
userAgent: 'MyBot/1.0 (+https://example.com/bot)'
|
|
128
|
+
}));
|
|
129
|
+
|
|
130
|
+
// Use with spider configuration
|
|
131
|
+
const config = makeSpiderConfig({
|
|
132
|
+
middleware: middlewares
|
|
133
|
+
});
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### Resumable Scraping
|
|
137
|
+
|
|
138
|
+
Resume interrupted scraping sessions:
|
|
139
|
+
|
|
140
|
+
```typescript
|
|
141
|
+
import {
|
|
142
|
+
SpiderService,
|
|
143
|
+
ResumabilityService,
|
|
144
|
+
FileStorageBackend
|
|
145
|
+
} from '@jambudipa.io/spider';
|
|
146
|
+
import { Effect, Layer } from 'effect';
|
|
147
|
+
|
|
148
|
+
// Configure resumability with file storage
|
|
149
|
+
const resumabilityLayer = Layer.succeed(
|
|
150
|
+
ResumabilityService,
|
|
151
|
+
ResumabilityService.of({
|
|
152
|
+
strategy: 'hybrid',
|
|
153
|
+
backend: new FileStorageBackend('./spider-state')
|
|
154
|
+
})
|
|
155
|
+
);
|
|
156
|
+
|
|
157
|
+
const program = Effect.gen(function* () {
|
|
158
|
+
const spider = yield* SpiderService;
|
|
159
|
+
const resumability = yield* ResumabilityService;
|
|
160
|
+
|
|
161
|
+
// Configure session
|
|
162
|
+
const sessionKey = 'my-scraping-session';
|
|
163
|
+
|
|
164
|
+
// Check for existing session
|
|
165
|
+
const existingState = yield* resumability.restore(sessionKey);
|
|
166
|
+
|
|
167
|
+
if (existingState) {
|
|
168
|
+
console.log('Resuming previous session...');
|
|
169
|
+
// Resume from saved state
|
|
170
|
+
yield* spider.resumeFromState(existingState);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// Start or continue crawling
|
|
174
|
+
const result = yield* spider.crawl({
|
|
175
|
+
url: 'https://example.com',
|
|
176
|
+
sessionKey,
|
|
177
|
+
saveState: true
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
return result;
|
|
181
|
+
}).pipe(
|
|
182
|
+
Effect.provide(Layer.mergeAll(
|
|
183
|
+
SpiderService.Default,
|
|
184
|
+
resumabilityLayer
|
|
185
|
+
))
|
|
186
|
+
);
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### Link Extraction
|
|
190
|
+
|
|
191
|
+
Extract and process links from pages:
|
|
192
|
+
|
|
193
|
+
```typescript
|
|
194
|
+
import { LinkExtractorService } from '@jambudipa.io/spider';
|
|
195
|
+
|
|
196
|
+
const program = Effect.gen(function* () {
|
|
197
|
+
const linkExtractor = yield* LinkExtractorService;
|
|
198
|
+
|
|
199
|
+
const result = yield* linkExtractor.extractLinks({
|
|
200
|
+
html: '<html>...</html>',
|
|
201
|
+
baseUrl: 'https://example.com',
|
|
202
|
+
filters: {
|
|
203
|
+
allowedDomains: ['example.com', 'sub.example.com'],
|
|
204
|
+
excludePatterns: ['/admin', '/private']
|
|
205
|
+
}
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
console.log(`Found ${result.links.length} links`);
|
|
209
|
+
return result;
|
|
210
|
+
}).pipe(
|
|
211
|
+
Effect.provide(LinkExtractorService.Default)
|
|
212
|
+
);
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## API Reference
|
|
216
|
+
|
|
217
|
+
### Core Services
|
|
218
|
+
|
|
219
|
+
- **SpiderService**: Main spider service for web crawling
|
|
220
|
+
- **SpiderSchedulerService**: Manages crawling queue and prioritisation
|
|
221
|
+
- **LinkExtractorService**: Extracts and filters links from HTML content
|
|
222
|
+
- **ResumabilityService**: Handles state persistence and resumption
|
|
223
|
+
- **ScraperService**: Low-level HTTP scraping functionality
|
|
224
|
+
|
|
225
|
+
### Configuration
|
|
226
|
+
|
|
227
|
+
- **SpiderConfig**: Main configuration interface
|
|
228
|
+
- **makeSpiderConfig()**: Factory function for creating configurations
|
|
229
|
+
|
|
230
|
+
### Middleware
|
|
231
|
+
|
|
232
|
+
- **MiddlewareManager**: Manages middleware chain
|
|
233
|
+
- **LoggingMiddleware**: Logs requests and responses
|
|
234
|
+
- **RateLimitMiddleware**: Implements rate limiting
|
|
235
|
+
- **UserAgentMiddleware**: Sets custom user agents
|
|
236
|
+
- **StatsMiddleware**: Collects scraping statistics
|
|
237
|
+
|
|
238
|
+
### Storage Backends
|
|
239
|
+
|
|
240
|
+
- **FileStorageBackend**: File-based state storage
|
|
241
|
+
- **PostgresStorageBackend**: PostgreSQL storage (requires database)
|
|
242
|
+
- **RedisStorageBackend**: Redis storage (requires Redis server)
|
|
243
|
+
|
|
244
|
+
## Configuration Options
|
|
245
|
+
|
|
246
|
+
| Option | Type | Default | Description |
|
|
247
|
+
|--------|------|---------|-------------|
|
|
248
|
+
| `maxDepth` | number | 3 | Maximum crawling depth |
|
|
249
|
+
| `maxPages` | number | 100 | Maximum pages to crawl |
|
|
250
|
+
| `respectRobotsTxt` | boolean | true | Follow robots.txt rules |
|
|
251
|
+
| `rateLimitDelay` | number | 1000 | Delay between requests (ms) |
|
|
252
|
+
| `maxConcurrentRequests` | number | 1 | Maximum concurrent requests |
|
|
253
|
+
| `requestTimeout` | number | 30000 | Request timeout (ms) |
|
|
254
|
+
| `followRedirects` | boolean | true | Follow HTTP redirects |
|
|
255
|
+
| `maxRedirects` | number | 5 | Maximum redirect hops |
|
|
256
|
+
| `userAgent` | string | Auto-generated | Custom user agent string |
|
|
257
|
+
|
|
258
|
+
## Error Handling
|
|
259
|
+
|
|
260
|
+
The library uses Effect for comprehensive error handling:
|
|
261
|
+
|
|
262
|
+
```typescript
|
|
263
|
+
import { NetworkError, ResponseError, RobotsTxtError } from '@jambudipa.io/spider';
|
|
264
|
+
|
|
265
|
+
const program = Effect.gen(function* () {
|
|
266
|
+
const spider = yield* SpiderService;
|
|
267
|
+
|
|
268
|
+
const result = yield* spider.crawl({
|
|
269
|
+
url: 'https://example.com'
|
|
270
|
+
}).pipe(
|
|
271
|
+
Effect.catchTags({
|
|
272
|
+
NetworkError: (error) => {
|
|
273
|
+
console.log('Network issue:', error.message);
|
|
274
|
+
return Effect.succeed(null);
|
|
275
|
+
},
|
|
276
|
+
ResponseError: (error) => {
|
|
277
|
+
console.log('HTTP error:', error.statusCode);
|
|
278
|
+
return Effect.succeed(null);
|
|
279
|
+
},
|
|
280
|
+
RobotsTxtError: (error) => {
|
|
281
|
+
console.log('Robots.txt blocked:', error.message);
|
|
282
|
+
return Effect.succeed(null);
|
|
283
|
+
}
|
|
284
|
+
})
|
|
285
|
+
);
|
|
286
|
+
|
|
287
|
+
return result;
|
|
288
|
+
});
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
## Advanced Usage
|
|
292
|
+
|
|
293
|
+
### Custom Middleware
|
|
294
|
+
|
|
295
|
+
Create custom middleware for specific needs:
|
|
296
|
+
|
|
297
|
+
```typescript
|
|
298
|
+
import { SpiderMiddleware, SpiderRequest, SpiderResponse } from '@jambudipa.io/spider';
|
|
299
|
+
import { Effect } from 'effect';
|
|
300
|
+
|
|
301
|
+
class CustomAuthMiddleware implements SpiderMiddleware {
|
|
302
|
+
constructor(private apiKey: string) {}
|
|
303
|
+
|
|
304
|
+
processRequest(request: SpiderRequest): Effect.Effect<SpiderRequest, never> {
|
|
305
|
+
return Effect.succeed({
|
|
306
|
+
...request,
|
|
307
|
+
headers: {
|
|
308
|
+
...request.headers,
|
|
309
|
+
'Authorization': `Bearer ${this.apiKey}`
|
|
310
|
+
}
|
|
311
|
+
});
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
processResponse(response: SpiderResponse): Effect.Effect<SpiderResponse, never> {
|
|
315
|
+
return Effect.succeed(response);
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
// Use in middleware chain
|
|
320
|
+
const middlewares = new MiddlewareManager()
|
|
321
|
+
.use(new CustomAuthMiddleware('your-api-key'));
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
### Performance Monitoring
|
|
325
|
+
|
|
326
|
+
Monitor scraping performance:
|
|
327
|
+
|
|
328
|
+
```typescript
|
|
329
|
+
import { WorkerHealthMonitorService } from '@jambudipa.io/spider';
|
|
330
|
+
|
|
331
|
+
const program = Effect.gen(function* () {
|
|
332
|
+
const healthMonitor = yield* WorkerHealthMonitorService;
|
|
333
|
+
|
|
334
|
+
// Start monitoring
|
|
335
|
+
yield* healthMonitor.startMonitoring();
|
|
336
|
+
|
|
337
|
+
// Your scraping code here...
|
|
338
|
+
|
|
339
|
+
// Get health metrics
|
|
340
|
+
const metrics = yield* healthMonitor.getMetrics();
|
|
341
|
+
|
|
342
|
+
console.log('Performance metrics:', {
|
|
343
|
+
requestsPerMinute: metrics.requestsPerMinute,
|
|
344
|
+
averageResponseTime: metrics.averageResponseTime,
|
|
345
|
+
errorRate: metrics.errorRate
|
|
346
|
+
});
|
|
347
|
+
});
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
## Contributing
|
|
351
|
+
|
|
352
|
+
1. Fork the repository
|
|
353
|
+
2. Create a feature branch: `git checkout -b feature/new-feature`
|
|
354
|
+
3. Make your changes
|
|
355
|
+
4. Add tests for new functionality
|
|
356
|
+
5. Run tests: `npm test`
|
|
357
|
+
6. Run linting: `npm run lint`
|
|
358
|
+
7. Commit changes: `git commit -am 'Add new feature'`
|
|
359
|
+
8. Push to branch: `git push origin feature/new-feature`
|
|
360
|
+
9. Submit a pull request
|
|
361
|
+
|
|
362
|
+
## Development
|
|
363
|
+
|
|
364
|
+
```bash
|
|
365
|
+
# Install dependencies
|
|
366
|
+
npm install
|
|
367
|
+
|
|
368
|
+
# Build the package
|
|
369
|
+
npm run build
|
|
370
|
+
|
|
371
|
+
# Run tests
|
|
372
|
+
npm test
|
|
373
|
+
|
|
374
|
+
# Run tests with coverage
|
|
375
|
+
npm run test:coverage
|
|
376
|
+
|
|
377
|
+
# Type checking
|
|
378
|
+
npm run typecheck
|
|
379
|
+
|
|
380
|
+
# Linting
|
|
381
|
+
npm run lint
|
|
382
|
+
|
|
383
|
+
# Format code
|
|
384
|
+
npm run format
|
|
385
|
+
```
|
|
386
|
+
|
|
387
|
+
## License
|
|
388
|
+
|
|
389
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
390
|
+
|
|
391
|
+
## Changelog
|
|
392
|
+
|
|
393
|
+
### 1.0.0
|
|
394
|
+
- Initial standalone release
|
|
395
|
+
- Migrated from monorepo structure
|
|
396
|
+
- Full TypeScript support
|
|
397
|
+
- Comprehensive middleware system
|
|
398
|
+
- Resumable scraping functionality
|
|
399
|
+
- Multiple storage backends
|
|
400
|
+
- Rate limiting and performance monitoring
|
|
401
|
+
|
|
402
|
+
## 📚 Documentation
|
|
403
|
+
|
|
404
|
+
Comprehensive documentation is available in the [`/docs`](./docs) directory:
|
|
405
|
+
|
|
406
|
+
### 🚀 Quick Links
|
|
407
|
+
- **[Getting Started Guide](./docs/guides/getting-started.md)** - Installation, setup, and first crawl
|
|
408
|
+
- **[API Reference](./docs/api/)** - Complete API documentation
|
|
409
|
+
- **[Configuration Guide](./docs/guides/configuration.md)** - Configuration options and patterns
|
|
410
|
+
- **[Examples](./docs/examples/)** - Working examples for common use cases
|
|
411
|
+
|
|
412
|
+
### 📖 Complete Documentation
|
|
413
|
+
- **[Documentation Index](./docs/README.md)** - Overview of all available documentation
|
|
414
|
+
- **[User Guides](./docs/guides/)** - Step-by-step tutorials and best practices
|
|
415
|
+
- **[Feature Documentation](./docs/features/)** - Deep dives into key capabilities
|
|
416
|
+
- **[Advanced Examples](./docs/examples/)** - Real-world usage patterns
|
|
417
|
+
|
|
418
|
+
## Support
|
|
419
|
+
|
|
420
|
+
- [GitHub Issues](https://github.com/jambudipa-io/spider/issues)
|
|
421
|
+
- [Complete Documentation](./docs/)
|
|
422
|
+
- [Working Examples](./docs/examples/)
|
|
423
|
+
|
|
424
|
+
---
|
|
425
|
+
|
|
426
|
+
Built with ❤️ by [Jambudipa.io](https://jambudipa.io)
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
export type { ISpider, ISpiderScheduler, IMiddlewareManager, IRateLimitMiddleware, ILoggingMiddleware, IUserAgentMiddleware, IStatsMiddleware, } from './lib/api-facades.js';
|
|
2
|
+
export * from './lib/Spider/Spider.service.js';
|
|
3
|
+
export * from './lib/Robots/Robots.service.js';
|
|
4
|
+
export * from './lib/Scraper/Scraper.service.js';
|
|
5
|
+
export * from './lib/PageData/PageData.js';
|
|
6
|
+
export type { SpiderConfigOptions, SpiderConfigService, } from './lib/Config/SpiderConfig.service.js';
|
|
7
|
+
export { SpiderConfig, makeSpiderConfig, } from './lib/Config/SpiderConfig.service.js';
|
|
8
|
+
export type { IUrlDeduplicator } from './lib/UrlDeduplicator/UrlDeduplicator.service.js';
|
|
9
|
+
export { UrlDeduplicatorService } from './lib/UrlDeduplicator/UrlDeduplicator.service.js';
|
|
10
|
+
export type { StatePersistence } from './lib/Scheduler/SpiderScheduler.service.js';
|
|
11
|
+
export { SpiderSchedulerService, SpiderStateKey, PriorityRequest, SpiderState, } from './lib/Scheduler/SpiderScheduler.service.js';
|
|
12
|
+
export type { SpiderMiddleware, SpiderRequest, SpiderResponse, } from './lib/Middleware/SpiderMiddleware.js';
|
|
13
|
+
export { MiddlewareManager, RateLimitMiddleware, LoggingMiddleware, UserAgentMiddleware, StatsMiddleware, } from './lib/Middleware/SpiderMiddleware.js';
|
|
14
|
+
export type { LinkExtractorConfig, LinkExtractionResult, LinkExtractorServiceInterface, } from './lib/LinkExtractor/LinkExtractor.service.js';
|
|
15
|
+
export { LinkExtractorService, LinkExtractorServiceLayer, LinkExtractionError, } from './lib/LinkExtractor/LinkExtractor.service.js';
|
|
16
|
+
export type { CrawlResult, CrawlTask, SpiderLinkExtractionOptions, } from './lib/Spider/Spider.service.js';
|
|
17
|
+
export type { PersistenceStrategy, StateOperation, StorageBackend, StorageCapabilities, HybridPersistenceConfig, } from './lib/Resumability/types.js';
|
|
18
|
+
export type { ResumabilityConfig } from './lib/Resumability/Resumability.service.js';
|
|
19
|
+
export { StateDelta, PersistenceError as ResumabilityError, DEFAULT_HYBRID_CONFIG, } from './lib/Resumability/types.js';
|
|
20
|
+
export { ResumabilityService, ResumabilityConfigs, createStateOperation, } from './lib/Resumability/Resumability.service.js';
|
|
21
|
+
export { FullStatePersistence, DeltaPersistence, HybridPersistence, } from './lib/Resumability/strategies.js';
|
|
22
|
+
export { FileStorageBackend } from './lib/Resumability/backends/FileStorageBackend.js';
|
|
23
|
+
export { NetworkError, ResponseError, RobotsTxtError, ConfigurationError, MiddlewareError, FileSystemError, PersistenceError, } from './lib/errors.js';
|
|
24
|
+
export type { SpiderError } from './lib/errors.js';
|
|
25
|
+
export type { SpiderLogEvent, SpiderLogger, } from './lib/Logging/SpiderLogger.service.js';
|
|
26
|
+
export { SpiderLogger as SpiderLoggerTag, makeSpiderLogger, SpiderLoggerLive, } from './lib/Logging/SpiderLogger.service.js';
|
|
27
|
+
export type { CookieManagerService, EnhancedHttpClientService, HttpRequestOptions, HttpResponse, Session, Credentials, SessionStoreService, TokenInfo, TokenExtractorService, } from './lib/HttpClient/index.js';
|
|
28
|
+
export { CookieManager, makeCookieManager, CookieManagerLive, EnhancedHttpClient, makeEnhancedHttpClient, EnhancedHttpClientLive, SessionStore, makeSessionStore, SessionStoreLive, TokenExtractor, makeTokenExtractor, TokenExtractorLive, } from './lib/HttpClient/index.js';
|
|
29
|
+
export type { Token, StateManagerService } from './lib/StateManager/index.js';
|
|
30
|
+
export { TokenType, StateManager, makeStateManager, StateManagerLive, } from './lib/StateManager/index.js';
|
|
31
|
+
export type { LoginCredentials, ScrapingSession, WebScrapingEngineService, } from './lib/WebScrapingEngine/index.js';
|
|
32
|
+
export { WebScrapingEngine, makeWebScrapingEngine, WebScrapingEngineLive, } from './lib/WebScrapingEngine/index.js';
|
|
33
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,YAAY,EACV,OAAO,EACP,gBAAgB,EAChB,kBAAkB,EAClB,oBAAoB,EACpB,kBAAkB,EAClB,oBAAoB,EACpB,gBAAgB,GACjB,MAAM,sBAAsB,CAAC;AAG9B,cAAc,gCAAgC,CAAC;AAC/C,cAAc,gCAAgC,CAAC;AAC/C,cAAc,kCAAkC,CAAC;AACjD,cAAc,4BAA4B,CAAC;AAG3C,YAAY,EACV,mBAAmB,EACnB,mBAAmB,GACpB,MAAM,sCAAsC,CAAC;AAC9C,OAAO,EACL,YAAY,EACZ,gBAAgB,GACjB,MAAM,sCAAsC,CAAC;AAG9C,YAAY,EAAE,gBAAgB,EAAE,MAAM,kDAAkD,CAAC;AACzF,OAAO,EAAE,sBAAsB,EAAE,MAAM,kDAAkD,CAAC;AAG1F,YAAY,EAAE,gBAAgB,EAAE,MAAM,4CAA4C,CAAC;AACnF,OAAO,EACL,sBAAsB,EACtB,cAAc,EACd,eAAe,EACf,WAAW,GACZ,MAAM,4CAA4C,CAAC;AAGpD,YAAY,EACV,gBAAgB,EAChB,aAAa,EACb,cAAc,GACf,MAAM,sCAAsC,CAAC;AAC9C,OAAO,EACL,iBAAiB,EACjB,mBAAmB,EACnB,iBAAiB,EACjB,mBAAmB,EACnB,eAAe,GAChB,MAAM,sCAAsC,CAAC;AAG9C,YAAY,EACV,mBAAmB,EACnB,oBAAoB,EACpB,6BAA6B,GAC9B,MAAM,8CAA8C,CAAC;AACtD,OAAO,EACL,oBAAoB,EACpB,yBAAyB,EACzB,mBAAmB,GACpB,MAAM,8CAA8C,CAAC;AAGtD,YAAY,EACV,WAAW,EACX,SAAS,EACT,2BAA2B,GAC5B,MAAM,gCAAgC,CAAC;AAGxC,YAAY,EACV,mBAAmB,EACnB,cAAc,EACd,cAAc,EACd,mBAAmB,EACnB,uBAAuB,GACxB,MAAM,6BAA6B,CAAC;AACrC,YAAY,EAAE,kBAAkB,EAAE,MAAM,4CAA4C,CAAC;AACrF,OAAO,EACL,UAAU,EACV,gBAAgB,IAAI,iBAAiB,EACrC,qBAAqB,GACtB,MAAM,6BAA6B,CAAC;AACrC,OAAO,EACL,mBAAmB,EACnB,mBAAmB,EACnB,oBAAoB,GACrB,MAAM,4CAA4C,CAAC;AACpD,OAAO,EACL,oBAAoB,EACpB,gBAAgB,EAChB,iBAAiB,GAClB,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EAAE,kBAAkB,EAAE,MAAM,mDAAmD,CAAC;AAGvF,OAAO,EACL,YAAY,EACZ,aAAa,EACb,cAAc,EACd,kBAAkB,EAClB,eAAe,EACf,eAAe,EACf,gBAAgB,GACjB,MAAM,iBAAiB,CAAC;AACzB,YAAY,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAGnD,YAAY,EACV,cAAc,EACd,YAAY,GACb,MAAM,uCAAuC,CAAC;AAC/C,OAAO,EACL,YAAY,IAAI,eAAe,EAC/B,gBAAgB,EAChB,gBAAgB,GACjB,MAAM,uCAAuC,CAAC;AAG/C,YAAY,EACV,oBAAoB,EACpB,yBAAyB,EACzB,kBAAkB,EAClB,YAAY,EACZ,OAAO,EACP,WAAW,EACX,mBAAmB,EACnB,SAAS,EACT,qBAAqB,GACtB,MAAM,2BAA2B,CAAC;AACnC,OAAO,EACL,aAAa,EACb,iBAAiB,EACjB,iBAAiB,EACjB,kBAAkB,EAClB,sBAAsB,EACtB,sBAAsB,EACtB,YAAY,EACZ,gBAAgB,EAChB,gBAAgB,EAChB,cAAc,EACd,kBAAkB,EAClB,kBAAkB,GACnB,MAAM,2BAA2B,CAAC;AAGnC,YAAY,EAAE,KAAK,EAAE,mBAAmB,EAAE,MAAM,6BAA6B,CAAC;AAC9E,OAAO,EACL,SAAS,EACT,YAAY,EACZ,gBAAgB,EAChB,gBAAgB,GACjB,MAAM,6BAA6B,CAAC;AAGrC,YAAY,EACV,gBAAgB,EAChB,eAAe,EACf,wBAAwB,GACzB,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EACL,iBAAiB,EACjB,qBAAqB,EACrB,qBAAqB,GACtB,MAAM,kCAAkC,CAAC"}
|