@jambudipa/spider 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +45 -43
- package/dist/index.js +3 -3
- package/dist/index.js.map +1 -1
- package/package.json +9 -6
- package/dist/index.d.ts +0 -33
- package/dist/index.d.ts.map +0 -1
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +0 -57
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +0 -1
- package/dist/lib/Config/SpiderConfig.service.d.ts +0 -256
- package/dist/lib/Config/SpiderConfig.service.d.ts.map +0 -1
- package/dist/lib/HttpClient/CookieManager.d.ts +0 -44
- package/dist/lib/HttpClient/CookieManager.d.ts.map +0 -1
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +0 -88
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +0 -1
- package/dist/lib/HttpClient/SessionStore.d.ts +0 -82
- package/dist/lib/HttpClient/SessionStore.d.ts.map +0 -1
- package/dist/lib/HttpClient/TokenExtractor.d.ts +0 -58
- package/dist/lib/HttpClient/TokenExtractor.d.ts.map +0 -1
- package/dist/lib/HttpClient/index.d.ts +0 -8
- package/dist/lib/HttpClient/index.d.ts.map +0 -1
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +0 -166
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +0 -1
- package/dist/lib/LinkExtractor/index.d.ts +0 -37
- package/dist/lib/LinkExtractor/index.d.ts.map +0 -1
- package/dist/lib/Logging/FetchLogger.d.ts +0 -8
- package/dist/lib/Logging/FetchLogger.d.ts.map +0 -1
- package/dist/lib/Logging/SpiderLogger.service.d.ts +0 -34
- package/dist/lib/Logging/SpiderLogger.service.d.ts.map +0 -1
- package/dist/lib/Middleware/SpiderMiddleware.d.ts +0 -276
- package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +0 -1
- package/dist/lib/PageData/PageData.d.ts +0 -28
- package/dist/lib/PageData/PageData.d.ts.map +0 -1
- package/dist/lib/Resumability/Resumability.service.d.ts +0 -176
- package/dist/lib/Resumability/Resumability.service.d.ts.map +0 -1
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +0 -47
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +0 -1
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +0 -95
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +0 -1
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +0 -92
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +0 -1
- package/dist/lib/Resumability/index.d.ts +0 -51
- package/dist/lib/Resumability/index.d.ts.map +0 -1
- package/dist/lib/Resumability/strategies.d.ts +0 -76
- package/dist/lib/Resumability/strategies.d.ts.map +0 -1
- package/dist/lib/Resumability/types.d.ts +0 -201
- package/dist/lib/Resumability/types.d.ts.map +0 -1
- package/dist/lib/Robots/Robots.service.d.ts +0 -78
- package/dist/lib/Robots/Robots.service.d.ts.map +0 -1
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +0 -211
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +0 -1
- package/dist/lib/Scraper/Scraper.service.d.ts +0 -123
- package/dist/lib/Scraper/Scraper.service.d.ts.map +0 -1
- package/dist/lib/Spider/Spider.service.d.ts +0 -194
- package/dist/lib/Spider/Spider.service.d.ts.map +0 -1
- package/dist/lib/StateManager/StateManager.service.d.ts +0 -68
- package/dist/lib/StateManager/StateManager.service.d.ts.map +0 -1
- package/dist/lib/StateManager/index.d.ts +0 -5
- package/dist/lib/StateManager/index.d.ts.map +0 -1
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +0 -58
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +0 -1
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +0 -77
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +0 -1
- package/dist/lib/WebScrapingEngine/index.d.ts +0 -5
- package/dist/lib/WebScrapingEngine/index.d.ts.map +0 -1
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +0 -39
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +0 -1
- package/dist/lib/api-facades.d.ts +0 -313
- package/dist/lib/api-facades.d.ts.map +0 -1
- package/dist/lib/errors.d.ts +0 -99
- package/dist/lib/errors.d.ts.map +0 -1
package/README.md
CHANGED
|
@@ -1,14 +1,40 @@
|
|
|
1
|
-
# @jambudipa
|
|
1
|
+
# @jambudipa/spider
|
|
2
2
|
|
|
3
3
|
A powerful, Effect.js-based web crawling framework for modern TypeScript applications. Built for type safety, composability, and enterprise-scale crawling operations.
|
|
4
4
|
|
|
5
|
+
## 🏆 **Battle-Tested Against Real-World Scenarios**
|
|
6
|
+
|
|
7
|
+
**Spider successfully handles ALL 16 https://web-scraping.dev challenge scenarios** - the most comprehensive web scraping test suite available:
|
|
8
|
+
|
|
9
|
+
| ✅ Scenario | Description | Complexity |
|
|
10
|
+
|-------------|-------------|------------|
|
|
11
|
+
| **Static Paging** | Traditional pagination navigation | Basic |
|
|
12
|
+
| **Endless Scroll** | Infinite scroll content loading | Dynamic |
|
|
13
|
+
| **Button Loading** | Dynamic content via button clicks | Dynamic |
|
|
14
|
+
| **GraphQL Requests** | Background API data fetching | Advanced |
|
|
15
|
+
| **Hidden Data** | Extracting non-visible content | Intermediate |
|
|
16
|
+
| **Product Markup** | Structured data extraction | Intermediate |
|
|
17
|
+
| **Local Storage** | Browser storage interaction | Advanced |
|
|
18
|
+
| **Secret API Tokens** | Authentication handling | Security |
|
|
19
|
+
| **CSRF Protection** | Token-based security bypass | Security |
|
|
20
|
+
| **Cookie Authentication** | Session-based access control | Security |
|
|
21
|
+
| **PDF Downloads** | Binary file handling | Special |
|
|
22
|
+
| **Cookie Popups** | Modal interaction handling | Special |
|
|
23
|
+
| **New Tab Links** | Multi-tab navigation | Special |
|
|
24
|
+
| **Block Pages** | Anti-bot detection handling | Anti-Block |
|
|
25
|
+
| **Invalid Referer Blocking** | Header-based access control | Anti-Block |
|
|
26
|
+
| **Persistent Cookie Blocking** | Long-term blocking mechanisms | Anti-Block |
|
|
27
|
+
|
|
28
|
+
🎯 **[View Live Test Results](https://github.com/jambudipa/spider/actions)** | 📊 **100% Test Pass Rate** | 🚀 **Production Ready**
|
|
29
|
+
|
|
5
30
|
## ✨ Key Features
|
|
6
31
|
|
|
7
32
|
- **🔥 Effect.js Foundation**: Type-safe, functional composition with robust error handling
|
|
8
33
|
- **⚡ High Performance**: Concurrent crawling with intelligent worker pool management
|
|
9
34
|
- **🤖 Robots.txt Compliant**: Automatic robots.txt parsing and compliance checking
|
|
10
35
|
- **🔄 Resumable Crawls**: State persistence and crash recovery capabilities
|
|
11
|
-
- **🛡️
|
|
36
|
+
- **🛡️ Anti-Bot Bypass**: Handles complex blocking mechanisms and security measures
|
|
37
|
+
- **🌐 Browser Automation**: Playwright integration for JavaScript-heavy sites
|
|
12
38
|
- **📊 Built-in Monitoring**: Comprehensive logging and performance monitoring
|
|
13
39
|
- **🎯 TypeScript First**: Full type safety with excellent IntelliSense support
|
|
14
40
|
|
|
@@ -49,18 +75,18 @@ Effect.runPromise(program.pipe(
|
|
|
49
75
|
|
|
50
76
|
### 🆕 New to Spider?
|
|
51
77
|
- **[Getting Started Guide](./docs/guides/getting-started.md)** - Complete setup and first crawl
|
|
52
|
-
- **[
|
|
53
|
-
- **[Basic
|
|
78
|
+
- **[Examples](./docs/examples/)** - Working examples to get you started
|
|
79
|
+
- **[Basic Configuration](./docs/guides/configuration.md)** - Configuration options
|
|
54
80
|
|
|
55
|
-
### 🔄
|
|
56
|
-
- **[
|
|
57
|
-
- **[
|
|
58
|
-
- **[
|
|
81
|
+
### 🔄 Advanced Usage
|
|
82
|
+
- **[Browser Automation](./docs/guides/browser-automation.md)** - Handle dynamic content
|
|
83
|
+
- **[Anti-Bot Protection](./docs/guides/anti-bot.md)** - Bypass blocking mechanisms
|
|
84
|
+
- **[Security Handling](./docs/guides/security.md)** - Authentication and sessions
|
|
59
85
|
|
|
60
86
|
### 🏭 Building Production Systems?
|
|
61
|
-
- **[
|
|
62
|
-
- **[Monitoring Guide](./docs/features/monitoring.md)** - Set up observability and alerting
|
|
87
|
+
- **[Performance Guide](./docs/guides/performance.md)** - Scale your crawling operations
|
|
63
88
|
- **[API Reference](./docs/api/)** - Complete technical documentation
|
|
89
|
+
- **[Enterprise Patterns](./docs/examples/enterprise-patterns.md)** - Production-ready patterns
|
|
64
90
|
|
|
65
91
|
## 🛠️ Quick Configuration
|
|
66
92
|
|
|
@@ -83,7 +109,7 @@ const config = makeSpiderConfig({
|
|
|
83
109
|
The spider can be configured for different scraping scenarios:
|
|
84
110
|
|
|
85
111
|
```typescript
|
|
86
|
-
import { makeSpiderConfig } from '@jambudipa
|
|
112
|
+
import { makeSpiderConfig } from '@jambudipa/spider';
|
|
87
113
|
|
|
88
114
|
const config = makeSpiderConfig({
|
|
89
115
|
// Basic settings
|
|
@@ -118,7 +144,7 @@ import {
|
|
|
118
144
|
LoggingMiddleware,
|
|
119
145
|
RateLimitMiddleware,
|
|
120
146
|
UserAgentMiddleware
|
|
121
|
-
} from '@jambudipa
|
|
147
|
+
} from '@jambudipa/spider';
|
|
122
148
|
|
|
123
149
|
const middlewares = new MiddlewareManager()
|
|
124
150
|
.use(new LoggingMiddleware({ level: 'info' }))
|
|
@@ -142,7 +168,7 @@ import {
|
|
|
142
168
|
SpiderService,
|
|
143
169
|
ResumabilityService,
|
|
144
170
|
FileStorageBackend
|
|
145
|
-
} from '@jambudipa
|
|
171
|
+
} from '@jambudipa/spider';
|
|
146
172
|
import { Effect, Layer } from 'effect';
|
|
147
173
|
|
|
148
174
|
// Configure resumability with file storage
|
|
@@ -191,7 +217,7 @@ const program = Effect.gen(function* () {
|
|
|
191
217
|
Extract and process links from pages:
|
|
192
218
|
|
|
193
219
|
```typescript
|
|
194
|
-
import { LinkExtractorService } from '@jambudipa
|
|
220
|
+
import { LinkExtractorService } from '@jambudipa/spider';
|
|
195
221
|
|
|
196
222
|
const program = Effect.gen(function* () {
|
|
197
223
|
const linkExtractor = yield* LinkExtractorService;
|
|
@@ -260,7 +286,7 @@ const program = Effect.gen(function* () {
|
|
|
260
286
|
The library uses Effect for comprehensive error handling:
|
|
261
287
|
|
|
262
288
|
```typescript
|
|
263
|
-
import { NetworkError, ResponseError, RobotsTxtError } from '@jambudipa
|
|
289
|
+
import { NetworkError, ResponseError, RobotsTxtError } from '@jambudipa/spider';
|
|
264
290
|
|
|
265
291
|
const program = Effect.gen(function* () {
|
|
266
292
|
const spider = yield* SpiderService;
|
|
@@ -295,7 +321,7 @@ const program = Effect.gen(function* () {
|
|
|
295
321
|
Create custom middleware for specific needs:
|
|
296
322
|
|
|
297
323
|
```typescript
|
|
298
|
-
import { SpiderMiddleware, SpiderRequest, SpiderResponse } from '@jambudipa
|
|
324
|
+
import { SpiderMiddleware, SpiderRequest, SpiderResponse } from '@jambudipa/spider';
|
|
299
325
|
import { Effect } from 'effect';
|
|
300
326
|
|
|
301
327
|
class CustomAuthMiddleware implements SpiderMiddleware {
|
|
@@ -326,7 +352,7 @@ const middlewares = new MiddlewareManager()
|
|
|
326
352
|
Monitor scraping performance:
|
|
327
353
|
|
|
328
354
|
```typescript
|
|
329
|
-
import { WorkerHealthMonitorService } from '@jambudipa
|
|
355
|
+
import { WorkerHealthMonitorService } from '@jambudipa/spider';
|
|
330
356
|
|
|
331
357
|
const program = Effect.gen(function* () {
|
|
332
358
|
const healthMonitor = yield* WorkerHealthMonitorService;
|
|
@@ -347,18 +373,6 @@ const program = Effect.gen(function* () {
|
|
|
347
373
|
});
|
|
348
374
|
```
|
|
349
375
|
|
|
350
|
-
## Contributing
|
|
351
|
-
|
|
352
|
-
1. Fork the repository
|
|
353
|
-
2. Create a feature branch: `git checkout -b feature/new-feature`
|
|
354
|
-
3. Make your changes
|
|
355
|
-
4. Add tests for new functionality
|
|
356
|
-
5. Run tests: `npm test`
|
|
357
|
-
6. Run linting: `npm run lint`
|
|
358
|
-
7. Commit changes: `git commit -am 'Add new feature'`
|
|
359
|
-
8. Push to branch: `git push origin feature/new-feature`
|
|
360
|
-
9. Submit a pull request
|
|
361
|
-
|
|
362
376
|
## Development
|
|
363
377
|
|
|
364
378
|
```bash
|
|
@@ -388,17 +402,6 @@ npm run format
|
|
|
388
402
|
|
|
389
403
|
MIT License - see [LICENSE](LICENSE) file for details.
|
|
390
404
|
|
|
391
|
-
## Changelog
|
|
392
|
-
|
|
393
|
-
### 1.0.0
|
|
394
|
-
- Initial standalone release
|
|
395
|
-
- Migrated from monorepo structure
|
|
396
|
-
- Full TypeScript support
|
|
397
|
-
- Comprehensive middleware system
|
|
398
|
-
- Resumable scraping functionality
|
|
399
|
-
- Multiple storage backends
|
|
400
|
-
- Rate limiting and performance monitoring
|
|
401
|
-
|
|
402
405
|
## 📚 Documentation
|
|
403
406
|
|
|
404
407
|
Comprehensive documentation is available in the [`/docs`](./docs) directory:
|
|
@@ -406,7 +409,6 @@ Comprehensive documentation is available in the [`/docs`](./docs) directory:
|
|
|
406
409
|
### 🚀 Quick Links
|
|
407
410
|
- **[Getting Started Guide](./docs/guides/getting-started.md)** - Installation, setup, and first crawl
|
|
408
411
|
- **[API Reference](./docs/api/)** - Complete API documentation
|
|
409
|
-
- **[Configuration Guide](./docs/guides/configuration.md)** - Configuration options and patterns
|
|
410
412
|
- **[Examples](./docs/examples/)** - Working examples for common use cases
|
|
411
413
|
|
|
412
414
|
### 📖 Complete Documentation
|
|
@@ -417,10 +419,10 @@ Comprehensive documentation is available in the [`/docs`](./docs) directory:
|
|
|
417
419
|
|
|
418
420
|
## Support
|
|
419
421
|
|
|
420
|
-
- [GitHub Issues](https://github.com/jambudipa
|
|
422
|
+
- [GitHub Issues](https://github.com/jambudipa/spider/issues)
|
|
421
423
|
- [Complete Documentation](./docs/)
|
|
422
424
|
- [Working Examples](./docs/examples/)
|
|
423
425
|
|
|
424
426
|
---
|
|
425
427
|
|
|
426
|
-
Built with ❤️ by [
|
|
428
|
+
Built with ❤️ by [JAMBUDIPA](https://jambudipa.io)
|
package/dist/index.js
CHANGED
|
@@ -5,7 +5,7 @@ import * as path from "path";
|
|
|
5
5
|
import * as fs$1 from "fs/promises";
|
|
6
6
|
import { CookieJar } from "tough-cookie";
|
|
7
7
|
class SpiderConfig extends Effect.Service()(
|
|
8
|
-
"@jambudipa
|
|
8
|
+
"@jambudipa/spiderConfig",
|
|
9
9
|
{
|
|
10
10
|
effect: Effect.sync(() => makeSpiderConfig({}))
|
|
11
11
|
}
|
|
@@ -1189,7 +1189,7 @@ class SpiderState extends Schema.Class("SpiderState")({
|
|
|
1189
1189
|
}) {
|
|
1190
1190
|
}
|
|
1191
1191
|
class SpiderSchedulerService extends Effect.Service()(
|
|
1192
|
-
"@jambudipa
|
|
1192
|
+
"@jambudipa/spiderSchedulerService",
|
|
1193
1193
|
{
|
|
1194
1194
|
effect: Effect.gen(function* () {
|
|
1195
1195
|
const config = yield* SpiderConfig;
|
|
@@ -1365,7 +1365,7 @@ const SpiderScheduler_service = /* @__PURE__ */ Object.freeze(/* @__PURE__ */ Ob
|
|
|
1365
1365
|
SpiderStateKey
|
|
1366
1366
|
}, Symbol.toStringTag, { value: "Module" }));
|
|
1367
1367
|
class SpiderService extends Effect.Service()(
|
|
1368
|
-
"@jambudipa
|
|
1368
|
+
"@jambudipa/spider",
|
|
1369
1369
|
{
|
|
1370
1370
|
effect: Effect.gen(function* () {
|
|
1371
1371
|
const robots = yield* RobotsService;
|