@jambudipa/spider 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/README.md +45 -43
  2. package/dist/index.js +3 -3
  3. package/dist/index.js.map +1 -1
  4. package/package.json +9 -6
  5. package/dist/index.d.ts +0 -33
  6. package/dist/index.d.ts.map +0 -1
  7. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +0 -57
  8. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +0 -1
  9. package/dist/lib/Config/SpiderConfig.service.d.ts +0 -256
  10. package/dist/lib/Config/SpiderConfig.service.d.ts.map +0 -1
  11. package/dist/lib/HttpClient/CookieManager.d.ts +0 -44
  12. package/dist/lib/HttpClient/CookieManager.d.ts.map +0 -1
  13. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +0 -88
  14. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +0 -1
  15. package/dist/lib/HttpClient/SessionStore.d.ts +0 -82
  16. package/dist/lib/HttpClient/SessionStore.d.ts.map +0 -1
  17. package/dist/lib/HttpClient/TokenExtractor.d.ts +0 -58
  18. package/dist/lib/HttpClient/TokenExtractor.d.ts.map +0 -1
  19. package/dist/lib/HttpClient/index.d.ts +0 -8
  20. package/dist/lib/HttpClient/index.d.ts.map +0 -1
  21. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +0 -166
  22. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +0 -1
  23. package/dist/lib/LinkExtractor/index.d.ts +0 -37
  24. package/dist/lib/LinkExtractor/index.d.ts.map +0 -1
  25. package/dist/lib/Logging/FetchLogger.d.ts +0 -8
  26. package/dist/lib/Logging/FetchLogger.d.ts.map +0 -1
  27. package/dist/lib/Logging/SpiderLogger.service.d.ts +0 -34
  28. package/dist/lib/Logging/SpiderLogger.service.d.ts.map +0 -1
  29. package/dist/lib/Middleware/SpiderMiddleware.d.ts +0 -276
  30. package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +0 -1
  31. package/dist/lib/PageData/PageData.d.ts +0 -28
  32. package/dist/lib/PageData/PageData.d.ts.map +0 -1
  33. package/dist/lib/Resumability/Resumability.service.d.ts +0 -176
  34. package/dist/lib/Resumability/Resumability.service.d.ts.map +0 -1
  35. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +0 -47
  36. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +0 -1
  37. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +0 -95
  38. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +0 -1
  39. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +0 -92
  40. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +0 -1
  41. package/dist/lib/Resumability/index.d.ts +0 -51
  42. package/dist/lib/Resumability/index.d.ts.map +0 -1
  43. package/dist/lib/Resumability/strategies.d.ts +0 -76
  44. package/dist/lib/Resumability/strategies.d.ts.map +0 -1
  45. package/dist/lib/Resumability/types.d.ts +0 -201
  46. package/dist/lib/Resumability/types.d.ts.map +0 -1
  47. package/dist/lib/Robots/Robots.service.d.ts +0 -78
  48. package/dist/lib/Robots/Robots.service.d.ts.map +0 -1
  49. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +0 -211
  50. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +0 -1
  51. package/dist/lib/Scraper/Scraper.service.d.ts +0 -123
  52. package/dist/lib/Scraper/Scraper.service.d.ts.map +0 -1
  53. package/dist/lib/Spider/Spider.service.d.ts +0 -194
  54. package/dist/lib/Spider/Spider.service.d.ts.map +0 -1
  55. package/dist/lib/StateManager/StateManager.service.d.ts +0 -68
  56. package/dist/lib/StateManager/StateManager.service.d.ts.map +0 -1
  57. package/dist/lib/StateManager/index.d.ts +0 -5
  58. package/dist/lib/StateManager/index.d.ts.map +0 -1
  59. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +0 -58
  60. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +0 -1
  61. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +0 -77
  62. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +0 -1
  63. package/dist/lib/WebScrapingEngine/index.d.ts +0 -5
  64. package/dist/lib/WebScrapingEngine/index.d.ts.map +0 -1
  65. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +0 -39
  66. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +0 -1
  67. package/dist/lib/api-facades.d.ts +0 -313
  68. package/dist/lib/api-facades.d.ts.map +0 -1
  69. package/dist/lib/errors.d.ts +0 -99
  70. package/dist/lib/errors.d.ts.map +0 -1
package/README.md CHANGED
@@ -1,14 +1,40 @@
1
- # @jambudipa.io/spider
1
+ # @jambudipa/spider
2
2
 
3
3
  A powerful, Effect.js-based web crawling framework for modern TypeScript applications. Built for type safety, composability, and enterprise-scale crawling operations.
4
4
 
5
+ ## 🏆 **Battle-Tested Against Real-World Scenarios**
6
+
7
+ **Spider successfully handles ALL 16 https://web-scraping.dev challenge scenarios** - the most comprehensive web scraping test suite available:
8
+
9
+ | ✅ Scenario | Description | Complexity |
10
+ |-------------|-------------|------------|
11
+ | **Static Paging** | Traditional pagination navigation | Basic |
12
+ | **Endless Scroll** | Infinite scroll content loading | Dynamic |
13
+ | **Button Loading** | Dynamic content via button clicks | Dynamic |
14
+ | **GraphQL Requests** | Background API data fetching | Advanced |
15
+ | **Hidden Data** | Extracting non-visible content | Intermediate |
16
+ | **Product Markup** | Structured data extraction | Intermediate |
17
+ | **Local Storage** | Browser storage interaction | Advanced |
18
+ | **Secret API Tokens** | Authentication handling | Security |
19
+ | **CSRF Protection** | Token-based security bypass | Security |
20
+ | **Cookie Authentication** | Session-based access control | Security |
21
+ | **PDF Downloads** | Binary file handling | Special |
22
+ | **Cookie Popups** | Modal interaction handling | Special |
23
+ | **New Tab Links** | Multi-tab navigation | Special |
24
+ | **Block Pages** | Anti-bot detection handling | Anti-Block |
25
+ | **Invalid Referer Blocking** | Header-based access control | Anti-Block |
26
+ | **Persistent Cookie Blocking** | Long-term blocking mechanisms | Anti-Block |
27
+
28
+ 🎯 **[View Live Test Results](https://github.com/jambudipa/spider/actions)** | 📊 **100% Test Pass Rate** | 🚀 **Production Ready**
29
+
5
30
  ## ✨ Key Features
6
31
 
7
32
  - **🔥 Effect.js Foundation**: Type-safe, functional composition with robust error handling
8
33
  - **⚡ High Performance**: Concurrent crawling with intelligent worker pool management
9
34
  - **🤖 Robots.txt Compliant**: Automatic robots.txt parsing and compliance checking
10
35
  - **🔄 Resumable Crawls**: State persistence and crash recovery capabilities
11
- - **🛡️ Middleware System**: Extensible middleware for rate limiting, authentication, and custom processing
36
+ - **🛡️ Anti-Bot Bypass**: Handles complex blocking mechanisms and security measures
37
+ - **🌐 Browser Automation**: Playwright integration for JavaScript-heavy sites
12
38
  - **📊 Built-in Monitoring**: Comprehensive logging and performance monitoring
13
39
  - **🎯 TypeScript First**: Full type safety with excellent IntelliSense support
14
40
 
@@ -49,18 +75,18 @@ Effect.runPromise(program.pipe(
49
75
 
50
76
  ### 🆕 New to Spider?
51
77
  - **[Getting Started Guide](./docs/guides/getting-started.md)** - Complete setup and first crawl
52
- - **[Configuration Guide](./docs/guides/configuration.md)** - Customise Spider for your needs
53
- - **[Basic Examples](./docs/examples/basic-crawling.md)** - Working examples to get you started
78
+ - **[Examples](./docs/examples/)** - Working examples to get you started
79
+ - **[Basic Configuration](./docs/guides/configuration.md)** - Configuration options
54
80
 
55
- ### 🔄 Migrating from Another Library?
56
- - **[Migration Guide](./docs/guides/migration.md)** - Move from Puppeteer, Playwright, or Scrapy
57
- - **[Advanced Patterns](./docs/guides/advanced-patterns.md)** - Implement sophisticated crawling logic
58
- - **[Performance Guide](./docs/guides/performance.md)** - Optimise for your use case
81
+ ### 🔄 Advanced Usage
82
+ - **[Browser Automation](./docs/guides/browser-automation.md)** - Handle dynamic content
83
+ - **[Anti-Bot Protection](./docs/guides/anti-bot.md)** - Bypass blocking mechanisms
84
+ - **[Security Handling](./docs/guides/security.md)** - Authentication and sessions
59
85
 
60
86
  ### 🏭 Building Production Systems?
61
- - **[Enterprise Patterns](./docs/examples/enterprise-patterns.md)** - Production-ready crawling solutions
62
- - **[Monitoring Guide](./docs/features/monitoring.md)** - Set up observability and alerting
87
+ - **[Performance Guide](./docs/guides/performance.md)** - Scale your crawling operations
63
88
  - **[API Reference](./docs/api/)** - Complete technical documentation
89
+ - **[Enterprise Patterns](./docs/examples/enterprise-patterns.md)** - Production-ready patterns
64
90
 
65
91
  ## 🛠️ Quick Configuration
66
92
 
@@ -83,7 +109,7 @@ const config = makeSpiderConfig({
83
109
  The spider can be configured for different scraping scenarios:
84
110
 
85
111
  ```typescript
86
- import { makeSpiderConfig } from '@jambudipa.io/spider';
112
+ import { makeSpiderConfig } from '@jambudipa/spider';
87
113
 
88
114
  const config = makeSpiderConfig({
89
115
  // Basic settings
@@ -118,7 +144,7 @@ import {
118
144
  LoggingMiddleware,
119
145
  RateLimitMiddleware,
120
146
  UserAgentMiddleware
121
- } from '@jambudipa.io/spider';
147
+ } from '@jambudipa/spider';
122
148
 
123
149
  const middlewares = new MiddlewareManager()
124
150
  .use(new LoggingMiddleware({ level: 'info' }))
@@ -142,7 +168,7 @@ import {
142
168
  SpiderService,
143
169
  ResumabilityService,
144
170
  FileStorageBackend
145
- } from '@jambudipa.io/spider';
171
+ } from '@jambudipa/spider';
146
172
  import { Effect, Layer } from 'effect';
147
173
 
148
174
  // Configure resumability with file storage
@@ -191,7 +217,7 @@ const program = Effect.gen(function* () {
191
217
  Extract and process links from pages:
192
218
 
193
219
  ```typescript
194
- import { LinkExtractorService } from '@jambudipa.io/spider';
220
+ import { LinkExtractorService } from '@jambudipa/spider';
195
221
 
196
222
  const program = Effect.gen(function* () {
197
223
  const linkExtractor = yield* LinkExtractorService;
@@ -260,7 +286,7 @@ const program = Effect.gen(function* () {
260
286
  The library uses Effect for comprehensive error handling:
261
287
 
262
288
  ```typescript
263
- import { NetworkError, ResponseError, RobotsTxtError } from '@jambudipa.io/spider';
289
+ import { NetworkError, ResponseError, RobotsTxtError } from '@jambudipa/spider';
264
290
 
265
291
  const program = Effect.gen(function* () {
266
292
  const spider = yield* SpiderService;
@@ -295,7 +321,7 @@ const program = Effect.gen(function* () {
295
321
  Create custom middleware for specific needs:
296
322
 
297
323
  ```typescript
298
- import { SpiderMiddleware, SpiderRequest, SpiderResponse } from '@jambudipa.io/spider';
324
+ import { SpiderMiddleware, SpiderRequest, SpiderResponse } from '@jambudipa/spider';
299
325
  import { Effect } from 'effect';
300
326
 
301
327
  class CustomAuthMiddleware implements SpiderMiddleware {
@@ -326,7 +352,7 @@ const middlewares = new MiddlewareManager()
326
352
  Monitor scraping performance:
327
353
 
328
354
  ```typescript
329
- import { WorkerHealthMonitorService } from '@jambudipa.io/spider';
355
+ import { WorkerHealthMonitorService } from '@jambudipa/spider';
330
356
 
331
357
  const program = Effect.gen(function* () {
332
358
  const healthMonitor = yield* WorkerHealthMonitorService;
@@ -347,18 +373,6 @@ const program = Effect.gen(function* () {
347
373
  });
348
374
  ```
349
375
 
350
- ## Contributing
351
-
352
- 1. Fork the repository
353
- 2. Create a feature branch: `git checkout -b feature/new-feature`
354
- 3. Make your changes
355
- 4. Add tests for new functionality
356
- 5. Run tests: `npm test`
357
- 6. Run linting: `npm run lint`
358
- 7. Commit changes: `git commit -am 'Add new feature'`
359
- 8. Push to branch: `git push origin feature/new-feature`
360
- 9. Submit a pull request
361
-
362
376
  ## Development
363
377
 
364
378
  ```bash
@@ -388,17 +402,6 @@ npm run format
388
402
 
389
403
  MIT License - see [LICENSE](LICENSE) file for details.
390
404
 
391
- ## Changelog
392
-
393
- ### 1.0.0
394
- - Initial standalone release
395
- - Migrated from monorepo structure
396
- - Full TypeScript support
397
- - Comprehensive middleware system
398
- - Resumable scraping functionality
399
- - Multiple storage backends
400
- - Rate limiting and performance monitoring
401
-
402
405
  ## 📚 Documentation
403
406
 
404
407
  Comprehensive documentation is available in the [`/docs`](./docs) directory:
@@ -406,7 +409,6 @@ Comprehensive documentation is available in the [`/docs`](./docs) directory:
406
409
  ### 🚀 Quick Links
407
410
  - **[Getting Started Guide](./docs/guides/getting-started.md)** - Installation, setup, and first crawl
408
411
  - **[API Reference](./docs/api/)** - Complete API documentation
409
- - **[Configuration Guide](./docs/guides/configuration.md)** - Configuration options and patterns
410
412
  - **[Examples](./docs/examples/)** - Working examples for common use cases
411
413
 
412
414
  ### 📖 Complete Documentation
@@ -417,10 +419,10 @@ Comprehensive documentation is available in the [`/docs`](./docs) directory:
417
419
 
418
420
  ## Support
419
421
 
420
- - [GitHub Issues](https://github.com/jambudipa-io/spider/issues)
422
+ - [GitHub Issues](https://github.com/jambudipa/spider/issues)
421
423
  - [Complete Documentation](./docs/)
422
424
  - [Working Examples](./docs/examples/)
423
425
 
424
426
  ---
425
427
 
426
- Built with ❤️ by [Jambudipa.io](https://jambudipa.io)
428
+ Built with ❤️ by [JAMBUDIPA](https://jambudipa.io)
package/dist/index.js CHANGED
@@ -5,7 +5,7 @@ import * as path from "path";
5
5
  import * as fs$1 from "fs/promises";
6
6
  import { CookieJar } from "tough-cookie";
7
7
  class SpiderConfig extends Effect.Service()(
8
- "@jambudipa.io/SpiderConfig",
8
+ "@jambudipa/spiderConfig",
9
9
  {
10
10
  effect: Effect.sync(() => makeSpiderConfig({}))
11
11
  }
@@ -1189,7 +1189,7 @@ class SpiderState extends Schema.Class("SpiderState")({
1189
1189
  }) {
1190
1190
  }
1191
1191
  class SpiderSchedulerService extends Effect.Service()(
1192
- "@jambudipa.io/SpiderSchedulerService",
1192
+ "@jambudipa/spiderSchedulerService",
1193
1193
  {
1194
1194
  effect: Effect.gen(function* () {
1195
1195
  const config = yield* SpiderConfig;
@@ -1365,7 +1365,7 @@ const SpiderScheduler_service = /* @__PURE__ */ Object.freeze(/* @__PURE__ */ Ob
1365
1365
  SpiderStateKey
1366
1366
  }, Symbol.toStringTag, { value: "Module" }));
1367
1367
  class SpiderService extends Effect.Service()(
1368
- "@jambudipa.io/Spider",
1368
+ "@jambudipa/spider",
1369
1369
  {
1370
1370
  effect: Effect.gen(function* () {
1371
1371
  const robots = yield* RobotsService;