@jambudipa/spider 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +426 -0
  3. package/dist/index.d.ts +33 -0
  4. package/dist/index.d.ts.map +1 -0
  5. package/dist/index.js +4681 -0
  6. package/dist/index.js.map +1 -0
  7. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +57 -0
  8. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +1 -0
  9. package/dist/lib/Config/SpiderConfig.service.d.ts +256 -0
  10. package/dist/lib/Config/SpiderConfig.service.d.ts.map +1 -0
  11. package/dist/lib/HttpClient/CookieManager.d.ts +44 -0
  12. package/dist/lib/HttpClient/CookieManager.d.ts.map +1 -0
  13. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +88 -0
  14. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +1 -0
  15. package/dist/lib/HttpClient/SessionStore.d.ts +82 -0
  16. package/dist/lib/HttpClient/SessionStore.d.ts.map +1 -0
  17. package/dist/lib/HttpClient/TokenExtractor.d.ts +58 -0
  18. package/dist/lib/HttpClient/TokenExtractor.d.ts.map +1 -0
  19. package/dist/lib/HttpClient/index.d.ts +8 -0
  20. package/dist/lib/HttpClient/index.d.ts.map +1 -0
  21. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +166 -0
  22. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +1 -0
  23. package/dist/lib/LinkExtractor/index.d.ts +37 -0
  24. package/dist/lib/LinkExtractor/index.d.ts.map +1 -0
  25. package/dist/lib/Logging/FetchLogger.d.ts +8 -0
  26. package/dist/lib/Logging/FetchLogger.d.ts.map +1 -0
  27. package/dist/lib/Logging/SpiderLogger.service.d.ts +34 -0
  28. package/dist/lib/Logging/SpiderLogger.service.d.ts.map +1 -0
  29. package/dist/lib/Middleware/SpiderMiddleware.d.ts +276 -0
  30. package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +1 -0
  31. package/dist/lib/PageData/PageData.d.ts +28 -0
  32. package/dist/lib/PageData/PageData.d.ts.map +1 -0
  33. package/dist/lib/Resumability/Resumability.service.d.ts +176 -0
  34. package/dist/lib/Resumability/Resumability.service.d.ts.map +1 -0
  35. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +47 -0
  36. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +1 -0
  37. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +95 -0
  38. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +1 -0
  39. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +92 -0
  40. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +1 -0
  41. package/dist/lib/Resumability/index.d.ts +51 -0
  42. package/dist/lib/Resumability/index.d.ts.map +1 -0
  43. package/dist/lib/Resumability/strategies.d.ts +76 -0
  44. package/dist/lib/Resumability/strategies.d.ts.map +1 -0
  45. package/dist/lib/Resumability/types.d.ts +201 -0
  46. package/dist/lib/Resumability/types.d.ts.map +1 -0
  47. package/dist/lib/Robots/Robots.service.d.ts +78 -0
  48. package/dist/lib/Robots/Robots.service.d.ts.map +1 -0
  49. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +211 -0
  50. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +1 -0
  51. package/dist/lib/Scraper/Scraper.service.d.ts +123 -0
  52. package/dist/lib/Scraper/Scraper.service.d.ts.map +1 -0
  53. package/dist/lib/Spider/Spider.service.d.ts +194 -0
  54. package/dist/lib/Spider/Spider.service.d.ts.map +1 -0
  55. package/dist/lib/StateManager/StateManager.service.d.ts +68 -0
  56. package/dist/lib/StateManager/StateManager.service.d.ts.map +1 -0
  57. package/dist/lib/StateManager/index.d.ts +5 -0
  58. package/dist/lib/StateManager/index.d.ts.map +1 -0
  59. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +58 -0
  60. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +1 -0
  61. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +77 -0
  62. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +1 -0
  63. package/dist/lib/WebScrapingEngine/index.d.ts +5 -0
  64. package/dist/lib/WebScrapingEngine/index.d.ts.map +1 -0
  65. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +39 -0
  66. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +1 -0
  67. package/dist/lib/api-facades.d.ts +313 -0
  68. package/dist/lib/api-facades.d.ts.map +1 -0
  69. package/dist/lib/errors.d.ts +99 -0
  70. package/dist/lib/errors.d.ts.map +1 -0
  71. package/package.json +108 -0
@@ -0,0 +1,166 @@
1
+ import { Effect } from 'effect';
2
+ /**
3
+ * Configuration for link extraction behavior.
4
+ *
5
+ * Focuses purely on HOW to extract links from HTML documents,
6
+ * not on processing or validating the extracted URLs.
7
+ *
8
+ * @example
9
+ * ```typescript
10
+ * // Extract from specific CSS selectors
11
+ * const config: LinkExtractorConfig = {
12
+ * restrictCss: ['a.product-link', 'form[action]'],
13
+ * tags: ['a', 'form'],
14
+ * attrs: ['href', 'action']
15
+ * };
16
+ *
17
+ * // Extract from all standard elements
18
+ * const config: LinkExtractorConfig = {
19
+ * tags: ['a', 'area', 'form', 'frame', 'iframe'],
20
+ * attrs: ['href', 'action', 'src']
21
+ * };
22
+ * ```
23
+ *
24
+ * @group LinkExtractor
25
+ * @public
26
+ */
27
+ export interface LinkExtractorConfig {
28
+ /**
29
+ * CSS selectors to restrict extraction to specific elements.
30
+ * If specified, only elements matching these selectors will be processed.
31
+ *
32
+ * @example
33
+ * ```typescript
34
+ * restrictCss: [
35
+ * 'a.product-link', // Only product links
36
+ * '.content a', // Links within content area
37
+ * 'form[method="post"]' // POST forms only
38
+ * ]
39
+ * ```
40
+ */
41
+ readonly restrictCss?: string[];
42
+ /**
43
+ * HTML tag names to extract links from.
44
+ * Defaults to common link-containing elements.
45
+ *
46
+ * @example ['a', 'area', 'form', 'frame', 'iframe', 'link']
47
+ */
48
+ readonly tags?: string[];
49
+ /**
50
+ * HTML attributes to extract URLs from.
51
+ * Defaults to common URL-containing attributes.
52
+ *
53
+ * @example ['href', 'action', 'src', 'data-url']
54
+ */
55
+ readonly attrs?: string[];
56
+ /**
57
+ * Whether to extract URLs from form input elements.
58
+ * Looks for hidden inputs with URL-like names/values.
59
+ *
60
+ * @default false
61
+ */
62
+ readonly extractFromInputs?: boolean;
63
+ }
64
+ /**
65
+ * Result of link extraction from an HTML document.
66
+ *
67
+ * Contains the raw extracted URLs without any processing or validation.
68
+ *
69
+ * @group LinkExtractor
70
+ * @public
71
+ */
72
+ export interface LinkExtractionResult {
73
+ /**
74
+ * Raw URLs extracted from the HTML document.
75
+ * These are unprocessed and may be relative URLs, fragments, etc.
76
+ */
77
+ readonly links: string[];
78
+ /**
79
+ * Total number of potential URL-containing elements found.
80
+ * Includes elements that didn't yield valid URLs.
81
+ */
82
+ readonly totalElementsProcessed: number;
83
+ /**
84
+ * Breakdown of extraction by element type.
85
+ * Maps element types to the number of URLs extracted from them.
86
+ */
87
+ readonly extractionBreakdown: Record<string, number>;
88
+ }
89
+ declare const LinkExtractionError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
90
+ readonly _tag: "LinkExtractionError";
91
+ } & Readonly<A>;
92
+ /**
93
+ * Error that can occur during link extraction.
94
+ *
95
+ * @group Errors
96
+ * @public
97
+ */
98
+ export declare class LinkExtractionError extends LinkExtractionError_base<{
99
+ readonly message: string;
100
+ readonly cause?: unknown;
101
+ }> {
102
+ }
103
+ /**
104
+ * Service interface for extracting links from HTML documents.
105
+ *
106
+ * This service focuses purely on extraction - it does not process,
107
+ * validate, or filter the extracted URLs in any way.
108
+ *
109
+ * @group Services
110
+ * @public
111
+ */
112
+ export interface LinkExtractorServiceInterface {
113
+ /**
114
+ * Extracts all URLs from an HTML document based on configuration.
115
+ *
116
+ * This method only extracts URLs from the HTML - it does not:
117
+ * - Validate URLs
118
+ * - Resolve relative URLs to absolute URLs
119
+ * - Apply domain or pattern filtering
120
+ * - Canonicalize URLs
121
+ *
122
+ * URL processing should be handled separately by the consumer.
123
+ *
124
+ * @param html - The HTML content to extract links from
125
+ * @param config - Configuration for extraction behavior
126
+ * @returns Effect containing the extraction result
127
+ *
128
+ * @example
129
+ * ```typescript
130
+ * const extractor = yield* LinkExtractorService;
131
+ * const result = yield* extractor.extractLinks(htmlContent, {
132
+ * tags: ['a', 'form'],
133
+ * attrs: ['href', 'action'],
134
+ * restrictCss: ['.content a']
135
+ * });
136
+ *
137
+ * console.log(`Found ${result.links.length} raw URLs`);
138
+ * // URLs may be relative, absolute, fragments, etc.
139
+ * ```
140
+ */
141
+ extractLinks(html: string, config?: LinkExtractorConfig): Effect.Effect<LinkExtractionResult, LinkExtractionError>;
142
+ }
143
+ declare const LinkExtractorService_base: Effect.Service.Class<LinkExtractorService, "@jambudipa.io/LinkExtractorService", {
144
+ readonly effect: Effect.Effect<{
145
+ extractLinks: (html: string, config?: LinkExtractorConfig) => Effect.Effect<LinkExtractionResult, LinkExtractionError, never>;
146
+ }, never, never>;
147
+ }>;
148
+ /**
149
+ * Implementation of the LinkExtractorService.
150
+ *
151
+ * Provides pure HTML link extraction without any URL processing.
152
+ *
153
+ * @group Services
154
+ * @public
155
+ */
156
+ export declare class LinkExtractorService extends LinkExtractorService_base {
157
+ }
158
+ /**
159
+ * Default layer for LinkExtractorService.
160
+ *
161
+ * @group Layers
162
+ * @public
163
+ */
164
+ export declare const LinkExtractorServiceLayer: import("effect/Layer").Layer<LinkExtractorService, never, never>;
165
+ export {};
166
+ //# sourceMappingURL=LinkExtractor.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"LinkExtractor.service.d.ts","sourceRoot":"","sources":["../../../src/lib/LinkExtractor/LinkExtractor.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAQ,MAAM,EAAE,MAAM,QAAQ,CAAC;AAItC;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,MAAM,WAAW,mBAAmB;IAClC;;;;;;;;;;;;OAYG;IACH,QAAQ,CAAC,WAAW,CAAC,EAAE,MAAM,EAAE,CAAC;IAEhC;;;;;OAKG;IACH,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAEzB;;;;;OAKG;IACH,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;IAE1B;;;;;OAKG;IACH,QAAQ,CAAC,iBAAiB,CAAC,EAAE,OAAO,CAAC;CACtC;AAED;;;;;;;GAOG;AACH,MAAM,WAAW,oBAAoB;IACnC;;;OAGG;IACH,QAAQ,CAAC,KAAK,EAAE,MAAM,EAAE,CAAC;IAEzB;;;OAGG;IACH,QAAQ,CAAC,sBAAsB,EAAE,MAAM,CAAC;IAExC;;;OAGG;IACH,QAAQ,CAAC,mBAAmB,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACtD;;;;AAED;;;;;GAKG;AACH,qBAAa,mBAAoB,SAAQ,yBAEvC;IACA,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;CAC1B,CAAC;CAAG;AAEL;;;;;;;;GAQG;AACH,MAAM,WAAW,6BAA6B;IAC5C;;;;;;;;;;;;;;;;;;;;;;;;;;;OA2BG;IACH,YAAY,CACV,IAAI,EAAE,MAAM,EACZ,MAAM,CAAC,EAAE,mBAAmB,GAC3B,MAAM,CAAC,MAAM,CAAC,oBAAoB,EAAE,mBAAmB,CAAC,CAAC;CAC7D;;;6BAyB0B,MAAM,WAAW,mBAAmB;;;AAZ/D;;;;;;;GAOG;AACH,qBAAa,oBAAqB,SAAQ,yBAsBzC;CAAG;AAEJ;;;;;GAKG;AACH,eAAO,MAAM,yBAAyB,kEAA+B,CAAC"}
@@ -0,0 +1,37 @@
1
+ /**
2
+ * Advanced link extraction functionality for the Spider framework.
3
+ *
4
+ * This module provides Scrapy-equivalent link extraction capabilities with support for:
5
+ * - CSS selector-based extraction
6
+ * - Pattern-based filtering (allow/deny regex patterns)
7
+ * - Domain-based filtering
8
+ * - URL canonicalization
9
+ * - Duplicate removal
10
+ * - Comprehensive extraction statistics
11
+ *
12
+ * @example
13
+ * ```typescript
14
+ * import { LinkExtractorService, type LinkExtractorConfig } from '@jambudipa.io/spider/LinkExtractor';
15
+ *
16
+ * const program = Effect.gen(function* () {
17
+ * const extractor = yield* LinkExtractorService;
18
+ *
19
+ * const result = yield* extractor.extractLinks(
20
+ * htmlContent,
21
+ * 'https://example.com',
22
+ * {
23
+ * allowPatterns: [/\/articles\/\d+/],
24
+ * restrictCss: ['.content a'],
25
+ * canonicalize: true
26
+ * }
27
+ * );
28
+ *
29
+ * console.log(`Extracted ${result.links.length} links`);
30
+ * });
31
+ * ```
32
+ *
33
+ * @group LinkExtractor
34
+ * @public
35
+ */
36
+ export { LinkExtractorService, LinkExtractorServiceLayer, type LinkExtractorConfig, type LinkExtractionResult, type LinkExtractorServiceInterface, LinkExtractionError, } from './LinkExtractor.service.js';
37
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/lib/LinkExtractor/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AAEH,OAAO,EACL,oBAAoB,EACpB,yBAAyB,EACzB,KAAK,mBAAmB,EACxB,KAAK,oBAAoB,EACzB,KAAK,6BAA6B,EAClC,mBAAmB,GACpB,MAAM,4BAA4B,CAAC"}
@@ -0,0 +1,8 @@
1
+ import { Context, Effect } from 'effect';
2
+ import { SpiderLogger } from './SpiderLogger.service.js';
3
+ /**
4
+ * Wrapper for fetch that adds comprehensive logging
5
+ */
6
+ export declare const makeLoggingFetch: Effect.Effect<(url: string, options?: RequestInit) => Promise<Response>, never, SpiderLogger>;
7
+ export declare const LoggingFetch: Context.Tag<(url: string, options?: RequestInit) => Promise<Response>, (url: string, options?: RequestInit) => Promise<Response>>;
8
+ //# sourceMappingURL=FetchLogger.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"FetchLogger.d.ts","sourceRoot":"","sources":["../../../src/lib/Logging/FetchLogger.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AACzC,OAAO,EAAE,YAAY,EAAE,MAAM,2BAA2B,CAAC;AAEzD;;GAEG;AACH,eAAO,MAAM,gBAAgB,sBAGd,MAAM,YAAY,WAAW,4CAgG1C,CAAC;AAEH,eAAO,MAAM,YAAY,oBACE,MAAM,YAAY,WAAW,KAAK,OAAO,CAAC,QAAQ,CAAC,QAAnD,MAAM,YAAY,WAAW,KAAK,OAAO,CAAC,QAAQ,CAAC,CAE3E,CAAC"}
@@ -0,0 +1,34 @@
1
+ import { Context, Effect, Layer } from 'effect';
2
+ export interface SpiderLogEvent {
3
+ timestamp: string;
4
+ type: 'domain_start' | 'domain_complete' | 'domain_error' | 'page_scraped' | 'queue_status' | 'worker_status' | 'rate_limit' | 'spider_lifecycle' | 'worker_lifecycle' | 'worker_state' | 'completion_monitor' | 'edge_case' | 'crawl_delay_capped';
5
+ domain?: string;
6
+ url?: string;
7
+ workerId?: string;
8
+ fiberId?: string;
9
+ message: string;
10
+ details?: Record<string, unknown>;
11
+ }
12
+ export interface SpiderLogger {
13
+ readonly logEvent: (event: Omit<SpiderLogEvent, 'timestamp'>) => Effect.Effect<void>;
14
+ readonly logDomainStart: (domain: string, startUrl: string) => Effect.Effect<void>;
15
+ readonly logDomainComplete: (domain: string, pagesScraped: number, reason: 'max_pages' | 'queue_empty' | 'error') => Effect.Effect<void>;
16
+ readonly logPageScraped: (url: string, domain: string, pageNumber: number) => Effect.Effect<void>;
17
+ readonly logQueueStatus: (domain: string, queueSize: number, activeWorkers: number) => Effect.Effect<void>;
18
+ readonly logRateLimit: (domain: string, requestsInWindow: number) => Effect.Effect<void>;
19
+ readonly logSpiderLifecycle: (event: 'start' | 'complete' | 'error', details?: Record<string, unknown>) => Effect.Effect<void>;
20
+ readonly logWorkerLifecycle: (workerId: string, domain: string, event: 'created' | 'entering_loop' | 'exiting_loop', reason?: string, details?: Record<string, unknown>) => Effect.Effect<void>;
21
+ readonly logWorkerState: (workerId: string, domain: string, event: 'taking_task' | 'marked_active' | 'marked_idle' | 'task_completed', details?: Record<string, unknown>) => Effect.Effect<void>;
22
+ readonly logCompletionMonitor: (domain: string, checkCount: number, queueSize: number, activeWorkers: number, stableCount: number, maxPagesReached: boolean, decision: string) => Effect.Effect<void>;
23
+ readonly logEdgeCase: (domain: string, caseType: string, details?: Record<string, unknown>) => Effect.Effect<void>;
24
+ readonly logDomainStatus: (domain: string, status: {
25
+ pagesScraped: number;
26
+ queueSize: number;
27
+ activeWorkers: number;
28
+ maxWorkers: number;
29
+ }) => Effect.Effect<void>;
30
+ }
31
+ export declare const SpiderLogger: Context.Tag<SpiderLogger, SpiderLogger>;
32
+ export declare const makeSpiderLogger: (logDir?: string) => SpiderLogger;
33
+ export declare const SpiderLoggerLive: Layer.Layer<SpiderLogger, never, never>;
34
+ //# sourceMappingURL=SpiderLogger.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"SpiderLogger.service.d.ts","sourceRoot":"","sources":["../../../src/lib/Logging/SpiderLogger.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAW,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,QAAQ,CAAC;AAIzD,MAAM,WAAW,cAAc;IAC7B,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EACA,cAAc,GACd,iBAAiB,GACjB,cAAc,GACd,cAAc,GACd,cAAc,GACd,eAAe,GACf,YAAY,GACZ,kBAAkB,GAClB,kBAAkB,GAClB,cAAc,GACd,oBAAoB,GACpB,WAAW,GACX,oBAAoB,CAAC;IACzB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACnC;AAED,MAAM,WAAW,YAAY;IAC3B,QAAQ,CAAC,QAAQ,EAAE,CACjB,KAAK,EAAE,IAAI,CAAC,cAAc,EAAE,WAAW,CAAC,KACrC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,cAAc,EAAE,CACvB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,MAAM,KACb,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,iBAAiB,EAAE,CAC1B,MAAM,EAAE,MAAM,EACd,YAAY,EAAE,MAAM,EACpB,MAAM,EAAE,WAAW,GAAG,aAAa,GAAG,OAAO,KAC1C,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,cAAc,EAAE,CACvB,GAAG,EAAE,MAAM,EACX,MAAM,EAAE,MAAM,EACd,UAAU,EAAE,MAAM,KACf,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,cAAc,EAAE,CACvB,MAAM,EAAE,MAAM,EACd,SAAS,EAAE,MAAM,EACjB,aAAa,EAAE,MAAM,KAClB,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,YAAY,EAAE,CACrB,MAAM,EAAE,MAAM,EACd,gBAAgB,EAAE,MAAM,KACrB,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,kBAAkB,EAAE,CAC3B,KAAK,EAAE,OAAO,GAAG,UAAU,GAAG,OAAO,EACrC,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAC9B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAGzB,QAAQ,CAAC,kBAAkB,EAAE,CAC3B,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,SAAS,GAAG,eAAe,GAAG,cAAc,EACnD,MAAM,CAAC,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAC9B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,cAAc,EAAE,CACvB,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,MAAM,EACd,KAAK,EAAE,aAAa,GAAG,eAAe,GAAG,aAAa,GAAG,gBAAgB,EACzE,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAC9B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,oBAAoB,EAAE,CAC7B,MAAM,EAAE,MAAM,EACd,UAAU,EAAE,MAAM,EAClB,SAAS,EAAE,MAAM,EACjB,aAAa,EAAE,MAAM,EACrB,WAAW,EAAE,MAAM,EACnB,eAAe,EAAE,OAAO,EACxB,QAAQ,EAAE,MAAM,KACb,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,WAAW,EAAE,CACpB,MAAM,EAAE,MAAM,EACd,QAAQ,EAAE,MAAM,EAChB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAC9B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,QAAQ,CAAC,eAAe,EAAE,CACxB,MAAM,EAAE,MAAM,EACd,MAAM,EAAE;QACN,YAAY,EAAE,MAAM,CAAC;QACrB,SAAS,EAAE,MAAM,CAAC;QAClB,aAAa,EAAE,MAAM,CAAC;QACtB,UAAU,EAAE,MAAM,CAAC;KACpB,KACE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;CAC1B;AAED,eAAO,MAAM,YAAY,yCAAmD,CAAC;AAE7E,eAAO,MAAM,gBAAgB,GAAI,eAAwB,KAAG,YA2R3D,CAAC;AAEF,eAAO,MAAM,gBAAgB,yCAAkD,CAAC"}
@@ -0,0 +1,276 @@
1
+ import { Effect } from 'effect';
2
+ import { CrawlTask } from '../Spider/Spider.service.js';
3
+ import { PageData } from '../PageData/PageData.js';
4
+ import { MiddlewareError } from '../errors.js';
5
+ /**
6
+ * Request object used in the middleware pipeline.
7
+ *
8
+ * Contains the crawl task along with optional headers and metadata
9
+ * that can be modified by middleware during processing.
10
+ *
11
+ * @group Interfaces
12
+ * @public
13
+ */
14
+ export interface SpiderRequest {
15
+ /** The crawl task containing URL and depth information */
16
+ task: CrawlTask;
17
+ /** HTTP headers to include with the request */
18
+ headers?: Record<string, string>;
19
+ /** Additional metadata that can be used by middleware */
20
+ meta?: Record<string, unknown>;
21
+ }
22
+ /**
23
+ * Response object used in the middleware pipeline.
24
+ *
25
+ * Contains the extracted page data along with optional HTTP response
26
+ * information and metadata from middleware processing.
27
+ *
28
+ * @group Interfaces
29
+ * @public
30
+ */
31
+ export interface SpiderResponse {
32
+ /** The extracted page data including content, links, and metadata */
33
+ pageData: PageData;
34
+ /** HTTP status code of the response */
35
+ statusCode?: number;
36
+ /** HTTP response headers */
37
+ headers?: Record<string, string>;
38
+ /** Additional metadata added by middleware */
39
+ meta?: Record<string, unknown>;
40
+ }
41
+ /**
42
+ * Interface for implementing custom middleware components.
43
+ *
44
+ * Middleware can intercept and modify requests before they're sent,
45
+ * responses after they're received, and handle exceptions that occur
46
+ * during processing. All methods are optional.
47
+ *
48
+ * @example
49
+ * ```typescript
50
+ * const loggingMiddleware: SpiderMiddleware = {
51
+ * processRequest: (request) => Effect.gen(function* () {
52
+ * console.log(`Requesting: ${request.task.url}`);
53
+ * return request;
54
+ * }),
55
+ *
56
+ * processResponse: (response, request) => Effect.gen(function* () {
57
+ * console.log(`Response: ${response.statusCode} for ${request.task.url}`);
58
+ * return response;
59
+ * }),
60
+ *
61
+ * processException: (error, request) => Effect.gen(function* () {
62
+ * console.error(`Error processing ${request.task.url}: ${error.message}`);
63
+ * return null; // Let the error propagate
64
+ * })
65
+ * };
66
+ * ```
67
+ *
68
+ * @group Interfaces
69
+ * @public
70
+ */
71
+ export interface SpiderMiddleware {
72
+ /**
73
+ * Process a request before it's sent to the target server.
74
+ * Can modify headers, metadata, or reject the request entirely.
75
+ */
76
+ processRequest?: (request: SpiderRequest) => Effect.Effect<SpiderRequest, MiddlewareError>;
77
+ /**
78
+ * Process a response after it's received from the target server.
79
+ * Can modify the response data or metadata.
80
+ */
81
+ processResponse?: (response: SpiderResponse, request: SpiderRequest) => Effect.Effect<SpiderResponse, MiddlewareError>;
82
+ /**
83
+ * Handle exceptions that occur during request processing.
84
+ * Can attempt recovery by returning a SpiderResponse, or return null to propagate the error.
85
+ */
86
+ processException?: (error: Error, request: SpiderRequest) => Effect.Effect<SpiderResponse | null, MiddlewareError>;
87
+ }
88
+ declare const MiddlewareManager_base: Effect.Service.Class<MiddlewareManager, "@jambudipa.io/MiddlewareManager", {
89
+ readonly effect: Effect.Effect<{
90
+ /**
91
+ * Processes a request through the middleware pipeline.
92
+ *
93
+ * Middleware are executed in order from first to last, with each middleware
94
+ * receiving the output of the previous middleware as input.
95
+ *
96
+ * @param request - The initial request to process
97
+ * @param middlewares - Array of middleware to apply
98
+ * @returns Effect containing the processed request
99
+ */
100
+ processRequest: (request: SpiderRequest, middlewares: SpiderMiddleware[]) => Effect.Effect<SpiderRequest, MiddlewareError, never>;
101
+ /**
102
+ * Processes a response through the middleware pipeline in reverse order.
103
+ *
104
+ * Middleware are executed in reverse order (last to first) to provide
105
+ * proper nesting of response processing.
106
+ *
107
+ * @param response - The response to process
108
+ * @param request - The original request (for context)
109
+ * @param middlewares - Array of middleware to apply
110
+ * @returns Effect containing the processed response
111
+ */
112
+ processResponse: (response: SpiderResponse, request: SpiderRequest, middlewares: SpiderMiddleware[]) => Effect.Effect<SpiderResponse, MiddlewareError, never>;
113
+ /**
114
+ * Processes an exception through the middleware pipeline in reverse order.
115
+ *
116
+ * Middleware are given a chance to handle or recover from exceptions.
117
+ * If a middleware returns a SpiderResponse, it indicates successful recovery.
118
+ * If it returns null, the exception continues to propagate.
119
+ *
120
+ * @param error - The error that occurred
121
+ * @param request - The request that caused the error
122
+ * @param middlewares - Array of middleware to apply
123
+ * @returns Effect containing a recovered response or null
124
+ */
125
+ processException: (error: Error, request: SpiderRequest, middlewares: SpiderMiddleware[]) => Effect.Effect<SpiderResponse | null, MiddlewareError, never>;
126
+ }, never, never>;
127
+ }>;
128
+ /**
129
+ * Manages the middleware pipeline for request and response processing.
130
+ *
131
+ * The MiddlewareManager orchestrates the execution of middleware in the correct order:
132
+ * - Requests are processed forward through the middleware array
133
+ * - Responses are processed in reverse order (last middleware first)
134
+ * - Exceptions are processed in reverse order for proper error handling
135
+ *
136
+ * @example
137
+ * ```typescript
138
+ * const program = Effect.gen(function* () {
139
+ * const manager = yield* MiddlewareManager;
140
+ *
141
+ * const middleware = [
142
+ * rateLimitMiddleware,
143
+ * loggingMiddleware,
144
+ * userAgentMiddleware
145
+ * ];
146
+ *
147
+ * const request: SpiderRequest = {
148
+ * task: { url: 'https://example.com', depth: 0 },
149
+ * headers: {}
150
+ * };
151
+ *
152
+ * const processedRequest = yield* manager.processRequest(request, middleware);
153
+ * console.log('Request processed through middleware pipeline');
154
+ * });
155
+ * ```
156
+ *
157
+ * @group Services
158
+ * @public
159
+ */
160
+ export declare class MiddlewareManager extends MiddlewareManager_base {
161
+ }
162
+ declare const RateLimitMiddleware_base: Effect.Service.Class<RateLimitMiddleware, "@jambudipa.io/RateLimitMiddleware", {
163
+ readonly effect: Effect.Effect<{
164
+ create: (config: {
165
+ maxConcurrentRequests: number;
166
+ maxRequestsPerSecondPerDomain: number;
167
+ requestDelayMs?: number;
168
+ }) => SpiderMiddleware;
169
+ }, never, never>;
170
+ }>;
171
+ /**
172
+ * Provides rate limiting functionality for respectful crawling.
173
+ *
174
+ * Controls request frequency at both global and per-domain levels to prevent
175
+ * overwhelming target servers and avoid being blocked.
176
+ *
177
+ * @example
178
+ * ```typescript
179
+ * const rateLimiter = yield* RateLimitMiddleware;
180
+ * const middleware = rateLimiter.create({
181
+ * maxConcurrentRequests: 5,
182
+ * maxRequestsPerSecondPerDomain: 2,
183
+ * requestDelayMs: 250
184
+ * });
185
+ * ```
186
+ *
187
+ * @group Middleware
188
+ * @public
189
+ */
190
+ export declare class RateLimitMiddleware extends RateLimitMiddleware_base {
191
+ }
192
+ declare const LoggingMiddleware_base: Effect.Service.Class<LoggingMiddleware, "@jambudipa.io/LoggingMiddleware", {
193
+ readonly effect: Effect.Effect<{
194
+ create: (config?: {
195
+ logRequests?: boolean;
196
+ logResponses?: boolean;
197
+ logErrors?: boolean;
198
+ logLevel?: "debug" | "info" | "warn" | "error";
199
+ }) => SpiderMiddleware;
200
+ }, never, never>;
201
+ }>;
202
+ /**
203
+ * Provides logging functionality using Effect.Logger.
204
+ *
205
+ * Logs requests, responses, and errors at configurable levels for debugging
206
+ * and monitoring purposes.
207
+ *
208
+ * @example
209
+ * ```typescript
210
+ * const logger = yield* LoggingMiddleware;
211
+ * const middleware = logger.create({
212
+ * logRequests: true,
213
+ * logResponses: true,
214
+ * logLevel: 'info'
215
+ * });
216
+ * ```
217
+ *
218
+ * @group Middleware
219
+ * @public
220
+ */
221
+ export declare class LoggingMiddleware extends LoggingMiddleware_base {
222
+ }
223
+ declare const UserAgentMiddleware_base: Effect.Service.Class<UserAgentMiddleware, "@jambudipa.io/UserAgentMiddleware", {
224
+ readonly effect: Effect.Effect<{
225
+ create: (userAgent: string) => SpiderMiddleware;
226
+ }, never, never>;
227
+ }>;
228
+ /**
229
+ * Adds User-Agent headers to requests.
230
+ *
231
+ * Sets a consistent User-Agent string for all requests to identify
232
+ * your crawler to web servers.
233
+ *
234
+ * @example
235
+ * ```typescript
236
+ * const userAgent = yield* UserAgentMiddleware;
237
+ * const middleware = userAgent.create('MyBot/1.0 (+https://example.com)');
238
+ * ```
239
+ *
240
+ * @group Middleware
241
+ * @public
242
+ */
243
+ export declare class UserAgentMiddleware extends UserAgentMiddleware_base {
244
+ }
245
+ declare const StatsMiddleware_base: Effect.Service.Class<StatsMiddleware, "@jambudipa.io/StatsMiddleware", {
246
+ readonly effect: Effect.Effect<{
247
+ create: () => {
248
+ middleware: SpiderMiddleware;
249
+ getStats: () => Effect.Effect<Record<string, number>>;
250
+ };
251
+ }, never, never>;
252
+ }>;
253
+ /**
254
+ * Collects statistics about crawling activity.
255
+ *
256
+ * Tracks various metrics including requests processed, response codes,
257
+ * bytes downloaded, and processing times for monitoring and optimization.
258
+ *
259
+ * @example
260
+ * ```typescript
261
+ * const statsService = yield* StatsMiddleware;
262
+ * const { middleware, getStats } = statsService.create();
263
+ *
264
+ * // Use middleware in your pipeline
265
+ * // Later get statistics
266
+ * const stats = yield* getStats();
267
+ * console.log(`Processed ${stats.requests_processed} requests`);
268
+ * ```
269
+ *
270
+ * @group Middleware
271
+ * @public
272
+ */
273
+ export declare class StatsMiddleware extends StatsMiddleware_base {
274
+ }
275
+ export {};
276
+ //# sourceMappingURL=SpiderMiddleware.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"SpiderMiddleware.d.ts","sourceRoot":"","sources":["../../../src/lib/Middleware/SpiderMiddleware.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAA0B,MAAM,QAAQ,CAAC;AACxD,OAAO,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AACxD,OAAO,EAAE,QAAQ,EAAE,MAAM,yBAAyB,CAAC;AACnD,OAAO,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAE/C;;;;;;;;GAQG;AACH,MAAM,WAAW,aAAa;IAC5B,0DAA0D;IAC1D,IAAI,EAAE,SAAS,CAAC;IAChB,+CAA+C;IAC/C,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,yDAAyD;IACzD,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAChC;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,cAAc;IAC7B,qEAAqE;IACrE,QAAQ,EAAE,QAAQ,CAAC;IACnB,uCAAuC;IACvC,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,4BAA4B;IAC5B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,8CAA8C;IAC9C,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAChC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;OAGG;IACH,cAAc,CAAC,EAAE,CACf,OAAO,EAAE,aAAa,KACnB,MAAM,CAAC,MAAM,CAAC,aAAa,EAAE,eAAe,CAAC,CAAC;IAEnD;;;OAGG;IACH,eAAe,CAAC,EAAE,CAChB,QAAQ,EAAE,cAAc,EACxB,OAAO,EAAE,aAAa,KACnB,MAAM,CAAC,MAAM,CAAC,cAAc,EAAE,eAAe,CAAC,CAAC;IAEpD;;;OAGG;IACH,gBAAgB,CAAC,EAAE,CACjB,KAAK,EAAE,KAAK,EACZ,OAAO,EAAE,aAAa,KACnB,MAAM,CAAC,MAAM,CAAC,cAAc,GAAG,IAAI,EAAE,eAAe,CAAC,CAAC;CAC5D;;;QAsCK;;;;;;;;;WASG;kCAEQ,aAAa,eACT,gBAAgB,EAAE;QAQjC;;;;;;;;;;WAUG;oCAES,cAAc,WACf,aAAa,eACT,gBAAgB,EAAE;QAWjC;;;;;;;;;;;WAWG;kCAEM,KAAK,WACH,aAAa,eACT,gBAAgB,EAAE;;;AAhGvC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AACH,qBAAa,iBAAkB,SAAQ,sBA4EtC;CAAG;;;yBA8BqB;YACf,qBAAqB,EAAE,MAAM,CAAC;YAC9B,6BAA6B,EAAE,MAAM,CAAC;YACtC,cAAc,CAAC,EAAE,MAAM,CAAC;SACzB,KAAG,gBAAgB;;;AAhC5B;;;;;;;;;;;;;;;;;;GAkBG;AACH,qBAAa,mBAAoB,SAAQ,wBAmExC;CAAG;;;0BA0BY;YACN,WAAW,CAAC,EAAE,OAAO,CAAC;YACtB,YAAY,CAAC,EAAE,OAAO,CAAC;YACvB,SAAS,CAAC,EAAE,OAAO,CAAC;YACpB,QAAQ,CAAC,EAAE,OAAO,GAAG,MAAM,GAAG,MAAM,GAAG,OAAO,CAAC;SAChD,KACA,gBAAgB;;;AA9BzB;;;;;;;;;;;;;;;;;;GAkBG;AACH,qBAAa,iBAAkB,SAAQ,sBA4EtC;CAAG;;;4BAqBsB,MAAM,KAAG,gBAAgB;;;AAnBnD;;;;;;;;;;;;;;GAcG;AACH,qBAAa,mBAAoB,SAAQ,wBAgBxC;CAAG;;;sBA0Bc;YACV,UAAU,EAAE,gBAAgB,CAAC;YAC7B,QAAQ,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;SACvD;;;AA3BP;;;;;;;;;;;;;;;;;;;GAmBG;AACH,qBAAa,eAAgB,SAAQ,oBA4DpC;CAAG"}
@@ -0,0 +1,28 @@
1
+ import { Schema } from 'effect';
2
+ export declare const PageDataSchema: Schema.Struct<{
3
+ url: Schema.filter<typeof Schema.String>;
4
+ html: typeof Schema.String;
5
+ title: Schema.optional<typeof Schema.String>;
6
+ /** All available metadata from meta tags */
7
+ metadata: Schema.Record$<typeof Schema.String, typeof Schema.String>;
8
+ /** Commonly used metadata fields for convenience */
9
+ commonMetadata: Schema.optional<Schema.Struct<{
10
+ description: Schema.optional<typeof Schema.String>;
11
+ keywords: Schema.optional<typeof Schema.String>;
12
+ author: Schema.optional<typeof Schema.String>;
13
+ robots: Schema.optional<typeof Schema.String>;
14
+ }>>;
15
+ statusCode: Schema.filter<Schema.filter<typeof Schema.Number>>;
16
+ /** All response headers */
17
+ headers: Schema.Record$<typeof Schema.String, typeof Schema.String>;
18
+ /** When the fetch operation started */
19
+ fetchedAt: typeof Schema.DateFromSelf;
20
+ /** How long the entire fetch and parse operation took in milliseconds */
21
+ scrapeDurationMs: typeof Schema.Number;
22
+ /** The crawl depth (number of hops from the starting URL) */
23
+ depth: Schema.filter<Schema.filter<typeof Schema.Number>>;
24
+ /** Optional extracted data from the page */
25
+ extractedData: Schema.optional<Schema.Record$<typeof Schema.String, typeof Schema.Unknown>>;
26
+ }>;
27
+ export type PageData = Schema.Schema.Type<typeof PageDataSchema>;
28
+ //# sourceMappingURL=PageData.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"PageData.d.ts","sourceRoot":"","sources":["../../../src/lib/PageData/PageData.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAEhC,eAAO,MAAM,cAAc;;;;IAkBzB,4CAA4C;;IAE5C,oDAAoD;;;;;;;;IAUpD,2BAA2B;;IAE3B,uCAAuC;;IAEvC,yEAAyE;;IAEzE,6DAA6D;;IAE7D,4CAA4C;;EAI5C,CAAC;AAEH,MAAM,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,cAAc,CAAC,CAAC"}