@jambudipa/spider 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. package/README.md +75 -35
  2. package/dist/browser/BrowserManager.d.ts +63 -0
  3. package/dist/browser/BrowserManager.d.ts.map +1 -0
  4. package/dist/browser/PlaywrightAdapter.d.ts +166 -0
  5. package/dist/browser/PlaywrightAdapter.d.ts.map +1 -0
  6. package/dist/examples/01-basic-crawl-working.d.ts +13 -0
  7. package/dist/examples/01-basic-crawl-working.d.ts.map +1 -0
  8. package/dist/examples/02-multiple-urls-working.d.ts +13 -0
  9. package/dist/examples/02-multiple-urls-working.d.ts.map +1 -0
  10. package/dist/examples/03-url-filtering.d.ts +13 -0
  11. package/dist/examples/03-url-filtering.d.ts.map +1 -0
  12. package/dist/examples/04-robots-compliance.d.ts +14 -0
  13. package/dist/examples/04-robots-compliance.d.ts.map +1 -0
  14. package/dist/examples/05-link-extraction-selectors.d.ts +14 -0
  15. package/dist/examples/05-link-extraction-selectors.d.ts.map +1 -0
  16. package/dist/examples/06-custom-middleware.d.ts +18 -0
  17. package/dist/examples/06-custom-middleware.d.ts.map +1 -0
  18. package/dist/examples/07-resumability-demo.d.ts +14 -0
  19. package/dist/examples/07-resumability-demo.d.ts.map +1 -0
  20. package/dist/examples/08-worker-monitoring.d.ts +15 -0
  21. package/dist/examples/08-worker-monitoring.d.ts.map +1 -0
  22. package/dist/examples/09-error-handling-recovery.d.ts +15 -0
  23. package/dist/examples/09-error-handling-recovery.d.ts.map +1 -0
  24. package/dist/index.d.ts +33 -0
  25. package/dist/index.d.ts.map +1 -0
  26. package/dist/index.js +3596 -1440
  27. package/dist/index.js.map +1 -1
  28. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +107 -0
  29. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +1 -0
  30. package/dist/lib/Config/SpiderConfig.service.d.ts +256 -0
  31. package/dist/lib/Config/SpiderConfig.service.d.ts.map +1 -0
  32. package/dist/lib/HttpClient/CookieManager.d.ts +58 -0
  33. package/dist/lib/HttpClient/CookieManager.d.ts.map +1 -0
  34. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +63 -0
  35. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +1 -0
  36. package/dist/lib/HttpClient/SessionStore.d.ts +114 -0
  37. package/dist/lib/HttpClient/SessionStore.d.ts.map +1 -0
  38. package/dist/lib/HttpClient/TokenExtractor.d.ts +83 -0
  39. package/dist/lib/HttpClient/TokenExtractor.d.ts.map +1 -0
  40. package/dist/lib/HttpClient/index.d.ts +8 -0
  41. package/dist/lib/HttpClient/index.d.ts.map +1 -0
  42. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +166 -0
  43. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +1 -0
  44. package/dist/lib/LinkExtractor/index.d.ts +37 -0
  45. package/dist/lib/LinkExtractor/index.d.ts.map +1 -0
  46. package/dist/lib/Logging/FetchLogger.d.ts +24 -0
  47. package/dist/lib/Logging/FetchLogger.d.ts.map +1 -0
  48. package/dist/lib/Logging/SpiderLogger.service.d.ts +37 -0
  49. package/dist/lib/Logging/SpiderLogger.service.d.ts.map +1 -0
  50. package/dist/lib/Middleware/SpiderMiddleware.d.ts +239 -0
  51. package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +1 -0
  52. package/dist/lib/Middleware/types.d.ts +99 -0
  53. package/dist/lib/Middleware/types.d.ts.map +1 -0
  54. package/dist/lib/PageData/PageData.d.ts +28 -0
  55. package/dist/lib/PageData/PageData.d.ts.map +1 -0
  56. package/dist/lib/Resumability/Resumability.service.d.ts +178 -0
  57. package/dist/lib/Resumability/Resumability.service.d.ts.map +1 -0
  58. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +47 -0
  59. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +1 -0
  60. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +95 -0
  61. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +1 -0
  62. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +92 -0
  63. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +1 -0
  64. package/dist/lib/Resumability/index.d.ts +51 -0
  65. package/dist/lib/Resumability/index.d.ts.map +1 -0
  66. package/dist/lib/Resumability/strategies.d.ts +76 -0
  67. package/dist/lib/Resumability/strategies.d.ts.map +1 -0
  68. package/dist/lib/Resumability/types.d.ts +201 -0
  69. package/dist/lib/Resumability/types.d.ts.map +1 -0
  70. package/dist/lib/Robots/Robots.service.d.ts +78 -0
  71. package/dist/lib/Robots/Robots.service.d.ts.map +1 -0
  72. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +211 -0
  73. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +1 -0
  74. package/dist/lib/Scraper/Scraper.service.d.ts +123 -0
  75. package/dist/lib/Scraper/Scraper.service.d.ts.map +1 -0
  76. package/dist/lib/Spider/Spider.service.d.ts +249 -0
  77. package/dist/lib/Spider/Spider.service.d.ts.map +1 -0
  78. package/dist/lib/StateManager/StateManager.service.d.ts +107 -0
  79. package/dist/lib/StateManager/StateManager.service.d.ts.map +1 -0
  80. package/dist/lib/StateManager/index.d.ts +5 -0
  81. package/dist/lib/StateManager/index.d.ts.map +1 -0
  82. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +58 -0
  83. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +1 -0
  84. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +110 -0
  85. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +1 -0
  86. package/dist/lib/WebScrapingEngine/index.d.ts +5 -0
  87. package/dist/lib/WebScrapingEngine/index.d.ts.map +1 -0
  88. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +39 -0
  89. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +1 -0
  90. package/dist/lib/api-facades.d.ts +313 -0
  91. package/dist/lib/api-facades.d.ts.map +1 -0
  92. package/dist/lib/errors/effect-errors.d.ts +179 -0
  93. package/dist/lib/errors/effect-errors.d.ts.map +1 -0
  94. package/dist/lib/errors.d.ts +172 -0
  95. package/dist/lib/errors.d.ts.map +1 -0
  96. package/dist/lib/utils/FileUtils.d.ts +284 -0
  97. package/dist/lib/utils/FileUtils.d.ts.map +1 -0
  98. package/dist/lib/utils/JsonUtils.d.ts +196 -0
  99. package/dist/lib/utils/JsonUtils.d.ts.map +1 -0
  100. package/dist/lib/utils/RegexUtils.d.ts +257 -0
  101. package/dist/lib/utils/RegexUtils.d.ts.map +1 -0
  102. package/dist/lib/utils/SchemaUtils.d.ts +251 -0
  103. package/dist/lib/utils/SchemaUtils.d.ts.map +1 -0
  104. package/dist/lib/utils/UrlUtils.d.ts +223 -0
  105. package/dist/lib/utils/UrlUtils.d.ts.map +1 -0
  106. package/dist/lib/utils/effect-migration.d.ts +31 -0
  107. package/dist/lib/utils/effect-migration.d.ts.map +1 -0
  108. package/dist/lib/utils/index.d.ts +15 -0
  109. package/dist/lib/utils/index.d.ts.map +1 -0
  110. package/dist/lib/utils/url-deduplication.d.ts +108 -0
  111. package/dist/lib/utils/url-deduplication.d.ts.map +1 -0
  112. package/dist/lib/utils/url-deduplication.test.d.ts +5 -0
  113. package/dist/lib/utils/url-deduplication.test.d.ts.map +1 -0
  114. package/dist/test/infrastructure/EffectTestUtils.d.ts +167 -0
  115. package/dist/test/infrastructure/EffectTestUtils.d.ts.map +1 -0
  116. package/package.json +23 -9
@@ -0,0 +1,123 @@
1
+ import { Effect } from 'effect';
2
+ import { NetworkError, ResponseError, ContentTypeError, RequestAbortError } from '../errors.js';
3
+ import { SpiderLogger } from '../Logging/SpiderLogger.service.js';
4
+ declare const ScraperService_base: Effect.Service.Class<ScraperService, "@jambudipa.io/ScraperService", {
5
+ readonly effect: Effect.Effect<{
6
+ /**
7
+ * Fetches a URL and parses the HTML to extract basic page information.
8
+ *
9
+ * This method performs the following operations:
10
+ * 1. Fetches the URL with configurable timeout (30 seconds)
11
+ * 2. Validates content type (skips binary files)
12
+ * 3. Parses HTML content with cheerio
13
+ * 4. Extracts basic page metadata (title, description, etc.)
14
+ * 5. Returns structured PageData object
15
+ *
16
+ * The method uses AbortController for proper timeout handling to prevent
17
+ * workers from hanging on malformed URLs or slow responses.
18
+ *
19
+ * @param url - The URL to fetch and parse
20
+ * @param depth - The crawl depth for logging purposes (default: 0)
21
+ * @returns Effect containing PageData with extracted information
22
+ * @throws NetworkError for network-related failures
23
+ * @throws ResponseError for HTTP error responses
24
+ *
25
+ * @example
26
+ * Basic usage:
27
+ * ```typescript
28
+ * const pageData = yield* scraper.fetchAndParse('https://example.com');
29
+ * console.log(`Page title: ${pageData.title}`);
30
+ * ```
31
+ *
32
+ * With depth tracking:
33
+ * ```typescript
34
+ * const pageData = yield* scraper.fetchAndParse('https://example.com/page', 2);
35
+ * ```
36
+ *
37
+ * Error handling:
38
+ * ```typescript
39
+ * const result = yield* scraper.fetchAndParse('https://example.com').pipe(
40
+ * Effect.catchTags({
41
+ * NetworkError: (error) => {
42
+ * console.log('Network error:', error.message);
43
+ * return Effect.succeed(null);
44
+ * },
45
+ * ResponseError: (error) => {
46
+ * console.log('HTTP error:', error.statusCode);
47
+ * return Effect.succeed(null);
48
+ * }
49
+ * })
50
+ * );
51
+ * ```
52
+ *
53
+ * @performance
54
+ * - Request timeout: 30 seconds
55
+ * - Response parsing timeout: 10 seconds
56
+ * - Memory usage: ~2-5MB per page depending on content size
57
+ *
58
+ * @security
59
+ * - Validates content types to prevent processing binary files
60
+ * - Uses AbortController to prevent hanging requests
61
+ * - No execution of JavaScript content (static HTML parsing only)
62
+ */
63
+ fetchAndParse: (url: string, depth?: number) => Effect.Effect<{
64
+ readonly url: string;
65
+ readonly html: string;
66
+ readonly title?: string | undefined;
67
+ readonly metadata: {
68
+ readonly [x: string]: string;
69
+ };
70
+ readonly commonMetadata?: {
71
+ readonly description?: string | undefined;
72
+ readonly keywords?: string | undefined;
73
+ readonly author?: string | undefined;
74
+ readonly robots?: string | undefined;
75
+ } | undefined;
76
+ readonly statusCode: number;
77
+ readonly headers: {
78
+ readonly [x: string]: string;
79
+ };
80
+ readonly fetchedAt: Date;
81
+ readonly scrapeDurationMs: number;
82
+ readonly depth: number;
83
+ readonly extractedData?: {
84
+ readonly [x: string]: unknown;
85
+ } | undefined;
86
+ }, NetworkError | ResponseError | ContentTypeError | RequestAbortError | import("effect/ParseResult").ParseError, SpiderLogger>;
87
+ }, never, never>;
88
+ }>;
89
+ /**
90
+ * Service responsible for fetching HTML content and parsing basic page information.
91
+ *
92
+ * The ScraperService handles the core HTTP fetching and HTML parsing functionality
93
+ * for the Spider framework. It provides robust error handling, timeout management,
94
+ * and content type validation to ensure reliable data extraction.
95
+ *
96
+ * **Key Features:**
97
+ * - Automatic timeout handling with AbortController
98
+ * - Content type validation (skips binary files)
99
+ * - Comprehensive error handling with typed errors
100
+ * - Performance monitoring and logging
101
+ * - Effect integration for composability
102
+ *
103
+ * **Note:** This service focuses solely on fetching and parsing HTML content.
104
+ * Link extraction is handled separately by LinkExtractorService for better
105
+ * separation of concerns and modularity.
106
+ *
107
+ * @example
108
+ * ```typescript
109
+ * const program = Effect.gen(function* () {
110
+ * const scraper = yield* ScraperService;
111
+ * const pageData = yield* scraper.fetchAndParse('https://example.com', 0);
112
+ * console.log(`Title: ${pageData.title}`);
113
+ * console.log(`Content length: ${pageData.html.length}`);
114
+ * });
115
+ * ```
116
+ *
117
+ * @group Services
118
+ * @public
119
+ */
120
+ export declare class ScraperService extends ScraperService_base {
121
+ }
122
+ export {};
123
+ //# sourceMappingURL=Scraper.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"Scraper.service.d.ts","sourceRoot":"","sources":["../../../src/lib/Scraper/Scraper.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAsB,MAAM,EAAkB,MAAM,QAAQ,CAAC;AAGpE,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,cAAc,CAAC;AAChG,OAAO,EAAE,YAAY,EAAE,MAAM,oCAAoC,CAAC;;;QAqC5D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDG;6BACkB,MAAM;;;;;;;;;;;;;;;;;;;;;;;;;;AA5FjC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AACH,qBAAa,cAAe,SAAQ,mBAoOnC;CAAG"}
@@ -0,0 +1,249 @@
1
+ import { Effect, Sink } from 'effect';
2
+ import { UrlDeduplicatorService } from '../UrlDeduplicator/UrlDeduplicator.service.js';
3
+ import { ScraperService } from '../Scraper/Scraper.service.js';
4
+ import { PageData } from '../PageData/PageData.js';
5
+ import { RobotsService } from '../Robots/Robots.service.js';
6
+ import { type LinkExtractorConfig, LinkExtractorService } from '../LinkExtractor/index.js';
7
+ import { SpiderSchedulerService } from '../Scheduler/SpiderScheduler.service.js';
8
+ import { SpiderLogger } from '../Logging/SpiderLogger.service.js';
9
+ /**
10
+ * Configuration for extracting a nested field from an element.
11
+ *
12
+ * @group Data Types
13
+ * @public
14
+ */
15
+ interface NestedFieldConfig {
16
+ /** CSS selector to find the nested element */
17
+ readonly selector: string;
18
+ /** HTML attribute to extract (if not specified, extracts text content) */
19
+ readonly attribute?: string;
20
+ }
21
+ /**
22
+ * Configuration for extracting a single field from the page.
23
+ *
24
+ * @group Data Types
25
+ * @public
26
+ */
27
+ interface FieldExtractionConfig {
28
+ /** CSS selector to find the element */
29
+ readonly selector: string;
30
+ /** Extract text content (default: true) */
31
+ readonly text?: boolean;
32
+ /** HTML attribute to extract instead of text */
33
+ readonly attribute?: string;
34
+ /** Extract multiple matching elements */
35
+ readonly multiple?: boolean;
36
+ /** Check if element exists (returns boolean) */
37
+ readonly exists?: boolean;
38
+ /** Nested fields to extract from each matched element */
39
+ readonly fields?: Record<string, NestedFieldConfig>;
40
+ }
41
+ /**
42
+ * Data extraction configuration - either a simple CSS selector string
43
+ * or a detailed field extraction configuration.
44
+ *
45
+ * @group Data Types
46
+ * @public
47
+ */
48
+ type DataExtractionFieldConfig = string | FieldExtractionConfig;
49
+ /**
50
+ * Configuration for extracting structured data from pages.
51
+ *
52
+ * @group Data Types
53
+ * @public
54
+ */
55
+ type DataExtractionConfig = Record<string, DataExtractionFieldConfig>;
56
+ /**
57
+ * Represents a single crawling task with URL and depth information.
58
+ *
59
+ * @group Data Types
60
+ * @public
61
+ */
62
+ interface CrawlTask {
63
+ /** The URL to be crawled */
64
+ url: string;
65
+ /** The depth level of this URL relative to the starting URL */
66
+ depth: number;
67
+ /** The URL from which this URL was discovered (optional) */
68
+ fromUrl?: string;
69
+ /** Optional metadata to be passed through to the result */
70
+ metadata?: Record<string, unknown>;
71
+ /** Optional data extraction configuration */
72
+ extractData?: DataExtractionConfig;
73
+ }
74
+ /**
75
+ * The result of a successful crawl operation.
76
+ *
77
+ * Contains all extracted information from a crawled page along with
78
+ * metadata about when and at what depth it was processed.
79
+ *
80
+ * @group Data Types
81
+ * @public
82
+ */
83
+ interface CrawlResult {
84
+ /** The extracted page data including content, links, and metadata */
85
+ pageData: PageData;
86
+ /** The depth at which this page was crawled */
87
+ depth: number;
88
+ /** When this page was crawled */
89
+ timestamp: Date;
90
+ /** Optional metadata passed through from the original request */
91
+ metadata?: Record<string, unknown>;
92
+ }
93
+ /**
94
+ * The main Spider service that orchestrates web crawling operations.
95
+ *
96
+ * This service provides the core functionality for crawling websites, including:
97
+ * - URL validation and filtering based on configuration
98
+ * - Robots.txt compliance checking
99
+ * - Concurrent crawling with configurable worker pools
100
+ * - Request scheduling and rate limiting
101
+ * - Result streaming through Effect sinks
102
+ *
103
+ * @example
104
+ * ```typescript
105
+ * const program = Effect.gen(function* () {
106
+ * const spider = yield* Spider;
107
+ * const collectSink = Sink.forEach<CrawlResult>(result =>
108
+ * Effect.sync(() => console.log(result.pageData.url))
109
+ * );
110
+ *
111
+ * const stats = yield* spider.crawl('https://example.com', collectSink);
112
+ * console.log(`Crawled ${stats.totalPages} pages`);
113
+ * });
114
+ * ```
115
+ *
116
+ * @group Services
117
+ * @public
118
+ */
119
+ /**
120
+ * Options for enhanced link extraction during crawling.
121
+ *
122
+ * @group Configuration
123
+ * @public
124
+ */
125
+ export interface SpiderLinkExtractionOptions {
126
+ /** Configuration for the LinkExtractorService */
127
+ readonly linkExtractorConfig?: LinkExtractorConfig;
128
+ /** Whether to use enhanced extraction in addition to basic extraction (default: false) */
129
+ readonly useEnhancedExtraction?: boolean;
130
+ /** Whether to replace basic extraction with enhanced extraction (default: true) */
131
+ readonly replaceBasicExtraction?: boolean;
132
+ /** Data extraction configuration for structured data extraction */
133
+ readonly extractData?: DataExtractionConfig;
134
+ }
135
+ declare const SpiderService_base: Effect.Service.Class<SpiderService, "@jambudipa/spider", {
136
+ readonly effect: Effect.Effect<{
137
+ /**
138
+ * Starts crawling from the specified URL and processes results through the provided sink.
139
+ *
140
+ * This method:
141
+ * 1. Validates the starting URL against configuration rules
142
+ * 2. Starts a configurable number of worker fibers
143
+ * 3. Each worker processes URLs from a shared queue
144
+ * 4. Results are streamed through the provided sink
145
+ * 5. New URLs discovered are queued for processing
146
+ *
147
+ * @param startingUrls - The starting URL(s) for crawling (single string or array)
148
+ * @param sink - Sink to process crawl results as they're produced
149
+ * @param options - Optional enhanced link extraction configuration
150
+ * @returns Effect containing crawl statistics (total pages, completion status)
151
+ *
152
+ * @example
153
+ * Basic usage:
154
+ * ```typescript
155
+ * const collectSink = Sink.forEach<CrawlResult>(result =>
156
+ * Effect.sync(() => console.log(`Found: ${result.pageData.title}`))
157
+ * );
158
+ *
159
+ * const stats = yield* spider.crawl('https://example.com', collectSink);
160
+ * ```
161
+ *
162
+ * With multiple starting URLs:
163
+ * ```typescript
164
+ * const stats = yield* spider.crawl([
165
+ * 'https://example.com',
166
+ * 'https://other-domain.com'
167
+ * ], collectSink);
168
+ * ```
169
+ *
170
+ * With enhanced link extraction:
171
+ * ```typescript
172
+ * const stats = yield* spider.crawl('https://example.com', collectSink, {
173
+ * useEnhancedExtraction: true,
174
+ * linkExtractorConfig: {
175
+ * allowPatterns: [/\/articles\//],
176
+ * restrictCss: ['.content a']
177
+ * }
178
+ * });
179
+ * ```
180
+ */
181
+ crawl: <A, E, R>(startingUrls: string | string[] | {
182
+ url: string;
183
+ metadata?: Record<string, unknown>;
184
+ } | {
185
+ url: string;
186
+ metadata?: Record<string, unknown>;
187
+ }[], sink: Sink.Sink<A, CrawlResult, E, R>, options?: SpiderLinkExtractionOptions) => Effect.Effect<{
188
+ completed: boolean;
189
+ }, unknown, import("../Config/SpiderConfig.service.js").SpiderConfigService | SpiderLogger>;
190
+ crawlSingle: <A, E, R>(urlString: string, sink: Sink.Sink<A, CrawlResult, E, R>, options?: SpiderLinkExtractionOptions, initialMetadata?: Record<string, unknown>, restrictToStartingDomain?: boolean) => Effect.Effect<{
191
+ completed: boolean;
192
+ pagesScraped: number;
193
+ domain: string;
194
+ }, unknown, import("../Config/SpiderConfig.service.js").SpiderConfigService | SpiderLogger>;
195
+ /**
196
+ * Resume a previous crawling session from persistent storage.
197
+ *
198
+ * This method requires resumability to be enabled in the SpiderConfig and
199
+ * a StatePersistence implementation to be configured. It will restore the
200
+ * crawling state and continue processing from where it left off.
201
+ *
202
+ * @param stateKey - The unique identifier for the session to resume
203
+ * @param sink - Sink to process crawl results as they're produced
204
+ * @param persistence - Optional persistence implementation (uses configured one if not provided)
205
+ * @returns Effect containing crawl statistics
206
+ *
207
+ * @example
208
+ * ```typescript
209
+ * const stateKey = new SpiderStateKey({
210
+ * id: 'my-crawl-session',
211
+ * timestamp: new Date('2024-01-01'),
212
+ * name: 'Example Crawl'
213
+ * });
214
+ *
215
+ * const collectSink = Sink.forEach<CrawlResult>(result =>
216
+ * Effect.sync(() => console.log(`Resumed: ${result.pageData.title}`))
217
+ * );
218
+ *
219
+ * const stats = yield* spider.resume(stateKey, collectSink);
220
+ * ```
221
+ */
222
+ resume: <A, E, R>(stateKey: import("../Scheduler/SpiderScheduler.service.js").SpiderStateKey, resumeSink: Sink.Sink<A, CrawlResult, E, R>, _persistence?: import("../Scheduler/SpiderScheduler.service.js").StatePersistence) => Effect.Effect<{
223
+ resumed: boolean;
224
+ sessionId: string;
225
+ completed: boolean;
226
+ urlsProcessed?: undefined;
227
+ } | {
228
+ completed: boolean;
229
+ resumed: boolean;
230
+ sessionId: string;
231
+ urlsProcessed: number;
232
+ }, unknown, import("../Config/SpiderConfig.service.js").SpiderConfigService | SpiderLogger | SpiderSchedulerService>;
233
+ /**
234
+ * Returns the list of URLs that have been visited during crawling.
235
+ *
236
+ * @returns Effect containing array of visited URLs
237
+ *
238
+ * @remarks
239
+ * This is currently a placeholder implementation. In a future version,
240
+ * this will return the actual list of visited URLs from the current session.
241
+ */
242
+ getVisitedUrls: () => Effect.Effect<string[]>;
243
+ }, never, SpiderLogger | ScraperService | RobotsService | LinkExtractorService>;
244
+ readonly dependencies: readonly [import("effect/Layer").Layer<RobotsService, never, never>, import("effect/Layer").Layer<ScraperService, never, never>, import("effect/Layer").Layer<UrlDeduplicatorService, never, never>, import("effect/Layer").Layer<import("../Config/SpiderConfig.service.js").SpiderConfigService, never, never>, import("effect/Layer").Layer<LinkExtractorService, never, never>, import("effect/Layer").Layer<SpiderLogger, never, never>];
245
+ }>;
246
+ export declare class SpiderService extends SpiderService_base {
247
+ }
248
+ export type { CrawlResult, CrawlTask };
249
+ //# sourceMappingURL=Spider.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"Spider.service.d.ts","sourceRoot":"","sources":["../../../src/lib/Spider/Spider.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAGL,MAAM,EASN,IAAI,EAEL,MAAM,QAAQ,CAAC;AAIhB,OAAO,EAAE,sBAAsB,EAAE,MAAM,+CAA+C,CAAC;AACvF,OAAO,EAAE,cAAc,EAAE,MAAM,+BAA+B,CAAC;AAC/D,OAAO,EAAE,QAAQ,EAAE,MAAM,yBAAyB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EACL,KAAK,mBAAmB,EACxB,oBAAoB,EACrB,MAAM,2BAA2B,CAAC;AACnC,OAAO,EAAE,sBAAsB,EAAE,MAAM,yCAAyC,CAAC;AAEjF,OAAO,EACL,YAAY,EAEb,MAAM,oCAAoC,CAAC;AAG5C;;;;;GAKG;AACH,UAAU,iBAAiB;IACzB,8CAA8C;IAC9C,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,0EAA0E;IAC1E,QAAQ,CAAC,SAAS,CAAC,EAAE,MAAM,CAAC;CAC7B;AAED;;;;;GAKG;AACH,UAAU,qBAAqB;IAC7B,uCAAuC;IACvC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,2CAA2C;IAC3C,QAAQ,CAAC,IAAI,CAAC,EAAE,OAAO,CAAC;IACxB,gDAAgD;IAChD,QAAQ,CAAC,SAAS,CAAC,EAAE,MAAM,CAAC;IAC5B,yCAAyC;IACzC,QAAQ,CAAC,QAAQ,CAAC,EAAE,OAAO,CAAC;IAC5B,gDAAgD;IAChD,QAAQ,CAAC,MAAM,CAAC,EAAE,OAAO,CAAC;IAC1B,yDAAyD;IACzD,QAAQ,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,iBAAiB,CAAC,CAAC;CACrD;AAED;;;;;;GAMG;AACH,KAAK,yBAAyB,GAAG,MAAM,GAAG,qBAAqB,CAAC;AAEhE;;;;;GAKG;AACH,KAAK,oBAAoB,GAAG,MAAM,CAAC,MAAM,EAAE,yBAAyB,CAAC,CAAC;AAEtE;;;;;GAKG;AACH,UAAU,SAAS;IACjB,4BAA4B;IAC5B,GAAG,EAAE,MAAM,CAAC;IACZ,+DAA+D;IAC/D,KAAK,EAAE,MAAM,CAAC;IACd,4DAA4D;IAC5D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,2DAA2D;IAC3D,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACnC,6CAA6C;IAC7C,WAAW,CAAC,EAAE,oBAAoB,CAAC;CACpC;AAED;;;;;;;;GAQG;AACH,UAAU,WAAW;IACnB,qEAAqE;IACrE,QAAQ,EAAE,QAAQ,CAAC;IACnB,+CAA+C;IAC/C,KAAK,EAAE,MAAM,CAAC;IACd,iCAAiC;IACjC,SAAS,EAAE,IAAI,CAAC;IAChB,iEAAiE;IACjE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACpC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH;;;;;GAKG;AACH,MAAM,WAAW,2BAA2B;IAC1C,iDAAiD;IACjD,QAAQ,CAAC,mBAAmB,CAAC,EAAE,mBAAmB,CAAC;IACnD,0FAA0F;IAC1F,QAAQ,CAAC,qBAAqB,CAAC,EAAE,OAAO,CAAC;IACzC,mFAAmF;IACnF,QAAQ,CAAC,sBAAsB,CAAC,EAAE,OAAO,CAAC;IAC1C,mEAAmE;IACnE,QAAQ,CAAC,WAAW,CAAC,EAAE,oBAAoB,CAAC;CAC7C;;;QAqBO;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WA2CG;gBACK,CAAC,EAAE,CAAC,EAAE,CAAC,gBAET,MAAM,GACN,MAAM,EAAE,GACR;YAAE,GAAG,EAAE,MAAM,CAAC;YAAC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;SAAE,GACnD;YAAE,GAAG,EAAE,MAAM,CAAC;YAAC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;SAAE,EAAE,QACnD,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC,YAC3B,2BAA2B;;;sBA0HzB,CAAC,EAAE,CAAC,EAAE,CAAC,aACR,MAAM,QACX,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC,YAC3B,2BAA2B,oBACnB,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,6BACd,OAAO;;;;;QA6gCpC;;;;;;;;;;;;;;;;;;;;;;;;;;WA0BG;iBACM,CAAC,EAAE,CAAC,EAAE,CAAC,YACJ,OAAO,yCAAyC,EAAE,cAAc,cAC9D,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC,iBAC5B,OAAO,yCAAyC,EAAE,gBAAgB;;;;;;;;;;;QAyInF;;;;;;;;WAQG;8BACiB,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;;;;AAl4CnD,qBAAa,aAAc,SAAQ,kBAg5ClC;CAAG;AAEJ,YAAY,EAAE,WAAW,EAAE,SAAS,EAAE,CAAC"}
@@ -0,0 +1,107 @@
1
+ /**
2
+ * State Manager Service
3
+ * Manages tokens, sessions, and client-side storage simulation
4
+ */
5
+ import { Context, Effect, Layer } from 'effect';
6
+ declare const CSRFTokenNotFoundError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
7
+ readonly _tag: "CSRFTokenNotFoundError";
8
+ } & Readonly<A>;
9
+ export declare class CSRFTokenNotFoundError extends CSRFTokenNotFoundError_base<{
10
+ readonly message: string;
11
+ }> {
12
+ }
13
+ declare const APITokenNotFoundError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
14
+ readonly _tag: "APITokenNotFoundError";
15
+ } & Readonly<A>;
16
+ export declare class APITokenNotFoundError extends APITokenNotFoundError_base<{
17
+ readonly message: string;
18
+ }> {
19
+ }
20
+ declare const TokenNotFoundError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
21
+ readonly _tag: "TokenNotFoundError";
22
+ } & Readonly<A>;
23
+ export declare class TokenNotFoundError extends TokenNotFoundError_base<{
24
+ readonly message: string;
25
+ readonly tokenType: TokenType;
26
+ }> {
27
+ }
28
+ declare const TokenExpiredError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
29
+ readonly _tag: "TokenExpiredError";
30
+ } & Readonly<A>;
31
+ export declare class TokenExpiredError extends TokenExpiredError_base<{
32
+ readonly message: string;
33
+ readonly tokenType: TokenType;
34
+ }> {
35
+ }
36
+ declare const StorageKeyNotFoundError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
37
+ readonly _tag: "StorageKeyNotFoundError";
38
+ } & Readonly<A>;
39
+ export declare class StorageKeyNotFoundError extends StorageKeyNotFoundError_base<{
40
+ readonly message: string;
41
+ readonly key: string;
42
+ readonly storageType: 'local' | 'session';
43
+ }> {
44
+ }
45
+ export declare enum TokenType {
46
+ CSRF = "csrf",
47
+ API = "api",
48
+ AUTH = "auth",
49
+ REFRESH = "refresh"
50
+ }
51
+ export interface Token {
52
+ type: TokenType;
53
+ value: string;
54
+ expiry?: Date;
55
+ scope?: string[];
56
+ }
57
+ export interface StateManagerService {
58
+ /**
59
+ * Extract CSRF token from HTML
60
+ */
61
+ extractCSRFToken: (html: string) => Effect.Effect<string, CSRFTokenNotFoundError>;
62
+ /**
63
+ * Extract API token from JavaScript
64
+ */
65
+ extractAPIToken: (scripts: string[]) => Effect.Effect<string, APITokenNotFoundError>;
66
+ /**
67
+ * Store a token
68
+ */
69
+ storeToken: (type: TokenType, token: string, expiry?: Date) => Effect.Effect<void>;
70
+ /**
71
+ * Get a stored token
72
+ */
73
+ getToken: (type: TokenType) => Effect.Effect<string, TokenNotFoundError | TokenExpiredError>;
74
+ /**
75
+ * Check if token is valid (not expired)
76
+ */
77
+ isTokenValid: (type: TokenType) => Effect.Effect<boolean>;
78
+ /**
79
+ * Simulate local storage
80
+ */
81
+ setLocalStorage: (key: string, value: string) => Effect.Effect<void>;
82
+ getLocalStorage: (key: string) => Effect.Effect<string, StorageKeyNotFoundError>;
83
+ clearLocalStorage: () => Effect.Effect<void>;
84
+ /**
85
+ * Simulate session storage
86
+ */
87
+ setSessionStorage: (key: string, value: string) => Effect.Effect<void>;
88
+ getSessionStorage: (key: string) => Effect.Effect<string, StorageKeyNotFoundError>;
89
+ clearSessionStorage: () => Effect.Effect<void>;
90
+ /**
91
+ * Clear all state
92
+ */
93
+ clearState: () => Effect.Effect<void>;
94
+ }
95
+ declare const StateManager_base: Context.TagClass<StateManager, "StateManager", StateManagerService>;
96
+ export declare class StateManager extends StateManager_base {
97
+ }
98
+ /**
99
+ * Create a StateManager service implementation
100
+ */
101
+ export declare const makeStateManager: () => Effect.Effect<StateManagerService>;
102
+ /**
103
+ * StateManager Layer
104
+ */
105
+ export declare const StateManagerLive: Layer.Layer<StateManager, never, never>;
106
+ export {};
107
+ //# sourceMappingURL=StateManager.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"StateManager.service.d.ts","sourceRoot":"","sources":["../../../src/lib/StateManager/StateManager.service.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,OAAO,EAAkB,MAAM,EAAW,KAAK,EAAe,MAAM,QAAQ,CAAC;;;;AAItF,qBAAa,sBAAuB,SAAQ,4BAA2C;IACrF,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;CAAG;;;;AAEL,qBAAa,qBAAsB,SAAQ,2BAA0C;IACnF,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B,CAAC;CAAG;;;;AAEL,qBAAa,kBAAmB,SAAQ,wBAAuC;IAC7E,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,SAAS,EAAE,SAAS,CAAC;CAC/B,CAAC;CAAG;;;;AAEL,qBAAa,iBAAkB,SAAQ,uBAAsC;IAC3E,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,SAAS,EAAE,SAAS,CAAC;CAC/B,CAAC;CAAG;;;;AAEL,qBAAa,uBAAwB,SAAQ,6BAA4C;IACvF,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,WAAW,EAAE,OAAO,GAAG,SAAS,CAAC;CAC3C,CAAC;CAAG;AAEL,oBAAY,SAAS;IACnB,IAAI,SAAS;IACb,GAAG,QAAQ;IACX,IAAI,SAAS;IACb,OAAO,YAAY;CACpB;AAED,MAAM,WAAW,KAAK;IACpB,IAAI,EAAE,SAAS,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,IAAI,CAAC;IACd,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;CAClB;AAED,MAAM,WAAW,mBAAmB;IAClC;;OAEG;IACH,gBAAgB,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,sBAAsB,CAAC,CAAC;IAElF;;OAEG;IACH,eAAe,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,qBAAqB,CAAC,CAAC;IAErF;;OAEG;IACH,UAAU,EAAE,CACV,IAAI,EAAE,SAAS,EACf,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,IAAI,KACV,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAEzB;;OAEG;IACH,QAAQ,EAAE,CAAC,IAAI,EAAE,SAAS,KAAK,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,kBAAkB,GAAG,iBAAiB,CAAC,CAAC;IAE7F;;OAEG;IACH,YAAY,EAAE,CAAC,IAAI,EAAE,SAAS,KAAK,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAE1D;;OAEG;IACH,eAAe,EAAE,CACf,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,KACV,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,eAAe,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,uBAAuB,CAAC,CAAC;IACjF,iBAAiB,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAE7C;;OAEG;IACH,iBAAiB,EAAE,CACjB,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,KACV,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACzB,iBAAiB,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,uBAAuB,CAAC,CAAC;IACnF,mBAAmB,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IAE/C;;OAEG;IACH,UAAU,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;CACvC;;AAED,qBAAa,YAAa,SAAQ,iBAG/B;CAAG;AAEN;;GAEG;AACH,eAAO,MAAM,gBAAgB,QAAO,MAAM,CAAC,MAAM,CAAC,mBAAmB,CAqNjE,CAAC;AAEL;;GAEG;AACH,eAAO,MAAM,gBAAgB,yCAAiD,CAAC"}
@@ -0,0 +1,5 @@
1
+ /**
2
+ * State Manager module exports
3
+ */
4
+ export * from './StateManager.service.js';
5
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/lib/StateManager/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,cAAc,2BAA2B,CAAC"}
@@ -0,0 +1,58 @@
1
+ import { Effect, MutableHashSet } from 'effect';
2
+ /**
3
+ * Thread-safe URL deduplication service with built-in normalization.
4
+ *
5
+ * Provides atomic operations for checking and adding URLs to prevent
6
+ * race conditions in concurrent environments. URLs are normalized
7
+ * before storage to ensure consistent deduplication.
8
+ *
9
+ * @group Services
10
+ * @public
11
+ */
12
+ export interface IUrlDeduplicator {
13
+ /**
14
+ * Attempts to add a URL to the deduplication set.
15
+ *
16
+ * @param url - The URL to add
17
+ * @returns Effect containing boolean - true if URL was added (first time seen), false if already exists
18
+ */
19
+ tryAdd(_url: string): Effect.Effect<boolean>;
20
+ /**
21
+ * Checks if a URL has already been seen.
22
+ *
23
+ * @param url - The URL to check
24
+ * @returns Effect containing boolean - true if URL exists, false otherwise
25
+ */
26
+ contains(_url: string): Effect.Effect<boolean>;
27
+ /**
28
+ * Returns the current number of unique URLs in the set.
29
+ *
30
+ * @returns Effect containing the count
31
+ */
32
+ size(): Effect.Effect<number>;
33
+ /**
34
+ * Clears all URLs from the deduplication set.
35
+ *
36
+ * @returns Effect containing void
37
+ */
38
+ clear(): Effect.Effect<void>;
39
+ }
40
+ declare const UrlDeduplicatorService_base: Effect.Service.Class<UrlDeduplicatorService, "@jambudipa.io/UrlDeduplicatorService", {
41
+ readonly effect: Effect.Effect<{
42
+ tryAdd: (url: string) => Effect.Effect<boolean, never, never>;
43
+ contains: (url: string) => Effect.Effect<boolean, never, never>;
44
+ size: () => Effect.Effect<number, never, never>;
45
+ clear: () => Effect.Effect<MutableHashSet.MutableHashSet<string>, never, never>;
46
+ }, never, import("../Config/SpiderConfig.service.js").SpiderConfigService>;
47
+ readonly dependencies: readonly [import("effect/Layer").Layer<import("../Config/SpiderConfig.service.js").SpiderConfigService, never, never>];
48
+ }>;
49
+ /**
50
+ * URL deduplication service as an Effect Service.
51
+ *
52
+ * @group Services
53
+ * @public
54
+ */
55
+ export declare class UrlDeduplicatorService extends UrlDeduplicatorService_base {
56
+ }
57
+ export {};
58
+ //# sourceMappingURL=UrlDeduplicator.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"UrlDeduplicator.service.d.ts","sourceRoot":"","sources":["../../../src/lib/UrlDeduplicator/UrlDeduplicator.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,cAAc,EAAE,MAAM,QAAQ,CAAC;AAGhD;;;;;;;;;GASG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;OAKG;IACH,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAE7C;;;;;OAKG;IACH,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAE/C;;;;OAIG;IACH,IAAI,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IAE9B;;;;OAIG;IACH,KAAK,IAAI,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;CAC9B;;;sBA4EqB,MAAM;wBAcJ,MAAM;;;;;;AAxF9B;;;;;GAKG;AACH,qBAAa,sBAAuB,SAAQ,2BAuG3C;CAAG"}