@jambudipa/spider 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +426 -0
  3. package/dist/index.d.ts +33 -0
  4. package/dist/index.d.ts.map +1 -0
  5. package/dist/index.js +4681 -0
  6. package/dist/index.js.map +1 -0
  7. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +57 -0
  8. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +1 -0
  9. package/dist/lib/Config/SpiderConfig.service.d.ts +256 -0
  10. package/dist/lib/Config/SpiderConfig.service.d.ts.map +1 -0
  11. package/dist/lib/HttpClient/CookieManager.d.ts +44 -0
  12. package/dist/lib/HttpClient/CookieManager.d.ts.map +1 -0
  13. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +88 -0
  14. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +1 -0
  15. package/dist/lib/HttpClient/SessionStore.d.ts +82 -0
  16. package/dist/lib/HttpClient/SessionStore.d.ts.map +1 -0
  17. package/dist/lib/HttpClient/TokenExtractor.d.ts +58 -0
  18. package/dist/lib/HttpClient/TokenExtractor.d.ts.map +1 -0
  19. package/dist/lib/HttpClient/index.d.ts +8 -0
  20. package/dist/lib/HttpClient/index.d.ts.map +1 -0
  21. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +166 -0
  22. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +1 -0
  23. package/dist/lib/LinkExtractor/index.d.ts +37 -0
  24. package/dist/lib/LinkExtractor/index.d.ts.map +1 -0
  25. package/dist/lib/Logging/FetchLogger.d.ts +8 -0
  26. package/dist/lib/Logging/FetchLogger.d.ts.map +1 -0
  27. package/dist/lib/Logging/SpiderLogger.service.d.ts +34 -0
  28. package/dist/lib/Logging/SpiderLogger.service.d.ts.map +1 -0
  29. package/dist/lib/Middleware/SpiderMiddleware.d.ts +276 -0
  30. package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +1 -0
  31. package/dist/lib/PageData/PageData.d.ts +28 -0
  32. package/dist/lib/PageData/PageData.d.ts.map +1 -0
  33. package/dist/lib/Resumability/Resumability.service.d.ts +176 -0
  34. package/dist/lib/Resumability/Resumability.service.d.ts.map +1 -0
  35. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +47 -0
  36. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +1 -0
  37. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +95 -0
  38. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +1 -0
  39. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +92 -0
  40. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +1 -0
  41. package/dist/lib/Resumability/index.d.ts +51 -0
  42. package/dist/lib/Resumability/index.d.ts.map +1 -0
  43. package/dist/lib/Resumability/strategies.d.ts +76 -0
  44. package/dist/lib/Resumability/strategies.d.ts.map +1 -0
  45. package/dist/lib/Resumability/types.d.ts +201 -0
  46. package/dist/lib/Resumability/types.d.ts.map +1 -0
  47. package/dist/lib/Robots/Robots.service.d.ts +78 -0
  48. package/dist/lib/Robots/Robots.service.d.ts.map +1 -0
  49. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +211 -0
  50. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +1 -0
  51. package/dist/lib/Scraper/Scraper.service.d.ts +123 -0
  52. package/dist/lib/Scraper/Scraper.service.d.ts.map +1 -0
  53. package/dist/lib/Spider/Spider.service.d.ts +194 -0
  54. package/dist/lib/Spider/Spider.service.d.ts.map +1 -0
  55. package/dist/lib/StateManager/StateManager.service.d.ts +68 -0
  56. package/dist/lib/StateManager/StateManager.service.d.ts.map +1 -0
  57. package/dist/lib/StateManager/index.d.ts +5 -0
  58. package/dist/lib/StateManager/index.d.ts.map +1 -0
  59. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +58 -0
  60. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +1 -0
  61. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +77 -0
  62. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +1 -0
  63. package/dist/lib/WebScrapingEngine/index.d.ts +5 -0
  64. package/dist/lib/WebScrapingEngine/index.d.ts.map +1 -0
  65. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +39 -0
  66. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +1 -0
  67. package/dist/lib/api-facades.d.ts +313 -0
  68. package/dist/lib/api-facades.d.ts.map +1 -0
  69. package/dist/lib/errors.d.ts +99 -0
  70. package/dist/lib/errors.d.ts.map +1 -0
  71. package/package.json +108 -0
@@ -0,0 +1,194 @@
1
+ import { Effect, Sink } from 'effect';
2
+ import { UrlDeduplicatorService } from '../UrlDeduplicator/UrlDeduplicator.service.js';
3
+ import { ScraperService } from '../Scraper/Scraper.service.js';
4
+ import { PageData } from '../PageData/PageData.js';
5
+ import { RobotsService } from '../Robots/Robots.service.js';
6
+ import { type LinkExtractorConfig, LinkExtractorService } from '../LinkExtractor/index.js';
7
+ import { SpiderLogger } from '../Logging/SpiderLogger.service.js';
8
+ /**
9
+ * Represents a single crawling task with URL and depth information.
10
+ *
11
+ * @group Data Types
12
+ * @public
13
+ */
14
+ interface CrawlTask {
15
+ /** The URL to be crawled */
16
+ url: string;
17
+ /** The depth level of this URL relative to the starting URL */
18
+ depth: number;
19
+ /** The URL from which this URL was discovered (optional) */
20
+ fromUrl?: string;
21
+ /** Optional metadata to be passed through to the result */
22
+ metadata?: Record<string, unknown>;
23
+ /** Optional data extraction configuration */
24
+ extractData?: Record<string, any>;
25
+ }
26
+ /**
27
+ * The result of a successful crawl operation.
28
+ *
29
+ * Contains all extracted information from a crawled page along with
30
+ * metadata about when and at what depth it was processed.
31
+ *
32
+ * @group Data Types
33
+ * @public
34
+ */
35
+ interface CrawlResult {
36
+ /** The extracted page data including content, links, and metadata */
37
+ pageData: PageData;
38
+ /** The depth at which this page was crawled */
39
+ depth: number;
40
+ /** When this page was crawled */
41
+ timestamp: Date;
42
+ /** Optional metadata passed through from the original request */
43
+ metadata?: Record<string, unknown>;
44
+ }
45
+ /**
46
+ * The main Spider service that orchestrates web crawling operations.
47
+ *
48
+ * This service provides the core functionality for crawling websites, including:
49
+ * - URL validation and filtering based on configuration
50
+ * - Robots.txt compliance checking
51
+ * - Concurrent crawling with configurable worker pools
52
+ * - Request scheduling and rate limiting
53
+ * - Result streaming through Effect sinks
54
+ *
55
+ * @example
56
+ * ```typescript
57
+ * const program = Effect.gen(function* () {
58
+ * const spider = yield* Spider;
59
+ * const collectSink = Sink.forEach<CrawlResult>(result =>
60
+ * Effect.sync(() => console.log(result.pageData.url))
61
+ * );
62
+ *
63
+ * const stats = yield* spider.crawl('https://example.com', collectSink);
64
+ * console.log(`Crawled ${stats.totalPages} pages`);
65
+ * });
66
+ * ```
67
+ *
68
+ * @group Services
69
+ * @public
70
+ */
71
+ /**
72
+ * Options for enhanced link extraction during crawling.
73
+ *
74
+ * @group Configuration
75
+ * @public
76
+ */
77
+ export interface SpiderLinkExtractionOptions {
78
+ /** Configuration for the LinkExtractorService */
79
+ readonly linkExtractorConfig?: LinkExtractorConfig;
80
+ /** Whether to use enhanced extraction in addition to basic extraction (default: false) */
81
+ readonly useEnhancedExtraction?: boolean;
82
+ /** Whether to replace basic extraction with enhanced extraction (default: true) */
83
+ readonly replaceBasicExtraction?: boolean;
84
+ /** Data extraction configuration for structured data extraction */
85
+ readonly extractData?: Record<string, any>;
86
+ }
87
+ declare const SpiderService_base: Effect.Service.Class<SpiderService, "@jambudipa.io/Spider", {
88
+ readonly effect: Effect.Effect<{
89
+ /**
90
+ * Starts crawling from the specified URL and processes results through the provided sink.
91
+ *
92
+ * This method:
93
+ * 1. Validates the starting URL against configuration rules
94
+ * 2. Starts a configurable number of worker fibers
95
+ * 3. Each worker processes URLs from a shared queue
96
+ * 4. Results are streamed through the provided sink
97
+ * 5. New URLs discovered are queued for processing
98
+ *
99
+ * @param startingUrls - The starting URL(s) for crawling (single string or array)
100
+ * @param sink - Sink to process crawl results as they're produced
101
+ * @param options - Optional enhanced link extraction configuration
102
+ * @returns Effect containing crawl statistics (total pages, completion status)
103
+ *
104
+ * @example
105
+ * Basic usage:
106
+ * ```typescript
107
+ * const collectSink = Sink.forEach<CrawlResult>(result =>
108
+ * Effect.sync(() => console.log(`Found: ${result.pageData.title}`))
109
+ * );
110
+ *
111
+ * const stats = yield* spider.crawl('https://example.com', collectSink);
112
+ * ```
113
+ *
114
+ * With multiple starting URLs:
115
+ * ```typescript
116
+ * const stats = yield* spider.crawl([
117
+ * 'https://example.com',
118
+ * 'https://other-domain.com'
119
+ * ], collectSink);
120
+ * ```
121
+ *
122
+ * With enhanced link extraction:
123
+ * ```typescript
124
+ * const stats = yield* spider.crawl('https://example.com', collectSink, {
125
+ * useEnhancedExtraction: true,
126
+ * linkExtractorConfig: {
127
+ * allowPatterns: [/\/articles\//],
128
+ * restrictCss: ['.content a']
129
+ * }
130
+ * });
131
+ * ```
132
+ */
133
+ crawl: <A, E, R>(startingUrls: string | string[] | {
134
+ url: string;
135
+ metadata?: Record<string, unknown>;
136
+ } | {
137
+ url: string;
138
+ metadata?: Record<string, unknown>;
139
+ }[], sink: Sink.Sink<A, CrawlResult, E, R>, options?: SpiderLinkExtractionOptions) => Effect.Effect<{
140
+ completed: boolean;
141
+ }, unknown, import("../Config/SpiderConfig.service.js").SpiderConfigService | SpiderLogger>;
142
+ crawlSingle: <A, E, R>(urlString: string, sink: Sink.Sink<A, CrawlResult, E, R>, options?: SpiderLinkExtractionOptions, initialMetadata?: Record<string, unknown>, restrictToStartingDomain?: boolean) => Effect.Effect<{
143
+ completed: boolean;
144
+ pagesScraped: number;
145
+ domain: string;
146
+ }, unknown, import("../Config/SpiderConfig.service.js").SpiderConfigService | SpiderLogger>;
147
+ /**
148
+ * Resume a previous crawling session from persistent storage.
149
+ *
150
+ * This method requires resumability to be enabled in the SpiderConfig and
151
+ * a StatePersistence implementation to be configured. It will restore the
152
+ * crawling state and continue processing from where it left off.
153
+ *
154
+ * @param stateKey - The unique identifier for the session to resume
155
+ * @param sink - Sink to process crawl results as they're produced
156
+ * @param persistence - Optional persistence implementation (uses configured one if not provided)
157
+ * @returns Effect containing crawl statistics
158
+ *
159
+ * @example
160
+ * ```typescript
161
+ * const stateKey = new SpiderStateKey({
162
+ * id: 'my-crawl-session',
163
+ * timestamp: new Date('2024-01-01'),
164
+ * name: 'Example Crawl'
165
+ * });
166
+ *
167
+ * const collectSink = Sink.forEach<CrawlResult>(result =>
168
+ * Effect.sync(() => console.log(`Resumed: ${result.pageData.title}`))
169
+ * );
170
+ *
171
+ * const stats = yield* spider.resume(stateKey, collectSink);
172
+ * ```
173
+ */
174
+ resume: <A, E, R>(stateKey: import("../Scheduler/SpiderScheduler.service.js").SpiderStateKey, _sink: Sink.Sink<A, CrawlResult, E, R>, _persistence?: import("../Scheduler/SpiderScheduler.service.js").StatePersistence) => Effect.Effect<{
175
+ completed: boolean;
176
+ resumed: boolean;
177
+ }, Error, import("../Config/SpiderConfig.service.js").SpiderConfigService>;
178
+ /**
179
+ * Returns the list of URLs that have been visited during crawling.
180
+ *
181
+ * @returns Effect containing array of visited URLs
182
+ *
183
+ * @remarks
184
+ * This is currently a placeholder implementation. In a future version,
185
+ * this will return the actual list of visited URLs from the current session.
186
+ */
187
+ getVisitedUrls: () => Effect.Effect<string[], never, never>;
188
+ }, never, SpiderLogger | ScraperService | RobotsService | LinkExtractorService>;
189
+ readonly dependencies: readonly [import("effect/Layer").Layer<RobotsService, never, never>, import("effect/Layer").Layer<ScraperService, never, never>, import("effect/Layer").Layer<UrlDeduplicatorService, never, never>, import("effect/Layer").Layer<import("../Config/SpiderConfig.service.js").SpiderConfigService, never, never>, import("effect/Layer").Layer<LinkExtractorService, never, never>, import("effect/Layer").Layer<SpiderLogger, never, never>];
190
+ }>;
191
+ export declare class SpiderService extends SpiderService_base {
192
+ }
193
+ export type { CrawlResult, CrawlTask };
194
+ //# sourceMappingURL=Spider.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"Spider.service.d.ts","sourceRoot":"","sources":["../../../src/lib/Spider/Spider.service.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,MAAM,EAQN,IAAI,EAEL,MAAM,QAAQ,CAAC;AAGhB,OAAO,EAAE,sBAAsB,EAAE,MAAM,+CAA+C,CAAC;AACvF,OAAO,EAAE,cAAc,EAAE,MAAM,+BAA+B,CAAC;AAC/D,OAAO,EAAE,QAAQ,EAAE,MAAM,yBAAyB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EACL,KAAK,mBAAmB,EACxB,oBAAoB,EACrB,MAAM,2BAA2B,CAAC;AAEnC,OAAO,EACL,YAAY,EAEb,MAAM,oCAAoC,CAAC;AAE5C;;;;;GAKG;AACH,UAAU,SAAS;IACjB,4BAA4B;IAC5B,GAAG,EAAE,MAAM,CAAC;IACZ,+DAA+D;IAC/D,KAAK,EAAE,MAAM,CAAC;IACd,4DAA4D;IAC5D,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,2DAA2D;IAC3D,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACnC,6CAA6C;IAC7C,WAAW,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;CACnC;AAED;;;;;;;;GAQG;AACH,UAAU,WAAW;IACnB,qEAAqE;IACrE,QAAQ,EAAE,QAAQ,CAAC;IACnB,+CAA+C;IAC/C,KAAK,EAAE,MAAM,CAAC;IACd,iCAAiC;IACjC,SAAS,EAAE,IAAI,CAAC;IAChB,iEAAiE;IACjE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACpC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH;;;;;GAKG;AACH,MAAM,WAAW,2BAA2B;IAC1C,iDAAiD;IACjD,QAAQ,CAAC,mBAAmB,CAAC,EAAE,mBAAmB,CAAC;IACnD,0FAA0F;IAC1F,QAAQ,CAAC,qBAAqB,CAAC,EAAE,OAAO,CAAC;IACzC,mFAAmF;IACnF,QAAQ,CAAC,sBAAsB,CAAC,EAAE,OAAO,CAAC;IAC1C,mEAAmE;IACnE,QAAQ,CAAC,WAAW,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;CAC5C;;;QAuBO;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WA2CG;gBACK,CAAC,EAAE,CAAC,EAAE,CAAC,gBAET,MAAM,GACN,MAAM,EAAE,GACR;YAAE,GAAG,EAAE,MAAM,CAAC;YAAC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;SAAE,GACnD;YAAE,GAAG,EAAE,MAAM,CAAC;YAAC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;SAAE,EAAE,QACnD,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC,YAC3B,2BAA2B;;;sBA+HzB,CAAC,EAAE,CAAC,EAAE,CAAC,aACR,MAAM,QACX,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC,YAC3B,2BAA2B,oBACnB,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,6BACd,OAAO;;;;;QA28BpC;;;;;;;;;;;;;;;;;;;;;;;;;;WA0BG;iBACM,CAAC,EAAE,CAAC,EAAE,CAAC,YACJ,OAAO,yCAAyC,EAAE,cAAc,SAEnE,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC,iBAEvB,OAAO,yCAAyC,EAAE,gBAAgB;;;;QA0CnF;;;;;;;;WAQG;;;;;AAzuCX,qBAAa,aAAc,SAAQ,kBAwvClC;CAAG;AAEJ,YAAY,EAAE,WAAW,EAAE,SAAS,EAAE,CAAC"}
@@ -0,0 +1,68 @@
1
+ /**
2
+ * State Manager Service
3
+ * Manages tokens, sessions, and client-side storage simulation
4
+ */
5
+ import { Context, Effect, Layer } from 'effect';
6
+ export declare enum TokenType {
7
+ CSRF = "csrf",
8
+ API = "api",
9
+ AUTH = "auth",
10
+ REFRESH = "refresh"
11
+ }
12
+ export interface Token {
13
+ type: TokenType;
14
+ value: string;
15
+ expiry?: Date;
16
+ scope?: string[];
17
+ }
18
+ export interface StateManagerService {
19
+ /**
20
+ * Extract CSRF token from HTML
21
+ */
22
+ extractCSRFToken: (html: string) => Effect.Effect<string, Error, never>;
23
+ /**
24
+ * Extract API token from JavaScript
25
+ */
26
+ extractAPIToken: (scripts: string[]) => Effect.Effect<string, Error, never>;
27
+ /**
28
+ * Store a token
29
+ */
30
+ storeToken: (type: TokenType, token: string, expiry?: Date) => Effect.Effect<void, never, never>;
31
+ /**
32
+ * Get a stored token
33
+ */
34
+ getToken: (type: TokenType) => Effect.Effect<string, Error, never>;
35
+ /**
36
+ * Check if token is valid (not expired)
37
+ */
38
+ isTokenValid: (type: TokenType) => Effect.Effect<boolean, never, never>;
39
+ /**
40
+ * Simulate local storage
41
+ */
42
+ setLocalStorage: (key: string, value: string) => Effect.Effect<void, never, never>;
43
+ getLocalStorage: (key: string) => Effect.Effect<string, Error, never>;
44
+ clearLocalStorage: () => Effect.Effect<void, never, never>;
45
+ /**
46
+ * Simulate session storage
47
+ */
48
+ setSessionStorage: (key: string, value: string) => Effect.Effect<void, never, never>;
49
+ getSessionStorage: (key: string) => Effect.Effect<string, Error, never>;
50
+ clearSessionStorage: () => Effect.Effect<void, never, never>;
51
+ /**
52
+ * Clear all state
53
+ */
54
+ clearState: () => Effect.Effect<void, never, never>;
55
+ }
56
+ declare const StateManager_base: Context.TagClass<StateManager, "StateManager", StateManagerService>;
57
+ export declare class StateManager extends StateManager_base {
58
+ }
59
+ /**
60
+ * Create a StateManager service implementation
61
+ */
62
+ export declare const makeStateManager: () => Effect.Effect<StateManagerService, never, never>;
63
+ /**
64
+ * StateManager Layer
65
+ */
66
+ export declare const StateManagerLive: Layer.Layer<StateManager, never, never>;
67
+ export {};
68
+ //# sourceMappingURL=StateManager.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"StateManager.service.d.ts","sourceRoot":"","sources":["../../../src/lib/StateManager/StateManager.service.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,KAAK,EAAO,MAAM,QAAQ,CAAC;AAGrD,oBAAY,SAAS;IACnB,IAAI,SAAS;IACb,GAAG,QAAQ;IACX,IAAI,SAAS;IACb,OAAO,YAAY;CACpB;AAED,MAAM,WAAW,KAAK;IACpB,IAAI,EAAE,SAAS,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,IAAI,CAAC;IACd,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;CAClB;AAED,MAAM,WAAW,mBAAmB;IAClC;;OAEG;IACH,gBAAgB,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAExE;;OAEG;IACH,eAAe,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAE5E;;OAEG;IACH,UAAU,EAAE,CACV,IAAI,EAAE,SAAS,EACf,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,IAAI,KACV,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAEvC;;OAEG;IACH,QAAQ,EAAE,CAAC,IAAI,EAAE,SAAS,KAAK,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAEnE;;OAEG;IACH,YAAY,EAAE,CAAC,IAAI,EAAE,SAAS,KAAK,MAAM,CAAC,MAAM,CAAC,OAAO,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAExE;;OAEG;IACH,eAAe,EAAE,CACf,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,KACV,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IACvC,eAAe,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IACtE,iBAAiB,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAE3D;;OAEG;IACH,iBAAiB,EAAE,CACjB,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,KACV,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IACvC,iBAAiB,EAAE,CAAC,GAAG,EAAE,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IACxE,mBAAmB,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAE7D;;OAEG;IACH,UAAU,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;CACrD;;AAED,qBAAa,YAAa,SAAQ,iBAG/B;CAAG;AAEN;;GAEG;AACH,eAAO,MAAM,gBAAgB,QAAO,MAAM,CAAC,MAAM,CAC/C,mBAAmB,EACnB,KAAK,EACL,KAAK,CAgNH,CAAC;AAEL;;GAEG;AACH,eAAO,MAAM,gBAAgB,yCAAiD,CAAC"}
@@ -0,0 +1,5 @@
1
+ /**
2
+ * State Manager module exports
3
+ */
4
+ export * from './StateManager.service.js';
5
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/lib/StateManager/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,cAAc,2BAA2B,CAAC"}
@@ -0,0 +1,58 @@
1
+ import { Effect, MutableHashSet } from 'effect';
2
+ /**
3
+ * Thread-safe URL deduplication service with built-in normalization.
4
+ *
5
+ * Provides atomic operations for checking and adding URLs to prevent
6
+ * race conditions in concurrent environments. URLs are normalized
7
+ * before storage to ensure consistent deduplication.
8
+ *
9
+ * @group Services
10
+ * @public
11
+ */
12
+ export interface IUrlDeduplicator {
13
+ /**
14
+ * Attempts to add a URL to the deduplication set.
15
+ *
16
+ * @param url - The URL to add
17
+ * @returns Effect containing boolean - true if URL was added (first time seen), false if already exists
18
+ */
19
+ tryAdd(url: string): Effect.Effect<boolean>;
20
+ /**
21
+ * Checks if a URL has already been seen.
22
+ *
23
+ * @param url - The URL to check
24
+ * @returns Effect containing boolean - true if URL exists, false otherwise
25
+ */
26
+ contains(url: string): Effect.Effect<boolean>;
27
+ /**
28
+ * Returns the current number of unique URLs in the set.
29
+ *
30
+ * @returns Effect containing the count
31
+ */
32
+ size(): Effect.Effect<number>;
33
+ /**
34
+ * Clears all URLs from the deduplication set.
35
+ *
36
+ * @returns Effect containing void
37
+ */
38
+ clear(): Effect.Effect<void>;
39
+ }
40
+ declare const UrlDeduplicatorService_base: Effect.Service.Class<UrlDeduplicatorService, "@jambudipa.io/UrlDeduplicatorService", {
41
+ readonly effect: Effect.Effect<{
42
+ tryAdd: (url: string) => Effect.Effect<boolean, never, never>;
43
+ contains: (url: string) => Effect.Effect<boolean, never, never>;
44
+ size: () => Effect.Effect<number, never, never>;
45
+ clear: () => Effect.Effect<MutableHashSet.MutableHashSet<string>, never, never>;
46
+ }, never, import("../Config/SpiderConfig.service.js").SpiderConfigService>;
47
+ readonly dependencies: readonly [import("effect/Layer").Layer<import("../Config/SpiderConfig.service.js").SpiderConfigService, never, never>];
48
+ }>;
49
+ /**
50
+ * URL deduplication service as an Effect Service.
51
+ *
52
+ * @group Services
53
+ * @public
54
+ */
55
+ export declare class UrlDeduplicatorService extends UrlDeduplicatorService_base {
56
+ }
57
+ export {};
58
+ //# sourceMappingURL=UrlDeduplicator.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"UrlDeduplicator.service.d.ts","sourceRoot":"","sources":["../../../src/lib/UrlDeduplicator/UrlDeduplicator.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,cAAc,EAAE,MAAM,QAAQ,CAAC;AAGhD;;;;;;;;;GASG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;OAKG;IACH,MAAM,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAE5C;;;;;OAKG;IACH,QAAQ,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAE9C;;;;OAIG;IACH,IAAI,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IAE9B;;;;OAIG;IACH,KAAK,IAAI,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;CAC9B;;;sBA2EqB,MAAM;wBAcJ,MAAM;;;;;;AAvF9B;;;;;GAKG;AACH,qBAAa,sBAAuB,SAAQ,2BAsG3C;CAAG"}
@@ -0,0 +1,77 @@
1
+ /**
2
+ * Web Scraping Engine Service
3
+ * Orchestrates all scraping capabilities including authentication, token management, and session handling
4
+ */
5
+ import { Context, Effect, Layer } from 'effect';
6
+ import { ScraperService } from '../Scraper/Scraper.service.js';
7
+ import { EnhancedHttpClient, type HttpResponse } from '../HttpClient/EnhancedHttpClient.js';
8
+ import { CookieManager } from '../HttpClient/CookieManager.js';
9
+ import { SessionStore } from '../HttpClient/SessionStore.js';
10
+ import { TokenExtractor } from '../HttpClient/TokenExtractor.js';
11
+ import { StateManager, TokenType } from '../StateManager/StateManager.service.js';
12
+ import { SpiderLogger } from '../Logging/SpiderLogger.service.js';
13
+ export interface LoginCredentials {
14
+ username: string;
15
+ password: string;
16
+ loginUrl: string;
17
+ usernameField?: string;
18
+ passwordField?: string;
19
+ additionalFields?: Record<string, string>;
20
+ }
21
+ export interface ScrapingSession {
22
+ id: string;
23
+ authenticated: boolean;
24
+ tokens: Map<TokenType, string>;
25
+ startTime: Date;
26
+ }
27
+ export interface WebScrapingEngineService {
28
+ /**
29
+ * Perform login with form submission
30
+ */
31
+ login: (credentials: LoginCredentials) => Effect.Effect<ScrapingSession, Error, never>;
32
+ /**
33
+ * Fetch authenticated content
34
+ */
35
+ fetchAuthenticated: (url: string) => Effect.Effect<HttpResponse, Error, never>;
36
+ /**
37
+ * Submit form with CSRF protection
38
+ */
39
+ submitFormWithCSRF: (url: string, formData: Record<string, string>, csrfUrl?: string) => Effect.Effect<HttpResponse, Error, never>;
40
+ /**
41
+ * Make API request with token
42
+ */
43
+ makeAPIRequest: (url: string, method?: 'GET' | 'POST' | 'PUT' | 'DELETE', data?: any) => Effect.Effect<HttpResponse, Error, never>;
44
+ /**
45
+ * Create and save a scraping session
46
+ */
47
+ createSession: (id?: string) => Effect.Effect<ScrapingSession, Error, never>;
48
+ /**
49
+ * Load existing session
50
+ */
51
+ loadSession: (id: string) => Effect.Effect<ScrapingSession, Error, never>;
52
+ /**
53
+ * Export session for persistence
54
+ */
55
+ exportSession: () => Effect.Effect<string, Error, never>;
56
+ /**
57
+ * Import session from persistence
58
+ */
59
+ importSession: (data: string) => Effect.Effect<void, Error, never>;
60
+ /**
61
+ * Clear all state and sessions
62
+ */
63
+ clearAll: () => Effect.Effect<void, never, never>;
64
+ }
65
+ declare const WebScrapingEngine_base: Context.TagClass<WebScrapingEngine, "WebScrapingEngine", WebScrapingEngineService>;
66
+ export declare class WebScrapingEngine extends WebScrapingEngine_base {
67
+ }
68
+ /**
69
+ * Create a WebScrapingEngine service implementation
70
+ */
71
+ export declare const makeWebScrapingEngine: Effect.Effect<WebScrapingEngineService, never, SpiderLogger | ScraperService | CookieManager | EnhancedHttpClient | StateManager | SessionStore | TokenExtractor>;
72
+ /**
73
+ * WebScrapingEngine Layer with all dependencies
74
+ */
75
+ export declare const WebScrapingEngineLive: Layer.Layer<WebScrapingEngine, never, SpiderLogger | ScraperService | CookieManager | EnhancedHttpClient | StateManager | SessionStore | TokenExtractor>;
76
+ export {};
77
+ //# sourceMappingURL=WebScrapingEngine.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"WebScrapingEngine.service.d.ts","sourceRoot":"","sources":["../../../src/lib/WebScrapingEngine/WebScrapingEngine.service.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,QAAQ,CAAC;AAChD,OAAO,EAAE,cAAc,EAAE,MAAM,+BAA+B,CAAC;AAC/D,OAAO,EACL,kBAAkB,EAClB,KAAK,YAAY,EAClB,MAAM,qCAAqC,CAAC;AAC7C,OAAO,EAAE,aAAa,EAAE,MAAM,gCAAgC,CAAC;AAC/D,OAAO,EAAE,YAAY,EAAE,MAAM,+BAA+B,CAAC;AAC7D,OAAO,EAAE,cAAc,EAAE,MAAM,iCAAiC,CAAC;AACjE,OAAO,EACL,YAAY,EACZ,SAAS,EACV,MAAM,yCAAyC,CAAC;AACjD,OAAO,EAAE,YAAY,EAAE,MAAM,oCAAoC,CAAC;AAElE,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;IACjB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,gBAAgB,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CAC3C;AAED,MAAM,WAAW,eAAe;IAC9B,EAAE,EAAE,MAAM,CAAC;IACX,aAAa,EAAE,OAAO,CAAC;IACvB,MAAM,EAAE,GAAG,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAC/B,SAAS,EAAE,IAAI,CAAC;CACjB;AAED,MAAM,WAAW,wBAAwB;IACvC;;OAEG;IACH,KAAK,EAAE,CACL,WAAW,EAAE,gBAAgB,KAC1B,MAAM,CAAC,MAAM,CAAC,eAAe,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAElD;;OAEG;IACH,kBAAkB,EAAE,CAClB,GAAG,EAAE,MAAM,KACR,MAAM,CAAC,MAAM,CAAC,YAAY,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAE/C;;OAEG;IACH,kBAAkB,EAAE,CAClB,GAAG,EAAE,MAAM,EACX,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAChC,OAAO,CAAC,EAAE,MAAM,KACb,MAAM,CAAC,MAAM,CAAC,YAAY,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAE/C;;OAEG;IACH,cAAc,EAAE,CACd,GAAG,EAAE,MAAM,EACX,MAAM,CAAC,EAAE,KAAK,GAAG,MAAM,GAAG,KAAK,GAAG,QAAQ,EAC1C,IAAI,CAAC,EAAE,GAAG,KACP,MAAM,CAAC,MAAM,CAAC,YAAY,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAE/C;;OAEG;IACH,aAAa,EAAE,CAAC,EAAE,CAAC,EAAE,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC,eAAe,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAE7E;;OAEG;IACH,WAAW,EAAE,CAAC,EAAE,EAAE,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC,eAAe,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAE1E;;OAEG;IACH,aAAa,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAEzD;;OAEG;IACH,aAAa,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAEnE;;OAEG;IACH,QAAQ,EAAE,MAAM,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;CACnD;;AAED,qBAAa,iBAAkB,SAAQ,sBAGpC;CAAG;AAEN;;GAEG;AACH,eAAO,MAAM,qBAAqB,mKAoRhC,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,qBAAqB,0JAGjC,CAAC"}
@@ -0,0 +1,5 @@
1
+ /**
2
+ * Web Scraping Engine module exports
3
+ */
4
+ export * from './WebScrapingEngine.service.js';
5
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/lib/WebScrapingEngine/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,cAAc,gCAAgC,CAAC"}
@@ -0,0 +1,39 @@
1
+ import { Effect } from 'effect';
2
+ import { SpiderLogger } from '../Logging/SpiderLogger.service.js';
3
+ interface WorkerStatus {
4
+ workerId: string;
5
+ domain: string;
6
+ currentUrl?: string;
7
+ lastActivity: Date;
8
+ fetchStartTime?: Date;
9
+ }
10
+ declare const WorkerHealthMonitor_base: Effect.Service.Class<WorkerHealthMonitor, "@jambudipa.io/WorkerHealthMonitor", {
11
+ readonly effect: Effect.Effect<{
12
+ /**
13
+ * Register a worker's activity
14
+ */
15
+ recordActivity: (workerId: string, domain: string, activity: {
16
+ url?: string;
17
+ fetchStart?: boolean;
18
+ }) => Effect.Effect<void, never, never>;
19
+ /**
20
+ * Remove a worker from monitoring
21
+ */
22
+ removeWorker: (workerId: string) => Effect.Effect<void, never, never>;
23
+ /**
24
+ * Get stuck workers
25
+ */
26
+ getStuckWorkers: Effect.Effect<WorkerStatus[], never, never>;
27
+ /**
28
+ * Monitor workers and log stuck ones
29
+ */
30
+ startMonitoring: Effect.Effect<void, never, never>;
31
+ }, never, SpiderLogger>;
32
+ }>;
33
+ /**
34
+ * Monitors worker health and kills stuck workers
35
+ */
36
+ export declare class WorkerHealthMonitor extends WorkerHealthMonitor_base {
37
+ }
38
+ export {};
39
+ //# sourceMappingURL=WorkerHealthMonitor.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"WorkerHealthMonitor.service.d.ts","sourceRoot":"","sources":["../../../src/lib/WorkerHealth/WorkerHealthMonitor.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAY,MAAM,EAAiB,MAAM,QAAQ,CAAC;AACzD,OAAO,EAAE,YAAY,EAAE,MAAM,oCAAoC,CAAC;AAElE,UAAU,YAAY;IACpB,QAAQ,EAAE,MAAM,CAAC;IACjB,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,IAAI,CAAC;IACnB,cAAc,CAAC,EAAE,IAAI,CAAC;CACvB;;;QAcO;;WAEG;mCAES,MAAM,UACR,MAAM,YACJ;YAAE,GAAG,CAAC,EAAE,MAAM,CAAC;YAAC,UAAU,CAAC,EAAE,OAAO,CAAA;SAAE;QAuBlD;;WAEG;iCACsB,MAAM;QAO/B;;WAEG;;QAgBH;;WAEG;;;;AAvEX;;GAEG;AACH,qBAAa,mBAAoB,SAAQ,wBAmHxC;CAAG"}