@jambudipa/spider 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +426 -0
  3. package/dist/index.d.ts +33 -0
  4. package/dist/index.d.ts.map +1 -0
  5. package/dist/index.js +4681 -0
  6. package/dist/index.js.map +1 -0
  7. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +57 -0
  8. package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +1 -0
  9. package/dist/lib/Config/SpiderConfig.service.d.ts +256 -0
  10. package/dist/lib/Config/SpiderConfig.service.d.ts.map +1 -0
  11. package/dist/lib/HttpClient/CookieManager.d.ts +44 -0
  12. package/dist/lib/HttpClient/CookieManager.d.ts.map +1 -0
  13. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +88 -0
  14. package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +1 -0
  15. package/dist/lib/HttpClient/SessionStore.d.ts +82 -0
  16. package/dist/lib/HttpClient/SessionStore.d.ts.map +1 -0
  17. package/dist/lib/HttpClient/TokenExtractor.d.ts +58 -0
  18. package/dist/lib/HttpClient/TokenExtractor.d.ts.map +1 -0
  19. package/dist/lib/HttpClient/index.d.ts +8 -0
  20. package/dist/lib/HttpClient/index.d.ts.map +1 -0
  21. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +166 -0
  22. package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +1 -0
  23. package/dist/lib/LinkExtractor/index.d.ts +37 -0
  24. package/dist/lib/LinkExtractor/index.d.ts.map +1 -0
  25. package/dist/lib/Logging/FetchLogger.d.ts +8 -0
  26. package/dist/lib/Logging/FetchLogger.d.ts.map +1 -0
  27. package/dist/lib/Logging/SpiderLogger.service.d.ts +34 -0
  28. package/dist/lib/Logging/SpiderLogger.service.d.ts.map +1 -0
  29. package/dist/lib/Middleware/SpiderMiddleware.d.ts +276 -0
  30. package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +1 -0
  31. package/dist/lib/PageData/PageData.d.ts +28 -0
  32. package/dist/lib/PageData/PageData.d.ts.map +1 -0
  33. package/dist/lib/Resumability/Resumability.service.d.ts +176 -0
  34. package/dist/lib/Resumability/Resumability.service.d.ts.map +1 -0
  35. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +47 -0
  36. package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +1 -0
  37. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +95 -0
  38. package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +1 -0
  39. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +92 -0
  40. package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +1 -0
  41. package/dist/lib/Resumability/index.d.ts +51 -0
  42. package/dist/lib/Resumability/index.d.ts.map +1 -0
  43. package/dist/lib/Resumability/strategies.d.ts +76 -0
  44. package/dist/lib/Resumability/strategies.d.ts.map +1 -0
  45. package/dist/lib/Resumability/types.d.ts +201 -0
  46. package/dist/lib/Resumability/types.d.ts.map +1 -0
  47. package/dist/lib/Robots/Robots.service.d.ts +78 -0
  48. package/dist/lib/Robots/Robots.service.d.ts.map +1 -0
  49. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +211 -0
  50. package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +1 -0
  51. package/dist/lib/Scraper/Scraper.service.d.ts +123 -0
  52. package/dist/lib/Scraper/Scraper.service.d.ts.map +1 -0
  53. package/dist/lib/Spider/Spider.service.d.ts +194 -0
  54. package/dist/lib/Spider/Spider.service.d.ts.map +1 -0
  55. package/dist/lib/StateManager/StateManager.service.d.ts +68 -0
  56. package/dist/lib/StateManager/StateManager.service.d.ts.map +1 -0
  57. package/dist/lib/StateManager/index.d.ts +5 -0
  58. package/dist/lib/StateManager/index.d.ts.map +1 -0
  59. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +58 -0
  60. package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +1 -0
  61. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +77 -0
  62. package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +1 -0
  63. package/dist/lib/WebScrapingEngine/index.d.ts +5 -0
  64. package/dist/lib/WebScrapingEngine/index.d.ts.map +1 -0
  65. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +39 -0
  66. package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +1 -0
  67. package/dist/lib/api-facades.d.ts +313 -0
  68. package/dist/lib/api-facades.d.ts.map +1 -0
  69. package/dist/lib/errors.d.ts +99 -0
  70. package/dist/lib/errors.d.ts.map +1 -0
  71. package/package.json +108 -0
@@ -0,0 +1,201 @@
1
+ import { Effect, Schema } from 'effect';
2
+ import { PriorityRequest, SpiderState, SpiderStateKey } from '../Scheduler/SpiderScheduler.service.js';
3
+ export { SpiderStateKey, PriorityRequest, SpiderState };
4
+ declare const StateDelta_base: Schema.Class<StateDelta, {
5
+ /** Session this delta applies to */
6
+ stateKey: typeof Schema.String;
7
+ /** Sequence number for ordering deltas */
8
+ sequence: typeof Schema.Number;
9
+ /** When this delta was created */
10
+ timestamp: typeof Schema.Date;
11
+ /** The operation that created this delta */
12
+ operation: Schema.Union<[Schema.Struct<{
13
+ type: Schema.Literal<["enqueue"]>;
14
+ request: typeof PriorityRequest;
15
+ }>, Schema.Struct<{
16
+ type: Schema.Literal<["dequeue"]>;
17
+ fingerprint: typeof Schema.String;
18
+ }>, Schema.Struct<{
19
+ type: Schema.Literal<["mark_visited"]>;
20
+ fingerprint: typeof Schema.String;
21
+ }>]>;
22
+ }, Schema.Struct.Encoded<{
23
+ /** Session this delta applies to */
24
+ stateKey: typeof Schema.String;
25
+ /** Sequence number for ordering deltas */
26
+ sequence: typeof Schema.Number;
27
+ /** When this delta was created */
28
+ timestamp: typeof Schema.Date;
29
+ /** The operation that created this delta */
30
+ operation: Schema.Union<[Schema.Struct<{
31
+ type: Schema.Literal<["enqueue"]>;
32
+ request: typeof PriorityRequest;
33
+ }>, Schema.Struct<{
34
+ type: Schema.Literal<["dequeue"]>;
35
+ fingerprint: typeof Schema.String;
36
+ }>, Schema.Struct<{
37
+ type: Schema.Literal<["mark_visited"]>;
38
+ fingerprint: typeof Schema.String;
39
+ }>]>;
40
+ }>, never, {
41
+ readonly operation: {
42
+ readonly type: "enqueue";
43
+ readonly request: PriorityRequest;
44
+ } | {
45
+ readonly type: "dequeue";
46
+ readonly fingerprint: string;
47
+ } | {
48
+ readonly type: "mark_visited";
49
+ readonly fingerprint: string;
50
+ };
51
+ } & {
52
+ readonly timestamp: Date;
53
+ } & {
54
+ readonly stateKey: string;
55
+ } & {
56
+ readonly sequence: number;
57
+ }, {}, {}>;
58
+ /**
59
+ * Delta operation that represents a single state change.
60
+ *
61
+ * Used for incremental persistence instead of saving the entire state
62
+ * on every operation, which is much more efficient for large crawls.
63
+ *
64
+ * @group Delta Updates
65
+ * @public
66
+ */
67
+ export declare class StateDelta extends StateDelta_base {
68
+ }
69
+ /**
70
+ * Represents a state change operation with both the delta and resulting state.
71
+ *
72
+ * This allows persistence strategies to choose whether to save deltas,
73
+ * full state, or both depending on their optimization needs.
74
+ *
75
+ * @group Operations
76
+ * @public
77
+ */
78
+ export interface StateOperation {
79
+ /** The incremental change */
80
+ readonly delta: StateDelta;
81
+ /** The complete state after applying this operation */
82
+ readonly resultingState: SpiderState;
83
+ /** Whether this operation should trigger a snapshot */
84
+ readonly shouldSnapshot: boolean;
85
+ }
86
+ declare const PersistenceError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
87
+ readonly _tag: "PersistenceError";
88
+ } & Readonly<A>;
89
+ /**
90
+ * Error that can occur during persistence operations.
91
+ *
92
+ * @group Errors
93
+ * @public
94
+ */
95
+ export declare class PersistenceError extends PersistenceError_base<{
96
+ readonly message: string;
97
+ readonly cause?: unknown;
98
+ readonly operation?: string;
99
+ }> {
100
+ }
101
+ /**
102
+ * Storage backend capabilities that determine optimal persistence strategy.
103
+ *
104
+ * Backends advertise their capabilities so the ResumabilityService can
105
+ * choose the best strategy automatically.
106
+ *
107
+ * @group Storage
108
+ * @public
109
+ */
110
+ export interface StorageCapabilities {
111
+ /** Can efficiently store and retrieve delta operations */
112
+ readonly supportsDelta: boolean;
113
+ /** Can efficiently store full state snapshots */
114
+ readonly supportsSnapshot: boolean;
115
+ /** Can handle streaming/batch operations */
116
+ readonly supportsStreaming: boolean;
117
+ /** Can handle concurrent access safely */
118
+ readonly supportsConcurrency: boolean;
119
+ /** Estimated latency category */
120
+ readonly latency: 'low' | 'medium' | 'high';
121
+ }
122
+ /**
123
+ * Generic storage backend interface that persistence strategies use.
124
+ *
125
+ * Backends implement the storage operations they support best.
126
+ * Not all methods need to be implemented - strategies will adapt.
127
+ *
128
+ * @group Storage
129
+ * @public
130
+ */
131
+ export interface StorageBackend {
132
+ /** Backend capabilities for strategy selection */
133
+ readonly capabilities: StorageCapabilities;
134
+ /** Storage backend identifier */
135
+ readonly name: string;
136
+ /** Initialize the backend (create tables, connections, etc.) */
137
+ initialize(): Effect.Effect<void, PersistenceError, never>;
138
+ /** Cleanup backend resources */
139
+ cleanup(): Effect.Effect<void, PersistenceError, never>;
140
+ saveState?(key: SpiderStateKey, state: SpiderState): Effect.Effect<void, PersistenceError, never>;
141
+ loadState?(key: SpiderStateKey): Effect.Effect<SpiderState | null, PersistenceError, never>;
142
+ deleteState?(key: SpiderStateKey): Effect.Effect<void, PersistenceError, never>;
143
+ saveDelta?(delta: StateDelta): Effect.Effect<void, PersistenceError, never>;
144
+ saveDeltas?(deltas: StateDelta[]): Effect.Effect<void, PersistenceError, never>;
145
+ loadDeltas?(key: SpiderStateKey, fromSequence?: number): Effect.Effect<StateDelta[], PersistenceError, never>;
146
+ saveSnapshot?(key: SpiderStateKey, state: SpiderState, sequence: number): Effect.Effect<void, PersistenceError, never>;
147
+ loadLatestSnapshot?(key: SpiderStateKey): Effect.Effect<{
148
+ state: SpiderState;
149
+ sequence: number;
150
+ } | null, PersistenceError, never>;
151
+ compactDeltas?(key: SpiderStateKey, beforeSequence: number): Effect.Effect<void, PersistenceError, never>;
152
+ listSessions?(): Effect.Effect<SpiderStateKey[], PersistenceError, never>;
153
+ }
154
+ /**
155
+ * Core strategy interface for different persistence approaches.
156
+ *
157
+ * Strategies implement the logic for when and how to persist state,
158
+ * using the storage backend for actual I/O operations.
159
+ *
160
+ * @group Strategies
161
+ * @public
162
+ */
163
+ export interface PersistenceStrategy {
164
+ /** Persist a state operation */
165
+ persist(operation: StateOperation): Effect.Effect<void, PersistenceError, never>;
166
+ /** Restore state from storage */
167
+ restore(key: SpiderStateKey): Effect.Effect<SpiderState | null, PersistenceError, never>;
168
+ /** Clean up old data */
169
+ cleanup(key: SpiderStateKey): Effect.Effect<void, PersistenceError, never>;
170
+ /** Get strategy information */
171
+ getInfo(): {
172
+ readonly name: string;
173
+ readonly description: string;
174
+ readonly capabilities: string[];
175
+ };
176
+ }
177
+ /**
178
+ * Configuration for hybrid persistence strategy.
179
+ *
180
+ * Controls when to save snapshots vs deltas for optimal performance.
181
+ *
182
+ * @group Configuration
183
+ * @public
184
+ */
185
+ export interface HybridPersistenceConfig {
186
+ /** Save a full snapshot every N operations */
187
+ readonly snapshotInterval: number;
188
+ /** Maximum deltas to accumulate before forcing a snapshot */
189
+ readonly maxDeltasBeforeSnapshot: number;
190
+ /** Whether to compact old deltas after snapshots */
191
+ readonly compactionEnabled: boolean;
192
+ /** Batch multiple deltas together for efficiency */
193
+ readonly batchDeltas: boolean;
194
+ /** Batch size for delta operations */
195
+ readonly deltaBatchSize: number;
196
+ }
197
+ /**
198
+ * Default hybrid persistence configuration.
199
+ */
200
+ export declare const DEFAULT_HYBRID_CONFIG: HybridPersistenceConfig;
201
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../../src/lib/Resumability/types.ts"],"names":[],"mappings":"AAAA,OAAO,EAAQ,MAAM,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EACL,eAAe,EACf,WAAW,EACX,cAAc,EACf,MAAM,yCAAyC,CAAC;AAGjD,OAAO,EAAE,cAAc,EAAE,eAAe,EAAE,WAAW,EAAE,CAAC;;IAYtD,oCAAoC;;IAEpC,0CAA0C;;IAE1C,kCAAkC;;IAElC,4CAA4C;;;;;;;;;;;;IAN5C,oCAAoC;;IAEpC,0CAA0C;;IAE1C,kCAAkC;;IAElC,4CAA4C;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAhB9C;;;;;;;;GAQG;AACH,qBAAa,UAAW,SAAQ,eAsB9B;CAAG;AAEL;;;;;;;;GAQG;AACH,MAAM,WAAW,cAAc;IAC7B,6BAA6B;IAC7B,QAAQ,CAAC,KAAK,EAAE,UAAU,CAAC;IAC3B,uDAAuD;IACvD,QAAQ,CAAC,cAAc,EAAE,WAAW,CAAC;IACrC,uDAAuD;IACvD,QAAQ,CAAC,cAAc,EAAE,OAAO,CAAC;CAClC;;;;AAED;;;;;GAKG;AACH,qBAAa,gBAAiB,SAAQ,sBAAqC;IACzE,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,SAAS,CAAC,EAAE,MAAM,CAAC;CAC7B,CAAC;CAAG;AAEL;;;;;;;;GAQG;AACH,MAAM,WAAW,mBAAmB;IAClC,0DAA0D;IAC1D,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC;IAChC,iDAAiD;IACjD,QAAQ,CAAC,gBAAgB,EAAE,OAAO,CAAC;IACnC,4CAA4C;IAC5C,QAAQ,CAAC,iBAAiB,EAAE,OAAO,CAAC;IACpC,0CAA0C;IAC1C,QAAQ,CAAC,mBAAmB,EAAE,OAAO,CAAC;IACtC,iCAAiC;IACjC,QAAQ,CAAC,OAAO,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;CAC7C;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,cAAc;IAC7B,kDAAkD;IAClD,QAAQ,CAAC,YAAY,EAAE,mBAAmB,CAAC;IAE3C,iCAAiC;IACjC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IAEtB,gEAAgE;IAChE,UAAU,IAAI,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAE3D,gCAAgC;IAChC,OAAO,IAAI,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAGxD,SAAS,CAAC,CACR,GAAG,EAAE,cAAc,EACnB,KAAK,EAAE,WAAW,GACjB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAChD,SAAS,CAAC,CACR,GAAG,EAAE,cAAc,GAClB,MAAM,CAAC,MAAM,CAAC,WAAW,GAAG,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAC9D,WAAW,CAAC,CACV,GAAG,EAAE,cAAc,GAClB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAGhD,SAAS,CAAC,CAAC,KAAK,EAAE,UAAU,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAC5E,UAAU,CAAC,CACT,MAAM,EAAE,UAAU,EAAE,GACnB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAChD,UAAU,CAAC,CACT,GAAG,EAAE,cAAc,EACnB,YAAY,CAAC,EAAE,MAAM,GACpB,MAAM,CAAC,MAAM,CAAC,UAAU,EAAE,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAGxD,YAAY,CAAC,CACX,GAAG,EAAE,cAAc,EACnB,KAAK,EAAE,WAAW,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAChD,kBAAkB,CAAC,CACjB,GAAG,EAAE,cAAc,GAClB,MAAM,CAAC,MAAM,CACd;QAAE,KAAK,EAAE,WAAW,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAA;KAAE,GAAG,IAAI,EAC/C,gBAAgB,EAChB,KAAK,CACN,CAAC;IAGF,aAAa,CAAC,CACZ,GAAG,EAAE,cAAc,EACnB,cAAc,EAAE,MAAM,GACrB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAChD,YAAY,CAAC,IAAI,MAAM,CAAC,MAAM,CAAC,cAAc,EAAE,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;CAC3E;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,mBAAmB;IAClC,gCAAgC;IAChC,OAAO,CACL,SAAS,EAAE,cAAc,GACxB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAEhD,iCAAiC;IACjC,OAAO,CACL,GAAG,EAAE,cAAc,GAClB,MAAM,CAAC,MAAM,CAAC,WAAW,GAAG,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAE9D,wBAAwB;IACxB,OAAO,CAAC,GAAG,EAAE,cAAc,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAE3E,+BAA+B;IAC/B,OAAO,IAAI;QACT,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;QACtB,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;QAC7B,QAAQ,CAAC,YAAY,EAAE,MAAM,EAAE,CAAC;KACjC,CAAC;CACH;AAED;;;;;;;GAOG;AACH,MAAM,WAAW,uBAAuB;IACtC,8CAA8C;IAC9C,QAAQ,CAAC,gBAAgB,EAAE,MAAM,CAAC;IAClC,6DAA6D;IAC7D,QAAQ,CAAC,uBAAuB,EAAE,MAAM,CAAC;IACzC,oDAAoD;IACpD,QAAQ,CAAC,iBAAiB,EAAE,OAAO,CAAC;IACpC,oDAAoD;IACpD,QAAQ,CAAC,WAAW,EAAE,OAAO,CAAC;IAC9B,sCAAsC;IACtC,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;CACjC;AAED;;GAEG;AACH,eAAO,MAAM,qBAAqB,EAAE,uBAMnC,CAAC"}
@@ -0,0 +1,78 @@
1
+ import { Effect, Option } from 'effect';
2
+ /**
3
+ * Parsed robots.txt rules for a specific user agent.
4
+ *
5
+ * Contains the disallowed paths and crawl delay settings extracted
6
+ * from a robots.txt file for a particular user agent string.
7
+ *
8
+ * @group Data Types
9
+ * @internal
10
+ */
11
+ interface RobotsRules {
12
+ /** Set of URL paths that are disallowed for this user agent */
13
+ disallowedPaths: Set<string>;
14
+ /** Optional crawl delay in seconds specified in robots.txt */
15
+ crawlDelay?: number;
16
+ /** The user agent these rules apply to */
17
+ userAgent: string;
18
+ }
19
+ declare const RobotsService_base: Effect.Service.Class<RobotsService, "@jambudipa.io/RobotsService", {
20
+ readonly effect: Effect.Effect<{
21
+ checkUrl: (urlString: string) => Effect.Effect<{
22
+ allowed: boolean;
23
+ crawlDelay?: undefined;
24
+ } | {
25
+ allowed: boolean;
26
+ crawlDelay: number | undefined;
27
+ }, never, never>;
28
+ getRules: (domain: string) => Effect.Effect<Option.Option<RobotsRules>, never, never>;
29
+ }, never, never>;
30
+ }>;
31
+ /**
32
+ * Service for parsing and enforcing robots.txt compliance.
33
+ *
34
+ * The RobotsService handles fetching, parsing, and caching robots.txt files
35
+ * to ensure compliant web crawling. It provides efficient URL checking with
36
+ * automatic caching to minimise network requests.
37
+ *
38
+ * **Key Features:**
39
+ * - Automatic robots.txt fetching and parsing
40
+ * - Intelligent caching to reduce redundant requests
41
+ * - User agent-specific rule enforcement
42
+ * - Crawl delay extraction and enforcement
43
+ * - Graceful error handling for malformed robots.txt files
44
+ *
45
+ * **Standards Compliance:**
46
+ * - Follows the Robots Exclusion Standard (RFC 9309)
47
+ * - Supports User-agent, Disallow, and Crawl-delay directives
48
+ * - Handles wildcard (*) user agent specifications
49
+ * - Case-insensitive user agent matching
50
+ *
51
+ * @example
52
+ * ```typescript
53
+ * const program = Effect.gen(function* () {
54
+ * const robots = yield* RobotsService;
55
+ *
56
+ * // Check if URL is allowed
57
+ * const check = yield* robots.checkUrl('https://example.com/admin');
58
+ * if (!check.allowed) {
59
+ * console.log('URL blocked by robots.txt');
60
+ * return;
61
+ * }
62
+ *
63
+ * // Apply crawl delay if specified
64
+ * if (check.crawlDelay) {
65
+ * yield* Effect.sleep(`${check.crawlDelay} seconds`);
66
+ * }
67
+ *
68
+ * // Proceed with crawling...
69
+ * });
70
+ * ```
71
+ *
72
+ * @group Services
73
+ * @public
74
+ */
75
+ export declare class RobotsService extends RobotsService_base {
76
+ }
77
+ export {};
78
+ //# sourceMappingURL=Robots.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"Robots.service.d.ts","sourceRoot":"","sources":["../../../src/lib/Robots/Robots.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAkB,MAAM,EAAE,MAAM,QAAQ,CAAC;AAGxD;;;;;;;;GAQG;AACH,UAAU,WAAW;IACnB,+DAA+D;IAC/D,eAAe,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;IAC7B,8DAA8D;IAC9D,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,0CAA0C;IAC1C,SAAS,EAAE,MAAM,CAAC;CACnB;;;8BA6I6B,MAAM;;;;;;;2BAqDT,MAAM;;;AAhMjC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2CG;AACH,qBAAa,aAAc,SAAQ,kBA6JlC;CAAG"}
@@ -0,0 +1,211 @@
1
+ import { Effect, Schema } from 'effect';
2
+ import { CrawlTask } from '../Spider/Spider.service.js';
3
+ import { ConfigurationError } from '../errors.js';
4
+ declare const SpiderStateKey_base: Schema.Class<SpiderStateKey, {
5
+ /** Unique identifier for the session */
6
+ id: typeof Schema.String;
7
+ /** When the session was created */
8
+ timestamp: typeof Schema.Date;
9
+ /** Human-readable name for the session */
10
+ name: typeof Schema.String;
11
+ }, Schema.Struct.Encoded<{
12
+ /** Unique identifier for the session */
13
+ id: typeof Schema.String;
14
+ /** When the session was created */
15
+ timestamp: typeof Schema.Date;
16
+ /** Human-readable name for the session */
17
+ name: typeof Schema.String;
18
+ }>, never, {
19
+ readonly name: string;
20
+ } & {
21
+ readonly timestamp: Date;
22
+ } & {
23
+ readonly id: string;
24
+ }, {}, {}>;
25
+ /**
26
+ * Unique identifier for a spider crawling session.
27
+ *
28
+ * Used to identify and restore specific crawl sessions when using
29
+ * persistent storage. Each crawl session should have a unique key.
30
+ *
31
+ * @group Data Types
32
+ * @public
33
+ */
34
+ export declare class SpiderStateKey extends SpiderStateKey_base {
35
+ }
36
+ declare const PriorityRequest_base: Schema.Class<PriorityRequest, {
37
+ /** The crawl task containing URL and depth information */
38
+ request: Schema.Struct<{
39
+ url: typeof Schema.String;
40
+ depth: typeof Schema.Number;
41
+ fromUrl: Schema.optional<typeof Schema.String>;
42
+ }>;
43
+ /** Priority level (higher numbers processed first) */
44
+ priority: typeof Schema.Number;
45
+ /** When this request was created */
46
+ timestamp: typeof Schema.Date;
47
+ /** Unique fingerprint for deduplication */
48
+ fingerprint: typeof Schema.String;
49
+ }, Schema.Struct.Encoded<{
50
+ /** The crawl task containing URL and depth information */
51
+ request: Schema.Struct<{
52
+ url: typeof Schema.String;
53
+ depth: typeof Schema.Number;
54
+ fromUrl: Schema.optional<typeof Schema.String>;
55
+ }>;
56
+ /** Priority level (higher numbers processed first) */
57
+ priority: typeof Schema.Number;
58
+ /** When this request was created */
59
+ timestamp: typeof Schema.Date;
60
+ /** Unique fingerprint for deduplication */
61
+ fingerprint: typeof Schema.String;
62
+ }>, never, {
63
+ readonly timestamp: Date;
64
+ } & {
65
+ readonly priority: number;
66
+ } & {
67
+ readonly fingerprint: string;
68
+ } & {
69
+ readonly request: {
70
+ readonly url: string;
71
+ readonly depth: number;
72
+ readonly fromUrl?: string | undefined;
73
+ };
74
+ }, {}, {}>;
75
+ /**
76
+ * A crawl request with priority and metadata for scheduling.
77
+ *
78
+ * Requests are processed in priority order (higher numbers first),
79
+ * with FIFO ordering within the same priority level.
80
+ *
81
+ * @group Data Types
82
+ * @public
83
+ */
84
+ export declare class PriorityRequest extends PriorityRequest_base {
85
+ }
86
+ declare const SpiderState_base: Schema.Class<SpiderState, {
87
+ /** The state key identifying this session */
88
+ key: typeof SpiderStateKey;
89
+ /** All requests waiting to be processed */
90
+ pendingRequests: Schema.Array$<typeof PriorityRequest>;
91
+ /** Fingerprints of URLs already visited (for deduplication) */
92
+ visitedFingerprints: Schema.Array$<typeof Schema.String>;
93
+ /** Total number of requests processed so far */
94
+ totalProcessed: typeof Schema.Number;
95
+ }, Schema.Struct.Encoded<{
96
+ /** The state key identifying this session */
97
+ key: typeof SpiderStateKey;
98
+ /** All requests waiting to be processed */
99
+ pendingRequests: Schema.Array$<typeof PriorityRequest>;
100
+ /** Fingerprints of URLs already visited (for deduplication) */
101
+ visitedFingerprints: Schema.Array$<typeof Schema.String>;
102
+ /** Total number of requests processed so far */
103
+ totalProcessed: typeof Schema.Number;
104
+ }>, never, {
105
+ readonly key: SpiderStateKey;
106
+ } & {
107
+ readonly totalProcessed: number;
108
+ } & {
109
+ readonly pendingRequests: readonly PriorityRequest[];
110
+ } & {
111
+ readonly visitedFingerprints: readonly string[];
112
+ }, {}, {}>;
113
+ /**
114
+ * Complete state snapshot of a spider crawling session.
115
+ *
116
+ * This contains all information needed to resume a crawl session,
117
+ * including pending requests, visited URLs, and progress counters.
118
+ *
119
+ * @group Data Types
120
+ * @public
121
+ */
122
+ export declare class SpiderState extends SpiderState_base {
123
+ }
124
+ /**
125
+ * Generic interface for persisting spider state.
126
+ *
127
+ * Implementations can use any storage backend (filesystem, database, etc.)
128
+ * to save and restore crawling sessions. All operations are Effect-based
129
+ * for composability and error handling.
130
+ *
131
+ * @example
132
+ * ```typescript
133
+ * class FilePersistence implements StatePersistence {
134
+ * saveState = (key: SpiderStateKey, state: SpiderState) =>
135
+ * Effect.tryPromise(() => fs.writeFile(key.id + '.json', JSON.stringify(state)))
136
+ *
137
+ * loadState = (key: SpiderStateKey) =>
138
+ * Effect.tryPromise(() => fs.readFile(key.id + '.json').then(JSON.parse))
139
+ *
140
+ * deleteState = (key: SpiderStateKey) =>
141
+ * Effect.tryPromise(() => fs.unlink(key.id + '.json'))
142
+ * }
143
+ * ```
144
+ *
145
+ * @group Interfaces
146
+ * @public
147
+ */
148
+ export interface StatePersistence {
149
+ /** Saves the complete spider state to persistent storage */
150
+ saveState: (key: SpiderStateKey, state: SpiderState) => Effect.Effect<void, Error>;
151
+ /** Loads spider state from persistent storage, returns null if not found */
152
+ loadState: (key: SpiderStateKey) => Effect.Effect<SpiderState | null, Error>;
153
+ /** Deletes spider state from persistent storage */
154
+ deleteState: (key: SpiderStateKey) => Effect.Effect<void, Error>;
155
+ }
156
+ declare const SpiderSchedulerService_base: Effect.Service.Class<SpiderSchedulerService, "@jambudipa.io/SpiderSchedulerService", {
157
+ readonly effect: Effect.Effect<{
158
+ configurePersistence: (persistence: StatePersistence, stateKey: SpiderStateKey) => Effect.Effect<void, never, never>;
159
+ clearPersistence: () => Effect.Effect<void, never, never>;
160
+ enqueue: (request: CrawlTask, priority?: number) => Effect.Effect<boolean, Error, never>;
161
+ dequeue: () => Effect.Effect<PriorityRequest, Error, never>;
162
+ size: () => Effect.Effect<number, never, never>;
163
+ isEmpty: () => Effect.Effect<boolean, never, never>;
164
+ getState: () => Effect.Effect<SpiderState, ConfigurationError, never>;
165
+ restoreFromState: (state: SpiderState) => Effect.Effect<void, Error>;
166
+ restore: (persistence: StatePersistence, stateKey: SpiderStateKey) => Effect.Effect<boolean, Error, never>;
167
+ }, never, import("../Config/SpiderConfig.service.js").SpiderConfigService>;
168
+ readonly dependencies: readonly [import("effect/Layer").Layer<import("../Config/SpiderConfig.service.js").SpiderConfigService, never, never>];
169
+ }>;
170
+ /**
171
+ * Manages request scheduling, prioritization, and state persistence for web crawling.
172
+ *
173
+ * The SpiderSchedulerService provides a priority-based request queue with optional persistence
174
+ * capabilities. It handles:
175
+ * - Request deduplication via fingerprinting
176
+ * - Priority-based scheduling (higher numbers processed first)
177
+ * - State persistence for resumable crawling
178
+ * - Atomic state operations
179
+ *
180
+ * @example
181
+ * ```typescript
182
+ * const program = Effect.gen(function* () {
183
+ * const scheduler = yield* SpiderSchedulerService;
184
+ *
185
+ * // Configure persistence
186
+ * const persistence = new FilePersistence('./state');
187
+ * const stateKey = new SpiderStateKey({
188
+ * id: 'my-crawl',
189
+ * timestamp: new Date(),
190
+ * name: 'Example Crawl'
191
+ * });
192
+ *
193
+ * yield* scheduler.configurePersistence(persistence, stateKey);
194
+ *
195
+ * // Queue requests
196
+ * yield* scheduler.enqueue({ url: 'https://example.com', depth: 0 }, 10);
197
+ * yield* scheduler.enqueue({ url: 'https://example.com/about', depth: 1 }, 5);
198
+ *
199
+ * // Process requests
200
+ * const request = yield* scheduler.dequeue();
201
+ * console.log(`Processing: ${request.request.url}`);
202
+ * });
203
+ * ```
204
+ *
205
+ * @group Services
206
+ * @public
207
+ */
208
+ export declare class SpiderSchedulerService extends SpiderSchedulerService_base {
209
+ }
210
+ export {};
211
+ //# sourceMappingURL=SpiderScheduler.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"SpiderScheduler.service.d.ts","sourceRoot":"","sources":["../../../src/lib/Scheduler/SpiderScheduler.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAyB,MAAM,EAAE,MAAM,QAAQ,CAAC;AAC/D,OAAO,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AACxD,OAAO,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAC;;IAehD,wCAAwC;;IAExC,mCAAmC;;IAEnC,0CAA0C;;;IAJ1C,wCAAwC;;IAExC,mCAAmC;;IAEnC,0CAA0C;;;;;;;;;AAhB5C;;;;;;;;GAQG;AACH,qBAAa,cAAe,SAAQ,mBASlC;CAAG;;IAcH,0DAA0D;;;;;;IAM1D,sDAAsD;;IAEtD,oCAAoC;;IAEpC,2CAA2C;;;IAV3C,0DAA0D;;;;;;IAM1D,sDAAsD;;IAEtD,oCAAoC;;IAEpC,2CAA2C;;;;;;;;;;;;;;;AAtB7C;;;;;;;;GAQG;AACH,qBAAa,eAAgB,SAAQ,oBAenC;CAAG;;IAYH,6CAA6C;;IAE7C,2CAA2C;;IAE3C,+DAA+D;;IAE/D,gDAAgD;;;IANhD,6CAA6C;;IAE7C,2CAA2C;;IAE3C,+DAA+D;;IAE/D,gDAAgD;;;;;;;;;;;AAhBlD;;;;;;;;GAQG;AACH,qBAAa,WAAY,SAAQ,gBAS/B;CAAG;AAEL;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,MAAM,WAAW,gBAAgB;IAC/B,4DAA4D;IAC5D,SAAS,EAAE,CACT,GAAG,EAAE,cAAc,EACnB,KAAK,EAAE,WAAW,KACf,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;IAChC,4EAA4E;IAC5E,SAAS,EAAE,CAAC,GAAG,EAAE,cAAc,KAAK,MAAM,CAAC,MAAM,CAAC,WAAW,GAAG,IAAI,EAAE,KAAK,CAAC,CAAC;IAC7E,mDAAmD;IACnD,WAAW,EAAE,CAAC,GAAG,EAAE,cAAc,KAAK,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;CAClE;;;4CA6LsB,gBAAgB,YACnB,cAAc;;2BAeP,SAAS;;;;;kCAhDrB,WAAW,KACjB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC;+BA4HJ,gBAAgB,YAAY,cAAc;;;;AAxRzE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqCG;AACH,qBAAa,sBAAuB,SAAQ,2BAgQ3C;CAAG"}
@@ -0,0 +1,123 @@
1
+ import { Effect } from 'effect';
2
+ import { NetworkError, ResponseError } from '../errors.js';
3
+ import { SpiderLogger } from '../Logging/SpiderLogger.service.js';
4
+ declare const ScraperService_base: Effect.Service.Class<ScraperService, "@jambudipa.io/ScraperService", {
5
+ readonly effect: Effect.Effect<{
6
+ /**
7
+ * Fetches a URL and parses the HTML to extract basic page information.
8
+ *
9
+ * This method performs the following operations:
10
+ * 1. Fetches the URL with configurable timeout (30 seconds)
11
+ * 2. Validates content type (skips binary files)
12
+ * 3. Parses HTML content with cheerio
13
+ * 4. Extracts basic page metadata (title, description, etc.)
14
+ * 5. Returns structured PageData object
15
+ *
16
+ * The method uses AbortController for proper timeout handling to prevent
17
+ * workers from hanging on malformed URLs or slow responses.
18
+ *
19
+ * @param url - The URL to fetch and parse
20
+ * @param depth - The crawl depth for logging purposes (default: 0)
21
+ * @returns Effect containing PageData with extracted information
22
+ * @throws NetworkError for network-related failures
23
+ * @throws ResponseError for HTTP error responses
24
+ *
25
+ * @example
26
+ * Basic usage:
27
+ * ```typescript
28
+ * const pageData = yield* scraper.fetchAndParse('https://example.com');
29
+ * console.log(`Page title: ${pageData.title}`);
30
+ * ```
31
+ *
32
+ * With depth tracking:
33
+ * ```typescript
34
+ * const pageData = yield* scraper.fetchAndParse('https://example.com/page', 2);
35
+ * ```
36
+ *
37
+ * Error handling:
38
+ * ```typescript
39
+ * const result = yield* scraper.fetchAndParse('https://example.com').pipe(
40
+ * Effect.catchTags({
41
+ * NetworkError: (error) => {
42
+ * console.log('Network error:', error.message);
43
+ * return Effect.succeed(null);
44
+ * },
45
+ * ResponseError: (error) => {
46
+ * console.log('HTTP error:', error.statusCode);
47
+ * return Effect.succeed(null);
48
+ * }
49
+ * })
50
+ * );
51
+ * ```
52
+ *
53
+ * @performance
54
+ * - Request timeout: 30 seconds
55
+ * - Response parsing timeout: 10 seconds
56
+ * - Memory usage: ~2-5MB per page depending on content size
57
+ *
58
+ * @security
59
+ * - Validates content types to prevent processing binary files
60
+ * - Uses AbortController to prevent hanging requests
61
+ * - No execution of JavaScript content (static HTML parsing only)
62
+ */
63
+ fetchAndParse: (url: string, depth?: number) => Effect.Effect<{
64
+ readonly url: string;
65
+ readonly html: string;
66
+ readonly title?: string | undefined;
67
+ readonly metadata: {
68
+ readonly [x: string]: string;
69
+ };
70
+ readonly commonMetadata?: {
71
+ readonly description?: string | undefined;
72
+ readonly keywords?: string | undefined;
73
+ readonly author?: string | undefined;
74
+ readonly robots?: string | undefined;
75
+ } | undefined;
76
+ readonly statusCode: number;
77
+ readonly headers: {
78
+ readonly [x: string]: string;
79
+ };
80
+ readonly fetchedAt: Date;
81
+ readonly scrapeDurationMs: number;
82
+ readonly depth: number;
83
+ readonly extractedData?: {
84
+ readonly [x: string]: unknown;
85
+ } | undefined;
86
+ }, NetworkError | ResponseError | import("effect/ParseResult").ParseError, SpiderLogger>;
87
+ }, never, never>;
88
+ }>;
89
+ /**
90
+ * Service responsible for fetching HTML content and parsing basic page information.
91
+ *
92
+ * The ScraperService handles the core HTTP fetching and HTML parsing functionality
93
+ * for the Spider framework. It provides robust error handling, timeout management,
94
+ * and content type validation to ensure reliable data extraction.
95
+ *
96
+ * **Key Features:**
97
+ * - Automatic timeout handling with AbortController
98
+ * - Content type validation (skips binary files)
99
+ * - Comprehensive error handling with typed errors
100
+ * - Performance monitoring and logging
101
+ * - Effect.js integration for composability
102
+ *
103
+ * **Note:** This service focuses solely on fetching and parsing HTML content.
104
+ * Link extraction is handled separately by LinkExtractorService for better
105
+ * separation of concerns and modularity.
106
+ *
107
+ * @example
108
+ * ```typescript
109
+ * const program = Effect.gen(function* () {
110
+ * const scraper = yield* ScraperService;
111
+ * const pageData = yield* scraper.fetchAndParse('https://example.com', 0);
112
+ * console.log(`Title: ${pageData.title}`);
113
+ * console.log(`Content length: ${pageData.html.length}`);
114
+ * });
115
+ * ```
116
+ *
117
+ * @group Services
118
+ * @public
119
+ */
120
+ export declare class ScraperService extends ScraperService_base {
121
+ }
122
+ export {};
123
+ //# sourceMappingURL=Scraper.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"Scraper.service.d.ts","sourceRoot":"","sources":["../../../src/lib/Scraper/Scraper.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAU,MAAM,QAAQ,CAAC;AAGxC,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC;AAC3D,OAAO,EAAE,YAAY,EAAE,MAAM,oCAAoC,CAAC;;;QAqC5D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDG;6BACkB,MAAM;;;;;;;;;;;;;;;;;;;;;;;;;;AA5FjC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AACH,qBAAa,cAAe,SAAQ,mBAqOnC;CAAG"}