@jambudipa/spider 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +426 -0
- package/dist/index.d.ts +33 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +4681 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts +57 -0
- package/dist/lib/BrowserEngine/BrowserEngine.service.d.ts.map +1 -0
- package/dist/lib/Config/SpiderConfig.service.d.ts +256 -0
- package/dist/lib/Config/SpiderConfig.service.d.ts.map +1 -0
- package/dist/lib/HttpClient/CookieManager.d.ts +44 -0
- package/dist/lib/HttpClient/CookieManager.d.ts.map +1 -0
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts +88 -0
- package/dist/lib/HttpClient/EnhancedHttpClient.d.ts.map +1 -0
- package/dist/lib/HttpClient/SessionStore.d.ts +82 -0
- package/dist/lib/HttpClient/SessionStore.d.ts.map +1 -0
- package/dist/lib/HttpClient/TokenExtractor.d.ts +58 -0
- package/dist/lib/HttpClient/TokenExtractor.d.ts.map +1 -0
- package/dist/lib/HttpClient/index.d.ts +8 -0
- package/dist/lib/HttpClient/index.d.ts.map +1 -0
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts +166 -0
- package/dist/lib/LinkExtractor/LinkExtractor.service.d.ts.map +1 -0
- package/dist/lib/LinkExtractor/index.d.ts +37 -0
- package/dist/lib/LinkExtractor/index.d.ts.map +1 -0
- package/dist/lib/Logging/FetchLogger.d.ts +8 -0
- package/dist/lib/Logging/FetchLogger.d.ts.map +1 -0
- package/dist/lib/Logging/SpiderLogger.service.d.ts +34 -0
- package/dist/lib/Logging/SpiderLogger.service.d.ts.map +1 -0
- package/dist/lib/Middleware/SpiderMiddleware.d.ts +276 -0
- package/dist/lib/Middleware/SpiderMiddleware.d.ts.map +1 -0
- package/dist/lib/PageData/PageData.d.ts +28 -0
- package/dist/lib/PageData/PageData.d.ts.map +1 -0
- package/dist/lib/Resumability/Resumability.service.d.ts +176 -0
- package/dist/lib/Resumability/Resumability.service.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts +47 -0
- package/dist/lib/Resumability/backends/FileStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts +95 -0
- package/dist/lib/Resumability/backends/PostgresStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts +92 -0
- package/dist/lib/Resumability/backends/RedisStorageBackend.d.ts.map +1 -0
- package/dist/lib/Resumability/index.d.ts +51 -0
- package/dist/lib/Resumability/index.d.ts.map +1 -0
- package/dist/lib/Resumability/strategies.d.ts +76 -0
- package/dist/lib/Resumability/strategies.d.ts.map +1 -0
- package/dist/lib/Resumability/types.d.ts +201 -0
- package/dist/lib/Resumability/types.d.ts.map +1 -0
- package/dist/lib/Robots/Robots.service.d.ts +78 -0
- package/dist/lib/Robots/Robots.service.d.ts.map +1 -0
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts +211 -0
- package/dist/lib/Scheduler/SpiderScheduler.service.d.ts.map +1 -0
- package/dist/lib/Scraper/Scraper.service.d.ts +123 -0
- package/dist/lib/Scraper/Scraper.service.d.ts.map +1 -0
- package/dist/lib/Spider/Spider.service.d.ts +194 -0
- package/dist/lib/Spider/Spider.service.d.ts.map +1 -0
- package/dist/lib/StateManager/StateManager.service.d.ts +68 -0
- package/dist/lib/StateManager/StateManager.service.d.ts.map +1 -0
- package/dist/lib/StateManager/index.d.ts +5 -0
- package/dist/lib/StateManager/index.d.ts.map +1 -0
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts +58 -0
- package/dist/lib/UrlDeduplicator/UrlDeduplicator.service.d.ts.map +1 -0
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts +77 -0
- package/dist/lib/WebScrapingEngine/WebScrapingEngine.service.d.ts.map +1 -0
- package/dist/lib/WebScrapingEngine/index.d.ts +5 -0
- package/dist/lib/WebScrapingEngine/index.d.ts.map +1 -0
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts +39 -0
- package/dist/lib/WorkerHealth/WorkerHealthMonitor.service.d.ts.map +1 -0
- package/dist/lib/api-facades.d.ts +313 -0
- package/dist/lib/api-facades.d.ts.map +1 -0
- package/dist/lib/errors.d.ts +99 -0
- package/dist/lib/errors.d.ts.map +1 -0
- package/package.json +108 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
import { Effect, Schema } from 'effect';
|
|
2
|
+
import { PriorityRequest, SpiderState, SpiderStateKey } from '../Scheduler/SpiderScheduler.service.js';
|
|
3
|
+
export { SpiderStateKey, PriorityRequest, SpiderState };
|
|
4
|
+
declare const StateDelta_base: Schema.Class<StateDelta, {
|
|
5
|
+
/** Session this delta applies to */
|
|
6
|
+
stateKey: typeof Schema.String;
|
|
7
|
+
/** Sequence number for ordering deltas */
|
|
8
|
+
sequence: typeof Schema.Number;
|
|
9
|
+
/** When this delta was created */
|
|
10
|
+
timestamp: typeof Schema.Date;
|
|
11
|
+
/** The operation that created this delta */
|
|
12
|
+
operation: Schema.Union<[Schema.Struct<{
|
|
13
|
+
type: Schema.Literal<["enqueue"]>;
|
|
14
|
+
request: typeof PriorityRequest;
|
|
15
|
+
}>, Schema.Struct<{
|
|
16
|
+
type: Schema.Literal<["dequeue"]>;
|
|
17
|
+
fingerprint: typeof Schema.String;
|
|
18
|
+
}>, Schema.Struct<{
|
|
19
|
+
type: Schema.Literal<["mark_visited"]>;
|
|
20
|
+
fingerprint: typeof Schema.String;
|
|
21
|
+
}>]>;
|
|
22
|
+
}, Schema.Struct.Encoded<{
|
|
23
|
+
/** Session this delta applies to */
|
|
24
|
+
stateKey: typeof Schema.String;
|
|
25
|
+
/** Sequence number for ordering deltas */
|
|
26
|
+
sequence: typeof Schema.Number;
|
|
27
|
+
/** When this delta was created */
|
|
28
|
+
timestamp: typeof Schema.Date;
|
|
29
|
+
/** The operation that created this delta */
|
|
30
|
+
operation: Schema.Union<[Schema.Struct<{
|
|
31
|
+
type: Schema.Literal<["enqueue"]>;
|
|
32
|
+
request: typeof PriorityRequest;
|
|
33
|
+
}>, Schema.Struct<{
|
|
34
|
+
type: Schema.Literal<["dequeue"]>;
|
|
35
|
+
fingerprint: typeof Schema.String;
|
|
36
|
+
}>, Schema.Struct<{
|
|
37
|
+
type: Schema.Literal<["mark_visited"]>;
|
|
38
|
+
fingerprint: typeof Schema.String;
|
|
39
|
+
}>]>;
|
|
40
|
+
}>, never, {
|
|
41
|
+
readonly operation: {
|
|
42
|
+
readonly type: "enqueue";
|
|
43
|
+
readonly request: PriorityRequest;
|
|
44
|
+
} | {
|
|
45
|
+
readonly type: "dequeue";
|
|
46
|
+
readonly fingerprint: string;
|
|
47
|
+
} | {
|
|
48
|
+
readonly type: "mark_visited";
|
|
49
|
+
readonly fingerprint: string;
|
|
50
|
+
};
|
|
51
|
+
} & {
|
|
52
|
+
readonly timestamp: Date;
|
|
53
|
+
} & {
|
|
54
|
+
readonly stateKey: string;
|
|
55
|
+
} & {
|
|
56
|
+
readonly sequence: number;
|
|
57
|
+
}, {}, {}>;
|
|
58
|
+
/**
|
|
59
|
+
* Delta operation that represents a single state change.
|
|
60
|
+
*
|
|
61
|
+
* Used for incremental persistence instead of saving the entire state
|
|
62
|
+
* on every operation, which is much more efficient for large crawls.
|
|
63
|
+
*
|
|
64
|
+
* @group Delta Updates
|
|
65
|
+
* @public
|
|
66
|
+
*/
|
|
67
|
+
export declare class StateDelta extends StateDelta_base {
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Represents a state change operation with both the delta and resulting state.
|
|
71
|
+
*
|
|
72
|
+
* This allows persistence strategies to choose whether to save deltas,
|
|
73
|
+
* full state, or both depending on their optimization needs.
|
|
74
|
+
*
|
|
75
|
+
* @group Operations
|
|
76
|
+
* @public
|
|
77
|
+
*/
|
|
78
|
+
export interface StateOperation {
|
|
79
|
+
/** The incremental change */
|
|
80
|
+
readonly delta: StateDelta;
|
|
81
|
+
/** The complete state after applying this operation */
|
|
82
|
+
readonly resultingState: SpiderState;
|
|
83
|
+
/** Whether this operation should trigger a snapshot */
|
|
84
|
+
readonly shouldSnapshot: boolean;
|
|
85
|
+
}
|
|
86
|
+
declare const PersistenceError_base: new <A extends Record<string, any> = {}>(args: import("effect/Types").Equals<A, {}> extends true ? void : { readonly [P in keyof A as P extends "_tag" ? never : P]: A[P]; }) => import("effect/Cause").YieldableError & {
|
|
87
|
+
readonly _tag: "PersistenceError";
|
|
88
|
+
} & Readonly<A>;
|
|
89
|
+
/**
|
|
90
|
+
* Error that can occur during persistence operations.
|
|
91
|
+
*
|
|
92
|
+
* @group Errors
|
|
93
|
+
* @public
|
|
94
|
+
*/
|
|
95
|
+
export declare class PersistenceError extends PersistenceError_base<{
|
|
96
|
+
readonly message: string;
|
|
97
|
+
readonly cause?: unknown;
|
|
98
|
+
readonly operation?: string;
|
|
99
|
+
}> {
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Storage backend capabilities that determine optimal persistence strategy.
|
|
103
|
+
*
|
|
104
|
+
* Backends advertise their capabilities so the ResumabilityService can
|
|
105
|
+
* choose the best strategy automatically.
|
|
106
|
+
*
|
|
107
|
+
* @group Storage
|
|
108
|
+
* @public
|
|
109
|
+
*/
|
|
110
|
+
export interface StorageCapabilities {
|
|
111
|
+
/** Can efficiently store and retrieve delta operations */
|
|
112
|
+
readonly supportsDelta: boolean;
|
|
113
|
+
/** Can efficiently store full state snapshots */
|
|
114
|
+
readonly supportsSnapshot: boolean;
|
|
115
|
+
/** Can handle streaming/batch operations */
|
|
116
|
+
readonly supportsStreaming: boolean;
|
|
117
|
+
/** Can handle concurrent access safely */
|
|
118
|
+
readonly supportsConcurrency: boolean;
|
|
119
|
+
/** Estimated latency category */
|
|
120
|
+
readonly latency: 'low' | 'medium' | 'high';
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Generic storage backend interface that persistence strategies use.
|
|
124
|
+
*
|
|
125
|
+
* Backends implement the storage operations they support best.
|
|
126
|
+
* Not all methods need to be implemented - strategies will adapt.
|
|
127
|
+
*
|
|
128
|
+
* @group Storage
|
|
129
|
+
* @public
|
|
130
|
+
*/
|
|
131
|
+
export interface StorageBackend {
|
|
132
|
+
/** Backend capabilities for strategy selection */
|
|
133
|
+
readonly capabilities: StorageCapabilities;
|
|
134
|
+
/** Storage backend identifier */
|
|
135
|
+
readonly name: string;
|
|
136
|
+
/** Initialize the backend (create tables, connections, etc.) */
|
|
137
|
+
initialize(): Effect.Effect<void, PersistenceError, never>;
|
|
138
|
+
/** Cleanup backend resources */
|
|
139
|
+
cleanup(): Effect.Effect<void, PersistenceError, never>;
|
|
140
|
+
saveState?(key: SpiderStateKey, state: SpiderState): Effect.Effect<void, PersistenceError, never>;
|
|
141
|
+
loadState?(key: SpiderStateKey): Effect.Effect<SpiderState | null, PersistenceError, never>;
|
|
142
|
+
deleteState?(key: SpiderStateKey): Effect.Effect<void, PersistenceError, never>;
|
|
143
|
+
saveDelta?(delta: StateDelta): Effect.Effect<void, PersistenceError, never>;
|
|
144
|
+
saveDeltas?(deltas: StateDelta[]): Effect.Effect<void, PersistenceError, never>;
|
|
145
|
+
loadDeltas?(key: SpiderStateKey, fromSequence?: number): Effect.Effect<StateDelta[], PersistenceError, never>;
|
|
146
|
+
saveSnapshot?(key: SpiderStateKey, state: SpiderState, sequence: number): Effect.Effect<void, PersistenceError, never>;
|
|
147
|
+
loadLatestSnapshot?(key: SpiderStateKey): Effect.Effect<{
|
|
148
|
+
state: SpiderState;
|
|
149
|
+
sequence: number;
|
|
150
|
+
} | null, PersistenceError, never>;
|
|
151
|
+
compactDeltas?(key: SpiderStateKey, beforeSequence: number): Effect.Effect<void, PersistenceError, never>;
|
|
152
|
+
listSessions?(): Effect.Effect<SpiderStateKey[], PersistenceError, never>;
|
|
153
|
+
}
|
|
154
|
+
/**
|
|
155
|
+
* Core strategy interface for different persistence approaches.
|
|
156
|
+
*
|
|
157
|
+
* Strategies implement the logic for when and how to persist state,
|
|
158
|
+
* using the storage backend for actual I/O operations.
|
|
159
|
+
*
|
|
160
|
+
* @group Strategies
|
|
161
|
+
* @public
|
|
162
|
+
*/
|
|
163
|
+
export interface PersistenceStrategy {
|
|
164
|
+
/** Persist a state operation */
|
|
165
|
+
persist(operation: StateOperation): Effect.Effect<void, PersistenceError, never>;
|
|
166
|
+
/** Restore state from storage */
|
|
167
|
+
restore(key: SpiderStateKey): Effect.Effect<SpiderState | null, PersistenceError, never>;
|
|
168
|
+
/** Clean up old data */
|
|
169
|
+
cleanup(key: SpiderStateKey): Effect.Effect<void, PersistenceError, never>;
|
|
170
|
+
/** Get strategy information */
|
|
171
|
+
getInfo(): {
|
|
172
|
+
readonly name: string;
|
|
173
|
+
readonly description: string;
|
|
174
|
+
readonly capabilities: string[];
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* Configuration for hybrid persistence strategy.
|
|
179
|
+
*
|
|
180
|
+
* Controls when to save snapshots vs deltas for optimal performance.
|
|
181
|
+
*
|
|
182
|
+
* @group Configuration
|
|
183
|
+
* @public
|
|
184
|
+
*/
|
|
185
|
+
export interface HybridPersistenceConfig {
|
|
186
|
+
/** Save a full snapshot every N operations */
|
|
187
|
+
readonly snapshotInterval: number;
|
|
188
|
+
/** Maximum deltas to accumulate before forcing a snapshot */
|
|
189
|
+
readonly maxDeltasBeforeSnapshot: number;
|
|
190
|
+
/** Whether to compact old deltas after snapshots */
|
|
191
|
+
readonly compactionEnabled: boolean;
|
|
192
|
+
/** Batch multiple deltas together for efficiency */
|
|
193
|
+
readonly batchDeltas: boolean;
|
|
194
|
+
/** Batch size for delta operations */
|
|
195
|
+
readonly deltaBatchSize: number;
|
|
196
|
+
}
|
|
197
|
+
/**
|
|
198
|
+
* Default hybrid persistence configuration.
|
|
199
|
+
*/
|
|
200
|
+
export declare const DEFAULT_HYBRID_CONFIG: HybridPersistenceConfig;
|
|
201
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../../src/lib/Resumability/types.ts"],"names":[],"mappings":"AAAA,OAAO,EAAQ,MAAM,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EACL,eAAe,EACf,WAAW,EACX,cAAc,EACf,MAAM,yCAAyC,CAAC;AAGjD,OAAO,EAAE,cAAc,EAAE,eAAe,EAAE,WAAW,EAAE,CAAC;;IAYtD,oCAAoC;;IAEpC,0CAA0C;;IAE1C,kCAAkC;;IAElC,4CAA4C;;;;;;;;;;;;IAN5C,oCAAoC;;IAEpC,0CAA0C;;IAE1C,kCAAkC;;IAElC,4CAA4C;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAhB9C;;;;;;;;GAQG;AACH,qBAAa,UAAW,SAAQ,eAsB9B;CAAG;AAEL;;;;;;;;GAQG;AACH,MAAM,WAAW,cAAc;IAC7B,6BAA6B;IAC7B,QAAQ,CAAC,KAAK,EAAE,UAAU,CAAC;IAC3B,uDAAuD;IACvD,QAAQ,CAAC,cAAc,EAAE,WAAW,CAAC;IACrC,uDAAuD;IACvD,QAAQ,CAAC,cAAc,EAAE,OAAO,CAAC;CAClC;;;;AAED;;;;;GAKG;AACH,qBAAa,gBAAiB,SAAQ,sBAAqC;IACzE,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC;IACzB,QAAQ,CAAC,SAAS,CAAC,EAAE,MAAM,CAAC;CAC7B,CAAC;CAAG;AAEL;;;;;;;;GAQG;AACH,MAAM,WAAW,mBAAmB;IAClC,0DAA0D;IAC1D,QAAQ,CAAC,aAAa,EAAE,OAAO,CAAC;IAChC,iDAAiD;IACjD,QAAQ,CAAC,gBAAgB,EAAE,OAAO,CAAC;IACnC,4CAA4C;IAC5C,QAAQ,CAAC,iBAAiB,EAAE,OAAO,CAAC;IACpC,0CAA0C;IAC1C,QAAQ,CAAC,mBAAmB,EAAE,OAAO,CAAC;IACtC,iCAAiC;IACjC,QAAQ,CAAC,OAAO,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;CAC7C;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,cAAc;IAC7B,kDAAkD;IAClD,QAAQ,CAAC,YAAY,EAAE,mBAAmB,CAAC;IAE3C,iCAAiC;IACjC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IAEtB,gEAAgE;IAChE,UAAU,IAAI,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAE3D,gCAAgC;IAChC,OAAO,IAAI,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAGxD,SAAS,CAAC,CACR,GAAG,EAAE,cAAc,EACnB,KAAK,EAAE,WAAW,GACjB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAChD,SAAS,CAAC,CACR,GAAG,EAAE,cAAc,GAClB,MAAM,CAAC,MAAM,CAAC,WAAW,GAAG,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAC9D,WAAW,CAAC,CACV,GAAG,EAAE,cAAc,GAClB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAGhD,SAAS,CAAC,CAAC,KAAK,EAAE,UAAU,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAC5E,UAAU,CAAC,CACT,MAAM,EAAE,UAAU,EAAE,GACnB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAChD,UAAU,CAAC,CACT,GAAG,EAAE,cAAc,EACnB,YAAY,CAAC,EAAE,MAAM,GACpB,MAAM,CAAC,MAAM,CAAC,UAAU,EAAE,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAGxD,YAAY,CAAC,CACX,GAAG,EAAE,cAAc,EACnB,KAAK,EAAE,WAAW,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAChD,kBAAkB,CAAC,CACjB,GAAG,EAAE,cAAc,GAClB,MAAM,CAAC,MAAM,CACd;QAAE,KAAK,EAAE,WAAW,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAA;KAAE,GAAG,IAAI,EAC/C,gBAAgB,EAChB,KAAK,CACN,CAAC;IAGF,aAAa,CAAC,CACZ,GAAG,EAAE,cAAc,EACnB,cAAc,EAAE,MAAM,GACrB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAChD,YAAY,CAAC,IAAI,MAAM,CAAC,MAAM,CAAC,cAAc,EAAE,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;CAC3E;AAED;;;;;;;;GAQG;AACH,MAAM,WAAW,mBAAmB;IAClC,gCAAgC;IAChC,OAAO,CACL,SAAS,EAAE,cAAc,GACxB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAEhD,iCAAiC;IACjC,OAAO,CACL,GAAG,EAAE,cAAc,GAClB,MAAM,CAAC,MAAM,CAAC,WAAW,GAAG,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAE9D,wBAAwB;IACxB,OAAO,CAAC,GAAG,EAAE,cAAc,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,gBAAgB,EAAE,KAAK,CAAC,CAAC;IAE3E,+BAA+B;IAC/B,OAAO,IAAI;QACT,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;QACtB,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;QAC7B,QAAQ,CAAC,YAAY,EAAE,MAAM,EAAE,CAAC;KACjC,CAAC;CACH;AAED;;;;;;;GAOG;AACH,MAAM,WAAW,uBAAuB;IACtC,8CAA8C;IAC9C,QAAQ,CAAC,gBAAgB,EAAE,MAAM,CAAC;IAClC,6DAA6D;IAC7D,QAAQ,CAAC,uBAAuB,EAAE,MAAM,CAAC;IACzC,oDAAoD;IACpD,QAAQ,CAAC,iBAAiB,EAAE,OAAO,CAAC;IACpC,oDAAoD;IACpD,QAAQ,CAAC,WAAW,EAAE,OAAO,CAAC;IAC9B,sCAAsC;IACtC,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;CACjC;AAED;;GAEG;AACH,eAAO,MAAM,qBAAqB,EAAE,uBAMnC,CAAC"}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { Effect, Option } from 'effect';
|
|
2
|
+
/**
|
|
3
|
+
* Parsed robots.txt rules for a specific user agent.
|
|
4
|
+
*
|
|
5
|
+
* Contains the disallowed paths and crawl delay settings extracted
|
|
6
|
+
* from a robots.txt file for a particular user agent string.
|
|
7
|
+
*
|
|
8
|
+
* @group Data Types
|
|
9
|
+
* @internal
|
|
10
|
+
*/
|
|
11
|
+
interface RobotsRules {
|
|
12
|
+
/** Set of URL paths that are disallowed for this user agent */
|
|
13
|
+
disallowedPaths: Set<string>;
|
|
14
|
+
/** Optional crawl delay in seconds specified in robots.txt */
|
|
15
|
+
crawlDelay?: number;
|
|
16
|
+
/** The user agent these rules apply to */
|
|
17
|
+
userAgent: string;
|
|
18
|
+
}
|
|
19
|
+
declare const RobotsService_base: Effect.Service.Class<RobotsService, "@jambudipa.io/RobotsService", {
|
|
20
|
+
readonly effect: Effect.Effect<{
|
|
21
|
+
checkUrl: (urlString: string) => Effect.Effect<{
|
|
22
|
+
allowed: boolean;
|
|
23
|
+
crawlDelay?: undefined;
|
|
24
|
+
} | {
|
|
25
|
+
allowed: boolean;
|
|
26
|
+
crawlDelay: number | undefined;
|
|
27
|
+
}, never, never>;
|
|
28
|
+
getRules: (domain: string) => Effect.Effect<Option.Option<RobotsRules>, never, never>;
|
|
29
|
+
}, never, never>;
|
|
30
|
+
}>;
|
|
31
|
+
/**
|
|
32
|
+
* Service for parsing and enforcing robots.txt compliance.
|
|
33
|
+
*
|
|
34
|
+
* The RobotsService handles fetching, parsing, and caching robots.txt files
|
|
35
|
+
* to ensure compliant web crawling. It provides efficient URL checking with
|
|
36
|
+
* automatic caching to minimise network requests.
|
|
37
|
+
*
|
|
38
|
+
* **Key Features:**
|
|
39
|
+
* - Automatic robots.txt fetching and parsing
|
|
40
|
+
* - Intelligent caching to reduce redundant requests
|
|
41
|
+
* - User agent-specific rule enforcement
|
|
42
|
+
* - Crawl delay extraction and enforcement
|
|
43
|
+
* - Graceful error handling for malformed robots.txt files
|
|
44
|
+
*
|
|
45
|
+
* **Standards Compliance:**
|
|
46
|
+
* - Follows the Robots Exclusion Standard (RFC 9309)
|
|
47
|
+
* - Supports User-agent, Disallow, and Crawl-delay directives
|
|
48
|
+
* - Handles wildcard (*) user agent specifications
|
|
49
|
+
* - Case-insensitive user agent matching
|
|
50
|
+
*
|
|
51
|
+
* @example
|
|
52
|
+
* ```typescript
|
|
53
|
+
* const program = Effect.gen(function* () {
|
|
54
|
+
* const robots = yield* RobotsService;
|
|
55
|
+
*
|
|
56
|
+
* // Check if URL is allowed
|
|
57
|
+
* const check = yield* robots.checkUrl('https://example.com/admin');
|
|
58
|
+
* if (!check.allowed) {
|
|
59
|
+
* console.log('URL blocked by robots.txt');
|
|
60
|
+
* return;
|
|
61
|
+
* }
|
|
62
|
+
*
|
|
63
|
+
* // Apply crawl delay if specified
|
|
64
|
+
* if (check.crawlDelay) {
|
|
65
|
+
* yield* Effect.sleep(`${check.crawlDelay} seconds`);
|
|
66
|
+
* }
|
|
67
|
+
*
|
|
68
|
+
* // Proceed with crawling...
|
|
69
|
+
* });
|
|
70
|
+
* ```
|
|
71
|
+
*
|
|
72
|
+
* @group Services
|
|
73
|
+
* @public
|
|
74
|
+
*/
|
|
75
|
+
export declare class RobotsService extends RobotsService_base {
|
|
76
|
+
}
|
|
77
|
+
export {};
|
|
78
|
+
//# sourceMappingURL=Robots.service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"Robots.service.d.ts","sourceRoot":"","sources":["../../../src/lib/Robots/Robots.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAkB,MAAM,EAAE,MAAM,QAAQ,CAAC;AAGxD;;;;;;;;GAQG;AACH,UAAU,WAAW;IACnB,+DAA+D;IAC/D,eAAe,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC;IAC7B,8DAA8D;IAC9D,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,0CAA0C;IAC1C,SAAS,EAAE,MAAM,CAAC;CACnB;;;8BA6I6B,MAAM;;;;;;;2BAqDT,MAAM;;;AAhMjC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2CG;AACH,qBAAa,aAAc,SAAQ,kBA6JlC;CAAG"}
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
import { Effect, Schema } from 'effect';
|
|
2
|
+
import { CrawlTask } from '../Spider/Spider.service.js';
|
|
3
|
+
import { ConfigurationError } from '../errors.js';
|
|
4
|
+
declare const SpiderStateKey_base: Schema.Class<SpiderStateKey, {
|
|
5
|
+
/** Unique identifier for the session */
|
|
6
|
+
id: typeof Schema.String;
|
|
7
|
+
/** When the session was created */
|
|
8
|
+
timestamp: typeof Schema.Date;
|
|
9
|
+
/** Human-readable name for the session */
|
|
10
|
+
name: typeof Schema.String;
|
|
11
|
+
}, Schema.Struct.Encoded<{
|
|
12
|
+
/** Unique identifier for the session */
|
|
13
|
+
id: typeof Schema.String;
|
|
14
|
+
/** When the session was created */
|
|
15
|
+
timestamp: typeof Schema.Date;
|
|
16
|
+
/** Human-readable name for the session */
|
|
17
|
+
name: typeof Schema.String;
|
|
18
|
+
}>, never, {
|
|
19
|
+
readonly name: string;
|
|
20
|
+
} & {
|
|
21
|
+
readonly timestamp: Date;
|
|
22
|
+
} & {
|
|
23
|
+
readonly id: string;
|
|
24
|
+
}, {}, {}>;
|
|
25
|
+
/**
|
|
26
|
+
* Unique identifier for a spider crawling session.
|
|
27
|
+
*
|
|
28
|
+
* Used to identify and restore specific crawl sessions when using
|
|
29
|
+
* persistent storage. Each crawl session should have a unique key.
|
|
30
|
+
*
|
|
31
|
+
* @group Data Types
|
|
32
|
+
* @public
|
|
33
|
+
*/
|
|
34
|
+
export declare class SpiderStateKey extends SpiderStateKey_base {
|
|
35
|
+
}
|
|
36
|
+
declare const PriorityRequest_base: Schema.Class<PriorityRequest, {
|
|
37
|
+
/** The crawl task containing URL and depth information */
|
|
38
|
+
request: Schema.Struct<{
|
|
39
|
+
url: typeof Schema.String;
|
|
40
|
+
depth: typeof Schema.Number;
|
|
41
|
+
fromUrl: Schema.optional<typeof Schema.String>;
|
|
42
|
+
}>;
|
|
43
|
+
/** Priority level (higher numbers processed first) */
|
|
44
|
+
priority: typeof Schema.Number;
|
|
45
|
+
/** When this request was created */
|
|
46
|
+
timestamp: typeof Schema.Date;
|
|
47
|
+
/** Unique fingerprint for deduplication */
|
|
48
|
+
fingerprint: typeof Schema.String;
|
|
49
|
+
}, Schema.Struct.Encoded<{
|
|
50
|
+
/** The crawl task containing URL and depth information */
|
|
51
|
+
request: Schema.Struct<{
|
|
52
|
+
url: typeof Schema.String;
|
|
53
|
+
depth: typeof Schema.Number;
|
|
54
|
+
fromUrl: Schema.optional<typeof Schema.String>;
|
|
55
|
+
}>;
|
|
56
|
+
/** Priority level (higher numbers processed first) */
|
|
57
|
+
priority: typeof Schema.Number;
|
|
58
|
+
/** When this request was created */
|
|
59
|
+
timestamp: typeof Schema.Date;
|
|
60
|
+
/** Unique fingerprint for deduplication */
|
|
61
|
+
fingerprint: typeof Schema.String;
|
|
62
|
+
}>, never, {
|
|
63
|
+
readonly timestamp: Date;
|
|
64
|
+
} & {
|
|
65
|
+
readonly priority: number;
|
|
66
|
+
} & {
|
|
67
|
+
readonly fingerprint: string;
|
|
68
|
+
} & {
|
|
69
|
+
readonly request: {
|
|
70
|
+
readonly url: string;
|
|
71
|
+
readonly depth: number;
|
|
72
|
+
readonly fromUrl?: string | undefined;
|
|
73
|
+
};
|
|
74
|
+
}, {}, {}>;
|
|
75
|
+
/**
|
|
76
|
+
* A crawl request with priority and metadata for scheduling.
|
|
77
|
+
*
|
|
78
|
+
* Requests are processed in priority order (higher numbers first),
|
|
79
|
+
* with FIFO ordering within the same priority level.
|
|
80
|
+
*
|
|
81
|
+
* @group Data Types
|
|
82
|
+
* @public
|
|
83
|
+
*/
|
|
84
|
+
export declare class PriorityRequest extends PriorityRequest_base {
|
|
85
|
+
}
|
|
86
|
+
declare const SpiderState_base: Schema.Class<SpiderState, {
|
|
87
|
+
/** The state key identifying this session */
|
|
88
|
+
key: typeof SpiderStateKey;
|
|
89
|
+
/** All requests waiting to be processed */
|
|
90
|
+
pendingRequests: Schema.Array$<typeof PriorityRequest>;
|
|
91
|
+
/** Fingerprints of URLs already visited (for deduplication) */
|
|
92
|
+
visitedFingerprints: Schema.Array$<typeof Schema.String>;
|
|
93
|
+
/** Total number of requests processed so far */
|
|
94
|
+
totalProcessed: typeof Schema.Number;
|
|
95
|
+
}, Schema.Struct.Encoded<{
|
|
96
|
+
/** The state key identifying this session */
|
|
97
|
+
key: typeof SpiderStateKey;
|
|
98
|
+
/** All requests waiting to be processed */
|
|
99
|
+
pendingRequests: Schema.Array$<typeof PriorityRequest>;
|
|
100
|
+
/** Fingerprints of URLs already visited (for deduplication) */
|
|
101
|
+
visitedFingerprints: Schema.Array$<typeof Schema.String>;
|
|
102
|
+
/** Total number of requests processed so far */
|
|
103
|
+
totalProcessed: typeof Schema.Number;
|
|
104
|
+
}>, never, {
|
|
105
|
+
readonly key: SpiderStateKey;
|
|
106
|
+
} & {
|
|
107
|
+
readonly totalProcessed: number;
|
|
108
|
+
} & {
|
|
109
|
+
readonly pendingRequests: readonly PriorityRequest[];
|
|
110
|
+
} & {
|
|
111
|
+
readonly visitedFingerprints: readonly string[];
|
|
112
|
+
}, {}, {}>;
|
|
113
|
+
/**
|
|
114
|
+
* Complete state snapshot of a spider crawling session.
|
|
115
|
+
*
|
|
116
|
+
* This contains all information needed to resume a crawl session,
|
|
117
|
+
* including pending requests, visited URLs, and progress counters.
|
|
118
|
+
*
|
|
119
|
+
* @group Data Types
|
|
120
|
+
* @public
|
|
121
|
+
*/
|
|
122
|
+
export declare class SpiderState extends SpiderState_base {
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Generic interface for persisting spider state.
|
|
126
|
+
*
|
|
127
|
+
* Implementations can use any storage backend (filesystem, database, etc.)
|
|
128
|
+
* to save and restore crawling sessions. All operations are Effect-based
|
|
129
|
+
* for composability and error handling.
|
|
130
|
+
*
|
|
131
|
+
* @example
|
|
132
|
+
* ```typescript
|
|
133
|
+
* class FilePersistence implements StatePersistence {
|
|
134
|
+
* saveState = (key: SpiderStateKey, state: SpiderState) =>
|
|
135
|
+
* Effect.tryPromise(() => fs.writeFile(key.id + '.json', JSON.stringify(state)))
|
|
136
|
+
*
|
|
137
|
+
* loadState = (key: SpiderStateKey) =>
|
|
138
|
+
* Effect.tryPromise(() => fs.readFile(key.id + '.json').then(JSON.parse))
|
|
139
|
+
*
|
|
140
|
+
* deleteState = (key: SpiderStateKey) =>
|
|
141
|
+
* Effect.tryPromise(() => fs.unlink(key.id + '.json'))
|
|
142
|
+
* }
|
|
143
|
+
* ```
|
|
144
|
+
*
|
|
145
|
+
* @group Interfaces
|
|
146
|
+
* @public
|
|
147
|
+
*/
|
|
148
|
+
export interface StatePersistence {
|
|
149
|
+
/** Saves the complete spider state to persistent storage */
|
|
150
|
+
saveState: (key: SpiderStateKey, state: SpiderState) => Effect.Effect<void, Error>;
|
|
151
|
+
/** Loads spider state from persistent storage, returns null if not found */
|
|
152
|
+
loadState: (key: SpiderStateKey) => Effect.Effect<SpiderState | null, Error>;
|
|
153
|
+
/** Deletes spider state from persistent storage */
|
|
154
|
+
deleteState: (key: SpiderStateKey) => Effect.Effect<void, Error>;
|
|
155
|
+
}
|
|
156
|
+
declare const SpiderSchedulerService_base: Effect.Service.Class<SpiderSchedulerService, "@jambudipa.io/SpiderSchedulerService", {
|
|
157
|
+
readonly effect: Effect.Effect<{
|
|
158
|
+
configurePersistence: (persistence: StatePersistence, stateKey: SpiderStateKey) => Effect.Effect<void, never, never>;
|
|
159
|
+
clearPersistence: () => Effect.Effect<void, never, never>;
|
|
160
|
+
enqueue: (request: CrawlTask, priority?: number) => Effect.Effect<boolean, Error, never>;
|
|
161
|
+
dequeue: () => Effect.Effect<PriorityRequest, Error, never>;
|
|
162
|
+
size: () => Effect.Effect<number, never, never>;
|
|
163
|
+
isEmpty: () => Effect.Effect<boolean, never, never>;
|
|
164
|
+
getState: () => Effect.Effect<SpiderState, ConfigurationError, never>;
|
|
165
|
+
restoreFromState: (state: SpiderState) => Effect.Effect<void, Error>;
|
|
166
|
+
restore: (persistence: StatePersistence, stateKey: SpiderStateKey) => Effect.Effect<boolean, Error, never>;
|
|
167
|
+
}, never, import("../Config/SpiderConfig.service.js").SpiderConfigService>;
|
|
168
|
+
readonly dependencies: readonly [import("effect/Layer").Layer<import("../Config/SpiderConfig.service.js").SpiderConfigService, never, never>];
|
|
169
|
+
}>;
|
|
170
|
+
/**
|
|
171
|
+
* Manages request scheduling, prioritization, and state persistence for web crawling.
|
|
172
|
+
*
|
|
173
|
+
* The SpiderSchedulerService provides a priority-based request queue with optional persistence
|
|
174
|
+
* capabilities. It handles:
|
|
175
|
+
* - Request deduplication via fingerprinting
|
|
176
|
+
* - Priority-based scheduling (higher numbers processed first)
|
|
177
|
+
* - State persistence for resumable crawling
|
|
178
|
+
* - Atomic state operations
|
|
179
|
+
*
|
|
180
|
+
* @example
|
|
181
|
+
* ```typescript
|
|
182
|
+
* const program = Effect.gen(function* () {
|
|
183
|
+
* const scheduler = yield* SpiderSchedulerService;
|
|
184
|
+
*
|
|
185
|
+
* // Configure persistence
|
|
186
|
+
* const persistence = new FilePersistence('./state');
|
|
187
|
+
* const stateKey = new SpiderStateKey({
|
|
188
|
+
* id: 'my-crawl',
|
|
189
|
+
* timestamp: new Date(),
|
|
190
|
+
* name: 'Example Crawl'
|
|
191
|
+
* });
|
|
192
|
+
*
|
|
193
|
+
* yield* scheduler.configurePersistence(persistence, stateKey);
|
|
194
|
+
*
|
|
195
|
+
* // Queue requests
|
|
196
|
+
* yield* scheduler.enqueue({ url: 'https://example.com', depth: 0 }, 10);
|
|
197
|
+
* yield* scheduler.enqueue({ url: 'https://example.com/about', depth: 1 }, 5);
|
|
198
|
+
*
|
|
199
|
+
* // Process requests
|
|
200
|
+
* const request = yield* scheduler.dequeue();
|
|
201
|
+
* console.log(`Processing: ${request.request.url}`);
|
|
202
|
+
* });
|
|
203
|
+
* ```
|
|
204
|
+
*
|
|
205
|
+
* @group Services
|
|
206
|
+
* @public
|
|
207
|
+
*/
|
|
208
|
+
export declare class SpiderSchedulerService extends SpiderSchedulerService_base {
|
|
209
|
+
}
|
|
210
|
+
export {};
|
|
211
|
+
//# sourceMappingURL=SpiderScheduler.service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"SpiderScheduler.service.d.ts","sourceRoot":"","sources":["../../../src/lib/Scheduler/SpiderScheduler.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAyB,MAAM,EAAE,MAAM,QAAQ,CAAC;AAC/D,OAAO,EAAE,SAAS,EAAE,MAAM,6BAA6B,CAAC;AACxD,OAAO,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAC;;IAehD,wCAAwC;;IAExC,mCAAmC;;IAEnC,0CAA0C;;;IAJ1C,wCAAwC;;IAExC,mCAAmC;;IAEnC,0CAA0C;;;;;;;;;AAhB5C;;;;;;;;GAQG;AACH,qBAAa,cAAe,SAAQ,mBASlC;CAAG;;IAcH,0DAA0D;;;;;;IAM1D,sDAAsD;;IAEtD,oCAAoC;;IAEpC,2CAA2C;;;IAV3C,0DAA0D;;;;;;IAM1D,sDAAsD;;IAEtD,oCAAoC;;IAEpC,2CAA2C;;;;;;;;;;;;;;;AAtB7C;;;;;;;;GAQG;AACH,qBAAa,eAAgB,SAAQ,oBAenC;CAAG;;IAYH,6CAA6C;;IAE7C,2CAA2C;;IAE3C,+DAA+D;;IAE/D,gDAAgD;;;IANhD,6CAA6C;;IAE7C,2CAA2C;;IAE3C,+DAA+D;;IAE/D,gDAAgD;;;;;;;;;;;AAhBlD;;;;;;;;GAQG;AACH,qBAAa,WAAY,SAAQ,gBAS/B;CAAG;AAEL;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,MAAM,WAAW,gBAAgB;IAC/B,4DAA4D;IAC5D,SAAS,EAAE,CACT,GAAG,EAAE,cAAc,EACnB,KAAK,EAAE,WAAW,KACf,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;IAChC,4EAA4E;IAC5E,SAAS,EAAE,CAAC,GAAG,EAAE,cAAc,KAAK,MAAM,CAAC,MAAM,CAAC,WAAW,GAAG,IAAI,EAAE,KAAK,CAAC,CAAC;IAC7E,mDAAmD;IACnD,WAAW,EAAE,CAAC,GAAG,EAAE,cAAc,KAAK,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;CAClE;;;4CA6LsB,gBAAgB,YACnB,cAAc;;2BAeP,SAAS;;;;;kCAhDrB,WAAW,KACjB,MAAM,CAAC,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC;+BA4HJ,gBAAgB,YAAY,cAAc;;;;AAxRzE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqCG;AACH,qBAAa,sBAAuB,SAAQ,2BAgQ3C;CAAG"}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import { Effect } from 'effect';
|
|
2
|
+
import { NetworkError, ResponseError } from '../errors.js';
|
|
3
|
+
import { SpiderLogger } from '../Logging/SpiderLogger.service.js';
|
|
4
|
+
declare const ScraperService_base: Effect.Service.Class<ScraperService, "@jambudipa.io/ScraperService", {
|
|
5
|
+
readonly effect: Effect.Effect<{
|
|
6
|
+
/**
|
|
7
|
+
* Fetches a URL and parses the HTML to extract basic page information.
|
|
8
|
+
*
|
|
9
|
+
* This method performs the following operations:
|
|
10
|
+
* 1. Fetches the URL with configurable timeout (30 seconds)
|
|
11
|
+
* 2. Validates content type (skips binary files)
|
|
12
|
+
* 3. Parses HTML content with cheerio
|
|
13
|
+
* 4. Extracts basic page metadata (title, description, etc.)
|
|
14
|
+
* 5. Returns structured PageData object
|
|
15
|
+
*
|
|
16
|
+
* The method uses AbortController for proper timeout handling to prevent
|
|
17
|
+
* workers from hanging on malformed URLs or slow responses.
|
|
18
|
+
*
|
|
19
|
+
* @param url - The URL to fetch and parse
|
|
20
|
+
* @param depth - The crawl depth for logging purposes (default: 0)
|
|
21
|
+
* @returns Effect containing PageData with extracted information
|
|
22
|
+
* @throws NetworkError for network-related failures
|
|
23
|
+
* @throws ResponseError for HTTP error responses
|
|
24
|
+
*
|
|
25
|
+
* @example
|
|
26
|
+
* Basic usage:
|
|
27
|
+
* ```typescript
|
|
28
|
+
* const pageData = yield* scraper.fetchAndParse('https://example.com');
|
|
29
|
+
* console.log(`Page title: ${pageData.title}`);
|
|
30
|
+
* ```
|
|
31
|
+
*
|
|
32
|
+
* With depth tracking:
|
|
33
|
+
* ```typescript
|
|
34
|
+
* const pageData = yield* scraper.fetchAndParse('https://example.com/page', 2);
|
|
35
|
+
* ```
|
|
36
|
+
*
|
|
37
|
+
* Error handling:
|
|
38
|
+
* ```typescript
|
|
39
|
+
* const result = yield* scraper.fetchAndParse('https://example.com').pipe(
|
|
40
|
+
* Effect.catchTags({
|
|
41
|
+
* NetworkError: (error) => {
|
|
42
|
+
* console.log('Network error:', error.message);
|
|
43
|
+
* return Effect.succeed(null);
|
|
44
|
+
* },
|
|
45
|
+
* ResponseError: (error) => {
|
|
46
|
+
* console.log('HTTP error:', error.statusCode);
|
|
47
|
+
* return Effect.succeed(null);
|
|
48
|
+
* }
|
|
49
|
+
* })
|
|
50
|
+
* );
|
|
51
|
+
* ```
|
|
52
|
+
*
|
|
53
|
+
* @performance
|
|
54
|
+
* - Request timeout: 30 seconds
|
|
55
|
+
* - Response parsing timeout: 10 seconds
|
|
56
|
+
* - Memory usage: ~2-5MB per page depending on content size
|
|
57
|
+
*
|
|
58
|
+
* @security
|
|
59
|
+
* - Validates content types to prevent processing binary files
|
|
60
|
+
* - Uses AbortController to prevent hanging requests
|
|
61
|
+
* - No execution of JavaScript content (static HTML parsing only)
|
|
62
|
+
*/
|
|
63
|
+
fetchAndParse: (url: string, depth?: number) => Effect.Effect<{
|
|
64
|
+
readonly url: string;
|
|
65
|
+
readonly html: string;
|
|
66
|
+
readonly title?: string | undefined;
|
|
67
|
+
readonly metadata: {
|
|
68
|
+
readonly [x: string]: string;
|
|
69
|
+
};
|
|
70
|
+
readonly commonMetadata?: {
|
|
71
|
+
readonly description?: string | undefined;
|
|
72
|
+
readonly keywords?: string | undefined;
|
|
73
|
+
readonly author?: string | undefined;
|
|
74
|
+
readonly robots?: string | undefined;
|
|
75
|
+
} | undefined;
|
|
76
|
+
readonly statusCode: number;
|
|
77
|
+
readonly headers: {
|
|
78
|
+
readonly [x: string]: string;
|
|
79
|
+
};
|
|
80
|
+
readonly fetchedAt: Date;
|
|
81
|
+
readonly scrapeDurationMs: number;
|
|
82
|
+
readonly depth: number;
|
|
83
|
+
readonly extractedData?: {
|
|
84
|
+
readonly [x: string]: unknown;
|
|
85
|
+
} | undefined;
|
|
86
|
+
}, NetworkError | ResponseError | import("effect/ParseResult").ParseError, SpiderLogger>;
|
|
87
|
+
}, never, never>;
|
|
88
|
+
}>;
|
|
89
|
+
/**
|
|
90
|
+
* Service responsible for fetching HTML content and parsing basic page information.
|
|
91
|
+
*
|
|
92
|
+
* The ScraperService handles the core HTTP fetching and HTML parsing functionality
|
|
93
|
+
* for the Spider framework. It provides robust error handling, timeout management,
|
|
94
|
+
* and content type validation to ensure reliable data extraction.
|
|
95
|
+
*
|
|
96
|
+
* **Key Features:**
|
|
97
|
+
* - Automatic timeout handling with AbortController
|
|
98
|
+
* - Content type validation (skips binary files)
|
|
99
|
+
* - Comprehensive error handling with typed errors
|
|
100
|
+
* - Performance monitoring and logging
|
|
101
|
+
* - Effect.js integration for composability
|
|
102
|
+
*
|
|
103
|
+
* **Note:** This service focuses solely on fetching and parsing HTML content.
|
|
104
|
+
* Link extraction is handled separately by LinkExtractorService for better
|
|
105
|
+
* separation of concerns and modularity.
|
|
106
|
+
*
|
|
107
|
+
* @example
|
|
108
|
+
* ```typescript
|
|
109
|
+
* const program = Effect.gen(function* () {
|
|
110
|
+
* const scraper = yield* ScraperService;
|
|
111
|
+
* const pageData = yield* scraper.fetchAndParse('https://example.com', 0);
|
|
112
|
+
* console.log(`Title: ${pageData.title}`);
|
|
113
|
+
* console.log(`Content length: ${pageData.html.length}`);
|
|
114
|
+
* });
|
|
115
|
+
* ```
|
|
116
|
+
*
|
|
117
|
+
* @group Services
|
|
118
|
+
* @public
|
|
119
|
+
*/
|
|
120
|
+
export declare class ScraperService extends ScraperService_base {
|
|
121
|
+
}
|
|
122
|
+
export {};
|
|
123
|
+
//# sourceMappingURL=Scraper.service.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"Scraper.service.d.ts","sourceRoot":"","sources":["../../../src/lib/Scraper/Scraper.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAU,MAAM,QAAQ,CAAC;AAGxC,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC;AAC3D,OAAO,EAAE,YAAY,EAAE,MAAM,oCAAoC,CAAC;;;QAqC5D;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;WAwDG;6BACkB,MAAM;;;;;;;;;;;;;;;;;;;;;;;;;;AA5FjC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AACH,qBAAa,cAAe,SAAQ,mBAqOnC;CAAG"}
|